pan_tiling.c revision 7ec681f3
1/*
2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
5 * Copyright (c) 2019 Collabora, Ltd.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28#include "pan_tiling.h"
29#include <stdbool.h>
30#include "util/macros.h"
31
32/* This file implements software encode/decode of the tiling format used for
33 * textures and framebuffers primarily on Utgard GPUs. Names for this format
34 * include "Utgard-style tiling", "(Mali) swizzled textures", and
35 * "U-interleaved" (the former two names being used in the community
36 * Lima/Panfrost drivers; the latter name used internally at Arm).
37 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
38 * spatial locality, to improve cache locality in both horizontal and vertical
39 * directions.
40 *
41 * This format is tiled: first, the image dimensions must be aligned to 16
42 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
43 * This size harmonizes with other properties of the GPU; on Midgard,
44 * framebuffer tiles are logically 16x16 (this is the tile size used in
45 * Transaction Elimination and the minimum tile size used in Hierarchical
46 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
47 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
48 * size.
49 *
50 * Within each 16x16 block, the bits are reordered according to this pattern:
51 *
52 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
53 *
54 * Basically, interleaving the X and Y bits, with XORs thrown in for every
55 * adjacent bit pair.
56 *
57 * This is cheap to implement both encode/decode in both hardware and software.
58 * In hardware, lines are simply rerouted to reorder and some XOR gates are
59 * thrown in. Software has to be a bit more clever.
60 *
61 * In software, the trick is to divide the pattern into two lines:
62 *
63 *    | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
64 *  ^ |  0 | x3 |  0 | x2 |  0 | x1 |  0 | x0 |
65 *
66 * That is, duplicate the bits of the Y and space out the bits of the X. The
67 * top line is a function only of Y, so it can be calculated once per row and
68 * stored in a register. The bottom line is simply X with the bits spaced out.
69 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
70 * mask pattern (abusing carry bits).
71 *
72 * This format is also supported on Midgard GPUs, where it *can* be used for
73 * textures and framebuffers. That said, in practice it is usually as a
74 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
75 * significantly more efficient than Utgard-style tiling and preferred for both
76 * textures and framebuffers, where possible. For unsupported texture types,
77 * for instance sRGB textures and framebuffers, this tiling scheme is used at a
78 * performance penalty, as AFBC is not compatible.
79 */
80
81/* Given the lower 4-bits of the Y coordinate, we would like to
82 * duplicate every bit over. So instead of 0b1010, we would like
83 * 0b11001100. The idea is that for the bits in the solely Y place, we
84 * get a Y place, and the bits in the XOR place *also* get a Y. */
85
86const uint32_t bit_duplication[16] = {
87   0b00000000,
88   0b00000011,
89   0b00001100,
90   0b00001111,
91   0b00110000,
92   0b00110011,
93   0b00111100,
94   0b00111111,
95   0b11000000,
96   0b11000011,
97   0b11001100,
98   0b11001111,
99   0b11110000,
100   0b11110011,
101   0b11111100,
102   0b11111111,
103};
104
105/* Space the bits out of a 4-bit nibble */
106
107const unsigned space_4[16] = {
108   0b0000000,
109   0b0000001,
110   0b0000100,
111   0b0000101,
112   0b0010000,
113   0b0010001,
114   0b0010100,
115   0b0010101,
116   0b1000000,
117   0b1000001,
118   0b1000100,
119   0b1000101,
120   0b1010000,
121   0b1010001,
122   0b1010100,
123   0b1010101
124};
125
126/* The scheme uses 16x16 tiles */
127
128#define TILE_WIDTH 16
129#define TILE_HEIGHT 16
130#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
131
132/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
133 * only support copies and sizeof, so emulating with a packed structure works
134 * well enough, but if there's a native 128-bit type we may we well prefer
135 * that. */
136
137#ifdef __SIZEOF_INT128__
138typedef __uint128_t pan_uint128_t;
139#else
140typedef struct {
141  uint64_t lo;
142  uint64_t hi;
143} __attribute__((packed)) pan_uint128_t;
144#endif
145
146typedef struct {
147  uint16_t lo;
148  uint8_t hi;
149} __attribute__((packed)) pan_uint24_t;
150
151/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
152 *
153 * dest_start precomputes the offset to the beginning of the first horizontal
154 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
155 * stored linearly, so we get the X tile number by shifting and then multiply
156 * by the bytes per tile .
157 *
158 * We iterate across the pixels we're trying to store in source-order. For each
159 * row in the destination image, we figure out which row of 16x16 block we're
160 * in, by slicing off the lower 4-bits (block_y).
161 *
162 * dest then precomputes the location of the top-left corner of the block the
163 * row starts in. In pixel coordinates (where the origin is the top-left),
164 * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
165 * pixels are reordered within a block, the blocks themselves are stored
166 * linearly, so multiplying block_y by the pixel stride of the destination
167 * image equals the byte offset of that top-left corner of the block this row
168 * is in.
169 *
170 * On the other hand, the source is linear so we compute the locations of the
171 * start and end of the row in the source by a simple linear addressing.
172 *
173 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
174 * y0] value. Since this is constant across a row, we look it up per-row and
175 * store in expanded_y.
176 *
177 * Finally, we iterate each row in source order. In the outer loop, we iterate
178 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
179 * be unrolled), calculating the index within the tile and writing.
180 */
181
182#define TILED_ACCESS_TYPE(pixel_t, shift) \
183static ALWAYS_INLINE void \
184panfrost_access_tiled_image_##pixel_t \
185                              (void *dst, void *src, \
186                               uint16_t sx, uint16_t sy, \
187                               uint16_t w, uint16_t h, \
188                               uint32_t dst_stride, \
189                               uint32_t src_stride, \
190                               bool is_store) \
191{ \
192   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
193   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
194      uint16_t block_y = y & ~0x0f; \
195      uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
196      pixel_t *source = src + (src_y * src_stride); \
197      pixel_t *source_end = source + w; \
198      unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
199      for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
200         for (uint8_t i = 0; i < 16; ++i) { \
201            unsigned index = expanded_y ^ (space_4[i] << shift); \
202            if (is_store) \
203                *((pixel_t *) (dest + index)) = *(source++); \
204            else \
205                *(source++) = *((pixel_t *) (dest + index)); \
206         } \
207      } \
208   } \
209} \
210
211TILED_ACCESS_TYPE(uint8_t, 0);
212TILED_ACCESS_TYPE(uint16_t, 1);
213TILED_ACCESS_TYPE(uint32_t, 2);
214TILED_ACCESS_TYPE(uint64_t, 3);
215TILED_ACCESS_TYPE(pan_uint128_t, 4);
216
217#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
218   const unsigned mask = (1 << tile_shift) - 1; \
219   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
220      unsigned block_y = y & ~mask; \
221      unsigned block_start_s = block_y * dst_stride; \
222      unsigned source_start = src_y * src_stride; \
223      unsigned expanded_y = bit_duplication[y & mask]; \
224 \
225      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
226         unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
227         unsigned index = expanded_y ^ space_4[x & mask]; \
228         uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
229         uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
230 \
231         pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
232         pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
233         *outp = *inp; \
234      } \
235   } \
236}
237
238#define TILED_UNALIGNED_TYPES(store, shift) { \
239   if (bpp == 8) \
240      TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
241   else if (bpp == 16) \
242      TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
243   else if (bpp == 24) \
244      TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \
245   else if (bpp == 32) \
246      TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
247   else if (bpp == 64) \
248      TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
249   else if (bpp == 128) \
250      TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
251}
252
253/*
254 * Perform a generic access to a tiled image with a given format. This works
255 * even for block-compressed images on entire blocks at a time. sx/sy/w/h are
256 * specified in pixels, not blocks, but our internal routines work in blocks,
257 * so we divide here. Alignment is assumed.
258 */
259static void
260panfrost_access_tiled_image_generic(void *dst, void *src,
261                               unsigned sx, unsigned sy,
262                               unsigned w, unsigned h,
263                               uint32_t dst_stride,
264                               uint32_t src_stride,
265                               const struct util_format_description *desc,
266                               bool _is_store)
267{
268   unsigned bpp = desc->block.bits;
269
270   /* Convert units */
271   sx /= desc->block.width;
272   sy /= desc->block.height;
273   w = DIV_ROUND_UP(w, desc->block.width);
274   h = DIV_ROUND_UP(h, desc->block.height);
275
276   if (desc->block.width > 1) {
277      if (_is_store)
278         TILED_UNALIGNED_TYPES(true, 2)
279      else
280         TILED_UNALIGNED_TYPES(false, 2)
281   } else {
282      if (_is_store)
283         TILED_UNALIGNED_TYPES(true, 4)
284      else
285         TILED_UNALIGNED_TYPES(false, 4)
286   }
287}
288
289#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
290
291static ALWAYS_INLINE void
292panfrost_access_tiled_image(void *dst, void *src,
293                           unsigned x, unsigned y,
294                           unsigned w, unsigned h,
295                           uint32_t dst_stride,
296                           uint32_t src_stride,
297                           enum pipe_format format,
298                           bool is_store)
299{
300   const struct util_format_description *desc = util_format_description(format);
301
302   if (desc->block.width > 1 || desc->block.bits == 24) {
303      panfrost_access_tiled_image_generic(dst, (void *) src,
304            x, y, w, h,
305            dst_stride, src_stride, desc, is_store);
306
307      return;
308   }
309
310   unsigned bpp = desc->block.bits;
311   unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
312   unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
313   unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
314   unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
315
316   /* First, tile the top portion */
317
318   unsigned orig_x = x, orig_y = y;
319
320   if (first_full_tile_y != y) {
321      unsigned dist = MIN2(first_full_tile_y - y, h);
322
323      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
324            x, y, w, dist,
325            dst_stride, src_stride, desc, is_store);
326
327      if (dist == h)
328         return;
329
330      y += dist;
331      h -= dist;
332   }
333
334   /* Next, the bottom portion */
335   if (last_full_tile_y != (y + h)) {
336      unsigned dist = (y + h) - last_full_tile_y;
337
338      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
339            x, last_full_tile_y, w, dist,
340            dst_stride, src_stride, desc, is_store);
341
342      h -= dist;
343   }
344
345   /* The left portion */
346   if (first_full_tile_x != x) {
347      unsigned dist = MIN2(first_full_tile_x - x, w);
348
349      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
350            x, y, dist, h,
351            dst_stride, src_stride, desc, is_store);
352
353      if (dist == w)
354         return;
355
356      x += dist;
357      w -= dist;
358   }
359
360   /* Finally, the right portion */
361   if (last_full_tile_x != (x + w)) {
362      unsigned dist = (x + w) - last_full_tile_x;
363
364      panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
365            last_full_tile_x, y, dist, h,
366            dst_stride, src_stride, desc, is_store);
367
368      w -= dist;
369   }
370
371   if (bpp == 8)
372      panfrost_access_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
373   else if (bpp == 16)
374      panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
375   else if (bpp == 32)
376      panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
377   else if (bpp == 64)
378      panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
379   else if (bpp == 128)
380      panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
381}
382
383/**
384 * Access a tiled image (load or store). Note: the region of interest (x, y, w,
385 * h) is specified in pixels, not blocks. It is expected that these quantities
386 * are aligned to the block size.
387 */
388void
389panfrost_store_tiled_image(void *dst, const void *src,
390                           unsigned x, unsigned y,
391                           unsigned w, unsigned h,
392                           uint32_t dst_stride,
393                           uint32_t src_stride,
394                           enum pipe_format format)
395{
396    panfrost_access_tiled_image(dst, (void *) src,
397        x, y, w, h,
398        dst_stride, src_stride, format, true);
399}
400
401void
402panfrost_load_tiled_image(void *dst, const void *src,
403                           unsigned x, unsigned y,
404                           unsigned w, unsigned h,
405                           uint32_t dst_stride,
406                           uint32_t src_stride,
407                           enum pipe_format format)
408{
409   panfrost_access_tiled_image((void *) src, dst,
410       x, y, w, h,
411       src_stride, dst_stride, format, false);
412}
413