pan_tiling.c revision 7ec681f3
1/* 2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be> 3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io> 4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com> 5 * Copyright (c) 2019 Collabora, Ltd. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sub license, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 * DEALINGS IN THE SOFTWARE. 25 * 26 */ 27 28#include "pan_tiling.h" 29#include <stdbool.h> 30#include "util/macros.h" 31 32/* This file implements software encode/decode of the tiling format used for 33 * textures and framebuffers primarily on Utgard GPUs. Names for this format 34 * include "Utgard-style tiling", "(Mali) swizzled textures", and 35 * "U-interleaved" (the former two names being used in the community 36 * Lima/Panfrost drivers; the latter name used internally at Arm). 37 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D 38 * spatial locality, to improve cache locality in both horizontal and vertical 39 * directions. 40 * 41 * This format is tiled: first, the image dimensions must be aligned to 16 42 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles. 43 * This size harmonizes with other properties of the GPU; on Midgard, 44 * framebuffer tiles are logically 16x16 (this is the tile size used in 45 * Transaction Elimination and the minimum tile size used in Hierarchical 46 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like 47 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line 48 * size. 49 * 50 * Within each 16x16 block, the bits are reordered according to this pattern: 51 * 52 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) | 53 * 54 * Basically, interleaving the X and Y bits, with XORs thrown in for every 55 * adjacent bit pair. 56 * 57 * This is cheap to implement both encode/decode in both hardware and software. 58 * In hardware, lines are simply rerouted to reorder and some XOR gates are 59 * thrown in. Software has to be a bit more clever. 60 * 61 * In software, the trick is to divide the pattern into two lines: 62 * 63 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 | 64 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 | 65 * 66 * That is, duplicate the bits of the Y and space out the bits of the X. The 67 * top line is a function only of Y, so it can be calculated once per row and 68 * stored in a register. The bottom line is simply X with the bits spaced out. 69 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the 70 * mask pattern (abusing carry bits). 71 * 72 * This format is also supported on Midgard GPUs, where it *can* be used for 73 * textures and framebuffers. That said, in practice it is usually as a 74 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is 75 * significantly more efficient than Utgard-style tiling and preferred for both 76 * textures and framebuffers, where possible. For unsupported texture types, 77 * for instance sRGB textures and framebuffers, this tiling scheme is used at a 78 * performance penalty, as AFBC is not compatible. 79 */ 80 81/* Given the lower 4-bits of the Y coordinate, we would like to 82 * duplicate every bit over. So instead of 0b1010, we would like 83 * 0b11001100. The idea is that for the bits in the solely Y place, we 84 * get a Y place, and the bits in the XOR place *also* get a Y. */ 85 86const uint32_t bit_duplication[16] = { 87 0b00000000, 88 0b00000011, 89 0b00001100, 90 0b00001111, 91 0b00110000, 92 0b00110011, 93 0b00111100, 94 0b00111111, 95 0b11000000, 96 0b11000011, 97 0b11001100, 98 0b11001111, 99 0b11110000, 100 0b11110011, 101 0b11111100, 102 0b11111111, 103}; 104 105/* Space the bits out of a 4-bit nibble */ 106 107const unsigned space_4[16] = { 108 0b0000000, 109 0b0000001, 110 0b0000100, 111 0b0000101, 112 0b0010000, 113 0b0010001, 114 0b0010100, 115 0b0010101, 116 0b1000000, 117 0b1000001, 118 0b1000100, 119 0b1000101, 120 0b1010000, 121 0b1010001, 122 0b1010100, 123 0b1010101 124}; 125 126/* The scheme uses 16x16 tiles */ 127 128#define TILE_WIDTH 16 129#define TILE_HEIGHT 16 130#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) 131 132/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must 133 * only support copies and sizeof, so emulating with a packed structure works 134 * well enough, but if there's a native 128-bit type we may we well prefer 135 * that. */ 136 137#ifdef __SIZEOF_INT128__ 138typedef __uint128_t pan_uint128_t; 139#else 140typedef struct { 141 uint64_t lo; 142 uint64_t hi; 143} __attribute__((packed)) pan_uint128_t; 144#endif 145 146typedef struct { 147 uint16_t lo; 148 uint8_t hi; 149} __attribute__((packed)) pan_uint24_t; 150 151/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: 152 * 153 * dest_start precomputes the offset to the beginning of the first horizontal 154 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are 155 * stored linearly, so we get the X tile number by shifting and then multiply 156 * by the bytes per tile . 157 * 158 * We iterate across the pixels we're trying to store in source-order. For each 159 * row in the destination image, we figure out which row of 16x16 block we're 160 * in, by slicing off the lower 4-bits (block_y). 161 * 162 * dest then precomputes the location of the top-left corner of the block the 163 * row starts in. In pixel coordinates (where the origin is the top-left), 164 * (block_y, 0) is the top-left corner of the leftmost tile in this row. While 165 * pixels are reordered within a block, the blocks themselves are stored 166 * linearly, so multiplying block_y by the pixel stride of the destination 167 * image equals the byte offset of that top-left corner of the block this row 168 * is in. 169 * 170 * On the other hand, the source is linear so we compute the locations of the 171 * start and end of the row in the source by a simple linear addressing. 172 * 173 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0 174 * y0] value. Since this is constant across a row, we look it up per-row and 175 * store in expanded_y. 176 * 177 * Finally, we iterate each row in source order. In the outer loop, we iterate 178 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should 179 * be unrolled), calculating the index within the tile and writing. 180 */ 181 182#define TILED_ACCESS_TYPE(pixel_t, shift) \ 183static ALWAYS_INLINE void \ 184panfrost_access_tiled_image_##pixel_t \ 185 (void *dst, void *src, \ 186 uint16_t sx, uint16_t sy, \ 187 uint16_t w, uint16_t h, \ 188 uint32_t dst_stride, \ 189 uint32_t src_stride, \ 190 bool is_store) \ 191{ \ 192 uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ 193 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ 194 uint16_t block_y = y & ~0x0f; \ 195 uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \ 196 pixel_t *source = src + (src_y * src_stride); \ 197 pixel_t *source_end = source + w; \ 198 unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ 199 for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ 200 for (uint8_t i = 0; i < 16; ++i) { \ 201 unsigned index = expanded_y ^ (space_4[i] << shift); \ 202 if (is_store) \ 203 *((pixel_t *) (dest + index)) = *(source++); \ 204 else \ 205 *(source++) = *((pixel_t *) (dest + index)); \ 206 } \ 207 } \ 208 } \ 209} \ 210 211TILED_ACCESS_TYPE(uint8_t, 0); 212TILED_ACCESS_TYPE(uint16_t, 1); 213TILED_ACCESS_TYPE(uint32_t, 2); 214TILED_ACCESS_TYPE(uint64_t, 3); 215TILED_ACCESS_TYPE(pan_uint128_t, 4); 216 217#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ 218 const unsigned mask = (1 << tile_shift) - 1; \ 219 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ 220 unsigned block_y = y & ~mask; \ 221 unsigned block_start_s = block_y * dst_stride; \ 222 unsigned source_start = src_y * src_stride; \ 223 unsigned expanded_y = bit_duplication[y & mask]; \ 224 \ 225 for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ 226 unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ 227 unsigned index = expanded_y ^ space_4[x & mask]; \ 228 uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ 229 uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ 230 \ 231 pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ 232 pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ 233 *outp = *inp; \ 234 } \ 235 } \ 236} 237 238#define TILED_UNALIGNED_TYPES(store, shift) { \ 239 if (bpp == 8) \ 240 TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ 241 else if (bpp == 16) \ 242 TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ 243 else if (bpp == 24) \ 244 TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \ 245 else if (bpp == 32) \ 246 TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ 247 else if (bpp == 64) \ 248 TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ 249 else if (bpp == 128) \ 250 TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ 251} 252 253/* 254 * Perform a generic access to a tiled image with a given format. This works 255 * even for block-compressed images on entire blocks at a time. sx/sy/w/h are 256 * specified in pixels, not blocks, but our internal routines work in blocks, 257 * so we divide here. Alignment is assumed. 258 */ 259static void 260panfrost_access_tiled_image_generic(void *dst, void *src, 261 unsigned sx, unsigned sy, 262 unsigned w, unsigned h, 263 uint32_t dst_stride, 264 uint32_t src_stride, 265 const struct util_format_description *desc, 266 bool _is_store) 267{ 268 unsigned bpp = desc->block.bits; 269 270 /* Convert units */ 271 sx /= desc->block.width; 272 sy /= desc->block.height; 273 w = DIV_ROUND_UP(w, desc->block.width); 274 h = DIV_ROUND_UP(h, desc->block.height); 275 276 if (desc->block.width > 1) { 277 if (_is_store) 278 TILED_UNALIGNED_TYPES(true, 2) 279 else 280 TILED_UNALIGNED_TYPES(false, 2) 281 } else { 282 if (_is_store) 283 TILED_UNALIGNED_TYPES(true, 4) 284 else 285 TILED_UNALIGNED_TYPES(false, 4) 286 } 287} 288 289#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) 290 291static ALWAYS_INLINE void 292panfrost_access_tiled_image(void *dst, void *src, 293 unsigned x, unsigned y, 294 unsigned w, unsigned h, 295 uint32_t dst_stride, 296 uint32_t src_stride, 297 enum pipe_format format, 298 bool is_store) 299{ 300 const struct util_format_description *desc = util_format_description(format); 301 302 if (desc->block.width > 1 || desc->block.bits == 24) { 303 panfrost_access_tiled_image_generic(dst, (void *) src, 304 x, y, w, h, 305 dst_stride, src_stride, desc, is_store); 306 307 return; 308 } 309 310 unsigned bpp = desc->block.bits; 311 unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH; 312 unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT; 313 unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH; 314 unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT; 315 316 /* First, tile the top portion */ 317 318 unsigned orig_x = x, orig_y = y; 319 320 if (first_full_tile_y != y) { 321 unsigned dist = MIN2(first_full_tile_y - y, h); 322 323 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), 324 x, y, w, dist, 325 dst_stride, src_stride, desc, is_store); 326 327 if (dist == h) 328 return; 329 330 y += dist; 331 h -= dist; 332 } 333 334 /* Next, the bottom portion */ 335 if (last_full_tile_y != (y + h)) { 336 unsigned dist = (y + h) - last_full_tile_y; 337 338 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), 339 x, last_full_tile_y, w, dist, 340 dst_stride, src_stride, desc, is_store); 341 342 h -= dist; 343 } 344 345 /* The left portion */ 346 if (first_full_tile_x != x) { 347 unsigned dist = MIN2(first_full_tile_x - x, w); 348 349 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), 350 x, y, dist, h, 351 dst_stride, src_stride, desc, is_store); 352 353 if (dist == w) 354 return; 355 356 x += dist; 357 w -= dist; 358 } 359 360 /* Finally, the right portion */ 361 if (last_full_tile_x != (x + w)) { 362 unsigned dist = (x + w) - last_full_tile_x; 363 364 panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), 365 last_full_tile_x, y, dist, h, 366 dst_stride, src_stride, desc, is_store); 367 368 w -= dist; 369 } 370 371 if (bpp == 8) 372 panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 373 else if (bpp == 16) 374 panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 375 else if (bpp == 32) 376 panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 377 else if (bpp == 64) 378 panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 379 else if (bpp == 128) 380 panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 381} 382 383/** 384 * Access a tiled image (load or store). Note: the region of interest (x, y, w, 385 * h) is specified in pixels, not blocks. It is expected that these quantities 386 * are aligned to the block size. 387 */ 388void 389panfrost_store_tiled_image(void *dst, const void *src, 390 unsigned x, unsigned y, 391 unsigned w, unsigned h, 392 uint32_t dst_stride, 393 uint32_t src_stride, 394 enum pipe_format format) 395{ 396 panfrost_access_tiled_image(dst, (void *) src, 397 x, y, w, h, 398 dst_stride, src_stride, format, true); 399} 400 401void 402panfrost_load_tiled_image(void *dst, const void *src, 403 unsigned x, unsigned y, 404 unsigned w, unsigned h, 405 uint32_t dst_stride, 406 uint32_t src_stride, 407 enum pipe_format format) 408{ 409 panfrost_access_tiled_image((void *) src, dst, 410 x, y, w, h, 411 src_stride, dst_stride, format, false); 412} 413