1/* 2 * Mesa 3-D graphics library 3 * 4 * Copyright 2012 Intel Corporation 5 * Copyright 2013 Google 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sublicense, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the 16 * next paragraph) shall be included in all copies or substantial portions 17 * of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 * Authors: 28 * Chad Versace <chad.versace@linux.intel.com> 29 * Frank Henigman <fjhenigman@google.com> 30 */ 31 32#include <string.h> 33 34#include "util/macros.h" 35#include "main/macros.h" 36 37#include "isl_priv.h" 38 39#if defined(__SSSE3__) 40#include <tmmintrin.h> 41#elif defined(__SSE2__) 42#include <emmintrin.h> 43#endif 44 45#define FILE_DEBUG_FLAG DEBUG_TEXTURE 46 47#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b) 48#define ALIGN_UP(a, b) ALIGN(a, b) 49 50/* Tile dimensions. Width and span are in bytes, height is in pixels (i.e. 51 * unitless). A "span" is the most number of bytes we can copy from linear 52 * to tiled without needing to calculate a new destination address. 53 */ 54static const uint32_t xtile_width = 512; 55static const uint32_t xtile_height = 8; 56static const uint32_t xtile_span = 64; 57static const uint32_t ytile_width = 128; 58static const uint32_t ytile_height = 32; 59static const uint32_t ytile_span = 16; 60 61static inline uint32_t 62ror(uint32_t n, uint32_t d) 63{ 64 return (n >> d) | (n << (32 - d)); 65} 66 67/* Handle conflicting declaration and conflicting macro in netbsd */ 68#undef bswap32 69#define bswap32(n) __builtin_bswap32(n) 70#if 0 71static inline uint32_t 72bswap32(uint32_t n) 73{ 74#if defined(HAVE___BUILTIN_BSWAP32) 75 return __builtin_bswap32(n); 76#else 77 return (n >> 24) | 78 ((n >> 8) & 0x0000ff00) | 79 ((n << 8) & 0x00ff0000) | 80 (n << 24); 81#endif 82} 83#endif 84 85/** 86 * Copy RGBA to BGRA - swap R and B. 87 */ 88static inline void * 89rgba8_copy(void *dst, const void *src, size_t bytes) 90{ 91 uint32_t *d = dst; 92 uint32_t const *s = src; 93 94 assert(bytes % 4 == 0); 95 96 while (bytes >= 4) { 97 *d = ror(bswap32(*s), 8); 98 d += 1; 99 s += 1; 100 bytes -= 4; 101 } 102 return dst; 103} 104 105#ifdef __SSSE3__ 106static const uint8_t rgba8_permutation[16] = 107 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; 108 109static inline void 110rgba8_copy_16_aligned_dst(void *dst, const void *src) 111{ 112 _mm_store_si128(dst, 113 _mm_shuffle_epi8(_mm_loadu_si128(src), 114 *(__m128i *)rgba8_permutation)); 115} 116 117static inline void 118rgba8_copy_16_aligned_src(void *dst, const void *src) 119{ 120 _mm_storeu_si128(dst, 121 _mm_shuffle_epi8(_mm_load_si128(src), 122 *(__m128i *)rgba8_permutation)); 123} 124 125#elif defined(__SSE2__) 126static inline void 127rgba8_copy_16_aligned_dst(void *dst, const void *src) 128{ 129 __m128i srcreg, dstreg, agmask, ag, rb, br; 130 131 agmask = _mm_set1_epi32(0xFF00FF00); 132 srcreg = _mm_loadu_si128((__m128i *)src); 133 134 rb = _mm_andnot_si128(agmask, srcreg); 135 ag = _mm_and_si128(agmask, srcreg); 136 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), 137 _MM_SHUFFLE(2, 3, 0, 1)); 138 dstreg = _mm_or_si128(ag, br); 139 140 _mm_store_si128((__m128i *)dst, dstreg); 141} 142 143static inline void 144rgba8_copy_16_aligned_src(void *dst, const void *src) 145{ 146 __m128i srcreg, dstreg, agmask, ag, rb, br; 147 148 agmask = _mm_set1_epi32(0xFF00FF00); 149 srcreg = _mm_load_si128((__m128i *)src); 150 151 rb = _mm_andnot_si128(agmask, srcreg); 152 ag = _mm_and_si128(agmask, srcreg); 153 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), 154 _MM_SHUFFLE(2, 3, 0, 1)); 155 dstreg = _mm_or_si128(ag, br); 156 157 _mm_storeu_si128((__m128i *)dst, dstreg); 158} 159#endif 160 161/** 162 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned. 163 */ 164static inline void * 165rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) 166{ 167 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf)); 168 169#if defined(__SSSE3__) || defined(__SSE2__) 170 if (bytes == 64) { 171 rgba8_copy_16_aligned_dst(dst + 0, src + 0); 172 rgba8_copy_16_aligned_dst(dst + 16, src + 16); 173 rgba8_copy_16_aligned_dst(dst + 32, src + 32); 174 rgba8_copy_16_aligned_dst(dst + 48, src + 48); 175 return dst; 176 } 177 178 while (bytes >= 16) { 179 rgba8_copy_16_aligned_dst(dst, src); 180 src += 16; 181 dst += 16; 182 bytes -= 16; 183 } 184#endif 185 186 rgba8_copy(dst, src, bytes); 187 188 return dst; 189} 190 191/** 192 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned. 193 */ 194static inline void * 195rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) 196{ 197 assert(bytes == 0 || !(((uintptr_t)src) & 0xf)); 198 199#if defined(__SSSE3__) || defined(__SSE2__) 200 if (bytes == 64) { 201 rgba8_copy_16_aligned_src(dst + 0, src + 0); 202 rgba8_copy_16_aligned_src(dst + 16, src + 16); 203 rgba8_copy_16_aligned_src(dst + 32, src + 32); 204 rgba8_copy_16_aligned_src(dst + 48, src + 48); 205 return dst; 206 } 207 208 while (bytes >= 16) { 209 rgba8_copy_16_aligned_src(dst, src); 210 src += 16; 211 dst += 16; 212 bytes -= 16; 213 } 214#endif 215 216 rgba8_copy(dst, src, bytes); 217 218 return dst; 219} 220 221/** 222 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3). 223 * These ranges are in bytes, i.e. pixels * bytes-per-pixel. 224 * The first and last ranges must be shorter than a "span" (the longest linear 225 * stretch within a tile) and the middle must equal a whole number of spans. 226 * Ranges may be empty. The region copied must land entirely within one tile. 227 * 'dst' is the start of the tile and 'src' is the corresponding 228 * address to copy from, though copying begins at (x0, y0). 229 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero. 230 * Swizzling flips bit 6 in the copy destination offset, when certain other 231 * bits are set in it. 232 */ 233typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 234 uint32_t y0, uint32_t y1, 235 char *dst, const char *src, 236 int32_t linear_pitch, 237 uint32_t swizzle_bit, 238 isl_memcpy_type copy_type); 239 240/** 241 * Copy texture data from linear to X tile layout. 242 * 243 * \copydoc tile_copy_fn 244 * 245 * The mem_copy parameters allow the user to specify an alternative mem_copy 246 * function that, for instance, may do RGBA -> BGRA swizzling. The first 247 * function must handle any memory alignment while the second function must 248 * only handle 16-byte alignment in whichever side (source or destination) is 249 * tiled. 250 */ 251static inline void 252linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 253 uint32_t y0, uint32_t y1, 254 char *dst, const char *src, 255 int32_t src_pitch, 256 uint32_t swizzle_bit, 257 isl_mem_copy_fn mem_copy, 258 isl_mem_copy_fn mem_copy_align16) 259{ 260 /* The copy destination offset for each range copied is the sum of 261 * an X offset 'x0' or 'xo' and a Y offset 'yo.' 262 */ 263 uint32_t xo, yo; 264 265 src += (ptrdiff_t)y0 * src_pitch; 266 267 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) { 268 /* Bits 9 and 10 of the copy destination offset control swizzling. 269 * Only 'yo' contributes to those bits in the total offset, 270 * so calculate 'swizzle' just once per row. 271 * Move bits 9 and 10 three and four places respectively down 272 * to bit 6 and xor them. 273 */ 274 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit; 275 276 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0); 277 278 for (xo = x1; xo < x2; xo += xtile_span) { 279 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); 280 } 281 282 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 283 284 src += src_pitch; 285 } 286} 287 288/** 289 * Copy texture data from linear to Y tile layout. 290 * 291 * \copydoc tile_copy_fn 292 */ 293static inline void 294linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 295 uint32_t y0, uint32_t y3, 296 char *dst, const char *src, 297 int32_t src_pitch, 298 uint32_t swizzle_bit, 299 isl_mem_copy_fn mem_copy, 300 isl_mem_copy_fn mem_copy_align16) 301{ 302 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height 303 * as the tile). Thus the destination offset for (x,y) is the sum of: 304 * (x % column_width) // position within column 305 * (x / column_width) * bytes_per_column // column number * bytes per column 306 * y * column_width 307 * 308 * The copy destination offset for each range copied is the sum of 309 * an X offset 'xo0' or 'xo' and a Y offset 'yo.' 310 */ 311 const uint32_t column_width = ytile_span; 312 const uint32_t bytes_per_column = column_width * ytile_height; 313 314 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4)); 315 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4)); 316 317 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column; 318 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column; 319 320 /* Bit 9 of the destination offset control swizzling. 321 * Only the X offset contributes to bit 9 of the total offset, 322 * so swizzle can be calculated in advance for these X positions. 323 * Move bit 9 three places down to bit 6. 324 */ 325 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit; 326 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit; 327 328 uint32_t x, yo; 329 330 src += (ptrdiff_t)y0 * src_pitch; 331 332 if (y0 != y1) { 333 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) { 334 uint32_t xo = xo1; 335 uint32_t swizzle = swizzle1; 336 337 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); 338 339 /* Step by spans/columns. As it happens, the swizzle bit flips 340 * at each step so we don't need to calculate it explicitly. 341 */ 342 for (x = x1; x < x2; x += ytile_span) { 343 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); 344 xo += bytes_per_column; 345 swizzle ^= swizzle_bit; 346 } 347 348 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 349 350 src += src_pitch; 351 } 352 } 353 354 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) { 355 uint32_t xo = xo1; 356 uint32_t swizzle = swizzle1; 357 358 if (x0 != x1) { 359 mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0); 360 mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0); 361 mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0); 362 mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0); 363 } 364 365 /* Step by spans/columns. As it happens, the swizzle bit flips 366 * at each step so we don't need to calculate it explicitly. 367 */ 368 for (x = x1; x < x2; x += ytile_span) { 369 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span); 370 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span); 371 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span); 372 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span); 373 xo += bytes_per_column; 374 swizzle ^= swizzle_bit; 375 } 376 377 if (x2 != x3) { 378 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2); 379 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2); 380 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2); 381 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2); 382 } 383 384 src += 4 * src_pitch; 385 } 386 387 if (y2 != y3) { 388 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) { 389 uint32_t xo = xo1; 390 uint32_t swizzle = swizzle1; 391 392 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); 393 394 /* Step by spans/columns. As it happens, the swizzle bit flips 395 * at each step so we don't need to calculate it explicitly. 396 */ 397 for (x = x1; x < x2; x += ytile_span) { 398 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); 399 xo += bytes_per_column; 400 swizzle ^= swizzle_bit; 401 } 402 403 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 404 405 src += src_pitch; 406 } 407 } 408} 409 410/** 411 * Copy texture data from X tile layout to linear. 412 * 413 * \copydoc tile_copy_fn 414 */ 415static inline void 416xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 417 uint32_t y0, uint32_t y1, 418 char *dst, const char *src, 419 int32_t dst_pitch, 420 uint32_t swizzle_bit, 421 isl_mem_copy_fn mem_copy, 422 isl_mem_copy_fn mem_copy_align16) 423{ 424 /* The copy destination offset for each range copied is the sum of 425 * an X offset 'x0' or 'xo' and a Y offset 'yo.' 426 */ 427 uint32_t xo, yo; 428 429 dst += (ptrdiff_t)y0 * dst_pitch; 430 431 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) { 432 /* Bits 9 and 10 of the copy destination offset control swizzling. 433 * Only 'yo' contributes to those bits in the total offset, 434 * so calculate 'swizzle' just once per row. 435 * Move bits 9 and 10 three and four places respectively down 436 * to bit 6 and xor them. 437 */ 438 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit; 439 440 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0); 441 442 for (xo = x1; xo < x2; xo += xtile_span) { 443 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); 444 } 445 446 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 447 448 dst += dst_pitch; 449 } 450} 451 452 /** 453 * Copy texture data from Y tile layout to linear. 454 * 455 * \copydoc tile_copy_fn 456 */ 457static inline void 458ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 459 uint32_t y0, uint32_t y3, 460 char *dst, const char *src, 461 int32_t dst_pitch, 462 uint32_t swizzle_bit, 463 isl_mem_copy_fn mem_copy, 464 isl_mem_copy_fn mem_copy_align16) 465{ 466 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height 467 * as the tile). Thus the destination offset for (x,y) is the sum of: 468 * (x % column_width) // position within column 469 * (x / column_width) * bytes_per_column // column number * bytes per column 470 * y * column_width 471 * 472 * The copy destination offset for each range copied is the sum of 473 * an X offset 'xo0' or 'xo' and a Y offset 'yo.' 474 */ 475 const uint32_t column_width = ytile_span; 476 const uint32_t bytes_per_column = column_width * ytile_height; 477 478 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4)); 479 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4)); 480 481 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column; 482 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column; 483 484 /* Bit 9 of the destination offset control swizzling. 485 * Only the X offset contributes to bit 9 of the total offset, 486 * so swizzle can be calculated in advance for these X positions. 487 * Move bit 9 three places down to bit 6. 488 */ 489 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit; 490 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit; 491 492 uint32_t x, yo; 493 494 dst += (ptrdiff_t)y0 * dst_pitch; 495 496 if (y0 != y1) { 497 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) { 498 uint32_t xo = xo1; 499 uint32_t swizzle = swizzle1; 500 501 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0); 502 503 /* Step by spans/columns. As it happens, the swizzle bit flips 504 * at each step so we don't need to calculate it explicitly. 505 */ 506 for (x = x1; x < x2; x += ytile_span) { 507 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); 508 xo += bytes_per_column; 509 swizzle ^= swizzle_bit; 510 } 511 512 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 513 514 dst += dst_pitch; 515 } 516 } 517 518 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) { 519 uint32_t xo = xo1; 520 uint32_t swizzle = swizzle1; 521 522 if (x0 != x1) { 523 mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0); 524 mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0); 525 mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0); 526 mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0); 527 } 528 529 /* Step by spans/columns. As it happens, the swizzle bit flips 530 * at each step so we don't need to calculate it explicitly. 531 */ 532 for (x = x1; x < x2; x += ytile_span) { 533 mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span); 534 mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span); 535 mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span); 536 mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span); 537 xo += bytes_per_column; 538 swizzle ^= swizzle_bit; 539 } 540 541 if (x2 != x3) { 542 mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2); 543 mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2); 544 mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2); 545 mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2); 546 } 547 548 dst += 4 * dst_pitch; 549 } 550 551 if (y2 != y3) { 552 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) { 553 uint32_t xo = xo1; 554 uint32_t swizzle = swizzle1; 555 556 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0); 557 558 /* Step by spans/columns. As it happens, the swizzle bit flips 559 * at each step so we don't need to calculate it explicitly. 560 */ 561 for (x = x1; x < x2; x += ytile_span) { 562 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); 563 xo += bytes_per_column; 564 swizzle ^= swizzle_bit; 565 } 566 567 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 568 569 dst += dst_pitch; 570 } 571 } 572} 573 574#if defined(INLINE_SSE41) 575static ALWAYS_INLINE void * 576_memcpy_streaming_load(void *dest, const void *src, size_t count) 577{ 578 if (count == 16) { 579 __m128i val = _mm_stream_load_si128((__m128i *)src); 580 _mm_storeu_si128((__m128i *)dest, val); 581 return dest; 582 } else if (count == 64) { 583 __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0); 584 __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1); 585 __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2); 586 __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3); 587 _mm_storeu_si128(((__m128i *)dest) + 0, val0); 588 _mm_storeu_si128(((__m128i *)dest) + 1, val1); 589 _mm_storeu_si128(((__m128i *)dest) + 2, val2); 590 _mm_storeu_si128(((__m128i *)dest) + 3, val3); 591 return dest; 592 } else { 593 assert(count < 64); /* and (count < 16) for ytiled */ 594 return memcpy(dest, src, count); 595 } 596} 597#endif 598 599static isl_mem_copy_fn 600choose_copy_function(isl_memcpy_type copy_type) 601{ 602 switch(copy_type) { 603 case ISL_MEMCPY: 604 return memcpy; 605 case ISL_MEMCPY_BGRA8: 606 return rgba8_copy; 607 case ISL_MEMCPY_STREAMING_LOAD: 608#if defined(INLINE_SSE41) 609 return _memcpy_streaming_load; 610#else 611 unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1"); 612#endif 613 case ISL_MEMCPY_INVALID: 614 unreachable("invalid copy_type"); 615 } 616 unreachable("unhandled copy_type"); 617 return NULL; 618} 619 620/** 621 * Copy texture data from linear to X tile layout, faster. 622 * 623 * Same as \ref linear_to_xtiled but faster, because it passes constant 624 * parameters for common cases, allowing the compiler to inline code 625 * optimized for those cases. 626 * 627 * \copydoc tile_copy_fn 628 */ 629static FLATTEN void 630linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 631 uint32_t y0, uint32_t y1, 632 char *dst, const char *src, 633 int32_t src_pitch, 634 uint32_t swizzle_bit, 635 isl_memcpy_type copy_type) 636{ 637 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 638 639 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { 640 if (mem_copy == memcpy) 641 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, 642 dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 643 else if (mem_copy == rgba8_copy) 644 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, 645 dst, src, src_pitch, swizzle_bit, 646 rgba8_copy, rgba8_copy_aligned_dst); 647 else 648 unreachable("not reached"); 649 } else { 650 if (mem_copy == memcpy) 651 return linear_to_xtiled(x0, x1, x2, x3, y0, y1, 652 dst, src, src_pitch, swizzle_bit, 653 memcpy, memcpy); 654 else if (mem_copy == rgba8_copy) 655 return linear_to_xtiled(x0, x1, x2, x3, y0, y1, 656 dst, src, src_pitch, swizzle_bit, 657 rgba8_copy, rgba8_copy_aligned_dst); 658 else 659 unreachable("not reached"); 660 } 661 linear_to_xtiled(x0, x1, x2, x3, y0, y1, 662 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); 663} 664 665/** 666 * Copy texture data from linear to Y tile layout, faster. 667 * 668 * Same as \ref linear_to_ytiled but faster, because it passes constant 669 * parameters for common cases, allowing the compiler to inline code 670 * optimized for those cases. 671 * 672 * \copydoc tile_copy_fn 673 */ 674static FLATTEN void 675linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 676 uint32_t y0, uint32_t y1, 677 char *dst, const char *src, 678 int32_t src_pitch, 679 uint32_t swizzle_bit, 680 isl_memcpy_type copy_type) 681{ 682 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 683 684 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { 685 if (mem_copy == memcpy) 686 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, 687 dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 688 else if (mem_copy == rgba8_copy) 689 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, 690 dst, src, src_pitch, swizzle_bit, 691 rgba8_copy, rgba8_copy_aligned_dst); 692 else 693 unreachable("not reached"); 694 } else { 695 if (mem_copy == memcpy) 696 return linear_to_ytiled(x0, x1, x2, x3, y0, y1, 697 dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 698 else if (mem_copy == rgba8_copy) 699 return linear_to_ytiled(x0, x1, x2, x3, y0, y1, 700 dst, src, src_pitch, swizzle_bit, 701 rgba8_copy, rgba8_copy_aligned_dst); 702 else 703 unreachable("not reached"); 704 } 705 linear_to_ytiled(x0, x1, x2, x3, y0, y1, 706 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); 707} 708 709/** 710 * Copy texture data from X tile layout to linear, faster. 711 * 712 * Same as \ref xtile_to_linear but faster, because it passes constant 713 * parameters for common cases, allowing the compiler to inline code 714 * optimized for those cases. 715 * 716 * \copydoc tile_copy_fn 717 */ 718static FLATTEN void 719xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 720 uint32_t y0, uint32_t y1, 721 char *dst, const char *src, 722 int32_t dst_pitch, 723 uint32_t swizzle_bit, 724 isl_memcpy_type copy_type) 725{ 726 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 727 728 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { 729 if (mem_copy == memcpy) 730 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 731 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 732 else if (mem_copy == rgba8_copy) 733 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 734 dst, src, dst_pitch, swizzle_bit, 735 rgba8_copy, rgba8_copy_aligned_src); 736#if defined(INLINE_SSE41) 737 else if (mem_copy == _memcpy_streaming_load) 738 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 739 dst, src, dst_pitch, swizzle_bit, 740 memcpy, _memcpy_streaming_load); 741#endif 742 else 743 unreachable("not reached"); 744 } else { 745 if (mem_copy == memcpy) 746 return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 747 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 748 else if (mem_copy == rgba8_copy) 749 return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 750 dst, src, dst_pitch, swizzle_bit, 751 rgba8_copy, rgba8_copy_aligned_src); 752#if defined(INLINE_SSE41) 753 else if (mem_copy == _memcpy_streaming_load) 754 return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 755 dst, src, dst_pitch, swizzle_bit, 756 memcpy, _memcpy_streaming_load); 757#endif 758 else 759 unreachable("not reached"); 760 } 761 xtiled_to_linear(x0, x1, x2, x3, y0, y1, 762 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); 763} 764 765/** 766 * Copy texture data from Y tile layout to linear, faster. 767 * 768 * Same as \ref ytile_to_linear but faster, because it passes constant 769 * parameters for common cases, allowing the compiler to inline code 770 * optimized for those cases. 771 * 772 * \copydoc tile_copy_fn 773 */ 774static FLATTEN void 775ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 776 uint32_t y0, uint32_t y1, 777 char *dst, const char *src, 778 int32_t dst_pitch, 779 uint32_t swizzle_bit, 780 isl_memcpy_type copy_type) 781{ 782 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 783 784 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { 785 if (mem_copy == memcpy) 786 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 787 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 788 else if (mem_copy == rgba8_copy) 789 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 790 dst, src, dst_pitch, swizzle_bit, 791 rgba8_copy, rgba8_copy_aligned_src); 792#if defined(INLINE_SSE41) 793 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD) 794 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 795 dst, src, dst_pitch, swizzle_bit, 796 memcpy, _memcpy_streaming_load); 797#endif 798 else 799 unreachable("not reached"); 800 } else { 801 if (mem_copy == memcpy) 802 return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 803 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 804 else if (mem_copy == rgba8_copy) 805 return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 806 dst, src, dst_pitch, swizzle_bit, 807 rgba8_copy, rgba8_copy_aligned_src); 808#if defined(INLINE_SSE41) 809 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD) 810 return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 811 dst, src, dst_pitch, swizzle_bit, 812 memcpy, _memcpy_streaming_load); 813#endif 814 else 815 unreachable("not reached"); 816 } 817 ytiled_to_linear(x0, x1, x2, x3, y0, y1, 818 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); 819} 820 821/** 822 * Copy from linear to tiled texture. 823 * 824 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into 825 * pieces that do not cross tile boundaries and copy each piece with a tile 826 * copy function (\ref tile_copy_fn). 827 * The X range is in bytes, i.e. pixels * bytes-per-pixel. 828 * The Y range is in pixels (i.e. unitless). 829 * 'dst' is the address of (0, 0) in the destination tiled texture. 830 * 'src' is the address of (xt1, yt1) in the source linear texture. 831 */ 832static void 833intel_linear_to_tiled(uint32_t xt1, uint32_t xt2, 834 uint32_t yt1, uint32_t yt2, 835 char *dst, const char *src, 836 uint32_t dst_pitch, int32_t src_pitch, 837 bool has_swizzling, 838 enum isl_tiling tiling, 839 isl_memcpy_type copy_type) 840{ 841 tile_copy_fn tile_copy; 842 uint32_t xt0, xt3; 843 uint32_t yt0, yt3; 844 uint32_t xt, yt; 845 uint32_t tw, th, span; 846 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0; 847 848 if (tiling == ISL_TILING_X) { 849 tw = xtile_width; 850 th = xtile_height; 851 span = xtile_span; 852 tile_copy = linear_to_xtiled_faster; 853 } else if (tiling == ISL_TILING_Y0) { 854 tw = ytile_width; 855 th = ytile_height; 856 span = ytile_span; 857 tile_copy = linear_to_ytiled_faster; 858 } else { 859 unreachable("unsupported tiling"); 860 } 861 862 /* Round out to tile boundaries. */ 863 xt0 = ALIGN_DOWN(xt1, tw); 864 xt3 = ALIGN_UP (xt2, tw); 865 yt0 = ALIGN_DOWN(yt1, th); 866 yt3 = ALIGN_UP (yt2, th); 867 868 /* Loop over all tiles to which we have something to copy. 869 * 'xt' and 'yt' are the origin of the destination tile, whether copying 870 * copying a full or partial tile. 871 * tile_copy() copies one tile or partial tile. 872 * Looping x inside y is the faster memory access pattern. 873 */ 874 for (yt = yt0; yt < yt3; yt += th) { 875 for (xt = xt0; xt < xt3; xt += tw) { 876 /* The area to update is [x0,x3) x [y0,y1). 877 * May not want the whole tile, hence the min and max. 878 */ 879 uint32_t x0 = MAX2(xt1, xt); 880 uint32_t y0 = MAX2(yt1, yt); 881 uint32_t x3 = MIN2(xt2, xt + tw); 882 uint32_t y1 = MIN2(yt2, yt + th); 883 884 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that 885 * the middle interval is the longest span-aligned part. 886 * The sub-ranges could be empty. 887 */ 888 uint32_t x1, x2; 889 x1 = ALIGN_UP(x0, span); 890 if (x1 > x3) 891 x1 = x2 = x3; 892 else 893 x2 = ALIGN_DOWN(x3, span); 894 895 assert(x0 <= x1 && x1 <= x2 && x2 <= x3); 896 assert(x1 - x0 < span && x3 - x2 < span); 897 assert(x3 - x0 <= tw); 898 assert((x2 - x1) % span == 0); 899 900 /* Translate by (xt,yt) for single-tile copier. */ 901 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt, 902 y0-yt, y1-yt, 903 dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch, 904 src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch, 905 src_pitch, 906 swizzle_bit, 907 copy_type); 908 } 909 } 910} 911 912/** 913 * Copy from tiled to linear texture. 914 * 915 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into 916 * pieces that do not cross tile boundaries and copy each piece with a tile 917 * copy function (\ref tile_copy_fn). 918 * The X range is in bytes, i.e. pixels * bytes-per-pixel. 919 * The Y range is in pixels (i.e. unitless). 920 * 'dst' is the address of (xt1, yt1) in the destination linear texture. 921 * 'src' is the address of (0, 0) in the source tiled texture. 922 */ 923static void 924intel_tiled_to_linear(uint32_t xt1, uint32_t xt2, 925 uint32_t yt1, uint32_t yt2, 926 char *dst, const char *src, 927 int32_t dst_pitch, uint32_t src_pitch, 928 bool has_swizzling, 929 enum isl_tiling tiling, 930 isl_memcpy_type copy_type) 931{ 932 tile_copy_fn tile_copy; 933 uint32_t xt0, xt3; 934 uint32_t yt0, yt3; 935 uint32_t xt, yt; 936 uint32_t tw, th, span; 937 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0; 938 939 if (tiling == ISL_TILING_X) { 940 tw = xtile_width; 941 th = xtile_height; 942 span = xtile_span; 943 tile_copy = xtiled_to_linear_faster; 944 } else if (tiling == ISL_TILING_Y0) { 945 tw = ytile_width; 946 th = ytile_height; 947 span = ytile_span; 948 tile_copy = ytiled_to_linear_faster; 949 } else { 950 unreachable("unsupported tiling"); 951 } 952 953#if defined(INLINE_SSE41) 954 if (copy_type == ISL_MEMCPY_STREAMING_LOAD) { 955 /* The hidden cacheline sized register used by movntdqa can apparently 956 * give you stale data, so do an mfence to invalidate it. 957 */ 958 _mm_mfence(); 959 } 960#endif 961 962 /* Round out to tile boundaries. */ 963 xt0 = ALIGN_DOWN(xt1, tw); 964 xt3 = ALIGN_UP (xt2, tw); 965 yt0 = ALIGN_DOWN(yt1, th); 966 yt3 = ALIGN_UP (yt2, th); 967 968 /* Loop over all tiles to which we have something to copy. 969 * 'xt' and 'yt' are the origin of the destination tile, whether copying 970 * copying a full or partial tile. 971 * tile_copy() copies one tile or partial tile. 972 * Looping x inside y is the faster memory access pattern. 973 */ 974 for (yt = yt0; yt < yt3; yt += th) { 975 for (xt = xt0; xt < xt3; xt += tw) { 976 /* The area to update is [x0,x3) x [y0,y1). 977 * May not want the whole tile, hence the min and max. 978 */ 979 uint32_t x0 = MAX2(xt1, xt); 980 uint32_t y0 = MAX2(yt1, yt); 981 uint32_t x3 = MIN2(xt2, xt + tw); 982 uint32_t y1 = MIN2(yt2, yt + th); 983 984 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that 985 * the middle interval is the longest span-aligned part. 986 * The sub-ranges could be empty. 987 */ 988 uint32_t x1, x2; 989 x1 = ALIGN_UP(x0, span); 990 if (x1 > x3) 991 x1 = x2 = x3; 992 else 993 x2 = ALIGN_DOWN(x3, span); 994 995 assert(x0 <= x1 && x1 <= x2 && x2 <= x3); 996 assert(x1 - x0 < span && x3 - x2 < span); 997 assert(x3 - x0 <= tw); 998 assert((x2 - x1) % span == 0); 999 1000 /* Translate by (xt,yt) for single-tile copier. */ 1001 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt, 1002 y0-yt, y1-yt, 1003 dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch, 1004 src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch, 1005 dst_pitch, 1006 swizzle_bit, 1007 copy_type); 1008 } 1009 } 1010} 1011