1 /* 2 * Copyright (c) 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Chris Wilson <chris (at) chris-wilson.co.uk> 25 * 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include "sna.h" 33 34 #if __x86_64__ 35 #define USE_SSE2 1 36 #endif 37 38 #if USE_SSE2 39 #include <xmmintrin.h> 40 41 #if __x86_64__ 42 #define have_sse2() 1 43 #else 44 enum { 45 MMX = 0x1, 46 MMX_EXTENSIONS = 0x2, 47 SSE = 0x6, 48 SSE2 = 0x8, 49 CMOV = 0x10 50 }; 51 52 #ifdef __GNUC__ 53 static unsigned int 54 detect_cpu_features(void) 55 { 56 unsigned int features; 57 unsigned int result = 0; 58 59 char vendor[13]; 60 vendor[0] = 0; 61 vendor[12] = 0; 62 63 asm ( 64 "pushf\n" 65 "pop %%eax\n" 66 "mov %%eax, %%ecx\n" 67 "xor $0x00200000, %%eax\n" 68 "push %%eax\n" 69 "popf\n" 70 "pushf\n" 71 "pop %%eax\n" 72 "mov $0x0, %%edx\n" 73 "xor %%ecx, %%eax\n" 74 "jz 1f\n" 75 76 "mov $0x00000000, %%eax\n" 77 "push %%ebx\n" 78 "cpuid\n" 79 "mov %%ebx, %%eax\n" 80 "pop %%ebx\n" 81 "mov %%eax, %1\n" 82 "mov %%edx, %2\n" 83 "mov %%ecx, %3\n" 84 "mov $0x00000001, %%eax\n" 85 "push %%ebx\n" 86 "cpuid\n" 87 "pop %%ebx\n" 88 "1:\n" 89 "mov %%edx, %0\n" 90 : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8]) 91 :: "%eax", "%ecx", "%edx"); 92 93 features = 0; 94 if (result) { 95 /* result now contains the standard feature bits */ 96 if (result & (1 << 15)) 97 features |= CMOV; 98 if (result & (1 << 23)) 99 features |= MMX; 100 if (result & (1 << 25)) 101 features |= SSE; 102 if (result & (1 << 26)) 103 features |= SSE2; 104 } 105 return features; 106 } 107 #else 108 static unsigned int detect_cpu_features(void) { return 0; } 109 #endif 110 111 static bool have_sse2(void) 112 { 113 static int sse2_present = -1; 114 115 if (sse2_present == -1) 116 sse2_present = detect_cpu_features() & SSE2; 117 118 return sse2_present; 119 } 120 #endif 121 122 static inline __m128i 123 xmm_create_mask_32(uint32_t mask) 124 { 125 return _mm_set_epi32(mask, mask, mask, mask); 126 } 127 128 static inline __m128i 129 xmm_load_128u(const __m128i *src) 130 { 131 return _mm_loadu_si128(src); 132 } 133 134 static inline void 135 xmm_save_128(__m128i *dst, __m128i data) 136 { 137 _mm_store_si128(dst, data); 138 } 139 #endif 140 141 fast void 142 memcpy_blt(const void *src, void *dst, int bpp, 143 int32_t src_stride, int32_t dst_stride, 144 int16_t src_x, int16_t src_y, 145 int16_t dst_x, int16_t dst_y, 146 uint16_t width, uint16_t height) 147 { 148 const uint8_t *src_bytes; 149 uint8_t *dst_bytes; 150 int byte_width; 151 152 assert(src); 153 assert(dst); 154 assert(width && height); 155 assert(bpp >= 8); 156 assert(width*bpp <= 8*src_stride); 157 assert(width*bpp <= 8*dst_stride); 158 159 DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 160 __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 161 162 bpp /= 8; 163 164 src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; 165 dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; 166 167 byte_width = width * bpp; 168 if (byte_width == src_stride && byte_width == dst_stride) { 169 byte_width *= height; 170 height = 1; 171 } 172 173 switch (byte_width) { 174 case 1: 175 do { 176 *dst_bytes = *src_bytes; 177 src_bytes += src_stride; 178 dst_bytes += dst_stride; 179 } while (--height); 180 break; 181 182 case 2: 183 do { 184 *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; 185 src_bytes += src_stride; 186 dst_bytes += dst_stride; 187 } while (--height); 188 break; 189 190 case 4: 191 do { 192 *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; 193 src_bytes += src_stride; 194 dst_bytes += dst_stride; 195 } while (--height); 196 break; 197 198 case 8: 199 do { 200 *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; 201 src_bytes += src_stride; 202 dst_bytes += dst_stride; 203 } while (--height); 204 break; 205 case 16: 206 do { 207 ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; 208 ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; 209 src_bytes += src_stride; 210 dst_bytes += dst_stride; 211 } while (--height); 212 break; 213 214 default: 215 do { 216 memcpy(dst_bytes, src_bytes, byte_width); 217 src_bytes += src_stride; 218 dst_bytes += dst_stride; 219 } while (--height); 220 break; 221 } 222 } 223 224 static fast_memcpy void 225 memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 226 int32_t src_stride, int32_t dst_stride, 227 int16_t src_x, int16_t src_y, 228 int16_t dst_x, int16_t dst_y, 229 uint16_t width, uint16_t height) 230 { 231 const unsigned tile_width = 512; 232 const unsigned tile_height = 8; 233 const unsigned tile_size = 4096; 234 235 const unsigned cpp = bpp / 8; 236 const unsigned tile_pixels = tile_width / cpp; 237 const unsigned tile_shift = ffs(tile_pixels) - 1; 238 const unsigned tile_mask = tile_pixels - 1; 239 240 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 241 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 242 assert(src != dst); 243 244 if (src_x | src_y) 245 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 246 assert(src_stride >= width * cpp); 247 src_stride -= width * cpp; 248 249 while (height--) { 250 unsigned w = width * cpp; 251 uint8_t *tile_row = dst; 252 253 tile_row += dst_y / tile_height * dst_stride * tile_height; 254 tile_row += (dst_y & (tile_height-1)) * tile_width; 255 if (dst_x) { 256 tile_row += (dst_x >> tile_shift) * tile_size; 257 if (dst_x & tile_mask) { 258 const unsigned x = (dst_x & tile_mask) * cpp; 259 const unsigned len = min(tile_width - x, w); 260 memcpy(tile_row + x, src, len); 261 262 tile_row += tile_size; 263 src = (const uint8_t *)src + len; 264 w -= len; 265 } 266 } 267 while (w >= tile_width) { 268 memcpy(tile_row, src, tile_width); 269 270 tile_row += tile_size; 271 src = (const uint8_t *)src + tile_width; 272 w -= tile_width; 273 } 274 memcpy(tile_row, src, w); 275 src = (const uint8_t *)src + src_stride + w; 276 dst_y++; 277 } 278 } 279 280 static fast_memcpy void 281 memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 282 int32_t src_stride, int32_t dst_stride, 283 int16_t src_x, int16_t src_y, 284 int16_t dst_x, int16_t dst_y, 285 uint16_t width, uint16_t height) 286 { 287 const unsigned tile_width = 512; 288 const unsigned tile_height = 8; 289 const unsigned tile_size = 4096; 290 291 const unsigned cpp = bpp / 8; 292 const unsigned tile_pixels = tile_width / cpp; 293 const unsigned tile_shift = ffs(tile_pixels) - 1; 294 const unsigned tile_mask = tile_pixels - 1; 295 296 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 297 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 298 assert(src != dst); 299 300 if (dst_x | dst_y) 301 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 302 assert(dst_stride >= width * cpp); 303 dst_stride -= width * cpp; 304 305 while (height--) { 306 unsigned w = width * cpp; 307 const uint8_t *tile_row = src; 308 309 tile_row += src_y / tile_height * src_stride * tile_height; 310 tile_row += (src_y & (tile_height-1)) * tile_width; 311 if (src_x) { 312 tile_row += (src_x >> tile_shift) * tile_size; 313 if (src_x & tile_mask) { 314 const unsigned x = (src_x & tile_mask) * cpp; 315 const unsigned len = min(tile_width - x, w); 316 memcpy(dst, tile_row + x, len); 317 318 tile_row += tile_size; 319 dst = (uint8_t *)dst + len; 320 w -= len; 321 } 322 } 323 while (w >= tile_width) { 324 memcpy(dst, tile_row, tile_width); 325 326 tile_row += tile_size; 327 dst = (uint8_t *)dst + tile_width; 328 w -= tile_width; 329 } 330 memcpy(dst, tile_row, w); 331 dst = (uint8_t *)dst + dst_stride + w; 332 src_y++; 333 } 334 } 335 336 fast_memcpy static void 337 memcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp, 338 int32_t src_stride, int32_t dst_stride, 339 int16_t src_x, int16_t src_y, 340 int16_t dst_x, int16_t dst_y, 341 uint16_t width, uint16_t height) 342 { 343 const unsigned tile_width = 512; 344 const unsigned tile_height = 8; 345 const unsigned tile_size = 4096; 346 347 const unsigned cpp = bpp / 8; 348 const unsigned stride_tiles = dst_stride / tile_width; 349 const unsigned swizzle_pixels = 64 / cpp; 350 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 351 const unsigned tile_mask = (1 << tile_pixels) - 1; 352 353 unsigned x, y; 354 355 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 356 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 357 358 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 359 360 for (y = 0; y < height; ++y) { 361 const uint32_t dy = y + dst_y; 362 const uint32_t tile_row = 363 (dy / tile_height * stride_tiles * tile_size + 364 (dy & (tile_height-1)) * tile_width); 365 const uint8_t *src_row = (const uint8_t *)src + src_stride * y; 366 uint32_t dx = dst_x, offset; 367 368 x = width * cpp; 369 if (dx & (swizzle_pixels - 1)) { 370 const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); 371 const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; 372 offset = tile_row + 373 (dx >> tile_pixels) * tile_size + 374 (dx & tile_mask) * cpp; 375 offset ^= (offset >> 3) & 64; 376 377 memcpy((char *)dst + offset, src_row, length * cpp); 378 379 src_row += length * cpp; 380 x -= length * cpp; 381 dx += length; 382 } 383 while (x >= 64) { 384 offset = tile_row + 385 (dx >> tile_pixels) * tile_size + 386 (dx & tile_mask) * cpp; 387 offset ^= (offset >> 3) & 64; 388 389 memcpy((char *)dst + offset, src_row, 64); 390 391 src_row += 64; 392 x -= 64; 393 dx += swizzle_pixels; 394 } 395 if (x) { 396 offset = tile_row + 397 (dx >> tile_pixels) * tile_size + 398 (dx & tile_mask) * cpp; 399 offset ^= (offset >> 3) & 64; 400 memcpy((char *)dst + offset, src_row, x); 401 } 402 } 403 } 404 405 fast_memcpy static void 406 memcpy_from_tiled_x__swizzle_9(const void *src, void *dst, int bpp, 407 int32_t src_stride, int32_t dst_stride, 408 int16_t src_x, int16_t src_y, 409 int16_t dst_x, int16_t dst_y, 410 uint16_t width, uint16_t height) 411 { 412 const unsigned tile_width = 512; 413 const unsigned tile_height = 8; 414 const unsigned tile_size = 4096; 415 416 const unsigned cpp = bpp / 8; 417 const unsigned stride_tiles = src_stride / tile_width; 418 const unsigned swizzle_pixels = 64 / cpp; 419 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 420 const unsigned tile_mask = (1 << tile_pixels) - 1; 421 422 unsigned x, y; 423 424 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 425 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 426 427 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 428 429 for (y = 0; y < height; ++y) { 430 const uint32_t sy = y + src_y; 431 const uint32_t tile_row = 432 (sy / tile_height * stride_tiles * tile_size + 433 (sy & (tile_height-1)) * tile_width); 434 uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; 435 uint32_t sx = src_x, offset; 436 437 x = width * cpp; 438 if (sx & (swizzle_pixels - 1)) { 439 const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); 440 const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; 441 offset = tile_row + 442 (sx >> tile_pixels) * tile_size + 443 (sx & tile_mask) * cpp; 444 offset ^= (offset >> 3) & 64; 445 446 memcpy(dst_row, (const char *)src + offset, length * cpp); 447 448 dst_row += length * cpp; 449 x -= length * cpp; 450 sx += length; 451 } 452 while (x >= 64) { 453 offset = tile_row + 454 (sx >> tile_pixels) * tile_size + 455 (sx & tile_mask) * cpp; 456 offset ^= (offset >> 3) & 64; 457 458 memcpy(dst_row, (const char *)src + offset, 64); 459 460 dst_row += 64; 461 x -= 64; 462 sx += swizzle_pixels; 463 } 464 if (x) { 465 offset = tile_row + 466 (sx >> tile_pixels) * tile_size + 467 (sx & tile_mask) * cpp; 468 offset ^= (offset >> 3) & 64; 469 memcpy(dst_row, (const char *)src + offset, x); 470 } 471 } 472 } 473 474 fast_memcpy static void 475 memcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp, 476 int32_t src_stride, int32_t dst_stride, 477 int16_t src_x, int16_t src_y, 478 int16_t dst_x, int16_t dst_y, 479 uint16_t width, uint16_t height) 480 { 481 const unsigned tile_width = 512; 482 const unsigned tile_height = 8; 483 const unsigned tile_size = 4096; 484 485 const unsigned cpp = bpp / 8; 486 const unsigned stride_tiles = dst_stride / tile_width; 487 const unsigned swizzle_pixels = 64 / cpp; 488 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 489 const unsigned tile_mask = (1 << tile_pixels) - 1; 490 491 unsigned x, y; 492 493 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 494 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 495 496 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 497 498 for (y = 0; y < height; ++y) { 499 const uint32_t dy = y + dst_y; 500 const uint32_t tile_row = 501 (dy / tile_height * stride_tiles * tile_size + 502 (dy & (tile_height-1)) * tile_width); 503 const uint8_t *src_row = (const uint8_t *)src + src_stride * y; 504 uint32_t dx = dst_x, offset; 505 506 x = width * cpp; 507 if (dx & (swizzle_pixels - 1)) { 508 const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); 509 const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; 510 offset = tile_row + 511 (dx >> tile_pixels) * tile_size + 512 (dx & tile_mask) * cpp; 513 offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 514 515 memcpy((char *)dst + offset, src_row, length * cpp); 516 517 src_row += length * cpp; 518 x -= length * cpp; 519 dx += length; 520 } 521 while (x >= 64) { 522 offset = tile_row + 523 (dx >> tile_pixels) * tile_size + 524 (dx & tile_mask) * cpp; 525 offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 526 527 memcpy((char *)dst + offset, src_row, 64); 528 529 src_row += 64; 530 x -= 64; 531 dx += swizzle_pixels; 532 } 533 if (x) { 534 offset = tile_row + 535 (dx >> tile_pixels) * tile_size + 536 (dx & tile_mask) * cpp; 537 offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 538 memcpy((char *)dst + offset, src_row, x); 539 } 540 } 541 } 542 543 fast_memcpy static void 544 memcpy_from_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp, 545 int32_t src_stride, int32_t dst_stride, 546 int16_t src_x, int16_t src_y, 547 int16_t dst_x, int16_t dst_y, 548 uint16_t width, uint16_t height) 549 { 550 const unsigned tile_width = 512; 551 const unsigned tile_height = 8; 552 const unsigned tile_size = 4096; 553 554 const unsigned cpp = bpp / 8; 555 const unsigned stride_tiles = src_stride / tile_width; 556 const unsigned swizzle_pixels = 64 / cpp; 557 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 558 const unsigned tile_mask = (1 << tile_pixels) - 1; 559 560 unsigned x, y; 561 562 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 563 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 564 565 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 566 567 for (y = 0; y < height; ++y) { 568 const uint32_t sy = y + src_y; 569 const uint32_t tile_row = 570 (sy / tile_height * stride_tiles * tile_size + 571 (sy & (tile_height-1)) * tile_width); 572 uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; 573 uint32_t sx = src_x, offset; 574 575 x = width * cpp; 576 if (sx & (swizzle_pixels - 1)) { 577 const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); 578 const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; 579 offset = tile_row + 580 (sx >> tile_pixels) * tile_size + 581 (sx & tile_mask) * cpp; 582 offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 583 584 memcpy(dst_row, (const char *)src + offset, length * cpp); 585 586 dst_row += length * cpp; 587 x -= length * cpp; 588 sx += length; 589 } 590 while (x >= 64) { 591 offset = tile_row + 592 (sx >> tile_pixels) * tile_size + 593 (sx & tile_mask) * cpp; 594 offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 595 596 memcpy(dst_row, (const char *)src + offset, 64); 597 598 dst_row += 64; 599 x -= 64; 600 sx += swizzle_pixels; 601 } 602 if (x) { 603 offset = tile_row + 604 (sx >> tile_pixels) * tile_size + 605 (sx & tile_mask) * cpp; 606 offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 607 memcpy(dst_row, (const char *)src + offset, x); 608 } 609 } 610 } 611 612 fast_memcpy static void 613 memcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp, 614 int32_t src_stride, int32_t dst_stride, 615 int16_t src_x, int16_t src_y, 616 int16_t dst_x, int16_t dst_y, 617 uint16_t width, uint16_t height) 618 { 619 const unsigned tile_width = 512; 620 const unsigned tile_height = 8; 621 const unsigned tile_size = 4096; 622 623 const unsigned cpp = bpp / 8; 624 const unsigned stride_tiles = dst_stride / tile_width; 625 const unsigned swizzle_pixels = 64 / cpp; 626 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 627 const unsigned tile_mask = (1 << tile_pixels) - 1; 628 629 unsigned x, y; 630 631 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 632 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 633 634 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 635 636 for (y = 0; y < height; ++y) { 637 const uint32_t dy = y + dst_y; 638 const uint32_t tile_row = 639 (dy / tile_height * stride_tiles * tile_size + 640 (dy & (tile_height-1)) * tile_width); 641 const uint8_t *src_row = (const uint8_t *)src + src_stride * y; 642 uint32_t dx = dst_x, offset; 643 644 x = width * cpp; 645 if (dx & (swizzle_pixels - 1)) { 646 const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); 647 const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; 648 offset = tile_row + 649 (dx >> tile_pixels) * tile_size + 650 (dx & tile_mask) * cpp; 651 offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 652 memcpy((char *)dst + offset, src_row, length * cpp); 653 654 src_row += length * cpp; 655 x -= length * cpp; 656 dx += length; 657 } 658 while (x >= 64) { 659 offset = tile_row + 660 (dx >> tile_pixels) * tile_size + 661 (dx & tile_mask) * cpp; 662 offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 663 664 memcpy((char *)dst + offset, src_row, 64); 665 666 src_row += 64; 667 x -= 64; 668 dx += swizzle_pixels; 669 } 670 if (x) { 671 offset = tile_row + 672 (dx >> tile_pixels) * tile_size + 673 (dx & tile_mask) * cpp; 674 offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 675 memcpy((char *)dst + offset, src_row, x); 676 } 677 } 678 } 679 680 fast_memcpy static void 681 memcpy_from_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp, 682 int32_t src_stride, int32_t dst_stride, 683 int16_t src_x, int16_t src_y, 684 int16_t dst_x, int16_t dst_y, 685 uint16_t width, uint16_t height) 686 { 687 const unsigned tile_width = 512; 688 const unsigned tile_height = 8; 689 const unsigned tile_size = 4096; 690 691 const unsigned cpp = bpp / 8; 692 const unsigned stride_tiles = src_stride / tile_width; 693 const unsigned swizzle_pixels = 64 / cpp; 694 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 695 const unsigned tile_mask = (1 << tile_pixels) - 1; 696 697 unsigned x, y; 698 699 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 700 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 701 702 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 703 704 for (y = 0; y < height; ++y) { 705 const uint32_t sy = y + src_y; 706 const uint32_t tile_row = 707 (sy / tile_height * stride_tiles * tile_size + 708 (sy & (tile_height-1)) * tile_width); 709 uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; 710 uint32_t sx = src_x, offset; 711 712 x = width * cpp; 713 if (sx & (swizzle_pixels - 1)) { 714 const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); 715 const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; 716 offset = tile_row + 717 (sx >> tile_pixels) * tile_size + 718 (sx & tile_mask) * cpp; 719 offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 720 memcpy(dst_row, (const char *)src + offset, length * cpp); 721 722 dst_row += length * cpp; 723 x -= length * cpp; 724 sx += length; 725 } 726 while (x >= 64) { 727 offset = tile_row + 728 (sx >> tile_pixels) * tile_size + 729 (sx & tile_mask) * cpp; 730 offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 731 732 memcpy(dst_row, (const char *)src + offset, 64); 733 734 dst_row += 64; 735 x -= 64; 736 sx += swizzle_pixels; 737 } 738 if (x) { 739 offset = tile_row + 740 (sx >> tile_pixels) * tile_size + 741 (sx & tile_mask) * cpp; 742 offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 743 memcpy(dst_row, (const char *)src + offset, x); 744 } 745 } 746 } 747 748 void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling) 749 { 750 switch (swizzling) { 751 default: 752 DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling)); 753 break; 754 case I915_BIT_6_SWIZZLE_NONE: 755 DBG(("%s: no swizzling\n", __FUNCTION__)); 756 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0; 757 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0; 758 break; 759 case I915_BIT_6_SWIZZLE_9: 760 DBG(("%s: 6^9 swizzling\n", __FUNCTION__)); 761 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9; 762 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9; 763 break; 764 case I915_BIT_6_SWIZZLE_9_10: 765 DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__)); 766 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10; 767 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10; 768 break; 769 case I915_BIT_6_SWIZZLE_9_11: 770 DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__)); 771 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11; 772 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11; 773 break; 774 } 775 } 776 777 void 778 memmove_box(const void *src, void *dst, 779 int bpp, int32_t stride, 780 const BoxRec *box, 781 int dx, int dy) 782 { 783 #define FORCE_MEMMOVE 0 784 union { 785 uint8_t u8; 786 uint16_t u16; 787 uint32_t u32; 788 uint64_t u64; 789 } tmp; 790 const uint8_t *src_bytes; 791 uint8_t *dst_bytes; 792 int width, height; 793 794 assert(src); 795 assert(dst); 796 assert(src != dst); 797 assert(bpp >= 8); 798 assert(box->x2 > box->x1); 799 assert(box->y2 > box->y1); 800 801 DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n", 802 __FUNCTION__, 803 box->x1, box->y1, box->x2, box->y2, 804 stride, bpp, dx, dy)); 805 806 bpp /= 8; 807 width = box->y1 * stride + box->x1 * bpp; 808 src_bytes = (const uint8_t *)src + width; 809 dst_bytes = (uint8_t *)dst + width; 810 assert(dst_bytes != src_bytes); 811 812 width = (box->x2 - box->x1) * bpp; 813 height = (box->y2 - box->y1); 814 assert(width <= stride); 815 if (width == stride) { 816 width *= height; 817 height = 1; 818 } 819 820 if (dy >= 0) { 821 switch (width) { 822 case 1: 823 do { 824 *dst_bytes = tmp.u8 = *src_bytes; 825 src_bytes += stride; 826 dst_bytes += stride; 827 } while (--height); 828 break; 829 830 case 2: 831 do { 832 *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes; 833 src_bytes += stride; 834 dst_bytes += stride; 835 } while (--height); 836 break; 837 838 case 4: 839 do { 840 *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes; 841 src_bytes += stride; 842 dst_bytes += stride; 843 } while (--height); 844 break; 845 846 case 8: 847 do { 848 *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes; 849 src_bytes += stride; 850 dst_bytes += stride; 851 } while (--height); 852 break; 853 854 default: 855 if (FORCE_MEMMOVE || 856 (dst_bytes < src_bytes + width && 857 src_bytes < dst_bytes + width)) { 858 do { 859 memmove(dst_bytes, src_bytes, width); 860 src_bytes += stride; 861 dst_bytes += stride; 862 } while (--height); 863 } else { 864 do { 865 memcpy(dst_bytes, src_bytes, width); 866 src_bytes += stride; 867 dst_bytes += stride; 868 } while (--height); 869 } 870 break; 871 } 872 } else { 873 src_bytes += (height-1) * stride; 874 dst_bytes += (height-1) * stride; 875 876 switch (width) { 877 case 1: 878 do { 879 *dst_bytes = tmp.u8 = *src_bytes; 880 src_bytes -= stride; 881 dst_bytes -= stride; 882 } while (--height); 883 break; 884 885 case 2: 886 do { 887 *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes; 888 src_bytes -= stride; 889 dst_bytes -= stride; 890 } while (--height); 891 break; 892 893 case 4: 894 do { 895 *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes; 896 src_bytes -= stride; 897 dst_bytes -= stride; 898 } while (--height); 899 break; 900 901 case 8: 902 do { 903 *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes; 904 src_bytes -= stride; 905 dst_bytes -= stride; 906 } while (--height); 907 break; 908 909 default: 910 if (FORCE_MEMMOVE || 911 (dst_bytes < src_bytes + width && 912 src_bytes < dst_bytes + width)) { 913 do { 914 memmove(dst_bytes, src_bytes, width); 915 src_bytes -= stride; 916 dst_bytes -= stride; 917 } while (--height); 918 } else { 919 do { 920 memcpy(dst_bytes, src_bytes, width); 921 src_bytes -= stride; 922 dst_bytes -= stride; 923 } while (--height); 924 } 925 break; 926 } 927 } 928 } 929 930 void 931 memcpy_xor(const void *src, void *dst, int bpp, 932 int32_t src_stride, int32_t dst_stride, 933 int16_t src_x, int16_t src_y, 934 int16_t dst_x, int16_t dst_y, 935 uint16_t width, uint16_t height, 936 uint32_t and, uint32_t or) 937 { 938 const uint8_t *src_bytes; 939 uint8_t *dst_bytes; 940 int i, w; 941 942 assert(width && height); 943 assert(bpp >= 8); 944 assert(width*bpp <= 8*src_stride); 945 assert(width*bpp <= 8*dst_stride); 946 947 DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n", 948 __FUNCTION__, 949 src_x, src_y, dst_x, dst_y, 950 width, height, 951 src_stride, dst_stride, 952 bpp, and, or)); 953 954 bpp /= 8; 955 src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; 956 dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; 957 958 if (and == 0xffffffff) { 959 switch (bpp) { 960 case 1: 961 if (width & 1) { 962 do { 963 for (i = 0; i < width; i++) 964 dst_bytes[i] = src_bytes[i] | or; 965 966 src_bytes += src_stride; 967 dst_bytes += dst_stride; 968 } while (--height); 969 break; 970 } else { 971 width /= 2; 972 or |= or << 8; 973 } 974 case 2: 975 if (width & 1) { 976 do { 977 uint16_t *d = (uint16_t *)dst_bytes; 978 const uint16_t *s = (const uint16_t *)src_bytes; 979 980 for (i = 0; i < width; i++) 981 d[i] = s[i] | or; 982 983 src_bytes += src_stride; 984 dst_bytes += dst_stride; 985 } while (--height); 986 break; 987 } else { 988 width /= 2; 989 or |= or << 16; 990 } 991 case 4: 992 w = width; 993 if (w * 4 == dst_stride && dst_stride == src_stride) { 994 w *= height; 995 height = 1; 996 } 997 998 #if USE_SSE2 999 if (have_sse2()) { 1000 do { 1001 uint32_t *d = (uint32_t *)dst_bytes; 1002 const uint32_t *s = (const uint32_t *)src_bytes; 1003 __m128i mask = xmm_create_mask_32(or); 1004 1005 i = w; 1006 while (i && (uintptr_t)d & 15) { 1007 *d++ = *s++ | or; 1008 i--; 1009 } 1010 1011 while (i >= 16) { 1012 __m128i xmm1, xmm2, xmm3, xmm4; 1013 1014 xmm1 = xmm_load_128u((const __m128i*)s + 0); 1015 xmm2 = xmm_load_128u((const __m128i*)s + 1); 1016 xmm3 = xmm_load_128u((const __m128i*)s + 2); 1017 xmm4 = xmm_load_128u((const __m128i*)s + 3); 1018 1019 xmm_save_128((__m128i*)d + 0, 1020 _mm_or_si128(xmm1, mask)); 1021 xmm_save_128((__m128i*)d + 1, 1022 _mm_or_si128(xmm2, mask)); 1023 xmm_save_128((__m128i*)d + 2, 1024 _mm_or_si128(xmm3, mask)); 1025 xmm_save_128((__m128i*)d + 3, 1026 _mm_or_si128(xmm4, mask)); 1027 1028 d += 16; 1029 s += 16; 1030 i -= 16; 1031 } 1032 1033 if (i & 8) { 1034 __m128i xmm1, xmm2; 1035 1036 xmm1 = xmm_load_128u((const __m128i*)s + 0); 1037 xmm2 = xmm_load_128u((const __m128i*)s + 1); 1038 1039 xmm_save_128((__m128i*)d + 0, 1040 _mm_or_si128(xmm1, mask)); 1041 xmm_save_128((__m128i*)d + 1, 1042 _mm_or_si128(xmm2, mask)); 1043 d += 8; 1044 s += 8; 1045 i -= 8; 1046 } 1047 1048 if (i & 4) { 1049 xmm_save_128((__m128i*)d, 1050 _mm_or_si128(xmm_load_128u((const __m128i*)s), 1051 mask)); 1052 1053 d += 4; 1054 s += 4; 1055 i -= 4; 1056 } 1057 1058 while (i) { 1059 *d++ = *s++ | or; 1060 i--; 1061 } 1062 1063 src_bytes += src_stride; 1064 dst_bytes += dst_stride; 1065 } while (--height); 1066 } else 1067 #else 1068 do { 1069 uint32_t *d = (uint32_t *)dst_bytes; 1070 uint32_t *s = (uint32_t *)src_bytes; 1071 1072 for (i = 0; i < w; i++) 1073 d[i] = s[i] | or; 1074 1075 src_bytes += src_stride; 1076 dst_bytes += dst_stride; 1077 } while (--height); 1078 #endif 1079 break; 1080 } 1081 } else { 1082 switch (bpp) { 1083 case 1: 1084 do { 1085 for (i = 0; i < width; i++) 1086 dst_bytes[i] = (src_bytes[i] & and) | or; 1087 1088 src_bytes += src_stride; 1089 dst_bytes += dst_stride; 1090 } while (--height); 1091 break; 1092 1093 case 2: 1094 do { 1095 uint16_t *d = (uint16_t *)dst_bytes; 1096 const uint16_t *s = (const uint16_t *)src_bytes; 1097 1098 for (i = 0; i < width; i++) 1099 d[i] = (s[i] & and) | or; 1100 1101 src_bytes += src_stride; 1102 dst_bytes += dst_stride; 1103 } while (--height); 1104 break; 1105 1106 case 4: 1107 do { 1108 uint32_t *d = (uint32_t *)dst_bytes; 1109 const uint32_t *s = (const uint32_t *)src_bytes; 1110 1111 for (i = 0; i < width; i++) 1112 d[i] = (s[i] & and) | or; 1113 1114 src_bytes += src_stride; 1115 dst_bytes += dst_stride; 1116 } while (--height); 1117 break; 1118 } 1119 } 1120 } 1121