1428d7b3dSmrg/* 2428d7b3dSmrg * Copyright (c) 2011 Intel Corporation 3428d7b3dSmrg * 4428d7b3dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 5428d7b3dSmrg * copy of this software and associated documentation files (the "Software"), 6428d7b3dSmrg * to deal in the Software without restriction, including without limitation 7428d7b3dSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8428d7b3dSmrg * and/or sell copies of the Software, and to permit persons to whom the 9428d7b3dSmrg * Software is furnished to do so, subject to the following conditions: 10428d7b3dSmrg * 11428d7b3dSmrg * The above copyright notice and this permission notice (including the next 12428d7b3dSmrg * paragraph) shall be included in all copies or substantial portions of the 13428d7b3dSmrg * Software. 14428d7b3dSmrg * 15428d7b3dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16428d7b3dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17428d7b3dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18428d7b3dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19428d7b3dSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20428d7b3dSmrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21428d7b3dSmrg * SOFTWARE. 22428d7b3dSmrg * 23428d7b3dSmrg * Authors: 24428d7b3dSmrg * Chris Wilson <chris@chris-wilson.co.uk> 25428d7b3dSmrg * 26428d7b3dSmrg */ 27428d7b3dSmrg 28428d7b3dSmrg#ifdef HAVE_CONFIG_H 29428d7b3dSmrg#include "config.h" 30428d7b3dSmrg#endif 31428d7b3dSmrg 32428d7b3dSmrg#include "sna.h" 33428d7b3dSmrg 34428d7b3dSmrg#if __x86_64__ 35428d7b3dSmrg#define USE_SSE2 1 36428d7b3dSmrg#endif 37428d7b3dSmrg 38428d7b3dSmrg#if USE_SSE2 39428d7b3dSmrg#include <xmmintrin.h> 40428d7b3dSmrg 41428d7b3dSmrg#if __x86_64__ 42428d7b3dSmrg#define have_sse2() 1 43428d7b3dSmrg#else 44428d7b3dSmrgenum { 45428d7b3dSmrg MMX = 0x1, 46428d7b3dSmrg MMX_EXTENSIONS = 0x2, 47428d7b3dSmrg SSE = 0x6, 48428d7b3dSmrg SSE2 = 0x8, 49428d7b3dSmrg CMOV = 0x10 50428d7b3dSmrg}; 51428d7b3dSmrg 52428d7b3dSmrg#ifdef __GNUC__ 53428d7b3dSmrgstatic unsigned int 54428d7b3dSmrgdetect_cpu_features(void) 55428d7b3dSmrg{ 56428d7b3dSmrg unsigned int features; 57428d7b3dSmrg unsigned int result = 0; 58428d7b3dSmrg 59428d7b3dSmrg char vendor[13]; 60428d7b3dSmrg vendor[0] = 0; 61428d7b3dSmrg vendor[12] = 0; 62428d7b3dSmrg 63428d7b3dSmrg asm ( 64428d7b3dSmrg "pushf\n" 65428d7b3dSmrg "pop %%eax\n" 66428d7b3dSmrg "mov %%eax, %%ecx\n" 67428d7b3dSmrg "xor $0x00200000, %%eax\n" 68428d7b3dSmrg "push %%eax\n" 69428d7b3dSmrg "popf\n" 70428d7b3dSmrg "pushf\n" 71428d7b3dSmrg "pop %%eax\n" 72428d7b3dSmrg "mov $0x0, %%edx\n" 73428d7b3dSmrg "xor %%ecx, %%eax\n" 74428d7b3dSmrg "jz 1f\n" 75428d7b3dSmrg 76428d7b3dSmrg "mov $0x00000000, %%eax\n" 77428d7b3dSmrg "push %%ebx\n" 78428d7b3dSmrg "cpuid\n" 79428d7b3dSmrg "mov %%ebx, %%eax\n" 80428d7b3dSmrg "pop %%ebx\n" 81428d7b3dSmrg "mov %%eax, %1\n" 82428d7b3dSmrg "mov %%edx, %2\n" 83428d7b3dSmrg "mov %%ecx, %3\n" 84428d7b3dSmrg "mov $0x00000001, %%eax\n" 85428d7b3dSmrg "push %%ebx\n" 86428d7b3dSmrg "cpuid\n" 87428d7b3dSmrg "pop %%ebx\n" 88428d7b3dSmrg "1:\n" 89428d7b3dSmrg "mov %%edx, %0\n" 90428d7b3dSmrg : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8]) 91428d7b3dSmrg :: "%eax", "%ecx", "%edx"); 92428d7b3dSmrg 93428d7b3dSmrg features = 0; 94428d7b3dSmrg if (result) { 95428d7b3dSmrg /* result now contains the standard feature bits */ 96428d7b3dSmrg if (result & (1 << 15)) 97428d7b3dSmrg features |= CMOV; 98428d7b3dSmrg if (result & (1 << 23)) 99428d7b3dSmrg features |= MMX; 100428d7b3dSmrg if (result & (1 << 25)) 101428d7b3dSmrg features |= SSE; 102428d7b3dSmrg if (result & (1 << 26)) 103428d7b3dSmrg features |= SSE2; 104428d7b3dSmrg } 105428d7b3dSmrg return features; 106428d7b3dSmrg} 107428d7b3dSmrg#else 108428d7b3dSmrgstatic unsigned int detect_cpu_features(void) { return 0; } 109428d7b3dSmrg#endif 110428d7b3dSmrg 111428d7b3dSmrgstatic bool have_sse2(void) 112428d7b3dSmrg{ 113428d7b3dSmrg static int sse2_present = -1; 114428d7b3dSmrg 115428d7b3dSmrg if (sse2_present == -1) 116428d7b3dSmrg sse2_present = detect_cpu_features() & SSE2; 117428d7b3dSmrg 118428d7b3dSmrg return sse2_present; 119428d7b3dSmrg} 120428d7b3dSmrg#endif 121428d7b3dSmrg 122428d7b3dSmrgstatic inline __m128i 123428d7b3dSmrgxmm_create_mask_32(uint32_t mask) 124428d7b3dSmrg{ 125428d7b3dSmrg return _mm_set_epi32(mask, mask, mask, mask); 126428d7b3dSmrg} 127428d7b3dSmrg 128428d7b3dSmrgstatic inline __m128i 129428d7b3dSmrgxmm_load_128u(const __m128i *src) 130428d7b3dSmrg{ 131428d7b3dSmrg return _mm_loadu_si128(src); 132428d7b3dSmrg} 133428d7b3dSmrg 134428d7b3dSmrgstatic inline void 135428d7b3dSmrgxmm_save_128(__m128i *dst, __m128i data) 136428d7b3dSmrg{ 137428d7b3dSmrg _mm_store_si128(dst, data); 138428d7b3dSmrg} 139428d7b3dSmrg#endif 140428d7b3dSmrg 141428d7b3dSmrgfast void 142428d7b3dSmrgmemcpy_blt(const void *src, void *dst, int bpp, 143428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 144428d7b3dSmrg int16_t src_x, int16_t src_y, 145428d7b3dSmrg int16_t dst_x, int16_t dst_y, 146428d7b3dSmrg uint16_t width, uint16_t height) 147428d7b3dSmrg{ 148428d7b3dSmrg const uint8_t *src_bytes; 149428d7b3dSmrg uint8_t *dst_bytes; 150428d7b3dSmrg int byte_width; 151428d7b3dSmrg 152428d7b3dSmrg assert(src); 153428d7b3dSmrg assert(dst); 154428d7b3dSmrg assert(width && height); 155428d7b3dSmrg assert(bpp >= 8); 156428d7b3dSmrg assert(width*bpp <= 8*src_stride); 157428d7b3dSmrg assert(width*bpp <= 8*dst_stride); 158428d7b3dSmrg 159428d7b3dSmrg DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 160428d7b3dSmrg __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 161428d7b3dSmrg 162428d7b3dSmrg bpp /= 8; 163428d7b3dSmrg 164428d7b3dSmrg src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; 165428d7b3dSmrg dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; 166428d7b3dSmrg 167428d7b3dSmrg byte_width = width * bpp; 168428d7b3dSmrg if (byte_width == src_stride && byte_width == dst_stride) { 169428d7b3dSmrg byte_width *= height; 170428d7b3dSmrg height = 1; 171428d7b3dSmrg } 172428d7b3dSmrg 173428d7b3dSmrg switch (byte_width) { 174428d7b3dSmrg case 1: 175428d7b3dSmrg do { 176428d7b3dSmrg *dst_bytes = *src_bytes; 177428d7b3dSmrg src_bytes += src_stride; 178428d7b3dSmrg dst_bytes += dst_stride; 179428d7b3dSmrg } while (--height); 180428d7b3dSmrg break; 181428d7b3dSmrg 182428d7b3dSmrg case 2: 183428d7b3dSmrg do { 184428d7b3dSmrg *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; 185428d7b3dSmrg src_bytes += src_stride; 186428d7b3dSmrg dst_bytes += dst_stride; 187428d7b3dSmrg } while (--height); 188428d7b3dSmrg break; 189428d7b3dSmrg 190428d7b3dSmrg case 4: 191428d7b3dSmrg do { 192428d7b3dSmrg *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; 193428d7b3dSmrg src_bytes += src_stride; 194428d7b3dSmrg dst_bytes += dst_stride; 195428d7b3dSmrg } while (--height); 196428d7b3dSmrg break; 197428d7b3dSmrg 198428d7b3dSmrg case 8: 199428d7b3dSmrg do { 200428d7b3dSmrg *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; 201428d7b3dSmrg src_bytes += src_stride; 202428d7b3dSmrg dst_bytes += dst_stride; 203428d7b3dSmrg } while (--height); 204428d7b3dSmrg break; 205428d7b3dSmrg case 16: 206428d7b3dSmrg do { 207428d7b3dSmrg ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; 208428d7b3dSmrg ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; 209428d7b3dSmrg src_bytes += src_stride; 210428d7b3dSmrg dst_bytes += dst_stride; 211428d7b3dSmrg } while (--height); 212428d7b3dSmrg break; 213428d7b3dSmrg 214428d7b3dSmrg default: 215428d7b3dSmrg do { 216428d7b3dSmrg memcpy(dst_bytes, src_bytes, byte_width); 217428d7b3dSmrg src_bytes += src_stride; 218428d7b3dSmrg dst_bytes += dst_stride; 219428d7b3dSmrg } while (--height); 220428d7b3dSmrg break; 221428d7b3dSmrg } 222428d7b3dSmrg} 223428d7b3dSmrg 224428d7b3dSmrgstatic fast_memcpy void 225428d7b3dSmrgmemcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 226428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 227428d7b3dSmrg int16_t src_x, int16_t src_y, 228428d7b3dSmrg int16_t dst_x, int16_t dst_y, 229428d7b3dSmrg uint16_t width, uint16_t height) 230428d7b3dSmrg{ 231428d7b3dSmrg const unsigned tile_width = 512; 232428d7b3dSmrg const unsigned tile_height = 8; 233428d7b3dSmrg const unsigned tile_size = 4096; 234428d7b3dSmrg 235428d7b3dSmrg const unsigned cpp = bpp / 8; 236428d7b3dSmrg const unsigned tile_pixels = tile_width / cpp; 237428d7b3dSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 238428d7b3dSmrg const unsigned tile_mask = tile_pixels - 1; 239428d7b3dSmrg 240428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 241428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 242428d7b3dSmrg assert(src != dst); 243428d7b3dSmrg 244428d7b3dSmrg if (src_x | src_y) 245428d7b3dSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 246428d7b3dSmrg assert(src_stride >= width * cpp); 247428d7b3dSmrg src_stride -= width * cpp; 248428d7b3dSmrg 249428d7b3dSmrg while (height--) { 250428d7b3dSmrg unsigned w = width * cpp; 251428d7b3dSmrg uint8_t *tile_row = dst; 252428d7b3dSmrg 253428d7b3dSmrg tile_row += dst_y / tile_height * dst_stride * tile_height; 254428d7b3dSmrg tile_row += (dst_y & (tile_height-1)) * tile_width; 255428d7b3dSmrg if (dst_x) { 256428d7b3dSmrg tile_row += (dst_x >> tile_shift) * tile_size; 257428d7b3dSmrg if (dst_x & tile_mask) { 258428d7b3dSmrg const unsigned x = (dst_x & tile_mask) * cpp; 259428d7b3dSmrg const unsigned len = min(tile_width - x, w); 260428d7b3dSmrg memcpy(tile_row + x, src, len); 261428d7b3dSmrg 262428d7b3dSmrg tile_row += tile_size; 263428d7b3dSmrg src = (const uint8_t *)src + len; 264428d7b3dSmrg w -= len; 265428d7b3dSmrg } 266428d7b3dSmrg } 267428d7b3dSmrg while (w >= tile_width) { 268428d7b3dSmrg memcpy(tile_row, src, tile_width); 269428d7b3dSmrg 270428d7b3dSmrg tile_row += tile_size; 271428d7b3dSmrg src = (const uint8_t *)src + tile_width; 272428d7b3dSmrg w -= tile_width; 273428d7b3dSmrg } 274428d7b3dSmrg memcpy(tile_row, src, w); 275428d7b3dSmrg src = (const uint8_t *)src + src_stride + w; 276428d7b3dSmrg dst_y++; 277428d7b3dSmrg } 278428d7b3dSmrg} 279428d7b3dSmrg 280428d7b3dSmrgstatic fast_memcpy void 281428d7b3dSmrgmemcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 282428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 283428d7b3dSmrg int16_t src_x, int16_t src_y, 284428d7b3dSmrg int16_t dst_x, int16_t dst_y, 285428d7b3dSmrg uint16_t width, uint16_t height) 286428d7b3dSmrg{ 287428d7b3dSmrg const unsigned tile_width = 512; 288428d7b3dSmrg const unsigned tile_height = 8; 289428d7b3dSmrg const unsigned tile_size = 4096; 290428d7b3dSmrg 291428d7b3dSmrg const unsigned cpp = bpp / 8; 292428d7b3dSmrg const unsigned tile_pixels = tile_width / cpp; 293428d7b3dSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 294428d7b3dSmrg const unsigned tile_mask = tile_pixels - 1; 295428d7b3dSmrg 296428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 297428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 298428d7b3dSmrg assert(src != dst); 299428d7b3dSmrg 300428d7b3dSmrg if (dst_x | dst_y) 301428d7b3dSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 302428d7b3dSmrg assert(dst_stride >= width * cpp); 303428d7b3dSmrg dst_stride -= width * cpp; 304428d7b3dSmrg 305428d7b3dSmrg while (height--) { 306428d7b3dSmrg unsigned w = width * cpp; 307428d7b3dSmrg const uint8_t *tile_row = src; 308428d7b3dSmrg 309428d7b3dSmrg tile_row += src_y / tile_height * src_stride * tile_height; 310428d7b3dSmrg tile_row += (src_y & (tile_height-1)) * tile_width; 311428d7b3dSmrg if (src_x) { 312428d7b3dSmrg tile_row += (src_x >> tile_shift) * tile_size; 313428d7b3dSmrg if (src_x & tile_mask) { 314428d7b3dSmrg const unsigned x = (src_x & tile_mask) * cpp; 315428d7b3dSmrg const unsigned len = min(tile_width - x, w); 316428d7b3dSmrg memcpy(dst, tile_row + x, len); 317428d7b3dSmrg 318428d7b3dSmrg tile_row += tile_size; 319428d7b3dSmrg dst = (uint8_t *)dst + len; 320428d7b3dSmrg w -= len; 321428d7b3dSmrg } 322428d7b3dSmrg } 323428d7b3dSmrg while (w >= tile_width) { 324428d7b3dSmrg memcpy(dst, tile_row, tile_width); 325428d7b3dSmrg 326428d7b3dSmrg tile_row += tile_size; 327428d7b3dSmrg dst = (uint8_t *)dst + tile_width; 328428d7b3dSmrg w -= tile_width; 329428d7b3dSmrg } 330428d7b3dSmrg memcpy(dst, tile_row, w); 331428d7b3dSmrg dst = (uint8_t *)dst + dst_stride + w; 332428d7b3dSmrg src_y++; 333428d7b3dSmrg } 334428d7b3dSmrg} 335428d7b3dSmrg 336428d7b3dSmrgfast_memcpy static void 337428d7b3dSmrgmemcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp, 338428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 339428d7b3dSmrg int16_t src_x, int16_t src_y, 340428d7b3dSmrg int16_t dst_x, int16_t dst_y, 341428d7b3dSmrg uint16_t width, uint16_t height) 342428d7b3dSmrg{ 343428d7b3dSmrg const unsigned tile_width = 512; 344428d7b3dSmrg const unsigned tile_height = 8; 345428d7b3dSmrg const unsigned tile_size = 4096; 346428d7b3dSmrg 347428d7b3dSmrg const unsigned cpp = bpp / 8; 348428d7b3dSmrg const unsigned stride_tiles = dst_stride / tile_width; 349428d7b3dSmrg const unsigned swizzle_pixels = 64 / cpp; 350428d7b3dSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 351428d7b3dSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; 352428d7b3dSmrg 353428d7b3dSmrg unsigned x, y; 354428d7b3dSmrg 355428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 356428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 357428d7b3dSmrg 358428d7b3dSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 359428d7b3dSmrg 360428d7b3dSmrg for (y = 0; y < height; ++y) { 361428d7b3dSmrg const uint32_t dy = y + dst_y; 362428d7b3dSmrg const uint32_t tile_row = 363428d7b3dSmrg (dy / tile_height * stride_tiles * tile_size + 364428d7b3dSmrg (dy & (tile_height-1)) * tile_width); 365428d7b3dSmrg const uint8_t *src_row = (const uint8_t *)src + src_stride * y; 366428d7b3dSmrg uint32_t dx = dst_x, offset; 367428d7b3dSmrg 368428d7b3dSmrg x = width * cpp; 369428d7b3dSmrg if (dx & (swizzle_pixels - 1)) { 370428d7b3dSmrg const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); 371428d7b3dSmrg const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; 372428d7b3dSmrg offset = tile_row + 373428d7b3dSmrg (dx >> tile_pixels) * tile_size + 374428d7b3dSmrg (dx & tile_mask) * cpp; 375428d7b3dSmrg offset ^= (offset >> 3) & 64; 376428d7b3dSmrg 377428d7b3dSmrg memcpy((char *)dst + offset, src_row, length * cpp); 378428d7b3dSmrg 379428d7b3dSmrg src_row += length * cpp; 380428d7b3dSmrg x -= length * cpp; 381428d7b3dSmrg dx += length; 382428d7b3dSmrg } 383428d7b3dSmrg while (x >= 64) { 384428d7b3dSmrg offset = tile_row + 385428d7b3dSmrg (dx >> tile_pixels) * tile_size + 386428d7b3dSmrg (dx & tile_mask) * cpp; 387428d7b3dSmrg offset ^= (offset >> 3) & 64; 388428d7b3dSmrg 389428d7b3dSmrg memcpy((char *)dst + offset, src_row, 64); 390428d7b3dSmrg 391428d7b3dSmrg src_row += 64; 392428d7b3dSmrg x -= 64; 393428d7b3dSmrg dx += swizzle_pixels; 394428d7b3dSmrg } 395428d7b3dSmrg if (x) { 396428d7b3dSmrg offset = tile_row + 397428d7b3dSmrg (dx >> tile_pixels) * tile_size + 398428d7b3dSmrg (dx & tile_mask) * cpp; 399428d7b3dSmrg offset ^= (offset >> 3) & 64; 400428d7b3dSmrg memcpy((char *)dst + offset, src_row, x); 401428d7b3dSmrg } 402428d7b3dSmrg } 403428d7b3dSmrg} 404428d7b3dSmrg 405428d7b3dSmrgfast_memcpy static void 406428d7b3dSmrgmemcpy_from_tiled_x__swizzle_9(const void *src, void *dst, int bpp, 407428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 408428d7b3dSmrg int16_t src_x, int16_t src_y, 409428d7b3dSmrg int16_t dst_x, int16_t dst_y, 410428d7b3dSmrg uint16_t width, uint16_t height) 411428d7b3dSmrg{ 412428d7b3dSmrg const unsigned tile_width = 512; 413428d7b3dSmrg const unsigned tile_height = 8; 414428d7b3dSmrg const unsigned tile_size = 4096; 415428d7b3dSmrg 416428d7b3dSmrg const unsigned cpp = bpp / 8; 417428d7b3dSmrg const unsigned stride_tiles = src_stride / tile_width; 418428d7b3dSmrg const unsigned swizzle_pixels = 64 / cpp; 419428d7b3dSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 420428d7b3dSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; 421428d7b3dSmrg 422428d7b3dSmrg unsigned x, y; 423428d7b3dSmrg 424428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 425428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 426428d7b3dSmrg 427428d7b3dSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 428428d7b3dSmrg 429428d7b3dSmrg for (y = 0; y < height; ++y) { 430428d7b3dSmrg const uint32_t sy = y + src_y; 431428d7b3dSmrg const uint32_t tile_row = 432428d7b3dSmrg (sy / tile_height * stride_tiles * tile_size + 433428d7b3dSmrg (sy & (tile_height-1)) * tile_width); 434428d7b3dSmrg uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; 435428d7b3dSmrg uint32_t sx = src_x, offset; 436428d7b3dSmrg 437428d7b3dSmrg x = width * cpp; 438428d7b3dSmrg if (sx & (swizzle_pixels - 1)) { 439428d7b3dSmrg const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); 440428d7b3dSmrg const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; 441428d7b3dSmrg offset = tile_row + 442428d7b3dSmrg (sx >> tile_pixels) * tile_size + 443428d7b3dSmrg (sx & tile_mask) * cpp; 444428d7b3dSmrg offset ^= (offset >> 3) & 64; 445428d7b3dSmrg 446428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, length * cpp); 447428d7b3dSmrg 448428d7b3dSmrg dst_row += length * cpp; 449428d7b3dSmrg x -= length * cpp; 450428d7b3dSmrg sx += length; 451428d7b3dSmrg } 452428d7b3dSmrg while (x >= 64) { 453428d7b3dSmrg offset = tile_row + 454428d7b3dSmrg (sx >> tile_pixels) * tile_size + 455428d7b3dSmrg (sx & tile_mask) * cpp; 456428d7b3dSmrg offset ^= (offset >> 3) & 64; 457428d7b3dSmrg 458428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, 64); 459428d7b3dSmrg 460428d7b3dSmrg dst_row += 64; 461428d7b3dSmrg x -= 64; 462428d7b3dSmrg sx += swizzle_pixels; 463428d7b3dSmrg } 464428d7b3dSmrg if (x) { 465428d7b3dSmrg offset = tile_row + 466428d7b3dSmrg (sx >> tile_pixels) * tile_size + 467428d7b3dSmrg (sx & tile_mask) * cpp; 468428d7b3dSmrg offset ^= (offset >> 3) & 64; 469428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, x); 470428d7b3dSmrg } 471428d7b3dSmrg } 472428d7b3dSmrg} 473428d7b3dSmrg 474428d7b3dSmrgfast_memcpy static void 475428d7b3dSmrgmemcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp, 476428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 477428d7b3dSmrg int16_t src_x, int16_t src_y, 478428d7b3dSmrg int16_t dst_x, int16_t dst_y, 479428d7b3dSmrg uint16_t width, uint16_t height) 480428d7b3dSmrg{ 481428d7b3dSmrg const unsigned tile_width = 512; 482428d7b3dSmrg const unsigned tile_height = 8; 483428d7b3dSmrg const unsigned tile_size = 4096; 484428d7b3dSmrg 485428d7b3dSmrg const unsigned cpp = bpp / 8; 486428d7b3dSmrg const unsigned stride_tiles = dst_stride / tile_width; 487428d7b3dSmrg const unsigned swizzle_pixels = 64 / cpp; 488428d7b3dSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 489428d7b3dSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; 490428d7b3dSmrg 491428d7b3dSmrg unsigned x, y; 492428d7b3dSmrg 493428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 494428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 495428d7b3dSmrg 496428d7b3dSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 497428d7b3dSmrg 498428d7b3dSmrg for (y = 0; y < height; ++y) { 499428d7b3dSmrg const uint32_t dy = y + dst_y; 500428d7b3dSmrg const uint32_t tile_row = 501428d7b3dSmrg (dy / tile_height * stride_tiles * tile_size + 502428d7b3dSmrg (dy & (tile_height-1)) * tile_width); 503428d7b3dSmrg const uint8_t *src_row = (const uint8_t *)src + src_stride * y; 504428d7b3dSmrg uint32_t dx = dst_x, offset; 505428d7b3dSmrg 506428d7b3dSmrg x = width * cpp; 507428d7b3dSmrg if (dx & (swizzle_pixels - 1)) { 508428d7b3dSmrg const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); 509428d7b3dSmrg const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; 510428d7b3dSmrg offset = tile_row + 511428d7b3dSmrg (dx >> tile_pixels) * tile_size + 512428d7b3dSmrg (dx & tile_mask) * cpp; 513428d7b3dSmrg offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 514428d7b3dSmrg 515428d7b3dSmrg memcpy((char *)dst + offset, src_row, length * cpp); 516428d7b3dSmrg 517428d7b3dSmrg src_row += length * cpp; 518428d7b3dSmrg x -= length * cpp; 519428d7b3dSmrg dx += length; 520428d7b3dSmrg } 521428d7b3dSmrg while (x >= 64) { 522428d7b3dSmrg offset = tile_row + 523428d7b3dSmrg (dx >> tile_pixels) * tile_size + 524428d7b3dSmrg (dx & tile_mask) * cpp; 525428d7b3dSmrg offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 526428d7b3dSmrg 527428d7b3dSmrg memcpy((char *)dst + offset, src_row, 64); 528428d7b3dSmrg 529428d7b3dSmrg src_row += 64; 530428d7b3dSmrg x -= 64; 531428d7b3dSmrg dx += swizzle_pixels; 532428d7b3dSmrg } 533428d7b3dSmrg if (x) { 534428d7b3dSmrg offset = tile_row + 535428d7b3dSmrg (dx >> tile_pixels) * tile_size + 536428d7b3dSmrg (dx & tile_mask) * cpp; 537428d7b3dSmrg offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 538428d7b3dSmrg memcpy((char *)dst + offset, src_row, x); 539428d7b3dSmrg } 540428d7b3dSmrg } 541428d7b3dSmrg} 542428d7b3dSmrg 543428d7b3dSmrgfast_memcpy static void 544428d7b3dSmrgmemcpy_from_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp, 545428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 546428d7b3dSmrg int16_t src_x, int16_t src_y, 547428d7b3dSmrg int16_t dst_x, int16_t dst_y, 548428d7b3dSmrg uint16_t width, uint16_t height) 549428d7b3dSmrg{ 550428d7b3dSmrg const unsigned tile_width = 512; 551428d7b3dSmrg const unsigned tile_height = 8; 552428d7b3dSmrg const unsigned tile_size = 4096; 553428d7b3dSmrg 554428d7b3dSmrg const unsigned cpp = bpp / 8; 555428d7b3dSmrg const unsigned stride_tiles = src_stride / tile_width; 556428d7b3dSmrg const unsigned swizzle_pixels = 64 / cpp; 557428d7b3dSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 558428d7b3dSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; 559428d7b3dSmrg 560428d7b3dSmrg unsigned x, y; 561428d7b3dSmrg 562428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 563428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 564428d7b3dSmrg 565428d7b3dSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 566428d7b3dSmrg 567428d7b3dSmrg for (y = 0; y < height; ++y) { 568428d7b3dSmrg const uint32_t sy = y + src_y; 569428d7b3dSmrg const uint32_t tile_row = 570428d7b3dSmrg (sy / tile_height * stride_tiles * tile_size + 571428d7b3dSmrg (sy & (tile_height-1)) * tile_width); 572428d7b3dSmrg uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; 573428d7b3dSmrg uint32_t sx = src_x, offset; 574428d7b3dSmrg 575428d7b3dSmrg x = width * cpp; 576428d7b3dSmrg if (sx & (swizzle_pixels - 1)) { 577428d7b3dSmrg const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); 578428d7b3dSmrg const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; 579428d7b3dSmrg offset = tile_row + 580428d7b3dSmrg (sx >> tile_pixels) * tile_size + 581428d7b3dSmrg (sx & tile_mask) * cpp; 582428d7b3dSmrg offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 583428d7b3dSmrg 584428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, length * cpp); 585428d7b3dSmrg 586428d7b3dSmrg dst_row += length * cpp; 587428d7b3dSmrg x -= length * cpp; 588428d7b3dSmrg sx += length; 589428d7b3dSmrg } 590428d7b3dSmrg while (x >= 64) { 591428d7b3dSmrg offset = tile_row + 592428d7b3dSmrg (sx >> tile_pixels) * tile_size + 593428d7b3dSmrg (sx & tile_mask) * cpp; 594428d7b3dSmrg offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 595428d7b3dSmrg 596428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, 64); 597428d7b3dSmrg 598428d7b3dSmrg dst_row += 64; 599428d7b3dSmrg x -= 64; 600428d7b3dSmrg sx += swizzle_pixels; 601428d7b3dSmrg } 602428d7b3dSmrg if (x) { 603428d7b3dSmrg offset = tile_row + 604428d7b3dSmrg (sx >> tile_pixels) * tile_size + 605428d7b3dSmrg (sx & tile_mask) * cpp; 606428d7b3dSmrg offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; 607428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, x); 608428d7b3dSmrg } 609428d7b3dSmrg } 610428d7b3dSmrg} 611428d7b3dSmrg 612428d7b3dSmrgfast_memcpy static void 613428d7b3dSmrgmemcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp, 614428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 615428d7b3dSmrg int16_t src_x, int16_t src_y, 616428d7b3dSmrg int16_t dst_x, int16_t dst_y, 617428d7b3dSmrg uint16_t width, uint16_t height) 618428d7b3dSmrg{ 619428d7b3dSmrg const unsigned tile_width = 512; 620428d7b3dSmrg const unsigned tile_height = 8; 621428d7b3dSmrg const unsigned tile_size = 4096; 622428d7b3dSmrg 623428d7b3dSmrg const unsigned cpp = bpp / 8; 624428d7b3dSmrg const unsigned stride_tiles = dst_stride / tile_width; 625428d7b3dSmrg const unsigned swizzle_pixels = 64 / cpp; 626428d7b3dSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 627428d7b3dSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; 628428d7b3dSmrg 629428d7b3dSmrg unsigned x, y; 630428d7b3dSmrg 631428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 632428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 633428d7b3dSmrg 634428d7b3dSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 635428d7b3dSmrg 636428d7b3dSmrg for (y = 0; y < height; ++y) { 637428d7b3dSmrg const uint32_t dy = y + dst_y; 638428d7b3dSmrg const uint32_t tile_row = 639428d7b3dSmrg (dy / tile_height * stride_tiles * tile_size + 640428d7b3dSmrg (dy & (tile_height-1)) * tile_width); 641428d7b3dSmrg const uint8_t *src_row = (const uint8_t *)src + src_stride * y; 642428d7b3dSmrg uint32_t dx = dst_x, offset; 643428d7b3dSmrg 644428d7b3dSmrg x = width * cpp; 645428d7b3dSmrg if (dx & (swizzle_pixels - 1)) { 646428d7b3dSmrg const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); 647428d7b3dSmrg const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; 648428d7b3dSmrg offset = tile_row + 649428d7b3dSmrg (dx >> tile_pixels) * tile_size + 650428d7b3dSmrg (dx & tile_mask) * cpp; 651428d7b3dSmrg offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 652428d7b3dSmrg memcpy((char *)dst + offset, src_row, length * cpp); 653428d7b3dSmrg 654428d7b3dSmrg src_row += length * cpp; 655428d7b3dSmrg x -= length * cpp; 656428d7b3dSmrg dx += length; 657428d7b3dSmrg } 658428d7b3dSmrg while (x >= 64) { 659428d7b3dSmrg offset = tile_row + 660428d7b3dSmrg (dx >> tile_pixels) * tile_size + 661428d7b3dSmrg (dx & tile_mask) * cpp; 662428d7b3dSmrg offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 663428d7b3dSmrg 664428d7b3dSmrg memcpy((char *)dst + offset, src_row, 64); 665428d7b3dSmrg 666428d7b3dSmrg src_row += 64; 667428d7b3dSmrg x -= 64; 668428d7b3dSmrg dx += swizzle_pixels; 669428d7b3dSmrg } 670428d7b3dSmrg if (x) { 671428d7b3dSmrg offset = tile_row + 672428d7b3dSmrg (dx >> tile_pixels) * tile_size + 673428d7b3dSmrg (dx & tile_mask) * cpp; 674428d7b3dSmrg offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 675428d7b3dSmrg memcpy((char *)dst + offset, src_row, x); 676428d7b3dSmrg } 677428d7b3dSmrg } 678428d7b3dSmrg} 679428d7b3dSmrg 680428d7b3dSmrgfast_memcpy static void 681428d7b3dSmrgmemcpy_from_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp, 682428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 683428d7b3dSmrg int16_t src_x, int16_t src_y, 684428d7b3dSmrg int16_t dst_x, int16_t dst_y, 685428d7b3dSmrg uint16_t width, uint16_t height) 686428d7b3dSmrg{ 687428d7b3dSmrg const unsigned tile_width = 512; 688428d7b3dSmrg const unsigned tile_height = 8; 689428d7b3dSmrg const unsigned tile_size = 4096; 690428d7b3dSmrg 691428d7b3dSmrg const unsigned cpp = bpp / 8; 692428d7b3dSmrg const unsigned stride_tiles = src_stride / tile_width; 693428d7b3dSmrg const unsigned swizzle_pixels = 64 / cpp; 694428d7b3dSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; 695428d7b3dSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; 696428d7b3dSmrg 697428d7b3dSmrg unsigned x, y; 698428d7b3dSmrg 699428d7b3dSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 700428d7b3dSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 701428d7b3dSmrg 702428d7b3dSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 703428d7b3dSmrg 704428d7b3dSmrg for (y = 0; y < height; ++y) { 705428d7b3dSmrg const uint32_t sy = y + src_y; 706428d7b3dSmrg const uint32_t tile_row = 707428d7b3dSmrg (sy / tile_height * stride_tiles * tile_size + 708428d7b3dSmrg (sy & (tile_height-1)) * tile_width); 709428d7b3dSmrg uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; 710428d7b3dSmrg uint32_t sx = src_x, offset; 711428d7b3dSmrg 712428d7b3dSmrg x = width * cpp; 713428d7b3dSmrg if (sx & (swizzle_pixels - 1)) { 714428d7b3dSmrg const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); 715428d7b3dSmrg const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; 716428d7b3dSmrg offset = tile_row + 717428d7b3dSmrg (sx >> tile_pixels) * tile_size + 718428d7b3dSmrg (sx & tile_mask) * cpp; 719428d7b3dSmrg offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 720428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, length * cpp); 721428d7b3dSmrg 722428d7b3dSmrg dst_row += length * cpp; 723428d7b3dSmrg x -= length * cpp; 724428d7b3dSmrg sx += length; 725428d7b3dSmrg } 726428d7b3dSmrg while (x >= 64) { 727428d7b3dSmrg offset = tile_row + 728428d7b3dSmrg (sx >> tile_pixels) * tile_size + 729428d7b3dSmrg (sx & tile_mask) * cpp; 730428d7b3dSmrg offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 731428d7b3dSmrg 732428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, 64); 733428d7b3dSmrg 734428d7b3dSmrg dst_row += 64; 735428d7b3dSmrg x -= 64; 736428d7b3dSmrg sx += swizzle_pixels; 737428d7b3dSmrg } 738428d7b3dSmrg if (x) { 739428d7b3dSmrg offset = tile_row + 740428d7b3dSmrg (sx >> tile_pixels) * tile_size + 741428d7b3dSmrg (sx & tile_mask) * cpp; 742428d7b3dSmrg offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; 743428d7b3dSmrg memcpy(dst_row, (const char *)src + offset, x); 744428d7b3dSmrg } 745428d7b3dSmrg } 746428d7b3dSmrg} 747428d7b3dSmrg 748428d7b3dSmrgvoid choose_memcpy_tiled_x(struct kgem *kgem, int swizzling) 749428d7b3dSmrg{ 750428d7b3dSmrg switch (swizzling) { 751428d7b3dSmrg default: 752428d7b3dSmrg DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling)); 753428d7b3dSmrg break; 754428d7b3dSmrg case I915_BIT_6_SWIZZLE_NONE: 755428d7b3dSmrg DBG(("%s: no swizzling\n", __FUNCTION__)); 756428d7b3dSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0; 757428d7b3dSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0; 758428d7b3dSmrg break; 759428d7b3dSmrg case I915_BIT_6_SWIZZLE_9: 760428d7b3dSmrg DBG(("%s: 6^9 swizzling\n", __FUNCTION__)); 761428d7b3dSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9; 762428d7b3dSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9; 763428d7b3dSmrg break; 764428d7b3dSmrg case I915_BIT_6_SWIZZLE_9_10: 765428d7b3dSmrg DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__)); 766428d7b3dSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10; 767428d7b3dSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10; 768428d7b3dSmrg break; 769428d7b3dSmrg case I915_BIT_6_SWIZZLE_9_11: 770428d7b3dSmrg DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__)); 771428d7b3dSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11; 772428d7b3dSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11; 773428d7b3dSmrg break; 774428d7b3dSmrg } 775428d7b3dSmrg} 776428d7b3dSmrg 777428d7b3dSmrgvoid 778428d7b3dSmrgmemmove_box(const void *src, void *dst, 779428d7b3dSmrg int bpp, int32_t stride, 780428d7b3dSmrg const BoxRec *box, 781428d7b3dSmrg int dx, int dy) 782428d7b3dSmrg{ 783428d7b3dSmrg#define FORCE_MEMMOVE 0 784428d7b3dSmrg union { 785428d7b3dSmrg uint8_t u8; 786428d7b3dSmrg uint16_t u16; 787428d7b3dSmrg uint32_t u32; 788428d7b3dSmrg uint64_t u64; 789428d7b3dSmrg } tmp; 790428d7b3dSmrg const uint8_t *src_bytes; 791428d7b3dSmrg uint8_t *dst_bytes; 792428d7b3dSmrg int width, height; 793428d7b3dSmrg 794428d7b3dSmrg assert(src); 795428d7b3dSmrg assert(dst); 796428d7b3dSmrg assert(src != dst); 797428d7b3dSmrg assert(bpp >= 8); 798428d7b3dSmrg assert(box->x2 > box->x1); 799428d7b3dSmrg assert(box->y2 > box->y1); 800428d7b3dSmrg 801428d7b3dSmrg DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n", 802428d7b3dSmrg __FUNCTION__, 803428d7b3dSmrg box->x1, box->y1, box->x2, box->y2, 804428d7b3dSmrg stride, bpp, dx, dy)); 805428d7b3dSmrg 806428d7b3dSmrg bpp /= 8; 807428d7b3dSmrg width = box->y1 * stride + box->x1 * bpp; 808428d7b3dSmrg src_bytes = (const uint8_t *)src + width; 809428d7b3dSmrg dst_bytes = (uint8_t *)dst + width; 810428d7b3dSmrg assert(dst_bytes != src_bytes); 811428d7b3dSmrg 812428d7b3dSmrg width = (box->x2 - box->x1) * bpp; 813428d7b3dSmrg height = (box->y2 - box->y1); 814428d7b3dSmrg assert(width <= stride); 815428d7b3dSmrg if (width == stride) { 816428d7b3dSmrg width *= height; 817428d7b3dSmrg height = 1; 818428d7b3dSmrg } 819428d7b3dSmrg 820428d7b3dSmrg if (dy >= 0) { 821428d7b3dSmrg switch (width) { 822428d7b3dSmrg case 1: 823428d7b3dSmrg do { 824428d7b3dSmrg *dst_bytes = tmp.u8 = *src_bytes; 825428d7b3dSmrg src_bytes += stride; 826428d7b3dSmrg dst_bytes += stride; 827428d7b3dSmrg } while (--height); 828428d7b3dSmrg break; 829428d7b3dSmrg 830428d7b3dSmrg case 2: 831428d7b3dSmrg do { 832428d7b3dSmrg *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes; 833428d7b3dSmrg src_bytes += stride; 834428d7b3dSmrg dst_bytes += stride; 835428d7b3dSmrg } while (--height); 836428d7b3dSmrg break; 837428d7b3dSmrg 838428d7b3dSmrg case 4: 839428d7b3dSmrg do { 840428d7b3dSmrg *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes; 841428d7b3dSmrg src_bytes += stride; 842428d7b3dSmrg dst_bytes += stride; 843428d7b3dSmrg } while (--height); 844428d7b3dSmrg break; 845428d7b3dSmrg 846428d7b3dSmrg case 8: 847428d7b3dSmrg do { 848428d7b3dSmrg *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes; 849428d7b3dSmrg src_bytes += stride; 850428d7b3dSmrg dst_bytes += stride; 851428d7b3dSmrg } while (--height); 852428d7b3dSmrg break; 853428d7b3dSmrg 854428d7b3dSmrg default: 855428d7b3dSmrg if (FORCE_MEMMOVE || 856428d7b3dSmrg (dst_bytes < src_bytes + width && 857428d7b3dSmrg src_bytes < dst_bytes + width)) { 858428d7b3dSmrg do { 859428d7b3dSmrg memmove(dst_bytes, src_bytes, width); 860428d7b3dSmrg src_bytes += stride; 861428d7b3dSmrg dst_bytes += stride; 862428d7b3dSmrg } while (--height); 863428d7b3dSmrg } else { 864428d7b3dSmrg do { 865428d7b3dSmrg memcpy(dst_bytes, src_bytes, width); 866428d7b3dSmrg src_bytes += stride; 867428d7b3dSmrg dst_bytes += stride; 868428d7b3dSmrg } while (--height); 869428d7b3dSmrg } 870428d7b3dSmrg break; 871428d7b3dSmrg } 872428d7b3dSmrg } else { 873428d7b3dSmrg src_bytes += (height-1) * stride; 874428d7b3dSmrg dst_bytes += (height-1) * stride; 875428d7b3dSmrg 876428d7b3dSmrg switch (width) { 877428d7b3dSmrg case 1: 878428d7b3dSmrg do { 879428d7b3dSmrg *dst_bytes = tmp.u8 = *src_bytes; 880428d7b3dSmrg src_bytes -= stride; 881428d7b3dSmrg dst_bytes -= stride; 882428d7b3dSmrg } while (--height); 883428d7b3dSmrg break; 884428d7b3dSmrg 885428d7b3dSmrg case 2: 886428d7b3dSmrg do { 887428d7b3dSmrg *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes; 888428d7b3dSmrg src_bytes -= stride; 889428d7b3dSmrg dst_bytes -= stride; 890428d7b3dSmrg } while (--height); 891428d7b3dSmrg break; 892428d7b3dSmrg 893428d7b3dSmrg case 4: 894428d7b3dSmrg do { 895428d7b3dSmrg *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes; 896428d7b3dSmrg src_bytes -= stride; 897428d7b3dSmrg dst_bytes -= stride; 898428d7b3dSmrg } while (--height); 899428d7b3dSmrg break; 900428d7b3dSmrg 901428d7b3dSmrg case 8: 902428d7b3dSmrg do { 903428d7b3dSmrg *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes; 904428d7b3dSmrg src_bytes -= stride; 905428d7b3dSmrg dst_bytes -= stride; 906428d7b3dSmrg } while (--height); 907428d7b3dSmrg break; 908428d7b3dSmrg 909428d7b3dSmrg default: 910428d7b3dSmrg if (FORCE_MEMMOVE || 911428d7b3dSmrg (dst_bytes < src_bytes + width && 912428d7b3dSmrg src_bytes < dst_bytes + width)) { 913428d7b3dSmrg do { 914428d7b3dSmrg memmove(dst_bytes, src_bytes, width); 915428d7b3dSmrg src_bytes -= stride; 916428d7b3dSmrg dst_bytes -= stride; 917428d7b3dSmrg } while (--height); 918428d7b3dSmrg } else { 919428d7b3dSmrg do { 920428d7b3dSmrg memcpy(dst_bytes, src_bytes, width); 921428d7b3dSmrg src_bytes -= stride; 922428d7b3dSmrg dst_bytes -= stride; 923428d7b3dSmrg } while (--height); 924428d7b3dSmrg } 925428d7b3dSmrg break; 926428d7b3dSmrg } 927428d7b3dSmrg } 928428d7b3dSmrg} 929428d7b3dSmrg 930428d7b3dSmrgvoid 931428d7b3dSmrgmemcpy_xor(const void *src, void *dst, int bpp, 932428d7b3dSmrg int32_t src_stride, int32_t dst_stride, 933428d7b3dSmrg int16_t src_x, int16_t src_y, 934428d7b3dSmrg int16_t dst_x, int16_t dst_y, 935428d7b3dSmrg uint16_t width, uint16_t height, 936428d7b3dSmrg uint32_t and, uint32_t or) 937428d7b3dSmrg{ 938428d7b3dSmrg const uint8_t *src_bytes; 939428d7b3dSmrg uint8_t *dst_bytes; 940428d7b3dSmrg int i, w; 941428d7b3dSmrg 942428d7b3dSmrg assert(width && height); 943428d7b3dSmrg assert(bpp >= 8); 944428d7b3dSmrg assert(width*bpp <= 8*src_stride); 945428d7b3dSmrg assert(width*bpp <= 8*dst_stride); 946428d7b3dSmrg 947428d7b3dSmrg DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n", 948428d7b3dSmrg __FUNCTION__, 949428d7b3dSmrg src_x, src_y, dst_x, dst_y, 950428d7b3dSmrg width, height, 951428d7b3dSmrg src_stride, dst_stride, 952428d7b3dSmrg bpp, and, or)); 953428d7b3dSmrg 954428d7b3dSmrg bpp /= 8; 955428d7b3dSmrg src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; 956428d7b3dSmrg dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; 957428d7b3dSmrg 958428d7b3dSmrg if (and == 0xffffffff) { 959428d7b3dSmrg switch (bpp) { 960428d7b3dSmrg case 1: 961428d7b3dSmrg if (width & 1) { 962428d7b3dSmrg do { 963428d7b3dSmrg for (i = 0; i < width; i++) 964428d7b3dSmrg dst_bytes[i] = src_bytes[i] | or; 965428d7b3dSmrg 966428d7b3dSmrg src_bytes += src_stride; 967428d7b3dSmrg dst_bytes += dst_stride; 968428d7b3dSmrg } while (--height); 969428d7b3dSmrg break; 970428d7b3dSmrg } else { 971428d7b3dSmrg width /= 2; 972428d7b3dSmrg or |= or << 8; 973428d7b3dSmrg } 974428d7b3dSmrg case 2: 975428d7b3dSmrg if (width & 1) { 976428d7b3dSmrg do { 977428d7b3dSmrg uint16_t *d = (uint16_t *)dst_bytes; 978428d7b3dSmrg const uint16_t *s = (const uint16_t *)src_bytes; 979428d7b3dSmrg 980428d7b3dSmrg for (i = 0; i < width; i++) 981428d7b3dSmrg d[i] = s[i] | or; 982428d7b3dSmrg 983428d7b3dSmrg src_bytes += src_stride; 984428d7b3dSmrg dst_bytes += dst_stride; 985428d7b3dSmrg } while (--height); 986428d7b3dSmrg break; 987428d7b3dSmrg } else { 988428d7b3dSmrg width /= 2; 989428d7b3dSmrg or |= or << 16; 990428d7b3dSmrg } 991428d7b3dSmrg case 4: 992428d7b3dSmrg w = width; 993428d7b3dSmrg if (w * 4 == dst_stride && dst_stride == src_stride) { 994428d7b3dSmrg w *= height; 995428d7b3dSmrg height = 1; 996428d7b3dSmrg } 997428d7b3dSmrg 998428d7b3dSmrg#if USE_SSE2 999428d7b3dSmrg if (have_sse2()) { 1000428d7b3dSmrg do { 1001428d7b3dSmrg uint32_t *d = (uint32_t *)dst_bytes; 1002428d7b3dSmrg const uint32_t *s = (const uint32_t *)src_bytes; 1003428d7b3dSmrg __m128i mask = xmm_create_mask_32(or); 1004428d7b3dSmrg 1005428d7b3dSmrg i = w; 1006428d7b3dSmrg while (i && (uintptr_t)d & 15) { 1007428d7b3dSmrg *d++ = *s++ | or; 1008428d7b3dSmrg i--; 1009428d7b3dSmrg } 1010428d7b3dSmrg 1011428d7b3dSmrg while (i >= 16) { 1012428d7b3dSmrg __m128i xmm1, xmm2, xmm3, xmm4; 1013428d7b3dSmrg 1014428d7b3dSmrg xmm1 = xmm_load_128u((const __m128i*)s + 0); 1015428d7b3dSmrg xmm2 = xmm_load_128u((const __m128i*)s + 1); 1016428d7b3dSmrg xmm3 = xmm_load_128u((const __m128i*)s + 2); 1017428d7b3dSmrg xmm4 = xmm_load_128u((const __m128i*)s + 3); 1018428d7b3dSmrg 1019428d7b3dSmrg xmm_save_128((__m128i*)d + 0, 1020428d7b3dSmrg _mm_or_si128(xmm1, mask)); 1021428d7b3dSmrg xmm_save_128((__m128i*)d + 1, 1022428d7b3dSmrg _mm_or_si128(xmm2, mask)); 1023428d7b3dSmrg xmm_save_128((__m128i*)d + 2, 1024428d7b3dSmrg _mm_or_si128(xmm3, mask)); 1025428d7b3dSmrg xmm_save_128((__m128i*)d + 3, 1026428d7b3dSmrg _mm_or_si128(xmm4, mask)); 1027428d7b3dSmrg 1028428d7b3dSmrg d += 16; 1029428d7b3dSmrg s += 16; 1030428d7b3dSmrg i -= 16; 1031428d7b3dSmrg } 1032428d7b3dSmrg 1033428d7b3dSmrg if (i & 8) { 1034428d7b3dSmrg __m128i xmm1, xmm2; 1035428d7b3dSmrg 1036428d7b3dSmrg xmm1 = xmm_load_128u((const __m128i*)s + 0); 1037428d7b3dSmrg xmm2 = xmm_load_128u((const __m128i*)s + 1); 1038428d7b3dSmrg 1039428d7b3dSmrg xmm_save_128((__m128i*)d + 0, 1040428d7b3dSmrg _mm_or_si128(xmm1, mask)); 1041428d7b3dSmrg xmm_save_128((__m128i*)d + 1, 1042428d7b3dSmrg _mm_or_si128(xmm2, mask)); 1043428d7b3dSmrg d += 8; 1044428d7b3dSmrg s += 8; 1045428d7b3dSmrg i -= 8; 1046428d7b3dSmrg } 1047428d7b3dSmrg 1048428d7b3dSmrg if (i & 4) { 1049428d7b3dSmrg xmm_save_128((__m128i*)d, 1050428d7b3dSmrg _mm_or_si128(xmm_load_128u((const __m128i*)s), 1051428d7b3dSmrg mask)); 1052428d7b3dSmrg 1053428d7b3dSmrg d += 4; 1054428d7b3dSmrg s += 4; 1055428d7b3dSmrg i -= 4; 1056428d7b3dSmrg } 1057428d7b3dSmrg 1058428d7b3dSmrg while (i) { 1059428d7b3dSmrg *d++ = *s++ | or; 1060428d7b3dSmrg i--; 1061428d7b3dSmrg } 1062428d7b3dSmrg 1063428d7b3dSmrg src_bytes += src_stride; 1064428d7b3dSmrg dst_bytes += dst_stride; 1065428d7b3dSmrg } while (--height); 1066428d7b3dSmrg } else 1067428d7b3dSmrg#else 1068428d7b3dSmrg do { 1069428d7b3dSmrg uint32_t *d = (uint32_t *)dst_bytes; 1070428d7b3dSmrg uint32_t *s = (uint32_t *)src_bytes; 1071428d7b3dSmrg 1072428d7b3dSmrg for (i = 0; i < w; i++) 1073428d7b3dSmrg d[i] = s[i] | or; 1074428d7b3dSmrg 1075428d7b3dSmrg src_bytes += src_stride; 1076428d7b3dSmrg dst_bytes += dst_stride; 1077428d7b3dSmrg } while (--height); 1078428d7b3dSmrg#endif 1079428d7b3dSmrg break; 1080428d7b3dSmrg } 1081428d7b3dSmrg } else { 1082428d7b3dSmrg switch (bpp) { 1083428d7b3dSmrg case 1: 1084428d7b3dSmrg do { 1085428d7b3dSmrg for (i = 0; i < width; i++) 1086428d7b3dSmrg dst_bytes[i] = (src_bytes[i] & and) | or; 1087428d7b3dSmrg 1088428d7b3dSmrg src_bytes += src_stride; 1089428d7b3dSmrg dst_bytes += dst_stride; 1090428d7b3dSmrg } while (--height); 1091428d7b3dSmrg break; 1092428d7b3dSmrg 1093428d7b3dSmrg case 2: 1094428d7b3dSmrg do { 1095428d7b3dSmrg uint16_t *d = (uint16_t *)dst_bytes; 1096428d7b3dSmrg const uint16_t *s = (const uint16_t *)src_bytes; 1097428d7b3dSmrg 1098428d7b3dSmrg for (i = 0; i < width; i++) 1099428d7b3dSmrg d[i] = (s[i] & and) | or; 1100428d7b3dSmrg 1101428d7b3dSmrg src_bytes += src_stride; 1102428d7b3dSmrg dst_bytes += dst_stride; 1103428d7b3dSmrg } while (--height); 1104428d7b3dSmrg break; 1105428d7b3dSmrg 1106428d7b3dSmrg case 4: 1107428d7b3dSmrg do { 1108428d7b3dSmrg uint32_t *d = (uint32_t *)dst_bytes; 1109428d7b3dSmrg const uint32_t *s = (const uint32_t *)src_bytes; 1110428d7b3dSmrg 1111428d7b3dSmrg for (i = 0; i < width; i++) 1112428d7b3dSmrg d[i] = (s[i] & and) | or; 1113428d7b3dSmrg 1114428d7b3dSmrg src_bytes += src_stride; 1115428d7b3dSmrg dst_bytes += dst_stride; 1116428d7b3dSmrg } while (--height); 1117428d7b3dSmrg break; 1118428d7b3dSmrg } 1119428d7b3dSmrg } 1120428d7b3dSmrg} 1121