103b705cfSriastradh/* 203b705cfSriastradh * Copyright (c) 2011 Intel Corporation 303b705cfSriastradh * 403b705cfSriastradh * Permission is hereby granted, free of charge, to any person obtaining a 503b705cfSriastradh * copy of this software and associated documentation files (the "Software"), 603b705cfSriastradh * to deal in the Software without restriction, including without limitation 703b705cfSriastradh * the rights to use, copy, modify, merge, publish, distribute, sublicense, 803b705cfSriastradh * and/or sell copies of the Software, and to permit persons to whom the 903b705cfSriastradh * Software is furnished to do so, subject to the following conditions: 1003b705cfSriastradh * 1103b705cfSriastradh * The above copyright notice and this permission notice (including the next 1203b705cfSriastradh * paragraph) shall be included in all copies or substantial portions of the 1303b705cfSriastradh * Software. 1403b705cfSriastradh * 1503b705cfSriastradh * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1603b705cfSriastradh * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1703b705cfSriastradh * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1803b705cfSriastradh * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1903b705cfSriastradh * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2003b705cfSriastradh * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 2103b705cfSriastradh * SOFTWARE. 2203b705cfSriastradh * 2303b705cfSriastradh * Authors: 2403b705cfSriastradh * Chris Wilson <chris@chris-wilson.co.uk> 2503b705cfSriastradh * 2603b705cfSriastradh */ 2703b705cfSriastradh 2803b705cfSriastradh#ifdef HAVE_CONFIG_H 2903b705cfSriastradh#include "config.h" 3003b705cfSriastradh#endif 3103b705cfSriastradh 3203b705cfSriastradh#include "sna.h" 33fe8aea9eSmrg#include <pixman.h> 3403b705cfSriastradh 35fe8aea9eSmrg#if defined(sse2) 36fe8aea9eSmrg#pragma GCC push_options 37fe8aea9eSmrg#pragma GCC target("sse2,inline-all-stringops,fpmath=sse") 38fe8aea9eSmrg#pragma GCC optimize("Ofast") 3903b705cfSriastradh#include <xmmintrin.h> 4003b705cfSriastradh 4103b705cfSriastradh#if __x86_64__ 4203b705cfSriastradh#define have_sse2() 1 4303b705cfSriastradh#else 4403b705cfSriastradhstatic bool have_sse2(void) 4503b705cfSriastradh{ 4603b705cfSriastradh static int sse2_present = -1; 4703b705cfSriastradh 4803b705cfSriastradh if (sse2_present == -1) 49fe8aea9eSmrg sse2_present = sna_cpu_detect() & SSE2; 5003b705cfSriastradh 5103b705cfSriastradh return sse2_present; 5203b705cfSriastradh} 5303b705cfSriastradh#endif 5403b705cfSriastradh 55fe8aea9eSmrgstatic force_inline __m128i 5603b705cfSriastradhxmm_create_mask_32(uint32_t mask) 5703b705cfSriastradh{ 5803b705cfSriastradh return _mm_set_epi32(mask, mask, mask, mask); 5903b705cfSriastradh} 6003b705cfSriastradh 61fe8aea9eSmrgstatic force_inline __m128i 62fe8aea9eSmrgxmm_load_128(const __m128i *src) 63fe8aea9eSmrg{ 64fe8aea9eSmrg return _mm_load_si128(src); 65fe8aea9eSmrg} 66fe8aea9eSmrg 67fe8aea9eSmrgstatic force_inline __m128i 6803b705cfSriastradhxmm_load_128u(const __m128i *src) 6903b705cfSriastradh{ 7003b705cfSriastradh return _mm_loadu_si128(src); 7103b705cfSriastradh} 7203b705cfSriastradh 73fe8aea9eSmrgstatic force_inline void 7403b705cfSriastradhxmm_save_128(__m128i *dst, __m128i data) 7503b705cfSriastradh{ 7603b705cfSriastradh _mm_store_si128(dst, data); 7703b705cfSriastradh} 78fe8aea9eSmrg 79fe8aea9eSmrgstatic force_inline void 80fe8aea9eSmrgxmm_save_128u(__m128i *dst, __m128i data) 81fe8aea9eSmrg{ 82fe8aea9eSmrg _mm_storeu_si128(dst, data); 83fe8aea9eSmrg} 84fe8aea9eSmrg 85fe8aea9eSmrgstatic force_inline void 86fe8aea9eSmrgto_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) 87fe8aea9eSmrg{ 88fe8aea9eSmrg int i; 89fe8aea9eSmrg 90fe8aea9eSmrg for (i = 0; i < bytes / 128; i++) { 91fe8aea9eSmrg __m128i xmm0, xmm1, xmm2, xmm3; 92fe8aea9eSmrg __m128i xmm4, xmm5, xmm6, xmm7; 93fe8aea9eSmrg 94fe8aea9eSmrg xmm0 = xmm_load_128u((const __m128i*)src + 0); 95fe8aea9eSmrg xmm1 = xmm_load_128u((const __m128i*)src + 1); 96fe8aea9eSmrg xmm2 = xmm_load_128u((const __m128i*)src + 2); 97fe8aea9eSmrg xmm3 = xmm_load_128u((const __m128i*)src + 3); 98fe8aea9eSmrg xmm4 = xmm_load_128u((const __m128i*)src + 4); 99fe8aea9eSmrg xmm5 = xmm_load_128u((const __m128i*)src + 5); 100fe8aea9eSmrg xmm6 = xmm_load_128u((const __m128i*)src + 6); 101fe8aea9eSmrg xmm7 = xmm_load_128u((const __m128i*)src + 7); 102fe8aea9eSmrg 103fe8aea9eSmrg xmm_save_128((__m128i*)dst + 0, xmm0); 104fe8aea9eSmrg xmm_save_128((__m128i*)dst + 1, xmm1); 105fe8aea9eSmrg xmm_save_128((__m128i*)dst + 2, xmm2); 106fe8aea9eSmrg xmm_save_128((__m128i*)dst + 3, xmm3); 107fe8aea9eSmrg xmm_save_128((__m128i*)dst + 4, xmm4); 108fe8aea9eSmrg xmm_save_128((__m128i*)dst + 5, xmm5); 109fe8aea9eSmrg xmm_save_128((__m128i*)dst + 6, xmm6); 110fe8aea9eSmrg xmm_save_128((__m128i*)dst + 7, xmm7); 111fe8aea9eSmrg 112fe8aea9eSmrg dst += 128; 113fe8aea9eSmrg src += 128; 114fe8aea9eSmrg } 115fe8aea9eSmrg} 116fe8aea9eSmrg 117fe8aea9eSmrgstatic force_inline void 118fe8aea9eSmrgto_sse64(uint8_t *dst, const uint8_t *src) 119fe8aea9eSmrg{ 120fe8aea9eSmrg __m128i xmm1, xmm2, xmm3, xmm4; 121fe8aea9eSmrg 122fe8aea9eSmrg xmm1 = xmm_load_128u((const __m128i*)src + 0); 123fe8aea9eSmrg xmm2 = xmm_load_128u((const __m128i*)src + 1); 124fe8aea9eSmrg xmm3 = xmm_load_128u((const __m128i*)src + 2); 125fe8aea9eSmrg xmm4 = xmm_load_128u((const __m128i*)src + 3); 126fe8aea9eSmrg 127fe8aea9eSmrg xmm_save_128((__m128i*)dst + 0, xmm1); 128fe8aea9eSmrg xmm_save_128((__m128i*)dst + 1, xmm2); 129fe8aea9eSmrg xmm_save_128((__m128i*)dst + 2, xmm3); 130fe8aea9eSmrg xmm_save_128((__m128i*)dst + 3, xmm4); 131fe8aea9eSmrg} 132fe8aea9eSmrg 133fe8aea9eSmrgstatic force_inline void 134fe8aea9eSmrgto_sse32(uint8_t *dst, const uint8_t *src) 135fe8aea9eSmrg{ 136fe8aea9eSmrg __m128i xmm1, xmm2; 137fe8aea9eSmrg 138fe8aea9eSmrg xmm1 = xmm_load_128u((const __m128i*)src + 0); 139fe8aea9eSmrg xmm2 = xmm_load_128u((const __m128i*)src + 1); 140fe8aea9eSmrg 141fe8aea9eSmrg xmm_save_128((__m128i*)dst + 0, xmm1); 142fe8aea9eSmrg xmm_save_128((__m128i*)dst + 1, xmm2); 143fe8aea9eSmrg} 144fe8aea9eSmrg 145fe8aea9eSmrgstatic force_inline void 146fe8aea9eSmrgto_sse16(uint8_t *dst, const uint8_t *src) 147fe8aea9eSmrg{ 148fe8aea9eSmrg xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src)); 149fe8aea9eSmrg} 150fe8aea9eSmrg 151fe8aea9eSmrgstatic void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) 152fe8aea9eSmrg{ 153fe8aea9eSmrg assert(len); 154fe8aea9eSmrg if ((uintptr_t)dst & 15) { 155fe8aea9eSmrg if (len <= 16 - ((uintptr_t)dst & 15)) { 156fe8aea9eSmrg memcpy(dst, src, len); 157fe8aea9eSmrg return; 158fe8aea9eSmrg } 159fe8aea9eSmrg 160fe8aea9eSmrg if ((uintptr_t)dst & 1) { 161fe8aea9eSmrg assert(len >= 1); 162fe8aea9eSmrg *dst++ = *src++; 163fe8aea9eSmrg len--; 164fe8aea9eSmrg } 165fe8aea9eSmrg if ((uintptr_t)dst & 2) { 166fe8aea9eSmrg assert(((uintptr_t)dst & 1) == 0); 167fe8aea9eSmrg assert(len >= 2); 168fe8aea9eSmrg *(uint16_t *)dst = *(const uint16_t *)src; 169fe8aea9eSmrg dst += 2; 170fe8aea9eSmrg src += 2; 171fe8aea9eSmrg len -= 2; 172fe8aea9eSmrg } 173fe8aea9eSmrg if ((uintptr_t)dst & 4) { 174fe8aea9eSmrg assert(((uintptr_t)dst & 3) == 0); 175fe8aea9eSmrg assert(len >= 4); 176fe8aea9eSmrg *(uint32_t *)dst = *(const uint32_t *)src; 177fe8aea9eSmrg dst += 4; 178fe8aea9eSmrg src += 4; 179fe8aea9eSmrg len -= 4; 180fe8aea9eSmrg } 181fe8aea9eSmrg if ((uintptr_t)dst & 8) { 182fe8aea9eSmrg assert(((uintptr_t)dst & 7) == 0); 183fe8aea9eSmrg assert(len >= 8); 184fe8aea9eSmrg *(uint64_t *)dst = *(const uint64_t *)src; 185fe8aea9eSmrg dst += 8; 186fe8aea9eSmrg src += 8; 187fe8aea9eSmrg len -= 8; 188fe8aea9eSmrg } 189fe8aea9eSmrg } 190fe8aea9eSmrg 191fe8aea9eSmrg assert(((uintptr_t)dst & 15) == 0); 192fe8aea9eSmrg while (len >= 64) { 193fe8aea9eSmrg to_sse64(dst, src); 194fe8aea9eSmrg dst += 64; 195fe8aea9eSmrg src += 64; 196fe8aea9eSmrg len -= 64; 197fe8aea9eSmrg } 198fe8aea9eSmrg if (len == 0) 199fe8aea9eSmrg return; 200fe8aea9eSmrg 201fe8aea9eSmrg if (len & 32) { 202fe8aea9eSmrg to_sse32(dst, src); 203fe8aea9eSmrg dst += 32; 204fe8aea9eSmrg src += 32; 205fe8aea9eSmrg } 206fe8aea9eSmrg if (len & 16) { 207fe8aea9eSmrg to_sse16(dst, src); 208fe8aea9eSmrg dst += 16; 209fe8aea9eSmrg src += 16; 210fe8aea9eSmrg } 211fe8aea9eSmrg if (len & 8) { 212fe8aea9eSmrg *(uint64_t *)dst = *(uint64_t *)src; 213fe8aea9eSmrg dst += 8; 214fe8aea9eSmrg src += 8; 215fe8aea9eSmrg } 216fe8aea9eSmrg if (len & 4) { 217fe8aea9eSmrg *(uint32_t *)dst = *(uint32_t *)src; 218fe8aea9eSmrg dst += 4; 219fe8aea9eSmrg src += 4; 220fe8aea9eSmrg } 221fe8aea9eSmrg memcpy(dst, src, len & 3); 222fe8aea9eSmrg} 223fe8aea9eSmrg 224fe8aea9eSmrgstatic void 225fe8aea9eSmrgmemcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, 226fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, 227fe8aea9eSmrg int16_t src_x, int16_t src_y, 228fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 229fe8aea9eSmrg uint16_t width, uint16_t height) 230fe8aea9eSmrg{ 231fe8aea9eSmrg const unsigned tile_width = 512; 232fe8aea9eSmrg const unsigned tile_height = 8; 233fe8aea9eSmrg const unsigned tile_size = 4096; 234fe8aea9eSmrg 235fe8aea9eSmrg const unsigned cpp = bpp / 8; 236fe8aea9eSmrg const unsigned tile_pixels = tile_width / cpp; 237fe8aea9eSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 238fe8aea9eSmrg const unsigned tile_mask = tile_pixels - 1; 239fe8aea9eSmrg 240fe8aea9eSmrg unsigned offset_x, length_x; 241fe8aea9eSmrg 242fe8aea9eSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 243fe8aea9eSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 244fe8aea9eSmrg assert(src != dst); 245fe8aea9eSmrg 246fe8aea9eSmrg if (src_x | src_y) 247fe8aea9eSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 248fe8aea9eSmrg width *= cpp; 249fe8aea9eSmrg assert(src_stride >= width); 250fe8aea9eSmrg 251fe8aea9eSmrg if (dst_x & tile_mask) { 252fe8aea9eSmrg offset_x = (dst_x & tile_mask) * cpp; 253fe8aea9eSmrg length_x = min(tile_width - offset_x, width); 254fe8aea9eSmrg } else 255fe8aea9eSmrg length_x = 0; 256fe8aea9eSmrg dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; 257fe8aea9eSmrg 258fe8aea9eSmrg while (height--) { 259fe8aea9eSmrg unsigned w = width; 260fe8aea9eSmrg const uint8_t *src_row = src; 261fe8aea9eSmrg uint8_t *tile_row = dst; 262fe8aea9eSmrg 263fe8aea9eSmrg src = (const uint8_t *)src + src_stride; 264fe8aea9eSmrg 265fe8aea9eSmrg tile_row += dst_y / tile_height * dst_stride * tile_height; 266fe8aea9eSmrg tile_row += (dst_y & (tile_height-1)) * tile_width; 267fe8aea9eSmrg dst_y++; 268fe8aea9eSmrg 269fe8aea9eSmrg if (length_x) { 270fe8aea9eSmrg to_memcpy(tile_row + offset_x, src_row, length_x); 271fe8aea9eSmrg 272fe8aea9eSmrg tile_row += tile_size; 273fe8aea9eSmrg src_row = (const uint8_t *)src_row + length_x; 274fe8aea9eSmrg w -= length_x; 275fe8aea9eSmrg } 276fe8aea9eSmrg while (w >= tile_width) { 277fe8aea9eSmrg assert(((uintptr_t)tile_row & (tile_width - 1)) == 0); 278fe8aea9eSmrg to_sse128xN(assume_aligned(tile_row, tile_width), 279fe8aea9eSmrg src_row, tile_width); 280fe8aea9eSmrg tile_row += tile_size; 281fe8aea9eSmrg src_row = (const uint8_t *)src_row + tile_width; 282fe8aea9eSmrg w -= tile_width; 283fe8aea9eSmrg } 284fe8aea9eSmrg if (w) { 285fe8aea9eSmrg assert(((uintptr_t)tile_row & (tile_width - 1)) == 0); 286fe8aea9eSmrg to_memcpy(assume_aligned(tile_row, tile_width), 287fe8aea9eSmrg src_row, w); 288fe8aea9eSmrg } 289fe8aea9eSmrg } 290fe8aea9eSmrg} 291fe8aea9eSmrg 292fe8aea9eSmrgstatic force_inline void 293fe8aea9eSmrgfrom_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes) 294fe8aea9eSmrg{ 295fe8aea9eSmrg int i; 296fe8aea9eSmrg 297fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 298fe8aea9eSmrg 299fe8aea9eSmrg for (i = 0; i < bytes / 128; i++) { 300fe8aea9eSmrg __m128i xmm0, xmm1, xmm2, xmm3; 301fe8aea9eSmrg __m128i xmm4, xmm5, xmm6, xmm7; 302fe8aea9eSmrg 303fe8aea9eSmrg xmm0 = xmm_load_128((const __m128i*)src + 0); 304fe8aea9eSmrg xmm1 = xmm_load_128((const __m128i*)src + 1); 305fe8aea9eSmrg xmm2 = xmm_load_128((const __m128i*)src + 2); 306fe8aea9eSmrg xmm3 = xmm_load_128((const __m128i*)src + 3); 307fe8aea9eSmrg xmm4 = xmm_load_128((const __m128i*)src + 4); 308fe8aea9eSmrg xmm5 = xmm_load_128((const __m128i*)src + 5); 309fe8aea9eSmrg xmm6 = xmm_load_128((const __m128i*)src + 6); 310fe8aea9eSmrg xmm7 = xmm_load_128((const __m128i*)src + 7); 311fe8aea9eSmrg 312fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 0, xmm0); 313fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 1, xmm1); 314fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 2, xmm2); 315fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 3, xmm3); 316fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 4, xmm4); 317fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 5, xmm5); 318fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 6, xmm6); 319fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 7, xmm7); 320fe8aea9eSmrg 321fe8aea9eSmrg dst += 128; 322fe8aea9eSmrg src += 128; 323fe8aea9eSmrg } 324fe8aea9eSmrg} 325fe8aea9eSmrg 326fe8aea9eSmrgstatic force_inline void 327fe8aea9eSmrgfrom_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes) 328fe8aea9eSmrg{ 329fe8aea9eSmrg int i; 330fe8aea9eSmrg 331fe8aea9eSmrg assert(((uintptr_t)dst & 15) == 0); 332fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 333fe8aea9eSmrg 334fe8aea9eSmrg for (i = 0; i < bytes / 128; i++) { 335fe8aea9eSmrg __m128i xmm0, xmm1, xmm2, xmm3; 336fe8aea9eSmrg __m128i xmm4, xmm5, xmm6, xmm7; 337fe8aea9eSmrg 338fe8aea9eSmrg xmm0 = xmm_load_128((const __m128i*)src + 0); 339fe8aea9eSmrg xmm1 = xmm_load_128((const __m128i*)src + 1); 340fe8aea9eSmrg xmm2 = xmm_load_128((const __m128i*)src + 2); 341fe8aea9eSmrg xmm3 = xmm_load_128((const __m128i*)src + 3); 342fe8aea9eSmrg xmm4 = xmm_load_128((const __m128i*)src + 4); 343fe8aea9eSmrg xmm5 = xmm_load_128((const __m128i*)src + 5); 344fe8aea9eSmrg xmm6 = xmm_load_128((const __m128i*)src + 6); 345fe8aea9eSmrg xmm7 = xmm_load_128((const __m128i*)src + 7); 346fe8aea9eSmrg 347fe8aea9eSmrg xmm_save_128((__m128i*)dst + 0, xmm0); 348fe8aea9eSmrg xmm_save_128((__m128i*)dst + 1, xmm1); 349fe8aea9eSmrg xmm_save_128((__m128i*)dst + 2, xmm2); 350fe8aea9eSmrg xmm_save_128((__m128i*)dst + 3, xmm3); 351fe8aea9eSmrg xmm_save_128((__m128i*)dst + 4, xmm4); 352fe8aea9eSmrg xmm_save_128((__m128i*)dst + 5, xmm5); 353fe8aea9eSmrg xmm_save_128((__m128i*)dst + 6, xmm6); 354fe8aea9eSmrg xmm_save_128((__m128i*)dst + 7, xmm7); 355fe8aea9eSmrg 356fe8aea9eSmrg dst += 128; 357fe8aea9eSmrg src += 128; 358fe8aea9eSmrg } 359fe8aea9eSmrg} 360fe8aea9eSmrg 361fe8aea9eSmrgstatic force_inline void 362fe8aea9eSmrgfrom_sse64u(uint8_t *dst, const uint8_t *src) 363fe8aea9eSmrg{ 364fe8aea9eSmrg __m128i xmm1, xmm2, xmm3, xmm4; 365fe8aea9eSmrg 366fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 367fe8aea9eSmrg 368fe8aea9eSmrg xmm1 = xmm_load_128((const __m128i*)src + 0); 369fe8aea9eSmrg xmm2 = xmm_load_128((const __m128i*)src + 1); 370fe8aea9eSmrg xmm3 = xmm_load_128((const __m128i*)src + 2); 371fe8aea9eSmrg xmm4 = xmm_load_128((const __m128i*)src + 3); 372fe8aea9eSmrg 373fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 0, xmm1); 374fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 1, xmm2); 375fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 2, xmm3); 376fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 3, xmm4); 377fe8aea9eSmrg} 378fe8aea9eSmrg 379fe8aea9eSmrgstatic force_inline void 380fe8aea9eSmrgfrom_sse64a(uint8_t *dst, const uint8_t *src) 381fe8aea9eSmrg{ 382fe8aea9eSmrg __m128i xmm1, xmm2, xmm3, xmm4; 383fe8aea9eSmrg 384fe8aea9eSmrg assert(((uintptr_t)dst & 15) == 0); 385fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 386fe8aea9eSmrg 387fe8aea9eSmrg xmm1 = xmm_load_128((const __m128i*)src + 0); 388fe8aea9eSmrg xmm2 = xmm_load_128((const __m128i*)src + 1); 389fe8aea9eSmrg xmm3 = xmm_load_128((const __m128i*)src + 2); 390fe8aea9eSmrg xmm4 = xmm_load_128((const __m128i*)src + 3); 391fe8aea9eSmrg 392fe8aea9eSmrg xmm_save_128((__m128i*)dst + 0, xmm1); 393fe8aea9eSmrg xmm_save_128((__m128i*)dst + 1, xmm2); 394fe8aea9eSmrg xmm_save_128((__m128i*)dst + 2, xmm3); 395fe8aea9eSmrg xmm_save_128((__m128i*)dst + 3, xmm4); 396fe8aea9eSmrg} 397fe8aea9eSmrg 398fe8aea9eSmrgstatic force_inline void 399fe8aea9eSmrgfrom_sse32u(uint8_t *dst, const uint8_t *src) 400fe8aea9eSmrg{ 401fe8aea9eSmrg __m128i xmm1, xmm2; 402fe8aea9eSmrg 403fe8aea9eSmrg xmm1 = xmm_load_128((const __m128i*)src + 0); 404fe8aea9eSmrg xmm2 = xmm_load_128((const __m128i*)src + 1); 405fe8aea9eSmrg 406fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 0, xmm1); 407fe8aea9eSmrg xmm_save_128u((__m128i*)dst + 1, xmm2); 408fe8aea9eSmrg} 409fe8aea9eSmrg 410fe8aea9eSmrgstatic force_inline void 411fe8aea9eSmrgfrom_sse32a(uint8_t *dst, const uint8_t *src) 412fe8aea9eSmrg{ 413fe8aea9eSmrg __m128i xmm1, xmm2; 414fe8aea9eSmrg 415fe8aea9eSmrg assert(((uintptr_t)dst & 15) == 0); 416fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 417fe8aea9eSmrg 418fe8aea9eSmrg xmm1 = xmm_load_128((const __m128i*)src + 0); 419fe8aea9eSmrg xmm2 = xmm_load_128((const __m128i*)src + 1); 420fe8aea9eSmrg 421fe8aea9eSmrg xmm_save_128((__m128i*)dst + 0, xmm1); 422fe8aea9eSmrg xmm_save_128((__m128i*)dst + 1, xmm2); 423fe8aea9eSmrg} 424fe8aea9eSmrg 425fe8aea9eSmrgstatic force_inline void 426fe8aea9eSmrgfrom_sse16u(uint8_t *dst, const uint8_t *src) 427fe8aea9eSmrg{ 428fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 429fe8aea9eSmrg 430fe8aea9eSmrg xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src)); 431fe8aea9eSmrg} 432fe8aea9eSmrg 433fe8aea9eSmrgstatic force_inline void 434fe8aea9eSmrgfrom_sse16a(uint8_t *dst, const uint8_t *src) 435fe8aea9eSmrg{ 436fe8aea9eSmrg assert(((uintptr_t)dst & 15) == 0); 437fe8aea9eSmrg assert(((uintptr_t)src & 15) == 0); 438fe8aea9eSmrg 439fe8aea9eSmrg xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src)); 440fe8aea9eSmrg} 441fe8aea9eSmrg 442fe8aea9eSmrgstatic void 443fe8aea9eSmrgmemcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, 444fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, 445fe8aea9eSmrg int16_t src_x, int16_t src_y, 446fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 447fe8aea9eSmrg uint16_t width, uint16_t height) 448fe8aea9eSmrg{ 449fe8aea9eSmrg const unsigned tile_width = 512; 450fe8aea9eSmrg const unsigned tile_height = 8; 451fe8aea9eSmrg const unsigned tile_size = 4096; 452fe8aea9eSmrg 453fe8aea9eSmrg const unsigned cpp = bpp / 8; 454fe8aea9eSmrg const unsigned tile_pixels = tile_width / cpp; 455fe8aea9eSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 456fe8aea9eSmrg const unsigned tile_mask = tile_pixels - 1; 457fe8aea9eSmrg 458fe8aea9eSmrg unsigned length_x, offset_x; 459fe8aea9eSmrg 460fe8aea9eSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 461fe8aea9eSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 462fe8aea9eSmrg assert(src != dst); 463fe8aea9eSmrg 464fe8aea9eSmrg if (dst_x | dst_y) 465fe8aea9eSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 466fe8aea9eSmrg width *= cpp; 467fe8aea9eSmrg assert(dst_stride >= width); 468fe8aea9eSmrg if (src_x & tile_mask) { 469fe8aea9eSmrg offset_x = (src_x & tile_mask) * cpp; 470fe8aea9eSmrg length_x = min(tile_width - offset_x, width); 471fe8aea9eSmrg dst_stride -= width; 472fe8aea9eSmrg dst_stride += (width - length_x) & 15; 473fe8aea9eSmrg } else { 474fe8aea9eSmrg offset_x = 0; 475fe8aea9eSmrg dst_stride -= width & ~15; 476fe8aea9eSmrg } 477fe8aea9eSmrg assert(dst_stride >= 0); 478fe8aea9eSmrg src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; 479fe8aea9eSmrg 480fe8aea9eSmrg while (height--) { 481fe8aea9eSmrg unsigned w = width; 482fe8aea9eSmrg const uint8_t *tile_row = src; 483fe8aea9eSmrg 484fe8aea9eSmrg tile_row += src_y / tile_height * src_stride * tile_height; 485fe8aea9eSmrg tile_row += (src_y & (tile_height-1)) * tile_width; 486fe8aea9eSmrg src_y++; 487fe8aea9eSmrg 488fe8aea9eSmrg if (offset_x) { 489fe8aea9eSmrg memcpy(dst, tile_row + offset_x, length_x); 490fe8aea9eSmrg tile_row += tile_size; 491fe8aea9eSmrg dst = (uint8_t *)dst + length_x; 492fe8aea9eSmrg w -= length_x; 493fe8aea9eSmrg } 494fe8aea9eSmrg 495fe8aea9eSmrg if ((uintptr_t)dst & 15) { 496fe8aea9eSmrg while (w >= tile_width) { 497fe8aea9eSmrg from_sse128xNu(dst, 498fe8aea9eSmrg assume_aligned(tile_row, tile_width), 499fe8aea9eSmrg tile_width); 500fe8aea9eSmrg tile_row += tile_size; 501fe8aea9eSmrg dst = (uint8_t *)dst + tile_width; 502fe8aea9eSmrg w -= tile_width; 503fe8aea9eSmrg } 504fe8aea9eSmrg while (w >= 64) { 505fe8aea9eSmrg from_sse64u(dst, tile_row); 506fe8aea9eSmrg tile_row += 64; 507fe8aea9eSmrg dst = (uint8_t *)dst + 64; 508fe8aea9eSmrg w -= 64; 509fe8aea9eSmrg } 510fe8aea9eSmrg if (w & 32) { 511fe8aea9eSmrg from_sse32u(dst, tile_row); 512fe8aea9eSmrg tile_row += 32; 513fe8aea9eSmrg dst = (uint8_t *)dst + 32; 514fe8aea9eSmrg } 515fe8aea9eSmrg if (w & 16) { 516fe8aea9eSmrg from_sse16u(dst, tile_row); 517fe8aea9eSmrg tile_row += 16; 518fe8aea9eSmrg dst = (uint8_t *)dst + 16; 519fe8aea9eSmrg } 520fe8aea9eSmrg memcpy(dst, assume_aligned(tile_row, 16), w & 15); 521fe8aea9eSmrg } else { 522fe8aea9eSmrg while (w >= tile_width) { 523fe8aea9eSmrg from_sse128xNa(assume_aligned(dst, 16), 524fe8aea9eSmrg assume_aligned(tile_row, tile_width), 525fe8aea9eSmrg tile_width); 526fe8aea9eSmrg tile_row += tile_size; 527fe8aea9eSmrg dst = (uint8_t *)dst + tile_width; 528fe8aea9eSmrg w -= tile_width; 529fe8aea9eSmrg } 530fe8aea9eSmrg while (w >= 64) { 531fe8aea9eSmrg from_sse64a(dst, tile_row); 532fe8aea9eSmrg tile_row += 64; 533fe8aea9eSmrg dst = (uint8_t *)dst + 64; 534fe8aea9eSmrg w -= 64; 535fe8aea9eSmrg } 536fe8aea9eSmrg if (w & 32) { 537fe8aea9eSmrg from_sse32a(dst, tile_row); 538fe8aea9eSmrg tile_row += 32; 539fe8aea9eSmrg dst = (uint8_t *)dst + 32; 540fe8aea9eSmrg } 541fe8aea9eSmrg if (w & 16) { 542fe8aea9eSmrg from_sse16a(dst, tile_row); 543fe8aea9eSmrg tile_row += 16; 544fe8aea9eSmrg dst = (uint8_t *)dst + 16; 545fe8aea9eSmrg } 546fe8aea9eSmrg memcpy(assume_aligned(dst, 16), 547fe8aea9eSmrg assume_aligned(tile_row, 16), 548fe8aea9eSmrg w & 15); 549fe8aea9eSmrg } 550fe8aea9eSmrg dst = (uint8_t *)dst + dst_stride; 551fe8aea9eSmrg } 552fe8aea9eSmrg} 553fe8aea9eSmrg 554fe8aea9eSmrgstatic void 555fe8aea9eSmrgmemcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, 556fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, 557fe8aea9eSmrg int16_t src_x, int16_t src_y, 558fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 559fe8aea9eSmrg uint16_t width, uint16_t height) 560fe8aea9eSmrg{ 561fe8aea9eSmrg const unsigned tile_width = 512; 562fe8aea9eSmrg const unsigned tile_height = 8; 563fe8aea9eSmrg const unsigned tile_size = 4096; 564fe8aea9eSmrg 565fe8aea9eSmrg const unsigned cpp = bpp / 8; 566fe8aea9eSmrg const unsigned tile_pixels = tile_width / cpp; 567fe8aea9eSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 568fe8aea9eSmrg const unsigned tile_mask = tile_pixels - 1; 569fe8aea9eSmrg 570fe8aea9eSmrg unsigned ox, lx; 571fe8aea9eSmrg 572fe8aea9eSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 573fe8aea9eSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 574fe8aea9eSmrg assert(src != dst); 575fe8aea9eSmrg 576fe8aea9eSmrg width *= cpp; 577fe8aea9eSmrg dst_stride *= tile_height; 578fe8aea9eSmrg src_stride *= tile_height; 579fe8aea9eSmrg 580fe8aea9eSmrg assert((dst_x & tile_mask) == (src_x & tile_mask)); 581fe8aea9eSmrg if (dst_x & tile_mask) { 582fe8aea9eSmrg ox = (dst_x & tile_mask) * cpp; 583fe8aea9eSmrg lx = min(tile_width - ox, width); 584fe8aea9eSmrg assert(lx != 0); 585fe8aea9eSmrg } else 586fe8aea9eSmrg lx = 0; 587fe8aea9eSmrg 588fe8aea9eSmrg if (dst_x) 589fe8aea9eSmrg dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; 590fe8aea9eSmrg if (src_x) 591fe8aea9eSmrg src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; 592fe8aea9eSmrg 593fe8aea9eSmrg while (height--) { 594fe8aea9eSmrg const uint8_t *src_row; 595fe8aea9eSmrg uint8_t *dst_row; 596fe8aea9eSmrg unsigned w = width; 597fe8aea9eSmrg 598fe8aea9eSmrg dst_row = dst; 599fe8aea9eSmrg dst_row += dst_y / tile_height * dst_stride; 600fe8aea9eSmrg dst_row += (dst_y & (tile_height-1)) * tile_width; 601fe8aea9eSmrg dst_y++; 602fe8aea9eSmrg 603fe8aea9eSmrg src_row = src; 604fe8aea9eSmrg src_row += src_y / tile_height * src_stride; 605fe8aea9eSmrg src_row += (src_y & (tile_height-1)) * tile_width; 606fe8aea9eSmrg src_y++; 607fe8aea9eSmrg 608fe8aea9eSmrg if (lx) { 609fe8aea9eSmrg to_memcpy(dst_row + ox, src_row + ox, lx); 610fe8aea9eSmrg dst_row += tile_size; 611fe8aea9eSmrg src_row += tile_size; 612fe8aea9eSmrg w -= lx; 613fe8aea9eSmrg } 614fe8aea9eSmrg while (w >= tile_width) { 615fe8aea9eSmrg assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); 616fe8aea9eSmrg assert(((uintptr_t)src_row & (tile_width - 1)) == 0); 617fe8aea9eSmrg to_sse128xN(assume_aligned(dst_row, tile_width), 618fe8aea9eSmrg assume_aligned(src_row, tile_width), 619fe8aea9eSmrg tile_width); 620fe8aea9eSmrg dst_row += tile_size; 621fe8aea9eSmrg src_row += tile_size; 622fe8aea9eSmrg w -= tile_width; 623fe8aea9eSmrg } 624fe8aea9eSmrg if (w) { 625fe8aea9eSmrg assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); 626fe8aea9eSmrg assert(((uintptr_t)src_row & (tile_width - 1)) == 0); 627fe8aea9eSmrg to_memcpy(assume_aligned(dst_row, tile_width), 628fe8aea9eSmrg assume_aligned(src_row, tile_width), 629fe8aea9eSmrg w); 630fe8aea9eSmrg } 631fe8aea9eSmrg } 632fe8aea9eSmrg} 633fe8aea9eSmrg 634fe8aea9eSmrg#pragma GCC push_options 63503b705cfSriastradh#endif 63603b705cfSriastradh 63703b705cfSriastradhfast void 63803b705cfSriastradhmemcpy_blt(const void *src, void *dst, int bpp, 63903b705cfSriastradh int32_t src_stride, int32_t dst_stride, 64003b705cfSriastradh int16_t src_x, int16_t src_y, 64103b705cfSriastradh int16_t dst_x, int16_t dst_y, 64203b705cfSriastradh uint16_t width, uint16_t height) 64303b705cfSriastradh{ 64403b705cfSriastradh const uint8_t *src_bytes; 64503b705cfSriastradh uint8_t *dst_bytes; 64603b705cfSriastradh int byte_width; 64703b705cfSriastradh 64803b705cfSriastradh assert(src); 64903b705cfSriastradh assert(dst); 65003b705cfSriastradh assert(width && height); 65103b705cfSriastradh assert(bpp >= 8); 65203b705cfSriastradh assert(width*bpp <= 8*src_stride); 65303b705cfSriastradh assert(width*bpp <= 8*dst_stride); 65403b705cfSriastradh 65503b705cfSriastradh DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 65603b705cfSriastradh __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 65703b705cfSriastradh 65803b705cfSriastradh bpp /= 8; 65903b705cfSriastradh 66003b705cfSriastradh src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; 66103b705cfSriastradh dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; 66203b705cfSriastradh 66303b705cfSriastradh byte_width = width * bpp; 66403b705cfSriastradh if (byte_width == src_stride && byte_width == dst_stride) { 66503b705cfSriastradh byte_width *= height; 66603b705cfSriastradh height = 1; 66703b705cfSriastradh } 66803b705cfSriastradh 66903b705cfSriastradh switch (byte_width) { 67003b705cfSriastradh case 1: 67103b705cfSriastradh do { 67203b705cfSriastradh *dst_bytes = *src_bytes; 67303b705cfSriastradh src_bytes += src_stride; 67403b705cfSriastradh dst_bytes += dst_stride; 67503b705cfSriastradh } while (--height); 67603b705cfSriastradh break; 67703b705cfSriastradh 67803b705cfSriastradh case 2: 67903b705cfSriastradh do { 68003b705cfSriastradh *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; 68103b705cfSriastradh src_bytes += src_stride; 68203b705cfSriastradh dst_bytes += dst_stride; 68303b705cfSriastradh } while (--height); 68403b705cfSriastradh break; 68503b705cfSriastradh 68603b705cfSriastradh case 4: 68703b705cfSriastradh do { 68803b705cfSriastradh *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; 68903b705cfSriastradh src_bytes += src_stride; 69003b705cfSriastradh dst_bytes += dst_stride; 69103b705cfSriastradh } while (--height); 69203b705cfSriastradh break; 69303b705cfSriastradh 69403b705cfSriastradh case 8: 69503b705cfSriastradh do { 69603b705cfSriastradh *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; 69703b705cfSriastradh src_bytes += src_stride; 69803b705cfSriastradh dst_bytes += dst_stride; 69903b705cfSriastradh } while (--height); 70003b705cfSriastradh break; 70103b705cfSriastradh case 16: 70203b705cfSriastradh do { 70303b705cfSriastradh ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; 70403b705cfSriastradh ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; 70503b705cfSriastradh src_bytes += src_stride; 70603b705cfSriastradh dst_bytes += dst_stride; 70703b705cfSriastradh } while (--height); 70803b705cfSriastradh break; 70903b705cfSriastradh 71003b705cfSriastradh default: 71103b705cfSriastradh do { 71203b705cfSriastradh memcpy(dst_bytes, src_bytes, byte_width); 71303b705cfSriastradh src_bytes += src_stride; 71403b705cfSriastradh dst_bytes += dst_stride; 71503b705cfSriastradh } while (--height); 71603b705cfSriastradh break; 71703b705cfSriastradh } 71803b705cfSriastradh} 71903b705cfSriastradh 72003b705cfSriastradhstatic fast_memcpy void 72103b705cfSriastradhmemcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 72203b705cfSriastradh int32_t src_stride, int32_t dst_stride, 72303b705cfSriastradh int16_t src_x, int16_t src_y, 72403b705cfSriastradh int16_t dst_x, int16_t dst_y, 72503b705cfSriastradh uint16_t width, uint16_t height) 72603b705cfSriastradh{ 72703b705cfSriastradh const unsigned tile_width = 512; 72803b705cfSriastradh const unsigned tile_height = 8; 72903b705cfSriastradh const unsigned tile_size = 4096; 73003b705cfSriastradh 73103b705cfSriastradh const unsigned cpp = bpp / 8; 73242542f5fSchristos const unsigned tile_pixels = tile_width / cpp; 73342542f5fSchristos const unsigned tile_shift = ffs(tile_pixels) - 1; 73442542f5fSchristos const unsigned tile_mask = tile_pixels - 1; 73503b705cfSriastradh 73603b705cfSriastradh DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 73703b705cfSriastradh __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 73842542f5fSchristos assert(src != dst); 73942542f5fSchristos 74042542f5fSchristos if (src_x | src_y) 74142542f5fSchristos src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 74242542f5fSchristos assert(src_stride >= width * cpp); 74342542f5fSchristos src_stride -= width * cpp; 74442542f5fSchristos 74542542f5fSchristos while (height--) { 74642542f5fSchristos unsigned w = width * cpp; 74742542f5fSchristos uint8_t *tile_row = dst; 74842542f5fSchristos 74942542f5fSchristos tile_row += dst_y / tile_height * dst_stride * tile_height; 75042542f5fSchristos tile_row += (dst_y & (tile_height-1)) * tile_width; 75142542f5fSchristos if (dst_x) { 75242542f5fSchristos tile_row += (dst_x >> tile_shift) * tile_size; 75342542f5fSchristos if (dst_x & tile_mask) { 75442542f5fSchristos const unsigned x = (dst_x & tile_mask) * cpp; 75542542f5fSchristos const unsigned len = min(tile_width - x, w); 756fe8aea9eSmrg memcpy(assume_misaligned(tile_row + x, tile_width, x), 757fe8aea9eSmrg src, len); 75842542f5fSchristos 75942542f5fSchristos tile_row += tile_size; 76042542f5fSchristos src = (const uint8_t *)src + len; 76142542f5fSchristos w -= len; 76242542f5fSchristos } 76303b705cfSriastradh } 76442542f5fSchristos while (w >= tile_width) { 765fe8aea9eSmrg memcpy(assume_aligned(tile_row, tile_width), 766fe8aea9eSmrg src, tile_width); 76742542f5fSchristos tile_row += tile_size; 76842542f5fSchristos src = (const uint8_t *)src + tile_width; 76942542f5fSchristos w -= tile_width; 77003b705cfSriastradh } 771fe8aea9eSmrg memcpy(assume_aligned(tile_row, tile_width), src, w); 77242542f5fSchristos src = (const uint8_t *)src + src_stride + w; 77342542f5fSchristos dst_y++; 77403b705cfSriastradh } 77503b705cfSriastradh} 77603b705cfSriastradh 77703b705cfSriastradhstatic fast_memcpy void 77803b705cfSriastradhmemcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 77903b705cfSriastradh int32_t src_stride, int32_t dst_stride, 78003b705cfSriastradh int16_t src_x, int16_t src_y, 78103b705cfSriastradh int16_t dst_x, int16_t dst_y, 78203b705cfSriastradh uint16_t width, uint16_t height) 78303b705cfSriastradh{ 78403b705cfSriastradh const unsigned tile_width = 512; 78503b705cfSriastradh const unsigned tile_height = 8; 78603b705cfSriastradh const unsigned tile_size = 4096; 78703b705cfSriastradh 78803b705cfSriastradh const unsigned cpp = bpp / 8; 78942542f5fSchristos const unsigned tile_pixels = tile_width / cpp; 79042542f5fSchristos const unsigned tile_shift = ffs(tile_pixels) - 1; 79142542f5fSchristos const unsigned tile_mask = tile_pixels - 1; 79203b705cfSriastradh 79303b705cfSriastradh DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 79403b705cfSriastradh __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 79542542f5fSchristos assert(src != dst); 79642542f5fSchristos 79742542f5fSchristos if (dst_x | dst_y) 79842542f5fSchristos dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 79942542f5fSchristos assert(dst_stride >= width * cpp); 80042542f5fSchristos dst_stride -= width * cpp; 80142542f5fSchristos 80242542f5fSchristos while (height--) { 80342542f5fSchristos unsigned w = width * cpp; 80442542f5fSchristos const uint8_t *tile_row = src; 80542542f5fSchristos 80642542f5fSchristos tile_row += src_y / tile_height * src_stride * tile_height; 80742542f5fSchristos tile_row += (src_y & (tile_height-1)) * tile_width; 80842542f5fSchristos if (src_x) { 80942542f5fSchristos tile_row += (src_x >> tile_shift) * tile_size; 81042542f5fSchristos if (src_x & tile_mask) { 81142542f5fSchristos const unsigned x = (src_x & tile_mask) * cpp; 81242542f5fSchristos const unsigned len = min(tile_width - x, w); 813fe8aea9eSmrg memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len); 81442542f5fSchristos 81542542f5fSchristos tile_row += tile_size; 81642542f5fSchristos dst = (uint8_t *)dst + len; 81742542f5fSchristos w -= len; 81842542f5fSchristos } 81903b705cfSriastradh } 82042542f5fSchristos while (w >= tile_width) { 821fe8aea9eSmrg memcpy(dst, 822fe8aea9eSmrg assume_aligned(tile_row, tile_width), 823fe8aea9eSmrg tile_width); 82403b705cfSriastradh 82542542f5fSchristos tile_row += tile_size; 82642542f5fSchristos dst = (uint8_t *)dst + tile_width; 82742542f5fSchristos w -= tile_width; 82803b705cfSriastradh } 829fe8aea9eSmrg memcpy(dst, assume_aligned(tile_row, tile_width), w); 83042542f5fSchristos dst = (uint8_t *)dst + dst_stride + w; 83142542f5fSchristos src_y++; 83203b705cfSriastradh } 83303b705cfSriastradh} 83403b705cfSriastradh 835fe8aea9eSmrgstatic fast_memcpy void 836fe8aea9eSmrgmemcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp, 837fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, 838fe8aea9eSmrg int16_t src_x, int16_t src_y, 839fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 840fe8aea9eSmrg uint16_t width, uint16_t height) 84103b705cfSriastradh{ 84203b705cfSriastradh const unsigned tile_width = 512; 84303b705cfSriastradh const unsigned tile_height = 8; 84403b705cfSriastradh const unsigned tile_size = 4096; 84503b705cfSriastradh 84603b705cfSriastradh const unsigned cpp = bpp / 8; 847fe8aea9eSmrg const unsigned tile_pixels = tile_width / cpp; 848fe8aea9eSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 849fe8aea9eSmrg const unsigned tile_mask = tile_pixels - 1; 85003b705cfSriastradh 85103b705cfSriastradh DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 85203b705cfSriastradh __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 853fe8aea9eSmrg assert(src != dst); 854fe8aea9eSmrg assert((dst_x & tile_mask) == (src_x & tile_mask)); 85503b705cfSriastradh 856fe8aea9eSmrg while (height--) { 857fe8aea9eSmrg unsigned w = width * cpp; 858fe8aea9eSmrg uint8_t *dst_row = dst; 859fe8aea9eSmrg const uint8_t *src_row = src; 86003b705cfSriastradh 861fe8aea9eSmrg dst_row += dst_y / tile_height * dst_stride * tile_height; 862fe8aea9eSmrg dst_row += (dst_y & (tile_height-1)) * tile_width; 863fe8aea9eSmrg if (dst_x) 864fe8aea9eSmrg dst_row += (dst_x >> tile_shift) * tile_size; 865fe8aea9eSmrg dst_y++; 86603b705cfSriastradh 867fe8aea9eSmrg src_row += src_y / tile_height * src_stride * tile_height; 868fe8aea9eSmrg src_row += (src_y & (tile_height-1)) * tile_width; 869fe8aea9eSmrg if (src_x) 870fe8aea9eSmrg src_row += (src_x >> tile_shift) * tile_size; 871fe8aea9eSmrg src_y++; 87203b705cfSriastradh 873fe8aea9eSmrg if (dst_x & tile_mask) { 874fe8aea9eSmrg const unsigned x = (dst_x & tile_mask) * cpp; 875fe8aea9eSmrg const unsigned len = min(tile_width - x, w); 87603b705cfSriastradh 877fe8aea9eSmrg memcpy(assume_misaligned(dst_row + x, tile_width, x), 878fe8aea9eSmrg assume_misaligned(src_row + x, tile_width, x), 879fe8aea9eSmrg len); 88003b705cfSriastradh 881fe8aea9eSmrg dst_row += tile_size; 882fe8aea9eSmrg src_row += tile_size; 883fe8aea9eSmrg w -= len; 88403b705cfSriastradh } 88503b705cfSriastradh 886fe8aea9eSmrg while (w >= tile_width) { 887fe8aea9eSmrg memcpy(assume_aligned(dst_row, tile_width), 888fe8aea9eSmrg assume_aligned(src_row, tile_width), 889fe8aea9eSmrg tile_width); 890fe8aea9eSmrg dst_row += tile_size; 891fe8aea9eSmrg src_row += tile_size; 892fe8aea9eSmrg w -= tile_width; 89303b705cfSriastradh } 894fe8aea9eSmrg memcpy(assume_aligned(dst_row, tile_width), 895fe8aea9eSmrg assume_aligned(src_row, tile_width), 896fe8aea9eSmrg w); 89703b705cfSriastradh } 89803b705cfSriastradh} 89903b705cfSriastradh 900fe8aea9eSmrg#define memcpy_to_tiled_x(swizzle) \ 901fe8aea9eSmrgfast_memcpy static void \ 902fe8aea9eSmrgmemcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \ 903fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, \ 904fe8aea9eSmrg int16_t src_x, int16_t src_y, \ 905fe8aea9eSmrg int16_t dst_x, int16_t dst_y, \ 906fe8aea9eSmrg uint16_t width, uint16_t height) \ 907fe8aea9eSmrg{ \ 908fe8aea9eSmrg const unsigned tile_width = 512; \ 909fe8aea9eSmrg const unsigned tile_height = 8; \ 910fe8aea9eSmrg const unsigned tile_size = 4096; \ 911fe8aea9eSmrg const unsigned cpp = bpp / 8; \ 912fe8aea9eSmrg const unsigned stride_tiles = dst_stride / tile_width; \ 913fe8aea9eSmrg const unsigned swizzle_pixels = 64 / cpp; \ 914fe8aea9eSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \ 915fe8aea9eSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; \ 916fe8aea9eSmrg unsigned x, y; \ 917fe8aea9eSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \ 918fe8aea9eSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \ 919fe8aea9eSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; \ 920fe8aea9eSmrg for (y = 0; y < height; ++y) { \ 921fe8aea9eSmrg const uint32_t dy = y + dst_y; \ 922fe8aea9eSmrg const uint32_t tile_row = \ 923fe8aea9eSmrg (dy / tile_height * stride_tiles * tile_size + \ 924fe8aea9eSmrg (dy & (tile_height-1)) * tile_width); \ 925fe8aea9eSmrg const uint8_t *src_row = (const uint8_t *)src + src_stride * y; \ 926fe8aea9eSmrg uint32_t dx = dst_x; \ 927fe8aea9eSmrg x = width * cpp; \ 928fe8aea9eSmrg if (dx & (swizzle_pixels - 1)) { \ 929fe8aea9eSmrg const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); \ 930fe8aea9eSmrg const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; \ 931fe8aea9eSmrg uint32_t offset = \ 932fe8aea9eSmrg tile_row + \ 933fe8aea9eSmrg (dx >> tile_pixels) * tile_size + \ 934fe8aea9eSmrg (dx & tile_mask) * cpp; \ 935fe8aea9eSmrg memcpy((char *)dst + swizzle(offset), src_row, length * cpp); \ 936fe8aea9eSmrg src_row += length * cpp; \ 937fe8aea9eSmrg x -= length * cpp; \ 938fe8aea9eSmrg dx += length; \ 939fe8aea9eSmrg } \ 940fe8aea9eSmrg while (x >= 64) { \ 941fe8aea9eSmrg uint32_t offset = \ 942fe8aea9eSmrg tile_row + \ 943fe8aea9eSmrg (dx >> tile_pixels) * tile_size + \ 944fe8aea9eSmrg (dx & tile_mask) * cpp; \ 945fe8aea9eSmrg memcpy(assume_aligned((char *)dst+swizzle(offset),64), \ 946fe8aea9eSmrg src_row, 64); \ 947fe8aea9eSmrg src_row += 64; \ 948fe8aea9eSmrg x -= 64; \ 949fe8aea9eSmrg dx += swizzle_pixels; \ 950fe8aea9eSmrg } \ 951fe8aea9eSmrg if (x) { \ 952fe8aea9eSmrg uint32_t offset = \ 953fe8aea9eSmrg tile_row + \ 954fe8aea9eSmrg (dx >> tile_pixels) * tile_size + \ 955fe8aea9eSmrg (dx & tile_mask) * cpp; \ 956fe8aea9eSmrg memcpy(assume_aligned((char *)dst + swizzle(offset), 64), src_row, x); \ 957fe8aea9eSmrg } \ 958fe8aea9eSmrg } \ 959fe8aea9eSmrg} 96003b705cfSriastradh 961fe8aea9eSmrg#define memcpy_from_tiled_x(swizzle) \ 962fe8aea9eSmrgfast_memcpy static void \ 963fe8aea9eSmrgmemcpy_from_tiled_x__##swizzle (const void *src, void *dst, int bpp, \ 964fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, \ 965fe8aea9eSmrg int16_t src_x, int16_t src_y, \ 966fe8aea9eSmrg int16_t dst_x, int16_t dst_y, \ 967fe8aea9eSmrg uint16_t width, uint16_t height) \ 968fe8aea9eSmrg{ \ 969fe8aea9eSmrg const unsigned tile_width = 512; \ 970fe8aea9eSmrg const unsigned tile_height = 8; \ 971fe8aea9eSmrg const unsigned tile_size = 4096; \ 972fe8aea9eSmrg const unsigned cpp = bpp / 8; \ 973fe8aea9eSmrg const unsigned stride_tiles = src_stride / tile_width; \ 974fe8aea9eSmrg const unsigned swizzle_pixels = 64 / cpp; \ 975fe8aea9eSmrg const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \ 976fe8aea9eSmrg const unsigned tile_mask = (1 << tile_pixels) - 1; \ 977fe8aea9eSmrg unsigned x, y; \ 978fe8aea9eSmrg DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \ 979fe8aea9eSmrg __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \ 980fe8aea9eSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; \ 981fe8aea9eSmrg for (y = 0; y < height; ++y) { \ 982fe8aea9eSmrg const uint32_t sy = y + src_y; \ 983fe8aea9eSmrg const uint32_t tile_row = \ 984fe8aea9eSmrg (sy / tile_height * stride_tiles * tile_size + \ 985fe8aea9eSmrg (sy & (tile_height-1)) * tile_width); \ 986fe8aea9eSmrg uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; \ 987fe8aea9eSmrg uint32_t sx = src_x; \ 988fe8aea9eSmrg x = width * cpp; \ 989fe8aea9eSmrg if (sx & (swizzle_pixels - 1)) { \ 990fe8aea9eSmrg const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); \ 991fe8aea9eSmrg const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; \ 992fe8aea9eSmrg uint32_t offset = \ 993fe8aea9eSmrg tile_row + \ 994fe8aea9eSmrg (sx >> tile_pixels) * tile_size + \ 995fe8aea9eSmrg (sx & tile_mask) * cpp; \ 996fe8aea9eSmrg memcpy(dst_row, (const char *)src + swizzle(offset), length * cpp); \ 997fe8aea9eSmrg dst_row += length * cpp; \ 998fe8aea9eSmrg x -= length * cpp; \ 999fe8aea9eSmrg sx += length; \ 1000fe8aea9eSmrg } \ 1001fe8aea9eSmrg while (x >= 64) { \ 1002fe8aea9eSmrg uint32_t offset = \ 1003fe8aea9eSmrg tile_row + \ 1004fe8aea9eSmrg (sx >> tile_pixels) * tile_size + \ 1005fe8aea9eSmrg (sx & tile_mask) * cpp; \ 1006fe8aea9eSmrg memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), 64); \ 1007fe8aea9eSmrg dst_row += 64; \ 1008fe8aea9eSmrg x -= 64; \ 1009fe8aea9eSmrg sx += swizzle_pixels; \ 1010fe8aea9eSmrg } \ 1011fe8aea9eSmrg if (x) { \ 1012fe8aea9eSmrg uint32_t offset = \ 1013fe8aea9eSmrg tile_row + \ 1014fe8aea9eSmrg (sx >> tile_pixels) * tile_size + \ 1015fe8aea9eSmrg (sx & tile_mask) * cpp; \ 1016fe8aea9eSmrg memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), x); \ 1017fe8aea9eSmrg } \ 1018fe8aea9eSmrg } \ 1019fe8aea9eSmrg} 102003b705cfSriastradh 1021fe8aea9eSmrg#define swizzle_9(X) ((X) ^ (((X) >> 3) & 64)) 1022fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9) 1023fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9) 1024fe8aea9eSmrg#undef swizzle_9 102503b705cfSriastradh 1026fe8aea9eSmrg#define swizzle_9_10(X) ((X) ^ ((((X) ^ ((X) >> 1)) >> 3) & 64)) 1027fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9_10) 1028fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9_10) 1029fe8aea9eSmrg#undef swizzle_9_10 103003b705cfSriastradh 1031fe8aea9eSmrg#define swizzle_9_11(X) ((X) ^ ((((X) ^ ((X) >> 2)) >> 3) & 64)) 1032fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9_11) 1033fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9_11) 1034fe8aea9eSmrg#undef swizzle_9_11 103503b705cfSriastradh 1036fe8aea9eSmrg#define swizzle_9_10_11(X) ((X) ^ ((((X) ^ ((X) >> 1) ^ ((X) >> 2)) >> 3) & 64)) 1037fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9_10_11) 1038fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9_10_11) 1039fe8aea9eSmrg#undef swizzle_9_10_11 104003b705cfSriastradh 1041fe8aea9eSmrgstatic fast_memcpy void 1042fe8aea9eSmrgmemcpy_to_tiled_x__gen2(const void *src, void *dst, int bpp, 1043fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, 1044fe8aea9eSmrg int16_t src_x, int16_t src_y, 1045fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 1046fe8aea9eSmrg uint16_t width, uint16_t height) 104703b705cfSriastradh{ 1048fe8aea9eSmrg const unsigned tile_width = 128; 1049fe8aea9eSmrg const unsigned tile_height = 16; 1050fe8aea9eSmrg const unsigned tile_size = 2048; 105103b705cfSriastradh 105203b705cfSriastradh const unsigned cpp = bpp / 8; 1053fe8aea9eSmrg const unsigned tile_pixels = tile_width / cpp; 1054fe8aea9eSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 1055fe8aea9eSmrg const unsigned tile_mask = tile_pixels - 1; 105603b705cfSriastradh 105703b705cfSriastradh DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 105803b705cfSriastradh __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 1059fe8aea9eSmrg assert(src != dst); 106003b705cfSriastradh 1061fe8aea9eSmrg if (src_x | src_y) 1062fe8aea9eSmrg src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; 1063fe8aea9eSmrg assert(src_stride >= width * cpp); 1064fe8aea9eSmrg src_stride -= width * cpp; 106503b705cfSriastradh 1066fe8aea9eSmrg while (height--) { 1067fe8aea9eSmrg unsigned w = width * cpp; 1068fe8aea9eSmrg uint8_t *tile_row = dst; 106903b705cfSriastradh 1070fe8aea9eSmrg tile_row += dst_y / tile_height * dst_stride * tile_height; 1071fe8aea9eSmrg tile_row += (dst_y & (tile_height-1)) * tile_width; 1072fe8aea9eSmrg if (dst_x) { 1073fe8aea9eSmrg tile_row += (dst_x >> tile_shift) * tile_size; 1074fe8aea9eSmrg if (dst_x & tile_mask) { 1075fe8aea9eSmrg const unsigned x = (dst_x & tile_mask) * cpp; 1076fe8aea9eSmrg const unsigned len = min(tile_width - x, w); 1077fe8aea9eSmrg memcpy(assume_misaligned(tile_row + x, tile_width, x), src, len); 107803b705cfSriastradh 1079fe8aea9eSmrg tile_row += tile_size; 1080fe8aea9eSmrg src = (const uint8_t *)src + len; 1081fe8aea9eSmrg w -= len; 1082fe8aea9eSmrg } 108303b705cfSriastradh } 1084fe8aea9eSmrg while (w >= tile_width) { 1085fe8aea9eSmrg memcpy(assume_aligned(tile_row, tile_width), 1086fe8aea9eSmrg src, tile_width); 108703b705cfSriastradh 1088fe8aea9eSmrg tile_row += tile_size; 1089fe8aea9eSmrg src = (const uint8_t *)src + tile_width; 1090fe8aea9eSmrg w -= tile_width; 109103b705cfSriastradh } 1092fe8aea9eSmrg memcpy(assume_aligned(tile_row, tile_width), src, w); 1093fe8aea9eSmrg src = (const uint8_t *)src + src_stride + w; 1094fe8aea9eSmrg dst_y++; 109503b705cfSriastradh } 109603b705cfSriastradh} 109703b705cfSriastradh 1098fe8aea9eSmrgstatic fast_memcpy void 1099fe8aea9eSmrgmemcpy_from_tiled_x__gen2(const void *src, void *dst, int bpp, 1100fe8aea9eSmrg int32_t src_stride, int32_t dst_stride, 1101fe8aea9eSmrg int16_t src_x, int16_t src_y, 1102fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 1103fe8aea9eSmrg uint16_t width, uint16_t height) 110403b705cfSriastradh{ 1105fe8aea9eSmrg const unsigned tile_width = 128; 1106fe8aea9eSmrg const unsigned tile_height = 16; 1107fe8aea9eSmrg const unsigned tile_size = 2048; 110803b705cfSriastradh 110903b705cfSriastradh const unsigned cpp = bpp / 8; 1110fe8aea9eSmrg const unsigned tile_pixels = tile_width / cpp; 1111fe8aea9eSmrg const unsigned tile_shift = ffs(tile_pixels) - 1; 1112fe8aea9eSmrg const unsigned tile_mask = tile_pixels - 1; 111303b705cfSriastradh 111403b705cfSriastradh DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", 111503b705cfSriastradh __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); 1116fe8aea9eSmrg assert(src != dst); 111703b705cfSriastradh 1118fe8aea9eSmrg if (dst_x | dst_y) 1119fe8aea9eSmrg dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; 1120fe8aea9eSmrg assert(dst_stride >= width * cpp); 1121fe8aea9eSmrg dst_stride -= width * cpp; 1122fe8aea9eSmrg 1123fe8aea9eSmrg while (height--) { 1124fe8aea9eSmrg unsigned w = width * cpp; 1125fe8aea9eSmrg const uint8_t *tile_row = src; 112603b705cfSriastradh 1127fe8aea9eSmrg tile_row += src_y / tile_height * src_stride * tile_height; 1128fe8aea9eSmrg tile_row += (src_y & (tile_height-1)) * tile_width; 1129fe8aea9eSmrg if (src_x) { 1130fe8aea9eSmrg tile_row += (src_x >> tile_shift) * tile_size; 1131fe8aea9eSmrg if (src_x & tile_mask) { 1132fe8aea9eSmrg const unsigned x = (src_x & tile_mask) * cpp; 1133fe8aea9eSmrg const unsigned len = min(tile_width - x, w); 1134fe8aea9eSmrg memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len); 113503b705cfSriastradh 1136fe8aea9eSmrg tile_row += tile_size; 1137fe8aea9eSmrg dst = (uint8_t *)dst + len; 1138fe8aea9eSmrg w -= len; 1139fe8aea9eSmrg } 114003b705cfSriastradh } 1141fe8aea9eSmrg while (w >= tile_width) { 1142fe8aea9eSmrg memcpy(dst, 1143fe8aea9eSmrg assume_aligned(tile_row, tile_width), 1144fe8aea9eSmrg tile_width); 1145fe8aea9eSmrg 1146fe8aea9eSmrg tile_row += tile_size; 1147fe8aea9eSmrg dst = (uint8_t *)dst + tile_width; 1148fe8aea9eSmrg w -= tile_width; 114903b705cfSriastradh } 1150fe8aea9eSmrg memcpy(dst, assume_aligned(tile_row, tile_width), w); 1151fe8aea9eSmrg dst = (uint8_t *)dst + dst_stride + w; 1152fe8aea9eSmrg src_y++; 115303b705cfSriastradh } 115403b705cfSriastradh} 115503b705cfSriastradh 1156fe8aea9eSmrgvoid choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu) 115703b705cfSriastradh{ 1158fe8aea9eSmrg if (kgem->gen < 030) { 1159fe8aea9eSmrg if (swizzling == I915_BIT_6_SWIZZLE_NONE) { 1160fe8aea9eSmrg DBG(("%s: gen2, no swizzling\n", __FUNCTION__)); 1161fe8aea9eSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__gen2; 1162fe8aea9eSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__gen2; 1163fe8aea9eSmrg } else 1164fe8aea9eSmrg DBG(("%s: no detiling with swizzle functions for gen2\n", __FUNCTION__)); 1165fe8aea9eSmrg return; 1166fe8aea9eSmrg } 1167fe8aea9eSmrg 116803b705cfSriastradh switch (swizzling) { 116903b705cfSriastradh default: 117003b705cfSriastradh DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling)); 117103b705cfSriastradh break; 117203b705cfSriastradh case I915_BIT_6_SWIZZLE_NONE: 117303b705cfSriastradh DBG(("%s: no swizzling\n", __FUNCTION__)); 1174fe8aea9eSmrg#if defined(sse2) 1175fe8aea9eSmrg if (cpu & SSE2) { 1176fe8aea9eSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2; 1177fe8aea9eSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2; 1178fe8aea9eSmrg kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2; 1179fe8aea9eSmrg } else 1180fe8aea9eSmrg#endif 1181fe8aea9eSmrg { 1182fe8aea9eSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0; 1183fe8aea9eSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0; 1184fe8aea9eSmrg kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0; 1185fe8aea9eSmrg } 118603b705cfSriastradh break; 118703b705cfSriastradh case I915_BIT_6_SWIZZLE_9: 118803b705cfSriastradh DBG(("%s: 6^9 swizzling\n", __FUNCTION__)); 118903b705cfSriastradh kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9; 119003b705cfSriastradh kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9; 119103b705cfSriastradh break; 119203b705cfSriastradh case I915_BIT_6_SWIZZLE_9_10: 119303b705cfSriastradh DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__)); 119403b705cfSriastradh kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10; 119503b705cfSriastradh kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10; 119603b705cfSriastradh break; 119703b705cfSriastradh case I915_BIT_6_SWIZZLE_9_11: 119803b705cfSriastradh DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__)); 119903b705cfSriastradh kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11; 120003b705cfSriastradh kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11; 120103b705cfSriastradh break; 1202fe8aea9eSmrg case I915_BIT_6_SWIZZLE_9_10_11: 1203fe8aea9eSmrg DBG(("%s: 6^9^10^11 swizzling\n", __FUNCTION__)); 1204fe8aea9eSmrg kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10_11; 1205fe8aea9eSmrg kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10_11; 1206fe8aea9eSmrg break; 120703b705cfSriastradh } 120803b705cfSriastradh} 120903b705cfSriastradh 121003b705cfSriastradhvoid 121103b705cfSriastradhmemmove_box(const void *src, void *dst, 121203b705cfSriastradh int bpp, int32_t stride, 121303b705cfSriastradh const BoxRec *box, 121403b705cfSriastradh int dx, int dy) 121503b705cfSriastradh{ 121642542f5fSchristos#define FORCE_MEMMOVE 0 121703b705cfSriastradh union { 121803b705cfSriastradh uint8_t u8; 121903b705cfSriastradh uint16_t u16; 122003b705cfSriastradh uint32_t u32; 122103b705cfSriastradh uint64_t u64; 122203b705cfSriastradh } tmp; 122303b705cfSriastradh const uint8_t *src_bytes; 122403b705cfSriastradh uint8_t *dst_bytes; 122503b705cfSriastradh int width, height; 122603b705cfSriastradh 122703b705cfSriastradh assert(src); 122803b705cfSriastradh assert(dst); 122942542f5fSchristos assert(src != dst); 123003b705cfSriastradh assert(bpp >= 8); 123103b705cfSriastradh assert(box->x2 > box->x1); 123203b705cfSriastradh assert(box->y2 > box->y1); 123303b705cfSriastradh 123403b705cfSriastradh DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n", 123503b705cfSriastradh __FUNCTION__, 123603b705cfSriastradh box->x1, box->y1, box->x2, box->y2, 123703b705cfSriastradh stride, bpp, dx, dy)); 123803b705cfSriastradh 123903b705cfSriastradh bpp /= 8; 124003b705cfSriastradh width = box->y1 * stride + box->x1 * bpp; 124103b705cfSriastradh src_bytes = (const uint8_t *)src + width; 124203b705cfSriastradh dst_bytes = (uint8_t *)dst + width; 124342542f5fSchristos assert(dst_bytes != src_bytes); 124403b705cfSriastradh 124503b705cfSriastradh width = (box->x2 - box->x1) * bpp; 124603b705cfSriastradh height = (box->y2 - box->y1); 124742542f5fSchristos assert(width <= stride); 124803b705cfSriastradh if (width == stride) { 124903b705cfSriastradh width *= height; 125003b705cfSriastradh height = 1; 125103b705cfSriastradh } 125203b705cfSriastradh 125303b705cfSriastradh if (dy >= 0) { 125403b705cfSriastradh switch (width) { 125503b705cfSriastradh case 1: 125603b705cfSriastradh do { 125703b705cfSriastradh *dst_bytes = tmp.u8 = *src_bytes; 125803b705cfSriastradh src_bytes += stride; 125903b705cfSriastradh dst_bytes += stride; 126003b705cfSriastradh } while (--height); 126103b705cfSriastradh break; 126203b705cfSriastradh 126303b705cfSriastradh case 2: 126403b705cfSriastradh do { 126503b705cfSriastradh *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes; 126603b705cfSriastradh src_bytes += stride; 126703b705cfSriastradh dst_bytes += stride; 126803b705cfSriastradh } while (--height); 126903b705cfSriastradh break; 127003b705cfSriastradh 127103b705cfSriastradh case 4: 127203b705cfSriastradh do { 127303b705cfSriastradh *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes; 127403b705cfSriastradh src_bytes += stride; 127503b705cfSriastradh dst_bytes += stride; 127603b705cfSriastradh } while (--height); 127703b705cfSriastradh break; 127803b705cfSriastradh 127903b705cfSriastradh case 8: 128003b705cfSriastradh do { 128103b705cfSriastradh *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes; 128203b705cfSriastradh src_bytes += stride; 128303b705cfSriastradh dst_bytes += stride; 128403b705cfSriastradh } while (--height); 128503b705cfSriastradh break; 128603b705cfSriastradh 128703b705cfSriastradh default: 128842542f5fSchristos if (FORCE_MEMMOVE || 128942542f5fSchristos (dst_bytes < src_bytes + width && 129042542f5fSchristos src_bytes < dst_bytes + width)) { 129103b705cfSriastradh do { 129203b705cfSriastradh memmove(dst_bytes, src_bytes, width); 129303b705cfSriastradh src_bytes += stride; 129403b705cfSriastradh dst_bytes += stride; 129503b705cfSriastradh } while (--height); 129603b705cfSriastradh } else { 129703b705cfSriastradh do { 129803b705cfSriastradh memcpy(dst_bytes, src_bytes, width); 129903b705cfSriastradh src_bytes += stride; 130003b705cfSriastradh dst_bytes += stride; 130103b705cfSriastradh } while (--height); 130203b705cfSriastradh } 130303b705cfSriastradh break; 130403b705cfSriastradh } 130503b705cfSriastradh } else { 130603b705cfSriastradh src_bytes += (height-1) * stride; 130703b705cfSriastradh dst_bytes += (height-1) * stride; 130803b705cfSriastradh 130903b705cfSriastradh switch (width) { 131003b705cfSriastradh case 1: 131103b705cfSriastradh do { 131203b705cfSriastradh *dst_bytes = tmp.u8 = *src_bytes; 131303b705cfSriastradh src_bytes -= stride; 131403b705cfSriastradh dst_bytes -= stride; 131503b705cfSriastradh } while (--height); 131603b705cfSriastradh break; 131703b705cfSriastradh 131803b705cfSriastradh case 2: 131903b705cfSriastradh do { 132003b705cfSriastradh *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes; 132103b705cfSriastradh src_bytes -= stride; 132203b705cfSriastradh dst_bytes -= stride; 132303b705cfSriastradh } while (--height); 132403b705cfSriastradh break; 132503b705cfSriastradh 132603b705cfSriastradh case 4: 132703b705cfSriastradh do { 132803b705cfSriastradh *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes; 132903b705cfSriastradh src_bytes -= stride; 133003b705cfSriastradh dst_bytes -= stride; 133103b705cfSriastradh } while (--height); 133203b705cfSriastradh break; 133303b705cfSriastradh 133403b705cfSriastradh case 8: 133503b705cfSriastradh do { 133603b705cfSriastradh *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes; 133703b705cfSriastradh src_bytes -= stride; 133803b705cfSriastradh dst_bytes -= stride; 133903b705cfSriastradh } while (--height); 134003b705cfSriastradh break; 134103b705cfSriastradh 134203b705cfSriastradh default: 134342542f5fSchristos if (FORCE_MEMMOVE || 134442542f5fSchristos (dst_bytes < src_bytes + width && 134542542f5fSchristos src_bytes < dst_bytes + width)) { 134603b705cfSriastradh do { 134703b705cfSriastradh memmove(dst_bytes, src_bytes, width); 134803b705cfSriastradh src_bytes -= stride; 134903b705cfSriastradh dst_bytes -= stride; 135003b705cfSriastradh } while (--height); 135103b705cfSriastradh } else { 135203b705cfSriastradh do { 135303b705cfSriastradh memcpy(dst_bytes, src_bytes, width); 135403b705cfSriastradh src_bytes -= stride; 135503b705cfSriastradh dst_bytes -= stride; 135603b705cfSriastradh } while (--height); 135703b705cfSriastradh } 135803b705cfSriastradh break; 135903b705cfSriastradh } 136003b705cfSriastradh } 136103b705cfSriastradh} 136203b705cfSriastradh 136303b705cfSriastradhvoid 136403b705cfSriastradhmemcpy_xor(const void *src, void *dst, int bpp, 136503b705cfSriastradh int32_t src_stride, int32_t dst_stride, 136603b705cfSriastradh int16_t src_x, int16_t src_y, 136703b705cfSriastradh int16_t dst_x, int16_t dst_y, 136803b705cfSriastradh uint16_t width, uint16_t height, 136903b705cfSriastradh uint32_t and, uint32_t or) 137003b705cfSriastradh{ 137103b705cfSriastradh const uint8_t *src_bytes; 137203b705cfSriastradh uint8_t *dst_bytes; 137342542f5fSchristos int i, w; 137403b705cfSriastradh 137503b705cfSriastradh assert(width && height); 137603b705cfSriastradh assert(bpp >= 8); 137703b705cfSriastradh assert(width*bpp <= 8*src_stride); 137803b705cfSriastradh assert(width*bpp <= 8*dst_stride); 137903b705cfSriastradh 138003b705cfSriastradh DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n", 138103b705cfSriastradh __FUNCTION__, 138203b705cfSriastradh src_x, src_y, dst_x, dst_y, 138303b705cfSriastradh width, height, 138403b705cfSriastradh src_stride, dst_stride, 138503b705cfSriastradh bpp, and, or)); 138603b705cfSriastradh 138703b705cfSriastradh bpp /= 8; 138803b705cfSriastradh src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; 138903b705cfSriastradh dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; 139003b705cfSriastradh 139103b705cfSriastradh if (and == 0xffffffff) { 139203b705cfSriastradh switch (bpp) { 139303b705cfSriastradh case 1: 139403b705cfSriastradh if (width & 1) { 139503b705cfSriastradh do { 139603b705cfSriastradh for (i = 0; i < width; i++) 139703b705cfSriastradh dst_bytes[i] = src_bytes[i] | or; 139803b705cfSriastradh 139903b705cfSriastradh src_bytes += src_stride; 140003b705cfSriastradh dst_bytes += dst_stride; 140103b705cfSriastradh } while (--height); 140203b705cfSriastradh break; 140303b705cfSriastradh } else { 140403b705cfSriastradh width /= 2; 140503b705cfSriastradh or |= or << 8; 140603b705cfSriastradh } 140703b705cfSriastradh case 2: 140803b705cfSriastradh if (width & 1) { 140903b705cfSriastradh do { 141003b705cfSriastradh uint16_t *d = (uint16_t *)dst_bytes; 141103b705cfSriastradh const uint16_t *s = (const uint16_t *)src_bytes; 141203b705cfSriastradh 141303b705cfSriastradh for (i = 0; i < width; i++) 141403b705cfSriastradh d[i] = s[i] | or; 141503b705cfSriastradh 141603b705cfSriastradh src_bytes += src_stride; 141703b705cfSriastradh dst_bytes += dst_stride; 141803b705cfSriastradh } while (--height); 141903b705cfSriastradh break; 142003b705cfSriastradh } else { 142103b705cfSriastradh width /= 2; 142203b705cfSriastradh or |= or << 16; 142303b705cfSriastradh } 142403b705cfSriastradh case 4: 142542542f5fSchristos w = width; 142642542f5fSchristos if (w * 4 == dst_stride && dst_stride == src_stride) { 142742542f5fSchristos w *= height; 142803b705cfSriastradh height = 1; 142903b705cfSriastradh } 143003b705cfSriastradh 1431fe8aea9eSmrg#if defined(sse2) && __x86_64__ 143203b705cfSriastradh if (have_sse2()) { 143303b705cfSriastradh do { 143403b705cfSriastradh uint32_t *d = (uint32_t *)dst_bytes; 143503b705cfSriastradh const uint32_t *s = (const uint32_t *)src_bytes; 143603b705cfSriastradh __m128i mask = xmm_create_mask_32(or); 143703b705cfSriastradh 143842542f5fSchristos i = w; 143903b705cfSriastradh while (i && (uintptr_t)d & 15) { 144003b705cfSriastradh *d++ = *s++ | or; 144103b705cfSriastradh i--; 144203b705cfSriastradh } 144303b705cfSriastradh 144403b705cfSriastradh while (i >= 16) { 144503b705cfSriastradh __m128i xmm1, xmm2, xmm3, xmm4; 144603b705cfSriastradh 144703b705cfSriastradh xmm1 = xmm_load_128u((const __m128i*)s + 0); 144803b705cfSriastradh xmm2 = xmm_load_128u((const __m128i*)s + 1); 144903b705cfSriastradh xmm3 = xmm_load_128u((const __m128i*)s + 2); 145003b705cfSriastradh xmm4 = xmm_load_128u((const __m128i*)s + 3); 145103b705cfSriastradh 145203b705cfSriastradh xmm_save_128((__m128i*)d + 0, 145303b705cfSriastradh _mm_or_si128(xmm1, mask)); 145403b705cfSriastradh xmm_save_128((__m128i*)d + 1, 145503b705cfSriastradh _mm_or_si128(xmm2, mask)); 145603b705cfSriastradh xmm_save_128((__m128i*)d + 2, 145703b705cfSriastradh _mm_or_si128(xmm3, mask)); 145803b705cfSriastradh xmm_save_128((__m128i*)d + 3, 145903b705cfSriastradh _mm_or_si128(xmm4, mask)); 146003b705cfSriastradh 146103b705cfSriastradh d += 16; 146203b705cfSriastradh s += 16; 146303b705cfSriastradh i -= 16; 146403b705cfSriastradh } 146503b705cfSriastradh 146603b705cfSriastradh if (i & 8) { 146703b705cfSriastradh __m128i xmm1, xmm2; 146803b705cfSriastradh 146903b705cfSriastradh xmm1 = xmm_load_128u((const __m128i*)s + 0); 147003b705cfSriastradh xmm2 = xmm_load_128u((const __m128i*)s + 1); 147103b705cfSriastradh 147203b705cfSriastradh xmm_save_128((__m128i*)d + 0, 147303b705cfSriastradh _mm_or_si128(xmm1, mask)); 147403b705cfSriastradh xmm_save_128((__m128i*)d + 1, 147503b705cfSriastradh _mm_or_si128(xmm2, mask)); 147603b705cfSriastradh d += 8; 147703b705cfSriastradh s += 8; 147803b705cfSriastradh i -= 8; 147903b705cfSriastradh } 148003b705cfSriastradh 148103b705cfSriastradh if (i & 4) { 148203b705cfSriastradh xmm_save_128((__m128i*)d, 148303b705cfSriastradh _mm_or_si128(xmm_load_128u((const __m128i*)s), 148403b705cfSriastradh mask)); 148503b705cfSriastradh 148603b705cfSriastradh d += 4; 148703b705cfSriastradh s += 4; 148803b705cfSriastradh i -= 4; 148903b705cfSriastradh } 149003b705cfSriastradh 149103b705cfSriastradh while (i) { 149203b705cfSriastradh *d++ = *s++ | or; 149303b705cfSriastradh i--; 149403b705cfSriastradh } 149503b705cfSriastradh 149603b705cfSriastradh src_bytes += src_stride; 149703b705cfSriastradh dst_bytes += dst_stride; 149803b705cfSriastradh } while (--height); 149903b705cfSriastradh } else 150003b705cfSriastradh#else 150103b705cfSriastradh do { 150203b705cfSriastradh uint32_t *d = (uint32_t *)dst_bytes; 150303b705cfSriastradh uint32_t *s = (uint32_t *)src_bytes; 150403b705cfSriastradh 150542542f5fSchristos for (i = 0; i < w; i++) 150603b705cfSriastradh d[i] = s[i] | or; 150703b705cfSriastradh 150803b705cfSriastradh src_bytes += src_stride; 150903b705cfSriastradh dst_bytes += dst_stride; 151003b705cfSriastradh } while (--height); 151103b705cfSriastradh#endif 151203b705cfSriastradh break; 151303b705cfSriastradh } 151403b705cfSriastradh } else { 151503b705cfSriastradh switch (bpp) { 151603b705cfSriastradh case 1: 151703b705cfSriastradh do { 151803b705cfSriastradh for (i = 0; i < width; i++) 151903b705cfSriastradh dst_bytes[i] = (src_bytes[i] & and) | or; 152003b705cfSriastradh 152103b705cfSriastradh src_bytes += src_stride; 152203b705cfSriastradh dst_bytes += dst_stride; 152303b705cfSriastradh } while (--height); 152403b705cfSriastradh break; 152503b705cfSriastradh 152603b705cfSriastradh case 2: 152703b705cfSriastradh do { 152803b705cfSriastradh uint16_t *d = (uint16_t *)dst_bytes; 152903b705cfSriastradh const uint16_t *s = (const uint16_t *)src_bytes; 153003b705cfSriastradh 153103b705cfSriastradh for (i = 0; i < width; i++) 153203b705cfSriastradh d[i] = (s[i] & and) | or; 153303b705cfSriastradh 153403b705cfSriastradh src_bytes += src_stride; 153503b705cfSriastradh dst_bytes += dst_stride; 153603b705cfSriastradh } while (--height); 153703b705cfSriastradh break; 153803b705cfSriastradh 153903b705cfSriastradh case 4: 154003b705cfSriastradh do { 154103b705cfSriastradh uint32_t *d = (uint32_t *)dst_bytes; 154203b705cfSriastradh const uint32_t *s = (const uint32_t *)src_bytes; 154303b705cfSriastradh 154403b705cfSriastradh for (i = 0; i < width; i++) 154503b705cfSriastradh d[i] = (s[i] & and) | or; 154603b705cfSriastradh 154703b705cfSriastradh src_bytes += src_stride; 154803b705cfSriastradh dst_bytes += dst_stride; 154903b705cfSriastradh } while (--height); 155003b705cfSriastradh break; 155103b705cfSriastradh } 155203b705cfSriastradh } 155303b705cfSriastradh} 1554fe8aea9eSmrg 1555fe8aea9eSmrg#define BILINEAR_INTERPOLATION_BITS 4 1556fe8aea9eSmrgstatic inline int 1557fe8aea9eSmrgbilinear_weight(pixman_fixed_t x) 1558fe8aea9eSmrg{ 1559fe8aea9eSmrg return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) & 1560fe8aea9eSmrg ((1 << BILINEAR_INTERPOLATION_BITS) - 1); 1561fe8aea9eSmrg} 1562fe8aea9eSmrg 1563fe8aea9eSmrg#if BILINEAR_INTERPOLATION_BITS <= 4 1564fe8aea9eSmrg/* Inspired by Filter_32_opaque from Skia */ 1565fe8aea9eSmrgstatic inline uint32_t 1566fe8aea9eSmrgbilinear_interpolation(uint32_t tl, uint32_t tr, 1567fe8aea9eSmrg uint32_t bl, uint32_t br, 1568fe8aea9eSmrg int distx, int disty) 1569fe8aea9eSmrg{ 1570fe8aea9eSmrg int distxy, distxiy, distixy, distixiy; 1571fe8aea9eSmrg uint32_t lo, hi; 1572fe8aea9eSmrg 1573fe8aea9eSmrg distx <<= (4 - BILINEAR_INTERPOLATION_BITS); 1574fe8aea9eSmrg disty <<= (4 - BILINEAR_INTERPOLATION_BITS); 1575fe8aea9eSmrg 1576fe8aea9eSmrg distxy = distx * disty; 1577fe8aea9eSmrg distxiy = (distx << 4) - distxy; /* distx * (16 - disty) */ 1578fe8aea9eSmrg distixy = (disty << 4) - distxy; /* disty * (16 - distx) */ 1579fe8aea9eSmrg distixiy = 1580fe8aea9eSmrg 16 * 16 - (disty << 4) - 1581fe8aea9eSmrg (distx << 4) + distxy; /* (16 - distx) * (16 - disty) */ 1582fe8aea9eSmrg 1583fe8aea9eSmrg lo = (tl & 0xff00ff) * distixiy; 1584fe8aea9eSmrg hi = ((tl >> 8) & 0xff00ff) * distixiy; 1585fe8aea9eSmrg 1586fe8aea9eSmrg lo += (tr & 0xff00ff) * distxiy; 1587fe8aea9eSmrg hi += ((tr >> 8) & 0xff00ff) * distxiy; 1588fe8aea9eSmrg 1589fe8aea9eSmrg lo += (bl & 0xff00ff) * distixy; 1590fe8aea9eSmrg hi += ((bl >> 8) & 0xff00ff) * distixy; 1591fe8aea9eSmrg 1592fe8aea9eSmrg lo += (br & 0xff00ff) * distxy; 1593fe8aea9eSmrg hi += ((br >> 8) & 0xff00ff) * distxy; 1594fe8aea9eSmrg 1595fe8aea9eSmrg return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff); 1596fe8aea9eSmrg} 1597fe8aea9eSmrg#elif SIZEOF_LONG > 4 1598fe8aea9eSmrgstatic inline uint32_t 1599fe8aea9eSmrgbilinear_interpolation(uint32_t tl, uint32_t tr, 1600fe8aea9eSmrg uint32_t bl, uint32_t br, 1601fe8aea9eSmrg int distx, int disty) 1602fe8aea9eSmrg{ 1603fe8aea9eSmrg uint64_t distxy, distxiy, distixy, distixiy; 1604fe8aea9eSmrg uint64_t tl64, tr64, bl64, br64; 1605fe8aea9eSmrg uint64_t f, r; 1606fe8aea9eSmrg 1607fe8aea9eSmrg distx <<= (8 - BILINEAR_INTERPOLATION_BITS); 1608fe8aea9eSmrg disty <<= (8 - BILINEAR_INTERPOLATION_BITS); 1609fe8aea9eSmrg 1610fe8aea9eSmrg distxy = distx * disty; 1611fe8aea9eSmrg distxiy = distx * (256 - disty); 1612fe8aea9eSmrg distixy = (256 - distx) * disty; 1613fe8aea9eSmrg distixiy = (256 - distx) * (256 - disty); 1614fe8aea9eSmrg 1615fe8aea9eSmrg /* Alpha and Blue */ 1616fe8aea9eSmrg tl64 = tl & 0xff0000ff; 1617fe8aea9eSmrg tr64 = tr & 0xff0000ff; 1618fe8aea9eSmrg bl64 = bl & 0xff0000ff; 1619fe8aea9eSmrg br64 = br & 0xff0000ff; 1620fe8aea9eSmrg 1621fe8aea9eSmrg f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; 1622fe8aea9eSmrg r = f & 0x0000ff0000ff0000ull; 1623fe8aea9eSmrg 1624fe8aea9eSmrg /* Red and Green */ 1625fe8aea9eSmrg tl64 = tl; 1626fe8aea9eSmrg tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull); 1627fe8aea9eSmrg 1628fe8aea9eSmrg tr64 = tr; 1629fe8aea9eSmrg tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull); 1630fe8aea9eSmrg 1631fe8aea9eSmrg bl64 = bl; 1632fe8aea9eSmrg bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull); 1633fe8aea9eSmrg 1634fe8aea9eSmrg br64 = br; 1635fe8aea9eSmrg br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull); 1636fe8aea9eSmrg 1637fe8aea9eSmrg f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; 1638fe8aea9eSmrg r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull); 1639fe8aea9eSmrg 1640fe8aea9eSmrg return (uint32_t)(r >> 16); 1641fe8aea9eSmrg} 1642fe8aea9eSmrg#else 1643fe8aea9eSmrgstatic inline uint32_t 1644fe8aea9eSmrgbilinear_interpolation(uint32_t tl, uint32_t tr, 1645fe8aea9eSmrg uint32_t bl, uint32_t br, 1646fe8aea9eSmrg int distx, int disty) 1647fe8aea9eSmrg{ 1648fe8aea9eSmrg int distxy, distxiy, distixy, distixiy; 1649fe8aea9eSmrg uint32_t f, r; 1650fe8aea9eSmrg 1651fe8aea9eSmrg distx <<= (8 - BILINEAR_INTERPOLATION_BITS); 1652fe8aea9eSmrg disty <<= (8 - BILINEAR_INTERPOLATION_BITS); 1653fe8aea9eSmrg 1654fe8aea9eSmrg distxy = distx * disty; 1655fe8aea9eSmrg distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */ 1656fe8aea9eSmrg distixy = (disty << 8) - distxy; /* disty * (256 - distx) */ 1657fe8aea9eSmrg distixiy = 1658fe8aea9eSmrg 256 * 256 - (disty << 8) - 1659fe8aea9eSmrg (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */ 1660fe8aea9eSmrg 1661fe8aea9eSmrg /* Blue */ 1662fe8aea9eSmrg r = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy + 1663fe8aea9eSmrg (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy); 1664fe8aea9eSmrg 1665fe8aea9eSmrg /* Green */ 1666fe8aea9eSmrg f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + 1667fe8aea9eSmrg (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy); 1668fe8aea9eSmrg r |= f & 0xff000000; 1669fe8aea9eSmrg 1670fe8aea9eSmrg tl >>= 16; 1671fe8aea9eSmrg tr >>= 16; 1672fe8aea9eSmrg bl >>= 16; 1673fe8aea9eSmrg br >>= 16; 1674fe8aea9eSmrg r >>= 16; 1675fe8aea9eSmrg 1676fe8aea9eSmrg /* Red */ 1677fe8aea9eSmrg f = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy + 1678fe8aea9eSmrg (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy); 1679fe8aea9eSmrg r |= f & 0x00ff0000; 1680fe8aea9eSmrg 1681fe8aea9eSmrg /* Alpha */ 1682fe8aea9eSmrg f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy + 1683fe8aea9eSmrg (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy); 1684fe8aea9eSmrg r |= f & 0xff000000; 1685fe8aea9eSmrg 1686fe8aea9eSmrg return r; 1687fe8aea9eSmrg} 1688fe8aea9eSmrg#endif 1689fe8aea9eSmrg 1690fe8aea9eSmrgstatic inline uint32_t convert_pixel(const uint8_t *p, int x) 1691fe8aea9eSmrg{ 1692fe8aea9eSmrg return ((uint32_t *)p)[x]; 1693fe8aea9eSmrg} 1694fe8aea9eSmrg 1695fe8aea9eSmrgfast void 1696fe8aea9eSmrgaffine_blt(const void *src, void *dst, int bpp, 1697fe8aea9eSmrg int16_t src_x, int16_t src_y, 1698fe8aea9eSmrg int16_t src_width, int16_t src_height, 1699fe8aea9eSmrg int32_t src_stride, 1700fe8aea9eSmrg int16_t dst_x, int16_t dst_y, 1701fe8aea9eSmrg uint16_t dst_width, uint16_t dst_height, 1702fe8aea9eSmrg int32_t dst_stride, 1703fe8aea9eSmrg const struct pixman_f_transform *t) 1704fe8aea9eSmrg{ 1705fe8aea9eSmrg static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1706fe8aea9eSmrg const pixman_fixed_t ux = pixman_double_to_fixed(t->m[0][0]); 1707fe8aea9eSmrg const pixman_fixed_t uy = pixman_double_to_fixed(t->m[1][0]); 1708fe8aea9eSmrg int i, j; 1709fe8aea9eSmrg 1710fe8aea9eSmrg assert(bpp == 32); 1711fe8aea9eSmrg 1712fe8aea9eSmrg for (j = 0; j < dst_height; j++) { 1713fe8aea9eSmrg pixman_fixed_t x, y; 1714fe8aea9eSmrg struct pixman_f_vector v; 1715fe8aea9eSmrg uint32_t *b; 1716fe8aea9eSmrg 1717fe8aea9eSmrg /* reference point is the center of the pixel */ 1718fe8aea9eSmrg v.v[0] = dst_x + 0.5; 1719fe8aea9eSmrg v.v[1] = dst_y + j + 0.5; 1720fe8aea9eSmrg v.v[2] = 1.0; 1721fe8aea9eSmrg 1722fe8aea9eSmrg pixman_f_transform_point_3d(t, &v); 1723fe8aea9eSmrg 1724fe8aea9eSmrg x = pixman_double_to_fixed(v.v[0]); 1725fe8aea9eSmrg x += pixman_int_to_fixed(src_x - dst_x); 1726fe8aea9eSmrg y = pixman_double_to_fixed(v.v[1]); 1727fe8aea9eSmrg y += pixman_int_to_fixed(src_y - dst_y); 1728fe8aea9eSmrg 1729fe8aea9eSmrg b = (uint32_t*)((uint8_t *)dst + (dst_y + j) * dst_stride + dst_x * bpp / 8); 1730fe8aea9eSmrg for (i = 0; i < dst_width; i++) { 1731fe8aea9eSmrg const uint8_t *row1; 1732fe8aea9eSmrg const uint8_t *row2; 1733fe8aea9eSmrg int x1, y1, x2, y2; 1734fe8aea9eSmrg uint32_t tl, tr, bl, br; 1735fe8aea9eSmrg int32_t fx, fy; 1736fe8aea9eSmrg 1737fe8aea9eSmrg x1 = x - pixman_fixed_1/2; 1738fe8aea9eSmrg y1 = y - pixman_fixed_1/2; 1739fe8aea9eSmrg 1740fe8aea9eSmrg fx = bilinear_weight(x1); 1741fe8aea9eSmrg fy = bilinear_weight(y1); 1742fe8aea9eSmrg 1743fe8aea9eSmrg x1 = pixman_fixed_to_int(x1); 1744fe8aea9eSmrg x2 = x1 + 1; 1745fe8aea9eSmrg y1 = pixman_fixed_to_int(y1); 1746fe8aea9eSmrg y2 = y1 + 1; 1747fe8aea9eSmrg 1748fe8aea9eSmrg if (x1 >= src_width || x2 < 0 || 1749fe8aea9eSmrg y1 >= src_height || y2 < 0) { 1750fe8aea9eSmrg b[i] = 0; 1751fe8aea9eSmrg goto next; 1752fe8aea9eSmrg } 1753fe8aea9eSmrg 1754fe8aea9eSmrg if (y2 == 0) { 1755fe8aea9eSmrg row1 = zero; 1756fe8aea9eSmrg } else { 1757fe8aea9eSmrg row1 = (uint8_t *)src + src_stride * y1; 1758fe8aea9eSmrg row1 += bpp / 8 * x1; 1759fe8aea9eSmrg } 1760fe8aea9eSmrg 1761fe8aea9eSmrg if (y1 == src_height - 1) { 1762fe8aea9eSmrg row2 = zero; 1763fe8aea9eSmrg } else { 1764fe8aea9eSmrg row2 = (uint8_t *)src + src_stride * y2; 1765fe8aea9eSmrg row2 += bpp / 8 * x1; 1766fe8aea9eSmrg } 1767fe8aea9eSmrg 1768fe8aea9eSmrg if (x2 == 0) { 1769fe8aea9eSmrg tl = 0; 1770fe8aea9eSmrg bl = 0; 1771fe8aea9eSmrg } else { 1772fe8aea9eSmrg tl = convert_pixel(row1, 0); 1773fe8aea9eSmrg bl = convert_pixel(row2, 0); 1774fe8aea9eSmrg } 1775fe8aea9eSmrg 1776fe8aea9eSmrg if (x1 == src_width - 1) { 1777fe8aea9eSmrg tr = 0; 1778fe8aea9eSmrg br = 0; 1779fe8aea9eSmrg } else { 1780fe8aea9eSmrg tr = convert_pixel(row1, 1); 1781fe8aea9eSmrg br = convert_pixel(row2, 1); 1782fe8aea9eSmrg } 1783fe8aea9eSmrg 1784fe8aea9eSmrg b[i] = bilinear_interpolation(tl, tr, bl, br, fx, fy); 1785fe8aea9eSmrg 1786fe8aea9eSmrgnext: 1787fe8aea9eSmrg x += ux; 1788fe8aea9eSmrg y += uy; 1789fe8aea9eSmrg } 1790fe8aea9eSmrg } 1791fe8aea9eSmrg} 1792