103b705cfSriastradh/*
203b705cfSriastradh * Copyright (c) 2011 Intel Corporation
303b705cfSriastradh *
403b705cfSriastradh * Permission is hereby granted, free of charge, to any person obtaining a
503b705cfSriastradh * copy of this software and associated documentation files (the "Software"),
603b705cfSriastradh * to deal in the Software without restriction, including without limitation
703b705cfSriastradh * the rights to use, copy, modify, merge, publish, distribute, sublicense,
803b705cfSriastradh * and/or sell copies of the Software, and to permit persons to whom the
903b705cfSriastradh * Software is furnished to do so, subject to the following conditions:
1003b705cfSriastradh *
1103b705cfSriastradh * The above copyright notice and this permission notice (including the next
1203b705cfSriastradh * paragraph) shall be included in all copies or substantial portions of the
1303b705cfSriastradh * Software.
1403b705cfSriastradh *
1503b705cfSriastradh * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1603b705cfSriastradh * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1703b705cfSriastradh * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1803b705cfSriastradh * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1903b705cfSriastradh * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2003b705cfSriastradh * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2103b705cfSriastradh * SOFTWARE.
2203b705cfSriastradh *
2303b705cfSriastradh * Authors:
2403b705cfSriastradh *    Chris Wilson <chris@chris-wilson.co.uk>
2503b705cfSriastradh *
2603b705cfSriastradh */
2703b705cfSriastradh
2803b705cfSriastradh#ifdef HAVE_CONFIG_H
2903b705cfSriastradh#include "config.h"
3003b705cfSriastradh#endif
3103b705cfSriastradh
3203b705cfSriastradh#include "sna.h"
33fe8aea9eSmrg#include <pixman.h>
3403b705cfSriastradh
35fe8aea9eSmrg#if defined(sse2)
36fe8aea9eSmrg#pragma GCC push_options
37fe8aea9eSmrg#pragma GCC target("sse2,inline-all-stringops,fpmath=sse")
38fe8aea9eSmrg#pragma GCC optimize("Ofast")
3903b705cfSriastradh#include <xmmintrin.h>
4003b705cfSriastradh
4103b705cfSriastradh#if __x86_64__
4203b705cfSriastradh#define have_sse2() 1
4303b705cfSriastradh#else
4403b705cfSriastradhstatic bool have_sse2(void)
4503b705cfSriastradh{
4603b705cfSriastradh	static int sse2_present = -1;
4703b705cfSriastradh
4803b705cfSriastradh	if (sse2_present == -1)
49fe8aea9eSmrg		sse2_present = sna_cpu_detect() & SSE2;
5003b705cfSriastradh
5103b705cfSriastradh	return sse2_present;
5203b705cfSriastradh}
5303b705cfSriastradh#endif
5403b705cfSriastradh
55fe8aea9eSmrgstatic force_inline __m128i
5603b705cfSriastradhxmm_create_mask_32(uint32_t mask)
5703b705cfSriastradh{
5803b705cfSriastradh	return _mm_set_epi32(mask, mask, mask, mask);
5903b705cfSriastradh}
6003b705cfSriastradh
61fe8aea9eSmrgstatic force_inline __m128i
62fe8aea9eSmrgxmm_load_128(const __m128i *src)
63fe8aea9eSmrg{
64fe8aea9eSmrg	return _mm_load_si128(src);
65fe8aea9eSmrg}
66fe8aea9eSmrg
67fe8aea9eSmrgstatic force_inline __m128i
6803b705cfSriastradhxmm_load_128u(const __m128i *src)
6903b705cfSriastradh{
7003b705cfSriastradh	return _mm_loadu_si128(src);
7103b705cfSriastradh}
7203b705cfSriastradh
73fe8aea9eSmrgstatic force_inline void
7403b705cfSriastradhxmm_save_128(__m128i *dst, __m128i data)
7503b705cfSriastradh{
7603b705cfSriastradh	_mm_store_si128(dst, data);
7703b705cfSriastradh}
78fe8aea9eSmrg
79fe8aea9eSmrgstatic force_inline void
80fe8aea9eSmrgxmm_save_128u(__m128i *dst, __m128i data)
81fe8aea9eSmrg{
82fe8aea9eSmrg	_mm_storeu_si128(dst, data);
83fe8aea9eSmrg}
84fe8aea9eSmrg
85fe8aea9eSmrgstatic force_inline void
86fe8aea9eSmrgto_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
87fe8aea9eSmrg{
88fe8aea9eSmrg	int i;
89fe8aea9eSmrg
90fe8aea9eSmrg	for (i = 0; i < bytes / 128; i++) {
91fe8aea9eSmrg		__m128i xmm0, xmm1, xmm2, xmm3;
92fe8aea9eSmrg		__m128i xmm4, xmm5, xmm6, xmm7;
93fe8aea9eSmrg
94fe8aea9eSmrg		xmm0 = xmm_load_128u((const __m128i*)src + 0);
95fe8aea9eSmrg		xmm1 = xmm_load_128u((const __m128i*)src + 1);
96fe8aea9eSmrg		xmm2 = xmm_load_128u((const __m128i*)src + 2);
97fe8aea9eSmrg		xmm3 = xmm_load_128u((const __m128i*)src + 3);
98fe8aea9eSmrg		xmm4 = xmm_load_128u((const __m128i*)src + 4);
99fe8aea9eSmrg		xmm5 = xmm_load_128u((const __m128i*)src + 5);
100fe8aea9eSmrg		xmm6 = xmm_load_128u((const __m128i*)src + 6);
101fe8aea9eSmrg		xmm7 = xmm_load_128u((const __m128i*)src + 7);
102fe8aea9eSmrg
103fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 0, xmm0);
104fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 1, xmm1);
105fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 2, xmm2);
106fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 3, xmm3);
107fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 4, xmm4);
108fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 5, xmm5);
109fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 6, xmm6);
110fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 7, xmm7);
111fe8aea9eSmrg
112fe8aea9eSmrg		dst += 128;
113fe8aea9eSmrg		src += 128;
114fe8aea9eSmrg	}
115fe8aea9eSmrg}
116fe8aea9eSmrg
117fe8aea9eSmrgstatic force_inline void
118fe8aea9eSmrgto_sse64(uint8_t *dst, const uint8_t *src)
119fe8aea9eSmrg{
120fe8aea9eSmrg	__m128i xmm1, xmm2, xmm3, xmm4;
121fe8aea9eSmrg
122fe8aea9eSmrg	xmm1 = xmm_load_128u((const __m128i*)src + 0);
123fe8aea9eSmrg	xmm2 = xmm_load_128u((const __m128i*)src + 1);
124fe8aea9eSmrg	xmm3 = xmm_load_128u((const __m128i*)src + 2);
125fe8aea9eSmrg	xmm4 = xmm_load_128u((const __m128i*)src + 3);
126fe8aea9eSmrg
127fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 0, xmm1);
128fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 1, xmm2);
129fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 2, xmm3);
130fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 3, xmm4);
131fe8aea9eSmrg}
132fe8aea9eSmrg
133fe8aea9eSmrgstatic force_inline void
134fe8aea9eSmrgto_sse32(uint8_t *dst, const uint8_t *src)
135fe8aea9eSmrg{
136fe8aea9eSmrg	__m128i xmm1, xmm2;
137fe8aea9eSmrg
138fe8aea9eSmrg	xmm1 = xmm_load_128u((const __m128i*)src + 0);
139fe8aea9eSmrg	xmm2 = xmm_load_128u((const __m128i*)src + 1);
140fe8aea9eSmrg
141fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 0, xmm1);
142fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 1, xmm2);
143fe8aea9eSmrg}
144fe8aea9eSmrg
145fe8aea9eSmrgstatic force_inline void
146fe8aea9eSmrgto_sse16(uint8_t *dst, const uint8_t *src)
147fe8aea9eSmrg{
148fe8aea9eSmrg	xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src));
149fe8aea9eSmrg}
150fe8aea9eSmrg
151fe8aea9eSmrgstatic void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
152fe8aea9eSmrg{
153fe8aea9eSmrg	assert(len);
154fe8aea9eSmrg	if ((uintptr_t)dst & 15) {
155fe8aea9eSmrg		if (len <= 16 - ((uintptr_t)dst & 15)) {
156fe8aea9eSmrg			memcpy(dst, src, len);
157fe8aea9eSmrg			return;
158fe8aea9eSmrg		}
159fe8aea9eSmrg
160fe8aea9eSmrg		if ((uintptr_t)dst & 1) {
161fe8aea9eSmrg			assert(len >= 1);
162fe8aea9eSmrg			*dst++ = *src++;
163fe8aea9eSmrg			len--;
164fe8aea9eSmrg		}
165fe8aea9eSmrg		if ((uintptr_t)dst & 2) {
166fe8aea9eSmrg			assert(((uintptr_t)dst & 1) == 0);
167fe8aea9eSmrg			assert(len >= 2);
168fe8aea9eSmrg			*(uint16_t *)dst = *(const uint16_t *)src;
169fe8aea9eSmrg			dst += 2;
170fe8aea9eSmrg			src += 2;
171fe8aea9eSmrg			len -= 2;
172fe8aea9eSmrg		}
173fe8aea9eSmrg		if ((uintptr_t)dst & 4) {
174fe8aea9eSmrg			assert(((uintptr_t)dst & 3) == 0);
175fe8aea9eSmrg			assert(len >= 4);
176fe8aea9eSmrg			*(uint32_t *)dst = *(const uint32_t *)src;
177fe8aea9eSmrg			dst += 4;
178fe8aea9eSmrg			src += 4;
179fe8aea9eSmrg			len -= 4;
180fe8aea9eSmrg		}
181fe8aea9eSmrg		if ((uintptr_t)dst & 8) {
182fe8aea9eSmrg			assert(((uintptr_t)dst & 7) == 0);
183fe8aea9eSmrg			assert(len >= 8);
184fe8aea9eSmrg			*(uint64_t *)dst = *(const uint64_t *)src;
185fe8aea9eSmrg			dst += 8;
186fe8aea9eSmrg			src += 8;
187fe8aea9eSmrg			len -= 8;
188fe8aea9eSmrg		}
189fe8aea9eSmrg	}
190fe8aea9eSmrg
191fe8aea9eSmrg	assert(((uintptr_t)dst & 15) == 0);
192fe8aea9eSmrg	while (len >= 64) {
193fe8aea9eSmrg		to_sse64(dst, src);
194fe8aea9eSmrg		dst += 64;
195fe8aea9eSmrg		src += 64;
196fe8aea9eSmrg		len -= 64;
197fe8aea9eSmrg	}
198fe8aea9eSmrg	if (len == 0)
199fe8aea9eSmrg		return;
200fe8aea9eSmrg
201fe8aea9eSmrg	if (len & 32) {
202fe8aea9eSmrg		to_sse32(dst, src);
203fe8aea9eSmrg		dst += 32;
204fe8aea9eSmrg		src += 32;
205fe8aea9eSmrg	}
206fe8aea9eSmrg	if (len & 16) {
207fe8aea9eSmrg		to_sse16(dst, src);
208fe8aea9eSmrg		dst += 16;
209fe8aea9eSmrg		src += 16;
210fe8aea9eSmrg	}
211fe8aea9eSmrg	if (len & 8) {
212fe8aea9eSmrg		*(uint64_t *)dst = *(uint64_t *)src;
213fe8aea9eSmrg		dst += 8;
214fe8aea9eSmrg		src += 8;
215fe8aea9eSmrg	}
216fe8aea9eSmrg	if (len & 4) {
217fe8aea9eSmrg		*(uint32_t *)dst = *(uint32_t *)src;
218fe8aea9eSmrg		dst += 4;
219fe8aea9eSmrg		src += 4;
220fe8aea9eSmrg	}
221fe8aea9eSmrg	memcpy(dst, src, len & 3);
222fe8aea9eSmrg}
223fe8aea9eSmrg
224fe8aea9eSmrgstatic void
225fe8aea9eSmrgmemcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
226fe8aea9eSmrg				   int32_t src_stride, int32_t dst_stride,
227fe8aea9eSmrg				   int16_t src_x, int16_t src_y,
228fe8aea9eSmrg				   int16_t dst_x, int16_t dst_y,
229fe8aea9eSmrg				   uint16_t width, uint16_t height)
230fe8aea9eSmrg{
231fe8aea9eSmrg	const unsigned tile_width = 512;
232fe8aea9eSmrg	const unsigned tile_height = 8;
233fe8aea9eSmrg	const unsigned tile_size = 4096;
234fe8aea9eSmrg
235fe8aea9eSmrg	const unsigned cpp = bpp / 8;
236fe8aea9eSmrg	const unsigned tile_pixels = tile_width / cpp;
237fe8aea9eSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
238fe8aea9eSmrg	const unsigned tile_mask = tile_pixels - 1;
239fe8aea9eSmrg
240fe8aea9eSmrg	unsigned offset_x, length_x;
241fe8aea9eSmrg
242fe8aea9eSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
243fe8aea9eSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
244fe8aea9eSmrg	assert(src != dst);
245fe8aea9eSmrg
246fe8aea9eSmrg	if (src_x | src_y)
247fe8aea9eSmrg		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
248fe8aea9eSmrg	width *= cpp;
249fe8aea9eSmrg	assert(src_stride >= width);
250fe8aea9eSmrg
251fe8aea9eSmrg	if (dst_x & tile_mask) {
252fe8aea9eSmrg		offset_x = (dst_x & tile_mask) * cpp;
253fe8aea9eSmrg		length_x = min(tile_width - offset_x, width);
254fe8aea9eSmrg	} else
255fe8aea9eSmrg		length_x = 0;
256fe8aea9eSmrg	dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
257fe8aea9eSmrg
258fe8aea9eSmrg	while (height--) {
259fe8aea9eSmrg		unsigned w = width;
260fe8aea9eSmrg		const uint8_t *src_row = src;
261fe8aea9eSmrg		uint8_t *tile_row = dst;
262fe8aea9eSmrg
263fe8aea9eSmrg		src = (const uint8_t *)src + src_stride;
264fe8aea9eSmrg
265fe8aea9eSmrg		tile_row += dst_y / tile_height * dst_stride * tile_height;
266fe8aea9eSmrg		tile_row += (dst_y & (tile_height-1)) * tile_width;
267fe8aea9eSmrg		dst_y++;
268fe8aea9eSmrg
269fe8aea9eSmrg		if (length_x) {
270fe8aea9eSmrg			to_memcpy(tile_row + offset_x, src_row, length_x);
271fe8aea9eSmrg
272fe8aea9eSmrg			tile_row += tile_size;
273fe8aea9eSmrg			src_row = (const uint8_t *)src_row + length_x;
274fe8aea9eSmrg			w -= length_x;
275fe8aea9eSmrg		}
276fe8aea9eSmrg		while (w >= tile_width) {
277fe8aea9eSmrg			assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
278fe8aea9eSmrg			to_sse128xN(assume_aligned(tile_row, tile_width),
279fe8aea9eSmrg				    src_row, tile_width);
280fe8aea9eSmrg			tile_row += tile_size;
281fe8aea9eSmrg			src_row = (const uint8_t *)src_row + tile_width;
282fe8aea9eSmrg			w -= tile_width;
283fe8aea9eSmrg		}
284fe8aea9eSmrg		if (w) {
285fe8aea9eSmrg			assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
286fe8aea9eSmrg			to_memcpy(assume_aligned(tile_row, tile_width),
287fe8aea9eSmrg				  src_row, w);
288fe8aea9eSmrg		}
289fe8aea9eSmrg	}
290fe8aea9eSmrg}
291fe8aea9eSmrg
292fe8aea9eSmrgstatic force_inline void
293fe8aea9eSmrgfrom_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
294fe8aea9eSmrg{
295fe8aea9eSmrg	int i;
296fe8aea9eSmrg
297fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
298fe8aea9eSmrg
299fe8aea9eSmrg	for (i = 0; i < bytes / 128; i++) {
300fe8aea9eSmrg		__m128i xmm0, xmm1, xmm2, xmm3;
301fe8aea9eSmrg		__m128i xmm4, xmm5, xmm6, xmm7;
302fe8aea9eSmrg
303fe8aea9eSmrg		xmm0 = xmm_load_128((const __m128i*)src + 0);
304fe8aea9eSmrg		xmm1 = xmm_load_128((const __m128i*)src + 1);
305fe8aea9eSmrg		xmm2 = xmm_load_128((const __m128i*)src + 2);
306fe8aea9eSmrg		xmm3 = xmm_load_128((const __m128i*)src + 3);
307fe8aea9eSmrg		xmm4 = xmm_load_128((const __m128i*)src + 4);
308fe8aea9eSmrg		xmm5 = xmm_load_128((const __m128i*)src + 5);
309fe8aea9eSmrg		xmm6 = xmm_load_128((const __m128i*)src + 6);
310fe8aea9eSmrg		xmm7 = xmm_load_128((const __m128i*)src + 7);
311fe8aea9eSmrg
312fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 0, xmm0);
313fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 1, xmm1);
314fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 2, xmm2);
315fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 3, xmm3);
316fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 4, xmm4);
317fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 5, xmm5);
318fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 6, xmm6);
319fe8aea9eSmrg		xmm_save_128u((__m128i*)dst + 7, xmm7);
320fe8aea9eSmrg
321fe8aea9eSmrg		dst += 128;
322fe8aea9eSmrg		src += 128;
323fe8aea9eSmrg	}
324fe8aea9eSmrg}
325fe8aea9eSmrg
326fe8aea9eSmrgstatic force_inline void
327fe8aea9eSmrgfrom_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
328fe8aea9eSmrg{
329fe8aea9eSmrg	int i;
330fe8aea9eSmrg
331fe8aea9eSmrg	assert(((uintptr_t)dst & 15) == 0);
332fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
333fe8aea9eSmrg
334fe8aea9eSmrg	for (i = 0; i < bytes / 128; i++) {
335fe8aea9eSmrg		__m128i xmm0, xmm1, xmm2, xmm3;
336fe8aea9eSmrg		__m128i xmm4, xmm5, xmm6, xmm7;
337fe8aea9eSmrg
338fe8aea9eSmrg		xmm0 = xmm_load_128((const __m128i*)src + 0);
339fe8aea9eSmrg		xmm1 = xmm_load_128((const __m128i*)src + 1);
340fe8aea9eSmrg		xmm2 = xmm_load_128((const __m128i*)src + 2);
341fe8aea9eSmrg		xmm3 = xmm_load_128((const __m128i*)src + 3);
342fe8aea9eSmrg		xmm4 = xmm_load_128((const __m128i*)src + 4);
343fe8aea9eSmrg		xmm5 = xmm_load_128((const __m128i*)src + 5);
344fe8aea9eSmrg		xmm6 = xmm_load_128((const __m128i*)src + 6);
345fe8aea9eSmrg		xmm7 = xmm_load_128((const __m128i*)src + 7);
346fe8aea9eSmrg
347fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 0, xmm0);
348fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 1, xmm1);
349fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 2, xmm2);
350fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 3, xmm3);
351fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 4, xmm4);
352fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 5, xmm5);
353fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 6, xmm6);
354fe8aea9eSmrg		xmm_save_128((__m128i*)dst + 7, xmm7);
355fe8aea9eSmrg
356fe8aea9eSmrg		dst += 128;
357fe8aea9eSmrg		src += 128;
358fe8aea9eSmrg	}
359fe8aea9eSmrg}
360fe8aea9eSmrg
361fe8aea9eSmrgstatic force_inline void
362fe8aea9eSmrgfrom_sse64u(uint8_t *dst, const uint8_t *src)
363fe8aea9eSmrg{
364fe8aea9eSmrg	__m128i xmm1, xmm2, xmm3, xmm4;
365fe8aea9eSmrg
366fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
367fe8aea9eSmrg
368fe8aea9eSmrg	xmm1 = xmm_load_128((const __m128i*)src + 0);
369fe8aea9eSmrg	xmm2 = xmm_load_128((const __m128i*)src + 1);
370fe8aea9eSmrg	xmm3 = xmm_load_128((const __m128i*)src + 2);
371fe8aea9eSmrg	xmm4 = xmm_load_128((const __m128i*)src + 3);
372fe8aea9eSmrg
373fe8aea9eSmrg	xmm_save_128u((__m128i*)dst + 0, xmm1);
374fe8aea9eSmrg	xmm_save_128u((__m128i*)dst + 1, xmm2);
375fe8aea9eSmrg	xmm_save_128u((__m128i*)dst + 2, xmm3);
376fe8aea9eSmrg	xmm_save_128u((__m128i*)dst + 3, xmm4);
377fe8aea9eSmrg}
378fe8aea9eSmrg
379fe8aea9eSmrgstatic force_inline void
380fe8aea9eSmrgfrom_sse64a(uint8_t *dst, const uint8_t *src)
381fe8aea9eSmrg{
382fe8aea9eSmrg	__m128i xmm1, xmm2, xmm3, xmm4;
383fe8aea9eSmrg
384fe8aea9eSmrg	assert(((uintptr_t)dst & 15) == 0);
385fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
386fe8aea9eSmrg
387fe8aea9eSmrg	xmm1 = xmm_load_128((const __m128i*)src + 0);
388fe8aea9eSmrg	xmm2 = xmm_load_128((const __m128i*)src + 1);
389fe8aea9eSmrg	xmm3 = xmm_load_128((const __m128i*)src + 2);
390fe8aea9eSmrg	xmm4 = xmm_load_128((const __m128i*)src + 3);
391fe8aea9eSmrg
392fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 0, xmm1);
393fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 1, xmm2);
394fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 2, xmm3);
395fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 3, xmm4);
396fe8aea9eSmrg}
397fe8aea9eSmrg
398fe8aea9eSmrgstatic force_inline void
399fe8aea9eSmrgfrom_sse32u(uint8_t *dst, const uint8_t *src)
400fe8aea9eSmrg{
401fe8aea9eSmrg	__m128i xmm1, xmm2;
402fe8aea9eSmrg
403fe8aea9eSmrg	xmm1 = xmm_load_128((const __m128i*)src + 0);
404fe8aea9eSmrg	xmm2 = xmm_load_128((const __m128i*)src + 1);
405fe8aea9eSmrg
406fe8aea9eSmrg	xmm_save_128u((__m128i*)dst + 0, xmm1);
407fe8aea9eSmrg	xmm_save_128u((__m128i*)dst + 1, xmm2);
408fe8aea9eSmrg}
409fe8aea9eSmrg
410fe8aea9eSmrgstatic force_inline void
411fe8aea9eSmrgfrom_sse32a(uint8_t *dst, const uint8_t *src)
412fe8aea9eSmrg{
413fe8aea9eSmrg	__m128i xmm1, xmm2;
414fe8aea9eSmrg
415fe8aea9eSmrg	assert(((uintptr_t)dst & 15) == 0);
416fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
417fe8aea9eSmrg
418fe8aea9eSmrg	xmm1 = xmm_load_128((const __m128i*)src + 0);
419fe8aea9eSmrg	xmm2 = xmm_load_128((const __m128i*)src + 1);
420fe8aea9eSmrg
421fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 0, xmm1);
422fe8aea9eSmrg	xmm_save_128((__m128i*)dst + 1, xmm2);
423fe8aea9eSmrg}
424fe8aea9eSmrg
425fe8aea9eSmrgstatic force_inline void
426fe8aea9eSmrgfrom_sse16u(uint8_t *dst, const uint8_t *src)
427fe8aea9eSmrg{
428fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
429fe8aea9eSmrg
430fe8aea9eSmrg	xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src));
431fe8aea9eSmrg}
432fe8aea9eSmrg
433fe8aea9eSmrgstatic force_inline void
434fe8aea9eSmrgfrom_sse16a(uint8_t *dst, const uint8_t *src)
435fe8aea9eSmrg{
436fe8aea9eSmrg	assert(((uintptr_t)dst & 15) == 0);
437fe8aea9eSmrg	assert(((uintptr_t)src & 15) == 0);
438fe8aea9eSmrg
439fe8aea9eSmrg	xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src));
440fe8aea9eSmrg}
441fe8aea9eSmrg
442fe8aea9eSmrgstatic void
443fe8aea9eSmrgmemcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
444fe8aea9eSmrg				     int32_t src_stride, int32_t dst_stride,
445fe8aea9eSmrg				     int16_t src_x, int16_t src_y,
446fe8aea9eSmrg				     int16_t dst_x, int16_t dst_y,
447fe8aea9eSmrg				     uint16_t width, uint16_t height)
448fe8aea9eSmrg{
449fe8aea9eSmrg	const unsigned tile_width = 512;
450fe8aea9eSmrg	const unsigned tile_height = 8;
451fe8aea9eSmrg	const unsigned tile_size = 4096;
452fe8aea9eSmrg
453fe8aea9eSmrg	const unsigned cpp = bpp / 8;
454fe8aea9eSmrg	const unsigned tile_pixels = tile_width / cpp;
455fe8aea9eSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
456fe8aea9eSmrg	const unsigned tile_mask = tile_pixels - 1;
457fe8aea9eSmrg
458fe8aea9eSmrg	unsigned length_x, offset_x;
459fe8aea9eSmrg
460fe8aea9eSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
461fe8aea9eSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
462fe8aea9eSmrg	assert(src != dst);
463fe8aea9eSmrg
464fe8aea9eSmrg	if (dst_x | dst_y)
465fe8aea9eSmrg		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
466fe8aea9eSmrg	width *= cpp;
467fe8aea9eSmrg	assert(dst_stride >= width);
468fe8aea9eSmrg	if (src_x & tile_mask) {
469fe8aea9eSmrg		offset_x = (src_x & tile_mask) * cpp;
470fe8aea9eSmrg		length_x = min(tile_width - offset_x, width);
471fe8aea9eSmrg		dst_stride -= width;
472fe8aea9eSmrg		dst_stride += (width - length_x) & 15;
473fe8aea9eSmrg	} else {
474fe8aea9eSmrg		offset_x = 0;
475fe8aea9eSmrg		dst_stride -= width & ~15;
476fe8aea9eSmrg	}
477fe8aea9eSmrg	assert(dst_stride >= 0);
478fe8aea9eSmrg	src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
479fe8aea9eSmrg
480fe8aea9eSmrg	while (height--) {
481fe8aea9eSmrg		unsigned w = width;
482fe8aea9eSmrg		const uint8_t *tile_row = src;
483fe8aea9eSmrg
484fe8aea9eSmrg		tile_row += src_y / tile_height * src_stride * tile_height;
485fe8aea9eSmrg		tile_row += (src_y & (tile_height-1)) * tile_width;
486fe8aea9eSmrg		src_y++;
487fe8aea9eSmrg
488fe8aea9eSmrg		if (offset_x) {
489fe8aea9eSmrg			memcpy(dst, tile_row + offset_x, length_x);
490fe8aea9eSmrg			tile_row += tile_size;
491fe8aea9eSmrg			dst = (uint8_t *)dst + length_x;
492fe8aea9eSmrg			w -= length_x;
493fe8aea9eSmrg		}
494fe8aea9eSmrg
495fe8aea9eSmrg		if ((uintptr_t)dst & 15) {
496fe8aea9eSmrg			while (w >= tile_width) {
497fe8aea9eSmrg				from_sse128xNu(dst,
498fe8aea9eSmrg					       assume_aligned(tile_row, tile_width),
499fe8aea9eSmrg					       tile_width);
500fe8aea9eSmrg				tile_row += tile_size;
501fe8aea9eSmrg				dst = (uint8_t *)dst + tile_width;
502fe8aea9eSmrg				w -= tile_width;
503fe8aea9eSmrg			}
504fe8aea9eSmrg			while (w >= 64) {
505fe8aea9eSmrg				from_sse64u(dst, tile_row);
506fe8aea9eSmrg				tile_row += 64;
507fe8aea9eSmrg				dst = (uint8_t *)dst + 64;
508fe8aea9eSmrg				w -= 64;
509fe8aea9eSmrg			}
510fe8aea9eSmrg			if (w & 32) {
511fe8aea9eSmrg				from_sse32u(dst, tile_row);
512fe8aea9eSmrg				tile_row += 32;
513fe8aea9eSmrg				dst = (uint8_t *)dst + 32;
514fe8aea9eSmrg			}
515fe8aea9eSmrg			if (w & 16) {
516fe8aea9eSmrg				from_sse16u(dst, tile_row);
517fe8aea9eSmrg				tile_row += 16;
518fe8aea9eSmrg				dst = (uint8_t *)dst + 16;
519fe8aea9eSmrg			}
520fe8aea9eSmrg			memcpy(dst, assume_aligned(tile_row, 16), w & 15);
521fe8aea9eSmrg		} else {
522fe8aea9eSmrg			while (w >= tile_width) {
523fe8aea9eSmrg				from_sse128xNa(assume_aligned(dst, 16),
524fe8aea9eSmrg					       assume_aligned(tile_row, tile_width),
525fe8aea9eSmrg					       tile_width);
526fe8aea9eSmrg				tile_row += tile_size;
527fe8aea9eSmrg				dst = (uint8_t *)dst + tile_width;
528fe8aea9eSmrg				w -= tile_width;
529fe8aea9eSmrg			}
530fe8aea9eSmrg			while (w >= 64) {
531fe8aea9eSmrg				from_sse64a(dst, tile_row);
532fe8aea9eSmrg				tile_row += 64;
533fe8aea9eSmrg				dst = (uint8_t *)dst + 64;
534fe8aea9eSmrg				w -= 64;
535fe8aea9eSmrg			}
536fe8aea9eSmrg			if (w & 32) {
537fe8aea9eSmrg				from_sse32a(dst, tile_row);
538fe8aea9eSmrg				tile_row += 32;
539fe8aea9eSmrg				dst = (uint8_t *)dst + 32;
540fe8aea9eSmrg			}
541fe8aea9eSmrg			if (w & 16) {
542fe8aea9eSmrg				from_sse16a(dst, tile_row);
543fe8aea9eSmrg				tile_row += 16;
544fe8aea9eSmrg				dst = (uint8_t *)dst + 16;
545fe8aea9eSmrg			}
546fe8aea9eSmrg			memcpy(assume_aligned(dst, 16),
547fe8aea9eSmrg			       assume_aligned(tile_row, 16),
548fe8aea9eSmrg			       w & 15);
549fe8aea9eSmrg		}
550fe8aea9eSmrg		dst = (uint8_t *)dst + dst_stride;
551fe8aea9eSmrg	}
552fe8aea9eSmrg}
553fe8aea9eSmrg
554fe8aea9eSmrgstatic void
555fe8aea9eSmrgmemcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
556fe8aea9eSmrg					int32_t src_stride, int32_t dst_stride,
557fe8aea9eSmrg					int16_t src_x, int16_t src_y,
558fe8aea9eSmrg					int16_t dst_x, int16_t dst_y,
559fe8aea9eSmrg					uint16_t width, uint16_t height)
560fe8aea9eSmrg{
561fe8aea9eSmrg	const unsigned tile_width = 512;
562fe8aea9eSmrg	const unsigned tile_height = 8;
563fe8aea9eSmrg	const unsigned tile_size = 4096;
564fe8aea9eSmrg
565fe8aea9eSmrg	const unsigned cpp = bpp / 8;
566fe8aea9eSmrg	const unsigned tile_pixels = tile_width / cpp;
567fe8aea9eSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
568fe8aea9eSmrg	const unsigned tile_mask = tile_pixels - 1;
569fe8aea9eSmrg
570fe8aea9eSmrg	unsigned ox, lx;
571fe8aea9eSmrg
572fe8aea9eSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
573fe8aea9eSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
574fe8aea9eSmrg	assert(src != dst);
575fe8aea9eSmrg
576fe8aea9eSmrg	width *= cpp;
577fe8aea9eSmrg	dst_stride *= tile_height;
578fe8aea9eSmrg	src_stride *= tile_height;
579fe8aea9eSmrg
580fe8aea9eSmrg	assert((dst_x & tile_mask) == (src_x & tile_mask));
581fe8aea9eSmrg	if (dst_x & tile_mask) {
582fe8aea9eSmrg		ox = (dst_x & tile_mask) * cpp;
583fe8aea9eSmrg		lx = min(tile_width - ox, width);
584fe8aea9eSmrg		assert(lx != 0);
585fe8aea9eSmrg	} else
586fe8aea9eSmrg		lx = 0;
587fe8aea9eSmrg
588fe8aea9eSmrg	if (dst_x)
589fe8aea9eSmrg		dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
590fe8aea9eSmrg	if (src_x)
591fe8aea9eSmrg		src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
592fe8aea9eSmrg
593fe8aea9eSmrg	while (height--) {
594fe8aea9eSmrg		const uint8_t *src_row;
595fe8aea9eSmrg		uint8_t *dst_row;
596fe8aea9eSmrg		unsigned w = width;
597fe8aea9eSmrg
598fe8aea9eSmrg		dst_row = dst;
599fe8aea9eSmrg		dst_row += dst_y / tile_height * dst_stride;
600fe8aea9eSmrg		dst_row += (dst_y & (tile_height-1)) * tile_width;
601fe8aea9eSmrg		dst_y++;
602fe8aea9eSmrg
603fe8aea9eSmrg		src_row = src;
604fe8aea9eSmrg		src_row += src_y / tile_height * src_stride;
605fe8aea9eSmrg		src_row += (src_y & (tile_height-1)) * tile_width;
606fe8aea9eSmrg		src_y++;
607fe8aea9eSmrg
608fe8aea9eSmrg		if (lx) {
609fe8aea9eSmrg			to_memcpy(dst_row + ox, src_row + ox, lx);
610fe8aea9eSmrg			dst_row += tile_size;
611fe8aea9eSmrg			src_row += tile_size;
612fe8aea9eSmrg			w -= lx;
613fe8aea9eSmrg		}
614fe8aea9eSmrg		while (w >= tile_width) {
615fe8aea9eSmrg			assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
616fe8aea9eSmrg			assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
617fe8aea9eSmrg			to_sse128xN(assume_aligned(dst_row, tile_width),
618fe8aea9eSmrg				    assume_aligned(src_row, tile_width),
619fe8aea9eSmrg				    tile_width);
620fe8aea9eSmrg			dst_row += tile_size;
621fe8aea9eSmrg			src_row += tile_size;
622fe8aea9eSmrg			w -= tile_width;
623fe8aea9eSmrg		}
624fe8aea9eSmrg		if (w) {
625fe8aea9eSmrg			assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
626fe8aea9eSmrg			assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
627fe8aea9eSmrg			to_memcpy(assume_aligned(dst_row, tile_width),
628fe8aea9eSmrg				  assume_aligned(src_row, tile_width),
629fe8aea9eSmrg				  w);
630fe8aea9eSmrg		}
631fe8aea9eSmrg	}
632fe8aea9eSmrg}
633fe8aea9eSmrg
634fe8aea9eSmrg#pragma GCC push_options
63503b705cfSriastradh#endif
63603b705cfSriastradh
63703b705cfSriastradhfast void
63803b705cfSriastradhmemcpy_blt(const void *src, void *dst, int bpp,
63903b705cfSriastradh	   int32_t src_stride, int32_t dst_stride,
64003b705cfSriastradh	   int16_t src_x, int16_t src_y,
64103b705cfSriastradh	   int16_t dst_x, int16_t dst_y,
64203b705cfSriastradh	   uint16_t width, uint16_t height)
64303b705cfSriastradh{
64403b705cfSriastradh	const uint8_t *src_bytes;
64503b705cfSriastradh	uint8_t *dst_bytes;
64603b705cfSriastradh	int byte_width;
64703b705cfSriastradh
64803b705cfSriastradh	assert(src);
64903b705cfSriastradh	assert(dst);
65003b705cfSriastradh	assert(width && height);
65103b705cfSriastradh	assert(bpp >= 8);
65203b705cfSriastradh	assert(width*bpp <= 8*src_stride);
65303b705cfSriastradh	assert(width*bpp <= 8*dst_stride);
65403b705cfSriastradh
65503b705cfSriastradh	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
65603b705cfSriastradh	     __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
65703b705cfSriastradh
65803b705cfSriastradh	bpp /= 8;
65903b705cfSriastradh
66003b705cfSriastradh	src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
66103b705cfSriastradh	dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
66203b705cfSriastradh
66303b705cfSriastradh	byte_width = width * bpp;
66403b705cfSriastradh	if (byte_width == src_stride && byte_width == dst_stride) {
66503b705cfSriastradh		byte_width *= height;
66603b705cfSriastradh		height = 1;
66703b705cfSriastradh	}
66803b705cfSriastradh
66903b705cfSriastradh	switch (byte_width) {
67003b705cfSriastradh	case 1:
67103b705cfSriastradh		do {
67203b705cfSriastradh			*dst_bytes = *src_bytes;
67303b705cfSriastradh			src_bytes += src_stride;
67403b705cfSriastradh			dst_bytes += dst_stride;
67503b705cfSriastradh		} while (--height);
67603b705cfSriastradh		break;
67703b705cfSriastradh
67803b705cfSriastradh	case 2:
67903b705cfSriastradh		do {
68003b705cfSriastradh			*(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes;
68103b705cfSriastradh			src_bytes += src_stride;
68203b705cfSriastradh			dst_bytes += dst_stride;
68303b705cfSriastradh		} while (--height);
68403b705cfSriastradh		break;
68503b705cfSriastradh
68603b705cfSriastradh	case 4:
68703b705cfSriastradh		do {
68803b705cfSriastradh			*(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes;
68903b705cfSriastradh			src_bytes += src_stride;
69003b705cfSriastradh			dst_bytes += dst_stride;
69103b705cfSriastradh		} while (--height);
69203b705cfSriastradh		break;
69303b705cfSriastradh
69403b705cfSriastradh	case 8:
69503b705cfSriastradh		do {
69603b705cfSriastradh			*(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes;
69703b705cfSriastradh			src_bytes += src_stride;
69803b705cfSriastradh			dst_bytes += dst_stride;
69903b705cfSriastradh		} while (--height);
70003b705cfSriastradh		break;
70103b705cfSriastradh	case 16:
70203b705cfSriastradh		do {
70303b705cfSriastradh			((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0];
70403b705cfSriastradh			((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1];
70503b705cfSriastradh			src_bytes += src_stride;
70603b705cfSriastradh			dst_bytes += dst_stride;
70703b705cfSriastradh		} while (--height);
70803b705cfSriastradh		break;
70903b705cfSriastradh
71003b705cfSriastradh	default:
71103b705cfSriastradh		do {
71203b705cfSriastradh			memcpy(dst_bytes, src_bytes, byte_width);
71303b705cfSriastradh			src_bytes += src_stride;
71403b705cfSriastradh			dst_bytes += dst_stride;
71503b705cfSriastradh		} while (--height);
71603b705cfSriastradh		break;
71703b705cfSriastradh	}
71803b705cfSriastradh}
71903b705cfSriastradh
72003b705cfSriastradhstatic fast_memcpy void
72103b705cfSriastradhmemcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
72203b705cfSriastradh			     int32_t src_stride, int32_t dst_stride,
72303b705cfSriastradh			     int16_t src_x, int16_t src_y,
72403b705cfSriastradh			     int16_t dst_x, int16_t dst_y,
72503b705cfSriastradh			     uint16_t width, uint16_t height)
72603b705cfSriastradh{
72703b705cfSriastradh	const unsigned tile_width = 512;
72803b705cfSriastradh	const unsigned tile_height = 8;
72903b705cfSriastradh	const unsigned tile_size = 4096;
73003b705cfSriastradh
73103b705cfSriastradh	const unsigned cpp = bpp / 8;
73242542f5fSchristos	const unsigned tile_pixels = tile_width / cpp;
73342542f5fSchristos	const unsigned tile_shift = ffs(tile_pixels) - 1;
73442542f5fSchristos	const unsigned tile_mask = tile_pixels - 1;
73503b705cfSriastradh
73603b705cfSriastradh	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
73703b705cfSriastradh	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
73842542f5fSchristos	assert(src != dst);
73942542f5fSchristos
74042542f5fSchristos	if (src_x | src_y)
74142542f5fSchristos		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
74242542f5fSchristos	assert(src_stride >= width * cpp);
74342542f5fSchristos	src_stride -= width * cpp;
74442542f5fSchristos
74542542f5fSchristos	while (height--) {
74642542f5fSchristos		unsigned w = width * cpp;
74742542f5fSchristos		uint8_t *tile_row = dst;
74842542f5fSchristos
74942542f5fSchristos		tile_row += dst_y / tile_height * dst_stride * tile_height;
75042542f5fSchristos		tile_row += (dst_y & (tile_height-1)) * tile_width;
75142542f5fSchristos		if (dst_x) {
75242542f5fSchristos			tile_row += (dst_x >> tile_shift) * tile_size;
75342542f5fSchristos			if (dst_x & tile_mask) {
75442542f5fSchristos				const unsigned x = (dst_x & tile_mask) * cpp;
75542542f5fSchristos				const unsigned len = min(tile_width - x, w);
756fe8aea9eSmrg				memcpy(assume_misaligned(tile_row + x, tile_width, x),
757fe8aea9eSmrg				       src, len);
75842542f5fSchristos
75942542f5fSchristos				tile_row += tile_size;
76042542f5fSchristos				src = (const uint8_t *)src + len;
76142542f5fSchristos				w -= len;
76242542f5fSchristos			}
76303b705cfSriastradh		}
76442542f5fSchristos		while (w >= tile_width) {
765fe8aea9eSmrg			memcpy(assume_aligned(tile_row, tile_width),
766fe8aea9eSmrg			       src, tile_width);
76742542f5fSchristos			tile_row += tile_size;
76842542f5fSchristos			src = (const uint8_t *)src + tile_width;
76942542f5fSchristos			w -= tile_width;
77003b705cfSriastradh		}
771fe8aea9eSmrg		memcpy(assume_aligned(tile_row, tile_width), src, w);
77242542f5fSchristos		src = (const uint8_t *)src + src_stride + w;
77342542f5fSchristos		dst_y++;
77403b705cfSriastradh	}
77503b705cfSriastradh}
77603b705cfSriastradh
77703b705cfSriastradhstatic fast_memcpy void
77803b705cfSriastradhmemcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
77903b705cfSriastradh			       int32_t src_stride, int32_t dst_stride,
78003b705cfSriastradh			       int16_t src_x, int16_t src_y,
78103b705cfSriastradh			       int16_t dst_x, int16_t dst_y,
78203b705cfSriastradh			       uint16_t width, uint16_t height)
78303b705cfSriastradh{
78403b705cfSriastradh	const unsigned tile_width = 512;
78503b705cfSriastradh	const unsigned tile_height = 8;
78603b705cfSriastradh	const unsigned tile_size = 4096;
78703b705cfSriastradh
78803b705cfSriastradh	const unsigned cpp = bpp / 8;
78942542f5fSchristos	const unsigned tile_pixels = tile_width / cpp;
79042542f5fSchristos	const unsigned tile_shift = ffs(tile_pixels) - 1;
79142542f5fSchristos	const unsigned tile_mask = tile_pixels - 1;
79203b705cfSriastradh
79303b705cfSriastradh	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
79403b705cfSriastradh	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
79542542f5fSchristos	assert(src != dst);
79642542f5fSchristos
79742542f5fSchristos	if (dst_x | dst_y)
79842542f5fSchristos		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
79942542f5fSchristos	assert(dst_stride >= width * cpp);
80042542f5fSchristos	dst_stride -= width * cpp;
80142542f5fSchristos
80242542f5fSchristos	while (height--) {
80342542f5fSchristos		unsigned w = width * cpp;
80442542f5fSchristos		const uint8_t *tile_row = src;
80542542f5fSchristos
80642542f5fSchristos		tile_row += src_y / tile_height * src_stride * tile_height;
80742542f5fSchristos		tile_row += (src_y & (tile_height-1)) * tile_width;
80842542f5fSchristos		if (src_x) {
80942542f5fSchristos			tile_row += (src_x >> tile_shift) * tile_size;
81042542f5fSchristos			if (src_x & tile_mask) {
81142542f5fSchristos				const unsigned x = (src_x & tile_mask) * cpp;
81242542f5fSchristos				const unsigned len = min(tile_width - x, w);
813fe8aea9eSmrg				memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
81442542f5fSchristos
81542542f5fSchristos				tile_row += tile_size;
81642542f5fSchristos				dst = (uint8_t *)dst + len;
81742542f5fSchristos				w -= len;
81842542f5fSchristos			}
81903b705cfSriastradh		}
82042542f5fSchristos		while (w >= tile_width) {
821fe8aea9eSmrg			memcpy(dst,
822fe8aea9eSmrg			       assume_aligned(tile_row, tile_width),
823fe8aea9eSmrg			       tile_width);
82403b705cfSriastradh
82542542f5fSchristos			tile_row += tile_size;
82642542f5fSchristos			dst = (uint8_t *)dst + tile_width;
82742542f5fSchristos			w -= tile_width;
82803b705cfSriastradh		}
829fe8aea9eSmrg		memcpy(dst, assume_aligned(tile_row, tile_width), w);
83042542f5fSchristos		dst = (uint8_t *)dst + dst_stride + w;
83142542f5fSchristos		src_y++;
83203b705cfSriastradh	}
83303b705cfSriastradh}
83403b705cfSriastradh
835fe8aea9eSmrgstatic fast_memcpy void
836fe8aea9eSmrgmemcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
837fe8aea9eSmrg				  int32_t src_stride, int32_t dst_stride,
838fe8aea9eSmrg				  int16_t src_x, int16_t src_y,
839fe8aea9eSmrg				  int16_t dst_x, int16_t dst_y,
840fe8aea9eSmrg				  uint16_t width, uint16_t height)
84103b705cfSriastradh{
84203b705cfSriastradh	const unsigned tile_width = 512;
84303b705cfSriastradh	const unsigned tile_height = 8;
84403b705cfSriastradh	const unsigned tile_size = 4096;
84503b705cfSriastradh
84603b705cfSriastradh	const unsigned cpp = bpp / 8;
847fe8aea9eSmrg	const unsigned tile_pixels = tile_width / cpp;
848fe8aea9eSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
849fe8aea9eSmrg	const unsigned tile_mask = tile_pixels - 1;
85003b705cfSriastradh
85103b705cfSriastradh	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
85203b705cfSriastradh	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
853fe8aea9eSmrg	assert(src != dst);
854fe8aea9eSmrg	assert((dst_x & tile_mask) == (src_x & tile_mask));
85503b705cfSriastradh
856fe8aea9eSmrg	while (height--) {
857fe8aea9eSmrg		unsigned w = width * cpp;
858fe8aea9eSmrg		uint8_t *dst_row = dst;
859fe8aea9eSmrg		const uint8_t *src_row = src;
86003b705cfSriastradh
861fe8aea9eSmrg		dst_row += dst_y / tile_height * dst_stride * tile_height;
862fe8aea9eSmrg		dst_row += (dst_y & (tile_height-1)) * tile_width;
863fe8aea9eSmrg		if (dst_x)
864fe8aea9eSmrg			dst_row += (dst_x >> tile_shift) * tile_size;
865fe8aea9eSmrg		dst_y++;
86603b705cfSriastradh
867fe8aea9eSmrg		src_row += src_y / tile_height * src_stride * tile_height;
868fe8aea9eSmrg		src_row += (src_y & (tile_height-1)) * tile_width;
869fe8aea9eSmrg		if (src_x)
870fe8aea9eSmrg			src_row += (src_x >> tile_shift) * tile_size;
871fe8aea9eSmrg		src_y++;
87203b705cfSriastradh
873fe8aea9eSmrg		if (dst_x & tile_mask) {
874fe8aea9eSmrg			const unsigned x = (dst_x & tile_mask) * cpp;
875fe8aea9eSmrg			const unsigned len = min(tile_width - x, w);
87603b705cfSriastradh
877fe8aea9eSmrg			memcpy(assume_misaligned(dst_row + x, tile_width, x),
878fe8aea9eSmrg			       assume_misaligned(src_row + x, tile_width, x),
879fe8aea9eSmrg			       len);
88003b705cfSriastradh
881fe8aea9eSmrg			dst_row += tile_size;
882fe8aea9eSmrg			src_row += tile_size;
883fe8aea9eSmrg			w -= len;
88403b705cfSriastradh		}
88503b705cfSriastradh
886fe8aea9eSmrg		while (w >= tile_width) {
887fe8aea9eSmrg			memcpy(assume_aligned(dst_row, tile_width),
888fe8aea9eSmrg			       assume_aligned(src_row, tile_width),
889fe8aea9eSmrg			       tile_width);
890fe8aea9eSmrg			dst_row += tile_size;
891fe8aea9eSmrg			src_row += tile_size;
892fe8aea9eSmrg			w -= tile_width;
89303b705cfSriastradh		}
894fe8aea9eSmrg		memcpy(assume_aligned(dst_row, tile_width),
895fe8aea9eSmrg		       assume_aligned(src_row, tile_width),
896fe8aea9eSmrg		       w);
89703b705cfSriastradh	}
89803b705cfSriastradh}
89903b705cfSriastradh
900fe8aea9eSmrg#define memcpy_to_tiled_x(swizzle) \
901fe8aea9eSmrgfast_memcpy static void \
902fe8aea9eSmrgmemcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
903fe8aea9eSmrg			      int32_t src_stride, int32_t dst_stride, \
904fe8aea9eSmrg			      int16_t src_x, int16_t src_y, \
905fe8aea9eSmrg			      int16_t dst_x, int16_t dst_y, \
906fe8aea9eSmrg			      uint16_t width, uint16_t height) \
907fe8aea9eSmrg{ \
908fe8aea9eSmrg	const unsigned tile_width = 512; \
909fe8aea9eSmrg	const unsigned tile_height = 8; \
910fe8aea9eSmrg	const unsigned tile_size = 4096; \
911fe8aea9eSmrg	const unsigned cpp = bpp / 8; \
912fe8aea9eSmrg	const unsigned stride_tiles = dst_stride / tile_width; \
913fe8aea9eSmrg	const unsigned swizzle_pixels = 64 / cpp; \
914fe8aea9eSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
915fe8aea9eSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1; \
916fe8aea9eSmrg	unsigned x, y; \
917fe8aea9eSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
918fe8aea9eSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
919fe8aea9eSmrg	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; \
920fe8aea9eSmrg	for (y = 0; y < height; ++y) { \
921fe8aea9eSmrg		const uint32_t dy = y + dst_y; \
922fe8aea9eSmrg		const uint32_t tile_row = \
923fe8aea9eSmrg			(dy / tile_height * stride_tiles * tile_size + \
924fe8aea9eSmrg			 (dy & (tile_height-1)) * tile_width); \
925fe8aea9eSmrg		const uint8_t *src_row = (const uint8_t *)src + src_stride * y; \
926fe8aea9eSmrg		uint32_t dx = dst_x; \
927fe8aea9eSmrg		x = width * cpp; \
928fe8aea9eSmrg		if (dx & (swizzle_pixels - 1)) { \
929fe8aea9eSmrg			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); \
930fe8aea9eSmrg			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; \
931fe8aea9eSmrg			uint32_t offset = \
932fe8aea9eSmrg				tile_row + \
933fe8aea9eSmrg				(dx >> tile_pixels) * tile_size + \
934fe8aea9eSmrg				(dx & tile_mask) * cpp; \
935fe8aea9eSmrg			memcpy((char *)dst + swizzle(offset), src_row, length * cpp); \
936fe8aea9eSmrg			src_row += length * cpp; \
937fe8aea9eSmrg			x -= length * cpp; \
938fe8aea9eSmrg			dx += length; \
939fe8aea9eSmrg		} \
940fe8aea9eSmrg		while (x >= 64) { \
941fe8aea9eSmrg			uint32_t offset = \
942fe8aea9eSmrg				tile_row + \
943fe8aea9eSmrg				(dx >> tile_pixels) * tile_size + \
944fe8aea9eSmrg				(dx & tile_mask) * cpp; \
945fe8aea9eSmrg			memcpy(assume_aligned((char *)dst+swizzle(offset),64), \
946fe8aea9eSmrg			       src_row, 64); \
947fe8aea9eSmrg			src_row += 64; \
948fe8aea9eSmrg			x -= 64; \
949fe8aea9eSmrg			dx += swizzle_pixels; \
950fe8aea9eSmrg		} \
951fe8aea9eSmrg		if (x) { \
952fe8aea9eSmrg			uint32_t offset = \
953fe8aea9eSmrg				tile_row + \
954fe8aea9eSmrg				(dx >> tile_pixels) * tile_size + \
955fe8aea9eSmrg				(dx & tile_mask) * cpp; \
956fe8aea9eSmrg			memcpy(assume_aligned((char *)dst + swizzle(offset), 64), src_row, x); \
957fe8aea9eSmrg		} \
958fe8aea9eSmrg	} \
959fe8aea9eSmrg}
96003b705cfSriastradh
961fe8aea9eSmrg#define memcpy_from_tiled_x(swizzle) \
962fe8aea9eSmrgfast_memcpy static void \
963fe8aea9eSmrgmemcpy_from_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
964fe8aea9eSmrg				int32_t src_stride, int32_t dst_stride, \
965fe8aea9eSmrg				int16_t src_x, int16_t src_y, \
966fe8aea9eSmrg				int16_t dst_x, int16_t dst_y, \
967fe8aea9eSmrg				uint16_t width, uint16_t height) \
968fe8aea9eSmrg{ \
969fe8aea9eSmrg	const unsigned tile_width = 512; \
970fe8aea9eSmrg	const unsigned tile_height = 8; \
971fe8aea9eSmrg	const unsigned tile_size = 4096; \
972fe8aea9eSmrg	const unsigned cpp = bpp / 8; \
973fe8aea9eSmrg	const unsigned stride_tiles = src_stride / tile_width; \
974fe8aea9eSmrg	const unsigned swizzle_pixels = 64 / cpp; \
975fe8aea9eSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
976fe8aea9eSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1; \
977fe8aea9eSmrg	unsigned x, y; \
978fe8aea9eSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
979fe8aea9eSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
980fe8aea9eSmrg	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; \
981fe8aea9eSmrg	for (y = 0; y < height; ++y) { \
982fe8aea9eSmrg		const uint32_t sy = y + src_y; \
983fe8aea9eSmrg		const uint32_t tile_row = \
984fe8aea9eSmrg			(sy / tile_height * stride_tiles * tile_size + \
985fe8aea9eSmrg			 (sy & (tile_height-1)) * tile_width); \
986fe8aea9eSmrg		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; \
987fe8aea9eSmrg		uint32_t sx = src_x; \
988fe8aea9eSmrg		x = width * cpp; \
989fe8aea9eSmrg		if (sx & (swizzle_pixels - 1)) { \
990fe8aea9eSmrg			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); \
991fe8aea9eSmrg			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; \
992fe8aea9eSmrg			uint32_t offset = \
993fe8aea9eSmrg				tile_row + \
994fe8aea9eSmrg				(sx >> tile_pixels) * tile_size + \
995fe8aea9eSmrg				(sx & tile_mask) * cpp; \
996fe8aea9eSmrg			memcpy(dst_row, (const char *)src + swizzle(offset), length * cpp); \
997fe8aea9eSmrg			dst_row += length * cpp; \
998fe8aea9eSmrg			x -= length * cpp; \
999fe8aea9eSmrg			sx += length; \
1000fe8aea9eSmrg		} \
1001fe8aea9eSmrg		while (x >= 64) { \
1002fe8aea9eSmrg			uint32_t offset = \
1003fe8aea9eSmrg				tile_row + \
1004fe8aea9eSmrg				(sx >> tile_pixels) * tile_size + \
1005fe8aea9eSmrg				(sx & tile_mask) * cpp; \
1006fe8aea9eSmrg			memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), 64); \
1007fe8aea9eSmrg			dst_row += 64; \
1008fe8aea9eSmrg			x -= 64; \
1009fe8aea9eSmrg			sx += swizzle_pixels; \
1010fe8aea9eSmrg		} \
1011fe8aea9eSmrg		if (x) { \
1012fe8aea9eSmrg			uint32_t offset = \
1013fe8aea9eSmrg				tile_row + \
1014fe8aea9eSmrg				(sx >> tile_pixels) * tile_size + \
1015fe8aea9eSmrg				(sx & tile_mask) * cpp; \
1016fe8aea9eSmrg			memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), x); \
1017fe8aea9eSmrg		} \
1018fe8aea9eSmrg	} \
1019fe8aea9eSmrg}
102003b705cfSriastradh
1021fe8aea9eSmrg#define swizzle_9(X) ((X) ^ (((X) >> 3) & 64))
1022fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9)
1023fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9)
1024fe8aea9eSmrg#undef swizzle_9
102503b705cfSriastradh
1026fe8aea9eSmrg#define swizzle_9_10(X) ((X) ^ ((((X) ^ ((X) >> 1)) >> 3) & 64))
1027fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9_10)
1028fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9_10)
1029fe8aea9eSmrg#undef swizzle_9_10
103003b705cfSriastradh
1031fe8aea9eSmrg#define swizzle_9_11(X) ((X) ^ ((((X) ^ ((X) >> 2)) >> 3) & 64))
1032fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9_11)
1033fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9_11)
1034fe8aea9eSmrg#undef swizzle_9_11
103503b705cfSriastradh
1036fe8aea9eSmrg#define swizzle_9_10_11(X) ((X) ^ ((((X) ^ ((X) >> 1) ^ ((X) >> 2)) >> 3) & 64))
1037fe8aea9eSmrgmemcpy_to_tiled_x(swizzle_9_10_11)
1038fe8aea9eSmrgmemcpy_from_tiled_x(swizzle_9_10_11)
1039fe8aea9eSmrg#undef swizzle_9_10_11
104003b705cfSriastradh
1041fe8aea9eSmrgstatic fast_memcpy void
1042fe8aea9eSmrgmemcpy_to_tiled_x__gen2(const void *src, void *dst, int bpp,
1043fe8aea9eSmrg			int32_t src_stride, int32_t dst_stride,
1044fe8aea9eSmrg			int16_t src_x, int16_t src_y,
1045fe8aea9eSmrg			int16_t dst_x, int16_t dst_y,
1046fe8aea9eSmrg			uint16_t width, uint16_t height)
104703b705cfSriastradh{
1048fe8aea9eSmrg	const unsigned tile_width = 128;
1049fe8aea9eSmrg	const unsigned tile_height = 16;
1050fe8aea9eSmrg	const unsigned tile_size = 2048;
105103b705cfSriastradh
105203b705cfSriastradh	const unsigned cpp = bpp / 8;
1053fe8aea9eSmrg	const unsigned tile_pixels = tile_width / cpp;
1054fe8aea9eSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
1055fe8aea9eSmrg	const unsigned tile_mask = tile_pixels - 1;
105603b705cfSriastradh
105703b705cfSriastradh	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
105803b705cfSriastradh	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
1059fe8aea9eSmrg	assert(src != dst);
106003b705cfSriastradh
1061fe8aea9eSmrg	if (src_x | src_y)
1062fe8aea9eSmrg		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
1063fe8aea9eSmrg	assert(src_stride >= width * cpp);
1064fe8aea9eSmrg	src_stride -= width * cpp;
106503b705cfSriastradh
1066fe8aea9eSmrg	while (height--) {
1067fe8aea9eSmrg		unsigned w = width * cpp;
1068fe8aea9eSmrg		uint8_t *tile_row = dst;
106903b705cfSriastradh
1070fe8aea9eSmrg		tile_row += dst_y / tile_height * dst_stride * tile_height;
1071fe8aea9eSmrg		tile_row += (dst_y & (tile_height-1)) * tile_width;
1072fe8aea9eSmrg		if (dst_x) {
1073fe8aea9eSmrg			tile_row += (dst_x >> tile_shift) * tile_size;
1074fe8aea9eSmrg			if (dst_x & tile_mask) {
1075fe8aea9eSmrg				const unsigned x = (dst_x & tile_mask) * cpp;
1076fe8aea9eSmrg				const unsigned len = min(tile_width - x, w);
1077fe8aea9eSmrg				memcpy(assume_misaligned(tile_row + x, tile_width, x), src, len);
107803b705cfSriastradh
1079fe8aea9eSmrg				tile_row += tile_size;
1080fe8aea9eSmrg				src = (const uint8_t *)src + len;
1081fe8aea9eSmrg				w -= len;
1082fe8aea9eSmrg			}
108303b705cfSriastradh		}
1084fe8aea9eSmrg		while (w >= tile_width) {
1085fe8aea9eSmrg			memcpy(assume_aligned(tile_row, tile_width),
1086fe8aea9eSmrg			       src, tile_width);
108703b705cfSriastradh
1088fe8aea9eSmrg			tile_row += tile_size;
1089fe8aea9eSmrg			src = (const uint8_t *)src + tile_width;
1090fe8aea9eSmrg			w -= tile_width;
109103b705cfSriastradh		}
1092fe8aea9eSmrg		memcpy(assume_aligned(tile_row, tile_width), src, w);
1093fe8aea9eSmrg		src = (const uint8_t *)src + src_stride + w;
1094fe8aea9eSmrg		dst_y++;
109503b705cfSriastradh	}
109603b705cfSriastradh}
109703b705cfSriastradh
1098fe8aea9eSmrgstatic fast_memcpy void
1099fe8aea9eSmrgmemcpy_from_tiled_x__gen2(const void *src, void *dst, int bpp,
1100fe8aea9eSmrg			  int32_t src_stride, int32_t dst_stride,
1101fe8aea9eSmrg			  int16_t src_x, int16_t src_y,
1102fe8aea9eSmrg			  int16_t dst_x, int16_t dst_y,
1103fe8aea9eSmrg			  uint16_t width, uint16_t height)
110403b705cfSriastradh{
1105fe8aea9eSmrg	const unsigned tile_width = 128;
1106fe8aea9eSmrg	const unsigned tile_height = 16;
1107fe8aea9eSmrg	const unsigned tile_size = 2048;
110803b705cfSriastradh
110903b705cfSriastradh	const unsigned cpp = bpp / 8;
1110fe8aea9eSmrg	const unsigned tile_pixels = tile_width / cpp;
1111fe8aea9eSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
1112fe8aea9eSmrg	const unsigned tile_mask = tile_pixels - 1;
111303b705cfSriastradh
111403b705cfSriastradh	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
111503b705cfSriastradh	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
1116fe8aea9eSmrg	assert(src != dst);
111703b705cfSriastradh
1118fe8aea9eSmrg	if (dst_x | dst_y)
1119fe8aea9eSmrg		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
1120fe8aea9eSmrg	assert(dst_stride >= width * cpp);
1121fe8aea9eSmrg	dst_stride -= width * cpp;
1122fe8aea9eSmrg
1123fe8aea9eSmrg	while (height--) {
1124fe8aea9eSmrg		unsigned w = width * cpp;
1125fe8aea9eSmrg		const uint8_t *tile_row = src;
112603b705cfSriastradh
1127fe8aea9eSmrg		tile_row += src_y / tile_height * src_stride * tile_height;
1128fe8aea9eSmrg		tile_row += (src_y & (tile_height-1)) * tile_width;
1129fe8aea9eSmrg		if (src_x) {
1130fe8aea9eSmrg			tile_row += (src_x >> tile_shift) * tile_size;
1131fe8aea9eSmrg			if (src_x & tile_mask) {
1132fe8aea9eSmrg				const unsigned x = (src_x & tile_mask) * cpp;
1133fe8aea9eSmrg				const unsigned len = min(tile_width - x, w);
1134fe8aea9eSmrg				memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
113503b705cfSriastradh
1136fe8aea9eSmrg				tile_row += tile_size;
1137fe8aea9eSmrg				dst = (uint8_t *)dst + len;
1138fe8aea9eSmrg				w -= len;
1139fe8aea9eSmrg			}
114003b705cfSriastradh		}
1141fe8aea9eSmrg		while (w >= tile_width) {
1142fe8aea9eSmrg			memcpy(dst,
1143fe8aea9eSmrg			       assume_aligned(tile_row, tile_width),
1144fe8aea9eSmrg			       tile_width);
1145fe8aea9eSmrg
1146fe8aea9eSmrg			tile_row += tile_size;
1147fe8aea9eSmrg			dst = (uint8_t *)dst + tile_width;
1148fe8aea9eSmrg			w -= tile_width;
114903b705cfSriastradh		}
1150fe8aea9eSmrg		memcpy(dst, assume_aligned(tile_row, tile_width), w);
1151fe8aea9eSmrg		dst = (uint8_t *)dst + dst_stride + w;
1152fe8aea9eSmrg		src_y++;
115303b705cfSriastradh	}
115403b705cfSriastradh}
115503b705cfSriastradh
1156fe8aea9eSmrgvoid choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu)
115703b705cfSriastradh{
1158fe8aea9eSmrg	if (kgem->gen < 030) {
1159fe8aea9eSmrg		if (swizzling == I915_BIT_6_SWIZZLE_NONE) {
1160fe8aea9eSmrg			DBG(("%s: gen2, no swizzling\n", __FUNCTION__));
1161fe8aea9eSmrg			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__gen2;
1162fe8aea9eSmrg			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__gen2;
1163fe8aea9eSmrg		} else
1164fe8aea9eSmrg			DBG(("%s: no detiling with swizzle functions for gen2\n", __FUNCTION__));
1165fe8aea9eSmrg		return;
1166fe8aea9eSmrg	}
1167fe8aea9eSmrg
116803b705cfSriastradh	switch (swizzling) {
116903b705cfSriastradh	default:
117003b705cfSriastradh		DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling));
117103b705cfSriastradh		break;
117203b705cfSriastradh	case I915_BIT_6_SWIZZLE_NONE:
117303b705cfSriastradh		DBG(("%s: no swizzling\n", __FUNCTION__));
1174fe8aea9eSmrg#if defined(sse2)
1175fe8aea9eSmrg		if (cpu & SSE2) {
1176fe8aea9eSmrg			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2;
1177fe8aea9eSmrg			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2;
1178fe8aea9eSmrg			kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2;
1179fe8aea9eSmrg		} else
1180fe8aea9eSmrg#endif
1181fe8aea9eSmrg	       	{
1182fe8aea9eSmrg			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
1183fe8aea9eSmrg			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
1184fe8aea9eSmrg			kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0;
1185fe8aea9eSmrg		}
118603b705cfSriastradh		break;
118703b705cfSriastradh	case I915_BIT_6_SWIZZLE_9:
118803b705cfSriastradh		DBG(("%s: 6^9 swizzling\n", __FUNCTION__));
118903b705cfSriastradh		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9;
119003b705cfSriastradh		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9;
119103b705cfSriastradh		break;
119203b705cfSriastradh	case I915_BIT_6_SWIZZLE_9_10:
119303b705cfSriastradh		DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__));
119403b705cfSriastradh		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10;
119503b705cfSriastradh		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10;
119603b705cfSriastradh		break;
119703b705cfSriastradh	case I915_BIT_6_SWIZZLE_9_11:
119803b705cfSriastradh		DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__));
119903b705cfSriastradh		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
120003b705cfSriastradh		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11;
120103b705cfSriastradh		break;
1202fe8aea9eSmrg	case I915_BIT_6_SWIZZLE_9_10_11:
1203fe8aea9eSmrg		DBG(("%s: 6^9^10^11 swizzling\n", __FUNCTION__));
1204fe8aea9eSmrg		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10_11;
1205fe8aea9eSmrg		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10_11;
1206fe8aea9eSmrg		break;
120703b705cfSriastradh	}
120803b705cfSriastradh}
120903b705cfSriastradh
121003b705cfSriastradhvoid
121103b705cfSriastradhmemmove_box(const void *src, void *dst,
121203b705cfSriastradh	    int bpp, int32_t stride,
121303b705cfSriastradh	    const BoxRec *box,
121403b705cfSriastradh	    int dx, int dy)
121503b705cfSriastradh{
121642542f5fSchristos#define FORCE_MEMMOVE 0
121703b705cfSriastradh	union {
121803b705cfSriastradh		uint8_t u8;
121903b705cfSriastradh		uint16_t u16;
122003b705cfSriastradh		uint32_t u32;
122103b705cfSriastradh		uint64_t u64;
122203b705cfSriastradh	} tmp;
122303b705cfSriastradh	const uint8_t *src_bytes;
122403b705cfSriastradh	uint8_t *dst_bytes;
122503b705cfSriastradh	int width, height;
122603b705cfSriastradh
122703b705cfSriastradh	assert(src);
122803b705cfSriastradh	assert(dst);
122942542f5fSchristos	assert(src != dst);
123003b705cfSriastradh	assert(bpp >= 8);
123103b705cfSriastradh	assert(box->x2 > box->x1);
123203b705cfSriastradh	assert(box->y2 > box->y1);
123303b705cfSriastradh
123403b705cfSriastradh	DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n",
123503b705cfSriastradh	     __FUNCTION__,
123603b705cfSriastradh	     box->x1, box->y1, box->x2, box->y2,
123703b705cfSriastradh	     stride, bpp, dx, dy));
123803b705cfSriastradh
123903b705cfSriastradh	bpp /= 8;
124003b705cfSriastradh	width = box->y1 * stride + box->x1 * bpp;
124103b705cfSriastradh	src_bytes = (const uint8_t *)src + width;
124203b705cfSriastradh	dst_bytes = (uint8_t *)dst + width;
124342542f5fSchristos	assert(dst_bytes != src_bytes);
124403b705cfSriastradh
124503b705cfSriastradh	width = (box->x2 - box->x1) * bpp;
124603b705cfSriastradh	height = (box->y2 - box->y1);
124742542f5fSchristos	assert(width <= stride);
124803b705cfSriastradh	if (width == stride) {
124903b705cfSriastradh		width *= height;
125003b705cfSriastradh		height = 1;
125103b705cfSriastradh	}
125203b705cfSriastradh
125303b705cfSriastradh	if (dy >= 0) {
125403b705cfSriastradh		switch (width) {
125503b705cfSriastradh		case 1:
125603b705cfSriastradh			do {
125703b705cfSriastradh				*dst_bytes = tmp.u8 = *src_bytes;
125803b705cfSriastradh				src_bytes += stride;
125903b705cfSriastradh				dst_bytes += stride;
126003b705cfSriastradh			} while (--height);
126103b705cfSriastradh			break;
126203b705cfSriastradh
126303b705cfSriastradh		case 2:
126403b705cfSriastradh			do {
126503b705cfSriastradh				*(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
126603b705cfSriastradh				src_bytes += stride;
126703b705cfSriastradh				dst_bytes += stride;
126803b705cfSriastradh			} while (--height);
126903b705cfSriastradh			break;
127003b705cfSriastradh
127103b705cfSriastradh		case 4:
127203b705cfSriastradh			do {
127303b705cfSriastradh				*(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
127403b705cfSriastradh				src_bytes += stride;
127503b705cfSriastradh				dst_bytes += stride;
127603b705cfSriastradh			} while (--height);
127703b705cfSriastradh			break;
127803b705cfSriastradh
127903b705cfSriastradh		case 8:
128003b705cfSriastradh			do {
128103b705cfSriastradh				*(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
128203b705cfSriastradh				src_bytes += stride;
128303b705cfSriastradh				dst_bytes += stride;
128403b705cfSriastradh			} while (--height);
128503b705cfSriastradh			break;
128603b705cfSriastradh
128703b705cfSriastradh		default:
128842542f5fSchristos			if (FORCE_MEMMOVE ||
128942542f5fSchristos			    (dst_bytes < src_bytes + width &&
129042542f5fSchristos			     src_bytes < dst_bytes + width)) {
129103b705cfSriastradh				do {
129203b705cfSriastradh					memmove(dst_bytes, src_bytes, width);
129303b705cfSriastradh					src_bytes += stride;
129403b705cfSriastradh					dst_bytes += stride;
129503b705cfSriastradh				} while (--height);
129603b705cfSriastradh			} else {
129703b705cfSriastradh				do {
129803b705cfSriastradh					memcpy(dst_bytes, src_bytes, width);
129903b705cfSriastradh					src_bytes += stride;
130003b705cfSriastradh					dst_bytes += stride;
130103b705cfSriastradh				} while (--height);
130203b705cfSriastradh			}
130303b705cfSriastradh			break;
130403b705cfSriastradh		}
130503b705cfSriastradh	} else {
130603b705cfSriastradh		src_bytes += (height-1) * stride;
130703b705cfSriastradh		dst_bytes += (height-1) * stride;
130803b705cfSriastradh
130903b705cfSriastradh		switch (width) {
131003b705cfSriastradh		case 1:
131103b705cfSriastradh			do {
131203b705cfSriastradh				*dst_bytes = tmp.u8 = *src_bytes;
131303b705cfSriastradh				src_bytes -= stride;
131403b705cfSriastradh				dst_bytes -= stride;
131503b705cfSriastradh			} while (--height);
131603b705cfSriastradh			break;
131703b705cfSriastradh
131803b705cfSriastradh		case 2:
131903b705cfSriastradh			do {
132003b705cfSriastradh				*(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
132103b705cfSriastradh				src_bytes -= stride;
132203b705cfSriastradh				dst_bytes -= stride;
132303b705cfSriastradh			} while (--height);
132403b705cfSriastradh			break;
132503b705cfSriastradh
132603b705cfSriastradh		case 4:
132703b705cfSriastradh			do {
132803b705cfSriastradh				*(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
132903b705cfSriastradh				src_bytes -= stride;
133003b705cfSriastradh				dst_bytes -= stride;
133103b705cfSriastradh			} while (--height);
133203b705cfSriastradh			break;
133303b705cfSriastradh
133403b705cfSriastradh		case 8:
133503b705cfSriastradh			do {
133603b705cfSriastradh				*(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
133703b705cfSriastradh				src_bytes -= stride;
133803b705cfSriastradh				dst_bytes -= stride;
133903b705cfSriastradh			} while (--height);
134003b705cfSriastradh			break;
134103b705cfSriastradh
134203b705cfSriastradh		default:
134342542f5fSchristos			if (FORCE_MEMMOVE ||
134442542f5fSchristos			    (dst_bytes < src_bytes + width &&
134542542f5fSchristos			     src_bytes < dst_bytes + width)) {
134603b705cfSriastradh				do {
134703b705cfSriastradh					memmove(dst_bytes, src_bytes, width);
134803b705cfSriastradh					src_bytes -= stride;
134903b705cfSriastradh					dst_bytes -= stride;
135003b705cfSriastradh				} while (--height);
135103b705cfSriastradh			} else {
135203b705cfSriastradh				do {
135303b705cfSriastradh					memcpy(dst_bytes, src_bytes, width);
135403b705cfSriastradh					src_bytes -= stride;
135503b705cfSriastradh					dst_bytes -= stride;
135603b705cfSriastradh				} while (--height);
135703b705cfSriastradh			}
135803b705cfSriastradh			break;
135903b705cfSriastradh		}
136003b705cfSriastradh	}
136103b705cfSriastradh}
136203b705cfSriastradh
136303b705cfSriastradhvoid
136403b705cfSriastradhmemcpy_xor(const void *src, void *dst, int bpp,
136503b705cfSriastradh	   int32_t src_stride, int32_t dst_stride,
136603b705cfSriastradh	   int16_t src_x, int16_t src_y,
136703b705cfSriastradh	   int16_t dst_x, int16_t dst_y,
136803b705cfSriastradh	   uint16_t width, uint16_t height,
136903b705cfSriastradh	   uint32_t and, uint32_t or)
137003b705cfSriastradh{
137103b705cfSriastradh	const uint8_t *src_bytes;
137203b705cfSriastradh	uint8_t *dst_bytes;
137342542f5fSchristos	int i, w;
137403b705cfSriastradh
137503b705cfSriastradh	assert(width && height);
137603b705cfSriastradh	assert(bpp >= 8);
137703b705cfSriastradh	assert(width*bpp <= 8*src_stride);
137803b705cfSriastradh	assert(width*bpp <= 8*dst_stride);
137903b705cfSriastradh
138003b705cfSriastradh	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n",
138103b705cfSriastradh	     __FUNCTION__,
138203b705cfSriastradh	     src_x, src_y, dst_x, dst_y,
138303b705cfSriastradh	     width, height,
138403b705cfSriastradh	     src_stride, dst_stride,
138503b705cfSriastradh	     bpp, and, or));
138603b705cfSriastradh
138703b705cfSriastradh	bpp /= 8;
138803b705cfSriastradh	src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
138903b705cfSriastradh	dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
139003b705cfSriastradh
139103b705cfSriastradh	if (and == 0xffffffff) {
139203b705cfSriastradh		switch (bpp) {
139303b705cfSriastradh		case 1:
139403b705cfSriastradh			if (width & 1) {
139503b705cfSriastradh				do {
139603b705cfSriastradh					for (i = 0; i < width; i++)
139703b705cfSriastradh						dst_bytes[i] = src_bytes[i] | or;
139803b705cfSriastradh
139903b705cfSriastradh					src_bytes += src_stride;
140003b705cfSriastradh					dst_bytes += dst_stride;
140103b705cfSriastradh				} while (--height);
140203b705cfSriastradh				break;
140303b705cfSriastradh			} else {
140403b705cfSriastradh				width /= 2;
140503b705cfSriastradh				or |= or << 8;
140603b705cfSriastradh			}
140703b705cfSriastradh		case 2:
140803b705cfSriastradh			if (width & 1) {
140903b705cfSriastradh				do {
141003b705cfSriastradh					uint16_t *d = (uint16_t *)dst_bytes;
141103b705cfSriastradh					const uint16_t *s = (const uint16_t *)src_bytes;
141203b705cfSriastradh
141303b705cfSriastradh					for (i = 0; i < width; i++)
141403b705cfSriastradh						d[i] = s[i] | or;
141503b705cfSriastradh
141603b705cfSriastradh					src_bytes += src_stride;
141703b705cfSriastradh					dst_bytes += dst_stride;
141803b705cfSriastradh				} while (--height);
141903b705cfSriastradh				break;
142003b705cfSriastradh			} else {
142103b705cfSriastradh				width /= 2;
142203b705cfSriastradh				or |= or << 16;
142303b705cfSriastradh			}
142403b705cfSriastradh		case 4:
142542542f5fSchristos			w = width;
142642542f5fSchristos			if (w * 4 == dst_stride && dst_stride == src_stride) {
142742542f5fSchristos				w *= height;
142803b705cfSriastradh				height = 1;
142903b705cfSriastradh			}
143003b705cfSriastradh
1431fe8aea9eSmrg#if defined(sse2) && __x86_64__
143203b705cfSriastradh			if (have_sse2()) {
143303b705cfSriastradh				do {
143403b705cfSriastradh					uint32_t *d = (uint32_t *)dst_bytes;
143503b705cfSriastradh					const uint32_t *s = (const uint32_t *)src_bytes;
143603b705cfSriastradh					__m128i mask = xmm_create_mask_32(or);
143703b705cfSriastradh
143842542f5fSchristos					i = w;
143903b705cfSriastradh					while (i && (uintptr_t)d & 15) {
144003b705cfSriastradh						*d++ = *s++ | or;
144103b705cfSriastradh						i--;
144203b705cfSriastradh					}
144303b705cfSriastradh
144403b705cfSriastradh					while (i >= 16) {
144503b705cfSriastradh						__m128i xmm1, xmm2, xmm3, xmm4;
144603b705cfSriastradh
144703b705cfSriastradh						xmm1 = xmm_load_128u((const __m128i*)s + 0);
144803b705cfSriastradh						xmm2 = xmm_load_128u((const __m128i*)s + 1);
144903b705cfSriastradh						xmm3 = xmm_load_128u((const __m128i*)s + 2);
145003b705cfSriastradh						xmm4 = xmm_load_128u((const __m128i*)s + 3);
145103b705cfSriastradh
145203b705cfSriastradh						xmm_save_128((__m128i*)d + 0,
145303b705cfSriastradh							     _mm_or_si128(xmm1, mask));
145403b705cfSriastradh						xmm_save_128((__m128i*)d + 1,
145503b705cfSriastradh							     _mm_or_si128(xmm2, mask));
145603b705cfSriastradh						xmm_save_128((__m128i*)d + 2,
145703b705cfSriastradh							     _mm_or_si128(xmm3, mask));
145803b705cfSriastradh						xmm_save_128((__m128i*)d + 3,
145903b705cfSriastradh							     _mm_or_si128(xmm4, mask));
146003b705cfSriastradh
146103b705cfSriastradh						d += 16;
146203b705cfSriastradh						s += 16;
146303b705cfSriastradh						i -= 16;
146403b705cfSriastradh					}
146503b705cfSriastradh
146603b705cfSriastradh					if (i & 8) {
146703b705cfSriastradh						__m128i xmm1, xmm2;
146803b705cfSriastradh
146903b705cfSriastradh						xmm1 = xmm_load_128u((const __m128i*)s + 0);
147003b705cfSriastradh						xmm2 = xmm_load_128u((const __m128i*)s + 1);
147103b705cfSriastradh
147203b705cfSriastradh						xmm_save_128((__m128i*)d + 0,
147303b705cfSriastradh							     _mm_or_si128(xmm1, mask));
147403b705cfSriastradh						xmm_save_128((__m128i*)d + 1,
147503b705cfSriastradh							     _mm_or_si128(xmm2, mask));
147603b705cfSriastradh						d += 8;
147703b705cfSriastradh						s += 8;
147803b705cfSriastradh						i -= 8;
147903b705cfSriastradh					}
148003b705cfSriastradh
148103b705cfSriastradh					if (i & 4) {
148203b705cfSriastradh						xmm_save_128((__m128i*)d,
148303b705cfSriastradh							     _mm_or_si128(xmm_load_128u((const __m128i*)s),
148403b705cfSriastradh									  mask));
148503b705cfSriastradh
148603b705cfSriastradh						d += 4;
148703b705cfSriastradh						s += 4;
148803b705cfSriastradh						i -= 4;
148903b705cfSriastradh					}
149003b705cfSriastradh
149103b705cfSriastradh					while (i) {
149203b705cfSriastradh						*d++ = *s++ | or;
149303b705cfSriastradh						i--;
149403b705cfSriastradh					}
149503b705cfSriastradh
149603b705cfSriastradh					src_bytes += src_stride;
149703b705cfSriastradh					dst_bytes += dst_stride;
149803b705cfSriastradh				} while (--height);
149903b705cfSriastradh			} else
150003b705cfSriastradh#else
150103b705cfSriastradh				do {
150203b705cfSriastradh					uint32_t *d = (uint32_t *)dst_bytes;
150303b705cfSriastradh					uint32_t *s = (uint32_t *)src_bytes;
150403b705cfSriastradh
150542542f5fSchristos					for (i = 0; i < w; i++)
150603b705cfSriastradh						d[i] = s[i] | or;
150703b705cfSriastradh
150803b705cfSriastradh					src_bytes += src_stride;
150903b705cfSriastradh					dst_bytes += dst_stride;
151003b705cfSriastradh				} while (--height);
151103b705cfSriastradh#endif
151203b705cfSriastradh			break;
151303b705cfSriastradh		}
151403b705cfSriastradh	} else {
151503b705cfSriastradh		switch (bpp) {
151603b705cfSriastradh		case 1:
151703b705cfSriastradh			do {
151803b705cfSriastradh				for (i = 0; i < width; i++)
151903b705cfSriastradh					dst_bytes[i] = (src_bytes[i] & and) | or;
152003b705cfSriastradh
152103b705cfSriastradh				src_bytes += src_stride;
152203b705cfSriastradh				dst_bytes += dst_stride;
152303b705cfSriastradh			} while (--height);
152403b705cfSriastradh			break;
152503b705cfSriastradh
152603b705cfSriastradh		case 2:
152703b705cfSriastradh			do {
152803b705cfSriastradh				uint16_t *d = (uint16_t *)dst_bytes;
152903b705cfSriastradh				const uint16_t *s = (const uint16_t *)src_bytes;
153003b705cfSriastradh
153103b705cfSriastradh				for (i = 0; i < width; i++)
153203b705cfSriastradh					d[i] = (s[i] & and) | or;
153303b705cfSriastradh
153403b705cfSriastradh				src_bytes += src_stride;
153503b705cfSriastradh				dst_bytes += dst_stride;
153603b705cfSriastradh			} while (--height);
153703b705cfSriastradh			break;
153803b705cfSriastradh
153903b705cfSriastradh		case 4:
154003b705cfSriastradh			do {
154103b705cfSriastradh				uint32_t *d = (uint32_t *)dst_bytes;
154203b705cfSriastradh				const uint32_t *s = (const uint32_t *)src_bytes;
154303b705cfSriastradh
154403b705cfSriastradh				for (i = 0; i < width; i++)
154503b705cfSriastradh					d[i] = (s[i] & and) | or;
154603b705cfSriastradh
154703b705cfSriastradh				src_bytes += src_stride;
154803b705cfSriastradh				dst_bytes += dst_stride;
154903b705cfSriastradh			} while (--height);
155003b705cfSriastradh			break;
155103b705cfSriastradh		}
155203b705cfSriastradh	}
155303b705cfSriastradh}
1554fe8aea9eSmrg
1555fe8aea9eSmrg#define BILINEAR_INTERPOLATION_BITS 4
1556fe8aea9eSmrgstatic inline int
1557fe8aea9eSmrgbilinear_weight(pixman_fixed_t x)
1558fe8aea9eSmrg{
1559fe8aea9eSmrg	return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
1560fe8aea9eSmrg		((1 << BILINEAR_INTERPOLATION_BITS) - 1);
1561fe8aea9eSmrg}
1562fe8aea9eSmrg
1563fe8aea9eSmrg#if BILINEAR_INTERPOLATION_BITS <= 4
1564fe8aea9eSmrg/* Inspired by Filter_32_opaque from Skia */
1565fe8aea9eSmrgstatic inline uint32_t
1566fe8aea9eSmrgbilinear_interpolation(uint32_t tl, uint32_t tr,
1567fe8aea9eSmrg		       uint32_t bl, uint32_t br,
1568fe8aea9eSmrg		       int distx, int disty)
1569fe8aea9eSmrg{
1570fe8aea9eSmrg	int distxy, distxiy, distixy, distixiy;
1571fe8aea9eSmrg	uint32_t lo, hi;
1572fe8aea9eSmrg
1573fe8aea9eSmrg	distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
1574fe8aea9eSmrg	disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
1575fe8aea9eSmrg
1576fe8aea9eSmrg	distxy = distx * disty;
1577fe8aea9eSmrg	distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
1578fe8aea9eSmrg	distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
1579fe8aea9eSmrg	distixiy =
1580fe8aea9eSmrg		16 * 16 - (disty << 4) -
1581fe8aea9eSmrg		(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
1582fe8aea9eSmrg
1583fe8aea9eSmrg	lo = (tl & 0xff00ff) * distixiy;
1584fe8aea9eSmrg	hi = ((tl >> 8) & 0xff00ff) * distixiy;
1585fe8aea9eSmrg
1586fe8aea9eSmrg	lo += (tr & 0xff00ff) * distxiy;
1587fe8aea9eSmrg	hi += ((tr >> 8) & 0xff00ff) * distxiy;
1588fe8aea9eSmrg
1589fe8aea9eSmrg	lo += (bl & 0xff00ff) * distixy;
1590fe8aea9eSmrg	hi += ((bl >> 8) & 0xff00ff) * distixy;
1591fe8aea9eSmrg
1592fe8aea9eSmrg	lo += (br & 0xff00ff) * distxy;
1593fe8aea9eSmrg	hi += ((br >> 8) & 0xff00ff) * distxy;
1594fe8aea9eSmrg
1595fe8aea9eSmrg	return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
1596fe8aea9eSmrg}
1597fe8aea9eSmrg#elif SIZEOF_LONG > 4
1598fe8aea9eSmrgstatic inline uint32_t
1599fe8aea9eSmrgbilinear_interpolation(uint32_t tl, uint32_t tr,
1600fe8aea9eSmrg		       uint32_t bl, uint32_t br,
1601fe8aea9eSmrg		       int distx, int disty)
1602fe8aea9eSmrg{
1603fe8aea9eSmrg	uint64_t distxy, distxiy, distixy, distixiy;
1604fe8aea9eSmrg	uint64_t tl64, tr64, bl64, br64;
1605fe8aea9eSmrg	uint64_t f, r;
1606fe8aea9eSmrg
1607fe8aea9eSmrg	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
1608fe8aea9eSmrg	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
1609fe8aea9eSmrg
1610fe8aea9eSmrg	distxy = distx * disty;
1611fe8aea9eSmrg	distxiy = distx * (256 - disty);
1612fe8aea9eSmrg	distixy = (256 - distx) * disty;
1613fe8aea9eSmrg	distixiy = (256 - distx) * (256 - disty);
1614fe8aea9eSmrg
1615fe8aea9eSmrg	/* Alpha and Blue */
1616fe8aea9eSmrg	tl64 = tl & 0xff0000ff;
1617fe8aea9eSmrg	tr64 = tr & 0xff0000ff;
1618fe8aea9eSmrg	bl64 = bl & 0xff0000ff;
1619fe8aea9eSmrg	br64 = br & 0xff0000ff;
1620fe8aea9eSmrg
1621fe8aea9eSmrg	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
1622fe8aea9eSmrg	r = f & 0x0000ff0000ff0000ull;
1623fe8aea9eSmrg
1624fe8aea9eSmrg	/* Red and Green */
1625fe8aea9eSmrg	tl64 = tl;
1626fe8aea9eSmrg	tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
1627fe8aea9eSmrg
1628fe8aea9eSmrg	tr64 = tr;
1629fe8aea9eSmrg	tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
1630fe8aea9eSmrg
1631fe8aea9eSmrg	bl64 = bl;
1632fe8aea9eSmrg	bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
1633fe8aea9eSmrg
1634fe8aea9eSmrg	br64 = br;
1635fe8aea9eSmrg	br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
1636fe8aea9eSmrg
1637fe8aea9eSmrg	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
1638fe8aea9eSmrg	r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
1639fe8aea9eSmrg
1640fe8aea9eSmrg	return (uint32_t)(r >> 16);
1641fe8aea9eSmrg}
1642fe8aea9eSmrg#else
1643fe8aea9eSmrgstatic inline uint32_t
1644fe8aea9eSmrgbilinear_interpolation(uint32_t tl, uint32_t tr,
1645fe8aea9eSmrg		       uint32_t bl, uint32_t br,
1646fe8aea9eSmrg		       int distx, int disty)
1647fe8aea9eSmrg{
1648fe8aea9eSmrg	int distxy, distxiy, distixy, distixiy;
1649fe8aea9eSmrg	uint32_t f, r;
1650fe8aea9eSmrg
1651fe8aea9eSmrg	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
1652fe8aea9eSmrg	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
1653fe8aea9eSmrg
1654fe8aea9eSmrg	distxy = distx * disty;
1655fe8aea9eSmrg	distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
1656fe8aea9eSmrg	distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
1657fe8aea9eSmrg	distixiy =
1658fe8aea9eSmrg		256 * 256 - (disty << 8) -
1659fe8aea9eSmrg		(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
1660fe8aea9eSmrg
1661fe8aea9eSmrg	/* Blue */
1662fe8aea9eSmrg	r = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
1663fe8aea9eSmrg	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
1664fe8aea9eSmrg
1665fe8aea9eSmrg	/* Green */
1666fe8aea9eSmrg	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
1667fe8aea9eSmrg	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
1668fe8aea9eSmrg	r |= f & 0xff000000;
1669fe8aea9eSmrg
1670fe8aea9eSmrg	tl >>= 16;
1671fe8aea9eSmrg	tr >>= 16;
1672fe8aea9eSmrg	bl >>= 16;
1673fe8aea9eSmrg	br >>= 16;
1674fe8aea9eSmrg	r >>= 16;
1675fe8aea9eSmrg
1676fe8aea9eSmrg	/* Red */
1677fe8aea9eSmrg	f = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
1678fe8aea9eSmrg	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
1679fe8aea9eSmrg	r |= f & 0x00ff0000;
1680fe8aea9eSmrg
1681fe8aea9eSmrg	/* Alpha */
1682fe8aea9eSmrg	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
1683fe8aea9eSmrg	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
1684fe8aea9eSmrg	r |= f & 0xff000000;
1685fe8aea9eSmrg
1686fe8aea9eSmrg	return r;
1687fe8aea9eSmrg}
1688fe8aea9eSmrg#endif
1689fe8aea9eSmrg
1690fe8aea9eSmrgstatic inline uint32_t convert_pixel(const uint8_t *p, int x)
1691fe8aea9eSmrg{
1692fe8aea9eSmrg	return ((uint32_t *)p)[x];
1693fe8aea9eSmrg}
1694fe8aea9eSmrg
1695fe8aea9eSmrgfast void
1696fe8aea9eSmrgaffine_blt(const void *src, void *dst, int bpp,
1697fe8aea9eSmrg	   int16_t src_x, int16_t src_y,
1698fe8aea9eSmrg	   int16_t src_width, int16_t src_height,
1699fe8aea9eSmrg	   int32_t src_stride,
1700fe8aea9eSmrg	   int16_t dst_x, int16_t dst_y,
1701fe8aea9eSmrg	   uint16_t dst_width, uint16_t dst_height,
1702fe8aea9eSmrg	   int32_t dst_stride,
1703fe8aea9eSmrg	   const struct pixman_f_transform *t)
1704fe8aea9eSmrg{
1705fe8aea9eSmrg	static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1706fe8aea9eSmrg	const pixman_fixed_t ux = pixman_double_to_fixed(t->m[0][0]);
1707fe8aea9eSmrg	const pixman_fixed_t uy = pixman_double_to_fixed(t->m[1][0]);
1708fe8aea9eSmrg	int i, j;
1709fe8aea9eSmrg
1710fe8aea9eSmrg	assert(bpp == 32);
1711fe8aea9eSmrg
1712fe8aea9eSmrg	for (j = 0; j < dst_height; j++) {
1713fe8aea9eSmrg		pixman_fixed_t x, y;
1714fe8aea9eSmrg		struct pixman_f_vector v;
1715fe8aea9eSmrg		uint32_t *b;
1716fe8aea9eSmrg
1717fe8aea9eSmrg		/* reference point is the center of the pixel */
1718fe8aea9eSmrg		v.v[0] = dst_x + 0.5;
1719fe8aea9eSmrg		v.v[1] = dst_y + j + 0.5;
1720fe8aea9eSmrg		v.v[2] = 1.0;
1721fe8aea9eSmrg
1722fe8aea9eSmrg		pixman_f_transform_point_3d(t, &v);
1723fe8aea9eSmrg
1724fe8aea9eSmrg		x = pixman_double_to_fixed(v.v[0]);
1725fe8aea9eSmrg		x += pixman_int_to_fixed(src_x - dst_x);
1726fe8aea9eSmrg		y = pixman_double_to_fixed(v.v[1]);
1727fe8aea9eSmrg		y +=  pixman_int_to_fixed(src_y - dst_y);
1728fe8aea9eSmrg
1729fe8aea9eSmrg		b = (uint32_t*)((uint8_t *)dst + (dst_y + j) * dst_stride + dst_x * bpp / 8);
1730fe8aea9eSmrg		for (i = 0; i < dst_width; i++) {
1731fe8aea9eSmrg			const uint8_t *row1;
1732fe8aea9eSmrg			const uint8_t *row2;
1733fe8aea9eSmrg			int x1, y1, x2, y2;
1734fe8aea9eSmrg			uint32_t tl, tr, bl, br;
1735fe8aea9eSmrg			int32_t fx, fy;
1736fe8aea9eSmrg
1737fe8aea9eSmrg			x1 = x - pixman_fixed_1/2;
1738fe8aea9eSmrg			y1 = y - pixman_fixed_1/2;
1739fe8aea9eSmrg
1740fe8aea9eSmrg			fx = bilinear_weight(x1);
1741fe8aea9eSmrg			fy = bilinear_weight(y1);
1742fe8aea9eSmrg
1743fe8aea9eSmrg			x1 = pixman_fixed_to_int(x1);
1744fe8aea9eSmrg			x2 = x1 + 1;
1745fe8aea9eSmrg			y1 = pixman_fixed_to_int(y1);
1746fe8aea9eSmrg			y2 = y1 + 1;
1747fe8aea9eSmrg
1748fe8aea9eSmrg			if (x1 >= src_width  || x2 < 0 ||
1749fe8aea9eSmrg			    y1 >= src_height || y2 < 0) {
1750fe8aea9eSmrg				b[i] = 0;
1751fe8aea9eSmrg				goto next;
1752fe8aea9eSmrg			}
1753fe8aea9eSmrg
1754fe8aea9eSmrg			if (y2 == 0) {
1755fe8aea9eSmrg				row1 = zero;
1756fe8aea9eSmrg			} else {
1757fe8aea9eSmrg				row1 = (uint8_t *)src + src_stride * y1;
1758fe8aea9eSmrg				row1 += bpp / 8 * x1;
1759fe8aea9eSmrg			}
1760fe8aea9eSmrg
1761fe8aea9eSmrg			if (y1 == src_height - 1) {
1762fe8aea9eSmrg				row2 = zero;
1763fe8aea9eSmrg			} else {
1764fe8aea9eSmrg				row2 = (uint8_t *)src + src_stride * y2;
1765fe8aea9eSmrg				row2 += bpp / 8 * x1;
1766fe8aea9eSmrg			}
1767fe8aea9eSmrg
1768fe8aea9eSmrg			if (x2 == 0) {
1769fe8aea9eSmrg				tl = 0;
1770fe8aea9eSmrg				bl = 0;
1771fe8aea9eSmrg			} else {
1772fe8aea9eSmrg				tl = convert_pixel(row1, 0);
1773fe8aea9eSmrg				bl = convert_pixel(row2, 0);
1774fe8aea9eSmrg			}
1775fe8aea9eSmrg
1776fe8aea9eSmrg			if (x1 == src_width - 1) {
1777fe8aea9eSmrg				tr = 0;
1778fe8aea9eSmrg				br = 0;
1779fe8aea9eSmrg			} else {
1780fe8aea9eSmrg				tr = convert_pixel(row1, 1);
1781fe8aea9eSmrg				br = convert_pixel(row2, 1);
1782fe8aea9eSmrg			}
1783fe8aea9eSmrg
1784fe8aea9eSmrg			b[i] = bilinear_interpolation(tl, tr, bl, br, fx, fy);
1785fe8aea9eSmrg
1786fe8aea9eSmrgnext:
1787fe8aea9eSmrg			x += ux;
1788fe8aea9eSmrg			y += uy;
1789fe8aea9eSmrg		}
1790fe8aea9eSmrg	}
1791fe8aea9eSmrg}
1792