1428d7b3dSmrg/*
2428d7b3dSmrg * Copyright (c) 2011 Intel Corporation
3428d7b3dSmrg *
4428d7b3dSmrg * Permission is hereby granted, free of charge, to any person obtaining a
5428d7b3dSmrg * copy of this software and associated documentation files (the "Software"),
6428d7b3dSmrg * to deal in the Software without restriction, including without limitation
7428d7b3dSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8428d7b3dSmrg * and/or sell copies of the Software, and to permit persons to whom the
9428d7b3dSmrg * Software is furnished to do so, subject to the following conditions:
10428d7b3dSmrg *
11428d7b3dSmrg * The above copyright notice and this permission notice (including the next
12428d7b3dSmrg * paragraph) shall be included in all copies or substantial portions of the
13428d7b3dSmrg * Software.
14428d7b3dSmrg *
15428d7b3dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16428d7b3dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17428d7b3dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18428d7b3dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19428d7b3dSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20428d7b3dSmrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21428d7b3dSmrg * SOFTWARE.
22428d7b3dSmrg *
23428d7b3dSmrg * Authors:
24428d7b3dSmrg *    Chris Wilson <chris@chris-wilson.co.uk>
25428d7b3dSmrg *
26428d7b3dSmrg */
27428d7b3dSmrg
28428d7b3dSmrg#ifdef HAVE_CONFIG_H
29428d7b3dSmrg#include "config.h"
30428d7b3dSmrg#endif
31428d7b3dSmrg
32428d7b3dSmrg#include "sna.h"
33428d7b3dSmrg
34428d7b3dSmrg#if __x86_64__
35428d7b3dSmrg#define USE_SSE2 1
36428d7b3dSmrg#endif
37428d7b3dSmrg
38428d7b3dSmrg#if USE_SSE2
39428d7b3dSmrg#include <xmmintrin.h>
40428d7b3dSmrg
41428d7b3dSmrg#if __x86_64__
42428d7b3dSmrg#define have_sse2() 1
43428d7b3dSmrg#else
44428d7b3dSmrgenum {
45428d7b3dSmrg	MMX = 0x1,
46428d7b3dSmrg	MMX_EXTENSIONS = 0x2,
47428d7b3dSmrg	SSE = 0x6,
48428d7b3dSmrg	SSE2 = 0x8,
49428d7b3dSmrg	CMOV = 0x10
50428d7b3dSmrg};
51428d7b3dSmrg
52428d7b3dSmrg#ifdef __GNUC__
53428d7b3dSmrgstatic unsigned int
54428d7b3dSmrgdetect_cpu_features(void)
55428d7b3dSmrg{
56428d7b3dSmrg	unsigned int features;
57428d7b3dSmrg	unsigned int result = 0;
58428d7b3dSmrg
59428d7b3dSmrg	char vendor[13];
60428d7b3dSmrg	vendor[0] = 0;
61428d7b3dSmrg	vendor[12] = 0;
62428d7b3dSmrg
63428d7b3dSmrg	asm (
64428d7b3dSmrg	     "pushf\n"
65428d7b3dSmrg	     "pop %%eax\n"
66428d7b3dSmrg	     "mov %%eax, %%ecx\n"
67428d7b3dSmrg	     "xor $0x00200000, %%eax\n"
68428d7b3dSmrg	     "push %%eax\n"
69428d7b3dSmrg	     "popf\n"
70428d7b3dSmrg	     "pushf\n"
71428d7b3dSmrg	     "pop %%eax\n"
72428d7b3dSmrg	     "mov $0x0, %%edx\n"
73428d7b3dSmrg	     "xor %%ecx, %%eax\n"
74428d7b3dSmrg	     "jz 1f\n"
75428d7b3dSmrg
76428d7b3dSmrg	     "mov $0x00000000, %%eax\n"
77428d7b3dSmrg	     "push %%ebx\n"
78428d7b3dSmrg	     "cpuid\n"
79428d7b3dSmrg	     "mov %%ebx, %%eax\n"
80428d7b3dSmrg	     "pop %%ebx\n"
81428d7b3dSmrg	     "mov %%eax, %1\n"
82428d7b3dSmrg	     "mov %%edx, %2\n"
83428d7b3dSmrg	     "mov %%ecx, %3\n"
84428d7b3dSmrg	     "mov $0x00000001, %%eax\n"
85428d7b3dSmrg	     "push %%ebx\n"
86428d7b3dSmrg	     "cpuid\n"
87428d7b3dSmrg	     "pop %%ebx\n"
88428d7b3dSmrg	     "1:\n"
89428d7b3dSmrg	     "mov %%edx, %0\n"
90428d7b3dSmrg	     : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8])
91428d7b3dSmrg	     :: "%eax", "%ecx", "%edx");
92428d7b3dSmrg
93428d7b3dSmrg	features = 0;
94428d7b3dSmrg	if (result) {
95428d7b3dSmrg		/* result now contains the standard feature bits */
96428d7b3dSmrg		if (result & (1 << 15))
97428d7b3dSmrg			features |= CMOV;
98428d7b3dSmrg		if (result & (1 << 23))
99428d7b3dSmrg			features |= MMX;
100428d7b3dSmrg		if (result & (1 << 25))
101428d7b3dSmrg			features |= SSE;
102428d7b3dSmrg		if (result & (1 << 26))
103428d7b3dSmrg			features |= SSE2;
104428d7b3dSmrg	}
105428d7b3dSmrg	return features;
106428d7b3dSmrg}
107428d7b3dSmrg#else
108428d7b3dSmrgstatic unsigned int detect_cpu_features(void) { return 0; }
109428d7b3dSmrg#endif
110428d7b3dSmrg
111428d7b3dSmrgstatic bool have_sse2(void)
112428d7b3dSmrg{
113428d7b3dSmrg	static int sse2_present = -1;
114428d7b3dSmrg
115428d7b3dSmrg	if (sse2_present == -1)
116428d7b3dSmrg		sse2_present = detect_cpu_features() & SSE2;
117428d7b3dSmrg
118428d7b3dSmrg	return sse2_present;
119428d7b3dSmrg}
120428d7b3dSmrg#endif
121428d7b3dSmrg
122428d7b3dSmrgstatic inline __m128i
123428d7b3dSmrgxmm_create_mask_32(uint32_t mask)
124428d7b3dSmrg{
125428d7b3dSmrg	return _mm_set_epi32(mask, mask, mask, mask);
126428d7b3dSmrg}
127428d7b3dSmrg
128428d7b3dSmrgstatic inline __m128i
129428d7b3dSmrgxmm_load_128u(const __m128i *src)
130428d7b3dSmrg{
131428d7b3dSmrg	return _mm_loadu_si128(src);
132428d7b3dSmrg}
133428d7b3dSmrg
134428d7b3dSmrgstatic inline void
135428d7b3dSmrgxmm_save_128(__m128i *dst, __m128i data)
136428d7b3dSmrg{
137428d7b3dSmrg	_mm_store_si128(dst, data);
138428d7b3dSmrg}
139428d7b3dSmrg#endif
140428d7b3dSmrg
141428d7b3dSmrgfast void
142428d7b3dSmrgmemcpy_blt(const void *src, void *dst, int bpp,
143428d7b3dSmrg	   int32_t src_stride, int32_t dst_stride,
144428d7b3dSmrg	   int16_t src_x, int16_t src_y,
145428d7b3dSmrg	   int16_t dst_x, int16_t dst_y,
146428d7b3dSmrg	   uint16_t width, uint16_t height)
147428d7b3dSmrg{
148428d7b3dSmrg	const uint8_t *src_bytes;
149428d7b3dSmrg	uint8_t *dst_bytes;
150428d7b3dSmrg	int byte_width;
151428d7b3dSmrg
152428d7b3dSmrg	assert(src);
153428d7b3dSmrg	assert(dst);
154428d7b3dSmrg	assert(width && height);
155428d7b3dSmrg	assert(bpp >= 8);
156428d7b3dSmrg	assert(width*bpp <= 8*src_stride);
157428d7b3dSmrg	assert(width*bpp <= 8*dst_stride);
158428d7b3dSmrg
159428d7b3dSmrg	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
160428d7b3dSmrg	     __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
161428d7b3dSmrg
162428d7b3dSmrg	bpp /= 8;
163428d7b3dSmrg
164428d7b3dSmrg	src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
165428d7b3dSmrg	dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
166428d7b3dSmrg
167428d7b3dSmrg	byte_width = width * bpp;
168428d7b3dSmrg	if (byte_width == src_stride && byte_width == dst_stride) {
169428d7b3dSmrg		byte_width *= height;
170428d7b3dSmrg		height = 1;
171428d7b3dSmrg	}
172428d7b3dSmrg
173428d7b3dSmrg	switch (byte_width) {
174428d7b3dSmrg	case 1:
175428d7b3dSmrg		do {
176428d7b3dSmrg			*dst_bytes = *src_bytes;
177428d7b3dSmrg			src_bytes += src_stride;
178428d7b3dSmrg			dst_bytes += dst_stride;
179428d7b3dSmrg		} while (--height);
180428d7b3dSmrg		break;
181428d7b3dSmrg
182428d7b3dSmrg	case 2:
183428d7b3dSmrg		do {
184428d7b3dSmrg			*(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes;
185428d7b3dSmrg			src_bytes += src_stride;
186428d7b3dSmrg			dst_bytes += dst_stride;
187428d7b3dSmrg		} while (--height);
188428d7b3dSmrg		break;
189428d7b3dSmrg
190428d7b3dSmrg	case 4:
191428d7b3dSmrg		do {
192428d7b3dSmrg			*(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes;
193428d7b3dSmrg			src_bytes += src_stride;
194428d7b3dSmrg			dst_bytes += dst_stride;
195428d7b3dSmrg		} while (--height);
196428d7b3dSmrg		break;
197428d7b3dSmrg
198428d7b3dSmrg	case 8:
199428d7b3dSmrg		do {
200428d7b3dSmrg			*(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes;
201428d7b3dSmrg			src_bytes += src_stride;
202428d7b3dSmrg			dst_bytes += dst_stride;
203428d7b3dSmrg		} while (--height);
204428d7b3dSmrg		break;
205428d7b3dSmrg	case 16:
206428d7b3dSmrg		do {
207428d7b3dSmrg			((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0];
208428d7b3dSmrg			((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1];
209428d7b3dSmrg			src_bytes += src_stride;
210428d7b3dSmrg			dst_bytes += dst_stride;
211428d7b3dSmrg		} while (--height);
212428d7b3dSmrg		break;
213428d7b3dSmrg
214428d7b3dSmrg	default:
215428d7b3dSmrg		do {
216428d7b3dSmrg			memcpy(dst_bytes, src_bytes, byte_width);
217428d7b3dSmrg			src_bytes += src_stride;
218428d7b3dSmrg			dst_bytes += dst_stride;
219428d7b3dSmrg		} while (--height);
220428d7b3dSmrg		break;
221428d7b3dSmrg	}
222428d7b3dSmrg}
223428d7b3dSmrg
224428d7b3dSmrgstatic fast_memcpy void
225428d7b3dSmrgmemcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
226428d7b3dSmrg			     int32_t src_stride, int32_t dst_stride,
227428d7b3dSmrg			     int16_t src_x, int16_t src_y,
228428d7b3dSmrg			     int16_t dst_x, int16_t dst_y,
229428d7b3dSmrg			     uint16_t width, uint16_t height)
230428d7b3dSmrg{
231428d7b3dSmrg	const unsigned tile_width = 512;
232428d7b3dSmrg	const unsigned tile_height = 8;
233428d7b3dSmrg	const unsigned tile_size = 4096;
234428d7b3dSmrg
235428d7b3dSmrg	const unsigned cpp = bpp / 8;
236428d7b3dSmrg	const unsigned tile_pixels = tile_width / cpp;
237428d7b3dSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
238428d7b3dSmrg	const unsigned tile_mask = tile_pixels - 1;
239428d7b3dSmrg
240428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
241428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
242428d7b3dSmrg	assert(src != dst);
243428d7b3dSmrg
244428d7b3dSmrg	if (src_x | src_y)
245428d7b3dSmrg		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
246428d7b3dSmrg	assert(src_stride >= width * cpp);
247428d7b3dSmrg	src_stride -= width * cpp;
248428d7b3dSmrg
249428d7b3dSmrg	while (height--) {
250428d7b3dSmrg		unsigned w = width * cpp;
251428d7b3dSmrg		uint8_t *tile_row = dst;
252428d7b3dSmrg
253428d7b3dSmrg		tile_row += dst_y / tile_height * dst_stride * tile_height;
254428d7b3dSmrg		tile_row += (dst_y & (tile_height-1)) * tile_width;
255428d7b3dSmrg		if (dst_x) {
256428d7b3dSmrg			tile_row += (dst_x >> tile_shift) * tile_size;
257428d7b3dSmrg			if (dst_x & tile_mask) {
258428d7b3dSmrg				const unsigned x = (dst_x & tile_mask) * cpp;
259428d7b3dSmrg				const unsigned len = min(tile_width - x, w);
260428d7b3dSmrg				memcpy(tile_row + x, src, len);
261428d7b3dSmrg
262428d7b3dSmrg				tile_row += tile_size;
263428d7b3dSmrg				src = (const uint8_t *)src + len;
264428d7b3dSmrg				w -= len;
265428d7b3dSmrg			}
266428d7b3dSmrg		}
267428d7b3dSmrg		while (w >= tile_width) {
268428d7b3dSmrg			memcpy(tile_row, src, tile_width);
269428d7b3dSmrg
270428d7b3dSmrg			tile_row += tile_size;
271428d7b3dSmrg			src = (const uint8_t *)src + tile_width;
272428d7b3dSmrg			w -= tile_width;
273428d7b3dSmrg		}
274428d7b3dSmrg		memcpy(tile_row, src, w);
275428d7b3dSmrg		src = (const uint8_t *)src + src_stride + w;
276428d7b3dSmrg		dst_y++;
277428d7b3dSmrg	}
278428d7b3dSmrg}
279428d7b3dSmrg
280428d7b3dSmrgstatic fast_memcpy void
281428d7b3dSmrgmemcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
282428d7b3dSmrg			       int32_t src_stride, int32_t dst_stride,
283428d7b3dSmrg			       int16_t src_x, int16_t src_y,
284428d7b3dSmrg			       int16_t dst_x, int16_t dst_y,
285428d7b3dSmrg			       uint16_t width, uint16_t height)
286428d7b3dSmrg{
287428d7b3dSmrg	const unsigned tile_width = 512;
288428d7b3dSmrg	const unsigned tile_height = 8;
289428d7b3dSmrg	const unsigned tile_size = 4096;
290428d7b3dSmrg
291428d7b3dSmrg	const unsigned cpp = bpp / 8;
292428d7b3dSmrg	const unsigned tile_pixels = tile_width / cpp;
293428d7b3dSmrg	const unsigned tile_shift = ffs(tile_pixels) - 1;
294428d7b3dSmrg	const unsigned tile_mask = tile_pixels - 1;
295428d7b3dSmrg
296428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
297428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
298428d7b3dSmrg	assert(src != dst);
299428d7b3dSmrg
300428d7b3dSmrg	if (dst_x | dst_y)
301428d7b3dSmrg		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
302428d7b3dSmrg	assert(dst_stride >= width * cpp);
303428d7b3dSmrg	dst_stride -= width * cpp;
304428d7b3dSmrg
305428d7b3dSmrg	while (height--) {
306428d7b3dSmrg		unsigned w = width * cpp;
307428d7b3dSmrg		const uint8_t *tile_row = src;
308428d7b3dSmrg
309428d7b3dSmrg		tile_row += src_y / tile_height * src_stride * tile_height;
310428d7b3dSmrg		tile_row += (src_y & (tile_height-1)) * tile_width;
311428d7b3dSmrg		if (src_x) {
312428d7b3dSmrg			tile_row += (src_x >> tile_shift) * tile_size;
313428d7b3dSmrg			if (src_x & tile_mask) {
314428d7b3dSmrg				const unsigned x = (src_x & tile_mask) * cpp;
315428d7b3dSmrg				const unsigned len = min(tile_width - x, w);
316428d7b3dSmrg				memcpy(dst, tile_row + x, len);
317428d7b3dSmrg
318428d7b3dSmrg				tile_row += tile_size;
319428d7b3dSmrg				dst = (uint8_t *)dst + len;
320428d7b3dSmrg				w -= len;
321428d7b3dSmrg			}
322428d7b3dSmrg		}
323428d7b3dSmrg		while (w >= tile_width) {
324428d7b3dSmrg			memcpy(dst, tile_row, tile_width);
325428d7b3dSmrg
326428d7b3dSmrg			tile_row += tile_size;
327428d7b3dSmrg			dst = (uint8_t *)dst + tile_width;
328428d7b3dSmrg			w -= tile_width;
329428d7b3dSmrg		}
330428d7b3dSmrg		memcpy(dst, tile_row, w);
331428d7b3dSmrg		dst = (uint8_t *)dst + dst_stride + w;
332428d7b3dSmrg		src_y++;
333428d7b3dSmrg	}
334428d7b3dSmrg}
335428d7b3dSmrg
336428d7b3dSmrgfast_memcpy static void
337428d7b3dSmrgmemcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
338428d7b3dSmrg			     int32_t src_stride, int32_t dst_stride,
339428d7b3dSmrg			     int16_t src_x, int16_t src_y,
340428d7b3dSmrg			     int16_t dst_x, int16_t dst_y,
341428d7b3dSmrg			     uint16_t width, uint16_t height)
342428d7b3dSmrg{
343428d7b3dSmrg	const unsigned tile_width = 512;
344428d7b3dSmrg	const unsigned tile_height = 8;
345428d7b3dSmrg	const unsigned tile_size = 4096;
346428d7b3dSmrg
347428d7b3dSmrg	const unsigned cpp = bpp / 8;
348428d7b3dSmrg	const unsigned stride_tiles = dst_stride / tile_width;
349428d7b3dSmrg	const unsigned swizzle_pixels = 64 / cpp;
350428d7b3dSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
351428d7b3dSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1;
352428d7b3dSmrg
353428d7b3dSmrg	unsigned x, y;
354428d7b3dSmrg
355428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
356428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
357428d7b3dSmrg
358428d7b3dSmrg	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
359428d7b3dSmrg
360428d7b3dSmrg	for (y = 0; y < height; ++y) {
361428d7b3dSmrg		const uint32_t dy = y + dst_y;
362428d7b3dSmrg		const uint32_t tile_row =
363428d7b3dSmrg			(dy / tile_height * stride_tiles * tile_size +
364428d7b3dSmrg			 (dy & (tile_height-1)) * tile_width);
365428d7b3dSmrg		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
366428d7b3dSmrg		uint32_t dx = dst_x, offset;
367428d7b3dSmrg
368428d7b3dSmrg		x = width * cpp;
369428d7b3dSmrg		if (dx & (swizzle_pixels - 1)) {
370428d7b3dSmrg			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
371428d7b3dSmrg			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
372428d7b3dSmrg			offset = tile_row +
373428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
374428d7b3dSmrg				(dx & tile_mask) * cpp;
375428d7b3dSmrg			offset ^= (offset >> 3) & 64;
376428d7b3dSmrg
377428d7b3dSmrg			memcpy((char *)dst + offset, src_row, length * cpp);
378428d7b3dSmrg
379428d7b3dSmrg			src_row += length * cpp;
380428d7b3dSmrg			x -= length * cpp;
381428d7b3dSmrg			dx += length;
382428d7b3dSmrg		}
383428d7b3dSmrg		while (x >= 64) {
384428d7b3dSmrg			offset = tile_row +
385428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
386428d7b3dSmrg				(dx & tile_mask) * cpp;
387428d7b3dSmrg			offset ^= (offset >> 3) & 64;
388428d7b3dSmrg
389428d7b3dSmrg			memcpy((char *)dst + offset, src_row, 64);
390428d7b3dSmrg
391428d7b3dSmrg			src_row += 64;
392428d7b3dSmrg			x -= 64;
393428d7b3dSmrg			dx += swizzle_pixels;
394428d7b3dSmrg		}
395428d7b3dSmrg		if (x) {
396428d7b3dSmrg			offset = tile_row +
397428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
398428d7b3dSmrg				(dx & tile_mask) * cpp;
399428d7b3dSmrg			offset ^= (offset >> 3) & 64;
400428d7b3dSmrg			memcpy((char *)dst + offset, src_row, x);
401428d7b3dSmrg		}
402428d7b3dSmrg	}
403428d7b3dSmrg}
404428d7b3dSmrg
405428d7b3dSmrgfast_memcpy static void
406428d7b3dSmrgmemcpy_from_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
407428d7b3dSmrg			       int32_t src_stride, int32_t dst_stride,
408428d7b3dSmrg			       int16_t src_x, int16_t src_y,
409428d7b3dSmrg			       int16_t dst_x, int16_t dst_y,
410428d7b3dSmrg			       uint16_t width, uint16_t height)
411428d7b3dSmrg{
412428d7b3dSmrg	const unsigned tile_width = 512;
413428d7b3dSmrg	const unsigned tile_height = 8;
414428d7b3dSmrg	const unsigned tile_size = 4096;
415428d7b3dSmrg
416428d7b3dSmrg	const unsigned cpp = bpp / 8;
417428d7b3dSmrg	const unsigned stride_tiles = src_stride / tile_width;
418428d7b3dSmrg	const unsigned swizzle_pixels = 64 / cpp;
419428d7b3dSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
420428d7b3dSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1;
421428d7b3dSmrg
422428d7b3dSmrg	unsigned x, y;
423428d7b3dSmrg
424428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
425428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
426428d7b3dSmrg
427428d7b3dSmrg	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
428428d7b3dSmrg
429428d7b3dSmrg	for (y = 0; y < height; ++y) {
430428d7b3dSmrg		const uint32_t sy = y + src_y;
431428d7b3dSmrg		const uint32_t tile_row =
432428d7b3dSmrg			(sy / tile_height * stride_tiles * tile_size +
433428d7b3dSmrg			 (sy & (tile_height-1)) * tile_width);
434428d7b3dSmrg		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
435428d7b3dSmrg		uint32_t sx = src_x, offset;
436428d7b3dSmrg
437428d7b3dSmrg		x = width * cpp;
438428d7b3dSmrg		if (sx & (swizzle_pixels - 1)) {
439428d7b3dSmrg			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
440428d7b3dSmrg			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
441428d7b3dSmrg			offset = tile_row +
442428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
443428d7b3dSmrg				(sx & tile_mask) * cpp;
444428d7b3dSmrg			offset ^= (offset >> 3) & 64;
445428d7b3dSmrg
446428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, length * cpp);
447428d7b3dSmrg
448428d7b3dSmrg			dst_row += length * cpp;
449428d7b3dSmrg			x -= length * cpp;
450428d7b3dSmrg			sx += length;
451428d7b3dSmrg		}
452428d7b3dSmrg		while (x >= 64) {
453428d7b3dSmrg			offset = tile_row +
454428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
455428d7b3dSmrg				(sx & tile_mask) * cpp;
456428d7b3dSmrg			offset ^= (offset >> 3) & 64;
457428d7b3dSmrg
458428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, 64);
459428d7b3dSmrg
460428d7b3dSmrg			dst_row += 64;
461428d7b3dSmrg			x -= 64;
462428d7b3dSmrg			sx += swizzle_pixels;
463428d7b3dSmrg		}
464428d7b3dSmrg		if (x) {
465428d7b3dSmrg			offset = tile_row +
466428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
467428d7b3dSmrg				(sx & tile_mask) * cpp;
468428d7b3dSmrg			offset ^= (offset >> 3) & 64;
469428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, x);
470428d7b3dSmrg		}
471428d7b3dSmrg	}
472428d7b3dSmrg}
473428d7b3dSmrg
474428d7b3dSmrgfast_memcpy static void
475428d7b3dSmrgmemcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
476428d7b3dSmrg				int32_t src_stride, int32_t dst_stride,
477428d7b3dSmrg				int16_t src_x, int16_t src_y,
478428d7b3dSmrg				int16_t dst_x, int16_t dst_y,
479428d7b3dSmrg				uint16_t width, uint16_t height)
480428d7b3dSmrg{
481428d7b3dSmrg	const unsigned tile_width = 512;
482428d7b3dSmrg	const unsigned tile_height = 8;
483428d7b3dSmrg	const unsigned tile_size = 4096;
484428d7b3dSmrg
485428d7b3dSmrg	const unsigned cpp = bpp / 8;
486428d7b3dSmrg	const unsigned stride_tiles = dst_stride / tile_width;
487428d7b3dSmrg	const unsigned swizzle_pixels = 64 / cpp;
488428d7b3dSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
489428d7b3dSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1;
490428d7b3dSmrg
491428d7b3dSmrg	unsigned x, y;
492428d7b3dSmrg
493428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
494428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
495428d7b3dSmrg
496428d7b3dSmrg	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
497428d7b3dSmrg
498428d7b3dSmrg	for (y = 0; y < height; ++y) {
499428d7b3dSmrg		const uint32_t dy = y + dst_y;
500428d7b3dSmrg		const uint32_t tile_row =
501428d7b3dSmrg			(dy / tile_height * stride_tiles * tile_size +
502428d7b3dSmrg			 (dy & (tile_height-1)) * tile_width);
503428d7b3dSmrg		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
504428d7b3dSmrg		uint32_t dx = dst_x, offset;
505428d7b3dSmrg
506428d7b3dSmrg		x = width * cpp;
507428d7b3dSmrg		if (dx & (swizzle_pixels - 1)) {
508428d7b3dSmrg			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
509428d7b3dSmrg			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
510428d7b3dSmrg			offset = tile_row +
511428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
512428d7b3dSmrg				(dx & tile_mask) * cpp;
513428d7b3dSmrg			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
514428d7b3dSmrg
515428d7b3dSmrg			memcpy((char *)dst + offset, src_row, length * cpp);
516428d7b3dSmrg
517428d7b3dSmrg			src_row += length * cpp;
518428d7b3dSmrg			x -= length * cpp;
519428d7b3dSmrg			dx += length;
520428d7b3dSmrg		}
521428d7b3dSmrg		while (x >= 64) {
522428d7b3dSmrg			offset = tile_row +
523428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
524428d7b3dSmrg				(dx & tile_mask) * cpp;
525428d7b3dSmrg			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
526428d7b3dSmrg
527428d7b3dSmrg			memcpy((char *)dst + offset, src_row, 64);
528428d7b3dSmrg
529428d7b3dSmrg			src_row += 64;
530428d7b3dSmrg			x -= 64;
531428d7b3dSmrg			dx += swizzle_pixels;
532428d7b3dSmrg		}
533428d7b3dSmrg		if (x) {
534428d7b3dSmrg			offset = tile_row +
535428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
536428d7b3dSmrg				(dx & tile_mask) * cpp;
537428d7b3dSmrg			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
538428d7b3dSmrg			memcpy((char *)dst + offset, src_row, x);
539428d7b3dSmrg		}
540428d7b3dSmrg	}
541428d7b3dSmrg}
542428d7b3dSmrg
543428d7b3dSmrgfast_memcpy static void
544428d7b3dSmrgmemcpy_from_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
545428d7b3dSmrg				  int32_t src_stride, int32_t dst_stride,
546428d7b3dSmrg				  int16_t src_x, int16_t src_y,
547428d7b3dSmrg				  int16_t dst_x, int16_t dst_y,
548428d7b3dSmrg				  uint16_t width, uint16_t height)
549428d7b3dSmrg{
550428d7b3dSmrg	const unsigned tile_width = 512;
551428d7b3dSmrg	const unsigned tile_height = 8;
552428d7b3dSmrg	const unsigned tile_size = 4096;
553428d7b3dSmrg
554428d7b3dSmrg	const unsigned cpp = bpp / 8;
555428d7b3dSmrg	const unsigned stride_tiles = src_stride / tile_width;
556428d7b3dSmrg	const unsigned swizzle_pixels = 64 / cpp;
557428d7b3dSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
558428d7b3dSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1;
559428d7b3dSmrg
560428d7b3dSmrg	unsigned x, y;
561428d7b3dSmrg
562428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
563428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
564428d7b3dSmrg
565428d7b3dSmrg	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
566428d7b3dSmrg
567428d7b3dSmrg	for (y = 0; y < height; ++y) {
568428d7b3dSmrg		const uint32_t sy = y + src_y;
569428d7b3dSmrg		const uint32_t tile_row =
570428d7b3dSmrg			(sy / tile_height * stride_tiles * tile_size +
571428d7b3dSmrg			 (sy & (tile_height-1)) * tile_width);
572428d7b3dSmrg		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
573428d7b3dSmrg		uint32_t sx = src_x, offset;
574428d7b3dSmrg
575428d7b3dSmrg		x = width * cpp;
576428d7b3dSmrg		if (sx & (swizzle_pixels - 1)) {
577428d7b3dSmrg			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
578428d7b3dSmrg			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
579428d7b3dSmrg			offset = tile_row +
580428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
581428d7b3dSmrg				(sx & tile_mask) * cpp;
582428d7b3dSmrg			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
583428d7b3dSmrg
584428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, length * cpp);
585428d7b3dSmrg
586428d7b3dSmrg			dst_row += length * cpp;
587428d7b3dSmrg			x -= length * cpp;
588428d7b3dSmrg			sx += length;
589428d7b3dSmrg		}
590428d7b3dSmrg		while (x >= 64) {
591428d7b3dSmrg			offset = tile_row +
592428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
593428d7b3dSmrg				(sx & tile_mask) * cpp;
594428d7b3dSmrg			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
595428d7b3dSmrg
596428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, 64);
597428d7b3dSmrg
598428d7b3dSmrg			dst_row += 64;
599428d7b3dSmrg			x -= 64;
600428d7b3dSmrg			sx += swizzle_pixels;
601428d7b3dSmrg		}
602428d7b3dSmrg		if (x) {
603428d7b3dSmrg			offset = tile_row +
604428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
605428d7b3dSmrg				(sx & tile_mask) * cpp;
606428d7b3dSmrg			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
607428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, x);
608428d7b3dSmrg		}
609428d7b3dSmrg	}
610428d7b3dSmrg}
611428d7b3dSmrg
612428d7b3dSmrgfast_memcpy static void
613428d7b3dSmrgmemcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
614428d7b3dSmrg				int32_t src_stride, int32_t dst_stride,
615428d7b3dSmrg				int16_t src_x, int16_t src_y,
616428d7b3dSmrg				int16_t dst_x, int16_t dst_y,
617428d7b3dSmrg				uint16_t width, uint16_t height)
618428d7b3dSmrg{
619428d7b3dSmrg	const unsigned tile_width = 512;
620428d7b3dSmrg	const unsigned tile_height = 8;
621428d7b3dSmrg	const unsigned tile_size = 4096;
622428d7b3dSmrg
623428d7b3dSmrg	const unsigned cpp = bpp / 8;
624428d7b3dSmrg	const unsigned stride_tiles = dst_stride / tile_width;
625428d7b3dSmrg	const unsigned swizzle_pixels = 64 / cpp;
626428d7b3dSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
627428d7b3dSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1;
628428d7b3dSmrg
629428d7b3dSmrg	unsigned x, y;
630428d7b3dSmrg
631428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
632428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
633428d7b3dSmrg
634428d7b3dSmrg	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
635428d7b3dSmrg
636428d7b3dSmrg	for (y = 0; y < height; ++y) {
637428d7b3dSmrg		const uint32_t dy = y + dst_y;
638428d7b3dSmrg		const uint32_t tile_row =
639428d7b3dSmrg			(dy / tile_height * stride_tiles * tile_size +
640428d7b3dSmrg			 (dy & (tile_height-1)) * tile_width);
641428d7b3dSmrg		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
642428d7b3dSmrg		uint32_t dx = dst_x, offset;
643428d7b3dSmrg
644428d7b3dSmrg		x = width * cpp;
645428d7b3dSmrg		if (dx & (swizzle_pixels - 1)) {
646428d7b3dSmrg			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
647428d7b3dSmrg			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
648428d7b3dSmrg			offset = tile_row +
649428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
650428d7b3dSmrg				(dx & tile_mask) * cpp;
651428d7b3dSmrg			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
652428d7b3dSmrg			memcpy((char *)dst + offset, src_row, length * cpp);
653428d7b3dSmrg
654428d7b3dSmrg			src_row += length * cpp;
655428d7b3dSmrg			x -= length * cpp;
656428d7b3dSmrg			dx += length;
657428d7b3dSmrg		}
658428d7b3dSmrg		while (x >= 64) {
659428d7b3dSmrg			offset = tile_row +
660428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
661428d7b3dSmrg				(dx & tile_mask) * cpp;
662428d7b3dSmrg			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
663428d7b3dSmrg
664428d7b3dSmrg			memcpy((char *)dst + offset, src_row, 64);
665428d7b3dSmrg
666428d7b3dSmrg			src_row += 64;
667428d7b3dSmrg			x -= 64;
668428d7b3dSmrg			dx += swizzle_pixels;
669428d7b3dSmrg		}
670428d7b3dSmrg		if (x) {
671428d7b3dSmrg			offset = tile_row +
672428d7b3dSmrg				(dx >> tile_pixels) * tile_size +
673428d7b3dSmrg				(dx & tile_mask) * cpp;
674428d7b3dSmrg			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
675428d7b3dSmrg			memcpy((char *)dst + offset, src_row, x);
676428d7b3dSmrg		}
677428d7b3dSmrg	}
678428d7b3dSmrg}
679428d7b3dSmrg
680428d7b3dSmrgfast_memcpy static void
681428d7b3dSmrgmemcpy_from_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
682428d7b3dSmrg				  int32_t src_stride, int32_t dst_stride,
683428d7b3dSmrg				  int16_t src_x, int16_t src_y,
684428d7b3dSmrg				  int16_t dst_x, int16_t dst_y,
685428d7b3dSmrg				  uint16_t width, uint16_t height)
686428d7b3dSmrg{
687428d7b3dSmrg	const unsigned tile_width = 512;
688428d7b3dSmrg	const unsigned tile_height = 8;
689428d7b3dSmrg	const unsigned tile_size = 4096;
690428d7b3dSmrg
691428d7b3dSmrg	const unsigned cpp = bpp / 8;
692428d7b3dSmrg	const unsigned stride_tiles = src_stride / tile_width;
693428d7b3dSmrg	const unsigned swizzle_pixels = 64 / cpp;
694428d7b3dSmrg	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
695428d7b3dSmrg	const unsigned tile_mask = (1 << tile_pixels) - 1;
696428d7b3dSmrg
697428d7b3dSmrg	unsigned x, y;
698428d7b3dSmrg
699428d7b3dSmrg	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
700428d7b3dSmrg	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
701428d7b3dSmrg
702428d7b3dSmrg	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
703428d7b3dSmrg
704428d7b3dSmrg	for (y = 0; y < height; ++y) {
705428d7b3dSmrg		const uint32_t sy = y + src_y;
706428d7b3dSmrg		const uint32_t tile_row =
707428d7b3dSmrg			(sy / tile_height * stride_tiles * tile_size +
708428d7b3dSmrg			 (sy & (tile_height-1)) * tile_width);
709428d7b3dSmrg		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
710428d7b3dSmrg		uint32_t sx = src_x, offset;
711428d7b3dSmrg
712428d7b3dSmrg		x = width * cpp;
713428d7b3dSmrg		if (sx & (swizzle_pixels - 1)) {
714428d7b3dSmrg			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
715428d7b3dSmrg			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
716428d7b3dSmrg			offset = tile_row +
717428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
718428d7b3dSmrg				(sx & tile_mask) * cpp;
719428d7b3dSmrg			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
720428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, length * cpp);
721428d7b3dSmrg
722428d7b3dSmrg			dst_row += length * cpp;
723428d7b3dSmrg			x -= length * cpp;
724428d7b3dSmrg			sx += length;
725428d7b3dSmrg		}
726428d7b3dSmrg		while (x >= 64) {
727428d7b3dSmrg			offset = tile_row +
728428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
729428d7b3dSmrg				(sx & tile_mask) * cpp;
730428d7b3dSmrg			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
731428d7b3dSmrg
732428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, 64);
733428d7b3dSmrg
734428d7b3dSmrg			dst_row += 64;
735428d7b3dSmrg			x -= 64;
736428d7b3dSmrg			sx += swizzle_pixels;
737428d7b3dSmrg		}
738428d7b3dSmrg		if (x) {
739428d7b3dSmrg			offset = tile_row +
740428d7b3dSmrg				(sx >> tile_pixels) * tile_size +
741428d7b3dSmrg				(sx & tile_mask) * cpp;
742428d7b3dSmrg			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
743428d7b3dSmrg			memcpy(dst_row, (const char *)src + offset, x);
744428d7b3dSmrg		}
745428d7b3dSmrg	}
746428d7b3dSmrg}
747428d7b3dSmrg
748428d7b3dSmrgvoid choose_memcpy_tiled_x(struct kgem *kgem, int swizzling)
749428d7b3dSmrg{
750428d7b3dSmrg	switch (swizzling) {
751428d7b3dSmrg	default:
752428d7b3dSmrg		DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling));
753428d7b3dSmrg		break;
754428d7b3dSmrg	case I915_BIT_6_SWIZZLE_NONE:
755428d7b3dSmrg		DBG(("%s: no swizzling\n", __FUNCTION__));
756428d7b3dSmrg		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
757428d7b3dSmrg		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
758428d7b3dSmrg		break;
759428d7b3dSmrg	case I915_BIT_6_SWIZZLE_9:
760428d7b3dSmrg		DBG(("%s: 6^9 swizzling\n", __FUNCTION__));
761428d7b3dSmrg		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9;
762428d7b3dSmrg		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9;
763428d7b3dSmrg		break;
764428d7b3dSmrg	case I915_BIT_6_SWIZZLE_9_10:
765428d7b3dSmrg		DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__));
766428d7b3dSmrg		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10;
767428d7b3dSmrg		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10;
768428d7b3dSmrg		break;
769428d7b3dSmrg	case I915_BIT_6_SWIZZLE_9_11:
770428d7b3dSmrg		DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__));
771428d7b3dSmrg		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
772428d7b3dSmrg		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11;
773428d7b3dSmrg		break;
774428d7b3dSmrg	}
775428d7b3dSmrg}
776428d7b3dSmrg
777428d7b3dSmrgvoid
778428d7b3dSmrgmemmove_box(const void *src, void *dst,
779428d7b3dSmrg	    int bpp, int32_t stride,
780428d7b3dSmrg	    const BoxRec *box,
781428d7b3dSmrg	    int dx, int dy)
782428d7b3dSmrg{
783428d7b3dSmrg#define FORCE_MEMMOVE 0
784428d7b3dSmrg	union {
785428d7b3dSmrg		uint8_t u8;
786428d7b3dSmrg		uint16_t u16;
787428d7b3dSmrg		uint32_t u32;
788428d7b3dSmrg		uint64_t u64;
789428d7b3dSmrg	} tmp;
790428d7b3dSmrg	const uint8_t *src_bytes;
791428d7b3dSmrg	uint8_t *dst_bytes;
792428d7b3dSmrg	int width, height;
793428d7b3dSmrg
794428d7b3dSmrg	assert(src);
795428d7b3dSmrg	assert(dst);
796428d7b3dSmrg	assert(src != dst);
797428d7b3dSmrg	assert(bpp >= 8);
798428d7b3dSmrg	assert(box->x2 > box->x1);
799428d7b3dSmrg	assert(box->y2 > box->y1);
800428d7b3dSmrg
801428d7b3dSmrg	DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n",
802428d7b3dSmrg	     __FUNCTION__,
803428d7b3dSmrg	     box->x1, box->y1, box->x2, box->y2,
804428d7b3dSmrg	     stride, bpp, dx, dy));
805428d7b3dSmrg
806428d7b3dSmrg	bpp /= 8;
807428d7b3dSmrg	width = box->y1 * stride + box->x1 * bpp;
808428d7b3dSmrg	src_bytes = (const uint8_t *)src + width;
809428d7b3dSmrg	dst_bytes = (uint8_t *)dst + width;
810428d7b3dSmrg	assert(dst_bytes != src_bytes);
811428d7b3dSmrg
812428d7b3dSmrg	width = (box->x2 - box->x1) * bpp;
813428d7b3dSmrg	height = (box->y2 - box->y1);
814428d7b3dSmrg	assert(width <= stride);
815428d7b3dSmrg	if (width == stride) {
816428d7b3dSmrg		width *= height;
817428d7b3dSmrg		height = 1;
818428d7b3dSmrg	}
819428d7b3dSmrg
820428d7b3dSmrg	if (dy >= 0) {
821428d7b3dSmrg		switch (width) {
822428d7b3dSmrg		case 1:
823428d7b3dSmrg			do {
824428d7b3dSmrg				*dst_bytes = tmp.u8 = *src_bytes;
825428d7b3dSmrg				src_bytes += stride;
826428d7b3dSmrg				dst_bytes += stride;
827428d7b3dSmrg			} while (--height);
828428d7b3dSmrg			break;
829428d7b3dSmrg
830428d7b3dSmrg		case 2:
831428d7b3dSmrg			do {
832428d7b3dSmrg				*(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
833428d7b3dSmrg				src_bytes += stride;
834428d7b3dSmrg				dst_bytes += stride;
835428d7b3dSmrg			} while (--height);
836428d7b3dSmrg			break;
837428d7b3dSmrg
838428d7b3dSmrg		case 4:
839428d7b3dSmrg			do {
840428d7b3dSmrg				*(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
841428d7b3dSmrg				src_bytes += stride;
842428d7b3dSmrg				dst_bytes += stride;
843428d7b3dSmrg			} while (--height);
844428d7b3dSmrg			break;
845428d7b3dSmrg
846428d7b3dSmrg		case 8:
847428d7b3dSmrg			do {
848428d7b3dSmrg				*(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
849428d7b3dSmrg				src_bytes += stride;
850428d7b3dSmrg				dst_bytes += stride;
851428d7b3dSmrg			} while (--height);
852428d7b3dSmrg			break;
853428d7b3dSmrg
854428d7b3dSmrg		default:
855428d7b3dSmrg			if (FORCE_MEMMOVE ||
856428d7b3dSmrg			    (dst_bytes < src_bytes + width &&
857428d7b3dSmrg			     src_bytes < dst_bytes + width)) {
858428d7b3dSmrg				do {
859428d7b3dSmrg					memmove(dst_bytes, src_bytes, width);
860428d7b3dSmrg					src_bytes += stride;
861428d7b3dSmrg					dst_bytes += stride;
862428d7b3dSmrg				} while (--height);
863428d7b3dSmrg			} else {
864428d7b3dSmrg				do {
865428d7b3dSmrg					memcpy(dst_bytes, src_bytes, width);
866428d7b3dSmrg					src_bytes += stride;
867428d7b3dSmrg					dst_bytes += stride;
868428d7b3dSmrg				} while (--height);
869428d7b3dSmrg			}
870428d7b3dSmrg			break;
871428d7b3dSmrg		}
872428d7b3dSmrg	} else {
873428d7b3dSmrg		src_bytes += (height-1) * stride;
874428d7b3dSmrg		dst_bytes += (height-1) * stride;
875428d7b3dSmrg
876428d7b3dSmrg		switch (width) {
877428d7b3dSmrg		case 1:
878428d7b3dSmrg			do {
879428d7b3dSmrg				*dst_bytes = tmp.u8 = *src_bytes;
880428d7b3dSmrg				src_bytes -= stride;
881428d7b3dSmrg				dst_bytes -= stride;
882428d7b3dSmrg			} while (--height);
883428d7b3dSmrg			break;
884428d7b3dSmrg
885428d7b3dSmrg		case 2:
886428d7b3dSmrg			do {
887428d7b3dSmrg				*(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
888428d7b3dSmrg				src_bytes -= stride;
889428d7b3dSmrg				dst_bytes -= stride;
890428d7b3dSmrg			} while (--height);
891428d7b3dSmrg			break;
892428d7b3dSmrg
893428d7b3dSmrg		case 4:
894428d7b3dSmrg			do {
895428d7b3dSmrg				*(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
896428d7b3dSmrg				src_bytes -= stride;
897428d7b3dSmrg				dst_bytes -= stride;
898428d7b3dSmrg			} while (--height);
899428d7b3dSmrg			break;
900428d7b3dSmrg
901428d7b3dSmrg		case 8:
902428d7b3dSmrg			do {
903428d7b3dSmrg				*(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
904428d7b3dSmrg				src_bytes -= stride;
905428d7b3dSmrg				dst_bytes -= stride;
906428d7b3dSmrg			} while (--height);
907428d7b3dSmrg			break;
908428d7b3dSmrg
909428d7b3dSmrg		default:
910428d7b3dSmrg			if (FORCE_MEMMOVE ||
911428d7b3dSmrg			    (dst_bytes < src_bytes + width &&
912428d7b3dSmrg			     src_bytes < dst_bytes + width)) {
913428d7b3dSmrg				do {
914428d7b3dSmrg					memmove(dst_bytes, src_bytes, width);
915428d7b3dSmrg					src_bytes -= stride;
916428d7b3dSmrg					dst_bytes -= stride;
917428d7b3dSmrg				} while (--height);
918428d7b3dSmrg			} else {
919428d7b3dSmrg				do {
920428d7b3dSmrg					memcpy(dst_bytes, src_bytes, width);
921428d7b3dSmrg					src_bytes -= stride;
922428d7b3dSmrg					dst_bytes -= stride;
923428d7b3dSmrg				} while (--height);
924428d7b3dSmrg			}
925428d7b3dSmrg			break;
926428d7b3dSmrg		}
927428d7b3dSmrg	}
928428d7b3dSmrg}
929428d7b3dSmrg
930428d7b3dSmrgvoid
931428d7b3dSmrgmemcpy_xor(const void *src, void *dst, int bpp,
932428d7b3dSmrg	   int32_t src_stride, int32_t dst_stride,
933428d7b3dSmrg	   int16_t src_x, int16_t src_y,
934428d7b3dSmrg	   int16_t dst_x, int16_t dst_y,
935428d7b3dSmrg	   uint16_t width, uint16_t height,
936428d7b3dSmrg	   uint32_t and, uint32_t or)
937428d7b3dSmrg{
938428d7b3dSmrg	const uint8_t *src_bytes;
939428d7b3dSmrg	uint8_t *dst_bytes;
940428d7b3dSmrg	int i, w;
941428d7b3dSmrg
942428d7b3dSmrg	assert(width && height);
943428d7b3dSmrg	assert(bpp >= 8);
944428d7b3dSmrg	assert(width*bpp <= 8*src_stride);
945428d7b3dSmrg	assert(width*bpp <= 8*dst_stride);
946428d7b3dSmrg
947428d7b3dSmrg	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n",
948428d7b3dSmrg	     __FUNCTION__,
949428d7b3dSmrg	     src_x, src_y, dst_x, dst_y,
950428d7b3dSmrg	     width, height,
951428d7b3dSmrg	     src_stride, dst_stride,
952428d7b3dSmrg	     bpp, and, or));
953428d7b3dSmrg
954428d7b3dSmrg	bpp /= 8;
955428d7b3dSmrg	src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
956428d7b3dSmrg	dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
957428d7b3dSmrg
958428d7b3dSmrg	if (and == 0xffffffff) {
959428d7b3dSmrg		switch (bpp) {
960428d7b3dSmrg		case 1:
961428d7b3dSmrg			if (width & 1) {
962428d7b3dSmrg				do {
963428d7b3dSmrg					for (i = 0; i < width; i++)
964428d7b3dSmrg						dst_bytes[i] = src_bytes[i] | or;
965428d7b3dSmrg
966428d7b3dSmrg					src_bytes += src_stride;
967428d7b3dSmrg					dst_bytes += dst_stride;
968428d7b3dSmrg				} while (--height);
969428d7b3dSmrg				break;
970428d7b3dSmrg			} else {
971428d7b3dSmrg				width /= 2;
972428d7b3dSmrg				or |= or << 8;
973428d7b3dSmrg			}
974428d7b3dSmrg		case 2:
975428d7b3dSmrg			if (width & 1) {
976428d7b3dSmrg				do {
977428d7b3dSmrg					uint16_t *d = (uint16_t *)dst_bytes;
978428d7b3dSmrg					const uint16_t *s = (const uint16_t *)src_bytes;
979428d7b3dSmrg
980428d7b3dSmrg					for (i = 0; i < width; i++)
981428d7b3dSmrg						d[i] = s[i] | or;
982428d7b3dSmrg
983428d7b3dSmrg					src_bytes += src_stride;
984428d7b3dSmrg					dst_bytes += dst_stride;
985428d7b3dSmrg				} while (--height);
986428d7b3dSmrg				break;
987428d7b3dSmrg			} else {
988428d7b3dSmrg				width /= 2;
989428d7b3dSmrg				or |= or << 16;
990428d7b3dSmrg			}
991428d7b3dSmrg		case 4:
992428d7b3dSmrg			w = width;
993428d7b3dSmrg			if (w * 4 == dst_stride && dst_stride == src_stride) {
994428d7b3dSmrg				w *= height;
995428d7b3dSmrg				height = 1;
996428d7b3dSmrg			}
997428d7b3dSmrg
998428d7b3dSmrg#if USE_SSE2
999428d7b3dSmrg			if (have_sse2()) {
1000428d7b3dSmrg				do {
1001428d7b3dSmrg					uint32_t *d = (uint32_t *)dst_bytes;
1002428d7b3dSmrg					const uint32_t *s = (const uint32_t *)src_bytes;
1003428d7b3dSmrg					__m128i mask = xmm_create_mask_32(or);
1004428d7b3dSmrg
1005428d7b3dSmrg					i = w;
1006428d7b3dSmrg					while (i && (uintptr_t)d & 15) {
1007428d7b3dSmrg						*d++ = *s++ | or;
1008428d7b3dSmrg						i--;
1009428d7b3dSmrg					}
1010428d7b3dSmrg
1011428d7b3dSmrg					while (i >= 16) {
1012428d7b3dSmrg						__m128i xmm1, xmm2, xmm3, xmm4;
1013428d7b3dSmrg
1014428d7b3dSmrg						xmm1 = xmm_load_128u((const __m128i*)s + 0);
1015428d7b3dSmrg						xmm2 = xmm_load_128u((const __m128i*)s + 1);
1016428d7b3dSmrg						xmm3 = xmm_load_128u((const __m128i*)s + 2);
1017428d7b3dSmrg						xmm4 = xmm_load_128u((const __m128i*)s + 3);
1018428d7b3dSmrg
1019428d7b3dSmrg						xmm_save_128((__m128i*)d + 0,
1020428d7b3dSmrg							     _mm_or_si128(xmm1, mask));
1021428d7b3dSmrg						xmm_save_128((__m128i*)d + 1,
1022428d7b3dSmrg							     _mm_or_si128(xmm2, mask));
1023428d7b3dSmrg						xmm_save_128((__m128i*)d + 2,
1024428d7b3dSmrg							     _mm_or_si128(xmm3, mask));
1025428d7b3dSmrg						xmm_save_128((__m128i*)d + 3,
1026428d7b3dSmrg							     _mm_or_si128(xmm4, mask));
1027428d7b3dSmrg
1028428d7b3dSmrg						d += 16;
1029428d7b3dSmrg						s += 16;
1030428d7b3dSmrg						i -= 16;
1031428d7b3dSmrg					}
1032428d7b3dSmrg
1033428d7b3dSmrg					if (i & 8) {
1034428d7b3dSmrg						__m128i xmm1, xmm2;
1035428d7b3dSmrg
1036428d7b3dSmrg						xmm1 = xmm_load_128u((const __m128i*)s + 0);
1037428d7b3dSmrg						xmm2 = xmm_load_128u((const __m128i*)s + 1);
1038428d7b3dSmrg
1039428d7b3dSmrg						xmm_save_128((__m128i*)d + 0,
1040428d7b3dSmrg							     _mm_or_si128(xmm1, mask));
1041428d7b3dSmrg						xmm_save_128((__m128i*)d + 1,
1042428d7b3dSmrg							     _mm_or_si128(xmm2, mask));
1043428d7b3dSmrg						d += 8;
1044428d7b3dSmrg						s += 8;
1045428d7b3dSmrg						i -= 8;
1046428d7b3dSmrg					}
1047428d7b3dSmrg
1048428d7b3dSmrg					if (i & 4) {
1049428d7b3dSmrg						xmm_save_128((__m128i*)d,
1050428d7b3dSmrg							     _mm_or_si128(xmm_load_128u((const __m128i*)s),
1051428d7b3dSmrg									  mask));
1052428d7b3dSmrg
1053428d7b3dSmrg						d += 4;
1054428d7b3dSmrg						s += 4;
1055428d7b3dSmrg						i -= 4;
1056428d7b3dSmrg					}
1057428d7b3dSmrg
1058428d7b3dSmrg					while (i) {
1059428d7b3dSmrg						*d++ = *s++ | or;
1060428d7b3dSmrg						i--;
1061428d7b3dSmrg					}
1062428d7b3dSmrg
1063428d7b3dSmrg					src_bytes += src_stride;
1064428d7b3dSmrg					dst_bytes += dst_stride;
1065428d7b3dSmrg				} while (--height);
1066428d7b3dSmrg			} else
1067428d7b3dSmrg#else
1068428d7b3dSmrg				do {
1069428d7b3dSmrg					uint32_t *d = (uint32_t *)dst_bytes;
1070428d7b3dSmrg					uint32_t *s = (uint32_t *)src_bytes;
1071428d7b3dSmrg
1072428d7b3dSmrg					for (i = 0; i < w; i++)
1073428d7b3dSmrg						d[i] = s[i] | or;
1074428d7b3dSmrg
1075428d7b3dSmrg					src_bytes += src_stride;
1076428d7b3dSmrg					dst_bytes += dst_stride;
1077428d7b3dSmrg				} while (--height);
1078428d7b3dSmrg#endif
1079428d7b3dSmrg			break;
1080428d7b3dSmrg		}
1081428d7b3dSmrg	} else {
1082428d7b3dSmrg		switch (bpp) {
1083428d7b3dSmrg		case 1:
1084428d7b3dSmrg			do {
1085428d7b3dSmrg				for (i = 0; i < width; i++)
1086428d7b3dSmrg					dst_bytes[i] = (src_bytes[i] & and) | or;
1087428d7b3dSmrg
1088428d7b3dSmrg				src_bytes += src_stride;
1089428d7b3dSmrg				dst_bytes += dst_stride;
1090428d7b3dSmrg			} while (--height);
1091428d7b3dSmrg			break;
1092428d7b3dSmrg
1093428d7b3dSmrg		case 2:
1094428d7b3dSmrg			do {
1095428d7b3dSmrg				uint16_t *d = (uint16_t *)dst_bytes;
1096428d7b3dSmrg				const uint16_t *s = (const uint16_t *)src_bytes;
1097428d7b3dSmrg
1098428d7b3dSmrg				for (i = 0; i < width; i++)
1099428d7b3dSmrg					d[i] = (s[i] & and) | or;
1100428d7b3dSmrg
1101428d7b3dSmrg				src_bytes += src_stride;
1102428d7b3dSmrg				dst_bytes += dst_stride;
1103428d7b3dSmrg			} while (--height);
1104428d7b3dSmrg			break;
1105428d7b3dSmrg
1106428d7b3dSmrg		case 4:
1107428d7b3dSmrg			do {
1108428d7b3dSmrg				uint32_t *d = (uint32_t *)dst_bytes;
1109428d7b3dSmrg				const uint32_t *s = (const uint32_t *)src_bytes;
1110428d7b3dSmrg
1111428d7b3dSmrg				for (i = 0; i < width; i++)
1112428d7b3dSmrg					d[i] = (s[i] & and) | or;
1113428d7b3dSmrg
1114428d7b3dSmrg				src_bytes += src_stride;
1115428d7b3dSmrg				dst_bytes += dst_stride;
1116428d7b3dSmrg			} while (--height);
1117428d7b3dSmrg			break;
1118428d7b3dSmrg		}
1119428d7b3dSmrg	}
1120428d7b3dSmrg}
1121