1/* $NetBSD: cg14_render.c,v 1.19 2023/01/11 09:23:57 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45/*#define SX_SINGLE*/
46/*#define SX_RENDER_DEBUG*/
47/*#define SX_RENDER_VERBOSE*/
48/*#define SX_ADD_SOFTWARE*/
49/*#define SX_RENDER_TRACE*/
50
51#ifdef SX_RENDER_TRACE
52#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
53#define DONE xf86Msg(X_ERROR, "<%s\n", __func__);
54#else
55#define ENTER
56#define DONE
57#endif
58
59#ifdef SX_RENDER_DEBUG
60#define DPRINTF xf86Msg
61#else
62#define DPRINTF while (0) xf86Msg
63#endif
64
65#ifdef SX_RENDER_VERBOSE
66char c[8] = " .,:+*oX";
67#endif
68
69void CG14Comp_Over32Solid(Cg14Ptr p,
70                   uint32_t src, uint32_t srcpitch,
71                   uint32_t dst, uint32_t dstpitch,
72                   int width, int height)
73{
74	uint32_t msk = src, mskx, dstx, m;
75	int line, x, i;
76
77	ENTER;
78
79	for (line = 0; line < height; line++) {
80		mskx = msk;
81		dstx = dst;
82#ifndef SX_SINGLE
83		int rest;
84		for (x = 0; x < width; x += 4) {
85			rest = width - x;
86			/* fetch 4 mask values */
87			sxm(SX_LDUQ0, mskx, 12, 3);
88			/* fetch destination pixels */
89			sxm(SX_LDUQ0, dstx, 60, 3);
90			/* duplicate them for all channels */
91			sxi(SX_ORS, 0, 12, 13, 2);
92			sxi(SX_ORS, 0, 16, 17, 2);
93			sxi(SX_ORS, 0, 20, 21, 2);
94			sxi(SX_ORS, 0, 24, 25, 2);
95			/* generate inverted alpha */
96			sxi(SX_XORS, 12, 8, 28, 15);
97			/* multiply source */
98			sxi(SX_MUL16X16SR8, 8, 12, 44, 3);
99			sxi(SX_MUL16X16SR8, 8, 16, 48, 3);
100			sxi(SX_MUL16X16SR8, 8, 20, 52, 3);
101			sxi(SX_MUL16X16SR8, 8, 24, 56, 3);
102			/* multiply dest */
103			sxi(SX_MUL16X16SR8, 28, 60, 76, 15);
104			/* add up */
105			sxi(SX_ADDV, 44, 76, 92, 15);
106			/* write back */
107			if (rest < 4) {
108				sxm(SX_STUQ0C, dstx, 92, rest - 1);
109			} else {
110				sxm(SX_STUQ0C, dstx, 92, 3);
111			}
112			dstx += 16;
113			mskx += 16;
114		}
115#else /* SX_SINGLE */
116		for (x = 0; x < width; x++) {
117			m = *(volatile uint32_t *)(p->fb + mskx);
118			m = m >> 24;
119			if (m == 0) {
120				/* nothing to do - all transparent */
121			} else if (m == 0xff) {
122				/* all opaque */
123				sxm(SX_STUQ0, dstx, 8, 0);
124			} else {
125				/* fetch alpha value, stick it into scam */
126				/* mask is in R[12:15] */
127				/*write_sx_io(p, mskx,
128				    SX_LDUQ0(12, 0, mskx & 7));*/
129				write_sx_reg(p, SX_QUEUED(12), m);
130				/* fetch dst pixel */
131				sxm(SX_LDUQ0, dstx, 20, 0);
132				sxi(SX_ORV, 12, 0, R_SCAM, 0);
133				/*
134				 * src * alpha + R0
135				 * R[9:11] * SCAM + R0 -> R[17:19]
136				 */
137				sxi(SX_SAXP16X16SR8, 9, 0, 17, 2);
138
139				/* invert SCAM */
140				sxi(SX_XORV, 12, 8, R_SCAM, 0);
141#ifdef SX_RENDER_DEBUG
142				sxi(SX_XORV, 12, 8, 13, 0);
143#endif
144				/* dst * (1 - alpha) + R[13:15] */
145				sxi(SX_SAXP16X16SR8, 21, 17, 25, 2);
146				sxm(SX_STUQ0C, dstx, 24, 0);
147			}
148			dstx += 4;
149			mskx += 4;
150		}
151#endif /* SX_SINGLE */
152		dst += dstpitch;
153		msk += srcpitch;
154	}
155}
156
157void CG14Comp_Over8Solid(Cg14Ptr p,
158                   uint32_t src, uint32_t srcpitch,
159                   uint32_t dst, uint32_t dstpitch,
160                   int width, int height)
161{
162	uint32_t msk = src, mskx, dstx, m;
163	int line, x, i;
164#ifdef SX_RENDER_VERBOSE
165	char buffer[256];
166#endif
167	ENTER;
168
169	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
170	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
171	    *(uint32_t *)(p->fb + p->srcoff));
172	for (line = 0; line < height; line++) {
173		mskx = msk;
174		dstx = dst;
175#ifndef SX_SINGLE
176		int rest;
177		for (x = 0; x < width; x += 4) {
178			rest = width - x;
179			/* fetch 4 mask values */
180			sxm(SX_LDB, mskx, 12, 3);
181			/* fetch destination pixels */
182			sxm(SX_LDUQ0, dstx, 60, 3);
183			/* duplicate them for all channels */
184			sxi(SX_ORS, 0, 13, 16, 3);
185			sxi(SX_ORS, 0, 14, 20, 3);
186			sxi(SX_ORS, 0, 15, 24, 3);
187			sxi(SX_ORS, 0, 12, 13, 2);
188			/* generate inverted alpha */
189			sxi(SX_XORS, 12, 8, 28, 15);
190			/* multiply source */
191			sxi(SX_MUL16X16SR8, 8, 12, 44, 3);
192			sxi(SX_MUL16X16SR8, 8, 16, 48, 3);
193			sxi(SX_MUL16X16SR8, 8, 20, 52, 3);
194			sxi(SX_MUL16X16SR8, 8, 24, 56, 3);
195			/* multiply dest */
196			sxi(SX_MUL16X16SR8, 28, 60, 76, 15);
197			/* add up */
198			sxi(SX_ADDV, 44, 76, 92, 15);
199			/* write back */
200			if (rest < 4) {
201				sxm(SX_STUQ0C, dstx, 92, rest - 1);
202			} else {
203				sxm(SX_STUQ0C, dstx, 92, 3);
204			}
205			dstx += 16;
206			mskx += 4;
207		}
208#else /* SX_SINGLE */
209		for (x = 0; x < width; x++) {
210			m = *(volatile uint8_t *)(p->fb + mskx);
211#ifdef SX_RENDER_VERBOSE
212			buffer[x] = c[m >> 5];
213#endif
214			if (m == 0) {
215				/* nothing to do - all transparent */
216			} else if (m == 0xff) {
217				/* all opaque */
218				sxm(SX_STUQ0, dstx, 8, 0);
219			} else {
220				/* fetch alpha value, stick it into scam */
221				/* mask is in R[12:15] */
222				/*write_sx_io(p, mskx & ~7,
223				    SX_LDB(12, 0, mskx & 7));*/
224				write_sx_reg(p, SX_QUEUED(12), m);
225				/* fetch dst pixel */
226				sxm(SX_LDUQ0, dstx, 20, 0);
227				sxi(SX_ORV, 12, 0, R_SCAM, 0);
228				/*
229				 * src * alpha + R0
230				 * R[9:11] * SCAM + R0 -> R[17:19]
231				 */
232				sxi(SX_SAXP16X16SR8, 9, 0, 17, 2);
233
234				/* invert SCAM */
235				sxi(SX_XORV, 12, 8, R_SCAM, 0);
236#ifdef SX_RENDER_DEBUG
237				sxi(SX_XORV, 12, 8, 13, 0);
238#endif
239				/* dst * (1 - alpha) + R[13:15] */
240				sxi(SX_SAXP16X16SR8, 21, 17, 25, 2);
241				sxm(SX_STUQ0C, dstx, 24, 0);
242			}
243			dstx += 4;
244			mskx += 1;
245		}
246#endif /* SX_SINGLE */
247#ifdef SX_RENDER_VERBOSE
248		buffer[x] = 0;
249		xf86Msg(X_ERROR, "%s\n", buffer);
250#endif
251		dst += dstpitch;
252		msk += srcpitch;
253	}
254	DONE;
255}
256
257void CG14Comp_Add32(Cg14Ptr p,
258                   uint32_t src, uint32_t srcpitch,
259                   uint32_t dst, uint32_t dstpitch,
260                   int width, int height)
261{
262	int line;
263	uint32_t srcx, dstx;
264	int full, part, x;
265
266	ENTER;
267	full = width >> 3;	/* chunks of 8 */
268	part = width & 7;	/* leftovers */
269	/* we do this up to 8 pixels at a time */
270	for (line = 0; line < height; line++) {
271		srcx = src;
272		dstx = dst;
273		for (x = 0; x < full; x++) {
274			sxm(SX_LDUQ0, srcx, 8, 31);
275			sxm(SX_LDUQ0, dstx, 40, 31);
276			sxi(SX_ADDV, 8, 40, 72, 15);
277			sxi(SX_ADDV, 24, 56, 88, 15);
278			sxm(SX_STUQ0, dstx, 72, 31);
279			srcx += 128;
280			dstx += 128;
281		}
282
283		/* do leftovers */
284		sxm(SX_LDUQ0, srcx, 8, part - 1);
285		sxm(SX_LDUQ0, dstx, 40, part - 1);
286		if (part & 16) {
287			sxi(SX_ADDV, 8, 40, 72, 15);
288			sxi(SX_ADDV, 24, 56, 88, part - 17);
289		} else {
290			sxi(SX_ADDV, 8, 40, 72, part - 1);
291		}
292		sxm(SX_STUQ0, dstx, 72, part - 1);
293
294		/* next line */
295		src += srcpitch;
296		dst += dstpitch;
297	}
298}
299
300void CG14Comp_Add8(Cg14Ptr p,
301                   uint32_t src, uint32_t srcpitch,
302                   uint32_t dst, uint32_t dstpitch,
303                   int width, int height)
304{
305	int line;
306	uint32_t srcx, dstx, srcoff, dstoff;
307	int pre, full, part, x;
308	uint8_t *d;
309#ifdef SX_RENDER_VERBOSE
310	char buffer[256];
311#endif
312	ENTER;
313
314	srcoff = src & 7;
315	src &= ~7;
316	dstoff = dst & 7;
317	dst &= ~7;
318	full = width >> 5;	/* chunks of 32 */
319	part = width & 31;	/* leftovers */
320
321#ifdef SX_RENDER_DEBUG
322	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
323	    width, height, full, part);
324#endif
325	/* we do this up to 32 pixels at a time */
326	for (line = 0; line < height; line++) {
327		srcx = src;
328		dstx = dst;
329#ifdef SX_ADD_SOFTWARE
330		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
331		d = (uint8_t *)(p->fb + dstx + dstoff);
332		for (x = 0; x < width; x++) {
333			d[x] = min(255, s[x] + d[x]);
334		}
335#else
336		for (x = 0; x < full; x++) {
337			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
338			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
339			sxi(SX_ADDV, 8, 40, 72, 15);
340			sxi(SX_ADDV, 24, 56, 88, 15);
341			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
342			srcx += 32;
343			dstx += 32;
344		}
345
346		if (part > 0) {
347			/* do leftovers */
348			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
349			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
350			if (part > 16) {
351				sxi(SX_ADDV, 8, 40, 72, 15);
352				sxi(SX_ADDV, 24, 56, 88, part - 17);
353			} else {
354				sxi(SX_ADDV, 8, 40, 72, part - 1);
355			}
356			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
357		}
358#endif
359#ifdef SX_RENDER_VERBOSE
360		d = (uint8_t *)(p->fb + src + srcoff);
361		for (x = 0; x < width; x++) {
362			buffer[x] = c[d[x]>>5];
363		}
364		buffer[x] = 0;
365		xf86Msg(X_ERROR, "%s\n", buffer);
366#endif
367		/* next line */
368		src += srcpitch;
369		dst += dstpitch;
370	}
371}
372
373void CG14Comp_Add8_32(Cg14Ptr p,
374                   uint32_t src, uint32_t srcpitch,
375                   uint32_t dst, uint32_t dstpitch,
376                   int width, int height)
377{
378	int line;
379	uint32_t srcx, dstx, srcoff, dstoff;
380	int pre, full, part, x;
381	uint8_t *d;
382#ifdef SX_RENDER_VERBOSE
383	char buffer[256];
384#endif
385	ENTER;
386
387	srcoff = src & 7;
388	src &= ~7;
389	dstoff = dst & 7;
390	dst &= ~7;
391	full = width >> 5;	/* chunks of 32 */
392	part = width & 31;	/* leftovers */
393
394#ifdef SX_RENDER_DEBUG
395	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
396	    width, height, full, part);
397#endif
398	/* we do this up to 32 pixels at a time */
399	for (line = 0; line < height; line++) {
400		srcx = src;
401		dstx = dst;
402		for (x = 0; x < full; x++) {
403			/* load source bytes */
404			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
405			/* load alpha from destination */
406			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
407			sxi(SX_ADDV, 8, 40, 72, 15);
408			sxi(SX_ADDV, 24, 56, 88, 15);
409			/* write clamped values back into dest alpha */
410			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
411			srcx += 32;
412			dstx += 128;
413		}
414
415		if (part > 0) {
416			/* do leftovers */
417			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
418			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
419			if (part > 16) {
420				sxi(SX_ADDV, 8, 40, 72, 15);
421				sxi(SX_ADDV, 24, 56, 88, part - 17);
422			} else {
423				sxi(SX_ADDV, 8, 40, 72, part - 1);
424			}
425			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
426		}
427#ifdef SX_RENDER_VERBOSE
428		d = (uint8_t *)(p->fb + src + srcoff);
429		for (x = 0; x < width; x++) {
430			buffer[x] = c[d[x]>>5];
431		}
432		buffer[x] = 0;
433		xf86Msg(X_ERROR, "%s\n", buffer);
434#endif
435		/* next line */
436		src += srcpitch;
437		dst += dstpitch;
438	}
439}
440
441void CG14Comp_Over32(Cg14Ptr p,
442                   uint32_t src, uint32_t srcpitch,
443                   uint32_t dst, uint32_t dstpitch,
444                   int width, int height, int flip)
445{
446	uint32_t srcx, dstx, mskx, m;
447	int line, x, i, num;
448
449	ENTER;
450
451	write_sx_reg(p, SX_QUEUED(8), 0xff);
452	for (line = 0; line < height; line++) {
453		srcx = src;
454		dstx = dst;
455
456		for (x = 0; x < width; x += 4) {
457			/* we do up to 4 pixels at a time */
458			num = min(4, width - x);
459			if (num <= 0) {
460				xf86Msg(X_ERROR, "wtf?!\n");
461				continue;
462			}
463			/* fetch source pixels */
464			sxm(SX_LDUQ0, srcx, 12, num - 1);
465			if (flip) {
466				sxi(SX_GATHER, 13, 4, 40, num - 1);
467				sxi(SX_GATHER, 15, 4, 44, num - 1);
468				sxi(SX_SCATTER, 40, 4, 15, num - 1);
469				sxi(SX_SCATTER, 44, 4, 13, num - 1);
470			}
471			/* fetch dst pixels */
472			sxm(SX_LDUQ0, dstx, 44, num - 1);
473			/* now process up to 4 pixels */
474			for (i = 0; i < num; i++) {
475				int ii = i << 2;
476				/* write inverted alpha into SCAM */
477				sxi(SX_XORS, 12 + ii, 8, R_SCAM, 0);
478				/* dst * (1 - alpha) + src */
479				sxi(SX_SAXP16X16SR8, 44 + ii, 12 + ii, 76 + ii, 3);
480			}
481			sxm(SX_STUQ0C, dstx, 76, num - 1);
482			srcx += 16;
483			dstx += 16;
484		}
485		src += srcpitch;
486		dst += dstpitch;
487	}
488}
489
490void CG14Comp_Over32Mask(Cg14Ptr p,
491                   uint32_t src, uint32_t srcpitch,
492                   uint32_t msk, uint32_t mskpitch,
493                   uint32_t dst, uint32_t dstpitch,
494                   int width, int height, int flip)
495{
496	uint32_t srcx, dstx, mskx, m;
497	int line, x, i, num;
498
499	ENTER;
500
501	write_sx_reg(p, SX_QUEUED(8), 0xff);
502	for (line = 0; line < height; line++) {
503		srcx = src;
504		mskx = msk;
505		dstx = dst;
506
507		for (x = 0; x < width; x += 4) {
508			/* we do up to 4 pixels at a time */
509			num = min(4, width - x);
510			if (num <= 0) {
511				xf86Msg(X_ERROR, "wtf?!\n");
512				continue;
513			}
514			/* fetch source pixels */
515			sxm(SX_LDUQ0, srcx, 12, num - 1);
516			if (flip) {
517				sxi(SX_GATHER, 13, 4, 40, num - 1);
518				sxi(SX_GATHER, 15, 4, 44, num - 1);
519				sxi(SX_SCATTER, 40, 4, 15, num - 1);
520				sxi(SX_SCATTER, 44, 4, 13, num - 1);
521			}
522			/* fetch mask */
523			sxm(SX_LDB, mskx, 28, num - 1);
524			/* fetch dst pixels */
525			sxm(SX_LDUQ0, dstx, 44, num - 1);
526			/* now process up to 4 pixels */
527			for (i = 0; i < num; i++) {
528				int ii = i << 2;
529				/* mask alpha to SCAM */
530				sxi(SX_ORS, 28 + i, 0, R_SCAM, 0);
531				/* src * alpha */
532				sxi(SX_SAXP16X16SR8, 12 + ii, 0, 60 + ii, 3);
533				/* write inverted alpha into SCAM */
534				sxi(SX_XORS, 28 + i, 8, R_SCAM, 0);
535				/* dst * (1 - alpha) + R[60:] */
536				sxi(SX_SAXP16X16SR8, 44 + ii, 60 + ii, 76 + ii, 3);
537			}
538			sxm(SX_STUQ0C, dstx, 76, num - 1);
539			srcx += 16;
540			mskx += 4;
541			dstx += 16;
542		}
543		src += srcpitch;
544		msk += mskpitch;
545		dst += dstpitch;
546	}
547}
548
549void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
550                   uint32_t src, uint32_t srcpitch,
551                   uint32_t msk, uint32_t mskpitch,
552                   uint32_t dst, uint32_t dstpitch,
553                   int width, int height, int flip)
554{
555	uint32_t srcx, dstx, mskx, m;
556	int line, x, i, num;
557
558	ENTER;
559
560	write_sx_reg(p, SX_QUEUED(8), 0xff);
561	write_sx_reg(p, SX_QUEUED(9), 0xff);
562	sxi(SX_ORS, 8, 0, 10, 1);
563	for (line = 0; line < height; line++) {
564		srcx = src;
565		mskx = msk;
566		dstx = dst;
567
568		for (x = 0; x < width; x += 4) {
569			/* we do up to 4 pixels at a time */
570			num = min(4, width - x);
571			if (num <= 0) {
572				xf86Msg(X_ERROR, "wtf?!\n");
573				continue;
574			}
575			/* fetch source pixels */
576			sxm(SX_LDUQ0, srcx, 12, num - 1);
577			if (flip) {
578				sxi(SX_GATHER, 13, 4, 40, num - 1);
579				sxi(SX_GATHER, 15, 4, 44, num - 1);
580				sxi(SX_SCATTER, 40, 4, 15, num - 1);
581				sxi(SX_SCATTER, 44, 4, 13, num - 1);
582			}
583			/* fetch mask */
584			sxm(SX_LDB, mskx, 28, num - 1);
585			/* fetch dst pixels */
586			sxm(SX_LDUQ0, dstx, 44, num - 1);
587			/* set src alpha to 0xff */
588			sxi(SX_SCATTER, 8, 4, 12, num - 1);
589			/* now process up to 4 pixels */
590			for (i = 0; i < num; i++) {
591				int ii = i << 2;
592				/* mask alpha to SCAM */
593				sxi(SX_ORS, 28 + i, 0, R_SCAM, 0);
594				/* src * alpha */
595				sxi(SX_SAXP16X16SR8, 12 + ii, 0, 60 + ii, 3);
596				/* write inverted alpha into SCAM */
597				sxi(SX_XORS, 28 + i, 8, R_SCAM, 0);
598				/* dst * (1 - alpha) + R[60:] */
599				sxi(SX_SAXP16X16SR8, 44 + ii, 60 + ii, 76 + ii, 3);
600			}
601			sxm(SX_STUQ0C, dstx, 76, num - 1);
602			srcx += 16;
603			mskx += 4;
604			dstx += 16;
605		}
606		src += srcpitch;
607		msk += mskpitch;
608		dst += dstpitch;
609	}
610}
611
612void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
613                   uint32_t src, uint32_t srcpitch,
614                   uint32_t msk, uint32_t mskpitch,
615                   uint32_t dst, uint32_t dstpitch,
616                   int width, int height, int flip)
617{
618	uint32_t srcx, dstx, mskx, m;
619	int line, x, i, num;
620
621	ENTER;
622
623	write_sx_reg(p, SX_QUEUED(8), 0xff);
624	write_sx_reg(p, SX_QUEUED(9), 0xff);
625	sxi(SX_ORS, 8, 0, 10, 1);
626	for (line = 0; line < height; line++) {
627		srcx = src;
628		mskx = msk;
629		dstx = dst;
630
631		for (x = 0; x < width; x += 4) {
632			/* we do up to 4 pixels at a time */
633			num = min(4, width - x);
634			if (num <= 0) {
635				xf86Msg(X_ERROR, "wtf?!\n");
636				continue;
637			}
638			/* fetch source pixels */
639			sxm(SX_LDUQ0, srcx, 12, num - 1);
640			if (flip) {
641				sxi(SX_GATHER, 13, 4, 40, num - 1);
642				sxi(SX_GATHER, 15, 4, 44, num - 1);
643				sxi(SX_SCATTER, 40, 4, 15, num - 1);
644				sxi(SX_SCATTER, 44, 4, 13, num - 1);
645			}
646			/* fetch mask */
647			sxm(SX_LDUQ0, mskx, 28, num - 1);
648			/* fetch dst pixels */
649			sxm(SX_LDUQ0, dstx, 44, num - 1);
650			/* set src alpha to 0xff */
651			sxi(SX_SCATTER, 8, 4, 12, num - 1);
652			/* now process up to 4 pixels */
653			for (i = 0; i < num; i++) {
654				int ii = i << 2;
655				/* mask alpha to SCAM */
656				sxi(SX_ORS, 28 + ii, 0, R_SCAM, 0);
657				/* src * alpha */
658				sxi(SX_SAXP16X16SR8, 12 + ii, 0, 60 + ii, 3);
659				/* write inverted alpha into SCAM */
660				sxi(SX_XORS, 28 + ii, 8, R_SCAM, 0);
661				/* dst * (1 - alpha) + R[60:] */
662				sxi(SX_SAXP16X16SR8, 44 + ii, 60 + ii, 76 + ii, 3);
663			}
664			sxm(SX_STUQ0C, dstx, 76, num - 1);
665			srcx += 16;
666			mskx += 16;
667			dstx += 16;
668		}
669		src += srcpitch;
670		msk += mskpitch;
671		dst += dstpitch;
672	}
673}
674