cg14_render.c revision 72fd264f
1/* $NetBSD: cg14_render.c,v 1.14 2021/12/24 04:41:40 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45/*#define SX_SINGLE*/
46/*#define SX_RENDER_DEBUG*/
47/*#define SX_ADD_SOFTWARE*/
48
49#ifdef SX_RENDER_DEBUG
50#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
51#define DPRINTF xf86Msg
52#else
53#define ENTER
54#define DPRINTF while (0) xf86Msg
55#endif
56
57char c[8] = " .,:+*oX";
58
59
60void CG14Comp_Over32Solid(Cg14Ptr p,
61                   uint32_t src, uint32_t srcpitch,
62                   uint32_t dst, uint32_t dstpitch,
63                   int width, int height)
64{
65	uint32_t msk = src, mskx, dstx, m;
66	int line, x, i;
67
68	ENTER;
69
70	for (line = 0; line < height; line++) {
71		mskx = msk;
72		dstx = dst;
73#ifndef SX_SINGLE
74		int rest;
75		for (x = 0; x < width; x += 4) {
76			rest = width - x;
77			/* fetch 4 mask values */
78			sxm(SX_LDUQ0, mskx, 12, 3);
79			/* fetch destination pixels */
80			sxm(SX_LDUQ0, dstx, 60, 3);
81			/* duplicate them for all channels */
82			sxi(SX_ORS(0, 12, 13, 2));
83			sxi(SX_ORS(0, 16, 17, 2));
84			sxi(SX_ORS(0, 20, 21, 2));
85			sxi(SX_ORS(0, 24, 25, 2));
86			/* generate inverted alpha */
87			sxi(SX_XORS(12, 8, 28, 15));
88			/* multiply source */
89			sxi(SX_MUL16X16SR8(8, 12, 44, 3));
90			sxi(SX_MUL16X16SR8(8, 16, 48, 3));
91			sxi(SX_MUL16X16SR8(8, 20, 52, 3));
92			sxi(SX_MUL16X16SR8(8, 24, 56, 3));
93			/* multiply dest */
94			sxi(SX_MUL16X16SR8(28, 60, 76, 15));
95			/* add up */
96			sxi(SX_ADDV(44, 76, 92, 15));
97			/* write back */
98			if (rest < 4) {
99				sxm(SX_STUQ0C, dstx, 92, rest - 1);
100			} else {
101				sxm(SX_STUQ0C, dstx, 92, 3);
102			}
103			dstx += 16;
104			mskx += 16;
105		}
106#else /* SX_SINGLE */
107		for (x = 0; x < width; x++) {
108			m = *(volatile uint32_t *)(p->fb + mskx);
109			m = m >> 24;
110			if (m == 0) {
111				/* nothing to do - all transparent */
112			} else if (m == 0xff) {
113				/* all opaque */
114				sxm(SX_STUQ0, dstx, 8, 0);
115			} else {
116				/* fetch alpha value, stick it into scam */
117				/* mask is in R[12:15] */
118				/*write_sx_io(p, mskx,
119				    SX_LDUQ0(12, 0, mskx & 7));*/
120				write_sx_reg(p, SX_QUEUED(12), m);
121				/* fetch dst pixel */
122				sxm(SX_LDUQ0, dstx, 20, 0);
123				sxi(SX_ORV(12, 0, R_SCAM, 0));
124				/*
125				 * src * alpha + R0
126				 * R[9:11] * SCAM + R0 -> R[17:19]
127				 */
128				sxi(SX_SAXP16X16SR8(9, 0, 17, 2));
129
130				/* invert SCAM */
131				sxi(SX_XORV(12, 8, R_SCAM, 0));
132#ifdef SX_DEBUG
133				sxi(SX_XORV(12, 8, 13, 0));
134#endif
135				/* dst * (1 - alpha) + R[13:15] */
136				sxi(SX_SAXP16X16SR8(21, 17, 25, 2));
137				sxm(SX_STUQ0C, dstx, 24, 0);
138			}
139			dstx += 4;
140			mskx += 4;
141		}
142#endif /* SX_SINGLE */
143		dst += dstpitch;
144		msk += srcpitch;
145	}
146}
147
148void CG14Comp_Over8Solid(Cg14Ptr p,
149                   uint32_t src, uint32_t srcpitch,
150                   uint32_t dst, uint32_t dstpitch,
151                   int width, int height)
152{
153	uint32_t msk = src, mskx, dstx, m;
154	int line, x, i;
155#ifdef SX_DEBUG
156	char buffer[256];
157#endif
158	ENTER;
159
160	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
161	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
162	    *(uint32_t *)(p->fb + p->srcoff));
163	for (line = 0; line < height; line++) {
164		mskx = msk;
165		dstx = dst;
166#ifndef SX_SINGLE
167		int rest;
168		for (x = 0; x < width; x += 4) {
169			rest = width - x;
170			/* fetch 4 mask values */
171			sxm(SX_LDB, mskx, 12, 3);
172			/* fetch destination pixels */
173			sxm(SX_LDUQ0, dstx, 60, 3);
174			/* duplicate them for all channels */
175			sxi(SX_ORS(0, 13, 16, 3));
176			sxi(SX_ORS(0, 14, 20, 3));
177			sxi(SX_ORS(0, 15, 24, 3));
178			sxi(SX_ORS(0, 12, 13, 2));
179			/* generate inverted alpha */
180			sxi(SX_XORS(12, 8, 28, 15));
181			/* multiply source */
182			sxi(SX_MUL16X16SR8(8, 12, 44, 3));
183			sxi(SX_MUL16X16SR8(8, 16, 48, 3));
184			sxi(SX_MUL16X16SR8(8, 20, 52, 3));
185			sxi(SX_MUL16X16SR8(8, 24, 56, 3));
186			/* multiply dest */
187			sxi(SX_MUL16X16SR8(28, 60, 76, 15));
188			/* add up */
189			sxi(SX_ADDV(44, 76, 92, 15));
190			/* write back */
191			if (rest < 4) {
192				sxm(SX_STUQ0C, dstx, 92, rest - 1);
193			} else {
194				sxm(SX_STUQ0C, dstx, 92, 3);
195			}
196			dstx += 16;
197			mskx += 4;
198		}
199#else /* SX_SINGLE */
200		for (x = 0; x < width; x++) {
201			m = *(volatile uint8_t *)(p->fb + mskx);
202#ifdef SX_DEBUG
203			buffer[x] = c[m >> 5];
204#endif
205			if (m == 0) {
206				/* nothing to do - all transparent */
207			} else if (m == 0xff) {
208				/* all opaque */
209				sxm(SX_STUQ0, dstx, 8, 0);
210			} else {
211				/* fetch alpha value, stick it into scam */
212				/* mask is in R[12:15] */
213				/*write_sx_io(p, mskx & ~7,
214				    SX_LDB(12, 0, mskx & 7));*/
215				write_sx_reg(p, SX_QUEUED(12), m);
216				/* fetch dst pixel */
217				sxm(SX_LDUQ0, dstx, 20, 0);
218				sxi(SX_ORV(12, 0, R_SCAM, 0));
219				/*
220				 * src * alpha + R0
221				 * R[9:11] * SCAM + R0 -> R[17:19]
222				 */
223				sxi(SX_SAXP16X16SR8(9, 0, 17, 2));
224
225				/* invert SCAM */
226				sxi(SX_XORV(12, 8, R_SCAM, 0));
227#ifdef SX_DEBUG
228				sxi(SX_XORV(12, 8, 13, 0));
229#endif
230				/* dst * (1 - alpha) + R[13:15] */
231				sxi(SX_SAXP16X16SR8(21, 17, 25, 2));
232				sxm(SX_STUQ0C, dstx, 24, 0);
233			}
234			dstx += 4;
235			mskx += 1;
236		}
237#endif /* SX_SINGLE */
238#ifdef SX_DEBUG
239		buffer[x] = 0;
240		xf86Msg(X_ERROR, "%s\n", buffer);
241#endif
242		dst += dstpitch;
243		msk += srcpitch;
244	}
245}
246
247void CG14Comp_Add32(Cg14Ptr p,
248                   uint32_t src, uint32_t srcpitch,
249                   uint32_t dst, uint32_t dstpitch,
250                   int width, int height)
251{
252	int line;
253	uint32_t srcx, dstx;
254	int full, part, x;
255
256	ENTER;
257	full = width >> 3;	/* chunks of 8 */
258	part = width & 7;	/* leftovers */
259	/* we do this up to 8 pixels at a time */
260	for (line = 0; line < height; line++) {
261		srcx = src;
262		dstx = dst;
263		for (x = 0; x < full; x++) {
264			sxm(SX_LDUQ0, srcx, 8, 31);
265			sxm(SX_LDUQ0, dstx, 40, 31);
266			sxi(SX_ADDV(8, 40, 72, 15));
267			sxi(SX_ADDV(24, 56, 88, 15));
268			sxm(SX_STUQ0, dstx, 72, 31);
269			srcx += 128;
270			dstx += 128;
271		}
272
273		/* do leftovers */
274		sxm(SX_LDUQ0, srcx, 8, part - 1);
275		sxm(SX_LDUQ0, dstx, 40, part - 1);
276		if (part & 16) {
277			sxi(SX_ADDV(8, 40, 72, 15));
278			sxi(SX_ADDV(24, 56, 88, part - 17));
279		} else {
280			sxi(SX_ADDV(8, 40, 72, part - 1));
281		}
282		sxm(SX_STUQ0, dstx, 72, part - 1);
283
284		/* next line */
285		src += srcpitch;
286		dst += dstpitch;
287	}
288}
289
290void CG14Comp_Add8(Cg14Ptr p,
291                   uint32_t src, uint32_t srcpitch,
292                   uint32_t dst, uint32_t dstpitch,
293                   int width, int height)
294{
295	int line;
296	uint32_t srcx, dstx, srcoff, dstoff;
297	int pre, full, part, x;
298	uint8_t *d;
299	char buffer[256];
300	ENTER;
301
302	srcoff = src & 7;
303	src &= ~7;
304	dstoff = dst & 7;
305	dst &= ~7;
306	full = width >> 5;	/* chunks of 32 */
307	part = width & 31;	/* leftovers */
308
309#ifdef SX_DEBUG
310	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
311	    width, height, full, part);
312#endif
313	/* we do this up to 32 pixels at a time */
314	for (line = 0; line < height; line++) {
315		srcx = src;
316		dstx = dst;
317#ifdef SX_ADD_SOFTWARE
318		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
319		d = (uint8_t *)(p->fb + dstx + dstoff);
320		for (x = 0; x < width; x++) {
321			d[x] = min(255, s[x] + d[x]);
322		}
323#else
324		for (x = 0; x < full; x++) {
325			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
326			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
327			sxi(SX_ADDV(8, 40, 72, 15));
328			sxi(SX_ADDV(24, 56, 88, 15));
329			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
330			srcx += 32;
331			dstx += 32;
332		}
333
334		if (part > 0) {
335			/* do leftovers */
336			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
337			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
338			if (part > 16) {
339				sxi(SX_ADDV(8, 40, 72, 15));
340				sxi(SX_ADDV(24, 56, 88, part - 17));
341			} else {
342				sxi(SX_ADDV(8, 40, 72, part - 1));
343			}
344			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
345		}
346#endif
347#ifdef SX_DEBUG
348		d = (uint8_t *)(p->fb + src + srcoff);
349		for (x = 0; x < width; x++) {
350			buffer[x] = c[d[x]>>5];
351		}
352		buffer[x] = 0;
353		xf86Msg(X_ERROR, "%s\n", buffer);
354#endif
355		/* next line */
356		src += srcpitch;
357		dst += dstpitch;
358	}
359}
360
361void CG14Comp_Add8_32(Cg14Ptr p,
362                   uint32_t src, uint32_t srcpitch,
363                   uint32_t dst, uint32_t dstpitch,
364                   int width, int height)
365{
366	int line;
367	uint32_t srcx, dstx, srcoff, dstoff;
368	int pre, full, part, x;
369	uint8_t *d;
370	char buffer[256];
371	ENTER;
372
373	srcoff = src & 7;
374	src &= ~7;
375	dstoff = dst & 7;
376	dst &= ~7;
377	full = width >> 5;	/* chunks of 32 */
378	part = width & 31;	/* leftovers */
379
380#ifdef SX_DEBUG
381	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
382	    width, height, full, part);
383#endif
384	/* we do this up to 32 pixels at a time */
385	for (line = 0; line < height; line++) {
386		srcx = src;
387		dstx = dst;
388		for (x = 0; x < full; x++) {
389			/* load source bytes */
390			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
391			/* load alpha from destination */
392			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
393			sxi(SX_ADDV(8, 40, 72, 15));
394			sxi(SX_ADDV(24, 56, 88, 15));
395			/* write clamped values back into dest alpha */
396			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
397			srcx += 32;
398			dstx += 128;
399		}
400
401		if (part > 0) {
402			/* do leftovers */
403			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
404			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
405			if (part > 16) {
406				sxi(SX_ADDV(8, 40, 72, 15));
407				sxi(SX_ADDV(24, 56, 88, part - 17));
408			} else {
409				sxi(SX_ADDV(8, 40, 72, part - 1));
410			}
411			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
412		}
413#ifdef SX_DEBUG
414		d = (uint8_t *)(p->fb + src + srcoff);
415		for (x = 0; x < width; x++) {
416			buffer[x] = c[d[x]>>5];
417		}
418		buffer[x] = 0;
419		xf86Msg(X_ERROR, "%s\n", buffer);
420#endif
421		/* next line */
422		src += srcpitch;
423		dst += dstpitch;
424	}
425}
426
427void CG14Comp_Over32(Cg14Ptr p,
428                   uint32_t src, uint32_t srcpitch,
429                   uint32_t dst, uint32_t dstpitch,
430                   int width, int height, int flip)
431{
432	uint32_t srcx, dstx, mskx, m;
433	int line, x, i, num;
434
435	ENTER;
436
437	write_sx_reg(p, SX_QUEUED(8), 0xff);
438	for (line = 0; line < height; line++) {
439		srcx = src;
440		dstx = dst;
441
442		for (x = 0; x < width; x += 4) {
443			/* we do up to 4 pixels at a time */
444			num = min(4, width - x);
445			if (num <= 0) {
446				xf86Msg(X_ERROR, "wtf?!\n");
447				continue;
448			}
449			/* fetch source pixels */
450			sxm(SX_LDUQ0, srcx, 12, num - 1);
451			if (flip) {
452				sxi(SX_GATHER(13, 4, 40, num - 1));
453				sxi(SX_GATHER(15, 4, 44, num - 1));
454				sxi(SX_SCATTER(40, 4, 15, num - 1));
455				sxi(SX_SCATTER(44, 4, 13, num - 1));
456			}
457			/* fetch dst pixels */
458			sxm(SX_LDUQ0, dstx, 44, num - 1);
459			/* now process up to 4 pixels */
460			for (i = 0; i < num; i++) {
461				int ii = i << 2;
462				/* write inverted alpha into SCAM */
463				sxi(SX_XORS(12 + ii, 8, R_SCAM, 0));
464				/* dst * (1 - alpha) + src */
465				sxi(SX_SAXP16X16SR8(44 + ii, 12 + ii, 76 + ii, 3));
466			}
467			sxm(SX_STUQ0C, dstx, 76, num - 1);
468			srcx += 16;
469			dstx += 16;
470		}
471		src += srcpitch;
472		dst += dstpitch;
473	}
474}
475
476void CG14Comp_Over32Mask(Cg14Ptr p,
477                   uint32_t src, uint32_t srcpitch,
478                   uint32_t msk, uint32_t mskpitch,
479                   uint32_t dst, uint32_t dstpitch,
480                   int width, int height, int flip)
481{
482	uint32_t srcx, dstx, mskx, m;
483	int line, x, i, num;
484
485	ENTER;
486
487	write_sx_reg(p, SX_QUEUED(8), 0xff);
488	for (line = 0; line < height; line++) {
489		srcx = src;
490		mskx = msk;
491		dstx = dst;
492
493		for (x = 0; x < width; x += 4) {
494			/* we do up to 4 pixels at a time */
495			num = min(4, width - x);
496			if (num <= 0) {
497				xf86Msg(X_ERROR, "wtf?!\n");
498				continue;
499			}
500			/* fetch source pixels */
501			sxm(SX_LDUQ0, srcx, 12, num - 1);
502			if (flip) {
503				sxi(SX_GATHER(13, 4, 40, num - 1));
504				sxi(SX_GATHER(15, 4, 44, num - 1));
505				sxi(SX_SCATTER(40, 4, 15, num - 1));
506				sxi(SX_SCATTER(44, 4, 13, num - 1));
507			}
508			/* fetch mask */
509			sxm(SX_LDB, mskx, 28, num - 1);
510			/* fetch dst pixels */
511			sxm(SX_LDUQ0, dstx, 44, num - 1);
512			/* now process up to 4 pixels */
513			for (i = 0; i < num; i++) {
514				int ii = i << 2;
515				/* mask alpha to SCAM */
516				sxi(SX_ORS(28 + i, 0, R_SCAM, 0));
517				/* src * alpha */
518				sxi(SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
519				/* write inverted alpha into SCAM */
520				sxi(SX_XORS(28 + i, 8, R_SCAM, 0));
521				/* dst * (1 - alpha) + R[60:] */
522				sxi(SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
523			}
524			sxm(SX_STUQ0C, dstx, 76, num - 1);
525			srcx += 16;
526			mskx += 4;
527			dstx += 16;
528		}
529		src += srcpitch;
530		msk += mskpitch;
531		dst += dstpitch;
532	}
533}
534
535void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
536                   uint32_t src, uint32_t srcpitch,
537                   uint32_t msk, uint32_t mskpitch,
538                   uint32_t dst, uint32_t dstpitch,
539                   int width, int height, int flip)
540{
541	uint32_t srcx, dstx, mskx, m;
542	int line, x, i, num;
543
544	ENTER;
545
546	write_sx_reg(p, SX_QUEUED(8), 0xff);
547	write_sx_reg(p, SX_QUEUED(9), 0xff);
548	sxi(SX_ORS(8, 0, 10, 1));
549	for (line = 0; line < height; line++) {
550		srcx = src;
551		mskx = msk;
552		dstx = dst;
553
554		for (x = 0; x < width; x += 4) {
555			/* we do up to 4 pixels at a time */
556			num = min(4, width - x);
557			if (num <= 0) {
558				xf86Msg(X_ERROR, "wtf?!\n");
559				continue;
560			}
561			/* fetch source pixels */
562			sxm(SX_LDUQ0, srcx, 12, num - 1);
563			if (flip) {
564				sxi(SX_GATHER(13, 4, 40, num - 1));
565				sxi(SX_GATHER(15, 4, 44, num - 1));
566				sxi(SX_SCATTER(40, 4, 15, num - 1));
567				sxi(SX_SCATTER(44, 4, 13, num - 1));
568			}
569			/* fetch mask */
570			sxm(SX_LDB, mskx, 28, num - 1);
571			/* fetch dst pixels */
572			sxm(SX_LDUQ0, dstx, 44, num - 1);
573			/* set src alpha to 0xff */
574			sxi(SX_SCATTER(8, 4, 12, num - 1));
575			/* now process up to 4 pixels */
576			for (i = 0; i < num; i++) {
577				int ii = i << 2;
578				/* mask alpha to SCAM */
579				sxi(SX_ORS(28 + i, 0, R_SCAM, 0));
580				/* src * alpha */
581				sxi(SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
582				/* write inverted alpha into SCAM */
583				sxi(SX_XORS(28 + i, 8, R_SCAM, 0));
584				/* dst * (1 - alpha) + R[60:] */
585				sxi(SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
586			}
587			sxm(SX_STUQ0C, dstx, 76, num - 1);
588			srcx += 16;
589			mskx += 4;
590			dstx += 16;
591		}
592		src += srcpitch;
593		msk += mskpitch;
594		dst += dstpitch;
595	}
596}
597
598void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
599                   uint32_t src, uint32_t srcpitch,
600                   uint32_t msk, uint32_t mskpitch,
601                   uint32_t dst, uint32_t dstpitch,
602                   int width, int height, int flip)
603{
604	uint32_t srcx, dstx, mskx, m;
605	int line, x, i, num;
606
607	ENTER;
608
609	write_sx_reg(p, SX_QUEUED(8), 0xff);
610	write_sx_reg(p, SX_QUEUED(9), 0xff);
611	sxi(SX_ORS(8, 0, 10, 1));
612	for (line = 0; line < height; line++) {
613		srcx = src;
614		mskx = msk;
615		dstx = dst;
616
617		for (x = 0; x < width; x += 4) {
618			/* we do up to 4 pixels at a time */
619			num = min(4, width - x);
620			if (num <= 0) {
621				xf86Msg(X_ERROR, "wtf?!\n");
622				continue;
623			}
624			/* fetch source pixels */
625			sxm(SX_LDUQ0, srcx, 12, num - 1);
626			if (flip) {
627				sxi(SX_GATHER(13, 4, 40, num - 1));
628				sxi(SX_GATHER(15, 4, 44, num - 1));
629				sxi(SX_SCATTER(40, 4, 15, num - 1));
630				sxi(SX_SCATTER(44, 4, 13, num - 1));
631			}
632			/* fetch mask */
633			sxm(SX_LDUQ0, mskx, 28, num - 1);
634			/* fetch dst pixels */
635			sxm(SX_LDUQ0, dstx, 44, num - 1);
636			/* set src alpha to 0xff */
637			sxi(SX_SCATTER(8, 4, 12, num - 1));
638			/* now process up to 4 pixels */
639			for (i = 0; i < num; i++) {
640				int ii = i << 2;
641				/* mask alpha to SCAM */
642				sxi(SX_ORS(28 + ii, 0, R_SCAM, 0));
643				/* src * alpha */
644				sxi(SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
645				/* write inverted alpha into SCAM */
646				sxi(SX_XORS(28 + ii, 8, R_SCAM, 0));
647				/* dst * (1 - alpha) + R[60:] */
648				sxi(SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
649			}
650			sxm(SX_STUQ0C, dstx, 76, num - 1);
651			srcx += 16;
652			mskx += 16;
653			dstx += 16;
654		}
655		src += srcpitch;
656		msk += mskpitch;
657		dst += dstpitch;
658	}
659}
660