cg14_render.c revision c2193d98
1/* $NetBSD: cg14_render.c,v 1.13 2019/07/24 16:07:59 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45/*#define SX_SINGLE*/
46/*#define SX_RENDER_DEBUG*/
47/*#define SX_ADD_SOFTWARE*/
48
49#ifdef SX_RENDER_DEBUG
50#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
51#define DPRINTF xf86Msg
52#else
53#define ENTER
54#define DPRINTF while (0) xf86Msg
55#endif
56
57char c[8] = " .,:+*oX";
58
59
60void CG14Comp_Over32Solid(Cg14Ptr p,
61                   uint32_t src, uint32_t srcpitch,
62                   uint32_t dst, uint32_t dstpitch,
63                   int width, int height)
64{
65	uint32_t msk = src, mskx, dstx, m;
66	int line, x, i;
67
68	ENTER;
69
70	for (line = 0; line < height; line++) {
71		mskx = msk;
72		dstx = dst;
73#ifndef SX_SINGLE
74		int rest;
75		for (x = 0; x < width; x += 4) {
76			rest = width - x;
77			/* fetch 4 mask values */
78			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
79			/* fetch destination pixels */
80			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
81			/* duplicate them for all channels */
82			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
83			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
84			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
85			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
86			/* generate inverted alpha */
87			write_sx_reg(p, SX_INSTRUCTIONS,
88			    SX_XORS(12, 8, 28, 15));
89			/* multiply source */
90			write_sx_reg(p, SX_INSTRUCTIONS,
91			    SX_MUL16X16SR8(8, 12, 44, 3));
92			write_sx_reg(p, SX_INSTRUCTIONS,
93			    SX_MUL16X16SR8(8, 16, 48, 3));
94			write_sx_reg(p, SX_INSTRUCTIONS,
95			    SX_MUL16X16SR8(8, 20, 52, 3));
96			write_sx_reg(p, SX_INSTRUCTIONS,
97			    SX_MUL16X16SR8(8, 24, 56, 3));
98			/* multiply dest */
99			write_sx_reg(p, SX_INSTRUCTIONS,
100			    SX_MUL16X16SR8(28, 60, 76, 15));
101			/* add up */
102			write_sx_reg(p, SX_INSTRUCTIONS,
103			    SX_ADDV(44, 76, 92, 15));
104			/* write back */
105			if (rest < 4) {
106				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
107			} else {
108				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
109			}
110			dstx += 16;
111			mskx += 16;
112		}
113#else /* SX_SINGLE */
114		for (x = 0; x < width; x++) {
115			m = *(volatile uint32_t *)(p->fb + mskx);
116			m = m >> 24;
117			if (m == 0) {
118				/* nothing to do - all transparent */
119			} else if (m == 0xff) {
120				/* all opaque */
121				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
122			} else {
123				/* fetch alpha value, stick it into scam */
124				/* mask is in R[12:15] */
125				/*write_sx_io(p, mskx,
126				    SX_LDUQ0(12, 0, mskx & 7));*/
127				write_sx_reg(p, SX_QUEUED(12), m);
128				/* fetch dst pixel */
129				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
130				write_sx_reg(p, SX_INSTRUCTIONS,
131				    SX_ORV(12, 0, R_SCAM, 0));
132				/*
133				 * src * alpha + R0
134				 * R[9:11] * SCAM + R0 -> R[17:19]
135				 */
136				write_sx_reg(p, SX_INSTRUCTIONS,
137				    SX_SAXP16X16SR8(9, 0, 17, 2));
138
139				/* invert SCAM */
140				write_sx_reg(p, SX_INSTRUCTIONS,
141				    SX_XORV(12, 8, R_SCAM, 0));
142#ifdef SX_DEBUG
143				write_sx_reg(p, SX_INSTRUCTIONS,
144				    SX_XORV(12, 8, 13, 0));
145#endif
146				/* dst * (1 - alpha) + R[13:15] */
147				write_sx_reg(p, SX_INSTRUCTIONS,
148				    SX_SAXP16X16SR8(21, 17, 25, 2));
149				write_sx_io(p, dstx,
150				    SX_STUQ0C(24, 0, dstx & 7));
151			}
152			dstx += 4;
153			mskx += 4;
154		}
155#endif /* SX_SINGLE */
156		dst += dstpitch;
157		msk += srcpitch;
158	}
159}
160
161void CG14Comp_Over8Solid(Cg14Ptr p,
162                   uint32_t src, uint32_t srcpitch,
163                   uint32_t dst, uint32_t dstpitch,
164                   int width, int height)
165{
166	uint32_t msk = src, mskx, dstx, m;
167	int line, x, i;
168#ifdef SX_DEBUG
169	char buffer[256];
170#endif
171	ENTER;
172
173	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
174	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
175	    *(uint32_t *)(p->fb + p->srcoff));
176	for (line = 0; line < height; line++) {
177		mskx = msk;
178		dstx = dst;
179#ifndef SX_SINGLE
180		int rest;
181		for (x = 0; x < width; x += 4) {
182			rest = width - x;
183			/* fetch 4 mask values */
184			write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
185			/* fetch destination pixels */
186			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
187			/* duplicate them for all channels */
188			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
189			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
190			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
191			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
192			/* generate inverted alpha */
193			write_sx_reg(p, SX_INSTRUCTIONS,
194			    SX_XORS(12, 8, 28, 15));
195			/* multiply source */
196			write_sx_reg(p, SX_INSTRUCTIONS,
197			    SX_MUL16X16SR8(8, 12, 44, 3));
198			write_sx_reg(p, SX_INSTRUCTIONS,
199			    SX_MUL16X16SR8(8, 16, 48, 3));
200			write_sx_reg(p, SX_INSTRUCTIONS,
201			    SX_MUL16X16SR8(8, 20, 52, 3));
202			write_sx_reg(p, SX_INSTRUCTIONS,
203			    SX_MUL16X16SR8(8, 24, 56, 3));
204			/* multiply dest */
205			write_sx_reg(p, SX_INSTRUCTIONS,
206			    SX_MUL16X16SR8(28, 60, 76, 15));
207			/* add up */
208			write_sx_reg(p, SX_INSTRUCTIONS,
209			    SX_ADDV(44, 76, 92, 15));
210			/* write back */
211			if (rest < 4) {
212				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
213			} else {
214				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
215			}
216			dstx += 16;
217			mskx += 4;
218		}
219#else /* SX_SINGLE */
220		for (x = 0; x < width; x++) {
221			m = *(volatile uint8_t *)(p->fb + mskx);
222#ifdef SX_DEBUG
223			buffer[x] = c[m >> 5];
224#endif
225			if (m == 0) {
226				/* nothing to do - all transparent */
227			} else if (m == 0xff) {
228				/* all opaque */
229				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
230			} else {
231				/* fetch alpha value, stick it into scam */
232				/* mask is in R[12:15] */
233				/*write_sx_io(p, mskx & ~7,
234				    SX_LDB(12, 0, mskx & 7));*/
235				write_sx_reg(p, SX_QUEUED(12), m);
236				/* fetch dst pixel */
237				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
238				write_sx_reg(p, SX_INSTRUCTIONS,
239				    SX_ORV(12, 0, R_SCAM, 0));
240				/*
241				 * src * alpha + R0
242				 * R[9:11] * SCAM + R0 -> R[17:19]
243				 */
244				write_sx_reg(p, SX_INSTRUCTIONS,
245				    SX_SAXP16X16SR8(9, 0, 17, 2));
246
247				/* invert SCAM */
248				write_sx_reg(p, SX_INSTRUCTIONS,
249				    SX_XORV(12, 8, R_SCAM, 0));
250#ifdef SX_DEBUG
251				write_sx_reg(p, SX_INSTRUCTIONS,
252				    SX_XORV(12, 8, 13, 0));
253#endif
254				/* dst * (1 - alpha) + R[13:15] */
255				write_sx_reg(p, SX_INSTRUCTIONS,
256				    SX_SAXP16X16SR8(21, 17, 25, 2));
257				write_sx_io(p, dstx,
258				    SX_STUQ0C(24, 0, dstx & 7));
259			}
260			dstx += 4;
261			mskx += 1;
262		}
263#endif /* SX_SINGLE */
264#ifdef SX_DEBUG
265		buffer[x] = 0;
266		xf86Msg(X_ERROR, "%s\n", buffer);
267#endif
268		dst += dstpitch;
269		msk += srcpitch;
270	}
271}
272
273void CG14Comp_Add32(Cg14Ptr p,
274                   uint32_t src, uint32_t srcpitch,
275                   uint32_t dst, uint32_t dstpitch,
276                   int width, int height)
277{
278	int line;
279	uint32_t srcx, dstx;
280	int full, part, x;
281
282	ENTER;
283	full = width >> 3;	/* chunks of 8 */
284	part = width & 7;	/* leftovers */
285	/* we do this up to 8 pixels at a time */
286	for (line = 0; line < height; line++) {
287		srcx = src;
288		dstx = dst;
289		for (x = 0; x < full; x++) {
290			write_sx_io(p, srcx, SX_LDUQ0(8, 31, srcx & 7));
291			write_sx_io(p, dstx, SX_LDUQ0(40, 31, dstx & 7));
292			write_sx_reg(p, SX_INSTRUCTIONS,
293			    SX_ADDV(8, 40, 72, 15));
294			write_sx_reg(p, SX_INSTRUCTIONS,
295			    SX_ADDV(24, 56, 88, 15));
296			write_sx_io(p, dstx, SX_STUQ0(72, 31, dstx & 7));
297			srcx += 128;
298			dstx += 128;
299		}
300
301		/* do leftovers */
302		write_sx_io(p, srcx, SX_LDUQ0(8, part - 1, srcx & 7));
303		write_sx_io(p, dstx, SX_LDUQ0(40, part - 1, dstx & 7));
304		if (part & 16) {
305			write_sx_reg(p, SX_INSTRUCTIONS,
306			    SX_ADDV(8, 40, 72, 15));
307			write_sx_reg(p, SX_INSTRUCTIONS,
308			    SX_ADDV(24, 56, 88, part - 17));
309		} else {
310			write_sx_reg(p, SX_INSTRUCTIONS,
311			    SX_ADDV(8, 40, 72, part - 1));
312		}
313		write_sx_io(p, dstx, SX_STUQ0(72, part - 1, dstx & 7));
314
315		/* next line */
316		src += srcpitch;
317		dst += dstpitch;
318	}
319}
320
321void CG14Comp_Add8(Cg14Ptr p,
322                   uint32_t src, uint32_t srcpitch,
323                   uint32_t dst, uint32_t dstpitch,
324                   int width, int height)
325{
326	int line;
327	uint32_t srcx, dstx, srcoff, dstoff;
328	int pre, full, part, x;
329	uint8_t *d;
330	char buffer[256];
331	ENTER;
332
333	srcoff = src & 7;
334	src &= ~7;
335	dstoff = dst & 7;
336	dst &= ~7;
337	full = width >> 5;	/* chunks of 32 */
338	part = width & 31;	/* leftovers */
339
340#ifdef SX_DEBUG
341	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
342	    width, height, full, part);
343#endif
344	/* we do this up to 32 pixels at a time */
345	for (line = 0; line < height; line++) {
346		srcx = src;
347		dstx = dst;
348#ifdef SX_ADD_SOFTWARE
349		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
350		d = (uint8_t *)(p->fb + dstx + dstoff);
351		for (x = 0; x < width; x++) {
352			d[x] = min(255, s[x] + d[x]);
353		}
354#else
355		for (x = 0; x < full; x++) {
356			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
357			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
358			write_sx_reg(p, SX_INSTRUCTIONS,
359			    SX_ADDV(8, 40, 72, 15));
360			write_sx_reg(p, SX_INSTRUCTIONS,
361			    SX_ADDV(24, 56, 88, 15));
362			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
363			srcx += 32;
364			dstx += 32;
365		}
366
367		if (part > 0) {
368			/* do leftovers */
369			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
370			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
371			if (part > 16) {
372				write_sx_reg(p, SX_INSTRUCTIONS,
373				    SX_ADDV(8, 40, 72, 15));
374				write_sx_reg(p, SX_INSTRUCTIONS,
375				    SX_ADDV(24, 56, 88, part - 17));
376			} else {
377				write_sx_reg(p, SX_INSTRUCTIONS,
378				    SX_ADDV(8, 40, 72, part - 1));
379			}
380			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
381		}
382#endif
383#ifdef SX_DEBUG
384		d = (uint8_t *)(p->fb + src + srcoff);
385		for (x = 0; x < width; x++) {
386			buffer[x] = c[d[x]>>5];
387		}
388		buffer[x] = 0;
389		xf86Msg(X_ERROR, "%s\n", buffer);
390#endif
391		/* next line */
392		src += srcpitch;
393		dst += dstpitch;
394	}
395}
396
397void CG14Comp_Add8_32(Cg14Ptr p,
398                   uint32_t src, uint32_t srcpitch,
399                   uint32_t dst, uint32_t dstpitch,
400                   int width, int height)
401{
402	int line;
403	uint32_t srcx, dstx, srcoff, dstoff;
404	int pre, full, part, x;
405	uint8_t *d;
406	char buffer[256];
407	ENTER;
408
409	srcoff = src & 7;
410	src &= ~7;
411	dstoff = dst & 7;
412	dst &= ~7;
413	full = width >> 5;	/* chunks of 32 */
414	part = width & 31;	/* leftovers */
415
416#ifdef SX_DEBUG
417	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
418	    width, height, full, part);
419#endif
420	/* we do this up to 32 pixels at a time */
421	for (line = 0; line < height; line++) {
422		srcx = src;
423		dstx = dst;
424		for (x = 0; x < full; x++) {
425			/* load source bytes */
426			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
427			/* load alpha from destination */
428			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
429			write_sx_reg(p, SX_INSTRUCTIONS,
430			    SX_ADDV(8, 40, 72, 15));
431			write_sx_reg(p, SX_INSTRUCTIONS,
432			    SX_ADDV(24, 56, 88, 15));
433			/* write clamped values back into dest alpha */
434			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
435			srcx += 32;
436			dstx += 128;
437		}
438
439		if (part > 0) {
440			/* do leftovers */
441			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
442			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
443			if (part > 16) {
444				write_sx_reg(p, SX_INSTRUCTIONS,
445				    SX_ADDV(8, 40, 72, 15));
446				write_sx_reg(p, SX_INSTRUCTIONS,
447				    SX_ADDV(24, 56, 88, part - 17));
448			} else {
449				write_sx_reg(p, SX_INSTRUCTIONS,
450				    SX_ADDV(8, 40, 72, part - 1));
451			}
452			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
453		}
454#ifdef SX_DEBUG
455		d = (uint8_t *)(p->fb + src + srcoff);
456		for (x = 0; x < width; x++) {
457			buffer[x] = c[d[x]>>5];
458		}
459		buffer[x] = 0;
460		xf86Msg(X_ERROR, "%s\n", buffer);
461#endif
462		/* next line */
463		src += srcpitch;
464		dst += dstpitch;
465	}
466}
467
468void CG14Comp_Over32(Cg14Ptr p,
469                   uint32_t src, uint32_t srcpitch,
470                   uint32_t dst, uint32_t dstpitch,
471                   int width, int height, int flip)
472{
473	uint32_t srcx, dstx, mskx, m;
474	int line, x, i, num;
475
476	ENTER;
477
478	write_sx_reg(p, SX_QUEUED(8), 0xff);
479	for (line = 0; line < height; line++) {
480		srcx = src;
481		dstx = dst;
482
483		for (x = 0; x < width; x += 4) {
484			/* we do up to 4 pixels at a time */
485			num = min(4, width - x);
486			if (num <= 0) {
487				xf86Msg(X_ERROR, "wtf?!\n");
488				continue;
489			}
490			/* fetch source pixels */
491			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
492			if (flip) {
493				write_sx_reg(p, SX_INSTRUCTIONS,
494				    SX_GATHER(13, 4, 40, num - 1));
495				write_sx_reg(p, SX_INSTRUCTIONS,
496				    SX_GATHER(15, 4, 44, num - 1));
497				write_sx_reg(p, SX_INSTRUCTIONS,
498				    SX_SCATTER(40, 4, 15, num - 1));
499				write_sx_reg(p, SX_INSTRUCTIONS,
500				    SX_SCATTER(44, 4, 13, num - 1));
501			}
502			/* fetch dst pixels */
503			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
504			/* now process up to 4 pixels */
505			for (i = 0; i < num; i++) {
506				int ii = i << 2;
507				/* write inverted alpha into SCAM */
508				write_sx_reg(p, SX_INSTRUCTIONS,
509				    SX_XORS(12 + ii, 8, R_SCAM, 0));
510				/* dst * (1 - alpha) + src */
511				write_sx_reg(p, SX_INSTRUCTIONS,
512				    SX_SAXP16X16SR8(44 + ii, 12 + ii, 76 + ii, 3));
513			}
514			write_sx_io(p, dstx,
515			    SX_STUQ0C(76, num - 1, dstx & 7));
516			srcx += 16;
517			dstx += 16;
518		}
519		src += srcpitch;
520		dst += dstpitch;
521	}
522}
523
524void CG14Comp_Over32Mask(Cg14Ptr p,
525                   uint32_t src, uint32_t srcpitch,
526                   uint32_t msk, uint32_t mskpitch,
527                   uint32_t dst, uint32_t dstpitch,
528                   int width, int height, int flip)
529{
530	uint32_t srcx, dstx, mskx, m;
531	int line, x, i, num;
532
533	ENTER;
534
535	write_sx_reg(p, SX_QUEUED(8), 0xff);
536	for (line = 0; line < height; line++) {
537		srcx = src;
538		mskx = msk;
539		dstx = dst;
540
541		for (x = 0; x < width; x += 4) {
542			/* we do up to 4 pixels at a time */
543			num = min(4, width - x);
544			if (num <= 0) {
545				xf86Msg(X_ERROR, "wtf?!\n");
546				continue;
547			}
548			/* fetch source pixels */
549			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
550			if (flip) {
551				write_sx_reg(p, SX_INSTRUCTIONS,
552				    SX_GATHER(13, 4, 40, num - 1));
553				write_sx_reg(p, SX_INSTRUCTIONS,
554				    SX_GATHER(15, 4, 44, num - 1));
555				write_sx_reg(p, SX_INSTRUCTIONS,
556				    SX_SCATTER(40, 4, 15, num - 1));
557				write_sx_reg(p, SX_INSTRUCTIONS,
558				    SX_SCATTER(44, 4, 13, num - 1));
559			}
560			/* fetch mask */
561			write_sx_io(p, mskx, SX_LDB(28, num - 1, mskx & 7));
562			/* fetch dst pixels */
563			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
564			/* now process up to 4 pixels */
565			for (i = 0; i < num; i++) {
566				int ii = i << 2;
567				/* mask alpha to SCAM */
568				write_sx_reg(p, SX_INSTRUCTIONS,
569				    SX_ORS(28 + i, 0, R_SCAM, 0));
570				/* src * alpha */
571				write_sx_reg(p, SX_INSTRUCTIONS,
572				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
573				/* write inverted alpha into SCAM */
574				write_sx_reg(p, SX_INSTRUCTIONS,
575				    SX_XORS(28 + i, 8, R_SCAM, 0));
576				/* dst * (1 - alpha) + R[60:] */
577				write_sx_reg(p, SX_INSTRUCTIONS,
578				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
579			}
580			write_sx_io(p, dstx,
581			    SX_STUQ0C(76, num - 1, dstx & 7));
582			srcx += 16;
583			mskx += 4;
584			dstx += 16;
585		}
586		src += srcpitch;
587		msk += mskpitch;
588		dst += dstpitch;
589	}
590}
591
592void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
593                   uint32_t src, uint32_t srcpitch,
594                   uint32_t msk, uint32_t mskpitch,
595                   uint32_t dst, uint32_t dstpitch,
596                   int width, int height, int flip)
597{
598	uint32_t srcx, dstx, mskx, m;
599	int line, x, i, num;
600
601	ENTER;
602
603	write_sx_reg(p, SX_QUEUED(8), 0xff);
604	write_sx_reg(p, SX_QUEUED(9), 0xff);
605	write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(8, 0, 10, 1));
606	for (line = 0; line < height; line++) {
607		srcx = src;
608		mskx = msk;
609		dstx = dst;
610
611		for (x = 0; x < width; x += 4) {
612			/* we do up to 4 pixels at a time */
613			num = min(4, width - x);
614			if (num <= 0) {
615				xf86Msg(X_ERROR, "wtf?!\n");
616				continue;
617			}
618			/* fetch source pixels */
619			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
620			if (flip) {
621				write_sx_reg(p, SX_INSTRUCTIONS,
622				    SX_GATHER(13, 4, 40, num - 1));
623				write_sx_reg(p, SX_INSTRUCTIONS,
624				    SX_GATHER(15, 4, 44, num - 1));
625				write_sx_reg(p, SX_INSTRUCTIONS,
626				    SX_SCATTER(40, 4, 15, num - 1));
627				write_sx_reg(p, SX_INSTRUCTIONS,
628				    SX_SCATTER(44, 4, 13, num - 1));
629			}
630			/* fetch mask */
631			write_sx_io(p, mskx, SX_LDB(28, num - 1, mskx & 7));
632			/* fetch dst pixels */
633			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
634			/* set src alpha to 0xff */
635			write_sx_reg(p, SX_INSTRUCTIONS,
636			    SX_SCATTER(8, 4, 12, num - 1));
637			/* now process up to 4 pixels */
638			for (i = 0; i < num; i++) {
639				int ii = i << 2;
640				/* mask alpha to SCAM */
641				write_sx_reg(p, SX_INSTRUCTIONS,
642				    SX_ORS(28 + i, 0, R_SCAM, 0));
643				/* src * alpha */
644				write_sx_reg(p, SX_INSTRUCTIONS,
645				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
646				/* write inverted alpha into SCAM */
647				write_sx_reg(p, SX_INSTRUCTIONS,
648				    SX_XORS(28 + i, 8, R_SCAM, 0));
649				/* dst * (1 - alpha) + R[60:] */
650				write_sx_reg(p, SX_INSTRUCTIONS,
651				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
652			}
653			write_sx_io(p, dstx,
654			    SX_STUQ0C(76, num - 1, dstx & 7));
655			srcx += 16;
656			mskx += 4;
657			dstx += 16;
658		}
659		src += srcpitch;
660		msk += mskpitch;
661		dst += dstpitch;
662	}
663}
664
665void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
666                   uint32_t src, uint32_t srcpitch,
667                   uint32_t msk, uint32_t mskpitch,
668                   uint32_t dst, uint32_t dstpitch,
669                   int width, int height, int flip)
670{
671	uint32_t srcx, dstx, mskx, m;
672	int line, x, i, num;
673
674	ENTER;
675
676	write_sx_reg(p, SX_QUEUED(8), 0xff);
677	write_sx_reg(p, SX_QUEUED(9), 0xff);
678	write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(8, 0, 10, 1));
679	for (line = 0; line < height; line++) {
680		srcx = src;
681		mskx = msk;
682		dstx = dst;
683
684		for (x = 0; x < width; x += 4) {
685			/* we do up to 4 pixels at a time */
686			num = min(4, width - x);
687			if (num <= 0) {
688				xf86Msg(X_ERROR, "wtf?!\n");
689				continue;
690			}
691			/* fetch source pixels */
692			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
693			if (flip) {
694				write_sx_reg(p, SX_INSTRUCTIONS,
695				    SX_GATHER(13, 4, 40, num - 1));
696				write_sx_reg(p, SX_INSTRUCTIONS,
697				    SX_GATHER(15, 4, 44, num - 1));
698				write_sx_reg(p, SX_INSTRUCTIONS,
699				    SX_SCATTER(40, 4, 15, num - 1));
700				write_sx_reg(p, SX_INSTRUCTIONS,
701				    SX_SCATTER(44, 4, 13, num - 1));
702			}
703			/* fetch mask */
704			write_sx_io(p, mskx, SX_LDUQ0(28, num - 1, mskx & 7));
705			/* fetch dst pixels */
706			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
707			/* set src alpha to 0xff */
708			write_sx_reg(p, SX_INSTRUCTIONS,
709			    SX_SCATTER(8, 4, 12, num - 1));
710			/* now process up to 4 pixels */
711			for (i = 0; i < num; i++) {
712				int ii = i << 2;
713				/* mask alpha to SCAM */
714				write_sx_reg(p, SX_INSTRUCTIONS,
715				    SX_ORS(28 + ii, 0, R_SCAM, 0));
716				/* src * alpha */
717				write_sx_reg(p, SX_INSTRUCTIONS,
718				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
719				/* write inverted alpha into SCAM */
720				write_sx_reg(p, SX_INSTRUCTIONS,
721				    SX_XORS(28 + ii, 8, R_SCAM, 0));
722				/* dst * (1 - alpha) + R[60:] */
723				write_sx_reg(p, SX_INSTRUCTIONS,
724				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
725			}
726			write_sx_io(p, dstx,
727			    SX_STUQ0C(76, num - 1, dstx & 7));
728			srcx += 16;
729			mskx += 16;
730			dstx += 16;
731		}
732		src += srcpitch;
733		msk += mskpitch;
734		dst += dstpitch;
735	}
736}
737