cg14_render.c revision 78d1a11b
1/* $NetBSD: cg14_render.c,v 1.12 2017/12/08 22:49:37 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44#include <sparc/sxreg.h>
45
46/*#define SX_SINGLE*/
47/*#define SX_RENDER_DEBUG*/
48/*#define SX_ADD_SOFTWARE*/
49
50#ifdef SX_RENDER_DEBUG
51#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
52#define DPRINTF xf86Msg
53#else
54#define ENTER
55#define DPRINTF while (0) xf86Msg
56#endif
57
58char c[8] = " .,:+*oX";
59
60
61void CG14Comp_Over32Solid(Cg14Ptr p,
62                   uint32_t src, uint32_t srcpitch,
63                   uint32_t dst, uint32_t dstpitch,
64                   int width, int height)
65{
66	uint32_t msk = src, mskx, dstx, m;
67	int line, x, i;
68
69	ENTER;
70
71	for (line = 0; line < height; line++) {
72		mskx = msk;
73		dstx = dst;
74#ifndef SX_SINGLE
75		int rest;
76		for (x = 0; x < width; x += 4) {
77			rest = width - x;
78			/* fetch 4 mask values */
79			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
80			/* fetch destination pixels */
81			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
82			/* duplicate them for all channels */
83			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
84			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
85			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
86			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
87			/* generate inverted alpha */
88			write_sx_reg(p, SX_INSTRUCTIONS,
89			    SX_XORS(12, 8, 28, 15));
90			/* multiply source */
91			write_sx_reg(p, SX_INSTRUCTIONS,
92			    SX_MUL16X16SR8(8, 12, 44, 3));
93			write_sx_reg(p, SX_INSTRUCTIONS,
94			    SX_MUL16X16SR8(8, 16, 48, 3));
95			write_sx_reg(p, SX_INSTRUCTIONS,
96			    SX_MUL16X16SR8(8, 20, 52, 3));
97			write_sx_reg(p, SX_INSTRUCTIONS,
98			    SX_MUL16X16SR8(8, 24, 56, 3));
99			/* multiply dest */
100			write_sx_reg(p, SX_INSTRUCTIONS,
101			    SX_MUL16X16SR8(28, 60, 76, 15));
102			/* add up */
103			write_sx_reg(p, SX_INSTRUCTIONS,
104			    SX_ADDV(44, 76, 92, 15));
105			/* write back */
106			if (rest < 4) {
107				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
108			} else {
109				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
110			}
111			dstx += 16;
112			mskx += 16;
113		}
114#else /* SX_SINGLE */
115		for (x = 0; x < width; x++) {
116			m = *(volatile uint32_t *)(p->fb + mskx);
117			m = m >> 24;
118			if (m == 0) {
119				/* nothing to do - all transparent */
120			} else if (m == 0xff) {
121				/* all opaque */
122				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
123			} else {
124				/* fetch alpha value, stick it into scam */
125				/* mask is in R[12:15] */
126				/*write_sx_io(p, mskx,
127				    SX_LDUQ0(12, 0, mskx & 7));*/
128				write_sx_reg(p, SX_QUEUED(12), m);
129				/* fetch dst pixel */
130				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
131				write_sx_reg(p, SX_INSTRUCTIONS,
132				    SX_ORV(12, 0, R_SCAM, 0));
133				/*
134				 * src * alpha + R0
135				 * R[9:11] * SCAM + R0 -> R[17:19]
136				 */
137				write_sx_reg(p, SX_INSTRUCTIONS,
138				    SX_SAXP16X16SR8(9, 0, 17, 2));
139
140				/* invert SCAM */
141				write_sx_reg(p, SX_INSTRUCTIONS,
142				    SX_XORV(12, 8, R_SCAM, 0));
143#ifdef SX_DEBUG
144				write_sx_reg(p, SX_INSTRUCTIONS,
145				    SX_XORV(12, 8, 13, 0));
146#endif
147				/* dst * (1 - alpha) + R[13:15] */
148				write_sx_reg(p, SX_INSTRUCTIONS,
149				    SX_SAXP16X16SR8(21, 17, 25, 2));
150				write_sx_io(p, dstx,
151				    SX_STUQ0C(24, 0, dstx & 7));
152			}
153			dstx += 4;
154			mskx += 4;
155		}
156#endif /* SX_SINGLE */
157		dst += dstpitch;
158		msk += srcpitch;
159	}
160}
161
162void CG14Comp_Over8Solid(Cg14Ptr p,
163                   uint32_t src, uint32_t srcpitch,
164                   uint32_t dst, uint32_t dstpitch,
165                   int width, int height)
166{
167	uint32_t msk = src, mskx, dstx, m;
168	int line, x, i;
169#ifdef SX_DEBUG
170	char buffer[256];
171#endif
172	ENTER;
173
174	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
175	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
176	    *(uint32_t *)(p->fb + p->srcoff));
177	for (line = 0; line < height; line++) {
178		mskx = msk;
179		dstx = dst;
180#ifndef SX_SINGLE
181		int rest;
182		for (x = 0; x < width; x += 4) {
183			rest = width - x;
184			/* fetch 4 mask values */
185			write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
186			/* fetch destination pixels */
187			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
188			/* duplicate them for all channels */
189			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
190			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
191			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
192			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
193			/* generate inverted alpha */
194			write_sx_reg(p, SX_INSTRUCTIONS,
195			    SX_XORS(12, 8, 28, 15));
196			/* multiply source */
197			write_sx_reg(p, SX_INSTRUCTIONS,
198			    SX_MUL16X16SR8(8, 12, 44, 3));
199			write_sx_reg(p, SX_INSTRUCTIONS,
200			    SX_MUL16X16SR8(8, 16, 48, 3));
201			write_sx_reg(p, SX_INSTRUCTIONS,
202			    SX_MUL16X16SR8(8, 20, 52, 3));
203			write_sx_reg(p, SX_INSTRUCTIONS,
204			    SX_MUL16X16SR8(8, 24, 56, 3));
205			/* multiply dest */
206			write_sx_reg(p, SX_INSTRUCTIONS,
207			    SX_MUL16X16SR8(28, 60, 76, 15));
208			/* add up */
209			write_sx_reg(p, SX_INSTRUCTIONS,
210			    SX_ADDV(44, 76, 92, 15));
211			/* write back */
212			if (rest < 4) {
213				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
214			} else {
215				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
216			}
217			dstx += 16;
218			mskx += 4;
219		}
220#else /* SX_SINGLE */
221		for (x = 0; x < width; x++) {
222			m = *(volatile uint8_t *)(p->fb + mskx);
223#ifdef SX_DEBUG
224			buffer[x] = c[m >> 5];
225#endif
226			if (m == 0) {
227				/* nothing to do - all transparent */
228			} else if (m == 0xff) {
229				/* all opaque */
230				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
231			} else {
232				/* fetch alpha value, stick it into scam */
233				/* mask is in R[12:15] */
234				/*write_sx_io(p, mskx & ~7,
235				    SX_LDB(12, 0, mskx & 7));*/
236				write_sx_reg(p, SX_QUEUED(12), m);
237				/* fetch dst pixel */
238				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
239				write_sx_reg(p, SX_INSTRUCTIONS,
240				    SX_ORV(12, 0, R_SCAM, 0));
241				/*
242				 * src * alpha + R0
243				 * R[9:11] * SCAM + R0 -> R[17:19]
244				 */
245				write_sx_reg(p, SX_INSTRUCTIONS,
246				    SX_SAXP16X16SR8(9, 0, 17, 2));
247
248				/* invert SCAM */
249				write_sx_reg(p, SX_INSTRUCTIONS,
250				    SX_XORV(12, 8, R_SCAM, 0));
251#ifdef SX_DEBUG
252				write_sx_reg(p, SX_INSTRUCTIONS,
253				    SX_XORV(12, 8, 13, 0));
254#endif
255				/* dst * (1 - alpha) + R[13:15] */
256				write_sx_reg(p, SX_INSTRUCTIONS,
257				    SX_SAXP16X16SR8(21, 17, 25, 2));
258				write_sx_io(p, dstx,
259				    SX_STUQ0C(24, 0, dstx & 7));
260			}
261			dstx += 4;
262			mskx += 1;
263		}
264#endif /* SX_SINGLE */
265#ifdef SX_DEBUG
266		buffer[x] = 0;
267		xf86Msg(X_ERROR, "%s\n", buffer);
268#endif
269		dst += dstpitch;
270		msk += srcpitch;
271	}
272}
273
274void CG14Comp_Add32(Cg14Ptr p,
275                   uint32_t src, uint32_t srcpitch,
276                   uint32_t dst, uint32_t dstpitch,
277                   int width, int height)
278{
279	int line;
280	uint32_t srcx, dstx;
281	int full, part, x;
282
283	ENTER;
284	full = width >> 3;	/* chunks of 8 */
285	part = width & 7;	/* leftovers */
286	/* we do this up to 8 pixels at a time */
287	for (line = 0; line < height; line++) {
288		srcx = src;
289		dstx = dst;
290		for (x = 0; x < full; x++) {
291			write_sx_io(p, srcx, SX_LDUQ0(8, 31, srcx & 7));
292			write_sx_io(p, dstx, SX_LDUQ0(40, 31, dstx & 7));
293			write_sx_reg(p, SX_INSTRUCTIONS,
294			    SX_ADDV(8, 40, 72, 15));
295			write_sx_reg(p, SX_INSTRUCTIONS,
296			    SX_ADDV(24, 56, 88, 15));
297			write_sx_io(p, dstx, SX_STUQ0(72, 31, dstx & 7));
298			srcx += 128;
299			dstx += 128;
300		}
301
302		/* do leftovers */
303		write_sx_io(p, srcx, SX_LDUQ0(8, part - 1, srcx & 7));
304		write_sx_io(p, dstx, SX_LDUQ0(40, part - 1, dstx & 7));
305		if (part & 16) {
306			write_sx_reg(p, SX_INSTRUCTIONS,
307			    SX_ADDV(8, 40, 72, 15));
308			write_sx_reg(p, SX_INSTRUCTIONS,
309			    SX_ADDV(24, 56, 88, part - 17));
310		} else {
311			write_sx_reg(p, SX_INSTRUCTIONS,
312			    SX_ADDV(8, 40, 72, part - 1));
313		}
314		write_sx_io(p, dstx, SX_STUQ0(72, part - 1, dstx & 7));
315
316		/* next line */
317		src += srcpitch;
318		dst += dstpitch;
319	}
320}
321
322void CG14Comp_Add8(Cg14Ptr p,
323                   uint32_t src, uint32_t srcpitch,
324                   uint32_t dst, uint32_t dstpitch,
325                   int width, int height)
326{
327	int line;
328	uint32_t srcx, dstx, srcoff, dstoff;
329	int pre, full, part, x;
330	uint8_t *d;
331	char buffer[256];
332	ENTER;
333
334	srcoff = src & 7;
335	src &= ~7;
336	dstoff = dst & 7;
337	dst &= ~7;
338	full = width >> 5;	/* chunks of 32 */
339	part = width & 31;	/* leftovers */
340
341#ifdef SX_DEBUG
342	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
343	    width, height, full, part);
344#endif
345	/* we do this up to 32 pixels at a time */
346	for (line = 0; line < height; line++) {
347		srcx = src;
348		dstx = dst;
349#ifdef SX_ADD_SOFTWARE
350		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
351		d = (uint8_t *)(p->fb + dstx + dstoff);
352		for (x = 0; x < width; x++) {
353			d[x] = min(255, s[x] + d[x]);
354		}
355#else
356		for (x = 0; x < full; x++) {
357			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
358			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
359			write_sx_reg(p, SX_INSTRUCTIONS,
360			    SX_ADDV(8, 40, 72, 15));
361			write_sx_reg(p, SX_INSTRUCTIONS,
362			    SX_ADDV(24, 56, 88, 15));
363			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
364			srcx += 32;
365			dstx += 32;
366		}
367
368		if (part > 0) {
369			/* do leftovers */
370			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
371			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
372			if (part > 16) {
373				write_sx_reg(p, SX_INSTRUCTIONS,
374				    SX_ADDV(8, 40, 72, 15));
375				write_sx_reg(p, SX_INSTRUCTIONS,
376				    SX_ADDV(24, 56, 88, part - 17));
377			} else {
378				write_sx_reg(p, SX_INSTRUCTIONS,
379				    SX_ADDV(8, 40, 72, part - 1));
380			}
381			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
382		}
383#endif
384#ifdef SX_DEBUG
385		d = (uint8_t *)(p->fb + src + srcoff);
386		for (x = 0; x < width; x++) {
387			buffer[x] = c[d[x]>>5];
388		}
389		buffer[x] = 0;
390		xf86Msg(X_ERROR, "%s\n", buffer);
391#endif
392		/* next line */
393		src += srcpitch;
394		dst += dstpitch;
395	}
396}
397
398void CG14Comp_Add8_32(Cg14Ptr p,
399                   uint32_t src, uint32_t srcpitch,
400                   uint32_t dst, uint32_t dstpitch,
401                   int width, int height)
402{
403	int line;
404	uint32_t srcx, dstx, srcoff, dstoff;
405	int pre, full, part, x;
406	uint8_t *d;
407	char buffer[256];
408	ENTER;
409
410	srcoff = src & 7;
411	src &= ~7;
412	dstoff = dst & 7;
413	dst &= ~7;
414	full = width >> 5;	/* chunks of 32 */
415	part = width & 31;	/* leftovers */
416
417#ifdef SX_DEBUG
418	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
419	    width, height, full, part);
420#endif
421	/* we do this up to 32 pixels at a time */
422	for (line = 0; line < height; line++) {
423		srcx = src;
424		dstx = dst;
425		for (x = 0; x < full; x++) {
426			/* load source bytes */
427			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
428			/* load alpha from destination */
429			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
430			write_sx_reg(p, SX_INSTRUCTIONS,
431			    SX_ADDV(8, 40, 72, 15));
432			write_sx_reg(p, SX_INSTRUCTIONS,
433			    SX_ADDV(24, 56, 88, 15));
434			/* write clamped values back into dest alpha */
435			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
436			srcx += 32;
437			dstx += 128;
438		}
439
440		if (part > 0) {
441			/* do leftovers */
442			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
443			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
444			if (part > 16) {
445				write_sx_reg(p, SX_INSTRUCTIONS,
446				    SX_ADDV(8, 40, 72, 15));
447				write_sx_reg(p, SX_INSTRUCTIONS,
448				    SX_ADDV(24, 56, 88, part - 17));
449			} else {
450				write_sx_reg(p, SX_INSTRUCTIONS,
451				    SX_ADDV(8, 40, 72, part - 1));
452			}
453			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
454		}
455#ifdef SX_DEBUG
456		d = (uint8_t *)(p->fb + src + srcoff);
457		for (x = 0; x < width; x++) {
458			buffer[x] = c[d[x]>>5];
459		}
460		buffer[x] = 0;
461		xf86Msg(X_ERROR, "%s\n", buffer);
462#endif
463		/* next line */
464		src += srcpitch;
465		dst += dstpitch;
466	}
467}
468
469void CG14Comp_Over32(Cg14Ptr p,
470                   uint32_t src, uint32_t srcpitch,
471                   uint32_t dst, uint32_t dstpitch,
472                   int width, int height, int flip)
473{
474	uint32_t srcx, dstx, mskx, m;
475	int line, x, i, num;
476
477	ENTER;
478
479	write_sx_reg(p, SX_QUEUED(8), 0xff);
480	for (line = 0; line < height; line++) {
481		srcx = src;
482		dstx = dst;
483
484		for (x = 0; x < width; x += 4) {
485			/* we do up to 4 pixels at a time */
486			num = min(4, width - x);
487			if (num <= 0) {
488				xf86Msg(X_ERROR, "wtf?!\n");
489				continue;
490			}
491			/* fetch source pixels */
492			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
493			if (flip) {
494				write_sx_reg(p, SX_INSTRUCTIONS,
495				    SX_GATHER(13, 4, 40, num - 1));
496				write_sx_reg(p, SX_INSTRUCTIONS,
497				    SX_GATHER(15, 4, 44, num - 1));
498				write_sx_reg(p, SX_INSTRUCTIONS,
499				    SX_SCATTER(40, 4, 15, num - 1));
500				write_sx_reg(p, SX_INSTRUCTIONS,
501				    SX_SCATTER(44, 4, 13, num - 1));
502			}
503			/* fetch dst pixels */
504			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
505			/* now process up to 4 pixels */
506			for (i = 0; i < num; i++) {
507				int ii = i << 2;
508				/* write inverted alpha into SCAM */
509				write_sx_reg(p, SX_INSTRUCTIONS,
510				    SX_XORS(12 + ii, 8, R_SCAM, 0));
511				/* dst * (1 - alpha) + src */
512				write_sx_reg(p, SX_INSTRUCTIONS,
513				    SX_SAXP16X16SR8(44 + ii, 12 + ii, 76 + ii, 3));
514			}
515			write_sx_io(p, dstx,
516			    SX_STUQ0C(76, num - 1, dstx & 7));
517			srcx += 16;
518			dstx += 16;
519		}
520		src += srcpitch;
521		dst += dstpitch;
522	}
523}
524
525void CG14Comp_Over32Mask(Cg14Ptr p,
526                   uint32_t src, uint32_t srcpitch,
527                   uint32_t msk, uint32_t mskpitch,
528                   uint32_t dst, uint32_t dstpitch,
529                   int width, int height, int flip)
530{
531	uint32_t srcx, dstx, mskx, m;
532	int line, x, i, num;
533
534	ENTER;
535
536	write_sx_reg(p, SX_QUEUED(8), 0xff);
537	for (line = 0; line < height; line++) {
538		srcx = src;
539		mskx = msk;
540		dstx = dst;
541
542		for (x = 0; x < width; x += 4) {
543			/* we do up to 4 pixels at a time */
544			num = min(4, width - x);
545			if (num <= 0) {
546				xf86Msg(X_ERROR, "wtf?!\n");
547				continue;
548			}
549			/* fetch source pixels */
550			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
551			if (flip) {
552				write_sx_reg(p, SX_INSTRUCTIONS,
553				    SX_GATHER(13, 4, 40, num - 1));
554				write_sx_reg(p, SX_INSTRUCTIONS,
555				    SX_GATHER(15, 4, 44, num - 1));
556				write_sx_reg(p, SX_INSTRUCTIONS,
557				    SX_SCATTER(40, 4, 15, num - 1));
558				write_sx_reg(p, SX_INSTRUCTIONS,
559				    SX_SCATTER(44, 4, 13, num - 1));
560			}
561			/* fetch mask */
562			write_sx_io(p, mskx, SX_LDB(28, num - 1, mskx & 7));
563			/* fetch dst pixels */
564			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
565			/* now process up to 4 pixels */
566			for (i = 0; i < num; i++) {
567				int ii = i << 2;
568				/* mask alpha to SCAM */
569				write_sx_reg(p, SX_INSTRUCTIONS,
570				    SX_ORS(28 + i, 0, R_SCAM, 0));
571				/* src * alpha */
572				write_sx_reg(p, SX_INSTRUCTIONS,
573				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
574				/* write inverted alpha into SCAM */
575				write_sx_reg(p, SX_INSTRUCTIONS,
576				    SX_XORS(28 + i, 8, R_SCAM, 0));
577				/* dst * (1 - alpha) + R[60:] */
578				write_sx_reg(p, SX_INSTRUCTIONS,
579				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
580			}
581			write_sx_io(p, dstx,
582			    SX_STUQ0C(76, num - 1, dstx & 7));
583			srcx += 16;
584			mskx += 4;
585			dstx += 16;
586		}
587		src += srcpitch;
588		msk += mskpitch;
589		dst += dstpitch;
590	}
591}
592
593void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
594                   uint32_t src, uint32_t srcpitch,
595                   uint32_t msk, uint32_t mskpitch,
596                   uint32_t dst, uint32_t dstpitch,
597                   int width, int height, int flip)
598{
599	uint32_t srcx, dstx, mskx, m;
600	int line, x, i, num;
601
602	ENTER;
603
604	write_sx_reg(p, SX_QUEUED(8), 0xff);
605	write_sx_reg(p, SX_QUEUED(9), 0xff);
606	write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(8, 0, 10, 1));
607	for (line = 0; line < height; line++) {
608		srcx = src;
609		mskx = msk;
610		dstx = dst;
611
612		for (x = 0; x < width; x += 4) {
613			/* we do up to 4 pixels at a time */
614			num = min(4, width - x);
615			if (num <= 0) {
616				xf86Msg(X_ERROR, "wtf?!\n");
617				continue;
618			}
619			/* fetch source pixels */
620			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
621			if (flip) {
622				write_sx_reg(p, SX_INSTRUCTIONS,
623				    SX_GATHER(13, 4, 40, num - 1));
624				write_sx_reg(p, SX_INSTRUCTIONS,
625				    SX_GATHER(15, 4, 44, num - 1));
626				write_sx_reg(p, SX_INSTRUCTIONS,
627				    SX_SCATTER(40, 4, 15, num - 1));
628				write_sx_reg(p, SX_INSTRUCTIONS,
629				    SX_SCATTER(44, 4, 13, num - 1));
630			}
631			/* fetch mask */
632			write_sx_io(p, mskx, SX_LDB(28, num - 1, mskx & 7));
633			/* fetch dst pixels */
634			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
635			/* set src alpha to 0xff */
636			write_sx_reg(p, SX_INSTRUCTIONS,
637			    SX_SCATTER(8, 4, 12, num - 1));
638			/* now process up to 4 pixels */
639			for (i = 0; i < num; i++) {
640				int ii = i << 2;
641				/* mask alpha to SCAM */
642				write_sx_reg(p, SX_INSTRUCTIONS,
643				    SX_ORS(28 + i, 0, R_SCAM, 0));
644				/* src * alpha */
645				write_sx_reg(p, SX_INSTRUCTIONS,
646				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
647				/* write inverted alpha into SCAM */
648				write_sx_reg(p, SX_INSTRUCTIONS,
649				    SX_XORS(28 + i, 8, R_SCAM, 0));
650				/* dst * (1 - alpha) + R[60:] */
651				write_sx_reg(p, SX_INSTRUCTIONS,
652				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
653			}
654			write_sx_io(p, dstx,
655			    SX_STUQ0C(76, num - 1, dstx & 7));
656			srcx += 16;
657			mskx += 4;
658			dstx += 16;
659		}
660		src += srcpitch;
661		msk += mskpitch;
662		dst += dstpitch;
663	}
664}
665
666void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
667                   uint32_t src, uint32_t srcpitch,
668                   uint32_t msk, uint32_t mskpitch,
669                   uint32_t dst, uint32_t dstpitch,
670                   int width, int height, int flip)
671{
672	uint32_t srcx, dstx, mskx, m;
673	int line, x, i, num;
674
675	ENTER;
676
677	write_sx_reg(p, SX_QUEUED(8), 0xff);
678	write_sx_reg(p, SX_QUEUED(9), 0xff);
679	write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(8, 0, 10, 1));
680	for (line = 0; line < height; line++) {
681		srcx = src;
682		mskx = msk;
683		dstx = dst;
684
685		for (x = 0; x < width; x += 4) {
686			/* we do up to 4 pixels at a time */
687			num = min(4, width - x);
688			if (num <= 0) {
689				xf86Msg(X_ERROR, "wtf?!\n");
690				continue;
691			}
692			/* fetch source pixels */
693			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
694			if (flip) {
695				write_sx_reg(p, SX_INSTRUCTIONS,
696				    SX_GATHER(13, 4, 40, num - 1));
697				write_sx_reg(p, SX_INSTRUCTIONS,
698				    SX_GATHER(15, 4, 44, num - 1));
699				write_sx_reg(p, SX_INSTRUCTIONS,
700				    SX_SCATTER(40, 4, 15, num - 1));
701				write_sx_reg(p, SX_INSTRUCTIONS,
702				    SX_SCATTER(44, 4, 13, num - 1));
703			}
704			/* fetch mask */
705			write_sx_io(p, mskx, SX_LDUQ0(28, num - 1, mskx & 7));
706			/* fetch dst pixels */
707			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
708			/* set src alpha to 0xff */
709			write_sx_reg(p, SX_INSTRUCTIONS,
710			    SX_SCATTER(8, 4, 12, num - 1));
711			/* now process up to 4 pixels */
712			for (i = 0; i < num; i++) {
713				int ii = i << 2;
714				/* mask alpha to SCAM */
715				write_sx_reg(p, SX_INSTRUCTIONS,
716				    SX_ORS(28 + ii, 0, R_SCAM, 0));
717				/* src * alpha */
718				write_sx_reg(p, SX_INSTRUCTIONS,
719				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
720				/* write inverted alpha into SCAM */
721				write_sx_reg(p, SX_INSTRUCTIONS,
722				    SX_XORS(28 + ii, 8, R_SCAM, 0));
723				/* dst * (1 - alpha) + R[60:] */
724				write_sx_reg(p, SX_INSTRUCTIONS,
725				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
726			}
727			write_sx_io(p, dstx,
728			    SX_STUQ0C(76, num - 1, dstx & 7));
729			srcx += 16;
730			mskx += 16;
731			dstx += 16;
732		}
733		src += srcpitch;
734		msk += mskpitch;
735		dst += dstpitch;
736	}
737}
738