cg14_render.c revision d71cb32d
1/* $NetBSD: cg14_render.c,v 1.10 2017/10/30 22:09:54 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44#include <sparc/sxreg.h>
45
46/*#define SX_SINGLE*/
47/*#define SX_RENDER_DEBUG*/
48/*#define SX_ADD_SOFTWARE*/
49
50#ifdef SX_RENDER_DEBUG
51#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
52#define DPRINTF xf86Msg
53#else
54#define ENTER
55#define DPRINTF while (0) xf86Msg
56#endif
57
58char c[8] = " .,:+*oX";
59
60
61void CG14Comp_Over32Solid(Cg14Ptr p,
62                   uint32_t src, uint32_t srcpitch,
63                   uint32_t dst, uint32_t dstpitch,
64                   int width, int height)
65{
66	uint32_t msk = src, mskx, dstx, m;
67	int line, x, i;
68
69	ENTER;
70
71	for (line = 0; line < height; line++) {
72		mskx = msk;
73		dstx = dst;
74#ifndef SX_SINGLE
75		int rest;
76		for (x = 0; x < width; x += 4) {
77			rest = width - x;
78			/* fetch 4 mask values */
79			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
80			/* fetch destination pixels */
81			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
82			/* duplicate them for all channels */
83			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
84			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
85			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
86			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
87			/* generate inverted alpha */
88			write_sx_reg(p, SX_INSTRUCTIONS,
89			    SX_XORS(12, 8, 28, 15));
90			/* multiply source */
91			write_sx_reg(p, SX_INSTRUCTIONS,
92			    SX_MUL16X16SR8(8, 12, 44, 3));
93			write_sx_reg(p, SX_INSTRUCTIONS,
94			    SX_MUL16X16SR8(8, 16, 48, 3));
95			write_sx_reg(p, SX_INSTRUCTIONS,
96			    SX_MUL16X16SR8(8, 20, 52, 3));
97			write_sx_reg(p, SX_INSTRUCTIONS,
98			    SX_MUL16X16SR8(8, 24, 56, 3));
99			/* multiply dest */
100			write_sx_reg(p, SX_INSTRUCTIONS,
101			    SX_MUL16X16SR8(28, 60, 76, 15));
102			/* add up */
103			write_sx_reg(p, SX_INSTRUCTIONS,
104			    SX_ADDV(44, 76, 92, 15));
105			/* write back */
106			if (rest < 4) {
107				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
108			} else {
109				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
110			}
111			dstx += 16;
112			mskx += 16;
113		}
114#else /* SX_SINGLE */
115		for (x = 0; x < width; x++) {
116			m = *(volatile uint32_t *)(p->fb + mskx);
117			m = m >> 24;
118			if (m == 0) {
119				/* nothing to do - all transparent */
120			} else if (m == 0xff) {
121				/* all opaque */
122				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
123			} else {
124				/* fetch alpha value, stick it into scam */
125				/* mask is in R[12:15] */
126				/*write_sx_io(p, mskx,
127				    SX_LDUQ0(12, 0, mskx & 7));*/
128				write_sx_reg(p, SX_QUEUED(12), m);
129				/* fetch dst pixel */
130				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
131				write_sx_reg(p, SX_INSTRUCTIONS,
132				    SX_ORV(12, 0, R_SCAM, 0));
133				/*
134				 * src * alpha + R0
135				 * R[9:11] * SCAM + R0 -> R[17:19]
136				 */
137				write_sx_reg(p, SX_INSTRUCTIONS,
138				    SX_SAXP16X16SR8(9, 0, 17, 2));
139
140				/* invert SCAM */
141				write_sx_reg(p, SX_INSTRUCTIONS,
142				    SX_XORV(12, 8, R_SCAM, 0));
143#ifdef SX_DEBUG
144				write_sx_reg(p, SX_INSTRUCTIONS,
145				    SX_XORV(12, 8, 13, 0));
146#endif
147				/* dst * (1 - alpha) + R[13:15] */
148				write_sx_reg(p, SX_INSTRUCTIONS,
149				    SX_SAXP16X16SR8(21, 17, 25, 2));
150				write_sx_io(p, dstx,
151				    SX_STUQ0C(24, 0, dstx & 7));
152			}
153			dstx += 4;
154			mskx += 4;
155		}
156#endif /* SX_SINGLE */
157		dst += dstpitch;
158		msk += srcpitch;
159	}
160}
161
162void CG14Comp_Over8Solid(Cg14Ptr p,
163                   uint32_t src, uint32_t srcpitch,
164                   uint32_t dst, uint32_t dstpitch,
165                   int width, int height)
166{
167	uint32_t msk = src, mskx, dstx, m;
168	int line, x, i;
169#ifdef SX_DEBUG
170	char buffer[256];
171#endif
172	ENTER;
173
174	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
175	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
176	    *(uint32_t *)(p->fb + p->srcoff));
177	for (line = 0; line < height; line++) {
178		mskx = msk;
179		dstx = dst;
180#ifndef SX_SINGLE
181		int rest;
182		for (x = 0; x < width; x += 4) {
183			rest = width - x;
184			/* fetch 4 mask values */
185			write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
186			/* fetch destination pixels */
187			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
188			/* duplicate them for all channels */
189			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
190			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
191			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
192			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
193			/* generate inverted alpha */
194			write_sx_reg(p, SX_INSTRUCTIONS,
195			    SX_XORS(12, 8, 28, 15));
196			/* multiply source */
197			write_sx_reg(p, SX_INSTRUCTIONS,
198			    SX_MUL16X16SR8(8, 12, 44, 3));
199			write_sx_reg(p, SX_INSTRUCTIONS,
200			    SX_MUL16X16SR8(8, 16, 48, 3));
201			write_sx_reg(p, SX_INSTRUCTIONS,
202			    SX_MUL16X16SR8(8, 20, 52, 3));
203			write_sx_reg(p, SX_INSTRUCTIONS,
204			    SX_MUL16X16SR8(8, 24, 56, 3));
205			/* multiply dest */
206			write_sx_reg(p, SX_INSTRUCTIONS,
207			    SX_MUL16X16SR8(28, 60, 76, 15));
208			/* add up */
209			write_sx_reg(p, SX_INSTRUCTIONS,
210			    SX_ADDV(44, 76, 92, 15));
211			/* write back */
212			if (rest < 4) {
213				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
214			} else {
215				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
216			}
217			dstx += 16;
218			mskx += 4;
219		}
220#else /* SX_SINGLE */
221		for (x = 0; x < width; x++) {
222			m = *(volatile uint8_t *)(p->fb + mskx);
223#ifdef SX_DEBUG
224			buffer[x] = c[m >> 5];
225#endif
226			if (m == 0) {
227				/* nothing to do - all transparent */
228			} else if (m == 0xff) {
229				/* all opaque */
230				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
231			} else {
232				/* fetch alpha value, stick it into scam */
233				/* mask is in R[12:15] */
234				/*write_sx_io(p, mskx & ~7,
235				    SX_LDB(12, 0, mskx & 7));*/
236				write_sx_reg(p, SX_QUEUED(12), m);
237				/* fetch dst pixel */
238				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
239				write_sx_reg(p, SX_INSTRUCTIONS,
240				    SX_ORV(12, 0, R_SCAM, 0));
241				/*
242				 * src * alpha + R0
243				 * R[9:11] * SCAM + R0 -> R[17:19]
244				 */
245				write_sx_reg(p, SX_INSTRUCTIONS,
246				    SX_SAXP16X16SR8(9, 0, 17, 2));
247
248				/* invert SCAM */
249				write_sx_reg(p, SX_INSTRUCTIONS,
250				    SX_XORV(12, 8, R_SCAM, 0));
251#ifdef SX_DEBUG
252				write_sx_reg(p, SX_INSTRUCTIONS,
253				    SX_XORV(12, 8, 13, 0));
254#endif
255				/* dst * (1 - alpha) + R[13:15] */
256				write_sx_reg(p, SX_INSTRUCTIONS,
257				    SX_SAXP16X16SR8(21, 17, 25, 2));
258				write_sx_io(p, dstx,
259				    SX_STUQ0C(24, 0, dstx & 7));
260			}
261			dstx += 4;
262			mskx += 1;
263		}
264#endif /* SX_SINGLE */
265#ifdef SX_DEBUG
266		buffer[x] = 0;
267		xf86Msg(X_ERROR, "%s\n", buffer);
268#endif
269		dst += dstpitch;
270		msk += srcpitch;
271	}
272}
273
274void CG14Comp_Add32(Cg14Ptr p,
275                   uint32_t src, uint32_t srcpitch,
276                   uint32_t dst, uint32_t dstpitch,
277                   int width, int height)
278{
279	int line;
280	uint32_t srcx, dstx;
281	int full, part, x;
282
283	ENTER;
284	full = width >> 3;	/* chunks of 8 */
285	part = width & 7;	/* leftovers */
286	/* we do this up to 8 pixels at a time */
287	for (line = 0; line < height; line++) {
288		srcx = src;
289		dstx = dst;
290		for (x = 0; x < full; x++) {
291			write_sx_io(p, srcx, SX_LDUQ0(8, 31, srcx & 7));
292			write_sx_io(p, dstx, SX_LDUQ0(40, 31, dstx & 7));
293			write_sx_reg(p, SX_INSTRUCTIONS,
294			    SX_ADDV(8, 40, 72, 15));
295			write_sx_reg(p, SX_INSTRUCTIONS,
296			    SX_ADDV(24, 56, 88, 15));
297			write_sx_io(p, dstx, SX_STUQ0(72, 31, dstx & 7));
298			srcx += 128;
299			dstx += 128;
300		}
301
302		/* do leftovers */
303		write_sx_io(p, srcx, SX_LDUQ0(8, part - 1, srcx & 7));
304		write_sx_io(p, dstx, SX_LDUQ0(40, part - 1, dstx & 7));
305		if (part & 16) {
306			write_sx_reg(p, SX_INSTRUCTIONS,
307			    SX_ADDV(8, 40, 72, 15));
308			write_sx_reg(p, SX_INSTRUCTIONS,
309			    SX_ADDV(24, 56, 88, part - 17));
310		} else {
311			write_sx_reg(p, SX_INSTRUCTIONS,
312			    SX_ADDV(8, 40, 72, part - 1));
313		}
314		write_sx_io(p, dstx, SX_STUQ0(72, part - 1, dstx & 7));
315
316		/* next line */
317		src += srcpitch;
318		dst += dstpitch;
319	}
320}
321
322void CG14Comp_Add8(Cg14Ptr p,
323                   uint32_t src, uint32_t srcpitch,
324                   uint32_t dst, uint32_t dstpitch,
325                   int width, int height)
326{
327	int line;
328	uint32_t srcx, dstx, srcoff, dstoff;
329	int pre, full, part, x;
330	uint8_t *d;
331	char buffer[256];
332	ENTER;
333
334	srcoff = src & 7;
335	src &= ~7;
336	dstoff = dst & 7;
337	dst &= ~7;
338	full = width >> 5;	/* chunks of 32 */
339	part = width & 31;	/* leftovers */
340
341#ifdef SX_DEBUG
342	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
343	    width, height, full, part);
344#endif
345	/* we do this up to 32 pixels at a time */
346	for (line = 0; line < height; line++) {
347		srcx = src;
348		dstx = dst;
349#ifdef SX_ADD_SOFTWARE
350		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
351		d = (uint8_t *)(p->fb + dstx + dstoff);
352		for (x = 0; x < width; x++) {
353			d[x] = min(255, s[x] + d[x]);
354		}
355#else
356		for (x = 0; x < full; x++) {
357			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
358			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
359			write_sx_reg(p, SX_INSTRUCTIONS,
360			    SX_ADDV(8, 40, 72, 15));
361			write_sx_reg(p, SX_INSTRUCTIONS,
362			    SX_ADDV(24, 56, 88, 15));
363			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
364			srcx += 32;
365			dstx += 32;
366		}
367
368		if (part > 0) {
369			/* do leftovers */
370			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
371			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
372			if (part > 16) {
373				write_sx_reg(p, SX_INSTRUCTIONS,
374				    SX_ADDV(8, 40, 72, 15));
375				write_sx_reg(p, SX_INSTRUCTIONS,
376				    SX_ADDV(24, 56, 88, part - 17));
377			} else {
378				write_sx_reg(p, SX_INSTRUCTIONS,
379				    SX_ADDV(8, 40, 72, part - 1));
380			}
381			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
382		}
383#endif
384#ifdef SX_DEBUG
385		d = (uint8_t *)(p->fb + src + srcoff);
386		for (x = 0; x < width; x++) {
387			buffer[x] = c[d[x]>>5];
388		}
389		buffer[x] = 0;
390		xf86Msg(X_ERROR, "%s\n", buffer);
391#endif
392		/* next line */
393		src += srcpitch;
394		dst += dstpitch;
395	}
396}
397
398void CG14Comp_Add8_32(Cg14Ptr p,
399                   uint32_t src, uint32_t srcpitch,
400                   uint32_t dst, uint32_t dstpitch,
401                   int width, int height)
402{
403	int line;
404	uint32_t srcx, dstx, srcoff, dstoff;
405	int pre, full, part, x;
406	uint8_t *d;
407	char buffer[256];
408	ENTER;
409
410	srcoff = src & 7;
411	src &= ~7;
412	dstoff = dst & 7;
413	dst &= ~7;
414	full = width >> 5;	/* chunks of 32 */
415	part = width & 31;	/* leftovers */
416
417#ifdef SX_DEBUG
418	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
419	    width, height, full, part);
420#endif
421	/* we do this up to 32 pixels at a time */
422	for (line = 0; line < height; line++) {
423		srcx = src;
424		dstx = dst;
425		for (x = 0; x < full; x++) {
426			/* load source bytes */
427			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
428			/* load alpha from destination */
429			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
430			write_sx_reg(p, SX_INSTRUCTIONS,
431			    SX_ADDV(8, 40, 72, 15));
432			write_sx_reg(p, SX_INSTRUCTIONS,
433			    SX_ADDV(24, 56, 88, 15));
434			/* write clamped values back into dest alpha */
435			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
436			srcx += 32;
437			dstx += 128;
438		}
439
440		if (part > 0) {
441			/* do leftovers */
442			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
443			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
444			if (part > 16) {
445				write_sx_reg(p, SX_INSTRUCTIONS,
446				    SX_ADDV(8, 40, 72, 15));
447				write_sx_reg(p, SX_INSTRUCTIONS,
448				    SX_ADDV(24, 56, 88, part - 17));
449			} else {
450				write_sx_reg(p, SX_INSTRUCTIONS,
451				    SX_ADDV(8, 40, 72, part - 1));
452			}
453			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
454		}
455#ifdef SX_DEBUG
456		d = (uint8_t *)(p->fb + src + srcoff);
457		for (x = 0; x < width; x++) {
458			buffer[x] = c[d[x]>>5];
459		}
460		buffer[x] = 0;
461		xf86Msg(X_ERROR, "%s\n", buffer);
462#endif
463		/* next line */
464		src += srcpitch;
465		dst += dstpitch;
466	}
467}
468
469void CG14Comp_Over32(Cg14Ptr p,
470                   uint32_t src, uint32_t srcpitch,
471                   uint32_t dst, uint32_t dstpitch,
472                   int width, int height)
473{
474	uint32_t srcx, dstx, m;
475	int line, x, i;
476
477	ENTER;
478
479	write_sx_reg(p, SX_QUEUED(8), 0xff);
480	for (line = 0; line < height; line++) {
481		srcx = src;
482		dstx = dst;
483
484		for (x = 0; x < width; x++) {
485			/* fetch source pixel */
486			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
487			/* fetch dst pixel */
488			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
489			/* src is premultiplied with alpha */
490			/* write inverted alpha into SCAM */
491			write_sx_reg(p, SX_INSTRUCTIONS,
492			    SX_XORV(12, 8, R_SCAM, 0));
493			/* dst * (1 - alpha) + R[13:15] */
494			write_sx_reg(p, SX_INSTRUCTIONS,
495			    SX_SAXP16X16SR8(20, 12, 24, 3));
496			write_sx_io(p, dstx,
497			    SX_STUQ0C(24, 0, dstx & 7));
498			dstx += 4;
499			srcx += 4;
500		}
501		dst += dstpitch;
502		src += srcpitch;
503	}
504}
505
506void CG14Comp_Over32Mask(Cg14Ptr p,
507                   uint32_t src, uint32_t srcpitch,
508                   uint32_t msk, uint32_t mskpitch,
509                   uint32_t dst, uint32_t dstpitch,
510                   int width, int height)
511{
512	uint32_t srcx, dstx, mskx, m;
513	int line, x, i;
514
515	ENTER;
516
517	write_sx_reg(p, SX_QUEUED(8), 0xff);
518	for (line = 0; line < height; line++) {
519		srcx = src;
520		mskx = msk;
521		dstx = dst;
522
523		for (x = 0; x < width; x++) {
524			/* fetch source pixel */
525			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
526			/* fetch mask */
527			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
528			/* fetch dst pixel */
529			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
530			/* stick mask alpha into SCAM */
531			write_sx_reg(p, SX_INSTRUCTIONS,
532			    SX_ORS(9, 0, R_SCAM, 0));
533			/* apply mask */
534			/* src is premultiplied with alpha */
535			write_sx_reg(p, SX_INSTRUCTIONS,
536			    SX_SAXP16X16SR8(12, 0, 16, 3));
537			/* write inverted alpha into SCAM */
538			write_sx_reg(p, SX_INSTRUCTIONS,
539			    SX_XORV(16, 8, R_SCAM, 0));
540			/* dst * (1 - alpha) + R[13:15] */
541			write_sx_reg(p, SX_INSTRUCTIONS,
542			    SX_SAXP16X16SR8(20, 16, 24, 3));
543			write_sx_io(p, dstx,
544			    SX_STUQ0C(24, 0, dstx & 7));
545			srcx += 4;
546			mskx += 1;
547			dstx += 4;
548		}
549		src += srcpitch;
550		msk += mskpitch;
551		dst += dstpitch;
552	}
553}
554
555void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
556                   uint32_t src, uint32_t srcpitch,
557                   uint32_t msk, uint32_t mskpitch,
558                   uint32_t dst, uint32_t dstpitch,
559                   int width, int height)
560{
561	uint32_t srcx, dstx, mskx, m;
562	int line, x, i;
563
564	ENTER;
565
566	write_sx_reg(p, SX_QUEUED(8), 0xff);
567	for (line = 0; line < height; line++) {
568		srcx = src;
569		mskx = msk;
570		dstx = dst;
571
572		for (x = 0; x < width; x++) {
573			/* fetch source pixel */
574			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
575			/* set src alpha to 0xff */
576			write_sx_reg(p, SX_INSTRUCTIONS,
577			    SX_ORS(8, 0, 12, 0));
578			/* fetch mask */
579			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
580			/* fetch dst pixel */
581			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
582			/* write alpha into SCAM */
583			write_sx_reg(p, SX_INSTRUCTIONS,
584			    SX_ORS(9, 0, R_SCAM, 0));
585			/* src * alpha + R0 */
586			write_sx_reg(p, SX_INSTRUCTIONS,
587			    SX_SAXP16X16SR8(12, 0, 16, 3));
588			/* write inverted alpha into SCAM */
589			write_sx_reg(p, SX_INSTRUCTIONS,
590			    SX_XORV(9, 8, R_SCAM, 0));
591			/* dst * (1 - alpha) + R[13:15] */
592			write_sx_reg(p, SX_INSTRUCTIONS,
593			    SX_SAXP16X16SR8(20, 16, 24, 3));
594			write_sx_io(p, dstx,
595			    SX_STUQ0C(24, 0, dstx & 7));
596			srcx += 4;
597			mskx += 1;
598			dstx += 4;
599		}
600		src += srcpitch;
601		msk += mskpitch;
602		dst += dstpitch;
603	}
604}
605
606void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
607                   uint32_t src, uint32_t srcpitch,
608                   uint32_t msk, uint32_t mskpitch,
609                   uint32_t dst, uint32_t dstpitch,
610                   int width, int height)
611{
612	uint32_t srcx, dstx, mskx, m;
613	int line, x, i;
614
615	ENTER;
616
617	write_sx_reg(p, SX_QUEUED(8), 0xff);
618	for (line = 0; line < height; line++) {
619		srcx = src;
620		mskx = msk;
621		dstx = dst;
622
623		for (x = 0; x < width; x++) {
624			/* fetch source pixel */
625			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
626			/* fetch mask */
627			write_sx_io(p, mskx, SX_LDUQ0(16, 0, mskx & 7));
628			/* fetch dst pixel */
629			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
630			/* set src alpha to 0xff */
631			write_sx_reg(p, SX_INSTRUCTIONS,
632			    SX_ORS(8, 0, 12, 0));
633			/* mask alpha to SCAM */
634			write_sx_reg(p, SX_INSTRUCTIONS,
635			    SX_ORS(16, 0, R_SCAM, 0));
636			/* src * alpha */
637			write_sx_reg(p, SX_INSTRUCTIONS,
638			    SX_SAXP16X16SR8(12, 0, 24, 3));
639			/* write inverted alpha into SCAM */
640			write_sx_reg(p, SX_INSTRUCTIONS,
641			    SX_XORS(16, 8, R_SCAM, 0));
642			/* dst * (1 - alpha) + R[24:31] */
643			write_sx_reg(p, SX_INSTRUCTIONS,
644			    SX_SAXP16X16SR8(20, 24, 28, 3));
645			write_sx_io(p, dstx,
646			    SX_STUQ0C(28, 0, dstx & 7));
647			srcx += 4;
648			mskx += 4;
649			dstx += 4;
650		}
651		src += srcpitch;
652		msk += mskpitch;
653		dst += dstpitch;
654	}
655}
656