cg14_render.c revision e311bbee
1/* $NetBSD: cg14_render.c,v 1.11 2017/12/07 19:23:22 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44#include <sparc/sxreg.h>
45
46/*#define SX_SINGLE*/
47/*#define SX_RENDER_DEBUG*/
48/*#define SX_ADD_SOFTWARE*/
49
50#ifdef SX_RENDER_DEBUG
51#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
52#define DPRINTF xf86Msg
53#else
54#define ENTER
55#define DPRINTF while (0) xf86Msg
56#endif
57
58char c[8] = " .,:+*oX";
59
60
61void CG14Comp_Over32Solid(Cg14Ptr p,
62                   uint32_t src, uint32_t srcpitch,
63                   uint32_t dst, uint32_t dstpitch,
64                   int width, int height)
65{
66	uint32_t msk = src, mskx, dstx, m;
67	int line, x, i;
68
69	ENTER;
70
71	for (line = 0; line < height; line++) {
72		mskx = msk;
73		dstx = dst;
74#ifndef SX_SINGLE
75		int rest;
76		for (x = 0; x < width; x += 4) {
77			rest = width - x;
78			/* fetch 4 mask values */
79			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
80			/* fetch destination pixels */
81			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
82			/* duplicate them for all channels */
83			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
84			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
85			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
86			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
87			/* generate inverted alpha */
88			write_sx_reg(p, SX_INSTRUCTIONS,
89			    SX_XORS(12, 8, 28, 15));
90			/* multiply source */
91			write_sx_reg(p, SX_INSTRUCTIONS,
92			    SX_MUL16X16SR8(8, 12, 44, 3));
93			write_sx_reg(p, SX_INSTRUCTIONS,
94			    SX_MUL16X16SR8(8, 16, 48, 3));
95			write_sx_reg(p, SX_INSTRUCTIONS,
96			    SX_MUL16X16SR8(8, 20, 52, 3));
97			write_sx_reg(p, SX_INSTRUCTIONS,
98			    SX_MUL16X16SR8(8, 24, 56, 3));
99			/* multiply dest */
100			write_sx_reg(p, SX_INSTRUCTIONS,
101			    SX_MUL16X16SR8(28, 60, 76, 15));
102			/* add up */
103			write_sx_reg(p, SX_INSTRUCTIONS,
104			    SX_ADDV(44, 76, 92, 15));
105			/* write back */
106			if (rest < 4) {
107				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
108			} else {
109				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
110			}
111			dstx += 16;
112			mskx += 16;
113		}
114#else /* SX_SINGLE */
115		for (x = 0; x < width; x++) {
116			m = *(volatile uint32_t *)(p->fb + mskx);
117			m = m >> 24;
118			if (m == 0) {
119				/* nothing to do - all transparent */
120			} else if (m == 0xff) {
121				/* all opaque */
122				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
123			} else {
124				/* fetch alpha value, stick it into scam */
125				/* mask is in R[12:15] */
126				/*write_sx_io(p, mskx,
127				    SX_LDUQ0(12, 0, mskx & 7));*/
128				write_sx_reg(p, SX_QUEUED(12), m);
129				/* fetch dst pixel */
130				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
131				write_sx_reg(p, SX_INSTRUCTIONS,
132				    SX_ORV(12, 0, R_SCAM, 0));
133				/*
134				 * src * alpha + R0
135				 * R[9:11] * SCAM + R0 -> R[17:19]
136				 */
137				write_sx_reg(p, SX_INSTRUCTIONS,
138				    SX_SAXP16X16SR8(9, 0, 17, 2));
139
140				/* invert SCAM */
141				write_sx_reg(p, SX_INSTRUCTIONS,
142				    SX_XORV(12, 8, R_SCAM, 0));
143#ifdef SX_DEBUG
144				write_sx_reg(p, SX_INSTRUCTIONS,
145				    SX_XORV(12, 8, 13, 0));
146#endif
147				/* dst * (1 - alpha) + R[13:15] */
148				write_sx_reg(p, SX_INSTRUCTIONS,
149				    SX_SAXP16X16SR8(21, 17, 25, 2));
150				write_sx_io(p, dstx,
151				    SX_STUQ0C(24, 0, dstx & 7));
152			}
153			dstx += 4;
154			mskx += 4;
155		}
156#endif /* SX_SINGLE */
157		dst += dstpitch;
158		msk += srcpitch;
159	}
160}
161
162void CG14Comp_Over8Solid(Cg14Ptr p,
163                   uint32_t src, uint32_t srcpitch,
164                   uint32_t dst, uint32_t dstpitch,
165                   int width, int height)
166{
167	uint32_t msk = src, mskx, dstx, m;
168	int line, x, i;
169#ifdef SX_DEBUG
170	char buffer[256];
171#endif
172	ENTER;
173
174	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
175	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
176	    *(uint32_t *)(p->fb + p->srcoff));
177	for (line = 0; line < height; line++) {
178		mskx = msk;
179		dstx = dst;
180#ifndef SX_SINGLE
181		int rest;
182		for (x = 0; x < width; x += 4) {
183			rest = width - x;
184			/* fetch 4 mask values */
185			write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
186			/* fetch destination pixels */
187			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
188			/* duplicate them for all channels */
189			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
190			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
191			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
192			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
193			/* generate inverted alpha */
194			write_sx_reg(p, SX_INSTRUCTIONS,
195			    SX_XORS(12, 8, 28, 15));
196			/* multiply source */
197			write_sx_reg(p, SX_INSTRUCTIONS,
198			    SX_MUL16X16SR8(8, 12, 44, 3));
199			write_sx_reg(p, SX_INSTRUCTIONS,
200			    SX_MUL16X16SR8(8, 16, 48, 3));
201			write_sx_reg(p, SX_INSTRUCTIONS,
202			    SX_MUL16X16SR8(8, 20, 52, 3));
203			write_sx_reg(p, SX_INSTRUCTIONS,
204			    SX_MUL16X16SR8(8, 24, 56, 3));
205			/* multiply dest */
206			write_sx_reg(p, SX_INSTRUCTIONS,
207			    SX_MUL16X16SR8(28, 60, 76, 15));
208			/* add up */
209			write_sx_reg(p, SX_INSTRUCTIONS,
210			    SX_ADDV(44, 76, 92, 15));
211			/* write back */
212			if (rest < 4) {
213				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
214			} else {
215				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
216			}
217			dstx += 16;
218			mskx += 4;
219		}
220#else /* SX_SINGLE */
221		for (x = 0; x < width; x++) {
222			m = *(volatile uint8_t *)(p->fb + mskx);
223#ifdef SX_DEBUG
224			buffer[x] = c[m >> 5];
225#endif
226			if (m == 0) {
227				/* nothing to do - all transparent */
228			} else if (m == 0xff) {
229				/* all opaque */
230				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
231			} else {
232				/* fetch alpha value, stick it into scam */
233				/* mask is in R[12:15] */
234				/*write_sx_io(p, mskx & ~7,
235				    SX_LDB(12, 0, mskx & 7));*/
236				write_sx_reg(p, SX_QUEUED(12), m);
237				/* fetch dst pixel */
238				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
239				write_sx_reg(p, SX_INSTRUCTIONS,
240				    SX_ORV(12, 0, R_SCAM, 0));
241				/*
242				 * src * alpha + R0
243				 * R[9:11] * SCAM + R0 -> R[17:19]
244				 */
245				write_sx_reg(p, SX_INSTRUCTIONS,
246				    SX_SAXP16X16SR8(9, 0, 17, 2));
247
248				/* invert SCAM */
249				write_sx_reg(p, SX_INSTRUCTIONS,
250				    SX_XORV(12, 8, R_SCAM, 0));
251#ifdef SX_DEBUG
252				write_sx_reg(p, SX_INSTRUCTIONS,
253				    SX_XORV(12, 8, 13, 0));
254#endif
255				/* dst * (1 - alpha) + R[13:15] */
256				write_sx_reg(p, SX_INSTRUCTIONS,
257				    SX_SAXP16X16SR8(21, 17, 25, 2));
258				write_sx_io(p, dstx,
259				    SX_STUQ0C(24, 0, dstx & 7));
260			}
261			dstx += 4;
262			mskx += 1;
263		}
264#endif /* SX_SINGLE */
265#ifdef SX_DEBUG
266		buffer[x] = 0;
267		xf86Msg(X_ERROR, "%s\n", buffer);
268#endif
269		dst += dstpitch;
270		msk += srcpitch;
271	}
272}
273
274void CG14Comp_Add32(Cg14Ptr p,
275                   uint32_t src, uint32_t srcpitch,
276                   uint32_t dst, uint32_t dstpitch,
277                   int width, int height)
278{
279	int line;
280	uint32_t srcx, dstx;
281	int full, part, x;
282
283	ENTER;
284	full = width >> 3;	/* chunks of 8 */
285	part = width & 7;	/* leftovers */
286	/* we do this up to 8 pixels at a time */
287	for (line = 0; line < height; line++) {
288		srcx = src;
289		dstx = dst;
290		for (x = 0; x < full; x++) {
291			write_sx_io(p, srcx, SX_LDUQ0(8, 31, srcx & 7));
292			write_sx_io(p, dstx, SX_LDUQ0(40, 31, dstx & 7));
293			write_sx_reg(p, SX_INSTRUCTIONS,
294			    SX_ADDV(8, 40, 72, 15));
295			write_sx_reg(p, SX_INSTRUCTIONS,
296			    SX_ADDV(24, 56, 88, 15));
297			write_sx_io(p, dstx, SX_STUQ0(72, 31, dstx & 7));
298			srcx += 128;
299			dstx += 128;
300		}
301
302		/* do leftovers */
303		write_sx_io(p, srcx, SX_LDUQ0(8, part - 1, srcx & 7));
304		write_sx_io(p, dstx, SX_LDUQ0(40, part - 1, dstx & 7));
305		if (part & 16) {
306			write_sx_reg(p, SX_INSTRUCTIONS,
307			    SX_ADDV(8, 40, 72, 15));
308			write_sx_reg(p, SX_INSTRUCTIONS,
309			    SX_ADDV(24, 56, 88, part - 17));
310		} else {
311			write_sx_reg(p, SX_INSTRUCTIONS,
312			    SX_ADDV(8, 40, 72, part - 1));
313		}
314		write_sx_io(p, dstx, SX_STUQ0(72, part - 1, dstx & 7));
315
316		/* next line */
317		src += srcpitch;
318		dst += dstpitch;
319	}
320}
321
322void CG14Comp_Add8(Cg14Ptr p,
323                   uint32_t src, uint32_t srcpitch,
324                   uint32_t dst, uint32_t dstpitch,
325                   int width, int height)
326{
327	int line;
328	uint32_t srcx, dstx, srcoff, dstoff;
329	int pre, full, part, x;
330	uint8_t *d;
331	char buffer[256];
332	ENTER;
333
334	srcoff = src & 7;
335	src &= ~7;
336	dstoff = dst & 7;
337	dst &= ~7;
338	full = width >> 5;	/* chunks of 32 */
339	part = width & 31;	/* leftovers */
340
341#ifdef SX_DEBUG
342	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
343	    width, height, full, part);
344#endif
345	/* we do this up to 32 pixels at a time */
346	for (line = 0; line < height; line++) {
347		srcx = src;
348		dstx = dst;
349#ifdef SX_ADD_SOFTWARE
350		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
351		d = (uint8_t *)(p->fb + dstx + dstoff);
352		for (x = 0; x < width; x++) {
353			d[x] = min(255, s[x] + d[x]);
354		}
355#else
356		for (x = 0; x < full; x++) {
357			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
358			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
359			write_sx_reg(p, SX_INSTRUCTIONS,
360			    SX_ADDV(8, 40, 72, 15));
361			write_sx_reg(p, SX_INSTRUCTIONS,
362			    SX_ADDV(24, 56, 88, 15));
363			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
364			srcx += 32;
365			dstx += 32;
366		}
367
368		if (part > 0) {
369			/* do leftovers */
370			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
371			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
372			if (part > 16) {
373				write_sx_reg(p, SX_INSTRUCTIONS,
374				    SX_ADDV(8, 40, 72, 15));
375				write_sx_reg(p, SX_INSTRUCTIONS,
376				    SX_ADDV(24, 56, 88, part - 17));
377			} else {
378				write_sx_reg(p, SX_INSTRUCTIONS,
379				    SX_ADDV(8, 40, 72, part - 1));
380			}
381			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
382		}
383#endif
384#ifdef SX_DEBUG
385		d = (uint8_t *)(p->fb + src + srcoff);
386		for (x = 0; x < width; x++) {
387			buffer[x] = c[d[x]>>5];
388		}
389		buffer[x] = 0;
390		xf86Msg(X_ERROR, "%s\n", buffer);
391#endif
392		/* next line */
393		src += srcpitch;
394		dst += dstpitch;
395	}
396}
397
398void CG14Comp_Add8_32(Cg14Ptr p,
399                   uint32_t src, uint32_t srcpitch,
400                   uint32_t dst, uint32_t dstpitch,
401                   int width, int height)
402{
403	int line;
404	uint32_t srcx, dstx, srcoff, dstoff;
405	int pre, full, part, x;
406	uint8_t *d;
407	char buffer[256];
408	ENTER;
409
410	srcoff = src & 7;
411	src &= ~7;
412	dstoff = dst & 7;
413	dst &= ~7;
414	full = width >> 5;	/* chunks of 32 */
415	part = width & 31;	/* leftovers */
416
417#ifdef SX_DEBUG
418	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
419	    width, height, full, part);
420#endif
421	/* we do this up to 32 pixels at a time */
422	for (line = 0; line < height; line++) {
423		srcx = src;
424		dstx = dst;
425		for (x = 0; x < full; x++) {
426			/* load source bytes */
427			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
428			/* load alpha from destination */
429			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
430			write_sx_reg(p, SX_INSTRUCTIONS,
431			    SX_ADDV(8, 40, 72, 15));
432			write_sx_reg(p, SX_INSTRUCTIONS,
433			    SX_ADDV(24, 56, 88, 15));
434			/* write clamped values back into dest alpha */
435			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
436			srcx += 32;
437			dstx += 128;
438		}
439
440		if (part > 0) {
441			/* do leftovers */
442			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
443			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
444			if (part > 16) {
445				write_sx_reg(p, SX_INSTRUCTIONS,
446				    SX_ADDV(8, 40, 72, 15));
447				write_sx_reg(p, SX_INSTRUCTIONS,
448				    SX_ADDV(24, 56, 88, part - 17));
449			} else {
450				write_sx_reg(p, SX_INSTRUCTIONS,
451				    SX_ADDV(8, 40, 72, part - 1));
452			}
453			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
454		}
455#ifdef SX_DEBUG
456		d = (uint8_t *)(p->fb + src + srcoff);
457		for (x = 0; x < width; x++) {
458			buffer[x] = c[d[x]>>5];
459		}
460		buffer[x] = 0;
461		xf86Msg(X_ERROR, "%s\n", buffer);
462#endif
463		/* next line */
464		src += srcpitch;
465		dst += dstpitch;
466	}
467}
468
469void CG14Comp_Over32(Cg14Ptr p,
470                   uint32_t src, uint32_t srcpitch,
471                   uint32_t dst, uint32_t dstpitch,
472                   int width, int height, int flip)
473{
474	uint32_t srcx, dstx, m;
475	int line, x, i;
476
477	ENTER;
478
479	write_sx_reg(p, SX_QUEUED(8), 0xff);
480	for (line = 0; line < height; line++) {
481		srcx = src;
482		dstx = dst;
483
484		for (x = 0; x < width; x++) {
485			/* fetch source pixel */
486			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
487			if (flip) {
488				write_sx_reg(p, SX_INSTRUCTIONS,
489				    SX_ORS(13, 0, 40, 0));
490				write_sx_reg(p, SX_INSTRUCTIONS,
491				    SX_ORS(15, 0, 13, 0));
492				write_sx_reg(p, SX_INSTRUCTIONS,
493				    SX_ORS(40, 0, 15, 0));
494			}
495			/* fetch dst pixel */
496			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
497			/* src is premultiplied with alpha */
498			/* write inverted alpha into SCAM */
499			write_sx_reg(p, SX_INSTRUCTIONS,
500			    SX_XORV(12, 8, R_SCAM, 0));
501			/* dst * (1 - alpha) + R[13:15] */
502			write_sx_reg(p, SX_INSTRUCTIONS,
503			    SX_SAXP16X16SR8(20, 12, 24, 3));
504			write_sx_io(p, dstx,
505			    SX_STUQ0C(24, 0, dstx & 7));
506			dstx += 4;
507			srcx += 4;
508		}
509		dst += dstpitch;
510		src += srcpitch;
511	}
512}
513
514void CG14Comp_Over32Mask(Cg14Ptr p,
515                   uint32_t src, uint32_t srcpitch,
516                   uint32_t msk, uint32_t mskpitch,
517                   uint32_t dst, uint32_t dstpitch,
518                   int width, int height, int flip)
519{
520	uint32_t srcx, dstx, mskx, m;
521	int line, x, i;
522
523	ENTER;
524
525	write_sx_reg(p, SX_QUEUED(8), 0xff);
526	for (line = 0; line < height; line++) {
527		srcx = src;
528		mskx = msk;
529		dstx = dst;
530
531		for (x = 0; x < width; x++) {
532			/* fetch source pixel */
533			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
534			if (flip) {
535				write_sx_reg(p, SX_INSTRUCTIONS,
536				    SX_ORS(13, 0, 40, 0));
537				write_sx_reg(p, SX_INSTRUCTIONS,
538				    SX_ORS(15, 0, 13, 0));
539				write_sx_reg(p, SX_INSTRUCTIONS,
540				    SX_ORS(40, 0, 15, 0));
541			}
542			/* fetch mask */
543			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
544			/* fetch dst pixel */
545			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
546			/* stick mask alpha into SCAM */
547			write_sx_reg(p, SX_INSTRUCTIONS,
548			    SX_ORS(9, 0, R_SCAM, 0));
549			/* apply mask */
550			/* src is premultiplied with alpha */
551			write_sx_reg(p, SX_INSTRUCTIONS,
552			    SX_SAXP16X16SR8(12, 0, 16, 3));
553			/* write inverted alpha into SCAM */
554			write_sx_reg(p, SX_INSTRUCTIONS,
555			    SX_XORV(16, 8, R_SCAM, 0));
556			/* dst * (1 - alpha) + R[13:15] */
557			write_sx_reg(p, SX_INSTRUCTIONS,
558			    SX_SAXP16X16SR8(20, 16, 24, 3));
559			write_sx_io(p, dstx,
560			    SX_STUQ0C(24, 0, dstx & 7));
561			srcx += 4;
562			mskx += 1;
563			dstx += 4;
564		}
565		src += srcpitch;
566		msk += mskpitch;
567		dst += dstpitch;
568	}
569}
570
571void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
572                   uint32_t src, uint32_t srcpitch,
573                   uint32_t msk, uint32_t mskpitch,
574                   uint32_t dst, uint32_t dstpitch,
575                   int width, int height, int flip)
576{
577	uint32_t srcx, dstx, mskx, m;
578	int line, x, i;
579
580	ENTER;
581
582	write_sx_reg(p, SX_QUEUED(8), 0xff);
583	for (line = 0; line < height; line++) {
584		srcx = src;
585		mskx = msk;
586		dstx = dst;
587
588		for (x = 0; x < width; x++) {
589			/* fetch source pixel */
590			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
591			if (flip) {
592				write_sx_reg(p, SX_INSTRUCTIONS,
593				    SX_ORS(13, 0, 40, 0));
594				write_sx_reg(p, SX_INSTRUCTIONS,
595				    SX_ORS(15, 0, 13, 0));
596				write_sx_reg(p, SX_INSTRUCTIONS,
597				    SX_ORS(40, 0, 15, 0));
598			}
599			/* set src alpha to 0xff */
600			write_sx_reg(p, SX_INSTRUCTIONS,
601			    SX_ORS(8, 0, 12, 0));
602			/* fetch mask */
603			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
604			/* fetch dst pixel */
605			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
606			/* write alpha into SCAM */
607			write_sx_reg(p, SX_INSTRUCTIONS,
608			    SX_ORS(9, 0, R_SCAM, 0));
609			/* src * alpha + R0 */
610			write_sx_reg(p, SX_INSTRUCTIONS,
611			    SX_SAXP16X16SR8(12, 0, 16, 3));
612			/* write inverted alpha into SCAM */
613			write_sx_reg(p, SX_INSTRUCTIONS,
614			    SX_XORV(9, 8, R_SCAM, 0));
615			/* dst * (1 - alpha) + R[13:15] */
616			write_sx_reg(p, SX_INSTRUCTIONS,
617			    SX_SAXP16X16SR8(20, 16, 24, 3));
618			write_sx_io(p, dstx,
619			    SX_STUQ0C(24, 0, dstx & 7));
620			srcx += 4;
621			mskx += 1;
622			dstx += 4;
623		}
624		src += srcpitch;
625		msk += mskpitch;
626		dst += dstpitch;
627	}
628}
629
630void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
631                   uint32_t src, uint32_t srcpitch,
632                   uint32_t msk, uint32_t mskpitch,
633                   uint32_t dst, uint32_t dstpitch,
634                   int width, int height, int flip)
635{
636	uint32_t srcx, dstx, mskx, m;
637	int line, x, i;
638
639	ENTER;
640
641	write_sx_reg(p, SX_QUEUED(8), 0xff);
642	for (line = 0; line < height; line++) {
643		srcx = src;
644		mskx = msk;
645		dstx = dst;
646
647		for (x = 0; x < width; x++) {
648			/* fetch source pixel */
649			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
650			if (flip) {
651				write_sx_reg(p, SX_INSTRUCTIONS,
652				    SX_ORS(13, 0, 40, 0));
653				write_sx_reg(p, SX_INSTRUCTIONS,
654				    SX_ORS(15, 0, 13, 0));
655				write_sx_reg(p, SX_INSTRUCTIONS,
656				    SX_ORS(40, 0, 15, 0));
657			}
658			/* fetch mask */
659			write_sx_io(p, mskx, SX_LDUQ0(16, 0, mskx & 7));
660			/* fetch dst pixel */
661			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
662			/* set src alpha to 0xff */
663			write_sx_reg(p, SX_INSTRUCTIONS,
664			    SX_ORS(8, 0, 12, 0));
665			/* mask alpha to SCAM */
666			write_sx_reg(p, SX_INSTRUCTIONS,
667			    SX_ORS(16, 0, R_SCAM, 0));
668			/* src * alpha */
669			write_sx_reg(p, SX_INSTRUCTIONS,
670			    SX_SAXP16X16SR8(12, 0, 24, 3));
671			/* write inverted alpha into SCAM */
672			write_sx_reg(p, SX_INSTRUCTIONS,
673			    SX_XORS(16, 8, R_SCAM, 0));
674			/* dst * (1 - alpha) + R[24:31] */
675			write_sx_reg(p, SX_INSTRUCTIONS,
676			    SX_SAXP16X16SR8(20, 24, 28, 3));
677			write_sx_io(p, dstx,
678			    SX_STUQ0C(28, 0, dstx & 7));
679			srcx += 4;
680			mskx += 4;
681			dstx += 4;
682		}
683		src += srcpitch;
684		msk += mskpitch;
685		dst += dstpitch;
686	}
687}
688