cg14_render.c revision ad6af7a7
1ad6af7a7Smacallan/* $NetBSD: cg14_render.c,v 1.16 2022/05/11 17:13:04 macallan Exp $ */
2a3a2ba44Smacallan/*
3a3a2ba44Smacallan * Copyright (c) 2013 Michael Lorenz
4a3a2ba44Smacallan * All rights reserved.
5a3a2ba44Smacallan *
6a3a2ba44Smacallan * Redistribution and use in source and binary forms, with or without
7a3a2ba44Smacallan * modification, are permitted provided that the following conditions
8a3a2ba44Smacallan * are met:
9a3a2ba44Smacallan *
10a3a2ba44Smacallan *    - Redistributions of source code must retain the above copyright
11a3a2ba44Smacallan *      notice, this list of conditions and the following disclaimer.
12a3a2ba44Smacallan *    - Redistributions in binary form must reproduce the above
13a3a2ba44Smacallan *      copyright notice, this list of conditions and the following
14a3a2ba44Smacallan *      disclaimer in the documentation and/or other materials provided
15a3a2ba44Smacallan *      with the distribution.
16a3a2ba44Smacallan *
17a3a2ba44Smacallan * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18a3a2ba44Smacallan * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19a3a2ba44Smacallan * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20a3a2ba44Smacallan * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21a3a2ba44Smacallan * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22a3a2ba44Smacallan * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23a3a2ba44Smacallan * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24a3a2ba44Smacallan * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25a3a2ba44Smacallan * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26a3a2ba44Smacallan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27a3a2ba44Smacallan * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28a3a2ba44Smacallan * POSSIBILITY OF SUCH DAMAGE.
29a3a2ba44Smacallan *
30a3a2ba44Smacallan */
31a3a2ba44Smacallan
32c88c16f8Smacallan#ifdef HAVE_CONFIG_H
33c88c16f8Smacallan#include "config.h"
34c88c16f8Smacallan#endif
35c88c16f8Smacallan
36a3a2ba44Smacallan#include <sys/types.h>
37a3a2ba44Smacallan
38a3a2ba44Smacallan/* all driver need this */
39a3a2ba44Smacallan#include "xf86.h"
40a3a2ba44Smacallan#include "xf86_OSproc.h"
41a3a2ba44Smacallan#include "compiler.h"
42a3a2ba44Smacallan
43a3a2ba44Smacallan#include "cg14.h"
44a3a2ba44Smacallan
45f221549cSmacallan/*#define SX_SINGLE*/
46a3a2ba44Smacallan/*#define SX_RENDER_DEBUG*/
47a3a2ba44Smacallan/*#define SX_ADD_SOFTWARE*/
48a3a2ba44Smacallan
49f221549cSmacallan#ifdef SX_RENDER_DEBUG
50a3a2ba44Smacallan#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
51a3a2ba44Smacallan#define DPRINTF xf86Msg
52a3a2ba44Smacallan#else
53a3a2ba44Smacallan#define ENTER
54a3a2ba44Smacallan#define DPRINTF while (0) xf86Msg
55a3a2ba44Smacallan#endif
56a3a2ba44Smacallan
57ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
58a3a2ba44Smacallanchar c[8] = " .,:+*oX";
59ad6af7a7Smacallan#endif
6078cb1511Smacallan
6178cb1511Smacallanvoid CG14Comp_Over32Solid(Cg14Ptr p,
6278cb1511Smacallan                   uint32_t src, uint32_t srcpitch,
6378cb1511Smacallan                   uint32_t dst, uint32_t dstpitch,
6478cb1511Smacallan                   int width, int height)
6578cb1511Smacallan{
6678cb1511Smacallan	uint32_t msk = src, mskx, dstx, m;
6778cb1511Smacallan	int line, x, i;
6878cb1511Smacallan
6978cb1511Smacallan	ENTER;
70f7cb851fSmacallan
7178cb1511Smacallan	for (line = 0; line < height; line++) {
7278cb1511Smacallan		mskx = msk;
7378cb1511Smacallan		dstx = dst;
74f221549cSmacallan#ifndef SX_SINGLE
75f221549cSmacallan		int rest;
76f221549cSmacallan		for (x = 0; x < width; x += 4) {
77f221549cSmacallan			rest = width - x;
78f221549cSmacallan			/* fetch 4 mask values */
7972fd264fSmacallan			sxm(SX_LDUQ0, mskx, 12, 3);
80f221549cSmacallan			/* fetch destination pixels */
8172fd264fSmacallan			sxm(SX_LDUQ0, dstx, 60, 3);
82f221549cSmacallan			/* duplicate them for all channels */
83230e26c7Smacallan			sxi(SX_ORS, 0, 12, 13, 2);
84230e26c7Smacallan			sxi(SX_ORS, 0, 16, 17, 2);
85230e26c7Smacallan			sxi(SX_ORS, 0, 20, 21, 2);
86230e26c7Smacallan			sxi(SX_ORS, 0, 24, 25, 2);
87f221549cSmacallan			/* generate inverted alpha */
88230e26c7Smacallan			sxi(SX_XORS, 12, 8, 28, 15);
89f221549cSmacallan			/* multiply source */
90230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 12, 44, 3);
91230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 16, 48, 3);
92230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 20, 52, 3);
93230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 24, 56, 3);
94f221549cSmacallan			/* multiply dest */
95230e26c7Smacallan			sxi(SX_MUL16X16SR8, 28, 60, 76, 15);
96f221549cSmacallan			/* add up */
97230e26c7Smacallan			sxi(SX_ADDV, 44, 76, 92, 15);
98f221549cSmacallan			/* write back */
99f221549cSmacallan			if (rest < 4) {
10072fd264fSmacallan				sxm(SX_STUQ0C, dstx, 92, rest - 1);
101f221549cSmacallan			} else {
10272fd264fSmacallan				sxm(SX_STUQ0C, dstx, 92, 3);
103f221549cSmacallan			}
104f221549cSmacallan			dstx += 16;
105f221549cSmacallan			mskx += 16;
106f221549cSmacallan		}
107f221549cSmacallan#else /* SX_SINGLE */
10878cb1511Smacallan		for (x = 0; x < width; x++) {
10978cb1511Smacallan			m = *(volatile uint32_t *)(p->fb + mskx);
11078cb1511Smacallan			m = m >> 24;
11178cb1511Smacallan			if (m == 0) {
11278cb1511Smacallan				/* nothing to do - all transparent */
11378cb1511Smacallan			} else if (m == 0xff) {
11478cb1511Smacallan				/* all opaque */
11572fd264fSmacallan				sxm(SX_STUQ0, dstx, 8, 0);
11678cb1511Smacallan			} else {
11778cb1511Smacallan				/* fetch alpha value, stick it into scam */
11878cb1511Smacallan				/* mask is in R[12:15] */
11978cb1511Smacallan				/*write_sx_io(p, mskx,
12078cb1511Smacallan				    SX_LDUQ0(12, 0, mskx & 7));*/
12178cb1511Smacallan				write_sx_reg(p, SX_QUEUED(12), m);
12278cb1511Smacallan				/* fetch dst pixel */
12372fd264fSmacallan				sxm(SX_LDUQ0, dstx, 20, 0);
124230e26c7Smacallan				sxi(SX_ORV, 12, 0, R_SCAM, 0);
12578cb1511Smacallan				/*
12678cb1511Smacallan				 * src * alpha + R0
12778cb1511Smacallan				 * R[9:11] * SCAM + R0 -> R[17:19]
12878cb1511Smacallan				 */
129230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 9, 0, 17, 2);
13078cb1511Smacallan
13178cb1511Smacallan				/* invert SCAM */
132230e26c7Smacallan				sxi(SX_XORV, 12, 8, R_SCAM, 0);
133ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
134230e26c7Smacallan				sxi(SX_XORV, 12, 8, 13, 0);
13578cb1511Smacallan#endif
13678cb1511Smacallan				/* dst * (1 - alpha) + R[13:15] */
137230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 21, 17, 25, 2);
13872fd264fSmacallan				sxm(SX_STUQ0C, dstx, 24, 0);
13978cb1511Smacallan			}
14078cb1511Smacallan			dstx += 4;
14178cb1511Smacallan			mskx += 4;
14278cb1511Smacallan		}
143f221549cSmacallan#endif /* SX_SINGLE */
144f221549cSmacallan		dst += dstpitch;
145f221549cSmacallan		msk += srcpitch;
146f221549cSmacallan	}
147f221549cSmacallan}
148f221549cSmacallan
149f221549cSmacallanvoid CG14Comp_Over8Solid(Cg14Ptr p,
150f221549cSmacallan                   uint32_t src, uint32_t srcpitch,
151f221549cSmacallan                   uint32_t dst, uint32_t dstpitch,
152f221549cSmacallan                   int width, int height)
153f221549cSmacallan{
154f221549cSmacallan	uint32_t msk = src, mskx, dstx, m;
155f221549cSmacallan	int line, x, i;
156ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
157f221549cSmacallan	char buffer[256];
158f221549cSmacallan#endif
159f221549cSmacallan	ENTER;
160f221549cSmacallan
161f221549cSmacallan	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
162f221549cSmacallan	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
163f221549cSmacallan	    *(uint32_t *)(p->fb + p->srcoff));
164f221549cSmacallan	for (line = 0; line < height; line++) {
165f221549cSmacallan		mskx = msk;
166f221549cSmacallan		dstx = dst;
167f221549cSmacallan#ifndef SX_SINGLE
168f221549cSmacallan		int rest;
16978cb1511Smacallan		for (x = 0; x < width; x += 4) {
170f221549cSmacallan			rest = width - x;
17178cb1511Smacallan			/* fetch 4 mask values */
17272fd264fSmacallan			sxm(SX_LDB, mskx, 12, 3);
17378cb1511Smacallan			/* fetch destination pixels */
17472fd264fSmacallan			sxm(SX_LDUQ0, dstx, 60, 3);
17578cb1511Smacallan			/* duplicate them for all channels */
176230e26c7Smacallan			sxi(SX_ORS, 0, 13, 16, 3);
177230e26c7Smacallan			sxi(SX_ORS, 0, 14, 20, 3);
178230e26c7Smacallan			sxi(SX_ORS, 0, 15, 24, 3);
179230e26c7Smacallan			sxi(SX_ORS, 0, 12, 13, 2);
18078cb1511Smacallan			/* generate inverted alpha */
181230e26c7Smacallan			sxi(SX_XORS, 12, 8, 28, 15);
18278cb1511Smacallan			/* multiply source */
183230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 12, 44, 3);
184230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 16, 48, 3);
185230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 20, 52, 3);
186230e26c7Smacallan			sxi(SX_MUL16X16SR8, 8, 24, 56, 3);
18778cb1511Smacallan			/* multiply dest */
188230e26c7Smacallan			sxi(SX_MUL16X16SR8, 28, 60, 76, 15);
18978cb1511Smacallan			/* add up */
190230e26c7Smacallan			sxi(SX_ADDV, 44, 76, 92, 15);
19178cb1511Smacallan			/* write back */
192f221549cSmacallan			if (rest < 4) {
19372fd264fSmacallan				sxm(SX_STUQ0C, dstx, 92, rest - 1);
194f221549cSmacallan			} else {
19572fd264fSmacallan				sxm(SX_STUQ0C, dstx, 92, 3);
196f221549cSmacallan			}
19778cb1511Smacallan			dstx += 16;
198f221549cSmacallan			mskx += 4;
19978cb1511Smacallan		}
200f221549cSmacallan#else /* SX_SINGLE */
201a3a2ba44Smacallan		for (x = 0; x < width; x++) {
202a3a2ba44Smacallan			m = *(volatile uint8_t *)(p->fb + mskx);
203ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
204a3a2ba44Smacallan			buffer[x] = c[m >> 5];
205a3a2ba44Smacallan#endif
206a3a2ba44Smacallan			if (m == 0) {
207a3a2ba44Smacallan				/* nothing to do - all transparent */
208a3a2ba44Smacallan			} else if (m == 0xff) {
209a3a2ba44Smacallan				/* all opaque */
21072fd264fSmacallan				sxm(SX_STUQ0, dstx, 8, 0);
211a3a2ba44Smacallan			} else {
212a3a2ba44Smacallan				/* fetch alpha value, stick it into scam */
213a3a2ba44Smacallan				/* mask is in R[12:15] */
214a3a2ba44Smacallan				/*write_sx_io(p, mskx & ~7,
215a3a2ba44Smacallan				    SX_LDB(12, 0, mskx & 7));*/
216a3a2ba44Smacallan				write_sx_reg(p, SX_QUEUED(12), m);
217a3a2ba44Smacallan				/* fetch dst pixel */
21872fd264fSmacallan				sxm(SX_LDUQ0, dstx, 20, 0);
219230e26c7Smacallan				sxi(SX_ORV, 12, 0, R_SCAM, 0);
220a3a2ba44Smacallan				/*
221a3a2ba44Smacallan				 * src * alpha + R0
222a3a2ba44Smacallan				 * R[9:11] * SCAM + R0 -> R[17:19]
223a3a2ba44Smacallan				 */
224230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 9, 0, 17, 2);
225a3a2ba44Smacallan
226a3a2ba44Smacallan				/* invert SCAM */
227230e26c7Smacallan				sxi(SX_XORV, 12, 8, R_SCAM, 0);
228ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
229230e26c7Smacallan				sxi(SX_XORV, 12, 8, 13, 0);
230a3a2ba44Smacallan#endif
231a3a2ba44Smacallan				/* dst * (1 - alpha) + R[13:15] */
232230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 21, 17, 25, 2);
23372fd264fSmacallan				sxm(SX_STUQ0C, dstx, 24, 0);
234a3a2ba44Smacallan			}
235a3a2ba44Smacallan			dstx += 4;
236a3a2ba44Smacallan			mskx += 1;
237a3a2ba44Smacallan		}
238f221549cSmacallan#endif /* SX_SINGLE */
239ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
240a3a2ba44Smacallan		buffer[x] = 0;
241a3a2ba44Smacallan		xf86Msg(X_ERROR, "%s\n", buffer);
242a3a2ba44Smacallan#endif
243a3a2ba44Smacallan		dst += dstpitch;
244a3a2ba44Smacallan		msk += srcpitch;
245a3a2ba44Smacallan	}
246a3a2ba44Smacallan}
247a3a2ba44Smacallan
248a3a2ba44Smacallanvoid CG14Comp_Add32(Cg14Ptr p,
249a3a2ba44Smacallan                   uint32_t src, uint32_t srcpitch,
250a3a2ba44Smacallan                   uint32_t dst, uint32_t dstpitch,
251a3a2ba44Smacallan                   int width, int height)
252a3a2ba44Smacallan{
253a3a2ba44Smacallan	int line;
254a3a2ba44Smacallan	uint32_t srcx, dstx;
255a3a2ba44Smacallan	int full, part, x;
256a3a2ba44Smacallan
257a3a2ba44Smacallan	ENTER;
258a3a2ba44Smacallan	full = width >> 3;	/* chunks of 8 */
259a3a2ba44Smacallan	part = width & 7;	/* leftovers */
260a3a2ba44Smacallan	/* we do this up to 8 pixels at a time */
261a3a2ba44Smacallan	for (line = 0; line < height; line++) {
262a3a2ba44Smacallan		srcx = src;
263a3a2ba44Smacallan		dstx = dst;
264a3a2ba44Smacallan		for (x = 0; x < full; x++) {
26572fd264fSmacallan			sxm(SX_LDUQ0, srcx, 8, 31);
26672fd264fSmacallan			sxm(SX_LDUQ0, dstx, 40, 31);
267230e26c7Smacallan			sxi(SX_ADDV, 8, 40, 72, 15);
268230e26c7Smacallan			sxi(SX_ADDV, 24, 56, 88, 15);
26972fd264fSmacallan			sxm(SX_STUQ0, dstx, 72, 31);
270a3a2ba44Smacallan			srcx += 128;
271a3a2ba44Smacallan			dstx += 128;
272a3a2ba44Smacallan		}
273a3a2ba44Smacallan
274a3a2ba44Smacallan		/* do leftovers */
27572fd264fSmacallan		sxm(SX_LDUQ0, srcx, 8, part - 1);
27672fd264fSmacallan		sxm(SX_LDUQ0, dstx, 40, part - 1);
277a3a2ba44Smacallan		if (part & 16) {
278230e26c7Smacallan			sxi(SX_ADDV, 8, 40, 72, 15);
279230e26c7Smacallan			sxi(SX_ADDV, 24, 56, 88, part - 17);
280a3a2ba44Smacallan		} else {
281230e26c7Smacallan			sxi(SX_ADDV, 8, 40, 72, part - 1);
282a3a2ba44Smacallan		}
28372fd264fSmacallan		sxm(SX_STUQ0, dstx, 72, part - 1);
284a3a2ba44Smacallan
285a3a2ba44Smacallan		/* next line */
286a3a2ba44Smacallan		src += srcpitch;
287a3a2ba44Smacallan		dst += dstpitch;
288a3a2ba44Smacallan	}
289a3a2ba44Smacallan}
290a3a2ba44Smacallan
291a3a2ba44Smacallanvoid CG14Comp_Add8(Cg14Ptr p,
292a3a2ba44Smacallan                   uint32_t src, uint32_t srcpitch,
293a3a2ba44Smacallan                   uint32_t dst, uint32_t dstpitch,
294a3a2ba44Smacallan                   int width, int height)
295a3a2ba44Smacallan{
296a3a2ba44Smacallan	int line;
297a3a2ba44Smacallan	uint32_t srcx, dstx, srcoff, dstoff;
298a3a2ba44Smacallan	int pre, full, part, x;
299a3a2ba44Smacallan	uint8_t *d;
300a3a2ba44Smacallan	char buffer[256];
301a3a2ba44Smacallan	ENTER;
302a3a2ba44Smacallan
303a3a2ba44Smacallan	srcoff = src & 7;
304a3a2ba44Smacallan	src &= ~7;
305a3a2ba44Smacallan	dstoff = dst & 7;
306a3a2ba44Smacallan	dst &= ~7;
307a3a2ba44Smacallan	full = width >> 5;	/* chunks of 32 */
308a3a2ba44Smacallan	part = width & 31;	/* leftovers */
309a3a2ba44Smacallan
310ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
311a3a2ba44Smacallan	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
312a3a2ba44Smacallan	    width, height, full, part);
313a3a2ba44Smacallan#endif
314a3a2ba44Smacallan	/* we do this up to 32 pixels at a time */
315a3a2ba44Smacallan	for (line = 0; line < height; line++) {
316a3a2ba44Smacallan		srcx = src;
317a3a2ba44Smacallan		dstx = dst;
318a3a2ba44Smacallan#ifdef SX_ADD_SOFTWARE
319a3a2ba44Smacallan		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
320a3a2ba44Smacallan		d = (uint8_t *)(p->fb + dstx + dstoff);
321a3a2ba44Smacallan		for (x = 0; x < width; x++) {
322a3a2ba44Smacallan			d[x] = min(255, s[x] + d[x]);
323a3a2ba44Smacallan		}
324a3a2ba44Smacallan#else
325a3a2ba44Smacallan		for (x = 0; x < full; x++) {
326a3a2ba44Smacallan			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
327a3a2ba44Smacallan			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
328230e26c7Smacallan			sxi(SX_ADDV, 8, 40, 72, 15);
329230e26c7Smacallan			sxi(SX_ADDV, 24, 56, 88, 15);
330a3a2ba44Smacallan			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
331a3a2ba44Smacallan			srcx += 32;
332a3a2ba44Smacallan			dstx += 32;
333a3a2ba44Smacallan		}
334a3a2ba44Smacallan
335a3a2ba44Smacallan		if (part > 0) {
336a3a2ba44Smacallan			/* do leftovers */
337a3a2ba44Smacallan			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
338a3a2ba44Smacallan			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
339a3a2ba44Smacallan			if (part > 16) {
340230e26c7Smacallan				sxi(SX_ADDV, 8, 40, 72, 15);
341230e26c7Smacallan				sxi(SX_ADDV, 24, 56, 88, part - 17);
342a3a2ba44Smacallan			} else {
343230e26c7Smacallan				sxi(SX_ADDV, 8, 40, 72, part - 1);
344a3a2ba44Smacallan			}
345a3a2ba44Smacallan			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
346a3a2ba44Smacallan		}
347a3a2ba44Smacallan#endif
348ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
349d71cb32dSmacallan		d = (uint8_t *)(p->fb + src + srcoff);
350d71cb32dSmacallan		for (x = 0; x < width; x++) {
351d71cb32dSmacallan			buffer[x] = c[d[x]>>5];
352d71cb32dSmacallan		}
353d71cb32dSmacallan		buffer[x] = 0;
354d71cb32dSmacallan		xf86Msg(X_ERROR, "%s\n", buffer);
355d71cb32dSmacallan#endif
356d71cb32dSmacallan		/* next line */
357d71cb32dSmacallan		src += srcpitch;
358d71cb32dSmacallan		dst += dstpitch;
359d71cb32dSmacallan	}
360d71cb32dSmacallan}
361d71cb32dSmacallan
362d71cb32dSmacallanvoid CG14Comp_Add8_32(Cg14Ptr p,
363d71cb32dSmacallan                   uint32_t src, uint32_t srcpitch,
364d71cb32dSmacallan                   uint32_t dst, uint32_t dstpitch,
365d71cb32dSmacallan                   int width, int height)
366d71cb32dSmacallan{
367d71cb32dSmacallan	int line;
368d71cb32dSmacallan	uint32_t srcx, dstx, srcoff, dstoff;
369d71cb32dSmacallan	int pre, full, part, x;
370d71cb32dSmacallan	uint8_t *d;
371d71cb32dSmacallan	char buffer[256];
372d71cb32dSmacallan	ENTER;
373d71cb32dSmacallan
374d71cb32dSmacallan	srcoff = src & 7;
375d71cb32dSmacallan	src &= ~7;
376d71cb32dSmacallan	dstoff = dst & 7;
377d71cb32dSmacallan	dst &= ~7;
378d71cb32dSmacallan	full = width >> 5;	/* chunks of 32 */
379d71cb32dSmacallan	part = width & 31;	/* leftovers */
380d71cb32dSmacallan
381ad6af7a7Smacallan#ifdef SX__RENDER_DEBUG
382d71cb32dSmacallan	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
383d71cb32dSmacallan	    width, height, full, part);
384d71cb32dSmacallan#endif
385d71cb32dSmacallan	/* we do this up to 32 pixels at a time */
386d71cb32dSmacallan	for (line = 0; line < height; line++) {
387d71cb32dSmacallan		srcx = src;
388d71cb32dSmacallan		dstx = dst;
389d71cb32dSmacallan		for (x = 0; x < full; x++) {
390d71cb32dSmacallan			/* load source bytes */
391d71cb32dSmacallan			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
392d71cb32dSmacallan			/* load alpha from destination */
393d71cb32dSmacallan			write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff));
394230e26c7Smacallan			sxi(SX_ADDV, 8, 40, 72, 15);
395230e26c7Smacallan			sxi(SX_ADDV, 24, 56, 88, 15);
396d71cb32dSmacallan			/* write clamped values back into dest alpha */
397d71cb32dSmacallan			write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff));
398d71cb32dSmacallan			srcx += 32;
399d71cb32dSmacallan			dstx += 128;
400d71cb32dSmacallan		}
401d71cb32dSmacallan
402d71cb32dSmacallan		if (part > 0) {
403d71cb32dSmacallan			/* do leftovers */
404d71cb32dSmacallan			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
405d71cb32dSmacallan			write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff));
406d71cb32dSmacallan			if (part > 16) {
407230e26c7Smacallan				sxi(SX_ADDV, 8, 40, 72, 15);
408230e26c7Smacallan				sxi(SX_ADDV, 24, 56, 88, part - 17);
409d71cb32dSmacallan			} else {
410230e26c7Smacallan				sxi(SX_ADDV, 8, 40, 72, part - 1);
411d71cb32dSmacallan			}
412d71cb32dSmacallan			write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff));
413d71cb32dSmacallan		}
414ad6af7a7Smacallan#ifdef SX_RENDER_DEBUG
415a3a2ba44Smacallan		d = (uint8_t *)(p->fb + src + srcoff);
416a3a2ba44Smacallan		for (x = 0; x < width; x++) {
417a3a2ba44Smacallan			buffer[x] = c[d[x]>>5];
418a3a2ba44Smacallan		}
419a3a2ba44Smacallan		buffer[x] = 0;
420a3a2ba44Smacallan		xf86Msg(X_ERROR, "%s\n", buffer);
421a3a2ba44Smacallan#endif
422a3a2ba44Smacallan		/* next line */
423a3a2ba44Smacallan		src += srcpitch;
424a3a2ba44Smacallan		dst += dstpitch;
425a3a2ba44Smacallan	}
426a3a2ba44Smacallan}
427a3a2ba44Smacallan
428a3a2ba44Smacallanvoid CG14Comp_Over32(Cg14Ptr p,
429a3a2ba44Smacallan                   uint32_t src, uint32_t srcpitch,
430a3a2ba44Smacallan                   uint32_t dst, uint32_t dstpitch,
431e311bbeeSmacallan                   int width, int height, int flip)
432a3a2ba44Smacallan{
43378d1a11bSmacallan	uint32_t srcx, dstx, mskx, m;
43478d1a11bSmacallan	int line, x, i, num;
435a3a2ba44Smacallan
436a3a2ba44Smacallan	ENTER;
437a3a2ba44Smacallan
438a3a2ba44Smacallan	write_sx_reg(p, SX_QUEUED(8), 0xff);
439a3a2ba44Smacallan	for (line = 0; line < height; line++) {
440a3a2ba44Smacallan		srcx = src;
441a3a2ba44Smacallan		dstx = dst;
442a3a2ba44Smacallan
44378d1a11bSmacallan		for (x = 0; x < width; x += 4) {
44478d1a11bSmacallan			/* we do up to 4 pixels at a time */
44578d1a11bSmacallan			num = min(4, width - x);
44678d1a11bSmacallan			if (num <= 0) {
44778d1a11bSmacallan				xf86Msg(X_ERROR, "wtf?!\n");
44878d1a11bSmacallan				continue;
44978d1a11bSmacallan			}
45078d1a11bSmacallan			/* fetch source pixels */
45172fd264fSmacallan			sxm(SX_LDUQ0, srcx, 12, num - 1);
452e311bbeeSmacallan			if (flip) {
453230e26c7Smacallan				sxi(SX_GATHER, 13, 4, 40, num - 1);
454230e26c7Smacallan				sxi(SX_GATHER, 15, 4, 44, num - 1);
455230e26c7Smacallan				sxi(SX_SCATTER, 40, 4, 15, num - 1);
456230e26c7Smacallan				sxi(SX_SCATTER, 44, 4, 13, num - 1);
45778d1a11bSmacallan			}
45878d1a11bSmacallan			/* fetch dst pixels */
45972fd264fSmacallan			sxm(SX_LDUQ0, dstx, 44, num - 1);
46078d1a11bSmacallan			/* now process up to 4 pixels */
46178d1a11bSmacallan			for (i = 0; i < num; i++) {
46278d1a11bSmacallan				int ii = i << 2;
46378d1a11bSmacallan				/* write inverted alpha into SCAM */
464230e26c7Smacallan				sxi(SX_XORS, 12 + ii, 8, R_SCAM, 0);
46578d1a11bSmacallan				/* dst * (1 - alpha) + src */
466230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 44 + ii, 12 + ii, 76 + ii, 3);
467e311bbeeSmacallan			}
46872fd264fSmacallan			sxm(SX_STUQ0C, dstx, 76, num - 1);
46978d1a11bSmacallan			srcx += 16;
47078d1a11bSmacallan			dstx += 16;
471a3a2ba44Smacallan		}
472a3a2ba44Smacallan		src += srcpitch;
47378d1a11bSmacallan		dst += dstpitch;
474a3a2ba44Smacallan	}
475a3a2ba44Smacallan}
476a3a2ba44Smacallan
477a3a2ba44Smacallanvoid CG14Comp_Over32Mask(Cg14Ptr p,
478a3a2ba44Smacallan                   uint32_t src, uint32_t srcpitch,
479a3a2ba44Smacallan                   uint32_t msk, uint32_t mskpitch,
480a3a2ba44Smacallan                   uint32_t dst, uint32_t dstpitch,
481e311bbeeSmacallan                   int width, int height, int flip)
482a3a2ba44Smacallan{
483a3a2ba44Smacallan	uint32_t srcx, dstx, mskx, m;
48478d1a11bSmacallan	int line, x, i, num;
485a3a2ba44Smacallan
486a3a2ba44Smacallan	ENTER;
487a3a2ba44Smacallan
488a3a2ba44Smacallan	write_sx_reg(p, SX_QUEUED(8), 0xff);
489a3a2ba44Smacallan	for (line = 0; line < height; line++) {
490a3a2ba44Smacallan		srcx = src;
491a3a2ba44Smacallan		mskx = msk;
492a3a2ba44Smacallan		dstx = dst;
493a3a2ba44Smacallan
49478d1a11bSmacallan		for (x = 0; x < width; x += 4) {
49578d1a11bSmacallan			/* we do up to 4 pixels at a time */
49678d1a11bSmacallan			num = min(4, width - x);
49778d1a11bSmacallan			if (num <= 0) {
49878d1a11bSmacallan				xf86Msg(X_ERROR, "wtf?!\n");
49978d1a11bSmacallan				continue;
50078d1a11bSmacallan			}
50178d1a11bSmacallan			/* fetch source pixels */
50272fd264fSmacallan			sxm(SX_LDUQ0, srcx, 12, num - 1);
503e311bbeeSmacallan			if (flip) {
504230e26c7Smacallan				sxi(SX_GATHER, 13, 4, 40, num - 1);
505230e26c7Smacallan				sxi(SX_GATHER, 15, 4, 44, num - 1);
506230e26c7Smacallan				sxi(SX_SCATTER, 40, 4, 15, num - 1);
507230e26c7Smacallan				sxi(SX_SCATTER, 44, 4, 13, num - 1);
508e311bbeeSmacallan			}
509a3a2ba44Smacallan			/* fetch mask */
51072fd264fSmacallan			sxm(SX_LDB, mskx, 28, num - 1);
51178d1a11bSmacallan			/* fetch dst pixels */
51272fd264fSmacallan			sxm(SX_LDUQ0, dstx, 44, num - 1);
51378d1a11bSmacallan			/* now process up to 4 pixels */
51478d1a11bSmacallan			for (i = 0; i < num; i++) {
51578d1a11bSmacallan				int ii = i << 2;
51678d1a11bSmacallan				/* mask alpha to SCAM */
517230e26c7Smacallan				sxi(SX_ORS, 28 + i, 0, R_SCAM, 0);
51878d1a11bSmacallan				/* src * alpha */
519230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 12 + ii, 0, 60 + ii, 3);
52078d1a11bSmacallan				/* write inverted alpha into SCAM */
521230e26c7Smacallan				sxi(SX_XORS, 28 + i, 8, R_SCAM, 0);
52278d1a11bSmacallan				/* dst * (1 - alpha) + R[60:] */
523230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 44 + ii, 60 + ii, 76 + ii, 3);
52478d1a11bSmacallan			}
52572fd264fSmacallan			sxm(SX_STUQ0C, dstx, 76, num - 1);
52678d1a11bSmacallan			srcx += 16;
52778d1a11bSmacallan			mskx += 4;
52878d1a11bSmacallan			dstx += 16;
529a3a2ba44Smacallan		}
530a3a2ba44Smacallan		src += srcpitch;
531a3a2ba44Smacallan		msk += mskpitch;
532a3a2ba44Smacallan		dst += dstpitch;
533a3a2ba44Smacallan	}
534a3a2ba44Smacallan}
5356bdc2ffdSmacallan
5366bdc2ffdSmacallanvoid CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
5376bdc2ffdSmacallan                   uint32_t src, uint32_t srcpitch,
5386bdc2ffdSmacallan                   uint32_t msk, uint32_t mskpitch,
5396bdc2ffdSmacallan                   uint32_t dst, uint32_t dstpitch,
540e311bbeeSmacallan                   int width, int height, int flip)
5416bdc2ffdSmacallan{
5426bdc2ffdSmacallan	uint32_t srcx, dstx, mskx, m;
54378d1a11bSmacallan	int line, x, i, num;
5446bdc2ffdSmacallan
5456bdc2ffdSmacallan	ENTER;
5466bdc2ffdSmacallan
5476bdc2ffdSmacallan	write_sx_reg(p, SX_QUEUED(8), 0xff);
54878d1a11bSmacallan	write_sx_reg(p, SX_QUEUED(9), 0xff);
549230e26c7Smacallan	sxi(SX_ORS, 8, 0, 10, 1);
5506bdc2ffdSmacallan	for (line = 0; line < height; line++) {
5516bdc2ffdSmacallan		srcx = src;
5526bdc2ffdSmacallan		mskx = msk;
5536bdc2ffdSmacallan		dstx = dst;
5546bdc2ffdSmacallan
55578d1a11bSmacallan		for (x = 0; x < width; x += 4) {
55678d1a11bSmacallan			/* we do up to 4 pixels at a time */
55778d1a11bSmacallan			num = min(4, width - x);
55878d1a11bSmacallan			if (num <= 0) {
55978d1a11bSmacallan				xf86Msg(X_ERROR, "wtf?!\n");
56078d1a11bSmacallan				continue;
56178d1a11bSmacallan			}
56278d1a11bSmacallan			/* fetch source pixels */
56372fd264fSmacallan			sxm(SX_LDUQ0, srcx, 12, num - 1);
564e311bbeeSmacallan			if (flip) {
565230e26c7Smacallan				sxi(SX_GATHER, 13, 4, 40, num - 1);
566230e26c7Smacallan				sxi(SX_GATHER, 15, 4, 44, num - 1);
567230e26c7Smacallan				sxi(SX_SCATTER, 40, 4, 15, num - 1);
568230e26c7Smacallan				sxi(SX_SCATTER, 44, 4, 13, num - 1);
569e311bbeeSmacallan			}
5706bdc2ffdSmacallan			/* fetch mask */
57172fd264fSmacallan			sxm(SX_LDB, mskx, 28, num - 1);
57278d1a11bSmacallan			/* fetch dst pixels */
57372fd264fSmacallan			sxm(SX_LDUQ0, dstx, 44, num - 1);
57478d1a11bSmacallan			/* set src alpha to 0xff */
575230e26c7Smacallan			sxi(SX_SCATTER, 8, 4, 12, num - 1);
57678d1a11bSmacallan			/* now process up to 4 pixels */
57778d1a11bSmacallan			for (i = 0; i < num; i++) {
57878d1a11bSmacallan				int ii = i << 2;
57978d1a11bSmacallan				/* mask alpha to SCAM */
580230e26c7Smacallan				sxi(SX_ORS, 28 + i, 0, R_SCAM, 0);
58178d1a11bSmacallan				/* src * alpha */
582230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 12 + ii, 0, 60 + ii, 3);
58378d1a11bSmacallan				/* write inverted alpha into SCAM */
584230e26c7Smacallan				sxi(SX_XORS, 28 + i, 8, R_SCAM, 0);
58578d1a11bSmacallan				/* dst * (1 - alpha) + R[60:] */
586230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 44 + ii, 60 + ii, 76 + ii, 3);
58778d1a11bSmacallan			}
58872fd264fSmacallan			sxm(SX_STUQ0C, dstx, 76, num - 1);
58978d1a11bSmacallan			srcx += 16;
59078d1a11bSmacallan			mskx += 4;
59178d1a11bSmacallan			dstx += 16;
5926bdc2ffdSmacallan		}
5936bdc2ffdSmacallan		src += srcpitch;
5946bdc2ffdSmacallan		msk += mskpitch;
5956bdc2ffdSmacallan		dst += dstpitch;
5966bdc2ffdSmacallan	}
5976bdc2ffdSmacallan}
598fa158432Smacallan
599fa158432Smacallanvoid CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
600fa158432Smacallan                   uint32_t src, uint32_t srcpitch,
601fa158432Smacallan                   uint32_t msk, uint32_t mskpitch,
602fa158432Smacallan                   uint32_t dst, uint32_t dstpitch,
603e311bbeeSmacallan                   int width, int height, int flip)
604fa158432Smacallan{
605fa158432Smacallan	uint32_t srcx, dstx, mskx, m;
60678d1a11bSmacallan	int line, x, i, num;
607fa158432Smacallan
608fa158432Smacallan	ENTER;
609fa158432Smacallan
610fa158432Smacallan	write_sx_reg(p, SX_QUEUED(8), 0xff);
61178d1a11bSmacallan	write_sx_reg(p, SX_QUEUED(9), 0xff);
612230e26c7Smacallan	sxi(SX_ORS, 8, 0, 10, 1);
613fa158432Smacallan	for (line = 0; line < height; line++) {
614fa158432Smacallan		srcx = src;
615fa158432Smacallan		mskx = msk;
616fa158432Smacallan		dstx = dst;
617fa158432Smacallan
61878d1a11bSmacallan		for (x = 0; x < width; x += 4) {
61978d1a11bSmacallan			/* we do up to 4 pixels at a time */
62078d1a11bSmacallan			num = min(4, width - x);
62178d1a11bSmacallan			if (num <= 0) {
62278d1a11bSmacallan				xf86Msg(X_ERROR, "wtf?!\n");
62378d1a11bSmacallan				continue;
62478d1a11bSmacallan			}
62578d1a11bSmacallan			/* fetch source pixels */
62672fd264fSmacallan			sxm(SX_LDUQ0, srcx, 12, num - 1);
627e311bbeeSmacallan			if (flip) {
628230e26c7Smacallan				sxi(SX_GATHER, 13, 4, 40, num - 1);
629230e26c7Smacallan				sxi(SX_GATHER, 15, 4, 44, num - 1);
630230e26c7Smacallan				sxi(SX_SCATTER, 40, 4, 15, num - 1);
631230e26c7Smacallan				sxi(SX_SCATTER, 44, 4, 13, num - 1);
632e311bbeeSmacallan			}
633fa158432Smacallan			/* fetch mask */
63472fd264fSmacallan			sxm(SX_LDUQ0, mskx, 28, num - 1);
63578d1a11bSmacallan			/* fetch dst pixels */
63672fd264fSmacallan			sxm(SX_LDUQ0, dstx, 44, num - 1);
63778d1a11bSmacallan			/* set src alpha to 0xff */
638230e26c7Smacallan			sxi(SX_SCATTER, 8, 4, 12, num - 1);
63978d1a11bSmacallan			/* now process up to 4 pixels */
64078d1a11bSmacallan			for (i = 0; i < num; i++) {
64178d1a11bSmacallan				int ii = i << 2;
64278d1a11bSmacallan				/* mask alpha to SCAM */
643230e26c7Smacallan				sxi(SX_ORS, 28 + ii, 0, R_SCAM, 0);
64478d1a11bSmacallan				/* src * alpha */
645230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 12 + ii, 0, 60 + ii, 3);
64678d1a11bSmacallan				/* write inverted alpha into SCAM */
647230e26c7Smacallan				sxi(SX_XORS, 28 + ii, 8, R_SCAM, 0);
64878d1a11bSmacallan				/* dst * (1 - alpha) + R[60:] */
649230e26c7Smacallan				sxi(SX_SAXP16X16SR8, 44 + ii, 60 + ii, 76 + ii, 3);
65078d1a11bSmacallan			}
65172fd264fSmacallan			sxm(SX_STUQ0C, dstx, 76, num - 1);
65278d1a11bSmacallan			srcx += 16;
65378d1a11bSmacallan			mskx += 16;
65478d1a11bSmacallan			dstx += 16;
655fa158432Smacallan		}
656fa158432Smacallan		src += srcpitch;
657fa158432Smacallan		msk += mskpitch;
658fa158432Smacallan		dst += dstpitch;
659fa158432Smacallan	}
660fa158432Smacallan}
661