cg14_render.c revision f221549c
1/* $NetBSD: cg14_render.c,v 1.9 2016/09/16 22:07:25 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44#include <sparc/sxreg.h>
45
46/*#define SX_SINGLE*/
47/*#define SX_RENDER_DEBUG*/
48/*#define SX_ADD_SOFTWARE*/
49
50#ifdef SX_RENDER_DEBUG
51#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
52#define DPRINTF xf86Msg
53#else
54#define ENTER
55#define DPRINTF while (0) xf86Msg
56#endif
57
58char c[8] = " .,:+*oX";
59
60
61void CG14Comp_Over32Solid(Cg14Ptr p,
62                   uint32_t src, uint32_t srcpitch,
63                   uint32_t dst, uint32_t dstpitch,
64                   int width, int height)
65{
66	uint32_t msk = src, mskx, dstx, m;
67	int line, x, i;
68
69	ENTER;
70
71	for (line = 0; line < height; line++) {
72		mskx = msk;
73		dstx = dst;
74#ifndef SX_SINGLE
75		int rest;
76		for (x = 0; x < width; x += 4) {
77			rest = width - x;
78			/* fetch 4 mask values */
79			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
80			/* fetch destination pixels */
81			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
82			/* duplicate them for all channels */
83			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
84			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
85			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
86			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
87			/* generate inverted alpha */
88			write_sx_reg(p, SX_INSTRUCTIONS,
89			    SX_XORS(12, 8, 28, 15));
90			/* multiply source */
91			write_sx_reg(p, SX_INSTRUCTIONS,
92			    SX_MUL16X16SR8(8, 12, 44, 3));
93			write_sx_reg(p, SX_INSTRUCTIONS,
94			    SX_MUL16X16SR8(8, 16, 48, 3));
95			write_sx_reg(p, SX_INSTRUCTIONS,
96			    SX_MUL16X16SR8(8, 20, 52, 3));
97			write_sx_reg(p, SX_INSTRUCTIONS,
98			    SX_MUL16X16SR8(8, 24, 56, 3));
99			/* multiply dest */
100			write_sx_reg(p, SX_INSTRUCTIONS,
101			    SX_MUL16X16SR8(28, 60, 76, 15));
102			/* add up */
103			write_sx_reg(p, SX_INSTRUCTIONS,
104			    SX_ADDV(44, 76, 92, 15));
105			/* write back */
106			if (rest < 4) {
107				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
108			} else {
109				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
110			}
111			dstx += 16;
112			mskx += 16;
113		}
114#else /* SX_SINGLE */
115		for (x = 0; x < width; x++) {
116			m = *(volatile uint32_t *)(p->fb + mskx);
117			m = m >> 24;
118			if (m == 0) {
119				/* nothing to do - all transparent */
120			} else if (m == 0xff) {
121				/* all opaque */
122				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
123			} else {
124				/* fetch alpha value, stick it into scam */
125				/* mask is in R[12:15] */
126				/*write_sx_io(p, mskx,
127				    SX_LDUQ0(12, 0, mskx & 7));*/
128				write_sx_reg(p, SX_QUEUED(12), m);
129				/* fetch dst pixel */
130				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
131				write_sx_reg(p, SX_INSTRUCTIONS,
132				    SX_ORV(12, 0, R_SCAM, 0));
133				/*
134				 * src * alpha + R0
135				 * R[9:11] * SCAM + R0 -> R[17:19]
136				 */
137				write_sx_reg(p, SX_INSTRUCTIONS,
138				    SX_SAXP16X16SR8(9, 0, 17, 2));
139
140				/* invert SCAM */
141				write_sx_reg(p, SX_INSTRUCTIONS,
142				    SX_XORV(12, 8, R_SCAM, 0));
143#ifdef SX_DEBUG
144				write_sx_reg(p, SX_INSTRUCTIONS,
145				    SX_XORV(12, 8, 13, 0));
146#endif
147				/* dst * (1 - alpha) + R[13:15] */
148				write_sx_reg(p, SX_INSTRUCTIONS,
149				    SX_SAXP16X16SR8(21, 17, 25, 2));
150				write_sx_io(p, dstx,
151				    SX_STUQ0C(24, 0, dstx & 7));
152			}
153			dstx += 4;
154			mskx += 4;
155		}
156#endif /* SX_SINGLE */
157		dst += dstpitch;
158		msk += srcpitch;
159	}
160}
161
162void CG14Comp_Over8Solid(Cg14Ptr p,
163                   uint32_t src, uint32_t srcpitch,
164                   uint32_t dst, uint32_t dstpitch,
165                   int width, int height)
166{
167	uint32_t msk = src, mskx, dstx, m;
168	int line, x, i;
169#ifdef SX_DEBUG
170	char buffer[256];
171#endif
172	ENTER;
173
174	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
175	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
176	    *(uint32_t *)(p->fb + p->srcoff));
177	for (line = 0; line < height; line++) {
178		mskx = msk;
179		dstx = dst;
180#ifndef SX_SINGLE
181		int rest;
182		for (x = 0; x < width; x += 4) {
183			rest = width - x;
184			/* fetch 4 mask values */
185			write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
186			/* fetch destination pixels */
187			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
188			/* duplicate them for all channels */
189			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
190			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
191			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
192			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
193			/* generate inverted alpha */
194			write_sx_reg(p, SX_INSTRUCTIONS,
195			    SX_XORS(12, 8, 28, 15));
196			/* multiply source */
197			write_sx_reg(p, SX_INSTRUCTIONS,
198			    SX_MUL16X16SR8(8, 12, 44, 3));
199			write_sx_reg(p, SX_INSTRUCTIONS,
200			    SX_MUL16X16SR8(8, 16, 48, 3));
201			write_sx_reg(p, SX_INSTRUCTIONS,
202			    SX_MUL16X16SR8(8, 20, 52, 3));
203			write_sx_reg(p, SX_INSTRUCTIONS,
204			    SX_MUL16X16SR8(8, 24, 56, 3));
205			/* multiply dest */
206			write_sx_reg(p, SX_INSTRUCTIONS,
207			    SX_MUL16X16SR8(28, 60, 76, 15));
208			/* add up */
209			write_sx_reg(p, SX_INSTRUCTIONS,
210			    SX_ADDV(44, 76, 92, 15));
211			/* write back */
212			if (rest < 4) {
213				write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
214			} else {
215				write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
216			}
217			dstx += 16;
218			mskx += 4;
219		}
220#else /* SX_SINGLE */
221		for (x = 0; x < width; x++) {
222			m = *(volatile uint8_t *)(p->fb + mskx);
223#ifdef SX_DEBUG
224			buffer[x] = c[m >> 5];
225#endif
226			if (m == 0) {
227				/* nothing to do - all transparent */
228			} else if (m == 0xff) {
229				/* all opaque */
230				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
231			} else {
232				/* fetch alpha value, stick it into scam */
233				/* mask is in R[12:15] */
234				/*write_sx_io(p, mskx & ~7,
235				    SX_LDB(12, 0, mskx & 7));*/
236				write_sx_reg(p, SX_QUEUED(12), m);
237				/* fetch dst pixel */
238				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
239				write_sx_reg(p, SX_INSTRUCTIONS,
240				    SX_ORV(12, 0, R_SCAM, 0));
241				/*
242				 * src * alpha + R0
243				 * R[9:11] * SCAM + R0 -> R[17:19]
244				 */
245				write_sx_reg(p, SX_INSTRUCTIONS,
246				    SX_SAXP16X16SR8(9, 0, 17, 2));
247
248				/* invert SCAM */
249				write_sx_reg(p, SX_INSTRUCTIONS,
250				    SX_XORV(12, 8, R_SCAM, 0));
251#ifdef SX_DEBUG
252				write_sx_reg(p, SX_INSTRUCTIONS,
253				    SX_XORV(12, 8, 13, 0));
254#endif
255				/* dst * (1 - alpha) + R[13:15] */
256				write_sx_reg(p, SX_INSTRUCTIONS,
257				    SX_SAXP16X16SR8(21, 17, 25, 2));
258				write_sx_io(p, dstx,
259				    SX_STUQ0C(24, 0, dstx & 7));
260			}
261			dstx += 4;
262			mskx += 1;
263		}
264#endif /* SX_SINGLE */
265#ifdef SX_DEBUG
266		buffer[x] = 0;
267		xf86Msg(X_ERROR, "%s\n", buffer);
268#endif
269		dst += dstpitch;
270		msk += srcpitch;
271	}
272}
273
274void CG14Comp_Add32(Cg14Ptr p,
275                   uint32_t src, uint32_t srcpitch,
276                   uint32_t dst, uint32_t dstpitch,
277                   int width, int height)
278{
279	int line;
280	uint32_t srcx, dstx;
281	int full, part, x;
282
283	ENTER;
284	full = width >> 3;	/* chunks of 8 */
285	part = width & 7;	/* leftovers */
286	/* we do this up to 8 pixels at a time */
287	for (line = 0; line < height; line++) {
288		srcx = src;
289		dstx = dst;
290		for (x = 0; x < full; x++) {
291			write_sx_io(p, srcx, SX_LDUQ0(8, 31, srcx & 7));
292			write_sx_io(p, dstx, SX_LDUQ0(40, 31, dstx & 7));
293			write_sx_reg(p, SX_INSTRUCTIONS,
294			    SX_ADDV(8, 40, 72, 15));
295			write_sx_reg(p, SX_INSTRUCTIONS,
296			    SX_ADDV(24, 56, 88, 15));
297			write_sx_io(p, dstx, SX_STUQ0(72, 31, dstx & 7));
298			srcx += 128;
299			dstx += 128;
300		}
301
302		/* do leftovers */
303		write_sx_io(p, srcx, SX_LDUQ0(8, part - 1, srcx & 7));
304		write_sx_io(p, dstx, SX_LDUQ0(40, part - 1, dstx & 7));
305		if (part & 16) {
306			write_sx_reg(p, SX_INSTRUCTIONS,
307			    SX_ADDV(8, 40, 72, 15));
308			write_sx_reg(p, SX_INSTRUCTIONS,
309			    SX_ADDV(24, 56, 88, part - 17));
310		} else {
311			write_sx_reg(p, SX_INSTRUCTIONS,
312			    SX_ADDV(8, 40, 72, part - 1));
313		}
314		write_sx_io(p, dstx, SX_STUQ0(72, part - 1, dstx & 7));
315
316		/* next line */
317		src += srcpitch;
318		dst += dstpitch;
319	}
320}
321
322void CG14Comp_Add8(Cg14Ptr p,
323                   uint32_t src, uint32_t srcpitch,
324                   uint32_t dst, uint32_t dstpitch,
325                   int width, int height)
326{
327	int line;
328	uint32_t srcx, dstx, srcoff, dstoff;
329	int pre, full, part, x;
330	uint8_t *d;
331	char buffer[256];
332	ENTER;
333
334	srcoff = src & 7;
335	src &= ~7;
336	dstoff = dst & 7;
337	dst &= ~7;
338	full = width >> 5;	/* chunks of 32 */
339	part = width & 31;	/* leftovers */
340
341#ifdef SX_DEBUG
342	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
343	    width, height, full, part);
344#endif
345	/* we do this up to 32 pixels at a time */
346	for (line = 0; line < height; line++) {
347		srcx = src;
348		dstx = dst;
349#ifdef SX_ADD_SOFTWARE
350		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
351		d = (uint8_t *)(p->fb + dstx + dstoff);
352		for (x = 0; x < width; x++) {
353			d[x] = min(255, s[x] + d[x]);
354		}
355#else
356		for (x = 0; x < full; x++) {
357			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
358			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
359			write_sx_reg(p, SX_INSTRUCTIONS,
360			    SX_ADDV(8, 40, 72, 15));
361			write_sx_reg(p, SX_INSTRUCTIONS,
362			    SX_ADDV(24, 56, 88, 15));
363			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
364			srcx += 32;
365			dstx += 32;
366		}
367
368		if (part > 0) {
369			/* do leftovers */
370			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
371			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
372			if (part > 16) {
373				write_sx_reg(p, SX_INSTRUCTIONS,
374				    SX_ADDV(8, 40, 72, 15));
375				write_sx_reg(p, SX_INSTRUCTIONS,
376				    SX_ADDV(24, 56, 88, part - 17));
377			} else {
378				write_sx_reg(p, SX_INSTRUCTIONS,
379				    SX_ADDV(8, 40, 72, part - 1));
380			}
381			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
382		}
383#endif
384#ifdef SX_DEBUG
385		d = (uint8_t *)(p->fb + src + srcoff);
386		for (x = 0; x < width; x++) {
387			buffer[x] = c[d[x]>>5];
388		}
389		buffer[x] = 0;
390		xf86Msg(X_ERROR, "%s\n", buffer);
391#endif
392		/* next line */
393		src += srcpitch;
394		dst += dstpitch;
395	}
396}
397
398void CG14Comp_Over32(Cg14Ptr p,
399                   uint32_t src, uint32_t srcpitch,
400                   uint32_t dst, uint32_t dstpitch,
401                   int width, int height)
402{
403	uint32_t srcx, dstx, m;
404	int line, x, i;
405
406	ENTER;
407
408	write_sx_reg(p, SX_QUEUED(8), 0xff);
409	for (line = 0; line < height; line++) {
410		srcx = src;
411		dstx = dst;
412
413		for (x = 0; x < width; x++) {
414			/* fetch source pixel */
415			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
416			/* fetch dst pixel */
417			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
418			/* src is premultiplied with alpha */
419			/* write inverted alpha into SCAM */
420			write_sx_reg(p, SX_INSTRUCTIONS,
421			    SX_XORV(12, 8, R_SCAM, 0));
422			/* dst * (1 - alpha) + R[13:15] */
423			write_sx_reg(p, SX_INSTRUCTIONS,
424			    SX_SAXP16X16SR8(20, 12, 24, 3));
425			write_sx_io(p, dstx,
426			    SX_STUQ0C(24, 0, dstx & 7));
427			dstx += 4;
428			srcx += 4;
429		}
430		dst += dstpitch;
431		src += srcpitch;
432	}
433}
434
435void CG14Comp_Over32Mask(Cg14Ptr p,
436                   uint32_t src, uint32_t srcpitch,
437                   uint32_t msk, uint32_t mskpitch,
438                   uint32_t dst, uint32_t dstpitch,
439                   int width, int height)
440{
441	uint32_t srcx, dstx, mskx, m;
442	int line, x, i;
443
444	ENTER;
445
446	write_sx_reg(p, SX_QUEUED(8), 0xff);
447	for (line = 0; line < height; line++) {
448		srcx = src;
449		mskx = msk;
450		dstx = dst;
451
452		for (x = 0; x < width; x++) {
453			/* fetch source pixel */
454			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
455			/* fetch mask */
456			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
457			/* fetch dst pixel */
458			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
459			/* stick mask alpha into SCAM */
460			write_sx_reg(p, SX_INSTRUCTIONS,
461			    SX_ORS(9, 0, R_SCAM, 0));
462			/* apply mask */
463			/* src is premultiplied with alpha */
464			write_sx_reg(p, SX_INSTRUCTIONS,
465			    SX_SAXP16X16SR8(12, 0, 16, 3));
466			/* write inverted alpha into SCAM */
467			write_sx_reg(p, SX_INSTRUCTIONS,
468			    SX_XORV(16, 8, R_SCAM, 0));
469			/* dst * (1 - alpha) + R[13:15] */
470			write_sx_reg(p, SX_INSTRUCTIONS,
471			    SX_SAXP16X16SR8(20, 16, 24, 3));
472			write_sx_io(p, dstx,
473			    SX_STUQ0C(24, 0, dstx & 7));
474			srcx += 4;
475			mskx += 1;
476			dstx += 4;
477		}
478		src += srcpitch;
479		msk += mskpitch;
480		dst += dstpitch;
481	}
482}
483
484void CG14Comp_Over32Mask_noalpha(Cg14Ptr p,
485                   uint32_t src, uint32_t srcpitch,
486                   uint32_t msk, uint32_t mskpitch,
487                   uint32_t dst, uint32_t dstpitch,
488                   int width, int height)
489{
490	uint32_t srcx, dstx, mskx, m;
491	int line, x, i;
492
493	ENTER;
494
495	write_sx_reg(p, SX_QUEUED(8), 0xff);
496	for (line = 0; line < height; line++) {
497		srcx = src;
498		mskx = msk;
499		dstx = dst;
500
501		for (x = 0; x < width; x++) {
502			/* fetch source pixel */
503			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
504			/* set src alpha to 0xff */
505			write_sx_reg(p, SX_INSTRUCTIONS,
506			    SX_ORS(8, 0, 12, 0));
507			/* fetch mask */
508			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
509			/* fetch dst pixel */
510			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
511			/* write alpha into SCAM */
512			write_sx_reg(p, SX_INSTRUCTIONS,
513			    SX_ORS(9, 0, R_SCAM, 0));
514			/* src * alpha + R0 */
515			write_sx_reg(p, SX_INSTRUCTIONS,
516			    SX_SAXP16X16SR8(12, 0, 16, 3));
517			/* write inverted alpha into SCAM */
518			write_sx_reg(p, SX_INSTRUCTIONS,
519			    SX_XORV(9, 8, R_SCAM, 0));
520			/* dst * (1 - alpha) + R[13:15] */
521			write_sx_reg(p, SX_INSTRUCTIONS,
522			    SX_SAXP16X16SR8(20, 16, 24, 3));
523			write_sx_io(p, dstx,
524			    SX_STUQ0C(24, 0, dstx & 7));
525			srcx += 4;
526			mskx += 1;
527			dstx += 4;
528		}
529		src += srcpitch;
530		msk += mskpitch;
531		dst += dstpitch;
532	}
533}
534
535void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p,
536                   uint32_t src, uint32_t srcpitch,
537                   uint32_t msk, uint32_t mskpitch,
538                   uint32_t dst, uint32_t dstpitch,
539                   int width, int height)
540{
541	uint32_t srcx, dstx, mskx, m;
542	int line, x, i;
543
544	ENTER;
545
546	write_sx_reg(p, SX_QUEUED(8), 0xff);
547	for (line = 0; line < height; line++) {
548		srcx = src;
549		mskx = msk;
550		dstx = dst;
551
552		for (x = 0; x < width; x++) {
553			/* fetch source pixel */
554			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
555			/* fetch mask */
556			write_sx_io(p, mskx, SX_LDUQ0(16, 0, mskx & 7));
557			/* fetch dst pixel */
558			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
559			/* set src alpha to 0xff */
560			write_sx_reg(p, SX_INSTRUCTIONS,
561			    SX_ORS(8, 0, 12, 0));
562			/* mask alpha to SCAM */
563			write_sx_reg(p, SX_INSTRUCTIONS,
564			    SX_ORS(16, 0, R_SCAM, 0));
565			/* src * alpha */
566			write_sx_reg(p, SX_INSTRUCTIONS,
567			    SX_SAXP16X16SR8(12, 0, 24, 3));
568			/* write inverted alpha into SCAM */
569			write_sx_reg(p, SX_INSTRUCTIONS,
570			    SX_XORS(16, 8, R_SCAM, 0));
571			/* dst * (1 - alpha) + R[24:31] */
572			write_sx_reg(p, SX_INSTRUCTIONS,
573			    SX_SAXP16X16SR8(20, 24, 28, 3));
574			write_sx_io(p, dstx,
575			    SX_STUQ0C(28, 0, dstx & 7));
576			srcx += 4;
577			mskx += 4;
578			dstx += 4;
579		}
580		src += srcpitch;
581		msk += mskpitch;
582		dst += dstpitch;
583	}
584}
585