cg14_accel.c revision 4261fa58
1/* $NetBSD: cg14_accel.c,v 1.1 2013/06/19 13:26:01 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#include <sys/types.h>
33
34/* all driver need this */
35#include "xf86.h"
36#include "xf86_OSproc.h"
37#include "compiler.h"
38
39#include "cg14.h"
40#include <sparc/sxreg.h>
41
42#define SX_SINGLE
43/*#define SX_DEBUG*/
44/*#define SX_ADD_SOFTWARE*/
45
46#ifdef SX_DEBUG
47#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
48#define DPRINTF xf86Msg
49#else
50#define ENTER
51#define DPRINTF while (0) xf86Msg
52#endif
53
54#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
55
56/* 0xcc is SX's GXcopy equivalent */
57uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
58		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
59
60int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
61		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
62int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
63
64char c[8] = " .,:+*oX";
65
66/* write an SX register */
67static inline void
68write_sx_reg(Cg14Ptr p, int reg, uint32_t val)
69{
70	*(volatile uint32_t *)(p->sxreg + reg) = val;
71}
72
73/* read an SX register */
74static inline uint32_t
75read_sx_reg(Cg14Ptr p, int reg)
76{
77	return *(volatile uint32_t *)(p->sxreg + reg);
78}
79
80/* write a memory referencing instruction */
81static inline void
82write_sx_io(Cg14Ptr p, int reg, uint32_t val)
83{
84	*(volatile uint32_t *)(p->sxio + reg) = val;
85}
86
87static inline void
88CG14Wait(Cg14Ptr p)
89{
90	/* we just wait until the instruction queue is empty */
91	while ((read_sx_reg(p, SX_CONTROL_STATUS) & SX_MT) != 0) {};
92}
93
94static void
95CG14WaitMarker(ScreenPtr pScreen, int Marker)
96{
97	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
98	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
99
100	CG14Wait(p);
101}
102
103static Bool
104CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
105		int xdir, int ydir, int alu, Pixel planemask)
106{
107	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
108	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
109
110	ENTER;
111	DPRINTF(X_ERROR, "bits per pixel: %d\n",
112	    pSrcPixmap->drawable.bitsPerPixel);
113
114	if (planemask != p->last_mask) {
115		CG14Wait(p);
116		write_sx_reg(p, SX_PLANEMASK, planemask);
117		p->last_mask = planemask;
118	}
119	alu = sx_rop[alu];
120	if (alu != p->last_rop) {
121		CG14Wait(p);
122		write_sx_reg(p, SX_ROP_CONTROL, alu);
123		p->last_rop = alu;
124	}
125	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
126	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
127	p->xdir = xdir;
128	p->ydir = ydir;
129	return TRUE;
130}
131
132static void
133CG14Copy(PixmapPtr pDstPixmap,
134         int srcX, int srcY, int dstX, int dstY, int w, int h)
135{
136	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
137	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
138	int dstpitch, dstoff, srcpitch, srcoff;
139	int srcstart, dststart, xinc, srcinc, dstinc;
140	int line, count, s, d, num;
141
142	ENTER;
143	dstpitch = exaGetPixmapPitch(pDstPixmap);
144	dstoff = exaGetPixmapOffset(pDstPixmap);
145	srcpitch = p->srcpitch;
146	srcoff = p->srcoff;
147	/*
148	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
149	 * actually wrote anything and only sync if it did
150	 */
151	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
152	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
153
154	/*
155	 * we always copy up to 32 pixels at a time so direction doesn't
156	 * matter if w<=32
157	 */
158	if (w > 32) {
159		if (p->xdir < 0) {
160			srcstart += (w - 32) << 2;
161			dststart += (w - 32) << 2;
162			xinc = -128;
163		} else
164			xinc = 128;
165	} else
166		xinc = 128;
167	if (p->ydir < 0) {
168		srcstart += (h - 1) * srcpitch;
169		dststart += (h - 1) * dstpitch;
170		srcinc = -srcpitch;
171		dstinc = -dstpitch;
172	} else {
173		srcinc = srcpitch;
174		dstinc = dstpitch;
175	}
176	if (p->last_rop == 0xcc) {
177		/* plain old copy */
178		if ( xinc > 0) {
179			/* going left to right */
180			for (line = 0; line < h; line++) {
181				count = 0;
182				s = srcstart;
183				d = dststart;
184				while ( count < w) {
185					num = min(32, w - count);
186					write_sx_io(p, s,
187					    SX_LD(10, num - 1, s & 7));
188					write_sx_io(p, d,
189					    SX_STM(10, num - 1, d & 7));
190					s += xinc;
191					d += xinc;
192					count += 32;
193				}
194				srcstart += srcinc;
195				dststart += dstinc;
196			}
197		} else {
198			/* going right to left */
199			int i, chunks = (w >> 5);
200			for (line = 0; line < h; line++) {
201				s = srcstart;
202				d = dststart;
203				count = w;
204				for (i = 0; i < chunks; i++) {
205					write_sx_io(p, s,
206					    SX_LD(10, 31, s & 7));
207					write_sx_io(p, d,
208					    SX_STM(10, 31, d & 7));
209					s -= 128;
210					d -= 128;
211					count -= 32;
212				}
213				/* leftovers, if any */
214				if (count > 0) {
215					s += (32 - count) << 2;
216					d += (32 - count) << 2;
217					write_sx_io(p, s,
218					    SX_LD(10, count - 1, s & 7));
219					write_sx_io(p, d,
220					    SX_STM(10, count - 1, d & 7));
221				}
222				srcstart += srcinc;
223				dststart += dstinc;
224			}
225		}
226	} else {
227		/* ROPs needed */
228		if ( xinc > 0) {
229			/* going left to right */
230			for (line = 0; line < h; line++) {
231				count = 0;
232				s = srcstart;
233				d = dststart;
234				while ( count < w) {
235					num = min(32, w - count);
236					write_sx_io(p, s,
237					    SX_LD(10, num - 1, s & 7));
238					write_sx_io(p, d,
239					    SX_LD(42, num - 1, d & 7));
240					if (num > 16) {
241						write_sx_reg(p, SX_INSTRUCTIONS,
242					    	 SX_ROP(10, 42, 74, 15));
243						write_sx_reg(p, SX_INSTRUCTIONS,
244					    	 SX_ROP(26, 58, 90, num - 17));
245					} else {
246						write_sx_reg(p, SX_INSTRUCTIONS,
247					    	 SX_ROP(10, 42, 74, num - 1));
248					}
249					write_sx_io(p, d,
250					    SX_STM(74, num - 1, d & 7));
251					s += xinc;
252					d += xinc;
253					count += 32;
254				}
255				srcstart += srcinc;
256				dststart += dstinc;
257			}
258		} else {
259			/* going right to left */
260			int i, chunks = (w >> 5);
261			for (line = 0; line < h; line++) {
262				s = srcstart;
263				d = dststart;
264				count = w;
265				for (i = 0; i < chunks; i++) {
266					write_sx_io(p, s, SX_LD(10, 31, s & 7));
267					write_sx_io(p, d, SX_LD(42, 31, d & 7));
268					write_sx_reg(p, SX_INSTRUCTIONS,
269				    	    SX_ROP(10, 42, 74, 15));
270					write_sx_reg(p, SX_INSTRUCTIONS,
271				    	    SX_ROP(26, 58, 90, 15));
272					write_sx_io(p, d,
273					    SX_STM(74, 31, d & 7));
274					s -= 128;
275					d -= 128;
276					count -= 32;
277				}
278				/* leftovers, if any */
279				if (count > 0) {
280					s += (32 - count) << 2;
281					d += (32 - count) << 2;
282					write_sx_io(p, s,
283					    SX_LD(10, count - 1, s & 7));
284					write_sx_io(p, d,
285					    SX_LD(42, count - 1, d & 7));
286					if (count > 16) {
287						write_sx_reg(p, SX_INSTRUCTIONS,
288					    	    SX_ROP(10, 42, 74, 15));
289						write_sx_reg(p, SX_INSTRUCTIONS,
290					    	 SX_ROP(26, 58, 90, count - 17));
291					} else {
292						write_sx_reg(p, SX_INSTRUCTIONS,
293					    	 SX_ROP(10, 42, 74, count - 1));
294					}
295
296					write_sx_io(p, d,
297					    SX_STM(74, count - 1, d & 7));
298				}
299				srcstart += srcinc;
300				dststart += dstinc;
301			}
302		}
303	}
304	exaMarkSync(pDstPixmap->drawable.pScreen);
305}
306
307static void
308CG14DoneCopy(PixmapPtr pDstPixmap)
309{
310}
311
312static Bool
313CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
314{
315	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
316	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
317
318	ENTER;
319	DPRINTF(X_ERROR, "bits per pixel: %d\n", pPixmap->drawable.bitsPerPixel);
320	write_sx_reg(p, SX_QUEUED(8), fg);
321	write_sx_reg(p, SX_QUEUED(9), fg);
322	if (planemask != p->last_mask) {
323		CG14Wait(p);
324		write_sx_reg(p, SX_PLANEMASK, planemask);
325		p->last_mask = planemask;
326	}
327	alu = sx_rop[alu];
328	if (alu != p->last_rop) {
329		CG14Wait(p);
330		write_sx_reg(p, SX_ROP_CONTROL, alu);
331		p->last_rop = alu;
332	}
333	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
334	return TRUE;
335}
336
337static void
338CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
339{
340	int line, x, num;
341	uint32_t ptr;
342
343	ENTER;
344	if (p->last_rop == 0xcc) {
345		/* simple fill */
346		for (line = 0; line < h; line++) {
347			x = 0;
348			while (x < w) {
349				ptr = start + (x << 2);
350				num = min(32, w - x);
351				write_sx_io(p, ptr,
352				    SX_STS(8, num - 1, ptr & 7));
353				x += 32;
354			}
355			start += pitch;
356		}
357	} else if (p->last_rop == 0xaa) {
358		/* nothing to do here */
359		return;
360	} else {
361		/* alright, let's do actual ROP stuff */
362
363		/* first repeat the fill colour into 16 registers */
364		write_sx_reg(p, SX_INSTRUCTIONS,
365		    SX_SELECT_S(8, 8, 10, 15));
366
367		for (line = 0; line < h; line++) {
368			x = 0;
369			while (x < w) {
370				ptr = start + (x << 2);
371				num = min(32, w - x);
372				/* now suck fb data into registers */
373				write_sx_io(p, ptr,
374				    SX_LD(42, num - 1, ptr & 7));
375				/*
376				 * ROP them with the fill data we left in 10
377				 * non-memory ops can only have counts up to 16
378				 */
379				if (num <= 16) {
380					write_sx_reg(p, SX_INSTRUCTIONS,
381					    SX_ROP(10, 42, 74, num - 1));
382				} else {
383					write_sx_reg(p, SX_INSTRUCTIONS,
384					    SX_ROP(10, 42, 74, 15));
385					write_sx_reg(p, SX_INSTRUCTIONS,
386					    SX_ROP(10, 58, 90, num - 17));
387				}
388				/* and write the result back into memory */
389				write_sx_io(p, ptr,
390				    SX_ST(74, num - 1, ptr & 7));
391				x += 32;
392			}
393			start += pitch;
394		}
395	}
396}
397
398static void
399CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
400{
401	int line, x, num, off;
402	uint32_t ptr;
403
404	ENTER;
405	off = start & 7;
406	start &= ~7;
407
408	if (p->last_rop == 0xcc) {
409		/* simple fill */
410		for (line = 0; line < h; line++) {
411			x = 0;
412			while (x < w) {
413				ptr = start + x;
414				num = min(32, w - x);
415				write_sx_io(p, ptr,
416				    SX_STBS(8, num - 1, off));
417				x += 32;
418			}
419			start += pitch;
420		}
421	} else if (p->last_rop == 0xaa) {
422		/* nothing to do here */
423		return;
424	} else {
425		/* alright, let's do actual ROP stuff */
426
427		/* first repeat the fill colour into 16 registers */
428		write_sx_reg(p, SX_INSTRUCTIONS,
429		    SX_SELECT_S(8, 8, 10, 15));
430
431		for (line = 0; line < h; line++) {
432			x = 0;
433			while (x < w) {
434				ptr = start + x;
435				num = min(32, w - x);
436				/* now suck fb data into registers */
437				write_sx_io(p, ptr,
438				    SX_LDB(42, num - 1, off));
439				/*
440				 * ROP them with the fill data we left in 10
441				 * non-memory ops can only have counts up to 16
442				 */
443				if (num <= 16) {
444					write_sx_reg(p, SX_INSTRUCTIONS,
445					    SX_ROP(10, 42, 74, num - 1));
446				} else {
447					write_sx_reg(p, SX_INSTRUCTIONS,
448					    SX_ROP(10, 42, 74, 15));
449					write_sx_reg(p, SX_INSTRUCTIONS,
450					    SX_ROP(10, 58, 90, num - 17));
451				}
452				/* and write the result back into memory */
453				write_sx_io(p, ptr,
454				    SX_STB(74, num - 1, off));
455				x += 32;
456			}
457			start += pitch;
458		}
459	}
460}
461
462static void
463CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
464{
465	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
466	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
467	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
468	int start, depth;
469
470	ENTER;
471	dstpitch = exaGetPixmapPitch(pPixmap);
472	dstoff = exaGetPixmapOffset(pPixmap);
473
474	depth = pPixmap->drawable.bitsPerPixel;
475	switch (depth) {
476		case 32:
477			start = dstoff + (y1 * dstpitch) + (x1 << 2);
478			CG14Solid32(p, start, dstpitch, w, h);
479			break;
480		case 8:
481			start = dstoff + (y1 * dstpitch) + x1;
482			CG14Solid8(p, start, dstpitch, w, h);
483			break;
484	}
485
486	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
487	    dstpitch, dstoff, start);
488	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
489	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
490	exaMarkSync(pPixmap->drawable.pScreen);
491}
492
493/*
494 * Memcpy-based UTS.
495 */
496static Bool
497CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
498    char *src, int src_pitch)
499{
500	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
501	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
502	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
503	int    dst_pitch  = exaGetPixmapPitch(pDst);
504
505	int bpp    = pDst->drawable.bitsPerPixel;
506	int cpp    = (bpp + 7) >> 3;
507	int wBytes = w * cpp;
508
509	ENTER;
510	dst += (x * cpp) + (y * dst_pitch);
511
512	CG14Wait(p);
513
514	while (h--) {
515		memcpy(dst, src, wBytes);
516		src += src_pitch;
517		dst += dst_pitch;
518	}
519	__asm("stbar;");
520	return TRUE;
521}
522
523/*
524 * Memcpy-based DFS.
525 */
526static Bool
527CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
528    char *dst, int dst_pitch)
529{
530	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
531	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
532	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
533	int    src_pitch  = exaGetPixmapPitch(pSrc);
534
535	ENTER;
536	int bpp    = pSrc->drawable.bitsPerPixel;
537	int cpp    = (bpp + 7) >> 3;
538	int wBytes = w * cpp;
539
540	src += (x * cpp) + (y * src_pitch);
541
542	CG14Wait(p);
543
544	while (h--) {
545		memcpy(dst, src, wBytes);
546		src += src_pitch;
547		dst += dst_pitch;
548	}
549
550	return TRUE;
551}
552
553Bool
554CG14CheckComposite(int op, PicturePtr pSrcPicture,
555                           PicturePtr pMaskPicture,
556                           PicturePtr pDstPicture)
557{
558	int i, ok = FALSE;
559
560	ENTER;
561
562	/*
563	 * SX is in theory capable of accelerating pretty much all Xrender ops,
564	 * even coordinate transformation and gradients. Support will be added
565	 * over time and likely have to spill over into its own source file.
566	 */
567
568	if ((op != PictOpOver) && (op != PictOpAdd)) {
569		xf86Msg(X_ERROR, "%s: rejecting %d\n", __func__, op);
570		return FALSE;
571	}
572	i = 0;
573	while ((i < arraysize(src_formats)) && (!ok)) {
574		ok =  (pSrcPicture->format == src_formats[i]);
575		i++;
576	}
577
578	if (!ok) {
579		xf86Msg(X_ERROR, "%s: unsupported src format %x\n",
580		    __func__, pSrcPicture->format);
581		return FALSE;
582	}
583
584	DPRINTF(X_ERROR, "src is %x %d %d\n", pSrcPicture->format,
585	    pSrcPicture->pDrawable->width, pSrcPicture->pDrawable->height);
586
587	if (pMaskPicture != NULL) {
588		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
589		    pMaskPicture->pDrawable->width,
590		    pMaskPicture->pDrawable->height);
591	}
592	return TRUE;
593}
594
595Bool
596CG14PrepareComposite(int op, PicturePtr pSrcPicture,
597                             PicturePtr pMaskPicture,
598                             PicturePtr pDstPicture,
599                             PixmapPtr  pSrc,
600                             PixmapPtr  pMask,
601                             PixmapPtr  pDst)
602{
603	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
604	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
605
606	ENTER;
607
608	if (pSrcPicture->pSourcePict != NULL) {
609		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
610			p->fillcolour =
611			    pSrcPicture->pSourcePict->solidFill.color;
612			DPRINTF(X_ERROR, "%s: solid src %08x\n",
613			    __func__, p->fillcolour);
614		}
615	}
616	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
617		if (pMaskPicture->pSourcePict->type ==
618		    SourcePictTypeSolidFill) {
619			p->fillcolour =
620			   pMaskPicture->pSourcePict->solidFill.color;
621			DPRINTF(X_ERROR, "%s: solid mask %08x\n",
622			    __func__, p->fillcolour);
623		}
624	}
625	if (pMaskPicture != NULL) {
626		p->mskoff = exaGetPixmapOffset(pMask);
627		p->mskpitch = exaGetPixmapPitch(pMask);
628		p->mskformat = pMaskPicture->format;
629	}
630	p->srcoff = exaGetPixmapOffset(pSrc);
631	p->srcpitch = exaGetPixmapPitch(pSrc);
632	p->srcformat = pSrcPicture->format;
633	p->dstformat = pDstPicture->format;
634	p->op = op;
635#ifdef SX_DEBUG
636	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
637	    *(uint32_t *)(p->fb + p->srcoff));
638#endif
639	return TRUE;
640}
641
642void CG14Comp_Over32(Cg14Ptr p,
643                   uint32_t src, uint32_t srcpitch,
644                   uint32_t dst, uint32_t dstpitch,
645                   int width, int height)
646{
647	uint32_t msk = src, mskx, dstx, m;
648	int line, x, i;
649
650	ENTER;
651	/* first get the source colour */
652	write_sx_io(p, p->srcoff, SX_LDUQ0(8, 0, p->srcoff & 7));
653	write_sx_reg(p, SX_QUEUED(8), 0xff);
654	for (line = 0; line < height; line++) {
655		mskx = msk;
656		dstx = dst;
657#ifdef SX_SINGLE
658
659		for (x = 0; x < width; x++) {
660			m = *(volatile uint32_t *)(p->fb + mskx);
661			m = m >> 24;
662			if (m == 0) {
663				/* nothing to do - all transparent */
664			} else if (m == 0xff) {
665				/* all opaque */
666				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
667			} else {
668				/* fetch alpha value, stick it into scam */
669				/* mask is in R[12:15] */
670				/*write_sx_io(p, mskx,
671				    SX_LDUQ0(12, 0, mskx & 7));*/
672				write_sx_reg(p, SX_QUEUED(12), m);
673				/* fetch dst pixel */
674				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
675				write_sx_reg(p, SX_INSTRUCTIONS,
676				    SX_ORV(12, 0, R_SCAM, 0));
677				/*
678				 * src * alpha + R0
679				 * R[9:11] * SCAM + R0 -> R[17:19]
680				 */
681				write_sx_reg(p, SX_INSTRUCTIONS,
682				    SX_SAXP16X16SR8(9, 0, 17, 2));
683
684				/* invert SCAM */
685				write_sx_reg(p, SX_INSTRUCTIONS,
686				    SX_XORV(12, 8, R_SCAM, 0));
687#ifdef SX_DEBUG
688				write_sx_reg(p, SX_INSTRUCTIONS,
689				    SX_XORV(12, 8, 13, 0));
690#endif
691				/* dst * (1 - alpha) + R[13:15] */
692				write_sx_reg(p, SX_INSTRUCTIONS,
693				    SX_SAXP16X16SR8(21, 17, 25, 2));
694				write_sx_io(p, dstx,
695				    SX_STUQ0C(24, 0, dstx & 7));
696			}
697			dstx += 4;
698			mskx += 4;
699		}
700#else
701		for (x = 0; x < width; x += 4) {
702			/* fetch 4 mask values */
703			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
704			/* fetch destination pixels */
705			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
706			/* duplicate them for all channels */
707			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
708			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
709			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
710			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
711			/* generate inverted alpha */
712			write_sx_reg(p, SX_INSTRUCTIONS,
713			    SX_XORS(12, 8, 28, 15));
714			/* multiply source */
715			write_sx_reg(p, SX_INSTRUCTIONS,
716			    SX_MUL16X16SR8(8, 12, 44, 3));
717			write_sx_reg(p, SX_INSTRUCTIONS,
718			    SX_MUL16X16SR8(8, 16, 48, 3));
719			write_sx_reg(p, SX_INSTRUCTIONS,
720			    SX_MUL16X16SR8(8, 20, 52, 3));
721			write_sx_reg(p, SX_INSTRUCTIONS,
722			    SX_MUL16X16SR8(8, 24, 56, 3));
723			/* multiply dest */
724			write_sx_reg(p, SX_INSTRUCTIONS,
725			    SX_MUL16X16SR8(28, 60, 76, 15));
726			/* add up */
727			write_sx_reg(p, SX_INSTRUCTIONS,
728			    SX_ADDV(44, 76, 92, 15));
729			/* write back */
730			write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
731			dstx += 16;
732			mskx += 16;
733		}
734#endif
735		dst += dstpitch;
736		msk += srcpitch;
737	}
738}
739
740void CG14Comp_Over8(Cg14Ptr p,
741                   uint32_t src, uint32_t srcpitch,
742                   uint32_t dst, uint32_t dstpitch,
743                   int width, int height)
744{
745	uint32_t msk = src, mskx, dstx, m;
746	int line, x, i;
747#ifdef SX_DEBUG
748	char buffer[256];
749#endif
750	ENTER;
751
752	/* first get the source colour */
753	write_sx_io(p, p->srcoff, SX_LDUQ0(8, 0, p->srcoff & 7));
754	write_sx_reg(p, SX_QUEUED(8), 0xff);
755	DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
756	    read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
757	    *(uint32_t *)(p->fb + p->srcoff));
758	for (line = 0; line < height; line++) {
759		mskx = msk;
760		dstx = dst;
761#ifdef SX_SINGLE
762
763		for (x = 0; x < width; x++) {
764			m = *(volatile uint8_t *)(p->fb + mskx);
765#ifdef SX_DEBUG
766			buffer[x] = c[m >> 5];
767#endif
768			if (m == 0) {
769				/* nothing to do - all transparent */
770			} else if (m == 0xff) {
771				/* all opaque */
772				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
773			} else {
774				/* fetch alpha value, stick it into scam */
775				/* mask is in R[12:15] */
776				/*write_sx_io(p, mskx & ~7,
777				    SX_LDB(12, 0, mskx & 7));*/
778				write_sx_reg(p, SX_QUEUED(12), m);
779				/* fetch dst pixel */
780				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
781				write_sx_reg(p, SX_INSTRUCTIONS,
782				    SX_ORV(12, 0, R_SCAM, 0));
783				/*
784				 * src * alpha + R0
785				 * R[9:11] * SCAM + R0 -> R[17:19]
786				 */
787				write_sx_reg(p, SX_INSTRUCTIONS,
788				    SX_SAXP16X16SR8(9, 0, 17, 2));
789
790				/* invert SCAM */
791				write_sx_reg(p, SX_INSTRUCTIONS,
792				    SX_XORV(12, 8, R_SCAM, 0));
793#ifdef SX_DEBUG
794				write_sx_reg(p, SX_INSTRUCTIONS,
795				    SX_XORV(12, 8, 13, 0));
796#endif
797				/* dst * (1 - alpha) + R[13:15] */
798				write_sx_reg(p, SX_INSTRUCTIONS,
799				    SX_SAXP16X16SR8(21, 17, 25, 2));
800				write_sx_io(p, dstx,
801				    SX_STUQ0C(24, 0, dstx & 7));
802			}
803			dstx += 4;
804			mskx += 1;
805		}
806#ifdef SX_DEBUG
807		buffer[x] = 0;
808		xf86Msg(X_ERROR, "%s\n", buffer);
809#endif
810#else
811		for (x = 0; x < width; x += 4) {
812			/* fetch 4 mask values */
813			write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
814			/* fetch destination pixels */
815			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
816			/* duplicate them for all channels */
817			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
818			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
819			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
820			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
821			/* generate inverted alpha */
822			write_sx_reg(p, SX_INSTRUCTIONS,
823			    SX_XORS(12, 8, 28, 15));
824			/* multiply source */
825			write_sx_reg(p, SX_INSTRUCTIONS,
826			    SX_MUL16X16SR8(8, 12, 44, 3));
827			write_sx_reg(p, SX_INSTRUCTIONS,
828			    SX_MUL16X16SR8(8, 16, 48, 3));
829			write_sx_reg(p, SX_INSTRUCTIONS,
830			    SX_MUL16X16SR8(8, 20, 52, 3));
831			write_sx_reg(p, SX_INSTRUCTIONS,
832			    SX_MUL16X16SR8(8, 24, 56, 3));
833			/* multiply dest */
834			write_sx_reg(p, SX_INSTRUCTIONS,
835			    SX_MUL16X16SR8(28, 60, 76, 15));
836			/* add up */
837			write_sx_reg(p, SX_INSTRUCTIONS,
838			    SX_ADDV(44, 76, 92, 15));
839			/* write back */
840			write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
841			dstx += 16;
842			mskx += 4;
843		}
844#endif
845		dst += dstpitch;
846		msk += srcpitch;
847	}
848}
849
850void CG14Comp_Add32(Cg14Ptr p,
851                   uint32_t src, uint32_t srcpitch,
852                   uint32_t dst, uint32_t dstpitch,
853                   int width, int height)
854{
855	int line;
856	uint32_t srcx, dstx;
857	int full, part, x;
858
859	ENTER;
860	full = width >> 3;	/* chunks of 8 */
861	part = width & 7;	/* leftovers */
862	/* we do this up to 8 pixels at a time */
863	for (line = 0; line < height; line++) {
864		srcx = src;
865		dstx = dst;
866		for (x = 0; x < full; x++) {
867			write_sx_io(p, srcx, SX_LDUQ0(8, 31, srcx & 7));
868			write_sx_io(p, dstx, SX_LDUQ0(40, 31, dstx & 7));
869			write_sx_reg(p, SX_INSTRUCTIONS,
870			    SX_ADDV(8, 40, 72, 15));
871			write_sx_reg(p, SX_INSTRUCTIONS,
872			    SX_ADDV(24, 56, 88, 15));
873			write_sx_io(p, dstx, SX_STUQ0(72, 31, dstx & 7));
874			srcx += 128;
875			dstx += 128;
876		}
877
878		/* do leftovers */
879		write_sx_io(p, srcx, SX_LDUQ0(8, part - 1, srcx & 7));
880		write_sx_io(p, dstx, SX_LDUQ0(40, part - 1, dstx & 7));
881		if (part & 16) {
882			write_sx_reg(p, SX_INSTRUCTIONS,
883			    SX_ADDV(8, 40, 72, 15));
884			write_sx_reg(p, SX_INSTRUCTIONS,
885			    SX_ADDV(24, 56, 88, part - 17));
886		} else {
887			write_sx_reg(p, SX_INSTRUCTIONS,
888			    SX_ADDV(8, 40, 72, part - 1));
889		}
890		write_sx_io(p, dstx, SX_STUQ0(72, part - 1, dstx & 7));
891
892		/* next line */
893		src += srcpitch;
894		dst += dstpitch;
895	}
896}
897
898void CG14Comp_Add8(Cg14Ptr p,
899                   uint32_t src, uint32_t srcpitch,
900                   uint32_t dst, uint32_t dstpitch,
901                   int width, int height)
902{
903	int line;
904	uint32_t srcx, dstx, srcoff, dstoff;
905	int pre, full, part, x;
906	uint8_t *d;
907	char buffer[256];
908	ENTER;
909
910	srcoff = src & 7;
911	src &= ~7;
912	dstoff = dst & 7;
913	dst &= ~7;
914	full = width >> 5;	/* chunks of 32 */
915	part = width & 31;	/* leftovers */
916
917#ifdef SX_DEBUG
918	xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch,
919	    width, height, full, part);
920#endif
921	/* we do this up to 32 pixels at a time */
922	for (line = 0; line < height; line++) {
923		srcx = src;
924		dstx = dst;
925#ifdef SX_ADD_SOFTWARE
926		uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff);
927		d = (uint8_t *)(p->fb + dstx + dstoff);
928		for (x = 0; x < width; x++) {
929			d[x] = min(255, s[x] + d[x]);
930		}
931#else
932		for (x = 0; x < full; x++) {
933			write_sx_io(p, srcx, SX_LDB(8, 31, srcoff));
934			write_sx_io(p, dstx, SX_LDB(40, 31, dstoff));
935			write_sx_reg(p, SX_INSTRUCTIONS,
936			    SX_ADDV(8, 40, 72, 15));
937			write_sx_reg(p, SX_INSTRUCTIONS,
938			    SX_ADDV(24, 56, 88, 15));
939			write_sx_io(p, dstx, SX_STBC(72, 31, dstoff));
940			srcx += 32;
941			dstx += 32;
942		}
943
944		if (part > 0) {
945			/* do leftovers */
946			write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff));
947			write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff));
948			if (part > 16) {
949				write_sx_reg(p, SX_INSTRUCTIONS,
950				    SX_ADDV(8, 40, 72, 15));
951				write_sx_reg(p, SX_INSTRUCTIONS,
952				    SX_ADDV(24, 56, 88, part - 17));
953			} else {
954				write_sx_reg(p, SX_INSTRUCTIONS,
955				    SX_ADDV(8, 40, 72, part - 1));
956			}
957			write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff));
958		}
959#endif
960#ifdef SX_DEBUG
961		d = (uint8_t *)(p->fb + src + srcoff);
962		for (x = 0; x < width; x++) {
963			buffer[x] = c[d[x]>>5];
964		}
965		buffer[x] = 0;
966		xf86Msg(X_ERROR, "%s\n", buffer);
967#endif
968		/* next line */
969		src += srcpitch;
970		dst += dstpitch;
971	}
972}
973
974void
975CG14Composite(PixmapPtr pDst, int srcX, int srcY,
976                              int maskX, int maskY,
977                              int dstX, int dstY,
978                              int width, int height)
979{
980	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
981	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
982	uint32_t dstoff, dstpitch;
983	uint32_t dst, msk, src;
984
985	ENTER;
986	dstoff = exaGetPixmapOffset(pDst);
987	dstpitch = exaGetPixmapPitch(pDst);
988
989	switch (p->op) {
990		case PictOpOver:
991			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
992			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
993			    p->mskformat, p->dstformat, srcX, srcY);
994			switch (p->mskformat) {
995				case PICT_a8:
996					msk = p->mskoff +
997					    (maskY * p->mskpitch) + maskX;
998					CG14Comp_Over8(p, msk, p->mskpitch,
999					    dst, dstpitch, width, height);
1000					break;
1001				case PICT_a8r8g8b8:
1002				case PICT_a8b8g8r8:
1003					msk = p->mskoff +
1004					    (maskY * p->mskpitch) +
1005					    (maskX << 2);
1006					CG14Comp_Over32(p, msk, p->mskpitch,
1007					    dst, dstpitch, width, height);
1008					break;
1009				default:
1010					xf86Msg(X_ERROR,
1011					    "unsupported mask format\n");
1012			}
1013			break;
1014		case PictOpAdd:
1015			DPRINTF(X_ERROR, "Add %08x %08x\n",
1016			    p->srcformat, p->dstformat);
1017			switch (p->srcformat) {
1018				case PICT_a8:
1019					src = p->srcoff +
1020					    (srcY * p->srcpitch) + srcX;
1021					dst = dstoff + (dstY * dstpitch) + dstX;
1022					CG14Comp_Add8(p, src, p->srcpitch,
1023					    dst, dstpitch, width, height);
1024					break;
1025				case PICT_a8r8g8b8:
1026				case PICT_x8r8g8b8:
1027					src = p->srcoff +
1028					    (srcY * p->srcpitch) + (srcX << 2);
1029					dst = dstoff + (dstY * dstpitch) +
1030					    (dstX << 2);
1031					CG14Comp_Add32(p, src, p->srcpitch,
1032					    dst, dstpitch, width, height);
1033					break;
1034				default:
1035					xf86Msg(X_ERROR,
1036					    "unsupported src format\n");
1037			}
1038			break;
1039		default:
1040			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
1041	}
1042	exaMarkSync(pDst->drawable.pScreen);
1043}
1044
1045
1046
1047Bool
1048CG14InitAccel(ScreenPtr pScreen)
1049{
1050	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1051	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1052	ExaDriverPtr pExa;
1053
1054	pExa = exaDriverAlloc();
1055	if (!pExa)
1056		return FALSE;
1057
1058	p->pExa = pExa;
1059
1060	pExa->exa_major = EXA_VERSION_MAJOR;
1061	pExa->exa_minor = EXA_VERSION_MINOR;
1062
1063	pExa->memoryBase = p->fb;
1064	pExa->memorySize = p->memsize;
1065	pExa->offScreenBase = p->width * p->height * 4;
1066
1067	/*
1068	 * SX memory instructions are written to 64bit aligned addresses with
1069	 * a 3 bit displacement. Make sure the displacement remains constant
1070	 * within one column
1071	 */
1072
1073	pExa->pixmapOffsetAlign = 8;
1074	pExa->pixmapPitchAlign = 8;
1075
1076	pExa->flags = EXA_OFFSCREEN_PIXMAPS |
1077		      /*EXA_SUPPORTS_OFFSCREEN_OVERLAPS |*/
1078		      EXA_MIXED_PIXMAPS;
1079
1080	/*
1081	 * these limits are bogus
1082	 * SX doesn't deal with coordinates at all, so there is no limit but
1083	 * we have to put something here
1084	 */
1085	pExa->maxX = 4096;
1086	pExa->maxY = 4096;
1087
1088	pExa->WaitMarker = CG14WaitMarker;
1089
1090	pExa->PrepareSolid = CG14PrepareSolid;
1091	pExa->Solid = CG14Solid;
1092	pExa->DoneSolid = CG14DoneCopy;
1093	pExa->PrepareCopy = CG14PrepareCopy;
1094	pExa->Copy = CG14Copy;
1095	pExa->DoneCopy = CG14DoneCopy;
1096	if (p->use_xrender) {
1097		pExa->CheckComposite = CG14CheckComposite;
1098		pExa->PrepareComposite = CG14PrepareComposite;
1099		pExa->Composite = CG14Composite;
1100		pExa->DoneComposite = CG14DoneCopy;
1101	}
1102
1103	/* EXA hits more optimized paths when it does not have to fallback
1104	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
1105	 */
1106	pExa->UploadToScreen = CG14UploadToScreen;
1107	pExa->DownloadFromScreen = CG14DownloadFromScreen;
1108
1109	/* do some hardware init */
1110	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
1111	p->last_mask = 0xffffffff;
1112	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
1113	p->last_rop = 0xcc;
1114	return exaDriverInit(pScreen, pExa);
1115}
1116