cg14_accel.c revision a3a2ba44
1/* $NetBSD: cg14_accel.c,v 1.2 2013/06/25 12:26:57 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#include <sys/types.h>
33
34/* all driver need this */
35#include "xf86.h"
36#include "xf86_OSproc.h"
37#include "compiler.h"
38
39#include "cg14.h"
40#include <sparc/sxreg.h>
41
42#define SX_SINGLE
43/*#define SX_DEBUG*/
44/*#define SX_ADD_SOFTWARE*/
45
46#ifdef SX_DEBUG
47#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
48#define DPRINTF xf86Msg
49#else
50#define ENTER
51#define DPRINTF while (0) xf86Msg
52#endif
53
54#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
55
56/* 0xcc is SX's GXcopy equivalent */
57uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
58		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
59
60int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
61		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
62int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
63
64static inline void
65CG14Wait(Cg14Ptr p)
66{
67	/* we just wait until the instruction queue is empty */
68	while ((read_sx_reg(p, SX_CONTROL_STATUS) & SX_MT) != 0) {};
69}
70
71static void
72CG14WaitMarker(ScreenPtr pScreen, int Marker)
73{
74	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
75	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
76
77	CG14Wait(p);
78}
79
80static Bool
81CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
82		int xdir, int ydir, int alu, Pixel planemask)
83{
84	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
85	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
86
87	ENTER;
88	DPRINTF(X_ERROR, "bits per pixel: %d\n",
89	    pSrcPixmap->drawable.bitsPerPixel);
90
91	if (planemask != p->last_mask) {
92		CG14Wait(p);
93		write_sx_reg(p, SX_PLANEMASK, planemask);
94		p->last_mask = planemask;
95	}
96	alu = sx_rop[alu];
97	if (alu != p->last_rop) {
98		CG14Wait(p);
99		write_sx_reg(p, SX_ROP_CONTROL, alu);
100		p->last_rop = alu;
101	}
102	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
103	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
104	p->xdir = xdir;
105	p->ydir = ydir;
106	return TRUE;
107}
108
109static void
110CG14Copy(PixmapPtr pDstPixmap,
111         int srcX, int srcY, int dstX, int dstY, int w, int h)
112{
113	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
114	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
115	int dstpitch, dstoff, srcpitch, srcoff;
116	int srcstart, dststart, xinc, srcinc, dstinc;
117	int line, count, s, d, num;
118
119	ENTER;
120	dstpitch = exaGetPixmapPitch(pDstPixmap);
121	dstoff = exaGetPixmapOffset(pDstPixmap);
122	srcpitch = p->srcpitch;
123	srcoff = p->srcoff;
124	/*
125	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
126	 * actually wrote anything and only sync if it did
127	 */
128	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
129	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
130
131	/*
132	 * we always copy up to 32 pixels at a time so direction doesn't
133	 * matter if w<=32
134	 */
135	if (w > 32) {
136		if (p->xdir < 0) {
137			srcstart += (w - 32) << 2;
138			dststart += (w - 32) << 2;
139			xinc = -128;
140		} else
141			xinc = 128;
142	} else
143		xinc = 128;
144	if (p->ydir < 0) {
145		srcstart += (h - 1) * srcpitch;
146		dststart += (h - 1) * dstpitch;
147		srcinc = -srcpitch;
148		dstinc = -dstpitch;
149	} else {
150		srcinc = srcpitch;
151		dstinc = dstpitch;
152	}
153	if (p->last_rop == 0xcc) {
154		/* plain old copy */
155		if ( xinc > 0) {
156			/* going left to right */
157			for (line = 0; line < h; line++) {
158				count = 0;
159				s = srcstart;
160				d = dststart;
161				while ( count < w) {
162					num = min(32, w - count);
163					write_sx_io(p, s,
164					    SX_LD(10, num - 1, s & 7));
165					write_sx_io(p, d,
166					    SX_STM(10, num - 1, d & 7));
167					s += xinc;
168					d += xinc;
169					count += 32;
170				}
171				srcstart += srcinc;
172				dststart += dstinc;
173			}
174		} else {
175			/* going right to left */
176			int i, chunks = (w >> 5);
177			for (line = 0; line < h; line++) {
178				s = srcstart;
179				d = dststart;
180				count = w;
181				for (i = 0; i < chunks; i++) {
182					write_sx_io(p, s,
183					    SX_LD(10, 31, s & 7));
184					write_sx_io(p, d,
185					    SX_STM(10, 31, d & 7));
186					s -= 128;
187					d -= 128;
188					count -= 32;
189				}
190				/* leftovers, if any */
191				if (count > 0) {
192					s += (32 - count) << 2;
193					d += (32 - count) << 2;
194					write_sx_io(p, s,
195					    SX_LD(10, count - 1, s & 7));
196					write_sx_io(p, d,
197					    SX_STM(10, count - 1, d & 7));
198				}
199				srcstart += srcinc;
200				dststart += dstinc;
201			}
202		}
203	} else {
204		/* ROPs needed */
205		if ( xinc > 0) {
206			/* going left to right */
207			for (line = 0; line < h; line++) {
208				count = 0;
209				s = srcstart;
210				d = dststart;
211				while ( count < w) {
212					num = min(32, w - count);
213					write_sx_io(p, s,
214					    SX_LD(10, num - 1, s & 7));
215					write_sx_io(p, d,
216					    SX_LD(42, num - 1, d & 7));
217					if (num > 16) {
218						write_sx_reg(p, SX_INSTRUCTIONS,
219					    	 SX_ROP(10, 42, 74, 15));
220						write_sx_reg(p, SX_INSTRUCTIONS,
221					    	 SX_ROP(26, 58, 90, num - 17));
222					} else {
223						write_sx_reg(p, SX_INSTRUCTIONS,
224					    	 SX_ROP(10, 42, 74, num - 1));
225					}
226					write_sx_io(p, d,
227					    SX_STM(74, num - 1, d & 7));
228					s += xinc;
229					d += xinc;
230					count += 32;
231				}
232				srcstart += srcinc;
233				dststart += dstinc;
234			}
235		} else {
236			/* going right to left */
237			int i, chunks = (w >> 5);
238			for (line = 0; line < h; line++) {
239				s = srcstart;
240				d = dststart;
241				count = w;
242				for (i = 0; i < chunks; i++) {
243					write_sx_io(p, s, SX_LD(10, 31, s & 7));
244					write_sx_io(p, d, SX_LD(42, 31, d & 7));
245					write_sx_reg(p, SX_INSTRUCTIONS,
246				    	    SX_ROP(10, 42, 74, 15));
247					write_sx_reg(p, SX_INSTRUCTIONS,
248				    	    SX_ROP(26, 58, 90, 15));
249					write_sx_io(p, d,
250					    SX_STM(74, 31, d & 7));
251					s -= 128;
252					d -= 128;
253					count -= 32;
254				}
255				/* leftovers, if any */
256				if (count > 0) {
257					s += (32 - count) << 2;
258					d += (32 - count) << 2;
259					write_sx_io(p, s,
260					    SX_LD(10, count - 1, s & 7));
261					write_sx_io(p, d,
262					    SX_LD(42, count - 1, d & 7));
263					if (count > 16) {
264						write_sx_reg(p, SX_INSTRUCTIONS,
265					    	    SX_ROP(10, 42, 74, 15));
266						write_sx_reg(p, SX_INSTRUCTIONS,
267					    	 SX_ROP(26, 58, 90, count - 17));
268					} else {
269						write_sx_reg(p, SX_INSTRUCTIONS,
270					    	 SX_ROP(10, 42, 74, count - 1));
271					}
272
273					write_sx_io(p, d,
274					    SX_STM(74, count - 1, d & 7));
275				}
276				srcstart += srcinc;
277				dststart += dstinc;
278			}
279		}
280	}
281	exaMarkSync(pDstPixmap->drawable.pScreen);
282}
283
284static void
285CG14DoneCopy(PixmapPtr pDstPixmap)
286{
287}
288
289static Bool
290CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
291{
292	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
293	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
294
295	ENTER;
296	DPRINTF(X_ERROR, "bits per pixel: %d\n", pPixmap->drawable.bitsPerPixel);
297	write_sx_reg(p, SX_QUEUED(8), fg);
298	write_sx_reg(p, SX_QUEUED(9), fg);
299	if (planemask != p->last_mask) {
300		CG14Wait(p);
301		write_sx_reg(p, SX_PLANEMASK, planemask);
302		p->last_mask = planemask;
303	}
304	alu = sx_rop[alu];
305	if (alu != p->last_rop) {
306		CG14Wait(p);
307		write_sx_reg(p, SX_ROP_CONTROL, alu);
308		p->last_rop = alu;
309	}
310	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
311	return TRUE;
312}
313
314static void
315CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
316{
317	int line, x, num;
318	uint32_t ptr;
319
320	ENTER;
321	if (p->last_rop == 0xcc) {
322		/* simple fill */
323		for (line = 0; line < h; line++) {
324			x = 0;
325			while (x < w) {
326				ptr = start + (x << 2);
327				num = min(32, w - x);
328				write_sx_io(p, ptr,
329				    SX_STS(8, num - 1, ptr & 7));
330				x += 32;
331			}
332			start += pitch;
333		}
334	} else if (p->last_rop == 0xaa) {
335		/* nothing to do here */
336		return;
337	} else {
338		/* alright, let's do actual ROP stuff */
339
340		/* first repeat the fill colour into 16 registers */
341		write_sx_reg(p, SX_INSTRUCTIONS,
342		    SX_SELECT_S(8, 8, 10, 15));
343
344		for (line = 0; line < h; line++) {
345			x = 0;
346			while (x < w) {
347				ptr = start + (x << 2);
348				num = min(32, w - x);
349				/* now suck fb data into registers */
350				write_sx_io(p, ptr,
351				    SX_LD(42, num - 1, ptr & 7));
352				/*
353				 * ROP them with the fill data we left in 10
354				 * non-memory ops can only have counts up to 16
355				 */
356				if (num <= 16) {
357					write_sx_reg(p, SX_INSTRUCTIONS,
358					    SX_ROP(10, 42, 74, num - 1));
359				} else {
360					write_sx_reg(p, SX_INSTRUCTIONS,
361					    SX_ROP(10, 42, 74, 15));
362					write_sx_reg(p, SX_INSTRUCTIONS,
363					    SX_ROP(10, 58, 90, num - 17));
364				}
365				/* and write the result back into memory */
366				write_sx_io(p, ptr,
367				    SX_ST(74, num - 1, ptr & 7));
368				x += 32;
369			}
370			start += pitch;
371		}
372	}
373}
374
375static void
376CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
377{
378	int line, x, num, off;
379	uint32_t ptr;
380
381	ENTER;
382	off = start & 7;
383	start &= ~7;
384
385	if (p->last_rop == 0xcc) {
386		/* simple fill */
387		for (line = 0; line < h; line++) {
388			x = 0;
389			while (x < w) {
390				ptr = start + x;
391				num = min(32, w - x);
392				write_sx_io(p, ptr,
393				    SX_STBS(8, num - 1, off));
394				x += 32;
395			}
396			start += pitch;
397		}
398	} else if (p->last_rop == 0xaa) {
399		/* nothing to do here */
400		return;
401	} else {
402		/* alright, let's do actual ROP stuff */
403
404		/* first repeat the fill colour into 16 registers */
405		write_sx_reg(p, SX_INSTRUCTIONS,
406		    SX_SELECT_S(8, 8, 10, 15));
407
408		for (line = 0; line < h; line++) {
409			x = 0;
410			while (x < w) {
411				ptr = start + x;
412				num = min(32, w - x);
413				/* now suck fb data into registers */
414				write_sx_io(p, ptr,
415				    SX_LDB(42, num - 1, off));
416				/*
417				 * ROP them with the fill data we left in 10
418				 * non-memory ops can only have counts up to 16
419				 */
420				if (num <= 16) {
421					write_sx_reg(p, SX_INSTRUCTIONS,
422					    SX_ROP(10, 42, 74, num - 1));
423				} else {
424					write_sx_reg(p, SX_INSTRUCTIONS,
425					    SX_ROP(10, 42, 74, 15));
426					write_sx_reg(p, SX_INSTRUCTIONS,
427					    SX_ROP(10, 58, 90, num - 17));
428				}
429				/* and write the result back into memory */
430				write_sx_io(p, ptr,
431				    SX_STB(74, num - 1, off));
432				x += 32;
433			}
434			start += pitch;
435		}
436	}
437}
438
439static void
440CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
441{
442	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
443	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
444	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
445	int start, depth;
446
447	ENTER;
448	dstpitch = exaGetPixmapPitch(pPixmap);
449	dstoff = exaGetPixmapOffset(pPixmap);
450
451	depth = pPixmap->drawable.bitsPerPixel;
452	switch (depth) {
453		case 32:
454			start = dstoff + (y1 * dstpitch) + (x1 << 2);
455			CG14Solid32(p, start, dstpitch, w, h);
456			break;
457		case 8:
458			start = dstoff + (y1 * dstpitch) + x1;
459			CG14Solid8(p, start, dstpitch, w, h);
460			break;
461	}
462
463	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
464	    dstpitch, dstoff, start);
465	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
466	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
467	exaMarkSync(pPixmap->drawable.pScreen);
468}
469
470/*
471 * Memcpy-based UTS.
472 */
473static Bool
474CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
475    char *src, int src_pitch)
476{
477	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
478	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
479	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
480	int    dst_pitch  = exaGetPixmapPitch(pDst);
481
482	int bpp    = pDst->drawable.bitsPerPixel;
483	int cpp    = (bpp + 7) >> 3;
484	int wBytes = w * cpp;
485
486	ENTER;
487	dst += (x * cpp) + (y * dst_pitch);
488
489	CG14Wait(p);
490
491	while (h--) {
492		memcpy(dst, src, wBytes);
493		src += src_pitch;
494		dst += dst_pitch;
495	}
496	__asm("stbar;");
497	return TRUE;
498}
499
500/*
501 * Memcpy-based DFS.
502 */
503static Bool
504CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
505    char *dst, int dst_pitch)
506{
507	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
508	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
509	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
510	int    src_pitch  = exaGetPixmapPitch(pSrc);
511
512	ENTER;
513	int bpp    = pSrc->drawable.bitsPerPixel;
514	int cpp    = (bpp + 7) >> 3;
515	int wBytes = w * cpp;
516
517	src += (x * cpp) + (y * src_pitch);
518
519	CG14Wait(p);
520
521	while (h--) {
522		memcpy(dst, src, wBytes);
523		src += src_pitch;
524		dst += dst_pitch;
525	}
526
527	return TRUE;
528}
529
530Bool
531CG14CheckComposite(int op, PicturePtr pSrcPicture,
532                           PicturePtr pMaskPicture,
533                           PicturePtr pDstPicture)
534{
535	int i, ok = FALSE;
536
537	ENTER;
538
539	/*
540	 * SX is in theory capable of accelerating pretty much all Xrender ops,
541	 * even coordinate transformation and gradients. Support will be added
542	 * over time and likely have to spill over into its own source file.
543	 */
544
545	if ((op != PictOpOver) && (op != PictOpAdd) && (op != PictOpSrc)) {
546		xf86Msg(X_ERROR, "%s: rejecting %d\n", __func__, op);
547		return FALSE;
548	}
549	i = 0;
550	while ((i < arraysize(src_formats)) && (!ok)) {
551		ok =  (pSrcPicture->format == src_formats[i]);
552		i++;
553	}
554
555	if (!ok) {
556		xf86Msg(X_ERROR, "%s: unsupported src format %x\n",
557		    __func__, pSrcPicture->format);
558		return FALSE;
559	}
560
561	DPRINTF(X_ERROR, "src is %x, %d: %d %d\n", pSrcPicture->format, op,
562	    pSrcPicture->pDrawable->width, pSrcPicture->pDrawable->height);
563
564	if (pMaskPicture != NULL) {
565		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
566		    pMaskPicture->pDrawable->width,
567		    pMaskPicture->pDrawable->height);
568	}
569	return TRUE;
570}
571
572Bool
573CG14PrepareComposite(int op, PicturePtr pSrcPicture,
574                             PicturePtr pMaskPicture,
575                             PicturePtr pDstPicture,
576                             PixmapPtr  pSrc,
577                             PixmapPtr  pMask,
578                             PixmapPtr  pDst)
579{
580	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
581	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
582
583	ENTER;
584
585	if (pSrcPicture->format == PICT_a1) {
586		xf86Msg(X_ERROR, "src mono, dst %x, op %d\n", pDstPicture->format, op);
587		if (pMaskPicture != NULL) {
588			xf86Msg(X_ERROR, "msk %x\n", pMaskPicture->format);
589		}
590	}
591	if (pSrcPicture->pSourcePict != NULL) {
592		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
593			p->fillcolour =
594			    pSrcPicture->pSourcePict->solidFill.color;
595			xf86Msg(X_ERROR, "%s: solid src %08x\n",
596			    __func__, p->fillcolour);
597		}
598	}
599	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
600		if (pMaskPicture->pSourcePict->type ==
601		    SourcePictTypeSolidFill) {
602			p->fillcolour =
603			   pMaskPicture->pSourcePict->solidFill.color;
604			xf86Msg(X_ERROR, "%s: solid mask %08x\n",
605			    __func__, p->fillcolour);
606		}
607	}
608	if (pMaskPicture != NULL) {
609		p->mskoff = exaGetPixmapOffset(pMask);
610		p->mskpitch = exaGetPixmapPitch(pMask);
611		p->mskformat = pMaskPicture->format;
612	} else {
613		p->mskoff = 0;
614		p->mskpitch = 0;
615		p->mskformat = 0;
616	}
617	p->source_is_solid =
618	   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
619	p->srcoff = exaGetPixmapOffset(pSrc);
620	p->srcpitch = exaGetPixmapPitch(pSrc);
621	p->srcformat = pSrcPicture->format;
622	p->dstformat = pDstPicture->format;
623	p->op = op;
624	if (op == PictOpSrc) {
625		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
626	}
627#ifdef SX_DEBUG
628	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
629	    *(uint32_t *)(p->fb + p->srcoff));
630#endif
631	return TRUE;
632}
633
634void CG14Comp_Over32Solid(Cg14Ptr p,
635                   uint32_t src, uint32_t srcpitch,
636                   uint32_t dst, uint32_t dstpitch,
637                   int width, int height)
638{
639	uint32_t msk = src, mskx, dstx, m;
640	int line, x, i;
641
642	ENTER;
643	/* first get the source colour */
644	write_sx_io(p, p->srcoff, SX_LDUQ0(8, 0, p->srcoff & 7));
645	write_sx_reg(p, SX_QUEUED(8), 0xff);
646	for (line = 0; line < height; line++) {
647		mskx = msk;
648		dstx = dst;
649#ifdef SX_SINGLE
650
651		for (x = 0; x < width; x++) {
652			m = *(volatile uint32_t *)(p->fb + mskx);
653			m = m >> 24;
654			if (m == 0) {
655				/* nothing to do - all transparent */
656			} else if (m == 0xff) {
657				/* all opaque */
658				write_sx_io(p, dstx, SX_STUQ0(8, 0, dstx & 7));
659			} else {
660				/* fetch alpha value, stick it into scam */
661				/* mask is in R[12:15] */
662				/*write_sx_io(p, mskx,
663				    SX_LDUQ0(12, 0, mskx & 7));*/
664				write_sx_reg(p, SX_QUEUED(12), m);
665				/* fetch dst pixel */
666				write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
667				write_sx_reg(p, SX_INSTRUCTIONS,
668				    SX_ORV(12, 0, R_SCAM, 0));
669				/*
670				 * src * alpha + R0
671				 * R[9:11] * SCAM + R0 -> R[17:19]
672				 */
673				write_sx_reg(p, SX_INSTRUCTIONS,
674				    SX_SAXP16X16SR8(9, 0, 17, 2));
675
676				/* invert SCAM */
677				write_sx_reg(p, SX_INSTRUCTIONS,
678				    SX_XORV(12, 8, R_SCAM, 0));
679#ifdef SX_DEBUG
680				write_sx_reg(p, SX_INSTRUCTIONS,
681				    SX_XORV(12, 8, 13, 0));
682#endif
683				/* dst * (1 - alpha) + R[13:15] */
684				write_sx_reg(p, SX_INSTRUCTIONS,
685				    SX_SAXP16X16SR8(21, 17, 25, 2));
686				write_sx_io(p, dstx,
687				    SX_STUQ0C(24, 0, dstx & 7));
688			}
689			dstx += 4;
690			mskx += 4;
691		}
692#else
693		for (x = 0; x < width; x += 4) {
694			/* fetch 4 mask values */
695			write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
696			/* fetch destination pixels */
697			write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
698			/* duplicate them for all channels */
699			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
700			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
701			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
702			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
703			/* generate inverted alpha */
704			write_sx_reg(p, SX_INSTRUCTIONS,
705			    SX_XORS(12, 8, 28, 15));
706			/* multiply source */
707			write_sx_reg(p, SX_INSTRUCTIONS,
708			    SX_MUL16X16SR8(8, 12, 44, 3));
709			write_sx_reg(p, SX_INSTRUCTIONS,
710			    SX_MUL16X16SR8(8, 16, 48, 3));
711			write_sx_reg(p, SX_INSTRUCTIONS,
712			    SX_MUL16X16SR8(8, 20, 52, 3));
713			write_sx_reg(p, SX_INSTRUCTIONS,
714			    SX_MUL16X16SR8(8, 24, 56, 3));
715			/* multiply dest */
716			write_sx_reg(p, SX_INSTRUCTIONS,
717			    SX_MUL16X16SR8(28, 60, 76, 15));
718			/* add up */
719			write_sx_reg(p, SX_INSTRUCTIONS,
720			    SX_ADDV(44, 76, 92, 15));
721			/* write back */
722			write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
723			dstx += 16;
724			mskx += 16;
725		}
726#endif
727		dst += dstpitch;
728		msk += srcpitch;
729	}
730}
731
732void
733CG14Composite(PixmapPtr pDst, int srcX, int srcY,
734                              int maskX, int maskY,
735                              int dstX, int dstY,
736                              int width, int height)
737{
738	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
739	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
740	uint32_t dstoff, dstpitch;
741	uint32_t dst, msk, src;
742
743	ENTER;
744	dstoff = exaGetPixmapOffset(pDst);
745	dstpitch = exaGetPixmapPitch(pDst);
746
747	switch (p->op) {
748		case PictOpOver:
749			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
750			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
751			    p->mskformat, p->dstformat, srcX, srcY);
752			if (p->source_is_solid) {
753				switch (p->mskformat) {
754					case PICT_a8:
755						msk = p->mskoff +
756						    (maskY * p->mskpitch) +
757						    maskX;
758						CG14Comp_Over8Solid(p,
759						    msk, p->mskpitch,
760						    dst, dstpitch,
761						    width, height);
762						break;
763					case PICT_a8r8g8b8:
764					case PICT_a8b8g8r8:
765						msk = p->mskoff +
766						    (maskY * p->mskpitch) +
767						    (maskX << 2);
768						CG14Comp_Over32Solid(p,
769						    msk, p->mskpitch,
770						    dst, dstpitch,
771						    width, height);
772						break;
773					default:
774						xf86Msg(X_ERROR,
775						    "unsupported mask format\n");
776				}
777			} else {
778				DPRINTF(X_ERROR, "non-solid over with msk %x\n", p->mskformat);
779				switch (p->srcformat) {
780					case PICT_a8r8g8b8:
781					case PICT_a8b8g8r8:
782						src = p->srcoff +
783						    (srcY * p->srcpitch) +
784						    (srcX << 2);
785						dst = dstoff +
786						    (dstY * dstpitch) +
787						    (dstX << 2);
788						if (p->mskformat == PICT_a8) {
789							msk = p->mskoff +
790							    (maskY * p->mskpitch) +
791							    maskX;
792							CG14Comp_Over32Mask(p,
793							    src, p->srcpitch,
794							    msk, p->mskpitch,
795							    dst, dstpitch,
796							    width, height);
797						} else {
798							CG14Comp_Over32(p,
799							    src, p->srcpitch,
800							    dst, dstpitch,
801							    width, height);
802						}
803						break;
804					case PICT_x8r8g8b8:
805					case PICT_x8b8g8r8:
806						xf86Msg(X_ERROR, "alpha better be separate\n");
807						break;
808					default:
809						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
810						    __func__, p->srcformat);
811				}
812			}
813			break;
814		case PictOpAdd:
815			DPRINTF(X_ERROR, "Add %08x %08x\n",
816			    p->srcformat, p->dstformat);
817			switch (p->srcformat) {
818				case PICT_a8:
819					src = p->srcoff +
820					    (srcY * p->srcpitch) + srcX;
821					dst = dstoff + (dstY * dstpitch) + dstX;
822					CG14Comp_Add8(p, src, p->srcpitch,
823					    dst, dstpitch, width, height);
824					break;
825				case PICT_a8r8g8b8:
826				case PICT_x8r8g8b8:
827					src = p->srcoff +
828					    (srcY * p->srcpitch) + (srcX << 2);
829					dst = dstoff + (dstY * dstpitch) +
830					    (dstX << 2);
831					CG14Comp_Add32(p, src, p->srcpitch,
832					    dst, dstpitch, width, height);
833					break;
834				default:
835					xf86Msg(X_ERROR,
836					    "unsupported src format\n");
837			}
838			break;
839		case PictOpSrc:
840			DPRINTF(X_ERROR, "Src %08x %08x\n",
841			    p->srcformat, p->dstformat);
842			CG14Copy(pDst, srcX, srcY, dstX, dstY, width, height);
843			break;
844		default:
845			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
846	}
847	exaMarkSync(pDst->drawable.pScreen);
848}
849
850
851
852Bool
853CG14InitAccel(ScreenPtr pScreen)
854{
855	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
856	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
857	ExaDriverPtr pExa;
858
859	pExa = exaDriverAlloc();
860	if (!pExa)
861		return FALSE;
862
863	p->pExa = pExa;
864
865	pExa->exa_major = EXA_VERSION_MAJOR;
866	pExa->exa_minor = EXA_VERSION_MINOR;
867
868	pExa->memoryBase = p->fb;
869	pExa->memorySize = p->memsize;
870	pExa->offScreenBase = p->width * p->height * 4;
871
872	/*
873	 * SX memory instructions are written to 64bit aligned addresses with
874	 * a 3 bit displacement. Make sure the displacement remains constant
875	 * within one column
876	 */
877
878	pExa->pixmapOffsetAlign = 8;
879	pExa->pixmapPitchAlign = 8;
880
881	pExa->flags = EXA_OFFSCREEN_PIXMAPS |
882		      /*EXA_SUPPORTS_OFFSCREEN_OVERLAPS |*/
883		      EXA_MIXED_PIXMAPS;
884
885	/*
886	 * these limits are bogus
887	 * SX doesn't deal with coordinates at all, so there is no limit but
888	 * we have to put something here
889	 */
890	pExa->maxX = 4096;
891	pExa->maxY = 4096;
892
893	pExa->WaitMarker = CG14WaitMarker;
894
895	pExa->PrepareSolid = CG14PrepareSolid;
896	pExa->Solid = CG14Solid;
897	pExa->DoneSolid = CG14DoneCopy;
898	pExa->PrepareCopy = CG14PrepareCopy;
899	pExa->Copy = CG14Copy;
900	pExa->DoneCopy = CG14DoneCopy;
901	if (p->use_xrender) {
902		pExa->CheckComposite = CG14CheckComposite;
903		pExa->PrepareComposite = CG14PrepareComposite;
904		pExa->Composite = CG14Composite;
905		pExa->DoneComposite = CG14DoneCopy;
906	}
907
908	/* EXA hits more optimized paths when it does not have to fallback
909	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
910	 */
911	pExa->UploadToScreen = CG14UploadToScreen;
912	pExa->DownloadFromScreen = CG14DownloadFromScreen;
913
914	/* do some hardware init */
915	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
916	p->last_mask = 0xffffffff;
917	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
918	p->last_rop = 0xcc;
919	return exaDriverInit(pScreen, pExa);
920}
921