cg14_accel.c revision 76a85281
1/* $NetBSD: cg14_accel.c,v 1.24 2021/12/10 19:42:07 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45//#define SX_DEBUG
46
47#ifdef SX_DEBUG
48#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
49#define DPRINTF xf86Msg
50#else
51#define ENTER
52#define DPRINTF while (0) xf86Msg
53#endif
54
55#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
56
57/* 0xcc is SX's GXcopy equivalent */
58uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
59		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
60
61int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
62		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
63int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
64
65static void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
66static void CG14Copy8(PixmapPtr, int, int, int, int, int, int);
67
68static inline void
69CG14Wait(Cg14Ptr p)
70{
71	int bail = 10000000;
72	/* we wait for the busy bit to clear */
73	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
74	       (bail > 0)) {
75		bail--;
76	};
77	if (bail == 0) {
78		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
79		    read_sx_reg(p, SX_CONTROL_STATUS),
80		    read_sx_reg(p, SX_ERROR));
81	}
82}
83
84static void
85CG14WaitMarker(ScreenPtr pScreen, int Marker)
86{
87	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
88	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
89
90	CG14Wait(p);
91}
92
93static Bool
94CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
95		int xdir, int ydir, int alu, Pixel planemask)
96{
97	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
98	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
99
100	ENTER;
101	DPRINTF(X_ERROR, "%s bpp %d rop %x\n", __func__,
102	    pSrcPixmap->drawable.bitsPerPixel, alu);
103
104	if (planemask != p->last_mask) {
105		CG14Wait(p);
106		write_sx_reg(p, SX_PLANEMASK, planemask);
107		p->last_mask = planemask;
108	}
109	alu = sx_rop[alu];
110	if (alu != p->last_rop) {
111		CG14Wait(p);
112		write_sx_reg(p, SX_ROP_CONTROL, alu);
113		p->last_rop = alu;
114	}
115	switch (pSrcPixmap->drawable.bitsPerPixel)  {
116		case 8:
117			p->pExa->Copy = CG14Copy8;
118			break;
119		case 32:
120			p->pExa->Copy = CG14Copy32;
121			break;
122		default:
123			xf86Msg(X_ERROR, "%s depth %d\n", __func__,
124			    pSrcPixmap->drawable.bitsPerPixel);
125	}
126	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
127	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
128	p->xdir = xdir;
129	p->ydir = ydir;
130	return TRUE;
131}
132
133static void
134CG14Copy32(PixmapPtr pDstPixmap,
135         int srcX, int srcY, int dstX, int dstY, int w, int h)
136{
137	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
138	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
139	int dstpitch, dstoff, srcpitch, srcoff;
140	int srcstart, dststart, xinc, srcinc, dstinc;
141	int line, count, s, d, num;
142
143	ENTER;
144	dstpitch = exaGetPixmapPitch(pDstPixmap);
145	dstoff = exaGetPixmapOffset(pDstPixmap);
146	srcpitch = p->srcpitch;
147	srcoff = p->srcoff;
148	/*
149	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
150	 * actually wrote anything and only sync if it did
151	 */
152	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
153	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
154
155	/*
156	 * we always copy up to 32 pixels at a time so direction doesn't
157	 * matter if w<=32
158	 */
159	if (w > 32) {
160		if (p->xdir < 0) {
161			srcstart += (w - 32) << 2;
162			dststart += (w - 32) << 2;
163			xinc = -128;
164		} else
165			xinc = 128;
166	} else
167		xinc = 128;
168	if (p->ydir < 0) {
169		srcstart += (h - 1) * srcpitch;
170		dststart += (h - 1) * dstpitch;
171		srcinc = -srcpitch;
172		dstinc = -dstpitch;
173	} else {
174		srcinc = srcpitch;
175		dstinc = dstpitch;
176	}
177	if (p->last_rop == 0xcc) {
178		/* plain old copy */
179		if ( xinc > 0) {
180			/* going left to right */
181			for (line = 0; line < h; line++) {
182				count = 0;
183				s = srcstart;
184				d = dststart;
185				while ( count < w) {
186					num = min(32, w - count);
187					write_sx_io(p, s,
188					    SX_LD(10, num - 1, s & 7));
189					write_sx_io(p, d,
190					    SX_STM(10, num - 1, d & 7));
191					s += xinc;
192					d += xinc;
193					count += 32;
194				}
195				srcstart += srcinc;
196				dststart += dstinc;
197			}
198		} else {
199			/* going right to left */
200			int i, chunks = (w >> 5);
201			for (line = 0; line < h; line++) {
202				s = srcstart;
203				d = dststart;
204				count = w;
205				for (i = 0; i < chunks; i++) {
206					write_sx_io(p, s,
207					    SX_LD(10, 31, s & 7));
208					write_sx_io(p, d,
209					    SX_STM(10, 31, d & 7));
210					s -= 128;
211					d -= 128;
212					count -= 32;
213				}
214				/* leftovers, if any */
215				if (count > 0) {
216					s += (32 - count) << 2;
217					d += (32 - count) << 2;
218					write_sx_io(p, s,
219					    SX_LD(10, count - 1, s & 7));
220					write_sx_io(p, d,
221					    SX_STM(10, count - 1, d & 7));
222				}
223				srcstart += srcinc;
224				dststart += dstinc;
225			}
226		}
227	} else {
228		/* ROPs needed */
229		if ( xinc > 0) {
230			/* going left to right */
231			for (line = 0; line < h; line++) {
232				count = 0;
233				s = srcstart;
234				d = dststart;
235				while ( count < w) {
236					num = min(32, w - count);
237					write_sx_io(p, s,
238					    SX_LD(10, num - 1, s & 7));
239					write_sx_io(p, d,
240					    SX_LD(42, num - 1, d & 7));
241					if (num > 16) {
242						write_sx_reg(p, SX_INSTRUCTIONS,
243					    	 SX_ROP(10, 42, 74, 15));
244						write_sx_reg(p, SX_INSTRUCTIONS,
245					    	 SX_ROP(26, 58, 90, num - 17));
246					} else {
247						write_sx_reg(p, SX_INSTRUCTIONS,
248					    	 SX_ROP(10, 42, 74, num - 1));
249					}
250					write_sx_io(p, d,
251					    SX_STM(74, num - 1, d & 7));
252					s += xinc;
253					d += xinc;
254					count += 32;
255				}
256				srcstart += srcinc;
257				dststart += dstinc;
258			}
259		} else {
260			/* going right to left */
261			int i, chunks = (w >> 5);
262			for (line = 0; line < h; line++) {
263				s = srcstart;
264				d = dststart;
265				count = w;
266				for (i = 0; i < chunks; i++) {
267					write_sx_io(p, s, SX_LD(10, 31, s & 7));
268					write_sx_io(p, d, SX_LD(42, 31, d & 7));
269					write_sx_reg(p, SX_INSTRUCTIONS,
270				    	    SX_ROP(10, 42, 74, 15));
271					write_sx_reg(p, SX_INSTRUCTIONS,
272				    	    SX_ROP(26, 58, 90, 15));
273					write_sx_io(p, d,
274					    SX_STM(74, 31, d & 7));
275					s -= 128;
276					d -= 128;
277					count -= 32;
278				}
279				/* leftovers, if any */
280				if (count > 0) {
281					s += (32 - count) << 2;
282					d += (32 - count) << 2;
283					write_sx_io(p, s,
284					    SX_LD(10, count - 1, s & 7));
285					write_sx_io(p, d,
286					    SX_LD(42, count - 1, d & 7));
287					if (count > 16) {
288						write_sx_reg(p, SX_INSTRUCTIONS,
289					    	    SX_ROP(10, 42, 74, 15));
290						write_sx_reg(p, SX_INSTRUCTIONS,
291					    	 SX_ROP(26, 58, 90, count - 17));
292					} else {
293						write_sx_reg(p, SX_INSTRUCTIONS,
294					    	 SX_ROP(10, 42, 74, count - 1));
295					}
296
297					write_sx_io(p, d,
298					    SX_STM(74, count - 1, d & 7));
299				}
300				srcstart += srcinc;
301				dststart += dstinc;
302			}
303		}
304	}
305	exaMarkSync(pDstPixmap->drawable.pScreen);
306}
307
308/*
309 * copy with same alignment, left to right, no ROP
310 */
311static void
312CG14Copy8_aligned_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
313{
314	int saddr, daddr, pre, cnt, wrds;
315
316	ENTER;
317
318	pre = srcstart & 3;
319	if (pre != 0) pre = 4 - pre;
320	pre = min(pre, w);
321
322	while (h > 0) {
323		saddr = srcstart;
324		daddr = dststart;
325		cnt = w;
326		if (pre > 0) {
327			write_sx_io(p, saddr & ~7, SX_LDB(8, pre - 1, saddr & 7));
328			write_sx_io(p, daddr & ~7, SX_STB(8, pre - 1, daddr & 7));
329			saddr += pre;
330			daddr += pre;
331			cnt -= pre;
332			if (cnt == 0) goto next;
333		}
334		while (cnt > 3) {
335			wrds = min(32, cnt >> 2);
336			write_sx_io(p, saddr & ~7, SX_LD(8, wrds - 1, saddr & 7));
337			write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
338			saddr += wrds << 2;
339			daddr += wrds << 2;
340			cnt -= wrds << 2;
341		}
342		if (cnt > 0) {
343			write_sx_io(p, saddr & ~7, SX_LDB(8, cnt - 1, saddr & 7));
344			write_sx_io(p, daddr & ~7, SX_STB(8, cnt - 1, daddr & 7));
345		}
346next:
347		srcstart += srcpitch;
348		dststart += dstpitch;
349		h--;
350	}
351}
352
353/*
354 * copy with same alignment, left to right, ROP
355 */
356static void
357CG14Copy8_aligned_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
358{
359	int saddr, daddr, pre, cnt, wrds;
360
361	ENTER;
362
363	pre = srcstart & 3;
364	if (pre != 0) pre = 4 - pre;
365	pre = min(pre, w);
366
367	while (h > 0) {
368		saddr = srcstart;
369		daddr = dststart;
370		cnt = w;
371		if (pre > 0) {
372			write_sx_io(p, saddr & ~7, SX_LDB(8, pre - 1, saddr & 7));
373			write_sx_io(p, daddr & ~7, SX_LDB(40, pre - 1, daddr & 7));
374			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, pre - 1));
375			write_sx_io(p, daddr & ~7, SX_STB(72, pre - 1, daddr & 7));
376			saddr += pre;
377			daddr += pre;
378			cnt -= pre;
379			if (cnt == 0) goto next;
380		}
381		while (cnt > 3) {
382			wrds = min(32, cnt >> 2);
383			write_sx_io(p, saddr & ~7, SX_LD(8, wrds - 1, saddr & 7));
384			write_sx_io(p, daddr & ~7, SX_LD(40, wrds - 1, daddr & 7));
385			if (cnt > 16) {
386				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, 15));
387				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 56, 88, wrds - 17));
388			} else
389				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, wrds - 1));
390			write_sx_io(p, daddr & ~7, SX_ST(72, wrds - 1, daddr & 7));
391			saddr += wrds << 2;
392			daddr += wrds << 2;
393			cnt -= wrds << 2;
394		}
395		if (cnt > 0) {
396			write_sx_io(p, saddr & ~7, SX_LDB(8, cnt - 1, saddr & 7));
397			write_sx_io(p, daddr & ~7, SX_LDB(40, cnt - 1, daddr & 7));
398			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, cnt - 1));
399			write_sx_io(p, daddr & ~7, SX_STB(72, cnt - 1, daddr & 7));
400		}
401next:
402		srcstart += srcpitch;
403		dststart += dstpitch;
404		h--;
405	}
406}
407
408/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
409static void
410CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
411{
412	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
413	int ssreg;
414#ifdef DEBUG
415	int taddr = 4 + dstpitch * 50;
416#endif
417	uint32_t lmask, rmask;
418	ENTER;
419
420	pre = dststart & 3;
421	lmask = 0xffffffff >> pre;
422	spre = srcstart & 3;
423	/*
424	 * make sure we count all the words needed to cover the destination
425	 * line, covering potential partials on both ends
426	 */
427	wrds = (w + pre + 3) >> 2;
428	swrds = (w + spre + 3) >> 2;
429
430	if (spre < pre) {
431		dist = 32 - (pre - spre) * 8;
432		sreg = 9;
433	} else {
434		dist = (spre - pre) * 8;
435		sreg = 8;
436	}
437
438	/*
439	 * mask out trailing pixels to avoid partial writes
440	 */
441	post = (dststart + w) & 3;
442	if (post != 0) {
443		rmask = ~(0xffffffff >> (post * 8));
444		write_sx_reg(p, SX_QUEUED(7), rmask);
445		write_sx_reg(p, SX_QUEUED(6), ~rmask);
446	}
447
448	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
449	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
450
451	/* mask out the leading pixels in dst by using a mask and ROP */
452	if (pre != 0) {
453		write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
454		write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
455	}
456
457	saddr = srcstart & ~3;
458	daddr = dststart & ~3;
459
460	while (h > 0) {
461		write_sx_io(p, daddr & ~7, SX_LD(80, wrds - 1, daddr & 7));
462		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
463		if (wrds > 15) {
464			if (dist != 0) {
465				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
466				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
467				/* shifted source pixels are now at register 40+ */
468				ssreg = 40;
469			} else ssreg = 8;
470			if (pre != 0) {
471				/* mask out leading junk */
472				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
473				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 0));
474				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
475				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 1, 81, 9, 14));
476			} else {
477				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 15));
478			}
479			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 16, 96, 24, wrds - 16));
480		} else {
481			if (dist != 0) {
482				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
483				ssreg = 40;
484			} else ssreg = 8;
485			if (pre != 0) {
486				/* mask out leading junk */
487				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
488				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 0));
489				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
490				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 1, 81, 9, wrds));
491			} else {
492				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, wrds));
493			}
494		}
495		if (post != 0) {
496			/*
497			 * if the last word to be written out is a partial we
498			 * mask out the leftovers and replace them with
499			 * background pixels
500			 * we could pull the same ROP * mask trick as we do on
501			 * the left end but it's less annoying this way and
502			 * the instruction count is the same
503			 */
504			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(7 + wrds, 7, 5, 0));
505			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(79 + wrds, 6, 4, 0));
506			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, 7 + wrds, 0));
507		}
508#ifdef DEBUG
509		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
510		taddr += dstpitch;
511#endif
512		write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
513		saddr += srcpitch;
514		daddr += dstpitch;
515		h--;
516	}
517}
518
519/* up to 124 pixels so direction doesn't matter, unaligned, straight copy */
520static void
521CG14Copy8_short_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
522{
523	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
524	int ssreg;
525#ifdef DEBUG
526	int taddr = 4 + dstpitch * 50;
527#endif
528	uint32_t lmask, rmask;
529	ENTER;
530
531	pre = dststart & 3;
532	lmask = 0xffffffff >> pre;
533	spre = srcstart & 3;
534	/*
535	 * make sure we count all the words needed to cover the destination
536	 * line, covering potential partials on both ends
537	 */
538	wrds = (w + pre + 3) >> 2;
539	swrds = (w + spre + 3) >> 2;
540
541	if (spre < pre) {
542		dist = 32 - (pre - spre) * 8;
543		sreg = 9;
544	} else {
545		dist = (spre - pre) * 8;
546		sreg = 8;
547	}
548
549	/*
550	 * mask out trailing pixels to avoid partial writes
551	 */
552	post = (dststart + w) & 3;
553	if (post != 0) {
554		rmask = ~(0xffffffff >> (post * 8));
555		write_sx_reg(p, SX_QUEUED(7), rmask);
556		write_sx_reg(p, SX_QUEUED(6), ~rmask);
557	}
558
559	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
560	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
561
562	/* mask out the leading pixels in dst by using a mask and ROP */
563	if (pre != 0) {
564		write_sx_reg(p, SX_ROP_CONTROL, 0xca);
565		write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
566	}
567
568	saddr = srcstart & ~3;
569	daddr = dststart & ~3;
570
571	while (h > 0) {
572		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
573		if (wrds > 15) {
574			if (dist != 0) {
575				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
576				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
577				/* shifted source pixels are now at register 40+ */
578				ssreg = 40;
579			} else ssreg = 8;
580			if (pre != 0) {
581				/* read only the first word */
582				write_sx_io(p, daddr & ~7, SX_LD(80, 0, daddr & 7));
583				/* mask out leading junk */
584				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, ssreg, 0));
585			}
586		} else {
587			if (dist != 0) {
588				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
589				ssreg = 40;
590			} else ssreg = 8;
591			if (pre != 0) {
592				/* read only the first word */
593				write_sx_io(p, daddr & ~7, SX_LD(80, 0, daddr & 7));
594				/* mask out leading junk */
595				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, ssreg, 0));
596			}
597		}
598		if (post != 0) {
599			int laddr = daddr + ((wrds - 1) << 2);
600			/*
601			 * if the last word to be written out is a partial we
602			 * mask out the leftovers and replace them with
603			 * background pixels
604			 * we could pull the same ROP * mask trick as we do on
605			 * the left end but it's less annoying this way and
606			 * the instruction count is the same
607			 */
608			write_sx_io(p, laddr & ~7, SX_LD(81, 0, laddr & 7));
609			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(ssreg + wrds - 1, 7, 5, 0));
610			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(81, 6, 4, 0));
611			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, ssreg + wrds - 1, 0));
612		}
613#ifdef DEBUG
614		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
615		taddr += dstpitch;
616#endif
617		write_sx_io(p, daddr & ~7, SX_ST(ssreg, wrds - 1, daddr & 7));
618		saddr += srcpitch;
619		daddr += dstpitch;
620		h--;
621	}
622}
623
624static void
625CG14Copy8(PixmapPtr pDstPixmap,
626         int srcX, int srcY, int dstX, int dstY, int w, int h)
627{
628	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
629	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
630	int dstpitch, dstoff, srcpitch, srcoff;
631	int srcstart, dststart, xinc, srcinc, dstinc;
632	int line, count, s, d, num;
633
634	ENTER;
635	dstpitch = exaGetPixmapPitch(pDstPixmap);
636	dstoff = exaGetPixmapOffset(pDstPixmap);
637	srcpitch = p->srcpitch;
638	srcoff = p->srcoff;
639	/*
640	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
641	 * actually wrote anything and only sync if it did
642	 */
643	srcstart = srcX + (srcpitch * srcY) + srcoff;
644	dststart = dstX + (dstpitch * dstY) + dstoff;
645
646	if (p->ydir < 0) {
647		srcstart += (h - 1) * srcpitch;
648		dststart += (h - 1) * dstpitch;
649		srcinc = -srcpitch;
650		dstinc = -dstpitch;
651	} else {
652		srcinc = srcpitch;
653		dstinc = dstpitch;
654	}
655
656	/*
657	 * this copies up to 124 pixels wide in one go, so horizontal
658	 * direction / overlap don't matter
659	 * uses all 32bit accesses and funnel shifter for unaligned copies
660	 */
661	if ((w < 125) && (w > 8)) {
662		switch (p->last_rop) {
663			case 0xcc:
664				CG14Copy8_short_norop(p, srcstart, dststart, w, h, srcinc, dstinc);
665				break;
666			default:
667				CG14Copy8_short_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
668		}
669		return;
670	}
671
672	/*
673	 * only invert x direction if absolutely necessary, it's a pain to
674	 * go backwards on SX so avoid as much as possible
675	 */
676	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
677		srcstart += (w - 32);
678		dststart += (w - 32);
679		xinc = -32;
680	} else
681		xinc = 32;
682
683	/*
684	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
685	 * most common case
686	 */
687	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
688		switch (p->last_rop) {
689			case 0xcc:
690				CG14Copy8_aligned_norop(p, srcstart, dststart, w, h, srcinc, dstinc);
691				break;
692			default:
693				CG14Copy8_aligned_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
694		}
695		return;
696	}
697
698	if (p->last_rop == 0xcc) {
699		/* plain old copy */
700		if ( xinc > 0) {
701			/* going left to right */
702			for (line = 0; line < h; line++) {
703				count = 0;
704				s = srcstart;
705				d = dststart;
706				while ( count < w) {
707					num = min(32, w - count);
708					write_sx_io(p, s,
709					    SX_LDB(10, num - 1, s & 7));
710					write_sx_io(p, d,
711					    SX_STBM(10, num - 1, d & 7));
712					s += xinc;
713					d += xinc;
714					count += 32;
715				}
716				srcstart += srcinc;
717				dststart += dstinc;
718			}
719		} else {
720			/* going right to left */
721			int i, chunks = (w >> 5);
722			for (line = 0; line < h; line++) {
723				s = srcstart;
724				d = dststart;
725				count = w;
726				for (i = 0; i < chunks; i++) {
727					write_sx_io(p, s,
728					    SX_LDB(10, 31, s & 7));
729					write_sx_io(p, d,
730					    SX_STBM(10, 31, d & 7));
731					s -= 32;
732					d -= 32;
733					count -= 32;
734				}
735				/* leftovers, if any */
736				if (count > 0) {
737					s += (32 - count);
738					d += (32 - count);
739					write_sx_io(p, s,
740					    SX_LDB(10, count - 1, s & 7));
741					write_sx_io(p, d,
742					    SX_STBM(10, count - 1, d & 7));
743				}
744				srcstart += srcinc;
745				dststart += dstinc;
746			}
747		}
748	} else {
749		/* ROPs needed */
750		if ( xinc > 0) {
751			/* going left to right */
752			for (line = 0; line < h; line++) {
753				count = 0;
754				s = srcstart;
755				d = dststart;
756				while ( count < w) {
757					num = min(32, w - count);
758					write_sx_io(p, s,
759					    SX_LDB(10, num - 1, s & 7));
760					write_sx_io(p, d,
761					    SX_LDB(42, num - 1, d & 7));
762					if (num > 16) {
763						write_sx_reg(p, SX_INSTRUCTIONS,
764					    	 SX_ROP(10, 42, 74, 15));
765						write_sx_reg(p, SX_INSTRUCTIONS,
766					    	 SX_ROP(26, 58, 90, num - 17));
767					} else {
768						write_sx_reg(p, SX_INSTRUCTIONS,
769					    	 SX_ROP(10, 42, 74, num - 1));
770					}
771					write_sx_io(p, d,
772					    SX_STBM(74, num - 1, d & 7));
773					s += xinc;
774					d += xinc;
775					count += 32;
776				}
777				srcstart += srcinc;
778				dststart += dstinc;
779			}
780		} else {
781			/* going right to left */
782			int i, chunks = (w >> 5);
783			for (line = 0; line < h; line++) {
784				s = srcstart;
785				d = dststart;
786				count = w;
787				for (i = 0; i < chunks; i++) {
788					write_sx_io(p, s, SX_LDB(10, 31, s & 7));
789					write_sx_io(p, d, SX_LDB(42, 31, d & 7));
790					write_sx_reg(p, SX_INSTRUCTIONS,
791				    	    SX_ROP(10, 42, 74, 15));
792					write_sx_reg(p, SX_INSTRUCTIONS,
793				    	    SX_ROP(26, 58, 90, 15));
794					write_sx_io(p, d,
795					    SX_STBM(74, 31, d & 7));
796					s -= 128;
797					d -= 128;
798					count -= 32;
799				}
800				/* leftovers, if any */
801				if (count > 0) {
802					s += (32 - count);
803					d += (32 - count);
804					write_sx_io(p, s,
805					    SX_LDB(10, count - 1, s & 7));
806					write_sx_io(p, d,
807					    SX_LDB(42, count - 1, d & 7));
808					if (count > 16) {
809						write_sx_reg(p, SX_INSTRUCTIONS,
810					    	    SX_ROP(10, 42, 74, 15));
811						write_sx_reg(p, SX_INSTRUCTIONS,
812					    	 SX_ROP(26, 58, 90, count - 17));
813					} else {
814						write_sx_reg(p, SX_INSTRUCTIONS,
815					    	 SX_ROP(10, 42, 74, count - 1));
816					}
817
818					write_sx_io(p, d,
819					    SX_STBM(74, count - 1, d & 7));
820				}
821				srcstart += srcinc;
822				dststart += dstinc;
823			}
824		}
825	}
826	exaMarkSync(pDstPixmap->drawable.pScreen);
827}
828
829static void
830CG14DoneCopy(PixmapPtr pDstPixmap)
831{
832}
833
834static Bool
835CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
836{
837	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
838	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
839
840	ENTER;
841	DPRINTF(X_ERROR, "bits per pixel: %d %08lx\n",
842	    pPixmap->drawable.bitsPerPixel, fg);
843
844	/*
845	 * GXset and GXclear are really just specual cases of GXcopy with
846	 * fixed fill colour
847	 */
848	switch (alu) {
849		case GXclear:
850			alu = GXcopy;
851			fg = 0;
852			break;
853		case GXset:
854			alu = GXcopy;
855			fg = 0xffffffff;
856			break;
857	}
858	/* repeat the colour in every sub byte if we're in 8 bit */
859	if (pPixmap->drawable.bitsPerPixel == 8) {
860		fg |= fg << 8;
861		fg |= fg << 16;
862	}
863	write_sx_reg(p, SX_QUEUED(8), fg);
864	write_sx_reg(p, SX_QUEUED(9), fg);
865	if (planemask != p->last_mask) {
866		CG14Wait(p);
867		write_sx_reg(p, SX_PLANEMASK, planemask);
868		p->last_mask = planemask;
869	}
870	alu = sx_rop[alu];
871	if (alu != p->last_rop) {
872		CG14Wait(p);
873		write_sx_reg(p, SX_ROP_CONTROL, alu);
874		p->last_rop = alu;
875	}
876
877	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
878	return TRUE;
879}
880
881static void
882CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
883{
884	int line, x, num;
885	uint32_t ptr;
886
887	ENTER;
888	if (p->last_rop == 0xcc) {
889		/* simple fill */
890		for (line = 0; line < h; line++) {
891			x = 0;
892			while (x < w) {
893				ptr = start + (x << 2);
894				num = min(32, w - x);
895				write_sx_io(p, ptr,
896				    SX_STS(8, num - 1, ptr & 7));
897				x += 32;
898			}
899			start += pitch;
900		}
901	} else if (p->last_rop == 0xaa) {
902		/* nothing to do here */
903		return;
904	} else {
905		/* alright, let's do actual ROP stuff */
906
907		/* first repeat the fill colour into 16 registers */
908		write_sx_reg(p, SX_INSTRUCTIONS,
909		    SX_SELECT_S(8, 8, 10, 15));
910
911		for (line = 0; line < h; line++) {
912			x = 0;
913			while (x < w) {
914				ptr = start + (x << 2);
915				num = min(32, w - x);
916				/* now suck fb data into registers */
917				write_sx_io(p, ptr,
918				    SX_LD(42, num - 1, ptr & 7));
919				/*
920				 * ROP them with the fill data we left in 10
921				 * non-memory ops can only have counts up to 16
922				 */
923				if (num <= 16) {
924					write_sx_reg(p, SX_INSTRUCTIONS,
925					    SX_ROP(10, 42, 74, num - 1));
926				} else {
927					write_sx_reg(p, SX_INSTRUCTIONS,
928					    SX_ROP(10, 42, 74, 15));
929					write_sx_reg(p, SX_INSTRUCTIONS,
930					    SX_ROP(10, 58, 90, num - 17));
931				}
932				/* and write the result back into memory */
933				write_sx_io(p, ptr,
934				    SX_ST(74, num - 1, ptr & 7));
935				x += 32;
936			}
937			start += pitch;
938		}
939	}
940}
941
942static void
943CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
944{
945	int line, num, pre, cnt;
946	uint32_t ptr;
947
948	ENTER;
949	pre = start & 3;
950	if (pre != 0) pre = 4 - pre;
951
952	if (p->last_rop == 0xcc) {
953		/* simple fill */
954		for (line = 0; line < h; line++) {
955			ptr = start;
956			cnt = w;
957			pre = min(pre, cnt);
958			if (pre) {
959				write_sx_io(p, ptr & ~7, SX_STBS(8, pre - 1, ptr & 7));
960				ptr += pre;
961				cnt -= pre;
962				if (cnt == 0) goto next;
963			}
964			/* now do the aligned pixels in 32bit chunks */
965			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
966			while(cnt > 3) {
967				num = min(32, cnt >> 2);
968				write_sx_io(p, ptr & ~7, SX_STS(8, num - 1, ptr & 7));
969				ptr += num << 2;
970				cnt -= num << 2;
971			}
972			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
973			if (cnt > 0) {
974				write_sx_io(p, ptr & ~7, SX_STBS(8, cnt - 1, ptr & 7));
975			}
976			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
977next:
978			start += pitch;
979		}
980	} else if (p->last_rop == 0xaa) {
981		/* nothing to do here */
982		return;
983	} else {
984		/* alright, let's do actual ROP stuff */
985
986		/* first repeat the fill colour into 16 registers */
987		write_sx_reg(p, SX_INSTRUCTIONS,
988		    SX_SELECT_S(8, 8, 10, 15));
989
990		for (line = 0; line < h; line++) {
991			ptr = start;
992			cnt = w;
993			pre = min(pre, cnt);
994			if (pre) {
995				write_sx_io(p, ptr & ~7, SX_LDB(26, pre - 1, ptr & 7));
996				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, pre - 1));
997				write_sx_io(p, ptr & ~7, SX_STB(42, pre - 1, ptr & 7));
998				ptr += pre;
999				cnt -= pre;
1000				if (cnt == 0) goto next2;
1001			}
1002			/* now do the aligned pixels in 32bit chunks */
1003			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1004			while(cnt > 3) {
1005				num = min(32, cnt >> 2);
1006				write_sx_io(p, ptr & ~7, SX_LD(26, num - 1, ptr & 7));
1007				if (num <= 16) {
1008					write_sx_reg(p, SX_INSTRUCTIONS,
1009					    SX_ROP(10, 26, 58, num - 1));
1010				} else {
1011					write_sx_reg(p, SX_INSTRUCTIONS,
1012					    SX_ROP(10, 26, 58, 15));
1013					write_sx_reg(p, SX_INSTRUCTIONS,
1014					    SX_ROP(10, 42, 74, num - 17));
1015				}
1016				write_sx_io(p, ptr & ~7, SX_ST(58, num - 1, ptr & 7));
1017				ptr += num << 2;
1018				cnt -= num << 2;
1019			}
1020			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1021			if (cnt > 0) {
1022				write_sx_io(p, ptr & ~7, SX_LDB(26, cnt - 1, ptr & 7));
1023				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, cnt - 1));
1024				write_sx_io(p, ptr & ~7, SX_STB(42, cnt - 1, ptr & 7));
1025			}
1026			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1027next2:
1028			start += pitch;
1029		}
1030	}
1031}
1032
1033static void
1034CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
1035{
1036	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
1037	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1038	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
1039	int start, depth;
1040
1041	ENTER;
1042	dstpitch = exaGetPixmapPitch(pPixmap);
1043	dstoff = exaGetPixmapOffset(pPixmap);
1044
1045	depth = pPixmap->drawable.bitsPerPixel;
1046	switch (depth) {
1047		case 32:
1048			start = dstoff + (y1 * dstpitch) + (x1 << 2);
1049			CG14Solid32(p, start, dstpitch, w, h);
1050			break;
1051		case 8:
1052			start = dstoff + (y1 * dstpitch) + x1;
1053			CG14Solid8(p, start, dstpitch, w, h);
1054			break;
1055	}
1056
1057	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
1058	    dstpitch, dstoff, start);
1059	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
1060	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
1061	exaMarkSync(pPixmap->drawable.pScreen);
1062}
1063
1064/*
1065 * Memcpy-based UTS.
1066 */
1067static Bool
1068CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
1069    char *src, int src_pitch)
1070{
1071	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1072	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1073	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
1074	int    dst_pitch  = exaGetPixmapPitch(pDst);
1075
1076	int bpp    = pDst->drawable.bitsPerPixel;
1077	int cpp    = (bpp + 7) >> 3;
1078	int wBytes = w * cpp;
1079
1080	ENTER;
1081	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
1082	dst += (x * cpp) + (y * dst_pitch);
1083
1084	CG14Wait(p);
1085
1086	while (h--) {
1087		memcpy(dst, src, wBytes);
1088		src += src_pitch;
1089		dst += dst_pitch;
1090	}
1091	__asm("stbar;");
1092	return TRUE;
1093}
1094
1095/*
1096 * Memcpy-based DFS.
1097 */
1098static Bool
1099CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
1100    char *dst, int dst_pitch)
1101{
1102	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
1103	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1104	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
1105	int    src_pitch  = exaGetPixmapPitch(pSrc);
1106
1107	ENTER;
1108	int bpp    = pSrc->drawable.bitsPerPixel;
1109	int cpp    = (bpp + 7) >> 3;
1110	int wBytes = w * cpp;
1111
1112	src += (x * cpp) + (y * src_pitch);
1113
1114	CG14Wait(p);
1115
1116	while (h--) {
1117		memcpy(dst, src, wBytes);
1118		src += src_pitch;
1119		dst += dst_pitch;
1120	}
1121
1122	return TRUE;
1123}
1124
1125Bool
1126CG14CheckComposite(int op, PicturePtr pSrcPicture,
1127                           PicturePtr pMaskPicture,
1128                           PicturePtr pDstPicture)
1129{
1130	int i, ok = FALSE;
1131
1132	ENTER;
1133
1134	/*
1135	 * SX is in theory capable of accelerating pretty much all Xrender ops,
1136	 * even coordinate transformation and gradients. Support will be added
1137	 * over time and likely have to spill over into its own source file.
1138	 */
1139
1140	if ((op != PictOpOver) && (op != PictOpAdd) && (op != PictOpSrc)) {
1141		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
1142		return FALSE;
1143	}
1144
1145	if (pSrcPicture != NULL) {
1146		i = 0;
1147		while ((i < arraysize(src_formats)) && (!ok)) {
1148			ok =  (pSrcPicture->format == src_formats[i]);
1149			i++;
1150		}
1151
1152		if (!ok) {
1153			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
1154			    __func__, pSrcPicture->format);
1155			return FALSE;
1156		}
1157		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
1158	}
1159
1160	if (pDstPicture != NULL) {
1161		i = 0;
1162		ok = FALSE;
1163		while ((i < arraysize(src_formats)) && (!ok)) {
1164			ok =  (pDstPicture->format == src_formats[i]);
1165			i++;
1166		}
1167
1168		if (!ok) {
1169			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
1170			    __func__, pDstPicture->format);
1171			return FALSE;
1172		}
1173		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
1174	}
1175
1176	if (pMaskPicture != NULL) {
1177		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
1178		    pMaskPicture->pDrawable->width,
1179		    pMaskPicture->pDrawable->height);
1180	}
1181	return TRUE;
1182}
1183
1184Bool
1185CG14PrepareComposite(int op, PicturePtr pSrcPicture,
1186                             PicturePtr pMaskPicture,
1187                             PicturePtr pDstPicture,
1188                             PixmapPtr  pSrc,
1189                             PixmapPtr  pMask,
1190                             PixmapPtr  pDst)
1191{
1192	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1193	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1194
1195	ENTER;
1196
1197	p->no_source_pixmap = FALSE;
1198	p->source_is_solid = FALSE;
1199
1200	if (pSrcPicture->format == PICT_a1) {
1201		xf86Msg(X_ERROR, "src mono, dst %x, op %d\n",
1202		    pDstPicture->format, op);
1203		if (pMaskPicture != NULL) {
1204			xf86Msg(X_ERROR, "msk %x\n", pMaskPicture->format);
1205		}
1206	}
1207	if (pSrcPicture->pSourcePict != NULL) {
1208		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
1209			p->fillcolour =
1210			    pSrcPicture->pSourcePict->solidFill.color;
1211			DPRINTF(X_ERROR, "%s: solid src %08x\n",
1212			    __func__, p->fillcolour);
1213			p->no_source_pixmap = TRUE;
1214			p->source_is_solid = TRUE;
1215		}
1216	}
1217	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
1218		if (pMaskPicture->pSourcePict->type ==
1219		    SourcePictTypeSolidFill) {
1220			p->fillcolour =
1221			   pMaskPicture->pSourcePict->solidFill.color;
1222			xf86Msg(X_ERROR, "%s: solid mask %08x\n",
1223			    __func__, p->fillcolour);
1224		}
1225	}
1226	if (pMaskPicture != NULL) {
1227		p->mskoff = exaGetPixmapOffset(pMask);
1228		p->mskpitch = exaGetPixmapPitch(pMask);
1229		p->mskformat = pMaskPicture->format;
1230	} else {
1231		p->mskoff = 0;
1232		p->mskpitch = 0;
1233		p->mskformat = 0;
1234	}
1235	if (pSrc != NULL) {
1236		p->source_is_solid =
1237		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
1238		p->srcoff = exaGetPixmapOffset(pSrc);
1239		p->srcpitch = exaGetPixmapPitch(pSrc);
1240		if (p->source_is_solid) {
1241			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
1242		}
1243	}
1244	p->srcformat = pSrcPicture->format;
1245	p->dstformat = pDstPicture->format;
1246
1247	if (p->source_is_solid) {
1248		uint32_t temp;
1249
1250		/* stuff source colour into SX registers, swap as needed */
1251		temp = p->fillcolour;
1252		switch (p->srcformat) {
1253			case PICT_a8r8g8b8:
1254			case PICT_x8r8g8b8:
1255				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1256				temp = temp >> 8;
1257				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1258				temp = temp >> 8;
1259				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1260				break;
1261			case PICT_a8b8g8r8:
1262			case PICT_x8b8g8r8:
1263				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1264				temp = temp >> 8;
1265				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1266				temp = temp >> 8;
1267				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1268				break;
1269		}
1270		write_sx_reg(p, SX_QUEUED(8), 0xff);
1271	}
1272	p->op = op;
1273	if (op == PictOpSrc) {
1274		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
1275	}
1276#ifdef SX_DEBUG
1277	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
1278	    *(uint32_t *)(p->fb + p->srcoff));
1279#endif
1280	return TRUE;
1281}
1282
1283void
1284CG14Composite(PixmapPtr pDst, int srcX, int srcY,
1285                              int maskX, int maskY,
1286                              int dstX, int dstY,
1287                              int width, int height)
1288{
1289	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1290	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1291	uint32_t dstoff, dstpitch;
1292	uint32_t dst, msk, src;
1293	int flip = 0;
1294
1295	ENTER;
1296	dstoff = exaGetPixmapOffset(pDst);
1297	dstpitch = exaGetPixmapPitch(pDst);
1298
1299	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
1300		PICT_FORMAT_TYPE(p->dstformat));
1301
1302	switch (p->op) {
1303		case PictOpOver:
1304			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
1305			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
1306			    p->mskformat, p->dstformat, srcX, srcY);
1307			if (p->source_is_solid) {
1308				switch (p->mskformat) {
1309					case PICT_a8:
1310						msk = p->mskoff +
1311						    (maskY * p->mskpitch) +
1312						    maskX;
1313						CG14Comp_Over8Solid(p,
1314						    msk, p->mskpitch,
1315						    dst, dstpitch,
1316						    width, height);
1317						break;
1318					case PICT_a8r8g8b8:
1319					case PICT_a8b8g8r8:
1320						msk = p->mskoff +
1321						    (maskY * p->mskpitch) +
1322						    (maskX << 2);
1323						CG14Comp_Over32Solid(p,
1324						    msk, p->mskpitch,
1325						    dst, dstpitch,
1326						    width, height);
1327						break;
1328					default:
1329						xf86Msg(X_ERROR,
1330						  "unsupported mask format %08x\n", p->mskformat);
1331				}
1332			} else {
1333				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
1334				    p->mskformat);
1335				switch (p->srcformat) {
1336					case PICT_a8r8g8b8:
1337					case PICT_a8b8g8r8:
1338						src = p->srcoff +
1339						    (srcY * p->srcpitch) +
1340						    (srcX << 2);
1341						dst = dstoff +
1342						    (dstY * dstpitch) +
1343						    (dstX << 2);
1344						if (p->mskformat == PICT_a8) {
1345							msk = p->mskoff +
1346							    (maskY * p->mskpitch) +
1347							    maskX;
1348							CG14Comp_Over32Mask(p,
1349							    src, p->srcpitch,
1350							    msk, p->mskpitch,
1351							    dst, dstpitch,
1352							    width, height, flip);
1353						} else {
1354							CG14Comp_Over32(p,
1355							    src, p->srcpitch,
1356							    dst, dstpitch,
1357							    width, height, flip);
1358						}
1359						break;
1360					case PICT_x8r8g8b8:
1361					case PICT_x8b8g8r8:
1362						src = p->srcoff +
1363						    (srcY * p->srcpitch) +
1364						    (srcX << 2);
1365						dst = dstoff +
1366						    (dstY * dstpitch) +
1367						    (dstX << 2);
1368						if (p->mskformat == PICT_a8) {
1369							msk = p->mskoff +
1370							    (maskY * p->mskpitch) +
1371							    maskX;
1372							CG14Comp_Over32Mask_noalpha(p,
1373							    src, p->srcpitch,
1374							    msk, p->mskpitch,
1375							    dst, dstpitch,
1376							    width, height, flip);
1377						} else if ((p->mskformat == PICT_a8r8g8b8) ||
1378							   (p->mskformat == PICT_a8b8g8r8)) {
1379							msk = p->mskoff +
1380							    (maskY * p->mskpitch) +
1381							    (maskX << 2);
1382							CG14Comp_Over32Mask32_noalpha(p,
1383							    src, p->srcpitch,
1384							    msk, p->mskpitch,
1385							    dst, dstpitch,
1386							    width, height, flip);
1387						} else {
1388							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
1389						}
1390						break;
1391					default:
1392						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
1393						    __func__, p->srcformat);
1394				}
1395			}
1396			break;
1397		case PictOpAdd:
1398			DPRINTF(X_ERROR, "Add %08x %08x\n",
1399			    p->srcformat, p->dstformat);
1400			switch (p->srcformat) {
1401				case PICT_a8:
1402					src = p->srcoff +
1403					    (srcY * p->srcpitch) + srcX;
1404					if (p->dstformat == PICT_a8) {
1405						dst = dstoff +
1406						      (dstY * dstpitch) + dstX;
1407						CG14Comp_Add8(p,
1408						    src, p->srcpitch,
1409						    dst, dstpitch,
1410						    width, height);
1411					} else {
1412						dst = dstoff +
1413						      (dstY * dstpitch) +
1414						      (dstX << 2);
1415						CG14Comp_Add8_32(p,
1416						    src, p->srcpitch,
1417						    dst, dstpitch,
1418						    width, height);
1419					}
1420					break;
1421				case PICT_a8r8g8b8:
1422				case PICT_x8r8g8b8:
1423					src = p->srcoff +
1424					    (srcY * p->srcpitch) + (srcX << 2);
1425					dst = dstoff + (dstY * dstpitch) +
1426					    (dstX << 2);
1427					CG14Comp_Add32(p, src, p->srcpitch,
1428					    dst, dstpitch, width, height);
1429					break;
1430				default:
1431					xf86Msg(X_ERROR,
1432					    "unsupported src format\n");
1433			}
1434			break;
1435		case PictOpSrc:
1436			DPRINTF(X_ERROR, "Src %08x %08x\n",
1437			    p->srcformat, p->dstformat);
1438			if (p->mskformat != 0)
1439				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
1440			if (p->srcformat == PICT_a8) {
1441				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
1442			} else {
1443				/* convert between RGB and BGR? */
1444				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
1445			}
1446			break;
1447		default:
1448			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
1449	}
1450	exaMarkSync(pDst->drawable.pScreen);
1451}
1452
1453
1454
1455Bool
1456CG14InitAccel(ScreenPtr pScreen)
1457{
1458	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1459	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1460	ExaDriverPtr pExa;
1461
1462	pExa = exaDriverAlloc();
1463	if (!pExa)
1464		return FALSE;
1465
1466	p->pExa = pExa;
1467
1468	pExa->exa_major = EXA_VERSION_MAJOR;
1469	pExa->exa_minor = EXA_VERSION_MINOR;
1470
1471	pExa->memoryBase = p->fb;
1472	pExa->memorySize = p->memsize;
1473	pExa->offScreenBase = p->width * p->height * (pScrn->depth >> 3);
1474
1475	/*
1476	 * SX memory instructions are written to 64bit aligned addresses with
1477	 * a 3 bit displacement. Make sure the displacement remains constant
1478	 * within one column
1479	 */
1480
1481	pExa->pixmapOffsetAlign = 8;
1482	pExa->pixmapPitchAlign = 8;
1483
1484	pExa->flags = EXA_OFFSCREEN_PIXMAPS
1485		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
1486		      /*| EXA_MIXED_PIXMAPS*/;
1487
1488	/*
1489	 * these limits are bogus
1490	 * SX doesn't deal with coordinates at all, so there is no limit but
1491	 * we have to put something here
1492	 */
1493	pExa->maxX = 4096;
1494	pExa->maxY = 4096;
1495
1496	pExa->WaitMarker = CG14WaitMarker;
1497
1498	pExa->PrepareSolid = CG14PrepareSolid;
1499	pExa->Solid = CG14Solid;
1500	pExa->DoneSolid = CG14DoneCopy;
1501	pExa->PrepareCopy = CG14PrepareCopy;
1502	pExa->Copy = CG14Copy32;
1503	pExa->DoneCopy = CG14DoneCopy;
1504	if (p->use_xrender) {
1505		pExa->CheckComposite = CG14CheckComposite;
1506		pExa->PrepareComposite = CG14PrepareComposite;
1507		pExa->Composite = CG14Composite;
1508		pExa->DoneComposite = CG14DoneCopy;
1509	}
1510
1511	/* EXA hits more optimized paths when it does not have to fallback
1512	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
1513	 */
1514	pExa->UploadToScreen = CG14UploadToScreen;
1515	pExa->DownloadFromScreen = CG14DownloadFromScreen;
1516
1517	p->queuecount = 0;
1518	/* do some hardware init */
1519	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
1520	p->last_mask = 0xffffffff;
1521	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
1522	p->last_rop = 0xcc;
1523	return exaDriverInit(pScreen, pExa);
1524}
1525