cg14_accel.c revision c1537409
1/* $NetBSD: cg14_accel.c,v 1.26 2021/12/19 04:50:27 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45//#define SX_DEBUG
46
47#ifdef SX_DEBUG
48#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
49#define DPRINTF xf86Msg
50#else
51#define ENTER
52#define DPRINTF while (0) xf86Msg
53#endif
54
55#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
56
57/* 0xcc is SX's GXcopy equivalent */
58uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
59		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
60
61int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
62		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
63int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
64
65static void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
66static void CG14Copy8(PixmapPtr, int, int, int, int, int, int);
67
68static inline void
69CG14Wait(Cg14Ptr p)
70{
71	int bail = 10000000;
72	/* we wait for the busy bit to clear */
73	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
74	       (bail > 0)) {
75		bail--;
76	};
77	if (bail == 0) {
78		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
79		    read_sx_reg(p, SX_CONTROL_STATUS),
80		    read_sx_reg(p, SX_ERROR));
81	}
82}
83
84static void
85CG14WaitMarker(ScreenPtr pScreen, int Marker)
86{
87	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
88	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
89
90	CG14Wait(p);
91}
92
93static Bool
94CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
95		int xdir, int ydir, int alu, Pixel planemask)
96{
97	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
98	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
99
100	ENTER;
101	DPRINTF(X_ERROR, "%s bpp %d rop %x\n", __func__,
102	    pSrcPixmap->drawable.bitsPerPixel, alu);
103
104	if (planemask != p->last_mask) {
105		CG14Wait(p);
106		write_sx_reg(p, SX_PLANEMASK, planemask);
107		p->last_mask = planemask;
108	}
109	alu = sx_rop[alu];
110	if (alu != p->last_rop) {
111		CG14Wait(p);
112		write_sx_reg(p, SX_ROP_CONTROL, alu);
113		p->last_rop = alu;
114	}
115	switch (pSrcPixmap->drawable.bitsPerPixel)  {
116		case 8:
117			p->pExa->Copy = CG14Copy8;
118			break;
119		case 32:
120			p->pExa->Copy = CG14Copy32;
121			break;
122		default:
123			xf86Msg(X_ERROR, "%s depth %d\n", __func__,
124			    pSrcPixmap->drawable.bitsPerPixel);
125	}
126	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
127	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
128	p->xdir = xdir;
129	p->ydir = ydir;
130	return TRUE;
131}
132
133static void
134CG14Copy32(PixmapPtr pDstPixmap,
135         int srcX, int srcY, int dstX, int dstY, int w, int h)
136{
137	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
138	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
139	int dstpitch, dstoff, srcpitch, srcoff;
140	int srcstart, dststart, xinc, srcinc, dstinc;
141	int line, count, s, d, num;
142
143	ENTER;
144	dstpitch = exaGetPixmapPitch(pDstPixmap);
145	dstoff = exaGetPixmapOffset(pDstPixmap);
146	srcpitch = p->srcpitch;
147	srcoff = p->srcoff;
148	/*
149	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
150	 * actually wrote anything and only sync if it did
151	 */
152	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
153	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
154
155	/*
156	 * we always copy up to 32 pixels at a time so direction doesn't
157	 * matter if w<=32
158	 */
159	if (w > 32) {
160		if (p->xdir < 0) {
161			srcstart += (w - 32) << 2;
162			dststart += (w - 32) << 2;
163			xinc = -128;
164		} else
165			xinc = 128;
166	} else
167		xinc = 128;
168	if (p->ydir < 0) {
169		srcstart += (h - 1) * srcpitch;
170		dststart += (h - 1) * dstpitch;
171		srcinc = -srcpitch;
172		dstinc = -dstpitch;
173	} else {
174		srcinc = srcpitch;
175		dstinc = dstpitch;
176	}
177	if (p->last_rop == 0xcc) {
178		/* plain old copy */
179		if ( xinc > 0) {
180			/* going left to right */
181			for (line = 0; line < h; line++) {
182				count = 0;
183				s = srcstart;
184				d = dststart;
185				while ( count < w) {
186					num = min(32, w - count);
187					write_sx_io(p, s,
188					    SX_LD(10, num - 1, s & 7));
189					write_sx_io(p, d,
190					    SX_STM(10, num - 1, d & 7));
191					s += xinc;
192					d += xinc;
193					count += 32;
194				}
195				srcstart += srcinc;
196				dststart += dstinc;
197			}
198		} else {
199			/* going right to left */
200			int i, chunks = (w >> 5);
201			for (line = 0; line < h; line++) {
202				s = srcstart;
203				d = dststart;
204				count = w;
205				for (i = 0; i < chunks; i++) {
206					write_sx_io(p, s,
207					    SX_LD(10, 31, s & 7));
208					write_sx_io(p, d,
209					    SX_STM(10, 31, d & 7));
210					s -= 128;
211					d -= 128;
212					count -= 32;
213				}
214				/* leftovers, if any */
215				if (count > 0) {
216					s += (32 - count) << 2;
217					d += (32 - count) << 2;
218					write_sx_io(p, s,
219					    SX_LD(10, count - 1, s & 7));
220					write_sx_io(p, d,
221					    SX_STM(10, count - 1, d & 7));
222				}
223				srcstart += srcinc;
224				dststart += dstinc;
225			}
226		}
227	} else {
228		/* ROPs needed */
229		if ( xinc > 0) {
230			/* going left to right */
231			for (line = 0; line < h; line++) {
232				count = 0;
233				s = srcstart;
234				d = dststart;
235				while ( count < w) {
236					num = min(32, w - count);
237					write_sx_io(p, s,
238					    SX_LD(10, num - 1, s & 7));
239					write_sx_io(p, d,
240					    SX_LD(42, num - 1, d & 7));
241					if (num > 16) {
242						write_sx_reg(p, SX_INSTRUCTIONS,
243					    	 SX_ROP(10, 42, 74, 15));
244						write_sx_reg(p, SX_INSTRUCTIONS,
245					    	 SX_ROP(26, 58, 90, num - 17));
246					} else {
247						write_sx_reg(p, SX_INSTRUCTIONS,
248					    	 SX_ROP(10, 42, 74, num - 1));
249					}
250					write_sx_io(p, d,
251					    SX_STM(74, num - 1, d & 7));
252					s += xinc;
253					d += xinc;
254					count += 32;
255				}
256				srcstart += srcinc;
257				dststart += dstinc;
258			}
259		} else {
260			/* going right to left */
261			int i, chunks = (w >> 5);
262			for (line = 0; line < h; line++) {
263				s = srcstart;
264				d = dststart;
265				count = w;
266				for (i = 0; i < chunks; i++) {
267					write_sx_io(p, s, SX_LD(10, 31, s & 7));
268					write_sx_io(p, d, SX_LD(42, 31, d & 7));
269					write_sx_reg(p, SX_INSTRUCTIONS,
270				    	    SX_ROP(10, 42, 74, 15));
271					write_sx_reg(p, SX_INSTRUCTIONS,
272				    	    SX_ROP(26, 58, 90, 15));
273					write_sx_io(p, d,
274					    SX_STM(74, 31, d & 7));
275					s -= 128;
276					d -= 128;
277					count -= 32;
278				}
279				/* leftovers, if any */
280				if (count > 0) {
281					s += (32 - count) << 2;
282					d += (32 - count) << 2;
283					write_sx_io(p, s,
284					    SX_LD(10, count - 1, s & 7));
285					write_sx_io(p, d,
286					    SX_LD(42, count - 1, d & 7));
287					if (count > 16) {
288						write_sx_reg(p, SX_INSTRUCTIONS,
289					    	    SX_ROP(10, 42, 74, 15));
290						write_sx_reg(p, SX_INSTRUCTIONS,
291					    	 SX_ROP(26, 58, 90, count - 17));
292					} else {
293						write_sx_reg(p, SX_INSTRUCTIONS,
294					    	 SX_ROP(10, 42, 74, count - 1));
295					}
296
297					write_sx_io(p, d,
298					    SX_STM(74, count - 1, d & 7));
299				}
300				srcstart += srcinc;
301				dststart += dstinc;
302			}
303		}
304	}
305	exaMarkSync(pDstPixmap->drawable.pScreen);
306}
307
308/*
309 * copy with same alignment, left to right, no ROP
310 */
311static void
312CG14Copy8_aligned_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
313{
314	int saddr, daddr, pre, cnt, wrds;
315
316	ENTER;
317
318	pre = srcstart & 3;
319	if (pre != 0) pre = 4 - pre;
320	pre = min(pre, w);
321
322	while (h > 0) {
323		saddr = srcstart;
324		daddr = dststart;
325		cnt = w;
326		if (pre > 0) {
327			write_sx_io(p, saddr & ~7, SX_LDB(8, pre - 1, saddr & 7));
328			write_sx_io(p, daddr & ~7, SX_STB(8, pre - 1, daddr & 7));
329			saddr += pre;
330			daddr += pre;
331			cnt -= pre;
332			if (cnt == 0) goto next;
333		}
334		while (cnt > 3) {
335			wrds = min(32, cnt >> 2);
336			write_sx_io(p, saddr & ~7, SX_LD(8, wrds - 1, saddr & 7));
337			write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
338			saddr += wrds << 2;
339			daddr += wrds << 2;
340			cnt -= wrds << 2;
341		}
342		if (cnt > 0) {
343			write_sx_io(p, saddr & ~7, SX_LDB(8, cnt - 1, saddr & 7));
344			write_sx_io(p, daddr & ~7, SX_STB(8, cnt - 1, daddr & 7));
345		}
346next:
347		srcstart += srcpitch;
348		dststart += dstpitch;
349		h--;
350	}
351}
352
353/*
354 * copy with same alignment, left to right, ROP
355 */
356static void
357CG14Copy8_aligned_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
358{
359	int saddr, daddr, pre, cnt, wrds;
360
361	ENTER;
362
363	pre = srcstart & 3;
364	if (pre != 0) pre = 4 - pre;
365	pre = min(pre, w);
366
367	while (h > 0) {
368		saddr = srcstart;
369		daddr = dststart;
370		cnt = w;
371		if (pre > 0) {
372			write_sx_io(p, saddr & ~7, SX_LDB(8, pre - 1, saddr & 7));
373			write_sx_io(p, daddr & ~7, SX_LDB(40, pre - 1, daddr & 7));
374			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, pre - 1));
375			write_sx_io(p, daddr & ~7, SX_STB(72, pre - 1, daddr & 7));
376			saddr += pre;
377			daddr += pre;
378			cnt -= pre;
379			if (cnt == 0) goto next;
380		}
381		while (cnt > 3) {
382			wrds = min(32, cnt >> 2);
383			write_sx_io(p, saddr & ~7, SX_LD(8, wrds - 1, saddr & 7));
384			write_sx_io(p, daddr & ~7, SX_LD(40, wrds - 1, daddr & 7));
385			if (cnt > 16) {
386				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, 15));
387				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 56, 88, wrds - 17));
388			} else
389				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, wrds - 1));
390			write_sx_io(p, daddr & ~7, SX_ST(72, wrds - 1, daddr & 7));
391			saddr += wrds << 2;
392			daddr += wrds << 2;
393			cnt -= wrds << 2;
394		}
395		if (cnt > 0) {
396			write_sx_io(p, saddr & ~7, SX_LDB(8, cnt - 1, saddr & 7));
397			write_sx_io(p, daddr & ~7, SX_LDB(40, cnt - 1, daddr & 7));
398			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, cnt - 1));
399			write_sx_io(p, daddr & ~7, SX_STB(72, cnt - 1, daddr & 7));
400		}
401next:
402		srcstart += srcpitch;
403		dststart += dstpitch;
404		h--;
405	}
406}
407
408/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
409static void
410CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
411{
412	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
413	int ssreg;
414#ifdef DEBUG
415	int taddr = 4 + dstpitch * 50;
416#endif
417	uint32_t lmask, rmask;
418	ENTER;
419
420	pre = dststart & 3;
421	lmask = 0xffffffff >> pre;
422	spre = srcstart & 3;
423	/*
424	 * make sure we count all the words needed to cover the destination
425	 * line, covering potential partials on both ends
426	 */
427	wrds = (w + pre + 3) >> 2;
428	swrds = (w + spre + 3) >> 2;
429
430	if (spre < pre) {
431		dist = 32 - (pre - spre) * 8;
432		sreg = 9;
433	} else {
434		dist = (spre - pre) * 8;
435		sreg = 8;
436	}
437
438	/*
439	 * mask out trailing pixels to avoid partial writes
440	 */
441	post = (dststart + w) & 3;
442	if (post != 0) {
443		rmask = ~(0xffffffff >> (post * 8));
444		write_sx_reg(p, SX_QUEUED(7), rmask);
445		write_sx_reg(p, SX_QUEUED(6), ~rmask);
446	}
447
448	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
449	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
450
451	/* mask out the leading pixels in dst by using a mask and ROP */
452	if (pre != 0) {
453		CG14Wait(p);
454		write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
455		write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
456	}
457
458	saddr = srcstart & ~3;
459	daddr = dststart & ~3;
460
461	while (h > 0) {
462		write_sx_io(p, daddr & ~7, SX_LD(80, wrds - 1, daddr & 7));
463		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
464		if (wrds > 15) {
465			if (dist != 0) {
466				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
467				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
468				/* shifted source pixels are now at register 40+ */
469				ssreg = 40;
470			} else ssreg = 8;
471			if (pre != 0) {
472				/* mask out leading junk */
473				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
474				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 0));
475				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
476				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 1, 81, 9, 14));
477			} else {
478				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 15));
479			}
480			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 16, 96, 24, wrds - 16));
481		} else {
482			if (dist != 0) {
483				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
484				ssreg = 40;
485			} else ssreg = 8;
486			if (pre != 0) {
487				/* mask out leading junk */
488				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
489				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 0));
490				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
491				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 1, 81, 9, wrds));
492			} else {
493				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, wrds));
494			}
495		}
496		if (post != 0) {
497			/*
498			 * if the last word to be written out is a partial we
499			 * mask out the leftovers and replace them with
500			 * background pixels
501			 * we could pull the same ROP * mask trick as we do on
502			 * the left end but it's less annoying this way and
503			 * the instruction count is the same
504			 */
505			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(7 + wrds, 7, 5, 0));
506			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(79 + wrds, 6, 4, 0));
507			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, 7 + wrds, 0));
508		}
509#ifdef DEBUG
510		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
511		taddr += dstpitch;
512#endif
513		write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
514		saddr += srcpitch;
515		daddr += dstpitch;
516		h--;
517	}
518}
519
520/* up to 124 pixels so direction doesn't matter, unaligned, straight copy */
521static void
522CG14Copy8_short_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
523{
524	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
525	int ssreg;
526#ifdef DEBUG
527	int taddr = 4 + dstpitch * 50;
528#endif
529	uint32_t lmask, rmask;
530	ENTER;
531
532	pre = dststart & 3;
533	lmask = 0xffffffff >> pre;
534	spre = srcstart & 3;
535	/*
536	 * make sure we count all the words needed to cover the destination
537	 * line, covering potential partials on both ends
538	 */
539	wrds = (w + pre + 3) >> 2;
540	swrds = (w + spre + 3) >> 2;
541
542	if (spre < pre) {
543		dist = 32 - (pre - spre) * 8;
544		sreg = 9;
545	} else {
546		dist = (spre - pre) * 8;
547		sreg = 8;
548	}
549
550	/*
551	 * mask out trailing pixels to avoid partial writes
552	 */
553	post = (dststart + w) & 3;
554	if (post != 0) {
555		rmask = ~(0xffffffff >> (post * 8));
556		write_sx_reg(p, SX_QUEUED(7), rmask);
557		write_sx_reg(p, SX_QUEUED(6), ~rmask);
558	}
559
560	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
561	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
562
563	/* mask out the leading pixels in dst by using a mask and ROP */
564	if (pre != 0) {
565		CG14Wait(p);
566		write_sx_reg(p, SX_ROP_CONTROL, 0xca);
567		write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
568	}
569
570	saddr = srcstart & ~3;
571	daddr = dststart & ~3;
572
573	while (h > 0) {
574		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
575		if (wrds > 15) {
576			if (dist != 0) {
577				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
578				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
579				/* shifted source pixels are now at register 40+ */
580				ssreg = 40;
581			} else ssreg = 8;
582			if (pre != 0) {
583				/* read only the first word */
584				write_sx_io(p, daddr & ~7, SX_LD(80, 0, daddr & 7));
585				/* mask out leading junk */
586				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, ssreg, 0));
587			}
588		} else {
589			if (dist != 0) {
590				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
591				ssreg = 40;
592			} else ssreg = 8;
593			if (pre != 0) {
594				/* read only the first word */
595				write_sx_io(p, daddr & ~7, SX_LD(80, 0, daddr & 7));
596				/* mask out leading junk */
597				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, ssreg, 0));
598			}
599		}
600		if (post != 0) {
601			int laddr = daddr + ((wrds - 1) << 2);
602			/*
603			 * if the last word to be written out is a partial we
604			 * mask out the leftovers and replace them with
605			 * background pixels
606			 * we could pull the same ROP * mask trick as we do on
607			 * the left end but it's less annoying this way and
608			 * the instruction count is the same
609			 */
610			write_sx_io(p, laddr & ~7, SX_LD(81, 0, laddr & 7));
611			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(ssreg + wrds - 1, 7, 5, 0));
612			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(81, 6, 4, 0));
613			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, ssreg + wrds - 1, 0));
614		}
615#ifdef DEBUG
616		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
617		taddr += dstpitch;
618#endif
619		write_sx_io(p, daddr & ~7, SX_ST(ssreg, wrds - 1, daddr & 7));
620		saddr += srcpitch;
621		daddr += dstpitch;
622		h--;
623	}
624}
625
626static void
627CG14Copy8(PixmapPtr pDstPixmap,
628         int srcX, int srcY, int dstX, int dstY, int w, int h)
629{
630	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
631	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
632	int dstpitch, dstoff, srcpitch, srcoff;
633	int srcstart, dststart, xinc, srcinc, dstinc;
634	int line, count, s, d, num;
635
636	ENTER;
637	dstpitch = exaGetPixmapPitch(pDstPixmap);
638	dstoff = exaGetPixmapOffset(pDstPixmap);
639	srcpitch = p->srcpitch;
640	srcoff = p->srcoff;
641	/*
642	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
643	 * actually wrote anything and only sync if it did
644	 */
645	srcstart = srcX + (srcpitch * srcY) + srcoff;
646	dststart = dstX + (dstpitch * dstY) + dstoff;
647
648	if (p->ydir < 0) {
649		srcstart += (h - 1) * srcpitch;
650		dststart += (h - 1) * dstpitch;
651		srcinc = -srcpitch;
652		dstinc = -dstpitch;
653	} else {
654		srcinc = srcpitch;
655		dstinc = dstpitch;
656	}
657
658	/*
659	 * this copies up to 124 pixels wide in one go, so horizontal
660	 * direction / overlap don't matter
661	 * uses all 32bit accesses and funnel shifter for unaligned copies
662	 */
663	if ((w < 125) && (w > 8)) {
664		switch (p->last_rop) {
665			case 0xcc:
666				CG14Copy8_short_norop(p, srcstart, dststart, w, h, srcinc, dstinc);
667				break;
668			default:
669				CG14Copy8_short_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
670		}
671		return;
672	}
673
674	/*
675	 * only invert x direction if absolutely necessary, it's a pain to
676	 * go backwards on SX so avoid as much as possible
677	 */
678	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
679		xinc = -32;
680	} else
681		xinc = 32;
682
683	/*
684	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
685	 * most common case
686	 */
687	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
688		switch (p->last_rop) {
689			case 0xcc:
690				CG14Copy8_aligned_norop(p, srcstart, dststart, w, h, srcinc, dstinc);
691				break;
692			default:
693				CG14Copy8_aligned_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
694		}
695		return;
696	}
697
698	/*
699	 * if we make it here we either have something large and unaligned,
700	 * something we need to do right to left, or something tiny.
701	 * we handle the non-tiny cases by breaking them down into chunks that
702	 * Copy8_short_*() can handle, making sure the destinations are 32bit
703	 * aligned whenever possible
704	 * since we copy by block, not by line we need to go backwards even if
705	 * we don't copy within the same line
706	 */
707	if (w > 8) {
708		int next, wi, end = dststart + w;
709		DPRINTF(X_ERROR, "%s %08x %08x %d\n", __func__, srcstart, dststart, w);
710		if ((p->xdir < 0) && (srcoff == dstoff)) {
711			srcstart += w;
712			next = max((end - 120) & ~3, dststart);
713			wi = end - next;
714			srcstart -= wi;
715			while (wi > 0) {
716				DPRINTF(X_ERROR, "%s RL %08x %08x %d\n", __func__, srcstart, next, wi);
717				if (p->last_rop == 0xcc) {
718					CG14Copy8_short_norop(p, srcstart, next, wi, h, srcinc, dstinc);
719				} else
720					CG14Copy8_short_rop(p, srcstart, next, wi, h, srcinc, dstinc);
721				end = next;
722				/*
723				 * avoid extremely narrow copies so I don't
724				 * have to deal with dangling start and end
725				 * pixels in the same word
726				 */
727				if ((end - dststart) < 140) {
728					next = max((end - 80) & ~3, dststart);
729				} else {
730					next = max((end - 120) & ~3, dststart);
731				}
732				wi = end - next;
733				srcstart -= wi;
734			}
735		} else {
736			next = min(end, (dststart + 124) & ~3);
737			wi = next - dststart;
738			while (wi > 0) {
739				DPRINTF(X_ERROR, "%s LR %08x %08x %d\n", __func__, srcstart, next, wi);
740				if (p->last_rop == 0xcc) {
741					CG14Copy8_short_norop(p, srcstart, dststart, wi, h, srcinc, dstinc);
742				} else
743					CG14Copy8_short_rop(p, srcstart, dststart, wi, h, srcinc, dstinc);
744				srcstart += wi;
745				dststart = next;
746				if ((end - dststart) < 140) {
747					next = min(end, (dststart + 84) & ~3);
748				} else {
749					next = min(end, (dststart + 124) & ~3);
750				}
751				wi = next - dststart;
752			}
753		}
754		return;
755	}
756	if (xinc < 0) {
757		srcstart += (w - 32);
758		dststart += (w - 32);
759	}
760
761	DPRINTF(X_ERROR, "%s fallback to byte-wise %d %d\n", __func__, w, h);
762	if (p->last_rop == 0xcc) {
763		/* plain old copy */
764		if ( xinc > 0) {
765			/* going left to right */
766			for (line = 0; line < h; line++) {
767				count = 0;
768				s = srcstart;
769				d = dststart;
770				while ( count < w) {
771					num = min(32, w - count);
772					write_sx_io(p, s,
773					    SX_LDB(10, num - 1, s & 7));
774					write_sx_io(p, d,
775					    SX_STBM(10, num - 1, d & 7));
776					s += xinc;
777					d += xinc;
778					count += 32;
779				}
780				srcstart += srcinc;
781				dststart += dstinc;
782			}
783		} else {
784			/* going right to left */
785			int i, chunks = (w >> 5);
786			for (line = 0; line < h; line++) {
787				s = srcstart;
788				d = dststart;
789				count = w;
790				for (i = 0; i < chunks; i++) {
791					write_sx_io(p, s,
792					    SX_LDB(10, 31, s & 7));
793					write_sx_io(p, d,
794					    SX_STBM(10, 31, d & 7));
795					s -= 32;
796					d -= 32;
797					count -= 32;
798				}
799				/* leftovers, if any */
800				if (count > 0) {
801					s += (32 - count);
802					d += (32 - count);
803					write_sx_io(p, s,
804					    SX_LDB(10, count - 1, s & 7));
805					write_sx_io(p, d,
806					    SX_STBM(10, count - 1, d & 7));
807				}
808				srcstart += srcinc;
809				dststart += dstinc;
810			}
811		}
812	} else {
813		/* ROPs needed */
814		if ( xinc > 0) {
815			/* going left to right */
816			for (line = 0; line < h; line++) {
817				count = 0;
818				s = srcstart;
819				d = dststart;
820				while ( count < w) {
821					num = min(32, w - count);
822					write_sx_io(p, s,
823					    SX_LDB(10, num - 1, s & 7));
824					write_sx_io(p, d,
825					    SX_LDB(42, num - 1, d & 7));
826					if (num > 16) {
827						write_sx_reg(p, SX_INSTRUCTIONS,
828					    	 SX_ROP(10, 42, 74, 15));
829						write_sx_reg(p, SX_INSTRUCTIONS,
830					    	 SX_ROP(26, 58, 90, num - 17));
831					} else {
832						write_sx_reg(p, SX_INSTRUCTIONS,
833					    	 SX_ROP(10, 42, 74, num - 1));
834					}
835					write_sx_io(p, d,
836					    SX_STBM(74, num - 1, d & 7));
837					s += xinc;
838					d += xinc;
839					count += 32;
840				}
841				srcstart += srcinc;
842				dststart += dstinc;
843			}
844		} else {
845			/* going right to left */
846			int i, chunks = (w >> 5);
847			for (line = 0; line < h; line++) {
848				s = srcstart;
849				d = dststart;
850				count = w;
851				for (i = 0; i < chunks; i++) {
852					write_sx_io(p, s, SX_LDB(10, 31, s & 7));
853					write_sx_io(p, d, SX_LDB(42, 31, d & 7));
854					write_sx_reg(p, SX_INSTRUCTIONS,
855				    	    SX_ROP(10, 42, 74, 15));
856					write_sx_reg(p, SX_INSTRUCTIONS,
857				    	    SX_ROP(26, 58, 90, 15));
858					write_sx_io(p, d,
859					    SX_STBM(74, 31, d & 7));
860					s -= 128;
861					d -= 128;
862					count -= 32;
863				}
864				/* leftovers, if any */
865				if (count > 0) {
866					s += (32 - count);
867					d += (32 - count);
868					write_sx_io(p, s,
869					    SX_LDB(10, count - 1, s & 7));
870					write_sx_io(p, d,
871					    SX_LDB(42, count - 1, d & 7));
872					if (count > 16) {
873						write_sx_reg(p, SX_INSTRUCTIONS,
874					    	    SX_ROP(10, 42, 74, 15));
875						write_sx_reg(p, SX_INSTRUCTIONS,
876					    	 SX_ROP(26, 58, 90, count - 17));
877					} else {
878						write_sx_reg(p, SX_INSTRUCTIONS,
879					    	 SX_ROP(10, 42, 74, count - 1));
880					}
881
882					write_sx_io(p, d,
883					    SX_STBM(74, count - 1, d & 7));
884				}
885				srcstart += srcinc;
886				dststart += dstinc;
887			}
888		}
889	}
890	exaMarkSync(pDstPixmap->drawable.pScreen);
891}
892
893static void
894CG14DoneCopy(PixmapPtr pDstPixmap)
895{
896}
897
898static Bool
899CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
900{
901	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
902	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
903
904	ENTER;
905	DPRINTF(X_ERROR, "bits per pixel: %d %08lx\n",
906	    pPixmap->drawable.bitsPerPixel, fg);
907
908	/*
909	 * GXset and GXclear are really just specual cases of GXcopy with
910	 * fixed fill colour
911	 */
912	switch (alu) {
913		case GXclear:
914			alu = GXcopy;
915			fg = 0;
916			break;
917		case GXset:
918			alu = GXcopy;
919			fg = 0xffffffff;
920			break;
921	}
922	/* repeat the colour in every sub byte if we're in 8 bit */
923	if (pPixmap->drawable.bitsPerPixel == 8) {
924		fg |= fg << 8;
925		fg |= fg << 16;
926	}
927	write_sx_reg(p, SX_QUEUED(8), fg);
928	write_sx_reg(p, SX_QUEUED(9), fg);
929	if (planemask != p->last_mask) {
930		CG14Wait(p);
931		write_sx_reg(p, SX_PLANEMASK, planemask);
932		p->last_mask = planemask;
933	}
934	alu = sx_rop[alu];
935	if (alu != p->last_rop) {
936		CG14Wait(p);
937		write_sx_reg(p, SX_ROP_CONTROL, alu);
938		p->last_rop = alu;
939	}
940
941	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
942	return TRUE;
943}
944
945static void
946CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
947{
948	int line, x, num;
949	uint32_t ptr;
950
951	ENTER;
952	if (p->last_rop == 0xcc) {
953		/* simple fill */
954		for (line = 0; line < h; line++) {
955			x = 0;
956			while (x < w) {
957				ptr = start + (x << 2);
958				num = min(32, w - x);
959				write_sx_io(p, ptr,
960				    SX_STS(8, num - 1, ptr & 7));
961				x += 32;
962			}
963			start += pitch;
964		}
965	} else if (p->last_rop == 0xaa) {
966		/* nothing to do here */
967		return;
968	} else {
969		/* alright, let's do actual ROP stuff */
970
971		/* first repeat the fill colour into 16 registers */
972		write_sx_reg(p, SX_INSTRUCTIONS,
973		    SX_SELECT_S(8, 8, 10, 15));
974
975		for (line = 0; line < h; line++) {
976			x = 0;
977			while (x < w) {
978				ptr = start + (x << 2);
979				num = min(32, w - x);
980				/* now suck fb data into registers */
981				write_sx_io(p, ptr,
982				    SX_LD(42, num - 1, ptr & 7));
983				/*
984				 * ROP them with the fill data we left in 10
985				 * non-memory ops can only have counts up to 16
986				 */
987				if (num <= 16) {
988					write_sx_reg(p, SX_INSTRUCTIONS,
989					    SX_ROP(10, 42, 74, num - 1));
990				} else {
991					write_sx_reg(p, SX_INSTRUCTIONS,
992					    SX_ROP(10, 42, 74, 15));
993					write_sx_reg(p, SX_INSTRUCTIONS,
994					    SX_ROP(10, 58, 90, num - 17));
995				}
996				/* and write the result back into memory */
997				write_sx_io(p, ptr,
998				    SX_ST(74, num - 1, ptr & 7));
999				x += 32;
1000			}
1001			start += pitch;
1002		}
1003	}
1004}
1005
1006static void
1007CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
1008{
1009	int line, num, pre, cnt;
1010	uint32_t ptr;
1011
1012	ENTER;
1013	pre = start & 3;
1014	if (pre != 0) pre = 4 - pre;
1015
1016	if (p->last_rop == 0xcc) {
1017		/* simple fill */
1018		for (line = 0; line < h; line++) {
1019			ptr = start;
1020			cnt = w;
1021			pre = min(pre, cnt);
1022			if (pre) {
1023				write_sx_io(p, ptr & ~7, SX_STBS(8, pre - 1, ptr & 7));
1024				ptr += pre;
1025				cnt -= pre;
1026				if (cnt == 0) goto next;
1027			}
1028			/* now do the aligned pixels in 32bit chunks */
1029			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1030			while(cnt > 3) {
1031				num = min(32, cnt >> 2);
1032				write_sx_io(p, ptr & ~7, SX_STS(8, num - 1, ptr & 7));
1033				ptr += num << 2;
1034				cnt -= num << 2;
1035			}
1036			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1037			if (cnt > 0) {
1038				write_sx_io(p, ptr & ~7, SX_STBS(8, cnt - 1, ptr & 7));
1039			}
1040			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1041next:
1042			start += pitch;
1043		}
1044	} else if (p->last_rop == 0xaa) {
1045		/* nothing to do here */
1046		return;
1047	} else {
1048		/* alright, let's do actual ROP stuff */
1049
1050		/* first repeat the fill colour into 16 registers */
1051		write_sx_reg(p, SX_INSTRUCTIONS,
1052		    SX_SELECT_S(8, 8, 10, 15));
1053
1054		for (line = 0; line < h; line++) {
1055			ptr = start;
1056			cnt = w;
1057			pre = min(pre, cnt);
1058			if (pre) {
1059				write_sx_io(p, ptr & ~7, SX_LDB(26, pre - 1, ptr & 7));
1060				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, pre - 1));
1061				write_sx_io(p, ptr & ~7, SX_STB(42, pre - 1, ptr & 7));
1062				ptr += pre;
1063				cnt -= pre;
1064				if (cnt == 0) goto next2;
1065			}
1066			/* now do the aligned pixels in 32bit chunks */
1067			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1068			while(cnt > 3) {
1069				num = min(32, cnt >> 2);
1070				write_sx_io(p, ptr & ~7, SX_LD(26, num - 1, ptr & 7));
1071				if (num <= 16) {
1072					write_sx_reg(p, SX_INSTRUCTIONS,
1073					    SX_ROP(10, 26, 58, num - 1));
1074				} else {
1075					write_sx_reg(p, SX_INSTRUCTIONS,
1076					    SX_ROP(10, 26, 58, 15));
1077					write_sx_reg(p, SX_INSTRUCTIONS,
1078					    SX_ROP(10, 42, 74, num - 17));
1079				}
1080				write_sx_io(p, ptr & ~7, SX_ST(58, num - 1, ptr & 7));
1081				ptr += num << 2;
1082				cnt -= num << 2;
1083			}
1084			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1085			if (cnt > 0) {
1086				write_sx_io(p, ptr & ~7, SX_LDB(26, cnt - 1, ptr & 7));
1087				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, cnt - 1));
1088				write_sx_io(p, ptr & ~7, SX_STB(42, cnt - 1, ptr & 7));
1089			}
1090			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1091next2:
1092			start += pitch;
1093		}
1094	}
1095}
1096
1097static void
1098CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
1099{
1100	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
1101	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1102	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
1103	int start, depth;
1104
1105	ENTER;
1106	dstpitch = exaGetPixmapPitch(pPixmap);
1107	dstoff = exaGetPixmapOffset(pPixmap);
1108
1109	depth = pPixmap->drawable.bitsPerPixel;
1110	switch (depth) {
1111		case 32:
1112			start = dstoff + (y1 * dstpitch) + (x1 << 2);
1113			CG14Solid32(p, start, dstpitch, w, h);
1114			break;
1115		case 8:
1116			start = dstoff + (y1 * dstpitch) + x1;
1117			CG14Solid8(p, start, dstpitch, w, h);
1118			break;
1119	}
1120
1121	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
1122	    dstpitch, dstoff, start);
1123	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
1124	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
1125	exaMarkSync(pPixmap->drawable.pScreen);
1126}
1127
1128/*
1129 * Memcpy-based UTS.
1130 */
1131static Bool
1132CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
1133    char *src, int src_pitch)
1134{
1135	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1136	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1137	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
1138	int    dst_pitch  = exaGetPixmapPitch(pDst);
1139
1140	int bpp    = pDst->drawable.bitsPerPixel;
1141	int cpp    = (bpp + 7) >> 3;
1142	int wBytes = w * cpp;
1143
1144	ENTER;
1145	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
1146	dst += (x * cpp) + (y * dst_pitch);
1147
1148	CG14Wait(p);
1149
1150	while (h--) {
1151		memcpy(dst, src, wBytes);
1152		src += src_pitch;
1153		dst += dst_pitch;
1154	}
1155	__asm("stbar;");
1156	return TRUE;
1157}
1158
1159/*
1160 * Memcpy-based DFS.
1161 */
1162static Bool
1163CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
1164    char *dst, int dst_pitch)
1165{
1166	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
1167	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1168	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
1169	int    src_pitch  = exaGetPixmapPitch(pSrc);
1170
1171	ENTER;
1172	int bpp    = pSrc->drawable.bitsPerPixel;
1173	int cpp    = (bpp + 7) >> 3;
1174	int wBytes = w * cpp;
1175
1176	src += (x * cpp) + (y * src_pitch);
1177
1178	CG14Wait(p);
1179
1180	while (h--) {
1181		memcpy(dst, src, wBytes);
1182		src += src_pitch;
1183		dst += dst_pitch;
1184	}
1185
1186	return TRUE;
1187}
1188
1189Bool
1190CG14CheckComposite(int op, PicturePtr pSrcPicture,
1191                           PicturePtr pMaskPicture,
1192                           PicturePtr pDstPicture)
1193{
1194	int i, ok = FALSE;
1195
1196	ENTER;
1197
1198	/*
1199	 * SX is in theory capable of accelerating pretty much all Xrender ops,
1200	 * even coordinate transformation and gradients. Support will be added
1201	 * over time and likely have to spill over into its own source file.
1202	 */
1203
1204	if ((op != PictOpOver) && (op != PictOpAdd) && (op != PictOpSrc)) {
1205		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
1206		return FALSE;
1207	}
1208
1209	if (pSrcPicture != NULL) {
1210		i = 0;
1211		while ((i < arraysize(src_formats)) && (!ok)) {
1212			ok =  (pSrcPicture->format == src_formats[i]);
1213			i++;
1214		}
1215
1216		if (!ok) {
1217			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
1218			    __func__, pSrcPicture->format);
1219			return FALSE;
1220		}
1221		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
1222	}
1223
1224	if (pDstPicture != NULL) {
1225		i = 0;
1226		ok = FALSE;
1227		while ((i < arraysize(src_formats)) && (!ok)) {
1228			ok =  (pDstPicture->format == src_formats[i]);
1229			i++;
1230		}
1231
1232		if (!ok) {
1233			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
1234			    __func__, pDstPicture->format);
1235			return FALSE;
1236		}
1237		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
1238	}
1239
1240	if (pMaskPicture != NULL) {
1241		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
1242		    pMaskPicture->pDrawable->width,
1243		    pMaskPicture->pDrawable->height);
1244	}
1245	return TRUE;
1246}
1247
1248Bool
1249CG14PrepareComposite(int op, PicturePtr pSrcPicture,
1250                             PicturePtr pMaskPicture,
1251                             PicturePtr pDstPicture,
1252                             PixmapPtr  pSrc,
1253                             PixmapPtr  pMask,
1254                             PixmapPtr  pDst)
1255{
1256	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1257	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1258
1259	ENTER;
1260
1261	p->no_source_pixmap = FALSE;
1262	p->source_is_solid = FALSE;
1263
1264	if (pSrcPicture->format == PICT_a1) {
1265		xf86Msg(X_ERROR, "src mono, dst %x, op %d\n",
1266		    pDstPicture->format, op);
1267		if (pMaskPicture != NULL) {
1268			xf86Msg(X_ERROR, "msk %x\n", pMaskPicture->format);
1269		}
1270	}
1271	if (pSrcPicture->pSourcePict != NULL) {
1272		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
1273			p->fillcolour =
1274			    pSrcPicture->pSourcePict->solidFill.color;
1275			DPRINTF(X_ERROR, "%s: solid src %08x\n",
1276			    __func__, p->fillcolour);
1277			p->no_source_pixmap = TRUE;
1278			p->source_is_solid = TRUE;
1279		}
1280	}
1281	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
1282		if (pMaskPicture->pSourcePict->type ==
1283		    SourcePictTypeSolidFill) {
1284			p->fillcolour =
1285			   pMaskPicture->pSourcePict->solidFill.color;
1286			xf86Msg(X_ERROR, "%s: solid mask %08x\n",
1287			    __func__, p->fillcolour);
1288		}
1289	}
1290	if (pMaskPicture != NULL) {
1291		p->mskoff = exaGetPixmapOffset(pMask);
1292		p->mskpitch = exaGetPixmapPitch(pMask);
1293		p->mskformat = pMaskPicture->format;
1294	} else {
1295		p->mskoff = 0;
1296		p->mskpitch = 0;
1297		p->mskformat = 0;
1298	}
1299	if (pSrc != NULL) {
1300		p->source_is_solid =
1301		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
1302		p->srcoff = exaGetPixmapOffset(pSrc);
1303		p->srcpitch = exaGetPixmapPitch(pSrc);
1304		if (p->source_is_solid) {
1305			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
1306		}
1307	}
1308	p->srcformat = pSrcPicture->format;
1309	p->dstformat = pDstPicture->format;
1310
1311	if (p->source_is_solid) {
1312		uint32_t temp;
1313
1314		/* stuff source colour into SX registers, swap as needed */
1315		temp = p->fillcolour;
1316		switch (p->srcformat) {
1317			case PICT_a8r8g8b8:
1318			case PICT_x8r8g8b8:
1319				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1320				temp = temp >> 8;
1321				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1322				temp = temp >> 8;
1323				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1324				break;
1325			case PICT_a8b8g8r8:
1326			case PICT_x8b8g8r8:
1327				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1328				temp = temp >> 8;
1329				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1330				temp = temp >> 8;
1331				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1332				break;
1333		}
1334		write_sx_reg(p, SX_QUEUED(8), 0xff);
1335	}
1336	p->op = op;
1337	if (op == PictOpSrc) {
1338		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
1339	}
1340#ifdef SX_DEBUG
1341	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
1342	    *(uint32_t *)(p->fb + p->srcoff));
1343#endif
1344	return TRUE;
1345}
1346
1347void
1348CG14Composite(PixmapPtr pDst, int srcX, int srcY,
1349                              int maskX, int maskY,
1350                              int dstX, int dstY,
1351                              int width, int height)
1352{
1353	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1354	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1355	uint32_t dstoff, dstpitch;
1356	uint32_t dst, msk, src;
1357	int flip = 0;
1358
1359	ENTER;
1360	dstoff = exaGetPixmapOffset(pDst);
1361	dstpitch = exaGetPixmapPitch(pDst);
1362
1363	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
1364		PICT_FORMAT_TYPE(p->dstformat));
1365
1366	switch (p->op) {
1367		case PictOpOver:
1368			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
1369			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
1370			    p->mskformat, p->dstformat, srcX, srcY);
1371			if (p->source_is_solid) {
1372				switch (p->mskformat) {
1373					case PICT_a8:
1374						msk = p->mskoff +
1375						    (maskY * p->mskpitch) +
1376						    maskX;
1377						CG14Comp_Over8Solid(p,
1378						    msk, p->mskpitch,
1379						    dst, dstpitch,
1380						    width, height);
1381						break;
1382					case PICT_a8r8g8b8:
1383					case PICT_a8b8g8r8:
1384						msk = p->mskoff +
1385						    (maskY * p->mskpitch) +
1386						    (maskX << 2);
1387						CG14Comp_Over32Solid(p,
1388						    msk, p->mskpitch,
1389						    dst, dstpitch,
1390						    width, height);
1391						break;
1392					default:
1393						xf86Msg(X_ERROR,
1394						  "unsupported mask format %08x\n", p->mskformat);
1395				}
1396			} else {
1397				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
1398				    p->mskformat);
1399				switch (p->srcformat) {
1400					case PICT_a8r8g8b8:
1401					case PICT_a8b8g8r8:
1402						src = p->srcoff +
1403						    (srcY * p->srcpitch) +
1404						    (srcX << 2);
1405						dst = dstoff +
1406						    (dstY * dstpitch) +
1407						    (dstX << 2);
1408						if (p->mskformat == PICT_a8) {
1409							msk = p->mskoff +
1410							    (maskY * p->mskpitch) +
1411							    maskX;
1412							CG14Comp_Over32Mask(p,
1413							    src, p->srcpitch,
1414							    msk, p->mskpitch,
1415							    dst, dstpitch,
1416							    width, height, flip);
1417						} else {
1418							CG14Comp_Over32(p,
1419							    src, p->srcpitch,
1420							    dst, dstpitch,
1421							    width, height, flip);
1422						}
1423						break;
1424					case PICT_x8r8g8b8:
1425					case PICT_x8b8g8r8:
1426						src = p->srcoff +
1427						    (srcY * p->srcpitch) +
1428						    (srcX << 2);
1429						dst = dstoff +
1430						    (dstY * dstpitch) +
1431						    (dstX << 2);
1432						if (p->mskformat == PICT_a8) {
1433							msk = p->mskoff +
1434							    (maskY * p->mskpitch) +
1435							    maskX;
1436							CG14Comp_Over32Mask_noalpha(p,
1437							    src, p->srcpitch,
1438							    msk, p->mskpitch,
1439							    dst, dstpitch,
1440							    width, height, flip);
1441						} else if ((p->mskformat == PICT_a8r8g8b8) ||
1442							   (p->mskformat == PICT_a8b8g8r8)) {
1443							msk = p->mskoff +
1444							    (maskY * p->mskpitch) +
1445							    (maskX << 2);
1446							CG14Comp_Over32Mask32_noalpha(p,
1447							    src, p->srcpitch,
1448							    msk, p->mskpitch,
1449							    dst, dstpitch,
1450							    width, height, flip);
1451						} else {
1452							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
1453						}
1454						break;
1455					default:
1456						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
1457						    __func__, p->srcformat);
1458				}
1459			}
1460			break;
1461		case PictOpAdd:
1462			DPRINTF(X_ERROR, "Add %08x %08x\n",
1463			    p->srcformat, p->dstformat);
1464			switch (p->srcformat) {
1465				case PICT_a8:
1466					src = p->srcoff +
1467					    (srcY * p->srcpitch) + srcX;
1468					if (p->dstformat == PICT_a8) {
1469						dst = dstoff +
1470						      (dstY * dstpitch) + dstX;
1471						CG14Comp_Add8(p,
1472						    src, p->srcpitch,
1473						    dst, dstpitch,
1474						    width, height);
1475					} else {
1476						dst = dstoff +
1477						      (dstY * dstpitch) +
1478						      (dstX << 2);
1479						CG14Comp_Add8_32(p,
1480						    src, p->srcpitch,
1481						    dst, dstpitch,
1482						    width, height);
1483					}
1484					break;
1485				case PICT_a8r8g8b8:
1486				case PICT_x8r8g8b8:
1487					src = p->srcoff +
1488					    (srcY * p->srcpitch) + (srcX << 2);
1489					dst = dstoff + (dstY * dstpitch) +
1490					    (dstX << 2);
1491					CG14Comp_Add32(p, src, p->srcpitch,
1492					    dst, dstpitch, width, height);
1493					break;
1494				default:
1495					xf86Msg(X_ERROR,
1496					    "unsupported src format\n");
1497			}
1498			break;
1499		case PictOpSrc:
1500			DPRINTF(X_ERROR, "Src %08x %08x\n",
1501			    p->srcformat, p->dstformat);
1502			if (p->mskformat != 0)
1503				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
1504			if (p->srcformat == PICT_a8) {
1505				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
1506			} else {
1507				/* convert between RGB and BGR? */
1508				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
1509			}
1510			break;
1511		default:
1512			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
1513	}
1514	exaMarkSync(pDst->drawable.pScreen);
1515}
1516
1517
1518
1519Bool
1520CG14InitAccel(ScreenPtr pScreen)
1521{
1522	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1523	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1524	ExaDriverPtr pExa;
1525
1526	pExa = exaDriverAlloc();
1527	if (!pExa)
1528		return FALSE;
1529
1530	p->pExa = pExa;
1531
1532	pExa->exa_major = EXA_VERSION_MAJOR;
1533	pExa->exa_minor = EXA_VERSION_MINOR;
1534
1535	pExa->memoryBase = p->fb;
1536	pExa->memorySize = p->memsize;
1537	pExa->offScreenBase = p->width * p->height * (pScrn->depth >> 3);
1538
1539	/*
1540	 * SX memory instructions are written to 64bit aligned addresses with
1541	 * a 3 bit displacement. Make sure the displacement remains constant
1542	 * within one column
1543	 */
1544
1545	pExa->pixmapOffsetAlign = 8;
1546	pExa->pixmapPitchAlign = 8;
1547
1548	pExa->flags = EXA_OFFSCREEN_PIXMAPS
1549		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
1550		      /*| EXA_MIXED_PIXMAPS*/;
1551
1552	/*
1553	 * these limits are bogus
1554	 * SX doesn't deal with coordinates at all, so there is no limit but
1555	 * we have to put something here
1556	 */
1557	pExa->maxX = 4096;
1558	pExa->maxY = 4096;
1559
1560	pExa->WaitMarker = CG14WaitMarker;
1561
1562	pExa->PrepareSolid = CG14PrepareSolid;
1563	pExa->Solid = CG14Solid;
1564	pExa->DoneSolid = CG14DoneCopy;
1565	pExa->PrepareCopy = CG14PrepareCopy;
1566	pExa->Copy = CG14Copy32;
1567	pExa->DoneCopy = CG14DoneCopy;
1568	if (p->use_xrender) {
1569		pExa->CheckComposite = CG14CheckComposite;
1570		pExa->PrepareComposite = CG14PrepareComposite;
1571		pExa->Composite = CG14Composite;
1572		pExa->DoneComposite = CG14DoneCopy;
1573	}
1574
1575	/* EXA hits more optimized paths when it does not have to fallback
1576	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
1577	 */
1578	pExa->UploadToScreen = CG14UploadToScreen;
1579	pExa->DownloadFromScreen = CG14DownloadFromScreen;
1580
1581	p->queuecount = 0;
1582	/* do some hardware init */
1583	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
1584	p->last_mask = 0xffffffff;
1585	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
1586	p->last_rop = 0xcc;
1587	return exaDriverInit(pScreen, pExa);
1588}
1589