cg14_accel.c revision 72fd264f
1/* $NetBSD: cg14_accel.c,v 1.27 2021/12/24 04:41:40 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45//#define SX_DEBUG
46
47#ifdef SX_DEBUG
48#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
49#define DPRINTF xf86Msg
50#else
51#define ENTER
52#define DPRINTF while (0) xf86Msg
53#endif
54
55#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
56
57/* 0xcc is SX's GXcopy equivalent */
58uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
59		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
60
61int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
62		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
63int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
64
65static void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
66static void CG14Copy8(PixmapPtr, int, int, int, int, int, int);
67
68static inline void
69CG14Wait(Cg14Ptr p)
70{
71	int bail = 10000000;
72	/* we wait for the busy bit to clear */
73	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
74	       (bail > 0)) {
75		bail--;
76	};
77	if (bail == 0) {
78		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
79		    read_sx_reg(p, SX_CONTROL_STATUS),
80		    read_sx_reg(p, SX_ERROR));
81	}
82}
83
84static void
85CG14WaitMarker(ScreenPtr pScreen, int Marker)
86{
87	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
88	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
89
90	CG14Wait(p);
91}
92
93static Bool
94CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
95		int xdir, int ydir, int alu, Pixel planemask)
96{
97	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
98	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
99
100	ENTER;
101	DPRINTF(X_ERROR, "%s bpp %d rop %x\n", __func__,
102	    pSrcPixmap->drawable.bitsPerPixel, alu);
103
104	if (planemask != p->last_mask) {
105		CG14Wait(p);
106		write_sx_reg(p, SX_PLANEMASK, planemask);
107		p->last_mask = planemask;
108	}
109	alu = sx_rop[alu];
110	if (alu != p->last_rop) {
111		CG14Wait(p);
112		write_sx_reg(p, SX_ROP_CONTROL, alu);
113		p->last_rop = alu;
114	}
115	switch (pSrcPixmap->drawable.bitsPerPixel)  {
116		case 8:
117			p->pExa->Copy = CG14Copy8;
118			break;
119		case 32:
120			p->pExa->Copy = CG14Copy32;
121			break;
122		default:
123			xf86Msg(X_ERROR, "%s depth %d\n", __func__,
124			    pSrcPixmap->drawable.bitsPerPixel);
125	}
126	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
127	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
128	p->xdir = xdir;
129	p->ydir = ydir;
130	return TRUE;
131}
132
133static void
134CG14Copy32(PixmapPtr pDstPixmap,
135         int srcX, int srcY, int dstX, int dstY, int w, int h)
136{
137	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
138	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
139	int dstpitch, dstoff, srcpitch, srcoff;
140	int srcstart, dststart, xinc, srcinc, dstinc;
141	int line, count, s, d, num;
142
143	ENTER;
144	dstpitch = exaGetPixmapPitch(pDstPixmap);
145	dstoff = exaGetPixmapOffset(pDstPixmap);
146	srcpitch = p->srcpitch;
147	srcoff = p->srcoff;
148	/*
149	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
150	 * actually wrote anything and only sync if it did
151	 */
152	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
153	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
154
155	/*
156	 * we always copy up to 32 pixels at a time so direction doesn't
157	 * matter if w<=32
158	 */
159	if (w > 32) {
160		if (p->xdir < 0) {
161			srcstart += (w - 32) << 2;
162			dststart += (w - 32) << 2;
163			xinc = -128;
164		} else
165			xinc = 128;
166	} else
167		xinc = 128;
168	if (p->ydir < 0) {
169		srcstart += (h - 1) * srcpitch;
170		dststart += (h - 1) * dstpitch;
171		srcinc = -srcpitch;
172		dstinc = -dstpitch;
173	} else {
174		srcinc = srcpitch;
175		dstinc = dstpitch;
176	}
177	if (p->last_rop == 0xcc) {
178		/* plain old copy */
179		if ( xinc > 0) {
180			/* going left to right */
181			for (line = 0; line < h; line++) {
182				count = 0;
183				s = srcstart;
184				d = dststart;
185				while ( count < w) {
186					num = min(32, w - count);
187					sxm(SX_LD, s, 10, num - 1);
188					sxm(SX_STM, d, 10, num - 1);
189					s += xinc;
190					d += xinc;
191					count += 32;
192				}
193				srcstart += srcinc;
194				dststart += dstinc;
195			}
196		} else {
197			/* going right to left */
198			int i, chunks = (w >> 5);
199			for (line = 0; line < h; line++) {
200				s = srcstart;
201				d = dststart;
202				count = w;
203				for (i = 0; i < chunks; i++) {
204					sxm(SX_LD, s, 10, 31);
205					sxm(SX_STM, d, 10, 31);
206					s -= 128;
207					d -= 128;
208					count -= 32;
209				}
210				/* leftovers, if any */
211				if (count > 0) {
212					s += (32 - count) << 2;
213					d += (32 - count) << 2;
214					sxm(SX_LD, s, 10, count - 1);
215					sxm(SX_STM, d, 10, count - 1);
216				}
217				srcstart += srcinc;
218				dststart += dstinc;
219			}
220		}
221	} else {
222		/* ROPs needed */
223		if ( xinc > 0) {
224			/* going left to right */
225			for (line = 0; line < h; line++) {
226				count = 0;
227				s = srcstart;
228				d = dststart;
229				while ( count < w) {
230					num = min(32, w - count);
231					sxm(SX_LD, s, 10, num - 1);
232					sxm(SX_LD, d, 42, num - 1);
233					if (num > 16) {
234						sxi(SX_ROP(10, 42, 74, 15));
235						sxi(SX_ROP(26, 58, 90, num - 17));
236					} else {
237						sxi(SX_ROP(10, 42, 74, num - 1));
238					}
239					sxm(SX_STM, d, 74, num - 1);
240					s += xinc;
241					d += xinc;
242					count += 32;
243				}
244				srcstart += srcinc;
245				dststart += dstinc;
246			}
247		} else {
248			/* going right to left */
249			int i, chunks = (w >> 5);
250			for (line = 0; line < h; line++) {
251				s = srcstart;
252				d = dststart;
253				count = w;
254				for (i = 0; i < chunks; i++) {
255					sxm(SX_LD, s, 10, 31);
256					sxm(SX_LD, d, 42, 31);
257					sxi(SX_ROP(10, 42, 74, 15));
258					sxi(SX_ROP(26, 58, 90, 15));
259					sxm(SX_STM, d, 74, 31);
260					s -= 128;
261					d -= 128;
262					count -= 32;
263				}
264				/* leftovers, if any */
265				if (count > 0) {
266					s += (32 - count) << 2;
267					d += (32 - count) << 2;
268					sxm(SX_LD, s, 10, count - 1);
269					sxm(SX_LD, d, 42, count - 1);
270					if (count > 16) {
271						sxi(SX_ROP(10, 42, 74, 15));
272						sxi(SX_ROP(26, 58, 90, count - 17));
273					} else {
274						sxi(SX_ROP(10, 42, 74, count - 1));
275					}
276					sxm(SX_STM, d, 74, count - 1);
277				}
278				srcstart += srcinc;
279				dststart += dstinc;
280			}
281		}
282	}
283	exaMarkSync(pDstPixmap->drawable.pScreen);
284}
285
286/*
287 * copy with same alignment, left to right, no ROP
288 */
289static void
290CG14Copy8_aligned_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
291    int srcpitch, int dstpitch)
292{
293	int saddr, daddr, pre, cnt, wrds;
294
295	ENTER;
296
297	pre = srcstart & 3;
298	if (pre != 0) pre = 4 - pre;
299	pre = min(pre, w);
300
301	while (h > 0) {
302		saddr = srcstart;
303		daddr = dststart;
304		cnt = w;
305		if (pre > 0) {
306			sxm(SX_LDB, saddr, 8, pre - 1);
307			sxm(SX_STB, daddr, 8, pre - 1);
308			saddr += pre;
309			daddr += pre;
310			cnt -= pre;
311			if (cnt == 0) goto next;
312		}
313		while (cnt > 3) {
314			wrds = min(32, cnt >> 2);
315			sxm(SX_LD, saddr, 8, wrds - 1);
316			sxm(SX_ST, daddr, 8, wrds - 1);
317			saddr += wrds << 2;
318			daddr += wrds << 2;
319			cnt -= wrds << 2;
320		}
321		if (cnt > 0) {
322			sxm(SX_LDB, saddr, 8, cnt - 1);
323			sxm(SX_STB, daddr, 8, cnt - 1);
324		}
325next:
326		srcstart += srcpitch;
327		dststart += dstpitch;
328		h--;
329	}
330}
331
332/*
333 * copy with same alignment, left to right, ROP
334 */
335static void
336CG14Copy8_aligned_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
337    int srcpitch, int dstpitch)
338{
339	int saddr, daddr, pre, cnt, wrds;
340
341	ENTER;
342
343	pre = srcstart & 3;
344	if (pre != 0) pre = 4 - pre;
345	pre = min(pre, w);
346
347	while (h > 0) {
348		saddr = srcstart;
349		daddr = dststart;
350		cnt = w;
351		if (pre > 0) {
352			sxm(SX_LDB, saddr, 8, pre - 1);
353			sxm(SX_LDB, daddr, 40, pre - 1);
354			sxi(SX_ROP(8, 40, 72, pre - 1));
355			sxm(SX_STB, daddr, 72, pre - 1);
356			saddr += pre;
357			daddr += pre;
358			cnt -= pre;
359			if (cnt == 0) goto next;
360		}
361		while (cnt > 3) {
362			wrds = min(32, cnt >> 2);
363			sxm(SX_LD, saddr, 8, wrds - 1);
364			sxm(SX_LD, daddr, 40, wrds - 1);
365			if (cnt > 16) {
366				sxi(SX_ROP(8, 40, 72, 15));
367				sxi(SX_ROP(8, 56, 88, wrds - 17));
368			} else
369				sxi(SX_ROP(8, 40, 72, wrds - 1));
370			sxm(SX_ST, daddr, 72, wrds - 1);
371			saddr += wrds << 2;
372			daddr += wrds << 2;
373			cnt -= wrds << 2;
374		}
375		if (cnt > 0) {
376			sxm(SX_LDB, saddr, 8, cnt - 1);
377			sxm(SX_LDB, daddr, 40, cnt - 1);
378			sxi(SX_ROP(8, 40, 72, cnt - 1));
379			sxm(SX_STB, daddr, 72, cnt - 1);
380		}
381next:
382		srcstart += srcpitch;
383		dststart += dstpitch;
384		h--;
385	}
386}
387
388/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
389static void
390CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
391{
392	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
393	int ssreg;
394#ifdef DEBUG
395	int taddr = 4 + dstpitch * 50;
396#endif
397	uint32_t lmask, rmask;
398	ENTER;
399
400	pre = dststart & 3;
401	lmask = 0xffffffff >> pre;
402	spre = srcstart & 3;
403	/*
404	 * make sure we count all the words needed to cover the destination
405	 * line, covering potential partials on both ends
406	 */
407	wrds = (w + pre + 3) >> 2;
408	swrds = (w + spre + 3) >> 2;
409
410	if (spre < pre) {
411		dist = 32 - (pre - spre) * 8;
412		sreg = 9;
413	} else {
414		dist = (spre - pre) * 8;
415		sreg = 8;
416	}
417
418	/*
419	 * mask out trailing pixels to avoid partial writes
420	 */
421	post = (dststart + w) & 3;
422	if (post != 0) {
423		rmask = ~(0xffffffff >> (post * 8));
424		write_sx_reg(p, SX_QUEUED(7), rmask);
425		write_sx_reg(p, SX_QUEUED(6), ~rmask);
426	}
427
428	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
429	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
430
431	/* mask out the leading pixels in dst by using a mask and ROP */
432	if (pre != 0) {
433		CG14Wait(p);
434		write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
435		write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
436	}
437
438	saddr = srcstart & ~3;
439	daddr = dststart & ~3;
440
441	while (h > 0) {
442		sxm(SX_LD, daddr, 80, wrds - 1);
443		sxm(SX_LD, saddr, sreg, swrds - 1);
444		if (wrds > 15) {
445			if (dist != 0) {
446				sxi(SX_FUNNEL_I(8, dist, 40, 15));
447				sxi(SX_FUNNEL_I(24, dist, 56, wrds - 16));
448				/* shifted source pixels are now at register 40+ */
449				ssreg = 40;
450			} else ssreg = 8;
451			if (pre != 0) {
452				/* mask out leading junk */
453				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
454				sxi(SX_ROPB(ssreg, 80, 8, 0));
455				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
456				sxi(SX_ROPB(ssreg + 1, 81, 9, 14));
457			} else {
458				sxi(SX_ROPB(ssreg, 80, 8, 15));
459			}
460			sxi(SX_ROPB(ssreg + 16, 96, 24, wrds - 16));
461		} else {
462			if (dist != 0) {
463				sxi(SX_FUNNEL_I(8, dist, 40, wrds));
464				ssreg = 40;
465			} else ssreg = 8;
466			if (pre != 0) {
467				/* mask out leading junk */
468				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
469				sxi(SX_ROPB(ssreg, 80, 8, 0));
470				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
471				sxi(SX_ROPB(ssreg + 1, 81, 9, wrds));
472			} else {
473				sxi(SX_ROPB(ssreg, 80, 8, wrds));
474			}
475		}
476		if (post != 0) {
477			/*
478			 * if the last word to be written out is a partial we
479			 * mask out the leftovers and replace them with
480			 * background pixels
481			 * we could pull the same ROP * mask trick as we do on
482			 * the left end but it's less annoying this way and
483			 * the instruction count is the same
484			 */
485			sxi(SX_ANDS(7 + wrds, 7, 5, 0));
486			sxi(SX_ANDS(79 + wrds, 6, 4, 0));
487			sxi(SX_ORS(5, 4, 7 + wrds, 0));
488		}
489#ifdef DEBUG
490		sxm(SX_ST, taddr, 40, wrds - 1);
491		taddr += dstpitch;
492#endif
493		sxm(SX_ST, daddr, 8, wrds - 1);
494		saddr += srcpitch;
495		daddr += dstpitch;
496		h--;
497	}
498}
499
500/* up to 124 pixels so direction doesn't matter, unaligned, straight copy */
501static void
502CG14Copy8_short_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
503    int srcpitch, int dstpitch)
504{
505	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
506	int ssreg;
507#ifdef DEBUG
508	int taddr = 4 + dstpitch * 50;
509#endif
510	uint32_t lmask, rmask;
511	ENTER;
512
513	pre = dststart & 3;
514	lmask = 0xffffffff >> pre;
515	spre = srcstart & 3;
516	/*
517	 * make sure we count all the words needed to cover the destination
518	 * line, covering potential partials on both ends
519	 */
520	wrds = (w + pre + 3) >> 2;
521	swrds = (w + spre + 3) >> 2;
522
523	if (spre < pre) {
524		dist = 32 - (pre - spre) * 8;
525		sreg = 9;
526	} else {
527		dist = (spre - pre) * 8;
528		sreg = 8;
529	}
530
531	/*
532	 * mask out trailing pixels to avoid partial writes
533	 */
534	post = (dststart + w) & 3;
535	if (post != 0) {
536		rmask = ~(0xffffffff >> (post * 8));
537		write_sx_reg(p, SX_QUEUED(7), rmask);
538		write_sx_reg(p, SX_QUEUED(6), ~rmask);
539	}
540
541	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
542	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
543
544	/* mask out the leading pixels in dst by using a mask and ROP */
545	if (pre != 0) {
546		CG14Wait(p);
547		write_sx_reg(p, SX_ROP_CONTROL, 0xca);
548		write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
549	}
550
551	saddr = srcstart & ~3;
552	daddr = dststart & ~3;
553
554	while (h > 0) {
555		sxm(SX_LD, saddr, sreg, swrds - 1);
556		if (wrds > 15) {
557			if (dist != 0) {
558				sxi(SX_FUNNEL_I(8, dist, 40, 15));
559				sxi(SX_FUNNEL_I(24, dist, 56, wrds - 16));
560				/* shifted source pixels are now at reg 40+ */
561				ssreg = 40;
562			} else ssreg = 8;
563			if (pre != 0) {
564				/* read only the first word */
565				sxm(SX_LD, daddr, 80, 0);
566				/* mask out leading junk */
567				sxi(SX_ROPB(ssreg, 80, ssreg, 0));
568			}
569		} else {
570			if (dist != 0) {
571				sxi(SX_FUNNEL_I(8, dist, 40, wrds));
572				ssreg = 40;
573			} else ssreg = 8;
574			if (pre != 0) {
575				/* read only the first word */
576				sxm(SX_LD, daddr, 80, 0);
577				/* mask out leading junk */
578				sxi(SX_ROPB(ssreg, 80, ssreg, 0));
579			}
580		}
581		if (post != 0) {
582			int laddr = daddr + ((wrds - 1) << 2);
583			/*
584			 * if the last word to be written out is a partial we
585			 * mask out the leftovers and replace them with
586			 * background pixels
587			 * we could pull the same ROP * mask trick as we do on
588			 * the left end but it's less annoying this way and
589			 * the instruction count is the same
590			 */
591			sxm(SX_LD, laddr, 81, 0);
592			sxi(SX_ANDS(ssreg + wrds - 1, 7, 5, 0));
593			sxi(SX_ANDS(81, 6, 4, 0));
594			sxi(SX_ORS(5, 4, ssreg + wrds - 1, 0));
595		}
596#ifdef DEBUG
597		sxm(SX_ST, taddr, 40, wrds - 1);
598		taddr += dstpitch;
599#endif
600		sxm(SX_ST, daddr, ssreg, wrds - 1);
601		saddr += srcpitch;
602		daddr += dstpitch;
603		h--;
604	}
605}
606
607static void
608CG14Copy8(PixmapPtr pDstPixmap,
609         int srcX, int srcY, int dstX, int dstY, int w, int h)
610{
611	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
612	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
613	int dstpitch, dstoff, srcpitch, srcoff;
614	int srcstart, dststart, xinc, srcinc, dstinc;
615	int line, count, s, d, num;
616
617	ENTER;
618	dstpitch = exaGetPixmapPitch(pDstPixmap);
619	dstoff = exaGetPixmapOffset(pDstPixmap);
620	srcpitch = p->srcpitch;
621	srcoff = p->srcoff;
622	/*
623	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
624	 * actually wrote anything and only sync if it did
625	 */
626	srcstart = srcX + (srcpitch * srcY) + srcoff;
627	dststart = dstX + (dstpitch * dstY) + dstoff;
628
629	if (p->ydir < 0) {
630		srcstart += (h - 1) * srcpitch;
631		dststart += (h - 1) * dstpitch;
632		srcinc = -srcpitch;
633		dstinc = -dstpitch;
634	} else {
635		srcinc = srcpitch;
636		dstinc = dstpitch;
637	}
638
639	/*
640	 * this copies up to 124 pixels wide in one go, so horizontal
641	 * direction / overlap don't matter
642	 * uses all 32bit accesses and funnel shifter for unaligned copies
643	 */
644	if ((w < 125) && (w > 8)) {
645		switch (p->last_rop) {
646			case 0xcc:
647				CG14Copy8_short_norop(p,
648				    srcstart, dststart, w, h, srcinc, dstinc);
649				break;
650			default:
651				CG14Copy8_short_rop(p,
652				    srcstart, dststart, w, h, srcinc, dstinc);
653		}
654		return;
655	}
656
657	/*
658	 * only invert x direction if absolutely necessary, it's a pain to
659	 * go backwards on SX so avoid as much as possible
660	 */
661	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
662		xinc = -32;
663	} else
664		xinc = 32;
665
666	/*
667	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
668	 * most common case
669	 */
670	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
671		switch (p->last_rop) {
672			case 0xcc:
673				CG14Copy8_aligned_norop(p,
674				    srcstart, dststart, w, h, srcinc, dstinc);
675				break;
676			default:
677				CG14Copy8_aligned_rop(p,
678				    srcstart, dststart, w, h, srcinc, dstinc);
679		}
680		return;
681	}
682
683	/*
684	 * if we make it here we either have something large and unaligned,
685	 * something we need to do right to left, or something tiny.
686	 * we handle the non-tiny cases by breaking them down into chunks that
687	 * Copy8_short_*() can handle, making sure the destinations are 32bit
688	 * aligned whenever possible
689	 * since we copy by block, not by line we need to go backwards even if
690	 * we don't copy within the same line
691	 */
692	if (w > 8) {
693		int next, wi, end = dststart + w;
694		DPRINTF(X_ERROR, "%s %08x %08x %d\n",
695		    __func__, srcstart, dststart, w);
696		if ((p->xdir < 0) && (srcoff == dstoff)) {
697			srcstart += w;
698			next = max((end - 120) & ~3, dststart);
699			wi = end - next;
700			srcstart -= wi;
701			while (wi > 0) {
702				DPRINTF(X_ERROR, "%s RL %08x %08x %d\n",
703				    __func__, srcstart, next, wi);
704				if (p->last_rop == 0xcc) {
705					CG14Copy8_short_norop(p, srcstart,
706					    next, wi, h, srcinc, dstinc);
707				} else
708					CG14Copy8_short_rop(p, srcstart,
709					    next, wi, h, srcinc, dstinc);
710				end = next;
711				/*
712				 * avoid extremely narrow copies so I don't
713				 * have to deal with dangling start and end
714				 * pixels in the same word
715				 */
716				if ((end - dststart) < 140) {
717					next = max((end - 80) & ~3, dststart);
718				} else {
719					next = max((end - 120) & ~3, dststart);
720				}
721				wi = end - next;
722				srcstart -= wi;
723			}
724		} else {
725			next = min(end, (dststart + 124) & ~3);
726			wi = next - dststart;
727			while (wi > 0) {
728				DPRINTF(X_ERROR, "%s LR %08x %08x %d\n",
729				    __func__, srcstart, next, wi);
730				if (p->last_rop == 0xcc) {
731					CG14Copy8_short_norop(p,
732					    srcstart, dststart, wi, h,
733					    srcinc, dstinc);
734				} else
735					CG14Copy8_short_rop(p,
736					    srcstart, dststart, wi, h,
737					    srcinc, dstinc);
738				srcstart += wi;
739				dststart = next;
740				if ((end - dststart) < 140) {
741					next = min(end, (dststart + 84) & ~3);
742				} else {
743					next = min(end, (dststart + 124) & ~3);
744				}
745				wi = next - dststart;
746			}
747		}
748		return;
749	}
750	if (xinc < 0) {
751		srcstart += (w - 32);
752		dststart += (w - 32);
753	}
754
755	DPRINTF(X_ERROR, "%s fallback to byte-wise %d %d\n", __func__, w, h);
756	if (p->last_rop == 0xcc) {
757		/* plain old copy */
758		if ( xinc > 0) {
759			/* going left to right */
760			for (line = 0; line < h; line++) {
761				count = 0;
762				s = srcstart;
763				d = dststart;
764				while ( count < w) {
765					num = min(32, w - count);
766					sxm(SX_LDB, s, 10, num - 1);
767					sxm(SX_STBM, d, 10, num - 1);
768					s += xinc;
769					d += xinc;
770					count += 32;
771				}
772				srcstart += srcinc;
773				dststart += dstinc;
774			}
775		} else {
776			/* going right to left */
777			int i, chunks = (w >> 5);
778			for (line = 0; line < h; line++) {
779				s = srcstart;
780				d = dststart;
781				count = w;
782				for (i = 0; i < chunks; i++) {
783					sxm(SX_LDB, s, 10, 31);
784					sxm(SX_STBM, d, 10, 31);
785					s -= 32;
786					d -= 32;
787					count -= 32;
788				}
789				/* leftovers, if any */
790				if (count > 0) {
791					s += (32 - count);
792					d += (32 - count);
793					sxm(SX_LDB, s, 10, count - 1);
794					sxm(SX_STBM, d, 10, count - 1);
795				}
796				srcstart += srcinc;
797				dststart += dstinc;
798			}
799		}
800	} else {
801		/* ROPs needed */
802		if ( xinc > 0) {
803			/* going left to right */
804			for (line = 0; line < h; line++) {
805				count = 0;
806				s = srcstart;
807				d = dststart;
808				while ( count < w) {
809					num = min(32, w - count);
810					sxm(SX_LDB, s, 10, num - 1);
811					sxm(SX_LDB, d, 42, num - 1);
812					if (num > 16) {
813						sxi(SX_ROP(10, 42, 74, 15));
814						sxi(SX_ROP(26, 58, 90, num - 17));
815					} else {
816						sxi(SX_ROP(10, 42, 74, num - 1));
817					}
818					sxm(SX_STBM, d, 74, num - 1);
819					s += xinc;
820					d += xinc;
821					count += 32;
822				}
823				srcstart += srcinc;
824				dststart += dstinc;
825			}
826		} else {
827			/* going right to left */
828			int i, chunks = (w >> 5);
829			for (line = 0; line < h; line++) {
830				s = srcstart;
831				d = dststart;
832				count = w;
833				for (i = 0; i < chunks; i++) {
834					sxm(SX_LDB, s, 10, 31);
835					sxm(SX_LDB, d, 42, 31);
836					sxi(SX_ROP(10, 42, 74, 15));
837					sxi(SX_ROP(26, 58, 90, 15));
838					sxm(SX_STBM, d, 74, 31);
839					s -= 128;
840					d -= 128;
841					count -= 32;
842				}
843				/* leftovers, if any */
844				if (count > 0) {
845					s += (32 - count);
846					d += (32 - count);
847					sxm(SX_LDB, s, 10, count - 1);
848					sxm(SX_LDB, d, 42, count - 1);
849					if (count > 16) {
850						sxi(SX_ROP(10, 42, 74, 15));
851						sxi(SX_ROP(26, 58, 90, count - 17));
852					} else {
853						sxi(SX_ROP(10, 42, 74, count - 1));
854					}
855					sxm(SX_STBM, d, 74, count - 1);
856				}
857				srcstart += srcinc;
858				dststart += dstinc;
859			}
860		}
861	}
862	exaMarkSync(pDstPixmap->drawable.pScreen);
863}
864
865static void
866CG14DoneCopy(PixmapPtr pDstPixmap)
867{
868}
869
870static Bool
871CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
872{
873	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
874	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
875
876	ENTER;
877	DPRINTF(X_ERROR, "bits per pixel: %d %08lx\n",
878	    pPixmap->drawable.bitsPerPixel, fg);
879
880	/*
881	 * GXset and GXclear are really just specual cases of GXcopy with
882	 * fixed fill colour
883	 */
884	switch (alu) {
885		case GXclear:
886			alu = GXcopy;
887			fg = 0;
888			break;
889		case GXset:
890			alu = GXcopy;
891			fg = 0xffffffff;
892			break;
893	}
894	/* repeat the colour in every sub byte if we're in 8 bit */
895	if (pPixmap->drawable.bitsPerPixel == 8) {
896		fg |= fg << 8;
897		fg |= fg << 16;
898	}
899	write_sx_reg(p, SX_QUEUED(8), fg);
900	write_sx_reg(p, SX_QUEUED(9), fg);
901	if (planemask != p->last_mask) {
902		CG14Wait(p);
903		write_sx_reg(p, SX_PLANEMASK, planemask);
904		p->last_mask = planemask;
905	}
906	alu = sx_rop[alu];
907	if (alu != p->last_rop) {
908		CG14Wait(p);
909		write_sx_reg(p, SX_ROP_CONTROL, alu);
910		p->last_rop = alu;
911	}
912
913	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
914	return TRUE;
915}
916
917static void
918CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
919{
920	int line, x, num;
921	uint32_t ptr;
922
923	ENTER;
924	if (p->last_rop == 0xcc) {
925		/* simple fill */
926		for (line = 0; line < h; line++) {
927			x = 0;
928			while (x < w) {
929				ptr = start + (x << 2);
930				num = min(32, w - x);
931				sxm(SX_STS, ptr, 8, num - 1);
932				x += 32;
933			}
934			start += pitch;
935		}
936	} else if (p->last_rop == 0xaa) {
937		/* nothing to do here */
938		return;
939	} else {
940		/* alright, let's do actual ROP stuff */
941
942		/* first repeat the fill colour into 16 registers */
943		sxi(SX_SELECT_S(8, 8, 10, 15));
944
945		for (line = 0; line < h; line++) {
946			x = 0;
947			while (x < w) {
948				ptr = start + (x << 2);
949				num = min(32, w - x);
950				/* now suck fb data into registers */
951				sxm(SX_LD, ptr, 42, num - 1);
952				/*
953				 * ROP them with the fill data we left in 10
954				 * non-memory ops can only have counts up to 16
955				 */
956				if (num <= 16) {
957					sxi(SX_ROP(10, 42, 74, num - 1));
958				} else {
959					sxi(SX_ROP(10, 42, 74, 15));
960					sxi(SX_ROP(10, 58, 90, num - 17));
961				}
962				/* and write the result back into memory */
963				sxm(SX_ST, ptr, 74, num - 1);
964				x += 32;
965			}
966			start += pitch;
967		}
968	}
969}
970
971static void
972CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
973{
974	int line, num, pre, cnt;
975	uint32_t ptr;
976
977	ENTER;
978	pre = start & 3;
979	if (pre != 0) pre = 4 - pre;
980
981	if (p->last_rop == 0xcc) {
982		/* simple fill */
983		for (line = 0; line < h; line++) {
984			ptr = start;
985			cnt = w;
986			pre = min(pre, cnt);
987			if (pre) {
988				sxm(SX_STBS, ptr, 8, pre - 1);
989				ptr += pre;
990				cnt -= pre;
991				if (cnt == 0) goto next;
992			}
993			/* now do the aligned pixels in 32bit chunks */
994			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
995			while(cnt > 3) {
996				num = min(32, cnt >> 2);
997				sxm(SX_STS, ptr, 8, num - 1);
998				ptr += num << 2;
999				cnt -= num << 2;
1000			}
1001			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1002			if (cnt > 0) {
1003				sxm(SX_STBS, ptr, 8, cnt - 1);
1004			}
1005			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1006next:
1007			start += pitch;
1008		}
1009	} else if (p->last_rop == 0xaa) {
1010		/* nothing to do here */
1011		return;
1012	} else {
1013		/* alright, let's do actual ROP stuff */
1014
1015		/* first repeat the fill colour into 16 registers */
1016		sxi(SX_SELECT_S(8, 8, 10, 15));
1017
1018		for (line = 0; line < h; line++) {
1019			ptr = start;
1020			cnt = w;
1021			pre = min(pre, cnt);
1022			if (pre) {
1023				sxm(SX_LDB, ptr, 26, pre - 1);
1024				sxi(SX_ROP(10, 26, 42, pre - 1));
1025				sxm(SX_STB, ptr, 42, pre - 1);
1026				ptr += pre;
1027				cnt -= pre;
1028				if (cnt == 0) goto next2;
1029			}
1030			/* now do the aligned pixels in 32bit chunks */
1031			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1032			while(cnt > 3) {
1033				num = min(32, cnt >> 2);
1034				sxm(SX_LD, ptr, 26, num - 1);
1035				if (num <= 16) {
1036					sxi(SX_ROP(10, 26, 58, num - 1));
1037				} else {
1038					sxi(SX_ROP(10, 26, 58, 15));
1039					sxi(SX_ROP(10, 42, 74, num - 17));
1040				}
1041				sxm(SX_ST, ptr, 58, num - 1);
1042				ptr += num << 2;
1043				cnt -= num << 2;
1044			}
1045			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1046			if (cnt > 0) {
1047				sxm(SX_LDB, ptr, 26, cnt - 1);
1048				sxi(SX_ROP(10, 26, 42, cnt - 1));
1049				sxm(SX_STB, ptr, 42, cnt - 1);
1050			}
1051			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1052next2:
1053			start += pitch;
1054		}
1055	}
1056}
1057
1058static void
1059CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
1060{
1061	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
1062	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1063	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
1064	int start, depth;
1065
1066	ENTER;
1067	dstpitch = exaGetPixmapPitch(pPixmap);
1068	dstoff = exaGetPixmapOffset(pPixmap);
1069
1070	depth = pPixmap->drawable.bitsPerPixel;
1071	switch (depth) {
1072		case 32:
1073			start = dstoff + (y1 * dstpitch) + (x1 << 2);
1074			CG14Solid32(p, start, dstpitch, w, h);
1075			break;
1076		case 8:
1077			start = dstoff + (y1 * dstpitch) + x1;
1078			CG14Solid8(p, start, dstpitch, w, h);
1079			break;
1080	}
1081
1082	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
1083	    dstpitch, dstoff, start);
1084	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
1085	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
1086	exaMarkSync(pPixmap->drawable.pScreen);
1087}
1088
1089/*
1090 * Memcpy-based UTS.
1091 */
1092static Bool
1093CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
1094    char *src, int src_pitch)
1095{
1096	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1097	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1098	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
1099	int    dst_pitch  = exaGetPixmapPitch(pDst);
1100
1101	int bpp    = pDst->drawable.bitsPerPixel;
1102	int cpp    = (bpp + 7) >> 3;
1103	int wBytes = w * cpp;
1104
1105	ENTER;
1106	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
1107	dst += (x * cpp) + (y * dst_pitch);
1108
1109	CG14Wait(p);
1110
1111	while (h--) {
1112		memcpy(dst, src, wBytes);
1113		src += src_pitch;
1114		dst += dst_pitch;
1115	}
1116	__asm("stbar;");
1117	return TRUE;
1118}
1119
1120/*
1121 * Memcpy-based DFS.
1122 */
1123static Bool
1124CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
1125    char *dst, int dst_pitch)
1126{
1127	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
1128	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1129	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
1130	int    src_pitch  = exaGetPixmapPitch(pSrc);
1131
1132	ENTER;
1133	int bpp    = pSrc->drawable.bitsPerPixel;
1134	int cpp    = (bpp + 7) >> 3;
1135	int wBytes = w * cpp;
1136
1137	src += (x * cpp) + (y * src_pitch);
1138
1139	CG14Wait(p);
1140
1141	while (h--) {
1142		memcpy(dst, src, wBytes);
1143		src += src_pitch;
1144		dst += dst_pitch;
1145	}
1146
1147	return TRUE;
1148}
1149
1150Bool
1151CG14CheckComposite(int op, PicturePtr pSrcPicture,
1152                           PicturePtr pMaskPicture,
1153                           PicturePtr pDstPicture)
1154{
1155	int i, ok = FALSE;
1156
1157	ENTER;
1158
1159	/*
1160	 * SX is in theory capable of accelerating pretty much all Xrender ops,
1161	 * even coordinate transformation and gradients. Support will be added
1162	 * over time and likely have to spill over into its own source file.
1163	 */
1164
1165	if ((op != PictOpOver) && (op != PictOpAdd) && (op != PictOpSrc)) {
1166		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
1167		return FALSE;
1168	}
1169
1170	if (pSrcPicture != NULL) {
1171		i = 0;
1172		while ((i < arraysize(src_formats)) && (!ok)) {
1173			ok =  (pSrcPicture->format == src_formats[i]);
1174			i++;
1175		}
1176
1177		if (!ok) {
1178			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
1179			    __func__, pSrcPicture->format);
1180			return FALSE;
1181		}
1182		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
1183	}
1184
1185	if (pDstPicture != NULL) {
1186		i = 0;
1187		ok = FALSE;
1188		while ((i < arraysize(src_formats)) && (!ok)) {
1189			ok =  (pDstPicture->format == src_formats[i]);
1190			i++;
1191		}
1192
1193		if (!ok) {
1194			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
1195			    __func__, pDstPicture->format);
1196			return FALSE;
1197		}
1198		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
1199	}
1200
1201	if (pMaskPicture != NULL) {
1202		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
1203		    pMaskPicture->pDrawable->width,
1204		    pMaskPicture->pDrawable->height);
1205	}
1206	return TRUE;
1207}
1208
1209Bool
1210CG14PrepareComposite(int op, PicturePtr pSrcPicture,
1211                             PicturePtr pMaskPicture,
1212                             PicturePtr pDstPicture,
1213                             PixmapPtr  pSrc,
1214                             PixmapPtr  pMask,
1215                             PixmapPtr  pDst)
1216{
1217	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1218	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1219
1220	ENTER;
1221
1222	p->no_source_pixmap = FALSE;
1223	p->source_is_solid = FALSE;
1224
1225	if (pSrcPicture->format == PICT_a1) {
1226		xf86Msg(X_ERROR, "src mono, dst %x, op %d\n",
1227		    pDstPicture->format, op);
1228		if (pMaskPicture != NULL) {
1229			xf86Msg(X_ERROR, "msk %x\n", pMaskPicture->format);
1230		}
1231	}
1232	if (pSrcPicture->pSourcePict != NULL) {
1233		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
1234			p->fillcolour =
1235			    pSrcPicture->pSourcePict->solidFill.color;
1236			DPRINTF(X_ERROR, "%s: solid src %08x\n",
1237			    __func__, p->fillcolour);
1238			p->no_source_pixmap = TRUE;
1239			p->source_is_solid = TRUE;
1240		}
1241	}
1242	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
1243		if (pMaskPicture->pSourcePict->type ==
1244		    SourcePictTypeSolidFill) {
1245			p->fillcolour =
1246			   pMaskPicture->pSourcePict->solidFill.color;
1247			xf86Msg(X_ERROR, "%s: solid mask %08x\n",
1248			    __func__, p->fillcolour);
1249		}
1250	}
1251	if (pMaskPicture != NULL) {
1252		p->mskoff = exaGetPixmapOffset(pMask);
1253		p->mskpitch = exaGetPixmapPitch(pMask);
1254		p->mskformat = pMaskPicture->format;
1255	} else {
1256		p->mskoff = 0;
1257		p->mskpitch = 0;
1258		p->mskformat = 0;
1259	}
1260	if (pSrc != NULL) {
1261		p->source_is_solid =
1262		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
1263		p->srcoff = exaGetPixmapOffset(pSrc);
1264		p->srcpitch = exaGetPixmapPitch(pSrc);
1265		if (p->source_is_solid) {
1266			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
1267		}
1268	}
1269	p->srcformat = pSrcPicture->format;
1270	p->dstformat = pDstPicture->format;
1271
1272	if (p->source_is_solid) {
1273		uint32_t temp;
1274
1275		/* stuff source colour into SX registers, swap as needed */
1276		temp = p->fillcolour;
1277		switch (p->srcformat) {
1278			case PICT_a8r8g8b8:
1279			case PICT_x8r8g8b8:
1280				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1281				temp = temp >> 8;
1282				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1283				temp = temp >> 8;
1284				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1285				break;
1286			case PICT_a8b8g8r8:
1287			case PICT_x8b8g8r8:
1288				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1289				temp = temp >> 8;
1290				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1291				temp = temp >> 8;
1292				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1293				break;
1294		}
1295		write_sx_reg(p, SX_QUEUED(8), 0xff);
1296	}
1297	p->op = op;
1298	if (op == PictOpSrc) {
1299		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
1300	}
1301#ifdef SX_DEBUG
1302	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
1303	    *(uint32_t *)(p->fb + p->srcoff));
1304#endif
1305	return TRUE;
1306}
1307
1308void
1309CG14Composite(PixmapPtr pDst, int srcX, int srcY,
1310                              int maskX, int maskY,
1311                              int dstX, int dstY,
1312                              int width, int height)
1313{
1314	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1315	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1316	uint32_t dstoff, dstpitch;
1317	uint32_t dst, msk, src;
1318	int flip = 0;
1319
1320	ENTER;
1321	dstoff = exaGetPixmapOffset(pDst);
1322	dstpitch = exaGetPixmapPitch(pDst);
1323
1324	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
1325		PICT_FORMAT_TYPE(p->dstformat));
1326
1327	switch (p->op) {
1328		case PictOpOver:
1329			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
1330			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
1331			    p->mskformat, p->dstformat, srcX, srcY);
1332			if (p->source_is_solid) {
1333				switch (p->mskformat) {
1334					case PICT_a8:
1335						msk = p->mskoff +
1336						    (maskY * p->mskpitch) +
1337						    maskX;
1338						CG14Comp_Over8Solid(p,
1339						    msk, p->mskpitch,
1340						    dst, dstpitch,
1341						    width, height);
1342						break;
1343					case PICT_a8r8g8b8:
1344					case PICT_a8b8g8r8:
1345						msk = p->mskoff +
1346						    (maskY * p->mskpitch) +
1347						    (maskX << 2);
1348						CG14Comp_Over32Solid(p,
1349						    msk, p->mskpitch,
1350						    dst, dstpitch,
1351						    width, height);
1352						break;
1353					default:
1354						xf86Msg(X_ERROR,
1355						  "unsupported mask format %08x\n", p->mskformat);
1356				}
1357			} else {
1358				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
1359				    p->mskformat);
1360				switch (p->srcformat) {
1361					case PICT_a8r8g8b8:
1362					case PICT_a8b8g8r8:
1363						src = p->srcoff +
1364						    (srcY * p->srcpitch) +
1365						    (srcX << 2);
1366						dst = dstoff +
1367						    (dstY * dstpitch) +
1368						    (dstX << 2);
1369						if (p->mskformat == PICT_a8) {
1370							msk = p->mskoff +
1371							    (maskY * p->mskpitch) +
1372							    maskX;
1373							CG14Comp_Over32Mask(p,
1374							    src, p->srcpitch,
1375							    msk, p->mskpitch,
1376							    dst, dstpitch,
1377							    width, height, flip);
1378						} else {
1379							CG14Comp_Over32(p,
1380							    src, p->srcpitch,
1381							    dst, dstpitch,
1382							    width, height, flip);
1383						}
1384						break;
1385					case PICT_x8r8g8b8:
1386					case PICT_x8b8g8r8:
1387						src = p->srcoff +
1388						    (srcY * p->srcpitch) +
1389						    (srcX << 2);
1390						dst = dstoff +
1391						    (dstY * dstpitch) +
1392						    (dstX << 2);
1393						if (p->mskformat == PICT_a8) {
1394							msk = p->mskoff +
1395							    (maskY * p->mskpitch) +
1396							    maskX;
1397							CG14Comp_Over32Mask_noalpha(p,
1398							    src, p->srcpitch,
1399							    msk, p->mskpitch,
1400							    dst, dstpitch,
1401							    width, height, flip);
1402						} else if ((p->mskformat == PICT_a8r8g8b8) ||
1403							   (p->mskformat == PICT_a8b8g8r8)) {
1404							msk = p->mskoff +
1405							    (maskY * p->mskpitch) +
1406							    (maskX << 2);
1407							CG14Comp_Over32Mask32_noalpha(p,
1408							    src, p->srcpitch,
1409							    msk, p->mskpitch,
1410							    dst, dstpitch,
1411							    width, height, flip);
1412						} else {
1413							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
1414						}
1415						break;
1416					default:
1417						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
1418						    __func__, p->srcformat);
1419				}
1420			}
1421			break;
1422		case PictOpAdd:
1423			DPRINTF(X_ERROR, "Add %08x %08x\n",
1424			    p->srcformat, p->dstformat);
1425			switch (p->srcformat) {
1426				case PICT_a8:
1427					src = p->srcoff +
1428					    (srcY * p->srcpitch) + srcX;
1429					if (p->dstformat == PICT_a8) {
1430						dst = dstoff +
1431						      (dstY * dstpitch) + dstX;
1432						CG14Comp_Add8(p,
1433						    src, p->srcpitch,
1434						    dst, dstpitch,
1435						    width, height);
1436					} else {
1437						dst = dstoff +
1438						      (dstY * dstpitch) +
1439						      (dstX << 2);
1440						CG14Comp_Add8_32(p,
1441						    src, p->srcpitch,
1442						    dst, dstpitch,
1443						    width, height);
1444					}
1445					break;
1446				case PICT_a8r8g8b8:
1447				case PICT_x8r8g8b8:
1448					src = p->srcoff +
1449					    (srcY * p->srcpitch) + (srcX << 2);
1450					dst = dstoff + (dstY * dstpitch) +
1451					    (dstX << 2);
1452					CG14Comp_Add32(p, src, p->srcpitch,
1453					    dst, dstpitch, width, height);
1454					break;
1455				default:
1456					xf86Msg(X_ERROR,
1457					    "unsupported src format\n");
1458			}
1459			break;
1460		case PictOpSrc:
1461			DPRINTF(X_ERROR, "Src %08x %08x\n",
1462			    p->srcformat, p->dstformat);
1463			if (p->mskformat != 0)
1464				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
1465			if (p->srcformat == PICT_a8) {
1466				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
1467			} else {
1468				/* convert between RGB and BGR? */
1469				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
1470			}
1471			break;
1472		default:
1473			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
1474	}
1475	exaMarkSync(pDst->drawable.pScreen);
1476}
1477
1478
1479
1480Bool
1481CG14InitAccel(ScreenPtr pScreen)
1482{
1483	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1484	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1485	ExaDriverPtr pExa;
1486
1487	pExa = exaDriverAlloc();
1488	if (!pExa)
1489		return FALSE;
1490
1491	p->pExa = pExa;
1492
1493	pExa->exa_major = EXA_VERSION_MAJOR;
1494	pExa->exa_minor = EXA_VERSION_MINOR;
1495
1496	pExa->memoryBase = p->fb;
1497	pExa->memorySize = p->memsize;
1498	pExa->offScreenBase = p->width * p->height * (pScrn->depth >> 3);
1499
1500	/*
1501	 * SX memory instructions are written to 64bit aligned addresses with
1502	 * a 3 bit displacement. Make sure the displacement remains constant
1503	 * within one column
1504	 */
1505
1506	pExa->pixmapOffsetAlign = 8;
1507	pExa->pixmapPitchAlign = 8;
1508
1509	pExa->flags = EXA_OFFSCREEN_PIXMAPS
1510		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
1511		      /*| EXA_MIXED_PIXMAPS*/;
1512
1513	/*
1514	 * these limits are bogus
1515	 * SX doesn't deal with coordinates at all, so there is no limit but
1516	 * we have to put something here
1517	 */
1518	pExa->maxX = 4096;
1519	pExa->maxY = 4096;
1520
1521	pExa->WaitMarker = CG14WaitMarker;
1522
1523	pExa->PrepareSolid = CG14PrepareSolid;
1524	pExa->Solid = CG14Solid;
1525	pExa->DoneSolid = CG14DoneCopy;
1526	pExa->PrepareCopy = CG14PrepareCopy;
1527	pExa->Copy = CG14Copy32;
1528	pExa->DoneCopy = CG14DoneCopy;
1529	if (p->use_xrender) {
1530		pExa->CheckComposite = CG14CheckComposite;
1531		pExa->PrepareComposite = CG14PrepareComposite;
1532		pExa->Composite = CG14Composite;
1533		pExa->DoneComposite = CG14DoneCopy;
1534	}
1535
1536	/* EXA hits more optimized paths when it does not have to fallback
1537	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
1538	 */
1539	pExa->UploadToScreen = CG14UploadToScreen;
1540	pExa->DownloadFromScreen = CG14DownloadFromScreen;
1541
1542	p->queuecount = 0;
1543	/* do some hardware init */
1544	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
1545	p->last_mask = 0xffffffff;
1546	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
1547	p->last_rop = 0xcc;
1548	return exaDriverInit(pScreen, pExa);
1549}
1550