cg14_accel.c revision b0f02aef
1/* $NetBSD: cg14_accel.c,v 1.31 2022/05/11 21:10:37 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45/*#define SX_DEBUG*/
46/*#define SX_TRACE*/
47
48#ifdef SX_TRACE
49#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
50#else
51#define ENTER
52#endif
53
54#ifdef SX_DEBUG
55#define DPRINTF xf86Msg
56#else
57#define DPRINTF while (0) xf86Msg
58#endif
59
60#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
61
62/* 0xcc is SX's GXcopy equivalent */
63uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
64		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
65
66int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
67		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
68int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
69
70static void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
71static void CG14Copy8(PixmapPtr, int, int, int, int, int, int);
72
73static inline void
74CG14Wait(Cg14Ptr p)
75{
76	int bail = 10000000;
77	/* we wait for the busy bit to clear */
78	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
79	       (bail > 0)) {
80		bail--;
81	};
82	if (bail == 0) {
83		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
84		    read_sx_reg(p, SX_CONTROL_STATUS),
85		    read_sx_reg(p, SX_ERROR));
86	}
87}
88
89static void
90CG14WaitMarker(ScreenPtr pScreen, int Marker)
91{
92	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
93	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
94
95	CG14Wait(p);
96}
97
98static Bool
99CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
100		int xdir, int ydir, int alu, Pixel planemask)
101{
102	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
103	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
104
105	ENTER;
106	DPRINTF(X_ERROR, "%s bpp %d rop %x\n", __func__,
107	    pSrcPixmap->drawable.bitsPerPixel, alu);
108
109	if (planemask != p->last_mask) {
110		CG14Wait(p);
111		write_sx_reg(p, SX_PLANEMASK, planemask);
112		p->last_mask = planemask;
113	}
114	alu = sx_rop[alu];
115	if (alu != p->last_rop) {
116		CG14Wait(p);
117		write_sx_reg(p, SX_ROP_CONTROL, alu);
118		p->last_rop = alu;
119	}
120	switch (pSrcPixmap->drawable.bitsPerPixel)  {
121		case 8:
122			p->pExa->Copy = CG14Copy8;
123			break;
124		case 32:
125			p->pExa->Copy = CG14Copy32;
126			break;
127		default:
128			DPRINTF(X_ERROR, "%s depth %d\n", __func__,
129			    pSrcPixmap->drawable.bitsPerPixel);
130	}
131	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
132	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
133	p->xdir = xdir;
134	p->ydir = ydir;
135	return TRUE;
136}
137
138static void
139CG14Copy32(PixmapPtr pDstPixmap,
140         int srcX, int srcY, int dstX, int dstY, int w, int h)
141{
142	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
143	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
144	int dstpitch, dstoff, srcpitch, srcoff;
145	int srcstart, dststart, xinc, srcinc, dstinc;
146	int line, count, s, d, num;
147
148	ENTER;
149	dstpitch = exaGetPixmapPitch(pDstPixmap);
150	dstoff = exaGetPixmapOffset(pDstPixmap);
151	srcpitch = p->srcpitch;
152	srcoff = p->srcoff;
153	/*
154	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
155	 * actually wrote anything and only sync if it did
156	 */
157	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
158	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
159
160	/*
161	 * we always copy up to 32 pixels at a time so direction doesn't
162	 * matter if w<=32
163	 */
164	if (w > 32) {
165		if (p->xdir < 0) {
166			srcstart += (w - 32) << 2;
167			dststart += (w - 32) << 2;
168			xinc = -128;
169		} else
170			xinc = 128;
171	} else
172		xinc = 128;
173	if (p->ydir < 0) {
174		srcstart += (h - 1) * srcpitch;
175		dststart += (h - 1) * dstpitch;
176		srcinc = -srcpitch;
177		dstinc = -dstpitch;
178	} else {
179		srcinc = srcpitch;
180		dstinc = dstpitch;
181	}
182	if (p->last_rop == 0xcc) {
183		/* plain old copy */
184		if ( xinc > 0) {
185			/* going left to right */
186			for (line = 0; line < h; line++) {
187				count = 0;
188				s = srcstart;
189				d = dststart;
190				while ( count < w) {
191					num = min(32, w - count);
192					sxm(SX_LD, s, 10, num - 1);
193					sxm(SX_STM, d, 10, num - 1);
194					s += xinc;
195					d += xinc;
196					count += 32;
197				}
198				srcstart += srcinc;
199				dststart += dstinc;
200			}
201		} else {
202			/* going right to left */
203			int i, chunks = (w >> 5);
204			for (line = 0; line < h; line++) {
205				s = srcstart;
206				d = dststart;
207				count = w;
208				for (i = 0; i < chunks; i++) {
209					sxm(SX_LD, s, 10, 31);
210					sxm(SX_STM, d, 10, 31);
211					s -= 128;
212					d -= 128;
213					count -= 32;
214				}
215				/* leftovers, if any */
216				if (count > 0) {
217					s += (32 - count) << 2;
218					d += (32 - count) << 2;
219					sxm(SX_LD, s, 10, count - 1);
220					sxm(SX_STM, d, 10, count - 1);
221				}
222				srcstart += srcinc;
223				dststart += dstinc;
224			}
225		}
226	} else {
227		/* ROPs needed */
228		if ( xinc > 0) {
229			/* going left to right */
230			for (line = 0; line < h; line++) {
231				count = 0;
232				s = srcstart;
233				d = dststart;
234				while ( count < w) {
235					num = min(32, w - count);
236					sxm(SX_LD, s, 10, num - 1);
237					sxm(SX_LD, d, 42, num - 1);
238					if (num > 16) {
239						sxi(SX_ROP, 10, 42, 74, 15);
240						sxi(SX_ROP, 26, 58, 90, num - 17);
241					} else {
242						sxi(SX_ROP, 10, 42, 74, num - 1);
243					}
244					sxm(SX_STM, d, 74, num - 1);
245					s += xinc;
246					d += xinc;
247					count += 32;
248				}
249				srcstart += srcinc;
250				dststart += dstinc;
251			}
252		} else {
253			/* going right to left */
254			int i, chunks = (w >> 5);
255			for (line = 0; line < h; line++) {
256				s = srcstart;
257				d = dststart;
258				count = w;
259				for (i = 0; i < chunks; i++) {
260					sxm(SX_LD, s, 10, 31);
261					sxm(SX_LD, d, 42, 31);
262					sxi(SX_ROP, 10, 42, 74, 15);
263					sxi(SX_ROP, 26, 58, 90, 15);
264					sxm(SX_STM, d, 74, 31);
265					s -= 128;
266					d -= 128;
267					count -= 32;
268				}
269				/* leftovers, if any */
270				if (count > 0) {
271					s += (32 - count) << 2;
272					d += (32 - count) << 2;
273					sxm(SX_LD, s, 10, count - 1);
274					sxm(SX_LD, d, 42, count - 1);
275					if (count > 16) {
276						sxi(SX_ROP, 10, 42, 74, 15);
277						sxi(SX_ROP, 26, 58, 90, count - 17);
278					} else {
279						sxi(SX_ROP, 10, 42, 74, count - 1);
280					}
281					sxm(SX_STM, d, 74, count - 1);
282				}
283				srcstart += srcinc;
284				dststart += dstinc;
285			}
286		}
287	}
288	exaMarkSync(pDstPixmap->drawable.pScreen);
289}
290
291/*
292 * copy with same alignment, left to right, no ROP
293 */
294static void
295CG14Copy8_aligned_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
296    int srcpitch, int dstpitch)
297{
298	int saddr, daddr, pre, cnt, wrds;
299
300	ENTER;
301
302	pre = srcstart & 3;
303	if (pre != 0) pre = 4 - pre;
304	pre = min(pre, w);
305
306	while (h > 0) {
307		saddr = srcstart;
308		daddr = dststart;
309		cnt = w;
310		if (pre > 0) {
311			sxm(SX_LDB, saddr, 8, pre - 1);
312			sxm(SX_STB, daddr, 8, pre - 1);
313			saddr += pre;
314			daddr += pre;
315			cnt -= pre;
316			if (cnt == 0) goto next;
317		}
318		while (cnt > 3) {
319			wrds = min(32, cnt >> 2);
320			sxm(SX_LD, saddr, 8, wrds - 1);
321			sxm(SX_ST, daddr, 8, wrds - 1);
322			saddr += wrds << 2;
323			daddr += wrds << 2;
324			cnt -= wrds << 2;
325		}
326		if (cnt > 0) {
327			sxm(SX_LDB, saddr, 8, cnt - 1);
328			sxm(SX_STB, daddr, 8, cnt - 1);
329		}
330next:
331		srcstart += srcpitch;
332		dststart += dstpitch;
333		h--;
334	}
335}
336
337/*
338 * copy with same alignment, left to right, ROP
339 */
340static void
341CG14Copy8_aligned_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
342    int srcpitch, int dstpitch)
343{
344	int saddr, daddr, pre, cnt, wrds;
345
346	ENTER;
347
348	pre = srcstart & 3;
349	if (pre != 0) pre = 4 - pre;
350	pre = min(pre, w);
351
352	while (h > 0) {
353		saddr = srcstart;
354		daddr = dststart;
355		cnt = w;
356		if (pre > 0) {
357			sxm(SX_LDB, saddr, 8, pre - 1);
358			sxm(SX_LDB, daddr, 40, pre - 1);
359			sxi(SX_ROP, 8, 40, 72, pre - 1);
360			sxm(SX_STB, daddr, 72, pre - 1);
361			saddr += pre;
362			daddr += pre;
363			cnt -= pre;
364			if (cnt == 0) goto next;
365		}
366		while (cnt > 3) {
367			wrds = min(32, cnt >> 2);
368			sxm(SX_LD, saddr, 8, wrds - 1);
369			sxm(SX_LD, daddr, 40, wrds - 1);
370			if (cnt > 16) {
371				sxi(SX_ROP, 8, 40, 72, 15);
372				sxi(SX_ROP, 8, 56, 88, wrds - 17);
373			} else
374				sxi(SX_ROP, 8, 40, 72, wrds - 1);
375			sxm(SX_ST, daddr, 72, wrds - 1);
376			saddr += wrds << 2;
377			daddr += wrds << 2;
378			cnt -= wrds << 2;
379		}
380		if (cnt > 0) {
381			sxm(SX_LDB, saddr, 8, cnt - 1);
382			sxm(SX_LDB, daddr, 40, cnt - 1);
383			sxi(SX_ROP, 8, 40, 72, cnt - 1);
384			sxm(SX_STB, daddr, 72, cnt - 1);
385		}
386next:
387		srcstart += srcpitch;
388		dststart += dstpitch;
389		h--;
390	}
391}
392
393/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
394static void
395CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
396{
397	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
398	int ssreg;
399#ifdef DEBUG
400	int taddr = 4 + dstpitch * 50;
401#endif
402	uint32_t lmask, rmask;
403	ENTER;
404
405	pre = dststart & 3;
406	lmask = 0xffffffff >> pre;
407	spre = srcstart & 3;
408	/*
409	 * make sure we count all the words needed to cover the destination
410	 * line, covering potential partials on both ends
411	 */
412	wrds = (w + pre + 3) >> 2;
413	swrds = (w + spre + 3) >> 2;
414
415	if (spre < pre) {
416		dist = 32 - (pre - spre) * 8;
417		sreg = 9;
418	} else {
419		dist = (spre - pre) * 8;
420		sreg = 8;
421	}
422
423	/*
424	 * mask out trailing pixels to avoid partial writes
425	 */
426	post = (dststart + w) & 3;
427	if (post != 0) {
428		rmask = ~(0xffffffff >> (post * 8));
429		write_sx_reg(p, SX_QUEUED(7), rmask);
430		write_sx_reg(p, SX_QUEUED(6), ~rmask);
431	}
432
433	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
434	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
435
436	/* mask out the leading pixels in dst by using a mask and ROP */
437	if (pre != 0) {
438		CG14Wait(p);
439		write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
440		write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
441	}
442
443	saddr = srcstart & ~3;
444	daddr = dststart & ~3;
445
446	while (h > 0) {
447		sxm(SX_LD, daddr, 80, wrds - 1);
448		sxm(SX_LD, saddr, sreg, swrds - 1);
449		if (wrds > 15) {
450			if (dist != 0) {
451				sxi(SX_FUNNEL_I, 8, dist, 40, 15);
452				sxi(SX_FUNNEL_I, 24, dist, 56, wrds - 16);
453				/* shifted source pixels are now at register 40+ */
454				ssreg = 40;
455			} else ssreg = 8;
456			if (pre != 0) {
457				/* mask out leading junk */
458				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
459				sxi(SX_ROPB, ssreg, 80, 8, 0);
460				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
461				sxi(SX_ROPB, ssreg + 1, 81, 9, 14);
462			} else {
463				sxi(SX_ROPB, ssreg, 80, 8, 15);
464			}
465			sxi(SX_ROPB, ssreg + 16, 96, 24, wrds - 16);
466		} else {
467			if (dist != 0) {
468				sxi(SX_FUNNEL_I, 8, dist, 40, wrds);
469				ssreg = 40;
470			} else ssreg = 8;
471			if (pre != 0) {
472				/* mask out leading junk */
473				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
474				sxi(SX_ROPB, ssreg, 80, 8, 0);
475				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
476				sxi(SX_ROPB, ssreg + 1, 81, 9, wrds);
477			} else {
478				sxi(SX_ROPB, ssreg, 80, 8, wrds);
479			}
480		}
481		if (post != 0) {
482			/*
483			 * if the last word to be written out is a partial we
484			 * mask out the leftovers and replace them with
485			 * background pixels
486			 * we could pull the same ROP * mask trick as we do on
487			 * the left end but it's less annoying this way and
488			 * the instruction count is the same
489			 */
490			sxi(SX_ANDS, 7 + wrds, 7, 5, 0);
491			sxi(SX_ANDS, 79 + wrds, 6, 4, 0);
492			sxi(SX_ORS, 5, 4, 7 + wrds, 0);
493		}
494#ifdef DEBUG
495		sxm(SX_ST, taddr, 40, wrds - 1);
496		taddr += dstpitch;
497#endif
498		sxm(SX_ST, daddr, 8, wrds - 1);
499		saddr += srcpitch;
500		daddr += dstpitch;
501		h--;
502	}
503}
504
505/* up to 124 pixels so direction doesn't matter, unaligned, straight copy */
506static void
507CG14Copy8_short_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h,
508    int srcpitch, int dstpitch)
509{
510	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
511	int ssreg;
512#ifdef DEBUG
513	int taddr = 4 + dstpitch * 50;
514#endif
515	uint32_t lmask, rmask;
516	ENTER;
517
518	pre = dststart & 3;
519	lmask = 0xffffffff >> pre;
520	spre = srcstart & 3;
521	/*
522	 * make sure we count all the words needed to cover the destination
523	 * line, covering potential partials on both ends
524	 */
525	wrds = (w + pre + 3) >> 2;
526	swrds = (w + spre + 3) >> 2;
527
528	if (spre < pre) {
529		dist = 32 - (pre - spre) * 8;
530		sreg = 9;
531	} else {
532		dist = (spre - pre) * 8;
533		sreg = 8;
534	}
535
536	/*
537	 * mask out trailing pixels to avoid partial writes
538	 */
539	post = (dststart + w) & 3;
540	if (post != 0) {
541		rmask = ~(0xffffffff >> (post * 8));
542		write_sx_reg(p, SX_QUEUED(7), rmask);
543		write_sx_reg(p, SX_QUEUED(6), ~rmask);
544	}
545
546	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
547	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
548
549	/* mask out the leading pixels in dst by using a mask and ROP */
550	if (pre != 0) {
551		CG14Wait(p);
552		write_sx_reg(p, SX_ROP_CONTROL, 0xca);
553		write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
554	}
555
556	saddr = srcstart & ~3;
557	daddr = dststart & ~3;
558
559	while (h > 0) {
560		sxm(SX_LD, saddr, sreg, swrds - 1);
561		if (wrds > 15) {
562			if (dist != 0) {
563				sxi(SX_FUNNEL_I, 8, dist, 40, 15);
564				sxi(SX_FUNNEL_I, 24, dist, 56, wrds - 16);
565				/* shifted source pixels are now at reg 40+ */
566				ssreg = 40;
567			} else ssreg = 8;
568			if (pre != 0) {
569				/* read only the first word */
570				sxm(SX_LD, daddr, 80, 0);
571				/* mask out leading junk */
572				sxi(SX_ROPB, ssreg, 80, ssreg, 0);
573			}
574		} else {
575			if (dist != 0) {
576				sxi(SX_FUNNEL_I, 8, dist, 40, wrds);
577				ssreg = 40;
578			} else ssreg = 8;
579			if (pre != 0) {
580				/* read only the first word */
581				sxm(SX_LD, daddr, 80, 0);
582				/* mask out leading junk */
583				sxi(SX_ROPB, ssreg, 80, ssreg, 0);
584			}
585		}
586		if (post != 0) {
587			int laddr = daddr + ((wrds - 1) << 2);
588			/*
589			 * if the last word to be written out is a partial we
590			 * mask out the leftovers and replace them with
591			 * background pixels
592			 * we could pull the same ROP * mask trick as we do on
593			 * the left end but it's less annoying this way and
594			 * the instruction count is the same
595			 */
596			sxm(SX_LD, laddr, 81, 0);
597			sxi(SX_ANDS, ssreg + wrds - 1, 7, 5, 0);
598			sxi(SX_ANDS, 81, 6, 4, 0);
599			sxi(SX_ORS, 5, 4, ssreg + wrds - 1, 0);
600		}
601#ifdef DEBUG
602		sxm(SX_ST, taddr, 40, wrds - 1);
603		taddr += dstpitch;
604#endif
605		sxm(SX_ST, daddr, ssreg, wrds - 1);
606		saddr += srcpitch;
607		daddr += dstpitch;
608		h--;
609	}
610}
611
612static void
613CG14Copy8(PixmapPtr pDstPixmap,
614         int srcX, int srcY, int dstX, int dstY, int w, int h)
615{
616	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
617	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
618	int dstpitch, dstoff, srcpitch, srcoff;
619	int srcstart, dststart, xinc, srcinc, dstinc;
620	int line, count, s, d, num;
621
622	ENTER;
623	dstpitch = exaGetPixmapPitch(pDstPixmap);
624	dstoff = exaGetPixmapOffset(pDstPixmap);
625	srcpitch = p->srcpitch;
626	srcoff = p->srcoff;
627	/*
628	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
629	 * actually wrote anything and only sync if it did
630	 */
631	srcstart = srcX + (srcpitch * srcY) + srcoff;
632	dststart = dstX + (dstpitch * dstY) + dstoff;
633
634	if (p->ydir < 0) {
635		srcstart += (h - 1) * srcpitch;
636		dststart += (h - 1) * dstpitch;
637		srcinc = -srcpitch;
638		dstinc = -dstpitch;
639	} else {
640		srcinc = srcpitch;
641		dstinc = dstpitch;
642	}
643
644	/*
645	 * this copies up to 124 pixels wide in one go, so horizontal
646	 * direction / overlap don't matter
647	 * uses all 32bit accesses and funnel shifter for unaligned copies
648	 */
649	if ((w < 125) && (w > 8)) {
650		switch (p->last_rop) {
651			case 0xcc:
652				CG14Copy8_short_norop(p,
653				    srcstart, dststart, w, h, srcinc, dstinc);
654				break;
655			default:
656				CG14Copy8_short_rop(p,
657				    srcstart, dststart, w, h, srcinc, dstinc);
658		}
659		return;
660	}
661
662	/*
663	 * only invert x direction if absolutely necessary, it's a pain to
664	 * go backwards on SX so avoid as much as possible
665	 */
666	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
667		xinc = -32;
668	} else
669		xinc = 32;
670
671	/*
672	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
673	 * most common case
674	 */
675	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
676		switch (p->last_rop) {
677			case 0xcc:
678				CG14Copy8_aligned_norop(p,
679				    srcstart, dststart, w, h, srcinc, dstinc);
680				break;
681			default:
682				CG14Copy8_aligned_rop(p,
683				    srcstart, dststart, w, h, srcinc, dstinc);
684		}
685		return;
686	}
687
688	/*
689	 * if we make it here we either have something large and unaligned,
690	 * something we need to do right to left, or something tiny.
691	 * we handle the non-tiny cases by breaking them down into chunks that
692	 * Copy8_short_*() can handle, making sure the destinations are 32bit
693	 * aligned whenever possible
694	 * since we copy by block, not by line we need to go backwards even if
695	 * we don't copy within the same line
696	 */
697	if (w > 8) {
698		int next, wi, end = dststart + w;
699		DPRINTF(X_ERROR, "%s %08x %08x %d\n",
700		    __func__, srcstart, dststart, w);
701		if ((p->xdir < 0) && (srcoff == dstoff)) {
702			srcstart += w;
703			next = max((end - 120) & ~3, dststart);
704			wi = end - next;
705			srcstart -= wi;
706			while (wi > 0) {
707				DPRINTF(X_ERROR, "%s RL %08x %08x %d\n",
708				    __func__, srcstart, next, wi);
709				if (p->last_rop == 0xcc) {
710					CG14Copy8_short_norop(p, srcstart,
711					    next, wi, h, srcinc, dstinc);
712				} else
713					CG14Copy8_short_rop(p, srcstart,
714					    next, wi, h, srcinc, dstinc);
715				end = next;
716				/*
717				 * avoid extremely narrow copies so I don't
718				 * have to deal with dangling start and end
719				 * pixels in the same word
720				 */
721				if ((end - dststart) < 140) {
722					next = max((end - 80) & ~3, dststart);
723				} else {
724					next = max((end - 120) & ~3, dststart);
725				}
726				wi = end - next;
727				srcstart -= wi;
728			}
729		} else {
730			next = min(end, (dststart + 124) & ~3);
731			wi = next - dststart;
732			while (wi > 0) {
733				DPRINTF(X_ERROR, "%s LR %08x %08x %d\n",
734				    __func__, srcstart, next, wi);
735				if (p->last_rop == 0xcc) {
736					CG14Copy8_short_norop(p,
737					    srcstart, dststart, wi, h,
738					    srcinc, dstinc);
739				} else
740					CG14Copy8_short_rop(p,
741					    srcstart, dststart, wi, h,
742					    srcinc, dstinc);
743				srcstart += wi;
744				dststart = next;
745				if ((end - dststart) < 140) {
746					next = min(end, (dststart + 84) & ~3);
747				} else {
748					next = min(end, (dststart + 124) & ~3);
749				}
750				wi = next - dststart;
751			}
752		}
753		return;
754	}
755	if (xinc < 0) {
756		srcstart += (w - 32);
757		dststart += (w - 32);
758	}
759
760	DPRINTF(X_ERROR, "%s fallback to byte-wise %d %d\n", __func__, w, h);
761	if (p->last_rop == 0xcc) {
762		/* plain old copy */
763		if ( xinc > 0) {
764			/* going left to right */
765			for (line = 0; line < h; line++) {
766				count = 0;
767				s = srcstart;
768				d = dststart;
769				while ( count < w) {
770					num = min(32, w - count);
771					sxm(SX_LDB, s, 10, num - 1);
772					sxm(SX_STBM, d, 10, num - 1);
773					s += xinc;
774					d += xinc;
775					count += 32;
776				}
777				srcstart += srcinc;
778				dststart += dstinc;
779			}
780		} else {
781			/* going right to left */
782			int i, chunks = (w >> 5);
783			for (line = 0; line < h; line++) {
784				s = srcstart;
785				d = dststart;
786				count = w;
787				for (i = 0; i < chunks; i++) {
788					sxm(SX_LDB, s, 10, 31);
789					sxm(SX_STBM, d, 10, 31);
790					s -= 32;
791					d -= 32;
792					count -= 32;
793				}
794				/* leftovers, if any */
795				if (count > 0) {
796					s += (32 - count);
797					d += (32 - count);
798					sxm(SX_LDB, s, 10, count - 1);
799					sxm(SX_STBM, d, 10, count - 1);
800				}
801				srcstart += srcinc;
802				dststart += dstinc;
803			}
804		}
805	} else {
806		/* ROPs needed */
807		if ( xinc > 0) {
808			/* going left to right */
809			for (line = 0; line < h; line++) {
810				count = 0;
811				s = srcstart;
812				d = dststart;
813				while ( count < w) {
814					num = min(32, w - count);
815					sxm(SX_LDB, s, 10, num - 1);
816					sxm(SX_LDB, d, 42, num - 1);
817					if (num > 16) {
818						sxi(SX_ROP, 10, 42, 74, 15);
819						sxi(SX_ROP, 26, 58, 90, num - 17);
820					} else {
821						sxi(SX_ROP, 10, 42, 74, num - 1);
822					}
823					sxm(SX_STBM, d, 74, num - 1);
824					s += xinc;
825					d += xinc;
826					count += 32;
827				}
828				srcstart += srcinc;
829				dststart += dstinc;
830			}
831		} else {
832			/* going right to left */
833			int i, chunks = (w >> 5);
834			for (line = 0; line < h; line++) {
835				s = srcstart;
836				d = dststart;
837				count = w;
838				for (i = 0; i < chunks; i++) {
839					sxm(SX_LDB, s, 10, 31);
840					sxm(SX_LDB, d, 42, 31);
841					sxi(SX_ROP, 10, 42, 74, 15);
842					sxi(SX_ROP, 26, 58, 90, 15);
843					sxm(SX_STBM, d, 74, 31);
844					s -= 128;
845					d -= 128;
846					count -= 32;
847				}
848				/* leftovers, if any */
849				if (count > 0) {
850					s += (32 - count);
851					d += (32 - count);
852					sxm(SX_LDB, s, 10, count - 1);
853					sxm(SX_LDB, d, 42, count - 1);
854					if (count > 16) {
855						sxi(SX_ROP, 10, 42, 74, 15);
856						sxi(SX_ROP, 26, 58, 90, count - 17);
857					} else {
858						sxi(SX_ROP, 10, 42, 74, count - 1);
859					}
860					sxm(SX_STBM, d, 74, count - 1);
861				}
862				srcstart += srcinc;
863				dststart += dstinc;
864			}
865		}
866	}
867	exaMarkSync(pDstPixmap->drawable.pScreen);
868}
869
870static void
871CG14DoneCopy(PixmapPtr pDstPixmap)
872{
873}
874
875static Bool
876CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
877{
878	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
879	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
880
881	ENTER;
882	DPRINTF(X_ERROR, "bits per pixel: %d %08lx\n",
883	    pPixmap->drawable.bitsPerPixel, fg);
884
885	/*
886	 * GXset and GXclear are really just specual cases of GXcopy with
887	 * fixed fill colour
888	 */
889	switch (alu) {
890		case GXclear:
891			alu = GXcopy;
892			fg = 0;
893			break;
894		case GXset:
895			alu = GXcopy;
896			fg = 0xffffffff;
897			break;
898	}
899	/* repeat the colour in every sub byte if we're in 8 bit */
900	if (pPixmap->drawable.bitsPerPixel == 8) {
901		fg |= fg << 8;
902		fg |= fg << 16;
903	}
904	write_sx_reg(p, SX_QUEUED(8), fg);
905	write_sx_reg(p, SX_QUEUED(9), fg);
906	if (planemask != p->last_mask) {
907		CG14Wait(p);
908		write_sx_reg(p, SX_PLANEMASK, planemask);
909		p->last_mask = planemask;
910	}
911	alu = sx_rop[alu];
912	if (alu != p->last_rop) {
913		CG14Wait(p);
914		write_sx_reg(p, SX_ROP_CONTROL, alu);
915		p->last_rop = alu;
916	}
917
918	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
919	return TRUE;
920}
921
922static void
923CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
924{
925	int line, x, num;
926	uint32_t ptr;
927
928	ENTER;
929	if (p->last_rop == 0xcc) {
930		/* simple fill */
931		for (line = 0; line < h; line++) {
932			x = 0;
933			while (x < w) {
934				ptr = start + (x << 2);
935				num = min(32, w - x);
936				sxm(SX_STS, ptr, 8, num - 1);
937				x += 32;
938			}
939			start += pitch;
940		}
941	} else if (p->last_rop == 0xaa) {
942		/* nothing to do here */
943		return;
944	} else {
945		/* alright, let's do actual ROP stuff */
946
947		/* first repeat the fill colour into 16 registers */
948		sxi(SX_SELECT_S, 8, 8, 10, 15);
949
950		for (line = 0; line < h; line++) {
951			x = 0;
952			while (x < w) {
953				ptr = start + (x << 2);
954				num = min(32, w - x);
955				/* now suck fb data into registers */
956				sxm(SX_LD, ptr, 42, num - 1);
957				/*
958				 * ROP them with the fill data we left in 10
959				 * non-memory ops can only have counts up to 16
960				 */
961				if (num <= 16) {
962					sxi(SX_ROP, 10, 42, 74, num - 1);
963				} else {
964					sxi(SX_ROP, 10, 42, 74, 15);
965					sxi(SX_ROP, 10, 58, 90, num - 17);
966				}
967				/* and write the result back into memory */
968				sxm(SX_ST, ptr, 74, num - 1);
969				x += 32;
970			}
971			start += pitch;
972		}
973	}
974}
975
976static void
977CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
978{
979	int line, num, pre, cnt;
980	uint32_t ptr;
981
982	ENTER;
983	pre = start & 3;
984	if (pre != 0) pre = 4 - pre;
985
986	if (p->last_rop == 0xcc) {
987		/* simple fill */
988		for (line = 0; line < h; line++) {
989			ptr = start;
990			cnt = w;
991			pre = min(pre, cnt);
992			if (pre) {
993				sxm(SX_STBS, ptr, 8, pre - 1);
994				ptr += pre;
995				cnt -= pre;
996				if (cnt == 0) goto next;
997			}
998			/* now do the aligned pixels in 32bit chunks */
999			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1000			while(cnt > 3) {
1001				num = min(32, cnt >> 2);
1002				sxm(SX_STS, ptr, 8, num - 1);
1003				ptr += num << 2;
1004				cnt -= num << 2;
1005			}
1006			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1007			if (cnt > 0) {
1008				sxm(SX_STBS, ptr, 8, cnt - 1);
1009			}
1010			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1011next:
1012			start += pitch;
1013		}
1014	} else if (p->last_rop == 0xaa) {
1015		/* nothing to do here */
1016		return;
1017	} else {
1018		/* alright, let's do actual ROP stuff */
1019
1020		/* first repeat the fill colour into 16 registers */
1021		sxi(SX_SELECT_S, 8, 8, 10, 15);
1022
1023		for (line = 0; line < h; line++) {
1024			ptr = start;
1025			cnt = w;
1026			pre = min(pre, cnt);
1027			if (pre) {
1028				sxm(SX_LDB, ptr, 26, pre - 1);
1029				sxi(SX_ROP, 10, 26, 42, pre - 1);
1030				sxm(SX_STB, ptr, 42, pre - 1);
1031				ptr += pre;
1032				cnt -= pre;
1033				if (cnt == 0) goto next2;
1034			}
1035			/* now do the aligned pixels in 32bit chunks */
1036			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1037			while(cnt > 3) {
1038				num = min(32, cnt >> 2);
1039				sxm(SX_LD, ptr, 26, num - 1);
1040				if (num <= 16) {
1041					sxi(SX_ROP, 10, 26, 58, num - 1);
1042				} else {
1043					sxi(SX_ROP, 10, 26, 58, 15);
1044					sxi(SX_ROP, 10, 42, 74, num - 17);
1045				}
1046				sxm(SX_ST, ptr, 58, num - 1);
1047				ptr += num << 2;
1048				cnt -= num << 2;
1049			}
1050			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1051			if (cnt > 0) {
1052				sxm(SX_LDB, ptr, 26, cnt - 1);
1053				sxi(SX_ROP, 10, 26, 42, cnt - 1);
1054				sxm(SX_STB, ptr, 42, cnt - 1);
1055			}
1056			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1057next2:
1058			start += pitch;
1059		}
1060	}
1061}
1062
1063static void
1064CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
1065{
1066	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
1067	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1068	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
1069	int start, depth;
1070
1071	ENTER;
1072	dstpitch = exaGetPixmapPitch(pPixmap);
1073	dstoff = exaGetPixmapOffset(pPixmap);
1074
1075	depth = pPixmap->drawable.bitsPerPixel;
1076	switch (depth) {
1077		case 32:
1078			start = dstoff + (y1 * dstpitch) + (x1 << 2);
1079			CG14Solid32(p, start, dstpitch, w, h);
1080			break;
1081		case 8:
1082			start = dstoff + (y1 * dstpitch) + x1;
1083			CG14Solid8(p, start, dstpitch, w, h);
1084			break;
1085	}
1086
1087	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
1088	    dstpitch, dstoff, start);
1089	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
1090	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
1091	exaMarkSync(pPixmap->drawable.pScreen);
1092}
1093
1094/*
1095 * Memcpy-based UTS.
1096 */
1097static Bool
1098CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
1099    char *src, int src_pitch)
1100{
1101	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1102	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1103	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
1104	int    dst_pitch  = exaGetPixmapPitch(pDst);
1105
1106	int bpp    = pDst->drawable.bitsPerPixel;
1107	int cpp    = (bpp + 7) >> 3;
1108	int wBytes = w * cpp;
1109
1110	ENTER;
1111	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
1112	dst += (x * cpp) + (y * dst_pitch);
1113
1114	CG14Wait(p);
1115
1116	while (h--) {
1117		memcpy(dst, src, wBytes);
1118		src += src_pitch;
1119		dst += dst_pitch;
1120	}
1121	__asm("stbar;");
1122	return TRUE;
1123}
1124
1125/*
1126 * Memcpy-based DFS.
1127 */
1128static Bool
1129CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
1130    char *dst, int dst_pitch)
1131{
1132	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
1133	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1134	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
1135	int    src_pitch  = exaGetPixmapPitch(pSrc);
1136
1137	ENTER;
1138	int bpp    = pSrc->drawable.bitsPerPixel;
1139	int cpp    = (bpp + 7) >> 3;
1140	int wBytes = w * cpp;
1141
1142	src += (x * cpp) + (y * src_pitch);
1143
1144	CG14Wait(p);
1145
1146	while (h--) {
1147		memcpy(dst, src, wBytes);
1148		src += src_pitch;
1149		dst += dst_pitch;
1150	}
1151
1152	return TRUE;
1153}
1154
1155Bool
1156CG14CheckComposite(int op, PicturePtr pSrcPicture,
1157                           PicturePtr pMaskPicture,
1158                           PicturePtr pDstPicture)
1159{
1160	int i, ok = FALSE;
1161
1162	ENTER;
1163
1164	/*
1165	 * SX is in theory capable of accelerating pretty much all Xrender ops,
1166	 * even coordinate transformation and gradients. Support will be added
1167	 * over time and likely have to spill over into its own source file.
1168	 */
1169
1170	if ((op != PictOpOver) && (op != PictOpAdd)/* && (op != PictOpSrc)*/) {
1171		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
1172		return FALSE;
1173	}
1174
1175	if (pSrcPicture != NULL) {
1176		i = 0;
1177		while ((i < arraysize(src_formats)) && (!ok)) {
1178			ok =  (pSrcPicture->format == src_formats[i]);
1179			i++;
1180		}
1181
1182		if (!ok) {
1183			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
1184			    __func__, pSrcPicture->format);
1185			return FALSE;
1186		}
1187		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
1188	}
1189
1190	if (pDstPicture != NULL) {
1191		i = 0;
1192		ok = FALSE;
1193		while ((i < arraysize(src_formats)) && (!ok)) {
1194			ok =  (pDstPicture->format == src_formats[i]);
1195			i++;
1196		}
1197
1198		if (!ok) {
1199			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
1200			    __func__, pDstPicture->format);
1201			return FALSE;
1202		}
1203		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
1204	}
1205
1206	if (pMaskPicture != NULL) {
1207		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
1208		    pMaskPicture->pDrawable->width,
1209		    pMaskPicture->pDrawable->height);
1210	}
1211	return TRUE;
1212}
1213
1214Bool
1215CG14PrepareComposite(int op, PicturePtr pSrcPicture,
1216                             PicturePtr pMaskPicture,
1217                             PicturePtr pDstPicture,
1218                             PixmapPtr  pSrc,
1219                             PixmapPtr  pMask,
1220                             PixmapPtr  pDst)
1221{
1222	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1223	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1224
1225	ENTER;
1226
1227	p->no_source_pixmap = FALSE;
1228	p->source_is_solid = FALSE;
1229
1230	if (pSrcPicture->format == PICT_a1) {
1231		DPRINTF(X_ERROR, "src mono, dst %x, op %d\n",
1232		    pDstPicture->format, op);
1233		if (pMaskPicture != NULL) {
1234			DPRINTF(X_ERROR, "msk %x\n", pMaskPicture->format);
1235		}
1236	}
1237	if (pSrcPicture->pSourcePict != NULL) {
1238		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
1239			p->fillcolour =
1240			    pSrcPicture->pSourcePict->solidFill.color;
1241			DPRINTF(X_ERROR, "%s: solid src %08x\n",
1242			    __func__, p->fillcolour);
1243			p->no_source_pixmap = TRUE;
1244			p->source_is_solid = TRUE;
1245		}
1246	}
1247	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
1248		if (pMaskPicture->pSourcePict->type ==
1249		    SourcePictTypeSolidFill) {
1250			p->fillcolour =
1251			   pMaskPicture->pSourcePict->solidFill.color;
1252			DPRINTF(X_ERROR, "%s: solid mask %08x\n",
1253			    __func__, p->fillcolour);
1254		}
1255	}
1256	if (pMaskPicture != NULL) {
1257		p->mskoff = exaGetPixmapOffset(pMask);
1258		p->mskpitch = exaGetPixmapPitch(pMask);
1259		p->mskformat = pMaskPicture->format;
1260	} else {
1261		p->mskoff = 0;
1262		p->mskpitch = 0;
1263		p->mskformat = 0;
1264	}
1265	if (pSrc != NULL) {
1266		p->source_is_solid =
1267		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
1268		p->srcoff = exaGetPixmapOffset(pSrc);
1269		p->srcpitch = exaGetPixmapPitch(pSrc);
1270		if (p->source_is_solid) {
1271			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
1272		}
1273	}
1274	p->srcformat = pSrcPicture->format;
1275	p->dstformat = pDstPicture->format;
1276
1277	if (p->source_is_solid) {
1278		uint32_t temp;
1279
1280		/* stuff source colour into SX registers, swap as needed */
1281		temp = p->fillcolour;
1282		DPRINTF(X_ERROR, "solid %08x\n", temp);
1283		switch (p->srcformat) {
1284			case PICT_a8r8g8b8:
1285			case PICT_x8r8g8b8:
1286				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1287				temp = temp >> 8;
1288				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1289				temp = temp >> 8;
1290				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1291				break;
1292			case PICT_a8b8g8r8:
1293			case PICT_x8b8g8r8:
1294				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1295				temp = temp >> 8;
1296				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1297				temp = temp >> 8;
1298				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1299				break;
1300		}
1301		write_sx_reg(p, SX_QUEUED(8), 0xff);
1302	}
1303	p->op = op;
1304	if (op == PictOpSrc) {
1305		if (pSrc == NULL) {
1306			DPRINTF(X_ERROR, "src type %d\n", pSrcPicture->pSourcePict->type);
1307			return FALSE;
1308		}
1309		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
1310	}
1311#ifdef SX_DEBUG
1312	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
1313	    *(uint32_t *)(p->fb + p->srcoff));
1314#endif
1315	return TRUE;
1316}
1317
1318void
1319CG14Composite(PixmapPtr pDst, int srcX, int srcY,
1320                              int maskX, int maskY,
1321                              int dstX, int dstY,
1322                              int width, int height)
1323{
1324	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
1325	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1326	uint32_t dstoff, dstpitch;
1327	uint32_t dst, msk, src;
1328	int flip = 0;
1329
1330	ENTER;
1331	dstoff = exaGetPixmapOffset(pDst);
1332	dstpitch = exaGetPixmapPitch(pDst);
1333
1334	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
1335		PICT_FORMAT_TYPE(p->dstformat));
1336
1337	switch (p->op) {
1338		case PictOpOver:
1339			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
1340			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
1341			    p->mskformat, p->dstformat, srcX, srcY);
1342			if (p->source_is_solid) {
1343				switch (p->mskformat) {
1344					case PICT_a8:
1345						msk = p->mskoff +
1346						    (maskY * p->mskpitch) +
1347						    maskX;
1348						CG14Comp_Over8Solid(p,
1349						    msk, p->mskpitch,
1350						    dst, dstpitch,
1351						    width, height);
1352						break;
1353					case PICT_a8r8g8b8:
1354					case PICT_a8b8g8r8:
1355						msk = p->mskoff +
1356						    (maskY * p->mskpitch) +
1357						    (maskX << 2);
1358						CG14Comp_Over32Solid(p,
1359						    msk, p->mskpitch,
1360						    dst, dstpitch,
1361						    width, height);
1362						break;
1363					default:
1364						xf86Msg(X_ERROR,
1365						  "unsupported mask format %08x\n", p->mskformat);
1366				}
1367			} else {
1368				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
1369				    p->mskformat);
1370				switch (p->srcformat) {
1371					case PICT_a8r8g8b8:
1372					case PICT_a8b8g8r8:
1373						src = p->srcoff +
1374						    (srcY * p->srcpitch) +
1375						    (srcX << 2);
1376						dst = dstoff +
1377						    (dstY * dstpitch) +
1378						    (dstX << 2);
1379						if (p->mskformat == PICT_a8) {
1380							msk = p->mskoff +
1381							    (maskY * p->mskpitch) +
1382							    maskX;
1383							CG14Comp_Over32Mask(p,
1384							    src, p->srcpitch,
1385							    msk, p->mskpitch,
1386							    dst, dstpitch,
1387							    width, height, flip);
1388						} else {
1389							CG14Comp_Over32(p,
1390							    src, p->srcpitch,
1391							    dst, dstpitch,
1392							    width, height, flip);
1393						}
1394						break;
1395					case PICT_x8r8g8b8:
1396					case PICT_x8b8g8r8:
1397						src = p->srcoff +
1398						    (srcY * p->srcpitch) +
1399						    (srcX << 2);
1400						dst = dstoff +
1401						    (dstY * dstpitch) +
1402						    (dstX << 2);
1403						if (p->mskformat == PICT_a8) {
1404							msk = p->mskoff +
1405							    (maskY * p->mskpitch) +
1406							    maskX;
1407							CG14Comp_Over32Mask_noalpha(p,
1408							    src, p->srcpitch,
1409							    msk, p->mskpitch,
1410							    dst, dstpitch,
1411							    width, height, flip);
1412						} else if ((p->mskformat == PICT_a8r8g8b8) ||
1413							   (p->mskformat == PICT_a8b8g8r8)) {
1414							msk = p->mskoff +
1415							    (maskY * p->mskpitch) +
1416							    (maskX << 2);
1417							CG14Comp_Over32Mask32_noalpha(p,
1418							    src, p->srcpitch,
1419							    msk, p->mskpitch,
1420							    dst, dstpitch,
1421							    width, height, flip);
1422						} else {
1423							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
1424						}
1425						break;
1426					default:
1427						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
1428						    __func__, p->srcformat);
1429				}
1430			}
1431			break;
1432		case PictOpAdd:
1433			DPRINTF(X_ERROR, "Add %08x %08x\n",
1434			    p->srcformat, p->dstformat);
1435			switch (p->srcformat) {
1436				case PICT_a8:
1437					src = p->srcoff +
1438					    (srcY * p->srcpitch) + srcX;
1439					if (p->dstformat == PICT_a8) {
1440						dst = dstoff +
1441						      (dstY * dstpitch) + dstX;
1442						CG14Comp_Add8(p,
1443						    src, p->srcpitch,
1444						    dst, dstpitch,
1445						    width, height);
1446					} else {
1447						dst = dstoff +
1448						      (dstY * dstpitch) +
1449						      (dstX << 2);
1450						CG14Comp_Add8_32(p,
1451						    src, p->srcpitch,
1452						    dst, dstpitch,
1453						    width, height);
1454					}
1455					break;
1456				case PICT_a8r8g8b8:
1457				case PICT_x8r8g8b8:
1458					src = p->srcoff +
1459					    (srcY * p->srcpitch) + (srcX << 2);
1460					dst = dstoff + (dstY * dstpitch) +
1461					    (dstX << 2);
1462					CG14Comp_Add32(p, src, p->srcpitch,
1463					    dst, dstpitch, width, height);
1464					break;
1465				default:
1466					xf86Msg(X_ERROR,
1467					    "unsupported src format\n");
1468			}
1469			break;
1470		case PictOpSrc:
1471			DPRINTF(X_ERROR, "Src %08x %08x\n",
1472			    p->srcformat, p->dstformat);
1473			if (p->mskformat != 0)
1474				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
1475			if (p->srcformat == PICT_a8) {
1476				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
1477			} else {
1478				/* convert between RGB and BGR? */
1479				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
1480			}
1481			break;
1482		default:
1483			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
1484	}
1485	exaMarkSync(pDst->drawable.pScreen);
1486}
1487
1488
1489
1490Bool
1491CG14InitAccel(ScreenPtr pScreen)
1492{
1493	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1494	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1495	ExaDriverPtr pExa;
1496
1497	pExa = exaDriverAlloc();
1498	if (!pExa)
1499		return FALSE;
1500
1501	p->pExa = pExa;
1502
1503	pExa->exa_major = EXA_VERSION_MAJOR;
1504	pExa->exa_minor = EXA_VERSION_MINOR;
1505
1506	pExa->memoryBase = p->fb;
1507	pExa->memorySize = p->memsize;
1508	pExa->offScreenBase = p->width * p->height * (pScrn->bitsPerPixel >> 3);
1509
1510	/*
1511	 * SX memory instructions are written to 64bit aligned addresses with
1512	 * a 3 bit displacement. Make sure the displacement remains constant
1513	 * within one column
1514	 */
1515
1516	pExa->pixmapOffsetAlign = 8;
1517	pExa->pixmapPitchAlign = 8;
1518
1519	pExa->flags = EXA_OFFSCREEN_PIXMAPS
1520		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
1521		      /*| EXA_MIXED_PIXMAPS*/;
1522
1523	/*
1524	 * these limits are bogus
1525	 * SX doesn't deal with coordinates at all, so there is no limit but
1526	 * we have to put something here
1527	 */
1528	pExa->maxX = 4096;
1529	pExa->maxY = 4096;
1530
1531	pExa->WaitMarker = CG14WaitMarker;
1532
1533	pExa->PrepareSolid = CG14PrepareSolid;
1534	pExa->Solid = CG14Solid;
1535	pExa->DoneSolid = CG14DoneCopy;
1536	pExa->PrepareCopy = CG14PrepareCopy;
1537	pExa->Copy = CG14Copy32;
1538	pExa->DoneCopy = CG14DoneCopy;
1539	if (p->use_xrender) {
1540		pExa->CheckComposite = CG14CheckComposite;
1541		pExa->PrepareComposite = CG14PrepareComposite;
1542		pExa->Composite = CG14Composite;
1543		pExa->DoneComposite = CG14DoneCopy;
1544	}
1545
1546	/* EXA hits more optimized paths when it does not have to fallback
1547	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
1548	 */
1549	pExa->UploadToScreen = CG14UploadToScreen;
1550	pExa->DownloadFromScreen = CG14DownloadFromScreen;
1551
1552	p->queuecount = 0;
1553	/* do some hardware init */
1554	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
1555	p->last_mask = 0xffffffff;
1556	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
1557	p->last_rop = 0xcc;
1558	return exaDriverInit(pScreen, pExa);
1559}
1560