cg14_accel.c revision dbf8597c
1/* $NetBSD: cg14_accel.c,v 1.18 2021/12/03 16:54:26 macallan Exp $ */
2/*
3 * Copyright (c) 2013 Michael Lorenz
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 *    - Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    - Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials provided
15 *      with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <sys/types.h>
37
38/* all driver need this */
39#include "xf86.h"
40#include "xf86_OSproc.h"
41#include "compiler.h"
42
43#include "cg14.h"
44
45//#define SX_DEBUG
46
47#ifdef SX_DEBUG
48#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
49#define DPRINTF xf86Msg
50#else
51#define ENTER
52#define DPRINTF while (0) xf86Msg
53#endif
54
55#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
56
57/* 0xcc is SX's GXcopy equivalent */
58uint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
59		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
60
61int src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
62		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
63int tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
64
65static void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
66static void CG14Copy8(PixmapPtr, int, int, int, int, int, int);
67
68static inline void
69CG14Wait(Cg14Ptr p)
70{
71	int bail = 10000000;
72	/* we wait for the busy bit to clear */
73	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
74	       (bail > 0)) {
75		bail--;
76	};
77	if (bail == 0) {
78		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
79		    read_sx_reg(p, SX_CONTROL_STATUS),
80		    read_sx_reg(p, SX_ERROR));
81	}
82}
83
84static void
85CG14WaitMarker(ScreenPtr pScreen, int Marker)
86{
87	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
88	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
89
90	CG14Wait(p);
91}
92
93static Bool
94CG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
95		int xdir, int ydir, int alu, Pixel planemask)
96{
97	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
98	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
99
100	ENTER;
101	DPRINTF(X_ERROR, "bits per pixel: %d\n",
102	    pSrcPixmap->drawable.bitsPerPixel);
103
104	if (planemask != p->last_mask) {
105		CG14Wait(p);
106		write_sx_reg(p, SX_PLANEMASK, planemask);
107		p->last_mask = planemask;
108	}
109	alu = sx_rop[alu];
110	if (alu != p->last_rop) {
111		CG14Wait(p);
112		write_sx_reg(p, SX_ROP_CONTROL, alu);
113		p->last_rop = alu;
114	}
115	switch (pSrcPixmap->drawable.bitsPerPixel)  {
116		case 8:
117			p->pExa->Copy = CG14Copy8;
118			break;
119		case 32:
120			p->pExa->Copy = CG14Copy32;
121			break;
122		default:
123			xf86Msg(X_ERROR, "%s depth %d\n", __func__,
124			    pSrcPixmap->drawable.bitsPerPixel);
125	}
126	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
127	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
128	p->xdir = xdir;
129	p->ydir = ydir;
130	return TRUE;
131}
132
133static void
134CG14Copy32(PixmapPtr pDstPixmap,
135         int srcX, int srcY, int dstX, int dstY, int w, int h)
136{
137	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
138	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
139	int dstpitch, dstoff, srcpitch, srcoff;
140	int srcstart, dststart, xinc, srcinc, dstinc;
141	int line, count, s, d, num;
142
143	ENTER;
144	dstpitch = exaGetPixmapPitch(pDstPixmap);
145	dstoff = exaGetPixmapOffset(pDstPixmap);
146	srcpitch = p->srcpitch;
147	srcoff = p->srcoff;
148	/*
149	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
150	 * actually wrote anything and only sync if it did
151	 */
152	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
153	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
154
155	/*
156	 * we always copy up to 32 pixels at a time so direction doesn't
157	 * matter if w<=32
158	 */
159	if (w > 32) {
160		if (p->xdir < 0) {
161			srcstart += (w - 32) << 2;
162			dststart += (w - 32) << 2;
163			xinc = -128;
164		} else
165			xinc = 128;
166	} else
167		xinc = 128;
168	if (p->ydir < 0) {
169		srcstart += (h - 1) * srcpitch;
170		dststart += (h - 1) * dstpitch;
171		srcinc = -srcpitch;
172		dstinc = -dstpitch;
173	} else {
174		srcinc = srcpitch;
175		dstinc = dstpitch;
176	}
177	if (p->last_rop == 0xcc) {
178		/* plain old copy */
179		if ( xinc > 0) {
180			/* going left to right */
181			for (line = 0; line < h; line++) {
182				count = 0;
183				s = srcstart;
184				d = dststart;
185				while ( count < w) {
186					num = min(32, w - count);
187					write_sx_io(p, s,
188					    SX_LD(10, num - 1, s & 7));
189					write_sx_io(p, d,
190					    SX_STM(10, num - 1, d & 7));
191					s += xinc;
192					d += xinc;
193					count += 32;
194				}
195				srcstart += srcinc;
196				dststart += dstinc;
197			}
198		} else {
199			/* going right to left */
200			int i, chunks = (w >> 5);
201			for (line = 0; line < h; line++) {
202				s = srcstart;
203				d = dststart;
204				count = w;
205				for (i = 0; i < chunks; i++) {
206					write_sx_io(p, s,
207					    SX_LD(10, 31, s & 7));
208					write_sx_io(p, d,
209					    SX_STM(10, 31, d & 7));
210					s -= 128;
211					d -= 128;
212					count -= 32;
213				}
214				/* leftovers, if any */
215				if (count > 0) {
216					s += (32 - count) << 2;
217					d += (32 - count) << 2;
218					write_sx_io(p, s,
219					    SX_LD(10, count - 1, s & 7));
220					write_sx_io(p, d,
221					    SX_STM(10, count - 1, d & 7));
222				}
223				srcstart += srcinc;
224				dststart += dstinc;
225			}
226		}
227	} else {
228		/* ROPs needed */
229		if ( xinc > 0) {
230			/* going left to right */
231			for (line = 0; line < h; line++) {
232				count = 0;
233				s = srcstart;
234				d = dststart;
235				while ( count < w) {
236					num = min(32, w - count);
237					write_sx_io(p, s,
238					    SX_LD(10, num - 1, s & 7));
239					write_sx_io(p, d,
240					    SX_LD(42, num - 1, d & 7));
241					if (num > 16) {
242						write_sx_reg(p, SX_INSTRUCTIONS,
243					    	 SX_ROP(10, 42, 74, 15));
244						write_sx_reg(p, SX_INSTRUCTIONS,
245					    	 SX_ROP(26, 58, 90, num - 17));
246					} else {
247						write_sx_reg(p, SX_INSTRUCTIONS,
248					    	 SX_ROP(10, 42, 74, num - 1));
249					}
250					write_sx_io(p, d,
251					    SX_STM(74, num - 1, d & 7));
252					s += xinc;
253					d += xinc;
254					count += 32;
255				}
256				srcstart += srcinc;
257				dststart += dstinc;
258			}
259		} else {
260			/* going right to left */
261			int i, chunks = (w >> 5);
262			for (line = 0; line < h; line++) {
263				s = srcstart;
264				d = dststart;
265				count = w;
266				for (i = 0; i < chunks; i++) {
267					write_sx_io(p, s, SX_LD(10, 31, s & 7));
268					write_sx_io(p, d, SX_LD(42, 31, d & 7));
269					write_sx_reg(p, SX_INSTRUCTIONS,
270				    	    SX_ROP(10, 42, 74, 15));
271					write_sx_reg(p, SX_INSTRUCTIONS,
272				    	    SX_ROP(26, 58, 90, 15));
273					write_sx_io(p, d,
274					    SX_STM(74, 31, d & 7));
275					s -= 128;
276					d -= 128;
277					count -= 32;
278				}
279				/* leftovers, if any */
280				if (count > 0) {
281					s += (32 - count) << 2;
282					d += (32 - count) << 2;
283					write_sx_io(p, s,
284					    SX_LD(10, count - 1, s & 7));
285					write_sx_io(p, d,
286					    SX_LD(42, count - 1, d & 7));
287					if (count > 16) {
288						write_sx_reg(p, SX_INSTRUCTIONS,
289					    	    SX_ROP(10, 42, 74, 15));
290						write_sx_reg(p, SX_INSTRUCTIONS,
291					    	 SX_ROP(26, 58, 90, count - 17));
292					} else {
293						write_sx_reg(p, SX_INSTRUCTIONS,
294					    	 SX_ROP(10, 42, 74, count - 1));
295					}
296
297					write_sx_io(p, d,
298					    SX_STM(74, count - 1, d & 7));
299				}
300				srcstart += srcinc;
301				dststart += dstinc;
302			}
303		}
304	}
305	exaMarkSync(pDstPixmap->drawable.pScreen);
306}
307
308static void
309CG14Copy8(PixmapPtr pDstPixmap,
310         int srcX, int srcY, int dstX, int dstY, int w, int h)
311{
312	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
313	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
314	int dstpitch, dstoff, srcpitch, srcoff;
315	int srcstart, dststart, xinc, srcinc, dstinc;
316	int line, count, s, d, num;
317
318	ENTER;
319	dstpitch = exaGetPixmapPitch(pDstPixmap);
320	dstoff = exaGetPixmapOffset(pDstPixmap);
321	srcpitch = p->srcpitch;
322	srcoff = p->srcoff;
323	/*
324	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
325	 * actually wrote anything and only sync if it did
326	 */
327	srcstart = srcX + (srcpitch * srcY) + srcoff;
328	dststart = dstX + (dstpitch * dstY) + dstoff;
329
330	/*
331	 * we always copy up to 32 pixels at a time so direction doesn't
332	 * matter if w<=32
333	 */
334	if (w > 32) {
335		if (p->xdir < 0) {
336			srcstart += (w - 32);
337			dststart += (w - 32);
338			xinc = -32;
339		} else
340			xinc = 32;
341	} else
342		xinc = 32;
343	if (p->ydir < 0) {
344		srcstart += (h - 1) * srcpitch;
345		dststart += (h - 1) * dstpitch;
346		srcinc = -srcpitch;
347		dstinc = -dstpitch;
348	} else {
349		srcinc = srcpitch;
350		dstinc = dstpitch;
351	}
352	if (p->last_rop == 0xcc) {
353		/* plain old copy */
354		if ( xinc > 0) {
355			/* going left to right */
356			for (line = 0; line < h; line++) {
357				count = 0;
358				s = srcstart;
359				d = dststart;
360				while ( count < w) {
361					num = min(32, w - count);
362					write_sx_io(p, s,
363					    SX_LDB(10, num - 1, s & 7));
364					write_sx_io(p, d,
365					    SX_STBM(10, num - 1, d & 7));
366					s += xinc;
367					d += xinc;
368					count += 32;
369				}
370				srcstart += srcinc;
371				dststart += dstinc;
372			}
373		} else {
374			/* going right to left */
375			int i, chunks = (w >> 5);
376			for (line = 0; line < h; line++) {
377				s = srcstart;
378				d = dststart;
379				count = w;
380				for (i = 0; i < chunks; i++) {
381					write_sx_io(p, s,
382					    SX_LDB(10, 31, s & 7));
383					write_sx_io(p, d,
384					    SX_STBM(10, 31, d & 7));
385					s -= 32;
386					d -= 32;
387					count -= 32;
388				}
389				/* leftovers, if any */
390				if (count > 0) {
391					s += (32 - count);
392					d += (32 - count);
393					write_sx_io(p, s,
394					    SX_LDB(10, count - 1, s & 7));
395					write_sx_io(p, d,
396					    SX_STBM(10, count - 1, d & 7));
397				}
398				srcstart += srcinc;
399				dststart += dstinc;
400			}
401		}
402	} else {
403		/* ROPs needed */
404		if ( xinc > 0) {
405			/* going left to right */
406			for (line = 0; line < h; line++) {
407				count = 0;
408				s = srcstart;
409				d = dststart;
410				while ( count < w) {
411					num = min(32, w - count);
412					write_sx_io(p, s,
413					    SX_LDB(10, num - 1, s & 7));
414					write_sx_io(p, d,
415					    SX_LDB(42, num - 1, d & 7));
416					if (num > 16) {
417						write_sx_reg(p, SX_INSTRUCTIONS,
418					    	 SX_ROP(10, 42, 74, 15));
419						write_sx_reg(p, SX_INSTRUCTIONS,
420					    	 SX_ROP(26, 58, 90, num - 17));
421					} else {
422						write_sx_reg(p, SX_INSTRUCTIONS,
423					    	 SX_ROP(10, 42, 74, num - 1));
424					}
425					write_sx_io(p, d,
426					    SX_STBM(74, num - 1, d & 7));
427					s += xinc;
428					d += xinc;
429					count += 32;
430				}
431				srcstart += srcinc;
432				dststart += dstinc;
433			}
434		} else {
435			/* going right to left */
436			int i, chunks = (w >> 5);
437			for (line = 0; line < h; line++) {
438				s = srcstart;
439				d = dststart;
440				count = w;
441				for (i = 0; i < chunks; i++) {
442					write_sx_io(p, s, SX_LDB(10, 31, s & 7));
443					write_sx_io(p, d, SX_LDB(42, 31, d & 7));
444					write_sx_reg(p, SX_INSTRUCTIONS,
445				    	    SX_ROP(10, 42, 74, 15));
446					write_sx_reg(p, SX_INSTRUCTIONS,
447				    	    SX_ROP(26, 58, 90, 15));
448					write_sx_io(p, d,
449					    SX_STBM(74, 31, d & 7));
450					s -= 128;
451					d -= 128;
452					count -= 32;
453				}
454				/* leftovers, if any */
455				if (count > 0) {
456					s += (32 - count);
457					d += (32 - count);
458					write_sx_io(p, s,
459					    SX_LDB(10, count - 1, s & 7));
460					write_sx_io(p, d,
461					    SX_LDB(42, count - 1, d & 7));
462					if (count > 16) {
463						write_sx_reg(p, SX_INSTRUCTIONS,
464					    	    SX_ROP(10, 42, 74, 15));
465						write_sx_reg(p, SX_INSTRUCTIONS,
466					    	 SX_ROP(26, 58, 90, count - 17));
467					} else {
468						write_sx_reg(p, SX_INSTRUCTIONS,
469					    	 SX_ROP(10, 42, 74, count - 1));
470					}
471
472					write_sx_io(p, d,
473					    SX_STBM(74, count - 1, d & 7));
474				}
475				srcstart += srcinc;
476				dststart += dstinc;
477			}
478		}
479	}
480	exaMarkSync(pDstPixmap->drawable.pScreen);
481}
482
483static void
484CG14DoneCopy(PixmapPtr pDstPixmap)
485{
486}
487
488static Bool
489CG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
490{
491	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
492	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
493
494	ENTER;
495	DPRINTF(X_ERROR, "bits per pixel: %d %08x\n",
496	    pPixmap->drawable.bitsPerPixel, fg);
497
498	/*
499	 * GXset and GXclear are really just specual cases of GXcopy with
500	 * fixed fill colour
501	 */
502	switch (alu) {
503		case GXclear:
504			alu = GXcopy;
505			fg = 0;
506			break;
507		case GXset:
508			alu = GXcopy;
509			fg = 0xffffffff;
510			break;
511	}
512	/* repeat the colour in every sub byte if we're in 8 bit */
513	if (pPixmap->drawable.bitsPerPixel == 8) {
514		fg |= fg << 8;
515		fg |= fg << 16;
516	}
517	write_sx_reg(p, SX_QUEUED(8), fg);
518	write_sx_reg(p, SX_QUEUED(9), fg);
519	if (planemask != p->last_mask) {
520		CG14Wait(p);
521		write_sx_reg(p, SX_PLANEMASK, planemask);
522		p->last_mask = planemask;
523	}
524	alu = sx_rop[alu];
525	if (alu != p->last_rop) {
526		CG14Wait(p);
527		write_sx_reg(p, SX_ROP_CONTROL, alu);
528		p->last_rop = alu;
529	}
530
531	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
532	return TRUE;
533}
534
535static void
536CG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
537{
538	int line, x, num;
539	uint32_t ptr;
540
541	ENTER;
542	if (p->last_rop == 0xcc) {
543		/* simple fill */
544		for (line = 0; line < h; line++) {
545			x = 0;
546			while (x < w) {
547				ptr = start + (x << 2);
548				num = min(32, w - x);
549				write_sx_io(p, ptr,
550				    SX_STS(8, num - 1, ptr & 7));
551				x += 32;
552			}
553			start += pitch;
554		}
555	} else if (p->last_rop == 0xaa) {
556		/* nothing to do here */
557		return;
558	} else {
559		/* alright, let's do actual ROP stuff */
560
561		/* first repeat the fill colour into 16 registers */
562		write_sx_reg(p, SX_INSTRUCTIONS,
563		    SX_SELECT_S(8, 8, 10, 15));
564
565		for (line = 0; line < h; line++) {
566			x = 0;
567			while (x < w) {
568				ptr = start + (x << 2);
569				num = min(32, w - x);
570				/* now suck fb data into registers */
571				write_sx_io(p, ptr,
572				    SX_LD(42, num - 1, ptr & 7));
573				/*
574				 * ROP them with the fill data we left in 10
575				 * non-memory ops can only have counts up to 16
576				 */
577				if (num <= 16) {
578					write_sx_reg(p, SX_INSTRUCTIONS,
579					    SX_ROP(10, 42, 74, num - 1));
580				} else {
581					write_sx_reg(p, SX_INSTRUCTIONS,
582					    SX_ROP(10, 42, 74, 15));
583					write_sx_reg(p, SX_INSTRUCTIONS,
584					    SX_ROP(10, 58, 90, num - 17));
585				}
586				/* and write the result back into memory */
587				write_sx_io(p, ptr,
588				    SX_ST(74, num - 1, ptr & 7));
589				x += 32;
590			}
591			start += pitch;
592		}
593	}
594}
595
596static void
597CG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
598{
599	int line, num, pre, cnt;
600	uint32_t ptr;
601
602	ENTER;
603	pre = start & 3;
604	if (pre != 0) pre = 4 - pre;
605
606	if (p->last_rop == 0xcc) {
607		/* simple fill */
608		for (line = 0; line < h; line++) {
609			ptr = start;
610			cnt = w;
611			pre = min(pre, cnt);
612			if (pre) {
613				write_sx_io(p, ptr & ~7, SX_STBS(8, pre - 1, ptr & 7));
614				ptr += pre;
615				cnt -= pre;
616				if (cnt == 0) goto next;
617			}
618			/* now do the aligned pixels in 32bit chunks */
619			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
620			while(cnt > 3) {
621				num = min(32, cnt >> 2);
622				write_sx_io(p, ptr & ~7, SX_STS(8, num - 1, ptr & 7));
623				ptr += num << 2;
624				cnt -= num << 2;
625			}
626			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
627			if (cnt > 0) {
628				write_sx_io(p, ptr & ~7, SX_STBS(8, cnt - 1, ptr & 7));
629			}
630			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
631next:
632			start += pitch;
633		}
634	} else if (p->last_rop == 0xaa) {
635		/* nothing to do here */
636		return;
637	} else {
638		/* alright, let's do actual ROP stuff */
639
640		/* first repeat the fill colour into 16 registers */
641		write_sx_reg(p, SX_INSTRUCTIONS,
642		    SX_SELECT_S(8, 8, 10, 15));
643
644		for (line = 0; line < h; line++) {
645			ptr = start;
646			cnt = w;
647			pre = min(pre, cnt);
648			if (pre) {
649				write_sx_io(p, ptr & ~7, SX_LDB(26, pre - 1, ptr & 7));
650				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, pre - 1));
651				write_sx_io(p, ptr & ~7, SX_STB(42, pre - 1, ptr & 7));
652				ptr += pre;
653				cnt -= pre;
654				if (cnt == 0) goto next2;
655			}
656			/* now do the aligned pixels in 32bit chunks */
657			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
658			while(cnt > 3) {
659				num = min(32, cnt >> 2);
660				write_sx_io(p, ptr & ~7, SX_LD(26, num - 1, ptr & 7));
661				if (num <= 16) {
662					write_sx_reg(p, SX_INSTRUCTIONS,
663					    SX_ROP(10, 26, 58, num - 1));
664				} else {
665					write_sx_reg(p, SX_INSTRUCTIONS,
666					    SX_ROP(10, 26, 58, 15));
667					write_sx_reg(p, SX_INSTRUCTIONS,
668					    SX_ROP(10, 42, 74, num - 17));
669				}
670				write_sx_io(p, ptr & ~7, SX_ST(58, num - 1, ptr & 7));
671				ptr += num << 2;
672				cnt -= num << 2;
673			}
674			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
675			if (cnt > 0) {
676				write_sx_io(p, ptr & ~7, SX_LDB(26, cnt - 1, ptr & 7));
677				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, cnt - 1));
678				write_sx_io(p, ptr & ~7, SX_STB(42, cnt - 1, ptr & 7));
679			}
680			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
681next2:
682			start += pitch;
683		}
684	}
685}
686
687static void
688CG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
689{
690	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
691	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
692	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
693	int start, depth;
694
695	ENTER;
696	dstpitch = exaGetPixmapPitch(pPixmap);
697	dstoff = exaGetPixmapOffset(pPixmap);
698
699	depth = pPixmap->drawable.bitsPerPixel;
700	switch (depth) {
701		case 32:
702			start = dstoff + (y1 * dstpitch) + (x1 << 2);
703			CG14Solid32(p, start, dstpitch, w, h);
704			break;
705		case 8:
706			start = dstoff + (y1 * dstpitch) + x1;
707			CG14Solid8(p, start, dstpitch, w, h);
708			break;
709	}
710
711	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
712	    dstpitch, dstoff, start);
713	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
714	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
715	exaMarkSync(pPixmap->drawable.pScreen);
716}
717
718/*
719 * Memcpy-based UTS.
720 */
721static Bool
722CG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
723    char *src, int src_pitch)
724{
725	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
726	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
727	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
728	int    dst_pitch  = exaGetPixmapPitch(pDst);
729
730	int bpp    = pDst->drawable.bitsPerPixel;
731	int cpp    = (bpp + 7) >> 3;
732	int wBytes = w * cpp;
733
734	ENTER;
735	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
736	dst += (x * cpp) + (y * dst_pitch);
737
738	CG14Wait(p);
739
740	while (h--) {
741		memcpy(dst, src, wBytes);
742		src += src_pitch;
743		dst += dst_pitch;
744	}
745	__asm("stbar;");
746	return TRUE;
747}
748
749/*
750 * Memcpy-based DFS.
751 */
752static Bool
753CG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
754    char *dst, int dst_pitch)
755{
756	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
757	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
758	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
759	int    src_pitch  = exaGetPixmapPitch(pSrc);
760
761	ENTER;
762	int bpp    = pSrc->drawable.bitsPerPixel;
763	int cpp    = (bpp + 7) >> 3;
764	int wBytes = w * cpp;
765
766	src += (x * cpp) + (y * src_pitch);
767
768	CG14Wait(p);
769
770	while (h--) {
771		memcpy(dst, src, wBytes);
772		src += src_pitch;
773		dst += dst_pitch;
774	}
775
776	return TRUE;
777}
778
779Bool
780CG14CheckComposite(int op, PicturePtr pSrcPicture,
781                           PicturePtr pMaskPicture,
782                           PicturePtr pDstPicture)
783{
784	int i, ok = FALSE;
785
786	ENTER;
787
788	/*
789	 * SX is in theory capable of accelerating pretty much all Xrender ops,
790	 * even coordinate transformation and gradients. Support will be added
791	 * over time and likely have to spill over into its own source file.
792	 */
793
794	if ((op != PictOpOver) && (op != PictOpAdd) && (op != PictOpSrc)) {
795		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
796		return FALSE;
797	}
798
799	if (pSrcPicture != NULL) {
800		i = 0;
801		while ((i < arraysize(src_formats)) && (!ok)) {
802			ok =  (pSrcPicture->format == src_formats[i]);
803			i++;
804		}
805
806		if (!ok) {
807			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
808			    __func__, pSrcPicture->format);
809			return FALSE;
810		}
811		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
812	}
813
814	if (pDstPicture != NULL) {
815		i = 0;
816		ok = FALSE;
817		while ((i < arraysize(src_formats)) && (!ok)) {
818			ok =  (pDstPicture->format == src_formats[i]);
819			i++;
820		}
821
822		if (!ok) {
823			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
824			    __func__, pDstPicture->format);
825			return FALSE;
826		}
827		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
828	}
829
830	if (pMaskPicture != NULL) {
831		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
832		    pMaskPicture->pDrawable->width,
833		    pMaskPicture->pDrawable->height);
834	}
835	return TRUE;
836}
837
838Bool
839CG14PrepareComposite(int op, PicturePtr pSrcPicture,
840                             PicturePtr pMaskPicture,
841                             PicturePtr pDstPicture,
842                             PixmapPtr  pSrc,
843                             PixmapPtr  pMask,
844                             PixmapPtr  pDst)
845{
846	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
847	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
848
849	ENTER;
850
851	p->no_source_pixmap = FALSE;
852	p->source_is_solid = FALSE;
853
854	if (pSrcPicture->format == PICT_a1) {
855		xf86Msg(X_ERROR, "src mono, dst %x, op %d\n",
856		    pDstPicture->format, op);
857		if (pMaskPicture != NULL) {
858			xf86Msg(X_ERROR, "msk %x\n", pMaskPicture->format);
859		}
860	}
861	if (pSrcPicture->pSourcePict != NULL) {
862		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
863			p->fillcolour =
864			    pSrcPicture->pSourcePict->solidFill.color;
865			DPRINTF(X_ERROR, "%s: solid src %08x\n",
866			    __func__, p->fillcolour);
867			p->no_source_pixmap = TRUE;
868			p->source_is_solid = TRUE;
869		}
870	}
871	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
872		if (pMaskPicture->pSourcePict->type ==
873		    SourcePictTypeSolidFill) {
874			p->fillcolour =
875			   pMaskPicture->pSourcePict->solidFill.color;
876			xf86Msg(X_ERROR, "%s: solid mask %08x\n",
877			    __func__, p->fillcolour);
878		}
879	}
880	if (pMaskPicture != NULL) {
881		p->mskoff = exaGetPixmapOffset(pMask);
882		p->mskpitch = exaGetPixmapPitch(pMask);
883		p->mskformat = pMaskPicture->format;
884	} else {
885		p->mskoff = 0;
886		p->mskpitch = 0;
887		p->mskformat = 0;
888	}
889	if (pSrc != NULL) {
890		p->source_is_solid =
891		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
892		p->srcoff = exaGetPixmapOffset(pSrc);
893		p->srcpitch = exaGetPixmapPitch(pSrc);
894		if (p->source_is_solid) {
895			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
896		}
897	}
898	p->srcformat = pSrcPicture->format;
899	p->dstformat = pDstPicture->format;
900
901	if (p->source_is_solid) {
902		uint32_t temp;
903
904		/* stuff source colour into SX registers, swap as needed */
905		temp = p->fillcolour;
906		switch (p->srcformat) {
907			case PICT_a8r8g8b8:
908			case PICT_x8r8g8b8:
909				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
910				temp = temp >> 8;
911				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
912				temp = temp >> 8;
913				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
914				break;
915			case PICT_a8b8g8r8:
916			case PICT_x8b8g8r8:
917				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
918				temp = temp >> 8;
919				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
920				temp = temp >> 8;
921				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
922				break;
923		}
924		write_sx_reg(p, SX_QUEUED(8), 0xff);
925	}
926	p->op = op;
927	if (op == PictOpSrc) {
928		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
929	}
930#ifdef SX_DEBUG
931	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
932	    *(uint32_t *)(p->fb + p->srcoff));
933#endif
934	return TRUE;
935}
936
937void
938CG14Composite(PixmapPtr pDst, int srcX, int srcY,
939                              int maskX, int maskY,
940                              int dstX, int dstY,
941                              int width, int height)
942{
943	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
944	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
945	uint32_t dstoff, dstpitch;
946	uint32_t dst, msk, src;
947	int flip = 0;
948
949	ENTER;
950	dstoff = exaGetPixmapOffset(pDst);
951	dstpitch = exaGetPixmapPitch(pDst);
952
953	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
954		PICT_FORMAT_TYPE(p->dstformat));
955
956	switch (p->op) {
957		case PictOpOver:
958			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
959			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
960			    p->mskformat, p->dstformat, srcX, srcY);
961			if (p->source_is_solid) {
962				switch (p->mskformat) {
963					case PICT_a8:
964						msk = p->mskoff +
965						    (maskY * p->mskpitch) +
966						    maskX;
967						CG14Comp_Over8Solid(p,
968						    msk, p->mskpitch,
969						    dst, dstpitch,
970						    width, height);
971						break;
972					case PICT_a8r8g8b8:
973					case PICT_a8b8g8r8:
974						msk = p->mskoff +
975						    (maskY * p->mskpitch) +
976						    (maskX << 2);
977						CG14Comp_Over32Solid(p,
978						    msk, p->mskpitch,
979						    dst, dstpitch,
980						    width, height);
981						break;
982					default:
983						xf86Msg(X_ERROR,
984						  "unsupported mask format %08x\n", p->mskformat);
985				}
986			} else {
987				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
988				    p->mskformat);
989				switch (p->srcformat) {
990					case PICT_a8r8g8b8:
991					case PICT_a8b8g8r8:
992						src = p->srcoff +
993						    (srcY * p->srcpitch) +
994						    (srcX << 2);
995						dst = dstoff +
996						    (dstY * dstpitch) +
997						    (dstX << 2);
998						if (p->mskformat == PICT_a8) {
999							msk = p->mskoff +
1000							    (maskY * p->mskpitch) +
1001							    maskX;
1002							CG14Comp_Over32Mask(p,
1003							    src, p->srcpitch,
1004							    msk, p->mskpitch,
1005							    dst, dstpitch,
1006							    width, height, flip);
1007						} else {
1008							CG14Comp_Over32(p,
1009							    src, p->srcpitch,
1010							    dst, dstpitch,
1011							    width, height, flip);
1012						}
1013						break;
1014					case PICT_x8r8g8b8:
1015					case PICT_x8b8g8r8:
1016						src = p->srcoff +
1017						    (srcY * p->srcpitch) +
1018						    (srcX << 2);
1019						dst = dstoff +
1020						    (dstY * dstpitch) +
1021						    (dstX << 2);
1022						if (p->mskformat == PICT_a8) {
1023							msk = p->mskoff +
1024							    (maskY * p->mskpitch) +
1025							    maskX;
1026							CG14Comp_Over32Mask_noalpha(p,
1027							    src, p->srcpitch,
1028							    msk, p->mskpitch,
1029							    dst, dstpitch,
1030							    width, height, flip);
1031						} else if ((p->mskformat == PICT_a8r8g8b8) ||
1032							   (p->mskformat == PICT_a8b8g8r8)) {
1033							msk = p->mskoff +
1034							    (maskY * p->mskpitch) +
1035							    (maskX << 2);
1036							CG14Comp_Over32Mask32_noalpha(p,
1037							    src, p->srcpitch,
1038							    msk, p->mskpitch,
1039							    dst, dstpitch,
1040							    width, height, flip);
1041						} else {
1042							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
1043						}
1044						break;
1045					default:
1046						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
1047						    __func__, p->srcformat);
1048				}
1049			}
1050			break;
1051		case PictOpAdd:
1052			DPRINTF(X_ERROR, "Add %08x %08x\n",
1053			    p->srcformat, p->dstformat);
1054			switch (p->srcformat) {
1055				case PICT_a8:
1056					src = p->srcoff +
1057					    (srcY * p->srcpitch) + srcX;
1058					if (p->dstformat == PICT_a8) {
1059						dst = dstoff +
1060						      (dstY * dstpitch) + dstX;
1061						CG14Comp_Add8(p,
1062						    src, p->srcpitch,
1063						    dst, dstpitch,
1064						    width, height);
1065					} else {
1066						dst = dstoff +
1067						      (dstY * dstpitch) +
1068						      (dstX << 2);
1069						CG14Comp_Add8_32(p,
1070						    src, p->srcpitch,
1071						    dst, dstpitch,
1072						    width, height);
1073					}
1074					break;
1075				case PICT_a8r8g8b8:
1076				case PICT_x8r8g8b8:
1077					src = p->srcoff +
1078					    (srcY * p->srcpitch) + (srcX << 2);
1079					dst = dstoff + (dstY * dstpitch) +
1080					    (dstX << 2);
1081					CG14Comp_Add32(p, src, p->srcpitch,
1082					    dst, dstpitch, width, height);
1083					break;
1084				default:
1085					xf86Msg(X_ERROR,
1086					    "unsupported src format\n");
1087			}
1088			break;
1089		case PictOpSrc:
1090			DPRINTF(X_ERROR, "Src %08x %08x\n",
1091			    p->srcformat, p->dstformat);
1092			if (p->mskformat != 0)
1093				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
1094			if (p->srcformat == PICT_a8) {
1095				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
1096			} else {
1097				/* convert between RGB and BGR? */
1098				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
1099			}
1100			break;
1101		default:
1102			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
1103	}
1104	exaMarkSync(pDst->drawable.pScreen);
1105}
1106
1107
1108
1109Bool
1110CG14InitAccel(ScreenPtr pScreen)
1111{
1112	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1113	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1114	ExaDriverPtr pExa;
1115
1116	pExa = exaDriverAlloc();
1117	if (!pExa)
1118		return FALSE;
1119
1120	p->pExa = pExa;
1121
1122	pExa->exa_major = EXA_VERSION_MAJOR;
1123	pExa->exa_minor = EXA_VERSION_MINOR;
1124
1125	pExa->memoryBase = p->fb;
1126	pExa->memorySize = p->memsize;
1127	pExa->offScreenBase = p->width * p->height * (pScrn->depth >> 3);
1128
1129	/*
1130	 * SX memory instructions are written to 64bit aligned addresses with
1131	 * a 3 bit displacement. Make sure the displacement remains constant
1132	 * within one column
1133	 */
1134
1135	pExa->pixmapOffsetAlign = 8;
1136	pExa->pixmapPitchAlign = 8;
1137
1138	pExa->flags = EXA_OFFSCREEN_PIXMAPS
1139		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
1140		      /*| EXA_MIXED_PIXMAPS*/;
1141
1142	/*
1143	 * these limits are bogus
1144	 * SX doesn't deal with coordinates at all, so there is no limit but
1145	 * we have to put something here
1146	 */
1147	pExa->maxX = 4096;
1148	pExa->maxY = 4096;
1149
1150	pExa->WaitMarker = CG14WaitMarker;
1151
1152	pExa->PrepareSolid = CG14PrepareSolid;
1153	pExa->Solid = CG14Solid;
1154	pExa->DoneSolid = CG14DoneCopy;
1155	pExa->PrepareCopy = CG14PrepareCopy;
1156	pExa->Copy = CG14Copy32;
1157	pExa->DoneCopy = CG14DoneCopy;
1158	if (p->use_xrender) {
1159		pExa->CheckComposite = CG14CheckComposite;
1160		pExa->PrepareComposite = CG14PrepareComposite;
1161		pExa->Composite = CG14Composite;
1162		pExa->DoneComposite = CG14DoneCopy;
1163	}
1164
1165	/* EXA hits more optimized paths when it does not have to fallback
1166	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
1167	 */
1168	pExa->UploadToScreen = CG14UploadToScreen;
1169	pExa->DownloadFromScreen = CG14DownloadFromScreen;
1170
1171	p->queuecount = 0;
1172	/* do some hardware init */
1173	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
1174	p->last_mask = 0xffffffff;
1175	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
1176	p->last_rop = 0xcc;
1177	return exaDriverInit(pScreen, pExa);
1178}
1179