cg14_accel.c revision 76a85281
176a85281Smacallan/* $NetBSD: cg14_accel.c,v 1.24 2021/12/10 19:42:07 macallan Exp $ */
24261fa58Smacallan/*
34261fa58Smacallan * Copyright (c) 2013 Michael Lorenz
44261fa58Smacallan * All rights reserved.
54261fa58Smacallan *
64261fa58Smacallan * Redistribution and use in source and binary forms, with or without
74261fa58Smacallan * modification, are permitted provided that the following conditions
84261fa58Smacallan * are met:
94261fa58Smacallan *
104261fa58Smacallan *    - Redistributions of source code must retain the above copyright
114261fa58Smacallan *      notice, this list of conditions and the following disclaimer.
124261fa58Smacallan *    - Redistributions in binary form must reproduce the above
134261fa58Smacallan *      copyright notice, this list of conditions and the following
144261fa58Smacallan *      disclaimer in the documentation and/or other materials provided
154261fa58Smacallan *      with the distribution.
164261fa58Smacallan *
174261fa58Smacallan * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
184261fa58Smacallan * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
194261fa58Smacallan * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
204261fa58Smacallan * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
214261fa58Smacallan * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
224261fa58Smacallan * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
234261fa58Smacallan * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
244261fa58Smacallan * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
254261fa58Smacallan * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
264261fa58Smacallan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
274261fa58Smacallan * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
284261fa58Smacallan * POSSIBILITY OF SUCH DAMAGE.
294261fa58Smacallan *
304261fa58Smacallan */
31c88c16f8Smacallan
32c88c16f8Smacallan#ifdef HAVE_CONFIG_H
33c88c16f8Smacallan#include "config.h"
34c88c16f8Smacallan#endif
35c88c16f8Smacallan
364261fa58Smacallan#include <sys/types.h>
374261fa58Smacallan
384261fa58Smacallan/* all driver need this */
394261fa58Smacallan#include "xf86.h"
404261fa58Smacallan#include "xf86_OSproc.h"
414261fa58Smacallan#include "compiler.h"
424261fa58Smacallan
434261fa58Smacallan#include "cg14.h"
444261fa58Smacallan
45b8ad197aSmacallan//#define SX_DEBUG
464261fa58Smacallan
474261fa58Smacallan#ifdef SX_DEBUG
484261fa58Smacallan#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
494261fa58Smacallan#define DPRINTF xf86Msg
504261fa58Smacallan#else
514261fa58Smacallan#define ENTER
524261fa58Smacallan#define DPRINTF while (0) xf86Msg
534261fa58Smacallan#endif
544261fa58Smacallan
554261fa58Smacallan#define arraysize(ary)        (sizeof(ary) / sizeof(ary[0]))
564261fa58Smacallan
574261fa58Smacallan/* 0xcc is SX's GXcopy equivalent */
584261fa58Smacallanuint32_t sx_rop[] = { 0x00, 0x88, 0x44, 0xcc, 0x22, 0xaa, 0x66, 0xee,
594261fa58Smacallan		      0x11, 0x99, 0x55, 0xdd, 0x33, 0xbb, 0x77, 0xff};
604261fa58Smacallan
614261fa58Smacallanint src_formats[] = {PICT_a8r8g8b8, PICT_x8r8g8b8,
624261fa58Smacallan		     PICT_a8b8g8r8, PICT_x8b8g8r8, PICT_a8};
634261fa58Smacallanint tex_formats[] = {PICT_a8r8g8b8, PICT_a8b8g8r8, PICT_a8};
644261fa58Smacallan
65f71acd79Smacallanstatic void CG14Copy32(PixmapPtr, int, int, int, int, int, int);
66f71acd79Smacallanstatic void CG14Copy8(PixmapPtr, int, int, int, int, int, int);
67f71acd79Smacallan
684261fa58Smacallanstatic inline void
694261fa58SmacallanCG14Wait(Cg14Ptr p)
704261fa58Smacallan{
71fc473876Smacallan	int bail = 10000000;
72fc473876Smacallan	/* we wait for the busy bit to clear */
73fc473876Smacallan	while (((read_sx_reg(p, SX_CONTROL_STATUS) & SX_BZ) != 0) &&
74fc473876Smacallan	       (bail > 0)) {
75fc473876Smacallan		bail--;
76fc473876Smacallan	};
77fc473876Smacallan	if (bail == 0) {
78fc473876Smacallan		xf86Msg(X_ERROR, "SX wait for idle timed out %08x %08x\n",
79fc473876Smacallan		    read_sx_reg(p, SX_CONTROL_STATUS),
80fc473876Smacallan		    read_sx_reg(p, SX_ERROR));
81fc473876Smacallan	}
824261fa58Smacallan}
834261fa58Smacallan
844261fa58Smacallanstatic void
854261fa58SmacallanCG14WaitMarker(ScreenPtr pScreen, int Marker)
864261fa58Smacallan{
874261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
884261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
894261fa58Smacallan
904261fa58Smacallan	CG14Wait(p);
914261fa58Smacallan}
924261fa58Smacallan
934261fa58Smacallanstatic Bool
944261fa58SmacallanCG14PrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
954261fa58Smacallan		int xdir, int ydir, int alu, Pixel planemask)
964261fa58Smacallan{
974261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
984261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
994261fa58Smacallan
1004261fa58Smacallan	ENTER;
1018c65af2dSmacallan	DPRINTF(X_ERROR, "%s bpp %d rop %x\n", __func__,
10281c68cf8Smacallan	    pSrcPixmap->drawable.bitsPerPixel, alu);
1034261fa58Smacallan
1044261fa58Smacallan	if (planemask != p->last_mask) {
1054261fa58Smacallan		CG14Wait(p);
1064261fa58Smacallan		write_sx_reg(p, SX_PLANEMASK, planemask);
1074261fa58Smacallan		p->last_mask = planemask;
1084261fa58Smacallan	}
1094261fa58Smacallan	alu = sx_rop[alu];
1104261fa58Smacallan	if (alu != p->last_rop) {
1114261fa58Smacallan		CG14Wait(p);
1124261fa58Smacallan		write_sx_reg(p, SX_ROP_CONTROL, alu);
1134261fa58Smacallan		p->last_rop = alu;
1144261fa58Smacallan	}
115f71acd79Smacallan	switch (pSrcPixmap->drawable.bitsPerPixel)  {
116f71acd79Smacallan		case 8:
117f71acd79Smacallan			p->pExa->Copy = CG14Copy8;
118f71acd79Smacallan			break;
119f71acd79Smacallan		case 32:
120f71acd79Smacallan			p->pExa->Copy = CG14Copy32;
121f71acd79Smacallan			break;
122f71acd79Smacallan		default:
123f71acd79Smacallan			xf86Msg(X_ERROR, "%s depth %d\n", __func__,
124f71acd79Smacallan			    pSrcPixmap->drawable.bitsPerPixel);
125f71acd79Smacallan	}
1264261fa58Smacallan	p->srcpitch = exaGetPixmapPitch(pSrcPixmap);
1274261fa58Smacallan	p->srcoff = exaGetPixmapOffset(pSrcPixmap);
1284261fa58Smacallan	p->xdir = xdir;
1294261fa58Smacallan	p->ydir = ydir;
1304261fa58Smacallan	return TRUE;
1314261fa58Smacallan}
1324261fa58Smacallan
1334261fa58Smacallanstatic void
134f71acd79SmacallanCG14Copy32(PixmapPtr pDstPixmap,
1354261fa58Smacallan         int srcX, int srcY, int dstX, int dstY, int w, int h)
1364261fa58Smacallan{
1374261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
1384261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
1394261fa58Smacallan	int dstpitch, dstoff, srcpitch, srcoff;
1404261fa58Smacallan	int srcstart, dststart, xinc, srcinc, dstinc;
1414261fa58Smacallan	int line, count, s, d, num;
1424261fa58Smacallan
1434261fa58Smacallan	ENTER;
1444261fa58Smacallan	dstpitch = exaGetPixmapPitch(pDstPixmap);
1454261fa58Smacallan	dstoff = exaGetPixmapOffset(pDstPixmap);
1464261fa58Smacallan	srcpitch = p->srcpitch;
1474261fa58Smacallan	srcoff = p->srcoff;
1484261fa58Smacallan	/*
1494261fa58Smacallan	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
1504261fa58Smacallan	 * actually wrote anything and only sync if it did
1514261fa58Smacallan	 */
1524261fa58Smacallan	srcstart = (srcX << 2) + (srcpitch * srcY) + srcoff;
1534261fa58Smacallan	dststart = (dstX << 2) + (dstpitch * dstY) + dstoff;
1544261fa58Smacallan
1554261fa58Smacallan	/*
1564261fa58Smacallan	 * we always copy up to 32 pixels at a time so direction doesn't
1574261fa58Smacallan	 * matter if w<=32
1584261fa58Smacallan	 */
1594261fa58Smacallan	if (w > 32) {
1604261fa58Smacallan		if (p->xdir < 0) {
1614261fa58Smacallan			srcstart += (w - 32) << 2;
1624261fa58Smacallan			dststart += (w - 32) << 2;
1634261fa58Smacallan			xinc = -128;
1644261fa58Smacallan		} else
1654261fa58Smacallan			xinc = 128;
1664261fa58Smacallan	} else
1674261fa58Smacallan		xinc = 128;
1684261fa58Smacallan	if (p->ydir < 0) {
1694261fa58Smacallan		srcstart += (h - 1) * srcpitch;
1704261fa58Smacallan		dststart += (h - 1) * dstpitch;
1714261fa58Smacallan		srcinc = -srcpitch;
1724261fa58Smacallan		dstinc = -dstpitch;
1734261fa58Smacallan	} else {
1744261fa58Smacallan		srcinc = srcpitch;
1754261fa58Smacallan		dstinc = dstpitch;
1764261fa58Smacallan	}
1774261fa58Smacallan	if (p->last_rop == 0xcc) {
1784261fa58Smacallan		/* plain old copy */
1794261fa58Smacallan		if ( xinc > 0) {
1804261fa58Smacallan			/* going left to right */
1814261fa58Smacallan			for (line = 0; line < h; line++) {
1824261fa58Smacallan				count = 0;
1834261fa58Smacallan				s = srcstart;
1844261fa58Smacallan				d = dststart;
1854261fa58Smacallan				while ( count < w) {
1864261fa58Smacallan					num = min(32, w - count);
1874261fa58Smacallan					write_sx_io(p, s,
1884261fa58Smacallan					    SX_LD(10, num - 1, s & 7));
1894261fa58Smacallan					write_sx_io(p, d,
1904261fa58Smacallan					    SX_STM(10, num - 1, d & 7));
1914261fa58Smacallan					s += xinc;
1924261fa58Smacallan					d += xinc;
1934261fa58Smacallan					count += 32;
1944261fa58Smacallan				}
1954261fa58Smacallan				srcstart += srcinc;
1964261fa58Smacallan				dststart += dstinc;
1974261fa58Smacallan			}
1984261fa58Smacallan		} else {
1994261fa58Smacallan			/* going right to left */
2004261fa58Smacallan			int i, chunks = (w >> 5);
2014261fa58Smacallan			for (line = 0; line < h; line++) {
2024261fa58Smacallan				s = srcstart;
2034261fa58Smacallan				d = dststart;
2044261fa58Smacallan				count = w;
2054261fa58Smacallan				for (i = 0; i < chunks; i++) {
2064261fa58Smacallan					write_sx_io(p, s,
2074261fa58Smacallan					    SX_LD(10, 31, s & 7));
2084261fa58Smacallan					write_sx_io(p, d,
2094261fa58Smacallan					    SX_STM(10, 31, d & 7));
2104261fa58Smacallan					s -= 128;
2114261fa58Smacallan					d -= 128;
2124261fa58Smacallan					count -= 32;
2134261fa58Smacallan				}
2144261fa58Smacallan				/* leftovers, if any */
2154261fa58Smacallan				if (count > 0) {
2164261fa58Smacallan					s += (32 - count) << 2;
2174261fa58Smacallan					d += (32 - count) << 2;
2184261fa58Smacallan					write_sx_io(p, s,
2194261fa58Smacallan					    SX_LD(10, count - 1, s & 7));
2204261fa58Smacallan					write_sx_io(p, d,
2214261fa58Smacallan					    SX_STM(10, count - 1, d & 7));
2224261fa58Smacallan				}
2234261fa58Smacallan				srcstart += srcinc;
2244261fa58Smacallan				dststart += dstinc;
2254261fa58Smacallan			}
2264261fa58Smacallan		}
2274261fa58Smacallan	} else {
2284261fa58Smacallan		/* ROPs needed */
2294261fa58Smacallan		if ( xinc > 0) {
2304261fa58Smacallan			/* going left to right */
2314261fa58Smacallan			for (line = 0; line < h; line++) {
2324261fa58Smacallan				count = 0;
2334261fa58Smacallan				s = srcstart;
2344261fa58Smacallan				d = dststart;
2354261fa58Smacallan				while ( count < w) {
2364261fa58Smacallan					num = min(32, w - count);
2374261fa58Smacallan					write_sx_io(p, s,
2384261fa58Smacallan					    SX_LD(10, num - 1, s & 7));
2394261fa58Smacallan					write_sx_io(p, d,
2404261fa58Smacallan					    SX_LD(42, num - 1, d & 7));
2414261fa58Smacallan					if (num > 16) {
2424261fa58Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
2434261fa58Smacallan					    	 SX_ROP(10, 42, 74, 15));
2444261fa58Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
2454261fa58Smacallan					    	 SX_ROP(26, 58, 90, num - 17));
2464261fa58Smacallan					} else {
2474261fa58Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
2484261fa58Smacallan					    	 SX_ROP(10, 42, 74, num - 1));
2494261fa58Smacallan					}
2504261fa58Smacallan					write_sx_io(p, d,
2514261fa58Smacallan					    SX_STM(74, num - 1, d & 7));
2524261fa58Smacallan					s += xinc;
2534261fa58Smacallan					d += xinc;
2544261fa58Smacallan					count += 32;
2554261fa58Smacallan				}
2564261fa58Smacallan				srcstart += srcinc;
2574261fa58Smacallan				dststart += dstinc;
2584261fa58Smacallan			}
2594261fa58Smacallan		} else {
2604261fa58Smacallan			/* going right to left */
2614261fa58Smacallan			int i, chunks = (w >> 5);
2624261fa58Smacallan			for (line = 0; line < h; line++) {
2634261fa58Smacallan				s = srcstart;
2644261fa58Smacallan				d = dststart;
2654261fa58Smacallan				count = w;
2664261fa58Smacallan				for (i = 0; i < chunks; i++) {
2674261fa58Smacallan					write_sx_io(p, s, SX_LD(10, 31, s & 7));
2684261fa58Smacallan					write_sx_io(p, d, SX_LD(42, 31, d & 7));
2694261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
2704261fa58Smacallan				    	    SX_ROP(10, 42, 74, 15));
2714261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
2724261fa58Smacallan				    	    SX_ROP(26, 58, 90, 15));
2734261fa58Smacallan					write_sx_io(p, d,
2744261fa58Smacallan					    SX_STM(74, 31, d & 7));
2754261fa58Smacallan					s -= 128;
2764261fa58Smacallan					d -= 128;
2774261fa58Smacallan					count -= 32;
2784261fa58Smacallan				}
2794261fa58Smacallan				/* leftovers, if any */
2804261fa58Smacallan				if (count > 0) {
2814261fa58Smacallan					s += (32 - count) << 2;
2824261fa58Smacallan					d += (32 - count) << 2;
2834261fa58Smacallan					write_sx_io(p, s,
2844261fa58Smacallan					    SX_LD(10, count - 1, s & 7));
2854261fa58Smacallan					write_sx_io(p, d,
2864261fa58Smacallan					    SX_LD(42, count - 1, d & 7));
2874261fa58Smacallan					if (count > 16) {
2884261fa58Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
2894261fa58Smacallan					    	    SX_ROP(10, 42, 74, 15));
2904261fa58Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
2914261fa58Smacallan					    	 SX_ROP(26, 58, 90, count - 17));
2924261fa58Smacallan					} else {
2934261fa58Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
2944261fa58Smacallan					    	 SX_ROP(10, 42, 74, count - 1));
2954261fa58Smacallan					}
2964261fa58Smacallan
2974261fa58Smacallan					write_sx_io(p, d,
2984261fa58Smacallan					    SX_STM(74, count - 1, d & 7));
2994261fa58Smacallan				}
3004261fa58Smacallan				srcstart += srcinc;
3014261fa58Smacallan				dststart += dstinc;
3024261fa58Smacallan			}
3034261fa58Smacallan		}
3044261fa58Smacallan	}
3054261fa58Smacallan	exaMarkSync(pDstPixmap->drawable.pScreen);
3064261fa58Smacallan}
3074261fa58Smacallan
30881c68cf8Smacallan/*
30981c68cf8Smacallan * copy with same alignment, left to right, no ROP
31081c68cf8Smacallan */
31181c68cf8Smacallanstatic void
31281c68cf8SmacallanCG14Copy8_aligned_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
31381c68cf8Smacallan{
31481c68cf8Smacallan	int saddr, daddr, pre, cnt, wrds;
31581c68cf8Smacallan
31681c68cf8Smacallan	ENTER;
31781c68cf8Smacallan
31881c68cf8Smacallan	pre = srcstart & 3;
31981c68cf8Smacallan	if (pre != 0) pre = 4 - pre;
32081c68cf8Smacallan	pre = min(pre, w);
32181c68cf8Smacallan
32281c68cf8Smacallan	while (h > 0) {
32381c68cf8Smacallan		saddr = srcstart;
32481c68cf8Smacallan		daddr = dststart;
32581c68cf8Smacallan		cnt = w;
32681c68cf8Smacallan		if (pre > 0) {
32781c68cf8Smacallan			write_sx_io(p, saddr & ~7, SX_LDB(8, pre - 1, saddr & 7));
32881c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_STB(8, pre - 1, daddr & 7));
32981c68cf8Smacallan			saddr += pre;
33081c68cf8Smacallan			daddr += pre;
33181c68cf8Smacallan			cnt -= pre;
33281c68cf8Smacallan			if (cnt == 0) goto next;
33381c68cf8Smacallan		}
33481c68cf8Smacallan		while (cnt > 3) {
33581c68cf8Smacallan			wrds = min(32, cnt >> 2);
33681c68cf8Smacallan			write_sx_io(p, saddr & ~7, SX_LD(8, wrds - 1, saddr & 7));
33781c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
33881c68cf8Smacallan			saddr += wrds << 2;
33981c68cf8Smacallan			daddr += wrds << 2;
34081c68cf8Smacallan			cnt -= wrds << 2;
34181c68cf8Smacallan		}
34281c68cf8Smacallan		if (cnt > 0) {
34381c68cf8Smacallan			write_sx_io(p, saddr & ~7, SX_LDB(8, cnt - 1, saddr & 7));
34481c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_STB(8, cnt - 1, daddr & 7));
34581c68cf8Smacallan		}
34681c68cf8Smacallannext:
34781c68cf8Smacallan		srcstart += srcpitch;
34881c68cf8Smacallan		dststart += dstpitch;
34981c68cf8Smacallan		h--;
35081c68cf8Smacallan	}
35181c68cf8Smacallan}
35281c68cf8Smacallan
35381c68cf8Smacallan/*
35481c68cf8Smacallan * copy with same alignment, left to right, ROP
35581c68cf8Smacallan */
35681c68cf8Smacallanstatic void
35781c68cf8SmacallanCG14Copy8_aligned_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
35881c68cf8Smacallan{
35981c68cf8Smacallan	int saddr, daddr, pre, cnt, wrds;
36081c68cf8Smacallan
36181c68cf8Smacallan	ENTER;
36281c68cf8Smacallan
36381c68cf8Smacallan	pre = srcstart & 3;
36481c68cf8Smacallan	if (pre != 0) pre = 4 - pre;
36581c68cf8Smacallan	pre = min(pre, w);
36681c68cf8Smacallan
36781c68cf8Smacallan	while (h > 0) {
36881c68cf8Smacallan		saddr = srcstart;
36981c68cf8Smacallan		daddr = dststart;
37081c68cf8Smacallan		cnt = w;
37181c68cf8Smacallan		if (pre > 0) {
37281c68cf8Smacallan			write_sx_io(p, saddr & ~7, SX_LDB(8, pre - 1, saddr & 7));
37381c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_LDB(40, pre - 1, daddr & 7));
37481c68cf8Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, pre - 1));
37581c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_STB(72, pre - 1, daddr & 7));
37681c68cf8Smacallan			saddr += pre;
37781c68cf8Smacallan			daddr += pre;
37881c68cf8Smacallan			cnt -= pre;
37981c68cf8Smacallan			if (cnt == 0) goto next;
38081c68cf8Smacallan		}
38181c68cf8Smacallan		while (cnt > 3) {
38281c68cf8Smacallan			wrds = min(32, cnt >> 2);
38381c68cf8Smacallan			write_sx_io(p, saddr & ~7, SX_LD(8, wrds - 1, saddr & 7));
38481c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_LD(40, wrds - 1, daddr & 7));
38581c68cf8Smacallan			if (cnt > 16) {
38681c68cf8Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, 15));
38781c68cf8Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 56, 88, wrds - 17));
38881c68cf8Smacallan			} else
38981c68cf8Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, wrds - 1));
39081c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_ST(72, wrds - 1, daddr & 7));
39181c68cf8Smacallan			saddr += wrds << 2;
39281c68cf8Smacallan			daddr += wrds << 2;
39381c68cf8Smacallan			cnt -= wrds << 2;
39481c68cf8Smacallan		}
39581c68cf8Smacallan		if (cnt > 0) {
39681c68cf8Smacallan			write_sx_io(p, saddr & ~7, SX_LDB(8, cnt - 1, saddr & 7));
39781c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_LDB(40, cnt - 1, daddr & 7));
39881c68cf8Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(8, 40, 72, cnt - 1));
39981c68cf8Smacallan			write_sx_io(p, daddr & ~7, SX_STB(72, cnt - 1, daddr & 7));
40081c68cf8Smacallan		}
40181c68cf8Smacallannext:
40281c68cf8Smacallan		srcstart += srcpitch;
40381c68cf8Smacallan		dststart += dstpitch;
40481c68cf8Smacallan		h--;
40581c68cf8Smacallan	}
40681c68cf8Smacallan}
40781c68cf8Smacallan
408f787bc61Smacallan/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
409f787bc61Smacallanstatic void
410f787bc61SmacallanCG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
411f787bc61Smacallan{
412f787bc61Smacallan	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
4139d7fb28bSmacallan	int ssreg;
414f787bc61Smacallan#ifdef DEBUG
415f787bc61Smacallan	int taddr = 4 + dstpitch * 50;
416f787bc61Smacallan#endif
417f787bc61Smacallan	uint32_t lmask, rmask;
418f787bc61Smacallan	ENTER;
419f787bc61Smacallan
420f787bc61Smacallan	pre = dststart & 3;
421f787bc61Smacallan	lmask = 0xffffffff >> pre;
422f787bc61Smacallan	spre = srcstart & 3;
423f787bc61Smacallan	/*
424f787bc61Smacallan	 * make sure we count all the words needed to cover the destination
425f787bc61Smacallan	 * line, covering potential partials on both ends
426f787bc61Smacallan	 */
427f787bc61Smacallan	wrds = (w + pre + 3) >> 2;
428f787bc61Smacallan	swrds = (w + spre + 3) >> 2;
429f787bc61Smacallan
430f787bc61Smacallan	if (spre < pre) {
431f787bc61Smacallan		dist = 32 - (pre - spre) * 8;
432f787bc61Smacallan		sreg = 9;
433f787bc61Smacallan	} else {
434f787bc61Smacallan		dist = (spre - pre) * 8;
435f787bc61Smacallan		sreg = 8;
436f787bc61Smacallan	}
437f787bc61Smacallan
438f787bc61Smacallan	/*
439f787bc61Smacallan	 * mask out trailing pixels to avoid partial writes
440f787bc61Smacallan	 */
441f787bc61Smacallan	post = (dststart + w) & 3;
44276a85281Smacallan	if (post != 0) {
44376a85281Smacallan		rmask = ~(0xffffffff >> (post * 8));
44476a85281Smacallan		write_sx_reg(p, SX_QUEUED(7), rmask);
44576a85281Smacallan		write_sx_reg(p, SX_QUEUED(6), ~rmask);
44676a85281Smacallan	}
44776a85281Smacallan
448f787bc61Smacallan	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
449f787bc61Smacallan	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
450f787bc61Smacallan
451f787bc61Smacallan	/* mask out the leading pixels in dst by using a mask and ROP */
45276a85281Smacallan	if (pre != 0) {
45376a85281Smacallan		write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
45476a85281Smacallan		write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
45576a85281Smacallan	}
456f787bc61Smacallan
457f787bc61Smacallan	saddr = srcstart & ~3;
458f787bc61Smacallan	daddr = dststart & ~3;
45976a85281Smacallan
460f787bc61Smacallan	while (h > 0) {
461f787bc61Smacallan		write_sx_io(p, daddr & ~7, SX_LD(80, wrds - 1, daddr & 7));
462f787bc61Smacallan		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
463f787bc61Smacallan		if (wrds > 15) {
4649d7fb28bSmacallan			if (dist != 0) {
4659d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
4669d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
4679d7fb28bSmacallan				/* shifted source pixels are now at register 40+ */
4689d7fb28bSmacallan				ssreg = 40;
4699d7fb28bSmacallan			} else ssreg = 8;
470f787bc61Smacallan			if (pre != 0) {
471f787bc61Smacallan				/* mask out leading junk */
472f787bc61Smacallan				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
4739d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 0));
474f787bc61Smacallan				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
4759d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 1, 81, 9, 14));
476f787bc61Smacallan			} else {
4779d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 15));
478f787bc61Smacallan			}
4799d7fb28bSmacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 16, 96, 24, wrds - 16));
480f787bc61Smacallan		} else {
4819d7fb28bSmacallan			if (dist != 0) {
4829d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
4839d7fb28bSmacallan				ssreg = 40;
4849d7fb28bSmacallan			} else ssreg = 8;
485f787bc61Smacallan			if (pre != 0) {
486f787bc61Smacallan				/* mask out leading junk */
487f787bc61Smacallan				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
4889d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, 0));
489f787bc61Smacallan				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
4909d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg + 1, 81, 9, wrds));
491f787bc61Smacallan			} else {
4929d7fb28bSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, 8, wrds));
493f787bc61Smacallan			}
494f787bc61Smacallan		}
495f787bc61Smacallan		if (post != 0) {
496f787bc61Smacallan			/*
497f787bc61Smacallan			 * if the last word to be written out is a partial we
498f787bc61Smacallan			 * mask out the leftovers and replace them with
499f787bc61Smacallan			 * background pixels
500f787bc61Smacallan			 * we could pull the same ROP * mask trick as we do on
501f787bc61Smacallan			 * the left end but it's less annoying this way and
502f787bc61Smacallan			 * the instruction count is the same
503f787bc61Smacallan			 */
504f787bc61Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(7 + wrds, 7, 5, 0));
505f787bc61Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(79 + wrds, 6, 4, 0));
506f787bc61Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, 7 + wrds, 0));
507f787bc61Smacallan		}
508f787bc61Smacallan#ifdef DEBUG
509f787bc61Smacallan		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
510f787bc61Smacallan		taddr += dstpitch;
511f787bc61Smacallan#endif
512f787bc61Smacallan		write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
513f787bc61Smacallan		saddr += srcpitch;
514f787bc61Smacallan		daddr += dstpitch;
515f787bc61Smacallan		h--;
516f787bc61Smacallan	}
517f787bc61Smacallan}
518f787bc61Smacallan
51976a85281Smacallan/* up to 124 pixels so direction doesn't matter, unaligned, straight copy */
52076a85281Smacallanstatic void
52176a85281SmacallanCG14Copy8_short_norop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
52276a85281Smacallan{
52376a85281Smacallan	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
52476a85281Smacallan	int ssreg;
52576a85281Smacallan#ifdef DEBUG
52676a85281Smacallan	int taddr = 4 + dstpitch * 50;
52776a85281Smacallan#endif
52876a85281Smacallan	uint32_t lmask, rmask;
52976a85281Smacallan	ENTER;
53076a85281Smacallan
53176a85281Smacallan	pre = dststart & 3;
53276a85281Smacallan	lmask = 0xffffffff >> pre;
53376a85281Smacallan	spre = srcstart & 3;
53476a85281Smacallan	/*
53576a85281Smacallan	 * make sure we count all the words needed to cover the destination
53676a85281Smacallan	 * line, covering potential partials on both ends
53776a85281Smacallan	 */
53876a85281Smacallan	wrds = (w + pre + 3) >> 2;
53976a85281Smacallan	swrds = (w + spre + 3) >> 2;
54076a85281Smacallan
54176a85281Smacallan	if (spre < pre) {
54276a85281Smacallan		dist = 32 - (pre - spre) * 8;
54376a85281Smacallan		sreg = 9;
54476a85281Smacallan	} else {
54576a85281Smacallan		dist = (spre - pre) * 8;
54676a85281Smacallan		sreg = 8;
54776a85281Smacallan	}
54876a85281Smacallan
54976a85281Smacallan	/*
55076a85281Smacallan	 * mask out trailing pixels to avoid partial writes
55176a85281Smacallan	 */
55276a85281Smacallan	post = (dststart + w) & 3;
55376a85281Smacallan	if (post != 0) {
55476a85281Smacallan		rmask = ~(0xffffffff >> (post * 8));
55576a85281Smacallan		write_sx_reg(p, SX_QUEUED(7), rmask);
55676a85281Smacallan		write_sx_reg(p, SX_QUEUED(6), ~rmask);
55776a85281Smacallan	}
55876a85281Smacallan
55976a85281Smacallan	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
56076a85281Smacallan	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
56176a85281Smacallan
56276a85281Smacallan	/* mask out the leading pixels in dst by using a mask and ROP */
56376a85281Smacallan	if (pre != 0) {
56476a85281Smacallan		write_sx_reg(p, SX_ROP_CONTROL, 0xca);
56576a85281Smacallan		write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
56676a85281Smacallan	}
56776a85281Smacallan
56876a85281Smacallan	saddr = srcstart & ~3;
56976a85281Smacallan	daddr = dststart & ~3;
57076a85281Smacallan
57176a85281Smacallan	while (h > 0) {
57276a85281Smacallan		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
57376a85281Smacallan		if (wrds > 15) {
57476a85281Smacallan			if (dist != 0) {
57576a85281Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
57676a85281Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
57776a85281Smacallan				/* shifted source pixels are now at register 40+ */
57876a85281Smacallan				ssreg = 40;
57976a85281Smacallan			} else ssreg = 8;
58076a85281Smacallan			if (pre != 0) {
58176a85281Smacallan				/* read only the first word */
58276a85281Smacallan				write_sx_io(p, daddr & ~7, SX_LD(80, 0, daddr & 7));
58376a85281Smacallan				/* mask out leading junk */
58476a85281Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, ssreg, 0));
58576a85281Smacallan			}
58676a85281Smacallan		} else {
58776a85281Smacallan			if (dist != 0) {
58876a85281Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
58976a85281Smacallan				ssreg = 40;
59076a85281Smacallan			} else ssreg = 8;
59176a85281Smacallan			if (pre != 0) {
59276a85281Smacallan				/* read only the first word */
59376a85281Smacallan				write_sx_io(p, daddr & ~7, SX_LD(80, 0, daddr & 7));
59476a85281Smacallan				/* mask out leading junk */
59576a85281Smacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(ssreg, 80, ssreg, 0));
59676a85281Smacallan			}
59776a85281Smacallan		}
59876a85281Smacallan		if (post != 0) {
59976a85281Smacallan			int laddr = daddr + ((wrds - 1) << 2);
60076a85281Smacallan			/*
60176a85281Smacallan			 * if the last word to be written out is a partial we
60276a85281Smacallan			 * mask out the leftovers and replace them with
60376a85281Smacallan			 * background pixels
60476a85281Smacallan			 * we could pull the same ROP * mask trick as we do on
60576a85281Smacallan			 * the left end but it's less annoying this way and
60676a85281Smacallan			 * the instruction count is the same
60776a85281Smacallan			 */
60876a85281Smacallan			write_sx_io(p, laddr & ~7, SX_LD(81, 0, laddr & 7));
60976a85281Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(ssreg + wrds - 1, 7, 5, 0));
61076a85281Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(81, 6, 4, 0));
61176a85281Smacallan			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, ssreg + wrds - 1, 0));
61276a85281Smacallan		}
61376a85281Smacallan#ifdef DEBUG
61476a85281Smacallan		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
61576a85281Smacallan		taddr += dstpitch;
61676a85281Smacallan#endif
61776a85281Smacallan		write_sx_io(p, daddr & ~7, SX_ST(ssreg, wrds - 1, daddr & 7));
61876a85281Smacallan		saddr += srcpitch;
61976a85281Smacallan		daddr += dstpitch;
62076a85281Smacallan		h--;
62176a85281Smacallan	}
62276a85281Smacallan}
62376a85281Smacallan
624f71acd79Smacallanstatic void
625f71acd79SmacallanCG14Copy8(PixmapPtr pDstPixmap,
626f71acd79Smacallan         int srcX, int srcY, int dstX, int dstY, int w, int h)
627f71acd79Smacallan{
628f71acd79Smacallan	ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
629f71acd79Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
630f71acd79Smacallan	int dstpitch, dstoff, srcpitch, srcoff;
631f71acd79Smacallan	int srcstart, dststart, xinc, srcinc, dstinc;
632f71acd79Smacallan	int line, count, s, d, num;
633f71acd79Smacallan
634f71acd79Smacallan	ENTER;
635f71acd79Smacallan	dstpitch = exaGetPixmapPitch(pDstPixmap);
636f71acd79Smacallan	dstoff = exaGetPixmapOffset(pDstPixmap);
637f71acd79Smacallan	srcpitch = p->srcpitch;
638f71acd79Smacallan	srcoff = p->srcoff;
639f71acd79Smacallan	/*
640f71acd79Smacallan	 * should clear the WO bit in SX_CONTROL_STATUS, then check if SX
641f71acd79Smacallan	 * actually wrote anything and only sync if it did
642f71acd79Smacallan	 */
643f71acd79Smacallan	srcstart = srcX + (srcpitch * srcY) + srcoff;
644f71acd79Smacallan	dststart = dstX + (dstpitch * dstY) + dstoff;
645f71acd79Smacallan
646f71acd79Smacallan	if (p->ydir < 0) {
647f71acd79Smacallan		srcstart += (h - 1) * srcpitch;
648f71acd79Smacallan		dststart += (h - 1) * dstpitch;
649f71acd79Smacallan		srcinc = -srcpitch;
650f71acd79Smacallan		dstinc = -dstpitch;
651f71acd79Smacallan	} else {
652f71acd79Smacallan		srcinc = srcpitch;
653f71acd79Smacallan		dstinc = dstpitch;
654f71acd79Smacallan	}
655f787bc61Smacallan
656f787bc61Smacallan	/*
657f787bc61Smacallan	 * this copies up to 124 pixels wide in one go, so horizontal
658f787bc61Smacallan	 * direction / overlap don't matter
659f787bc61Smacallan	 * uses all 32bit accesses and funnel shifter for unaligned copies
660f787bc61Smacallan	 */
661f787bc61Smacallan	if ((w < 125) && (w > 8)) {
66276a85281Smacallan		switch (p->last_rop) {
66376a85281Smacallan			case 0xcc:
66476a85281Smacallan				CG14Copy8_short_norop(p, srcstart, dststart, w, h, srcinc, dstinc);
66576a85281Smacallan				break;
66676a85281Smacallan			default:
66776a85281Smacallan				CG14Copy8_short_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
66876a85281Smacallan		}
669f787bc61Smacallan		return;
670f787bc61Smacallan	}
671f787bc61Smacallan
672f787bc61Smacallan	/*
673f787bc61Smacallan	 * only invert x direction if absolutely necessary, it's a pain to
674f787bc61Smacallan	 * go backwards on SX so avoid as much as possible
675f787bc61Smacallan	 */
676f787bc61Smacallan	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
677f787bc61Smacallan		srcstart += (w - 32);
678f787bc61Smacallan		dststart += (w - 32);
679f787bc61Smacallan		xinc = -32;
680f787bc61Smacallan	} else
681f787bc61Smacallan		xinc = 32;
682f787bc61Smacallan
683f787bc61Smacallan	/*
684f787bc61Smacallan	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
685f787bc61Smacallan	 * most common case
686f787bc61Smacallan	 */
68781c68cf8Smacallan	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
68881c68cf8Smacallan		switch (p->last_rop) {
68981c68cf8Smacallan			case 0xcc:
69081c68cf8Smacallan				CG14Copy8_aligned_norop(p, srcstart, dststart, w, h, srcinc, dstinc);
69181c68cf8Smacallan				break;
69281c68cf8Smacallan			default:
69381c68cf8Smacallan				CG14Copy8_aligned_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
69481c68cf8Smacallan		}
69581c68cf8Smacallan		return;
69681c68cf8Smacallan	}
697f787bc61Smacallan
698f71acd79Smacallan	if (p->last_rop == 0xcc) {
699f71acd79Smacallan		/* plain old copy */
700f71acd79Smacallan		if ( xinc > 0) {
701f71acd79Smacallan			/* going left to right */
702f71acd79Smacallan			for (line = 0; line < h; line++) {
703f71acd79Smacallan				count = 0;
704f71acd79Smacallan				s = srcstart;
705f71acd79Smacallan				d = dststart;
706f71acd79Smacallan				while ( count < w) {
707f71acd79Smacallan					num = min(32, w - count);
708f71acd79Smacallan					write_sx_io(p, s,
709f71acd79Smacallan					    SX_LDB(10, num - 1, s & 7));
710f71acd79Smacallan					write_sx_io(p, d,
711f71acd79Smacallan					    SX_STBM(10, num - 1, d & 7));
712f71acd79Smacallan					s += xinc;
713f71acd79Smacallan					d += xinc;
714f71acd79Smacallan					count += 32;
715f71acd79Smacallan				}
716f71acd79Smacallan				srcstart += srcinc;
717f71acd79Smacallan				dststart += dstinc;
718f71acd79Smacallan			}
719f71acd79Smacallan		} else {
720f71acd79Smacallan			/* going right to left */
721f71acd79Smacallan			int i, chunks = (w >> 5);
722f71acd79Smacallan			for (line = 0; line < h; line++) {
723f71acd79Smacallan				s = srcstart;
724f71acd79Smacallan				d = dststart;
725f71acd79Smacallan				count = w;
726f71acd79Smacallan				for (i = 0; i < chunks; i++) {
727f71acd79Smacallan					write_sx_io(p, s,
728f71acd79Smacallan					    SX_LDB(10, 31, s & 7));
729f71acd79Smacallan					write_sx_io(p, d,
730f71acd79Smacallan					    SX_STBM(10, 31, d & 7));
731f71acd79Smacallan					s -= 32;
732f71acd79Smacallan					d -= 32;
733f71acd79Smacallan					count -= 32;
734f71acd79Smacallan				}
735f71acd79Smacallan				/* leftovers, if any */
736f71acd79Smacallan				if (count > 0) {
737f71acd79Smacallan					s += (32 - count);
738f71acd79Smacallan					d += (32 - count);
739f71acd79Smacallan					write_sx_io(p, s,
740f71acd79Smacallan					    SX_LDB(10, count - 1, s & 7));
741f71acd79Smacallan					write_sx_io(p, d,
742f71acd79Smacallan					    SX_STBM(10, count - 1, d & 7));
743f71acd79Smacallan				}
744f71acd79Smacallan				srcstart += srcinc;
745f71acd79Smacallan				dststart += dstinc;
746f71acd79Smacallan			}
747f71acd79Smacallan		}
748f71acd79Smacallan	} else {
749f71acd79Smacallan		/* ROPs needed */
750f71acd79Smacallan		if ( xinc > 0) {
751f71acd79Smacallan			/* going left to right */
752f71acd79Smacallan			for (line = 0; line < h; line++) {
753f71acd79Smacallan				count = 0;
754f71acd79Smacallan				s = srcstart;
755f71acd79Smacallan				d = dststart;
756f71acd79Smacallan				while ( count < w) {
757f71acd79Smacallan					num = min(32, w - count);
758f71acd79Smacallan					write_sx_io(p, s,
759f71acd79Smacallan					    SX_LDB(10, num - 1, s & 7));
760f71acd79Smacallan					write_sx_io(p, d,
761f71acd79Smacallan					    SX_LDB(42, num - 1, d & 7));
762f71acd79Smacallan					if (num > 16) {
763f71acd79Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
764f71acd79Smacallan					    	 SX_ROP(10, 42, 74, 15));
765f71acd79Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
766f71acd79Smacallan					    	 SX_ROP(26, 58, 90, num - 17));
767f71acd79Smacallan					} else {
768f71acd79Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
769f71acd79Smacallan					    	 SX_ROP(10, 42, 74, num - 1));
770f71acd79Smacallan					}
771f71acd79Smacallan					write_sx_io(p, d,
772f71acd79Smacallan					    SX_STBM(74, num - 1, d & 7));
773f71acd79Smacallan					s += xinc;
774f71acd79Smacallan					d += xinc;
775f71acd79Smacallan					count += 32;
776f71acd79Smacallan				}
777f71acd79Smacallan				srcstart += srcinc;
778f71acd79Smacallan				dststart += dstinc;
779f71acd79Smacallan			}
780f71acd79Smacallan		} else {
781f71acd79Smacallan			/* going right to left */
782f71acd79Smacallan			int i, chunks = (w >> 5);
783f71acd79Smacallan			for (line = 0; line < h; line++) {
784f71acd79Smacallan				s = srcstart;
785f71acd79Smacallan				d = dststart;
786f71acd79Smacallan				count = w;
787f71acd79Smacallan				for (i = 0; i < chunks; i++) {
788f71acd79Smacallan					write_sx_io(p, s, SX_LDB(10, 31, s & 7));
789f71acd79Smacallan					write_sx_io(p, d, SX_LDB(42, 31, d & 7));
790f71acd79Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
791f71acd79Smacallan				    	    SX_ROP(10, 42, 74, 15));
792f71acd79Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
793f71acd79Smacallan				    	    SX_ROP(26, 58, 90, 15));
794f71acd79Smacallan					write_sx_io(p, d,
795f71acd79Smacallan					    SX_STBM(74, 31, d & 7));
796f71acd79Smacallan					s -= 128;
797f71acd79Smacallan					d -= 128;
798f71acd79Smacallan					count -= 32;
799f71acd79Smacallan				}
800f71acd79Smacallan				/* leftovers, if any */
801f71acd79Smacallan				if (count > 0) {
802f71acd79Smacallan					s += (32 - count);
803f71acd79Smacallan					d += (32 - count);
804f71acd79Smacallan					write_sx_io(p, s,
805f71acd79Smacallan					    SX_LDB(10, count - 1, s & 7));
806f71acd79Smacallan					write_sx_io(p, d,
807f71acd79Smacallan					    SX_LDB(42, count - 1, d & 7));
808f71acd79Smacallan					if (count > 16) {
809f71acd79Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
810f71acd79Smacallan					    	    SX_ROP(10, 42, 74, 15));
811f71acd79Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
812f71acd79Smacallan					    	 SX_ROP(26, 58, 90, count - 17));
813f71acd79Smacallan					} else {
814f71acd79Smacallan						write_sx_reg(p, SX_INSTRUCTIONS,
815f71acd79Smacallan					    	 SX_ROP(10, 42, 74, count - 1));
816f71acd79Smacallan					}
817f71acd79Smacallan
818f71acd79Smacallan					write_sx_io(p, d,
819f71acd79Smacallan					    SX_STBM(74, count - 1, d & 7));
820f71acd79Smacallan				}
821f71acd79Smacallan				srcstart += srcinc;
822f71acd79Smacallan				dststart += dstinc;
823f71acd79Smacallan			}
824f71acd79Smacallan		}
825f71acd79Smacallan	}
826f71acd79Smacallan	exaMarkSync(pDstPixmap->drawable.pScreen);
827f71acd79Smacallan}
828f71acd79Smacallan
8294261fa58Smacallanstatic void
8304261fa58SmacallanCG14DoneCopy(PixmapPtr pDstPixmap)
8314261fa58Smacallan{
8324261fa58Smacallan}
8334261fa58Smacallan
8344261fa58Smacallanstatic Bool
8354261fa58SmacallanCG14PrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
8364261fa58Smacallan{
8374261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
8384261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
8394261fa58Smacallan
8404261fa58Smacallan	ENTER;
841faf11d72Schristos	DPRINTF(X_ERROR, "bits per pixel: %d %08lx\n",
842b8ad197aSmacallan	    pPixmap->drawable.bitsPerPixel, fg);
843b8ad197aSmacallan
844dbf8597cSmacallan	/*
845dbf8597cSmacallan	 * GXset and GXclear are really just specual cases of GXcopy with
846dbf8597cSmacallan	 * fixed fill colour
847dbf8597cSmacallan	 */
848dbf8597cSmacallan	switch (alu) {
849dbf8597cSmacallan		case GXclear:
850dbf8597cSmacallan			alu = GXcopy;
851dbf8597cSmacallan			fg = 0;
852dbf8597cSmacallan			break;
853dbf8597cSmacallan		case GXset:
854dbf8597cSmacallan			alu = GXcopy;
855dbf8597cSmacallan			fg = 0xffffffff;
856dbf8597cSmacallan			break;
857dbf8597cSmacallan	}
858b8ad197aSmacallan	/* repeat the colour in every sub byte if we're in 8 bit */
859b8ad197aSmacallan	if (pPixmap->drawable.bitsPerPixel == 8) {
860b8ad197aSmacallan		fg |= fg << 8;
861b8ad197aSmacallan		fg |= fg << 16;
862b8ad197aSmacallan	}
8634261fa58Smacallan	write_sx_reg(p, SX_QUEUED(8), fg);
8644261fa58Smacallan	write_sx_reg(p, SX_QUEUED(9), fg);
8654261fa58Smacallan	if (planemask != p->last_mask) {
8664261fa58Smacallan		CG14Wait(p);
8674261fa58Smacallan		write_sx_reg(p, SX_PLANEMASK, planemask);
8684261fa58Smacallan		p->last_mask = planemask;
8694261fa58Smacallan	}
8704261fa58Smacallan	alu = sx_rop[alu];
8714261fa58Smacallan	if (alu != p->last_rop) {
8724261fa58Smacallan		CG14Wait(p);
8734261fa58Smacallan		write_sx_reg(p, SX_ROP_CONTROL, alu);
8744261fa58Smacallan		p->last_rop = alu;
8754261fa58Smacallan	}
876dbf8597cSmacallan
8774261fa58Smacallan	DPRINTF(X_ERROR, "%s: %x\n", __func__, alu);
8784261fa58Smacallan	return TRUE;
8794261fa58Smacallan}
8804261fa58Smacallan
8814261fa58Smacallanstatic void
8824261fa58SmacallanCG14Solid32(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
8834261fa58Smacallan{
8844261fa58Smacallan	int line, x, num;
8854261fa58Smacallan	uint32_t ptr;
8864261fa58Smacallan
8874261fa58Smacallan	ENTER;
8884261fa58Smacallan	if (p->last_rop == 0xcc) {
8894261fa58Smacallan		/* simple fill */
8904261fa58Smacallan		for (line = 0; line < h; line++) {
8914261fa58Smacallan			x = 0;
8924261fa58Smacallan			while (x < w) {
8934261fa58Smacallan				ptr = start + (x << 2);
8944261fa58Smacallan				num = min(32, w - x);
8954261fa58Smacallan				write_sx_io(p, ptr,
8964261fa58Smacallan				    SX_STS(8, num - 1, ptr & 7));
8974261fa58Smacallan				x += 32;
8984261fa58Smacallan			}
8994261fa58Smacallan			start += pitch;
9004261fa58Smacallan		}
9014261fa58Smacallan	} else if (p->last_rop == 0xaa) {
9024261fa58Smacallan		/* nothing to do here */
9034261fa58Smacallan		return;
9044261fa58Smacallan	} else {
9054261fa58Smacallan		/* alright, let's do actual ROP stuff */
9064261fa58Smacallan
9074261fa58Smacallan		/* first repeat the fill colour into 16 registers */
9084261fa58Smacallan		write_sx_reg(p, SX_INSTRUCTIONS,
9094261fa58Smacallan		    SX_SELECT_S(8, 8, 10, 15));
9104261fa58Smacallan
9114261fa58Smacallan		for (line = 0; line < h; line++) {
9124261fa58Smacallan			x = 0;
9134261fa58Smacallan			while (x < w) {
9144261fa58Smacallan				ptr = start + (x << 2);
9154261fa58Smacallan				num = min(32, w - x);
9164261fa58Smacallan				/* now suck fb data into registers */
9174261fa58Smacallan				write_sx_io(p, ptr,
9184261fa58Smacallan				    SX_LD(42, num - 1, ptr & 7));
9194261fa58Smacallan				/*
9204261fa58Smacallan				 * ROP them with the fill data we left in 10
9214261fa58Smacallan				 * non-memory ops can only have counts up to 16
9224261fa58Smacallan				 */
9234261fa58Smacallan				if (num <= 16) {
9244261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
9254261fa58Smacallan					    SX_ROP(10, 42, 74, num - 1));
9264261fa58Smacallan				} else {
9274261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
9284261fa58Smacallan					    SX_ROP(10, 42, 74, 15));
9294261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
9304261fa58Smacallan					    SX_ROP(10, 58, 90, num - 17));
9314261fa58Smacallan				}
9324261fa58Smacallan				/* and write the result back into memory */
9334261fa58Smacallan				write_sx_io(p, ptr,
9344261fa58Smacallan				    SX_ST(74, num - 1, ptr & 7));
9354261fa58Smacallan				x += 32;
9364261fa58Smacallan			}
9374261fa58Smacallan			start += pitch;
9384261fa58Smacallan		}
9394261fa58Smacallan	}
9404261fa58Smacallan}
9414261fa58Smacallan
9424261fa58Smacallanstatic void
9434261fa58SmacallanCG14Solid8(Cg14Ptr p, uint32_t start, uint32_t pitch, int w, int h)
9444261fa58Smacallan{
945dbf8597cSmacallan	int line, num, pre, cnt;
9464261fa58Smacallan	uint32_t ptr;
9474261fa58Smacallan
9484261fa58Smacallan	ENTER;
949b8ad197aSmacallan	pre = start & 3;
950b8ad197aSmacallan	if (pre != 0) pre = 4 - pre;
9514261fa58Smacallan
9524261fa58Smacallan	if (p->last_rop == 0xcc) {
9534261fa58Smacallan		/* simple fill */
9544261fa58Smacallan		for (line = 0; line < h; line++) {
955b8ad197aSmacallan			ptr = start;
956b8ad197aSmacallan			cnt = w;
957b46cab2aSmacallan			pre = min(pre, cnt);
958b8ad197aSmacallan			if (pre) {
959b8ad197aSmacallan				write_sx_io(p, ptr & ~7, SX_STBS(8, pre - 1, ptr & 7));
960b8ad197aSmacallan				ptr += pre;
961b8ad197aSmacallan				cnt -= pre;
962b46cab2aSmacallan				if (cnt == 0) goto next;
963b8ad197aSmacallan			}
964b8ad197aSmacallan			/* now do the aligned pixels in 32bit chunks */
965b8ad197aSmacallan			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
966b8ad197aSmacallan			while(cnt > 3) {
967b8ad197aSmacallan				num = min(32, cnt >> 2);
968b8ad197aSmacallan				write_sx_io(p, ptr & ~7, SX_STS(8, num - 1, ptr & 7));
969b8ad197aSmacallan				ptr += num << 2;
970b8ad197aSmacallan				cnt -= num << 2;
971b8ad197aSmacallan			}
972b8ad197aSmacallan			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
973b8ad197aSmacallan			if (cnt > 0) {
974b8ad197aSmacallan				write_sx_io(p, ptr & ~7, SX_STBS(8, cnt - 1, ptr & 7));
9754261fa58Smacallan			}
976b8ad197aSmacallan			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
977b46cab2aSmacallannext:
9784261fa58Smacallan			start += pitch;
9794261fa58Smacallan		}
9804261fa58Smacallan	} else if (p->last_rop == 0xaa) {
9814261fa58Smacallan		/* nothing to do here */
9824261fa58Smacallan		return;
9834261fa58Smacallan	} else {
9844261fa58Smacallan		/* alright, let's do actual ROP stuff */
9854261fa58Smacallan
9864261fa58Smacallan		/* first repeat the fill colour into 16 registers */
9874261fa58Smacallan		write_sx_reg(p, SX_INSTRUCTIONS,
9884261fa58Smacallan		    SX_SELECT_S(8, 8, 10, 15));
9894261fa58Smacallan
9904261fa58Smacallan		for (line = 0; line < h; line++) {
991dbf8597cSmacallan			ptr = start;
992dbf8597cSmacallan			cnt = w;
993dbf8597cSmacallan			pre = min(pre, cnt);
994dbf8597cSmacallan			if (pre) {
995dbf8597cSmacallan				write_sx_io(p, ptr & ~7, SX_LDB(26, pre - 1, ptr & 7));
996dbf8597cSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, pre - 1));
997dbf8597cSmacallan				write_sx_io(p, ptr & ~7, SX_STB(42, pre - 1, ptr & 7));
998dbf8597cSmacallan				ptr += pre;
999dbf8597cSmacallan				cnt -= pre;
1000dbf8597cSmacallan				if (cnt == 0) goto next2;
1001dbf8597cSmacallan			}
1002dbf8597cSmacallan			/* now do the aligned pixels in 32bit chunks */
1003dbf8597cSmacallan			if (ptr & 3) xf86Msg(X_ERROR, "%s %x\n", __func__, ptr);
1004dbf8597cSmacallan			while(cnt > 3) {
1005dbf8597cSmacallan				num = min(32, cnt >> 2);
1006dbf8597cSmacallan				write_sx_io(p, ptr & ~7, SX_LD(26, num - 1, ptr & 7));
10074261fa58Smacallan				if (num <= 16) {
10084261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
1009dbf8597cSmacallan					    SX_ROP(10, 26, 58, num - 1));
10104261fa58Smacallan				} else {
10114261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
1012dbf8597cSmacallan					    SX_ROP(10, 26, 58, 15));
10134261fa58Smacallan					write_sx_reg(p, SX_INSTRUCTIONS,
1014dbf8597cSmacallan					    SX_ROP(10, 42, 74, num - 17));
10154261fa58Smacallan				}
1016dbf8597cSmacallan				write_sx_io(p, ptr & ~7, SX_ST(58, num - 1, ptr & 7));
1017dbf8597cSmacallan				ptr += num << 2;
1018dbf8597cSmacallan				cnt -= num << 2;
10194261fa58Smacallan			}
1020dbf8597cSmacallan			if (cnt > 3) xf86Msg(X_ERROR, "%s cnt %d\n", __func__, cnt);
1021dbf8597cSmacallan			if (cnt > 0) {
1022dbf8597cSmacallan				write_sx_io(p, ptr & ~7, SX_LDB(26, cnt - 1, ptr & 7));
1023dbf8597cSmacallan				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROP(10, 26, 42, cnt - 1));
1024dbf8597cSmacallan				write_sx_io(p, ptr & ~7, SX_STB(42, cnt - 1, ptr & 7));
1025dbf8597cSmacallan			}
1026dbf8597cSmacallan			if ((ptr + cnt) != (start + w)) xf86Msg(X_ERROR, "%s %x vs %x\n", __func__, ptr + cnt, start + w);
1027dbf8597cSmacallannext2:
10284261fa58Smacallan			start += pitch;
10294261fa58Smacallan		}
10304261fa58Smacallan	}
10314261fa58Smacallan}
10324261fa58Smacallan
10334261fa58Smacallanstatic void
10344261fa58SmacallanCG14Solid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
10354261fa58Smacallan{
10364261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
10374261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
10384261fa58Smacallan	int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
10394261fa58Smacallan	int start, depth;
10404261fa58Smacallan
10414261fa58Smacallan	ENTER;
10424261fa58Smacallan	dstpitch = exaGetPixmapPitch(pPixmap);
10434261fa58Smacallan	dstoff = exaGetPixmapOffset(pPixmap);
10444261fa58Smacallan
10454261fa58Smacallan	depth = pPixmap->drawable.bitsPerPixel;
10464261fa58Smacallan	switch (depth) {
10474261fa58Smacallan		case 32:
10484261fa58Smacallan			start = dstoff + (y1 * dstpitch) + (x1 << 2);
10494261fa58Smacallan			CG14Solid32(p, start, dstpitch, w, h);
10504261fa58Smacallan			break;
10514261fa58Smacallan		case 8:
10524261fa58Smacallan			start = dstoff + (y1 * dstpitch) + x1;
10534261fa58Smacallan			CG14Solid8(p, start, dstpitch, w, h);
10544261fa58Smacallan			break;
10554261fa58Smacallan	}
10564261fa58Smacallan
10574261fa58Smacallan	DPRINTF(X_ERROR, "Solid %d %d %d %d, %d %d -> %d\n", x1, y1, x2, y2,
10584261fa58Smacallan	    dstpitch, dstoff, start);
10594261fa58Smacallan	DPRINTF(X_ERROR, "%x %x %x\n", p->last_rop,
10604261fa58Smacallan	    read_sx_reg(p, SX_QUEUED(8)), read_sx_reg(p, SX_QUEUED(9)));
10614261fa58Smacallan	exaMarkSync(pPixmap->drawable.pScreen);
10624261fa58Smacallan}
10634261fa58Smacallan
10644261fa58Smacallan/*
10654261fa58Smacallan * Memcpy-based UTS.
10664261fa58Smacallan */
10674261fa58Smacallanstatic Bool
10684261fa58SmacallanCG14UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
10694261fa58Smacallan    char *src, int src_pitch)
10704261fa58Smacallan{
10714261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
10724261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
10734261fa58Smacallan	char  *dst        = p->fb + exaGetPixmapOffset(pDst);
10744261fa58Smacallan	int    dst_pitch  = exaGetPixmapPitch(pDst);
10754261fa58Smacallan
10764261fa58Smacallan	int bpp    = pDst->drawable.bitsPerPixel;
10774261fa58Smacallan	int cpp    = (bpp + 7) >> 3;
10784261fa58Smacallan	int wBytes = w * cpp;
10794261fa58Smacallan
10804261fa58Smacallan	ENTER;
1081f71acd79Smacallan	DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
10824261fa58Smacallan	dst += (x * cpp) + (y * dst_pitch);
10834261fa58Smacallan
10844261fa58Smacallan	CG14Wait(p);
10854261fa58Smacallan
10864261fa58Smacallan	while (h--) {
10874261fa58Smacallan		memcpy(dst, src, wBytes);
10884261fa58Smacallan		src += src_pitch;
10894261fa58Smacallan		dst += dst_pitch;
10904261fa58Smacallan	}
10914261fa58Smacallan	__asm("stbar;");
10924261fa58Smacallan	return TRUE;
10934261fa58Smacallan}
10944261fa58Smacallan
10954261fa58Smacallan/*
10964261fa58Smacallan * Memcpy-based DFS.
10974261fa58Smacallan */
10984261fa58Smacallanstatic Bool
10994261fa58SmacallanCG14DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
11004261fa58Smacallan    char *dst, int dst_pitch)
11014261fa58Smacallan{
11024261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
11034261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
11044261fa58Smacallan	char  *src        = p->fb + exaGetPixmapOffset(pSrc);
11054261fa58Smacallan	int    src_pitch  = exaGetPixmapPitch(pSrc);
11064261fa58Smacallan
11074261fa58Smacallan	ENTER;
11084261fa58Smacallan	int bpp    = pSrc->drawable.bitsPerPixel;
11094261fa58Smacallan	int cpp    = (bpp + 7) >> 3;
11104261fa58Smacallan	int wBytes = w * cpp;
11114261fa58Smacallan
11124261fa58Smacallan	src += (x * cpp) + (y * src_pitch);
11134261fa58Smacallan
11144261fa58Smacallan	CG14Wait(p);
11154261fa58Smacallan
11164261fa58Smacallan	while (h--) {
11174261fa58Smacallan		memcpy(dst, src, wBytes);
11184261fa58Smacallan		src += src_pitch;
11194261fa58Smacallan		dst += dst_pitch;
11204261fa58Smacallan	}
11214261fa58Smacallan
11224261fa58Smacallan	return TRUE;
11234261fa58Smacallan}
11244261fa58Smacallan
11254261fa58SmacallanBool
11264261fa58SmacallanCG14CheckComposite(int op, PicturePtr pSrcPicture,
11274261fa58Smacallan                           PicturePtr pMaskPicture,
11284261fa58Smacallan                           PicturePtr pDstPicture)
11294261fa58Smacallan{
11304261fa58Smacallan	int i, ok = FALSE;
11314261fa58Smacallan
11324261fa58Smacallan	ENTER;
11334261fa58Smacallan
11344261fa58Smacallan	/*
11354261fa58Smacallan	 * SX is in theory capable of accelerating pretty much all Xrender ops,
11364261fa58Smacallan	 * even coordinate transformation and gradients. Support will be added
11374261fa58Smacallan	 * over time and likely have to spill over into its own source file.
11384261fa58Smacallan	 */
11394261fa58Smacallan
1140a3a2ba44Smacallan	if ((op != PictOpOver) && (op != PictOpAdd) && (op != PictOpSrc)) {
1141fe97f391Smacallan		DPRINTF(X_ERROR, "%s: rejecting %d\n", __func__, op);
11424261fa58Smacallan		return FALSE;
11434261fa58Smacallan	}
11444261fa58Smacallan
11454bd47ccfSmacallan	if (pSrcPicture != NULL) {
11464bd47ccfSmacallan		i = 0;
11474bd47ccfSmacallan		while ((i < arraysize(src_formats)) && (!ok)) {
11484bd47ccfSmacallan			ok =  (pSrcPicture->format == src_formats[i]);
11494bd47ccfSmacallan			i++;
11504bd47ccfSmacallan		}
11514bd47ccfSmacallan
11524bd47ccfSmacallan		if (!ok) {
11534bd47ccfSmacallan			DPRINTF(X_ERROR, "%s: unsupported src format %x\n",
11544bd47ccfSmacallan			    __func__, pSrcPicture->format);
11554bd47ccfSmacallan			return FALSE;
11564bd47ccfSmacallan		}
11574bd47ccfSmacallan		DPRINTF(X_ERROR, "src is %x, %d\n", pSrcPicture->format, op);
11584261fa58Smacallan	}
11594261fa58Smacallan
11604bd47ccfSmacallan	if (pDstPicture != NULL) {
11614bd47ccfSmacallan		i = 0;
11624bd47ccfSmacallan		ok = FALSE;
11634bd47ccfSmacallan		while ((i < arraysize(src_formats)) && (!ok)) {
11644bd47ccfSmacallan			ok =  (pDstPicture->format == src_formats[i]);
11654bd47ccfSmacallan			i++;
11664bd47ccfSmacallan		}
11674bd47ccfSmacallan
11684bd47ccfSmacallan		if (!ok) {
11694bd47ccfSmacallan			DPRINTF(X_ERROR, "%s: unsupported dst format %x\n",
11704bd47ccfSmacallan			    __func__, pDstPicture->format);
11714bd47ccfSmacallan			return FALSE;
11724bd47ccfSmacallan		}
11734bd47ccfSmacallan		DPRINTF(X_ERROR, "dst is %x, %d\n", pDstPicture->format, op);
11744bd47ccfSmacallan	}
11754261fa58Smacallan
11764261fa58Smacallan	if (pMaskPicture != NULL) {
11774261fa58Smacallan		DPRINTF(X_ERROR, "mask is %x %d %d\n", pMaskPicture->format,
11784261fa58Smacallan		    pMaskPicture->pDrawable->width,
11794261fa58Smacallan		    pMaskPicture->pDrawable->height);
11804261fa58Smacallan	}
11814261fa58Smacallan	return TRUE;
11824261fa58Smacallan}
11834261fa58Smacallan
11844261fa58SmacallanBool
11854261fa58SmacallanCG14PrepareComposite(int op, PicturePtr pSrcPicture,
11864261fa58Smacallan                             PicturePtr pMaskPicture,
11874261fa58Smacallan                             PicturePtr pDstPicture,
11884261fa58Smacallan                             PixmapPtr  pSrc,
11894261fa58Smacallan                             PixmapPtr  pMask,
11904261fa58Smacallan                             PixmapPtr  pDst)
11914261fa58Smacallan{
11924261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
11934261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
11944261fa58Smacallan
11954261fa58Smacallan	ENTER;
11964261fa58Smacallan
1197f7cb851fSmacallan	p->no_source_pixmap = FALSE;
1198f7cb851fSmacallan	p->source_is_solid = FALSE;
1199f7cb851fSmacallan
1200a3a2ba44Smacallan	if (pSrcPicture->format == PICT_a1) {
12016bdc2ffdSmacallan		xf86Msg(X_ERROR, "src mono, dst %x, op %d\n",
12026bdc2ffdSmacallan		    pDstPicture->format, op);
1203a3a2ba44Smacallan		if (pMaskPicture != NULL) {
1204a3a2ba44Smacallan			xf86Msg(X_ERROR, "msk %x\n", pMaskPicture->format);
1205a3a2ba44Smacallan		}
1206f7cb851fSmacallan	}
12074261fa58Smacallan	if (pSrcPicture->pSourcePict != NULL) {
12084261fa58Smacallan		if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
12094261fa58Smacallan			p->fillcolour =
12104261fa58Smacallan			    pSrcPicture->pSourcePict->solidFill.color;
1211f7cb851fSmacallan			DPRINTF(X_ERROR, "%s: solid src %08x\n",
12124261fa58Smacallan			    __func__, p->fillcolour);
1213f7cb851fSmacallan			p->no_source_pixmap = TRUE;
1214f7cb851fSmacallan			p->source_is_solid = TRUE;
12154261fa58Smacallan		}
12164261fa58Smacallan	}
12174261fa58Smacallan	if ((pMaskPicture != NULL) && (pMaskPicture->pSourcePict != NULL)) {
12184261fa58Smacallan		if (pMaskPicture->pSourcePict->type ==
12194261fa58Smacallan		    SourcePictTypeSolidFill) {
12204261fa58Smacallan			p->fillcolour =
12214261fa58Smacallan			   pMaskPicture->pSourcePict->solidFill.color;
1222a3a2ba44Smacallan			xf86Msg(X_ERROR, "%s: solid mask %08x\n",
12234261fa58Smacallan			    __func__, p->fillcolour);
12244261fa58Smacallan		}
12254261fa58Smacallan	}
12264261fa58Smacallan	if (pMaskPicture != NULL) {
1227239808baSmacallan		p->mskoff = exaGetPixmapOffset(pMask);
12284261fa58Smacallan		p->mskpitch = exaGetPixmapPitch(pMask);
12294261fa58Smacallan		p->mskformat = pMaskPicture->format;
1230a3a2ba44Smacallan	} else {
1231239808baSmacallan		p->mskoff = 0;
1232a3a2ba44Smacallan		p->mskpitch = 0;
1233a3a2ba44Smacallan		p->mskformat = 0;
12344261fa58Smacallan	}
1235f7cb851fSmacallan	if (pSrc != NULL) {
1236f7cb851fSmacallan		p->source_is_solid =
1237f7cb851fSmacallan		   ((pSrc->drawable.width == 1) && (pSrc->drawable.height == 1));
1238f7cb851fSmacallan		p->srcoff = exaGetPixmapOffset(pSrc);
1239f7cb851fSmacallan		p->srcpitch = exaGetPixmapPitch(pSrc);
1240f7cb851fSmacallan		if (p->source_is_solid) {
1241f7cb851fSmacallan			p->fillcolour = *(uint32_t *)(p->fb + p->srcoff);
1242f7cb851fSmacallan		}
1243f7cb851fSmacallan	}
12444261fa58Smacallan	p->srcformat = pSrcPicture->format;
12454261fa58Smacallan	p->dstformat = pDstPicture->format;
1246f7cb851fSmacallan
1247f7cb851fSmacallan	if (p->source_is_solid) {
1248f7cb851fSmacallan		uint32_t temp;
1249f7cb851fSmacallan
1250f7cb851fSmacallan		/* stuff source colour into SX registers, swap as needed */
1251f7cb851fSmacallan		temp = p->fillcolour;
1252f7cb851fSmacallan		switch (p->srcformat) {
1253f7cb851fSmacallan			case PICT_a8r8g8b8:
1254f7cb851fSmacallan			case PICT_x8r8g8b8:
1255f7cb851fSmacallan				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1256f7cb851fSmacallan				temp = temp >> 8;
1257f7cb851fSmacallan				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1258f7cb851fSmacallan				temp = temp >> 8;
1259f7cb851fSmacallan				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1260f7cb851fSmacallan				break;
1261f7cb851fSmacallan			case PICT_a8b8g8r8:
1262f7cb851fSmacallan			case PICT_x8b8g8r8:
1263f7cb851fSmacallan				write_sx_reg(p, SX_QUEUED(11), temp & 0xff);
1264f7cb851fSmacallan				temp = temp >> 8;
1265f7cb851fSmacallan				write_sx_reg(p, SX_QUEUED(10), temp & 0xff);
1266f7cb851fSmacallan				temp = temp >> 8;
1267f7cb851fSmacallan				write_sx_reg(p, SX_QUEUED(9), temp & 0xff);
1268f7cb851fSmacallan				break;
1269f7cb851fSmacallan		}
1270f7cb851fSmacallan		write_sx_reg(p, SX_QUEUED(8), 0xff);
1271f7cb851fSmacallan	}
12724261fa58Smacallan	p->op = op;
1273a3a2ba44Smacallan	if (op == PictOpSrc) {
1274a3a2ba44Smacallan		CG14PrepareCopy(pSrc, pDst, 1, 1, GXcopy, 0xffffffff);
1275a3a2ba44Smacallan	}
12764261fa58Smacallan#ifdef SX_DEBUG
12774261fa58Smacallan	DPRINTF(X_ERROR, "%x %x -> %x\n", p->srcoff, p->mskoff,
12784261fa58Smacallan	    *(uint32_t *)(p->fb + p->srcoff));
12794261fa58Smacallan#endif
12804261fa58Smacallan	return TRUE;
12814261fa58Smacallan}
12824261fa58Smacallan
12834261fa58Smacallanvoid
12844261fa58SmacallanCG14Composite(PixmapPtr pDst, int srcX, int srcY,
12854261fa58Smacallan                              int maskX, int maskY,
12864261fa58Smacallan                              int dstX, int dstY,
12874261fa58Smacallan                              int width, int height)
12884261fa58Smacallan{
12894261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
12904261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
12914261fa58Smacallan	uint32_t dstoff, dstpitch;
12924261fa58Smacallan	uint32_t dst, msk, src;
1293e311bbeeSmacallan	int flip = 0;
12944261fa58Smacallan
12954261fa58Smacallan	ENTER;
12964261fa58Smacallan	dstoff = exaGetPixmapOffset(pDst);
12974261fa58Smacallan	dstpitch = exaGetPixmapPitch(pDst);
12984261fa58Smacallan
1299e311bbeeSmacallan	flip = (PICT_FORMAT_TYPE(p->srcformat) !=
1300e311bbeeSmacallan		PICT_FORMAT_TYPE(p->dstformat));
1301e311bbeeSmacallan
13024261fa58Smacallan	switch (p->op) {
13034261fa58Smacallan		case PictOpOver:
13044261fa58Smacallan			dst = dstoff + (dstY * dstpitch) + (dstX << 2);
13054261fa58Smacallan			DPRINTF(X_ERROR, "Over %08x %08x, %d %d\n",
13064261fa58Smacallan			    p->mskformat, p->dstformat, srcX, srcY);
1307a3a2ba44Smacallan			if (p->source_is_solid) {
1308a3a2ba44Smacallan				switch (p->mskformat) {
1309a3a2ba44Smacallan					case PICT_a8:
1310a3a2ba44Smacallan						msk = p->mskoff +
1311a3a2ba44Smacallan						    (maskY * p->mskpitch) +
1312a3a2ba44Smacallan						    maskX;
1313a3a2ba44Smacallan						CG14Comp_Over8Solid(p,
1314a3a2ba44Smacallan						    msk, p->mskpitch,
1315a3a2ba44Smacallan						    dst, dstpitch,
1316a3a2ba44Smacallan						    width, height);
1317a3a2ba44Smacallan						break;
1318a3a2ba44Smacallan					case PICT_a8r8g8b8:
1319a3a2ba44Smacallan					case PICT_a8b8g8r8:
1320a3a2ba44Smacallan						msk = p->mskoff +
1321a3a2ba44Smacallan						    (maskY * p->mskpitch) +
1322a3a2ba44Smacallan						    (maskX << 2);
1323a3a2ba44Smacallan						CG14Comp_Over32Solid(p,
1324a3a2ba44Smacallan						    msk, p->mskpitch,
1325a3a2ba44Smacallan						    dst, dstpitch,
1326a3a2ba44Smacallan						    width, height);
1327a3a2ba44Smacallan						break;
1328a3a2ba44Smacallan					default:
1329a3a2ba44Smacallan						xf86Msg(X_ERROR,
1330f71acd79Smacallan						  "unsupported mask format %08x\n", p->mskformat);
1331a3a2ba44Smacallan				}
1332a3a2ba44Smacallan			} else {
13336bdc2ffdSmacallan				DPRINTF(X_ERROR, "non-solid over with msk %x\n",
13346bdc2ffdSmacallan				    p->mskformat);
1335a3a2ba44Smacallan				switch (p->srcformat) {
1336a3a2ba44Smacallan					case PICT_a8r8g8b8:
1337a3a2ba44Smacallan					case PICT_a8b8g8r8:
1338a3a2ba44Smacallan						src = p->srcoff +
1339a3a2ba44Smacallan						    (srcY * p->srcpitch) +
1340a3a2ba44Smacallan						    (srcX << 2);
1341a3a2ba44Smacallan						dst = dstoff +
1342a3a2ba44Smacallan						    (dstY * dstpitch) +
1343a3a2ba44Smacallan						    (dstX << 2);
1344a3a2ba44Smacallan						if (p->mskformat == PICT_a8) {
1345a3a2ba44Smacallan							msk = p->mskoff +
1346a3a2ba44Smacallan							    (maskY * p->mskpitch) +
1347a3a2ba44Smacallan							    maskX;
1348a3a2ba44Smacallan							CG14Comp_Over32Mask(p,
1349a3a2ba44Smacallan							    src, p->srcpitch,
1350a3a2ba44Smacallan							    msk, p->mskpitch,
1351a3a2ba44Smacallan							    dst, dstpitch,
1352e311bbeeSmacallan							    width, height, flip);
1353a3a2ba44Smacallan						} else {
1354a3a2ba44Smacallan							CG14Comp_Over32(p,
1355a3a2ba44Smacallan							    src, p->srcpitch,
1356a3a2ba44Smacallan							    dst, dstpitch,
1357e311bbeeSmacallan							    width, height, flip);
1358a3a2ba44Smacallan						}
1359a3a2ba44Smacallan						break;
1360a3a2ba44Smacallan					case PICT_x8r8g8b8:
1361a3a2ba44Smacallan					case PICT_x8b8g8r8:
13626bdc2ffdSmacallan						src = p->srcoff +
13636bdc2ffdSmacallan						    (srcY * p->srcpitch) +
13646bdc2ffdSmacallan						    (srcX << 2);
13656bdc2ffdSmacallan						dst = dstoff +
13666bdc2ffdSmacallan						    (dstY * dstpitch) +
13676bdc2ffdSmacallan						    (dstX << 2);
13686bdc2ffdSmacallan						if (p->mskformat == PICT_a8) {
13696bdc2ffdSmacallan							msk = p->mskoff +
13706bdc2ffdSmacallan							    (maskY * p->mskpitch) +
13716bdc2ffdSmacallan							    maskX;
13726bdc2ffdSmacallan							CG14Comp_Over32Mask_noalpha(p,
13736bdc2ffdSmacallan							    src, p->srcpitch,
13746bdc2ffdSmacallan							    msk, p->mskpitch,
1375fa158432Smacallan							    dst, dstpitch,
1376e311bbeeSmacallan							    width, height, flip);
1377fa158432Smacallan						} else if ((p->mskformat == PICT_a8r8g8b8) ||
1378fa158432Smacallan							   (p->mskformat == PICT_a8b8g8r8)) {
1379fa158432Smacallan							msk = p->mskoff +
1380fa158432Smacallan							    (maskY * p->mskpitch) +
1381fa158432Smacallan							    (maskX << 2);
1382fa158432Smacallan							CG14Comp_Over32Mask32_noalpha(p,
1383fa158432Smacallan							    src, p->srcpitch,
1384fa158432Smacallan							    msk, p->mskpitch,
13856bdc2ffdSmacallan							    dst, dstpitch,
1386e311bbeeSmacallan							    width, height, flip);
13876bdc2ffdSmacallan						} else {
13886bdc2ffdSmacallan							xf86Msg(X_ERROR, "no src alpha, mask is %x\n", p->mskformat);
13896bdc2ffdSmacallan						}
1390a3a2ba44Smacallan						break;
1391a3a2ba44Smacallan					default:
1392a3a2ba44Smacallan						xf86Msg(X_ERROR, "%s: format %x in non-solid Over op\n",
1393a3a2ba44Smacallan						    __func__, p->srcformat);
1394a3a2ba44Smacallan				}
1395a3a2ba44Smacallan			}
13964261fa58Smacallan			break;
13974261fa58Smacallan		case PictOpAdd:
13984261fa58Smacallan			DPRINTF(X_ERROR, "Add %08x %08x\n",
13994261fa58Smacallan			    p->srcformat, p->dstformat);
14004261fa58Smacallan			switch (p->srcformat) {
14014261fa58Smacallan				case PICT_a8:
14024261fa58Smacallan					src = p->srcoff +
14034261fa58Smacallan					    (srcY * p->srcpitch) + srcX;
1404d71cb32dSmacallan					if (p->dstformat == PICT_a8) {
1405d71cb32dSmacallan						dst = dstoff +
1406d71cb32dSmacallan						      (dstY * dstpitch) + dstX;
1407d71cb32dSmacallan						CG14Comp_Add8(p,
1408d71cb32dSmacallan						    src, p->srcpitch,
1409d71cb32dSmacallan						    dst, dstpitch,
1410d71cb32dSmacallan						    width, height);
1411d71cb32dSmacallan					} else {
1412d71cb32dSmacallan						dst = dstoff +
1413d71cb32dSmacallan						      (dstY * dstpitch) +
1414d71cb32dSmacallan						      (dstX << 2);
1415d71cb32dSmacallan						CG14Comp_Add8_32(p,
1416d71cb32dSmacallan						    src, p->srcpitch,
1417d71cb32dSmacallan						    dst, dstpitch,
1418d71cb32dSmacallan						    width, height);
1419d71cb32dSmacallan					}
14204261fa58Smacallan					break;
14214261fa58Smacallan				case PICT_a8r8g8b8:
14224261fa58Smacallan				case PICT_x8r8g8b8:
14234261fa58Smacallan					src = p->srcoff +
14244261fa58Smacallan					    (srcY * p->srcpitch) + (srcX << 2);
14254261fa58Smacallan					dst = dstoff + (dstY * dstpitch) +
14264261fa58Smacallan					    (dstX << 2);
14274261fa58Smacallan					CG14Comp_Add32(p, src, p->srcpitch,
14284261fa58Smacallan					    dst, dstpitch, width, height);
14294261fa58Smacallan					break;
14304261fa58Smacallan				default:
14314261fa58Smacallan					xf86Msg(X_ERROR,
14324261fa58Smacallan					    "unsupported src format\n");
14334261fa58Smacallan			}
14344261fa58Smacallan			break;
1435a3a2ba44Smacallan		case PictOpSrc:
1436a3a2ba44Smacallan			DPRINTF(X_ERROR, "Src %08x %08x\n",
1437a3a2ba44Smacallan			    p->srcformat, p->dstformat);
1438239808baSmacallan			if (p->mskformat != 0)
1439239808baSmacallan				xf86Msg(X_ERROR, "Src mask %08x\n", p->mskformat);
1440f71acd79Smacallan			if (p->srcformat == PICT_a8) {
1441f71acd79Smacallan				CG14Copy8(pDst, srcX, srcY, dstX, dstY, width, height);
1442f71acd79Smacallan			} else {
1443f71acd79Smacallan				/* convert between RGB and BGR? */
1444f71acd79Smacallan				CG14Copy32(pDst, srcX, srcY, dstX, dstY, width, height);
1445f71acd79Smacallan			}
1446a3a2ba44Smacallan			break;
14474261fa58Smacallan		default:
14484261fa58Smacallan			xf86Msg(X_ERROR, "unsupported op %d\n", p->op);
14494261fa58Smacallan	}
14504261fa58Smacallan	exaMarkSync(pDst->drawable.pScreen);
14514261fa58Smacallan}
14524261fa58Smacallan
14534261fa58Smacallan
14544261fa58Smacallan
14554261fa58SmacallanBool
14564261fa58SmacallanCG14InitAccel(ScreenPtr pScreen)
14574261fa58Smacallan{
14584261fa58Smacallan	ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
14594261fa58Smacallan	Cg14Ptr p = GET_CG14_FROM_SCRN(pScrn);
14604261fa58Smacallan	ExaDriverPtr pExa;
14614261fa58Smacallan
14624261fa58Smacallan	pExa = exaDriverAlloc();
14634261fa58Smacallan	if (!pExa)
14644261fa58Smacallan		return FALSE;
14654261fa58Smacallan
14664261fa58Smacallan	p->pExa = pExa;
14674261fa58Smacallan
14684261fa58Smacallan	pExa->exa_major = EXA_VERSION_MAJOR;
14694261fa58Smacallan	pExa->exa_minor = EXA_VERSION_MINOR;
14704261fa58Smacallan
14714261fa58Smacallan	pExa->memoryBase = p->fb;
14724261fa58Smacallan	pExa->memorySize = p->memsize;
1473b8ad197aSmacallan	pExa->offScreenBase = p->width * p->height * (pScrn->depth >> 3);
14744261fa58Smacallan
14754261fa58Smacallan	/*
14764261fa58Smacallan	 * SX memory instructions are written to 64bit aligned addresses with
14774261fa58Smacallan	 * a 3 bit displacement. Make sure the displacement remains constant
14784261fa58Smacallan	 * within one column
14794261fa58Smacallan	 */
14804261fa58Smacallan
14814261fa58Smacallan	pExa->pixmapOffsetAlign = 8;
14824261fa58Smacallan	pExa->pixmapPitchAlign = 8;
14834261fa58Smacallan
1484fe97f391Smacallan	pExa->flags = EXA_OFFSCREEN_PIXMAPS
1485f71acd79Smacallan		      | EXA_SUPPORTS_OFFSCREEN_OVERLAPS
1486f71acd79Smacallan		      /*| EXA_MIXED_PIXMAPS*/;
14874261fa58Smacallan
14884261fa58Smacallan	/*
14894261fa58Smacallan	 * these limits are bogus
14904261fa58Smacallan	 * SX doesn't deal with coordinates at all, so there is no limit but
14914261fa58Smacallan	 * we have to put something here
14924261fa58Smacallan	 */
14934261fa58Smacallan	pExa->maxX = 4096;
14944261fa58Smacallan	pExa->maxY = 4096;
14954261fa58Smacallan
14964261fa58Smacallan	pExa->WaitMarker = CG14WaitMarker;
14974261fa58Smacallan
14984261fa58Smacallan	pExa->PrepareSolid = CG14PrepareSolid;
14994261fa58Smacallan	pExa->Solid = CG14Solid;
15004261fa58Smacallan	pExa->DoneSolid = CG14DoneCopy;
15014261fa58Smacallan	pExa->PrepareCopy = CG14PrepareCopy;
1502f71acd79Smacallan	pExa->Copy = CG14Copy32;
15034261fa58Smacallan	pExa->DoneCopy = CG14DoneCopy;
15044261fa58Smacallan	if (p->use_xrender) {
15054261fa58Smacallan		pExa->CheckComposite = CG14CheckComposite;
15064261fa58Smacallan		pExa->PrepareComposite = CG14PrepareComposite;
15074261fa58Smacallan		pExa->Composite = CG14Composite;
15084261fa58Smacallan		pExa->DoneComposite = CG14DoneCopy;
15094261fa58Smacallan	}
15104261fa58Smacallan
15114261fa58Smacallan	/* EXA hits more optimized paths when it does not have to fallback
15124261fa58Smacallan	 * because of missing UTS/DFS, hook memcpy-based UTS/DFS.
15134261fa58Smacallan	 */
15144261fa58Smacallan	pExa->UploadToScreen = CG14UploadToScreen;
15154261fa58Smacallan	pExa->DownloadFromScreen = CG14DownloadFromScreen;
15164261fa58Smacallan
1517c2193d98Smacallan	p->queuecount = 0;
15184261fa58Smacallan	/* do some hardware init */
15194261fa58Smacallan	write_sx_reg(p, SX_PLANEMASK, 0xffffffff);
15204261fa58Smacallan	p->last_mask = 0xffffffff;
15214261fa58Smacallan	write_sx_reg(p, SX_ROP_CONTROL, 0xcc);
15224261fa58Smacallan	p->last_rop = 0xcc;
15234261fa58Smacallan	return exaDriverInit(pScreen, pExa);
15244261fa58Smacallan}
1525