1d983712dSmrg
2d983712dSmrg#ifdef HAVE_CONFIG_H
3d983712dSmrg#include "config.h"
4d983712dSmrg#endif
5d983712dSmrg
6d983712dSmrg/*
7d983712dSmrg * if NO_OPTIMIZE is set, some optimizations are disabled.
8d983712dSmrg *
9d983712dSmrg * What it basically tries to do is minimize the amounts of writes to
10d983712dSmrg * accelerator registers, since these are the ones that slow down small
11d983712dSmrg * operations a lot.
12d983712dSmrg */
13d983712dSmrg/* #define NO_OPTIMIZE */
14d983712dSmrg
15d983712dSmrg/*
16d983712dSmrg * if ET6K_TRANSPARENCY is set, ScreentoScreenCopy operations (and pattern
17d983712dSmrg * fills) will support transparency. But then the planemask support has to
18d983712dSmrg * be dropped. The default here is to support planemasks, because all Tseng
19d983712dSmrg * chips can do this. Only the ET6000 supports a transparency compare. The
20d983712dSmrg * code could be easily changed to support transparency on the ET6000 and
21d983712dSmrg * planemasks on the others, but that's only useful when transparency is
22d983712dSmrg * more important than planemasks.
23d983712dSmrg */
24d983712dSmrg#undef ET6K_TRANSPARENCY
25d983712dSmrg
26d983712dSmrg#include "tseng.h"
27d983712dSmrg#include "tseng_accel.h"
28d983712dSmrg
294b9470b1Smrg#ifdef HAVE_XAA_H
30d983712dSmrg#include "miline.h"
31d983712dSmrg
32d983712dSmrg/*
33d983712dSmrg * conversion from X ROPs to Microsoft ROPs.
34d983712dSmrg */
35d983712dSmrg
36d983712dSmrgstatic int W32OpTable[] =
37d983712dSmrg{
38d983712dSmrg    0x00,			       /* Xclear             0 */
39d983712dSmrg    0x88,			       /* Xand               src AND dst */
40d983712dSmrg    0x44,			       /* XandReverse        src AND NOT dst */
41d983712dSmrg    0xcc,			       /* Xcopy              src */
42d983712dSmrg    0x22,			       /* XandInverted       NOT src AND dst */
43d983712dSmrg    0xaa,			       /* Xnoop              dst */
44d983712dSmrg    0x66,			       /* Xxor               src XOR dst */
45d983712dSmrg    0xee,			       /* Xor                src OR dst */
46d983712dSmrg    0x11,			       /* Xnor               NOT src AND NOT dst */
47d983712dSmrg    0x99,			       /* Xequiv             NOT src XOR dst */
48d983712dSmrg    0x55,			       /* Xinvert            NOT dst */
49d983712dSmrg    0xdd,			       /* XorReverse         src OR NOT dst */
50d983712dSmrg    0x33,			       /* XcopyInverted      NOT src */
51d983712dSmrg    0xbb,			       /* XorInverted        NOT src OR dst */
52d983712dSmrg    0x77,			       /* Xnand              NOT src OR NOT dst */
53d983712dSmrg    0xff			       /* Xset               1 */
54d983712dSmrg};
55d983712dSmrg
56d983712dSmrgstatic int W32OpTable_planemask[] =
57d983712dSmrg{
58d983712dSmrg    0x0a,			       /* Xclear             0 */
59d983712dSmrg    0x8a,			       /* Xand               src AND dst */
60d983712dSmrg    0x4a,			       /* XandReverse        src AND NOT dst */
61d983712dSmrg    0xca,			       /* Xcopy              src */
62d983712dSmrg    0x2a,			       /* XandInverted       NOT src AND dst */
63d983712dSmrg    0xaa,			       /* Xnoop              dst */
64d983712dSmrg    0x6a,			       /* Xxor               src XOR dst */
65d983712dSmrg    0xea,			       /* Xor                src OR dst */
66d983712dSmrg    0x1a,			       /* Xnor               NOT src AND NOT dst */
67d983712dSmrg    0x9a,			       /* Xequiv             NOT src XOR dst */
68d983712dSmrg    0x5a,			       /* Xinvert            NOT dst */
69d983712dSmrg    0xda,			       /* XorReverse         src OR NOT dst */
70d983712dSmrg    0x3a,			       /* XcopyInverted      NOT src */
71d983712dSmrg    0xba,			       /* XorInverted        NOT src OR dst */
72d983712dSmrg    0x7a,			       /* Xnand              NOT src OR NOT dst */
73d983712dSmrg    0xfa			       /* Xset               1 */
74d983712dSmrg};
75d983712dSmrg
76d983712dSmrgstatic int W32PatternOpTable[] =
77d983712dSmrg{
78d983712dSmrg    0x00,			       /* Xclear             0 */
79d983712dSmrg    0xa0,			       /* Xand               pat AND dst */
80d983712dSmrg    0x50,			       /* XandReverse        pat AND NOT dst */
81d983712dSmrg    0xf0,			       /* Xcopy              pat */
82d983712dSmrg    0x0a,			       /* XandInverted       NOT pat AND dst */
83d983712dSmrg    0xaa,			       /* Xnoop              dst */
84d983712dSmrg    0x5a,			       /* Xxor               pat XOR dst */
85d983712dSmrg    0xfa,			       /* Xor                pat OR dst */
86d983712dSmrg    0x05,			       /* Xnor               NOT pat AND NOT dst */
87d983712dSmrg    0xa5,			       /* Xequiv             NOT pat XOR dst */
88d983712dSmrg    0x55,			       /* Xinvert            NOT dst */
89d983712dSmrg    0xf5,			       /* XorReverse         pat OR NOT dst */
90d983712dSmrg    0x0f,			       /* XcopyInverted      NOT pat */
91d983712dSmrg    0xaf,			       /* XorInverted        NOT pat OR dst */
92d983712dSmrg    0x5f,			       /* Xnand              NOT pat OR NOT dst */
93d983712dSmrg    0xff			       /* Xset               1 */
94d983712dSmrg};
95d983712dSmrg
96d983712dSmrg
97d983712dSmrg
98d983712dSmrg/**********************************************************************/
99d983712dSmrg
100d983712dSmrgstatic void
101d983712dSmrgtseng_terminate_acl(TsengPtr pTseng)
102d983712dSmrg{
103d983712dSmrg    /* only terminate when needed */
104d983712dSmrg/*  if (*(volatile unsigned char *)ACL_ACCELERATOR_STATUS & 0x06) */
105d983712dSmrg    {
106d983712dSmrg	ACL_SUSPEND_TERMINATE(0x00);
107d983712dSmrg	/* suspend any running operation */
108d983712dSmrg	ACL_SUSPEND_TERMINATE(0x01);
109d983712dSmrg	WAIT_ACL;
110d983712dSmrg	ACL_SUSPEND_TERMINATE(0x00);
111d983712dSmrg	/* ... and now terminate it */
112d983712dSmrg	ACL_SUSPEND_TERMINATE(0x10);
113d983712dSmrg	WAIT_ACL;
114d983712dSmrg	ACL_SUSPEND_TERMINATE(0x00);
115d983712dSmrg    }
116d983712dSmrg}
117d983712dSmrg
1183cb82e98Smrgvoid
119d983712dSmrgtseng_recover_timeout(TsengPtr pTseng)
120d983712dSmrg{
121d983712dSmrg    if (pTseng->ChipType == ET4000) {
122d983712dSmrg	ErrorF("trying to unlock......................................\n");
123d983712dSmrg	MMIO_OUT32(pTseng->tsengCPU2ACLBase,0,0L); /* try unlocking the bus when CPU-to-accel gets stuck */
124d983712dSmrg
125d983712dSmrg        /* flush the accelerator pipeline */
126d983712dSmrg	ACL_SUSPEND_TERMINATE(0x00);
127d983712dSmrg	ACL_SUSPEND_TERMINATE(0x02);
128d983712dSmrg	ACL_SUSPEND_TERMINATE(0x00);
129d983712dSmrg    }
130d983712dSmrg}
131d983712dSmrg
132d983712dSmrgvoid
133d983712dSmrgtseng_init_acl(ScrnInfoPtr pScrn)
134d983712dSmrg{
135d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
136d983712dSmrg
137d983712dSmrg    PDEBUG("	tseng_init_acl\n");
138d983712dSmrg    /*
139d983712dSmrg     * prepare some shortcuts for faster access to memory mapped registers
140d983712dSmrg     */
141d983712dSmrg
142d983712dSmrg    pTseng->scratchMemBase = pTseng->FbBase + pTseng->AccelColorBufferOffset;
143d983712dSmrg    /*
144d983712dSmrg     * we won't be using tsengCPU2ACLBase in linear memory mode anyway, since
145d983712dSmrg     * using the MMU apertures restricts the amount of useable video memory
146d983712dSmrg     * to only 2MB, supposing we ONLY redirect MMU aperture 2 to the CPU.
147d983712dSmrg     * (see data book W32p, page 207)
148d983712dSmrg     */
149d983712dSmrg    pTseng->tsengCPU2ACLBase = pTseng->FbBase + 0x200000;	/* MMU aperture 2 */
150d983712dSmrg
151d983712dSmrg#ifdef DEBUG
152d983712dSmrg    ErrorF("MMioBase = 0x%x, scratchMemBase = 0x%x\n", pTseng->MMioBase, pTseng->scratchMemBase);
153d983712dSmrg#endif
154d983712dSmrg
155d983712dSmrg    /*
156d983712dSmrg     * prepare the accelerator for some real work
157d983712dSmrg     */
158d983712dSmrg
159d983712dSmrg    tseng_terminate_acl(pTseng);
160d983712dSmrg
161d983712dSmrg    ACL_INTERRUPT_STATUS(0xe);       /* clear interrupts */
162d983712dSmrg    ACL_INTERRUPT_MASK(0x04);	       /* disable interrupts, but enable deadlock exit */
163d983712dSmrg    ACL_INTERRUPT_STATUS(0x0);
164d983712dSmrg    ACL_ACCELERATOR_STATUS_SET(0x0);
165d983712dSmrg
166d983712dSmrg    if (pTseng->ChipType == ET6000) {
167d983712dSmrg	ACL_STEPPING_INHIBIT(0x0);   /* Undefined at power-on, let all maps (Src, Dst, Mix, Pat) step */
168d983712dSmrg	ACL_6K_CONFIG(0x00);	       /* maximum performance -- what did you think? */
169d983712dSmrg	ACL_POWER_CONTROL(0x01);     /* conserve power when ACL is idle */
170d983712dSmrg	ACL_MIX_CONTROL(0x33);
171d983712dSmrg	ACL_TRANSFER_DISABLE(0x00);  /* Undefined at power-on, enable all transfers */
172d983712dSmrg    } else {			       /* W32i/W32p */
173d983712dSmrg  	ACL_RELOAD_CONTROL(0x0);
174d983712dSmrg	ACL_SYNC_ENABLE(0x1);	       /* | 0x2 = 0WS ACL read. Yields up to 10% faster operation for small blits */
175d983712dSmrg	ACL_ROUTING_CONTROL(0x00);
176d983712dSmrg    }
177d983712dSmrg
178d983712dSmrg    /* Enable the W32p startup bit and set use an eight-bit pixel depth */
179d983712dSmrg    ACL_NQ_X_POSITION(0);
180d983712dSmrg    ACL_NQ_Y_POSITION(0);
181d983712dSmrg    ACL_PIXEL_DEPTH((pScrn->bitsPerPixel - 8) << 1);
182d983712dSmrg    /* writing destination address will start ACL */
183d983712dSmrg    ACL_OPERATION_STATE(0x10);
184d983712dSmrg
185d983712dSmrg    ACL_DESTINATION_Y_OFFSET(pScrn->displayWidth * pTseng->Bytesperpixel - 1);
186d983712dSmrg    ACL_XY_DIRECTION(0);
187d983712dSmrg
188d983712dSmrg    MMU_CONTROL(0x74);
189d983712dSmrg
190d983712dSmrg    if (pTseng->ChipType == ET4000) {
191d983712dSmrg	/*
192d983712dSmrg	 * Since the w32p revs C and D don't have any memory mapped when the
193d983712dSmrg	 * accelerator registers are used it is necessary to use the MMUs to
194d983712dSmrg	 * provide a semblance of linear memory. Fortunately on these chips
195d983712dSmrg	 * the MMU appertures are 1 megabyte each. So as long as we are
196d983712dSmrg	 * willing to only use 3 megs of video memory we can have some
197d983712dSmrg	 * acceleration. If we ever get the CPU-to-screen-color-expansion
198d983712dSmrg	 * stuff working then we will NOT need to sacrifice the extra 1MB
199d983712dSmrg	 * provided by MBP2, because we could do dynamic switching of the APT
200d983712dSmrg	 * bit in the MMU control register.
201d983712dSmrg	 *
202d983712dSmrg	 * On W32p rev c and d MBP2 is hardwired to 0x200000 when linear
203d983712dSmrg	 * memory mode is enabled. (On rev a it is programmable).
204d983712dSmrg	 *
205d983712dSmrg	 * W32p rev a and b have their first 2M mapped in the normal (non-MMU)
206d983712dSmrg	 * way, and MMU0 and MMU1, each 512 kb wide, can be used to access
207d983712dSmrg	 * another 1MB of memory. This totals to 3MB of mem. available in
208d983712dSmrg	 * linear memory when the accelerator is enabled.
209d983712dSmrg	 */
210d983712dSmrg	if ((pTseng->ChipRev == REV_A) || (pTseng->ChipRev == REV_B)) {
211d983712dSmrg	    MMIO_OUT32(pTseng->MMioBase, 0x00<<0, 0x200000L);
212d983712dSmrg	    MMIO_OUT32(pTseng->MMioBase, 0x04<<0, 0x280000L);
213d983712dSmrg	} else {		       /* rev C & D */
214d983712dSmrg	    MMIO_OUT32(pTseng->MMioBase, 0x00<<0, 0x0L);
215d983712dSmrg	    MMIO_OUT32 (pTseng->MMioBase, 0x04<<0, 0x100000L);
216d983712dSmrg	}
217d983712dSmrg    }
218d983712dSmrg}
219d983712dSmrg
220d983712dSmrg/*
221d983712dSmrg * ET4/6K acceleration interface -- color expansion primitives.
222d983712dSmrg *
223d983712dSmrg * Uses Harm Hanemaayer's generic acceleration interface (XAA).
224d983712dSmrg *
225d983712dSmrg * Author: Koen Gadeyne
226d983712dSmrg *
227d983712dSmrg * Much of the acceleration code is based on the XF86_W32 server code from
228d983712dSmrg * Glenn Lai.
229d983712dSmrg *
230d983712dSmrg *
231d983712dSmrg *     Color expansion capabilities of the Tseng chip families:
232d983712dSmrg *
233d983712dSmrg *     Chip     screen-to-screen   CPU-to-screen   Supported depths
234d983712dSmrg *
235d983712dSmrg *   ET4000W32/W32i   No               Yes             8bpp only
236d983712dSmrg *   ET4000W32p       Yes              Yes             8bpp only
237d983712dSmrg *   ET6000           Yes              No              8/16/24/32 bpp
238d983712dSmrg */
239d983712dSmrg#define SET_FUNCTION_COLOREXPAND \
240d983712dSmrg    if (pTseng->ChipType == ET6000) \
241d983712dSmrg      ACL_MIX_CONTROL(0x32); \
242d983712dSmrg    else \
243d983712dSmrg      ACL_ROUTING_CONTROL(0x08);
244d983712dSmrg
245d983712dSmrg#define SET_FUNCTION_COLOREXPAND_CPU \
246d983712dSmrg    ACL_ROUTING_CONTROL(0x02);
247d983712dSmrg
248d983712dSmrg
249d983712dSmrgstatic void
250d983712dSmrgTsengSubsequentScanlineCPUToScreenColorExpandFill(ScrnInfoPtr pScrn,
251d983712dSmrg    int x, int y, int w, int h, int skipleft)
252d983712dSmrg{
253d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
254d983712dSmrg
255d983712dSmrg    if (pTseng->ChipType == ET4000) {
256d983712dSmrg	/* the accelerator needs DWORD padding, and "w" is in PIXELS... */
257d983712dSmrg	pTseng->acl_colexp_width_dwords = (MULBPP(pTseng, w) + 31) >> 5;
258d983712dSmrg	pTseng->acl_colexp_width_bytes = (MULBPP(pTseng, w) + 7) >> 3;
259d983712dSmrg    }
260d983712dSmrg
261d983712dSmrg    pTseng->acl_ColorExpandDst = FBADDR(pTseng, x, y);
262d983712dSmrg    pTseng->acl_skipleft = skipleft;
263d983712dSmrg
264d983712dSmrg    wait_acl_queue(pTseng);
265d983712dSmrg
266d983712dSmrg#if 0
267d983712dSmrg    ACL_MIX_Y_OFFSET(w - 1);
268d983712dSmrg
269d983712dSmrg    ErrorF(" W=%d", w);
270d983712dSmrg#endif
271d983712dSmrg    SET_XY(pTseng, w, 1);
272d983712dSmrg}
273d983712dSmrg
274d983712dSmrgstatic void
275d983712dSmrgTsengSubsequentColorExpandScanline(ScrnInfoPtr pScrn,
276d983712dSmrg    int bufno)
277d983712dSmrg{
278d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
279d983712dSmrg
280d983712dSmrg    wait_acl_queue(pTseng);
281d983712dSmrg
282d983712dSmrg    ACL_MIX_ADDRESS((pTseng->AccelColorExpandBufferOffsets[bufno] << 3) + pTseng->acl_skipleft);
283d983712dSmrg    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
284d983712dSmrg
285d983712dSmrg    /* move to next scanline */
286d983712dSmrg    pTseng->acl_ColorExpandDst += pTseng->line_width;
287d983712dSmrg
288d983712dSmrg    /*
289d983712dSmrg     * If not using triple-buffering, we need to wait for the queued
290d983712dSmrg     * register set to be transferred to the working register set here,
291d983712dSmrg     * because otherwise an e.g. double-buffering mechanism could overwrite
292d983712dSmrg     * the buffer that's currently being worked with with new data too soon.
293d983712dSmrg     *
294d983712dSmrg     * WAIT_QUEUE; // not needed with triple-buffering
295d983712dSmrg     */
296d983712dSmrg}
297d983712dSmrg
298d983712dSmrg
299d983712dSmrg
300d983712dSmrg/*
301d983712dSmrg * We use this intermediate CPU-to-Screen color expansion because the one
302d983712dSmrg * provided by XAA seems to lock up the accelerator engine.
303d983712dSmrg *
304d983712dSmrg * One of the main differences between the XAA approach and this one is that
305d983712dSmrg * transfers are done per byte. I'm not sure if that is needed though.
306d983712dSmrg */
307d983712dSmrgstatic void
308d983712dSmrgTsengSubsequentColorExpandScanline_8bpp(ScrnInfoPtr pScrn, int bufno)
309d983712dSmrg{
310d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
311d983712dSmrg    pointer dest = pTseng->tsengCPU2ACLBase;
312d983712dSmrg    int i,j;
313d983712dSmrg    CARD8 *bufptr;
314d983712dSmrg
315d983712dSmrg    i = pTseng->acl_colexp_width_bytes;
316d983712dSmrg    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
317d983712dSmrg
318d983712dSmrg    wait_acl_queue(pTseng);
319d983712dSmrg    START_ACL (pTseng, pTseng->acl_ColorExpandDst);
320d983712dSmrg
321d983712dSmrg/*  *((LongP) (MMioBase + 0x08)) = (CARD32) pTseng->acl_ColorExpandDst;*/
322d983712dSmrg/*  MMIO_OUT32(tsengCPU2ACLBase,0, (CARD32)pTseng->acl_ColorExpandDst); */
323d983712dSmrg    j = 0;
324d983712dSmrg    /* Copy scanline data to accelerator MMU aperture byte by byte */
325d983712dSmrg    while (i--) {		       /* FIXME: we need to take care of PCI bursting and MMU overflow here! */
326d983712dSmrg	MMIO_OUT8(dest,j++, *bufptr++);
327d983712dSmrg    }
328d983712dSmrg
329d983712dSmrg    /* move to next scanline */
330d983712dSmrg    pTseng->acl_ColorExpandDst += pTseng->line_width;
331d983712dSmrg}
332d983712dSmrg
333d983712dSmrg/*
334d983712dSmrg * This function does direct memory-to-CPU bit doubling for color-expansion
335d983712dSmrg * at 16bpp on W32 chips. They can only do 8bpp color expansion, so we have
336d983712dSmrg * to expand the incoming data to 2bpp first.
337d983712dSmrg */
338d983712dSmrgstatic void
339d983712dSmrgTsengSubsequentColorExpandScanline_16bpp(ScrnInfoPtr pScrn, int bufno)
340d983712dSmrg{
341d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
342d983712dSmrg    pointer dest = pTseng->tsengCPU2ACLBase;
343d983712dSmrg    int i,j;
344d983712dSmrg    CARD8 *bufptr;
345d983712dSmrg    register CARD32 bits16;
346d983712dSmrg
347d983712dSmrg    i = pTseng->acl_colexp_width_dwords * 2;
348d983712dSmrg    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
349d983712dSmrg
350d983712dSmrg    wait_acl_queue(pTseng);
351d983712dSmrg    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
352d983712dSmrg
353d983712dSmrg    j = 0;
354d983712dSmrg    while (i--) {
355d983712dSmrg	bits16 = pTseng->ColExpLUT[*bufptr++];
356d983712dSmrg	MMIO_OUT8(dest,j++,bits16 & 0xFF);
357d983712dSmrg	MMIO_OUT8(dest,j++,(bits16 >> 8) & 0xFF);
358d983712dSmrg    }
359d983712dSmrg
360d983712dSmrg    /* move to next scanline */
361d983712dSmrg    pTseng->acl_ColorExpandDst += pTseng->line_width;
362d983712dSmrg}
363d983712dSmrg
364d983712dSmrg/*
365d983712dSmrg * This function does direct memory-to-CPU bit doubling for color-expansion
366d983712dSmrg * at 24bpp on W32 chips. They can only do 8bpp color expansion, so we have
367d983712dSmrg * to expand the incoming data to 3bpp first.
368d983712dSmrg */
369d983712dSmrgstatic void
370d983712dSmrgTsengSubsequentColorExpandScanline_24bpp(ScrnInfoPtr pScrn, int bufno)
371d983712dSmrg{
372d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
373d983712dSmrg    pointer dest = pTseng->tsengCPU2ACLBase;
374d983712dSmrg    int i, k, j = -1;
375d983712dSmrg    CARD8 *bufptr;
376d983712dSmrg    register CARD32 bits24;
377d983712dSmrg
378d983712dSmrg    i = pTseng->acl_colexp_width_dwords * 4;
379d983712dSmrg    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
380d983712dSmrg
381d983712dSmrg    wait_acl_queue(pTseng);
382d983712dSmrg    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
383d983712dSmrg
384d983712dSmrg    /* take 8 input bits, expand to 3 output bytes */
385d983712dSmrg    bits24 = pTseng->ColExpLUT[*bufptr++];
386d983712dSmrg    k = 0;
387d983712dSmrg    while (i--) {
388d983712dSmrg	if ((j++) == 2) {	       /* "i % 3" operation is much to expensive */
389d983712dSmrg	    j = 0;
390d983712dSmrg	    bits24 = pTseng->ColExpLUT[*bufptr++];
391d983712dSmrg	}
392d983712dSmrg	MMIO_OUT8(dest,k++,bits24 & 0xFF);
393d983712dSmrg	bits24 >>= 8;
394d983712dSmrg    }
395d983712dSmrg
396d983712dSmrg    /* move to next scanline */
397d983712dSmrg    pTseng->acl_ColorExpandDst += pTseng->line_width;
398d983712dSmrg}
399d983712dSmrg
400d983712dSmrg/*
401d983712dSmrg * This function does direct memory-to-CPU bit doubling for color-expansion
402d983712dSmrg * at 32bpp on W32 chips. They can only do 8bpp color expansion, so we have
403d983712dSmrg * to expand the incoming data to 4bpp first.
404d983712dSmrg */
405d983712dSmrgstatic void
406d983712dSmrgTsengSubsequentColorExpandScanline_32bpp(ScrnInfoPtr pScrn, int bufno)
407d983712dSmrg{
408d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
409d983712dSmrg    pointer dest = pTseng->tsengCPU2ACLBase;
410d983712dSmrg    int i,j;
411d983712dSmrg    CARD8 *bufptr;
412d983712dSmrg    register CARD32 bits32;
413d983712dSmrg
414d983712dSmrg    i = pTseng->acl_colexp_width_dwords;
415d983712dSmrg   /* amount of blocks of 8 bits to expand to 32 bits (=1 DWORD) */
416d983712dSmrg    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
417d983712dSmrg
418d983712dSmrg    wait_acl_queue(pTseng);
419d983712dSmrg    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
420d983712dSmrg
421d983712dSmrg    j = 0;
422d983712dSmrg    while (i--) {
423d983712dSmrg	bits32 = pTseng->ColExpLUT[*bufptr++];
424d983712dSmrg	MMIO_OUT8(dest,j++,bits32 & 0xFF);
425d983712dSmrg	MMIO_OUT8(dest,j++,(bits32 >> 8) & 0xFF);
426d983712dSmrg	MMIO_OUT8(dest,j++,(bits32 >> 16) & 0xFF);
427d983712dSmrg	MMIO_OUT8(dest,j++,(bits32 >> 24) & 0xFF);
428d983712dSmrg    }
429d983712dSmrg
430d983712dSmrg    /* move to next scanline */
431d983712dSmrg    pTseng->acl_ColorExpandDst += pTseng->line_width;
432d983712dSmrg}
433d983712dSmrg
434d983712dSmrg/*
435d983712dSmrg * CPU-to-Screen color expansion.
436d983712dSmrg *   This is for ET4000 only (The ET6000 cannot do this)
437d983712dSmrg */
438d983712dSmrgstatic void
439d983712dSmrgTsengSetupForCPUToScreenColorExpandFill(ScrnInfoPtr pScrn,
440d983712dSmrg    int fg, int bg, int rop, unsigned int planemask)
441d983712dSmrg{
442d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
443d983712dSmrg
444d983712dSmrg/*  ErrorF("X"); */
445d983712dSmrg
446d983712dSmrg    PINGPONG(pTseng);
447d983712dSmrg
448d983712dSmrg    wait_acl_queue(pTseng);
449d983712dSmrg
450d983712dSmrg    SET_FG_ROP(rop);
451d983712dSmrg    SET_BG_ROP_TR(rop, bg);
452d983712dSmrg
453d983712dSmrg    SET_XYDIR(0);
454d983712dSmrg
455d983712dSmrg    SET_FG_BG_COLOR(pTseng, fg, bg);
456d983712dSmrg
457d983712dSmrg    SET_FUNCTION_COLOREXPAND_CPU;
458d983712dSmrg
459d983712dSmrg    /* assure correct alignment of MIX address (ACL needs same alignment here as in MMU aperture) */
460d983712dSmrg    ACL_MIX_ADDRESS(0);
461d983712dSmrg}
462d983712dSmrg
463d983712dSmrg#ifdef TSENG_CPU_TO_SCREEN_COLOREXPAND
464d983712dSmrg/*
465d983712dSmrg * TsengSubsequentCPUToScreenColorExpand() is potentially dangerous:
466d983712dSmrg *   Not writing enough data to the MMU aperture for CPU-to-screen color
467d983712dSmrg *   expansion will eventually cause a system deadlock!
468d983712dSmrg *
469d983712dSmrg * Note that CPUToScreenColorExpand operations _always_ require a
470d983712dSmrg * WAIT_INTERFACE before starting a new operation (this is empyrical,
471d983712dSmrg * though)
472d983712dSmrg */
473d983712dSmrgstatic void
474d983712dSmrgTsengSubsequentCPUToScreenColorExpandFill(ScrnInfoPtr pScrn,
475d983712dSmrg    int x, int y, int w, int h, int skipleft)
476d983712dSmrg{
477d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
478d983712dSmrg    int destaddr = FBADDR(pTseng, x, y);
479d983712dSmrg
480d983712dSmrg    /* ErrorF(" %dx%d|%d ",w,h,skipleft); */
481d983712dSmrg    if (skipleft)
482d983712dSmrg	ErrorF("Can't do: Skipleft = %d\n", skipleft);
483d983712dSmrg
484d983712dSmrg/*  wait_acl_queue(); */
485d983712dSmrg    ErrorF("=========WAIT     FIXME!\n");
486d983712dSmrg    WAIT_INTERFACE;
487d983712dSmrg
488d983712dSmrg    ACL_MIX_Y_OFFSET(w - 1);
489d983712dSmrg    SET_XY(pTseng, w, h);
490d983712dSmrg    START_ACL(pTseng, destaddr);
491d983712dSmrg}
492d983712dSmrg#endif
493d983712dSmrg
494d983712dSmrgstatic void
495d983712dSmrgTsengSetupForScreenToScreenColorExpandFill(ScrnInfoPtr pScrn,
496d983712dSmrg    int fg, int bg, int rop, unsigned int planemask)
497d983712dSmrg{
498d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
499d983712dSmrg
500d983712dSmrg/*  ErrorF("SSC "); */
501d983712dSmrg
502d983712dSmrg    PINGPONG(pTseng);
503d983712dSmrg
504d983712dSmrg    wait_acl_queue(pTseng);
505d983712dSmrg
506d983712dSmrg    SET_FG_ROP(rop);
507d983712dSmrg    SET_BG_ROP_TR(rop, bg);
508d983712dSmrg
509d983712dSmrg    SET_FG_BG_COLOR(pTseng, fg, bg);
510d983712dSmrg
511d983712dSmrg    SET_FUNCTION_COLOREXPAND;
512d983712dSmrg
513d983712dSmrg    SET_XYDIR(0);
514d983712dSmrg}
515d983712dSmrg
516d983712dSmrgstatic void
517d983712dSmrgTsengSubsequentScreenToScreenColorExpandFill(ScrnInfoPtr pScrn,
518d983712dSmrg    int x, int y, int w, int h, int srcx, int srcy, int skipleft)
519d983712dSmrg{
520d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
521d983712dSmrg    int destaddr = FBADDR(pTseng, x, y);
522d983712dSmrg
523d983712dSmrg/*    int srcaddr = FBADDR(pTseng, srcx, srcy); */
524d983712dSmrg
525d983712dSmrg    wait_acl_queue(pTseng);
526d983712dSmrg
527d983712dSmrg    SET_XY(pTseng, w, h);
528d983712dSmrg    ACL_MIX_ADDRESS(		       /* MIX address is in BITS */
529d983712dSmrg	(((srcy * pScrn->displayWidth) + srcx) * pScrn->bitsPerPixel) + skipleft);
530d983712dSmrg
531d983712dSmrg    ACL_MIX_Y_OFFSET(pTseng->line_width << 3);
532d983712dSmrg
533d983712dSmrg    START_ACL(pTseng, destaddr);
534d983712dSmrg}
535d983712dSmrg
536d983712dSmrg/*
537d983712dSmrg *
538d983712dSmrg */
539d983712dSmrgstatic Bool
540d983712dSmrgTsengXAAInit_Colexp(ScrnInfoPtr pScrn)
541d983712dSmrg{
542d983712dSmrg    int i, j, r;
543d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
544d983712dSmrg    XAAInfoRecPtr pXAAInfo = pTseng->AccelInfoRec;
545d983712dSmrg
546d983712dSmrg    PDEBUG("	TsengXAAInit_Colexp\n");
547d983712dSmrg
548d983712dSmrg#ifdef TODO
549d983712dSmrg    if (OFLG_ISSET(OPTION_XAA_NO_COL_EXP, &vga256InfoRec.options))
550d983712dSmrg	return;
551d983712dSmrg#endif
552d983712dSmrg
553d983712dSmrg    /* FIXME! disable accelerated color expansion for W32/W32i until it's fixed */
554d983712dSmrg/*  if (Is_W32 || Is_W32i) return; */
555d983712dSmrg
556d983712dSmrg    /*
557d983712dSmrg     * Screen-to-screen color expansion.
558d983712dSmrg     *
559d983712dSmrg     * Scanline-screen-to-screen color expansion is slower than
560d983712dSmrg     * CPU-to-screen color expansion.
561d983712dSmrg     */
562d983712dSmrg
563d983712dSmrg    pXAAInfo->ScreenToScreenColorExpandFillFlags =
564d983712dSmrg	BIT_ORDER_IN_BYTE_LSBFIRST |
565d983712dSmrg	SCANLINE_PAD_DWORD |
566d983712dSmrg	LEFT_EDGE_CLIPPING |
567d983712dSmrg	NO_PLANEMASK;
568d983712dSmrg
569d983712dSmrg#if 1
570d983712dSmrg    if ((pTseng->ChipType == ET6000) || (pScrn->bitsPerPixel == 8)) {
571d983712dSmrg	pXAAInfo->SetupForScreenToScreenColorExpandFill =
572d983712dSmrg	    TsengSetupForScreenToScreenColorExpandFill;
573d983712dSmrg	pXAAInfo->SubsequentScreenToScreenColorExpandFill =
574d983712dSmrg	    TsengSubsequentScreenToScreenColorExpandFill;
575d983712dSmrg    }
576d983712dSmrg#endif
577d983712dSmrg
578d983712dSmrg    /*
579d983712dSmrg     * Scanline CPU to screen color expansion for all W32 engines.
580d983712dSmrg     *
581d983712dSmrg     * real CPU-to-screen color expansion is extremely tricky, and only
582d983712dSmrg     * works for 8bpp anyway.
583d983712dSmrg     *
584d983712dSmrg     * This also allows us to do 16, 24 and 32 bpp color expansion by first
585d983712dSmrg     * doubling the bitmap pattern before color-expanding it, because W32s
586d983712dSmrg     * can only do 8bpp color expansion.
587d983712dSmrg     */
588d983712dSmrg
589d983712dSmrg    pXAAInfo->ScanlineCPUToScreenColorExpandFillFlags =
590d983712dSmrg	BIT_ORDER_IN_BYTE_LSBFIRST |
591d983712dSmrg	SCANLINE_PAD_DWORD |
592d983712dSmrg	NO_PLANEMASK;
593d983712dSmrg
594d983712dSmrg    if (pTseng->ChipType == ET4000) {
595d983712dSmrg	pTseng->XAAScanlineColorExpandBuffers[0] =
596d983712dSmrg	    xnfalloc(((pScrn->virtualX + 31)/32) * 4 * pTseng->Bytesperpixel);
597d983712dSmrg	if (pTseng->XAAScanlineColorExpandBuffers[0] == NULL) {
598d983712dSmrg	    xf86Msg(X_ERROR, "Could not malloc color expansion scanline buffer.\n");
599d983712dSmrg	    return FALSE;
600d983712dSmrg	}
601d983712dSmrg	pXAAInfo->NumScanlineColorExpandBuffers = 1;
602d983712dSmrg	pXAAInfo->ScanlineColorExpandBuffers = pTseng->XAAScanlineColorExpandBuffers;
603d983712dSmrg
604d983712dSmrg	pXAAInfo->SetupForScanlineCPUToScreenColorExpandFill =
605d983712dSmrg	    TsengSetupForCPUToScreenColorExpandFill;
606d983712dSmrg
607d983712dSmrg	pXAAInfo->SubsequentScanlineCPUToScreenColorExpandFill =
608d983712dSmrg	    TsengSubsequentScanlineCPUToScreenColorExpandFill;
609d983712dSmrg
610d983712dSmrg	switch (pScrn->bitsPerPixel) {
611d983712dSmrg	case 8:
612d983712dSmrg	    pXAAInfo->SubsequentColorExpandScanline =
613d983712dSmrg		TsengSubsequentColorExpandScanline_8bpp;
614d983712dSmrg	    break;
615d983712dSmrg	case 15:
616d983712dSmrg	case 16:
617d983712dSmrg	    pXAAInfo->SubsequentColorExpandScanline =
618d983712dSmrg		TsengSubsequentColorExpandScanline_16bpp;
619d983712dSmrg	    break;
620d983712dSmrg	case 24:
621d983712dSmrg	    pXAAInfo->SubsequentColorExpandScanline =
622d983712dSmrg		TsengSubsequentColorExpandScanline_24bpp;
623d983712dSmrg	    break;
624d983712dSmrg	case 32:
625d983712dSmrg	    pXAAInfo->SubsequentColorExpandScanline =
626d983712dSmrg		TsengSubsequentColorExpandScanline_32bpp;
627d983712dSmrg	    break;
628d983712dSmrg	}
629d983712dSmrg	/* create color expansion LUT (used for >8bpp only) */
630d983712dSmrg	pTseng->ColExpLUT = xnfalloc(sizeof(CARD32)*256);
631d983712dSmrg	if (pTseng->ColExpLUT == NULL) {
632d983712dSmrg	    xf86Msg(X_ERROR, "Could not malloc color expansion tables.\n");
633d983712dSmrg	    return FALSE;
634d983712dSmrg	}
635d983712dSmrg	for (i = 0; i < 256; i++) {
636d983712dSmrg	    r = 0;
637d983712dSmrg	    for (j = 7; j >= 0; j--) {
638d983712dSmrg		r <<= pTseng->Bytesperpixel;
639d983712dSmrg		if ((i >> j) & 1)
640d983712dSmrg		    r |= (1 << pTseng->Bytesperpixel) - 1;
641d983712dSmrg	    }
642d983712dSmrg	    pTseng->ColExpLUT[i] = r;
643d983712dSmrg	    /* ErrorF("0x%08X, ",r ); if ((i%8)==7) ErrorF("\n"); */
644d983712dSmrg	}
645d983712dSmrg    } else {
646d983712dSmrg	/*
647d983712dSmrg	 * Triple-buffering is needed to account for double-buffering of Tseng
648d983712dSmrg	 * acceleration registers.
649d983712dSmrg	 */
650d983712dSmrg	pXAAInfo->NumScanlineColorExpandBuffers = 3;
651d983712dSmrg	pXAAInfo->ScanlineColorExpandBuffers =
652d983712dSmrg	    pTseng->XAAColorExpandBuffers;
653d983712dSmrg	pXAAInfo->SetupForScanlineCPUToScreenColorExpandFill =
654d983712dSmrg	    TsengSetupForScreenToScreenColorExpandFill;
655d983712dSmrg	pXAAInfo->SubsequentScanlineCPUToScreenColorExpandFill =
656d983712dSmrg	    TsengSubsequentScanlineCPUToScreenColorExpandFill;
657d983712dSmrg	pXAAInfo->SubsequentColorExpandScanline =
658d983712dSmrg	    TsengSubsequentColorExpandScanline;
659d983712dSmrg
660d983712dSmrg	/* calculate memory addresses from video memory offsets */
661d983712dSmrg	for (i = 0; i < pXAAInfo->NumScanlineColorExpandBuffers; i++) {
662d983712dSmrg	    pTseng->XAAColorExpandBuffers[i] =
663d983712dSmrg		pTseng->FbBase + pTseng->AccelColorExpandBufferOffsets[i];
664d983712dSmrg	}
665d983712dSmrg
666d983712dSmrg	pXAAInfo->ScanlineColorExpandBuffers = pTseng->XAAColorExpandBuffers;
667d983712dSmrg    }
668d983712dSmrg
669d983712dSmrg#ifdef TSENG_CPU_TO_SCREEN_COLOREXPAND
670d983712dSmrg    /*
671d983712dSmrg     * CPU-to-screen color expansion doesn't seem to be reliable yet. The
672d983712dSmrg     * W32 needs the correct amount of data sent to it in this mode, or it
673d983712dSmrg     * hangs the machine until is does (?). Currently, the init code in this
674d983712dSmrg     * file or the XAA code that uses this does something wrong, so that
675d983712dSmrg     * occasionally we get accelerator timeouts, and after a few, complete
676d983712dSmrg     * system hangs.
677d983712dSmrg     *
678d983712dSmrg     * The W32 engine requires SCANLINE_NO_PAD, but that doesn't seem to
679d983712dSmrg     * work very well (accelerator hangs).
680d983712dSmrg     *
681d983712dSmrg     * What works is this: tell XAA that we have SCANLINE_PAD_DWORD, and then
682d983712dSmrg     * add the following code in TsengSubsequentCPUToScreenColorExpand():
683d983712dSmrg     *     w = (w + 31) & ~31; this code rounds the width up to the nearest
684d983712dSmrg     * multiple of 32, and together with SCANLINE_PAD_DWORD, this makes
685d983712dSmrg     * CPU-to-screen color expansion work. Of course, the display isn't
686d983712dSmrg     * correct (4 chars are "blanked out" when only one is written, for
687d983712dSmrg     * example). But this shows that the principle works. But the code
688d983712dSmrg     * doesn't...
689d983712dSmrg     *
690d983712dSmrg     * The same thing goes for PAD_BYTE: this also works (with the same
691d983712dSmrg     * problems as SCANLINE_PAD_DWORD, although less prominent)
692d983712dSmrg     */
693d983712dSmrg
694d983712dSmrg    pXAAInfo->CPUToScreenColorExpandFillFlags =
695d983712dSmrg	BIT_ORDER_IN_BYTE_LSBFIRST |
696d983712dSmrg	SCANLINE_PAD_DWORD |   /* no other choice */
697d983712dSmrg	CPU_TRANSFER_PAD_DWORD |
698d983712dSmrg	NO_PLANEMASK;
699d983712dSmrg
700d983712dSmrg    if (Is_W32_any && (pScrn->bitsPerPixel == 8)) {
701d983712dSmrg	pXAAInfo->SetupForCPUToScreenColorExpandFill =
702d983712dSmrg	    TsengSetupForCPUToScreenColorExpandFill;
703d983712dSmrg	pXAAInfo->SubsequentCPUToScreenColorExpandFill =
704d983712dSmrg	    TsengSubsequentCPUToScreenColorExpandFill;
705d983712dSmrg
706d983712dSmrg	/* we'll be using MMU aperture 2 */
707d983712dSmrg	pXAAInfo->ColorExpandBase = (CARD8 *)pTseng->tsengCPU2ACLBase;
708d983712dSmrg	/* ErrorF("tsengCPU2ACLBase = 0x%x\n", pTseng->tsengCPU2ACLBase); */
709d983712dSmrg	/* aperture size is 8kb in banked mode. Larger in linear mode, but 8kb is enough */
710d983712dSmrg	pXAAInfo->ColorExpandRange = 8192;
711d983712dSmrg    }
712d983712dSmrg#endif
713d983712dSmrg    return TRUE;
714d983712dSmrg}
715d983712dSmrg
716d983712dSmrg/*
717d983712dSmrg * ET4/6K acceleration interface.
718d983712dSmrg *
719d983712dSmrg * Uses Harm Hanemaayer's generic acceleration interface (XAA).
720d983712dSmrg *
721d983712dSmrg * Author: Koen Gadeyne
722d983712dSmrg *
723d983712dSmrg * Much of the acceleration code is based on the XF86_W32 server code from
724d983712dSmrg * Glenn Lai.
725d983712dSmrg *
726d983712dSmrg */
727d983712dSmrg
728d983712dSmrg/*
729d983712dSmrg * This is the implementation of the Sync() function.
730d983712dSmrg *
731d983712dSmrg * To avoid pipeline/cache/buffer flushing in the PCI subsystem and the VGA
732d983712dSmrg * controller, we might replace this read-intensive code with a dummy
733d983712dSmrg * accelerator operation that causes a hardware-blocking (wait-states) until
734d983712dSmrg * the running operation is done.
735d983712dSmrg */
736d983712dSmrgstatic void
737d983712dSmrgTsengSync(ScrnInfoPtr pScrn)
738d983712dSmrg{
739d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
740d983712dSmrg
741d983712dSmrg    WAIT_ACL;
742d983712dSmrg}
743d983712dSmrg
744d983712dSmrg/*
745d983712dSmrg * This is the implementation of the SetupForSolidFill function
746d983712dSmrg * that sets up the coprocessor for a subsequent batch for solid
747d983712dSmrg * rectangle fills.
748d983712dSmrg */
749d983712dSmrgstatic void
750d983712dSmrgTsengSetupForSolidFill(ScrnInfoPtr pScrn,
751d983712dSmrg    int color, int rop, unsigned int planemask)
752d983712dSmrg{
753d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
754d983712dSmrg
755d983712dSmrg    /*
756d983712dSmrg     * all registers are queued in the Tseng chips, except of course for the
757d983712dSmrg     * stuff we want to store in off-screen memory. So we have to use a
758d983712dSmrg     * ping-pong method for those if we want to avoid having to wait for the
759d983712dSmrg     * accelerator when we want to write to these.
760d983712dSmrg     */
761d983712dSmrg
762d983712dSmrg/*    ErrorF("S"); */
763d983712dSmrg
764d983712dSmrg    PINGPONG(pTseng);
765d983712dSmrg
766d983712dSmrg    wait_acl_queue(pTseng);
767d983712dSmrg
768d983712dSmrg    /*
769d983712dSmrg     * planemask emulation uses a modified "standard" FG ROP (see ET6000
770d983712dSmrg     * data book p 66 or W32p databook p 37: "Bit masking"). We only enable
771d983712dSmrg     * the planemask emulation when the planemask is not a no-op, because
772d983712dSmrg     * blitting speed would suffer.
773d983712dSmrg     */
774d983712dSmrg
775d983712dSmrg    if ((planemask & pTseng->planemask_mask) != pTseng->planemask_mask) {
776d983712dSmrg	SET_FG_ROP_PLANEMASK(rop);
777d983712dSmrg	SET_BG_COLOR(pTseng, planemask);
778d983712dSmrg    } else {
779d983712dSmrg	SET_FG_ROP(rop);
780d983712dSmrg    }
781d983712dSmrg    SET_FG_COLOR(pTseng, color);
782d983712dSmrg
783d983712dSmrg    SET_FUNCTION_BLT;
784d983712dSmrg}
785d983712dSmrg
786d983712dSmrg/*
787d983712dSmrg * This is the implementation of the SubsequentForSolidFillRect function
788d983712dSmrg * that sends commands to the coprocessor to fill a solid rectangle of
789d983712dSmrg * the specified location and size, with the parameters from the SetUp
790d983712dSmrg * call.
791d983712dSmrg *
792d983712dSmrg * Splitting it up between ET4000 and ET6000 avoids lots of chipset type
793d983712dSmrg * comparisons.
794d983712dSmrg */
795d983712dSmrgstatic void
796d983712dSmrgTsengW32pSubsequentSolidFillRect(ScrnInfoPtr pScrn,
797d983712dSmrg    int x, int y, int w, int h)
798d983712dSmrg{
799d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
800d983712dSmrg    int destaddr = FBADDR(pTseng, x, y);
801d983712dSmrg
802d983712dSmrg    wait_acl_queue(pTseng);
803d983712dSmrg
804d983712dSmrg    /*
805d983712dSmrg     * Restoring the ACL_SOURCE_ADDRESS here is needed as long as Bresenham
806d983712dSmrg     * lines are enabled for >8bpp. Or until XAA allows us to render
807d983712dSmrg     * horizontal lines using the same Bresenham code instead of re-routing
808d983712dSmrg     * them to FillRectSolid. For XDECREASING lines, the SubsequentBresenham
809d983712dSmrg     * code adjusts the ACL_SOURCE_ADDRESS to make sure XDECREASING lines
810d983712dSmrg     * are drawn with the correct colors. But if a batch of subsequent
811d983712dSmrg     * operations also holds a few horizontal lines, they will be routed to
812d983712dSmrg     * here without calling the SetupFor... code again, and the
813d983712dSmrg     * ACL_SOURCE_ADDRESS will be wrong.
814d983712dSmrg     */
815d983712dSmrg    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
816d983712dSmrg
817d983712dSmrg    SET_XYDIR(0);   /* FIXME: not needed with separate setupforsolidline */
818d983712dSmrg
819d983712dSmrg    SET_XY_4(pTseng, w, h);
820d983712dSmrg    START_ACL(pTseng, destaddr);
821d983712dSmrg}
822d983712dSmrg
823d983712dSmrgstatic void
824d983712dSmrgTseng6KSubsequentSolidFillRect(ScrnInfoPtr pScrn,
825d983712dSmrg    int x, int y, int w, int h)
826d983712dSmrg{
827d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
828d983712dSmrg    int destaddr = FBADDR(pTseng, x, y);
829d983712dSmrg
830d983712dSmrg    wait_acl_queue(pTseng);
831d983712dSmrg
832d983712dSmrg    /* see comment in TsengW32pSubsequentFillRectSolid */
833d983712dSmrg    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
834d983712dSmrg
835d983712dSmrg    /* if XYDIR is not reset here, drawing a hardware line in between
836d983712dSmrg     * blitting, with the same ROP, color, etc will not cause a call to
837d983712dSmrg     * SetupFor... (because linedrawing uses SetupForSolidFill() as its
838d983712dSmrg     * Setup() function), and thus the direction register will have been
839d983712dSmrg     * changed by the last LineDraw operation.
840d983712dSmrg     */
841d983712dSmrg    SET_XYDIR(0);
842d983712dSmrg
843d983712dSmrg    SET_XY_6(pTseng, w, h);
844d983712dSmrg    START_ACL_6(destaddr);
845d983712dSmrg}
846d983712dSmrg
847d983712dSmrg/*
848d983712dSmrg * This is the implementation of the SetupForScreenToScreenCopy function
849d983712dSmrg * that sets up the coprocessor for a subsequent batch of
850d983712dSmrg * screen-to-screen copies.
851d983712dSmrg */
852d983712dSmrg
853d983712dSmrgstatic __inline__ void
854d983712dSmrgTseng_setup_screencopy(TsengPtr pTseng,
855d983712dSmrg    int rop, unsigned int planemask,
856d983712dSmrg    int trans_color, int blit_dir)
857d983712dSmrg{
858d983712dSmrg    wait_acl_queue(pTseng);
859d983712dSmrg
860d983712dSmrg#ifdef ET6K_TRANSPARENCY
861d983712dSmrg    if ((pTseng->ChipType == ET6000) && (trans_color != -1)) {
862d983712dSmrg	SET_BG_COLOR(trans_color);
863d983712dSmrg	SET_FUNCTION_BLT_TR;
864d983712dSmrg    } else
865d983712dSmrg	SET_FUNCTION_BLT;
866d983712dSmrg
867d983712dSmrg    SET_FG_ROP(rop);
868d983712dSmrg#else
869d983712dSmrg    if ((planemask & pTseng->planemask_mask) != pTseng->planemask_mask) {
870d983712dSmrg	SET_FG_ROP_PLANEMASK(rop);
871d983712dSmrg	SET_BG_COLOR(pTseng, planemask);
872d983712dSmrg    } else {
873d983712dSmrg	SET_FG_ROP(rop);
874d983712dSmrg    }
875d983712dSmrg    SET_FUNCTION_BLT;
876d983712dSmrg#endif
877d983712dSmrg    SET_XYDIR(blit_dir);
878d983712dSmrg}
879d983712dSmrg
880d983712dSmrgstatic void
881d983712dSmrgTsengSetupForScreenToScreenCopy(ScrnInfoPtr pScrn,
882d983712dSmrg    int xdir, int ydir, int rop,
883d983712dSmrg    unsigned int planemask, int trans_color)
884d983712dSmrg{
885d983712dSmrg    /*
886d983712dSmrg     * xdir can be either 1 (left-to-right) or -1 (right-to-left).
887d983712dSmrg     * ydir can be either 1 (top-to-bottom) or -1 (bottom-to-top).
888d983712dSmrg     */
889d983712dSmrg
890d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
891d983712dSmrg    int blit_dir = 0;
892d983712dSmrg
893d983712dSmrg/*    ErrorF("C%d ", trans_color); */
894d983712dSmrg
895d983712dSmrg    pTseng->acl_blitxdir = xdir;
896d983712dSmrg    pTseng->acl_blitydir = ydir;
897d983712dSmrg
898d983712dSmrg    if (xdir == -1)
899d983712dSmrg	blit_dir |= 0x1;
900d983712dSmrg    if (ydir == -1)
901d983712dSmrg	blit_dir |= 0x2;
902d983712dSmrg
903d983712dSmrg    Tseng_setup_screencopy(pTseng, rop, planemask, trans_color, blit_dir);
904d983712dSmrg
905d983712dSmrg    ACL_SOURCE_WRAP(0x77);	       /* no wrap */
906d983712dSmrg    ACL_SOURCE_Y_OFFSET(pTseng->line_width - 1);
907d983712dSmrg}
908d983712dSmrg
909d983712dSmrg/*
910d983712dSmrg * This is the implementation of the SubsequentForScreenToScreenCopy
911d983712dSmrg * that sends commands to the coprocessor to perform a screen-to-screen
912d983712dSmrg * copy of the specified areas, with the parameters from the SetUp call.
913d983712dSmrg * In this sample implementation, the direction must be taken into
914d983712dSmrg * account when calculating the addresses (with coordinates, it might be
915d983712dSmrg * a little easier).
916d983712dSmrg *
917d983712dSmrg * Splitting up the SubsequentScreenToScreenCopy between ET4000 and ET6000
918d983712dSmrg * doesn't seem to improve speed for small blits (as it did with
919d983712dSmrg * SolidFillRect).
920d983712dSmrg */
921d983712dSmrgstatic void
922d983712dSmrgTsengSubsequentScreenToScreenCopy(ScrnInfoPtr pScrn,
923d983712dSmrg    int x1, int y1, int x2, int y2,
924d983712dSmrg    int w, int h)
925d983712dSmrg{
926d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
927d983712dSmrg    int srcaddr, destaddr;
928d983712dSmrg
929d983712dSmrg    /*
930d983712dSmrg     * Optimizing note: the pre-calc code below (i.e. until the first
931d983712dSmrg     * register write) doesn't significantly affect performance. Removing it
932d983712dSmrg     * all boosts small blits from 24.22 to 25.47 MB/sec. Don't waste time
933d983712dSmrg     * on that. One less PCI bus write would boost us to 30.00 MB/sec, up
934d983712dSmrg     * from 24.22. Waste time on _that_...
935d983712dSmrg     */
936d983712dSmrg
937d983712dSmrg    /* tseng chips want x-sizes in bytes, not pixels */
938d983712dSmrg    x1 = MULBPP(pTseng, x1);
939d983712dSmrg    x2 = MULBPP(pTseng, x2);
940d983712dSmrg
941d983712dSmrg    /*
942d983712dSmrg     * If the direction is "decreasing", the chip wants the addresses
943d983712dSmrg     * to be at the other end, so we must be aware of that in our
944d983712dSmrg     * calculations.
945d983712dSmrg     */
946d983712dSmrg    if (pTseng->acl_blitydir == -1) {
947d983712dSmrg	srcaddr = (y1 + h - 1) * pTseng->line_width;
948d983712dSmrg	destaddr = (y2 + h - 1) * pTseng->line_width;
949d983712dSmrg    } else {
950d983712dSmrg	srcaddr = y1 * pTseng->line_width;
951d983712dSmrg	destaddr = y2 * pTseng->line_width;
952d983712dSmrg    }
953d983712dSmrg    if (pTseng->acl_blitxdir == -1) {
954d983712dSmrg	/* Accelerator start address must point to first byte to be processed.
955d983712dSmrg	 * Depending on the direction, this is the first or the last byte
956d983712dSmrg	 * in the multi-byte pixel.
957d983712dSmrg	 */
958d983712dSmrg	int eol = MULBPP(pTseng, w);
959d983712dSmrg
960d983712dSmrg	srcaddr += x1 + eol - 1;
961d983712dSmrg	destaddr += x2 + eol - 1;
962d983712dSmrg    } else {
963d983712dSmrg	srcaddr += x1;
964d983712dSmrg	destaddr += x2;
965d983712dSmrg    }
966d983712dSmrg
967d983712dSmrg    wait_acl_queue(pTseng);
968d983712dSmrg
969d983712dSmrg    SET_XY(pTseng, w, h);
970d983712dSmrg    ACL_SOURCE_ADDRESS(srcaddr);
971d983712dSmrg    START_ACL(pTseng, destaddr);
972d983712dSmrg}
973d983712dSmrg
974d983712dSmrg#if 0
975d983712dSmrgstatic int pat_src_addr;
976d983712dSmrg
977d983712dSmrgstatic void
978d983712dSmrgTsengSetupForColor8x8PatternFill(ScrnInfoPtr pScrn,
979d983712dSmrg    int patx, int paty, int rop, unsigned int planemask, int trans_color)
980d983712dSmrg{
981d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
982d983712dSmrg
983d983712dSmrg    pat_src_addr = FBADDR(pTseng, patx, paty);
984d983712dSmrg
985d983712dSmrg    ErrorF("P");
986d983712dSmrg
987d983712dSmrg    Tseng_setup_screencopy(pTseng, rop, planemask, trans_color, 0);
988d983712dSmrg
989d983712dSmrg    switch (pTseng->Bytesperpixel) {
990d983712dSmrg    case 1:
991d983712dSmrg	ACL_SOURCE_WRAP(0x33);       /* 8x8 wrap */
992d983712dSmrg	ACL_SOURCE_Y_OFFSET(8 - 1);
993d983712dSmrg	break;
994d983712dSmrg    case 2:
995d983712dSmrg	ACL_SOURCE_WRAP(0x34);       /* 16x8 wrap */
996d983712dSmrg	ACL_SOURCE_Y_OFFSET(16 - 1);
997d983712dSmrg	break;
998d983712dSmrg    case 3:
999d983712dSmrg	ACL_SOURCE_WRAP(0x3D);       /* 24x8 wrap --- only for ET6000 !!! */
1000d983712dSmrg	ACL_SOURCE_Y_OFFSET(32 - 1); /* this is no error -- see databook */
1001d983712dSmrg	break;
1002d983712dSmrg    case 4:
1003d983712dSmrg	ACL_SOURCE_WRAP(0x35);       /* 32x8 wrap */
1004d983712dSmrg	ACL_SOURCE_Y_OFFSET(32 - 1);
1005d983712dSmrg    }
1006d983712dSmrg}
1007d983712dSmrg
1008d983712dSmrgstatic void
1009d983712dSmrgTsengSubsequentColor8x8PatternFillRect(ScrnInfoPtr pScrn,
1010d983712dSmrg    int patx, int paty, int x, int y, int w, int h)
1011d983712dSmrg{
1012d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
1013d983712dSmrg    int destaddr = FBADDR(pTseng, x, y);
1014d983712dSmrg    int srcaddr = pat_src_addr + MULBPP(pTseng, paty * 8 + patx);
1015d983712dSmrg
1016d983712dSmrg    wait_acl_queue(pTseng);
1017d983712dSmrg
1018d983712dSmrg    ACL_SOURCE_ADDRESS(srcaddr);
1019d983712dSmrg
1020d983712dSmrg    SET_XY(pTseng, w, h);
1021d983712dSmrg    START_ACL(pTseng, destaddr);
1022d983712dSmrg}
1023d983712dSmrg#endif
1024d983712dSmrg
1025d983712dSmrg#if 0
1026d983712dSmrg/*
1027d983712dSmrg * ImageWrite is nothing more than a per-scanline screencopy.
1028d983712dSmrg */
1029d983712dSmrg
1030d983712dSmrgstatic void
1031d983712dSmrgTsengSetupForScanlineImageWrite(ScrnInfoPtr pScrn,
1032d983712dSmrg    int rop, unsigned int planemask, int trans_color, int bpp, int depth)
1033d983712dSmrg{
1034d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
1035d983712dSmrg
1036d983712dSmrg/*    ErrorF("IW"); */
1037d983712dSmrg
1038d983712dSmrg    Tseng_setup_screencopy(pTseng, rop, planemask, trans_color, 0);
1039d983712dSmrg
1040d983712dSmrg    ACL_SOURCE_WRAP(0x77);	       /* no wrap */
1041d983712dSmrg    ACL_SOURCE_Y_OFFSET(pTseng->line_width - 1);
1042d983712dSmrg}
1043d983712dSmrg
1044d983712dSmrgstatic void
1045d983712dSmrgTsengSubsequentScanlineImageWriteRect(ScrnInfoPtr pScrn,
1046d983712dSmrg    int x, int y, int w, int h, int skipleft)
1047d983712dSmrg{
1048d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
1049d983712dSmrg
1050d983712dSmrg/*    ErrorF("r%d",h); */
1051d983712dSmrg
1052d983712dSmrg    pTseng->acl_iw_dest = y * pTseng->line_width + MULBPP(pTseng, x);
1053d983712dSmrg    pTseng->acl_skipleft = MULBPP(pTseng, skipleft);
1054d983712dSmrg
1055d983712dSmrg    wait_acl_queue(pTseng);
1056d983712dSmrg    SET_XY(pTseng, w, 1);
1057d983712dSmrg}
1058d983712dSmrg
1059d983712dSmrgstatic void
1060d983712dSmrgTsengSubsequentImageWriteScanline(ScrnInfoPtr pScrn,
1061d983712dSmrg    int bufno)
1062d983712dSmrg{
1063d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
1064d983712dSmrg
1065d983712dSmrg/*    ErrorF("%d", bufno); */
1066d983712dSmrg
1067d983712dSmrg    wait_acl_queue(pTseng);
1068d983712dSmrg
1069d983712dSmrg    ACL_SOURCE_ADDRESS(pTseng->AccelImageWriteBufferOffsets[bufno]
1070d983712dSmrg		       + pTseng->acl_skipleft);
1071d983712dSmrg    START_ACL(pTseng, pTseng->acl_iw_dest);
1072d983712dSmrg    pTseng->acl_iw_dest += pTseng->line_width;
1073d983712dSmrg}
1074d983712dSmrg#endif
1075d983712dSmrg
1076d983712dSmrg#if 0
1077d983712dSmrg/*
1078d983712dSmrg * W32p/ET6000 hardware linedraw code.
1079d983712dSmrg *
1080d983712dSmrg * TsengSetupForSolidFill() is used as a setup function.
1081d983712dSmrg *
1082d983712dSmrg * Three major problems that needed to be solved here:
1083d983712dSmrg *
1084d983712dSmrg * 1. The "bias" value must be translated into the "line draw algorithm"
1085d983712dSmrg *    parameter in the Tseng accelerators. This parameter, although not
1086d983712dSmrg *    documented as such, needs to be set to the _inverse_ of the
1087d983712dSmrg *    appropriate bias bit (i.e. for the appropriate octant).
1088d983712dSmrg *
1089d983712dSmrg * 2. In >8bpp modes, the accelerator will render BYTES in the same order as
1090d983712dSmrg *    it is drawing the line. This means it will render the colors in the
1091d983712dSmrg *    same order as well, reversing the byte-order in pixels that are drawn
1092d983712dSmrg *    right-to-left. This causes wrong colors to be rendered.
1093d983712dSmrg *
1094d983712dSmrg * 3. The Tseng data book says that the ACL Y count register needs to be
1095d983712dSmrg *    programmed with "dy-1". A similar thing is said about ACL X count. But
1096d983712dSmrg *    this assumes (x2,y2) is NOT drawn (although that is not mentionned in
1097d983712dSmrg *    the data book). X assumes the endpoint _is_ drawn. If "dy-1" is used,
1098d983712dSmrg *    this sometimes results in a negative value (if dx==dy==0),
1099d983712dSmrg *    causing a complete accelerator hang.
1100d983712dSmrg */
1101d983712dSmrg
1102d983712dSmrgstatic void
1103d983712dSmrgTsengSubsequentSolidBresenhamLine(ScrnInfoPtr pScrn,
1104d983712dSmrg    int x, int y, int major, int minor, int err, int len, int octant)
1105d983712dSmrg{
1106d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
1107d983712dSmrg    int destaddr = FBADDR(pTseng, x, y);
1108d983712dSmrg    int xydir = pTseng->BresenhamTable[octant];
1109d983712dSmrg
1110d983712dSmrg    /* Tseng wants the real dx/dy in major/minor. Bresenham uses 2*dx and 2*dy */
1111d983712dSmrg    minor >>= 1;
1112d983712dSmrg    major >>= 1;
1113d983712dSmrg
1114d983712dSmrg    wait_acl_queue(pTseng);
1115d983712dSmrg
1116d983712dSmrg    if (!(octant & YMAJOR)) {
1117d983712dSmrg	SET_X_YRAW(pTseng, len, 0xFFF);
1118d983712dSmrg    } else {
1119d983712dSmrg	SET_XY_RAW(pTseng,0xFFF, len - 1);
1120d983712dSmrg    }
1121d983712dSmrg
1122d983712dSmrg    SET_DELTA(minor, major);
1123d983712dSmrg    ACL_ERROR_TERM(-err);  /* error term from XAA is NEGATIVE */
1124d983712dSmrg
1125d983712dSmrg    /* make sure colors are rendered correctly if >8bpp */
1126d983712dSmrg    if (octant & XDECREASING) {
1127d983712dSmrg	destaddr += pTseng->Bytesperpixel - 1;
1128d983712dSmrg	ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset
1129d983712dSmrg			   + pTseng->tsengFg + pTseng->neg_x_pixel_offset);
1130d983712dSmrg    } else
1131d983712dSmrg	ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
1132d983712dSmrg
1133d983712dSmrg    SET_XYDIR(xydir);
1134d983712dSmrg
1135d983712dSmrg    START_ACL(pTseng, destaddr);
1136d983712dSmrg}
1137d983712dSmrg#endif
1138d983712dSmrg
1139d983712dSmrg#ifdef TODO
1140d983712dSmrg/*
1141d983712dSmrg * Trapezoid filling code.
1142d983712dSmrg *
1143d983712dSmrg * TsengSetupForSolidFill() is used as a setup function
1144d983712dSmrg */
1145d983712dSmrg
1146d983712dSmrg#undef DEBUG_TRAP
1147d983712dSmrg
1148d983712dSmrg#ifdef TSENG_TRAPEZOIDS
1149d983712dSmrgstatic void
1150d983712dSmrgTsengSubsequentFillTrapezoidSolid(ytop, height, left, dxL, dyL, eL, right, dxR, dyR, eR)
1151d983712dSmrg    int ytop;
1152d983712dSmrg    int height;
1153d983712dSmrg    int left;
1154d983712dSmrg    int dxL, dyL;
1155d983712dSmrg    int eL;
1156d983712dSmrg    int right;
1157d983712dSmrg    int dxR, dyR;
1158d983712dSmrg    int eR;
1159d983712dSmrg{
1160d983712dSmrg    unsigned int tseng_bias_compensate = 0xd8;
1161d983712dSmrg    int destaddr, algrthm;
1162d983712dSmrg    int xcount = right - left + 1;     /* both edges included */
1163d983712dSmrg    int dir_reg = 0x60;		       /* trapezoid drawing; use error term for primary edge */
1164d983712dSmrg    int sec_dir_reg = 0x20;	       /* use error term for secondary edge */
1165d983712dSmrg    int octant = 0;
1166d983712dSmrg
1167d983712dSmrg    /*    ErrorF("#"); */
1168d983712dSmrg
1169d983712dSmrg    int destaddr, algrthm;
1170d983712dSmrg    int xcount = right - left + 1;
1171d983712dSmrg
1172d983712dSmrg#ifdef USE_ERROR_TERM
1173d983712dSmrg    int dir_reg = 0x60;
1174d983712dSmrg    int sec_dir_reg = 0x20;
1175d983712dSmrg
1176d983712dSmrg#else
1177d983712dSmrg    int dir_reg = 0x40;
1178d983712dSmrg    int sec_dir_reg = 0x00;
1179d983712dSmrg
1180d983712dSmrg#endif
1181d983712dSmrg    int octant = 0;
1182d983712dSmrg    int bias = 0x00;		       /* FIXME !!! */
1183d983712dSmrg
1184d983712dSmrg/*    ErrorF("#"); */
1185d983712dSmrg
1186d983712dSmrg#ifdef DEBUG_TRAP
1187d983712dSmrg    ErrorF("ytop=%d, height=%d, left=%d, dxL=%d, dyL=%d, eL=%d, right=%d, dxR=%d, dyR=%d, eR=%d ",
1188d983712dSmrg	ytop, height, left, dxL, dyL, eL, right, dxR, dyR, eR);
1189d983712dSmrg#endif
1190d983712dSmrg
1191d983712dSmrg    if ((dyL < 0) || (dyR < 0))
1192d983712dSmrg	ErrorF("Tseng Trapezoids: Wrong assumption: dyL/R < 0\n");
1193d983712dSmrg
1194d983712dSmrg    destaddr = FBADDR(pTseng, left, ytop);
1195d983712dSmrg
1196d983712dSmrg    /* left edge */
1197d983712dSmrg    if (dxL < 0) {
1198d983712dSmrg	dir_reg |= 1;
1199d983712dSmrg	octant |= XDECREASING;
1200d983712dSmrg	dxL = -dxL;
1201d983712dSmrg    }
1202d983712dSmrg    /* Y direction is always positive (top-to-bottom drawing) */
1203d983712dSmrg
1204d983712dSmrg    wait_acl_queue(pTseng);
1205d983712dSmrg
1206d983712dSmrg    /* left edge */
1207d983712dSmrg    /* compute axial direction and load registers */
1208d983712dSmrg    if (dxL >= dyL) {		       /* X is major axis */
1209d983712dSmrg	dir_reg |= 4;
1210d983712dSmrg	SET_DELTA(dyL, dxL);
1211d983712dSmrg	if (dir_reg & 1) {	       /* edge coherency: draw left edge */
1212d983712dSmrg	    destaddr += pTseng->Bytesperpixel;
1213d983712dSmrg	    sec_dir_reg |= 0x80;
1214d983712dSmrg	    xcount--;
1215d983712dSmrg	}
1216d983712dSmrg    } else {			       /* Y is major axis */
1217d983712dSmrg	SetYMajorOctant(octant);
1218d983712dSmrg	SET_DELTA(dxL, dyL);
1219d983712dSmrg    }
1220d983712dSmrg    ACL_ERROR_TERM(eL);
1221d983712dSmrg
1222d983712dSmrg    /* select "linedraw algorithm" (=bias) and load direction register */
1223d983712dSmrg    /* ErrorF(" o=%d ", octant); */
1224d983712dSmrg    algrthm = ((tseng_bias_compensate >> octant) & 1) ^ 1;
1225d983712dSmrg    dir_reg |= algrthm << 4;
1226d983712dSmrg    SET_XYDIR(dir_reg);
1227d983712dSmrg
1228d983712dSmrg    /* right edge */
1229d983712dSmrg    if (dxR < 0) {
1230d983712dSmrg	sec_dir_reg |= 1;
1231d983712dSmrg	dxR = -dxR;
1232d983712dSmrg    }
1233d983712dSmrg    /* compute axial direction and load registers */
1234d983712dSmrg    if (dxR >= dyR) {		       /* X is major axis */
1235d983712dSmrg	sec_dir_reg |= 4;
1236d983712dSmrg	SET_SECONDARY_DELTA(dyR, dxR);
1237d983712dSmrg	if (dir_reg & 1) {	       /* edge coherency: do not draw right edge */
1238d983712dSmrg	    sec_dir_reg |= 0x40;
1239d983712dSmrg	    xcount++;
1240d983712dSmrg	}
1241d983712dSmrg    } else {			       /* Y is major axis */
1242d983712dSmrg	SET_SECONDARY_DELTA(dxR, dyR);
1243d983712dSmrg    }
1244d983712dSmrg    ACL_SECONDARY_ERROR_TERM(eR);
1245d983712dSmrg
1246d983712dSmrg    /* ErrorF("%02x", sec_dir_reg); */
1247d983712dSmrg    SET_SECONDARY_XYDIR(sec_dir_reg);
1248d983712dSmrg
1249d983712dSmrg    SET_XY_6(pTseng, xcount, height);
1250d983712dSmrg
1251d983712dSmrg#ifdef DEBUG_TRAP
1252d983712dSmrg    ErrorF("-> %d,%d\n", xcount, height);
1253d983712dSmrg#endif
1254d983712dSmrg
1255d983712dSmrg    START_ACL_6(destaddr);
1256d983712dSmrg}
1257d983712dSmrg#endif
1258d983712dSmrg
1259d983712dSmrg#endif
1260d983712dSmrg
12614b9470b1Smrg#endif
1262d983712dSmrg
1263d983712dSmrg/*
1264d983712dSmrg * The following function sets up the supported acceleration. Call it from
1265d983712dSmrg * the FbInit() function in the SVGA driver. Do NOT initialize any hardware
1266d983712dSmrg * in here. That belongs in tseng_init_acl().
1267d983712dSmrg */
1268d983712dSmrgBool
1269d983712dSmrgTsengXAAInit(ScreenPtr pScreen)
1270d983712dSmrg{
12714b9470b1Smrg#ifdef HAVE_XAA_H
12724b9470b1Smrg    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
1273d983712dSmrg    TsengPtr pTseng = TsengPTR(pScrn);
1274d983712dSmrg    XAAInfoRecPtr pXAAinfo;
1275d983712dSmrg    BoxRec AvailFBArea;
1276d983712dSmrg
1277d983712dSmrg    PDEBUG("	TsengXAAInit\n");
1278d983712dSmrg    pTseng->AccelInfoRec = pXAAinfo = XAACreateInfoRec();
1279d983712dSmrg    if (!pXAAinfo)
1280d983712dSmrg	return FALSE;
1281d983712dSmrg
1282d983712dSmrg    /*
1283d983712dSmrg     * Set up the main acceleration flags.
1284d983712dSmrg     */
1285d983712dSmrg    pXAAinfo->Flags = PIXMAP_CACHE;
1286d983712dSmrg
1287d983712dSmrg    /*
1288d983712dSmrg     * The following line installs a "Sync" function, that waits for
1289d983712dSmrg     * all coprocessor operations to complete.
1290d983712dSmrg     */
1291d983712dSmrg    pXAAinfo->Sync = TsengSync;
1292d983712dSmrg
1293d983712dSmrg    /* W32 and W32i must wait for ACL before changing registers */
1294d983712dSmrg    if (pTseng->ChipType == ET4000)
1295d983712dSmrg        pTseng->need_wait_acl = TRUE;
1296d983712dSmrg    else
1297d983712dSmrg        pTseng->need_wait_acl = FALSE;
1298d983712dSmrg
1299d983712dSmrg    pTseng->line_width = pScrn->displayWidth * pTseng->Bytesperpixel;
1300d983712dSmrg
1301d983712dSmrg#if 1
1302d983712dSmrg    /*
1303d983712dSmrg     * SolidFillRect.
1304d983712dSmrg     *
1305d983712dSmrg     * The W32 and W32i chips don't have a register to set the amount of
1306d983712dSmrg     * bytes per pixel, and hence they don't skip 1 byte in each 4-byte word
1307d983712dSmrg     * at 24bpp. Therefor, the FG or BG colors would have to be concatenated
1308d983712dSmrg     * in video memory (R-G-B-R-G-B-... instead of R-G-B-X-R-G-B-X-..., with
1309d983712dSmrg     * X = dont' care), plus a wrap value that is a multiple of 3 would have
1310d983712dSmrg     * to be set. There is no such wrap combination available.
1311d983712dSmrg     */
1312d983712dSmrg#ifdef OBSOLETE
1313d983712dSmrg    pXAAinfo->SolidFillFlags |= NO_PLANEMASK;
1314d983712dSmrg#endif
1315d983712dSmrg
1316d983712dSmrg    pXAAinfo->SetupForSolidFill = TsengSetupForSolidFill;
1317d983712dSmrg    if (pTseng->ChipType == ET6000)
1318d983712dSmrg        pXAAinfo->SubsequentSolidFillRect = Tseng6KSubsequentSolidFillRect;
1319d983712dSmrg    else
1320d983712dSmrg        pXAAinfo->SubsequentSolidFillRect = TsengW32pSubsequentSolidFillRect;
1321d983712dSmrg
1322d983712dSmrg#ifdef TSENG_TRAPEZOIDS
1323d983712dSmrg    if (pTseng->ChipType == ET6000)
1324d983712dSmrg	/* disabled for now: not fully compliant yet */
1325d983712dSmrg	pXAAinfo->SubsequentFillTrapezoidSolid = TsengSubsequentFillTrapezoidSolid;
1326d983712dSmrg#endif
1327d983712dSmrg#endif
1328d983712dSmrg
1329d983712dSmrg#if 1
1330d983712dSmrg    /*
1331d983712dSmrg     * SceenToScreenCopy (BitBLT).
1332d983712dSmrg     *
1333d983712dSmrg     * Restrictions: On ET6000, we support EITHER a planemask OR
1334d983712dSmrg     * TRANSPARENCY, but not both (they use the same Pattern map).
1335d983712dSmrg     * All other chips can't do TRANSPARENCY at all.
1336d983712dSmrg     */
1337d983712dSmrg#ifdef ET6K_TRANSPARENCY
1338d983712dSmrg    pXAAinfo->CopyAreaFlags = NO_PLANEMASK;
1339d983712dSmrg    if (pTseng->ChipType == ET4000)
1340d983712dSmrg	pXAAinfo->CopyAreaFlags |= NO_TRANSPARENCY;
1341d983712dSmrg
1342d983712dSmrg#else
1343d983712dSmrg    pXAAinfo->CopyAreaFlags = NO_TRANSPARENCY;
1344d983712dSmrg#endif
1345d983712dSmrg
1346d983712dSmrg    pXAAinfo->SetupForScreenToScreenCopy =
1347d983712dSmrg	TsengSetupForScreenToScreenCopy;
1348d983712dSmrg    pXAAinfo->SubsequentScreenToScreenCopy =
1349d983712dSmrg	TsengSubsequentScreenToScreenCopy;
1350d983712dSmrg#endif
1351d983712dSmrg
1352d983712dSmrg#if 0
1353d983712dSmrg    /*
1354d983712dSmrg     * ImageWrite.
1355d983712dSmrg     *
1356d983712dSmrg     * SInce this uses off-screen scanline buffers, it is only of use when
1357d983712dSmrg     * complex ROPs are used. But since the current XAA pixmap cache code
1358d983712dSmrg     * only works when an ImageWrite is provided, the NO_GXCOPY flag is
1359d983712dSmrg     * temporarily disabled.
1360d983712dSmrg     */
1361d983712dSmrg
1362d983712dSmrg    if (pTseng->AccelImageWriteBufferOffsets[0]) {
1363d983712dSmrg	pXAAinfo->ScanlineImageWriteFlags =
1364d983712dSmrg	    pXAAinfo->CopyAreaFlags | LEFT_EDGE_CLIPPING /* | NO_GXCOPY */ ;
1365d983712dSmrg	pXAAinfo->NumScanlineImageWriteBuffers = 2;
1366d983712dSmrg	pXAAinfo->SetupForScanlineImageWrite =
1367d983712dSmrg	    TsengSetupForScanlineImageWrite;
1368d983712dSmrg	pXAAinfo->SubsequentScanlineImageWriteRect =
1369d983712dSmrg	    TsengSubsequentScanlineImageWriteRect;
1370d983712dSmrg	pXAAinfo->SubsequentImageWriteScanline =
1371d983712dSmrg	    TsengSubsequentImageWriteScanline;
1372d983712dSmrg
1373d983712dSmrg	/* calculate memory addresses from video memory offsets */
1374d983712dSmrg	for (i = 0; i < pXAAinfo->NumScanlineImageWriteBuffers; i++) {
1375d983712dSmrg	    pTseng->XAAScanlineImageWriteBuffers[i] =
1376d983712dSmrg		pTseng->FbBase + pTseng->AccelImageWriteBufferOffsets[i];
1377d983712dSmrg	}
1378d983712dSmrg
1379d983712dSmrg	pXAAinfo->ScanlineImageWriteBuffers = pTseng->XAAScanlineImageWriteBuffers;
1380d983712dSmrg    }
1381d983712dSmrg#endif
1382d983712dSmrg    /*
1383d983712dSmrg     * 8x8 pattern tiling not possible on W32/i/p chips in 24bpp mode.
1384d983712dSmrg     * Currently, 24bpp pattern tiling doesn't work at all on those.
1385d983712dSmrg     *
1386d983712dSmrg     * FIXME: On W32 cards, pattern tiling doesn't work as expected.
1387d983712dSmrg     */
1388d983712dSmrg    pXAAinfo->Color8x8PatternFillFlags = HARDWARE_PATTERN_PROGRAMMED_ORIGIN;
1389d983712dSmrg
1390d983712dSmrg    pXAAinfo->CachePixelGranularity = 8 * 8;
1391d983712dSmrg
1392d983712dSmrg#ifdef ET6K_TRANSPARENCY
1393d983712dSmrg    pXAAinfo->PatternFlags |= HARDWARE_PATTERN_NO_PLANEMASK;
1394d983712dSmrg    if (pTseng->ChipType == ET6000)
1395d983712dSmrg	pXAAinfo->PatternFlags |= HARDWARE_PATTERN_TRANSPARENCY;
1396d983712dSmrg#endif
1397d983712dSmrg
1398d983712dSmrg#if 0
1399d983712dSmrg    /* FIXME! This needs to be fixed for W32 and W32i (it "should work") */
1400d983712dSmrg    if (pScrn->bitsPerPixel != 24) {
1401d983712dSmrg	pXAAinfo->SetupForColor8x8PatternFill =
1402d983712dSmrg	    TsengSetupForColor8x8PatternFill;
1403d983712dSmrg	pXAAinfo->SubsequentColor8x8PatternFillRect =
1404d983712dSmrg	    TsengSubsequentColor8x8PatternFillRect;
1405d983712dSmrg    }
1406d983712dSmrg#endif
1407d983712dSmrg
1408d983712dSmrg#if 0 /*1*/
1409d983712dSmrg    /*
1410d983712dSmrg     * SolidLine.
1411d983712dSmrg     *
1412d983712dSmrg     * We use Bresenham by preference, because it supports hardware clipping
1413d983712dSmrg     * (using the error term). TwoPointLines() is implemented, but not used,
1414d983712dSmrg     * because clipped lines are not accelerated (hardware clipping support
1415d983712dSmrg     * is lacking)...
1416d983712dSmrg     */
1417d983712dSmrg
1418d983712dSmrg    /*
1419d983712dSmrg     * Fill in the hardware linedraw ACL_XY_DIRECTION table
1420d983712dSmrg     *
1421d983712dSmrg     * W32BresTable[] converts XAA interface Bresenham octants to direct
1422d983712dSmrg     * ACL direction register contents. This includes the correct bias
1423d983712dSmrg     * setting etc.
1424d983712dSmrg     *
1425d983712dSmrg     * According to miline.h (but with base 0 instead of base 1 as in
1426d983712dSmrg     * miline.h), the octants are numbered as follows:
1427d983712dSmrg     *
1428d983712dSmrg     *   \    |    /
1429d983712dSmrg     *    \ 2 | 1 /
1430d983712dSmrg     *     \  |  /
1431d983712dSmrg     *    3 \ | / 0
1432d983712dSmrg     *       \|/
1433d983712dSmrg     *   -----------
1434d983712dSmrg     *       /|                                 \
1435d983712dSmrg     *    4 / | \ 7
1436d983712dSmrg     *     /  |       \
1437d983712dSmrg     *    / 5 | 6      \
1438d983712dSmrg     *   /    |        \
1439d983712dSmrg     *
1440d983712dSmrg     * In ACL_XY_DIRECTION, bits 2:0 are defined as follows:
1441d983712dSmrg     *	0: '1' if XDECREASING
1442d983712dSmrg     *	1: '1' if YDECREASING
1443d983712dSmrg     *	2: '1' if XMAJOR (== not YMAJOR)
1444d983712dSmrg     *
1445d983712dSmrg     * Bit 4 defines the bias.  It should be set to '1' for all octants
1446d983712dSmrg     * NOT passed to miSetZeroLineBias(). i.e. the inverse of the X bias.
1447d983712dSmrg     *
1448d983712dSmrg     * (For MS compatible bias, the data book says to set to the same as
1449d983712dSmrg     * YDIR, i.e. bit 1 of the same register, = '1' if YDECREASING. MS
1450d983712dSmrg     * bias is towards octants 0..3 (i.e. Y decreasing), hence this
1451d983712dSmrg     * definition of bit 4)
1452d983712dSmrg     *
1453d983712dSmrg     */
1454d983712dSmrg    pTseng->BresenhamTable = xnfalloc(8);
1455d983712dSmrg    if (pTseng->BresenhamTable == NULL) {
1456d983712dSmrg        xf86Msg(X_ERROR, "Could not malloc Bresenham Table.\n");
1457d983712dSmrg        return FALSE;
1458d983712dSmrg    }
1459d983712dSmrg    for (i=0; i<8; i++) {
1460d983712dSmrg        unsigned char zerolinebias = miGetZeroLineBias(pScreen);
1461d983712dSmrg        pTseng->BresenhamTable[i] = 0xA0; /* command=linedraw, use error term */
1462d983712dSmrg        if (i & XDECREASING) pTseng->BresenhamTable[i] |= 0x01;
1463d983712dSmrg        if (i & YDECREASING) pTseng->BresenhamTable[i] |= 0x02;
1464d983712dSmrg        if (!(i & YMAJOR))   pTseng->BresenhamTable[i] |= 0x04;
1465d983712dSmrg        if ((1 << i) & zerolinebias) pTseng->BresenhamTable[i] |= 0x10;
1466d983712dSmrg        /* ErrorF("BresenhamTable[%d]=0x%x\n", i, pTseng->BresenhamTable[i]); */
1467d983712dSmrg    }
1468d983712dSmrg
1469d983712dSmrg    pXAAinfo->SolidLineFlags = 0;
1470d983712dSmrg    pXAAinfo->SetupForSolidLine = TsengSetupForSolidFill;
1471d983712dSmrg    pXAAinfo->SubsequentSolidBresenhamLine =
1472d983712dSmrg        TsengSubsequentSolidBresenhamLine;
1473d983712dSmrg    /*
1474d983712dSmrg     * ErrorTermBits is used to limit minor, major and error term, so it
1475d983712dSmrg     * must be min(errorterm_size, delta_major_size, delta_minor_size)
1476d983712dSmrg     * But the calculation for major and minor is done on the DOUBLED
1477d983712dSmrg     * values (as per the Bresenham algorithm), so they can also have 13
1478d983712dSmrg     * bits (inside XAA). They are divided by 2 in this driver, so they
1479d983712dSmrg     * are then again limited to 12 bits.
1480d983712dSmrg     */
1481d983712dSmrg    pXAAinfo->SolidBresenhamLineErrorTermBits = 13;
1482d983712dSmrg
1483d983712dSmrg#endif
1484d983712dSmrg
1485d983712dSmrg#if 1
1486d983712dSmrg    /* set up color expansion acceleration */
1487d983712dSmrg    if (!TsengXAAInit_Colexp(pScrn))
1488d983712dSmrg	return FALSE;
1489d983712dSmrg#endif
1490d983712dSmrg
1491d983712dSmrg
1492d983712dSmrg    /*
1493d983712dSmrg     * For Tseng, we set up some often-used values
1494d983712dSmrg     */
1495d983712dSmrg
1496d983712dSmrg    switch (pTseng->Bytesperpixel) {   /* for MULBPP optimization */
1497d983712dSmrg    case 1:
1498d983712dSmrg	pTseng->powerPerPixel = 0;
1499d983712dSmrg	pTseng->planemask_mask = 0x000000FF;
1500d983712dSmrg	pTseng->neg_x_pixel_offset = 0;
1501d983712dSmrg	break;
1502d983712dSmrg    case 2:
1503d983712dSmrg	pTseng->powerPerPixel = 1;
1504d983712dSmrg	pTseng->planemask_mask = 0x0000FFFF;
1505d983712dSmrg	pTseng->neg_x_pixel_offset = 1;
1506d983712dSmrg	break;
1507d983712dSmrg    case 3:
1508d983712dSmrg	pTseng->powerPerPixel = 1;
1509d983712dSmrg	pTseng->planemask_mask = 0x00FFFFFF;
1510d983712dSmrg	pTseng->neg_x_pixel_offset = 2;		/* is this correct ??? */
1511d983712dSmrg	break;
1512d983712dSmrg    case 4:
1513d983712dSmrg	pTseng->powerPerPixel = 2;
1514d983712dSmrg	pTseng->planemask_mask = 0xFFFFFFFF;
1515d983712dSmrg	pTseng->neg_x_pixel_offset = 3;
1516d983712dSmrg	break;
1517d983712dSmrg    }
1518d983712dSmrg
1519d983712dSmrg    /*
1520d983712dSmrg     * Init ping-pong registers.
1521d983712dSmrg     * This might be obsoleted by the BACKGROUND_OPERATIONS flag.
1522d983712dSmrg     */
1523d983712dSmrg    pTseng->tsengFg = 0;
1524d983712dSmrg    pTseng->tsengBg = 16;
1525d983712dSmrg    pTseng->tsengPat = 32;
1526d983712dSmrg
1527d983712dSmrg    /* for register write optimisation */
1528d983712dSmrg    pTseng->tseng_old_dir = -1;
1529d983712dSmrg    pTseng->old_x = 0;
1530d983712dSmrg    pTseng->old_y = 0;
1531d983712dSmrg
1532d983712dSmrg    /*
1533d983712dSmrg     * Finally, we set up the video memory space available to the pixmap
1534d983712dSmrg     * cache. In this case, all memory from the end of the virtual screen to
1535d983712dSmrg     * the end of video memory minus 1K (which we already reserved), can be
1536d983712dSmrg     * used.
1537d983712dSmrg     */
1538d983712dSmrg
1539d983712dSmrg    AvailFBArea.x1 = 0;
1540d983712dSmrg    AvailFBArea.y1 = 0;
1541d983712dSmrg    AvailFBArea.x2 = pScrn->displayWidth;
1542d983712dSmrg    AvailFBArea.y2 = (pScrn->videoRam * 1024) /
1543d983712dSmrg	(pScrn->displayWidth * pTseng->Bytesperpixel);
1544d983712dSmrg
1545d983712dSmrg    xf86InitFBManager(pScreen, &AvailFBArea);
1546d983712dSmrg
1547d983712dSmrg    return (XAAInit(pScreen, pXAAinfo));
15484b9470b1Smrg#else
15494b9470b1Smrg    return FALSE;
15504b9470b1Smrg#endif
1551d983712dSmrg}
1552