tseng_accel.c revision 962c3257
1
2#ifdef HAVE_CONFIG_H
3#include "config.h"
4#endif
5
6/*
7 * if NO_OPTIMIZE is set, some optimizations are disabled.
8 *
9 * What it basically tries to do is minimize the amounts of writes to
10 * accelerator registers, since these are the ones that slow down small
11 * operations a lot.
12 */
13/* #define NO_OPTIMIZE */
14
15/*
16 * if ET6K_TRANSPARENCY is set, ScreentoScreenCopy operations (and pattern
17 * fills) will support transparency. But then the planemask support has to
18 * be dropped. The default here is to support planemasks, because all Tseng
19 * chips can do this. Only the ET6000 supports a transparency compare. The
20 * code could be easily changed to support transparency on the ET6000 and
21 * planemasks on the others, but that's only useful when transparency is
22 * more important than planemasks.
23 */
24#undef ET6K_TRANSPARENCY
25
26#include "tseng.h"
27#include "tseng_accel.h"
28
29#include "miline.h"
30
31/*
32 * conversion from X ROPs to Microsoft ROPs.
33 */
34
35static int W32OpTable[] =
36{
37    0x00,			       /* Xclear             0 */
38    0x88,			       /* Xand               src AND dst */
39    0x44,			       /* XandReverse        src AND NOT dst */
40    0xcc,			       /* Xcopy              src */
41    0x22,			       /* XandInverted       NOT src AND dst */
42    0xaa,			       /* Xnoop              dst */
43    0x66,			       /* Xxor               src XOR dst */
44    0xee,			       /* Xor                src OR dst */
45    0x11,			       /* Xnor               NOT src AND NOT dst */
46    0x99,			       /* Xequiv             NOT src XOR dst */
47    0x55,			       /* Xinvert            NOT dst */
48    0xdd,			       /* XorReverse         src OR NOT dst */
49    0x33,			       /* XcopyInverted      NOT src */
50    0xbb,			       /* XorInverted        NOT src OR dst */
51    0x77,			       /* Xnand              NOT src OR NOT dst */
52    0xff			       /* Xset               1 */
53};
54
55static int W32OpTable_planemask[] =
56{
57    0x0a,			       /* Xclear             0 */
58    0x8a,			       /* Xand               src AND dst */
59    0x4a,			       /* XandReverse        src AND NOT dst */
60    0xca,			       /* Xcopy              src */
61    0x2a,			       /* XandInverted       NOT src AND dst */
62    0xaa,			       /* Xnoop              dst */
63    0x6a,			       /* Xxor               src XOR dst */
64    0xea,			       /* Xor                src OR dst */
65    0x1a,			       /* Xnor               NOT src AND NOT dst */
66    0x9a,			       /* Xequiv             NOT src XOR dst */
67    0x5a,			       /* Xinvert            NOT dst */
68    0xda,			       /* XorReverse         src OR NOT dst */
69    0x3a,			       /* XcopyInverted      NOT src */
70    0xba,			       /* XorInverted        NOT src OR dst */
71    0x7a,			       /* Xnand              NOT src OR NOT dst */
72    0xfa			       /* Xset               1 */
73};
74
75static int W32PatternOpTable[] =
76{
77    0x00,			       /* Xclear             0 */
78    0xa0,			       /* Xand               pat AND dst */
79    0x50,			       /* XandReverse        pat AND NOT dst */
80    0xf0,			       /* Xcopy              pat */
81    0x0a,			       /* XandInverted       NOT pat AND dst */
82    0xaa,			       /* Xnoop              dst */
83    0x5a,			       /* Xxor               pat XOR dst */
84    0xfa,			       /* Xor                pat OR dst */
85    0x05,			       /* Xnor               NOT pat AND NOT dst */
86    0xa5,			       /* Xequiv             NOT pat XOR dst */
87    0x55,			       /* Xinvert            NOT dst */
88    0xf5,			       /* XorReverse         pat OR NOT dst */
89    0x0f,			       /* XcopyInverted      NOT pat */
90    0xaf,			       /* XorInverted        NOT pat OR dst */
91    0x5f,			       /* Xnand              NOT pat OR NOT dst */
92    0xff			       /* Xset               1 */
93};
94
95
96
97/**********************************************************************/
98
99static void
100tseng_terminate_acl(TsengPtr pTseng)
101{
102    /* only terminate when needed */
103/*  if (*(volatile unsigned char *)ACL_ACCELERATOR_STATUS & 0x06) */
104    {
105	ACL_SUSPEND_TERMINATE(0x00);
106	/* suspend any running operation */
107	ACL_SUSPEND_TERMINATE(0x01);
108	WAIT_ACL;
109	ACL_SUSPEND_TERMINATE(0x00);
110	/* ... and now terminate it */
111	ACL_SUSPEND_TERMINATE(0x10);
112	WAIT_ACL;
113	ACL_SUSPEND_TERMINATE(0x00);
114    }
115}
116
117static void
118tseng_recover_timeout(TsengPtr pTseng)
119{
120    if (pTseng->ChipType == ET4000) {
121	ErrorF("trying to unlock......................................\n");
122	MMIO_OUT32(pTseng->tsengCPU2ACLBase,0,0L); /* try unlocking the bus when CPU-to-accel gets stuck */
123
124        /* flush the accelerator pipeline */
125	ACL_SUSPEND_TERMINATE(0x00);
126	ACL_SUSPEND_TERMINATE(0x02);
127	ACL_SUSPEND_TERMINATE(0x00);
128    }
129}
130
131void
132tseng_init_acl(ScrnInfoPtr pScrn)
133{
134    TsengPtr pTseng = TsengPTR(pScrn);
135
136    PDEBUG("	tseng_init_acl\n");
137    /*
138     * prepare some shortcuts for faster access to memory mapped registers
139     */
140
141    pTseng->scratchMemBase = pTseng->FbBase + pTseng->AccelColorBufferOffset;
142    /*
143     * we won't be using tsengCPU2ACLBase in linear memory mode anyway, since
144     * using the MMU apertures restricts the amount of useable video memory
145     * to only 2MB, supposing we ONLY redirect MMU aperture 2 to the CPU.
146     * (see data book W32p, page 207)
147     */
148    pTseng->tsengCPU2ACLBase = pTseng->FbBase + 0x200000;	/* MMU aperture 2 */
149
150#ifdef DEBUG
151    ErrorF("MMioBase = 0x%x, scratchMemBase = 0x%x\n", pTseng->MMioBase, pTseng->scratchMemBase);
152#endif
153
154    /*
155     * prepare the accelerator for some real work
156     */
157
158    tseng_terminate_acl(pTseng);
159
160    ACL_INTERRUPT_STATUS(0xe);       /* clear interrupts */
161    ACL_INTERRUPT_MASK(0x04);	       /* disable interrupts, but enable deadlock exit */
162    ACL_INTERRUPT_STATUS(0x0);
163    ACL_ACCELERATOR_STATUS_SET(0x0);
164
165    if (pTseng->ChipType == ET6000) {
166	ACL_STEPPING_INHIBIT(0x0);   /* Undefined at power-on, let all maps (Src, Dst, Mix, Pat) step */
167	ACL_6K_CONFIG(0x00);	       /* maximum performance -- what did you think? */
168	ACL_POWER_CONTROL(0x01);     /* conserve power when ACL is idle */
169	ACL_MIX_CONTROL(0x33);
170	ACL_TRANSFER_DISABLE(0x00);  /* Undefined at power-on, enable all transfers */
171    } else {			       /* W32i/W32p */
172  	ACL_RELOAD_CONTROL(0x0);
173	ACL_SYNC_ENABLE(0x1);	       /* | 0x2 = 0WS ACL read. Yields up to 10% faster operation for small blits */
174	ACL_ROUTING_CONTROL(0x00);
175    }
176
177    /* Enable the W32p startup bit and set use an eight-bit pixel depth */
178    ACL_NQ_X_POSITION(0);
179    ACL_NQ_Y_POSITION(0);
180    ACL_PIXEL_DEPTH((pScrn->bitsPerPixel - 8) << 1);
181    /* writing destination address will start ACL */
182    ACL_OPERATION_STATE(0x10);
183
184    ACL_DESTINATION_Y_OFFSET(pScrn->displayWidth * pTseng->Bytesperpixel - 1);
185    ACL_XY_DIRECTION(0);
186
187    MMU_CONTROL(0x74);
188
189    if (pTseng->ChipType == ET4000) {
190	/*
191	 * Since the w32p revs C and D don't have any memory mapped when the
192	 * accelerator registers are used it is necessary to use the MMUs to
193	 * provide a semblance of linear memory. Fortunately on these chips
194	 * the MMU appertures are 1 megabyte each. So as long as we are
195	 * willing to only use 3 megs of video memory we can have some
196	 * acceleration. If we ever get the CPU-to-screen-color-expansion
197	 * stuff working then we will NOT need to sacrifice the extra 1MB
198	 * provided by MBP2, because we could do dynamic switching of the APT
199	 * bit in the MMU control register.
200	 *
201	 * On W32p rev c and d MBP2 is hardwired to 0x200000 when linear
202	 * memory mode is enabled. (On rev a it is programmable).
203	 *
204	 * W32p rev a and b have their first 2M mapped in the normal (non-MMU)
205	 * way, and MMU0 and MMU1, each 512 kb wide, can be used to access
206	 * another 1MB of memory. This totals to 3MB of mem. available in
207	 * linear memory when the accelerator is enabled.
208	 */
209	if ((pTseng->ChipRev == REV_A) || (pTseng->ChipRev == REV_B)) {
210	    MMIO_OUT32(pTseng->MMioBase, 0x00<<0, 0x200000L);
211	    MMIO_OUT32(pTseng->MMioBase, 0x04<<0, 0x280000L);
212	} else {		       /* rev C & D */
213	    MMIO_OUT32(pTseng->MMioBase, 0x00<<0, 0x0L);
214	    MMIO_OUT32 (pTseng->MMioBase, 0x04<<0, 0x100000L);
215	}
216    }
217}
218
219/*
220 * ET4/6K acceleration interface -- color expansion primitives.
221 *
222 * Uses Harm Hanemaayer's generic acceleration interface (XAA).
223 *
224 * Author: Koen Gadeyne
225 *
226 * Much of the acceleration code is based on the XF86_W32 server code from
227 * Glenn Lai.
228 *
229 *
230 *     Color expansion capabilities of the Tseng chip families:
231 *
232 *     Chip     screen-to-screen   CPU-to-screen   Supported depths
233 *
234 *   ET4000W32/W32i   No               Yes             8bpp only
235 *   ET4000W32p       Yes              Yes             8bpp only
236 *   ET6000           Yes              No              8/16/24/32 bpp
237 */
238#define SET_FUNCTION_COLOREXPAND \
239    if (pTseng->ChipType == ET6000) \
240      ACL_MIX_CONTROL(0x32); \
241    else \
242      ACL_ROUTING_CONTROL(0x08);
243
244#define SET_FUNCTION_COLOREXPAND_CPU \
245    ACL_ROUTING_CONTROL(0x02);
246
247
248static void
249TsengSubsequentScanlineCPUToScreenColorExpandFill(ScrnInfoPtr pScrn,
250    int x, int y, int w, int h, int skipleft)
251{
252    TsengPtr pTseng = TsengPTR(pScrn);
253
254    if (pTseng->ChipType == ET4000) {
255	/* the accelerator needs DWORD padding, and "w" is in PIXELS... */
256	pTseng->acl_colexp_width_dwords = (MULBPP(pTseng, w) + 31) >> 5;
257	pTseng->acl_colexp_width_bytes = (MULBPP(pTseng, w) + 7) >> 3;
258    }
259
260    pTseng->acl_ColorExpandDst = FBADDR(pTseng, x, y);
261    pTseng->acl_skipleft = skipleft;
262
263    wait_acl_queue(pTseng);
264
265#if 0
266    ACL_MIX_Y_OFFSET(w - 1);
267
268    ErrorF(" W=%d", w);
269#endif
270    SET_XY(pTseng, w, 1);
271}
272
273static void
274TsengSubsequentColorExpandScanline(ScrnInfoPtr pScrn,
275    int bufno)
276{
277    TsengPtr pTseng = TsengPTR(pScrn);
278
279    wait_acl_queue(pTseng);
280
281    ACL_MIX_ADDRESS((pTseng->AccelColorExpandBufferOffsets[bufno] << 3) + pTseng->acl_skipleft);
282    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
283
284    /* move to next scanline */
285    pTseng->acl_ColorExpandDst += pTseng->line_width;
286
287    /*
288     * If not using triple-buffering, we need to wait for the queued
289     * register set to be transferred to the working register set here,
290     * because otherwise an e.g. double-buffering mechanism could overwrite
291     * the buffer that's currently being worked with with new data too soon.
292     *
293     * WAIT_QUEUE; // not needed with triple-buffering
294     */
295}
296
297
298
299/*
300 * We use this intermediate CPU-to-Screen color expansion because the one
301 * provided by XAA seems to lock up the accelerator engine.
302 *
303 * One of the main differences between the XAA approach and this one is that
304 * transfers are done per byte. I'm not sure if that is needed though.
305 */
306static void
307TsengSubsequentColorExpandScanline_8bpp(ScrnInfoPtr pScrn, int bufno)
308{
309    TsengPtr pTseng = TsengPTR(pScrn);
310    pointer dest = pTseng->tsengCPU2ACLBase;
311    int i,j;
312    CARD8 *bufptr;
313
314    i = pTseng->acl_colexp_width_bytes;
315    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
316
317    wait_acl_queue(pTseng);
318    START_ACL (pTseng, pTseng->acl_ColorExpandDst);
319
320/*  *((LongP) (MMioBase + 0x08)) = (CARD32) pTseng->acl_ColorExpandDst;*/
321/*  MMIO_OUT32(tsengCPU2ACLBase,0, (CARD32)pTseng->acl_ColorExpandDst); */
322    j = 0;
323    /* Copy scanline data to accelerator MMU aperture byte by byte */
324    while (i--) {		       /* FIXME: we need to take care of PCI bursting and MMU overflow here! */
325	MMIO_OUT8(dest,j++, *bufptr++);
326    }
327
328    /* move to next scanline */
329    pTseng->acl_ColorExpandDst += pTseng->line_width;
330}
331
332/*
333 * This function does direct memory-to-CPU bit doubling for color-expansion
334 * at 16bpp on W32 chips. They can only do 8bpp color expansion, so we have
335 * to expand the incoming data to 2bpp first.
336 */
337static void
338TsengSubsequentColorExpandScanline_16bpp(ScrnInfoPtr pScrn, int bufno)
339{
340    TsengPtr pTseng = TsengPTR(pScrn);
341    pointer dest = pTseng->tsengCPU2ACLBase;
342    int i,j;
343    CARD8 *bufptr;
344    register CARD32 bits16;
345
346    i = pTseng->acl_colexp_width_dwords * 2;
347    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
348
349    wait_acl_queue(pTseng);
350    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
351
352    j = 0;
353    while (i--) {
354	bits16 = pTseng->ColExpLUT[*bufptr++];
355	MMIO_OUT8(dest,j++,bits16 & 0xFF);
356	MMIO_OUT8(dest,j++,(bits16 >> 8) & 0xFF);
357    }
358
359    /* move to next scanline */
360    pTseng->acl_ColorExpandDst += pTseng->line_width;
361}
362
363/*
364 * This function does direct memory-to-CPU bit doubling for color-expansion
365 * at 24bpp on W32 chips. They can only do 8bpp color expansion, so we have
366 * to expand the incoming data to 3bpp first.
367 */
368static void
369TsengSubsequentColorExpandScanline_24bpp(ScrnInfoPtr pScrn, int bufno)
370{
371    TsengPtr pTseng = TsengPTR(pScrn);
372    pointer dest = pTseng->tsengCPU2ACLBase;
373    int i, k, j = -1;
374    CARD8 *bufptr;
375    register CARD32 bits24;
376
377    i = pTseng->acl_colexp_width_dwords * 4;
378    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
379
380    wait_acl_queue(pTseng);
381    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
382
383    /* take 8 input bits, expand to 3 output bytes */
384    bits24 = pTseng->ColExpLUT[*bufptr++];
385    k = 0;
386    while (i--) {
387	if ((j++) == 2) {	       /* "i % 3" operation is much to expensive */
388	    j = 0;
389	    bits24 = pTseng->ColExpLUT[*bufptr++];
390	}
391	MMIO_OUT8(dest,k++,bits24 & 0xFF);
392	bits24 >>= 8;
393    }
394
395    /* move to next scanline */
396    pTseng->acl_ColorExpandDst += pTseng->line_width;
397}
398
399/*
400 * This function does direct memory-to-CPU bit doubling for color-expansion
401 * at 32bpp on W32 chips. They can only do 8bpp color expansion, so we have
402 * to expand the incoming data to 4bpp first.
403 */
404static void
405TsengSubsequentColorExpandScanline_32bpp(ScrnInfoPtr pScrn, int bufno)
406{
407    TsengPtr pTseng = TsengPTR(pScrn);
408    pointer dest = pTseng->tsengCPU2ACLBase;
409    int i,j;
410    CARD8 *bufptr;
411    register CARD32 bits32;
412
413    i = pTseng->acl_colexp_width_dwords;
414   /* amount of blocks of 8 bits to expand to 32 bits (=1 DWORD) */
415    bufptr = (CARD8 *) (pTseng->XAAScanlineColorExpandBuffers[bufno]);
416
417    wait_acl_queue(pTseng);
418    START_ACL(pTseng, pTseng->acl_ColorExpandDst);
419
420    j = 0;
421    while (i--) {
422	bits32 = pTseng->ColExpLUT[*bufptr++];
423	MMIO_OUT8(dest,j++,bits32 & 0xFF);
424	MMIO_OUT8(dest,j++,(bits32 >> 8) & 0xFF);
425	MMIO_OUT8(dest,j++,(bits32 >> 16) & 0xFF);
426	MMIO_OUT8(dest,j++,(bits32 >> 24) & 0xFF);
427    }
428
429    /* move to next scanline */
430    pTseng->acl_ColorExpandDst += pTseng->line_width;
431}
432
433/*
434 * CPU-to-Screen color expansion.
435 *   This is for ET4000 only (The ET6000 cannot do this)
436 */
437static void
438TsengSetupForCPUToScreenColorExpandFill(ScrnInfoPtr pScrn,
439    int fg, int bg, int rop, unsigned int planemask)
440{
441    TsengPtr pTseng = TsengPTR(pScrn);
442
443/*  ErrorF("X"); */
444
445    PINGPONG(pTseng);
446
447    wait_acl_queue(pTseng);
448
449    SET_FG_ROP(rop);
450    SET_BG_ROP_TR(rop, bg);
451
452    SET_XYDIR(0);
453
454    SET_FG_BG_COLOR(pTseng, fg, bg);
455
456    SET_FUNCTION_COLOREXPAND_CPU;
457
458    /* assure correct alignment of MIX address (ACL needs same alignment here as in MMU aperture) */
459    ACL_MIX_ADDRESS(0);
460}
461
462#ifdef TSENG_CPU_TO_SCREEN_COLOREXPAND
463/*
464 * TsengSubsequentCPUToScreenColorExpand() is potentially dangerous:
465 *   Not writing enough data to the MMU aperture for CPU-to-screen color
466 *   expansion will eventually cause a system deadlock!
467 *
468 * Note that CPUToScreenColorExpand operations _always_ require a
469 * WAIT_INTERFACE before starting a new operation (this is empyrical,
470 * though)
471 */
472static void
473TsengSubsequentCPUToScreenColorExpandFill(ScrnInfoPtr pScrn,
474    int x, int y, int w, int h, int skipleft)
475{
476    TsengPtr pTseng = TsengPTR(pScrn);
477    int destaddr = FBADDR(pTseng, x, y);
478
479    /* ErrorF(" %dx%d|%d ",w,h,skipleft); */
480    if (skipleft)
481	ErrorF("Can't do: Skipleft = %d\n", skipleft);
482
483/*  wait_acl_queue(); */
484    ErrorF("=========WAIT     FIXME!\n");
485    WAIT_INTERFACE;
486
487    ACL_MIX_Y_OFFSET(w - 1);
488    SET_XY(pTseng, w, h);
489    START_ACL(pTseng, destaddr);
490}
491#endif
492
493static void
494TsengSetupForScreenToScreenColorExpandFill(ScrnInfoPtr pScrn,
495    int fg, int bg, int rop, unsigned int planemask)
496{
497    TsengPtr pTseng = TsengPTR(pScrn);
498
499/*  ErrorF("SSC "); */
500
501    PINGPONG(pTseng);
502
503    wait_acl_queue(pTseng);
504
505    SET_FG_ROP(rop);
506    SET_BG_ROP_TR(rop, bg);
507
508    SET_FG_BG_COLOR(pTseng, fg, bg);
509
510    SET_FUNCTION_COLOREXPAND;
511
512    SET_XYDIR(0);
513}
514
515static void
516TsengSubsequentScreenToScreenColorExpandFill(ScrnInfoPtr pScrn,
517    int x, int y, int w, int h, int srcx, int srcy, int skipleft)
518{
519    TsengPtr pTseng = TsengPTR(pScrn);
520    int destaddr = FBADDR(pTseng, x, y);
521
522/*    int srcaddr = FBADDR(pTseng, srcx, srcy); */
523
524    wait_acl_queue(pTseng);
525
526    SET_XY(pTseng, w, h);
527    ACL_MIX_ADDRESS(		       /* MIX address is in BITS */
528	(((srcy * pScrn->displayWidth) + srcx) * pScrn->bitsPerPixel) + skipleft);
529
530    ACL_MIX_Y_OFFSET(pTseng->line_width << 3);
531
532    START_ACL(pTseng, destaddr);
533}
534
535/*
536 *
537 */
538static Bool
539TsengXAAInit_Colexp(ScrnInfoPtr pScrn)
540{
541    int i, j, r;
542    TsengPtr pTseng = TsengPTR(pScrn);
543    XAAInfoRecPtr pXAAInfo = pTseng->AccelInfoRec;
544
545    PDEBUG("	TsengXAAInit_Colexp\n");
546
547#ifdef TODO
548    if (OFLG_ISSET(OPTION_XAA_NO_COL_EXP, &vga256InfoRec.options))
549	return;
550#endif
551
552    /* FIXME! disable accelerated color expansion for W32/W32i until it's fixed */
553/*  if (Is_W32 || Is_W32i) return; */
554
555    /*
556     * Screen-to-screen color expansion.
557     *
558     * Scanline-screen-to-screen color expansion is slower than
559     * CPU-to-screen color expansion.
560     */
561
562    pXAAInfo->ScreenToScreenColorExpandFillFlags =
563	BIT_ORDER_IN_BYTE_LSBFIRST |
564	SCANLINE_PAD_DWORD |
565	LEFT_EDGE_CLIPPING |
566	NO_PLANEMASK;
567
568#if 1
569    if ((pTseng->ChipType == ET6000) || (pScrn->bitsPerPixel == 8)) {
570	pXAAInfo->SetupForScreenToScreenColorExpandFill =
571	    TsengSetupForScreenToScreenColorExpandFill;
572	pXAAInfo->SubsequentScreenToScreenColorExpandFill =
573	    TsengSubsequentScreenToScreenColorExpandFill;
574    }
575#endif
576
577    /*
578     * Scanline CPU to screen color expansion for all W32 engines.
579     *
580     * real CPU-to-screen color expansion is extremely tricky, and only
581     * works for 8bpp anyway.
582     *
583     * This also allows us to do 16, 24 and 32 bpp color expansion by first
584     * doubling the bitmap pattern before color-expanding it, because W32s
585     * can only do 8bpp color expansion.
586     */
587
588    pXAAInfo->ScanlineCPUToScreenColorExpandFillFlags =
589	BIT_ORDER_IN_BYTE_LSBFIRST |
590	SCANLINE_PAD_DWORD |
591	NO_PLANEMASK;
592
593    if (pTseng->ChipType == ET4000) {
594	pTseng->XAAScanlineColorExpandBuffers[0] =
595	    xnfalloc(((pScrn->virtualX + 31)/32) * 4 * pTseng->Bytesperpixel);
596	if (pTseng->XAAScanlineColorExpandBuffers[0] == NULL) {
597	    xf86Msg(X_ERROR, "Could not malloc color expansion scanline buffer.\n");
598	    return FALSE;
599	}
600	pXAAInfo->NumScanlineColorExpandBuffers = 1;
601	pXAAInfo->ScanlineColorExpandBuffers = pTseng->XAAScanlineColorExpandBuffers;
602
603	pXAAInfo->SetupForScanlineCPUToScreenColorExpandFill =
604	    TsengSetupForCPUToScreenColorExpandFill;
605
606	pXAAInfo->SubsequentScanlineCPUToScreenColorExpandFill =
607	    TsengSubsequentScanlineCPUToScreenColorExpandFill;
608
609	switch (pScrn->bitsPerPixel) {
610	case 8:
611	    pXAAInfo->SubsequentColorExpandScanline =
612		TsengSubsequentColorExpandScanline_8bpp;
613	    break;
614	case 15:
615	case 16:
616	    pXAAInfo->SubsequentColorExpandScanline =
617		TsengSubsequentColorExpandScanline_16bpp;
618	    break;
619	case 24:
620	    pXAAInfo->SubsequentColorExpandScanline =
621		TsengSubsequentColorExpandScanline_24bpp;
622	    break;
623	case 32:
624	    pXAAInfo->SubsequentColorExpandScanline =
625		TsengSubsequentColorExpandScanline_32bpp;
626	    break;
627	}
628	/* create color expansion LUT (used for >8bpp only) */
629	pTseng->ColExpLUT = xnfalloc(sizeof(CARD32)*256);
630	if (pTseng->ColExpLUT == NULL) {
631	    xf86Msg(X_ERROR, "Could not malloc color expansion tables.\n");
632	    return FALSE;
633	}
634	for (i = 0; i < 256; i++) {
635	    r = 0;
636	    for (j = 7; j >= 0; j--) {
637		r <<= pTseng->Bytesperpixel;
638		if ((i >> j) & 1)
639		    r |= (1 << pTseng->Bytesperpixel) - 1;
640	    }
641	    pTseng->ColExpLUT[i] = r;
642	    /* ErrorF("0x%08X, ",r ); if ((i%8)==7) ErrorF("\n"); */
643	}
644    } else {
645	/*
646	 * Triple-buffering is needed to account for double-buffering of Tseng
647	 * acceleration registers.
648	 */
649	pXAAInfo->NumScanlineColorExpandBuffers = 3;
650	pXAAInfo->ScanlineColorExpandBuffers =
651	    pTseng->XAAColorExpandBuffers;
652	pXAAInfo->SetupForScanlineCPUToScreenColorExpandFill =
653	    TsengSetupForScreenToScreenColorExpandFill;
654	pXAAInfo->SubsequentScanlineCPUToScreenColorExpandFill =
655	    TsengSubsequentScanlineCPUToScreenColorExpandFill;
656	pXAAInfo->SubsequentColorExpandScanline =
657	    TsengSubsequentColorExpandScanline;
658
659	/* calculate memory addresses from video memory offsets */
660	for (i = 0; i < pXAAInfo->NumScanlineColorExpandBuffers; i++) {
661	    pTseng->XAAColorExpandBuffers[i] =
662		pTseng->FbBase + pTseng->AccelColorExpandBufferOffsets[i];
663	}
664
665	pXAAInfo->ScanlineColorExpandBuffers = pTseng->XAAColorExpandBuffers;
666    }
667
668#ifdef TSENG_CPU_TO_SCREEN_COLOREXPAND
669    /*
670     * CPU-to-screen color expansion doesn't seem to be reliable yet. The
671     * W32 needs the correct amount of data sent to it in this mode, or it
672     * hangs the machine until is does (?). Currently, the init code in this
673     * file or the XAA code that uses this does something wrong, so that
674     * occasionally we get accelerator timeouts, and after a few, complete
675     * system hangs.
676     *
677     * The W32 engine requires SCANLINE_NO_PAD, but that doesn't seem to
678     * work very well (accelerator hangs).
679     *
680     * What works is this: tell XAA that we have SCANLINE_PAD_DWORD, and then
681     * add the following code in TsengSubsequentCPUToScreenColorExpand():
682     *     w = (w + 31) & ~31; this code rounds the width up to the nearest
683     * multiple of 32, and together with SCANLINE_PAD_DWORD, this makes
684     * CPU-to-screen color expansion work. Of course, the display isn't
685     * correct (4 chars are "blanked out" when only one is written, for
686     * example). But this shows that the principle works. But the code
687     * doesn't...
688     *
689     * The same thing goes for PAD_BYTE: this also works (with the same
690     * problems as SCANLINE_PAD_DWORD, although less prominent)
691     */
692
693    pXAAInfo->CPUToScreenColorExpandFillFlags =
694	BIT_ORDER_IN_BYTE_LSBFIRST |
695	SCANLINE_PAD_DWORD |   /* no other choice */
696	CPU_TRANSFER_PAD_DWORD |
697	NO_PLANEMASK;
698
699    if (Is_W32_any && (pScrn->bitsPerPixel == 8)) {
700	pXAAInfo->SetupForCPUToScreenColorExpandFill =
701	    TsengSetupForCPUToScreenColorExpandFill;
702	pXAAInfo->SubsequentCPUToScreenColorExpandFill =
703	    TsengSubsequentCPUToScreenColorExpandFill;
704
705	/* we'll be using MMU aperture 2 */
706	pXAAInfo->ColorExpandBase = (CARD8 *)pTseng->tsengCPU2ACLBase;
707	/* ErrorF("tsengCPU2ACLBase = 0x%x\n", pTseng->tsengCPU2ACLBase); */
708	/* aperture size is 8kb in banked mode. Larger in linear mode, but 8kb is enough */
709	pXAAInfo->ColorExpandRange = 8192;
710    }
711#endif
712    return TRUE;
713}
714
715/*
716 * ET4/6K acceleration interface.
717 *
718 * Uses Harm Hanemaayer's generic acceleration interface (XAA).
719 *
720 * Author: Koen Gadeyne
721 *
722 * Much of the acceleration code is based on the XF86_W32 server code from
723 * Glenn Lai.
724 *
725 */
726
727/*
728 * This is the implementation of the Sync() function.
729 *
730 * To avoid pipeline/cache/buffer flushing in the PCI subsystem and the VGA
731 * controller, we might replace this read-intensive code with a dummy
732 * accelerator operation that causes a hardware-blocking (wait-states) until
733 * the running operation is done.
734 */
735static void
736TsengSync(ScrnInfoPtr pScrn)
737{
738    TsengPtr pTseng = TsengPTR(pScrn);
739
740    WAIT_ACL;
741}
742
743/*
744 * This is the implementation of the SetupForSolidFill function
745 * that sets up the coprocessor for a subsequent batch for solid
746 * rectangle fills.
747 */
748static void
749TsengSetupForSolidFill(ScrnInfoPtr pScrn,
750    int color, int rop, unsigned int planemask)
751{
752    TsengPtr pTseng = TsengPTR(pScrn);
753
754    /*
755     * all registers are queued in the Tseng chips, except of course for the
756     * stuff we want to store in off-screen memory. So we have to use a
757     * ping-pong method for those if we want to avoid having to wait for the
758     * accelerator when we want to write to these.
759     */
760
761/*    ErrorF("S"); */
762
763    PINGPONG(pTseng);
764
765    wait_acl_queue(pTseng);
766
767    /*
768     * planemask emulation uses a modified "standard" FG ROP (see ET6000
769     * data book p 66 or W32p databook p 37: "Bit masking"). We only enable
770     * the planemask emulation when the planemask is not a no-op, because
771     * blitting speed would suffer.
772     */
773
774    if ((planemask & pTseng->planemask_mask) != pTseng->planemask_mask) {
775	SET_FG_ROP_PLANEMASK(rop);
776	SET_BG_COLOR(pTseng, planemask);
777    } else {
778	SET_FG_ROP(rop);
779    }
780    SET_FG_COLOR(pTseng, color);
781
782    SET_FUNCTION_BLT;
783}
784
785/*
786 * This is the implementation of the SubsequentForSolidFillRect function
787 * that sends commands to the coprocessor to fill a solid rectangle of
788 * the specified location and size, with the parameters from the SetUp
789 * call.
790 *
791 * Splitting it up between ET4000 and ET6000 avoids lots of chipset type
792 * comparisons.
793 */
794static void
795TsengW32pSubsequentSolidFillRect(ScrnInfoPtr pScrn,
796    int x, int y, int w, int h)
797{
798    TsengPtr pTseng = TsengPTR(pScrn);
799    int destaddr = FBADDR(pTseng, x, y);
800
801    wait_acl_queue(pTseng);
802
803    /*
804     * Restoring the ACL_SOURCE_ADDRESS here is needed as long as Bresenham
805     * lines are enabled for >8bpp. Or until XAA allows us to render
806     * horizontal lines using the same Bresenham code instead of re-routing
807     * them to FillRectSolid. For XDECREASING lines, the SubsequentBresenham
808     * code adjusts the ACL_SOURCE_ADDRESS to make sure XDECREASING lines
809     * are drawn with the correct colors. But if a batch of subsequent
810     * operations also holds a few horizontal lines, they will be routed to
811     * here without calling the SetupFor... code again, and the
812     * ACL_SOURCE_ADDRESS will be wrong.
813     */
814    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
815
816    SET_XYDIR(0);   /* FIXME: not needed with separate setupforsolidline */
817
818    SET_XY_4(pTseng, w, h);
819    START_ACL(pTseng, destaddr);
820}
821
822static void
823Tseng6KSubsequentSolidFillRect(ScrnInfoPtr pScrn,
824    int x, int y, int w, int h)
825{
826    TsengPtr pTseng = TsengPTR(pScrn);
827    int destaddr = FBADDR(pTseng, x, y);
828
829    wait_acl_queue(pTseng);
830
831    /* see comment in TsengW32pSubsequentFillRectSolid */
832    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
833
834    /* if XYDIR is not reset here, drawing a hardware line in between
835     * blitting, with the same ROP, color, etc will not cause a call to
836     * SetupFor... (because linedrawing uses SetupForSolidFill() as its
837     * Setup() function), and thus the direction register will have been
838     * changed by the last LineDraw operation.
839     */
840    SET_XYDIR(0);
841
842    SET_XY_6(pTseng, w, h);
843    START_ACL_6(destaddr);
844}
845
846/*
847 * This is the implementation of the SetupForScreenToScreenCopy function
848 * that sets up the coprocessor for a subsequent batch of
849 * screen-to-screen copies.
850 */
851
852static __inline__ void
853Tseng_setup_screencopy(TsengPtr pTseng,
854    int rop, unsigned int planemask,
855    int trans_color, int blit_dir)
856{
857    wait_acl_queue(pTseng);
858
859#ifdef ET6K_TRANSPARENCY
860    if ((pTseng->ChipType == ET6000) && (trans_color != -1)) {
861	SET_BG_COLOR(trans_color);
862	SET_FUNCTION_BLT_TR;
863    } else
864	SET_FUNCTION_BLT;
865
866    SET_FG_ROP(rop);
867#else
868    if ((planemask & pTseng->planemask_mask) != pTseng->planemask_mask) {
869	SET_FG_ROP_PLANEMASK(rop);
870	SET_BG_COLOR(pTseng, planemask);
871    } else {
872	SET_FG_ROP(rop);
873    }
874    SET_FUNCTION_BLT;
875#endif
876    SET_XYDIR(blit_dir);
877}
878
879static void
880TsengSetupForScreenToScreenCopy(ScrnInfoPtr pScrn,
881    int xdir, int ydir, int rop,
882    unsigned int planemask, int trans_color)
883{
884    /*
885     * xdir can be either 1 (left-to-right) or -1 (right-to-left).
886     * ydir can be either 1 (top-to-bottom) or -1 (bottom-to-top).
887     */
888
889    TsengPtr pTseng = TsengPTR(pScrn);
890    int blit_dir = 0;
891
892/*    ErrorF("C%d ", trans_color); */
893
894    pTseng->acl_blitxdir = xdir;
895    pTseng->acl_blitydir = ydir;
896
897    if (xdir == -1)
898	blit_dir |= 0x1;
899    if (ydir == -1)
900	blit_dir |= 0x2;
901
902    Tseng_setup_screencopy(pTseng, rop, planemask, trans_color, blit_dir);
903
904    ACL_SOURCE_WRAP(0x77);	       /* no wrap */
905    ACL_SOURCE_Y_OFFSET(pTseng->line_width - 1);
906}
907
908/*
909 * This is the implementation of the SubsequentForScreenToScreenCopy
910 * that sends commands to the coprocessor to perform a screen-to-screen
911 * copy of the specified areas, with the parameters from the SetUp call.
912 * In this sample implementation, the direction must be taken into
913 * account when calculating the addresses (with coordinates, it might be
914 * a little easier).
915 *
916 * Splitting up the SubsequentScreenToScreenCopy between ET4000 and ET6000
917 * doesn't seem to improve speed for small blits (as it did with
918 * SolidFillRect).
919 */
920static void
921TsengSubsequentScreenToScreenCopy(ScrnInfoPtr pScrn,
922    int x1, int y1, int x2, int y2,
923    int w, int h)
924{
925    TsengPtr pTseng = TsengPTR(pScrn);
926    int srcaddr, destaddr;
927
928    /*
929     * Optimizing note: the pre-calc code below (i.e. until the first
930     * register write) doesn't significantly affect performance. Removing it
931     * all boosts small blits from 24.22 to 25.47 MB/sec. Don't waste time
932     * on that. One less PCI bus write would boost us to 30.00 MB/sec, up
933     * from 24.22. Waste time on _that_...
934     */
935
936    /* tseng chips want x-sizes in bytes, not pixels */
937    x1 = MULBPP(pTseng, x1);
938    x2 = MULBPP(pTseng, x2);
939
940    /*
941     * If the direction is "decreasing", the chip wants the addresses
942     * to be at the other end, so we must be aware of that in our
943     * calculations.
944     */
945    if (pTseng->acl_blitydir == -1) {
946	srcaddr = (y1 + h - 1) * pTseng->line_width;
947	destaddr = (y2 + h - 1) * pTseng->line_width;
948    } else {
949	srcaddr = y1 * pTseng->line_width;
950	destaddr = y2 * pTseng->line_width;
951    }
952    if (pTseng->acl_blitxdir == -1) {
953	/* Accelerator start address must point to first byte to be processed.
954	 * Depending on the direction, this is the first or the last byte
955	 * in the multi-byte pixel.
956	 */
957	int eol = MULBPP(pTseng, w);
958
959	srcaddr += x1 + eol - 1;
960	destaddr += x2 + eol - 1;
961    } else {
962	srcaddr += x1;
963	destaddr += x2;
964    }
965
966    wait_acl_queue(pTseng);
967
968    SET_XY(pTseng, w, h);
969    ACL_SOURCE_ADDRESS(srcaddr);
970    START_ACL(pTseng, destaddr);
971}
972
973#if 0
974static int pat_src_addr;
975
976static void
977TsengSetupForColor8x8PatternFill(ScrnInfoPtr pScrn,
978    int patx, int paty, int rop, unsigned int planemask, int trans_color)
979{
980    TsengPtr pTseng = TsengPTR(pScrn);
981
982    pat_src_addr = FBADDR(pTseng, patx, paty);
983
984    ErrorF("P");
985
986    Tseng_setup_screencopy(pTseng, rop, planemask, trans_color, 0);
987
988    switch (pTseng->Bytesperpixel) {
989    case 1:
990	ACL_SOURCE_WRAP(0x33);       /* 8x8 wrap */
991	ACL_SOURCE_Y_OFFSET(8 - 1);
992	break;
993    case 2:
994	ACL_SOURCE_WRAP(0x34);       /* 16x8 wrap */
995	ACL_SOURCE_Y_OFFSET(16 - 1);
996	break;
997    case 3:
998	ACL_SOURCE_WRAP(0x3D);       /* 24x8 wrap --- only for ET6000 !!! */
999	ACL_SOURCE_Y_OFFSET(32 - 1); /* this is no error -- see databook */
1000	break;
1001    case 4:
1002	ACL_SOURCE_WRAP(0x35);       /* 32x8 wrap */
1003	ACL_SOURCE_Y_OFFSET(32 - 1);
1004    }
1005}
1006
1007static void
1008TsengSubsequentColor8x8PatternFillRect(ScrnInfoPtr pScrn,
1009    int patx, int paty, int x, int y, int w, int h)
1010{
1011    TsengPtr pTseng = TsengPTR(pScrn);
1012    int destaddr = FBADDR(pTseng, x, y);
1013    int srcaddr = pat_src_addr + MULBPP(pTseng, paty * 8 + patx);
1014
1015    wait_acl_queue(pTseng);
1016
1017    ACL_SOURCE_ADDRESS(srcaddr);
1018
1019    SET_XY(pTseng, w, h);
1020    START_ACL(pTseng, destaddr);
1021}
1022#endif
1023
1024#if 0
1025/*
1026 * ImageWrite is nothing more than a per-scanline screencopy.
1027 */
1028
1029static void
1030TsengSetupForScanlineImageWrite(ScrnInfoPtr pScrn,
1031    int rop, unsigned int planemask, int trans_color, int bpp, int depth)
1032{
1033    TsengPtr pTseng = TsengPTR(pScrn);
1034
1035/*    ErrorF("IW"); */
1036
1037    Tseng_setup_screencopy(pTseng, rop, planemask, trans_color, 0);
1038
1039    ACL_SOURCE_WRAP(0x77);	       /* no wrap */
1040    ACL_SOURCE_Y_OFFSET(pTseng->line_width - 1);
1041}
1042
1043static void
1044TsengSubsequentScanlineImageWriteRect(ScrnInfoPtr pScrn,
1045    int x, int y, int w, int h, int skipleft)
1046{
1047    TsengPtr pTseng = TsengPTR(pScrn);
1048
1049/*    ErrorF("r%d",h); */
1050
1051    pTseng->acl_iw_dest = y * pTseng->line_width + MULBPP(pTseng, x);
1052    pTseng->acl_skipleft = MULBPP(pTseng, skipleft);
1053
1054    wait_acl_queue(pTseng);
1055    SET_XY(pTseng, w, 1);
1056}
1057
1058static void
1059TsengSubsequentImageWriteScanline(ScrnInfoPtr pScrn,
1060    int bufno)
1061{
1062    TsengPtr pTseng = TsengPTR(pScrn);
1063
1064/*    ErrorF("%d", bufno); */
1065
1066    wait_acl_queue(pTseng);
1067
1068    ACL_SOURCE_ADDRESS(pTseng->AccelImageWriteBufferOffsets[bufno]
1069		       + pTseng->acl_skipleft);
1070    START_ACL(pTseng, pTseng->acl_iw_dest);
1071    pTseng->acl_iw_dest += pTseng->line_width;
1072}
1073#endif
1074
1075#if 0
1076/*
1077 * W32p/ET6000 hardware linedraw code.
1078 *
1079 * TsengSetupForSolidFill() is used as a setup function.
1080 *
1081 * Three major problems that needed to be solved here:
1082 *
1083 * 1. The "bias" value must be translated into the "line draw algorithm"
1084 *    parameter in the Tseng accelerators. This parameter, although not
1085 *    documented as such, needs to be set to the _inverse_ of the
1086 *    appropriate bias bit (i.e. for the appropriate octant).
1087 *
1088 * 2. In >8bpp modes, the accelerator will render BYTES in the same order as
1089 *    it is drawing the line. This means it will render the colors in the
1090 *    same order as well, reversing the byte-order in pixels that are drawn
1091 *    right-to-left. This causes wrong colors to be rendered.
1092 *
1093 * 3. The Tseng data book says that the ACL Y count register needs to be
1094 *    programmed with "dy-1". A similar thing is said about ACL X count. But
1095 *    this assumes (x2,y2) is NOT drawn (although that is not mentionned in
1096 *    the data book). X assumes the endpoint _is_ drawn. If "dy-1" is used,
1097 *    this sometimes results in a negative value (if dx==dy==0),
1098 *    causing a complete accelerator hang.
1099 */
1100
1101static void
1102TsengSubsequentSolidBresenhamLine(ScrnInfoPtr pScrn,
1103    int x, int y, int major, int minor, int err, int len, int octant)
1104{
1105    TsengPtr pTseng = TsengPTR(pScrn);
1106    int destaddr = FBADDR(pTseng, x, y);
1107    int xydir = pTseng->BresenhamTable[octant];
1108
1109    /* Tseng wants the real dx/dy in major/minor. Bresenham uses 2*dx and 2*dy */
1110    minor >>= 1;
1111    major >>= 1;
1112
1113    wait_acl_queue(pTseng);
1114
1115    if (!(octant & YMAJOR)) {
1116	SET_X_YRAW(pTseng, len, 0xFFF);
1117    } else {
1118	SET_XY_RAW(pTseng,0xFFF, len - 1);
1119    }
1120
1121    SET_DELTA(minor, major);
1122    ACL_ERROR_TERM(-err);  /* error term from XAA is NEGATIVE */
1123
1124    /* make sure colors are rendered correctly if >8bpp */
1125    if (octant & XDECREASING) {
1126	destaddr += pTseng->Bytesperpixel - 1;
1127	ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset
1128			   + pTseng->tsengFg + pTseng->neg_x_pixel_offset);
1129    } else
1130	ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
1131
1132    SET_XYDIR(xydir);
1133
1134    START_ACL(pTseng, destaddr);
1135}
1136#endif
1137
1138#ifdef TODO
1139/*
1140 * Trapezoid filling code.
1141 *
1142 * TsengSetupForSolidFill() is used as a setup function
1143 */
1144
1145#undef DEBUG_TRAP
1146
1147#ifdef TSENG_TRAPEZOIDS
1148static void
1149TsengSubsequentFillTrapezoidSolid(ytop, height, left, dxL, dyL, eL, right, dxR, dyR, eR)
1150    int ytop;
1151    int height;
1152    int left;
1153    int dxL, dyL;
1154    int eL;
1155    int right;
1156    int dxR, dyR;
1157    int eR;
1158{
1159    unsigned int tseng_bias_compensate = 0xd8;
1160    int destaddr, algrthm;
1161    int xcount = right - left + 1;     /* both edges included */
1162    int dir_reg = 0x60;		       /* trapezoid drawing; use error term for primary edge */
1163    int sec_dir_reg = 0x20;	       /* use error term for secondary edge */
1164    int octant = 0;
1165
1166    /*    ErrorF("#"); */
1167
1168    int destaddr, algrthm;
1169    int xcount = right - left + 1;
1170
1171#ifdef USE_ERROR_TERM
1172    int dir_reg = 0x60;
1173    int sec_dir_reg = 0x20;
1174
1175#else
1176    int dir_reg = 0x40;
1177    int sec_dir_reg = 0x00;
1178
1179#endif
1180    int octant = 0;
1181    int bias = 0x00;		       /* FIXME !!! */
1182
1183/*    ErrorF("#"); */
1184
1185#ifdef DEBUG_TRAP
1186    ErrorF("ytop=%d, height=%d, left=%d, dxL=%d, dyL=%d, eL=%d, right=%d, dxR=%d, dyR=%d, eR=%d ",
1187	ytop, height, left, dxL, dyL, eL, right, dxR, dyR, eR);
1188#endif
1189
1190    if ((dyL < 0) || (dyR < 0))
1191	ErrorF("Tseng Trapezoids: Wrong assumption: dyL/R < 0\n");
1192
1193    destaddr = FBADDR(pTseng, left, ytop);
1194
1195    /* left edge */
1196    if (dxL < 0) {
1197	dir_reg |= 1;
1198	octant |= XDECREASING;
1199	dxL = -dxL;
1200    }
1201    /* Y direction is always positive (top-to-bottom drawing) */
1202
1203    wait_acl_queue(pTseng);
1204
1205    /* left edge */
1206    /* compute axial direction and load registers */
1207    if (dxL >= dyL) {		       /* X is major axis */
1208	dir_reg |= 4;
1209	SET_DELTA(dyL, dxL);
1210	if (dir_reg & 1) {	       /* edge coherency: draw left edge */
1211	    destaddr += pTseng->Bytesperpixel;
1212	    sec_dir_reg |= 0x80;
1213	    xcount--;
1214	}
1215    } else {			       /* Y is major axis */
1216	SetYMajorOctant(octant);
1217	SET_DELTA(dxL, dyL);
1218    }
1219    ACL_ERROR_TERM(eL);
1220
1221    /* select "linedraw algorithm" (=bias) and load direction register */
1222    /* ErrorF(" o=%d ", octant); */
1223    algrthm = ((tseng_bias_compensate >> octant) & 1) ^ 1;
1224    dir_reg |= algrthm << 4;
1225    SET_XYDIR(dir_reg);
1226
1227    /* right edge */
1228    if (dxR < 0) {
1229	sec_dir_reg |= 1;
1230	dxR = -dxR;
1231    }
1232    /* compute axial direction and load registers */
1233    if (dxR >= dyR) {		       /* X is major axis */
1234	sec_dir_reg |= 4;
1235	SET_SECONDARY_DELTA(dyR, dxR);
1236	if (dir_reg & 1) {	       /* edge coherency: do not draw right edge */
1237	    sec_dir_reg |= 0x40;
1238	    xcount++;
1239	}
1240    } else {			       /* Y is major axis */
1241	SET_SECONDARY_DELTA(dxR, dyR);
1242    }
1243    ACL_SECONDARY_ERROR_TERM(eR);
1244
1245    /* ErrorF("%02x", sec_dir_reg); */
1246    SET_SECONDARY_XYDIR(sec_dir_reg);
1247
1248    SET_XY_6(pTseng, xcount, height);
1249
1250#ifdef DEBUG_TRAP
1251    ErrorF("-> %d,%d\n", xcount, height);
1252#endif
1253
1254    START_ACL_6(destaddr);
1255}
1256#endif
1257
1258#endif
1259
1260
1261/*
1262 * The following function sets up the supported acceleration. Call it from
1263 * the FbInit() function in the SVGA driver. Do NOT initialize any hardware
1264 * in here. That belongs in tseng_init_acl().
1265 */
1266Bool
1267TsengXAAInit(ScreenPtr pScreen)
1268{
1269    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1270    TsengPtr pTseng = TsengPTR(pScrn);
1271    XAAInfoRecPtr pXAAinfo;
1272    BoxRec AvailFBArea;
1273
1274    PDEBUG("	TsengXAAInit\n");
1275    pTseng->AccelInfoRec = pXAAinfo = XAACreateInfoRec();
1276    if (!pXAAinfo)
1277	return FALSE;
1278
1279    /*
1280     * Set up the main acceleration flags.
1281     */
1282    pXAAinfo->Flags = PIXMAP_CACHE;
1283
1284    /*
1285     * The following line installs a "Sync" function, that waits for
1286     * all coprocessor operations to complete.
1287     */
1288    pXAAinfo->Sync = TsengSync;
1289
1290    /* W32 and W32i must wait for ACL before changing registers */
1291    if (pTseng->ChipType == ET4000)
1292        pTseng->need_wait_acl = TRUE;
1293    else
1294        pTseng->need_wait_acl = FALSE;
1295
1296    pTseng->line_width = pScrn->displayWidth * pTseng->Bytesperpixel;
1297
1298#if 1
1299    /*
1300     * SolidFillRect.
1301     *
1302     * The W32 and W32i chips don't have a register to set the amount of
1303     * bytes per pixel, and hence they don't skip 1 byte in each 4-byte word
1304     * at 24bpp. Therefor, the FG or BG colors would have to be concatenated
1305     * in video memory (R-G-B-R-G-B-... instead of R-G-B-X-R-G-B-X-..., with
1306     * X = dont' care), plus a wrap value that is a multiple of 3 would have
1307     * to be set. There is no such wrap combination available.
1308     */
1309#ifdef OBSOLETE
1310    pXAAinfo->SolidFillFlags |= NO_PLANEMASK;
1311#endif
1312
1313    pXAAinfo->SetupForSolidFill = TsengSetupForSolidFill;
1314    if (pTseng->ChipType == ET6000)
1315        pXAAinfo->SubsequentSolidFillRect = Tseng6KSubsequentSolidFillRect;
1316    else
1317        pXAAinfo->SubsequentSolidFillRect = TsengW32pSubsequentSolidFillRect;
1318
1319#ifdef TSENG_TRAPEZOIDS
1320    if (pTseng->ChipType == ET6000)
1321	/* disabled for now: not fully compliant yet */
1322	pXAAinfo->SubsequentFillTrapezoidSolid = TsengSubsequentFillTrapezoidSolid;
1323#endif
1324#endif
1325
1326#if 1
1327    /*
1328     * SceenToScreenCopy (BitBLT).
1329     *
1330     * Restrictions: On ET6000, we support EITHER a planemask OR
1331     * TRANSPARENCY, but not both (they use the same Pattern map).
1332     * All other chips can't do TRANSPARENCY at all.
1333     */
1334#ifdef ET6K_TRANSPARENCY
1335    pXAAinfo->CopyAreaFlags = NO_PLANEMASK;
1336    if (pTseng->ChipType == ET4000)
1337	pXAAinfo->CopyAreaFlags |= NO_TRANSPARENCY;
1338
1339#else
1340    pXAAinfo->CopyAreaFlags = NO_TRANSPARENCY;
1341#endif
1342
1343    pXAAinfo->SetupForScreenToScreenCopy =
1344	TsengSetupForScreenToScreenCopy;
1345    pXAAinfo->SubsequentScreenToScreenCopy =
1346	TsengSubsequentScreenToScreenCopy;
1347#endif
1348
1349#if 0
1350    /*
1351     * ImageWrite.
1352     *
1353     * SInce this uses off-screen scanline buffers, it is only of use when
1354     * complex ROPs are used. But since the current XAA pixmap cache code
1355     * only works when an ImageWrite is provided, the NO_GXCOPY flag is
1356     * temporarily disabled.
1357     */
1358
1359    if (pTseng->AccelImageWriteBufferOffsets[0]) {
1360	pXAAinfo->ScanlineImageWriteFlags =
1361	    pXAAinfo->CopyAreaFlags | LEFT_EDGE_CLIPPING /* | NO_GXCOPY */ ;
1362	pXAAinfo->NumScanlineImageWriteBuffers = 2;
1363	pXAAinfo->SetupForScanlineImageWrite =
1364	    TsengSetupForScanlineImageWrite;
1365	pXAAinfo->SubsequentScanlineImageWriteRect =
1366	    TsengSubsequentScanlineImageWriteRect;
1367	pXAAinfo->SubsequentImageWriteScanline =
1368	    TsengSubsequentImageWriteScanline;
1369
1370	/* calculate memory addresses from video memory offsets */
1371	for (i = 0; i < pXAAinfo->NumScanlineImageWriteBuffers; i++) {
1372	    pTseng->XAAScanlineImageWriteBuffers[i] =
1373		pTseng->FbBase + pTseng->AccelImageWriteBufferOffsets[i];
1374	}
1375
1376	pXAAinfo->ScanlineImageWriteBuffers = pTseng->XAAScanlineImageWriteBuffers;
1377    }
1378#endif
1379    /*
1380     * 8x8 pattern tiling not possible on W32/i/p chips in 24bpp mode.
1381     * Currently, 24bpp pattern tiling doesn't work at all on those.
1382     *
1383     * FIXME: On W32 cards, pattern tiling doesn't work as expected.
1384     */
1385    pXAAinfo->Color8x8PatternFillFlags = HARDWARE_PATTERN_PROGRAMMED_ORIGIN;
1386
1387    pXAAinfo->CachePixelGranularity = 8 * 8;
1388
1389#ifdef ET6K_TRANSPARENCY
1390    pXAAinfo->PatternFlags |= HARDWARE_PATTERN_NO_PLANEMASK;
1391    if (pTseng->ChipType == ET6000)
1392	pXAAinfo->PatternFlags |= HARDWARE_PATTERN_TRANSPARENCY;
1393#endif
1394
1395#if 0
1396    /* FIXME! This needs to be fixed for W32 and W32i (it "should work") */
1397    if (pScrn->bitsPerPixel != 24) {
1398	pXAAinfo->SetupForColor8x8PatternFill =
1399	    TsengSetupForColor8x8PatternFill;
1400	pXAAinfo->SubsequentColor8x8PatternFillRect =
1401	    TsengSubsequentColor8x8PatternFillRect;
1402    }
1403#endif
1404
1405#if 0 /*1*/
1406    /*
1407     * SolidLine.
1408     *
1409     * We use Bresenham by preference, because it supports hardware clipping
1410     * (using the error term). TwoPointLines() is implemented, but not used,
1411     * because clipped lines are not accelerated (hardware clipping support
1412     * is lacking)...
1413     */
1414
1415    /*
1416     * Fill in the hardware linedraw ACL_XY_DIRECTION table
1417     *
1418     * W32BresTable[] converts XAA interface Bresenham octants to direct
1419     * ACL direction register contents. This includes the correct bias
1420     * setting etc.
1421     *
1422     * According to miline.h (but with base 0 instead of base 1 as in
1423     * miline.h), the octants are numbered as follows:
1424     *
1425     *   \    |    /
1426     *    \ 2 | 1 /
1427     *     \  |  /
1428     *    3 \ | / 0
1429     *       \|/
1430     *   -----------
1431     *       /|                                 \
1432     *    4 / | \ 7
1433     *     /  |       \
1434     *    / 5 | 6      \
1435     *   /    |        \
1436     *
1437     * In ACL_XY_DIRECTION, bits 2:0 are defined as follows:
1438     *	0: '1' if XDECREASING
1439     *	1: '1' if YDECREASING
1440     *	2: '1' if XMAJOR (== not YMAJOR)
1441     *
1442     * Bit 4 defines the bias.  It should be set to '1' for all octants
1443     * NOT passed to miSetZeroLineBias(). i.e. the inverse of the X bias.
1444     *
1445     * (For MS compatible bias, the data book says to set to the same as
1446     * YDIR, i.e. bit 1 of the same register, = '1' if YDECREASING. MS
1447     * bias is towards octants 0..3 (i.e. Y decreasing), hence this
1448     * definition of bit 4)
1449     *
1450     */
1451    pTseng->BresenhamTable = xnfalloc(8);
1452    if (pTseng->BresenhamTable == NULL) {
1453        xf86Msg(X_ERROR, "Could not malloc Bresenham Table.\n");
1454        return FALSE;
1455    }
1456    for (i=0; i<8; i++) {
1457        unsigned char zerolinebias = miGetZeroLineBias(pScreen);
1458        pTseng->BresenhamTable[i] = 0xA0; /* command=linedraw, use error term */
1459        if (i & XDECREASING) pTseng->BresenhamTable[i] |= 0x01;
1460        if (i & YDECREASING) pTseng->BresenhamTable[i] |= 0x02;
1461        if (!(i & YMAJOR))   pTseng->BresenhamTable[i] |= 0x04;
1462        if ((1 << i) & zerolinebias) pTseng->BresenhamTable[i] |= 0x10;
1463        /* ErrorF("BresenhamTable[%d]=0x%x\n", i, pTseng->BresenhamTable[i]); */
1464    }
1465
1466    pXAAinfo->SolidLineFlags = 0;
1467    pXAAinfo->SetupForSolidLine = TsengSetupForSolidFill;
1468    pXAAinfo->SubsequentSolidBresenhamLine =
1469        TsengSubsequentSolidBresenhamLine;
1470    /*
1471     * ErrorTermBits is used to limit minor, major and error term, so it
1472     * must be min(errorterm_size, delta_major_size, delta_minor_size)
1473     * But the calculation for major and minor is done on the DOUBLED
1474     * values (as per the Bresenham algorithm), so they can also have 13
1475     * bits (inside XAA). They are divided by 2 in this driver, so they
1476     * are then again limited to 12 bits.
1477     */
1478    pXAAinfo->SolidBresenhamLineErrorTermBits = 13;
1479
1480#endif
1481
1482#if 1
1483    /* set up color expansion acceleration */
1484    if (!TsengXAAInit_Colexp(pScrn))
1485	return FALSE;
1486#endif
1487
1488
1489    /*
1490     * For Tseng, we set up some often-used values
1491     */
1492
1493    switch (pTseng->Bytesperpixel) {   /* for MULBPP optimization */
1494    case 1:
1495	pTseng->powerPerPixel = 0;
1496	pTseng->planemask_mask = 0x000000FF;
1497	pTseng->neg_x_pixel_offset = 0;
1498	break;
1499    case 2:
1500	pTseng->powerPerPixel = 1;
1501	pTseng->planemask_mask = 0x0000FFFF;
1502	pTseng->neg_x_pixel_offset = 1;
1503	break;
1504    case 3:
1505	pTseng->powerPerPixel = 1;
1506	pTseng->planemask_mask = 0x00FFFFFF;
1507	pTseng->neg_x_pixel_offset = 2;		/* is this correct ??? */
1508	break;
1509    case 4:
1510	pTseng->powerPerPixel = 2;
1511	pTseng->planemask_mask = 0xFFFFFFFF;
1512	pTseng->neg_x_pixel_offset = 3;
1513	break;
1514    }
1515
1516    /*
1517     * Init ping-pong registers.
1518     * This might be obsoleted by the BACKGROUND_OPERATIONS flag.
1519     */
1520    pTseng->tsengFg = 0;
1521    pTseng->tsengBg = 16;
1522    pTseng->tsengPat = 32;
1523
1524    /* for register write optimisation */
1525    pTseng->tseng_old_dir = -1;
1526    pTseng->old_x = 0;
1527    pTseng->old_y = 0;
1528
1529    /*
1530     * Finally, we set up the video memory space available to the pixmap
1531     * cache. In this case, all memory from the end of the virtual screen to
1532     * the end of video memory minus 1K (which we already reserved), can be
1533     * used.
1534     */
1535
1536    AvailFBArea.x1 = 0;
1537    AvailFBArea.y1 = 0;
1538    AvailFBArea.x2 = pScrn->displayWidth;
1539    AvailFBArea.y2 = (pScrn->videoRam * 1024) /
1540	(pScrn->displayWidth * pTseng->Bytesperpixel);
1541
1542    xf86InitFBManager(pScreen, &AvailFBArea);
1543
1544    return (XAAInit(pScreen, pXAAinfo));
1545
1546}
1547