tseng_accel.h revision 3cb82e98
1#ifndef _TSENG_ACCEL_H
2#define _TSENG_ACCEL_H
3/*
4 * Shortcuts to Tseng memory-mapped accelerator-control registers
5 */
6
7#define MMU_CONTROL(x)  MMIO_OUT8(pTseng->MMioBase, 0x13<<0, x)
8#define ACL_SUSPEND_TERMINATE(x)  MMIO_OUT8(pTseng->MMioBase, 0x30<<0, x)
9#define ACL_OPERATION_STATE(x)  MMIO_OUT8(pTseng->MMioBase, 0x31<<0, x)
10
11#define ACL_SYNC_ENABLE(x)  MMIO_OUT8(pTseng->MMioBase, 0x32<<0, x)
12    /* for ET6000, ACL_SYNC_ENABLE becomes ACL_6K_CONFIG */
13
14#define ACL_INTERRUPT_STATUS(x) \
15                        MMIO_OUT8(pTseng->MMioBase, 0x35<<0, x)
16#define ACL_INTERRUPT_MASK(x) MMIO_OUT8(pTseng->MMioBase, 0x34<<0, x)
17#define ACL_ACCELERATOR_STATUS (0x36 << 0)
18#define ACL_ACCELERATOR_STATUS_SET(x) \
19                        MMIO_OUT8(pTseng->MMioBase, ACL_ACCELERATOR_STATUS, x)
20#define ACL_WRITE_INTERFACE_VALID (0x33 << 0)
21
22    /* and this is only for the ET6000 */
23#define ACL_POWER_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x37<<0, x)
24
25    /* non-queued for w32p's and ET6000 */
26#define ACL_NQ_X_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x38<<0, x)
27#define ACL_NQ_Y_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x3A<<0, x)
28    /* queued for w32 and w32i */
29#define ACL_X_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x94<<0, x)
30#define ACL_Y_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x96<<0, x)
31
32#define ACL_PATTERN_ADDRESS(x)  MMIO_OUT32(pTseng->MMioBase, 0x80<<0, x)
33#define ACL_SOURCE_ADDRESS(x)  MMIO_OUT32(pTseng->MMioBase, 0x84<<0, x)
34
35#define ACL_PATTERN_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x88<<0, x)
36#define ACL_PATTERN_Y_OFFSET32(x)  MMIO_OUT32(pTseng->MMioBase, 0x88<<0, x)
37#define ACL_SOURCE_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x8A<<0, x)
38#define ACL_DESTINATION_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x8C<<0, x)
39
40    /* W32i */
41#define ACL_VIRTUAL_BUS_SIZE(x) MMIO_OUT8(pTseng->MMioBase, 0x8E<<0, x)
42    /* w32p */
43#define ACL_PIXEL_DEPTH(x)  MMIO_OUT8(pTseng->MMioBase, 0x8E<<0, x)
44
45    /* w32 and w32i */
46#define ACL_XY_DIRECTION(x)  MMIO_OUT8(pTseng->MMioBase, 0x8F<<0, x)
47
48#define ACL_PATTERN_WRAP(x)   MMIO_OUT8(pTseng->MMioBase, 0x90<<0, x)
49#define ACL_PATTERN_WRAP32(x)   MMIO_OUT32(pTseng->MMioBase, 0x90<<0, x)
50#define ACL_TRANSFER_DISABLE(x)  MMIO_OUT8(pTseng->MMioBase, 0x91<<0, x) /* ET6000 only */
51#define ACL_SOURCE_WRAP(x) MMIO_OUT8(pTseng->MMioBase, 0x92<<0, x)
52
53#define ACL_X_COUNT(x) MMIO_OUT16(pTseng->MMioBase, 0x98<<0, x)
54#define ACL_Y_COUNT(x) MMIO_OUT16(pTseng->MMioBase, 0x9A<<0, x)
55/* shortcut. not a real register */
56#define ACL_XY_COUNT(x) MMIO_OUT32(pTseng->MMioBase, 0x98<<0, x)
57
58#define ACL_ROUTING_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x9C<<0, x)
59    /* for ET6000, ACL_ROUTING_CONTROL becomes ACL_MIX_CONTROL */
60#define ACL_RELOAD_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x9D<<0, x)
61    /* for ET6000, ACL_RELOAD_CONTROL becomes ACL_STEPPING_INHIBIT */
62
63#define ACL_BACKGROUND_RASTER_OPERATION(x)  MMIO_OUT8(pTseng->MMioBase, 0x9E<<0, x)
64#define ACL_FOREGROUND_RASTER_OPERATION(x)  MMIO_OUT8(pTseng->MMioBase, 0x9F<<0, x)
65
66#define ACL_DESTINATION_ADDRESS(x) MMIO_OUT32(pTseng->MMioBase, 0xA0<<0, x)
67
68    /* the following is for the w32p's only */
69#define ACL_MIX_ADDRESS(x) MMIO_OUT32(pTseng->MMioBase, 0xA4<<0, x)
70
71#define ACL_MIX_Y_OFFSET(x) MMIO_OUT16(pTseng->MMioBase, 0xA8<<0, x)
72#define ACL_ERROR_TERM(x) MMIO_OUT16(pTseng->MMioBase, 0xAA<<0, x)
73#define ACL_DELTA_MINOR(x) MMIO_OUT16(pTseng->MMioBase, 0xAC<<0, x)
74#define ACL_DELTA_MINOR32(x) MMIO_OUT32(pTseng->MMioBase, 0xAC<<0, x)
75#define ACL_DELTA_MAJOR(x) MMIO_OUT16(pTseng->MMioBase, 0xAE<<0, x)
76
77    /* ET6000 only (trapezoids) */
78#define ACL_SECONDARY_EDGE(x) MMIO_OUT8(pTseng->MMioBase, 0x93<<0, x)
79#define ACL_SECONDARY_ERROR_TERM(x) MMIO_OUT16(pTseng->MMioBase, 0xB2<<0, x)
80#define ACL_SECONDARY_DELTA_MINOR(x) MMIO_OUT16(pTseng->MMioBase, 0xB4<<0, x)
81#define ACL_SECONDARY_DELTA_MINOR32(x) MMIO_OUT32(pTseng->MMioBase, 0xB4<<0, x)
82#define ACL_SECONDARY_DELTA_MAJOR(x) MMIO_OUT16(pTseng->MMioBase, 0xB6<<0, x)
83
84/* for ET6000: */
85#define ACL_6K_CONFIG ACL_SYNC_ENABLE
86
87/* for ET6000: */
88#define ACL_MIX_CONTROL ACL_ROUTING_CONTROL
89#define ACL_STEPPING_INHIBIT ACL_RELOAD_CONTROL
90
91/*
92 * Some shortcuts.
93 */
94
95#define MAX_WAIT_CNT 500000	       /* how long we wait before we time out */
96#undef WAIT_VERBOSE		       /* if defined: print out how long we waited */
97
98void tseng_recover_timeout(TsengPtr pTseng);
99
100static __inline__ void
101tseng_wait(TsengPtr pTseng, int reg, char *name, unsigned char mask)
102{
103    int cnt = MAX_WAIT_CNT;
104
105    while ((MMIO_IN32(pTseng->MMioBase,reg)) & mask)
106	if (--cnt < 0) {
107	    ErrorF("WAIT_%s: timeout.\n", name);
108	    tseng_recover_timeout(pTseng);
109	    break;
110	}
111#ifdef WAIT_VERBOSE
112    ErrorF("%s%d ", name, MAX_WAIT_CNT - cnt);
113#endif
114}
115
116#define WAIT_QUEUE tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "QUEUE", 0x1)
117
118/* This is only for W32p rev b...d */
119#define WAIT_INTERFACE tseng_wait(pTseng, ACL_WRITE_INTERFACE_VALID, "INTERFACE", 0xf)
120
121#define WAIT_ACL tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "ACL", 0x2)
122
123#define WAIT_XY tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "XY", 0x4)
124
125#define SET_FUNCTION_BLT \
126    if (pTseng->ChipType == ET6000) \
127        ACL_MIX_CONTROL(0x33); \
128    else \
129        ACL_ROUTING_CONTROL(0x00);
130
131#define SET_FUNCTION_BLT_TR \
132        ACL_MIX_CONTROL(0x13);
133
134#define FBADDR(pTseng, x,y) ( (y) * pTseng->line_width + MULBPP(pTseng, x) )
135
136#define SET_FG_ROP(rop) \
137    ACL_FOREGROUND_RASTER_OPERATION(W32OpTable[rop]);
138
139#define SET_FG_ROP_PLANEMASK(rop) \
140    ACL_FOREGROUND_RASTER_OPERATION(W32OpTable_planemask[rop]);
141
142#define SET_BG_ROP(rop) \
143    ACL_BACKGROUND_RASTER_OPERATION(W32PatternOpTable[rop]);
144
145#define SET_BG_ROP_TR(rop, bg_color) \
146  if ((bg_color) == -1)    /* transparent color expansion */ \
147    ACL_BACKGROUND_RASTER_OPERATION(0xaa); \
148  else \
149    ACL_BACKGROUND_RASTER_OPERATION(W32PatternOpTable[rop]);
150
151#define SET_DELTA(Min, Maj) \
152    ACL_DELTA_MINOR32(((Maj) << 16) + (Min))
153
154#define SET_SECONDARY_DELTA(Min, Maj) \
155    ACL_SECONDARY_DELTA_MINOR(((Maj) << 16) + (Min))
156
157#ifdef NO_OPTIMIZE
158#define SET_XYDIR(dir) \
159      ACL_XY_DIRECTION(dir);
160#else
161/*
162 * only changing ACL_XY_DIRECTION when it needs to be changed avoids
163 * unnecessary PCI bus writes, which are slow. This shows up very well
164 * on consecutive small fills.
165 */
166#define SET_XYDIR(dir) \
167    if ((dir) != pTseng->tseng_old_dir) \
168      pTseng->tseng_old_dir = (dir); \
169      ACL_XY_DIRECTION(pTseng->tseng_old_dir);
170#endif
171
172#define SET_SECONDARY_XYDIR(dir) \
173      ACL_SECONDARY_EDGE(dir);
174
175/* Must do 0x09 (in one operation) for the W32 */
176#define START_ACL(pTseng, dst) \
177    ACL_DESTINATION_ADDRESS(dst);
178
179/* START_ACL for the ET6000 */
180#define START_ACL_6(dst) \
181    ACL_DESTINATION_ADDRESS(dst);
182
183#define START_ACL_CPU(pTseng, dst) \
184      ACL_DESTINATION_ADDRESS(dst);
185
186/*    ACL_DESTINATION_ADDRESS(dst);    should be enough for START_ACL_CPU */
187
188/*
189 * Some commonly used inline functions and utility functions.
190 */
191
192static __inline__ int
193COLOR_REPLICATE_DWORD(TsengPtr pTseng, int color)
194{
195    switch (pTseng->Bytesperpixel) {
196    case 1:
197	color &= 0xFF;
198	color = (color << 8) | color;
199	color = (color << 16) | color;
200	break;
201    case 2:
202	color &= 0xFFFF;
203	color = (color << 16) | color;
204	break;
205    }
206    return color;
207}
208
209/*
210 * Optimizing note: increasing the wrap size for fixed-color source/pattern
211 * tiles from 4x1 (as below) to anything bigger doesn't seem to affect
212 * performance (it might have been better for larger wraps, but it isn't).
213 */
214
215static __inline__ void
216SET_FG_COLOR(TsengPtr pTseng, int color)
217{
218    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
219    ACL_SOURCE_Y_OFFSET(3);
220    color = COLOR_REPLICATE_DWORD(pTseng, color);
221    MMIO_OUT32(pTseng->scratchMemBase, pTseng->tsengFg, color);
222
223    ACL_SOURCE_WRAP(0x02);
224}
225
226static __inline__ void
227SET_BG_COLOR(TsengPtr pTseng, int color)
228{
229    ACL_PATTERN_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengPat);
230    ACL_PATTERN_Y_OFFSET(3);
231    color = COLOR_REPLICATE_DWORD(pTseng, color);
232    MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengPat, color);
233
234    ACL_PATTERN_WRAP(0x02);
235}
236
237/*
238 * this does the same as SET_FG_COLOR and SET_BG_COLOR together, but is
239 * faster, because it allows the PCI chipset to chain the requests into a
240 * burst sequence. The order of the commands is partly linear.
241 * So far for the theory...
242 */
243static __inline__ void
244SET_FG_BG_COLOR(TsengPtr pTseng, int fgcolor, int bgcolor)
245{
246    ACL_PATTERN_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengPat);
247    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
248    ACL_PATTERN_Y_OFFSET32(0x00030003);
249    fgcolor = COLOR_REPLICATE_DWORD(pTseng, fgcolor);
250    bgcolor = COLOR_REPLICATE_DWORD(pTseng, bgcolor);
251    MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengFg, fgcolor);
252    MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengPat, bgcolor);
253
254    ACL_PATTERN_WRAP32(0x00020002);
255}
256
257/*
258 * Real 32-bit multiplications are horribly slow compared to 16-bit (on i386).
259 */
260#ifdef NO_OPTIMIZE
261static __inline__ int
262MULBPP(TsengPtr pTseng, int x)
263{
264    return (x * pTseng->Bytesperpixel);
265}
266#else
267static __inline__ int
268MULBPP(TsengPtr pTseng, int x)
269{
270    int result = x << pTseng->powerPerPixel;
271
272    if (pTseng->Bytesperpixel != 3)
273	return result;
274    else
275	return result + x;
276}
277#endif
278
279static __inline__ int
280CALC_XY(TsengPtr pTseng, int x, int y)
281{
282    int new_x, xy;
283
284    if ((pTseng->old_y == y) && (pTseng->old_x == x))
285	return -1;
286
287    if (pTseng->ChipType == ET4000)
288	new_x = MULBPP(pTseng, x - 1);
289    else
290	new_x = MULBPP(pTseng, x) - 1;
291    xy = ((y - 1) << 16) + new_x;
292    pTseng->old_x = x;
293    pTseng->old_y = y;
294    return xy;
295}
296
297/* generic SET_XY */
298static __inline__ void
299SET_XY(TsengPtr pTseng, int x, int y)
300{
301    int new_x;
302
303    if (pTseng->ChipType == ET4000)
304	new_x = MULBPP(pTseng, x - 1);
305    else
306	new_x = MULBPP(pTseng, x) - 1;
307    ACL_XY_COUNT(((y - 1) << 16) + new_x);
308    pTseng->old_x = x;
309    pTseng->old_y = y;
310}
311
312static __inline__ void
313SET_X_YRAW(TsengPtr pTseng, int x, int y)
314{
315    int new_x;
316
317    if (pTseng->ChipType == ET4000)
318	new_x = MULBPP(pTseng, x - 1);
319    else
320	new_x = MULBPP(pTseng, x) - 1;
321    ACL_XY_COUNT((y << 16) + new_x);
322    pTseng->old_x = x;
323    pTseng->old_y = y - 1;	      /* old_y is invalid (raw transfer) */
324}
325
326/*
327 * This is plain and simple "benchmark rigging".
328 * (no real application does lots of subsequent same-size blits)
329 *
330 * The effect of this is amazingly good on e.g large blits: 400x400
331 * rectangle fill in 24 and 32 bpp on ET6000 jumps from 276 MB/sec to up to
332 * 490 MB/sec... But not always. There must be a good reason why this gives
333 * such a boost, but I don't know it.
334 */
335
336static __inline__ void
337SET_XY_4(TsengPtr pTseng, int x, int y)
338{
339    int new_xy;
340
341    if ((pTseng->old_y != y) || (pTseng->old_x != x)) {
342	new_xy = ((y - 1) << 16) + MULBPP(pTseng, x - 1);
343	ACL_XY_COUNT(new_xy);
344	pTseng->old_x = x;
345	pTseng->old_y = y;
346    }
347}
348
349static __inline__ void
350SET_XY_6(TsengPtr pTseng, int x, int y)
351{
352    int new_xy;			       /* using this intermediate variable is faster */
353
354    if ((pTseng->old_y != y) || (pTseng->old_x != x)) {
355	new_xy = ((y - 1) << 16) + MULBPP(pTseng, x) - 1;
356	ACL_XY_COUNT(new_xy);
357	pTseng->old_x = x;
358	pTseng->old_y = y;
359    }
360}
361
362/* generic SET_XY_RAW */
363static __inline__ void
364SET_XY_RAW(TsengPtr pTseng,int x, int y)
365{
366    ACL_XY_COUNT((y << 16) + x);
367    pTseng->old_x = pTseng->old_y = -1;   /* invalidate old_x/old_y (raw transfers) */
368}
369
370static __inline__ void
371PINGPONG(TsengPtr pTseng)
372{
373    if (pTseng->tsengFg == 0) {
374	pTseng->tsengFg = 8;
375	pTseng->tsengBg = 24;
376	pTseng->tsengPat = 40;
377    } else {
378	pTseng->tsengFg = 0;
379	pTseng->tsengBg = 16;
380	pTseng->tsengPat = 32;
381    }
382}
383
384/*
385 * This is called in each ACL function just before the first ACL register is
386 * written to. It waits for the accelerator to finish on cards that don't
387 * support hardware-wait-state locking, and waits for a free queue entry on
388 * others, if hardware-wait-states are not enabled.
389 */
390static __inline__ void
391wait_acl_queue(TsengPtr pTseng)
392{
393    if (pTseng->UsePCIRetry)
394	WAIT_QUEUE;
395    if (pTseng->need_wait_acl)
396	WAIT_ACL;
397}
398#endif /* _TSENG_ACCEL_H */
399