Home | History | Annotate | Line # | Download | only in src
      1 #ifndef _TSENG_ACCEL_H
      2 #define _TSENG_ACCEL_H
      3 /*
      4  * Shortcuts to Tseng memory-mapped accelerator-control registers
      5  */
      6 
      7 #define MMU_CONTROL(x)  MMIO_OUT8(pTseng->MMioBase, 0x13<<0, x)
      8 #define ACL_SUSPEND_TERMINATE(x)  MMIO_OUT8(pTseng->MMioBase, 0x30<<0, x)
      9 #define ACL_OPERATION_STATE(x)  MMIO_OUT8(pTseng->MMioBase, 0x31<<0, x)
     10 
     11 #define ACL_SYNC_ENABLE(x)  MMIO_OUT8(pTseng->MMioBase, 0x32<<0, x)
     12     /* for ET6000, ACL_SYNC_ENABLE becomes ACL_6K_CONFIG */
     13 
     14 #define ACL_INTERRUPT_STATUS(x) \
     15                         MMIO_OUT8(pTseng->MMioBase, 0x35<<0, x)
     16 #define ACL_INTERRUPT_MASK(x) MMIO_OUT8(pTseng->MMioBase, 0x34<<0, x)
     17 #define ACL_ACCELERATOR_STATUS (0x36 << 0)
     18 #define ACL_ACCELERATOR_STATUS_SET(x) \
     19                         MMIO_OUT8(pTseng->MMioBase, ACL_ACCELERATOR_STATUS, x)
     20 #define ACL_WRITE_INTERFACE_VALID (0x33 << 0)
     21 
     22     /* and this is only for the ET6000 */
     23 #define ACL_POWER_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x37<<0, x)
     24 
     25     /* non-queued for w32p's and ET6000 */
     26 #define ACL_NQ_X_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x38<<0, x)
     27 #define ACL_NQ_Y_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x3A<<0, x)
     28     /* queued for w32 and w32i */
     29 #define ACL_X_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x94<<0, x)
     30 #define ACL_Y_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x96<<0, x)
     31 
     32 #define ACL_PATTERN_ADDRESS(x)  MMIO_OUT32(pTseng->MMioBase, 0x80<<0, x)
     33 #define ACL_SOURCE_ADDRESS(x)  MMIO_OUT32(pTseng->MMioBase, 0x84<<0, x)
     34 
     35 #define ACL_PATTERN_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x88<<0, x)
     36 #define ACL_PATTERN_Y_OFFSET32(x)  MMIO_OUT32(pTseng->MMioBase, 0x88<<0, x)
     37 #define ACL_SOURCE_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x8A<<0, x)
     38 #define ACL_DESTINATION_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x8C<<0, x)
     39 
     40     /* W32i */
     41 #define ACL_VIRTUAL_BUS_SIZE(x) MMIO_OUT8(pTseng->MMioBase, 0x8E<<0, x)
     42     /* w32p */
     43 #define ACL_PIXEL_DEPTH(x)  MMIO_OUT8(pTseng->MMioBase, 0x8E<<0, x)
     44 
     45     /* w32 and w32i */
     46 #define ACL_XY_DIRECTION(x)  MMIO_OUT8(pTseng->MMioBase, 0x8F<<0, x)
     47 
     48 #define ACL_PATTERN_WRAP(x)   MMIO_OUT8(pTseng->MMioBase, 0x90<<0, x)
     49 #define ACL_PATTERN_WRAP32(x)   MMIO_OUT32(pTseng->MMioBase, 0x90<<0, x)
     50 #define ACL_TRANSFER_DISABLE(x)  MMIO_OUT8(pTseng->MMioBase, 0x91<<0, x) /* ET6000 only */
     51 #define ACL_SOURCE_WRAP(x) MMIO_OUT8(pTseng->MMioBase, 0x92<<0, x)
     52 
     53 #define ACL_X_COUNT(x) MMIO_OUT16(pTseng->MMioBase, 0x98<<0, x)
     54 #define ACL_Y_COUNT(x) MMIO_OUT16(pTseng->MMioBase, 0x9A<<0, x)
     55 /* shortcut. not a real register */
     56 #define ACL_XY_COUNT(x) MMIO_OUT32(pTseng->MMioBase, 0x98<<0, x)
     57 
     58 #define ACL_ROUTING_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x9C<<0, x)
     59     /* for ET6000, ACL_ROUTING_CONTROL becomes ACL_MIX_CONTROL */
     60 #define ACL_RELOAD_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x9D<<0, x)
     61     /* for ET6000, ACL_RELOAD_CONTROL becomes ACL_STEPPING_INHIBIT */
     62 
     63 #define ACL_BACKGROUND_RASTER_OPERATION(x)  MMIO_OUT8(pTseng->MMioBase, 0x9E<<0, x)
     64 #define ACL_FOREGROUND_RASTER_OPERATION(x)  MMIO_OUT8(pTseng->MMioBase, 0x9F<<0, x)
     65 
     66 #define ACL_DESTINATION_ADDRESS(x) MMIO_OUT32(pTseng->MMioBase, 0xA0<<0, x)
     67 
     68     /* the following is for the w32p's only */
     69 #define ACL_MIX_ADDRESS(x) MMIO_OUT32(pTseng->MMioBase, 0xA4<<0, x)
     70 
     71 #define ACL_MIX_Y_OFFSET(x) MMIO_OUT16(pTseng->MMioBase, 0xA8<<0, x)
     72 #define ACL_ERROR_TERM(x) MMIO_OUT16(pTseng->MMioBase, 0xAA<<0, x)
     73 #define ACL_DELTA_MINOR(x) MMIO_OUT16(pTseng->MMioBase, 0xAC<<0, x)
     74 #define ACL_DELTA_MINOR32(x) MMIO_OUT32(pTseng->MMioBase, 0xAC<<0, x)
     75 #define ACL_DELTA_MAJOR(x) MMIO_OUT16(pTseng->MMioBase, 0xAE<<0, x)
     76 
     77     /* ET6000 only (trapezoids) */
     78 #define ACL_SECONDARY_EDGE(x) MMIO_OUT8(pTseng->MMioBase, 0x93<<0, x)
     79 #define ACL_SECONDARY_ERROR_TERM(x) MMIO_OUT16(pTseng->MMioBase, 0xB2<<0, x)
     80 #define ACL_SECONDARY_DELTA_MINOR(x) MMIO_OUT16(pTseng->MMioBase, 0xB4<<0, x)
     81 #define ACL_SECONDARY_DELTA_MINOR32(x) MMIO_OUT32(pTseng->MMioBase, 0xB4<<0, x)
     82 #define ACL_SECONDARY_DELTA_MAJOR(x) MMIO_OUT16(pTseng->MMioBase, 0xB6<<0, x)
     83 
     84 /* for ET6000: */
     85 #define ACL_6K_CONFIG ACL_SYNC_ENABLE
     86 
     87 /* for ET6000: */
     88 #define ACL_MIX_CONTROL ACL_ROUTING_CONTROL
     89 #define ACL_STEPPING_INHIBIT ACL_RELOAD_CONTROL
     90 
     91 /*
     92  * Some shortcuts.
     93  */
     94 
     95 #define MAX_WAIT_CNT 500000	       /* how long we wait before we time out */
     96 #undef WAIT_VERBOSE		       /* if defined: print out how long we waited */
     97 
     98 void tseng_recover_timeout(TsengPtr pTseng);
     99 
    100 static __inline__ void
    101 tseng_wait(TsengPtr pTseng, int reg, char *name, unsigned char mask)
    102 {
    103     int cnt = MAX_WAIT_CNT;
    104 
    105     while ((MMIO_IN32(pTseng->MMioBase,reg)) & mask)
    106 	if (--cnt < 0) {
    107 	    ErrorF("WAIT_%s: timeout.\n", name);
    108 	    tseng_recover_timeout(pTseng);
    109 	    break;
    110 	}
    111 #ifdef WAIT_VERBOSE
    112     ErrorF("%s%d ", name, MAX_WAIT_CNT - cnt);
    113 #endif
    114 }
    115 
    116 #define WAIT_QUEUE tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "QUEUE", 0x1)
    117 
    118 /* This is only for W32p rev b...d */
    119 #define WAIT_INTERFACE tseng_wait(pTseng, ACL_WRITE_INTERFACE_VALID, "INTERFACE", 0xf)
    120 
    121 #define WAIT_ACL tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "ACL", 0x2)
    122 
    123 #define WAIT_XY tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "XY", 0x4)
    124 
    125 #define SET_FUNCTION_BLT \
    126     if (pTseng->ChipType == ET6000) \
    127         ACL_MIX_CONTROL(0x33); \
    128     else \
    129         ACL_ROUTING_CONTROL(0x00);
    130 
    131 #define SET_FUNCTION_BLT_TR \
    132         ACL_MIX_CONTROL(0x13);
    133 
    134 #define FBADDR(pTseng, x,y) ( (y) * pTseng->line_width + MULBPP(pTseng, x) )
    135 
    136 #define SET_FG_ROP(rop) \
    137     ACL_FOREGROUND_RASTER_OPERATION(W32OpTable[rop]);
    138 
    139 #define SET_FG_ROP_PLANEMASK(rop) \
    140     ACL_FOREGROUND_RASTER_OPERATION(W32OpTable_planemask[rop]);
    141 
    142 #define SET_BG_ROP(rop) \
    143     ACL_BACKGROUND_RASTER_OPERATION(W32PatternOpTable[rop]);
    144 
    145 #define SET_BG_ROP_TR(rop, bg_color) \
    146   if ((bg_color) == -1)    /* transparent color expansion */ \
    147     ACL_BACKGROUND_RASTER_OPERATION(0xaa); \
    148   else \
    149     ACL_BACKGROUND_RASTER_OPERATION(W32PatternOpTable[rop]);
    150 
    151 #define SET_DELTA(Min, Maj) \
    152     ACL_DELTA_MINOR32(((Maj) << 16) + (Min))
    153 
    154 #define SET_SECONDARY_DELTA(Min, Maj) \
    155     ACL_SECONDARY_DELTA_MINOR(((Maj) << 16) + (Min))
    156 
    157 #ifdef NO_OPTIMIZE
    158 #define SET_XYDIR(dir) \
    159       ACL_XY_DIRECTION(dir);
    160 #else
    161 /*
    162  * only changing ACL_XY_DIRECTION when it needs to be changed avoids
    163  * unnecessary PCI bus writes, which are slow. This shows up very well
    164  * on consecutive small fills.
    165  */
    166 #define SET_XYDIR(dir) \
    167     if ((dir) != pTseng->tseng_old_dir) \
    168       pTseng->tseng_old_dir = (dir); \
    169       ACL_XY_DIRECTION(pTseng->tseng_old_dir);
    170 #endif
    171 
    172 #define SET_SECONDARY_XYDIR(dir) \
    173       ACL_SECONDARY_EDGE(dir);
    174 
    175 /* Must do 0x09 (in one operation) for the W32 */
    176 #define START_ACL(pTseng, dst) \
    177     ACL_DESTINATION_ADDRESS(dst);
    178 
    179 /* START_ACL for the ET6000 */
    180 #define START_ACL_6(dst) \
    181     ACL_DESTINATION_ADDRESS(dst);
    182 
    183 #define START_ACL_CPU(pTseng, dst) \
    184       ACL_DESTINATION_ADDRESS(dst);
    185 
    186 /*    ACL_DESTINATION_ADDRESS(dst);    should be enough for START_ACL_CPU */
    187 
    188 /*
    189  * Some commonly used inline functions and utility functions.
    190  */
    191 
    192 static __inline__ int
    193 COLOR_REPLICATE_DWORD(TsengPtr pTseng, int color)
    194 {
    195     switch (pTseng->Bytesperpixel) {
    196     case 1:
    197 	color &= 0xFF;
    198 	color = (color << 8) | color;
    199 	color = (color << 16) | color;
    200 	break;
    201     case 2:
    202 	color &= 0xFFFF;
    203 	color = (color << 16) | color;
    204 	break;
    205     }
    206     return color;
    207 }
    208 
    209 /*
    210  * Optimizing note: increasing the wrap size for fixed-color source/pattern
    211  * tiles from 4x1 (as below) to anything bigger doesn't seem to affect
    212  * performance (it might have been better for larger wraps, but it isn't).
    213  */
    214 
    215 static __inline__ void
    216 SET_FG_COLOR(TsengPtr pTseng, int color)
    217 {
    218     ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
    219     ACL_SOURCE_Y_OFFSET(3);
    220     color = COLOR_REPLICATE_DWORD(pTseng, color);
    221     MMIO_OUT32(pTseng->scratchMemBase, pTseng->tsengFg, color);
    222 
    223     ACL_SOURCE_WRAP(0x02);
    224 }
    225 
    226 static __inline__ void
    227 SET_BG_COLOR(TsengPtr pTseng, int color)
    228 {
    229     ACL_PATTERN_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengPat);
    230     ACL_PATTERN_Y_OFFSET(3);
    231     color = COLOR_REPLICATE_DWORD(pTseng, color);
    232     MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengPat, color);
    233 
    234     ACL_PATTERN_WRAP(0x02);
    235 }
    236 
    237 /*
    238  * this does the same as SET_FG_COLOR and SET_BG_COLOR together, but is
    239  * faster, because it allows the PCI chipset to chain the requests into a
    240  * burst sequence. The order of the commands is partly linear.
    241  * So far for the theory...
    242  */
    243 static __inline__ void
    244 SET_FG_BG_COLOR(TsengPtr pTseng, int fgcolor, int bgcolor)
    245 {
    246     ACL_PATTERN_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengPat);
    247     ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
    248     ACL_PATTERN_Y_OFFSET32(0x00030003);
    249     fgcolor = COLOR_REPLICATE_DWORD(pTseng, fgcolor);
    250     bgcolor = COLOR_REPLICATE_DWORD(pTseng, bgcolor);
    251     MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengFg, fgcolor);
    252     MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengPat, bgcolor);
    253 
    254     ACL_PATTERN_WRAP32(0x00020002);
    255 }
    256 
    257 /*
    258  * Real 32-bit multiplications are horribly slow compared to 16-bit (on i386).
    259  */
    260 #ifdef NO_OPTIMIZE
    261 static __inline__ int
    262 MULBPP(TsengPtr pTseng, int x)
    263 {
    264     return (x * pTseng->Bytesperpixel);
    265 }
    266 #else
    267 static __inline__ int
    268 MULBPP(TsengPtr pTseng, int x)
    269 {
    270     int result = x << pTseng->powerPerPixel;
    271 
    272     if (pTseng->Bytesperpixel != 3)
    273 	return result;
    274     else
    275 	return result + x;
    276 }
    277 #endif
    278 
    279 static __inline__ int
    280 CALC_XY(TsengPtr pTseng, int x, int y)
    281 {
    282     int new_x, xy;
    283 
    284     if ((pTseng->old_y == y) && (pTseng->old_x == x))
    285 	return -1;
    286 
    287     if (pTseng->ChipType == ET4000)
    288 	new_x = MULBPP(pTseng, x - 1);
    289     else
    290 	new_x = MULBPP(pTseng, x) - 1;
    291     xy = ((y - 1) << 16) + new_x;
    292     pTseng->old_x = x;
    293     pTseng->old_y = y;
    294     return xy;
    295 }
    296 
    297 /* generic SET_XY */
    298 static __inline__ void
    299 SET_XY(TsengPtr pTseng, int x, int y)
    300 {
    301     int new_x;
    302 
    303     if (pTseng->ChipType == ET4000)
    304 	new_x = MULBPP(pTseng, x - 1);
    305     else
    306 	new_x = MULBPP(pTseng, x) - 1;
    307     ACL_XY_COUNT(((y - 1) << 16) + new_x);
    308     pTseng->old_x = x;
    309     pTseng->old_y = y;
    310 }
    311 
    312 static __inline__ void
    313 SET_X_YRAW(TsengPtr pTseng, int x, int y)
    314 {
    315     int new_x;
    316 
    317     if (pTseng->ChipType == ET4000)
    318 	new_x = MULBPP(pTseng, x - 1);
    319     else
    320 	new_x = MULBPP(pTseng, x) - 1;
    321     ACL_XY_COUNT((y << 16) + new_x);
    322     pTseng->old_x = x;
    323     pTseng->old_y = y - 1;	      /* old_y is invalid (raw transfer) */
    324 }
    325 
    326 /*
    327  * This is plain and simple "benchmark rigging".
    328  * (no real application does lots of subsequent same-size blits)
    329  *
    330  * The effect of this is amazingly good on e.g large blits: 400x400
    331  * rectangle fill in 24 and 32 bpp on ET6000 jumps from 276 MB/sec to up to
    332  * 490 MB/sec... But not always. There must be a good reason why this gives
    333  * such a boost, but I don't know it.
    334  */
    335 
    336 static __inline__ void
    337 SET_XY_4(TsengPtr pTseng, int x, int y)
    338 {
    339     int new_xy;
    340 
    341     if ((pTseng->old_y != y) || (pTseng->old_x != x)) {
    342 	new_xy = ((y - 1) << 16) + MULBPP(pTseng, x - 1);
    343 	ACL_XY_COUNT(new_xy);
    344 	pTseng->old_x = x;
    345 	pTseng->old_y = y;
    346     }
    347 }
    348 
    349 static __inline__ void
    350 SET_XY_6(TsengPtr pTseng, int x, int y)
    351 {
    352     int new_xy;			       /* using this intermediate variable is faster */
    353 
    354     if ((pTseng->old_y != y) || (pTseng->old_x != x)) {
    355 	new_xy = ((y - 1) << 16) + MULBPP(pTseng, x) - 1;
    356 	ACL_XY_COUNT(new_xy);
    357 	pTseng->old_x = x;
    358 	pTseng->old_y = y;
    359     }
    360 }
    361 
    362 /* generic SET_XY_RAW */
    363 static __inline__ void
    364 SET_XY_RAW(TsengPtr pTseng,int x, int y)
    365 {
    366     ACL_XY_COUNT((y << 16) + x);
    367     pTseng->old_x = pTseng->old_y = -1;   /* invalidate old_x/old_y (raw transfers) */
    368 }
    369 
    370 static __inline__ void
    371 PINGPONG(TsengPtr pTseng)
    372 {
    373     if (pTseng->tsengFg == 0) {
    374 	pTseng->tsengFg = 8;
    375 	pTseng->tsengBg = 24;
    376 	pTseng->tsengPat = 40;
    377     } else {
    378 	pTseng->tsengFg = 0;
    379 	pTseng->tsengBg = 16;
    380 	pTseng->tsengPat = 32;
    381     }
    382 }
    383 
    384 /*
    385  * This is called in each ACL function just before the first ACL register is
    386  * written to. It waits for the accelerator to finish on cards that don't
    387  * support hardware-wait-state locking, and waits for a free queue entry on
    388  * others, if hardware-wait-states are not enabled.
    389  */
    390 static __inline__ void
    391 wait_acl_queue(TsengPtr pTseng)
    392 {
    393     if (pTseng->UsePCIRetry)
    394 	WAIT_QUEUE;
    395     if (pTseng->need_wait_acl)
    396 	WAIT_ACL;
    397 }
    398 #endif /* _TSENG_ACCEL_H */
    399