radeon_textured_videofuncs.c revision de2362d3
11.9Schristos/*
21.3Sperry * Copyright 2008 Alex Deucher
31.1Scjs *
41.7Sagc * Permission is hereby granted, free of charge, to any person obtaining a
51.1Scjs * copy of this software and associated documentation files (the "Software"),
61.1Scjs * to deal in the Software without restriction, including without limitation
71.1Scjs * the rights to use, copy, modify, merge, publish, distribute, sublicense,
81.1Scjs * and/or sell copies of the Software, and to permit persons to whom the
91.1Scjs * Software is furnished to do so, subject to the following conditions:
101.1Scjs *
111.1Scjs * The above copyright notice and this permission notice (including the next
121.1Scjs * paragraph) shall be included in all copies or substantial portions of the
131.1Scjs * Software.
141.1Scjs *
151.1Scjs * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
161.6Sagc * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
171.6Sagc * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
181.6Sagc * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
191.6Sagc * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
201.6Sagc * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
211.6Sagc * SOFTWARE.
221.6Sagc *
231.6Sagc *
241.6Sagc * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
251.6Sagc *
261.6Sagc */
271.6Sagc
281.6Sagc#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
291.6Sagcdo {								\
301.6Sagc    OUT_RING(F_TO_DW(_dstX));						\
311.6Sagc    OUT_RING(F_TO_DW(_dstY));						\
321.6Sagc    OUT_RING(F_TO_DW(_srcX));						\
331.4Schristos    OUT_RING(F_TO_DW(_srcY));						\
341.1Scjs    OUT_RING(F_TO_DW(_maskX));						\
351.4Schristos    OUT_RING(F_TO_DW(_maskY));						\
361.1Scjs} while (0)
371.4Schristos
381.9Schristos#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
391.4Schristosdo {								\
401.1Scjs    OUT_RING(F_TO_DW(_dstX));						\
411.1Scjs    OUT_RING(F_TO_DW(_dstY));						\
421.1Scjs    OUT_RING(F_TO_DW(_srcX));						\
431.1Scjs    OUT_RING(F_TO_DW(_srcY));						\
441.1Scjs} while (0)
451.1Scjs
461.1Scjs
471.1Scjsstatic Bool
481.1ScjsRADEONPrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
491.1Scjs{
501.1Scjs    RADEONInfoPtr info = RADEONPTR(pScrn);
511.1Scjs    PixmapPtr pPixmap = pPriv->pPixmap;
521.1Scjs    struct radeon_exa_pixmap_priv *driver_priv;
531.1Scjs    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
541.1Scjs    uint32_t txformat, txsize, txpitch;
551.1Scjs    uint32_t dst_pitch, dst_format;
561.1Scjs    uint32_t colorpitch;
571.1Scjs    int pixel_shift;
581.1Scjs    int scissor_w = MIN(pPixmap->drawable.width, 2048) - 1;
591.1Scjs    int scissor_h = MIN(pPixmap->drawable.height, 2048) - 1;
601.2Scjs    int ret;
611.4Schristos
621.4Schristos    radeon_cs_space_reset_bos(info->cs);
631.4Schristos    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
641.4Schristos
651.4Schristos    if (pPriv->bicubic_enabled)
661.4Schristos	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
671.1Scjs
681.1Scjs    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
691.1Scjs    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
701.1Scjs
711.1Scjs    ret = radeon_cs_space_check(info->cs);
721.1Scjs    if (ret) {
731.1Scjs	ErrorF("Not enough RAM to hw accel xv operation\n");
741.1Scjs	return FALSE;
751.1Scjs    }
761.1Scjs
771.4Schristos    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
781.1Scjs
791.1Scjs    dst_pitch = exaGetPixmapPitch(pPixmap);
801.1Scjs    RADEON_SWITCH_TO_3D();
811.1Scjs
821.1Scjs    /* Same for R100/R200 */
831.4Schristos    switch (pPixmap->drawable.bitsPerPixel) {
841.1Scjs    case 16:
851.1Scjs	if (pPixmap->drawable.depth == 15)
861.1Scjs	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
871.1Scjs	else
881.1Scjs	    dst_format = RADEON_COLOR_FORMAT_RGB565;
891.1Scjs	break;
901.1Scjs    case 32:
911.1Scjs	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
921.1Scjs	break;
931.1Scjs    default:
941.1Scjs	return FALSE;
951.1Scjs    }
961.1Scjs
971.1Scjs    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
981.1Scjs	pPriv->is_planar = TRUE;
991.1Scjs	txformat = RADEON_TXFORMAT_Y8;
1001.1Scjs    } else {
1011.1Scjs	pPriv->is_planar = FALSE;
1021.1Scjs	if (pPriv->id == FOURCC_UYVY)
1031.1Scjs	    txformat = RADEON_TXFORMAT_YVYU422;
1041.1Scjs	else
1051.1Scjs	    txformat = RADEON_TXFORMAT_VYUY422;
1061.1Scjs    }
1071.1Scjs
1081.1Scjs    txformat |= RADEON_TXFORMAT_NON_POWER2;
1091.1Scjs
1101.1Scjs    colorpitch = dst_pitch >> pixel_shift;
1111.1Scjs
1121.1Scjs    if (RADEONTilingEnabled(pScrn, pPixmap))
1131.1Scjs	colorpitch |= RADEON_COLOR_TILE_ENABLE;
1141.1Scjs
1151.1Scjs    BEGIN_ACCEL_RELOC(4,2);
1161.1Scjs
1171.1Scjs    OUT_RING_REG(RADEON_RB3D_CNTL, dst_format);
1181.1Scjs    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
1191.1Scjs    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
1201.1Scjs    OUT_RING_REG(RADEON_RB3D_BLENDCNTL,
1211.1Scjs		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
1221.1Scjs
1231.1Scjs    ADVANCE_RING();
1241.1Scjs
1251.1Scjs    if (pPriv->is_planar) {
1261.1Scjs	/* need 2 texcoord sets (even though they are identical) due
1271.1Scjs	   to denormalization! hw apparently can't premultiply
1281.1Scjs	   same coord set by different texture size */
1291.1Scjs	pPriv->vtx_count = 6;
1301.1Scjs
1311.1Scjs	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
1321.1Scjs		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
1331.5Sitojun	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
1341.5Sitojun	txpitch -= 32;
1351.1Scjs
1361.1Scjs	BEGIN_ACCEL_RELOC(23, 3);
1371.1Scjs
1381.1Scjs	OUT_RING_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
1391.1Scjs					  RADEON_SE_VTX_FMT_ST0 |
1401.1Scjs					  RADEON_SE_VTX_FMT_ST1));
1411.1Scjs
1421.1Scjs	OUT_RING_REG(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE |
1431.1Scjs				       RADEON_TEX_1_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
1441.1Scjs				       RADEON_TEX_2_ENABLE | RADEON_TEX_BLEND_2_ENABLE |
1451.1Scjs				       RADEON_PLANAR_YUV_ENABLE));
1461.1Scjs
1471.1Scjs	/* Y */
1481.1Scjs	OUT_RING_REG(RADEON_PP_TXFILTER_0,
1491.1Scjs		      RADEON_MAG_FILTER_LINEAR |
1501.1Scjs		      RADEON_MIN_FILTER_LINEAR |
1511.1Scjs		      RADEON_CLAMP_S_CLAMP_LAST |
1521.1Scjs		      RADEON_CLAMP_T_CLAMP_LAST |
1531.1Scjs		      RADEON_YUV_TO_RGB);
1541.1Scjs	OUT_RING_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
1551.1Scjs	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, 0, src_bo);
1561.1Scjs	OUT_RING_REG(RADEON_PP_TXCBLEND_0,
1571.1Scjs		      RADEON_COLOR_ARG_A_ZERO |
1581.1Scjs		      RADEON_COLOR_ARG_B_ZERO |
1591.4Schristos		      RADEON_COLOR_ARG_C_T0_COLOR |
1601.1Scjs		      RADEON_BLEND_CTL_ADD |
1611.1Scjs		      RADEON_CLAMP_TX);
1621.1Scjs	OUT_RING_REG(RADEON_PP_TXABLEND_0,
1631.1Scjs		      RADEON_ALPHA_ARG_A_ZERO |
1641.1Scjs		      RADEON_ALPHA_ARG_B_ZERO |
1651.1Scjs		      RADEON_ALPHA_ARG_C_T0_ALPHA |
1661.1Scjs		      RADEON_BLEND_CTL_ADD |
1671.1Scjs		      RADEON_CLAMP_TX);
1681.4Schristos
1691.1Scjs	OUT_RING_REG(RADEON_PP_TEX_SIZE_0,
1701.1Scjs		      (pPriv->w - 1) |
1711.1Scjs		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
1721.1Scjs	OUT_RING_REG(RADEON_PP_TEX_PITCH_0,
1731.1Scjs		      pPriv->src_pitch - 32);
1741.4Schristos
1751.1Scjs	/* U */
1761.1Scjs	OUT_RING_REG(RADEON_PP_TXFILTER_1,
1771.1Scjs		      RADEON_MAG_FILTER_LINEAR |
1781.1Scjs		      RADEON_MIN_FILTER_LINEAR |
1791.1Scjs		      RADEON_CLAMP_S_CLAMP_LAST |
1801.1Scjs		      RADEON_CLAMP_T_CLAMP_LAST);
1811.1Scjs	OUT_RING_REG(RADEON_PP_TXFORMAT_1, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
1821.1Scjs	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_1, pPriv->planeu_offset, src_bo);
1831.1Scjs	OUT_RING_REG(RADEON_PP_TXCBLEND_1,
1841.1Scjs		      RADEON_COLOR_ARG_A_ZERO |
1851.1Scjs		      RADEON_COLOR_ARG_B_ZERO |
1861.1Scjs		      RADEON_COLOR_ARG_C_T0_COLOR |
1871.1Scjs		      RADEON_BLEND_CTL_ADD |
1881.1Scjs		      RADEON_CLAMP_TX);
1891.1Scjs	OUT_RING_REG(RADEON_PP_TXABLEND_1,
1901.1Scjs		      RADEON_ALPHA_ARG_A_ZERO |
1911.4Schristos		      RADEON_ALPHA_ARG_B_ZERO |
1921.1Scjs		      RADEON_ALPHA_ARG_C_T0_ALPHA |
1931.1Scjs		      RADEON_BLEND_CTL_ADD |
1941.1Scjs		      RADEON_CLAMP_TX);
1951.1Scjs
1961.1Scjs	OUT_RING_REG(RADEON_PP_TEX_SIZE_1, txsize);
1971.1Scjs	OUT_RING_REG(RADEON_PP_TEX_PITCH_1, txpitch);
1981.1Scjs
1991.1Scjs	/* V */
2001.1Scjs	OUT_RING_REG(RADEON_PP_TXFILTER_2,
2011.1Scjs		      RADEON_MAG_FILTER_LINEAR |
2021.1Scjs		      RADEON_MIN_FILTER_LINEAR |
2031.1Scjs		      RADEON_CLAMP_S_CLAMP_LAST |
2041.1Scjs		      RADEON_CLAMP_T_CLAMP_LAST);
2051.1Scjs	OUT_RING_REG(RADEON_PP_TXFORMAT_2, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
2061.1Scjs	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_2, pPriv->planev_offset, src_bo);
2071.1Scjs	OUT_RING_REG(RADEON_PP_TXCBLEND_2,
2081.1Scjs		      RADEON_COLOR_ARG_A_ZERO |
2091.1Scjs		      RADEON_COLOR_ARG_B_ZERO |
2101.4Schristos		      RADEON_COLOR_ARG_C_T0_COLOR |
2111.1Scjs		      RADEON_BLEND_CTL_ADD |
2121.1Scjs		      RADEON_CLAMP_TX);
2131.1Scjs	OUT_RING_REG(RADEON_PP_TXABLEND_2,
2141.1Scjs		      RADEON_ALPHA_ARG_A_ZERO |
2151.1Scjs		      RADEON_ALPHA_ARG_B_ZERO |
2161.1Scjs		      RADEON_ALPHA_ARG_C_T0_ALPHA |
2171.1Scjs		      RADEON_BLEND_CTL_ADD |
2181.1Scjs		      RADEON_CLAMP_TX);
2191.1Scjs
2201.1Scjs	OUT_RING_REG(RADEON_PP_TEX_SIZE_2, txsize);
2211.1Scjs	OUT_RING_REG(RADEON_PP_TEX_PITCH_2, txpitch);
2221.1Scjs	ADVANCE_RING();
2231.1Scjs    } else {
2241.1Scjs	pPriv->vtx_count = 4;
2251.1Scjs	BEGIN_ACCEL_RELOC(9, 1);
2261.5Sitojun
2271.1Scjs	OUT_RING_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
2281.1Scjs					  RADEON_SE_VTX_FMT_ST0));
2291.1Scjs
2301.1Scjs	OUT_RING_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
2311.1Scjs
2321.1Scjs	OUT_RING_REG(RADEON_PP_TXFILTER_0,
2331.1Scjs		      RADEON_MAG_FILTER_LINEAR |
2341.5Sitojun		      RADEON_MIN_FILTER_LINEAR |
2351.1Scjs		      RADEON_CLAMP_S_CLAMP_LAST |
2361.1Scjs		      RADEON_CLAMP_T_CLAMP_LAST |
2371.1Scjs		      RADEON_YUV_TO_RGB);
2381.1Scjs	OUT_RING_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
2391.9Schristos	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, 0, src_bo);
2401.9Schristos	OUT_RING_REG(RADEON_PP_TXCBLEND_0,
2411.1Scjs		      RADEON_COLOR_ARG_A_ZERO |
2421.9Schristos		      RADEON_COLOR_ARG_B_ZERO |
2431.1Scjs		      RADEON_COLOR_ARG_C_T0_COLOR |
2441.1Scjs		      RADEON_BLEND_CTL_ADD |
2451.5Sitojun		      RADEON_CLAMP_TX);
2461.1Scjs	OUT_RING_REG(RADEON_PP_TXABLEND_0,
2471.1Scjs		      RADEON_ALPHA_ARG_A_ZERO |
2481.1Scjs		      RADEON_ALPHA_ARG_B_ZERO |
2491.1Scjs		      RADEON_ALPHA_ARG_C_T0_ALPHA |
2501.1Scjs		      RADEON_BLEND_CTL_ADD |
2511.1Scjs		      RADEON_CLAMP_TX);
2521.1Scjs
2531.1Scjs	OUT_RING_REG(RADEON_PP_TEX_SIZE_0,
2541.1Scjs		      (pPriv->w - 1) |
2551.1Scjs		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
2561.1Scjs	OUT_RING_REG(RADEON_PP_TEX_PITCH_0,
2571.1Scjs		      pPriv->src_pitch - 32);
2581.1Scjs	ADVANCE_RING();
2591.1Scjs    }
2601.1Scjs
2611.1Scjs    BEGIN_RING(2*2);
2621.1Scjs    OUT_RING_REG(RADEON_RE_TOP_LEFT, 0);
2631.1Scjs    OUT_RING_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
2641.5Sitojun					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
2651.5Sitojun    ADVANCE_RING();
2661.1Scjs
2671.1Scjs    if (pPriv->vsync) {
2681.1Scjs	xf86CrtcPtr crtc;
2691.1Scjs	if (pPriv->desired_crtc)
2701.1Scjs	    crtc = pPriv->desired_crtc;
2711.1Scjs	else
2721.5Sitojun	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
2731.1Scjs					 pPriv->drw_x,
2741.1Scjs					 pPriv->drw_x + pPriv->dst_w,
2751.1Scjs					 pPriv->drw_y,
2761.1Scjs					 pPriv->drw_y + pPriv->dst_h);
2771.1Scjs	if (crtc)
2781.1Scjs	    RADEONWaitForVLine(pScrn, pPixmap,
2791.1Scjs				 crtc,
2801.1Scjs				 pPriv->drw_y - crtc->y,
2811.1Scjs				 (pPriv->drw_y - crtc->y) + pPriv->dst_h);
2821.4Schristos    }
2831.1Scjs
2841.1Scjs    return TRUE;
2851.1Scjs}
2861.1Scjs
2871.1Scjsstatic void
2881.1ScjsRADEONDisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2891.1Scjs{
2901.1Scjs    RADEONInfoPtr info = RADEONPTR(pScrn);
291    PixmapPtr pPixmap = pPriv->pPixmap;
292    int dstxoff, dstyoff;
293    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
294    int nBox = REGION_NUM_RECTS(&pPriv->clip);
295
296#ifdef COMPOSITE
297    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
298    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
299#else
300    dstxoff = 0;
301    dstyoff = 0;
302#endif
303
304    if (!RADEONPrepareTexturedVideo(pScrn, pPriv))
305	return;
306
307    /*
308     * Rendering of the actual polygon is done in two different
309     * ways depending on chip generation:
310     *
311     * < R300:
312     *
313     *     These chips can render a rectangle in one pass, so
314     *     handling is pretty straight-forward.
315     *
316     * >= R300:
317     *
318     *     These chips can accept a quad, but will render it as
319     *     two triangles which results in a diagonal tear. Instead
320     *     We render a single, large triangle and use the scissor
321     *     functionality to restrict it to the desired rectangle.
322     *     Due to guardband limits on r3xx/r4xx, we can only use
323     *     the single triangle up to 2560/4021 pixels; above that we
324     *     render as a quad.
325     */
326    while (nBox) {
327	int draw_size = 3 * pPriv->vtx_count + 5;
328	int loop_boxes;
329
330	if (draw_size > radeon_cs_space_remaining(pScrn)) {
331	    radeon_cs_flush_indirect(pScrn);
332	    if (!RADEONPrepareTexturedVideo(pScrn, pPriv))
333		return;
334	}
335	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
336	nBox -= loop_boxes;
337
338	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 5);
339	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
340			    loop_boxes * 3 * pPriv->vtx_count + 1));
341	if (pPriv->is_planar)
342	    OUT_RING(RADEON_CP_VC_FRMT_XY |
343		     RADEON_CP_VC_FRMT_ST0 |
344		     RADEON_CP_VC_FRMT_ST1);
345	else
346	    OUT_RING(RADEON_CP_VC_FRMT_XY |
347		     RADEON_CP_VC_FRMT_ST0);
348	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
349		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
350		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
351		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
352		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
353
354	while (loop_boxes--) {
355	    float srcX, srcY, srcw, srch;
356	    int dstX, dstY, dstw, dsth;
357	    dstX = pBox->x1 + dstxoff;
358	    dstY = pBox->y1 + dstyoff;
359	    dstw = pBox->x2 - pBox->x1;
360	    dsth = pBox->y2 - pBox->y1;
361
362	    srcX = pPriv->src_x;
363	    srcX += ((pBox->x1 - pPriv->drw_x) *
364		     pPriv->src_w) / (float)pPriv->dst_w;
365	    srcY = pPriv->src_y;
366	    srcY += ((pBox->y1 - pPriv->drw_y) *
367		     pPriv->src_h) / (float)pPriv->dst_h;
368
369	    srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
370	    srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
371
372
373	    if (pPriv->is_planar) {
374		/*
375		 * Just render a rect (using three coords).
376		 */
377		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
378			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
379			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
380		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
381			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
382			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
383		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
384			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
385			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
386	    } else {
387		/*
388		 * Just render a rect (using three coords).
389		 */
390		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
391			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
392		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
393			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
394		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
395			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
396	    }
397
398	    pBox++;
399	}
400
401	OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
402	ADVANCE_RING();
403    }
404    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
405}
406
407static Bool
408R200PrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
409{
410    RADEONInfoPtr info = RADEONPTR(pScrn);
411    PixmapPtr pPixmap = pPriv->pPixmap;
412    struct radeon_exa_pixmap_priv *driver_priv;
413    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
414    uint32_t txformat;
415    uint32_t txfilter, txsize, txpitch;
416    uint32_t dst_pitch, dst_format;
417    uint32_t colorpitch;
418    int pixel_shift;
419    int scissor_w = MIN(pPixmap->drawable.width, 2048) - 1;
420    int scissor_h = MIN(pPixmap->drawable.height, 2048) - 1;
421    /* note: in contrast to r300, use input biasing on uv components */
422    const float Loff = -0.0627;
423    float uvcosf, uvsinf;
424    float yco, yoff;
425    float uco[3], vco[3];
426    float bright, cont, sat;
427    int ref = pPriv->transform_index;
428    float ucscale = 0.25, vcscale = 0.25;
429    Bool needux8 = FALSE, needvx8 = FALSE;
430    int ret;
431
432    radeon_cs_space_reset_bos(info->cs);
433    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
434
435    if (pPriv->bicubic_enabled)
436	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
437
438    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
439    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
440
441    ret = radeon_cs_space_check(info->cs);
442    if (ret) {
443	ErrorF("Not enough RAM to hw accel xv operation\n");
444	return FALSE;
445    }
446
447    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
448
449    dst_pitch = exaGetPixmapPitch(pPixmap);
450
451    RADEON_SWITCH_TO_3D();
452
453    /* Same for R100/R200 */
454    switch (pPixmap->drawable.bitsPerPixel) {
455    case 16:
456	if (pPixmap->drawable.depth == 15)
457	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
458	else
459	    dst_format = RADEON_COLOR_FORMAT_RGB565;
460	break;
461    case 32:
462	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
463	break;
464    default:
465	return FALSE;
466    }
467
468    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
469	pPriv->is_planar = TRUE;
470	txformat = RADEON_TXFORMAT_I8;
471    } else {
472	pPriv->is_planar = FALSE;
473	if (pPriv->id == FOURCC_UYVY)
474	    txformat = RADEON_TXFORMAT_YVYU422;
475	else
476	    txformat = RADEON_TXFORMAT_VYUY422;
477    }
478
479    txformat |= RADEON_TXFORMAT_NON_POWER2;
480
481    colorpitch = dst_pitch >> pixel_shift;
482
483    if (RADEONTilingEnabled(pScrn, pPixmap))
484	colorpitch |= RADEON_COLOR_TILE_ENABLE;
485
486    BEGIN_ACCEL_RELOC(4,2);
487
488    OUT_RING_REG(RADEON_RB3D_CNTL, dst_format);
489    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
490    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
491
492    OUT_RING_REG(RADEON_RB3D_BLENDCNTL,
493		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
494
495    ADVANCE_RING();
496
497    txfilter =  R200_MAG_FILTER_LINEAR |
498	R200_MIN_FILTER_LINEAR |
499	R200_CLAMP_S_CLAMP_LAST |
500	R200_CLAMP_T_CLAMP_LAST;
501
502    /* contrast can cause constant overflow, clamp */
503    cont = RTFContrast(pPriv->contrast);
504    if (cont * trans[ref].RefLuma > 2.0)
505	cont = 2.0 / trans[ref].RefLuma;
506    /* brightness is only from -0.5 to 0.5 should be safe */
507    bright = RTFBrightness(pPriv->brightness);
508    /* saturation can also cause overflow, clamp */
509    sat = RTFSaturation(pPriv->saturation);
510    if (sat * trans[ref].RefBCb > 4.0)
511	sat = 4.0 / trans[ref].RefBCb;
512    uvcosf = sat * cos(RTFHue(pPriv->hue));
513    uvsinf = sat * sin(RTFHue(pPriv->hue));
514
515    yco = trans[ref].RefLuma * cont;
516    uco[0] = -trans[ref].RefRCr * uvsinf;
517    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
518    uco[2] = trans[ref].RefBCb * uvcosf;
519    vco[0] = trans[ref].RefRCr * uvcosf;
520    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
521    vco[2] = trans[ref].RefBCb * uvsinf;
522    yoff = Loff * yco + bright;
523
524    if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
525	needux8 = TRUE;
526	ucscale = 0.125;
527    }
528    if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
529	needvx8 = TRUE;
530	vcscale = 0.125;
531    }
532
533    if (pPriv->is_planar) {
534	/* need 2 texcoord sets (even though they are identical) due
535	   to denormalization! hw apparently can't premultiply
536	   same coord set by different texture size */
537	pPriv->vtx_count = 6;
538
539	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
540		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
541	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
542	txpitch -= 32;
543
544	BEGIN_ACCEL_RELOC(36, 3);
545
546	OUT_RING_REG(RADEON_PP_CNTL,
547		      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
548		      RADEON_TEX_BLEND_0_ENABLE |
549		      RADEON_TEX_BLEND_1_ENABLE |
550		      RADEON_TEX_BLEND_2_ENABLE);
551
552	OUT_RING_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
553	OUT_RING_REG(R200_SE_VTX_FMT_1,
554		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
555		      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
556
557	OUT_RING_REG(R200_PP_TXFILTER_0, txfilter);
558	OUT_RING_REG(R200_PP_TXFORMAT_0, txformat);
559	OUT_RING_REG(R200_PP_TXFORMAT_X_0, 0);
560	OUT_RING_REG(R200_PP_TXSIZE_0,
561		      (pPriv->w - 1) |
562		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
563	OUT_RING_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
564	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, 0, src_bo);
565
566	OUT_RING_REG(R200_PP_TXFILTER_1, txfilter);
567	OUT_RING_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
568	OUT_RING_REG(R200_PP_TXFORMAT_X_1, 0);
569	OUT_RING_REG(R200_PP_TXSIZE_1, txsize);
570	OUT_RING_REG(R200_PP_TXPITCH_1, txpitch);
571	OUT_TEXTURE_REG(R200_PP_TXOFFSET_1, pPriv->planeu_offset, src_bo);
572
573	OUT_RING_REG(R200_PP_TXFILTER_2, txfilter);
574	OUT_RING_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
575	OUT_RING_REG(R200_PP_TXFORMAT_X_2, 0);
576	OUT_RING_REG(R200_PP_TXSIZE_2, txsize);
577	OUT_RING_REG(R200_PP_TXPITCH_2, txpitch);
578	OUT_TEXTURE_REG(R200_PP_TXOFFSET_2, pPriv->planev_offset, src_bo);
579
580	/* similar to r300 code. Note the big problem is that hardware constants
581	 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
582	 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
583	 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
584	 * the constants not. To get larger range can use output scale, but for
585	 * that 2.018 value we need a total scale by 8, which means the constants
586	 * really have no accuracy whatsoever (5 fractional bits only).
587	 * The only direct way to get high  precision "constants" into the fragment
588	 * pipe I know of is to use the texcoord interpolator (not color, this one
589	 * is 8 bit only too), which seems a bit expensive. We're lucky though it
590	 * seems the values we need seem to fit better than worst case (get about
591	 * 6 fractional bits for this instead of 5, at least when not correcting for
592	 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
593	 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
594	 * even with non-default saturation/hue/contrast/brightness adjustments,
595	 * it gets a little crazy and ultimately precision might still be lacking.
596	 *
597	 * A higher precision (8 fractional bits) version might just put uco into
598	 * a texcoord, and calculate a new vcoconst in the shader, like so:
599	 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
600	 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
601	 * vcocalc = ADD temp, bias/scale(cohelper), vco
602	 * would in total use 4 tex units, 4 instructions which seems fairly
603	 * balanced for this architecture (instead of 3 + 3 for the solution here)
604	 *
605	 * temp = MAD(yco, yuv.yyyy, yoff)
606	 * temp = MAD(uco, yuv.uuuu, temp)
607	 * result = MAD(vco, yuv.vvvv, temp)
608	 *
609	 * note first mad produces actually scalar, hence we transform
610	 * it into a dp2a to get 8 bit precision of yco instead of 7 -
611	 * That's assuming hw correctly expands consts to internal precision.
612	 * (y * 1 + y * (yco - 1) + yoff)
613	 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
614	 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
615	 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
616	 *
617	 * vco, uco need bias (and hence scale too)
618	 *
619	 */
620
621	/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
622	OUT_RING_REG(R200_PP_TXCBLEND_0,
623		      R200_TXC_ARG_A_TFACTOR_COLOR |
624		      R200_TXC_ARG_B_R0_COLOR |
625		      R200_TXC_ARG_C_TFACTOR_COLOR |
626		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
627		      R200_TXC_OP_DOT2_ADD);
628	OUT_RING_REG(R200_PP_TXCBLEND2_0,
629		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
630		      R200_TXC_SCALE_INV2 |
631		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
632	OUT_RING_REG(R200_PP_TXABLEND_0,
633		      R200_TXA_ARG_A_ZERO |
634		      R200_TXA_ARG_B_ZERO |
635		      R200_TXA_ARG_C_ZERO |
636		      R200_TXA_OP_MADD);
637	OUT_RING_REG(R200_PP_TXABLEND2_0,
638		      R200_TXA_OUTPUT_REG_NONE);
639
640	/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
641	OUT_RING_REG(R200_PP_TXCBLEND_1,
642		      R200_TXC_ARG_A_TFACTOR_COLOR |
643		      R200_TXC_BIAS_ARG_A |
644		      R200_TXC_SCALE_ARG_A |
645		      R200_TXC_ARG_B_R1_COLOR |
646		      R200_TXC_BIAS_ARG_B |
647		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
648		      R200_TXC_ARG_C_R0_COLOR |
649		      R200_TXC_OP_MADD);
650	OUT_RING_REG(R200_PP_TXCBLEND2_1,
651		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
652		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
653	OUT_RING_REG(R200_PP_TXABLEND_1,
654		      R200_TXA_ARG_A_ZERO |
655		      R200_TXA_ARG_B_ZERO |
656		      R200_TXA_ARG_C_ZERO |
657		      R200_TXA_OP_MADD);
658	OUT_RING_REG(R200_PP_TXABLEND2_1,
659		      R200_TXA_OUTPUT_REG_NONE);
660
661	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
662	OUT_RING_REG(R200_PP_TXCBLEND_2,
663		      R200_TXC_ARG_A_TFACTOR_COLOR |
664		      R200_TXC_BIAS_ARG_A |
665		      R200_TXC_SCALE_ARG_A |
666		      R200_TXC_ARG_B_R2_COLOR |
667		      R200_TXC_BIAS_ARG_B |
668		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
669		      R200_TXC_ARG_C_R0_COLOR |
670		      R200_TXC_OP_MADD);
671	OUT_RING_REG(R200_PP_TXCBLEND2_2,
672		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
673		      R200_TXC_SCALE_2X |
674		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
675	OUT_RING_REG(R200_PP_TXABLEND_2,
676		      R200_TXA_ARG_A_ZERO |
677		      R200_TXA_ARG_B_ZERO |
678		      R200_TXA_ARG_C_ZERO |
679		      R200_TXA_COMP_ARG_C |
680		      R200_TXA_OP_MADD);
681	OUT_RING_REG(R200_PP_TXABLEND2_2,
682		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
683
684	/* shader constants */
685	OUT_RING_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
686						      yco > 1.0 ? yco - 1.0: yco,
687						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
688						      0.0));
689	OUT_RING_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
690						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
691						      uco[2] * ucscale + 0.5,
692						      0.0));
693	OUT_RING_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
694						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
695						      vco[2] * vcscale + 0.5,
696						      0.0));
697
698	ADVANCE_RING();
699    } else {
700	pPriv->vtx_count = 4;
701
702	BEGIN_ACCEL_RELOC(24, 1);
703
704	OUT_RING_REG(RADEON_PP_CNTL,
705		      RADEON_TEX_0_ENABLE |
706		      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
707		      RADEON_TEX_BLEND_2_ENABLE);
708
709	OUT_RING_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
710	OUT_RING_REG(R200_SE_VTX_FMT_1,
711		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
712
713	OUT_RING_REG(R200_PP_TXFILTER_0, txfilter);
714	OUT_RING_REG(R200_PP_TXFORMAT_0, txformat);
715	OUT_RING_REG(R200_PP_TXFORMAT_X_0, 0);
716	OUT_RING_REG(R200_PP_TXSIZE_0,
717		      (pPriv->w - 1) |
718		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
719	OUT_RING_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
720	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, 0, src_bo);
721
722	/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
723	OUT_RING_REG(R200_PP_TXCBLEND_0,
724		      R200_TXC_ARG_A_TFACTOR_COLOR |
725		      R200_TXC_ARG_B_R0_COLOR |
726		      R200_TXC_ARG_C_TFACTOR_COLOR |
727		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
728		      R200_TXC_OP_DOT2_ADD);
729	OUT_RING_REG(R200_PP_TXCBLEND2_0,
730		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
731		      R200_TXC_SCALE_INV2 |
732		      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
733		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
734	OUT_RING_REG(R200_PP_TXABLEND_0,
735		      R200_TXA_ARG_A_ZERO |
736		      R200_TXA_ARG_B_ZERO |
737		      R200_TXA_ARG_C_ZERO |
738		      R200_TXA_OP_MADD);
739	OUT_RING_REG(R200_PP_TXABLEND2_0,
740		      R200_TXA_OUTPUT_REG_NONE);
741
742	/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
743	OUT_RING_REG(R200_PP_TXCBLEND_1,
744		      R200_TXC_ARG_A_TFACTOR_COLOR |
745		      R200_TXC_BIAS_ARG_A |
746		      R200_TXC_SCALE_ARG_A |
747		      R200_TXC_ARG_B_R0_COLOR |
748		      R200_TXC_BIAS_ARG_B |
749		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
750		      R200_TXC_ARG_C_R1_COLOR |
751		      R200_TXC_OP_MADD);
752	OUT_RING_REG(R200_PP_TXCBLEND2_1,
753		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
754		      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
755		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
756	OUT_RING_REG(R200_PP_TXABLEND_1,
757		      R200_TXA_ARG_A_ZERO |
758		      R200_TXA_ARG_B_ZERO |
759		      R200_TXA_ARG_C_ZERO |
760		      R200_TXA_OP_MADD);
761	OUT_RING_REG(R200_PP_TXABLEND2_1,
762		      R200_TXA_OUTPUT_REG_NONE);
763
764	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
765	OUT_RING_REG(R200_PP_TXCBLEND_2,
766		      R200_TXC_ARG_A_TFACTOR_COLOR |
767		      R200_TXC_BIAS_ARG_A |
768		      R200_TXC_SCALE_ARG_A |
769		      R200_TXC_ARG_B_R0_COLOR |
770		      R200_TXC_BIAS_ARG_B |
771		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
772		      R200_TXC_ARG_C_R1_COLOR |
773		      R200_TXC_OP_MADD);
774	OUT_RING_REG(R200_PP_TXCBLEND2_2,
775		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
776		      R200_TXC_SCALE_2X |
777		      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
778		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
779	OUT_RING_REG(R200_PP_TXABLEND_2,
780		      R200_TXA_ARG_A_ZERO |
781		      R200_TXA_ARG_B_ZERO |
782		      R200_TXA_ARG_C_ZERO |
783		      R200_TXA_COMP_ARG_C |
784		      R200_TXA_OP_MADD);
785	OUT_RING_REG(R200_PP_TXABLEND2_2,
786		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
787
788	/* shader constants */
789	OUT_RING_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
790						      yco > 1.0 ? yco - 1.0: yco,
791						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
792						      0.0));
793	OUT_RING_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
794						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
795						      uco[2] * ucscale + 0.5,
796						      0.0));
797	OUT_RING_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
798						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
799						      vco[2] * vcscale + 0.5,
800						      0.0));
801
802	ADVANCE_RING();
803    }
804
805    BEGIN_RING(2*2);
806    OUT_RING_REG(RADEON_RE_TOP_LEFT, 0);
807    OUT_RING_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
808					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
809    ADVANCE_RING();
810
811    if (pPriv->vsync) {
812	xf86CrtcPtr crtc;
813	if (pPriv->desired_crtc)
814	    crtc = pPriv->desired_crtc;
815	else
816	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
817					 pPriv->drw_x,
818					 pPriv->drw_x + pPriv->dst_w,
819					 pPriv->drw_y,
820					 pPriv->drw_y + pPriv->dst_h);
821	if (crtc)
822	    RADEONWaitForVLine(pScrn, pPixmap,
823				 crtc,
824				 pPriv->drw_y - crtc->y,
825				 (pPriv->drw_y - crtc->y) + pPriv->dst_h);
826    }
827
828    return TRUE;
829}
830
831static void
832R200DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
833{
834    RADEONInfoPtr info = RADEONPTR(pScrn);
835    PixmapPtr pPixmap = pPriv->pPixmap;
836    int dstxoff, dstyoff;
837    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
838    int nBox = REGION_NUM_RECTS(&pPriv->clip);
839
840#ifdef COMPOSITE
841    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
842    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
843#else
844    dstxoff = 0;
845    dstyoff = 0;
846#endif
847
848    if (!R200PrepareTexturedVideo(pScrn, pPriv))
849	return;
850
851    /*
852     * Rendering of the actual polygon is done in two different
853     * ways depending on chip generation:
854     *
855     * < R300:
856     *
857     *     These chips can render a rectangle in one pass, so
858     *     handling is pretty straight-forward.
859     *
860     * >= R300:
861     *
862     *     These chips can accept a quad, but will render it as
863     *     two triangles which results in a diagonal tear. Instead
864     *     We render a single, large triangle and use the scissor
865     *     functionality to restrict it to the desired rectangle.
866     *     Due to guardband limits on r3xx/r4xx, we can only use
867     *     the single triangle up to 2560/4021 pixels; above that we
868     *     render as a quad.
869     */
870
871    while (nBox) {
872	int draw_size = 3 * pPriv->vtx_count + 4;
873	int loop_boxes;
874
875	if (draw_size > radeon_cs_space_remaining(pScrn)) {
876	    radeon_cs_flush_indirect(pScrn);
877	    if (!R200PrepareTexturedVideo(pScrn, pPriv))
878		return;
879	}
880	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
881	nBox -= loop_boxes;
882
883	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 4);
884	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
885			    loop_boxes * 3 * pPriv->vtx_count));
886	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
887		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
888		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
889
890	while (loop_boxes--) {
891	    float srcX, srcY, srcw, srch;
892	    int dstX, dstY, dstw, dsth;
893	    dstX = pBox->x1 + dstxoff;
894	    dstY = pBox->y1 + dstyoff;
895	    dstw = pBox->x2 - pBox->x1;
896	    dsth = pBox->y2 - pBox->y1;
897
898	    srcX = pPriv->src_x;
899	    srcX += ((pBox->x1 - pPriv->drw_x) *
900		     pPriv->src_w) / (float)pPriv->dst_w;
901	    srcY = pPriv->src_y;
902	    srcY += ((pBox->y1 - pPriv->drw_y) *
903		     pPriv->src_h) / (float)pPriv->dst_h;
904
905	    srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
906	    srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
907
908	    if (pPriv->is_planar) {
909		/*
910		 * Just render a rect (using three coords).
911		 */
912		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
913			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
914			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
915		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
916			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
917			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
918		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
919			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
920			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
921	    } else {
922		/*
923		 * Just render a rect (using three coords).
924		 */
925		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
926			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
927		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
928			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
929		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
930			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
931	    }
932
933	    pBox++;
934	}
935
936	OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
937	ADVANCE_RING();
938    }
939
940    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
941}
942
943static Bool
944R300PrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
945{
946    RADEONInfoPtr info = RADEONPTR(pScrn);
947    PixmapPtr pPixmap = pPriv->pPixmap;
948    struct radeon_exa_pixmap_priv *driver_priv;
949    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
950    uint32_t txfilter, txformat0, txformat1, txpitch;
951    uint32_t dst_pitch, dst_format;
952    uint32_t txenable, colorpitch;
953    uint32_t output_fmt;
954    int pixel_shift;
955    int ret;
956
957    radeon_cs_space_reset_bos(info->cs);
958    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
959
960    if (pPriv->bicubic_enabled)
961	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
962
963    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
964    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
965
966    ret = radeon_cs_space_check(info->cs);
967    if (ret) {
968	ErrorF("Not enough RAM to hw accel xv operation\n");
969	return FALSE;
970    }
971
972    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
973
974    dst_pitch = exaGetPixmapPitch(pPixmap);
975    RADEON_SWITCH_TO_3D();
976
977    if (pPriv->bicubic_enabled)
978	pPriv->vtx_count = 6;
979    else
980	pPriv->vtx_count = 4;
981
982    switch (pPixmap->drawable.bitsPerPixel) {
983    case 16:
984	if (pPixmap->drawable.depth == 15)
985	    dst_format = R300_COLORFORMAT_ARGB1555;
986	else
987	    dst_format = R300_COLORFORMAT_RGB565;
988	break;
989    case 32:
990	dst_format = R300_COLORFORMAT_ARGB8888;
991	break;
992    default:
993	return FALSE;
994    }
995
996    output_fmt = (R300_OUT_FMT_C4_8 |
997		  R300_OUT_FMT_C0_SEL_BLUE |
998		  R300_OUT_FMT_C1_SEL_GREEN |
999		  R300_OUT_FMT_C2_SEL_RED |
1000		  R300_OUT_FMT_C3_SEL_ALPHA);
1001
1002    colorpitch = dst_pitch >> pixel_shift;
1003    colorpitch |= dst_format;
1004
1005    if (RADEONTilingEnabled(pScrn, pPixmap))
1006	colorpitch |= R300_COLORTILE;
1007
1008
1009    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
1010	(pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
1011	pPriv->is_planar = TRUE;
1012    else
1013	pPriv->is_planar = FALSE;
1014
1015    if (pPriv->is_planar) {
1016	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
1017	txpitch = pPriv->src_pitch;
1018    } else {
1019	if (pPriv->id == FOURCC_UYVY)
1020	    txformat1 = R300_TX_FORMAT_YVYU422;
1021	else
1022	    txformat1 = R300_TX_FORMAT_VYUY422;
1023
1024	if (pPriv->bicubic_state != BICUBIC_OFF)
1025	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
1026
1027	/* pitch is in pixels */
1028	txpitch = pPriv->src_pitch / 2;
1029    }
1030    txpitch -= 1;
1031
1032    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1033		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1034		 R300_TXPITCH_EN);
1035
1036    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1037		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1038		R300_TX_MAG_FILTER_LINEAR |
1039		R300_TX_MIN_FILTER_LINEAR |
1040		(0 << R300_TX_ID_SHIFT));
1041
1042    BEGIN_ACCEL_RELOC(6, 1);
1043    OUT_RING_REG(R300_TX_FILTER0_0, txfilter);
1044    OUT_RING_REG(R300_TX_FILTER1_0, 0);
1045    OUT_RING_REG(R300_TX_FORMAT0_0, txformat0);
1046    if (pPriv->is_planar)
1047	OUT_RING_REG(R300_TX_FORMAT1_0, txformat1 | R300_TX_FORMAT_CACHE_HALF_REGION_0);
1048    else
1049	OUT_RING_REG(R300_TX_FORMAT1_0, txformat1);
1050    OUT_RING_REG(R300_TX_FORMAT2_0, txpitch);
1051    OUT_TEXTURE_REG(R300_TX_OFFSET_0, 0, src_bo);
1052    ADVANCE_RING();
1053
1054    txenable = R300_TEX_0_ENABLE;
1055
1056    if (pPriv->is_planar) {
1057	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1058		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1059		     R300_TXPITCH_EN);
1060	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
1061	txpitch -= 1;
1062	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1063		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1064		    R300_TX_MIN_FILTER_LINEAR |
1065		    R300_TX_MAG_FILTER_LINEAR);
1066
1067	BEGIN_ACCEL_RELOC(12, 2);
1068	OUT_RING_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
1069	OUT_RING_REG(R300_TX_FILTER1_1, 0);
1070	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
1071	OUT_RING_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
1072	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
1073	OUT_TEXTURE_REG(R300_TX_OFFSET_1, pPriv->planeu_offset, src_bo);
1074	OUT_RING_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
1075	OUT_RING_REG(R300_TX_FILTER1_2, 0);
1076	OUT_RING_REG(R300_TX_FORMAT0_2, txformat0);
1077	OUT_RING_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
1078	OUT_RING_REG(R300_TX_FORMAT2_2, txpitch);
1079	OUT_TEXTURE_REG(R300_TX_OFFSET_2, pPriv->planev_offset, src_bo);
1080	ADVANCE_RING();
1081	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
1082    }
1083
1084    if (pPriv->bicubic_enabled) {
1085	/* Size is 128x1 */
1086	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
1087		     (0x0 << R300_TXHEIGHT_SHIFT) |
1088		     R300_TXPITCH_EN);
1089	/* Format is 32-bit floats, 4bpp */
1090	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
1091	/* Pitch is 127 (128-1) */
1092	txpitch = 0x7f;
1093	/* Tex filter */
1094	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
1095		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
1096		    R300_TX_MIN_FILTER_NEAREST |
1097		    R300_TX_MAG_FILTER_NEAREST |
1098		    (1 << R300_TX_ID_SHIFT));
1099
1100	BEGIN_ACCEL_RELOC(6, 1);
1101	OUT_RING_REG(R300_TX_FILTER0_1, txfilter);
1102	OUT_RING_REG(R300_TX_FILTER1_1, 0);
1103	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
1104	OUT_RING_REG(R300_TX_FORMAT1_1, txformat1);
1105	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
1106	OUT_TEXTURE_REG(R300_TX_OFFSET_1, 0, info->bicubic_bo);
1107	ADVANCE_RING();
1108
1109	/* Enable tex 1 */
1110	txenable |= R300_TEX_1_ENABLE;
1111    }
1112
1113    /* setup the VAP */
1114    if (info->accel_state->has_tcl) {
1115	if (pPriv->bicubic_enabled)
1116	    BEGIN_RING(2*7);
1117	else
1118	    BEGIN_RING(2*6);
1119    } else {
1120	if (pPriv->bicubic_enabled)
1121	    BEGIN_RING(2*5);
1122	else
1123	    BEGIN_RING(2*4);
1124    }
1125
1126    /* These registers define the number, type, and location of data submitted
1127     * to the PVS unit of GA input (when PVS is disabled)
1128     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
1129     * enabled.  This memory provides the imputs to the vertex shader program
1130     * and ordering is not important.  When PVS/TCL is disabled, this field maps
1131     * directly to the GA input memory and the order is signifigant.  In
1132     * PVS_BYPASS mode the order is as follows:
1133     * Position
1134     * Point Size
1135     * Color 0-3
1136     * Textures 0-7
1137     * Fog
1138     */
1139    if (pPriv->bicubic_enabled) {
1140	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
1141		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1142		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1143		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1144		       R300_SIGNED_0 |
1145		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1146		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1147		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1148		       R300_SIGNED_1));
1149	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_1,
1150		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
1151		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
1152		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
1153		       R300_LAST_VEC_2 |
1154		       R300_SIGNED_2));
1155    } else {
1156	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
1157		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1158		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1159		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1160		       R300_SIGNED_0 |
1161		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1162		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1163		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1164		       R300_LAST_VEC_1 |
1165		       R300_SIGNED_1));
1166    }
1167
1168    /* load the vertex shader
1169     * We pre-load vertex programs in RADEONInit3DEngine():
1170     * - exa
1171     * - Xv
1172     * - Xv bicubic
1173     * Here we select the offset of the vertex program we want to use
1174     */
1175    if (info->accel_state->has_tcl) {
1176	if (pPriv->bicubic_enabled) {
1177	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
1178			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
1179			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1180			   (13 << R300_PVS_LAST_INST_SHIFT)));
1181	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
1182			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1183	} else {
1184	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
1185			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
1186			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1187			   (10 << R300_PVS_LAST_INST_SHIFT)));
1188	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
1189			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1190	}
1191    }
1192
1193    /* Position and one set of 2 texture coordinates */
1194    OUT_RING_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
1195    if (pPriv->bicubic_enabled)
1196	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
1197					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
1198    else
1199	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
1200
1201    OUT_RING_REG(R300_US_OUT_FMT_0, output_fmt);
1202    ADVANCE_RING();
1203
1204    /* setup pixel shader */
1205    if (pPriv->bicubic_state != BICUBIC_OFF) {
1206	if (pPriv->bicubic_enabled) {
1207	    BEGIN_RING(2*79);
1208
1209	    /* 4 components: 2 for tex0 and 2 for tex1 */
1210	    OUT_RING_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1211					  R300_RS_COUNT_HIRES_EN));
1212
1213	    /* R300_INST_COUNT_RS - highest RS instruction used */
1214	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
1215
1216	    /* Pixel stack frame size. */
1217	    OUT_RING_REG(R300_US_PIXSIZE, 5);
1218
1219	    /* Indirection levels */
1220	    OUT_RING_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
1221					   R300_FIRST_TEX));
1222
1223	    /* Set nodes. */
1224	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1225						R300_ALU_CODE_SIZE(14) |
1226						R300_TEX_CODE_OFFSET(0) |
1227						R300_TEX_CODE_SIZE(6)));
1228
1229	    /* Nodes are allocated highest first, but executed lowest first */
1230	    OUT_RING_REG(R300_US_CODE_ADDR_0, 0);
1231	    OUT_RING_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
1232						R300_ALU_SIZE(0) |
1233						R300_TEX_START(0) |
1234						R300_TEX_SIZE(0)));
1235	    OUT_RING_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
1236						R300_ALU_SIZE(9) |
1237						R300_TEX_START(1) |
1238						R300_TEX_SIZE(0)));
1239	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
1240						R300_ALU_SIZE(2) |
1241						R300_TEX_START(2) |
1242						R300_TEX_SIZE(3) |
1243						R300_RGBA_OUT));
1244
1245	    /* ** BICUBIC FP ** */
1246
1247	    /* texcoord0 => temp0
1248	     * texcoord1 => temp1 */
1249
1250	    // first node
1251	    /* TEX temp2, temp1.rrr0, tex1, 1D */
1252	    OUT_RING_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
1253						R300_TEX_ID(1) |
1254						R300_TEX_SRC_ADDR(1) |
1255						R300_TEX_DST_ADDR(2)));
1256
1257	    /* MOV temp1.r, temp1.ggg0 */
1258	    OUT_RING_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1259						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1260						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1261						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1262	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
1263						    R300_ALU_RGB_ADDRD(1) |
1264						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1265	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1266						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1267						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1268						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1269	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
1270						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1271
1272
1273	    // second node
1274	    /* TEX temp1, temp1, tex1, 1D */
1275	    OUT_RING_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
1276						R300_TEX_ID(1) |
1277						R300_TEX_SRC_ADDR(1) |
1278						R300_TEX_DST_ADDR(1)));
1279
1280	    /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
1281	    OUT_RING_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1282						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1283						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1284						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1285	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
1286						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1287						    R300_ALU_RGB_ADDRD(3) |
1288						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1289	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1290						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1291						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1292						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1293	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
1294						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1295
1296
1297	    /* MUL temp2.rg, temp2.rrr0, const0.rgb */
1298	    OUT_RING_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1299						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1300						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1301						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1302	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
1303						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1304						    R300_ALU_RGB_ADDRD(2) |
1305						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1306	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1307						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1308						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1309						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1310	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
1311						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1312
1313	    /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
1314	    OUT_RING_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1315						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1316						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1317						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1318	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
1319						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1320						    R300_ALU_RGB_ADDR2(3) |
1321						    R300_ALU_RGB_ADDRD(4) |
1322						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1323	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1324						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1325						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1326						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1327	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
1328						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1329
1330	    /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
1331	    OUT_RING_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1332						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1333						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1334						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1335	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
1336						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1337						    R300_ALU_RGB_ADDR2(2) |
1338						    R300_ALU_RGB_ADDRD(5) |
1339						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1340	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1341						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1342						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1343						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1344	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
1345						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1346
1347	    /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
1348	    OUT_RING_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1349						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1350						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1351						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1352	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
1353						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1354						    R300_ALU_RGB_ADDR2(3) |
1355						    R300_ALU_RGB_ADDRD(3) |
1356						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1357	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1358						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1359						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1360						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1361	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
1362						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1363
1364	    /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
1365	    OUT_RING_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1366						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1367						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1368						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1369	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
1370						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1371						    R300_ALU_RGB_ADDR2(2) |
1372						    R300_ALU_RGB_ADDRD(1) |
1373						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1374	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1375						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1376						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1377						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1378	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
1379						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1380
1381	    /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
1382	    OUT_RING_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1383						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1384						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1385						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1386	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1387						    R300_ALU_RGB_ADDR2(1) |
1388						    R300_ALU_RGB_ADDRD(1) |
1389						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1390	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1391						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1392						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1393						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1394	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
1395						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1396
1397	    /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
1398	    OUT_RING_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1399						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1400						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1401						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1402	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1403						    R300_ALU_RGB_ADDR2(3) |
1404						    R300_ALU_RGB_ADDRD(2) |
1405						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1406	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1407						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1408						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1409						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1410	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
1411						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1412
1413	    /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
1414	    OUT_RING_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1415						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1416						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1417						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1418	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1419						    R300_ALU_RGB_ADDR2(5) |
1420						    R300_ALU_RGB_ADDRD(3) |
1421						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1422	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1423						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1424						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1425						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1426	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
1427						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1428
1429	    /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
1430	    OUT_RING_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1431						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1432						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1433						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1434	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
1435						     R300_ALU_RGB_ADDR2(4) |
1436						     R300_ALU_RGB_ADDRD(0) |
1437						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1438	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1439						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1440						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1441						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1442	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
1443						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1444
1445
1446	    // third node
1447	    /* TEX temp4, temp1.rg--, tex0, 1D */
1448	    OUT_RING_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
1449						R300_TEX_ID(0) |
1450						R300_TEX_SRC_ADDR(1) |
1451						R300_TEX_DST_ADDR(4)));
1452
1453	    /* TEX temp3, temp3.rg--, tex0, 1D */
1454	    OUT_RING_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
1455						R300_TEX_ID(0) |
1456						R300_TEX_SRC_ADDR(3) |
1457						R300_TEX_DST_ADDR(3)));
1458
1459	    /* TEX temp5, temp2.rg--, tex0, 1D */
1460	    OUT_RING_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
1461						R300_TEX_ID(0) |
1462						R300_TEX_SRC_ADDR(2) |
1463						R300_TEX_DST_ADDR(5)));
1464
1465	    /* TEX temp0, temp0.rg--, tex0, 1D */
1466	    OUT_RING_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
1467						R300_TEX_ID(0) |
1468						R300_TEX_SRC_ADDR(0) |
1469						R300_TEX_DST_ADDR(0)));
1470
1471	    /* LRP temp3, temp1.bbbb, temp4, temp3 ->
1472	     * - PRESUB temps, temp4 - temp3
1473	     * - MAD temp3, temp1.bbbb, temps, temp3 */
1474	    OUT_RING_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1475						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1476						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1477						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1478						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1479	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
1480						     R300_ALU_RGB_ADDR1(4) |
1481						     R300_ALU_RGB_ADDR2(1) |
1482						     R300_ALU_RGB_ADDRD(3) |
1483						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1484	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1485						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1486						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1487						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1488	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
1489						       R300_ALU_ALPHA_ADDR1(4) |
1490						       R300_ALU_ALPHA_ADDR2(1) |
1491						       R300_ALU_ALPHA_ADDRD(3) |
1492						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1493
1494	    /* LRP temp0, temp1.bbbb, temp5, temp0 ->
1495	     * - PRESUB temps, temp5 - temp0
1496	     * - MAD temp0, temp1.bbbb, temps, temp0 */
1497	    OUT_RING_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1498						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1499						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1500						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1501						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
1502						     R300_ALU_RGB_INSERT_NOP));
1503	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
1504						     R300_ALU_RGB_ADDR1(5) |
1505						     R300_ALU_RGB_ADDR2(1) |
1506						     R300_ALU_RGB_ADDRD(0) |
1507						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1508	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1509						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1510						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1511						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1512	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
1513						       R300_ALU_ALPHA_ADDR1(5) |
1514						       R300_ALU_ALPHA_ADDR2(1) |
1515						       R300_ALU_ALPHA_ADDRD(0) |
1516						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1517
1518	    /* LRP output, temp2.bbbb, temp3, temp0 ->
1519	     * - PRESUB temps, temp3 - temp0
1520	     * - MAD output, temp2.bbbb, temps, temp0 */
1521	    OUT_RING_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1522						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1523						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1524						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1525						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1526	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
1527						     R300_ALU_RGB_ADDR1(3) |
1528						     R300_ALU_RGB_ADDR2(2) |
1529						     R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
1530	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1531						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1532						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1533						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1534	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
1535						       R300_ALU_ALPHA_ADDR1(3) |
1536						       R300_ALU_ALPHA_ADDR2(2) |
1537						       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
1538
1539	    /* Shader constants. */
1540	    OUT_RING_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
1541	    OUT_RING_REG(R300_US_ALU_CONST_G(0), 0);
1542	    OUT_RING_REG(R300_US_ALU_CONST_B(0), 0);
1543	    OUT_RING_REG(R300_US_ALU_CONST_A(0), 0);
1544
1545	    OUT_RING_REG(R300_US_ALU_CONST_R(1), 0);
1546	    OUT_RING_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
1547	    OUT_RING_REG(R300_US_ALU_CONST_B(1), 0);
1548	    OUT_RING_REG(R300_US_ALU_CONST_A(1), 0);
1549
1550	    ADVANCE_RING();
1551	} else {
1552	    BEGIN_RING(2*11);
1553	    /* 2 components: 2 for tex0 */
1554	    OUT_RING_REG(R300_RS_COUNT,
1555                          ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1556                           R300_RS_COUNT_HIRES_EN));
1557	    /* R300_INST_COUNT_RS - highest RS instruction used */
1558	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1559
1560	    OUT_RING_REG(R300_US_PIXSIZE, 0); /* highest temp used */
1561
1562	    /* Indirection levels */
1563	    OUT_RING_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1564					   R300_FIRST_TEX));
1565
1566	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1567						R300_ALU_CODE_SIZE(1) |
1568						R300_TEX_CODE_OFFSET(0) |
1569						R300_TEX_CODE_SIZE(1)));
1570
1571	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1572						R300_ALU_SIZE(0) |
1573						R300_TEX_START(0) |
1574						R300_TEX_SIZE(0) |
1575						R300_RGBA_OUT));
1576
1577	    /* tex inst */
1578	    OUT_RING_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1579					       R300_TEX_DST_ADDR(0) |
1580					       R300_TEX_ID(0) |
1581					       R300_TEX_INST(R300_TEX_INST_LD)));
1582
1583	    /* ALU inst */
1584	    /* RGB */
1585	    OUT_RING_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
1586                                                   R300_ALU_RGB_ADDR1(0) |
1587                                                   R300_ALU_RGB_ADDR2(0) |
1588                                                   R300_ALU_RGB_ADDRD(0) |
1589                                                   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
1590								       R300_ALU_RGB_MASK_G |
1591								       R300_ALU_RGB_MASK_B)) |
1592                                                   R300_ALU_RGB_TARGET_A));
1593	    OUT_RING_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1594                                                   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1595                                                   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1596						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1597                                                   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1598                                                   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1599                                                   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1600                                                   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1601                                                   R300_ALU_RGB_CLAMP));
1602	    /* Alpha */
1603	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
1604						     R300_ALU_ALPHA_ADDR1(0) |
1605						     R300_ALU_ALPHA_ADDR2(0) |
1606						     R300_ALU_ALPHA_ADDRD(0) |
1607						     R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1608						     R300_ALU_ALPHA_TARGET_A |
1609						     R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
1610	    OUT_RING_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
1611						     R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
1612						     R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
1613						     R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
1614						     R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
1615						     R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
1616						     R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1617						     R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
1618						     R300_ALU_ALPHA_CLAMP));
1619	    ADVANCE_RING();
1620	}
1621    } else {
1622	/*
1623	 * y' = y - .0625
1624	 * u' = u - .5
1625	 * v' = v - .5;
1626	 *
1627	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
1628	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
1629	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
1630	 *
1631	 * DP3 might look like the straightforward solution
1632	 * but we'd need to move the texture yuv values in
1633	 * the same reg for this to work. Therefore use MADs.
1634	 * Brightness just adds to the off constant.
1635	 * Contrast is multiplication of luminance.
1636	 * Saturation and hue change the u and v coeffs.
1637	 * Default values (before adjustments - depend on colorspace):
1638	 * yco = 1.1643
1639	 * uco = 0, -0.39173, 2.017
1640	 * vco = 1.5958, -0.8129, 0
1641	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
1642	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
1643	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
1644	 *
1645	 * temp = MAD(yco, yuv.yyyy, off)
1646	 * temp = MAD(uco, yuv.uuuu, temp)
1647	 * result = MAD(vco, yuv.vvvv, temp)
1648	 */
1649	/* TODO: don't recalc consts always */
1650	const float Loff = -0.0627;
1651	const float Coff = -0.502;
1652	float uvcosf, uvsinf;
1653	float yco;
1654	float uco[3], vco[3], off[3];
1655	float bright, cont, gamma;
1656	int ref = pPriv->transform_index;
1657	Bool needgamma = FALSE;
1658
1659	cont = RTFContrast(pPriv->contrast);
1660	bright = RTFBrightness(pPriv->brightness);
1661	gamma = (float)pPriv->gamma / 1000.0;
1662	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
1663	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
1664	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
1665
1666	yco = trans[ref].RefLuma * cont;
1667	uco[0] = -trans[ref].RefRCr * uvsinf;
1668	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
1669	uco[2] = trans[ref].RefBCb * uvcosf;
1670	vco[0] = trans[ref].RefRCr * uvcosf;
1671	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
1672	vco[2] = trans[ref].RefBCb * uvsinf;
1673	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
1674	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
1675	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
1676
1677	if (gamma != 1.0) {
1678	    needgamma = TRUE;
1679	    /* note: gamma correction is out = in ^ gamma;
1680	       gpu can only do LG2/EX2 therefore we transform into
1681	       in ^ gamma = 2 ^ (log2(in) * gamma).
1682	       Lots of scalar ops, unfortunately (better solution?) -
1683	       without gamma that's 3 inst, with gamma it's 10...
1684	       could use different gamma factors per channel,
1685	       if that's of any use. */
1686	}
1687
1688	if (pPriv->is_planar) {
1689	    BEGIN_RING(2 * (needgamma ? (28 + 33) : 33));
1690	    /* 2 components: same 2 for tex0/1/2 */
1691	    OUT_RING_REG(R300_RS_COUNT,
1692			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1693			   R300_RS_COUNT_HIRES_EN));
1694	    /* R300_INST_COUNT_RS - highest RS instruction used */
1695	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1696
1697	    OUT_RING_REG(R300_US_PIXSIZE, 2); /* highest temp used */
1698
1699	    /* Indirection levels */
1700	    OUT_RING_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1701					   R300_FIRST_TEX));
1702
1703	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1704						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1705						R300_TEX_CODE_OFFSET(0) |
1706						R300_TEX_CODE_SIZE(3)));
1707
1708	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1709						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1710						R300_TEX_START(0) |
1711						R300_TEX_SIZE(2) |
1712						R300_RGBA_OUT));
1713
1714	    /* tex inst */
1715	    OUT_RING_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1716					       R300_TEX_DST_ADDR(2) |
1717					       R300_TEX_ID(0) |
1718					       R300_TEX_INST(R300_TEX_INST_LD)));
1719	    OUT_RING_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
1720					       R300_TEX_DST_ADDR(1) |
1721					       R300_TEX_ID(1) |
1722					       R300_TEX_INST(R300_TEX_INST_LD)));
1723	    OUT_RING_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
1724					       R300_TEX_DST_ADDR(0) |
1725					       R300_TEX_ID(2) |
1726					       R300_TEX_INST(R300_TEX_INST_LD)));
1727
1728	    /* ALU inst */
1729	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
1730	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
1731						    R300_ALU_RGB_ADDR1(2) |
1732						    R300_ALU_RGB_ADDR2(0) |
1733						    R300_ALU_RGB_ADDRD(2) |
1734						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1735	    OUT_RING_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
1736						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1737						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1738						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1739						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1740						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1741						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1742						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1743	    /* alpha nop, but need to set up alpha source for rgb usage */
1744	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
1745						      R300_ALU_ALPHA_ADDR1(2) |
1746						      R300_ALU_ALPHA_ADDR2(0) |
1747						      R300_ALU_ALPHA_ADDRD(2) |
1748						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1749	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1750						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1751						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1752						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1753
1754	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
1755	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
1756						    R300_ALU_RGB_ADDR1(1) |
1757						    R300_ALU_RGB_ADDR2(2) |
1758						    R300_ALU_RGB_ADDRD(2) |
1759						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1760	    OUT_RING_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1761						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1762						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1763						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1764						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1765						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1766						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1767						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1768	    /* alpha nop */
1769	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
1770						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1771	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1772						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1773						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1774						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1775
1776	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
1777	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
1778						    R300_ALU_RGB_ADDR1(0) |
1779						    R300_ALU_RGB_ADDR2(2) |
1780						    R300_ALU_RGB_ADDRD(0) |
1781						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
1782						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
1783	    OUT_RING_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1784						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1785						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1786						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1787						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1788						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1789						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1790						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1791						    R300_ALU_RGB_CLAMP));
1792	    /* write alpha 1 */
1793	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
1794						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1795						      R300_ALU_ALPHA_TARGET_A));
1796	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1797						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1798						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1799						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
1800
1801	    if (needgamma) {
1802		/* rgb temp0.r = op_sop, set up src0 reg */
1803		OUT_RING_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
1804							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1805		OUT_RING_REG(R300_US_ALU_RGB_INST(3),
1806			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1807			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1808		/* alpha lg2 temp0, temp0.r */
1809		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
1810							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1811		OUT_RING_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1812							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
1813							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1814							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1815
1816		/* rgb temp0.g = op_sop, set up src0 reg */
1817		OUT_RING_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
1818							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
1819		OUT_RING_REG(R300_US_ALU_RGB_INST(4),
1820			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1821			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1822		/* alpha lg2 temp0, temp0.g */
1823		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
1824							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1825		OUT_RING_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1826							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
1827							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1828							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1829
1830		/* rgb temp0.b = op_sop, set up src0 reg */
1831		OUT_RING_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
1832							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
1833		OUT_RING_REG(R300_US_ALU_RGB_INST(5),
1834			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1835			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1836		/* alpha lg2 temp0, temp0.b */
1837		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
1838							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1839		OUT_RING_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1840							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
1841							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1842							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1843
1844		/* MUL const1, temp1, temp0 */
1845		OUT_RING_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
1846							R300_ALU_RGB_ADDR1(0) |
1847							R300_ALU_RGB_ADDR2(0) |
1848							R300_ALU_RGB_ADDRD(0) |
1849							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1850		OUT_RING_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1851							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1852							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
1853							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1854							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1855							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1856							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1857							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1858		/* alpha nop, but set up const1 */
1859		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
1860							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
1861							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1862		OUT_RING_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1863							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1864							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1865							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1866
1867		/* rgb out0.r = op_sop, set up src0 reg */
1868		OUT_RING_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1869							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
1870							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
1871		OUT_RING_REG(R300_US_ALU_RGB_INST(7),
1872			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1873			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1874		/* alpha ex2 temp0, temp0.r */
1875		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
1876							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1877		OUT_RING_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
1878							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
1879							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1880							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1881
1882		/* rgb out0.g = op_sop, set up src0 reg */
1883		OUT_RING_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1884							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
1885							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
1886		OUT_RING_REG(R300_US_ALU_RGB_INST(8),
1887			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1888			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1889		/* alpha ex2 temp0, temp0.g */
1890		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
1891							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1892		OUT_RING_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
1893							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
1894							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1895							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1896
1897		/* rgb out0.b = op_sop, set up src0 reg */
1898		OUT_RING_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1899							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
1900							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
1901		OUT_RING_REG(R300_US_ALU_RGB_INST(9),
1902			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1903			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1904		/* alpha ex2 temp0, temp0.b */
1905		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
1906							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1907		OUT_RING_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
1908							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
1909							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1910							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1911	    }
1912	} else {
1913	    BEGIN_RING(2 * (needgamma ? (28 + 31) : 31));
1914	    /* 2 components */
1915	    OUT_RING_REG(R300_RS_COUNT,
1916			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1917			   R300_RS_COUNT_HIRES_EN));
1918	    /* R300_INST_COUNT_RS - highest RS instruction used */
1919	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1920
1921	    OUT_RING_REG(R300_US_PIXSIZE, 1); /* highest temp used */
1922
1923	    /* Indirection levels */
1924	    OUT_RING_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1925					   R300_FIRST_TEX));
1926
1927	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1928						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1929						R300_TEX_CODE_OFFSET(0) |
1930						R300_TEX_CODE_SIZE(1)));
1931
1932	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1933						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1934						R300_TEX_START(0) |
1935						R300_TEX_SIZE(0) |
1936						R300_RGBA_OUT));
1937
1938	    /* tex inst */
1939	    OUT_RING_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1940					       R300_TEX_DST_ADDR(0) |
1941					       R300_TEX_ID(0) |
1942					       R300_TEX_INST(R300_TEX_INST_LD)));
1943
1944	    /* ALU inst */
1945	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
1946	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
1947						    R300_ALU_RGB_ADDR1(0) |
1948						    R300_ALU_RGB_ADDR2(0) |
1949						    R300_ALU_RGB_ADDRD(1) |
1950						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1951	    OUT_RING_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
1952						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1953						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
1954						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1955						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1956						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1957						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1958						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1959	    /* alpha nop, but need to set up alpha source for rgb usage */
1960	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
1961						      R300_ALU_ALPHA_ADDR1(0) |
1962						      R300_ALU_ALPHA_ADDR2(0) |
1963						      R300_ALU_ALPHA_ADDRD(0) |
1964						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1965	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1966						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1967						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1968						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1969
1970	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
1971	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
1972						    R300_ALU_RGB_ADDR1(0) |
1973						    R300_ALU_RGB_ADDR2(1) |
1974						    R300_ALU_RGB_ADDRD(1) |
1975						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1976	    OUT_RING_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1977						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1978						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
1979						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1980						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1981						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1982						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1983						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1984	    /* alpha nop */
1985	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
1986						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1987	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1988						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1989						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1990						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1991
1992	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
1993	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
1994						    R300_ALU_RGB_ADDR1(0) |
1995						    R300_ALU_RGB_ADDR2(1) |
1996						    R300_ALU_RGB_ADDRD(0) |
1997						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
1998						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
1999	    OUT_RING_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2000						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2001						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
2002						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2003						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2004						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2005						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2006						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2007						    R300_ALU_RGB_CLAMP));
2008	    /* write alpha 1 */
2009	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2010						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2011						      R300_ALU_ALPHA_TARGET_A));
2012	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2013						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2014						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2015						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2016
2017	    if (needgamma) {
2018		/* rgb temp0.r = op_sop, set up src0 reg */
2019		OUT_RING_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2020							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2021		OUT_RING_REG(R300_US_ALU_RGB_INST(3),
2022			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2023			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2024		/* alpha lg2 temp0, temp0.r */
2025		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2026							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2027		OUT_RING_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2028							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2029							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2030							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2031
2032		/* rgb temp0.g = op_sop, set up src0 reg */
2033		OUT_RING_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2034							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2035		OUT_RING_REG(R300_US_ALU_RGB_INST(4),
2036			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2037			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2038		/* alpha lg2 temp0, temp0.g */
2039		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2040							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2041		OUT_RING_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2042							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2043							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2044							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2045
2046		/* rgb temp0.b = op_sop, set up src0 reg */
2047		OUT_RING_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2048							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2049		OUT_RING_REG(R300_US_ALU_RGB_INST(5),
2050			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2051			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2052		/* alpha lg2 temp0, temp0.b */
2053		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2054							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2055		OUT_RING_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2056							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2057							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2058							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2059
2060		/* MUL const1, temp1, temp0 */
2061		OUT_RING_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2062							R300_ALU_RGB_ADDR1(0) |
2063							R300_ALU_RGB_ADDR2(0) |
2064							R300_ALU_RGB_ADDRD(0) |
2065							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2066		OUT_RING_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2067							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2068							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2069							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2070							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2071							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2072							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2073							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2074		/* alpha nop, but set up const1 */
2075		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2076							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2077							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2078		OUT_RING_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2079							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2080							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2081							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2082
2083		/* rgb out0.r = op_sop, set up src0 reg */
2084		OUT_RING_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2085							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2086							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2087		OUT_RING_REG(R300_US_ALU_RGB_INST(7),
2088			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2089			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2090		/* alpha ex2 temp0, temp0.r */
2091		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2092							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2093		OUT_RING_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2094							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2095							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2096							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2097
2098		/* rgb out0.g = op_sop, set up src0 reg */
2099		OUT_RING_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2100							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2101							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2102		OUT_RING_REG(R300_US_ALU_RGB_INST(8),
2103			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2104			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2105		/* alpha ex2 temp0, temp0.g */
2106		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2107							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2108		OUT_RING_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2109							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2110							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2111							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2112
2113		/* rgb out0.b = op_sop, set up src0 reg */
2114		OUT_RING_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2115							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2116							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2117		OUT_RING_REG(R300_US_ALU_RGB_INST(9),
2118			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2119			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2120		/* alpha ex2 temp0, temp0.b */
2121		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2122							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2123		OUT_RING_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2124							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2125							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2126							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2127	    }
2128	}
2129
2130	/* Shader constants. */
2131	/* constant 0: off, yco */
2132	OUT_RING_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
2133	OUT_RING_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
2134	OUT_RING_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
2135	OUT_RING_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
2136	/* constant 1: uco */
2137	OUT_RING_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
2138	OUT_RING_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
2139	OUT_RING_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
2140	OUT_RING_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
2141	/* constant 2: vco */
2142	OUT_RING_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
2143	OUT_RING_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
2144	OUT_RING_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
2145	OUT_RING_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
2146
2147	ADVANCE_RING();
2148    }
2149
2150    BEGIN_ACCEL_RELOC(6, 2);
2151    OUT_RING_REG(R300_TX_INVALTAGS, 0);
2152    OUT_RING_REG(R300_TX_ENABLE, txenable);
2153
2154    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
2155    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
2156
2157    /* no need to enable blending */
2158    OUT_RING_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
2159
2160    OUT_RING_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
2161    ADVANCE_RING();
2162
2163    if (pPriv->vsync) {
2164	xf86CrtcPtr crtc;
2165	if (pPriv->desired_crtc)
2166	    crtc = pPriv->desired_crtc;
2167	else
2168	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
2169					 pPriv->drw_x,
2170					 pPriv->drw_x + pPriv->dst_w,
2171					 pPriv->drw_y,
2172					 pPriv->drw_y + pPriv->dst_h);
2173	if (crtc)
2174	    RADEONWaitForVLine(pScrn, pPixmap,
2175			       crtc,
2176			       pPriv->drw_y - crtc->y,
2177			       (pPriv->drw_y - crtc->y) + pPriv->dst_h);
2178    }
2179
2180    return TRUE;
2181}
2182
2183static void
2184R300DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2185{
2186    RADEONInfoPtr info = RADEONPTR(pScrn);
2187    PixmapPtr pPixmap = pPriv->pPixmap;
2188    int dstxoff, dstyoff;
2189    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
2190    int nBox = REGION_NUM_RECTS(&pPriv->clip);
2191
2192#ifdef COMPOSITE
2193    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
2194    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
2195#else
2196    dstxoff = 0;
2197    dstyoff = 0;
2198#endif
2199
2200    if (!R300PrepareTexturedVideo(pScrn, pPriv))
2201	return;
2202
2203    /*
2204     * Rendering of the actual polygon is done in two different
2205     * ways depending on chip generation:
2206     *
2207     * < R300:
2208     *
2209     *     These chips can render a rectangle in one pass, so
2210     *     handling is pretty straight-forward.
2211     *
2212     * >= R300:
2213     *
2214     *     These chips can accept a quad, but will render it as
2215     *     two triangles which results in a diagonal tear. Instead
2216     *     We render a single, large triangle and use the scissor
2217     *     functionality to restrict it to the desired rectangle.
2218     *     Due to guardband limits on r3xx/r4xx, we can only use
2219     *     the single triangle up to 2560/4021 pixels; above that we
2220     *     render as a quad.
2221     */
2222
2223    while (nBox--) {
2224	float srcX, srcY, srcw, srch;
2225	int dstX, dstY, dstw, dsth;
2226	Bool use_quad = FALSE;
2227	int draw_size = 4 * pPriv->vtx_count + 4 + 2 + 3;
2228
2229	if (draw_size > radeon_cs_space_remaining(pScrn)) {
2230	    radeon_cs_flush_indirect(pScrn);
2231	    if (!R300PrepareTexturedVideo(pScrn, pPriv))
2232		return;
2233	}
2234
2235	dstX = pBox->x1 + dstxoff;
2236	dstY = pBox->y1 + dstyoff;
2237	dstw = pBox->x2 - pBox->x1;
2238	dsth = pBox->y2 - pBox->y1;
2239
2240	srcX = pPriv->src_x;
2241	srcX += ((pBox->x1 - pPriv->drw_x) *
2242		 pPriv->src_w) / (float)pPriv->dst_w;
2243	srcY = pPriv->src_y;
2244	srcY += ((pBox->y1 - pPriv->drw_y) *
2245		 pPriv->src_h) / (float)pPriv->dst_h;
2246
2247	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
2248	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
2249
2250	if (IS_R400_3D) {
2251	    if ((dstw+dsth) > 4021)
2252		use_quad = TRUE;
2253	} else {
2254	    if ((dstw+dsth) > 2560)
2255		use_quad = TRUE;
2256	}
2257	/*
2258	 * Set up the scissor area to that of the output size.
2259	 */
2260	BEGIN_RING(2*2);
2261	/* R300 has an offset */
2262	OUT_RING_REG(R300_SC_SCISSOR0, (((dstX + 1440) << R300_SCISSOR_X_SHIFT) |
2263					 ((dstY + 1440) << R300_SCISSOR_Y_SHIFT)));
2264	OUT_RING_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1440 - 1) << R300_SCISSOR_X_SHIFT) |
2265					 ((dstY + dsth + 1440 - 1) << R300_SCISSOR_Y_SHIFT)));
2266	ADVANCE_RING();
2267
2268	if (use_quad) {
2269	    BEGIN_RING(4 * pPriv->vtx_count + 4);
2270	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2271				4 * pPriv->vtx_count));
2272	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
2273		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2274		     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2275	} else {
2276	    BEGIN_RING(3 * pPriv->vtx_count + 4);
2277	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2278				3 * pPriv->vtx_count));
2279	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
2280		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2281		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2282	}
2283
2284	if (pPriv->bicubic_enabled) {
2285		/*
2286		 * This code is only executed on >= R300, so we don't
2287		 * have to deal with the legacy handling.
2288		 */
2289	    if (use_quad) {
2290		VTX_OUT_6((float)dstX,                     (float)dstY,
2291			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2292			  (float)srcX + 0.5,               (float)srcY + 0.5);
2293		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
2294			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
2295			  (float)srcX + 0.5,               (float)(srcY + srch) + 0.5);
2296		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
2297			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
2298			  (float)(srcX + srcw) + 0.5,      (float)(srcY + srch) + 0.5);
2299		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
2300			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
2301			  (float)(srcX + srcw) + 0.5,      (float)srcY + 0.5);
2302	    } else {
2303		VTX_OUT_6((float)dstX,                     (float)dstY,
2304			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2305			  (float)srcX + 0.5,               (float)srcY + 0.5);
2306		VTX_OUT_6((float)dstX,                     (float)(dstY + dstw + dsth),
2307			  (float)srcX / pPriv->w,
2308			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
2309			  (float)srcX + 0.5,
2310			  (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
2311		VTX_OUT_6((float)(dstX + dstw + dsth),     (float)dstY,
2312			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2313			  (float)srcY / pPriv->h,
2314			  (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
2315			  (float)srcY + 0.5);
2316	    }
2317	} else {
2318	    if (use_quad) {
2319		VTX_OUT_4((float)dstX,                     (float)dstY,
2320			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h);
2321		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
2322			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
2323		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
2324			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
2325		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
2326			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
2327	    } else {
2328		/*
2329		 * Render a big, scissored triangle. This means
2330		 * increasing the triangle size and adjusting
2331		 * texture coordinates.
2332		 */
2333		VTX_OUT_4((float)dstX,                 (float)dstY,
2334			  (float)srcX / pPriv->w,      (float)srcY / pPriv->h);
2335		VTX_OUT_4((float)dstX,                 (float)(dstY + dsth + dstw),
2336			  (float)srcX / pPriv->w,
2337			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
2338		VTX_OUT_4((float)(dstX + dstw + dsth), (float)dstY,
2339			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2340			  (float)srcY / pPriv->h);
2341	    }
2342	}
2343
2344	/* flushing is pipelined, free/finish is not */
2345	OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2346
2347	ADVANCE_RING();
2348
2349	pBox++;
2350    }
2351
2352    BEGIN_RING(2*3);
2353    OUT_RING_REG(R300_SC_CLIP_RULE, 0xAAAA);
2354    OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
2355    OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
2356    ADVANCE_RING();
2357
2358    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
2359}
2360
2361static Bool
2362R500PrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2363{
2364    RADEONInfoPtr info = RADEONPTR(pScrn);
2365    PixmapPtr pPixmap = pPriv->pPixmap;
2366    struct radeon_exa_pixmap_priv *driver_priv;
2367    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
2368    uint32_t txfilter, txformat0, txformat1, txpitch, us_format = 0;
2369    uint32_t dst_pitch, dst_format;
2370    uint32_t txenable, colorpitch;
2371    uint32_t output_fmt;
2372    int pixel_shift, out_size = 6;
2373    int ret;
2374
2375    radeon_cs_space_reset_bos(info->cs);
2376    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2377
2378    if (pPriv->bicubic_enabled)
2379	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2380
2381    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
2382    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
2383
2384    ret = radeon_cs_space_check(info->cs);
2385    if (ret) {
2386	ErrorF("Not enough RAM to hw accel xv operation\n");
2387	return FALSE;
2388    }
2389
2390    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
2391
2392    dst_pitch = exaGetPixmapPitch(pPixmap);
2393    RADEON_SWITCH_TO_3D();
2394
2395    if (pPriv->bicubic_enabled)
2396	pPriv->vtx_count = 6;
2397    else
2398	pPriv->vtx_count = 4;
2399
2400    switch (pPixmap->drawable.bitsPerPixel) {
2401    case 16:
2402	if (pPixmap->drawable.depth == 15)
2403	    dst_format = R300_COLORFORMAT_ARGB1555;
2404	else
2405	    dst_format = R300_COLORFORMAT_RGB565;
2406	break;
2407    case 32:
2408	dst_format = R300_COLORFORMAT_ARGB8888;
2409	break;
2410    default:
2411	return FALSE;
2412    }
2413
2414    output_fmt = (R300_OUT_FMT_C4_8 |
2415		  R300_OUT_FMT_C0_SEL_BLUE |
2416		  R300_OUT_FMT_C1_SEL_GREEN |
2417		  R300_OUT_FMT_C2_SEL_RED |
2418		  R300_OUT_FMT_C3_SEL_ALPHA);
2419
2420    colorpitch = dst_pitch >> pixel_shift;
2421    colorpitch |= dst_format;
2422
2423    if (RADEONTilingEnabled(pScrn, pPixmap))
2424	colorpitch |= R300_COLORTILE;
2425
2426    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
2427        (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
2428	pPriv->is_planar = TRUE;
2429    else
2430	pPriv->is_planar = FALSE;
2431
2432    if (pPriv->is_planar) {
2433	txformat1 = R300_TX_FORMAT_X8;
2434	txpitch = pPriv->src_pitch;
2435    } else {
2436	if (pPriv->id == FOURCC_UYVY)
2437	    txformat1 = R300_TX_FORMAT_YVYU422;
2438	else
2439	    txformat1 = R300_TX_FORMAT_VYUY422;
2440
2441	if (pPriv->bicubic_state != BICUBIC_OFF)
2442	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
2443
2444	/* pitch is in pixels */
2445	txpitch = pPriv->src_pitch / 2;
2446    }
2447    txpitch -= 1;
2448
2449    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2450		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2451		 R300_TXPITCH_EN);
2452
2453    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2454		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2455		R300_TX_MAG_FILTER_LINEAR |
2456		R300_TX_MIN_FILTER_LINEAR |
2457		(0 << R300_TX_ID_SHIFT));
2458
2459
2460    if ((pPriv->w - 1) & 0x800)
2461	txpitch |= R500_TXWIDTH_11;
2462
2463    if ((pPriv->h - 1) & 0x800)
2464	txpitch |= R500_TXHEIGHT_11;
2465
2466    if (info->ChipFamily == CHIP_FAMILY_R520) {
2467	unsigned us_width = (pPriv->w - 1) & 0x7ff;
2468	unsigned us_height = (pPriv->h - 1) & 0x7ff;
2469	unsigned us_depth = 0;
2470
2471	if (pPriv->w > 2048) {
2472	    us_width = (0x7ff + us_width) >> 1;
2473	    us_depth |= 0x0d;
2474	}
2475	if (pPriv->h > 2048) {
2476	    us_height = (0x7ff + us_height) >> 1;
2477	    us_depth |= 0x0e;
2478	}
2479	us_format = (us_width << R300_TXWIDTH_SHIFT) |
2480		    (us_height << R300_TXHEIGHT_SHIFT) |
2481		    (us_depth << R300_TXDEPTH_SHIFT);
2482	out_size++;
2483    }
2484
2485    BEGIN_ACCEL_RELOC(out_size, 1);
2486    OUT_RING_REG(R300_TX_FILTER0_0, txfilter);
2487    OUT_RING_REG(R300_TX_FILTER1_0, 0);
2488    OUT_RING_REG(R300_TX_FORMAT0_0, txformat0);
2489    OUT_RING_REG(R300_TX_FORMAT1_0, txformat1);
2490    OUT_RING_REG(R300_TX_FORMAT2_0, txpitch);
2491    OUT_TEXTURE_REG(R300_TX_OFFSET_0, 0, src_bo);
2492    if (info->ChipFamily == CHIP_FAMILY_R520)
2493	OUT_RING_REG(R500_US_FORMAT0_0, us_format);
2494    ADVANCE_RING();
2495
2496    txenable = R300_TEX_0_ENABLE;
2497
2498    if (pPriv->is_planar) {
2499	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2500		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2501		     R300_TXPITCH_EN);
2502	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
2503	txpitch -= 1;
2504	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2505		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2506		    R300_TX_MIN_FILTER_LINEAR |
2507		    R300_TX_MAG_FILTER_LINEAR);
2508
2509	BEGIN_ACCEL_RELOC(12, 2);
2510	OUT_RING_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
2511	OUT_RING_REG(R300_TX_FILTER1_1, 0);
2512	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
2513	OUT_RING_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8);
2514	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
2515	OUT_TEXTURE_REG(R300_TX_OFFSET_1, pPriv->planeu_offset, src_bo);
2516	OUT_RING_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
2517	OUT_RING_REG(R300_TX_FILTER1_2, 0);
2518	OUT_RING_REG(R300_TX_FORMAT0_2, txformat0);
2519	OUT_RING_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8);
2520	OUT_RING_REG(R300_TX_FORMAT2_2, txpitch);
2521	OUT_TEXTURE_REG(R300_TX_OFFSET_2, pPriv->planev_offset, src_bo);
2522	ADVANCE_RING();
2523	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
2524    }
2525
2526    if (pPriv->bicubic_enabled) {
2527	/* Size is 128x1 */
2528	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
2529		     (0x0 << R300_TXHEIGHT_SHIFT) |
2530		     R300_TXPITCH_EN);
2531	/* Format is 32-bit floats, 4bpp */
2532	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
2533	/* Pitch is 127 (128-1) */
2534	txpitch = 0x7f;
2535	/* Tex filter */
2536	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
2537		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
2538		    R300_TX_MIN_FILTER_NEAREST |
2539		    R300_TX_MAG_FILTER_NEAREST |
2540		    (1 << R300_TX_ID_SHIFT));
2541
2542	BEGIN_ACCEL_RELOC(6, 1);
2543	OUT_RING_REG(R300_TX_FILTER0_1, txfilter);
2544	OUT_RING_REG(R300_TX_FILTER1_1, 0);
2545	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
2546	OUT_RING_REG(R300_TX_FORMAT1_1, txformat1);
2547	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
2548	OUT_TEXTURE_REG(R300_TX_OFFSET_1, 0, info->bicubic_bo);
2549	ADVANCE_RING();
2550
2551	/* Enable tex 1 */
2552	txenable |= R300_TEX_1_ENABLE;
2553    }
2554
2555    /* setup the VAP */
2556    if (info->accel_state->has_tcl) {
2557	if (pPriv->bicubic_enabled)
2558	    BEGIN_RING(2*7);
2559	else
2560	    BEGIN_RING(2*6);
2561    } else {
2562	if (pPriv->bicubic_enabled)
2563	    BEGIN_RING(2*5);
2564	else
2565	    BEGIN_RING(2*4);
2566    }
2567
2568    /* These registers define the number, type, and location of data submitted
2569     * to the PVS unit of GA input (when PVS is disabled)
2570     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
2571     * enabled.  This memory provides the imputs to the vertex shader program
2572     * and ordering is not important.  When PVS/TCL is disabled, this field maps
2573     * directly to the GA input memory and the order is signifigant.  In
2574     * PVS_BYPASS mode the order is as follows:
2575     * Position
2576     * Point Size
2577     * Color 0-3
2578     * Textures 0-7
2579     * Fog
2580     */
2581    if (pPriv->bicubic_enabled) {
2582	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
2583		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2584		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2585		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2586		       R300_SIGNED_0 |
2587		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2588		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2589		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2590		       R300_SIGNED_1));
2591	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_1,
2592		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
2593		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
2594		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
2595		       R300_LAST_VEC_2 |
2596		       R300_SIGNED_2));
2597    } else {
2598	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
2599		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2600		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2601		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2602		       R300_SIGNED_0 |
2603		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2604		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2605		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2606		       R300_LAST_VEC_1 |
2607		       R300_SIGNED_1));
2608    }
2609
2610    /* load the vertex shader
2611     * We pre-load vertex programs in RADEONInit3DEngine():
2612     * - exa
2613     * - Xv
2614     * - Xv bicubic
2615     * Here we select the offset of the vertex program we want to use
2616     */
2617    if (info->accel_state->has_tcl) {
2618	if (pPriv->bicubic_enabled) {
2619	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
2620			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
2621			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2622			   (13 << R300_PVS_LAST_INST_SHIFT)));
2623	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
2624			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2625	} else {
2626	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
2627			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
2628			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2629			   (10 << R300_PVS_LAST_INST_SHIFT)));
2630	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
2631			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2632	}
2633    }
2634
2635    /* Position and one set of 2 texture coordinates */
2636    OUT_RING_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
2637    if (pPriv->bicubic_enabled)
2638	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
2639					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
2640    else
2641	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
2642
2643    OUT_RING_REG(R300_US_OUT_FMT_0, output_fmt);
2644    ADVANCE_RING();
2645
2646    /* setup pixel shader */
2647    if (pPriv->bicubic_state != BICUBIC_OFF) {
2648	if (pPriv->bicubic_enabled) {
2649	    BEGIN_RING(2*7);
2650
2651	    /* 4 components: 2 for tex0 and 2 for tex1 */
2652	    OUT_RING_REG(R300_RS_COUNT,
2653			  ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2654			   R300_RS_COUNT_HIRES_EN));
2655
2656	    /* R300_INST_COUNT_RS - highest RS instruction used */
2657	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
2658
2659	    /* Pixel stack frame size. */
2660	    OUT_RING_REG(R300_US_PIXSIZE, 5);
2661
2662	    /* FP length. */
2663	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
2664					      R500_US_CODE_END_ADDR(13)));
2665	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
2666					       R500_US_CODE_RANGE_SIZE(13)));
2667
2668	    /* Prepare for FP emission. */
2669	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
2670	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
2671	    ADVANCE_RING();
2672
2673	    BEGIN_RING(2*89);
2674	    /* Pixel shader.
2675	     * I've gone ahead and annotated each instruction, since this
2676	     * thing is MASSIVE. :3
2677	     * Note: In order to avoid buggies with temps and multiple
2678	     * inputs, all temps are offset by 2. temp0 -> register2. */
2679
2680	    /* TEX temp2, input1.xxxx, tex1, 1D */
2681	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2682						   R500_INST_RGB_WMASK_R |
2683						   R500_INST_RGB_WMASK_G |
2684						   R500_INST_RGB_WMASK_B));
2685	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
2686						   R500_TEX_INST_LD |
2687						   R500_TEX_IGNORE_UNCOVERED));
2688	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
2689						   R500_TEX_SRC_S_SWIZ_R |
2690						   R500_TEX_SRC_T_SWIZ_R |
2691						   R500_TEX_SRC_R_SWIZ_R |
2692						   R500_TEX_SRC_Q_SWIZ_R |
2693						   R500_TEX_DST_ADDR(2) |
2694						   R500_TEX_DST_R_SWIZ_R |
2695						   R500_TEX_DST_G_SWIZ_G |
2696						   R500_TEX_DST_B_SWIZ_B |
2697						   R500_TEX_DST_A_SWIZ_A));
2698	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2699	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2700	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2701
2702	    /* TEX temp5, input1.yyyy, tex1, 1D */
2703	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2704						   R500_INST_TEX_SEM_WAIT |
2705						   R500_INST_RGB_WMASK_R |
2706						   R500_INST_RGB_WMASK_G |
2707						   R500_INST_RGB_WMASK_B));
2708	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
2709						   R500_TEX_INST_LD |
2710						   R500_TEX_SEM_ACQUIRE |
2711						   R500_TEX_IGNORE_UNCOVERED));
2712	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
2713						   R500_TEX_SRC_S_SWIZ_G |
2714						   R500_TEX_SRC_T_SWIZ_G |
2715						   R500_TEX_SRC_R_SWIZ_G |
2716						   R500_TEX_SRC_Q_SWIZ_G |
2717						   R500_TEX_DST_ADDR(5) |
2718						   R500_TEX_DST_R_SWIZ_R |
2719						   R500_TEX_DST_G_SWIZ_G |
2720						   R500_TEX_DST_B_SWIZ_B |
2721						   R500_TEX_DST_A_SWIZ_A));
2722	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2723	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2724	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2725
2726	    /* MUL temp4, const0.x0x0, temp2.yyxx */
2727	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2728						   R500_INST_TEX_SEM_WAIT |
2729						   R500_INST_RGB_WMASK_R |
2730						   R500_INST_RGB_WMASK_G |
2731						   R500_INST_RGB_WMASK_B |
2732						   R500_INST_ALPHA_WMASK));
2733	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2734						   R500_RGB_ADDR0_CONST |
2735						   R500_RGB_ADDR1(2)));
2736	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2737						   R500_ALPHA_ADDR0_CONST |
2738						   R500_ALPHA_ADDR1(2)));
2739	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2740						   R500_ALU_RGB_R_SWIZ_A_R |
2741						   R500_ALU_RGB_G_SWIZ_A_0 |
2742						   R500_ALU_RGB_B_SWIZ_A_R |
2743						   R500_ALU_RGB_SEL_B_SRC1 |
2744						   R500_ALU_RGB_R_SWIZ_B_G |
2745						   R500_ALU_RGB_G_SWIZ_B_G |
2746						   R500_ALU_RGB_B_SWIZ_B_R));
2747	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
2748						   R500_ALPHA_OP_MAD |
2749						   R500_ALPHA_SEL_A_SRC0 |
2750						   R500_ALPHA_SWIZ_A_0 |
2751						   R500_ALPHA_SEL_B_SRC1 |
2752						   R500_ALPHA_SWIZ_B_R));
2753	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
2754						   R500_ALU_RGBA_OP_MAD |
2755						   R500_ALU_RGBA_R_SWIZ_0 |
2756						   R500_ALU_RGBA_G_SWIZ_0 |
2757						   R500_ALU_RGBA_B_SWIZ_0 |
2758						   R500_ALU_RGBA_A_SWIZ_0));
2759
2760	    /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
2761	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2762						   R500_INST_RGB_WMASK_R |
2763						   R500_INST_RGB_WMASK_G |
2764						   R500_INST_RGB_WMASK_B |
2765						   R500_INST_ALPHA_WMASK));
2766	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2767						   R500_RGB_ADDR0_CONST |
2768						   R500_RGB_ADDR1(5) |
2769						   R500_RGB_ADDR2(4)));
2770	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2771						   R500_ALPHA_ADDR0_CONST |
2772						   R500_ALPHA_ADDR1(5) |
2773						   R500_ALPHA_ADDR2(4)));
2774	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2775						   R500_ALU_RGB_R_SWIZ_A_0 |
2776						   R500_ALU_RGB_G_SWIZ_A_G |
2777						   R500_ALU_RGB_B_SWIZ_A_0 |
2778						   R500_ALU_RGB_SEL_B_SRC1 |
2779						   R500_ALU_RGB_R_SWIZ_B_R |
2780						   R500_ALU_RGB_G_SWIZ_B_R |
2781						   R500_ALU_RGB_B_SWIZ_B_R));
2782	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
2783						   R500_ALPHA_OP_MAD |
2784						   R500_ALPHA_SEL_A_SRC0 |
2785						   R500_ALPHA_SWIZ_A_G |
2786						   R500_ALPHA_SEL_B_SRC1 |
2787						   R500_ALPHA_SWIZ_B_R));
2788	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
2789						   R500_ALU_RGBA_OP_MAD |
2790						   R500_ALU_RGBA_SEL_C_SRC2 |
2791						   R500_ALU_RGBA_R_SWIZ_R |
2792						   R500_ALU_RGBA_G_SWIZ_G |
2793						   R500_ALU_RGBA_B_SWIZ_B |
2794						   R500_ALU_RGBA_A_SWIZ_A));
2795
2796	    /* ADD temp3, temp3, input0.xyxy */
2797	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2798						   R500_INST_RGB_WMASK_R |
2799						   R500_INST_RGB_WMASK_G |
2800						   R500_INST_RGB_WMASK_B |
2801						   R500_INST_ALPHA_WMASK));
2802	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
2803						   R500_RGB_ADDR2(0)));
2804	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
2805						   R500_ALPHA_ADDR2(0)));
2806	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
2807						   R500_ALU_RGB_G_SWIZ_A_1 |
2808						   R500_ALU_RGB_B_SWIZ_A_1 |
2809						   R500_ALU_RGB_SEL_B_SRC1 |
2810						   R500_ALU_RGB_R_SWIZ_B_R |
2811						   R500_ALU_RGB_G_SWIZ_B_G |
2812						   R500_ALU_RGB_B_SWIZ_B_B));
2813	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
2814						   R500_ALPHA_OP_MAD |
2815						   R500_ALPHA_SWIZ_A_1 |
2816						   R500_ALPHA_SEL_B_SRC1 |
2817						   R500_ALPHA_SWIZ_B_A));
2818	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
2819						   R500_ALU_RGBA_OP_MAD |
2820						   R500_ALU_RGBA_SEL_C_SRC2 |
2821						   R500_ALU_RGBA_R_SWIZ_R |
2822						   R500_ALU_RGBA_G_SWIZ_G |
2823						   R500_ALU_RGBA_B_SWIZ_R |
2824						   R500_ALU_RGBA_A_SWIZ_G));
2825
2826	    /* TEX temp1, temp3.zwxy, tex0, 2D */
2827	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2828						   R500_INST_RGB_WMASK_R |
2829						   R500_INST_RGB_WMASK_G |
2830						   R500_INST_RGB_WMASK_B |
2831						   R500_INST_ALPHA_WMASK));
2832	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2833						   R500_TEX_INST_LD |
2834						   R500_TEX_IGNORE_UNCOVERED));
2835	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
2836						   R500_TEX_SRC_S_SWIZ_B |
2837						   R500_TEX_SRC_T_SWIZ_A |
2838						   R500_TEX_SRC_R_SWIZ_R |
2839						   R500_TEX_SRC_Q_SWIZ_G |
2840						   R500_TEX_DST_ADDR(1) |
2841						   R500_TEX_DST_R_SWIZ_R |
2842						   R500_TEX_DST_G_SWIZ_G |
2843						   R500_TEX_DST_B_SWIZ_B |
2844						   R500_TEX_DST_A_SWIZ_A));
2845	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2846	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2847	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2848
2849	    /* TEX temp3, temp3.xyzw, tex0, 2D */
2850	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2851						   R500_INST_TEX_SEM_WAIT |
2852						   R500_INST_RGB_WMASK_R |
2853						   R500_INST_RGB_WMASK_G |
2854						   R500_INST_RGB_WMASK_B |
2855						   R500_INST_ALPHA_WMASK));
2856	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2857						   R500_TEX_INST_LD |
2858						   R500_TEX_SEM_ACQUIRE |
2859						   R500_TEX_IGNORE_UNCOVERED));
2860	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
2861						   R500_TEX_SRC_S_SWIZ_R |
2862						   R500_TEX_SRC_T_SWIZ_G |
2863						   R500_TEX_SRC_R_SWIZ_B |
2864						   R500_TEX_SRC_Q_SWIZ_A |
2865						   R500_TEX_DST_ADDR(3) |
2866						   R500_TEX_DST_R_SWIZ_R |
2867						   R500_TEX_DST_G_SWIZ_G |
2868						   R500_TEX_DST_B_SWIZ_B |
2869						   R500_TEX_DST_A_SWIZ_A));
2870	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2871	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2872	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2873
2874	    /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
2875	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2876						   R500_INST_RGB_WMASK_R |
2877						   R500_INST_RGB_WMASK_G |
2878						   R500_INST_RGB_WMASK_B |
2879						   R500_INST_ALPHA_WMASK));
2880	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2881						   R500_RGB_ADDR0_CONST |
2882						   R500_RGB_ADDR1(5) |
2883						   R500_RGB_ADDR2(4)));
2884	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2885						   R500_ALPHA_ADDR0_CONST |
2886						   R500_ALPHA_ADDR1(5) |
2887						   R500_ALPHA_ADDR2(4)));
2888	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2889						   R500_ALU_RGB_R_SWIZ_A_0 |
2890						   R500_ALU_RGB_G_SWIZ_A_G |
2891						   R500_ALU_RGB_B_SWIZ_A_0 |
2892						   R500_ALU_RGB_SEL_B_SRC1 |
2893						   R500_ALU_RGB_R_SWIZ_B_G |
2894						   R500_ALU_RGB_G_SWIZ_B_G |
2895						   R500_ALU_RGB_B_SWIZ_B_G));
2896	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
2897						   R500_ALPHA_OP_MAD |
2898						   R500_ALPHA_SEL_A_SRC0 |
2899						   R500_ALPHA_SWIZ_A_G |
2900						   R500_ALPHA_SEL_B_SRC1 |
2901						   R500_ALPHA_SWIZ_B_G));
2902	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
2903						   R500_ALU_RGBA_OP_MAD |
2904						   R500_ALU_RGBA_SEL_C_SRC2 |
2905						   R500_ALU_RGBA_R_SWIZ_R |
2906						   R500_ALU_RGBA_G_SWIZ_G |
2907						   R500_ALU_RGBA_B_SWIZ_B |
2908						   R500_ALU_RGBA_A_SWIZ_A));
2909
2910	    /* ADD temp0, temp4, input0.xyxy */
2911	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2912						   R500_INST_RGB_WMASK_R |
2913						   R500_INST_RGB_WMASK_G |
2914						   R500_INST_RGB_WMASK_B |
2915						   R500_INST_ALPHA_WMASK));
2916	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
2917						   R500_RGB_ADDR2(0)));
2918	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
2919						   R500_ALPHA_ADDR2(0)));
2920	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
2921						   R500_ALU_RGB_G_SWIZ_A_1 |
2922						   R500_ALU_RGB_B_SWIZ_A_1 |
2923						   R500_ALU_RGB_SEL_B_SRC1 |
2924						   R500_ALU_RGB_R_SWIZ_B_R |
2925						   R500_ALU_RGB_G_SWIZ_B_G |
2926						   R500_ALU_RGB_B_SWIZ_B_B));
2927	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
2928						   R500_ALPHA_OP_MAD |
2929						   R500_ALPHA_SWIZ_A_1 |
2930						   R500_ALPHA_SEL_B_SRC1 |
2931						   R500_ALPHA_SWIZ_B_A));
2932	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
2933						   R500_ALU_RGBA_OP_MAD |
2934						   R500_ALU_RGBA_SEL_C_SRC2 |
2935						   R500_ALU_RGBA_R_SWIZ_R |
2936						   R500_ALU_RGBA_G_SWIZ_G |
2937						   R500_ALU_RGBA_B_SWIZ_R |
2938						   R500_ALU_RGBA_A_SWIZ_G));
2939
2940	    /* TEX temp4, temp0.zwzw, tex0, 2D */
2941	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2942						   R500_INST_TEX_SEM_WAIT |
2943						   R500_INST_RGB_WMASK_R |
2944						   R500_INST_RGB_WMASK_G |
2945						   R500_INST_RGB_WMASK_B |
2946						   R500_INST_ALPHA_WMASK));
2947	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2948						   R500_TEX_INST_LD |
2949						   R500_TEX_IGNORE_UNCOVERED));
2950	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
2951						   R500_TEX_SRC_S_SWIZ_B |
2952						   R500_TEX_SRC_T_SWIZ_A |
2953						   R500_TEX_SRC_R_SWIZ_B |
2954						   R500_TEX_SRC_Q_SWIZ_A |
2955						   R500_TEX_DST_ADDR(4) |
2956						   R500_TEX_DST_R_SWIZ_R |
2957						   R500_TEX_DST_G_SWIZ_G |
2958						   R500_TEX_DST_B_SWIZ_B |
2959						   R500_TEX_DST_A_SWIZ_A));
2960	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2961	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2962	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2963
2964	    /* TEX temp0, temp0.xyzw, tex0, 2D */
2965	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2966						   R500_INST_TEX_SEM_WAIT |
2967						   R500_INST_RGB_WMASK_R |
2968						   R500_INST_RGB_WMASK_G |
2969						   R500_INST_RGB_WMASK_B |
2970						   R500_INST_ALPHA_WMASK));
2971	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2972						   R500_TEX_INST_LD |
2973						   R500_TEX_SEM_ACQUIRE |
2974						   R500_TEX_IGNORE_UNCOVERED));
2975	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
2976						   R500_TEX_SRC_S_SWIZ_R |
2977						   R500_TEX_SRC_T_SWIZ_G |
2978						   R500_TEX_SRC_R_SWIZ_B |
2979						   R500_TEX_SRC_Q_SWIZ_A |
2980						   R500_TEX_DST_ADDR(0) |
2981						   R500_TEX_DST_R_SWIZ_R |
2982						   R500_TEX_DST_G_SWIZ_G |
2983						   R500_TEX_DST_B_SWIZ_B |
2984						   R500_TEX_DST_A_SWIZ_A));
2985	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2986	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2987	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2988
2989	    /* LRP temp3, temp2.zzzz, temp1, temp3 ->
2990	     * - PRESUB temps, temp1 - temp3
2991	     * - MAD temp2.zzzz, temps, temp3 */
2992	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2993						   R500_INST_RGB_WMASK_R |
2994						   R500_INST_RGB_WMASK_G |
2995						   R500_INST_RGB_WMASK_B |
2996						   R500_INST_ALPHA_WMASK));
2997	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
2998						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
2999						   R500_RGB_ADDR1(1) |
3000						   R500_RGB_ADDR2(2)));
3001	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
3002						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3003						   R500_ALPHA_ADDR1(1) |
3004						   R500_ALPHA_ADDR2(2)));
3005	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3006						   R500_ALU_RGB_R_SWIZ_A_B |
3007						   R500_ALU_RGB_G_SWIZ_A_B |
3008						   R500_ALU_RGB_B_SWIZ_A_B |
3009						   R500_ALU_RGB_SEL_B_SRCP |
3010						   R500_ALU_RGB_R_SWIZ_B_R |
3011						   R500_ALU_RGB_G_SWIZ_B_G |
3012						   R500_ALU_RGB_B_SWIZ_B_B));
3013	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3014						   R500_ALPHA_OP_MAD |
3015						   R500_ALPHA_SEL_A_SRC2 |
3016						   R500_ALPHA_SWIZ_A_B |
3017						   R500_ALPHA_SEL_B_SRCP |
3018						   R500_ALPHA_SWIZ_B_A));
3019	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3020						   R500_ALU_RGBA_OP_MAD |
3021						   R500_ALU_RGBA_SEL_C_SRC0 |
3022						   R500_ALU_RGBA_R_SWIZ_R |
3023						   R500_ALU_RGBA_G_SWIZ_G |
3024						   R500_ALU_RGBA_B_SWIZ_B |
3025						   R500_ALU_RGBA_A_SWIZ_A));
3026
3027	    /* LRP temp0, temp2.zzzz, temp4, temp0 ->
3028	     * - PRESUB temps, temp4 - temp1
3029	     * - MAD temp2.zzzz, temps, temp0 */
3030	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3031						   R500_INST_TEX_SEM_WAIT |
3032						   R500_INST_RGB_WMASK_R |
3033						   R500_INST_RGB_WMASK_G |
3034						   R500_INST_RGB_WMASK_B |
3035						   R500_INST_ALPHA_WMASK));
3036	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3037						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3038						   R500_RGB_ADDR1(4) |
3039						   R500_RGB_ADDR2(2)));
3040	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3041						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3042						   R500_ALPHA_ADDR1(4) |
3043						   R500_ALPHA_ADDR2(2)));
3044	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3045						   R500_ALU_RGB_R_SWIZ_A_B |
3046						   R500_ALU_RGB_G_SWIZ_A_B |
3047						   R500_ALU_RGB_B_SWIZ_A_B |
3048						   R500_ALU_RGB_SEL_B_SRCP |
3049						   R500_ALU_RGB_R_SWIZ_B_R |
3050						   R500_ALU_RGB_G_SWIZ_B_G |
3051						   R500_ALU_RGB_B_SWIZ_B_B));
3052	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3053						   R500_ALPHA_OP_MAD |
3054						   R500_ALPHA_SEL_A_SRC2 |
3055						   R500_ALPHA_SWIZ_A_B |
3056						   R500_ALPHA_SEL_B_SRCP |
3057						   R500_ALPHA_SWIZ_B_A));
3058	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3059						   R500_ALU_RGBA_OP_MAD |
3060						   R500_ALU_RGBA_SEL_C_SRC0 |
3061						   R500_ALU_RGBA_R_SWIZ_R |
3062						   R500_ALU_RGBA_G_SWIZ_G |
3063						   R500_ALU_RGBA_B_SWIZ_B |
3064						   R500_ALU_RGBA_A_SWIZ_A));
3065
3066	    /* LRP output, temp5.zzzz, temp3, temp0 ->
3067	     * - PRESUB temps, temp3 - temp0
3068	     * - MAD temp5.zzzz, temps, temp0 */
3069	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3070						   R500_INST_LAST |
3071						   R500_INST_TEX_SEM_WAIT |
3072						   R500_INST_RGB_WMASK_R |
3073						   R500_INST_RGB_WMASK_G |
3074						   R500_INST_RGB_WMASK_B |
3075						   R500_INST_ALPHA_WMASK |
3076						   R500_INST_RGB_OMASK_R |
3077						   R500_INST_RGB_OMASK_G |
3078						   R500_INST_RGB_OMASK_B |
3079						   R500_INST_ALPHA_OMASK));
3080	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3081						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3082						   R500_RGB_ADDR1(3) |
3083						   R500_RGB_ADDR2(5)));
3084	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3085						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3086						   R500_ALPHA_ADDR1(3) |
3087						   R500_ALPHA_ADDR2(5)));
3088	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3089						   R500_ALU_RGB_R_SWIZ_A_B |
3090						   R500_ALU_RGB_G_SWIZ_A_B |
3091						   R500_ALU_RGB_B_SWIZ_A_B |
3092						   R500_ALU_RGB_SEL_B_SRCP |
3093						   R500_ALU_RGB_R_SWIZ_B_R |
3094						   R500_ALU_RGB_G_SWIZ_B_G |
3095						   R500_ALU_RGB_B_SWIZ_B_B));
3096	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3097						   R500_ALPHA_OP_MAD |
3098						   R500_ALPHA_SEL_A_SRC2 |
3099						   R500_ALPHA_SWIZ_A_B |
3100						   R500_ALPHA_SEL_B_SRCP |
3101						   R500_ALPHA_SWIZ_B_A));
3102	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3103						   R500_ALU_RGBA_OP_MAD |
3104						   R500_ALU_RGBA_SEL_C_SRC0 |
3105						   R500_ALU_RGBA_R_SWIZ_R |
3106						   R500_ALU_RGBA_G_SWIZ_G |
3107						   R500_ALU_RGBA_B_SWIZ_B |
3108						   R500_ALU_RGBA_A_SWIZ_A));
3109
3110	    /* Shader constants. */
3111	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3112
3113	    /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
3114	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
3115	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
3116	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3117	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3118
3119	    ADVANCE_RING();
3120	} else {
3121	    BEGIN_RING(2*19);
3122	    /* 2 components: 2 for tex0 */
3123	    OUT_RING_REG(R300_RS_COUNT,
3124			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3125			   R300_RS_COUNT_HIRES_EN));
3126
3127	    /* R300_INST_COUNT_RS - highest RS instruction used */
3128	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3129
3130	    /* Pixel stack frame size. */
3131	    OUT_RING_REG(R300_US_PIXSIZE, 0); /* highest temp used */
3132
3133	    /* FP length. */
3134	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3135					      R500_US_CODE_END_ADDR(1)));
3136	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3137					       R500_US_CODE_RANGE_SIZE(1)));
3138
3139	    /* Prepare for FP emission. */
3140	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
3141	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3142
3143	    /* tex inst */
3144	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3145						   R500_INST_TEX_SEM_WAIT |
3146						   R500_INST_RGB_WMASK_R |
3147						   R500_INST_RGB_WMASK_G |
3148						   R500_INST_RGB_WMASK_B |
3149						   R500_INST_ALPHA_WMASK |
3150						   R500_INST_RGB_CLAMP |
3151						   R500_INST_ALPHA_CLAMP));
3152	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3153						   R500_TEX_INST_LD |
3154						   R500_TEX_SEM_ACQUIRE |
3155						   R500_TEX_IGNORE_UNCOVERED));
3156	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3157						   R500_TEX_SRC_S_SWIZ_R |
3158						   R500_TEX_SRC_T_SWIZ_G |
3159						   R500_TEX_DST_ADDR(0) |
3160						   R500_TEX_DST_R_SWIZ_R |
3161						   R500_TEX_DST_G_SWIZ_G |
3162						   R500_TEX_DST_B_SWIZ_B |
3163						   R500_TEX_DST_A_SWIZ_A));
3164	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3165						   R500_DX_S_SWIZ_R |
3166						   R500_DX_T_SWIZ_R |
3167						   R500_DX_R_SWIZ_R |
3168						   R500_DX_Q_SWIZ_R |
3169						   R500_DY_ADDR(0) |
3170						   R500_DY_S_SWIZ_R |
3171						   R500_DY_T_SWIZ_R |
3172						   R500_DY_R_SWIZ_R |
3173						   R500_DY_Q_SWIZ_R));
3174	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3175	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3176
3177	    /* ALU inst */
3178	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3179						   R500_INST_TEX_SEM_WAIT |
3180						   R500_INST_LAST |
3181						   R500_INST_RGB_OMASK_R |
3182						   R500_INST_RGB_OMASK_G |
3183						   R500_INST_RGB_OMASK_B |
3184						   R500_INST_ALPHA_OMASK |
3185						   R500_INST_RGB_CLAMP |
3186						   R500_INST_ALPHA_CLAMP));
3187	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3188						   R500_RGB_ADDR1(0) |
3189						   R500_RGB_ADDR1_CONST |
3190						   R500_RGB_ADDR2(0) |
3191						   R500_RGB_ADDR2_CONST));
3192	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3193						   R500_ALPHA_ADDR1(0) |
3194						   R500_ALPHA_ADDR1_CONST |
3195						   R500_ALPHA_ADDR2(0) |
3196						   R500_ALPHA_ADDR2_CONST));
3197	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3198						   R500_ALU_RGB_R_SWIZ_A_R |
3199						   R500_ALU_RGB_G_SWIZ_A_G |
3200						   R500_ALU_RGB_B_SWIZ_A_B |
3201						   R500_ALU_RGB_SEL_B_SRC0 |
3202						   R500_ALU_RGB_R_SWIZ_B_1 |
3203						   R500_ALU_RGB_B_SWIZ_B_1 |
3204						   R500_ALU_RGB_G_SWIZ_B_1));
3205	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3206						   R500_ALPHA_SWIZ_A_A |
3207						   R500_ALPHA_SWIZ_B_1));
3208	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3209						   R500_ALU_RGBA_R_SWIZ_0 |
3210						   R500_ALU_RGBA_G_SWIZ_0 |
3211						   R500_ALU_RGBA_B_SWIZ_0 |
3212						   R500_ALU_RGBA_A_SWIZ_0));
3213	    ADVANCE_RING();
3214	}
3215    } else {
3216	/*
3217	 * y' = y - .0625
3218	 * u' = u - .5
3219	 * v' = v - .5;
3220	 *
3221	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
3222	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
3223	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
3224	 *
3225	 * DP3 might look like the straightforward solution
3226	 * but we'd need to move the texture yuv values in
3227	 * the same reg for this to work. Therefore use MADs.
3228	 * Brightness just adds to the off constant.
3229	 * Contrast is multiplication of luminance.
3230	 * Saturation and hue change the u and v coeffs.
3231	 * Default values (before adjustments - depend on colorspace):
3232	 * yco = 1.1643
3233	 * uco = 0, -0.39173, 2.017
3234	 * vco = 1.5958, -0.8129, 0
3235	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
3236	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
3237	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
3238	 *
3239	 * temp = MAD(yco, yuv.yyyy, off)
3240	 * temp = MAD(uco, yuv.uuuu, temp)
3241	 * result = MAD(vco, yuv.vvvv, temp)
3242	 */
3243	/* TODO: don't recalc consts always */
3244	const float Loff = -0.0627;
3245	const float Coff = -0.502;
3246	float uvcosf, uvsinf;
3247	float yco;
3248	float uco[3], vco[3], off[3];
3249	float bright, cont, gamma;
3250	int ref = pPriv->transform_index;
3251
3252	cont = RTFContrast(pPriv->contrast);
3253	bright = RTFBrightness(pPriv->brightness);
3254	gamma = (float)pPriv->gamma / 1000.0;
3255	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
3256	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
3257	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
3258
3259	yco = trans[ref].RefLuma * cont;
3260	uco[0] = -trans[ref].RefRCr * uvsinf;
3261	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
3262	uco[2] = trans[ref].RefBCb * uvcosf;
3263	vco[0] = trans[ref].RefRCr * uvcosf;
3264	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
3265	vco[2] = trans[ref].RefBCb * uvsinf;
3266	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
3267	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
3268	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
3269
3270	//XXX gamma
3271
3272	if (pPriv->is_planar) {
3273	    BEGIN_RING(2*56);
3274	    /* 2 components: 2 for tex0 */
3275	    OUT_RING_REG(R300_RS_COUNT,
3276			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3277			   R300_RS_COUNT_HIRES_EN));
3278
3279	    /* R300_INST_COUNT_RS - highest RS instruction used */
3280	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3281
3282	    /* Pixel stack frame size. */
3283	    OUT_RING_REG(R300_US_PIXSIZE, 2); /* highest temp used */
3284
3285	    /* FP length. */
3286	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3287					      R500_US_CODE_END_ADDR(5)));
3288	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3289					       R500_US_CODE_RANGE_SIZE(5)));
3290
3291	    /* Prepare for FP emission. */
3292	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
3293	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3294
3295	    /* tex inst */
3296	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3297						   R500_INST_TEX_SEM_WAIT |
3298						   R500_INST_RGB_WMASK_R |
3299						   R500_INST_RGB_WMASK_G |
3300						   R500_INST_RGB_WMASK_B |
3301						   R500_INST_ALPHA_WMASK |
3302						   R500_INST_RGB_CLAMP |
3303						   R500_INST_ALPHA_CLAMP));
3304	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3305						   R500_TEX_INST_LD |
3306						   R500_TEX_IGNORE_UNCOVERED));
3307	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3308						   R500_TEX_SRC_S_SWIZ_R |
3309						   R500_TEX_SRC_T_SWIZ_G |
3310						   R500_TEX_DST_ADDR(2) |
3311						   R500_TEX_DST_R_SWIZ_R |
3312						   R500_TEX_DST_G_SWIZ_G |
3313						   R500_TEX_DST_B_SWIZ_B |
3314						   R500_TEX_DST_A_SWIZ_A));
3315	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3316						   R500_DX_S_SWIZ_R |
3317						   R500_DX_T_SWIZ_R |
3318						   R500_DX_R_SWIZ_R |
3319						   R500_DX_Q_SWIZ_R |
3320						   R500_DY_ADDR(0) |
3321						   R500_DY_S_SWIZ_R |
3322						   R500_DY_T_SWIZ_R |
3323						   R500_DY_R_SWIZ_R |
3324						   R500_DY_Q_SWIZ_R));
3325	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3326	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3327
3328	    /* tex inst */
3329	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3330						   R500_INST_TEX_SEM_WAIT |
3331						   R500_INST_RGB_WMASK_R |
3332						   R500_INST_RGB_WMASK_G |
3333						   R500_INST_RGB_WMASK_B |
3334						   R500_INST_ALPHA_WMASK |
3335						   R500_INST_RGB_CLAMP |
3336						   R500_INST_ALPHA_CLAMP));
3337	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3338						   R500_TEX_INST_LD |
3339						   R500_TEX_IGNORE_UNCOVERED));
3340	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3341						   R500_TEX_SRC_S_SWIZ_R |
3342						   R500_TEX_SRC_T_SWIZ_G |
3343						   R500_TEX_DST_ADDR(1) |
3344						   R500_TEX_DST_R_SWIZ_R |
3345						   R500_TEX_DST_G_SWIZ_G |
3346						   R500_TEX_DST_B_SWIZ_B |
3347						   R500_TEX_DST_A_SWIZ_A));
3348	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3349						   R500_DX_S_SWIZ_R |
3350						   R500_DX_T_SWIZ_R |
3351						   R500_DX_R_SWIZ_R |
3352						   R500_DX_Q_SWIZ_R |
3353						   R500_DY_ADDR(0) |
3354						   R500_DY_S_SWIZ_R |
3355						   R500_DY_T_SWIZ_R |
3356						   R500_DY_R_SWIZ_R |
3357						   R500_DY_Q_SWIZ_R));
3358	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3359	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3360
3361	    /* tex inst */
3362	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3363						   R500_INST_TEX_SEM_WAIT |
3364						   R500_INST_RGB_WMASK_R |
3365						   R500_INST_RGB_WMASK_G |
3366						   R500_INST_RGB_WMASK_B |
3367						   R500_INST_ALPHA_WMASK |
3368						   R500_INST_RGB_CLAMP |
3369						   R500_INST_ALPHA_CLAMP));
3370	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
3371						   R500_TEX_INST_LD |
3372						   R500_TEX_SEM_ACQUIRE |
3373						   R500_TEX_IGNORE_UNCOVERED));
3374	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3375						   R500_TEX_SRC_S_SWIZ_R |
3376						   R500_TEX_SRC_T_SWIZ_G |
3377						   R500_TEX_DST_ADDR(0) |
3378						   R500_TEX_DST_R_SWIZ_R |
3379						   R500_TEX_DST_G_SWIZ_G |
3380						   R500_TEX_DST_B_SWIZ_B |
3381						   R500_TEX_DST_A_SWIZ_A));
3382	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3383						   R500_DX_S_SWIZ_R |
3384						   R500_DX_T_SWIZ_R |
3385						   R500_DX_R_SWIZ_R |
3386						   R500_DX_Q_SWIZ_R |
3387						   R500_DY_ADDR(0) |
3388						   R500_DY_S_SWIZ_R |
3389						   R500_DY_T_SWIZ_R |
3390						   R500_DY_R_SWIZ_R |
3391						   R500_DY_Q_SWIZ_R));
3392	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3393	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3394
3395	    /* ALU inst */
3396	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
3397	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3398						   R500_INST_TEX_SEM_WAIT |
3399						   R500_INST_RGB_WMASK_R |
3400						   R500_INST_RGB_WMASK_G |
3401						   R500_INST_RGB_WMASK_B |
3402						   R500_INST_ALPHA_WMASK));
3403	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3404						   R500_RGB_ADDR0_CONST |
3405						   R500_RGB_ADDR1(2) |
3406						   R500_RGB_ADDR2(0) |
3407						   R500_RGB_ADDR2_CONST));
3408	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3409						   R500_ALPHA_ADDR0_CONST |
3410						   R500_ALPHA_ADDR1(2) |
3411						   R500_ALPHA_ADDR2(0) |
3412						   R500_ALPHA_ADDR2_CONST));
3413	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3414						   R500_ALU_RGB_R_SWIZ_A_A |
3415						   R500_ALU_RGB_G_SWIZ_A_A |
3416						   R500_ALU_RGB_B_SWIZ_A_A |
3417						   R500_ALU_RGB_SEL_B_SRC1 |
3418						   R500_ALU_RGB_R_SWIZ_B_R |
3419						   R500_ALU_RGB_B_SWIZ_B_G |
3420						   R500_ALU_RGB_G_SWIZ_B_B));
3421	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3422						   R500_ALPHA_ADDRD(2) |
3423						   R500_ALPHA_SWIZ_A_0 |
3424						   R500_ALPHA_SWIZ_B_0));
3425	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3426						   R500_ALU_RGBA_ADDRD(2) |
3427						   R500_ALU_RGBA_SEL_C_SRC0 |
3428						   R500_ALU_RGBA_R_SWIZ_R |
3429						   R500_ALU_RGBA_G_SWIZ_G |
3430						   R500_ALU_RGBA_B_SWIZ_B |
3431						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3432						   R500_ALU_RGBA_A_SWIZ_0));
3433
3434	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
3435	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3436						   R500_INST_TEX_SEM_WAIT |
3437						   R500_INST_RGB_WMASK_R |
3438						   R500_INST_RGB_WMASK_G |
3439						   R500_INST_RGB_WMASK_B |
3440						   R500_INST_ALPHA_WMASK));
3441	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3442						   R500_RGB_ADDR0_CONST |
3443						   R500_RGB_ADDR1(1) |
3444						   R500_RGB_ADDR2(2)));
3445	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3446						   R500_ALPHA_ADDR0_CONST |
3447						   R500_ALPHA_ADDR1(1) |
3448						   R500_ALPHA_ADDR2(2)));
3449	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3450						   R500_ALU_RGB_R_SWIZ_A_R |
3451						   R500_ALU_RGB_G_SWIZ_A_G |
3452						   R500_ALU_RGB_B_SWIZ_A_B |
3453						   R500_ALU_RGB_SEL_B_SRC1 |
3454						   R500_ALU_RGB_R_SWIZ_B_R |
3455						   R500_ALU_RGB_B_SWIZ_B_G |
3456						   R500_ALU_RGB_G_SWIZ_B_B));
3457	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3458						   R500_ALPHA_ADDRD(2) |
3459						   R500_ALPHA_SWIZ_A_0 |
3460						   R500_ALPHA_SWIZ_B_0));
3461	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3462						   R500_ALU_RGBA_ADDRD(2) |
3463						   R500_ALU_RGBA_SEL_C_SRC2 |
3464						   R500_ALU_RGBA_R_SWIZ_R |
3465						   R500_ALU_RGBA_G_SWIZ_G |
3466						   R500_ALU_RGBA_B_SWIZ_B |
3467						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3468						   R500_ALU_RGBA_A_SWIZ_0));
3469
3470	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
3471	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3472						   R500_INST_TEX_SEM_WAIT |
3473						   R500_INST_LAST |
3474						   R500_INST_RGB_OMASK_R |
3475						   R500_INST_RGB_OMASK_G |
3476						   R500_INST_RGB_OMASK_B |
3477						   R500_INST_ALPHA_OMASK |
3478						   R500_INST_RGB_CLAMP |
3479						   R500_INST_ALPHA_CLAMP));
3480	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3481						   R500_RGB_ADDR0_CONST |
3482						   R500_RGB_ADDR1(0) |
3483						   R500_RGB_ADDR2(2)));
3484	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
3485						   R500_ALPHA_ADDR0_CONST |
3486						   R500_ALPHA_ADDR1(0) |
3487						   R500_ALPHA_ADDR2(2)));
3488	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3489						   R500_ALU_RGB_R_SWIZ_A_R |
3490						   R500_ALU_RGB_G_SWIZ_A_G |
3491						   R500_ALU_RGB_B_SWIZ_A_B |
3492						   R500_ALU_RGB_SEL_B_SRC1 |
3493						   R500_ALU_RGB_R_SWIZ_B_R |
3494						   R500_ALU_RGB_B_SWIZ_B_G |
3495						   R500_ALU_RGB_G_SWIZ_B_B));
3496	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3497						   R500_ALPHA_ADDRD(0) |
3498						   R500_ALPHA_SWIZ_A_0 |
3499						   R500_ALPHA_SWIZ_B_0));
3500	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3501						   R500_ALU_RGBA_ADDRD(0) |
3502						   R500_ALU_RGBA_SEL_C_SRC2 |
3503						   R500_ALU_RGBA_R_SWIZ_R |
3504						   R500_ALU_RGBA_G_SWIZ_G |
3505						   R500_ALU_RGBA_B_SWIZ_B |
3506						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3507						   R500_ALU_RGBA_A_SWIZ_1));
3508
3509	} else {
3510	    BEGIN_RING(2*44);
3511	    /* 2 components: 2 for tex0/1/2 */
3512	    OUT_RING_REG(R300_RS_COUNT,
3513			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3514			   R300_RS_COUNT_HIRES_EN));
3515
3516	    /* R300_INST_COUNT_RS - highest RS instruction used */
3517	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3518
3519	    /* Pixel stack frame size. */
3520	    OUT_RING_REG(R300_US_PIXSIZE, 1); /* highest temp used */
3521
3522	    /* FP length. */
3523	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3524					      R500_US_CODE_END_ADDR(3)));
3525	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3526					       R500_US_CODE_RANGE_SIZE(3)));
3527
3528	    /* Prepare for FP emission. */
3529	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
3530	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3531
3532	    /* tex inst */
3533	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3534						   R500_INST_TEX_SEM_WAIT |
3535						   R500_INST_RGB_WMASK_R |
3536						   R500_INST_RGB_WMASK_G |
3537						   R500_INST_RGB_WMASK_B |
3538						   R500_INST_ALPHA_WMASK |
3539						   R500_INST_RGB_CLAMP |
3540						   R500_INST_ALPHA_CLAMP));
3541	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3542						   R500_TEX_INST_LD |
3543						   R500_TEX_SEM_ACQUIRE |
3544						   R500_TEX_IGNORE_UNCOVERED));
3545	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3546						   R500_TEX_SRC_S_SWIZ_R |
3547						   R500_TEX_SRC_T_SWIZ_G |
3548						   R500_TEX_DST_ADDR(0) |
3549						   R500_TEX_DST_R_SWIZ_R |
3550						   R500_TEX_DST_G_SWIZ_G |
3551						   R500_TEX_DST_B_SWIZ_B |
3552						   R500_TEX_DST_A_SWIZ_A));
3553	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3554						   R500_DX_S_SWIZ_R |
3555						   R500_DX_T_SWIZ_R |
3556						   R500_DX_R_SWIZ_R |
3557						   R500_DX_Q_SWIZ_R |
3558						   R500_DY_ADDR(0) |
3559						   R500_DY_S_SWIZ_R |
3560						   R500_DY_T_SWIZ_R |
3561						   R500_DY_R_SWIZ_R |
3562						   R500_DY_Q_SWIZ_R));
3563	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3564	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3565
3566	    /* ALU inst */
3567	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
3568	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3569						   R500_INST_TEX_SEM_WAIT |
3570						   R500_INST_RGB_WMASK_R |
3571						   R500_INST_RGB_WMASK_G |
3572						   R500_INST_RGB_WMASK_B |
3573						   R500_INST_ALPHA_WMASK));
3574	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3575						   R500_RGB_ADDR0_CONST |
3576						   R500_RGB_ADDR1(0) |
3577						   R500_RGB_ADDR2(0) |
3578						   R500_RGB_ADDR2_CONST));
3579	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3580						   R500_ALPHA_ADDR0_CONST |
3581						   R500_ALPHA_ADDR1(0) |
3582						   R500_ALPHA_ADDR2(0) |
3583						   R500_ALPHA_ADDR2_CONST));
3584	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3585						   R500_ALU_RGB_R_SWIZ_A_A |
3586						   R500_ALU_RGB_G_SWIZ_A_A |
3587						   R500_ALU_RGB_B_SWIZ_A_A |
3588						   R500_ALU_RGB_SEL_B_SRC1 |
3589						   R500_ALU_RGB_R_SWIZ_B_G |
3590						   R500_ALU_RGB_B_SWIZ_B_G |
3591						   R500_ALU_RGB_G_SWIZ_B_G));
3592	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3593						   R500_ALPHA_ADDRD(1) |
3594						   R500_ALPHA_SWIZ_A_0 |
3595						   R500_ALPHA_SWIZ_B_0));
3596	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3597						   R500_ALU_RGBA_ADDRD(1) |
3598						   R500_ALU_RGBA_SEL_C_SRC0 |
3599						   R500_ALU_RGBA_R_SWIZ_R |
3600						   R500_ALU_RGBA_G_SWIZ_G |
3601						   R500_ALU_RGBA_B_SWIZ_B |
3602						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3603						   R500_ALU_RGBA_A_SWIZ_0));
3604
3605	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
3606	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3607						   R500_INST_TEX_SEM_WAIT |
3608						   R500_INST_RGB_WMASK_R |
3609						   R500_INST_RGB_WMASK_G |
3610						   R500_INST_RGB_WMASK_B |
3611						   R500_INST_ALPHA_WMASK));
3612	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3613						   R500_RGB_ADDR0_CONST |
3614						   R500_RGB_ADDR1(0) |
3615						   R500_RGB_ADDR2(1)));
3616	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3617						   R500_ALPHA_ADDR0_CONST |
3618						   R500_ALPHA_ADDR1(0) |
3619						   R500_ALPHA_ADDR2(1)));
3620	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3621						   R500_ALU_RGB_R_SWIZ_A_R |
3622						   R500_ALU_RGB_G_SWIZ_A_G |
3623						   R500_ALU_RGB_B_SWIZ_A_B |
3624						   R500_ALU_RGB_SEL_B_SRC1 |
3625						   R500_ALU_RGB_R_SWIZ_B_B |
3626						   R500_ALU_RGB_B_SWIZ_B_B |
3627						   R500_ALU_RGB_G_SWIZ_B_B));
3628	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3629						   R500_ALPHA_ADDRD(1) |
3630						   R500_ALPHA_SWIZ_A_0 |
3631						   R500_ALPHA_SWIZ_B_0));
3632	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3633						   R500_ALU_RGBA_ADDRD(1) |
3634						   R500_ALU_RGBA_SEL_C_SRC2 |
3635						   R500_ALU_RGBA_R_SWIZ_R |
3636						   R500_ALU_RGBA_G_SWIZ_G |
3637						   R500_ALU_RGBA_B_SWIZ_B |
3638						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3639						   R500_ALU_RGBA_A_SWIZ_0));
3640
3641	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
3642	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3643						   R500_INST_TEX_SEM_WAIT |
3644						   R500_INST_LAST |
3645						   R500_INST_RGB_OMASK_R |
3646						   R500_INST_RGB_OMASK_G |
3647						   R500_INST_RGB_OMASK_B |
3648						   R500_INST_ALPHA_OMASK |
3649						   R500_INST_RGB_CLAMP |
3650						   R500_INST_ALPHA_CLAMP));
3651	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3652						   R500_RGB_ADDR0_CONST |
3653						   R500_RGB_ADDR1(0) |
3654						   R500_RGB_ADDR2(1)));
3655	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3656						   R500_ALPHA_ADDR0_CONST |
3657						   R500_ALPHA_ADDR1(0) |
3658						   R500_ALPHA_ADDR2(1)));
3659	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3660						   R500_ALU_RGB_R_SWIZ_A_R |
3661						   R500_ALU_RGB_G_SWIZ_A_G |
3662						   R500_ALU_RGB_B_SWIZ_A_B |
3663						   R500_ALU_RGB_SEL_B_SRC1 |
3664						   R500_ALU_RGB_R_SWIZ_B_R |
3665						   R500_ALU_RGB_B_SWIZ_B_R |
3666						   R500_ALU_RGB_G_SWIZ_B_R));
3667	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3668						   R500_ALPHA_ADDRD(1) |
3669						   R500_ALPHA_SWIZ_A_0 |
3670						   R500_ALPHA_SWIZ_B_0));
3671	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3672						   R500_ALU_RGBA_ADDRD(1) |
3673						   R500_ALU_RGBA_SEL_C_SRC2 |
3674						   R500_ALU_RGBA_R_SWIZ_R |
3675						   R500_ALU_RGBA_G_SWIZ_G |
3676						   R500_ALU_RGBA_B_SWIZ_B |
3677						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3678						   R500_ALU_RGBA_A_SWIZ_1));
3679	}
3680
3681	/* Shader constants. */
3682	OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3683
3684	/* constant 0: off, yco */
3685	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
3686	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
3687	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
3688	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
3689	/* constant 1: uco */
3690	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
3691	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
3692	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
3693	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
3694	/* constant 2: vco */
3695	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
3696	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
3697	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
3698	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
3699
3700	ADVANCE_RING();
3701    }
3702
3703    BEGIN_ACCEL_RELOC(6, 2);
3704    OUT_RING_REG(R300_TX_INVALTAGS, 0);
3705    OUT_RING_REG(R300_TX_ENABLE, txenable);
3706
3707    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
3708    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
3709
3710    /* no need to enable blending */
3711    OUT_RING_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
3712
3713    OUT_RING_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
3714    ADVANCE_RING();
3715
3716    if (pPriv->vsync) {
3717	xf86CrtcPtr crtc;
3718	if (pPriv->desired_crtc)
3719	    crtc = pPriv->desired_crtc;
3720	else
3721	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
3722					 pPriv->drw_x,
3723					 pPriv->drw_x + pPriv->dst_w,
3724					 pPriv->drw_y,
3725					 pPriv->drw_y + pPriv->dst_h);
3726	if (crtc)
3727	    RADEONWaitForVLine(pScrn, pPixmap,
3728			       crtc,
3729			       pPriv->drw_y - crtc->y,
3730			       (pPriv->drw_y - crtc->y) + pPriv->dst_h);
3731    }
3732
3733    return TRUE;
3734}
3735
3736static void
3737R500DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
3738{
3739    RADEONInfoPtr info = RADEONPTR(pScrn);
3740    PixmapPtr pPixmap = pPriv->pPixmap;
3741    int dstxoff, dstyoff;
3742    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
3743    int nBox = REGION_NUM_RECTS(&pPriv->clip);
3744
3745#ifdef COMPOSITE
3746    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
3747    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
3748#else
3749    dstxoff = 0;
3750    dstyoff = 0;
3751#endif
3752
3753    if (!R500PrepareTexturedVideo(pScrn, pPriv))
3754	return;
3755
3756    /*
3757     * Rendering of the actual polygon is done in two different
3758     * ways depending on chip generation:
3759     *
3760     * < R300:
3761     *
3762     *     These chips can render a rectangle in one pass, so
3763     *     handling is pretty straight-forward.
3764     *
3765     * >= R300:
3766     *
3767     *     These chips can accept a quad, but will render it as
3768     *     two triangles which results in a diagonal tear. Instead
3769     *     We render a single, large triangle and use the scissor
3770     *     functionality to restrict it to the desired rectangle.
3771     *     Due to guardband limits on r3xx/r4xx, we can only use
3772     *     the single triangle up to 2880 pixels; above that we
3773     *     render as a quad.
3774     */
3775
3776    while (nBox--) {
3777	float srcX, srcY, srcw, srch;
3778	int dstX, dstY, dstw, dsth;
3779	int draw_size = 3 * pPriv->vtx_count + 4 + 2 + 3;
3780
3781	if (draw_size > radeon_cs_space_remaining(pScrn)) {
3782	    radeon_cs_flush_indirect(pScrn);
3783	    if (!R500PrepareTexturedVideo(pScrn, pPriv))
3784		return;
3785	}
3786
3787	dstX = pBox->x1 + dstxoff;
3788	dstY = pBox->y1 + dstyoff;
3789	dstw = pBox->x2 - pBox->x1;
3790	dsth = pBox->y2 - pBox->y1;
3791
3792	srcX = pPriv->src_x;
3793	srcX += ((pBox->x1 - pPriv->drw_x) *
3794		 pPriv->src_w) / (float)pPriv->dst_w;
3795	srcY = pPriv->src_y;
3796	srcY += ((pBox->y1 - pPriv->drw_y) *
3797		 pPriv->src_h) / (float)pPriv->dst_h;
3798
3799	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
3800	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
3801
3802	BEGIN_RING(2*2);
3803	OUT_RING_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
3804					 ((dstY) << R300_SCISSOR_Y_SHIFT)));
3805	OUT_RING_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
3806					 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
3807	ADVANCE_RING();
3808
3809	BEGIN_RING(3 * pPriv->vtx_count + 4);
3810	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
3811			    3 * pPriv->vtx_count));
3812	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
3813		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
3814		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
3815
3816	if (pPriv->bicubic_enabled) {
3817	    VTX_OUT_6((float)dstX,            (float)dstY,
3818		      (float)srcX / pPriv->w, (float)srcY / pPriv->h,
3819		      (float)srcX + 0.5,      (float)srcY + 0.5);
3820	    VTX_OUT_6((float)dstX,            (float)(dstY + dstw + dsth),
3821		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
3822		      (float)srcX + 0.5,      (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
3823	    VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
3824		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
3825		      (float)srcY / pPriv->h,
3826		      (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
3827		      (float)srcY + 0.5);
3828	} else {
3829	    /*
3830	     * Render a big, scissored triangle. This means
3831	     * increasing the triangle size and adjusting
3832	     * texture coordinates.
3833	     */
3834	    VTX_OUT_4((float)dstX,            (float)dstY,
3835		      (float)srcX / pPriv->w, (float)srcY / pPriv->h);
3836	    VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
3837		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
3838	    VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
3839		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
3840		      (float)srcY / pPriv->h);
3841	}
3842
3843	/* flushing is pipelined, free/finish is not */
3844	OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
3845
3846	ADVANCE_RING();
3847
3848	pBox++;
3849    }
3850
3851    BEGIN_RING(2*3);
3852    OUT_RING_REG(R300_SC_CLIP_RULE, 0xAAAA);
3853    OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
3854    OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
3855    ADVANCE_RING();
3856
3857    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
3858}
3859
3860#undef VTX_OUT_4
3861#undef VTX_OUT_6
3862