radeon_textured_videofuncs.c revision 0a1d3ae0
1/*
2 * Copyright 2008 Alex Deucher
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 *
24 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
25 *
26 */
27
28#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
29do {								\
30    OUT_RING(F_TO_DW(_dstX));						\
31    OUT_RING(F_TO_DW(_dstY));						\
32    OUT_RING(F_TO_DW(_srcX));						\
33    OUT_RING(F_TO_DW(_srcY));						\
34    OUT_RING(F_TO_DW(_maskX));						\
35    OUT_RING(F_TO_DW(_maskY));						\
36} while (0)
37
38#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
39do {								\
40    OUT_RING(F_TO_DW(_dstX));						\
41    OUT_RING(F_TO_DW(_dstY));						\
42    OUT_RING(F_TO_DW(_srcX));						\
43    OUT_RING(F_TO_DW(_srcY));						\
44} while (0)
45
46
47static Bool
48RADEONPrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
49{
50    RADEONInfoPtr info = RADEONPTR(pScrn);
51    PixmapPtr pPixmap = pPriv->pPixmap;
52    struct radeon_exa_pixmap_priv *driver_priv;
53    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
54    uint32_t txformat, txsize, txpitch;
55    uint32_t dst_pitch, dst_format;
56    uint32_t colorpitch;
57    int pixel_shift;
58    int scissor_w = MIN(pPixmap->drawable.width, 2048) - 1;
59    int scissor_h = MIN(pPixmap->drawable.height, 2048) - 1;
60    int ret;
61
62    radeon_cs_space_reset_bos(info->cs);
63    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
64
65    if (pPriv->bicubic_enabled)
66	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo,
67					  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
68
69    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
70    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo->bo.radeon, 0,
71				      RADEON_GEM_DOMAIN_VRAM);
72
73    ret = radeon_cs_space_check(info->cs);
74    if (ret) {
75	ErrorF("Not enough RAM to hw accel xv operation\n");
76	return FALSE;
77    }
78
79    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
80
81    dst_pitch = exaGetPixmapPitch(pPixmap);
82    RADEON_SWITCH_TO_3D();
83
84    /* Same for R100/R200 */
85    switch (pPixmap->drawable.bitsPerPixel) {
86    case 16:
87	if (pPixmap->drawable.depth == 15)
88	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
89	else
90	    dst_format = RADEON_COLOR_FORMAT_RGB565;
91	break;
92    case 32:
93	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
94	break;
95    default:
96	return FALSE;
97    }
98
99    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
100	pPriv->is_planar = TRUE;
101	txformat = RADEON_TXFORMAT_Y8;
102    } else {
103	pPriv->is_planar = FALSE;
104	if (pPriv->id == FOURCC_UYVY)
105	    txformat = RADEON_TXFORMAT_YVYU422;
106	else
107	    txformat = RADEON_TXFORMAT_VYUY422;
108    }
109
110    txformat |= RADEON_TXFORMAT_NON_POWER2;
111
112    colorpitch = dst_pitch >> pixel_shift;
113
114    if (RADEONTilingEnabled(pScrn, pPixmap))
115	colorpitch |= RADEON_COLOR_TILE_ENABLE;
116
117    BEGIN_ACCEL_RELOC(4,2);
118
119    OUT_RING_REG(RADEON_RB3D_CNTL, dst_format);
120    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
121    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
122    OUT_RING_REG(RADEON_RB3D_BLENDCNTL,
123		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
124
125    ADVANCE_RING();
126
127    if (pPriv->is_planar) {
128	/* need 2 texcoord sets (even though they are identical) due
129	   to denormalization! hw apparently can't premultiply
130	   same coord set by different texture size */
131	pPriv->vtx_count = 6;
132
133	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
134		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
135	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
136	txpitch -= 32;
137
138	BEGIN_ACCEL_RELOC(23, 3);
139
140	OUT_RING_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
141					  RADEON_SE_VTX_FMT_ST0 |
142					  RADEON_SE_VTX_FMT_ST1));
143
144	OUT_RING_REG(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE |
145				       RADEON_TEX_1_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
146				       RADEON_TEX_2_ENABLE | RADEON_TEX_BLEND_2_ENABLE |
147				       RADEON_PLANAR_YUV_ENABLE));
148
149	/* Y */
150	OUT_RING_REG(RADEON_PP_TXFILTER_0,
151		      RADEON_MAG_FILTER_LINEAR |
152		      RADEON_MIN_FILTER_LINEAR |
153		      RADEON_CLAMP_S_CLAMP_LAST |
154		      RADEON_CLAMP_T_CLAMP_LAST |
155		      RADEON_YUV_TO_RGB);
156	OUT_RING_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
157	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, 0, src_bo);
158	OUT_RING_REG(RADEON_PP_TXCBLEND_0,
159		      RADEON_COLOR_ARG_A_ZERO |
160		      RADEON_COLOR_ARG_B_ZERO |
161		      RADEON_COLOR_ARG_C_T0_COLOR |
162		      RADEON_BLEND_CTL_ADD |
163		      RADEON_CLAMP_TX);
164	OUT_RING_REG(RADEON_PP_TXABLEND_0,
165		      RADEON_ALPHA_ARG_A_ZERO |
166		      RADEON_ALPHA_ARG_B_ZERO |
167		      RADEON_ALPHA_ARG_C_T0_ALPHA |
168		      RADEON_BLEND_CTL_ADD |
169		      RADEON_CLAMP_TX);
170
171	OUT_RING_REG(RADEON_PP_TEX_SIZE_0,
172		      (pPriv->w - 1) |
173		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
174	OUT_RING_REG(RADEON_PP_TEX_PITCH_0,
175		      pPriv->src_pitch - 32);
176
177	/* U */
178	OUT_RING_REG(RADEON_PP_TXFILTER_1,
179		      RADEON_MAG_FILTER_LINEAR |
180		      RADEON_MIN_FILTER_LINEAR |
181		      RADEON_CLAMP_S_CLAMP_LAST |
182		      RADEON_CLAMP_T_CLAMP_LAST);
183	OUT_RING_REG(RADEON_PP_TXFORMAT_1, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
184	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_1, pPriv->planeu_offset, src_bo);
185	OUT_RING_REG(RADEON_PP_TXCBLEND_1,
186		      RADEON_COLOR_ARG_A_ZERO |
187		      RADEON_COLOR_ARG_B_ZERO |
188		      RADEON_COLOR_ARG_C_T0_COLOR |
189		      RADEON_BLEND_CTL_ADD |
190		      RADEON_CLAMP_TX);
191	OUT_RING_REG(RADEON_PP_TXABLEND_1,
192		      RADEON_ALPHA_ARG_A_ZERO |
193		      RADEON_ALPHA_ARG_B_ZERO |
194		      RADEON_ALPHA_ARG_C_T0_ALPHA |
195		      RADEON_BLEND_CTL_ADD |
196		      RADEON_CLAMP_TX);
197
198	OUT_RING_REG(RADEON_PP_TEX_SIZE_1, txsize);
199	OUT_RING_REG(RADEON_PP_TEX_PITCH_1, txpitch);
200
201	/* V */
202	OUT_RING_REG(RADEON_PP_TXFILTER_2,
203		      RADEON_MAG_FILTER_LINEAR |
204		      RADEON_MIN_FILTER_LINEAR |
205		      RADEON_CLAMP_S_CLAMP_LAST |
206		      RADEON_CLAMP_T_CLAMP_LAST);
207	OUT_RING_REG(RADEON_PP_TXFORMAT_2, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
208	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_2, pPriv->planev_offset, src_bo);
209	OUT_RING_REG(RADEON_PP_TXCBLEND_2,
210		      RADEON_COLOR_ARG_A_ZERO |
211		      RADEON_COLOR_ARG_B_ZERO |
212		      RADEON_COLOR_ARG_C_T0_COLOR |
213		      RADEON_BLEND_CTL_ADD |
214		      RADEON_CLAMP_TX);
215	OUT_RING_REG(RADEON_PP_TXABLEND_2,
216		      RADEON_ALPHA_ARG_A_ZERO |
217		      RADEON_ALPHA_ARG_B_ZERO |
218		      RADEON_ALPHA_ARG_C_T0_ALPHA |
219		      RADEON_BLEND_CTL_ADD |
220		      RADEON_CLAMP_TX);
221
222	OUT_RING_REG(RADEON_PP_TEX_SIZE_2, txsize);
223	OUT_RING_REG(RADEON_PP_TEX_PITCH_2, txpitch);
224	ADVANCE_RING();
225    } else {
226	pPriv->vtx_count = 4;
227	BEGIN_ACCEL_RELOC(9, 1);
228
229	OUT_RING_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
230					  RADEON_SE_VTX_FMT_ST0));
231
232	OUT_RING_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
233
234	OUT_RING_REG(RADEON_PP_TXFILTER_0,
235		      RADEON_MAG_FILTER_LINEAR |
236		      RADEON_MIN_FILTER_LINEAR |
237		      RADEON_CLAMP_S_CLAMP_LAST |
238		      RADEON_CLAMP_T_CLAMP_LAST |
239		      RADEON_YUV_TO_RGB);
240	OUT_RING_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
241	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, 0, src_bo);
242	OUT_RING_REG(RADEON_PP_TXCBLEND_0,
243		      RADEON_COLOR_ARG_A_ZERO |
244		      RADEON_COLOR_ARG_B_ZERO |
245		      RADEON_COLOR_ARG_C_T0_COLOR |
246		      RADEON_BLEND_CTL_ADD |
247		      RADEON_CLAMP_TX);
248	OUT_RING_REG(RADEON_PP_TXABLEND_0,
249		      RADEON_ALPHA_ARG_A_ZERO |
250		      RADEON_ALPHA_ARG_B_ZERO |
251		      RADEON_ALPHA_ARG_C_T0_ALPHA |
252		      RADEON_BLEND_CTL_ADD |
253		      RADEON_CLAMP_TX);
254
255	OUT_RING_REG(RADEON_PP_TEX_SIZE_0,
256		      (pPriv->w - 1) |
257		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
258	OUT_RING_REG(RADEON_PP_TEX_PITCH_0,
259		      pPriv->src_pitch - 32);
260	ADVANCE_RING();
261    }
262
263    BEGIN_RING(2*2);
264    OUT_RING_REG(RADEON_RE_TOP_LEFT, 0);
265    OUT_RING_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
266					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
267    ADVANCE_RING();
268
269    if (pPriv->vsync) {
270	xf86CrtcPtr crtc;
271	if (pPriv->desired_crtc)
272	    crtc = pPriv->desired_crtc;
273	else
274	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
275					 pPriv->drw_x,
276					 pPriv->drw_x + pPriv->dst_w,
277					 pPriv->drw_y,
278					 pPriv->drw_y + pPriv->dst_h);
279	if (crtc)
280	    RADEONWaitForVLine(pScrn, pPixmap,
281				 crtc,
282				 pPriv->drw_y - crtc->y,
283				 (pPriv->drw_y - crtc->y) + pPriv->dst_h);
284    }
285
286    return TRUE;
287}
288
289static void
290RADEONDisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
291{
292    RADEONInfoPtr info = RADEONPTR(pScrn);
293    PixmapPtr pPixmap = pPriv->pPixmap;
294    int dstxoff, dstyoff;
295    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
296    int nBox = REGION_NUM_RECTS(&pPriv->clip);
297
298#ifdef COMPOSITE
299    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
300    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
301#else
302    dstxoff = 0;
303    dstyoff = 0;
304#endif
305
306    if (!RADEONPrepareTexturedVideo(pScrn, pPriv))
307	return;
308
309    /*
310     * Rendering of the actual polygon is done in two different
311     * ways depending on chip generation:
312     *
313     * < R300:
314     *
315     *     These chips can render a rectangle in one pass, so
316     *     handling is pretty straight-forward.
317     *
318     * >= R300:
319     *
320     *     These chips can accept a quad, but will render it as
321     *     two triangles which results in a diagonal tear. Instead
322     *     We render a single, large triangle and use the scissor
323     *     functionality to restrict it to the desired rectangle.
324     *     Due to guardband limits on r3xx/r4xx, we can only use
325     *     the single triangle up to 2560/4021 pixels; above that we
326     *     render as a quad.
327     */
328    while (nBox) {
329	int draw_size = 3 * pPriv->vtx_count + 5;
330	int loop_boxes;
331
332	if (draw_size > radeon_cs_space_remaining(pScrn)) {
333	    radeon_cs_flush_indirect(pScrn);
334	    if (!RADEONPrepareTexturedVideo(pScrn, pPriv))
335		return;
336	}
337	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
338	nBox -= loop_boxes;
339
340	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 5);
341	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
342			    loop_boxes * 3 * pPriv->vtx_count + 1));
343	if (pPriv->is_planar)
344	    OUT_RING(RADEON_CP_VC_FRMT_XY |
345		     RADEON_CP_VC_FRMT_ST0 |
346		     RADEON_CP_VC_FRMT_ST1);
347	else
348	    OUT_RING(RADEON_CP_VC_FRMT_XY |
349		     RADEON_CP_VC_FRMT_ST0);
350	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
351		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
352		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
353		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
354		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
355
356	while (loop_boxes--) {
357	    float srcX, srcY, srcw, srch;
358	    int dstX, dstY, dstw, dsth;
359	    dstX = pBox->x1 + dstxoff;
360	    dstY = pBox->y1 + dstyoff;
361	    dstw = pBox->x2 - pBox->x1;
362	    dsth = pBox->y2 - pBox->y1;
363
364	    srcX = pPriv->src_x;
365	    srcX += ((pBox->x1 - pPriv->drw_x) *
366		     pPriv->src_w) / (float)pPriv->dst_w;
367	    srcY = pPriv->src_y;
368	    srcY += ((pBox->y1 - pPriv->drw_y) *
369		     pPriv->src_h) / (float)pPriv->dst_h;
370
371	    srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
372	    srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
373
374
375	    if (pPriv->is_planar) {
376		/*
377		 * Just render a rect (using three coords).
378		 */
379		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
380			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
381			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
382		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
383			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
384			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
385		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
386			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
387			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
388	    } else {
389		/*
390		 * Just render a rect (using three coords).
391		 */
392		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
393			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
394		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
395			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
396		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
397			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
398	    }
399
400	    pBox++;
401	}
402
403	OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
404	ADVANCE_RING();
405    }
406    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
407}
408
409static Bool
410R200PrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
411{
412    RADEONInfoPtr info = RADEONPTR(pScrn);
413    PixmapPtr pPixmap = pPriv->pPixmap;
414    struct radeon_exa_pixmap_priv *driver_priv;
415    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
416    uint32_t txformat;
417    uint32_t txfilter, txsize, txpitch;
418    uint32_t dst_pitch, dst_format;
419    uint32_t colorpitch;
420    int pixel_shift;
421    int scissor_w = MIN(pPixmap->drawable.width, 2048) - 1;
422    int scissor_h = MIN(pPixmap->drawable.height, 2048) - 1;
423    /* note: in contrast to r300, use input biasing on uv components */
424    const float Loff = -0.0627;
425    float uvcosf, uvsinf;
426    float yco, yoff;
427    float uco[3], vco[3];
428    float bright, cont, sat;
429    int ref = pPriv->transform_index;
430    float ucscale = 0.25, vcscale = 0.25;
431    Bool needux8 = FALSE, needvx8 = FALSE;
432    int ret;
433
434    radeon_cs_space_reset_bos(info->cs);
435    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
436
437    if (pPriv->bicubic_enabled)
438	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo,
439					  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
440
441    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
442    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo->bo.radeon, 0,
443				      RADEON_GEM_DOMAIN_VRAM);
444
445    ret = radeon_cs_space_check(info->cs);
446    if (ret) {
447	ErrorF("Not enough RAM to hw accel xv operation\n");
448	return FALSE;
449    }
450
451    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
452
453    dst_pitch = exaGetPixmapPitch(pPixmap);
454
455    RADEON_SWITCH_TO_3D();
456
457    /* Same for R100/R200 */
458    switch (pPixmap->drawable.bitsPerPixel) {
459    case 16:
460	if (pPixmap->drawable.depth == 15)
461	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
462	else
463	    dst_format = RADEON_COLOR_FORMAT_RGB565;
464	break;
465    case 32:
466	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
467	break;
468    default:
469	return FALSE;
470    }
471
472    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
473	pPriv->is_planar = TRUE;
474	txformat = RADEON_TXFORMAT_I8;
475    } else {
476	pPriv->is_planar = FALSE;
477	if (pPriv->id == FOURCC_UYVY)
478	    txformat = RADEON_TXFORMAT_YVYU422;
479	else
480	    txformat = RADEON_TXFORMAT_VYUY422;
481    }
482
483    txformat |= RADEON_TXFORMAT_NON_POWER2;
484
485    colorpitch = dst_pitch >> pixel_shift;
486
487    if (RADEONTilingEnabled(pScrn, pPixmap))
488	colorpitch |= RADEON_COLOR_TILE_ENABLE;
489
490    BEGIN_ACCEL_RELOC(4,2);
491
492    OUT_RING_REG(RADEON_RB3D_CNTL, dst_format);
493    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
494    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
495
496    OUT_RING_REG(RADEON_RB3D_BLENDCNTL,
497		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
498
499    ADVANCE_RING();
500
501    txfilter =  R200_MAG_FILTER_LINEAR |
502	R200_MIN_FILTER_LINEAR |
503	R200_CLAMP_S_CLAMP_LAST |
504	R200_CLAMP_T_CLAMP_LAST;
505
506    /* contrast can cause constant overflow, clamp */
507    cont = RTFContrast(pPriv->contrast);
508    if (cont * trans[ref].RefLuma > 2.0)
509	cont = 2.0 / trans[ref].RefLuma;
510    /* brightness is only from -0.5 to 0.5 should be safe */
511    bright = RTFBrightness(pPriv->brightness);
512    /* saturation can also cause overflow, clamp */
513    sat = RTFSaturation(pPriv->saturation);
514    if (sat * trans[ref].RefBCb > 4.0)
515	sat = 4.0 / trans[ref].RefBCb;
516    uvcosf = sat * cos(RTFHue(pPriv->hue));
517    uvsinf = sat * sin(RTFHue(pPriv->hue));
518
519    yco = trans[ref].RefLuma * cont;
520    uco[0] = -trans[ref].RefRCr * uvsinf;
521    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
522    uco[2] = trans[ref].RefBCb * uvcosf;
523    vco[0] = trans[ref].RefRCr * uvcosf;
524    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
525    vco[2] = trans[ref].RefBCb * uvsinf;
526    yoff = Loff * yco + bright;
527
528    if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
529	needux8 = TRUE;
530	ucscale = 0.125;
531    }
532    if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
533	needvx8 = TRUE;
534	vcscale = 0.125;
535    }
536
537    if (pPriv->is_planar) {
538	/* need 2 texcoord sets (even though they are identical) due
539	   to denormalization! hw apparently can't premultiply
540	   same coord set by different texture size */
541	pPriv->vtx_count = 6;
542
543	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
544		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
545	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
546	txpitch -= 32;
547
548	BEGIN_ACCEL_RELOC(36, 3);
549
550	OUT_RING_REG(RADEON_PP_CNTL,
551		      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
552		      RADEON_TEX_BLEND_0_ENABLE |
553		      RADEON_TEX_BLEND_1_ENABLE |
554		      RADEON_TEX_BLEND_2_ENABLE);
555
556	OUT_RING_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
557	OUT_RING_REG(R200_SE_VTX_FMT_1,
558		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
559		      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
560
561	OUT_RING_REG(R200_PP_TXFILTER_0, txfilter);
562	OUT_RING_REG(R200_PP_TXFORMAT_0, txformat);
563	OUT_RING_REG(R200_PP_TXFORMAT_X_0, 0);
564	OUT_RING_REG(R200_PP_TXSIZE_0,
565		      (pPriv->w - 1) |
566		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
567	OUT_RING_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
568	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, 0, src_bo);
569
570	OUT_RING_REG(R200_PP_TXFILTER_1, txfilter);
571	OUT_RING_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
572	OUT_RING_REG(R200_PP_TXFORMAT_X_1, 0);
573	OUT_RING_REG(R200_PP_TXSIZE_1, txsize);
574	OUT_RING_REG(R200_PP_TXPITCH_1, txpitch);
575	OUT_TEXTURE_REG(R200_PP_TXOFFSET_1, pPriv->planeu_offset, src_bo);
576
577	OUT_RING_REG(R200_PP_TXFILTER_2, txfilter);
578	OUT_RING_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
579	OUT_RING_REG(R200_PP_TXFORMAT_X_2, 0);
580	OUT_RING_REG(R200_PP_TXSIZE_2, txsize);
581	OUT_RING_REG(R200_PP_TXPITCH_2, txpitch);
582	OUT_TEXTURE_REG(R200_PP_TXOFFSET_2, pPriv->planev_offset, src_bo);
583
584	/* similar to r300 code. Note the big problem is that hardware constants
585	 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
586	 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
587	 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
588	 * the constants not. To get larger range can use output scale, but for
589	 * that 2.018 value we need a total scale by 8, which means the constants
590	 * really have no accuracy whatsoever (5 fractional bits only).
591	 * The only direct way to get high  precision "constants" into the fragment
592	 * pipe I know of is to use the texcoord interpolator (not color, this one
593	 * is 8 bit only too), which seems a bit expensive. We're lucky though it
594	 * seems the values we need seem to fit better than worst case (get about
595	 * 6 fractional bits for this instead of 5, at least when not correcting for
596	 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
597	 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
598	 * even with non-default saturation/hue/contrast/brightness adjustments,
599	 * it gets a little crazy and ultimately precision might still be lacking.
600	 *
601	 * A higher precision (8 fractional bits) version might just put uco into
602	 * a texcoord, and calculate a new vcoconst in the shader, like so:
603	 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
604	 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
605	 * vcocalc = ADD temp, bias/scale(cohelper), vco
606	 * would in total use 4 tex units, 4 instructions which seems fairly
607	 * balanced for this architecture (instead of 3 + 3 for the solution here)
608	 *
609	 * temp = MAD(yco, yuv.yyyy, yoff)
610	 * temp = MAD(uco, yuv.uuuu, temp)
611	 * result = MAD(vco, yuv.vvvv, temp)
612	 *
613	 * note first mad produces actually scalar, hence we transform
614	 * it into a dp2a to get 8 bit precision of yco instead of 7 -
615	 * That's assuming hw correctly expands consts to internal precision.
616	 * (y * 1 + y * (yco - 1) + yoff)
617	 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
618	 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
619	 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
620	 *
621	 * vco, uco need bias (and hence scale too)
622	 *
623	 */
624
625	/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
626	OUT_RING_REG(R200_PP_TXCBLEND_0,
627		      R200_TXC_ARG_A_TFACTOR_COLOR |
628		      R200_TXC_ARG_B_R0_COLOR |
629		      R200_TXC_ARG_C_TFACTOR_COLOR |
630		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
631		      R200_TXC_OP_DOT2_ADD);
632	OUT_RING_REG(R200_PP_TXCBLEND2_0,
633		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
634		      R200_TXC_SCALE_INV2 |
635		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
636	OUT_RING_REG(R200_PP_TXABLEND_0,
637		      R200_TXA_ARG_A_ZERO |
638		      R200_TXA_ARG_B_ZERO |
639		      R200_TXA_ARG_C_ZERO |
640		      R200_TXA_OP_MADD);
641	OUT_RING_REG(R200_PP_TXABLEND2_0,
642		      R200_TXA_OUTPUT_REG_NONE);
643
644	/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
645	OUT_RING_REG(R200_PP_TXCBLEND_1,
646		      R200_TXC_ARG_A_TFACTOR_COLOR |
647		      R200_TXC_BIAS_ARG_A |
648		      R200_TXC_SCALE_ARG_A |
649		      R200_TXC_ARG_B_R1_COLOR |
650		      R200_TXC_BIAS_ARG_B |
651		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
652		      R200_TXC_ARG_C_R0_COLOR |
653		      R200_TXC_OP_MADD);
654	OUT_RING_REG(R200_PP_TXCBLEND2_1,
655		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
656		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
657	OUT_RING_REG(R200_PP_TXABLEND_1,
658		      R200_TXA_ARG_A_ZERO |
659		      R200_TXA_ARG_B_ZERO |
660		      R200_TXA_ARG_C_ZERO |
661		      R200_TXA_OP_MADD);
662	OUT_RING_REG(R200_PP_TXABLEND2_1,
663		      R200_TXA_OUTPUT_REG_NONE);
664
665	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
666	OUT_RING_REG(R200_PP_TXCBLEND_2,
667		      R200_TXC_ARG_A_TFACTOR_COLOR |
668		      R200_TXC_BIAS_ARG_A |
669		      R200_TXC_SCALE_ARG_A |
670		      R200_TXC_ARG_B_R2_COLOR |
671		      R200_TXC_BIAS_ARG_B |
672		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
673		      R200_TXC_ARG_C_R0_COLOR |
674		      R200_TXC_OP_MADD);
675	OUT_RING_REG(R200_PP_TXCBLEND2_2,
676		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
677		      R200_TXC_SCALE_2X |
678		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
679	OUT_RING_REG(R200_PP_TXABLEND_2,
680		      R200_TXA_ARG_A_ZERO |
681		      R200_TXA_ARG_B_ZERO |
682		      R200_TXA_ARG_C_ZERO |
683		      R200_TXA_COMP_ARG_C |
684		      R200_TXA_OP_MADD);
685	OUT_RING_REG(R200_PP_TXABLEND2_2,
686		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
687
688	/* shader constants */
689	OUT_RING_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
690						      yco > 1.0 ? yco - 1.0: yco,
691						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
692						      0.0));
693	OUT_RING_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
694						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
695						      uco[2] * ucscale + 0.5,
696						      0.0));
697	OUT_RING_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
698						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
699						      vco[2] * vcscale + 0.5,
700						      0.0));
701
702	ADVANCE_RING();
703    } else {
704	pPriv->vtx_count = 4;
705
706	BEGIN_ACCEL_RELOC(24, 1);
707
708	OUT_RING_REG(RADEON_PP_CNTL,
709		      RADEON_TEX_0_ENABLE |
710		      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
711		      RADEON_TEX_BLEND_2_ENABLE);
712
713	OUT_RING_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
714	OUT_RING_REG(R200_SE_VTX_FMT_1,
715		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
716
717	OUT_RING_REG(R200_PP_TXFILTER_0, txfilter);
718	OUT_RING_REG(R200_PP_TXFORMAT_0, txformat);
719	OUT_RING_REG(R200_PP_TXFORMAT_X_0, 0);
720	OUT_RING_REG(R200_PP_TXSIZE_0,
721		      (pPriv->w - 1) |
722		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
723	OUT_RING_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
724	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, 0, src_bo);
725
726	/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
727	OUT_RING_REG(R200_PP_TXCBLEND_0,
728		      R200_TXC_ARG_A_TFACTOR_COLOR |
729		      R200_TXC_ARG_B_R0_COLOR |
730		      R200_TXC_ARG_C_TFACTOR_COLOR |
731		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
732		      R200_TXC_OP_DOT2_ADD);
733	OUT_RING_REG(R200_PP_TXCBLEND2_0,
734		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
735		      R200_TXC_SCALE_INV2 |
736		      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
737		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
738	OUT_RING_REG(R200_PP_TXABLEND_0,
739		      R200_TXA_ARG_A_ZERO |
740		      R200_TXA_ARG_B_ZERO |
741		      R200_TXA_ARG_C_ZERO |
742		      R200_TXA_OP_MADD);
743	OUT_RING_REG(R200_PP_TXABLEND2_0,
744		      R200_TXA_OUTPUT_REG_NONE);
745
746	/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
747	OUT_RING_REG(R200_PP_TXCBLEND_1,
748		      R200_TXC_ARG_A_TFACTOR_COLOR |
749		      R200_TXC_BIAS_ARG_A |
750		      R200_TXC_SCALE_ARG_A |
751		      R200_TXC_ARG_B_R0_COLOR |
752		      R200_TXC_BIAS_ARG_B |
753		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
754		      R200_TXC_ARG_C_R1_COLOR |
755		      R200_TXC_OP_MADD);
756	OUT_RING_REG(R200_PP_TXCBLEND2_1,
757		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
758		      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
759		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
760	OUT_RING_REG(R200_PP_TXABLEND_1,
761		      R200_TXA_ARG_A_ZERO |
762		      R200_TXA_ARG_B_ZERO |
763		      R200_TXA_ARG_C_ZERO |
764		      R200_TXA_OP_MADD);
765	OUT_RING_REG(R200_PP_TXABLEND2_1,
766		      R200_TXA_OUTPUT_REG_NONE);
767
768	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
769	OUT_RING_REG(R200_PP_TXCBLEND_2,
770		      R200_TXC_ARG_A_TFACTOR_COLOR |
771		      R200_TXC_BIAS_ARG_A |
772		      R200_TXC_SCALE_ARG_A |
773		      R200_TXC_ARG_B_R0_COLOR |
774		      R200_TXC_BIAS_ARG_B |
775		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
776		      R200_TXC_ARG_C_R1_COLOR |
777		      R200_TXC_OP_MADD);
778	OUT_RING_REG(R200_PP_TXCBLEND2_2,
779		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
780		      R200_TXC_SCALE_2X |
781		      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
782		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
783	OUT_RING_REG(R200_PP_TXABLEND_2,
784		      R200_TXA_ARG_A_ZERO |
785		      R200_TXA_ARG_B_ZERO |
786		      R200_TXA_ARG_C_ZERO |
787		      R200_TXA_COMP_ARG_C |
788		      R200_TXA_OP_MADD);
789	OUT_RING_REG(R200_PP_TXABLEND2_2,
790		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
791
792	/* shader constants */
793	OUT_RING_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
794						      yco > 1.0 ? yco - 1.0: yco,
795						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
796						      0.0));
797	OUT_RING_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
798						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
799						      uco[2] * ucscale + 0.5,
800						      0.0));
801	OUT_RING_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
802						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
803						      vco[2] * vcscale + 0.5,
804						      0.0));
805
806	ADVANCE_RING();
807    }
808
809    BEGIN_RING(2*2);
810    OUT_RING_REG(RADEON_RE_TOP_LEFT, 0);
811    OUT_RING_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
812					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
813    ADVANCE_RING();
814
815    if (pPriv->vsync) {
816	xf86CrtcPtr crtc;
817	if (pPriv->desired_crtc)
818	    crtc = pPriv->desired_crtc;
819	else
820	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
821					 pPriv->drw_x,
822					 pPriv->drw_x + pPriv->dst_w,
823					 pPriv->drw_y,
824					 pPriv->drw_y + pPriv->dst_h);
825	if (crtc)
826	    RADEONWaitForVLine(pScrn, pPixmap,
827				 crtc,
828				 pPriv->drw_y - crtc->y,
829				 (pPriv->drw_y - crtc->y) + pPriv->dst_h);
830    }
831
832    return TRUE;
833}
834
835static void
836R200DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
837{
838    RADEONInfoPtr info = RADEONPTR(pScrn);
839    PixmapPtr pPixmap = pPriv->pPixmap;
840    int dstxoff, dstyoff;
841    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
842    int nBox = REGION_NUM_RECTS(&pPriv->clip);
843
844#ifdef COMPOSITE
845    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
846    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
847#else
848    dstxoff = 0;
849    dstyoff = 0;
850#endif
851
852    if (!R200PrepareTexturedVideo(pScrn, pPriv))
853	return;
854
855    /*
856     * Rendering of the actual polygon is done in two different
857     * ways depending on chip generation:
858     *
859     * < R300:
860     *
861     *     These chips can render a rectangle in one pass, so
862     *     handling is pretty straight-forward.
863     *
864     * >= R300:
865     *
866     *     These chips can accept a quad, but will render it as
867     *     two triangles which results in a diagonal tear. Instead
868     *     We render a single, large triangle and use the scissor
869     *     functionality to restrict it to the desired rectangle.
870     *     Due to guardband limits on r3xx/r4xx, we can only use
871     *     the single triangle up to 2560/4021 pixels; above that we
872     *     render as a quad.
873     */
874
875    while (nBox) {
876	int draw_size = 3 * pPriv->vtx_count + 4;
877	int loop_boxes;
878
879	if (draw_size > radeon_cs_space_remaining(pScrn)) {
880	    radeon_cs_flush_indirect(pScrn);
881	    if (!R200PrepareTexturedVideo(pScrn, pPriv))
882		return;
883	}
884	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
885	nBox -= loop_boxes;
886
887	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 4);
888	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
889			    loop_boxes * 3 * pPriv->vtx_count));
890	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
891		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
892		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
893
894	while (loop_boxes--) {
895	    float srcX, srcY, srcw, srch;
896	    int dstX, dstY, dstw, dsth;
897	    dstX = pBox->x1 + dstxoff;
898	    dstY = pBox->y1 + dstyoff;
899	    dstw = pBox->x2 - pBox->x1;
900	    dsth = pBox->y2 - pBox->y1;
901
902	    srcX = pPriv->src_x;
903	    srcX += ((pBox->x1 - pPriv->drw_x) *
904		     pPriv->src_w) / (float)pPriv->dst_w;
905	    srcY = pPriv->src_y;
906	    srcY += ((pBox->y1 - pPriv->drw_y) *
907		     pPriv->src_h) / (float)pPriv->dst_h;
908
909	    srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
910	    srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
911
912	    if (pPriv->is_planar) {
913		/*
914		 * Just render a rect (using three coords).
915		 */
916		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
917			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
918			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
919		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
920			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
921			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
922		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
923			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
924			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
925	    } else {
926		/*
927		 * Just render a rect (using three coords).
928		 */
929		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
930			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
931		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
932			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
933		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
934			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
935	    }
936
937	    pBox++;
938	}
939
940	OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
941	ADVANCE_RING();
942    }
943
944    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
945}
946
947static Bool
948R300PrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
949{
950    RADEONInfoPtr info = RADEONPTR(pScrn);
951    PixmapPtr pPixmap = pPriv->pPixmap;
952    struct radeon_exa_pixmap_priv *driver_priv;
953    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
954    uint32_t txfilter, txformat0, txformat1, txpitch;
955    uint32_t dst_pitch, dst_format;
956    uint32_t txenable, colorpitch;
957    uint32_t output_fmt;
958    int pixel_shift;
959    int ret;
960
961    radeon_cs_space_reset_bos(info->cs);
962    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
963
964    if (pPriv->bicubic_enabled)
965	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo,
966					  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
967
968    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
969    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo->bo.radeon, 0,
970				      RADEON_GEM_DOMAIN_VRAM);
971
972    ret = radeon_cs_space_check(info->cs);
973    if (ret) {
974	ErrorF("Not enough RAM to hw accel xv operation\n");
975	return FALSE;
976    }
977
978    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
979
980    dst_pitch = exaGetPixmapPitch(pPixmap);
981    RADEON_SWITCH_TO_3D();
982
983    if (pPriv->bicubic_enabled)
984	pPriv->vtx_count = 6;
985    else
986	pPriv->vtx_count = 4;
987
988    switch (pPixmap->drawable.bitsPerPixel) {
989    case 16:
990	if (pPixmap->drawable.depth == 15)
991	    dst_format = R300_COLORFORMAT_ARGB1555;
992	else
993	    dst_format = R300_COLORFORMAT_RGB565;
994	break;
995    case 32:
996	dst_format = R300_COLORFORMAT_ARGB8888;
997	break;
998    default:
999	return FALSE;
1000    }
1001
1002    output_fmt = (R300_OUT_FMT_C4_8 |
1003		  R300_OUT_FMT_C0_SEL_BLUE |
1004		  R300_OUT_FMT_C1_SEL_GREEN |
1005		  R300_OUT_FMT_C2_SEL_RED |
1006		  R300_OUT_FMT_C3_SEL_ALPHA);
1007
1008    colorpitch = dst_pitch >> pixel_shift;
1009    colorpitch |= dst_format;
1010
1011    if (RADEONTilingEnabled(pScrn, pPixmap))
1012	colorpitch |= R300_COLORTILE;
1013
1014
1015    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
1016	(pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
1017	pPriv->is_planar = TRUE;
1018    else
1019	pPriv->is_planar = FALSE;
1020
1021    if (pPriv->is_planar) {
1022	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
1023	txpitch = pPriv->src_pitch;
1024    } else {
1025	if (pPriv->id == FOURCC_UYVY)
1026	    txformat1 = R300_TX_FORMAT_YVYU422;
1027	else
1028	    txformat1 = R300_TX_FORMAT_VYUY422;
1029
1030	if (pPriv->bicubic_state != BICUBIC_OFF)
1031	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
1032
1033	/* pitch is in pixels */
1034	txpitch = pPriv->src_pitch / 2;
1035    }
1036    txpitch -= 1;
1037
1038    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1039		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1040		 R300_TXPITCH_EN);
1041
1042    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1043		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1044		R300_TX_MAG_FILTER_LINEAR |
1045		R300_TX_MIN_FILTER_LINEAR |
1046		(0 << R300_TX_ID_SHIFT));
1047
1048    BEGIN_ACCEL_RELOC(6, 1);
1049    OUT_RING_REG(R300_TX_FILTER0_0, txfilter);
1050    OUT_RING_REG(R300_TX_FILTER1_0, 0);
1051    OUT_RING_REG(R300_TX_FORMAT0_0, txformat0);
1052    if (pPriv->is_planar)
1053	OUT_RING_REG(R300_TX_FORMAT1_0, txformat1 | R300_TX_FORMAT_CACHE_HALF_REGION_0);
1054    else
1055	OUT_RING_REG(R300_TX_FORMAT1_0, txformat1);
1056    OUT_RING_REG(R300_TX_FORMAT2_0, txpitch);
1057    OUT_TEXTURE_REG(R300_TX_OFFSET_0, 0, src_bo);
1058    ADVANCE_RING();
1059
1060    txenable = R300_TEX_0_ENABLE;
1061
1062    if (pPriv->is_planar) {
1063	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1064		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1065		     R300_TXPITCH_EN);
1066	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
1067	txpitch -= 1;
1068	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1069		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1070		    R300_TX_MIN_FILTER_LINEAR |
1071		    R300_TX_MAG_FILTER_LINEAR);
1072
1073	BEGIN_ACCEL_RELOC(12, 2);
1074	OUT_RING_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
1075	OUT_RING_REG(R300_TX_FILTER1_1, 0);
1076	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
1077	OUT_RING_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
1078	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
1079	OUT_TEXTURE_REG(R300_TX_OFFSET_1, pPriv->planeu_offset, src_bo);
1080	OUT_RING_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
1081	OUT_RING_REG(R300_TX_FILTER1_2, 0);
1082	OUT_RING_REG(R300_TX_FORMAT0_2, txformat0);
1083	OUT_RING_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
1084	OUT_RING_REG(R300_TX_FORMAT2_2, txpitch);
1085	OUT_TEXTURE_REG(R300_TX_OFFSET_2, pPriv->planev_offset, src_bo);
1086	ADVANCE_RING();
1087	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
1088    }
1089
1090    if (pPriv->bicubic_enabled) {
1091	/* Size is 128x1 */
1092	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
1093		     (0x0 << R300_TXHEIGHT_SHIFT) |
1094		     R300_TXPITCH_EN);
1095	/* Format is 32-bit floats, 4bpp */
1096	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
1097	/* Pitch is 127 (128-1) */
1098	txpitch = 0x7f;
1099	/* Tex filter */
1100	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
1101		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
1102		    R300_TX_MIN_FILTER_NEAREST |
1103		    R300_TX_MAG_FILTER_NEAREST |
1104		    (1 << R300_TX_ID_SHIFT));
1105
1106	BEGIN_ACCEL_RELOC(6, 1);
1107	OUT_RING_REG(R300_TX_FILTER0_1, txfilter);
1108	OUT_RING_REG(R300_TX_FILTER1_1, 0);
1109	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
1110	OUT_RING_REG(R300_TX_FORMAT1_1, txformat1);
1111	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
1112	OUT_TEXTURE_REG(R300_TX_OFFSET_1, 0, info->bicubic_bo);
1113	ADVANCE_RING();
1114
1115	/* Enable tex 1 */
1116	txenable |= R300_TEX_1_ENABLE;
1117    }
1118
1119    /* setup the VAP */
1120    if (info->accel_state->has_tcl) {
1121	if (pPriv->bicubic_enabled)
1122	    BEGIN_RING(2*7);
1123	else
1124	    BEGIN_RING(2*6);
1125    } else {
1126	if (pPriv->bicubic_enabled)
1127	    BEGIN_RING(2*5);
1128	else
1129	    BEGIN_RING(2*4);
1130    }
1131
1132    /* These registers define the number, type, and location of data submitted
1133     * to the PVS unit of GA input (when PVS is disabled)
1134     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
1135     * enabled.  This memory provides the imputs to the vertex shader program
1136     * and ordering is not important.  When PVS/TCL is disabled, this field maps
1137     * directly to the GA input memory and the order is significant.  In
1138     * PVS_BYPASS mode the order is as follows:
1139     * Position
1140     * Point Size
1141     * Color 0-3
1142     * Textures 0-7
1143     * Fog
1144     */
1145    if (pPriv->bicubic_enabled) {
1146	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
1147		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1148		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1149		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1150		       R300_SIGNED_0 |
1151		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1152		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1153		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1154		       R300_SIGNED_1));
1155	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_1,
1156		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
1157		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
1158		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
1159		       R300_LAST_VEC_2 |
1160		       R300_SIGNED_2));
1161    } else {
1162	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
1163		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1164		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1165		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1166		       R300_SIGNED_0 |
1167		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1168		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1169		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1170		       R300_LAST_VEC_1 |
1171		       R300_SIGNED_1));
1172    }
1173
1174    /* load the vertex shader
1175     * We pre-load vertex programs in RADEONInit3DEngine():
1176     * - exa
1177     * - Xv
1178     * - Xv bicubic
1179     * Here we select the offset of the vertex program we want to use
1180     */
1181    if (info->accel_state->has_tcl) {
1182	if (pPriv->bicubic_enabled) {
1183	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
1184			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
1185			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1186			   (13 << R300_PVS_LAST_INST_SHIFT)));
1187	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
1188			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1189	} else {
1190	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
1191			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
1192			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1193			   (10 << R300_PVS_LAST_INST_SHIFT)));
1194	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
1195			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1196	}
1197    }
1198
1199    /* Position and one set of 2 texture coordinates */
1200    OUT_RING_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
1201    if (pPriv->bicubic_enabled)
1202	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
1203					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
1204    else
1205	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
1206
1207    OUT_RING_REG(R300_US_OUT_FMT_0, output_fmt);
1208    ADVANCE_RING();
1209
1210    /* setup pixel shader */
1211    if (pPriv->bicubic_state != BICUBIC_OFF) {
1212	if (pPriv->bicubic_enabled) {
1213	    BEGIN_RING(2*79);
1214
1215	    /* 4 components: 2 for tex0 and 2 for tex1 */
1216	    OUT_RING_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1217					  R300_RS_COUNT_HIRES_EN));
1218
1219	    /* R300_INST_COUNT_RS - highest RS instruction used */
1220	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
1221
1222	    /* Pixel stack frame size. */
1223	    OUT_RING_REG(R300_US_PIXSIZE, 5);
1224
1225	    /* Indirection levels */
1226	    OUT_RING_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
1227					   R300_FIRST_TEX));
1228
1229	    /* Set nodes. */
1230	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1231						R300_ALU_CODE_SIZE(14) |
1232						R300_TEX_CODE_OFFSET(0) |
1233						R300_TEX_CODE_SIZE(6)));
1234
1235	    /* Nodes are allocated highest first, but executed lowest first */
1236	    OUT_RING_REG(R300_US_CODE_ADDR_0, 0);
1237	    OUT_RING_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
1238						R300_ALU_SIZE(0) |
1239						R300_TEX_START(0) |
1240						R300_TEX_SIZE(0)));
1241	    OUT_RING_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
1242						R300_ALU_SIZE(9) |
1243						R300_TEX_START(1) |
1244						R300_TEX_SIZE(0)));
1245	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
1246						R300_ALU_SIZE(2) |
1247						R300_TEX_START(2) |
1248						R300_TEX_SIZE(3) |
1249						R300_RGBA_OUT));
1250
1251	    /* ** BICUBIC FP ** */
1252
1253	    /* texcoord0 => temp0
1254	     * texcoord1 => temp1 */
1255
1256	    // first node
1257	    /* TEX temp2, temp1.rrr0, tex1, 1D */
1258	    OUT_RING_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
1259						R300_TEX_ID(1) |
1260						R300_TEX_SRC_ADDR(1) |
1261						R300_TEX_DST_ADDR(2)));
1262
1263	    /* MOV temp1.r, temp1.ggg0 */
1264	    OUT_RING_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1265						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1266						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1267						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1268	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
1269						    R300_ALU_RGB_ADDRD(1) |
1270						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1271	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1272						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1273						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1274						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1275	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
1276						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1277
1278
1279	    // second node
1280	    /* TEX temp1, temp1, tex1, 1D */
1281	    OUT_RING_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
1282						R300_TEX_ID(1) |
1283						R300_TEX_SRC_ADDR(1) |
1284						R300_TEX_DST_ADDR(1)));
1285
1286	    /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
1287	    OUT_RING_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1288						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1289						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1290						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1291	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
1292						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1293						    R300_ALU_RGB_ADDRD(3) |
1294						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1295	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1296						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1297						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1298						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1299	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
1300						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1301
1302
1303	    /* MUL temp2.rg, temp2.rrr0, const0.rgb */
1304	    OUT_RING_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1305						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1306						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1307						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1308	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
1309						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1310						    R300_ALU_RGB_ADDRD(2) |
1311						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1312	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1313						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1314						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1315						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1316	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
1317						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1318
1319	    /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
1320	    OUT_RING_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1321						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1322						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1323						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1324	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
1325						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1326						    R300_ALU_RGB_ADDR2(3) |
1327						    R300_ALU_RGB_ADDRD(4) |
1328						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1329	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1330						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1331						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1332						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1333	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
1334						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1335
1336	    /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
1337	    OUT_RING_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1338						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1339						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1340						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1341	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
1342						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1343						    R300_ALU_RGB_ADDR2(2) |
1344						    R300_ALU_RGB_ADDRD(5) |
1345						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1346	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1347						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1348						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1349						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1350	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
1351						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1352
1353	    /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
1354	    OUT_RING_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1355						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1356						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1357						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1358	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
1359						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1360						    R300_ALU_RGB_ADDR2(3) |
1361						    R300_ALU_RGB_ADDRD(3) |
1362						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1363	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1364						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1365						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1366						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1367	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
1368						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1369
1370	    /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
1371	    OUT_RING_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1372						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1373						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1374						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1375	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
1376						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1377						    R300_ALU_RGB_ADDR2(2) |
1378						    R300_ALU_RGB_ADDRD(1) |
1379						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1380	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1381						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1382						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1383						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1384	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
1385						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1386
1387	    /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
1388	    OUT_RING_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1389						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1390						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1391						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1392	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1393						    R300_ALU_RGB_ADDR2(1) |
1394						    R300_ALU_RGB_ADDRD(1) |
1395						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1396	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1397						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1398						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1399						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1400	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
1401						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1402
1403	    /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
1404	    OUT_RING_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1405						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1406						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1407						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1408	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1409						    R300_ALU_RGB_ADDR2(3) |
1410						    R300_ALU_RGB_ADDRD(2) |
1411						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1412	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1413						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1414						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1415						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1416	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
1417						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1418
1419	    /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
1420	    OUT_RING_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1421						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1422						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1423						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1424	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1425						    R300_ALU_RGB_ADDR2(5) |
1426						    R300_ALU_RGB_ADDRD(3) |
1427						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1428	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1429						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1430						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1431						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1432	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
1433						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1434
1435	    /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
1436	    OUT_RING_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1437						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1438						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1439						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1440	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
1441						     R300_ALU_RGB_ADDR2(4) |
1442						     R300_ALU_RGB_ADDRD(0) |
1443						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1444	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1445						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1446						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1447						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1448	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
1449						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1450
1451
1452	    // third node
1453	    /* TEX temp4, temp1.rg--, tex0, 1D */
1454	    OUT_RING_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
1455						R300_TEX_ID(0) |
1456						R300_TEX_SRC_ADDR(1) |
1457						R300_TEX_DST_ADDR(4)));
1458
1459	    /* TEX temp3, temp3.rg--, tex0, 1D */
1460	    OUT_RING_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
1461						R300_TEX_ID(0) |
1462						R300_TEX_SRC_ADDR(3) |
1463						R300_TEX_DST_ADDR(3)));
1464
1465	    /* TEX temp5, temp2.rg--, tex0, 1D */
1466	    OUT_RING_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
1467						R300_TEX_ID(0) |
1468						R300_TEX_SRC_ADDR(2) |
1469						R300_TEX_DST_ADDR(5)));
1470
1471	    /* TEX temp0, temp0.rg--, tex0, 1D */
1472	    OUT_RING_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
1473						R300_TEX_ID(0) |
1474						R300_TEX_SRC_ADDR(0) |
1475						R300_TEX_DST_ADDR(0)));
1476
1477	    /* LRP temp3, temp1.bbbb, temp4, temp3 ->
1478	     * - PRESUB temps, temp4 - temp3
1479	     * - MAD temp3, temp1.bbbb, temps, temp3 */
1480	    OUT_RING_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1481						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1482						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1483						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1484						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1485	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
1486						     R300_ALU_RGB_ADDR1(4) |
1487						     R300_ALU_RGB_ADDR2(1) |
1488						     R300_ALU_RGB_ADDRD(3) |
1489						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1490	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1491						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1492						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1493						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1494	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
1495						       R300_ALU_ALPHA_ADDR1(4) |
1496						       R300_ALU_ALPHA_ADDR2(1) |
1497						       R300_ALU_ALPHA_ADDRD(3) |
1498						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1499
1500	    /* LRP temp0, temp1.bbbb, temp5, temp0 ->
1501	     * - PRESUB temps, temp5 - temp0
1502	     * - MAD temp0, temp1.bbbb, temps, temp0 */
1503	    OUT_RING_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1504						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1505						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1506						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1507						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
1508						     R300_ALU_RGB_INSERT_NOP));
1509	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
1510						     R300_ALU_RGB_ADDR1(5) |
1511						     R300_ALU_RGB_ADDR2(1) |
1512						     R300_ALU_RGB_ADDRD(0) |
1513						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1514	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1515						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1516						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1517						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1518	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
1519						       R300_ALU_ALPHA_ADDR1(5) |
1520						       R300_ALU_ALPHA_ADDR2(1) |
1521						       R300_ALU_ALPHA_ADDRD(0) |
1522						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1523
1524	    /* LRP output, temp2.bbbb, temp3, temp0 ->
1525	     * - PRESUB temps, temp3 - temp0
1526	     * - MAD output, temp2.bbbb, temps, temp0 */
1527	    OUT_RING_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1528						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1529						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1530						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1531						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1532	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
1533						     R300_ALU_RGB_ADDR1(3) |
1534						     R300_ALU_RGB_ADDR2(2) |
1535						     R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
1536	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1537						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1538						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1539						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1540	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
1541						       R300_ALU_ALPHA_ADDR1(3) |
1542						       R300_ALU_ALPHA_ADDR2(2) |
1543						       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
1544
1545	    /* Shader constants. */
1546	    OUT_RING_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
1547	    OUT_RING_REG(R300_US_ALU_CONST_G(0), 0);
1548	    OUT_RING_REG(R300_US_ALU_CONST_B(0), 0);
1549	    OUT_RING_REG(R300_US_ALU_CONST_A(0), 0);
1550
1551	    OUT_RING_REG(R300_US_ALU_CONST_R(1), 0);
1552	    OUT_RING_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
1553	    OUT_RING_REG(R300_US_ALU_CONST_B(1), 0);
1554	    OUT_RING_REG(R300_US_ALU_CONST_A(1), 0);
1555
1556	    ADVANCE_RING();
1557	} else {
1558	    BEGIN_RING(2*11);
1559	    /* 2 components: 2 for tex0 */
1560	    OUT_RING_REG(R300_RS_COUNT,
1561                          ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1562                           R300_RS_COUNT_HIRES_EN));
1563	    /* R300_INST_COUNT_RS - highest RS instruction used */
1564	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1565
1566	    OUT_RING_REG(R300_US_PIXSIZE, 0); /* highest temp used */
1567
1568	    /* Indirection levels */
1569	    OUT_RING_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1570					   R300_FIRST_TEX));
1571
1572	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1573						R300_ALU_CODE_SIZE(1) |
1574						R300_TEX_CODE_OFFSET(0) |
1575						R300_TEX_CODE_SIZE(1)));
1576
1577	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1578						R300_ALU_SIZE(0) |
1579						R300_TEX_START(0) |
1580						R300_TEX_SIZE(0) |
1581						R300_RGBA_OUT));
1582
1583	    /* tex inst */
1584	    OUT_RING_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1585					       R300_TEX_DST_ADDR(0) |
1586					       R300_TEX_ID(0) |
1587					       R300_TEX_INST(R300_TEX_INST_LD)));
1588
1589	    /* ALU inst */
1590	    /* RGB */
1591	    OUT_RING_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
1592                                                   R300_ALU_RGB_ADDR1(0) |
1593                                                   R300_ALU_RGB_ADDR2(0) |
1594                                                   R300_ALU_RGB_ADDRD(0) |
1595                                                   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
1596								       R300_ALU_RGB_MASK_G |
1597								       R300_ALU_RGB_MASK_B)) |
1598                                                   R300_ALU_RGB_TARGET_A));
1599	    OUT_RING_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1600                                                   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1601                                                   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1602						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1603                                                   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1604                                                   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1605                                                   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1606                                                   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1607                                                   R300_ALU_RGB_CLAMP));
1608	    /* Alpha */
1609	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
1610						     R300_ALU_ALPHA_ADDR1(0) |
1611						     R300_ALU_ALPHA_ADDR2(0) |
1612						     R300_ALU_ALPHA_ADDRD(0) |
1613						     R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1614						     R300_ALU_ALPHA_TARGET_A |
1615						     R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
1616	    OUT_RING_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
1617						     R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
1618						     R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
1619						     R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
1620						     R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
1621						     R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
1622						     R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1623						     R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
1624						     R300_ALU_ALPHA_CLAMP));
1625	    ADVANCE_RING();
1626	}
1627    } else {
1628	/*
1629	 * y' = y - .0625
1630	 * u' = u - .5
1631	 * v' = v - .5;
1632	 *
1633	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
1634	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
1635	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
1636	 *
1637	 * DP3 might look like the straightforward solution
1638	 * but we'd need to move the texture yuv values in
1639	 * the same reg for this to work. Therefore use MADs.
1640	 * Brightness just adds to the off constant.
1641	 * Contrast is multiplication of luminance.
1642	 * Saturation and hue change the u and v coeffs.
1643	 * Default values (before adjustments - depend on colorspace):
1644	 * yco = 1.1643
1645	 * uco = 0, -0.39173, 2.017
1646	 * vco = 1.5958, -0.8129, 0
1647	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
1648	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
1649	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
1650	 *
1651	 * temp = MAD(yco, yuv.yyyy, off)
1652	 * temp = MAD(uco, yuv.uuuu, temp)
1653	 * result = MAD(vco, yuv.vvvv, temp)
1654	 */
1655	/* TODO: don't recalc consts always */
1656	const float Loff = -0.0627;
1657	const float Coff = -0.502;
1658	float uvcosf, uvsinf;
1659	float yco;
1660	float uco[3], vco[3], off[3];
1661	float bright, cont, gamma;
1662	int ref = pPriv->transform_index;
1663	Bool needgamma = FALSE;
1664
1665	cont = RTFContrast(pPriv->contrast);
1666	bright = RTFBrightness(pPriv->brightness);
1667	gamma = (float)pPriv->gamma / 1000.0;
1668	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
1669	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
1670	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
1671
1672	yco = trans[ref].RefLuma * cont;
1673	uco[0] = -trans[ref].RefRCr * uvsinf;
1674	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
1675	uco[2] = trans[ref].RefBCb * uvcosf;
1676	vco[0] = trans[ref].RefRCr * uvcosf;
1677	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
1678	vco[2] = trans[ref].RefBCb * uvsinf;
1679	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
1680	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
1681	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
1682
1683	if (gamma != 1.0) {
1684	    needgamma = TRUE;
1685	    /* note: gamma correction is out = in ^ gamma;
1686	       gpu can only do LG2/EX2 therefore we transform into
1687	       in ^ gamma = 2 ^ (log2(in) * gamma).
1688	       Lots of scalar ops, unfortunately (better solution?) -
1689	       without gamma that's 3 inst, with gamma it's 10...
1690	       could use different gamma factors per channel,
1691	       if that's of any use. */
1692	}
1693
1694	if (pPriv->is_planar) {
1695	    BEGIN_RING(2 * (needgamma ? (28 + 33) : 33));
1696	    /* 2 components: same 2 for tex0/1/2 */
1697	    OUT_RING_REG(R300_RS_COUNT,
1698			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1699			   R300_RS_COUNT_HIRES_EN));
1700	    /* R300_INST_COUNT_RS - highest RS instruction used */
1701	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1702
1703	    OUT_RING_REG(R300_US_PIXSIZE, 2); /* highest temp used */
1704
1705	    /* Indirection levels */
1706	    OUT_RING_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1707					   R300_FIRST_TEX));
1708
1709	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1710						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1711						R300_TEX_CODE_OFFSET(0) |
1712						R300_TEX_CODE_SIZE(3)));
1713
1714	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1715						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1716						R300_TEX_START(0) |
1717						R300_TEX_SIZE(2) |
1718						R300_RGBA_OUT));
1719
1720	    /* tex inst */
1721	    OUT_RING_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1722					       R300_TEX_DST_ADDR(2) |
1723					       R300_TEX_ID(0) |
1724					       R300_TEX_INST(R300_TEX_INST_LD)));
1725	    OUT_RING_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
1726					       R300_TEX_DST_ADDR(1) |
1727					       R300_TEX_ID(1) |
1728					       R300_TEX_INST(R300_TEX_INST_LD)));
1729	    OUT_RING_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
1730					       R300_TEX_DST_ADDR(0) |
1731					       R300_TEX_ID(2) |
1732					       R300_TEX_INST(R300_TEX_INST_LD)));
1733
1734	    /* ALU inst */
1735	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
1736	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
1737						    R300_ALU_RGB_ADDR1(2) |
1738						    R300_ALU_RGB_ADDR2(0) |
1739						    R300_ALU_RGB_ADDRD(2) |
1740						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1741	    OUT_RING_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
1742						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1743						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1744						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1745						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1746						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1747						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1748						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1749	    /* alpha nop, but need to set up alpha source for rgb usage */
1750	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
1751						      R300_ALU_ALPHA_ADDR1(2) |
1752						      R300_ALU_ALPHA_ADDR2(0) |
1753						      R300_ALU_ALPHA_ADDRD(2) |
1754						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1755	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1756						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1757						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1758						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1759
1760	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
1761	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
1762						    R300_ALU_RGB_ADDR1(1) |
1763						    R300_ALU_RGB_ADDR2(2) |
1764						    R300_ALU_RGB_ADDRD(2) |
1765						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1766	    OUT_RING_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1767						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1768						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1769						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1770						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1771						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1772						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1773						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1774	    /* alpha nop */
1775	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
1776						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1777	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1778						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1779						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1780						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1781
1782	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
1783	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
1784						    R300_ALU_RGB_ADDR1(0) |
1785						    R300_ALU_RGB_ADDR2(2) |
1786						    R300_ALU_RGB_ADDRD(0) |
1787						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
1788						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
1789	    OUT_RING_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1790						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1791						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1792						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1793						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1794						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1795						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1796						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1797						    R300_ALU_RGB_CLAMP));
1798	    /* write alpha 1 */
1799	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
1800						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1801						      R300_ALU_ALPHA_TARGET_A));
1802	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1803						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1804						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1805						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
1806
1807	    if (needgamma) {
1808		/* rgb temp0.r = op_sop, set up src0 reg */
1809		OUT_RING_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
1810							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1811		OUT_RING_REG(R300_US_ALU_RGB_INST(3),
1812			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1813			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1814		/* alpha lg2 temp0, temp0.r */
1815		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
1816							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1817		OUT_RING_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1818							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
1819							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1820							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1821
1822		/* rgb temp0.g = op_sop, set up src0 reg */
1823		OUT_RING_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
1824							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
1825		OUT_RING_REG(R300_US_ALU_RGB_INST(4),
1826			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1827			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1828		/* alpha lg2 temp0, temp0.g */
1829		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
1830							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1831		OUT_RING_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1832							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
1833							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1834							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1835
1836		/* rgb temp0.b = op_sop, set up src0 reg */
1837		OUT_RING_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
1838							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
1839		OUT_RING_REG(R300_US_ALU_RGB_INST(5),
1840			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1841			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1842		/* alpha lg2 temp0, temp0.b */
1843		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
1844							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1845		OUT_RING_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1846							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
1847							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1848							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1849
1850		/* MUL const1, temp1, temp0 */
1851		OUT_RING_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
1852							R300_ALU_RGB_ADDR1(0) |
1853							R300_ALU_RGB_ADDR2(0) |
1854							R300_ALU_RGB_ADDRD(0) |
1855							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1856		OUT_RING_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1857							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1858							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
1859							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1860							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1861							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1862							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1863							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1864		/* alpha nop, but set up const1 */
1865		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
1866							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
1867							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1868		OUT_RING_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1869							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1870							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1871							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1872
1873		/* rgb out0.r = op_sop, set up src0 reg */
1874		OUT_RING_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1875							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
1876							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
1877		OUT_RING_REG(R300_US_ALU_RGB_INST(7),
1878			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1879			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1880		/* alpha ex2 temp0, temp0.r */
1881		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
1882							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1883		OUT_RING_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
1884							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
1885							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1886							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1887
1888		/* rgb out0.g = op_sop, set up src0 reg */
1889		OUT_RING_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1890							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
1891							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
1892		OUT_RING_REG(R300_US_ALU_RGB_INST(8),
1893			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1894			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1895		/* alpha ex2 temp0, temp0.g */
1896		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
1897							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1898		OUT_RING_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
1899							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
1900							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1901							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1902
1903		/* rgb out0.b = op_sop, set up src0 reg */
1904		OUT_RING_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1905							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
1906							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
1907		OUT_RING_REG(R300_US_ALU_RGB_INST(9),
1908			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1909			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1910		/* alpha ex2 temp0, temp0.b */
1911		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
1912							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1913		OUT_RING_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
1914							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
1915							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1916							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1917	    }
1918	} else {
1919	    BEGIN_RING(2 * (needgamma ? (28 + 31) : 31));
1920	    /* 2 components */
1921	    OUT_RING_REG(R300_RS_COUNT,
1922			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1923			   R300_RS_COUNT_HIRES_EN));
1924	    /* R300_INST_COUNT_RS - highest RS instruction used */
1925	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1926
1927	    OUT_RING_REG(R300_US_PIXSIZE, 1); /* highest temp used */
1928
1929	    /* Indirection levels */
1930	    OUT_RING_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1931					   R300_FIRST_TEX));
1932
1933	    OUT_RING_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1934						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1935						R300_TEX_CODE_OFFSET(0) |
1936						R300_TEX_CODE_SIZE(1)));
1937
1938	    OUT_RING_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1939						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1940						R300_TEX_START(0) |
1941						R300_TEX_SIZE(0) |
1942						R300_RGBA_OUT));
1943
1944	    /* tex inst */
1945	    OUT_RING_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1946					       R300_TEX_DST_ADDR(0) |
1947					       R300_TEX_ID(0) |
1948					       R300_TEX_INST(R300_TEX_INST_LD)));
1949
1950	    /* ALU inst */
1951	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
1952	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
1953						    R300_ALU_RGB_ADDR1(0) |
1954						    R300_ALU_RGB_ADDR2(0) |
1955						    R300_ALU_RGB_ADDRD(1) |
1956						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1957	    OUT_RING_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
1958						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1959						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
1960						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1961						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1962						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1963						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1964						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1965	    /* alpha nop, but need to set up alpha source for rgb usage */
1966	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
1967						      R300_ALU_ALPHA_ADDR1(0) |
1968						      R300_ALU_ALPHA_ADDR2(0) |
1969						      R300_ALU_ALPHA_ADDRD(0) |
1970						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1971	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1972						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1973						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1974						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1975
1976	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
1977	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
1978						    R300_ALU_RGB_ADDR1(0) |
1979						    R300_ALU_RGB_ADDR2(1) |
1980						    R300_ALU_RGB_ADDRD(1) |
1981						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1982	    OUT_RING_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1983						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1984						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
1985						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1986						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1987						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1988						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1989						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1990	    /* alpha nop */
1991	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
1992						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1993	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1994						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1995						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1996						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1997
1998	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
1999	    OUT_RING_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
2000						    R300_ALU_RGB_ADDR1(0) |
2001						    R300_ALU_RGB_ADDR2(1) |
2002						    R300_ALU_RGB_ADDRD(0) |
2003						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
2004						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
2005	    OUT_RING_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2006						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2007						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
2008						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2009						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2010						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2011						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2012						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2013						    R300_ALU_RGB_CLAMP));
2014	    /* write alpha 1 */
2015	    OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2016						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2017						      R300_ALU_ALPHA_TARGET_A));
2018	    OUT_RING_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2019						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2020						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2021						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2022
2023	    if (needgamma) {
2024		/* rgb temp0.r = op_sop, set up src0 reg */
2025		OUT_RING_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2026							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2027		OUT_RING_REG(R300_US_ALU_RGB_INST(3),
2028			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2029			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2030		/* alpha lg2 temp0, temp0.r */
2031		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2032							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2033		OUT_RING_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2034							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2035							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2036							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2037
2038		/* rgb temp0.g = op_sop, set up src0 reg */
2039		OUT_RING_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2040							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2041		OUT_RING_REG(R300_US_ALU_RGB_INST(4),
2042			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2043			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2044		/* alpha lg2 temp0, temp0.g */
2045		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2046							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2047		OUT_RING_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2048							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2049							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2050							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2051
2052		/* rgb temp0.b = op_sop, set up src0 reg */
2053		OUT_RING_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2054							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2055		OUT_RING_REG(R300_US_ALU_RGB_INST(5),
2056			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2057			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2058		/* alpha lg2 temp0, temp0.b */
2059		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2060							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2061		OUT_RING_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2062							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2063							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2064							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2065
2066		/* MUL const1, temp1, temp0 */
2067		OUT_RING_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2068							R300_ALU_RGB_ADDR1(0) |
2069							R300_ALU_RGB_ADDR2(0) |
2070							R300_ALU_RGB_ADDRD(0) |
2071							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2072		OUT_RING_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2073							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2074							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2075							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2076							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2077							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2078							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2079							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2080		/* alpha nop, but set up const1 */
2081		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2082							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2083							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2084		OUT_RING_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2085							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2086							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2087							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2088
2089		/* rgb out0.r = op_sop, set up src0 reg */
2090		OUT_RING_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2091							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2092							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2093		OUT_RING_REG(R300_US_ALU_RGB_INST(7),
2094			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2095			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2096		/* alpha ex2 temp0, temp0.r */
2097		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2098							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2099		OUT_RING_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2100							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2101							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2102							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2103
2104		/* rgb out0.g = op_sop, set up src0 reg */
2105		OUT_RING_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2106							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2107							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2108		OUT_RING_REG(R300_US_ALU_RGB_INST(8),
2109			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2110			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2111		/* alpha ex2 temp0, temp0.g */
2112		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2113							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2114		OUT_RING_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2115							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2116							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2117							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2118
2119		/* rgb out0.b = op_sop, set up src0 reg */
2120		OUT_RING_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2121							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2122							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2123		OUT_RING_REG(R300_US_ALU_RGB_INST(9),
2124			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2125			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2126		/* alpha ex2 temp0, temp0.b */
2127		OUT_RING_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2128							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2129		OUT_RING_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2130							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2131							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2132							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2133	    }
2134	}
2135
2136	/* Shader constants. */
2137	/* constant 0: off, yco */
2138	OUT_RING_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
2139	OUT_RING_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
2140	OUT_RING_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
2141	OUT_RING_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
2142	/* constant 1: uco */
2143	OUT_RING_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
2144	OUT_RING_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
2145	OUT_RING_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
2146	OUT_RING_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
2147	/* constant 2: vco */
2148	OUT_RING_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
2149	OUT_RING_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
2150	OUT_RING_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
2151	OUT_RING_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
2152
2153	ADVANCE_RING();
2154    }
2155
2156    BEGIN_ACCEL_RELOC(6, 2);
2157    OUT_RING_REG(R300_TX_INVALTAGS, 0);
2158    OUT_RING_REG(R300_TX_ENABLE, txenable);
2159
2160    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
2161    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
2162
2163    /* no need to enable blending */
2164    OUT_RING_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
2165
2166    OUT_RING_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
2167    ADVANCE_RING();
2168
2169    if (pPriv->vsync) {
2170	xf86CrtcPtr crtc;
2171	if (pPriv->desired_crtc)
2172	    crtc = pPriv->desired_crtc;
2173	else
2174	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
2175					 pPriv->drw_x,
2176					 pPriv->drw_x + pPriv->dst_w,
2177					 pPriv->drw_y,
2178					 pPriv->drw_y + pPriv->dst_h);
2179	if (crtc)
2180	    RADEONWaitForVLine(pScrn, pPixmap,
2181			       crtc,
2182			       pPriv->drw_y - crtc->y,
2183			       (pPriv->drw_y - crtc->y) + pPriv->dst_h);
2184    }
2185
2186    return TRUE;
2187}
2188
2189static void
2190R300DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2191{
2192    RADEONInfoPtr info = RADEONPTR(pScrn);
2193    PixmapPtr pPixmap = pPriv->pPixmap;
2194    int dstxoff, dstyoff;
2195    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
2196    int nBox = REGION_NUM_RECTS(&pPriv->clip);
2197
2198#ifdef COMPOSITE
2199    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
2200    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
2201#else
2202    dstxoff = 0;
2203    dstyoff = 0;
2204#endif
2205
2206    if (!R300PrepareTexturedVideo(pScrn, pPriv))
2207	return;
2208
2209    /*
2210     * Rendering of the actual polygon is done in two different
2211     * ways depending on chip generation:
2212     *
2213     * < R300:
2214     *
2215     *     These chips can render a rectangle in one pass, so
2216     *     handling is pretty straight-forward.
2217     *
2218     * >= R300:
2219     *
2220     *     These chips can accept a quad, but will render it as
2221     *     two triangles which results in a diagonal tear. Instead
2222     *     We render a single, large triangle and use the scissor
2223     *     functionality to restrict it to the desired rectangle.
2224     *     Due to guardband limits on r3xx/r4xx, we can only use
2225     *     the single triangle up to 2560/4021 pixels; above that we
2226     *     render as a quad.
2227     */
2228
2229    while (nBox--) {
2230	float srcX, srcY, srcw, srch;
2231	int dstX, dstY, dstw, dsth;
2232	Bool use_quad = FALSE;
2233	int draw_size = 4 * pPriv->vtx_count + 4 + 2 + 3;
2234
2235	if (draw_size > radeon_cs_space_remaining(pScrn)) {
2236	    radeon_cs_flush_indirect(pScrn);
2237	    if (!R300PrepareTexturedVideo(pScrn, pPriv))
2238		return;
2239	}
2240
2241	dstX = pBox->x1 + dstxoff;
2242	dstY = pBox->y1 + dstyoff;
2243	dstw = pBox->x2 - pBox->x1;
2244	dsth = pBox->y2 - pBox->y1;
2245
2246	srcX = pPriv->src_x;
2247	srcX += ((pBox->x1 - pPriv->drw_x) *
2248		 pPriv->src_w) / (float)pPriv->dst_w;
2249	srcY = pPriv->src_y;
2250	srcY += ((pBox->y1 - pPriv->drw_y) *
2251		 pPriv->src_h) / (float)pPriv->dst_h;
2252
2253	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
2254	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
2255
2256	if (IS_R400_3D) {
2257	    if ((dstw+dsth) > 4021)
2258		use_quad = TRUE;
2259	} else {
2260	    if ((dstw+dsth) > 2560)
2261		use_quad = TRUE;
2262	}
2263	/*
2264	 * Set up the scissor area to that of the output size.
2265	 */
2266	BEGIN_RING(2*2);
2267	/* R300 has an offset */
2268	OUT_RING_REG(R300_SC_SCISSOR0, (((dstX + 1440) << R300_SCISSOR_X_SHIFT) |
2269					 ((dstY + 1440) << R300_SCISSOR_Y_SHIFT)));
2270	OUT_RING_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1440 - 1) << R300_SCISSOR_X_SHIFT) |
2271					 ((dstY + dsth + 1440 - 1) << R300_SCISSOR_Y_SHIFT)));
2272	ADVANCE_RING();
2273
2274	if (use_quad) {
2275	    BEGIN_RING(4 * pPriv->vtx_count + 4);
2276	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2277				4 * pPriv->vtx_count));
2278	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
2279		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2280		     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2281	} else {
2282	    BEGIN_RING(3 * pPriv->vtx_count + 4);
2283	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2284				3 * pPriv->vtx_count));
2285	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
2286		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2287		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2288	}
2289
2290	if (pPriv->bicubic_enabled) {
2291		/*
2292		 * This code is only executed on >= R300, so we don't
2293		 * have to deal with the legacy handling.
2294		 */
2295	    if (use_quad) {
2296		VTX_OUT_6((float)dstX,                     (float)dstY,
2297			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2298			  (float)srcX + 0.5,               (float)srcY + 0.5);
2299		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
2300			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
2301			  (float)srcX + 0.5,               (float)(srcY + srch) + 0.5);
2302		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
2303			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
2304			  (float)(srcX + srcw) + 0.5,      (float)(srcY + srch) + 0.5);
2305		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
2306			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
2307			  (float)(srcX + srcw) + 0.5,      (float)srcY + 0.5);
2308	    } else {
2309		VTX_OUT_6((float)dstX,                     (float)dstY,
2310			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2311			  (float)srcX + 0.5,               (float)srcY + 0.5);
2312		VTX_OUT_6((float)dstX,                     (float)(dstY + dstw + dsth),
2313			  (float)srcX / pPriv->w,
2314			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
2315			  (float)srcX + 0.5,
2316			  (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
2317		VTX_OUT_6((float)(dstX + dstw + dsth),     (float)dstY,
2318			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2319			  (float)srcY / pPriv->h,
2320			  (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
2321			  (float)srcY + 0.5);
2322	    }
2323	} else {
2324	    if (use_quad) {
2325		VTX_OUT_4((float)dstX,                     (float)dstY,
2326			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h);
2327		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
2328			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
2329		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
2330			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
2331		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
2332			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
2333	    } else {
2334		/*
2335		 * Render a big, scissored triangle. This means
2336		 * increasing the triangle size and adjusting
2337		 * texture coordinates.
2338		 */
2339		VTX_OUT_4((float)dstX,                 (float)dstY,
2340			  (float)srcX / pPriv->w,      (float)srcY / pPriv->h);
2341		VTX_OUT_4((float)dstX,                 (float)(dstY + dsth + dstw),
2342			  (float)srcX / pPriv->w,
2343			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
2344		VTX_OUT_4((float)(dstX + dstw + dsth), (float)dstY,
2345			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2346			  (float)srcY / pPriv->h);
2347	    }
2348	}
2349
2350	/* flushing is pipelined, free/finish is not */
2351	OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2352
2353	ADVANCE_RING();
2354
2355	pBox++;
2356    }
2357
2358    BEGIN_RING(2*3);
2359    OUT_RING_REG(R300_SC_CLIP_RULE, 0xAAAA);
2360    OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
2361    OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
2362    ADVANCE_RING();
2363
2364    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
2365}
2366
2367static Bool
2368R500PrepareTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2369{
2370    RADEONInfoPtr info = RADEONPTR(pScrn);
2371    PixmapPtr pPixmap = pPriv->pPixmap;
2372    struct radeon_exa_pixmap_priv *driver_priv;
2373    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
2374    uint32_t txfilter, txformat0, txformat1, txpitch, us_format = 0;
2375    uint32_t dst_pitch, dst_format;
2376    uint32_t txenable, colorpitch;
2377    uint32_t output_fmt;
2378    int pixel_shift, out_size = 6;
2379    int ret;
2380
2381    radeon_cs_space_reset_bos(info->cs);
2382    radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2383
2384    if (pPriv->bicubic_enabled)
2385	radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo,
2386					  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2387
2388    driver_priv = exaGetPixmapDriverPrivate(pPixmap);
2389    radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo->bo.radeon, 0,
2390				      RADEON_GEM_DOMAIN_VRAM);
2391
2392    ret = radeon_cs_space_check(info->cs);
2393    if (ret) {
2394	ErrorF("Not enough RAM to hw accel xv operation\n");
2395	return FALSE;
2396    }
2397
2398    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
2399
2400    dst_pitch = exaGetPixmapPitch(pPixmap);
2401    RADEON_SWITCH_TO_3D();
2402
2403    if (pPriv->bicubic_enabled)
2404	pPriv->vtx_count = 6;
2405    else
2406	pPriv->vtx_count = 4;
2407
2408    switch (pPixmap->drawable.bitsPerPixel) {
2409    case 16:
2410	if (pPixmap->drawable.depth == 15)
2411	    dst_format = R300_COLORFORMAT_ARGB1555;
2412	else
2413	    dst_format = R300_COLORFORMAT_RGB565;
2414	break;
2415    case 32:
2416	dst_format = R300_COLORFORMAT_ARGB8888;
2417	break;
2418    default:
2419	return FALSE;
2420    }
2421
2422    output_fmt = (R300_OUT_FMT_C4_8 |
2423		  R300_OUT_FMT_C0_SEL_BLUE |
2424		  R300_OUT_FMT_C1_SEL_GREEN |
2425		  R300_OUT_FMT_C2_SEL_RED |
2426		  R300_OUT_FMT_C3_SEL_ALPHA);
2427
2428    colorpitch = dst_pitch >> pixel_shift;
2429    colorpitch |= dst_format;
2430
2431    if (RADEONTilingEnabled(pScrn, pPixmap))
2432	colorpitch |= R300_COLORTILE;
2433
2434    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
2435        (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
2436	pPriv->is_planar = TRUE;
2437    else
2438	pPriv->is_planar = FALSE;
2439
2440    if (pPriv->is_planar) {
2441	txformat1 = R300_TX_FORMAT_X8;
2442	txpitch = pPriv->src_pitch;
2443    } else {
2444	if (pPriv->id == FOURCC_UYVY)
2445	    txformat1 = R300_TX_FORMAT_YVYU422;
2446	else
2447	    txformat1 = R300_TX_FORMAT_VYUY422;
2448
2449	if (pPriv->bicubic_state != BICUBIC_OFF)
2450	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
2451
2452	/* pitch is in pixels */
2453	txpitch = pPriv->src_pitch / 2;
2454    }
2455    txpitch -= 1;
2456
2457    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2458		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2459		 R300_TXPITCH_EN);
2460
2461    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2462		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2463		R300_TX_MAG_FILTER_LINEAR |
2464		R300_TX_MIN_FILTER_LINEAR |
2465		(0 << R300_TX_ID_SHIFT));
2466
2467
2468    if ((pPriv->w - 1) & 0x800)
2469	txpitch |= R500_TXWIDTH_11;
2470
2471    if ((pPriv->h - 1) & 0x800)
2472	txpitch |= R500_TXHEIGHT_11;
2473
2474    if (info->ChipFamily == CHIP_FAMILY_R520) {
2475	unsigned us_width = (pPriv->w - 1) & 0x7ff;
2476	unsigned us_height = (pPriv->h - 1) & 0x7ff;
2477	unsigned us_depth = 0;
2478
2479	if (pPriv->w > 2048) {
2480	    us_width = (0x7ff + us_width) >> 1;
2481	    us_depth |= 0x0d;
2482	}
2483	if (pPriv->h > 2048) {
2484	    us_height = (0x7ff + us_height) >> 1;
2485	    us_depth |= 0x0e;
2486	}
2487	us_format = (us_width << R300_TXWIDTH_SHIFT) |
2488		    (us_height << R300_TXHEIGHT_SHIFT) |
2489		    (us_depth << R300_TXDEPTH_SHIFT);
2490	out_size++;
2491    }
2492
2493    BEGIN_ACCEL_RELOC(out_size, 1);
2494    OUT_RING_REG(R300_TX_FILTER0_0, txfilter);
2495    OUT_RING_REG(R300_TX_FILTER1_0, 0);
2496    OUT_RING_REG(R300_TX_FORMAT0_0, txformat0);
2497    OUT_RING_REG(R300_TX_FORMAT1_0, txformat1);
2498    OUT_RING_REG(R300_TX_FORMAT2_0, txpitch);
2499    OUT_TEXTURE_REG(R300_TX_OFFSET_0, 0, src_bo);
2500    if (info->ChipFamily == CHIP_FAMILY_R520)
2501	OUT_RING_REG(R500_US_FORMAT0_0, us_format);
2502    ADVANCE_RING();
2503
2504    txenable = R300_TEX_0_ENABLE;
2505
2506    if (pPriv->is_planar) {
2507	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2508		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2509		     R300_TXPITCH_EN);
2510	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
2511	txpitch -= 1;
2512	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2513		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2514		    R300_TX_MIN_FILTER_LINEAR |
2515		    R300_TX_MAG_FILTER_LINEAR);
2516
2517	BEGIN_ACCEL_RELOC(12, 2);
2518	OUT_RING_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
2519	OUT_RING_REG(R300_TX_FILTER1_1, 0);
2520	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
2521	OUT_RING_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8);
2522	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
2523	OUT_TEXTURE_REG(R300_TX_OFFSET_1, pPriv->planeu_offset, src_bo);
2524	OUT_RING_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
2525	OUT_RING_REG(R300_TX_FILTER1_2, 0);
2526	OUT_RING_REG(R300_TX_FORMAT0_2, txformat0);
2527	OUT_RING_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8);
2528	OUT_RING_REG(R300_TX_FORMAT2_2, txpitch);
2529	OUT_TEXTURE_REG(R300_TX_OFFSET_2, pPriv->planev_offset, src_bo);
2530	ADVANCE_RING();
2531	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
2532    }
2533
2534    if (pPriv->bicubic_enabled) {
2535	/* Size is 128x1 */
2536	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
2537		     (0x0 << R300_TXHEIGHT_SHIFT) |
2538		     R300_TXPITCH_EN);
2539	/* Format is 32-bit floats, 4bpp */
2540	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
2541	/* Pitch is 127 (128-1) */
2542	txpitch = 0x7f;
2543	/* Tex filter */
2544	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
2545		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
2546		    R300_TX_MIN_FILTER_NEAREST |
2547		    R300_TX_MAG_FILTER_NEAREST |
2548		    (1 << R300_TX_ID_SHIFT));
2549
2550	BEGIN_ACCEL_RELOC(6, 1);
2551	OUT_RING_REG(R300_TX_FILTER0_1, txfilter);
2552	OUT_RING_REG(R300_TX_FILTER1_1, 0);
2553	OUT_RING_REG(R300_TX_FORMAT0_1, txformat0);
2554	OUT_RING_REG(R300_TX_FORMAT1_1, txformat1);
2555	OUT_RING_REG(R300_TX_FORMAT2_1, txpitch);
2556	OUT_TEXTURE_REG(R300_TX_OFFSET_1, 0, info->bicubic_bo);
2557	ADVANCE_RING();
2558
2559	/* Enable tex 1 */
2560	txenable |= R300_TEX_1_ENABLE;
2561    }
2562
2563    /* setup the VAP */
2564    if (info->accel_state->has_tcl) {
2565	if (pPriv->bicubic_enabled)
2566	    BEGIN_RING(2*7);
2567	else
2568	    BEGIN_RING(2*6);
2569    } else {
2570	if (pPriv->bicubic_enabled)
2571	    BEGIN_RING(2*5);
2572	else
2573	    BEGIN_RING(2*4);
2574    }
2575
2576    /* These registers define the number, type, and location of data submitted
2577     * to the PVS unit of GA input (when PVS is disabled)
2578     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
2579     * enabled.  This memory provides the imputs to the vertex shader program
2580     * and ordering is not important.  When PVS/TCL is disabled, this field maps
2581     * directly to the GA input memory and the order is significant.  In
2582     * PVS_BYPASS mode the order is as follows:
2583     * Position
2584     * Point Size
2585     * Color 0-3
2586     * Textures 0-7
2587     * Fog
2588     */
2589    if (pPriv->bicubic_enabled) {
2590	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
2591		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2592		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2593		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2594		       R300_SIGNED_0 |
2595		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2596		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2597		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2598		       R300_SIGNED_1));
2599	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_1,
2600		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
2601		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
2602		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
2603		       R300_LAST_VEC_2 |
2604		       R300_SIGNED_2));
2605    } else {
2606	OUT_RING_REG(R300_VAP_PROG_STREAM_CNTL_0,
2607		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2608		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2609		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2610		       R300_SIGNED_0 |
2611		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2612		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2613		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2614		       R300_LAST_VEC_1 |
2615		       R300_SIGNED_1));
2616    }
2617
2618    /* load the vertex shader
2619     * We pre-load vertex programs in RADEONInit3DEngine():
2620     * - exa
2621     * - Xv
2622     * - Xv bicubic
2623     * Here we select the offset of the vertex program we want to use
2624     */
2625    if (info->accel_state->has_tcl) {
2626	if (pPriv->bicubic_enabled) {
2627	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
2628			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
2629			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2630			   (13 << R300_PVS_LAST_INST_SHIFT)));
2631	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
2632			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2633	} else {
2634	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_0,
2635			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
2636			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2637			   (10 << R300_PVS_LAST_INST_SHIFT)));
2638	    OUT_RING_REG(R300_VAP_PVS_CODE_CNTL_1,
2639			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2640	}
2641    }
2642
2643    /* Position and one set of 2 texture coordinates */
2644    OUT_RING_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
2645    if (pPriv->bicubic_enabled)
2646	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
2647					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
2648    else
2649	OUT_RING_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
2650
2651    OUT_RING_REG(R300_US_OUT_FMT_0, output_fmt);
2652    ADVANCE_RING();
2653
2654    /* setup pixel shader */
2655    if (pPriv->bicubic_state != BICUBIC_OFF) {
2656	if (pPriv->bicubic_enabled) {
2657	    BEGIN_RING(2*7);
2658
2659	    /* 4 components: 2 for tex0 and 2 for tex1 */
2660	    OUT_RING_REG(R300_RS_COUNT,
2661			  ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2662			   R300_RS_COUNT_HIRES_EN));
2663
2664	    /* R300_INST_COUNT_RS - highest RS instruction used */
2665	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
2666
2667	    /* Pixel stack frame size. */
2668	    OUT_RING_REG(R300_US_PIXSIZE, 5);
2669
2670	    /* FP length. */
2671	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
2672					      R500_US_CODE_END_ADDR(13)));
2673	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
2674					       R500_US_CODE_RANGE_SIZE(13)));
2675
2676	    /* Prepare for FP emission. */
2677	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
2678	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
2679	    ADVANCE_RING();
2680
2681	    BEGIN_RING(2*89);
2682	    /* Pixel shader.
2683	     * I've gone ahead and annotated each instruction, since this
2684	     * thing is MASSIVE. :3
2685	     * Note: In order to avoid buggies with temps and multiple
2686	     * inputs, all temps are offset by 2. temp0 -> register2. */
2687
2688	    /* TEX temp2, input1.xxxx, tex1, 1D */
2689	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2690						   R500_INST_RGB_WMASK_R |
2691						   R500_INST_RGB_WMASK_G |
2692						   R500_INST_RGB_WMASK_B));
2693	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
2694						   R500_TEX_INST_LD |
2695						   R500_TEX_IGNORE_UNCOVERED));
2696	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
2697						   R500_TEX_SRC_S_SWIZ_R |
2698						   R500_TEX_SRC_T_SWIZ_R |
2699						   R500_TEX_SRC_R_SWIZ_R |
2700						   R500_TEX_SRC_Q_SWIZ_R |
2701						   R500_TEX_DST_ADDR(2) |
2702						   R500_TEX_DST_R_SWIZ_R |
2703						   R500_TEX_DST_G_SWIZ_G |
2704						   R500_TEX_DST_B_SWIZ_B |
2705						   R500_TEX_DST_A_SWIZ_A));
2706	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2707	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2708	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2709
2710	    /* TEX temp5, input1.yyyy, tex1, 1D */
2711	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2712						   R500_INST_TEX_SEM_WAIT |
2713						   R500_INST_RGB_WMASK_R |
2714						   R500_INST_RGB_WMASK_G |
2715						   R500_INST_RGB_WMASK_B));
2716	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
2717						   R500_TEX_INST_LD |
2718						   R500_TEX_SEM_ACQUIRE |
2719						   R500_TEX_IGNORE_UNCOVERED));
2720	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
2721						   R500_TEX_SRC_S_SWIZ_G |
2722						   R500_TEX_SRC_T_SWIZ_G |
2723						   R500_TEX_SRC_R_SWIZ_G |
2724						   R500_TEX_SRC_Q_SWIZ_G |
2725						   R500_TEX_DST_ADDR(5) |
2726						   R500_TEX_DST_R_SWIZ_R |
2727						   R500_TEX_DST_G_SWIZ_G |
2728						   R500_TEX_DST_B_SWIZ_B |
2729						   R500_TEX_DST_A_SWIZ_A));
2730	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2731	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2732	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2733
2734	    /* MUL temp4, const0.x0x0, temp2.yyxx */
2735	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2736						   R500_INST_TEX_SEM_WAIT |
2737						   R500_INST_RGB_WMASK_R |
2738						   R500_INST_RGB_WMASK_G |
2739						   R500_INST_RGB_WMASK_B |
2740						   R500_INST_ALPHA_WMASK));
2741	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2742						   R500_RGB_ADDR0_CONST |
2743						   R500_RGB_ADDR1(2)));
2744	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2745						   R500_ALPHA_ADDR0_CONST |
2746						   R500_ALPHA_ADDR1(2)));
2747	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2748						   R500_ALU_RGB_R_SWIZ_A_R |
2749						   R500_ALU_RGB_G_SWIZ_A_0 |
2750						   R500_ALU_RGB_B_SWIZ_A_R |
2751						   R500_ALU_RGB_SEL_B_SRC1 |
2752						   R500_ALU_RGB_R_SWIZ_B_G |
2753						   R500_ALU_RGB_G_SWIZ_B_G |
2754						   R500_ALU_RGB_B_SWIZ_B_R));
2755	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
2756						   R500_ALPHA_OP_MAD |
2757						   R500_ALPHA_SEL_A_SRC0 |
2758						   R500_ALPHA_SWIZ_A_0 |
2759						   R500_ALPHA_SEL_B_SRC1 |
2760						   R500_ALPHA_SWIZ_B_R));
2761	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
2762						   R500_ALU_RGBA_OP_MAD |
2763						   R500_ALU_RGBA_R_SWIZ_0 |
2764						   R500_ALU_RGBA_G_SWIZ_0 |
2765						   R500_ALU_RGBA_B_SWIZ_0 |
2766						   R500_ALU_RGBA_A_SWIZ_0));
2767
2768	    /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
2769	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2770						   R500_INST_RGB_WMASK_R |
2771						   R500_INST_RGB_WMASK_G |
2772						   R500_INST_RGB_WMASK_B |
2773						   R500_INST_ALPHA_WMASK));
2774	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2775						   R500_RGB_ADDR0_CONST |
2776						   R500_RGB_ADDR1(5) |
2777						   R500_RGB_ADDR2(4)));
2778	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2779						   R500_ALPHA_ADDR0_CONST |
2780						   R500_ALPHA_ADDR1(5) |
2781						   R500_ALPHA_ADDR2(4)));
2782	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2783						   R500_ALU_RGB_R_SWIZ_A_0 |
2784						   R500_ALU_RGB_G_SWIZ_A_G |
2785						   R500_ALU_RGB_B_SWIZ_A_0 |
2786						   R500_ALU_RGB_SEL_B_SRC1 |
2787						   R500_ALU_RGB_R_SWIZ_B_R |
2788						   R500_ALU_RGB_G_SWIZ_B_R |
2789						   R500_ALU_RGB_B_SWIZ_B_R));
2790	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
2791						   R500_ALPHA_OP_MAD |
2792						   R500_ALPHA_SEL_A_SRC0 |
2793						   R500_ALPHA_SWIZ_A_G |
2794						   R500_ALPHA_SEL_B_SRC1 |
2795						   R500_ALPHA_SWIZ_B_R));
2796	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
2797						   R500_ALU_RGBA_OP_MAD |
2798						   R500_ALU_RGBA_SEL_C_SRC2 |
2799						   R500_ALU_RGBA_R_SWIZ_R |
2800						   R500_ALU_RGBA_G_SWIZ_G |
2801						   R500_ALU_RGBA_B_SWIZ_B |
2802						   R500_ALU_RGBA_A_SWIZ_A));
2803
2804	    /* ADD temp3, temp3, input0.xyxy */
2805	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2806						   R500_INST_RGB_WMASK_R |
2807						   R500_INST_RGB_WMASK_G |
2808						   R500_INST_RGB_WMASK_B |
2809						   R500_INST_ALPHA_WMASK));
2810	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
2811						   R500_RGB_ADDR2(0)));
2812	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
2813						   R500_ALPHA_ADDR2(0)));
2814	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
2815						   R500_ALU_RGB_G_SWIZ_A_1 |
2816						   R500_ALU_RGB_B_SWIZ_A_1 |
2817						   R500_ALU_RGB_SEL_B_SRC1 |
2818						   R500_ALU_RGB_R_SWIZ_B_R |
2819						   R500_ALU_RGB_G_SWIZ_B_G |
2820						   R500_ALU_RGB_B_SWIZ_B_B));
2821	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
2822						   R500_ALPHA_OP_MAD |
2823						   R500_ALPHA_SWIZ_A_1 |
2824						   R500_ALPHA_SEL_B_SRC1 |
2825						   R500_ALPHA_SWIZ_B_A));
2826	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
2827						   R500_ALU_RGBA_OP_MAD |
2828						   R500_ALU_RGBA_SEL_C_SRC2 |
2829						   R500_ALU_RGBA_R_SWIZ_R |
2830						   R500_ALU_RGBA_G_SWIZ_G |
2831						   R500_ALU_RGBA_B_SWIZ_R |
2832						   R500_ALU_RGBA_A_SWIZ_G));
2833
2834	    /* TEX temp1, temp3.zwxy, tex0, 2D */
2835	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2836						   R500_INST_RGB_WMASK_R |
2837						   R500_INST_RGB_WMASK_G |
2838						   R500_INST_RGB_WMASK_B |
2839						   R500_INST_ALPHA_WMASK));
2840	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2841						   R500_TEX_INST_LD |
2842						   R500_TEX_IGNORE_UNCOVERED));
2843	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
2844						   R500_TEX_SRC_S_SWIZ_B |
2845						   R500_TEX_SRC_T_SWIZ_A |
2846						   R500_TEX_SRC_R_SWIZ_R |
2847						   R500_TEX_SRC_Q_SWIZ_G |
2848						   R500_TEX_DST_ADDR(1) |
2849						   R500_TEX_DST_R_SWIZ_R |
2850						   R500_TEX_DST_G_SWIZ_G |
2851						   R500_TEX_DST_B_SWIZ_B |
2852						   R500_TEX_DST_A_SWIZ_A));
2853	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2854	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2855	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2856
2857	    /* TEX temp3, temp3.xyzw, tex0, 2D */
2858	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2859						   R500_INST_TEX_SEM_WAIT |
2860						   R500_INST_RGB_WMASK_R |
2861						   R500_INST_RGB_WMASK_G |
2862						   R500_INST_RGB_WMASK_B |
2863						   R500_INST_ALPHA_WMASK));
2864	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2865						   R500_TEX_INST_LD |
2866						   R500_TEX_SEM_ACQUIRE |
2867						   R500_TEX_IGNORE_UNCOVERED));
2868	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
2869						   R500_TEX_SRC_S_SWIZ_R |
2870						   R500_TEX_SRC_T_SWIZ_G |
2871						   R500_TEX_SRC_R_SWIZ_B |
2872						   R500_TEX_SRC_Q_SWIZ_A |
2873						   R500_TEX_DST_ADDR(3) |
2874						   R500_TEX_DST_R_SWIZ_R |
2875						   R500_TEX_DST_G_SWIZ_G |
2876						   R500_TEX_DST_B_SWIZ_B |
2877						   R500_TEX_DST_A_SWIZ_A));
2878	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2879	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2880	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2881
2882	    /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
2883	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2884						   R500_INST_RGB_WMASK_R |
2885						   R500_INST_RGB_WMASK_G |
2886						   R500_INST_RGB_WMASK_B |
2887						   R500_INST_ALPHA_WMASK));
2888	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2889						   R500_RGB_ADDR0_CONST |
2890						   R500_RGB_ADDR1(5) |
2891						   R500_RGB_ADDR2(4)));
2892	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2893						   R500_ALPHA_ADDR0_CONST |
2894						   R500_ALPHA_ADDR1(5) |
2895						   R500_ALPHA_ADDR2(4)));
2896	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2897						   R500_ALU_RGB_R_SWIZ_A_0 |
2898						   R500_ALU_RGB_G_SWIZ_A_G |
2899						   R500_ALU_RGB_B_SWIZ_A_0 |
2900						   R500_ALU_RGB_SEL_B_SRC1 |
2901						   R500_ALU_RGB_R_SWIZ_B_G |
2902						   R500_ALU_RGB_G_SWIZ_B_G |
2903						   R500_ALU_RGB_B_SWIZ_B_G));
2904	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
2905						   R500_ALPHA_OP_MAD |
2906						   R500_ALPHA_SEL_A_SRC0 |
2907						   R500_ALPHA_SWIZ_A_G |
2908						   R500_ALPHA_SEL_B_SRC1 |
2909						   R500_ALPHA_SWIZ_B_G));
2910	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
2911						   R500_ALU_RGBA_OP_MAD |
2912						   R500_ALU_RGBA_SEL_C_SRC2 |
2913						   R500_ALU_RGBA_R_SWIZ_R |
2914						   R500_ALU_RGBA_G_SWIZ_G |
2915						   R500_ALU_RGBA_B_SWIZ_B |
2916						   R500_ALU_RGBA_A_SWIZ_A));
2917
2918	    /* ADD temp0, temp4, input0.xyxy */
2919	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2920						   R500_INST_RGB_WMASK_R |
2921						   R500_INST_RGB_WMASK_G |
2922						   R500_INST_RGB_WMASK_B |
2923						   R500_INST_ALPHA_WMASK));
2924	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
2925						   R500_RGB_ADDR2(0)));
2926	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
2927						   R500_ALPHA_ADDR2(0)));
2928	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
2929						   R500_ALU_RGB_G_SWIZ_A_1 |
2930						   R500_ALU_RGB_B_SWIZ_A_1 |
2931						   R500_ALU_RGB_SEL_B_SRC1 |
2932						   R500_ALU_RGB_R_SWIZ_B_R |
2933						   R500_ALU_RGB_G_SWIZ_B_G |
2934						   R500_ALU_RGB_B_SWIZ_B_B));
2935	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
2936						   R500_ALPHA_OP_MAD |
2937						   R500_ALPHA_SWIZ_A_1 |
2938						   R500_ALPHA_SEL_B_SRC1 |
2939						   R500_ALPHA_SWIZ_B_A));
2940	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
2941						   R500_ALU_RGBA_OP_MAD |
2942						   R500_ALU_RGBA_SEL_C_SRC2 |
2943						   R500_ALU_RGBA_R_SWIZ_R |
2944						   R500_ALU_RGBA_G_SWIZ_G |
2945						   R500_ALU_RGBA_B_SWIZ_R |
2946						   R500_ALU_RGBA_A_SWIZ_G));
2947
2948	    /* TEX temp4, temp0.zwzw, tex0, 2D */
2949	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2950						   R500_INST_TEX_SEM_WAIT |
2951						   R500_INST_RGB_WMASK_R |
2952						   R500_INST_RGB_WMASK_G |
2953						   R500_INST_RGB_WMASK_B |
2954						   R500_INST_ALPHA_WMASK));
2955	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2956						   R500_TEX_INST_LD |
2957						   R500_TEX_IGNORE_UNCOVERED));
2958	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
2959						   R500_TEX_SRC_S_SWIZ_B |
2960						   R500_TEX_SRC_T_SWIZ_A |
2961						   R500_TEX_SRC_R_SWIZ_B |
2962						   R500_TEX_SRC_Q_SWIZ_A |
2963						   R500_TEX_DST_ADDR(4) |
2964						   R500_TEX_DST_R_SWIZ_R |
2965						   R500_TEX_DST_G_SWIZ_G |
2966						   R500_TEX_DST_B_SWIZ_B |
2967						   R500_TEX_DST_A_SWIZ_A));
2968	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2969	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2970	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2971
2972	    /* TEX temp0, temp0.xyzw, tex0, 2D */
2973	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2974						   R500_INST_TEX_SEM_WAIT |
2975						   R500_INST_RGB_WMASK_R |
2976						   R500_INST_RGB_WMASK_G |
2977						   R500_INST_RGB_WMASK_B |
2978						   R500_INST_ALPHA_WMASK));
2979	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2980						   R500_TEX_INST_LD |
2981						   R500_TEX_SEM_ACQUIRE |
2982						   R500_TEX_IGNORE_UNCOVERED));
2983	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
2984						   R500_TEX_SRC_S_SWIZ_R |
2985						   R500_TEX_SRC_T_SWIZ_G |
2986						   R500_TEX_SRC_R_SWIZ_B |
2987						   R500_TEX_SRC_Q_SWIZ_A |
2988						   R500_TEX_DST_ADDR(0) |
2989						   R500_TEX_DST_R_SWIZ_R |
2990						   R500_TEX_DST_G_SWIZ_G |
2991						   R500_TEX_DST_B_SWIZ_B |
2992						   R500_TEX_DST_A_SWIZ_A));
2993	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2994	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2995	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2996
2997	    /* LRP temp3, temp2.zzzz, temp1, temp3 ->
2998	     * - PRESUB temps, temp1 - temp3
2999	     * - MAD temp2.zzzz, temps, temp3 */
3000	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3001						   R500_INST_RGB_WMASK_R |
3002						   R500_INST_RGB_WMASK_G |
3003						   R500_INST_RGB_WMASK_B |
3004						   R500_INST_ALPHA_WMASK));
3005	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
3006						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3007						   R500_RGB_ADDR1(1) |
3008						   R500_RGB_ADDR2(2)));
3009	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
3010						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3011						   R500_ALPHA_ADDR1(1) |
3012						   R500_ALPHA_ADDR2(2)));
3013	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3014						   R500_ALU_RGB_R_SWIZ_A_B |
3015						   R500_ALU_RGB_G_SWIZ_A_B |
3016						   R500_ALU_RGB_B_SWIZ_A_B |
3017						   R500_ALU_RGB_SEL_B_SRCP |
3018						   R500_ALU_RGB_R_SWIZ_B_R |
3019						   R500_ALU_RGB_G_SWIZ_B_G |
3020						   R500_ALU_RGB_B_SWIZ_B_B));
3021	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3022						   R500_ALPHA_OP_MAD |
3023						   R500_ALPHA_SEL_A_SRC2 |
3024						   R500_ALPHA_SWIZ_A_B |
3025						   R500_ALPHA_SEL_B_SRCP |
3026						   R500_ALPHA_SWIZ_B_A));
3027	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3028						   R500_ALU_RGBA_OP_MAD |
3029						   R500_ALU_RGBA_SEL_C_SRC0 |
3030						   R500_ALU_RGBA_R_SWIZ_R |
3031						   R500_ALU_RGBA_G_SWIZ_G |
3032						   R500_ALU_RGBA_B_SWIZ_B |
3033						   R500_ALU_RGBA_A_SWIZ_A));
3034
3035	    /* LRP temp0, temp2.zzzz, temp4, temp0 ->
3036	     * - PRESUB temps, temp4 - temp1
3037	     * - MAD temp2.zzzz, temps, temp0 */
3038	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3039						   R500_INST_TEX_SEM_WAIT |
3040						   R500_INST_RGB_WMASK_R |
3041						   R500_INST_RGB_WMASK_G |
3042						   R500_INST_RGB_WMASK_B |
3043						   R500_INST_ALPHA_WMASK));
3044	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3045						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3046						   R500_RGB_ADDR1(4) |
3047						   R500_RGB_ADDR2(2)));
3048	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3049						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3050						   R500_ALPHA_ADDR1(4) |
3051						   R500_ALPHA_ADDR2(2)));
3052	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3053						   R500_ALU_RGB_R_SWIZ_A_B |
3054						   R500_ALU_RGB_G_SWIZ_A_B |
3055						   R500_ALU_RGB_B_SWIZ_A_B |
3056						   R500_ALU_RGB_SEL_B_SRCP |
3057						   R500_ALU_RGB_R_SWIZ_B_R |
3058						   R500_ALU_RGB_G_SWIZ_B_G |
3059						   R500_ALU_RGB_B_SWIZ_B_B));
3060	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3061						   R500_ALPHA_OP_MAD |
3062						   R500_ALPHA_SEL_A_SRC2 |
3063						   R500_ALPHA_SWIZ_A_B |
3064						   R500_ALPHA_SEL_B_SRCP |
3065						   R500_ALPHA_SWIZ_B_A));
3066	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3067						   R500_ALU_RGBA_OP_MAD |
3068						   R500_ALU_RGBA_SEL_C_SRC0 |
3069						   R500_ALU_RGBA_R_SWIZ_R |
3070						   R500_ALU_RGBA_G_SWIZ_G |
3071						   R500_ALU_RGBA_B_SWIZ_B |
3072						   R500_ALU_RGBA_A_SWIZ_A));
3073
3074	    /* LRP output, temp5.zzzz, temp3, temp0 ->
3075	     * - PRESUB temps, temp3 - temp0
3076	     * - MAD temp5.zzzz, temps, temp0 */
3077	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3078						   R500_INST_LAST |
3079						   R500_INST_TEX_SEM_WAIT |
3080						   R500_INST_RGB_WMASK_R |
3081						   R500_INST_RGB_WMASK_G |
3082						   R500_INST_RGB_WMASK_B |
3083						   R500_INST_ALPHA_WMASK |
3084						   R500_INST_RGB_OMASK_R |
3085						   R500_INST_RGB_OMASK_G |
3086						   R500_INST_RGB_OMASK_B |
3087						   R500_INST_ALPHA_OMASK));
3088	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3089						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3090						   R500_RGB_ADDR1(3) |
3091						   R500_RGB_ADDR2(5)));
3092	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3093						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3094						   R500_ALPHA_ADDR1(3) |
3095						   R500_ALPHA_ADDR2(5)));
3096	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3097						   R500_ALU_RGB_R_SWIZ_A_B |
3098						   R500_ALU_RGB_G_SWIZ_A_B |
3099						   R500_ALU_RGB_B_SWIZ_A_B |
3100						   R500_ALU_RGB_SEL_B_SRCP |
3101						   R500_ALU_RGB_R_SWIZ_B_R |
3102						   R500_ALU_RGB_G_SWIZ_B_G |
3103						   R500_ALU_RGB_B_SWIZ_B_B));
3104	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3105						   R500_ALPHA_OP_MAD |
3106						   R500_ALPHA_SEL_A_SRC2 |
3107						   R500_ALPHA_SWIZ_A_B |
3108						   R500_ALPHA_SEL_B_SRCP |
3109						   R500_ALPHA_SWIZ_B_A));
3110	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3111						   R500_ALU_RGBA_OP_MAD |
3112						   R500_ALU_RGBA_SEL_C_SRC0 |
3113						   R500_ALU_RGBA_R_SWIZ_R |
3114						   R500_ALU_RGBA_G_SWIZ_G |
3115						   R500_ALU_RGBA_B_SWIZ_B |
3116						   R500_ALU_RGBA_A_SWIZ_A));
3117
3118	    /* Shader constants. */
3119	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3120
3121	    /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
3122	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
3123	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
3124	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3125	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3126
3127	    ADVANCE_RING();
3128	} else {
3129	    BEGIN_RING(2*19);
3130	    /* 2 components: 2 for tex0 */
3131	    OUT_RING_REG(R300_RS_COUNT,
3132			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3133			   R300_RS_COUNT_HIRES_EN));
3134
3135	    /* R300_INST_COUNT_RS - highest RS instruction used */
3136	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3137
3138	    /* Pixel stack frame size. */
3139	    OUT_RING_REG(R300_US_PIXSIZE, 0); /* highest temp used */
3140
3141	    /* FP length. */
3142	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3143					      R500_US_CODE_END_ADDR(1)));
3144	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3145					       R500_US_CODE_RANGE_SIZE(1)));
3146
3147	    /* Prepare for FP emission. */
3148	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
3149	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3150
3151	    /* tex inst */
3152	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3153						   R500_INST_TEX_SEM_WAIT |
3154						   R500_INST_RGB_WMASK_R |
3155						   R500_INST_RGB_WMASK_G |
3156						   R500_INST_RGB_WMASK_B |
3157						   R500_INST_ALPHA_WMASK |
3158						   R500_INST_RGB_CLAMP |
3159						   R500_INST_ALPHA_CLAMP));
3160	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3161						   R500_TEX_INST_LD |
3162						   R500_TEX_SEM_ACQUIRE |
3163						   R500_TEX_IGNORE_UNCOVERED));
3164	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3165						   R500_TEX_SRC_S_SWIZ_R |
3166						   R500_TEX_SRC_T_SWIZ_G |
3167						   R500_TEX_DST_ADDR(0) |
3168						   R500_TEX_DST_R_SWIZ_R |
3169						   R500_TEX_DST_G_SWIZ_G |
3170						   R500_TEX_DST_B_SWIZ_B |
3171						   R500_TEX_DST_A_SWIZ_A));
3172	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3173						   R500_DX_S_SWIZ_R |
3174						   R500_DX_T_SWIZ_R |
3175						   R500_DX_R_SWIZ_R |
3176						   R500_DX_Q_SWIZ_R |
3177						   R500_DY_ADDR(0) |
3178						   R500_DY_S_SWIZ_R |
3179						   R500_DY_T_SWIZ_R |
3180						   R500_DY_R_SWIZ_R |
3181						   R500_DY_Q_SWIZ_R));
3182	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3183	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3184
3185	    /* ALU inst */
3186	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3187						   R500_INST_TEX_SEM_WAIT |
3188						   R500_INST_LAST |
3189						   R500_INST_RGB_OMASK_R |
3190						   R500_INST_RGB_OMASK_G |
3191						   R500_INST_RGB_OMASK_B |
3192						   R500_INST_ALPHA_OMASK |
3193						   R500_INST_RGB_CLAMP |
3194						   R500_INST_ALPHA_CLAMP));
3195	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3196						   R500_RGB_ADDR1(0) |
3197						   R500_RGB_ADDR1_CONST |
3198						   R500_RGB_ADDR2(0) |
3199						   R500_RGB_ADDR2_CONST));
3200	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3201						   R500_ALPHA_ADDR1(0) |
3202						   R500_ALPHA_ADDR1_CONST |
3203						   R500_ALPHA_ADDR2(0) |
3204						   R500_ALPHA_ADDR2_CONST));
3205	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3206						   R500_ALU_RGB_R_SWIZ_A_R |
3207						   R500_ALU_RGB_G_SWIZ_A_G |
3208						   R500_ALU_RGB_B_SWIZ_A_B |
3209						   R500_ALU_RGB_SEL_B_SRC0 |
3210						   R500_ALU_RGB_R_SWIZ_B_1 |
3211						   R500_ALU_RGB_B_SWIZ_B_1 |
3212						   R500_ALU_RGB_G_SWIZ_B_1));
3213	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3214						   R500_ALPHA_SWIZ_A_A |
3215						   R500_ALPHA_SWIZ_B_1));
3216	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3217						   R500_ALU_RGBA_R_SWIZ_0 |
3218						   R500_ALU_RGBA_G_SWIZ_0 |
3219						   R500_ALU_RGBA_B_SWIZ_0 |
3220						   R500_ALU_RGBA_A_SWIZ_0));
3221	    ADVANCE_RING();
3222	}
3223    } else {
3224	/*
3225	 * y' = y - .0625
3226	 * u' = u - .5
3227	 * v' = v - .5;
3228	 *
3229	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
3230	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
3231	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
3232	 *
3233	 * DP3 might look like the straightforward solution
3234	 * but we'd need to move the texture yuv values in
3235	 * the same reg for this to work. Therefore use MADs.
3236	 * Brightness just adds to the off constant.
3237	 * Contrast is multiplication of luminance.
3238	 * Saturation and hue change the u and v coeffs.
3239	 * Default values (before adjustments - depend on colorspace):
3240	 * yco = 1.1643
3241	 * uco = 0, -0.39173, 2.017
3242	 * vco = 1.5958, -0.8129, 0
3243	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
3244	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
3245	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
3246	 *
3247	 * temp = MAD(yco, yuv.yyyy, off)
3248	 * temp = MAD(uco, yuv.uuuu, temp)
3249	 * result = MAD(vco, yuv.vvvv, temp)
3250	 */
3251	/* TODO: don't recalc consts always */
3252	const float Loff = -0.0627;
3253	const float Coff = -0.502;
3254	float uvcosf, uvsinf;
3255	float yco;
3256	float uco[3], vco[3], off[3];
3257	float bright, cont, gamma;
3258	int ref = pPriv->transform_index;
3259
3260	cont = RTFContrast(pPriv->contrast);
3261	bright = RTFBrightness(pPriv->brightness);
3262	gamma = (float)pPriv->gamma / 1000.0;
3263	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
3264	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
3265	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
3266
3267	yco = trans[ref].RefLuma * cont;
3268	uco[0] = -trans[ref].RefRCr * uvsinf;
3269	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
3270	uco[2] = trans[ref].RefBCb * uvcosf;
3271	vco[0] = trans[ref].RefRCr * uvcosf;
3272	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
3273	vco[2] = trans[ref].RefBCb * uvsinf;
3274	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
3275	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
3276	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
3277
3278	//XXX gamma
3279
3280	if (pPriv->is_planar) {
3281	    BEGIN_RING(2*56);
3282	    /* 2 components: 2 for tex0 */
3283	    OUT_RING_REG(R300_RS_COUNT,
3284			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3285			   R300_RS_COUNT_HIRES_EN));
3286
3287	    /* R300_INST_COUNT_RS - highest RS instruction used */
3288	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3289
3290	    /* Pixel stack frame size. */
3291	    OUT_RING_REG(R300_US_PIXSIZE, 2); /* highest temp used */
3292
3293	    /* FP length. */
3294	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3295					      R500_US_CODE_END_ADDR(5)));
3296	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3297					       R500_US_CODE_RANGE_SIZE(5)));
3298
3299	    /* Prepare for FP emission. */
3300	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
3301	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3302
3303	    /* tex inst */
3304	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3305						   R500_INST_TEX_SEM_WAIT |
3306						   R500_INST_RGB_WMASK_R |
3307						   R500_INST_RGB_WMASK_G |
3308						   R500_INST_RGB_WMASK_B |
3309						   R500_INST_ALPHA_WMASK |
3310						   R500_INST_RGB_CLAMP |
3311						   R500_INST_ALPHA_CLAMP));
3312	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3313						   R500_TEX_INST_LD |
3314						   R500_TEX_IGNORE_UNCOVERED));
3315	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3316						   R500_TEX_SRC_S_SWIZ_R |
3317						   R500_TEX_SRC_T_SWIZ_G |
3318						   R500_TEX_DST_ADDR(2) |
3319						   R500_TEX_DST_R_SWIZ_R |
3320						   R500_TEX_DST_G_SWIZ_G |
3321						   R500_TEX_DST_B_SWIZ_B |
3322						   R500_TEX_DST_A_SWIZ_A));
3323	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3324						   R500_DX_S_SWIZ_R |
3325						   R500_DX_T_SWIZ_R |
3326						   R500_DX_R_SWIZ_R |
3327						   R500_DX_Q_SWIZ_R |
3328						   R500_DY_ADDR(0) |
3329						   R500_DY_S_SWIZ_R |
3330						   R500_DY_T_SWIZ_R |
3331						   R500_DY_R_SWIZ_R |
3332						   R500_DY_Q_SWIZ_R));
3333	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3334	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3335
3336	    /* tex inst */
3337	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3338						   R500_INST_TEX_SEM_WAIT |
3339						   R500_INST_RGB_WMASK_R |
3340						   R500_INST_RGB_WMASK_G |
3341						   R500_INST_RGB_WMASK_B |
3342						   R500_INST_ALPHA_WMASK |
3343						   R500_INST_RGB_CLAMP |
3344						   R500_INST_ALPHA_CLAMP));
3345	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3346						   R500_TEX_INST_LD |
3347						   R500_TEX_IGNORE_UNCOVERED));
3348	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3349						   R500_TEX_SRC_S_SWIZ_R |
3350						   R500_TEX_SRC_T_SWIZ_G |
3351						   R500_TEX_DST_ADDR(1) |
3352						   R500_TEX_DST_R_SWIZ_R |
3353						   R500_TEX_DST_G_SWIZ_G |
3354						   R500_TEX_DST_B_SWIZ_B |
3355						   R500_TEX_DST_A_SWIZ_A));
3356	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3357						   R500_DX_S_SWIZ_R |
3358						   R500_DX_T_SWIZ_R |
3359						   R500_DX_R_SWIZ_R |
3360						   R500_DX_Q_SWIZ_R |
3361						   R500_DY_ADDR(0) |
3362						   R500_DY_S_SWIZ_R |
3363						   R500_DY_T_SWIZ_R |
3364						   R500_DY_R_SWIZ_R |
3365						   R500_DY_Q_SWIZ_R));
3366	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3367	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3368
3369	    /* tex inst */
3370	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3371						   R500_INST_TEX_SEM_WAIT |
3372						   R500_INST_RGB_WMASK_R |
3373						   R500_INST_RGB_WMASK_G |
3374						   R500_INST_RGB_WMASK_B |
3375						   R500_INST_ALPHA_WMASK |
3376						   R500_INST_RGB_CLAMP |
3377						   R500_INST_ALPHA_CLAMP));
3378	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
3379						   R500_TEX_INST_LD |
3380						   R500_TEX_SEM_ACQUIRE |
3381						   R500_TEX_IGNORE_UNCOVERED));
3382	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3383						   R500_TEX_SRC_S_SWIZ_R |
3384						   R500_TEX_SRC_T_SWIZ_G |
3385						   R500_TEX_DST_ADDR(0) |
3386						   R500_TEX_DST_R_SWIZ_R |
3387						   R500_TEX_DST_G_SWIZ_G |
3388						   R500_TEX_DST_B_SWIZ_B |
3389						   R500_TEX_DST_A_SWIZ_A));
3390	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3391						   R500_DX_S_SWIZ_R |
3392						   R500_DX_T_SWIZ_R |
3393						   R500_DX_R_SWIZ_R |
3394						   R500_DX_Q_SWIZ_R |
3395						   R500_DY_ADDR(0) |
3396						   R500_DY_S_SWIZ_R |
3397						   R500_DY_T_SWIZ_R |
3398						   R500_DY_R_SWIZ_R |
3399						   R500_DY_Q_SWIZ_R));
3400	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3401	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3402
3403	    /* ALU inst */
3404	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
3405	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3406						   R500_INST_TEX_SEM_WAIT |
3407						   R500_INST_RGB_WMASK_R |
3408						   R500_INST_RGB_WMASK_G |
3409						   R500_INST_RGB_WMASK_B |
3410						   R500_INST_ALPHA_WMASK));
3411	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3412						   R500_RGB_ADDR0_CONST |
3413						   R500_RGB_ADDR1(2) |
3414						   R500_RGB_ADDR2(0) |
3415						   R500_RGB_ADDR2_CONST));
3416	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3417						   R500_ALPHA_ADDR0_CONST |
3418						   R500_ALPHA_ADDR1(2) |
3419						   R500_ALPHA_ADDR2(0) |
3420						   R500_ALPHA_ADDR2_CONST));
3421	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3422						   R500_ALU_RGB_R_SWIZ_A_A |
3423						   R500_ALU_RGB_G_SWIZ_A_A |
3424						   R500_ALU_RGB_B_SWIZ_A_A |
3425						   R500_ALU_RGB_SEL_B_SRC1 |
3426						   R500_ALU_RGB_R_SWIZ_B_R |
3427						   R500_ALU_RGB_B_SWIZ_B_G |
3428						   R500_ALU_RGB_G_SWIZ_B_B));
3429	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3430						   R500_ALPHA_ADDRD(2) |
3431						   R500_ALPHA_SWIZ_A_0 |
3432						   R500_ALPHA_SWIZ_B_0));
3433	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3434						   R500_ALU_RGBA_ADDRD(2) |
3435						   R500_ALU_RGBA_SEL_C_SRC0 |
3436						   R500_ALU_RGBA_R_SWIZ_R |
3437						   R500_ALU_RGBA_G_SWIZ_G |
3438						   R500_ALU_RGBA_B_SWIZ_B |
3439						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3440						   R500_ALU_RGBA_A_SWIZ_0));
3441
3442	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
3443	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3444						   R500_INST_TEX_SEM_WAIT |
3445						   R500_INST_RGB_WMASK_R |
3446						   R500_INST_RGB_WMASK_G |
3447						   R500_INST_RGB_WMASK_B |
3448						   R500_INST_ALPHA_WMASK));
3449	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3450						   R500_RGB_ADDR0_CONST |
3451						   R500_RGB_ADDR1(1) |
3452						   R500_RGB_ADDR2(2)));
3453	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3454						   R500_ALPHA_ADDR0_CONST |
3455						   R500_ALPHA_ADDR1(1) |
3456						   R500_ALPHA_ADDR2(2)));
3457	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3458						   R500_ALU_RGB_R_SWIZ_A_R |
3459						   R500_ALU_RGB_G_SWIZ_A_G |
3460						   R500_ALU_RGB_B_SWIZ_A_B |
3461						   R500_ALU_RGB_SEL_B_SRC1 |
3462						   R500_ALU_RGB_R_SWIZ_B_R |
3463						   R500_ALU_RGB_B_SWIZ_B_G |
3464						   R500_ALU_RGB_G_SWIZ_B_B));
3465	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3466						   R500_ALPHA_ADDRD(2) |
3467						   R500_ALPHA_SWIZ_A_0 |
3468						   R500_ALPHA_SWIZ_B_0));
3469	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3470						   R500_ALU_RGBA_ADDRD(2) |
3471						   R500_ALU_RGBA_SEL_C_SRC2 |
3472						   R500_ALU_RGBA_R_SWIZ_R |
3473						   R500_ALU_RGBA_G_SWIZ_G |
3474						   R500_ALU_RGBA_B_SWIZ_B |
3475						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3476						   R500_ALU_RGBA_A_SWIZ_0));
3477
3478	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
3479	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3480						   R500_INST_TEX_SEM_WAIT |
3481						   R500_INST_LAST |
3482						   R500_INST_RGB_OMASK_R |
3483						   R500_INST_RGB_OMASK_G |
3484						   R500_INST_RGB_OMASK_B |
3485						   R500_INST_ALPHA_OMASK |
3486						   R500_INST_RGB_CLAMP |
3487						   R500_INST_ALPHA_CLAMP));
3488	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3489						   R500_RGB_ADDR0_CONST |
3490						   R500_RGB_ADDR1(0) |
3491						   R500_RGB_ADDR2(2)));
3492	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
3493						   R500_ALPHA_ADDR0_CONST |
3494						   R500_ALPHA_ADDR1(0) |
3495						   R500_ALPHA_ADDR2(2)));
3496	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3497						   R500_ALU_RGB_R_SWIZ_A_R |
3498						   R500_ALU_RGB_G_SWIZ_A_G |
3499						   R500_ALU_RGB_B_SWIZ_A_B |
3500						   R500_ALU_RGB_SEL_B_SRC1 |
3501						   R500_ALU_RGB_R_SWIZ_B_R |
3502						   R500_ALU_RGB_B_SWIZ_B_G |
3503						   R500_ALU_RGB_G_SWIZ_B_B));
3504	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3505						   R500_ALPHA_ADDRD(0) |
3506						   R500_ALPHA_SWIZ_A_0 |
3507						   R500_ALPHA_SWIZ_B_0));
3508	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3509						   R500_ALU_RGBA_ADDRD(0) |
3510						   R500_ALU_RGBA_SEL_C_SRC2 |
3511						   R500_ALU_RGBA_R_SWIZ_R |
3512						   R500_ALU_RGBA_G_SWIZ_G |
3513						   R500_ALU_RGBA_B_SWIZ_B |
3514						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3515						   R500_ALU_RGBA_A_SWIZ_1));
3516
3517	} else {
3518	    BEGIN_RING(2*44);
3519	    /* 2 components: 2 for tex0/1/2 */
3520	    OUT_RING_REG(R300_RS_COUNT,
3521			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3522			   R300_RS_COUNT_HIRES_EN));
3523
3524	    /* R300_INST_COUNT_RS - highest RS instruction used */
3525	    OUT_RING_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3526
3527	    /* Pixel stack frame size. */
3528	    OUT_RING_REG(R300_US_PIXSIZE, 1); /* highest temp used */
3529
3530	    /* FP length. */
3531	    OUT_RING_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3532					      R500_US_CODE_END_ADDR(3)));
3533	    OUT_RING_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3534					       R500_US_CODE_RANGE_SIZE(3)));
3535
3536	    /* Prepare for FP emission. */
3537	    OUT_RING_REG(R500_US_CODE_OFFSET, 0);
3538	    OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3539
3540	    /* tex inst */
3541	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3542						   R500_INST_TEX_SEM_WAIT |
3543						   R500_INST_RGB_WMASK_R |
3544						   R500_INST_RGB_WMASK_G |
3545						   R500_INST_RGB_WMASK_B |
3546						   R500_INST_ALPHA_WMASK |
3547						   R500_INST_RGB_CLAMP |
3548						   R500_INST_ALPHA_CLAMP));
3549	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3550						   R500_TEX_INST_LD |
3551						   R500_TEX_SEM_ACQUIRE |
3552						   R500_TEX_IGNORE_UNCOVERED));
3553	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3554						   R500_TEX_SRC_S_SWIZ_R |
3555						   R500_TEX_SRC_T_SWIZ_G |
3556						   R500_TEX_DST_ADDR(0) |
3557						   R500_TEX_DST_R_SWIZ_R |
3558						   R500_TEX_DST_G_SWIZ_G |
3559						   R500_TEX_DST_B_SWIZ_B |
3560						   R500_TEX_DST_A_SWIZ_A));
3561	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3562						   R500_DX_S_SWIZ_R |
3563						   R500_DX_T_SWIZ_R |
3564						   R500_DX_R_SWIZ_R |
3565						   R500_DX_Q_SWIZ_R |
3566						   R500_DY_ADDR(0) |
3567						   R500_DY_S_SWIZ_R |
3568						   R500_DY_T_SWIZ_R |
3569						   R500_DY_R_SWIZ_R |
3570						   R500_DY_Q_SWIZ_R));
3571	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3572	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3573
3574	    /* ALU inst */
3575	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
3576	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3577						   R500_INST_TEX_SEM_WAIT |
3578						   R500_INST_RGB_WMASK_R |
3579						   R500_INST_RGB_WMASK_G |
3580						   R500_INST_RGB_WMASK_B |
3581						   R500_INST_ALPHA_WMASK));
3582	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3583						   R500_RGB_ADDR0_CONST |
3584						   R500_RGB_ADDR1(0) |
3585						   R500_RGB_ADDR2(0) |
3586						   R500_RGB_ADDR2_CONST));
3587	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3588						   R500_ALPHA_ADDR0_CONST |
3589						   R500_ALPHA_ADDR1(0) |
3590						   R500_ALPHA_ADDR2(0) |
3591						   R500_ALPHA_ADDR2_CONST));
3592	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3593						   R500_ALU_RGB_R_SWIZ_A_A |
3594						   R500_ALU_RGB_G_SWIZ_A_A |
3595						   R500_ALU_RGB_B_SWIZ_A_A |
3596						   R500_ALU_RGB_SEL_B_SRC1 |
3597						   R500_ALU_RGB_R_SWIZ_B_G |
3598						   R500_ALU_RGB_B_SWIZ_B_G |
3599						   R500_ALU_RGB_G_SWIZ_B_G));
3600	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3601						   R500_ALPHA_ADDRD(1) |
3602						   R500_ALPHA_SWIZ_A_0 |
3603						   R500_ALPHA_SWIZ_B_0));
3604	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3605						   R500_ALU_RGBA_ADDRD(1) |
3606						   R500_ALU_RGBA_SEL_C_SRC0 |
3607						   R500_ALU_RGBA_R_SWIZ_R |
3608						   R500_ALU_RGBA_G_SWIZ_G |
3609						   R500_ALU_RGBA_B_SWIZ_B |
3610						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3611						   R500_ALU_RGBA_A_SWIZ_0));
3612
3613	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
3614	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3615						   R500_INST_TEX_SEM_WAIT |
3616						   R500_INST_RGB_WMASK_R |
3617						   R500_INST_RGB_WMASK_G |
3618						   R500_INST_RGB_WMASK_B |
3619						   R500_INST_ALPHA_WMASK));
3620	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3621						   R500_RGB_ADDR0_CONST |
3622						   R500_RGB_ADDR1(0) |
3623						   R500_RGB_ADDR2(1)));
3624	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3625						   R500_ALPHA_ADDR0_CONST |
3626						   R500_ALPHA_ADDR1(0) |
3627						   R500_ALPHA_ADDR2(1)));
3628	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3629						   R500_ALU_RGB_R_SWIZ_A_R |
3630						   R500_ALU_RGB_G_SWIZ_A_G |
3631						   R500_ALU_RGB_B_SWIZ_A_B |
3632						   R500_ALU_RGB_SEL_B_SRC1 |
3633						   R500_ALU_RGB_R_SWIZ_B_B |
3634						   R500_ALU_RGB_B_SWIZ_B_B |
3635						   R500_ALU_RGB_G_SWIZ_B_B));
3636	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3637						   R500_ALPHA_ADDRD(1) |
3638						   R500_ALPHA_SWIZ_A_0 |
3639						   R500_ALPHA_SWIZ_B_0));
3640	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3641						   R500_ALU_RGBA_ADDRD(1) |
3642						   R500_ALU_RGBA_SEL_C_SRC2 |
3643						   R500_ALU_RGBA_R_SWIZ_R |
3644						   R500_ALU_RGBA_G_SWIZ_G |
3645						   R500_ALU_RGBA_B_SWIZ_B |
3646						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3647						   R500_ALU_RGBA_A_SWIZ_0));
3648
3649	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
3650	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3651						   R500_INST_TEX_SEM_WAIT |
3652						   R500_INST_LAST |
3653						   R500_INST_RGB_OMASK_R |
3654						   R500_INST_RGB_OMASK_G |
3655						   R500_INST_RGB_OMASK_B |
3656						   R500_INST_ALPHA_OMASK |
3657						   R500_INST_RGB_CLAMP |
3658						   R500_INST_ALPHA_CLAMP));
3659	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3660						   R500_RGB_ADDR0_CONST |
3661						   R500_RGB_ADDR1(0) |
3662						   R500_RGB_ADDR2(1)));
3663	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3664						   R500_ALPHA_ADDR0_CONST |
3665						   R500_ALPHA_ADDR1(0) |
3666						   R500_ALPHA_ADDR2(1)));
3667	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3668						   R500_ALU_RGB_R_SWIZ_A_R |
3669						   R500_ALU_RGB_G_SWIZ_A_G |
3670						   R500_ALU_RGB_B_SWIZ_A_B |
3671						   R500_ALU_RGB_SEL_B_SRC1 |
3672						   R500_ALU_RGB_R_SWIZ_B_R |
3673						   R500_ALU_RGB_B_SWIZ_B_R |
3674						   R500_ALU_RGB_G_SWIZ_B_R));
3675	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3676						   R500_ALPHA_ADDRD(1) |
3677						   R500_ALPHA_SWIZ_A_0 |
3678						   R500_ALPHA_SWIZ_B_0));
3679	    OUT_RING_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3680						   R500_ALU_RGBA_ADDRD(1) |
3681						   R500_ALU_RGBA_SEL_C_SRC2 |
3682						   R500_ALU_RGBA_R_SWIZ_R |
3683						   R500_ALU_RGBA_G_SWIZ_G |
3684						   R500_ALU_RGBA_B_SWIZ_B |
3685						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3686						   R500_ALU_RGBA_A_SWIZ_1));
3687	}
3688
3689	/* Shader constants. */
3690	OUT_RING_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3691
3692	/* constant 0: off, yco */
3693	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
3694	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
3695	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
3696	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
3697	/* constant 1: uco */
3698	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
3699	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
3700	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
3701	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
3702	/* constant 2: vco */
3703	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
3704	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
3705	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
3706	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
3707
3708	ADVANCE_RING();
3709    }
3710
3711    BEGIN_ACCEL_RELOC(6, 2);
3712    OUT_RING_REG(R300_TX_INVALTAGS, 0);
3713    OUT_RING_REG(R300_TX_ENABLE, txenable);
3714
3715    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
3716    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
3717
3718    /* no need to enable blending */
3719    OUT_RING_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
3720
3721    OUT_RING_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
3722    ADVANCE_RING();
3723
3724    if (pPriv->vsync) {
3725	xf86CrtcPtr crtc;
3726	if (pPriv->desired_crtc)
3727	    crtc = pPriv->desired_crtc;
3728	else
3729	    crtc = radeon_pick_best_crtc(pScrn, FALSE,
3730					 pPriv->drw_x,
3731					 pPriv->drw_x + pPriv->dst_w,
3732					 pPriv->drw_y,
3733					 pPriv->drw_y + pPriv->dst_h);
3734	if (crtc)
3735	    RADEONWaitForVLine(pScrn, pPixmap,
3736			       crtc,
3737			       pPriv->drw_y - crtc->y,
3738			       (pPriv->drw_y - crtc->y) + pPriv->dst_h);
3739    }
3740
3741    return TRUE;
3742}
3743
3744static void
3745R500DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
3746{
3747    RADEONInfoPtr info = RADEONPTR(pScrn);
3748    PixmapPtr pPixmap = pPriv->pPixmap;
3749    int dstxoff, dstyoff;
3750    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
3751    int nBox = REGION_NUM_RECTS(&pPriv->clip);
3752
3753#ifdef COMPOSITE
3754    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
3755    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
3756#else
3757    dstxoff = 0;
3758    dstyoff = 0;
3759#endif
3760
3761    if (!R500PrepareTexturedVideo(pScrn, pPriv))
3762	return;
3763
3764    /*
3765     * Rendering of the actual polygon is done in two different
3766     * ways depending on chip generation:
3767     *
3768     * < R300:
3769     *
3770     *     These chips can render a rectangle in one pass, so
3771     *     handling is pretty straight-forward.
3772     *
3773     * >= R300:
3774     *
3775     *     These chips can accept a quad, but will render it as
3776     *     two triangles which results in a diagonal tear. Instead
3777     *     We render a single, large triangle and use the scissor
3778     *     functionality to restrict it to the desired rectangle.
3779     *     Due to guardband limits on r3xx/r4xx, we can only use
3780     *     the single triangle up to 2880 pixels; above that we
3781     *     render as a quad.
3782     */
3783
3784    while (nBox--) {
3785	float srcX, srcY, srcw, srch;
3786	int dstX, dstY, dstw, dsth;
3787	int draw_size = 3 * pPriv->vtx_count + 4 + 2 + 3;
3788
3789	if (draw_size > radeon_cs_space_remaining(pScrn)) {
3790	    radeon_cs_flush_indirect(pScrn);
3791	    if (!R500PrepareTexturedVideo(pScrn, pPriv))
3792		return;
3793	}
3794
3795	dstX = pBox->x1 + dstxoff;
3796	dstY = pBox->y1 + dstyoff;
3797	dstw = pBox->x2 - pBox->x1;
3798	dsth = pBox->y2 - pBox->y1;
3799
3800	srcX = pPriv->src_x;
3801	srcX += ((pBox->x1 - pPriv->drw_x) *
3802		 pPriv->src_w) / (float)pPriv->dst_w;
3803	srcY = pPriv->src_y;
3804	srcY += ((pBox->y1 - pPriv->drw_y) *
3805		 pPriv->src_h) / (float)pPriv->dst_h;
3806
3807	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
3808	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
3809
3810	BEGIN_RING(2*2);
3811	OUT_RING_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
3812					 ((dstY) << R300_SCISSOR_Y_SHIFT)));
3813	OUT_RING_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
3814					 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
3815	ADVANCE_RING();
3816
3817	BEGIN_RING(3 * pPriv->vtx_count + 4);
3818	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
3819			    3 * pPriv->vtx_count));
3820	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
3821		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
3822		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
3823
3824	if (pPriv->bicubic_enabled) {
3825	    VTX_OUT_6((float)dstX,            (float)dstY,
3826		      (float)srcX / pPriv->w, (float)srcY / pPriv->h,
3827		      (float)srcX + 0.5,      (float)srcY + 0.5);
3828	    VTX_OUT_6((float)dstX,            (float)(dstY + dstw + dsth),
3829		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
3830		      (float)srcX + 0.5,      (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
3831	    VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
3832		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
3833		      (float)srcY / pPriv->h,
3834		      (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
3835		      (float)srcY + 0.5);
3836	} else {
3837	    /*
3838	     * Render a big, scissored triangle. This means
3839	     * increasing the triangle size and adjusting
3840	     * texture coordinates.
3841	     */
3842	    VTX_OUT_4((float)dstX,            (float)dstY,
3843		      (float)srcX / pPriv->w, (float)srcY / pPriv->h);
3844	    VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
3845		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
3846	    VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
3847		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
3848		      (float)srcY / pPriv->h);
3849	}
3850
3851	/* flushing is pipelined, free/finish is not */
3852	OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
3853
3854	ADVANCE_RING();
3855
3856	pBox++;
3857    }
3858
3859    BEGIN_RING(2*3);
3860    OUT_RING_REG(R300_SC_CLIP_RULE, 0xAAAA);
3861    OUT_RING_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
3862    OUT_RING_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
3863    ADVANCE_RING();
3864
3865    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
3866}
3867
3868#undef VTX_OUT_4
3869#undef VTX_OUT_6
3870