radeon_textured_videofuncs.c revision 921a55d8
1/*
2 * Copyright 2008 Alex Deucher
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 *
24 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
25 *
26 */
27
28#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
29#error Cannot define both MMIO and CP acceleration!
30#endif
31
32#if !defined(UNIXCPP) || defined(ANSICPP)
33#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
34#else
35#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
36#endif
37
38#ifdef ACCEL_MMIO
39#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
40#else
41#ifdef ACCEL_CP
42#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
43#else
44#error No accel type defined!
45#endif
46#endif
47
48#ifdef ACCEL_CP
49
50#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
51do {								\
52    OUT_RING_F(_dstX);						\
53    OUT_RING_F(_dstY);						\
54    OUT_RING_F(_srcX);						\
55    OUT_RING_F(_srcY);						\
56    OUT_RING_F(_maskX);						\
57    OUT_RING_F(_maskY);						\
58} while (0)
59
60#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
61do {								\
62    OUT_RING_F(_dstX);						\
63    OUT_RING_F(_dstY);						\
64    OUT_RING_F(_srcX);						\
65    OUT_RING_F(_srcY);						\
66} while (0)
67
68#else /* ACCEL_CP */
69
70#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)		\
71do {									\
72    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);			\
73    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);			\
74    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);			\
75    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);			\
76    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskX);			\
77    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);			\
78} while (0)
79
80#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
81do {								\
82    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);		\
83    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);		\
84    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);		\
85    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);		\
86} while (0)
87
88#endif /* !ACCEL_CP */
89
90static Bool
91FUNC_NAME(RADEONPrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
92{
93    RADEONInfoPtr info = RADEONPTR(pScrn);
94    PixmapPtr pPixmap = pPriv->pPixmap;
95    struct radeon_exa_pixmap_priv *driver_priv;
96    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
97    uint32_t txformat, txsize, txpitch, txoffset;
98    uint32_t dst_pitch, dst_format;
99    uint32_t colorpitch;
100    int pixel_shift;
101    int scissor_w = MIN(pPixmap->drawable.width, 2047);
102    int scissor_h = MIN(pPixmap->drawable.height, 2047);
103    ACCEL_PREAMBLE();
104
105#ifdef XF86DRM_MODE
106    if (info->cs) {
107	int ret;
108
109	radeon_cs_space_reset_bos(info->cs);
110        radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
111
112	if (pPriv->bicubic_enabled)
113	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
114
115	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
116	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
117
118	ret = radeon_cs_space_check(info->cs);
119	if (ret) {
120	    ErrorF("Not enough RAM to hw accel xv operation\n");
121	    return FALSE;
122	}
123    }
124#endif
125
126    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
127
128
129#ifdef USE_EXA
130    if (info->useEXA) {
131	dst_pitch = exaGetPixmapPitch(pPixmap);
132    } else
133#endif
134    {
135        dst_pitch = pPixmap->devKind;
136    }
137
138#ifdef USE_EXA
139    if (info->useEXA) {
140	RADEON_SWITCH_TO_3D();
141    } else
142#endif
143    {
144	BEGIN_ACCEL(2);
145	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
146	/* We must wait for 3d to idle, in case source was just written as a dest. */
147	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
148		      RADEON_WAIT_HOST_IDLECLEAN |
149		      RADEON_WAIT_2D_IDLECLEAN |
150		      RADEON_WAIT_3D_IDLECLEAN |
151		      RADEON_WAIT_DMA_GUI_IDLE);
152	FINISH_ACCEL();
153
154	if (!info->accel_state->XInited3D)
155	    RADEONInit3DEngine(pScrn);
156    }
157
158    /* Same for R100/R200 */
159    switch (pPixmap->drawable.bitsPerPixel) {
160    case 16:
161	if (pPixmap->drawable.depth == 15)
162	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
163	else
164	    dst_format = RADEON_COLOR_FORMAT_RGB565;
165	break;
166    case 32:
167	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
168	break;
169    default:
170	return FALSE;
171    }
172
173    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
174	pPriv->is_planar = TRUE;
175	txformat = RADEON_TXFORMAT_Y8;
176    } else {
177	pPriv->is_planar = FALSE;
178	if (pPriv->id == FOURCC_UYVY)
179	    txformat = RADEON_TXFORMAT_YVYU422;
180	else
181	    txformat = RADEON_TXFORMAT_VYUY422;
182    }
183
184    txformat |= RADEON_TXFORMAT_NON_POWER2;
185
186    colorpitch = dst_pitch >> pixel_shift;
187
188    if (RADEONTilingEnabled(pScrn, pPixmap))
189	colorpitch |= RADEON_COLOR_TILE_ENABLE;
190
191    txoffset = info->cs ? 0 : pPriv->src_offset;
192
193    BEGIN_ACCEL_RELOC(4,2);
194
195    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
196    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
197    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
198    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
199		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
200
201    FINISH_ACCEL();
202
203    if (pPriv->is_planar) {
204	/* need 2 texcoord sets (even though they are identical) due
205	   to denormalization! hw apparently can't premultiply
206	   same coord set by different texture size */
207	pPriv->vtx_count = 6;
208
209	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
210		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
211	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
212	txpitch -= 32;
213
214	BEGIN_ACCEL_RELOC(23, 3);
215
216	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
217					  RADEON_SE_VTX_FMT_ST0 |
218					  RADEON_SE_VTX_FMT_ST1));
219
220	OUT_ACCEL_REG(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE |
221				       RADEON_TEX_1_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
222				       RADEON_TEX_2_ENABLE | RADEON_TEX_BLEND_2_ENABLE |
223				       RADEON_PLANAR_YUV_ENABLE));
224
225	/* Y */
226	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
227		      RADEON_MAG_FILTER_LINEAR |
228		      RADEON_MIN_FILTER_LINEAR |
229		      RADEON_CLAMP_S_CLAMP_LAST |
230		      RADEON_CLAMP_T_CLAMP_LAST |
231		      RADEON_YUV_TO_RGB);
232	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
233	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, txoffset, src_bo);
234	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
235		      RADEON_COLOR_ARG_A_ZERO |
236		      RADEON_COLOR_ARG_B_ZERO |
237		      RADEON_COLOR_ARG_C_T0_COLOR |
238		      RADEON_BLEND_CTL_ADD |
239		      RADEON_CLAMP_TX);
240	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
241		      RADEON_ALPHA_ARG_A_ZERO |
242		      RADEON_ALPHA_ARG_B_ZERO |
243		      RADEON_ALPHA_ARG_C_T0_ALPHA |
244		      RADEON_BLEND_CTL_ADD |
245		      RADEON_CLAMP_TX);
246
247	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
248		      (pPriv->w - 1) |
249		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
250	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
251		      pPriv->src_pitch - 32);
252
253	/* U */
254	OUT_ACCEL_REG(RADEON_PP_TXFILTER_1,
255		      RADEON_MAG_FILTER_LINEAR |
256		      RADEON_MIN_FILTER_LINEAR |
257		      RADEON_CLAMP_S_CLAMP_LAST |
258		      RADEON_CLAMP_T_CLAMP_LAST);
259	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_1, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
260	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
261	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_1,
262		      RADEON_COLOR_ARG_A_ZERO |
263		      RADEON_COLOR_ARG_B_ZERO |
264		      RADEON_COLOR_ARG_C_T0_COLOR |
265		      RADEON_BLEND_CTL_ADD |
266		      RADEON_CLAMP_TX);
267	OUT_ACCEL_REG(RADEON_PP_TXABLEND_1,
268		      RADEON_ALPHA_ARG_A_ZERO |
269		      RADEON_ALPHA_ARG_B_ZERO |
270		      RADEON_ALPHA_ARG_C_T0_ALPHA |
271		      RADEON_BLEND_CTL_ADD |
272		      RADEON_CLAMP_TX);
273
274	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_1, txsize);
275	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_1, txpitch);
276
277	/* V */
278	OUT_ACCEL_REG(RADEON_PP_TXFILTER_2,
279		      RADEON_MAG_FILTER_LINEAR |
280		      RADEON_MIN_FILTER_LINEAR |
281		      RADEON_CLAMP_S_CLAMP_LAST |
282		      RADEON_CLAMP_T_CLAMP_LAST);
283	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_2, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
284	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_2, txoffset + pPriv->planev_offset, src_bo);
285	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_2,
286		      RADEON_COLOR_ARG_A_ZERO |
287		      RADEON_COLOR_ARG_B_ZERO |
288		      RADEON_COLOR_ARG_C_T0_COLOR |
289		      RADEON_BLEND_CTL_ADD |
290		      RADEON_CLAMP_TX);
291	OUT_ACCEL_REG(RADEON_PP_TXABLEND_2,
292		      RADEON_ALPHA_ARG_A_ZERO |
293		      RADEON_ALPHA_ARG_B_ZERO |
294		      RADEON_ALPHA_ARG_C_T0_ALPHA |
295		      RADEON_BLEND_CTL_ADD |
296		      RADEON_CLAMP_TX);
297
298	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_2, txsize);
299	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_2, txpitch);
300	FINISH_ACCEL();
301    } else {
302	pPriv->vtx_count = 4;
303	BEGIN_ACCEL_RELOC(9, 1);
304
305	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
306					  RADEON_SE_VTX_FMT_ST0));
307
308	OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
309
310	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
311		      RADEON_MAG_FILTER_LINEAR |
312		      RADEON_MIN_FILTER_LINEAR |
313		      RADEON_CLAMP_S_CLAMP_LAST |
314		      RADEON_CLAMP_T_CLAMP_LAST |
315		      RADEON_YUV_TO_RGB);
316	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
317	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, txoffset, src_bo);
318	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
319		      RADEON_COLOR_ARG_A_ZERO |
320		      RADEON_COLOR_ARG_B_ZERO |
321		      RADEON_COLOR_ARG_C_T0_COLOR |
322		      RADEON_BLEND_CTL_ADD |
323		      RADEON_CLAMP_TX);
324	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
325		      RADEON_ALPHA_ARG_A_ZERO |
326		      RADEON_ALPHA_ARG_B_ZERO |
327		      RADEON_ALPHA_ARG_C_T0_ALPHA |
328		      RADEON_BLEND_CTL_ADD |
329		      RADEON_CLAMP_TX);
330
331	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
332		      (pPriv->w - 1) |
333		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
334	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
335		      pPriv->src_pitch - 32);
336	FINISH_ACCEL();
337    }
338
339    BEGIN_ACCEL(2);
340    OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
341    OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
342					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
343    FINISH_ACCEL();
344
345    if (pPriv->vsync) {
346	xf86CrtcPtr crtc;
347	if (pPriv->desired_crtc)
348	    crtc = pPriv->desired_crtc;
349	else
350	    crtc = radeon_pick_best_crtc(pScrn,
351					 pPriv->drw_x,
352					 pPriv->drw_x + pPriv->dst_w,
353					 pPriv->drw_y,
354					 pPriv->drw_y + pPriv->dst_h);
355	if (crtc)
356	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
357					  crtc,
358					  pPriv->drw_y - crtc->y,
359					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
360    }
361
362    return TRUE;
363}
364
365static void
366FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
367{
368    RADEONInfoPtr info = RADEONPTR(pScrn);
369    PixmapPtr pPixmap = pPriv->pPixmap;
370    int dstxoff, dstyoff;
371    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
372    int nBox = REGION_NUM_RECTS(&pPriv->clip);
373    ACCEL_PREAMBLE();
374
375#ifdef COMPOSITE
376    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
377    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
378#else
379    dstxoff = 0;
380    dstyoff = 0;
381#endif
382
383    if (!FUNC_NAME(RADEONPrepareTexturedVideo)(pScrn, pPriv))
384	return;
385
386    /*
387     * Rendering of the actual polygon is done in two different
388     * ways depending on chip generation:
389     *
390     * < R300:
391     *
392     *     These chips can render a rectangle in one pass, so
393     *     handling is pretty straight-forward.
394     *
395     * >= R300:
396     *
397     *     These chips can accept a quad, but will render it as
398     *     two triangles which results in a diagonal tear. Instead
399     *     We render a single, large triangle and use the scissor
400     *     functionality to restrict it to the desired rectangle.
401     *     Due to guardband limits on r3xx/r4xx, we can only use
402     *     the single triangle up to 2560/4021 pixels; above that we
403     *     render as a quad.
404     */
405#ifdef ACCEL_CP
406    while (nBox) {
407	int draw_size = 3 * pPriv->vtx_count + 5;
408	int loop_boxes;
409
410	if (draw_size > radeon_cs_space_remaining(pScrn)) {
411	    if (info->cs)
412		radeon_cs_flush_indirect(pScrn);
413	    else
414		RADEONCPFlushIndirect(pScrn, 1);
415	    if (!FUNC_NAME(RADEONPrepareTexturedVideo)(pScrn, pPriv))
416		return;
417	}
418	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
419	nBox -= loop_boxes;
420
421	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 5);
422	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
423			    loop_boxes * 3 * pPriv->vtx_count + 1));
424	if (pPriv->is_planar)
425	    OUT_RING(RADEON_CP_VC_FRMT_XY |
426		     RADEON_CP_VC_FRMT_ST0 |
427		     RADEON_CP_VC_FRMT_ST1);
428	else
429	    OUT_RING(RADEON_CP_VC_FRMT_XY |
430		     RADEON_CP_VC_FRMT_ST0);
431	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
432		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
433		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
434		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
435		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
436
437	while (loop_boxes--) {
438	    int srcX, srcY, srcw, srch;
439	    int dstX, dstY, dstw, dsth;
440	    dstX = pBox->x1 + dstxoff;
441	    dstY = pBox->y1 + dstyoff;
442	    dstw = pBox->x2 - pBox->x1;
443	    dsth = pBox->y2 - pBox->y1;
444
445	    srcX = pPriv->src_x;
446	    srcX += ((pBox->x1 - pPriv->drw_x) *
447		     pPriv->src_w) / pPriv->dst_w;
448	    srcY = pPriv->src_y;
449	    srcY += ((pBox->y1 - pPriv->drw_y) *
450		     pPriv->src_h) / pPriv->dst_h;
451
452	    srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
453	    srch = (pPriv->src_h * dsth) / pPriv->dst_h;
454
455
456	    if (pPriv->is_planar) {
457		/*
458		 * Just render a rect (using three coords).
459		 */
460		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
461			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
462			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
463		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
464			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
465			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
466		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
467			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
468			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
469	    } else {
470		/*
471		 * Just render a rect (using three coords).
472		 */
473		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
474			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
475		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
476			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
477		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
478			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
479	    }
480
481	    pBox++;
482	}
483
484	OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
485	ADVANCE_RING();
486    }
487#else /* ACCEL_CP */
488    BEGIN_ACCEL(nBox * pPriv->vtx_count * 3 + 2);
489    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
490				      RADEON_VF_PRIM_WALK_DATA |
491				      RADEON_VF_RADEON_MODE |
492				      ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
493    while (nBox--) {
494	int srcX, srcY, srcw, srch;
495	int dstX, dstY, dstw, dsth;
496	dstX = pBox->x1 + dstxoff;
497	dstY = pBox->y1 + dstyoff;
498	dstw = pBox->x2 - pBox->x1;
499	dsth = pBox->y2 - pBox->y1;
500
501	srcX = pPriv->src_x;
502	srcX += ((pBox->x1 - pPriv->drw_x) *
503		 pPriv->src_w) / pPriv->dst_w;
504	srcY = pPriv->src_y;
505	srcY += ((pBox->y1 - pPriv->drw_y) *
506		 pPriv->src_h) / pPriv->dst_h;
507
508	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
509	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
510
511
512	if (pPriv->is_planar) {
513	    /*
514	     * Just render a rect (using three coords).
515	     */
516	    VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
517		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
518		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
519	    VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
520		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
521		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
522	    VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
523		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
524		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
525	} else {
526	    /*
527	     * Just render a rect (using three coords).
528	     */
529	    VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
530		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
531	    VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
532		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
533	    VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
534		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
535	}
536
537	pBox++;
538    }
539
540    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
541    FINISH_ACCEL();
542#endif /* !ACCEL_CP */
543
544    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
545}
546
547static Bool
548FUNC_NAME(R200PrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
549{
550    RADEONInfoPtr info = RADEONPTR(pScrn);
551    PixmapPtr pPixmap = pPriv->pPixmap;
552    struct radeon_exa_pixmap_priv *driver_priv;
553    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
554    uint32_t txformat;
555    uint32_t txfilter, txsize, txpitch, txoffset;
556    uint32_t dst_pitch, dst_format;
557    uint32_t colorpitch;
558    int pixel_shift;
559    int scissor_w = MIN(pPixmap->drawable.width, 2047);
560    int scissor_h = MIN(pPixmap->drawable.height, 2047);
561    /* note: in contrast to r300, use input biasing on uv components */
562    const float Loff = -0.0627;
563    float uvcosf, uvsinf;
564    float yco, yoff;
565    float uco[3], vco[3];
566    float bright, cont, sat;
567    int ref = pPriv->transform_index;
568    float ucscale = 0.25, vcscale = 0.25;
569    Bool needux8 = FALSE, needvx8 = FALSE;
570    ACCEL_PREAMBLE();
571
572#ifdef XF86DRM_MODE
573    if (info->cs) {
574	int ret;
575
576	radeon_cs_space_reset_bos(info->cs);
577        radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
578
579	if (pPriv->bicubic_enabled)
580	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
581
582	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
583	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
584
585	ret = radeon_cs_space_check(info->cs);
586	if (ret) {
587	    ErrorF("Not enough RAM to hw accel xv operation\n");
588	    return FALSE;
589	}
590    }
591#endif
592
593    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
594
595#ifdef USE_EXA
596    if (info->useEXA) {
597	dst_pitch = exaGetPixmapPitch(pPixmap);
598    } else
599#endif
600    {
601	dst_pitch = pPixmap->devKind;
602    }
603
604#ifdef USE_EXA
605    if (info->useEXA) {
606	RADEON_SWITCH_TO_3D();
607    } else
608#endif
609    {
610	BEGIN_ACCEL(2);
611	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
612	/* We must wait for 3d to idle, in case source was just written as a dest. */
613	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
614		      RADEON_WAIT_HOST_IDLECLEAN |
615		      RADEON_WAIT_2D_IDLECLEAN |
616		      RADEON_WAIT_3D_IDLECLEAN |
617		      RADEON_WAIT_DMA_GUI_IDLE);
618	FINISH_ACCEL();
619
620	if (!info->accel_state->XInited3D)
621	    RADEONInit3DEngine(pScrn);
622    }
623
624    /* Same for R100/R200 */
625    switch (pPixmap->drawable.bitsPerPixel) {
626    case 16:
627	if (pPixmap->drawable.depth == 15)
628	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
629	else
630	    dst_format = RADEON_COLOR_FORMAT_RGB565;
631	break;
632    case 32:
633	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
634	break;
635    default:
636	return FALSE;
637    }
638
639    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
640	pPriv->is_planar = TRUE;
641	txformat = RADEON_TXFORMAT_I8;
642    } else {
643	pPriv->is_planar = FALSE;
644	if (pPriv->id == FOURCC_UYVY)
645	    txformat = RADEON_TXFORMAT_YVYU422;
646	else
647	    txformat = RADEON_TXFORMAT_VYUY422;
648    }
649
650    txformat |= RADEON_TXFORMAT_NON_POWER2;
651
652    colorpitch = dst_pitch >> pixel_shift;
653
654    if (RADEONTilingEnabled(pScrn, pPixmap))
655	colorpitch |= RADEON_COLOR_TILE_ENABLE;
656
657    BEGIN_ACCEL_RELOC(4,2);
658
659    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
660    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
661    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
662
663    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
664		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
665
666    FINISH_ACCEL();
667
668    txfilter =  R200_MAG_FILTER_LINEAR |
669	R200_MIN_FILTER_LINEAR |
670	R200_CLAMP_S_CLAMP_LAST |
671	R200_CLAMP_T_CLAMP_LAST;
672
673    /* contrast can cause constant overflow, clamp */
674    cont = RTFContrast(pPriv->contrast);
675    if (cont * trans[ref].RefLuma > 2.0)
676	cont = 2.0 / trans[ref].RefLuma;
677    /* brightness is only from -0.5 to 0.5 should be safe */
678    bright = RTFBrightness(pPriv->brightness);
679    /* saturation can also cause overflow, clamp */
680    sat = RTFSaturation(pPriv->saturation);
681    if (sat * trans[ref].RefBCb > 4.0)
682	sat = 4.0 / trans[ref].RefBCb;
683    uvcosf = sat * cos(RTFHue(pPriv->hue));
684    uvsinf = sat * sin(RTFHue(pPriv->hue));
685
686    yco = trans[ref].RefLuma * cont;
687    uco[0] = -trans[ref].RefRCr * uvsinf;
688    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
689    uco[2] = trans[ref].RefBCb * uvcosf;
690    vco[0] = trans[ref].RefRCr * uvcosf;
691    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
692    vco[2] = trans[ref].RefBCb * uvsinf;
693    yoff = Loff * yco + bright;
694
695    if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
696	needux8 = TRUE;
697	ucscale = 0.125;
698    }
699    if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
700	needvx8 = TRUE;
701	vcscale = 0.125;
702    }
703
704    txoffset = info->cs ? 0 : pPriv->src_offset;
705
706    if (pPriv->is_planar) {
707	/* need 2 texcoord sets (even though they are identical) due
708	   to denormalization! hw apparently can't premultiply
709	   same coord set by different texture size */
710	pPriv->vtx_count = 6;
711
712	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
713		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
714	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
715	txpitch -= 32;
716
717	BEGIN_ACCEL_RELOC(36, 3);
718
719	OUT_ACCEL_REG(RADEON_PP_CNTL,
720		      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
721		      RADEON_TEX_BLEND_0_ENABLE |
722		      RADEON_TEX_BLEND_1_ENABLE |
723		      RADEON_TEX_BLEND_2_ENABLE);
724
725	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
726	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
727		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
728		      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
729
730	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
731	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
732	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
733	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
734		      (pPriv->w - 1) |
735		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
736	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
737	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, txoffset, src_bo);
738
739	OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
740	OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
741	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
742	OUT_ACCEL_REG(R200_PP_TXSIZE_1, txsize);
743	OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
744	OUT_TEXTURE_REG(R200_PP_TXOFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
745
746	OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
747	OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
748	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
749	OUT_ACCEL_REG(R200_PP_TXSIZE_2, txsize);
750	OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
751	OUT_TEXTURE_REG(R200_PP_TXOFFSET_2, txoffset + pPriv->planev_offset, src_bo);
752
753	/* similar to r300 code. Note the big problem is that hardware constants
754	 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
755	 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
756	 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
757	 * the constants not. To get larger range can use output scale, but for
758	 * that 2.018 value we need a total scale by 8, which means the constants
759	 * really have no accuracy whatsoever (5 fractional bits only).
760	 * The only direct way to get high  precision "constants" into the fragment
761	 * pipe I know of is to use the texcoord interpolator (not color, this one
762	 * is 8 bit only too), which seems a bit expensive. We're lucky though it
763	 * seems the values we need seem to fit better than worst case (get about
764	 * 6 fractional bits for this instead of 5, at least when not correcting for
765	 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
766	 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
767	 * even with non-default saturation/hue/contrast/brightness adjustments,
768	 * it gets a little crazy and ultimately precision might still be lacking.
769	 *
770	 * A higher precision (8 fractional bits) version might just put uco into
771	 * a texcoord, and calculate a new vcoconst in the shader, like so:
772	 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
773	 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
774	 * vcocalc = ADD temp, bias/scale(cohelper), vco
775	 * would in total use 4 tex units, 4 instructions which seems fairly
776	 * balanced for this architecture (instead of 3 + 3 for the solution here)
777	 *
778	 * temp = MAD(yco, yuv.yyyy, yoff)
779	 * temp = MAD(uco, yuv.uuuu, temp)
780	 * result = MAD(vco, yuv.vvvv, temp)
781	 *
782	 * note first mad produces actually scalar, hence we transform
783	 * it into a dp2a to get 8 bit precision of yco instead of 7 -
784	 * That's assuming hw correctly expands consts to internal precision.
785	 * (y * 1 + y * (yco - 1) + yoff)
786	 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
787	 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
788	 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
789	 *
790	 * vco, uco need bias (and hence scale too)
791	 *
792	 */
793
794	/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
795	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
796		      R200_TXC_ARG_A_TFACTOR_COLOR |
797		      R200_TXC_ARG_B_R0_COLOR |
798		      R200_TXC_ARG_C_TFACTOR_COLOR |
799		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
800		      R200_TXC_OP_DOT2_ADD);
801	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
802		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
803		      R200_TXC_SCALE_INV2 |
804		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
805	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
806		      R200_TXA_ARG_A_ZERO |
807		      R200_TXA_ARG_B_ZERO |
808		      R200_TXA_ARG_C_ZERO |
809		      R200_TXA_OP_MADD);
810	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
811		      R200_TXA_OUTPUT_REG_NONE);
812
813	/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
814	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
815		      R200_TXC_ARG_A_TFACTOR_COLOR |
816		      R200_TXC_BIAS_ARG_A |
817		      R200_TXC_SCALE_ARG_A |
818		      R200_TXC_ARG_B_R1_COLOR |
819		      R200_TXC_BIAS_ARG_B |
820		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
821		      R200_TXC_ARG_C_R0_COLOR |
822		      R200_TXC_OP_MADD);
823	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
824		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
825		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
826	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
827		      R200_TXA_ARG_A_ZERO |
828		      R200_TXA_ARG_B_ZERO |
829		      R200_TXA_ARG_C_ZERO |
830		      R200_TXA_OP_MADD);
831	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
832		      R200_TXA_OUTPUT_REG_NONE);
833
834	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
835	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
836		      R200_TXC_ARG_A_TFACTOR_COLOR |
837		      R200_TXC_BIAS_ARG_A |
838		      R200_TXC_SCALE_ARG_A |
839		      R200_TXC_ARG_B_R2_COLOR |
840		      R200_TXC_BIAS_ARG_B |
841		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
842		      R200_TXC_ARG_C_R0_COLOR |
843		      R200_TXC_OP_MADD);
844	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
845		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
846		      R200_TXC_SCALE_2X |
847		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
848	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
849		      R200_TXA_ARG_A_ZERO |
850		      R200_TXA_ARG_B_ZERO |
851		      R200_TXA_ARG_C_ZERO |
852		      R200_TXA_COMP_ARG_C |
853		      R200_TXA_OP_MADD);
854	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
855		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
856
857	/* shader constants */
858	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
859						      yco > 1.0 ? yco - 1.0: yco,
860						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
861						      0.0));
862	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
863						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
864						      uco[2] * ucscale + 0.5,
865						      0.0));
866	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
867						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
868						      vco[2] * vcscale + 0.5,
869						      0.0));
870
871	FINISH_ACCEL();
872    } else {
873	pPriv->vtx_count = 4;
874
875	BEGIN_ACCEL_RELOC(24, 1);
876
877	OUT_ACCEL_REG(RADEON_PP_CNTL,
878		      RADEON_TEX_0_ENABLE |
879		      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
880		      RADEON_TEX_BLEND_2_ENABLE);
881
882	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
883	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
884		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
885
886	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
887	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
888	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
889	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
890		      (pPriv->w - 1) |
891		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
892	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
893	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, txoffset, src_bo);
894
895	/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
896	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
897		      R200_TXC_ARG_A_TFACTOR_COLOR |
898		      R200_TXC_ARG_B_R0_COLOR |
899		      R200_TXC_ARG_C_TFACTOR_COLOR |
900		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
901		      R200_TXC_OP_DOT2_ADD);
902	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
903		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
904		      R200_TXC_SCALE_INV2 |
905		      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
906		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
907	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
908		      R200_TXA_ARG_A_ZERO |
909		      R200_TXA_ARG_B_ZERO |
910		      R200_TXA_ARG_C_ZERO |
911		      R200_TXA_OP_MADD);
912	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
913		      R200_TXA_OUTPUT_REG_NONE);
914
915	/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
916	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
917		      R200_TXC_ARG_A_TFACTOR_COLOR |
918		      R200_TXC_BIAS_ARG_A |
919		      R200_TXC_SCALE_ARG_A |
920		      R200_TXC_ARG_B_R0_COLOR |
921		      R200_TXC_BIAS_ARG_B |
922		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
923		      R200_TXC_ARG_C_R1_COLOR |
924		      R200_TXC_OP_MADD);
925	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
926		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
927		      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
928		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
929	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
930		      R200_TXA_ARG_A_ZERO |
931		      R200_TXA_ARG_B_ZERO |
932		      R200_TXA_ARG_C_ZERO |
933		      R200_TXA_OP_MADD);
934	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
935		      R200_TXA_OUTPUT_REG_NONE);
936
937	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
938	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
939		      R200_TXC_ARG_A_TFACTOR_COLOR |
940		      R200_TXC_BIAS_ARG_A |
941		      R200_TXC_SCALE_ARG_A |
942		      R200_TXC_ARG_B_R0_COLOR |
943		      R200_TXC_BIAS_ARG_B |
944		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
945		      R200_TXC_ARG_C_R1_COLOR |
946		      R200_TXC_OP_MADD);
947	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
948		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
949		      R200_TXC_SCALE_2X |
950		      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
951		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
952	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
953		      R200_TXA_ARG_A_ZERO |
954		      R200_TXA_ARG_B_ZERO |
955		      R200_TXA_ARG_C_ZERO |
956		      R200_TXA_COMP_ARG_C |
957		      R200_TXA_OP_MADD);
958	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
959		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
960
961	/* shader constants */
962	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
963						      yco > 1.0 ? yco - 1.0: yco,
964						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
965						      0.0));
966	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
967						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
968						      uco[2] * ucscale + 0.5,
969						      0.0));
970	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
971						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
972						      vco[2] * vcscale + 0.5,
973						      0.0));
974
975	FINISH_ACCEL();
976    }
977
978    BEGIN_ACCEL(2);
979    OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
980    OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
981					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
982    FINISH_ACCEL();
983
984    if (pPriv->vsync) {
985	xf86CrtcPtr crtc;
986	if (pPriv->desired_crtc)
987	    crtc = pPriv->desired_crtc;
988	else
989	    crtc = radeon_pick_best_crtc(pScrn,
990					 pPriv->drw_x,
991					 pPriv->drw_x + pPriv->dst_w,
992					 pPriv->drw_y,
993					 pPriv->drw_y + pPriv->dst_h);
994	if (crtc)
995	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
996					  crtc,
997					  pPriv->drw_y - crtc->y,
998					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
999    }
1000
1001    return TRUE;
1002}
1003
1004static void
1005FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
1006{
1007    RADEONInfoPtr info = RADEONPTR(pScrn);
1008    PixmapPtr pPixmap = pPriv->pPixmap;
1009    int dstxoff, dstyoff;
1010    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
1011    int nBox = REGION_NUM_RECTS(&pPriv->clip);
1012    ACCEL_PREAMBLE();
1013
1014#ifdef COMPOSITE
1015    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
1016    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
1017#else
1018    dstxoff = 0;
1019    dstyoff = 0;
1020#endif
1021
1022    if (!FUNC_NAME(R200PrepareTexturedVideo)(pScrn, pPriv))
1023	return;
1024
1025    /*
1026     * Rendering of the actual polygon is done in two different
1027     * ways depending on chip generation:
1028     *
1029     * < R300:
1030     *
1031     *     These chips can render a rectangle in one pass, so
1032     *     handling is pretty straight-forward.
1033     *
1034     * >= R300:
1035     *
1036     *     These chips can accept a quad, but will render it as
1037     *     two triangles which results in a diagonal tear. Instead
1038     *     We render a single, large triangle and use the scissor
1039     *     functionality to restrict it to the desired rectangle.
1040     *     Due to guardband limits on r3xx/r4xx, we can only use
1041     *     the single triangle up to 2560/4021 pixels; above that we
1042     *     render as a quad.
1043     */
1044
1045#ifdef ACCEL_CP
1046    while (nBox) {
1047	int draw_size = 3 * pPriv->vtx_count + 4;
1048	int loop_boxes;
1049
1050	if (draw_size > radeon_cs_space_remaining(pScrn)) {
1051	    if (info->cs)
1052		radeon_cs_flush_indirect(pScrn);
1053	    else
1054		RADEONCPFlushIndirect(pScrn, 1);
1055	    if (!FUNC_NAME(R200PrepareTexturedVideo)(pScrn, pPriv))
1056		return;
1057	}
1058	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
1059	nBox -= loop_boxes;
1060
1061	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 4);
1062	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
1063			    loop_boxes * 3 * pPriv->vtx_count));
1064	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
1065		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
1066		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
1067
1068	while (loop_boxes--) {
1069	    int srcX, srcY, srcw, srch;
1070	    int dstX, dstY, dstw, dsth;
1071	    dstX = pBox->x1 + dstxoff;
1072	    dstY = pBox->y1 + dstyoff;
1073	    dstw = pBox->x2 - pBox->x1;
1074	    dsth = pBox->y2 - pBox->y1;
1075
1076	    srcX = pPriv->src_x;
1077	    srcX += ((pBox->x1 - pPriv->drw_x) *
1078		     pPriv->src_w) / pPriv->dst_w;
1079	    srcY = pPriv->src_y;
1080	    srcY += ((pBox->y1 - pPriv->drw_y) *
1081		     pPriv->src_h) / pPriv->dst_h;
1082
1083	    srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
1084	    srch = (pPriv->src_h * dsth) / pPriv->dst_h;
1085
1086	    if (pPriv->is_planar) {
1087		/*
1088		 * Just render a rect (using three coords).
1089		 */
1090		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
1091			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
1092			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1093		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
1094			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
1095			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1096		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
1097			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
1098			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1099	    } else {
1100		/*
1101		 * Just render a rect (using three coords).
1102		 */
1103		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
1104			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1105		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
1106			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1107		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
1108			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1109	    }
1110
1111	    pBox++;
1112	}
1113
1114	OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
1115	ADVANCE_RING();
1116    }
1117#else /* ACCEL_CP */
1118    BEGIN_ACCEL(nBox * 3 * pPriv->vtx_count + 2);
1119    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
1120				      RADEON_VF_PRIM_WALK_DATA |
1121				      ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
1122    while (nBox--) {
1123	int srcX, srcY, srcw, srch;
1124	int dstX, dstY, dstw, dsth;
1125	dstX = pBox->x1 + dstxoff;
1126	dstY = pBox->y1 + dstyoff;
1127	dstw = pBox->x2 - pBox->x1;
1128	dsth = pBox->y2 - pBox->y1;
1129
1130	srcX = pPriv->src_x;
1131	srcX += ((pBox->x1 - pPriv->drw_x) *
1132		 pPriv->src_w) / pPriv->dst_w;
1133	srcY = pPriv->src_y;
1134	srcY += ((pBox->y1 - pPriv->drw_y) *
1135		 pPriv->src_h) / pPriv->dst_h;
1136
1137	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
1138	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
1139
1140	if (pPriv->is_planar) {
1141	    /*
1142	     * Just render a rect (using three coords).
1143	     */
1144	    VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
1145		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
1146		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1147	    VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
1148		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
1149		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1150	    VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
1151		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
1152		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1153	} else {
1154	    /*
1155	     * Just render a rect (using three coords).
1156	     */
1157	    VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
1158		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1159	    VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
1160		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1161	    VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
1162		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1163	}
1164
1165	pBox++;
1166    }
1167
1168    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
1169    FINISH_ACCEL();
1170#endif /* !ACCEL_CP */
1171
1172    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
1173}
1174
1175static Bool
1176FUNC_NAME(R300PrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
1177{
1178    RADEONInfoPtr info = RADEONPTR(pScrn);
1179    PixmapPtr pPixmap = pPriv->pPixmap;
1180    struct radeon_exa_pixmap_priv *driver_priv;
1181    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
1182    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
1183    uint32_t dst_pitch, dst_format;
1184    uint32_t txenable, colorpitch, bicubic_offset;
1185    uint32_t output_fmt;
1186    int pixel_shift;
1187    ACCEL_PREAMBLE();
1188
1189#ifdef XF86DRM_MODE
1190    if (info->cs) {
1191	int ret;
1192
1193	radeon_cs_space_reset_bos(info->cs);
1194	radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
1195
1196	if (pPriv->bicubic_enabled)
1197	  radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
1198
1199	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
1200	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
1201
1202	ret = radeon_cs_space_check(info->cs);
1203	if (ret) {
1204	    ErrorF("Not enough RAM to hw accel xv operation\n");
1205	    return FALSE;
1206	}
1207    }
1208#endif
1209
1210    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
1211
1212#ifdef USE_EXA
1213    if (info->useEXA) {
1214	dst_pitch = exaGetPixmapPitch(pPixmap);
1215    } else
1216#endif
1217    {
1218	dst_pitch = pPixmap->devKind;
1219    }
1220
1221#ifdef USE_EXA
1222    if (info->useEXA) {
1223	RADEON_SWITCH_TO_3D();
1224    } else
1225#endif
1226    {
1227	BEGIN_ACCEL(2);
1228	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
1229	/* We must wait for 3d to idle, in case source was just written as a dest. */
1230	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
1231		      RADEON_WAIT_HOST_IDLECLEAN |
1232		      RADEON_WAIT_2D_IDLECLEAN |
1233		      RADEON_WAIT_3D_IDLECLEAN |
1234		      RADEON_WAIT_DMA_GUI_IDLE);
1235	FINISH_ACCEL();
1236
1237	if (!info->accel_state->XInited3D)
1238	    RADEONInit3DEngine(pScrn);
1239    }
1240
1241    if (pPriv->bicubic_enabled)
1242	pPriv->vtx_count = 6;
1243    else
1244	pPriv->vtx_count = 4;
1245
1246    switch (pPixmap->drawable.bitsPerPixel) {
1247    case 16:
1248	if (pPixmap->drawable.depth == 15)
1249	    dst_format = R300_COLORFORMAT_ARGB1555;
1250	else
1251	    dst_format = R300_COLORFORMAT_RGB565;
1252	break;
1253    case 32:
1254	dst_format = R300_COLORFORMAT_ARGB8888;
1255	break;
1256    default:
1257	return FALSE;
1258    }
1259
1260    output_fmt = (R300_OUT_FMT_C4_8 |
1261		  R300_OUT_FMT_C0_SEL_BLUE |
1262		  R300_OUT_FMT_C1_SEL_GREEN |
1263		  R300_OUT_FMT_C2_SEL_RED |
1264		  R300_OUT_FMT_C3_SEL_ALPHA);
1265
1266    colorpitch = dst_pitch >> pixel_shift;
1267    colorpitch |= dst_format;
1268
1269    if (RADEONTilingEnabled(pScrn, pPixmap))
1270	colorpitch |= R300_COLORTILE;
1271
1272
1273    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
1274	(pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
1275	pPriv->is_planar = TRUE;
1276    else
1277	pPriv->is_planar = FALSE;
1278
1279    if (pPriv->is_planar) {
1280	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
1281	txpitch = pPriv->src_pitch;
1282    } else {
1283	if (pPriv->id == FOURCC_UYVY)
1284	    txformat1 = R300_TX_FORMAT_YVYU422;
1285	else
1286	    txformat1 = R300_TX_FORMAT_VYUY422;
1287
1288	if (pPriv->bicubic_state != BICUBIC_OFF)
1289	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
1290
1291	/* pitch is in pixels */
1292	txpitch = pPriv->src_pitch / 2;
1293    }
1294    txpitch -= 1;
1295
1296    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1297		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1298		 R300_TXPITCH_EN);
1299
1300    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1301		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1302		R300_TX_MAG_FILTER_LINEAR |
1303		R300_TX_MIN_FILTER_LINEAR |
1304		(0 << R300_TX_ID_SHIFT));
1305
1306    txoffset = info->cs ? 0 : pPriv->src_offset;
1307
1308    BEGIN_ACCEL_RELOC(6, 1);
1309    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
1310    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
1311    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
1312    if (pPriv->is_planar)
1313	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1 | R300_TX_FORMAT_CACHE_HALF_REGION_0);
1314    else
1315	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
1316    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
1317    OUT_TEXTURE_REG(R300_TX_OFFSET_0, txoffset, src_bo);
1318    FINISH_ACCEL();
1319
1320    txenable = R300_TEX_0_ENABLE;
1321
1322    if (pPriv->is_planar) {
1323	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1324		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1325		     R300_TXPITCH_EN);
1326	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
1327	txpitch -= 1;
1328	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1329		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1330		    R300_TX_MIN_FILTER_LINEAR |
1331		    R300_TX_MAG_FILTER_LINEAR);
1332
1333	BEGIN_ACCEL_RELOC(12, 2);
1334	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
1335	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
1336	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
1337	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
1338	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
1339	OUT_TEXTURE_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
1340	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
1341	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
1342	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
1343	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
1344	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
1345	OUT_TEXTURE_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset, src_bo);
1346	FINISH_ACCEL();
1347	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
1348    }
1349
1350    if (pPriv->bicubic_enabled) {
1351	/* Size is 128x1 */
1352	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
1353		     (0x0 << R300_TXHEIGHT_SHIFT) |
1354		     R300_TXPITCH_EN);
1355	/* Format is 32-bit floats, 4bpp */
1356	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
1357	/* Pitch is 127 (128-1) */
1358	txpitch = 0x7f;
1359	/* Tex filter */
1360	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
1361		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
1362		    R300_TX_MIN_FILTER_NEAREST |
1363		    R300_TX_MAG_FILTER_NEAREST |
1364		    (1 << R300_TX_ID_SHIFT));
1365
1366	if (info->cs)
1367	    bicubic_offset = 0;
1368	else
1369	    bicubic_offset = pPriv->bicubic_src_offset;
1370
1371	BEGIN_ACCEL_RELOC(6, 1);
1372	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
1373	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
1374	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
1375	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
1376	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
1377	OUT_TEXTURE_REG(R300_TX_OFFSET_1, bicubic_offset, info->bicubic_bo);
1378	FINISH_ACCEL();
1379
1380	/* Enable tex 1 */
1381	txenable |= R300_TEX_1_ENABLE;
1382    }
1383
1384    /* setup the VAP */
1385    if (info->accel_state->has_tcl) {
1386	if (pPriv->bicubic_enabled)
1387	    BEGIN_ACCEL(7);
1388	else
1389	    BEGIN_ACCEL(6);
1390    } else {
1391	if (pPriv->bicubic_enabled)
1392	    BEGIN_ACCEL(5);
1393	else
1394	    BEGIN_ACCEL(4);
1395    }
1396
1397    /* These registers define the number, type, and location of data submitted
1398     * to the PVS unit of GA input (when PVS is disabled)
1399     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
1400     * enabled.  This memory provides the imputs to the vertex shader program
1401     * and ordering is not important.  When PVS/TCL is disabled, this field maps
1402     * directly to the GA input memory and the order is signifigant.  In
1403     * PVS_BYPASS mode the order is as follows:
1404     * Position
1405     * Point Size
1406     * Color 0-3
1407     * Textures 0-7
1408     * Fog
1409     */
1410    if (pPriv->bicubic_enabled) {
1411	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
1412		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1413		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1414		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1415		       R300_SIGNED_0 |
1416		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1417		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1418		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1419		       R300_SIGNED_1));
1420	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
1421		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
1422		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
1423		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
1424		       R300_LAST_VEC_2 |
1425		       R300_SIGNED_2));
1426    } else {
1427	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
1428		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1429		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1430		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1431		       R300_SIGNED_0 |
1432		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1433		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1434		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1435		       R300_LAST_VEC_1 |
1436		       R300_SIGNED_1));
1437    }
1438
1439    /* load the vertex shader
1440     * We pre-load vertex programs in RADEONInit3DEngine():
1441     * - exa
1442     * - Xv
1443     * - Xv bicubic
1444     * Here we select the offset of the vertex program we want to use
1445     */
1446    if (info->accel_state->has_tcl) {
1447	if (pPriv->bicubic_enabled) {
1448	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
1449			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
1450			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1451			   (13 << R300_PVS_LAST_INST_SHIFT)));
1452	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
1453			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1454	} else {
1455	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
1456			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
1457			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1458			   (10 << R300_PVS_LAST_INST_SHIFT)));
1459	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
1460			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1461	}
1462    }
1463
1464    /* Position and one set of 2 texture coordinates */
1465    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
1466    if (pPriv->bicubic_enabled)
1467	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
1468					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
1469    else
1470	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
1471
1472    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
1473    FINISH_ACCEL();
1474
1475    /* setup pixel shader */
1476    if (pPriv->bicubic_state != BICUBIC_OFF) {
1477	if (pPriv->bicubic_enabled) {
1478	    BEGIN_ACCEL(79);
1479
1480	    /* 4 components: 2 for tex0 and 2 for tex1 */
1481	    OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1482					  R300_RS_COUNT_HIRES_EN));
1483
1484	    /* R300_INST_COUNT_RS - highest RS instruction used */
1485	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
1486
1487	    /* Pixel stack frame size. */
1488	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
1489
1490	    /* Indirection levels */
1491	    OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
1492					   R300_FIRST_TEX));
1493
1494	    /* Set nodes. */
1495	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1496						R300_ALU_CODE_SIZE(14) |
1497						R300_TEX_CODE_OFFSET(0) |
1498						R300_TEX_CODE_SIZE(6)));
1499
1500	    /* Nodes are allocated highest first, but executed lowest first */
1501	    OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
1502	    OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
1503						R300_ALU_SIZE(0) |
1504						R300_TEX_START(0) |
1505						R300_TEX_SIZE(0)));
1506	    OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
1507						R300_ALU_SIZE(9) |
1508						R300_TEX_START(1) |
1509						R300_TEX_SIZE(0)));
1510	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
1511						R300_ALU_SIZE(2) |
1512						R300_TEX_START(2) |
1513						R300_TEX_SIZE(3) |
1514						R300_RGBA_OUT));
1515
1516	    /* ** BICUBIC FP ** */
1517
1518	    /* texcoord0 => temp0
1519	     * texcoord1 => temp1 */
1520
1521	    // first node
1522	    /* TEX temp2, temp1.rrr0, tex1, 1D */
1523	    OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
1524						R300_TEX_ID(1) |
1525						R300_TEX_SRC_ADDR(1) |
1526						R300_TEX_DST_ADDR(2)));
1527
1528	    /* MOV temp1.r, temp1.ggg0 */
1529	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1530						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1531						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1532						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1533	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
1534						    R300_ALU_RGB_ADDRD(1) |
1535						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1536	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1537						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1538						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1539						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1540	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
1541						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1542
1543
1544	    // second node
1545	    /* TEX temp1, temp1, tex1, 1D */
1546	    OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
1547						R300_TEX_ID(1) |
1548						R300_TEX_SRC_ADDR(1) |
1549						R300_TEX_DST_ADDR(1)));
1550
1551	    /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
1552	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1553						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1554						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1555						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1556	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
1557						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1558						    R300_ALU_RGB_ADDRD(3) |
1559						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1560	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1561						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1562						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1563						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1564	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
1565						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1566
1567
1568	    /* MUL temp2.rg, temp2.rrr0, const0.rgb */
1569	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1570						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1571						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1572						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1573	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
1574						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1575						    R300_ALU_RGB_ADDRD(2) |
1576						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1577	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1578						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1579						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1580						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1581	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
1582						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1583
1584	    /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
1585	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1586						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1587						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1588						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1589	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
1590						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1591						    R300_ALU_RGB_ADDR2(3) |
1592						    R300_ALU_RGB_ADDRD(4) |
1593						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1594	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1595						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1596						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1597						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1598	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
1599						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1600
1601	    /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
1602	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1603						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1604						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1605						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1606	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
1607						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1608						    R300_ALU_RGB_ADDR2(2) |
1609						    R300_ALU_RGB_ADDRD(5) |
1610						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1611	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1612						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1613						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1614						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1615	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
1616						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1617
1618	    /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
1619	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1620						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1621						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1622						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1623	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
1624						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1625						    R300_ALU_RGB_ADDR2(3) |
1626						    R300_ALU_RGB_ADDRD(3) |
1627						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1628	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1629						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1630						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1631						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1632	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
1633						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1634
1635	    /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
1636	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1637						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1638						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1639						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1640	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
1641						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1642						    R300_ALU_RGB_ADDR2(2) |
1643						    R300_ALU_RGB_ADDRD(1) |
1644						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1645	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1646						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1647						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1648						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1649	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
1650						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1651
1652	    /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
1653	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1654						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1655						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1656						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1657	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1658						    R300_ALU_RGB_ADDR2(1) |
1659						    R300_ALU_RGB_ADDRD(1) |
1660						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1661	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1662						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1663						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1664						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1665	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
1666						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1667
1668	    /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
1669	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1670						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1671						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1672						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1673	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1674						    R300_ALU_RGB_ADDR2(3) |
1675						    R300_ALU_RGB_ADDRD(2) |
1676						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1677	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1678						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1679						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1680						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1681	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
1682						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1683
1684	    /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
1685	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1686						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1687						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1688						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1689	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1690						    R300_ALU_RGB_ADDR2(5) |
1691						    R300_ALU_RGB_ADDRD(3) |
1692						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1693	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1694						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1695						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1696						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1697	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
1698						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1699
1700	    /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
1701	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1702						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1703						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1704						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1705	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
1706						     R300_ALU_RGB_ADDR2(4) |
1707						     R300_ALU_RGB_ADDRD(0) |
1708						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1709	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1710						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1711						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1712						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1713	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
1714						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1715
1716
1717	    // third node
1718	    /* TEX temp4, temp1.rg--, tex0, 1D */
1719	    OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
1720						R300_TEX_ID(0) |
1721						R300_TEX_SRC_ADDR(1) |
1722						R300_TEX_DST_ADDR(4)));
1723
1724	    /* TEX temp3, temp3.rg--, tex0, 1D */
1725	    OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
1726						R300_TEX_ID(0) |
1727						R300_TEX_SRC_ADDR(3) |
1728						R300_TEX_DST_ADDR(3)));
1729
1730	    /* TEX temp5, temp2.rg--, tex0, 1D */
1731	    OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
1732						R300_TEX_ID(0) |
1733						R300_TEX_SRC_ADDR(2) |
1734						R300_TEX_DST_ADDR(5)));
1735
1736	    /* TEX temp0, temp0.rg--, tex0, 1D */
1737	    OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
1738						R300_TEX_ID(0) |
1739						R300_TEX_SRC_ADDR(0) |
1740						R300_TEX_DST_ADDR(0)));
1741
1742	    /* LRP temp3, temp1.bbbb, temp4, temp3 ->
1743	     * - PRESUB temps, temp4 - temp3
1744	     * - MAD temp3, temp1.bbbb, temps, temp3 */
1745	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1746						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1747						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1748						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1749						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1750	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
1751						     R300_ALU_RGB_ADDR1(4) |
1752						     R300_ALU_RGB_ADDR2(1) |
1753						     R300_ALU_RGB_ADDRD(3) |
1754						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1755	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1756						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1757						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1758						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1759	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
1760						       R300_ALU_ALPHA_ADDR1(4) |
1761						       R300_ALU_ALPHA_ADDR2(1) |
1762						       R300_ALU_ALPHA_ADDRD(3) |
1763						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1764
1765	    /* LRP temp0, temp1.bbbb, temp5, temp0 ->
1766	     * - PRESUB temps, temp5 - temp0
1767	     * - MAD temp0, temp1.bbbb, temps, temp0 */
1768	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1769						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1770						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1771						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1772						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
1773						     R300_ALU_RGB_INSERT_NOP));
1774	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
1775						     R300_ALU_RGB_ADDR1(5) |
1776						     R300_ALU_RGB_ADDR2(1) |
1777						     R300_ALU_RGB_ADDRD(0) |
1778						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1779	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1780						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1781						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1782						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1783	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
1784						       R300_ALU_ALPHA_ADDR1(5) |
1785						       R300_ALU_ALPHA_ADDR2(1) |
1786						       R300_ALU_ALPHA_ADDRD(0) |
1787						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1788
1789	    /* LRP output, temp2.bbbb, temp3, temp0 ->
1790	     * - PRESUB temps, temp3 - temp0
1791	     * - MAD output, temp2.bbbb, temps, temp0 */
1792	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1793						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1794						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1795						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1796						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1797	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
1798						     R300_ALU_RGB_ADDR1(3) |
1799						     R300_ALU_RGB_ADDR2(2) |
1800						     R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
1801	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1802						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1803						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1804						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1805	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
1806						       R300_ALU_ALPHA_ADDR1(3) |
1807						       R300_ALU_ALPHA_ADDR2(2) |
1808						       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
1809
1810	    /* Shader constants. */
1811	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
1812	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
1813	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
1814	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
1815
1816	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
1817	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
1818	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
1819	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
1820
1821	    FINISH_ACCEL();
1822	} else {
1823	    BEGIN_ACCEL(11);
1824	    /* 2 components: 2 for tex0 */
1825	    OUT_ACCEL_REG(R300_RS_COUNT,
1826                          ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1827                           R300_RS_COUNT_HIRES_EN));
1828	    /* R300_INST_COUNT_RS - highest RS instruction used */
1829	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1830
1831	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
1832
1833	    /* Indirection levels */
1834	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1835					   R300_FIRST_TEX));
1836
1837	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1838						R300_ALU_CODE_SIZE(1) |
1839						R300_TEX_CODE_OFFSET(0) |
1840						R300_TEX_CODE_SIZE(1)));
1841
1842	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1843						R300_ALU_SIZE(0) |
1844						R300_TEX_START(0) |
1845						R300_TEX_SIZE(0) |
1846						R300_RGBA_OUT));
1847
1848	    /* tex inst */
1849	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1850					       R300_TEX_DST_ADDR(0) |
1851					       R300_TEX_ID(0) |
1852					       R300_TEX_INST(R300_TEX_INST_LD)));
1853
1854	    /* ALU inst */
1855	    /* RGB */
1856	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
1857                                                   R300_ALU_RGB_ADDR1(0) |
1858                                                   R300_ALU_RGB_ADDR2(0) |
1859                                                   R300_ALU_RGB_ADDRD(0) |
1860                                                   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
1861								       R300_ALU_RGB_MASK_G |
1862								       R300_ALU_RGB_MASK_B)) |
1863                                                   R300_ALU_RGB_TARGET_A));
1864	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1865                                                   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1866                                                   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1867						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1868                                                   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1869                                                   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1870                                                   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1871                                                   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1872                                                   R300_ALU_RGB_CLAMP));
1873	    /* Alpha */
1874	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
1875						     R300_ALU_ALPHA_ADDR1(0) |
1876						     R300_ALU_ALPHA_ADDR2(0) |
1877						     R300_ALU_ALPHA_ADDRD(0) |
1878						     R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1879						     R300_ALU_ALPHA_TARGET_A |
1880						     R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
1881	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
1882						     R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
1883						     R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
1884						     R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
1885						     R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
1886						     R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
1887						     R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1888						     R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
1889						     R300_ALU_ALPHA_CLAMP));
1890	    FINISH_ACCEL();
1891	}
1892    } else {
1893	/*
1894	 * y' = y - .0625
1895	 * u' = u - .5
1896	 * v' = v - .5;
1897	 *
1898	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
1899	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
1900	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
1901	 *
1902	 * DP3 might look like the straightforward solution
1903	 * but we'd need to move the texture yuv values in
1904	 * the same reg for this to work. Therefore use MADs.
1905	 * Brightness just adds to the off constant.
1906	 * Contrast is multiplication of luminance.
1907	 * Saturation and hue change the u and v coeffs.
1908	 * Default values (before adjustments - depend on colorspace):
1909	 * yco = 1.1643
1910	 * uco = 0, -0.39173, 2.017
1911	 * vco = 1.5958, -0.8129, 0
1912	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
1913	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
1914	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
1915	 *
1916	 * temp = MAD(yco, yuv.yyyy, off)
1917	 * temp = MAD(uco, yuv.uuuu, temp)
1918	 * result = MAD(vco, yuv.vvvv, temp)
1919	 */
1920	/* TODO: don't recalc consts always */
1921	const float Loff = -0.0627;
1922	const float Coff = -0.502;
1923	float uvcosf, uvsinf;
1924	float yco;
1925	float uco[3], vco[3], off[3];
1926	float bright, cont, gamma;
1927	int ref = pPriv->transform_index;
1928	Bool needgamma = FALSE;
1929
1930	cont = RTFContrast(pPriv->contrast);
1931	bright = RTFBrightness(pPriv->brightness);
1932	gamma = (float)pPriv->gamma / 1000.0;
1933	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
1934	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
1935	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
1936
1937	yco = trans[ref].RefLuma * cont;
1938	uco[0] = -trans[ref].RefRCr * uvsinf;
1939	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
1940	uco[2] = trans[ref].RefBCb * uvcosf;
1941	vco[0] = trans[ref].RefRCr * uvcosf;
1942	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
1943	vco[2] = trans[ref].RefBCb * uvsinf;
1944	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
1945	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
1946	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
1947
1948	if (gamma != 1.0) {
1949	    needgamma = TRUE;
1950	    /* note: gamma correction is out = in ^ gamma;
1951	       gpu can only do LG2/EX2 therefore we transform into
1952	       in ^ gamma = 2 ^ (log2(in) * gamma).
1953	       Lots of scalar ops, unfortunately (better solution?) -
1954	       without gamma that's 3 inst, with gamma it's 10...
1955	       could use different gamma factors per channel,
1956	       if that's of any use. */
1957	}
1958
1959	if (pPriv->is_planar) {
1960	    BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
1961	    /* 2 components: same 2 for tex0/1/2 */
1962	    OUT_ACCEL_REG(R300_RS_COUNT,
1963			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1964			   R300_RS_COUNT_HIRES_EN));
1965	    /* R300_INST_COUNT_RS - highest RS instruction used */
1966	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1967
1968	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
1969
1970	    /* Indirection levels */
1971	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1972					   R300_FIRST_TEX));
1973
1974	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1975						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1976						R300_TEX_CODE_OFFSET(0) |
1977						R300_TEX_CODE_SIZE(3)));
1978
1979	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1980						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1981						R300_TEX_START(0) |
1982						R300_TEX_SIZE(2) |
1983						R300_RGBA_OUT));
1984
1985	    /* tex inst */
1986	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1987					       R300_TEX_DST_ADDR(2) |
1988					       R300_TEX_ID(0) |
1989					       R300_TEX_INST(R300_TEX_INST_LD)));
1990	    OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
1991					       R300_TEX_DST_ADDR(1) |
1992					       R300_TEX_ID(1) |
1993					       R300_TEX_INST(R300_TEX_INST_LD)));
1994	    OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
1995					       R300_TEX_DST_ADDR(0) |
1996					       R300_TEX_ID(2) |
1997					       R300_TEX_INST(R300_TEX_INST_LD)));
1998
1999	    /* ALU inst */
2000	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
2001	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
2002						    R300_ALU_RGB_ADDR1(2) |
2003						    R300_ALU_RGB_ADDR2(0) |
2004						    R300_ALU_RGB_ADDRD(2) |
2005						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2006	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
2007						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2008						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
2009						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2010						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
2011						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2012						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2013						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2014	    /* alpha nop, but need to set up alpha source for rgb usage */
2015	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
2016						      R300_ALU_ALPHA_ADDR1(2) |
2017						      R300_ALU_ALPHA_ADDR2(0) |
2018						      R300_ALU_ALPHA_ADDRD(2) |
2019						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2020	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2021						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2022						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2023						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2024
2025	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
2026	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
2027						    R300_ALU_RGB_ADDR1(1) |
2028						    R300_ALU_RGB_ADDR2(2) |
2029						    R300_ALU_RGB_ADDRD(2) |
2030						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2031	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2032						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2033						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
2034						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2035						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2036						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2037						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2038						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2039	    /* alpha nop */
2040	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
2041						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2042	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2043						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2044						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2045						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2046
2047	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
2048	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
2049						    R300_ALU_RGB_ADDR1(0) |
2050						    R300_ALU_RGB_ADDR2(2) |
2051						    R300_ALU_RGB_ADDRD(0) |
2052						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
2053						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
2054	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2055						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2056						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
2057						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2058						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2059						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2060						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2061						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2062						    R300_ALU_RGB_CLAMP));
2063	    /* write alpha 1 */
2064	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2065						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2066						      R300_ALU_ALPHA_TARGET_A));
2067	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2068						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2069						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2070						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2071
2072	    if (needgamma) {
2073		/* rgb temp0.r = op_sop, set up src0 reg */
2074		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2075							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2076		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
2077			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2078			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2079		/* alpha lg2 temp0, temp0.r */
2080		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2081							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2082		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2083							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2084							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2085							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2086
2087		/* rgb temp0.g = op_sop, set up src0 reg */
2088		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2089							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2090		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
2091			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2092			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2093		/* alpha lg2 temp0, temp0.g */
2094		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2095							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2096		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2097							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2098							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2099							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2100
2101		/* rgb temp0.b = op_sop, set up src0 reg */
2102		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2103							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2104		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
2105			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2106			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2107		/* alpha lg2 temp0, temp0.b */
2108		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2109							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2110		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2111							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2112							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2113							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2114
2115		/* MUL const1, temp1, temp0 */
2116		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2117							R300_ALU_RGB_ADDR1(0) |
2118							R300_ALU_RGB_ADDR2(0) |
2119							R300_ALU_RGB_ADDRD(0) |
2120							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2121		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2122							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2123							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2124							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2125							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2126							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2127							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2128							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2129		/* alpha nop, but set up const1 */
2130		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2131							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2132							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2133		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2134							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2135							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2136							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2137
2138		/* rgb out0.r = op_sop, set up src0 reg */
2139		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2140							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2141							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2142		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
2143			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2144			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2145		/* alpha ex2 temp0, temp0.r */
2146		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2147							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2148		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2149							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2150							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2151							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2152
2153		/* rgb out0.g = op_sop, set up src0 reg */
2154		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2155							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2156							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2157		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
2158			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2159			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2160		/* alpha ex2 temp0, temp0.g */
2161		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2162							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2163		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2164							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2165							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2166							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2167
2168		/* rgb out0.b = op_sop, set up src0 reg */
2169		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2170							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2171							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2172		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
2173			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2174			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2175		/* alpha ex2 temp0, temp0.b */
2176		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2177							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2178		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2179							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2180							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2181							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2182	    }
2183	} else {
2184	    BEGIN_ACCEL(needgamma ? 28 + 31 : 31);
2185	    /* 2 components */
2186	    OUT_ACCEL_REG(R300_RS_COUNT,
2187			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2188			   R300_RS_COUNT_HIRES_EN));
2189	    /* R300_INST_COUNT_RS - highest RS instruction used */
2190	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
2191
2192	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
2193
2194	    /* Indirection levels */
2195	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
2196					   R300_FIRST_TEX));
2197
2198	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
2199						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
2200						R300_TEX_CODE_OFFSET(0) |
2201						R300_TEX_CODE_SIZE(1)));
2202
2203	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
2204						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
2205						R300_TEX_START(0) |
2206						R300_TEX_SIZE(0) |
2207						R300_RGBA_OUT));
2208
2209	    /* tex inst */
2210	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
2211					       R300_TEX_DST_ADDR(0) |
2212					       R300_TEX_ID(0) |
2213					       R300_TEX_INST(R300_TEX_INST_LD)));
2214
2215	    /* ALU inst */
2216	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
2217	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
2218						    R300_ALU_RGB_ADDR1(0) |
2219						    R300_ALU_RGB_ADDR2(0) |
2220						    R300_ALU_RGB_ADDRD(1) |
2221						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2222	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
2223						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2224						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
2225						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2226						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
2227						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2228						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2229						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2230	    /* alpha nop, but need to set up alpha source for rgb usage */
2231	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
2232						      R300_ALU_ALPHA_ADDR1(0) |
2233						      R300_ALU_ALPHA_ADDR2(0) |
2234						      R300_ALU_ALPHA_ADDRD(0) |
2235						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2236	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2237						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2238						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2239						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2240
2241	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
2242	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
2243						    R300_ALU_RGB_ADDR1(0) |
2244						    R300_ALU_RGB_ADDR2(1) |
2245						    R300_ALU_RGB_ADDRD(1) |
2246						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2247	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2248						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2249						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
2250						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2251						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2252						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2253						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2254						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2255	    /* alpha nop */
2256	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
2257						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2258	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2259						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2260						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2261						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2262
2263	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
2264	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
2265						    R300_ALU_RGB_ADDR1(0) |
2266						    R300_ALU_RGB_ADDR2(1) |
2267						    R300_ALU_RGB_ADDRD(0) |
2268						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
2269						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
2270	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2271						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2272						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
2273						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2274						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2275						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2276						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2277						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2278						    R300_ALU_RGB_CLAMP));
2279	    /* write alpha 1 */
2280	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2281						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2282						      R300_ALU_ALPHA_TARGET_A));
2283	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2284						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2285						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2286						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2287
2288	    if (needgamma) {
2289		/* rgb temp0.r = op_sop, set up src0 reg */
2290		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2291							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2292		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
2293			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2294			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2295		/* alpha lg2 temp0, temp0.r */
2296		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2297							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2298		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2299							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2300							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2301							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2302
2303		/* rgb temp0.g = op_sop, set up src0 reg */
2304		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2305							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2306		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
2307			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2308			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2309		/* alpha lg2 temp0, temp0.g */
2310		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2311							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2312		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2313							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2314							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2315							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2316
2317		/* rgb temp0.b = op_sop, set up src0 reg */
2318		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2319							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2320		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
2321			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2322			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2323		/* alpha lg2 temp0, temp0.b */
2324		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2325							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2326		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2327							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2328							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2329							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2330
2331		/* MUL const1, temp1, temp0 */
2332		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2333							R300_ALU_RGB_ADDR1(0) |
2334							R300_ALU_RGB_ADDR2(0) |
2335							R300_ALU_RGB_ADDRD(0) |
2336							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2337		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2338							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2339							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2340							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2341							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2342							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2343							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2344							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2345		/* alpha nop, but set up const1 */
2346		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2347							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2348							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2349		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2350							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2351							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2352							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2353
2354		/* rgb out0.r = op_sop, set up src0 reg */
2355		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2356							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2357							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2358		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
2359			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2360			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2361		/* alpha ex2 temp0, temp0.r */
2362		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2363							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2364		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2365							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2366							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2367							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2368
2369		/* rgb out0.g = op_sop, set up src0 reg */
2370		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2371							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2372							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2373		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
2374			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2375			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2376		/* alpha ex2 temp0, temp0.g */
2377		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2378							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2379		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2380							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2381							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2382							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2383
2384		/* rgb out0.b = op_sop, set up src0 reg */
2385		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2386							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2387							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2388		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
2389			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2390			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2391		/* alpha ex2 temp0, temp0.b */
2392		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2393							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2394		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2395							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2396							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2397							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2398	    }
2399	}
2400
2401	/* Shader constants. */
2402	/* constant 0: off, yco */
2403	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
2404	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
2405	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
2406	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
2407	/* constant 1: uco */
2408	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
2409	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
2410	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
2411	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
2412	/* constant 2: vco */
2413	OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
2414	OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
2415	OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
2416	OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
2417
2418	FINISH_ACCEL();
2419    }
2420
2421    BEGIN_ACCEL_RELOC(6, 2);
2422    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
2423    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
2424
2425    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
2426    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
2427
2428    /* no need to enable blending */
2429    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
2430
2431    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
2432    FINISH_ACCEL();
2433
2434    if (pPriv->vsync) {
2435	xf86CrtcPtr crtc;
2436	if (pPriv->desired_crtc)
2437	    crtc = pPriv->desired_crtc;
2438	else
2439	    crtc = radeon_pick_best_crtc(pScrn,
2440					 pPriv->drw_x,
2441					 pPriv->drw_x + pPriv->dst_w,
2442					 pPriv->drw_y,
2443					 pPriv->drw_y + pPriv->dst_h);
2444	if (crtc)
2445	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
2446					  crtc,
2447					  pPriv->drw_y - crtc->y,
2448					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
2449    }
2450
2451    return TRUE;
2452}
2453
2454static void
2455FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2456{
2457    RADEONInfoPtr info = RADEONPTR(pScrn);
2458    PixmapPtr pPixmap = pPriv->pPixmap;
2459    int dstxoff, dstyoff;
2460    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
2461    int nBox = REGION_NUM_RECTS(&pPriv->clip);
2462    ACCEL_PREAMBLE();
2463
2464#ifdef COMPOSITE
2465    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
2466    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
2467#else
2468    dstxoff = 0;
2469    dstyoff = 0;
2470#endif
2471
2472    if (!FUNC_NAME(R300PrepareTexturedVideo)(pScrn, pPriv))
2473	return;
2474
2475    /*
2476     * Rendering of the actual polygon is done in two different
2477     * ways depending on chip generation:
2478     *
2479     * < R300:
2480     *
2481     *     These chips can render a rectangle in one pass, so
2482     *     handling is pretty straight-forward.
2483     *
2484     * >= R300:
2485     *
2486     *     These chips can accept a quad, but will render it as
2487     *     two triangles which results in a diagonal tear. Instead
2488     *     We render a single, large triangle and use the scissor
2489     *     functionality to restrict it to the desired rectangle.
2490     *     Due to guardband limits on r3xx/r4xx, we can only use
2491     *     the single triangle up to 2560/4021 pixels; above that we
2492     *     render as a quad.
2493     */
2494
2495    while (nBox--) {
2496	int srcX, srcY, srcw, srch;
2497	int dstX, dstY, dstw, dsth;
2498	Bool use_quad = FALSE;
2499#ifdef ACCEL_CP
2500	int draw_size = 4 * pPriv->vtx_count + 4 + 2 + 3;
2501
2502	if (draw_size > radeon_cs_space_remaining(pScrn)) {
2503	    if (info->cs)
2504		radeon_cs_flush_indirect(pScrn);
2505	    else
2506		RADEONCPFlushIndirect(pScrn, 1);
2507	    if (!FUNC_NAME(R300PrepareTexturedVideo)(pScrn, pPriv))
2508		return;
2509	}
2510#endif
2511
2512	dstX = pBox->x1 + dstxoff;
2513	dstY = pBox->y1 + dstyoff;
2514	dstw = pBox->x2 - pBox->x1;
2515	dsth = pBox->y2 - pBox->y1;
2516
2517	srcX = pPriv->src_x;
2518	srcX += ((pBox->x1 - pPriv->drw_x) *
2519		 pPriv->src_w) / pPriv->dst_w;
2520	srcY = pPriv->src_y;
2521	srcY += ((pBox->y1 - pPriv->drw_y) *
2522		 pPriv->src_h) / pPriv->dst_h;
2523
2524	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
2525	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
2526
2527	if (IS_R400_3D) {
2528	    if ((dstw+dsth) > 4021)
2529		use_quad = TRUE;
2530	} else {
2531	    if ((dstw+dsth) > 2560)
2532		use_quad = TRUE;
2533	}
2534	/*
2535	 * Set up the scissor area to that of the output size.
2536	 */
2537	BEGIN_ACCEL(2);
2538	/* R300 has an offset */
2539	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1440) << R300_SCISSOR_X_SHIFT) |
2540					 ((dstY + 1440) << R300_SCISSOR_Y_SHIFT)));
2541	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1440 - 1) << R300_SCISSOR_X_SHIFT) |
2542					 ((dstY + dsth + 1440 - 1) << R300_SCISSOR_Y_SHIFT)));
2543	FINISH_ACCEL();
2544
2545#ifdef ACCEL_CP
2546	if (use_quad) {
2547	    BEGIN_RING(4 * pPriv->vtx_count + 4);
2548	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2549				4 * pPriv->vtx_count));
2550	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
2551		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2552		     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2553	} else {
2554	    BEGIN_RING(3 * pPriv->vtx_count + 4);
2555	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2556				3 * pPriv->vtx_count));
2557	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
2558		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2559		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2560	}
2561#else /* ACCEL_CP */
2562	if (use_quad)
2563	    BEGIN_ACCEL(2 + pPriv->vtx_count * 4);
2564	else
2565	    BEGIN_ACCEL(2 + pPriv->vtx_count * 3);
2566
2567	if (use_quad)
2568	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
2569					      RADEON_VF_PRIM_WALK_DATA |
2570					      (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
2571	else
2572	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
2573					      RADEON_VF_PRIM_WALK_DATA |
2574					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
2575#endif
2576	if (pPriv->bicubic_enabled) {
2577		/*
2578		 * This code is only executed on >= R300, so we don't
2579		 * have to deal with the legacy handling.
2580		 */
2581	    if (use_quad) {
2582		VTX_OUT_6((float)dstX,                     (float)dstY,
2583			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2584			  (float)srcX + 0.5,               (float)srcY + 0.5);
2585		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
2586			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
2587			  (float)srcX + 0.5,               (float)(srcY + srch) + 0.5);
2588		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
2589			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
2590			  (float)(srcX + srcw) + 0.5,      (float)(srcY + srch) + 0.5);
2591		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
2592			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
2593			  (float)(srcX + srcw) + 0.5,      (float)srcY + 0.5);
2594	    } else {
2595		VTX_OUT_6((float)dstX,                     (float)dstY,
2596			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2597			  (float)srcX + 0.5,               (float)srcY + 0.5);
2598		VTX_OUT_6((float)dstX,                     (float)(dstY + dstw + dsth),
2599			  (float)srcX / pPriv->w,
2600			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
2601			  (float)srcX + 0.5,
2602			  (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
2603		VTX_OUT_6((float)(dstX + dstw + dsth),     (float)dstY,
2604			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2605			  (float)srcY / pPriv->h,
2606			  (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
2607			  (float)srcY + 0.5);
2608	    }
2609	} else {
2610	    if (use_quad) {
2611		VTX_OUT_4((float)dstX,                     (float)dstY,
2612			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h);
2613		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
2614			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
2615		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
2616			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
2617		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
2618			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
2619	    } else {
2620		/*
2621		 * Render a big, scissored triangle. This means
2622		 * increasing the triangle size and adjusting
2623		 * texture coordinates.
2624		 */
2625		VTX_OUT_4((float)dstX,                 (float)dstY,
2626			  (float)srcX / pPriv->w,      (float)srcY / pPriv->h);
2627		VTX_OUT_4((float)dstX,                 (float)(dstY + dsth + dstw),
2628			  (float)srcX / pPriv->w,
2629			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
2630		VTX_OUT_4((float)(dstX + dstw + dsth), (float)dstY,
2631			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2632			  (float)srcY / pPriv->h);
2633	    }
2634	}
2635
2636	/* flushing is pipelined, free/finish is not */
2637	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2638
2639#ifdef ACCEL_CP
2640	ADVANCE_RING();
2641#else
2642	FINISH_ACCEL();
2643#endif /* !ACCEL_CP */
2644
2645	pBox++;
2646    }
2647
2648    BEGIN_ACCEL(3);
2649    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
2650    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
2651    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
2652    FINISH_ACCEL();
2653
2654    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
2655}
2656
2657static Bool
2658FUNC_NAME(R500PrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2659{
2660    RADEONInfoPtr info = RADEONPTR(pScrn);
2661    PixmapPtr pPixmap = pPriv->pPixmap;
2662    struct radeon_exa_pixmap_priv *driver_priv;
2663    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
2664    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
2665    uint32_t dst_pitch, dst_format;
2666    uint32_t txenable, colorpitch, bicubic_offset;
2667    uint32_t output_fmt;
2668    int pixel_shift;
2669    ACCEL_PREAMBLE();
2670
2671#ifdef XF86DRM_MODE
2672    if (info->cs) {
2673	int ret;
2674
2675	radeon_cs_space_reset_bos(info->cs);
2676	radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2677
2678	if (pPriv->bicubic_enabled)
2679	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2680
2681	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
2682	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
2683
2684	ret = radeon_cs_space_check(info->cs);
2685	if (ret) {
2686	    ErrorF("Not enough RAM to hw accel xv operation\n");
2687	    return FALSE;
2688	}
2689    }
2690#endif
2691
2692    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
2693
2694#ifdef USE_EXA
2695    if (info->useEXA) {
2696	dst_pitch = exaGetPixmapPitch(pPixmap);
2697    } else
2698#endif
2699    {
2700	dst_pitch = pPixmap->devKind;
2701    }
2702
2703#ifdef USE_EXA
2704    if (info->useEXA) {
2705	RADEON_SWITCH_TO_3D();
2706    } else
2707#endif
2708    {
2709	BEGIN_ACCEL(2);
2710	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2711	/* We must wait for 3d to idle, in case source was just written as a dest. */
2712	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
2713		      RADEON_WAIT_HOST_IDLECLEAN |
2714		      RADEON_WAIT_2D_IDLECLEAN |
2715		      RADEON_WAIT_3D_IDLECLEAN |
2716		      RADEON_WAIT_DMA_GUI_IDLE);
2717	FINISH_ACCEL();
2718
2719	if (!info->accel_state->XInited3D)
2720	    RADEONInit3DEngine(pScrn);
2721    }
2722
2723    if (pPriv->bicubic_enabled)
2724	pPriv->vtx_count = 6;
2725    else
2726	pPriv->vtx_count = 4;
2727
2728    switch (pPixmap->drawable.bitsPerPixel) {
2729    case 16:
2730	if (pPixmap->drawable.depth == 15)
2731	    dst_format = R300_COLORFORMAT_ARGB1555;
2732	else
2733	    dst_format = R300_COLORFORMAT_RGB565;
2734	break;
2735    case 32:
2736	dst_format = R300_COLORFORMAT_ARGB8888;
2737	break;
2738    default:
2739	return FALSE;
2740    }
2741
2742    output_fmt = (R300_OUT_FMT_C4_8 |
2743		  R300_OUT_FMT_C0_SEL_BLUE |
2744		  R300_OUT_FMT_C1_SEL_GREEN |
2745		  R300_OUT_FMT_C2_SEL_RED |
2746		  R300_OUT_FMT_C3_SEL_ALPHA);
2747
2748    colorpitch = dst_pitch >> pixel_shift;
2749    colorpitch |= dst_format;
2750
2751    if (RADEONTilingEnabled(pScrn, pPixmap))
2752	colorpitch |= R300_COLORTILE;
2753
2754    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
2755        (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
2756	pPriv->is_planar = TRUE;
2757    else
2758	pPriv->is_planar = FALSE;
2759
2760    if (pPriv->is_planar) {
2761	txformat1 = R300_TX_FORMAT_X8;
2762	txpitch = pPriv->src_pitch;
2763    } else {
2764	if (pPriv->id == FOURCC_UYVY)
2765	    txformat1 = R300_TX_FORMAT_YVYU422;
2766	else
2767	    txformat1 = R300_TX_FORMAT_VYUY422;
2768
2769	if (pPriv->bicubic_state != BICUBIC_OFF)
2770	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
2771
2772	/* pitch is in pixels */
2773	txpitch = pPriv->src_pitch / 2;
2774    }
2775    txpitch -= 1;
2776
2777    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2778		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2779		 R300_TXPITCH_EN);
2780
2781    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2782		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2783		R300_TX_MAG_FILTER_LINEAR |
2784		R300_TX_MIN_FILTER_LINEAR |
2785		(0 << R300_TX_ID_SHIFT));
2786
2787
2788    if ((pPriv->w - 1) & 0x800)
2789	txpitch |= R500_TXWIDTH_11;
2790
2791    if ((pPriv->h - 1) & 0x800)
2792	txpitch |= R500_TXHEIGHT_11;
2793
2794    txoffset = info->cs ? 0 : pPriv->src_offset;
2795
2796    BEGIN_ACCEL_RELOC(6, 1);
2797    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
2798    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
2799    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
2800    OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
2801    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
2802    OUT_TEXTURE_REG(R300_TX_OFFSET_0, txoffset, src_bo);
2803    FINISH_ACCEL();
2804
2805    txenable = R300_TEX_0_ENABLE;
2806
2807    if (pPriv->is_planar) {
2808	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2809		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2810		     R300_TXPITCH_EN);
2811	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
2812	txpitch -= 1;
2813	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2814		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2815		    R300_TX_MIN_FILTER_LINEAR |
2816		    R300_TX_MAG_FILTER_LINEAR);
2817
2818	BEGIN_ACCEL_RELOC(12, 2);
2819	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
2820	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
2821	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
2822	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8);
2823	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
2824	OUT_TEXTURE_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
2825	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
2826	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
2827	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
2828	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8);
2829	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
2830	OUT_TEXTURE_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset, src_bo);
2831	FINISH_ACCEL();
2832	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
2833    }
2834
2835    if (pPriv->bicubic_enabled) {
2836	/* Size is 128x1 */
2837	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
2838		     (0x0 << R300_TXHEIGHT_SHIFT) |
2839		     R300_TXPITCH_EN);
2840	/* Format is 32-bit floats, 4bpp */
2841	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
2842	/* Pitch is 127 (128-1) */
2843	txpitch = 0x7f;
2844	/* Tex filter */
2845	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
2846		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
2847		    R300_TX_MIN_FILTER_NEAREST |
2848		    R300_TX_MAG_FILTER_NEAREST |
2849		    (1 << R300_TX_ID_SHIFT));
2850
2851	if (info->cs)
2852	    bicubic_offset = 0;
2853	else
2854	    bicubic_offset = pPriv->bicubic_src_offset;
2855
2856	BEGIN_ACCEL_RELOC(6, 1);
2857	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
2858	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
2859	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
2860	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
2861	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
2862	OUT_TEXTURE_REG(R300_TX_OFFSET_1, bicubic_offset, info->bicubic_bo);
2863	FINISH_ACCEL();
2864
2865	/* Enable tex 1 */
2866	txenable |= R300_TEX_1_ENABLE;
2867    }
2868
2869    /* setup the VAP */
2870    if (info->accel_state->has_tcl) {
2871	if (pPriv->bicubic_enabled)
2872	    BEGIN_ACCEL(7);
2873	else
2874	    BEGIN_ACCEL(6);
2875    } else {
2876	if (pPriv->bicubic_enabled)
2877	    BEGIN_ACCEL(5);
2878	else
2879	    BEGIN_ACCEL(4);
2880    }
2881
2882    /* These registers define the number, type, and location of data submitted
2883     * to the PVS unit of GA input (when PVS is disabled)
2884     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
2885     * enabled.  This memory provides the imputs to the vertex shader program
2886     * and ordering is not important.  When PVS/TCL is disabled, this field maps
2887     * directly to the GA input memory and the order is signifigant.  In
2888     * PVS_BYPASS mode the order is as follows:
2889     * Position
2890     * Point Size
2891     * Color 0-3
2892     * Textures 0-7
2893     * Fog
2894     */
2895    if (pPriv->bicubic_enabled) {
2896	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
2897		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2898		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2899		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2900		       R300_SIGNED_0 |
2901		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2902		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2903		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2904		       R300_SIGNED_1));
2905	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
2906		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
2907		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
2908		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
2909		       R300_LAST_VEC_2 |
2910		       R300_SIGNED_2));
2911    } else {
2912	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
2913		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2914		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2915		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2916		       R300_SIGNED_0 |
2917		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2918		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2919		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2920		       R300_LAST_VEC_1 |
2921		       R300_SIGNED_1));
2922    }
2923
2924    /* load the vertex shader
2925     * We pre-load vertex programs in RADEONInit3DEngine():
2926     * - exa
2927     * - Xv
2928     * - Xv bicubic
2929     * Here we select the offset of the vertex program we want to use
2930     */
2931    if (info->accel_state->has_tcl) {
2932	if (pPriv->bicubic_enabled) {
2933	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
2934			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
2935			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2936			   (13 << R300_PVS_LAST_INST_SHIFT)));
2937	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
2938			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2939	} else {
2940	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
2941			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
2942			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2943			   (10 << R300_PVS_LAST_INST_SHIFT)));
2944	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
2945			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2946	}
2947    }
2948
2949    /* Position and one set of 2 texture coordinates */
2950    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
2951    if (pPriv->bicubic_enabled)
2952	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
2953					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
2954    else
2955	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
2956
2957    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
2958    FINISH_ACCEL();
2959
2960    /* setup pixel shader */
2961    if (pPriv->bicubic_state != BICUBIC_OFF) {
2962	if (pPriv->bicubic_enabled) {
2963	    BEGIN_ACCEL(7);
2964
2965	    /* 4 components: 2 for tex0 and 2 for tex1 */
2966	    OUT_ACCEL_REG(R300_RS_COUNT,
2967			  ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2968			   R300_RS_COUNT_HIRES_EN));
2969
2970	    /* R300_INST_COUNT_RS - highest RS instruction used */
2971	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
2972
2973	    /* Pixel stack frame size. */
2974	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
2975
2976	    /* FP length. */
2977	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
2978					      R500_US_CODE_END_ADDR(13)));
2979	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
2980					       R500_US_CODE_RANGE_SIZE(13)));
2981
2982	    /* Prepare for FP emission. */
2983	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
2984	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
2985	    FINISH_ACCEL();
2986
2987	    BEGIN_ACCEL(89);
2988	    /* Pixel shader.
2989	     * I've gone ahead and annotated each instruction, since this
2990	     * thing is MASSIVE. :3
2991	     * Note: In order to avoid buggies with temps and multiple
2992	     * inputs, all temps are offset by 2. temp0 -> register2. */
2993
2994	    /* TEX temp2, input1.xxxx, tex1, 1D */
2995	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2996						   R500_INST_RGB_WMASK_R |
2997						   R500_INST_RGB_WMASK_G |
2998						   R500_INST_RGB_WMASK_B));
2999	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3000						   R500_TEX_INST_LD |
3001						   R500_TEX_IGNORE_UNCOVERED));
3002	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
3003						   R500_TEX_SRC_S_SWIZ_R |
3004						   R500_TEX_SRC_T_SWIZ_R |
3005						   R500_TEX_SRC_R_SWIZ_R |
3006						   R500_TEX_SRC_Q_SWIZ_R |
3007						   R500_TEX_DST_ADDR(2) |
3008						   R500_TEX_DST_R_SWIZ_R |
3009						   R500_TEX_DST_G_SWIZ_G |
3010						   R500_TEX_DST_B_SWIZ_B |
3011						   R500_TEX_DST_A_SWIZ_A));
3012	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3013	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3014	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3015
3016	    /* TEX temp5, input1.yyyy, tex1, 1D */
3017	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3018						   R500_INST_TEX_SEM_WAIT |
3019						   R500_INST_RGB_WMASK_R |
3020						   R500_INST_RGB_WMASK_G |
3021						   R500_INST_RGB_WMASK_B));
3022	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3023						   R500_TEX_INST_LD |
3024						   R500_TEX_SEM_ACQUIRE |
3025						   R500_TEX_IGNORE_UNCOVERED));
3026	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
3027						   R500_TEX_SRC_S_SWIZ_G |
3028						   R500_TEX_SRC_T_SWIZ_G |
3029						   R500_TEX_SRC_R_SWIZ_G |
3030						   R500_TEX_SRC_Q_SWIZ_G |
3031						   R500_TEX_DST_ADDR(5) |
3032						   R500_TEX_DST_R_SWIZ_R |
3033						   R500_TEX_DST_G_SWIZ_G |
3034						   R500_TEX_DST_B_SWIZ_B |
3035						   R500_TEX_DST_A_SWIZ_A));
3036	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3037	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3038	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3039
3040	    /* MUL temp4, const0.x0x0, temp2.yyxx */
3041	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3042						   R500_INST_TEX_SEM_WAIT |
3043						   R500_INST_RGB_WMASK_R |
3044						   R500_INST_RGB_WMASK_G |
3045						   R500_INST_RGB_WMASK_B |
3046						   R500_INST_ALPHA_WMASK));
3047	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3048						   R500_RGB_ADDR0_CONST |
3049						   R500_RGB_ADDR1(2)));
3050	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3051						   R500_ALPHA_ADDR0_CONST |
3052						   R500_ALPHA_ADDR1(2)));
3053	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3054						   R500_ALU_RGB_R_SWIZ_A_R |
3055						   R500_ALU_RGB_G_SWIZ_A_0 |
3056						   R500_ALU_RGB_B_SWIZ_A_R |
3057						   R500_ALU_RGB_SEL_B_SRC1 |
3058						   R500_ALU_RGB_R_SWIZ_B_G |
3059						   R500_ALU_RGB_G_SWIZ_B_G |
3060						   R500_ALU_RGB_B_SWIZ_B_R));
3061	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
3062						   R500_ALPHA_OP_MAD |
3063						   R500_ALPHA_SEL_A_SRC0 |
3064						   R500_ALPHA_SWIZ_A_0 |
3065						   R500_ALPHA_SEL_B_SRC1 |
3066						   R500_ALPHA_SWIZ_B_R));
3067	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
3068						   R500_ALU_RGBA_OP_MAD |
3069						   R500_ALU_RGBA_R_SWIZ_0 |
3070						   R500_ALU_RGBA_G_SWIZ_0 |
3071						   R500_ALU_RGBA_B_SWIZ_0 |
3072						   R500_ALU_RGBA_A_SWIZ_0));
3073
3074	    /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
3075	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3076						   R500_INST_RGB_WMASK_R |
3077						   R500_INST_RGB_WMASK_G |
3078						   R500_INST_RGB_WMASK_B |
3079						   R500_INST_ALPHA_WMASK));
3080	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3081						   R500_RGB_ADDR0_CONST |
3082						   R500_RGB_ADDR1(5) |
3083						   R500_RGB_ADDR2(4)));
3084	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3085						   R500_ALPHA_ADDR0_CONST |
3086						   R500_ALPHA_ADDR1(5) |
3087						   R500_ALPHA_ADDR2(4)));
3088	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3089						   R500_ALU_RGB_R_SWIZ_A_0 |
3090						   R500_ALU_RGB_G_SWIZ_A_G |
3091						   R500_ALU_RGB_B_SWIZ_A_0 |
3092						   R500_ALU_RGB_SEL_B_SRC1 |
3093						   R500_ALU_RGB_R_SWIZ_B_R |
3094						   R500_ALU_RGB_G_SWIZ_B_R |
3095						   R500_ALU_RGB_B_SWIZ_B_R));
3096	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3097						   R500_ALPHA_OP_MAD |
3098						   R500_ALPHA_SEL_A_SRC0 |
3099						   R500_ALPHA_SWIZ_A_G |
3100						   R500_ALPHA_SEL_B_SRC1 |
3101						   R500_ALPHA_SWIZ_B_R));
3102	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3103						   R500_ALU_RGBA_OP_MAD |
3104						   R500_ALU_RGBA_SEL_C_SRC2 |
3105						   R500_ALU_RGBA_R_SWIZ_R |
3106						   R500_ALU_RGBA_G_SWIZ_G |
3107						   R500_ALU_RGBA_B_SWIZ_B |
3108						   R500_ALU_RGBA_A_SWIZ_A));
3109
3110	    /* ADD temp3, temp3, input0.xyxy */
3111	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3112						   R500_INST_RGB_WMASK_R |
3113						   R500_INST_RGB_WMASK_G |
3114						   R500_INST_RGB_WMASK_B |
3115						   R500_INST_ALPHA_WMASK));
3116	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
3117						   R500_RGB_ADDR2(0)));
3118	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
3119						   R500_ALPHA_ADDR2(0)));
3120	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
3121						   R500_ALU_RGB_G_SWIZ_A_1 |
3122						   R500_ALU_RGB_B_SWIZ_A_1 |
3123						   R500_ALU_RGB_SEL_B_SRC1 |
3124						   R500_ALU_RGB_R_SWIZ_B_R |
3125						   R500_ALU_RGB_G_SWIZ_B_G |
3126						   R500_ALU_RGB_B_SWIZ_B_B));
3127	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3128						   R500_ALPHA_OP_MAD |
3129						   R500_ALPHA_SWIZ_A_1 |
3130						   R500_ALPHA_SEL_B_SRC1 |
3131						   R500_ALPHA_SWIZ_B_A));
3132	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3133						   R500_ALU_RGBA_OP_MAD |
3134						   R500_ALU_RGBA_SEL_C_SRC2 |
3135						   R500_ALU_RGBA_R_SWIZ_R |
3136						   R500_ALU_RGBA_G_SWIZ_G |
3137						   R500_ALU_RGBA_B_SWIZ_R |
3138						   R500_ALU_RGBA_A_SWIZ_G));
3139
3140	    /* TEX temp1, temp3.zwxy, tex0, 2D */
3141	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3142						   R500_INST_RGB_WMASK_R |
3143						   R500_INST_RGB_WMASK_G |
3144						   R500_INST_RGB_WMASK_B |
3145						   R500_INST_ALPHA_WMASK));
3146	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3147						   R500_TEX_INST_LD |
3148						   R500_TEX_IGNORE_UNCOVERED));
3149	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
3150						   R500_TEX_SRC_S_SWIZ_B |
3151						   R500_TEX_SRC_T_SWIZ_A |
3152						   R500_TEX_SRC_R_SWIZ_R |
3153						   R500_TEX_SRC_Q_SWIZ_G |
3154						   R500_TEX_DST_ADDR(1) |
3155						   R500_TEX_DST_R_SWIZ_R |
3156						   R500_TEX_DST_G_SWIZ_G |
3157						   R500_TEX_DST_B_SWIZ_B |
3158						   R500_TEX_DST_A_SWIZ_A));
3159	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3160	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3161	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3162
3163	    /* TEX temp3, temp3.xyzw, tex0, 2D */
3164	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3165						   R500_INST_TEX_SEM_WAIT |
3166						   R500_INST_RGB_WMASK_R |
3167						   R500_INST_RGB_WMASK_G |
3168						   R500_INST_RGB_WMASK_B |
3169						   R500_INST_ALPHA_WMASK));
3170	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3171						   R500_TEX_INST_LD |
3172						   R500_TEX_SEM_ACQUIRE |
3173						   R500_TEX_IGNORE_UNCOVERED));
3174	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
3175						   R500_TEX_SRC_S_SWIZ_R |
3176						   R500_TEX_SRC_T_SWIZ_G |
3177						   R500_TEX_SRC_R_SWIZ_B |
3178						   R500_TEX_SRC_Q_SWIZ_A |
3179						   R500_TEX_DST_ADDR(3) |
3180						   R500_TEX_DST_R_SWIZ_R |
3181						   R500_TEX_DST_G_SWIZ_G |
3182						   R500_TEX_DST_B_SWIZ_B |
3183						   R500_TEX_DST_A_SWIZ_A));
3184	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3185	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3186	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3187
3188	    /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
3189	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3190						   R500_INST_RGB_WMASK_R |
3191						   R500_INST_RGB_WMASK_G |
3192						   R500_INST_RGB_WMASK_B |
3193						   R500_INST_ALPHA_WMASK));
3194	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3195						   R500_RGB_ADDR0_CONST |
3196						   R500_RGB_ADDR1(5) |
3197						   R500_RGB_ADDR2(4)));
3198	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3199						   R500_ALPHA_ADDR0_CONST |
3200						   R500_ALPHA_ADDR1(5) |
3201						   R500_ALPHA_ADDR2(4)));
3202	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3203						   R500_ALU_RGB_R_SWIZ_A_0 |
3204						   R500_ALU_RGB_G_SWIZ_A_G |
3205						   R500_ALU_RGB_B_SWIZ_A_0 |
3206						   R500_ALU_RGB_SEL_B_SRC1 |
3207						   R500_ALU_RGB_R_SWIZ_B_G |
3208						   R500_ALU_RGB_G_SWIZ_B_G |
3209						   R500_ALU_RGB_B_SWIZ_B_G));
3210	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
3211						   R500_ALPHA_OP_MAD |
3212						   R500_ALPHA_SEL_A_SRC0 |
3213						   R500_ALPHA_SWIZ_A_G |
3214						   R500_ALPHA_SEL_B_SRC1 |
3215						   R500_ALPHA_SWIZ_B_G));
3216	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
3217						   R500_ALU_RGBA_OP_MAD |
3218						   R500_ALU_RGBA_SEL_C_SRC2 |
3219						   R500_ALU_RGBA_R_SWIZ_R |
3220						   R500_ALU_RGBA_G_SWIZ_G |
3221						   R500_ALU_RGBA_B_SWIZ_B |
3222						   R500_ALU_RGBA_A_SWIZ_A));
3223
3224	    /* ADD temp0, temp4, input0.xyxy */
3225	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3226						   R500_INST_RGB_WMASK_R |
3227						   R500_INST_RGB_WMASK_G |
3228						   R500_INST_RGB_WMASK_B |
3229						   R500_INST_ALPHA_WMASK));
3230	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
3231						   R500_RGB_ADDR2(0)));
3232	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
3233						   R500_ALPHA_ADDR2(0)));
3234	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
3235						   R500_ALU_RGB_G_SWIZ_A_1 |
3236						   R500_ALU_RGB_B_SWIZ_A_1 |
3237						   R500_ALU_RGB_SEL_B_SRC1 |
3238						   R500_ALU_RGB_R_SWIZ_B_R |
3239						   R500_ALU_RGB_G_SWIZ_B_G |
3240						   R500_ALU_RGB_B_SWIZ_B_B));
3241	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3242						   R500_ALPHA_OP_MAD |
3243						   R500_ALPHA_SWIZ_A_1 |
3244						   R500_ALPHA_SEL_B_SRC1 |
3245						   R500_ALPHA_SWIZ_B_A));
3246	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3247						   R500_ALU_RGBA_OP_MAD |
3248						   R500_ALU_RGBA_SEL_C_SRC2 |
3249						   R500_ALU_RGBA_R_SWIZ_R |
3250						   R500_ALU_RGBA_G_SWIZ_G |
3251						   R500_ALU_RGBA_B_SWIZ_R |
3252						   R500_ALU_RGBA_A_SWIZ_G));
3253
3254	    /* TEX temp4, temp0.zwzw, tex0, 2D */
3255	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3256						   R500_INST_TEX_SEM_WAIT |
3257						   R500_INST_RGB_WMASK_R |
3258						   R500_INST_RGB_WMASK_G |
3259						   R500_INST_RGB_WMASK_B |
3260						   R500_INST_ALPHA_WMASK));
3261	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3262						   R500_TEX_INST_LD |
3263						   R500_TEX_IGNORE_UNCOVERED));
3264	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3265						   R500_TEX_SRC_S_SWIZ_B |
3266						   R500_TEX_SRC_T_SWIZ_A |
3267						   R500_TEX_SRC_R_SWIZ_B |
3268						   R500_TEX_SRC_Q_SWIZ_A |
3269						   R500_TEX_DST_ADDR(4) |
3270						   R500_TEX_DST_R_SWIZ_R |
3271						   R500_TEX_DST_G_SWIZ_G |
3272						   R500_TEX_DST_B_SWIZ_B |
3273						   R500_TEX_DST_A_SWIZ_A));
3274	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3275	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3276	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3277
3278	    /* TEX temp0, temp0.xyzw, tex0, 2D */
3279	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3280						   R500_INST_TEX_SEM_WAIT |
3281						   R500_INST_RGB_WMASK_R |
3282						   R500_INST_RGB_WMASK_G |
3283						   R500_INST_RGB_WMASK_B |
3284						   R500_INST_ALPHA_WMASK));
3285	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3286						   R500_TEX_INST_LD |
3287						   R500_TEX_SEM_ACQUIRE |
3288						   R500_TEX_IGNORE_UNCOVERED));
3289	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3290						   R500_TEX_SRC_S_SWIZ_R |
3291						   R500_TEX_SRC_T_SWIZ_G |
3292						   R500_TEX_SRC_R_SWIZ_B |
3293						   R500_TEX_SRC_Q_SWIZ_A |
3294						   R500_TEX_DST_ADDR(0) |
3295						   R500_TEX_DST_R_SWIZ_R |
3296						   R500_TEX_DST_G_SWIZ_G |
3297						   R500_TEX_DST_B_SWIZ_B |
3298						   R500_TEX_DST_A_SWIZ_A));
3299	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3300	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3301	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3302
3303	    /* LRP temp3, temp2.zzzz, temp1, temp3 ->
3304	     * - PRESUB temps, temp1 - temp3
3305	     * - MAD temp2.zzzz, temps, temp3 */
3306	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3307						   R500_INST_RGB_WMASK_R |
3308						   R500_INST_RGB_WMASK_G |
3309						   R500_INST_RGB_WMASK_B |
3310						   R500_INST_ALPHA_WMASK));
3311	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
3312						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3313						   R500_RGB_ADDR1(1) |
3314						   R500_RGB_ADDR2(2)));
3315	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
3316						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3317						   R500_ALPHA_ADDR1(1) |
3318						   R500_ALPHA_ADDR2(2)));
3319	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3320						   R500_ALU_RGB_R_SWIZ_A_B |
3321						   R500_ALU_RGB_G_SWIZ_A_B |
3322						   R500_ALU_RGB_B_SWIZ_A_B |
3323						   R500_ALU_RGB_SEL_B_SRCP |
3324						   R500_ALU_RGB_R_SWIZ_B_R |
3325						   R500_ALU_RGB_G_SWIZ_B_G |
3326						   R500_ALU_RGB_B_SWIZ_B_B));
3327	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3328						   R500_ALPHA_OP_MAD |
3329						   R500_ALPHA_SEL_A_SRC2 |
3330						   R500_ALPHA_SWIZ_A_B |
3331						   R500_ALPHA_SEL_B_SRCP |
3332						   R500_ALPHA_SWIZ_B_A));
3333	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3334						   R500_ALU_RGBA_OP_MAD |
3335						   R500_ALU_RGBA_SEL_C_SRC0 |
3336						   R500_ALU_RGBA_R_SWIZ_R |
3337						   R500_ALU_RGBA_G_SWIZ_G |
3338						   R500_ALU_RGBA_B_SWIZ_B |
3339						   R500_ALU_RGBA_A_SWIZ_A));
3340
3341	    /* LRP temp0, temp2.zzzz, temp4, temp0 ->
3342	     * - PRESUB temps, temp4 - temp1
3343	     * - MAD temp2.zzzz, temps, temp0 */
3344	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3345						   R500_INST_TEX_SEM_WAIT |
3346						   R500_INST_RGB_WMASK_R |
3347						   R500_INST_RGB_WMASK_G |
3348						   R500_INST_RGB_WMASK_B |
3349						   R500_INST_ALPHA_WMASK));
3350	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3351						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3352						   R500_RGB_ADDR1(4) |
3353						   R500_RGB_ADDR2(2)));
3354	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3355						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3356						   R500_ALPHA_ADDR1(4) |
3357						   R500_ALPHA_ADDR2(2)));
3358	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3359						   R500_ALU_RGB_R_SWIZ_A_B |
3360						   R500_ALU_RGB_G_SWIZ_A_B |
3361						   R500_ALU_RGB_B_SWIZ_A_B |
3362						   R500_ALU_RGB_SEL_B_SRCP |
3363						   R500_ALU_RGB_R_SWIZ_B_R |
3364						   R500_ALU_RGB_G_SWIZ_B_G |
3365						   R500_ALU_RGB_B_SWIZ_B_B));
3366	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3367						   R500_ALPHA_OP_MAD |
3368						   R500_ALPHA_SEL_A_SRC2 |
3369						   R500_ALPHA_SWIZ_A_B |
3370						   R500_ALPHA_SEL_B_SRCP |
3371						   R500_ALPHA_SWIZ_B_A));
3372	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3373						   R500_ALU_RGBA_OP_MAD |
3374						   R500_ALU_RGBA_SEL_C_SRC0 |
3375						   R500_ALU_RGBA_R_SWIZ_R |
3376						   R500_ALU_RGBA_G_SWIZ_G |
3377						   R500_ALU_RGBA_B_SWIZ_B |
3378						   R500_ALU_RGBA_A_SWIZ_A));
3379
3380	    /* LRP output, temp5.zzzz, temp3, temp0 ->
3381	     * - PRESUB temps, temp3 - temp0
3382	     * - MAD temp5.zzzz, temps, temp0 */
3383	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3384						   R500_INST_LAST |
3385						   R500_INST_TEX_SEM_WAIT |
3386						   R500_INST_RGB_WMASK_R |
3387						   R500_INST_RGB_WMASK_G |
3388						   R500_INST_RGB_WMASK_B |
3389						   R500_INST_ALPHA_WMASK |
3390						   R500_INST_RGB_OMASK_R |
3391						   R500_INST_RGB_OMASK_G |
3392						   R500_INST_RGB_OMASK_B |
3393						   R500_INST_ALPHA_OMASK));
3394	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3395						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3396						   R500_RGB_ADDR1(3) |
3397						   R500_RGB_ADDR2(5)));
3398	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3399						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3400						   R500_ALPHA_ADDR1(3) |
3401						   R500_ALPHA_ADDR2(5)));
3402	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3403						   R500_ALU_RGB_R_SWIZ_A_B |
3404						   R500_ALU_RGB_G_SWIZ_A_B |
3405						   R500_ALU_RGB_B_SWIZ_A_B |
3406						   R500_ALU_RGB_SEL_B_SRCP |
3407						   R500_ALU_RGB_R_SWIZ_B_R |
3408						   R500_ALU_RGB_G_SWIZ_B_G |
3409						   R500_ALU_RGB_B_SWIZ_B_B));
3410	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3411						   R500_ALPHA_OP_MAD |
3412						   R500_ALPHA_SEL_A_SRC2 |
3413						   R500_ALPHA_SWIZ_A_B |
3414						   R500_ALPHA_SEL_B_SRCP |
3415						   R500_ALPHA_SWIZ_B_A));
3416	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3417						   R500_ALU_RGBA_OP_MAD |
3418						   R500_ALU_RGBA_SEL_C_SRC0 |
3419						   R500_ALU_RGBA_R_SWIZ_R |
3420						   R500_ALU_RGBA_G_SWIZ_G |
3421						   R500_ALU_RGBA_B_SWIZ_B |
3422						   R500_ALU_RGBA_A_SWIZ_A));
3423
3424	    /* Shader constants. */
3425	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3426
3427	    /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
3428	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
3429	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
3430	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3431	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3432
3433	    FINISH_ACCEL();
3434	} else {
3435	    BEGIN_ACCEL(19);
3436	    /* 2 components: 2 for tex0 */
3437	    OUT_ACCEL_REG(R300_RS_COUNT,
3438			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3439			   R300_RS_COUNT_HIRES_EN));
3440
3441	    /* R300_INST_COUNT_RS - highest RS instruction used */
3442	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3443
3444	    /* Pixel stack frame size. */
3445	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
3446
3447	    /* FP length. */
3448	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3449					      R500_US_CODE_END_ADDR(1)));
3450	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3451					       R500_US_CODE_RANGE_SIZE(1)));
3452
3453	    /* Prepare for FP emission. */
3454	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3455	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3456
3457	    /* tex inst */
3458	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3459						   R500_INST_TEX_SEM_WAIT |
3460						   R500_INST_RGB_WMASK_R |
3461						   R500_INST_RGB_WMASK_G |
3462						   R500_INST_RGB_WMASK_B |
3463						   R500_INST_ALPHA_WMASK |
3464						   R500_INST_RGB_CLAMP |
3465						   R500_INST_ALPHA_CLAMP));
3466	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3467						   R500_TEX_INST_LD |
3468						   R500_TEX_SEM_ACQUIRE |
3469						   R500_TEX_IGNORE_UNCOVERED));
3470	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3471						   R500_TEX_SRC_S_SWIZ_R |
3472						   R500_TEX_SRC_T_SWIZ_G |
3473						   R500_TEX_DST_ADDR(0) |
3474						   R500_TEX_DST_R_SWIZ_R |
3475						   R500_TEX_DST_G_SWIZ_G |
3476						   R500_TEX_DST_B_SWIZ_B |
3477						   R500_TEX_DST_A_SWIZ_A));
3478	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3479						   R500_DX_S_SWIZ_R |
3480						   R500_DX_T_SWIZ_R |
3481						   R500_DX_R_SWIZ_R |
3482						   R500_DX_Q_SWIZ_R |
3483						   R500_DY_ADDR(0) |
3484						   R500_DY_S_SWIZ_R |
3485						   R500_DY_T_SWIZ_R |
3486						   R500_DY_R_SWIZ_R |
3487						   R500_DY_Q_SWIZ_R));
3488	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3489	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3490
3491	    /* ALU inst */
3492	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3493						   R500_INST_TEX_SEM_WAIT |
3494						   R500_INST_LAST |
3495						   R500_INST_RGB_OMASK_R |
3496						   R500_INST_RGB_OMASK_G |
3497						   R500_INST_RGB_OMASK_B |
3498						   R500_INST_ALPHA_OMASK |
3499						   R500_INST_RGB_CLAMP |
3500						   R500_INST_ALPHA_CLAMP));
3501	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3502						   R500_RGB_ADDR1(0) |
3503						   R500_RGB_ADDR1_CONST |
3504						   R500_RGB_ADDR2(0) |
3505						   R500_RGB_ADDR2_CONST));
3506	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3507						   R500_ALPHA_ADDR1(0) |
3508						   R500_ALPHA_ADDR1_CONST |
3509						   R500_ALPHA_ADDR2(0) |
3510						   R500_ALPHA_ADDR2_CONST));
3511	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3512						   R500_ALU_RGB_R_SWIZ_A_R |
3513						   R500_ALU_RGB_G_SWIZ_A_G |
3514						   R500_ALU_RGB_B_SWIZ_A_B |
3515						   R500_ALU_RGB_SEL_B_SRC0 |
3516						   R500_ALU_RGB_R_SWIZ_B_1 |
3517						   R500_ALU_RGB_B_SWIZ_B_1 |
3518						   R500_ALU_RGB_G_SWIZ_B_1));
3519	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3520						   R500_ALPHA_SWIZ_A_A |
3521						   R500_ALPHA_SWIZ_B_1));
3522	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3523						   R500_ALU_RGBA_R_SWIZ_0 |
3524						   R500_ALU_RGBA_G_SWIZ_0 |
3525						   R500_ALU_RGBA_B_SWIZ_0 |
3526						   R500_ALU_RGBA_A_SWIZ_0));
3527	    FINISH_ACCEL();
3528	}
3529    } else {
3530	/*
3531	 * y' = y - .0625
3532	 * u' = u - .5
3533	 * v' = v - .5;
3534	 *
3535	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
3536	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
3537	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
3538	 *
3539	 * DP3 might look like the straightforward solution
3540	 * but we'd need to move the texture yuv values in
3541	 * the same reg for this to work. Therefore use MADs.
3542	 * Brightness just adds to the off constant.
3543	 * Contrast is multiplication of luminance.
3544	 * Saturation and hue change the u and v coeffs.
3545	 * Default values (before adjustments - depend on colorspace):
3546	 * yco = 1.1643
3547	 * uco = 0, -0.39173, 2.017
3548	 * vco = 1.5958, -0.8129, 0
3549	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
3550	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
3551	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
3552	 *
3553	 * temp = MAD(yco, yuv.yyyy, off)
3554	 * temp = MAD(uco, yuv.uuuu, temp)
3555	 * result = MAD(vco, yuv.vvvv, temp)
3556	 */
3557	/* TODO: don't recalc consts always */
3558	const float Loff = -0.0627;
3559	const float Coff = -0.502;
3560	float uvcosf, uvsinf;
3561	float yco;
3562	float uco[3], vco[3], off[3];
3563	float bright, cont, gamma;
3564	int ref = pPriv->transform_index;
3565	Bool needgamma = FALSE;
3566
3567	cont = RTFContrast(pPriv->contrast);
3568	bright = RTFBrightness(pPriv->brightness);
3569	gamma = (float)pPriv->gamma / 1000.0;
3570	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
3571	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
3572	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
3573
3574	yco = trans[ref].RefLuma * cont;
3575	uco[0] = -trans[ref].RefRCr * uvsinf;
3576	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
3577	uco[2] = trans[ref].RefBCb * uvcosf;
3578	vco[0] = trans[ref].RefRCr * uvcosf;
3579	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
3580	vco[2] = trans[ref].RefBCb * uvsinf;
3581	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
3582	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
3583	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
3584
3585	//XXX gamma
3586
3587	if (gamma != 1.0) {
3588	    needgamma = TRUE;
3589	    /* note: gamma correction is out = in ^ gamma;
3590	       gpu can only do LG2/EX2 therefore we transform into
3591	       in ^ gamma = 2 ^ (log2(in) * gamma).
3592	       Lots of scalar ops, unfortunately (better solution?) -
3593	       without gamma that's 3 inst, with gamma it's 10...
3594	       could use different gamma factors per channel,
3595	       if that's of any use. */
3596	}
3597
3598	if (pPriv->is_planar) {
3599	    BEGIN_ACCEL(56);
3600	    /* 2 components: 2 for tex0 */
3601	    OUT_ACCEL_REG(R300_RS_COUNT,
3602			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3603			   R300_RS_COUNT_HIRES_EN));
3604
3605	    /* R300_INST_COUNT_RS - highest RS instruction used */
3606	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3607
3608	    /* Pixel stack frame size. */
3609	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
3610
3611	    /* FP length. */
3612	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3613					      R500_US_CODE_END_ADDR(5)));
3614	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3615					       R500_US_CODE_RANGE_SIZE(5)));
3616
3617	    /* Prepare for FP emission. */
3618	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3619	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3620
3621	    /* tex inst */
3622	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3623						   R500_INST_TEX_SEM_WAIT |
3624						   R500_INST_RGB_WMASK_R |
3625						   R500_INST_RGB_WMASK_G |
3626						   R500_INST_RGB_WMASK_B |
3627						   R500_INST_ALPHA_WMASK |
3628						   R500_INST_RGB_CLAMP |
3629						   R500_INST_ALPHA_CLAMP));
3630	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3631						   R500_TEX_INST_LD |
3632						   R500_TEX_IGNORE_UNCOVERED));
3633	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3634						   R500_TEX_SRC_S_SWIZ_R |
3635						   R500_TEX_SRC_T_SWIZ_G |
3636						   R500_TEX_DST_ADDR(2) |
3637						   R500_TEX_DST_R_SWIZ_R |
3638						   R500_TEX_DST_G_SWIZ_G |
3639						   R500_TEX_DST_B_SWIZ_B |
3640						   R500_TEX_DST_A_SWIZ_A));
3641	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3642						   R500_DX_S_SWIZ_R |
3643						   R500_DX_T_SWIZ_R |
3644						   R500_DX_R_SWIZ_R |
3645						   R500_DX_Q_SWIZ_R |
3646						   R500_DY_ADDR(0) |
3647						   R500_DY_S_SWIZ_R |
3648						   R500_DY_T_SWIZ_R |
3649						   R500_DY_R_SWIZ_R |
3650						   R500_DY_Q_SWIZ_R));
3651	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3652	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3653
3654	    /* tex inst */
3655	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3656						   R500_INST_TEX_SEM_WAIT |
3657						   R500_INST_RGB_WMASK_R |
3658						   R500_INST_RGB_WMASK_G |
3659						   R500_INST_RGB_WMASK_B |
3660						   R500_INST_ALPHA_WMASK |
3661						   R500_INST_RGB_CLAMP |
3662						   R500_INST_ALPHA_CLAMP));
3663	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3664						   R500_TEX_INST_LD |
3665						   R500_TEX_IGNORE_UNCOVERED));
3666	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3667						   R500_TEX_SRC_S_SWIZ_R |
3668						   R500_TEX_SRC_T_SWIZ_G |
3669						   R500_TEX_DST_ADDR(1) |
3670						   R500_TEX_DST_R_SWIZ_R |
3671						   R500_TEX_DST_G_SWIZ_G |
3672						   R500_TEX_DST_B_SWIZ_B |
3673						   R500_TEX_DST_A_SWIZ_A));
3674	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3675						   R500_DX_S_SWIZ_R |
3676						   R500_DX_T_SWIZ_R |
3677						   R500_DX_R_SWIZ_R |
3678						   R500_DX_Q_SWIZ_R |
3679						   R500_DY_ADDR(0) |
3680						   R500_DY_S_SWIZ_R |
3681						   R500_DY_T_SWIZ_R |
3682						   R500_DY_R_SWIZ_R |
3683						   R500_DY_Q_SWIZ_R));
3684	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3685	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3686
3687	    /* tex inst */
3688	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3689						   R500_INST_TEX_SEM_WAIT |
3690						   R500_INST_RGB_WMASK_R |
3691						   R500_INST_RGB_WMASK_G |
3692						   R500_INST_RGB_WMASK_B |
3693						   R500_INST_ALPHA_WMASK |
3694						   R500_INST_RGB_CLAMP |
3695						   R500_INST_ALPHA_CLAMP));
3696	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
3697						   R500_TEX_INST_LD |
3698						   R500_TEX_SEM_ACQUIRE |
3699						   R500_TEX_IGNORE_UNCOVERED));
3700	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3701						   R500_TEX_SRC_S_SWIZ_R |
3702						   R500_TEX_SRC_T_SWIZ_G |
3703						   R500_TEX_DST_ADDR(0) |
3704						   R500_TEX_DST_R_SWIZ_R |
3705						   R500_TEX_DST_G_SWIZ_G |
3706						   R500_TEX_DST_B_SWIZ_B |
3707						   R500_TEX_DST_A_SWIZ_A));
3708	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3709						   R500_DX_S_SWIZ_R |
3710						   R500_DX_T_SWIZ_R |
3711						   R500_DX_R_SWIZ_R |
3712						   R500_DX_Q_SWIZ_R |
3713						   R500_DY_ADDR(0) |
3714						   R500_DY_S_SWIZ_R |
3715						   R500_DY_T_SWIZ_R |
3716						   R500_DY_R_SWIZ_R |
3717						   R500_DY_Q_SWIZ_R));
3718	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3719	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3720
3721	    /* ALU inst */
3722	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
3723	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3724						   R500_INST_TEX_SEM_WAIT |
3725						   R500_INST_RGB_WMASK_R |
3726						   R500_INST_RGB_WMASK_G |
3727						   R500_INST_RGB_WMASK_B |
3728						   R500_INST_ALPHA_WMASK));
3729	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3730						   R500_RGB_ADDR0_CONST |
3731						   R500_RGB_ADDR1(2) |
3732						   R500_RGB_ADDR2(0) |
3733						   R500_RGB_ADDR2_CONST));
3734	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3735						   R500_ALPHA_ADDR0_CONST |
3736						   R500_ALPHA_ADDR1(2) |
3737						   R500_ALPHA_ADDR2(0) |
3738						   R500_ALPHA_ADDR2_CONST));
3739	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3740						   R500_ALU_RGB_R_SWIZ_A_A |
3741						   R500_ALU_RGB_G_SWIZ_A_A |
3742						   R500_ALU_RGB_B_SWIZ_A_A |
3743						   R500_ALU_RGB_SEL_B_SRC1 |
3744						   R500_ALU_RGB_R_SWIZ_B_R |
3745						   R500_ALU_RGB_B_SWIZ_B_G |
3746						   R500_ALU_RGB_G_SWIZ_B_B));
3747	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3748						   R500_ALPHA_ADDRD(2) |
3749						   R500_ALPHA_SWIZ_A_0 |
3750						   R500_ALPHA_SWIZ_B_0));
3751	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3752						   R500_ALU_RGBA_ADDRD(2) |
3753						   R500_ALU_RGBA_SEL_C_SRC0 |
3754						   R500_ALU_RGBA_R_SWIZ_R |
3755						   R500_ALU_RGBA_G_SWIZ_G |
3756						   R500_ALU_RGBA_B_SWIZ_B |
3757						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3758						   R500_ALU_RGBA_A_SWIZ_0));
3759
3760	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
3761	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3762						   R500_INST_TEX_SEM_WAIT |
3763						   R500_INST_RGB_WMASK_R |
3764						   R500_INST_RGB_WMASK_G |
3765						   R500_INST_RGB_WMASK_B |
3766						   R500_INST_ALPHA_WMASK));
3767	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3768						   R500_RGB_ADDR0_CONST |
3769						   R500_RGB_ADDR1(1) |
3770						   R500_RGB_ADDR2(2)));
3771	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3772						   R500_ALPHA_ADDR0_CONST |
3773						   R500_ALPHA_ADDR1(1) |
3774						   R500_ALPHA_ADDR2(2)));
3775	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3776						   R500_ALU_RGB_R_SWIZ_A_R |
3777						   R500_ALU_RGB_G_SWIZ_A_G |
3778						   R500_ALU_RGB_B_SWIZ_A_B |
3779						   R500_ALU_RGB_SEL_B_SRC1 |
3780						   R500_ALU_RGB_R_SWIZ_B_R |
3781						   R500_ALU_RGB_B_SWIZ_B_G |
3782						   R500_ALU_RGB_G_SWIZ_B_B));
3783	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3784						   R500_ALPHA_ADDRD(2) |
3785						   R500_ALPHA_SWIZ_A_0 |
3786						   R500_ALPHA_SWIZ_B_0));
3787	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3788						   R500_ALU_RGBA_ADDRD(2) |
3789						   R500_ALU_RGBA_SEL_C_SRC2 |
3790						   R500_ALU_RGBA_R_SWIZ_R |
3791						   R500_ALU_RGBA_G_SWIZ_G |
3792						   R500_ALU_RGBA_B_SWIZ_B |
3793						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3794						   R500_ALU_RGBA_A_SWIZ_0));
3795
3796	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
3797	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3798						   R500_INST_TEX_SEM_WAIT |
3799						   R500_INST_LAST |
3800						   R500_INST_RGB_OMASK_R |
3801						   R500_INST_RGB_OMASK_G |
3802						   R500_INST_RGB_OMASK_B |
3803						   R500_INST_ALPHA_OMASK |
3804						   R500_INST_RGB_CLAMP |
3805						   R500_INST_ALPHA_CLAMP));
3806	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3807						   R500_RGB_ADDR0_CONST |
3808						   R500_RGB_ADDR1(0) |
3809						   R500_RGB_ADDR2(2)));
3810	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
3811						   R500_ALPHA_ADDR0_CONST |
3812						   R500_ALPHA_ADDR1(0) |
3813						   R500_ALPHA_ADDR2(2)));
3814	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3815						   R500_ALU_RGB_R_SWIZ_A_R |
3816						   R500_ALU_RGB_G_SWIZ_A_G |
3817						   R500_ALU_RGB_B_SWIZ_A_B |
3818						   R500_ALU_RGB_SEL_B_SRC1 |
3819						   R500_ALU_RGB_R_SWIZ_B_R |
3820						   R500_ALU_RGB_B_SWIZ_B_G |
3821						   R500_ALU_RGB_G_SWIZ_B_B));
3822	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3823						   R500_ALPHA_ADDRD(0) |
3824						   R500_ALPHA_SWIZ_A_0 |
3825						   R500_ALPHA_SWIZ_B_0));
3826	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3827						   R500_ALU_RGBA_ADDRD(0) |
3828						   R500_ALU_RGBA_SEL_C_SRC2 |
3829						   R500_ALU_RGBA_R_SWIZ_R |
3830						   R500_ALU_RGBA_G_SWIZ_G |
3831						   R500_ALU_RGBA_B_SWIZ_B |
3832						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3833						   R500_ALU_RGBA_A_SWIZ_1));
3834
3835	} else {
3836	    BEGIN_ACCEL(44);
3837	    /* 2 components: 2 for tex0/1/2 */
3838	    OUT_ACCEL_REG(R300_RS_COUNT,
3839			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3840			   R300_RS_COUNT_HIRES_EN));
3841
3842	    /* R300_INST_COUNT_RS - highest RS instruction used */
3843	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3844
3845	    /* Pixel stack frame size. */
3846	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
3847
3848	    /* FP length. */
3849	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3850					      R500_US_CODE_END_ADDR(3)));
3851	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3852					       R500_US_CODE_RANGE_SIZE(3)));
3853
3854	    /* Prepare for FP emission. */
3855	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3856	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3857
3858	    /* tex inst */
3859	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3860						   R500_INST_TEX_SEM_WAIT |
3861						   R500_INST_RGB_WMASK_R |
3862						   R500_INST_RGB_WMASK_G |
3863						   R500_INST_RGB_WMASK_B |
3864						   R500_INST_ALPHA_WMASK |
3865						   R500_INST_RGB_CLAMP |
3866						   R500_INST_ALPHA_CLAMP));
3867	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3868						   R500_TEX_INST_LD |
3869						   R500_TEX_SEM_ACQUIRE |
3870						   R500_TEX_IGNORE_UNCOVERED));
3871	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3872						   R500_TEX_SRC_S_SWIZ_R |
3873						   R500_TEX_SRC_T_SWIZ_G |
3874						   R500_TEX_DST_ADDR(0) |
3875						   R500_TEX_DST_R_SWIZ_R |
3876						   R500_TEX_DST_G_SWIZ_G |
3877						   R500_TEX_DST_B_SWIZ_B |
3878						   R500_TEX_DST_A_SWIZ_A));
3879	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3880						   R500_DX_S_SWIZ_R |
3881						   R500_DX_T_SWIZ_R |
3882						   R500_DX_R_SWIZ_R |
3883						   R500_DX_Q_SWIZ_R |
3884						   R500_DY_ADDR(0) |
3885						   R500_DY_S_SWIZ_R |
3886						   R500_DY_T_SWIZ_R |
3887						   R500_DY_R_SWIZ_R |
3888						   R500_DY_Q_SWIZ_R));
3889	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3890	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3891
3892	    /* ALU inst */
3893	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
3894	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3895						   R500_INST_TEX_SEM_WAIT |
3896						   R500_INST_RGB_WMASK_R |
3897						   R500_INST_RGB_WMASK_G |
3898						   R500_INST_RGB_WMASK_B |
3899						   R500_INST_ALPHA_WMASK));
3900	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3901						   R500_RGB_ADDR0_CONST |
3902						   R500_RGB_ADDR1(0) |
3903						   R500_RGB_ADDR2(0) |
3904						   R500_RGB_ADDR2_CONST));
3905	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3906						   R500_ALPHA_ADDR0_CONST |
3907						   R500_ALPHA_ADDR1(0) |
3908						   R500_ALPHA_ADDR2(0) |
3909						   R500_ALPHA_ADDR2_CONST));
3910	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3911						   R500_ALU_RGB_R_SWIZ_A_A |
3912						   R500_ALU_RGB_G_SWIZ_A_A |
3913						   R500_ALU_RGB_B_SWIZ_A_A |
3914						   R500_ALU_RGB_SEL_B_SRC1 |
3915						   R500_ALU_RGB_R_SWIZ_B_G |
3916						   R500_ALU_RGB_B_SWIZ_B_G |
3917						   R500_ALU_RGB_G_SWIZ_B_G));
3918	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3919						   R500_ALPHA_ADDRD(1) |
3920						   R500_ALPHA_SWIZ_A_0 |
3921						   R500_ALPHA_SWIZ_B_0));
3922	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3923						   R500_ALU_RGBA_ADDRD(1) |
3924						   R500_ALU_RGBA_SEL_C_SRC0 |
3925						   R500_ALU_RGBA_R_SWIZ_R |
3926						   R500_ALU_RGBA_G_SWIZ_G |
3927						   R500_ALU_RGBA_B_SWIZ_B |
3928						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3929						   R500_ALU_RGBA_A_SWIZ_0));
3930
3931	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
3932	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3933						   R500_INST_TEX_SEM_WAIT |
3934						   R500_INST_RGB_WMASK_R |
3935						   R500_INST_RGB_WMASK_G |
3936						   R500_INST_RGB_WMASK_B |
3937						   R500_INST_ALPHA_WMASK));
3938	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3939						   R500_RGB_ADDR0_CONST |
3940						   R500_RGB_ADDR1(0) |
3941						   R500_RGB_ADDR2(1)));
3942	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3943						   R500_ALPHA_ADDR0_CONST |
3944						   R500_ALPHA_ADDR1(0) |
3945						   R500_ALPHA_ADDR2(1)));
3946	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3947						   R500_ALU_RGB_R_SWIZ_A_R |
3948						   R500_ALU_RGB_G_SWIZ_A_G |
3949						   R500_ALU_RGB_B_SWIZ_A_B |
3950						   R500_ALU_RGB_SEL_B_SRC1 |
3951						   R500_ALU_RGB_R_SWIZ_B_B |
3952						   R500_ALU_RGB_B_SWIZ_B_B |
3953						   R500_ALU_RGB_G_SWIZ_B_B));
3954	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3955						   R500_ALPHA_ADDRD(1) |
3956						   R500_ALPHA_SWIZ_A_0 |
3957						   R500_ALPHA_SWIZ_B_0));
3958	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3959						   R500_ALU_RGBA_ADDRD(1) |
3960						   R500_ALU_RGBA_SEL_C_SRC2 |
3961						   R500_ALU_RGBA_R_SWIZ_R |
3962						   R500_ALU_RGBA_G_SWIZ_G |
3963						   R500_ALU_RGBA_B_SWIZ_B |
3964						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3965						   R500_ALU_RGBA_A_SWIZ_0));
3966
3967	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
3968	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3969						   R500_INST_TEX_SEM_WAIT |
3970						   R500_INST_LAST |
3971						   R500_INST_RGB_OMASK_R |
3972						   R500_INST_RGB_OMASK_G |
3973						   R500_INST_RGB_OMASK_B |
3974						   R500_INST_ALPHA_OMASK |
3975						   R500_INST_RGB_CLAMP |
3976						   R500_INST_ALPHA_CLAMP));
3977	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3978						   R500_RGB_ADDR0_CONST |
3979						   R500_RGB_ADDR1(0) |
3980						   R500_RGB_ADDR2(1)));
3981	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3982						   R500_ALPHA_ADDR0_CONST |
3983						   R500_ALPHA_ADDR1(0) |
3984						   R500_ALPHA_ADDR2(1)));
3985	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3986						   R500_ALU_RGB_R_SWIZ_A_R |
3987						   R500_ALU_RGB_G_SWIZ_A_G |
3988						   R500_ALU_RGB_B_SWIZ_A_B |
3989						   R500_ALU_RGB_SEL_B_SRC1 |
3990						   R500_ALU_RGB_R_SWIZ_B_R |
3991						   R500_ALU_RGB_B_SWIZ_B_R |
3992						   R500_ALU_RGB_G_SWIZ_B_R));
3993	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3994						   R500_ALPHA_ADDRD(1) |
3995						   R500_ALPHA_SWIZ_A_0 |
3996						   R500_ALPHA_SWIZ_B_0));
3997	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3998						   R500_ALU_RGBA_ADDRD(1) |
3999						   R500_ALU_RGBA_SEL_C_SRC2 |
4000						   R500_ALU_RGBA_R_SWIZ_R |
4001						   R500_ALU_RGBA_G_SWIZ_G |
4002						   R500_ALU_RGBA_B_SWIZ_B |
4003						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
4004						   R500_ALU_RGBA_A_SWIZ_1));
4005	}
4006
4007	/* Shader constants. */
4008	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
4009
4010	/* constant 0: off, yco */
4011	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
4012	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
4013	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
4014	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
4015	/* constant 1: uco */
4016	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
4017	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
4018	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
4019	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
4020	/* constant 2: vco */
4021	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
4022	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
4023	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
4024	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
4025
4026	FINISH_ACCEL();
4027    }
4028
4029    BEGIN_ACCEL_RELOC(6, 2);
4030    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
4031    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
4032
4033    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
4034    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
4035
4036    /* no need to enable blending */
4037    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
4038
4039    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
4040    FINISH_ACCEL();
4041
4042    if (pPriv->vsync) {
4043	xf86CrtcPtr crtc;
4044	if (pPriv->desired_crtc)
4045	    crtc = pPriv->desired_crtc;
4046	else
4047	    crtc = radeon_pick_best_crtc(pScrn,
4048					 pPriv->drw_x,
4049					 pPriv->drw_x + pPriv->dst_w,
4050					 pPriv->drw_y,
4051					 pPriv->drw_y + pPriv->dst_h);
4052	if (crtc)
4053	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
4054					  crtc,
4055					  pPriv->drw_y - crtc->y,
4056					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
4057    }
4058
4059    return TRUE;
4060}
4061
4062static void
4063FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
4064{
4065    RADEONInfoPtr info = RADEONPTR(pScrn);
4066    PixmapPtr pPixmap = pPriv->pPixmap;
4067    int dstxoff, dstyoff;
4068    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
4069    int nBox = REGION_NUM_RECTS(&pPriv->clip);
4070    ACCEL_PREAMBLE();
4071
4072#ifdef COMPOSITE
4073    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
4074    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
4075#else
4076    dstxoff = 0;
4077    dstyoff = 0;
4078#endif
4079
4080    if (!FUNC_NAME(R500PrepareTexturedVideo)(pScrn, pPriv))
4081	return;
4082
4083    /*
4084     * Rendering of the actual polygon is done in two different
4085     * ways depending on chip generation:
4086     *
4087     * < R300:
4088     *
4089     *     These chips can render a rectangle in one pass, so
4090     *     handling is pretty straight-forward.
4091     *
4092     * >= R300:
4093     *
4094     *     These chips can accept a quad, but will render it as
4095     *     two triangles which results in a diagonal tear. Instead
4096     *     We render a single, large triangle and use the scissor
4097     *     functionality to restrict it to the desired rectangle.
4098     *     Due to guardband limits on r3xx/r4xx, we can only use
4099     *     the single triangle up to 2880 pixels; above that we
4100     *     render as a quad.
4101     */
4102
4103    while (nBox--) {
4104	int srcX, srcY, srcw, srch;
4105	int dstX, dstY, dstw, dsth;
4106#ifdef ACCEL_CP
4107	int draw_size = 3 * pPriv->vtx_count + 4 + 2 + 3;
4108
4109	if (draw_size > radeon_cs_space_remaining(pScrn)) {
4110	    if (info->cs)
4111		radeon_cs_flush_indirect(pScrn);
4112	    else
4113		RADEONCPFlushIndirect(pScrn, 1);
4114	    if (!FUNC_NAME(R500PrepareTexturedVideo)(pScrn, pPriv))
4115		return;
4116	}
4117#endif
4118
4119	dstX = pBox->x1 + dstxoff;
4120	dstY = pBox->y1 + dstyoff;
4121	dstw = pBox->x2 - pBox->x1;
4122	dsth = pBox->y2 - pBox->y1;
4123
4124	srcX = pPriv->src_x;
4125	srcX += ((pBox->x1 - pPriv->drw_x) *
4126		 pPriv->src_w) / pPriv->dst_w;
4127	srcY = pPriv->src_y;
4128	srcY += ((pBox->y1 - pPriv->drw_y) *
4129		 pPriv->src_h) / pPriv->dst_h;
4130
4131	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
4132	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
4133
4134	BEGIN_ACCEL(2);
4135	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
4136					 ((dstY) << R300_SCISSOR_Y_SHIFT)));
4137	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
4138					 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
4139	FINISH_ACCEL();
4140
4141#ifdef ACCEL_CP
4142	BEGIN_RING(3 * pPriv->vtx_count + 4);
4143	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
4144			    3 * pPriv->vtx_count));
4145	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
4146		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
4147		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
4148#else /* ACCEL_CP */
4149	BEGIN_ACCEL(2 + pPriv->vtx_count * 3);
4150	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
4151					  RADEON_VF_PRIM_WALK_DATA |
4152					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
4153#endif
4154	if (pPriv->bicubic_enabled) {
4155	    VTX_OUT_6((float)dstX,            (float)dstY,
4156		      (float)srcX / pPriv->w, (float)srcY / pPriv->h,
4157		      (float)srcX + 0.5,      (float)srcY + 0.5);
4158	    VTX_OUT_6((float)dstX,            (float)(dstY + dstw + dsth),
4159		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
4160		      (float)srcX + 0.5,      (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
4161	    VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
4162		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
4163		      (float)srcY / pPriv->h,
4164		      (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
4165		      (float)srcY + 0.5);
4166	} else {
4167	    /*
4168	     * Render a big, scissored triangle. This means
4169	     * increasing the triangle size and adjusting
4170	     * texture coordinates.
4171	     */
4172	    VTX_OUT_4((float)dstX,            (float)dstY,
4173		      (float)srcX / pPriv->w, (float)srcY / pPriv->h);
4174	    VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
4175		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
4176	    VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
4177		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
4178		      (float)srcY / pPriv->h);
4179	}
4180
4181	/* flushing is pipelined, free/finish is not */
4182	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
4183
4184#ifdef ACCEL_CP
4185	ADVANCE_RING();
4186#else
4187	FINISH_ACCEL();
4188#endif /* !ACCEL_CP */
4189
4190	pBox++;
4191    }
4192
4193    BEGIN_ACCEL(3);
4194    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
4195    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
4196    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
4197    FINISH_ACCEL();
4198
4199    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
4200}
4201
4202#undef VTX_OUT_4
4203#undef VTX_OUT_6
4204#undef FUNC_NAME
4205