1/*
2 * Copyright 2008 Alex Deucher
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 *
24 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
25 *
26 */
27
28#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
29#error Cannot define both MMIO and CP acceleration!
30#endif
31
32#if !defined(UNIXCPP) || defined(ANSICPP)
33#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
34#else
35#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
36#endif
37
38#ifdef ACCEL_MMIO
39#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
40#else
41#ifdef ACCEL_CP
42#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
43#else
44#error No accel type defined!
45#endif
46#endif
47
48#ifdef ACCEL_CP
49
50#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
51do {								\
52    OUT_RING_F(_dstX);						\
53    OUT_RING_F(_dstY);						\
54    OUT_RING_F(_srcX);						\
55    OUT_RING_F(_srcY);						\
56    OUT_RING_F(_maskX);						\
57    OUT_RING_F(_maskY);						\
58} while (0)
59
60#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
61do {								\
62    OUT_RING_F(_dstX);						\
63    OUT_RING_F(_dstY);						\
64    OUT_RING_F(_srcX);						\
65    OUT_RING_F(_srcY);						\
66} while (0)
67
68#else /* ACCEL_CP */
69
70#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)		\
71do {									\
72    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);			\
73    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);			\
74    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);			\
75    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);			\
76    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskX);			\
77    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);			\
78} while (0)
79
80#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
81do {								\
82    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);		\
83    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);		\
84    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);		\
85    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);		\
86} while (0)
87
88#endif /* !ACCEL_CP */
89
90static Bool
91FUNC_NAME(RADEONPrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
92{
93    RADEONInfoPtr info = RADEONPTR(pScrn);
94    PixmapPtr pPixmap = pPriv->pPixmap;
95    struct radeon_exa_pixmap_priv *driver_priv;
96    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
97    uint32_t txformat, txsize, txpitch, txoffset;
98    uint32_t dst_pitch, dst_format;
99    uint32_t colorpitch;
100    int pixel_shift;
101    int scissor_w = MIN(pPixmap->drawable.width, 2047);
102    int scissor_h = MIN(pPixmap->drawable.height, 2047);
103    ACCEL_PREAMBLE();
104
105#ifdef XF86DRM_MODE
106    if (info->cs) {
107	int ret;
108
109	radeon_cs_space_reset_bos(info->cs);
110        radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
111
112	if (pPriv->bicubic_enabled)
113	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
114
115	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
116	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
117
118	ret = radeon_cs_space_check(info->cs);
119	if (ret) {
120	    ErrorF("Not enough RAM to hw accel xv operation\n");
121	    return FALSE;
122	}
123    }
124#else
125    (void)src_bo;
126#endif
127
128    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
129
130
131#ifdef USE_EXA
132    if (info->useEXA) {
133	dst_pitch = exaGetPixmapPitch(pPixmap);
134    } else
135#endif
136    {
137        dst_pitch = pPixmap->devKind;
138    }
139
140#ifdef USE_EXA
141    if (info->useEXA) {
142	RADEON_SWITCH_TO_3D();
143    } else
144#endif
145    {
146	BEGIN_ACCEL(2);
147	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
148	/* We must wait for 3d to idle, in case source was just written as a dest. */
149	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
150		      RADEON_WAIT_HOST_IDLECLEAN |
151		      RADEON_WAIT_2D_IDLECLEAN |
152		      RADEON_WAIT_3D_IDLECLEAN |
153		      RADEON_WAIT_DMA_GUI_IDLE);
154	FINISH_ACCEL();
155
156	if (!info->accel_state->XInited3D)
157	    RADEONInit3DEngine(pScrn);
158    }
159
160    /* Same for R100/R200 */
161    switch (pPixmap->drawable.bitsPerPixel) {
162    case 16:
163	if (pPixmap->drawable.depth == 15)
164	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
165	else
166	    dst_format = RADEON_COLOR_FORMAT_RGB565;
167	break;
168    case 32:
169	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
170	break;
171    default:
172	return FALSE;
173    }
174
175    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
176	pPriv->is_planar = TRUE;
177	txformat = RADEON_TXFORMAT_Y8;
178    } else {
179	pPriv->is_planar = FALSE;
180	if (pPriv->id == FOURCC_UYVY)
181	    txformat = RADEON_TXFORMAT_YVYU422;
182	else
183	    txformat = RADEON_TXFORMAT_VYUY422;
184    }
185
186    txformat |= RADEON_TXFORMAT_NON_POWER2;
187
188    colorpitch = dst_pitch >> pixel_shift;
189
190    if (RADEONTilingEnabled(pScrn, pPixmap))
191	colorpitch |= RADEON_COLOR_TILE_ENABLE;
192
193    txoffset = info->cs ? 0 : pPriv->src_offset;
194
195    BEGIN_ACCEL_RELOC(4,2);
196
197    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
198    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
199    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
200    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
201		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
202
203    FINISH_ACCEL();
204
205    if (pPriv->is_planar) {
206	/* need 2 texcoord sets (even though they are identical) due
207	   to denormalization! hw apparently can't premultiply
208	   same coord set by different texture size */
209	pPriv->vtx_count = 6;
210
211	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
212		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
213	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
214	txpitch -= 32;
215
216	BEGIN_ACCEL_RELOC(23, 3);
217
218	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
219					  RADEON_SE_VTX_FMT_ST0 |
220					  RADEON_SE_VTX_FMT_ST1));
221
222	OUT_ACCEL_REG(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE |
223				       RADEON_TEX_1_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
224				       RADEON_TEX_2_ENABLE | RADEON_TEX_BLEND_2_ENABLE |
225				       RADEON_PLANAR_YUV_ENABLE));
226
227	/* Y */
228	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
229		      RADEON_MAG_FILTER_LINEAR |
230		      RADEON_MIN_FILTER_LINEAR |
231		      RADEON_CLAMP_S_CLAMP_LAST |
232		      RADEON_CLAMP_T_CLAMP_LAST |
233		      RADEON_YUV_TO_RGB);
234	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
235	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, txoffset, src_bo);
236	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
237		      RADEON_COLOR_ARG_A_ZERO |
238		      RADEON_COLOR_ARG_B_ZERO |
239		      RADEON_COLOR_ARG_C_T0_COLOR |
240		      RADEON_BLEND_CTL_ADD |
241		      RADEON_CLAMP_TX);
242	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
243		      RADEON_ALPHA_ARG_A_ZERO |
244		      RADEON_ALPHA_ARG_B_ZERO |
245		      RADEON_ALPHA_ARG_C_T0_ALPHA |
246		      RADEON_BLEND_CTL_ADD |
247		      RADEON_CLAMP_TX);
248
249	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
250		      (pPriv->w - 1) |
251		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
252	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
253		      pPriv->src_pitch - 32);
254
255	/* U */
256	OUT_ACCEL_REG(RADEON_PP_TXFILTER_1,
257		      RADEON_MAG_FILTER_LINEAR |
258		      RADEON_MIN_FILTER_LINEAR |
259		      RADEON_CLAMP_S_CLAMP_LAST |
260		      RADEON_CLAMP_T_CLAMP_LAST);
261	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_1, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
262	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
263	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_1,
264		      RADEON_COLOR_ARG_A_ZERO |
265		      RADEON_COLOR_ARG_B_ZERO |
266		      RADEON_COLOR_ARG_C_T0_COLOR |
267		      RADEON_BLEND_CTL_ADD |
268		      RADEON_CLAMP_TX);
269	OUT_ACCEL_REG(RADEON_PP_TXABLEND_1,
270		      RADEON_ALPHA_ARG_A_ZERO |
271		      RADEON_ALPHA_ARG_B_ZERO |
272		      RADEON_ALPHA_ARG_C_T0_ALPHA |
273		      RADEON_BLEND_CTL_ADD |
274		      RADEON_CLAMP_TX);
275
276	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_1, txsize);
277	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_1, txpitch);
278
279	/* V */
280	OUT_ACCEL_REG(RADEON_PP_TXFILTER_2,
281		      RADEON_MAG_FILTER_LINEAR |
282		      RADEON_MIN_FILTER_LINEAR |
283		      RADEON_CLAMP_S_CLAMP_LAST |
284		      RADEON_CLAMP_T_CLAMP_LAST);
285	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_2, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
286	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_2, txoffset + pPriv->planev_offset, src_bo);
287	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_2,
288		      RADEON_COLOR_ARG_A_ZERO |
289		      RADEON_COLOR_ARG_B_ZERO |
290		      RADEON_COLOR_ARG_C_T0_COLOR |
291		      RADEON_BLEND_CTL_ADD |
292		      RADEON_CLAMP_TX);
293	OUT_ACCEL_REG(RADEON_PP_TXABLEND_2,
294		      RADEON_ALPHA_ARG_A_ZERO |
295		      RADEON_ALPHA_ARG_B_ZERO |
296		      RADEON_ALPHA_ARG_C_T0_ALPHA |
297		      RADEON_BLEND_CTL_ADD |
298		      RADEON_CLAMP_TX);
299
300	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_2, txsize);
301	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_2, txpitch);
302	FINISH_ACCEL();
303    } else {
304	pPriv->vtx_count = 4;
305	BEGIN_ACCEL_RELOC(9, 1);
306
307	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
308					  RADEON_SE_VTX_FMT_ST0));
309
310	OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
311
312	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
313		      RADEON_MAG_FILTER_LINEAR |
314		      RADEON_MIN_FILTER_LINEAR |
315		      RADEON_CLAMP_S_CLAMP_LAST |
316		      RADEON_CLAMP_T_CLAMP_LAST |
317		      RADEON_YUV_TO_RGB);
318	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
319	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, txoffset, src_bo);
320	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
321		      RADEON_COLOR_ARG_A_ZERO |
322		      RADEON_COLOR_ARG_B_ZERO |
323		      RADEON_COLOR_ARG_C_T0_COLOR |
324		      RADEON_BLEND_CTL_ADD |
325		      RADEON_CLAMP_TX);
326	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
327		      RADEON_ALPHA_ARG_A_ZERO |
328		      RADEON_ALPHA_ARG_B_ZERO |
329		      RADEON_ALPHA_ARG_C_T0_ALPHA |
330		      RADEON_BLEND_CTL_ADD |
331		      RADEON_CLAMP_TX);
332
333	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
334		      (pPriv->w - 1) |
335		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
336	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
337		      pPriv->src_pitch - 32);
338	FINISH_ACCEL();
339    }
340
341    BEGIN_ACCEL(2);
342    OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
343    OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
344					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
345    FINISH_ACCEL();
346
347    if (pPriv->vsync) {
348	xf86CrtcPtr crtc;
349	if (pPriv->desired_crtc)
350	    crtc = pPriv->desired_crtc;
351	else
352	    crtc = radeon_pick_best_crtc(pScrn,
353					 pPriv->drw_x,
354					 pPriv->drw_x + pPriv->dst_w,
355					 pPriv->drw_y,
356					 pPriv->drw_y + pPriv->dst_h);
357	if (crtc)
358	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
359					  crtc,
360					  pPriv->drw_y - crtc->y,
361					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
362    }
363
364    return TRUE;
365}
366
367static void
368FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
369{
370    RADEONInfoPtr info = RADEONPTR(pScrn);
371    PixmapPtr pPixmap = pPriv->pPixmap;
372    int dstxoff, dstyoff;
373    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
374    int nBox = REGION_NUM_RECTS(&pPriv->clip);
375    ACCEL_PREAMBLE();
376
377#ifdef COMPOSITE
378    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
379    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
380#else
381    dstxoff = 0;
382    dstyoff = 0;
383#endif
384
385    if (!FUNC_NAME(RADEONPrepareTexturedVideo)(pScrn, pPriv))
386	return;
387
388    /*
389     * Rendering of the actual polygon is done in two different
390     * ways depending on chip generation:
391     *
392     * < R300:
393     *
394     *     These chips can render a rectangle in one pass, so
395     *     handling is pretty straight-forward.
396     *
397     * >= R300:
398     *
399     *     These chips can accept a quad, but will render it as
400     *     two triangles which results in a diagonal tear. Instead
401     *     We render a single, large triangle and use the scissor
402     *     functionality to restrict it to the desired rectangle.
403     *     Due to guardband limits on r3xx/r4xx, we can only use
404     *     the single triangle up to 2560/4021 pixels; above that we
405     *     render as a quad.
406     */
407#ifdef ACCEL_CP
408    while (nBox) {
409	int draw_size = 3 * pPriv->vtx_count + 5;
410	int loop_boxes;
411
412	if (draw_size > radeon_cs_space_remaining(pScrn)) {
413	    if (info->cs)
414		radeon_cs_flush_indirect(pScrn);
415	    else
416		RADEONCPFlushIndirect(pScrn, 1);
417	    if (!FUNC_NAME(RADEONPrepareTexturedVideo)(pScrn, pPriv))
418		return;
419	}
420	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
421	nBox -= loop_boxes;
422
423	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 5);
424	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
425			    loop_boxes * 3 * pPriv->vtx_count + 1));
426	if (pPriv->is_planar)
427	    OUT_RING(RADEON_CP_VC_FRMT_XY |
428		     RADEON_CP_VC_FRMT_ST0 |
429		     RADEON_CP_VC_FRMT_ST1);
430	else
431	    OUT_RING(RADEON_CP_VC_FRMT_XY |
432		     RADEON_CP_VC_FRMT_ST0);
433	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
434		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
435		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
436		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
437		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
438
439	while (loop_boxes--) {
440	    float srcX, srcY, srcw, srch;
441	    int dstX, dstY, dstw, dsth;
442	    dstX = pBox->x1 + dstxoff;
443	    dstY = pBox->y1 + dstyoff;
444	    dstw = pBox->x2 - pBox->x1;
445	    dsth = pBox->y2 - pBox->y1;
446
447	    srcX = pPriv->src_x;
448	    srcX += ((pBox->x1 - pPriv->drw_x) *
449		     pPriv->src_w) / (float)pPriv->dst_w;
450	    srcY = pPriv->src_y;
451	    srcY += ((pBox->y1 - pPriv->drw_y) *
452		     pPriv->src_h) / (float)pPriv->dst_h;
453
454	    srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
455	    srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
456
457
458	    if (pPriv->is_planar) {
459		/*
460		 * Just render a rect (using three coords).
461		 */
462		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
463			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
464			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
465		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
466			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
467			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
468		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
469			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
470			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
471	    } else {
472		/*
473		 * Just render a rect (using three coords).
474		 */
475		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
476			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
477		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
478			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
479		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
480			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
481	    }
482
483	    pBox++;
484	}
485
486	OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
487	ADVANCE_RING();
488    }
489#else /* ACCEL_CP */
490    BEGIN_ACCEL(nBox * pPriv->vtx_count * 3 + 2);
491    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
492				      RADEON_VF_PRIM_WALK_DATA |
493				      RADEON_VF_RADEON_MODE |
494				      ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
495    while (nBox--) {
496	float srcX, srcY, srcw, srch;
497	int dstX, dstY, dstw, dsth;
498	dstX = pBox->x1 + dstxoff;
499	dstY = pBox->y1 + dstyoff;
500	dstw = pBox->x2 - pBox->x1;
501	dsth = pBox->y2 - pBox->y1;
502
503	srcX = pPriv->src_x;
504	srcX += ((pBox->x1 - pPriv->drw_x) *
505		 pPriv->src_w) / (float)pPriv->dst_w;
506	srcY = pPriv->src_y;
507	srcY += ((pBox->y1 - pPriv->drw_y) *
508		 pPriv->src_h) / (float)pPriv->dst_h;
509
510	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
511	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
512
513
514	if (pPriv->is_planar) {
515	    /*
516	     * Just render a rect (using three coords).
517	     */
518	    VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
519		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
520		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
521	    VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
522		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
523		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
524	    VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
525		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
526		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
527	} else {
528	    /*
529	     * Just render a rect (using three coords).
530	     */
531	    VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
532		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
533	    VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
534		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
535	    VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
536		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
537	}
538
539	pBox++;
540    }
541
542    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
543    FINISH_ACCEL();
544#endif /* !ACCEL_CP */
545
546    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
547}
548
549static Bool
550FUNC_NAME(R200PrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
551{
552    RADEONInfoPtr info = RADEONPTR(pScrn);
553    PixmapPtr pPixmap = pPriv->pPixmap;
554    struct radeon_exa_pixmap_priv *driver_priv;
555    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
556    uint32_t txformat;
557    uint32_t txfilter, txsize, txpitch, txoffset;
558    uint32_t dst_pitch, dst_format;
559    uint32_t colorpitch;
560    int pixel_shift;
561    int scissor_w = MIN(pPixmap->drawable.width, 2047);
562    int scissor_h = MIN(pPixmap->drawable.height, 2047);
563    /* note: in contrast to r300, use input biasing on uv components */
564    const float Loff = -0.0627;
565    float uvcosf, uvsinf;
566    float yco, yoff;
567    float uco[3], vco[3];
568    float bright, cont, sat;
569    int ref = pPriv->transform_index;
570    float ucscale = 0.25, vcscale = 0.25;
571    Bool needux8 = FALSE, needvx8 = FALSE;
572    ACCEL_PREAMBLE();
573
574#ifdef XF86DRM_MODE
575    if (info->cs) {
576	int ret;
577
578	radeon_cs_space_reset_bos(info->cs);
579        radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
580
581	if (pPriv->bicubic_enabled)
582	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
583
584	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
585	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
586
587	ret = radeon_cs_space_check(info->cs);
588	if (ret) {
589	    ErrorF("Not enough RAM to hw accel xv operation\n");
590	    return FALSE;
591	}
592    }
593#else
594    (void)src_bo;
595#endif
596
597    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
598
599#ifdef USE_EXA
600    if (info->useEXA) {
601	dst_pitch = exaGetPixmapPitch(pPixmap);
602    } else
603#endif
604    {
605	dst_pitch = pPixmap->devKind;
606    }
607
608#ifdef USE_EXA
609    if (info->useEXA) {
610	RADEON_SWITCH_TO_3D();
611    } else
612#endif
613    {
614	BEGIN_ACCEL(2);
615	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
616	/* We must wait for 3d to idle, in case source was just written as a dest. */
617	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
618		      RADEON_WAIT_HOST_IDLECLEAN |
619		      RADEON_WAIT_2D_IDLECLEAN |
620		      RADEON_WAIT_3D_IDLECLEAN |
621		      RADEON_WAIT_DMA_GUI_IDLE);
622	FINISH_ACCEL();
623
624	if (!info->accel_state->XInited3D)
625	    RADEONInit3DEngine(pScrn);
626    }
627
628    /* Same for R100/R200 */
629    switch (pPixmap->drawable.bitsPerPixel) {
630    case 16:
631	if (pPixmap->drawable.depth == 15)
632	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
633	else
634	    dst_format = RADEON_COLOR_FORMAT_RGB565;
635	break;
636    case 32:
637	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
638	break;
639    default:
640	return FALSE;
641    }
642
643    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
644	pPriv->is_planar = TRUE;
645	txformat = RADEON_TXFORMAT_I8;
646    } else {
647	pPriv->is_planar = FALSE;
648	if (pPriv->id == FOURCC_UYVY)
649	    txformat = RADEON_TXFORMAT_YVYU422;
650	else
651	    txformat = RADEON_TXFORMAT_VYUY422;
652    }
653
654    txformat |= RADEON_TXFORMAT_NON_POWER2;
655
656    colorpitch = dst_pitch >> pixel_shift;
657
658    if (RADEONTilingEnabled(pScrn, pPixmap))
659	colorpitch |= RADEON_COLOR_TILE_ENABLE;
660
661    BEGIN_ACCEL_RELOC(4,2);
662
663    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
664    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
665    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
666
667    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
668		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
669
670    FINISH_ACCEL();
671
672    txfilter =  R200_MAG_FILTER_LINEAR |
673	R200_MIN_FILTER_LINEAR |
674	R200_CLAMP_S_CLAMP_LAST |
675	R200_CLAMP_T_CLAMP_LAST;
676
677    /* contrast can cause constant overflow, clamp */
678    cont = RTFContrast(pPriv->contrast);
679    if (cont * trans[ref].RefLuma > 2.0)
680	cont = 2.0 / trans[ref].RefLuma;
681    /* brightness is only from -0.5 to 0.5 should be safe */
682    bright = RTFBrightness(pPriv->brightness);
683    /* saturation can also cause overflow, clamp */
684    sat = RTFSaturation(pPriv->saturation);
685    if (sat * trans[ref].RefBCb > 4.0)
686	sat = 4.0 / trans[ref].RefBCb;
687    uvcosf = sat * cos(RTFHue(pPriv->hue));
688    uvsinf = sat * sin(RTFHue(pPriv->hue));
689
690    yco = trans[ref].RefLuma * cont;
691    uco[0] = -trans[ref].RefRCr * uvsinf;
692    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
693    uco[2] = trans[ref].RefBCb * uvcosf;
694    vco[0] = trans[ref].RefRCr * uvcosf;
695    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
696    vco[2] = trans[ref].RefBCb * uvsinf;
697    yoff = Loff * yco + bright;
698
699    if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
700	needux8 = TRUE;
701	ucscale = 0.125;
702    }
703    if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
704	needvx8 = TRUE;
705	vcscale = 0.125;
706    }
707
708    txoffset = info->cs ? 0 : pPriv->src_offset;
709
710    if (pPriv->is_planar) {
711	/* need 2 texcoord sets (even though they are identical) due
712	   to denormalization! hw apparently can't premultiply
713	   same coord set by different texture size */
714	pPriv->vtx_count = 6;
715
716	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
717		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
718	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
719	txpitch -= 32;
720
721	BEGIN_ACCEL_RELOC(36, 3);
722
723	OUT_ACCEL_REG(RADEON_PP_CNTL,
724		      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
725		      RADEON_TEX_BLEND_0_ENABLE |
726		      RADEON_TEX_BLEND_1_ENABLE |
727		      RADEON_TEX_BLEND_2_ENABLE);
728
729	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
730	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
731		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
732		      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
733
734	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
735	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
736	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
737	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
738		      (pPriv->w - 1) |
739		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
740	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
741	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, txoffset, src_bo);
742
743	OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
744	OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
745	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
746	OUT_ACCEL_REG(R200_PP_TXSIZE_1, txsize);
747	OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
748	OUT_TEXTURE_REG(R200_PP_TXOFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
749
750	OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
751	OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
752	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
753	OUT_ACCEL_REG(R200_PP_TXSIZE_2, txsize);
754	OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
755	OUT_TEXTURE_REG(R200_PP_TXOFFSET_2, txoffset + pPriv->planev_offset, src_bo);
756
757	/* similar to r300 code. Note the big problem is that hardware constants
758	 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
759	 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
760	 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
761	 * the constants not. To get larger range can use output scale, but for
762	 * that 2.018 value we need a total scale by 8, which means the constants
763	 * really have no accuracy whatsoever (5 fractional bits only).
764	 * The only direct way to get high  precision "constants" into the fragment
765	 * pipe I know of is to use the texcoord interpolator (not color, this one
766	 * is 8 bit only too), which seems a bit expensive. We're lucky though it
767	 * seems the values we need seem to fit better than worst case (get about
768	 * 6 fractional bits for this instead of 5, at least when not correcting for
769	 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
770	 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
771	 * even with non-default saturation/hue/contrast/brightness adjustments,
772	 * it gets a little crazy and ultimately precision might still be lacking.
773	 *
774	 * A higher precision (8 fractional bits) version might just put uco into
775	 * a texcoord, and calculate a new vcoconst in the shader, like so:
776	 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
777	 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
778	 * vcocalc = ADD temp, bias/scale(cohelper), vco
779	 * would in total use 4 tex units, 4 instructions which seems fairly
780	 * balanced for this architecture (instead of 3 + 3 for the solution here)
781	 *
782	 * temp = MAD(yco, yuv.yyyy, yoff)
783	 * temp = MAD(uco, yuv.uuuu, temp)
784	 * result = MAD(vco, yuv.vvvv, temp)
785	 *
786	 * note first mad produces actually scalar, hence we transform
787	 * it into a dp2a to get 8 bit precision of yco instead of 7 -
788	 * That's assuming hw correctly expands consts to internal precision.
789	 * (y * 1 + y * (yco - 1) + yoff)
790	 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
791	 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
792	 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
793	 *
794	 * vco, uco need bias (and hence scale too)
795	 *
796	 */
797
798	/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
799	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
800		      R200_TXC_ARG_A_TFACTOR_COLOR |
801		      R200_TXC_ARG_B_R0_COLOR |
802		      R200_TXC_ARG_C_TFACTOR_COLOR |
803		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
804		      R200_TXC_OP_DOT2_ADD);
805	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
806		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
807		      R200_TXC_SCALE_INV2 |
808		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
809	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
810		      R200_TXA_ARG_A_ZERO |
811		      R200_TXA_ARG_B_ZERO |
812		      R200_TXA_ARG_C_ZERO |
813		      R200_TXA_OP_MADD);
814	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
815		      R200_TXA_OUTPUT_REG_NONE);
816
817	/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
818	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
819		      R200_TXC_ARG_A_TFACTOR_COLOR |
820		      R200_TXC_BIAS_ARG_A |
821		      R200_TXC_SCALE_ARG_A |
822		      R200_TXC_ARG_B_R1_COLOR |
823		      R200_TXC_BIAS_ARG_B |
824		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
825		      R200_TXC_ARG_C_R0_COLOR |
826		      R200_TXC_OP_MADD);
827	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
828		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
829		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
830	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
831		      R200_TXA_ARG_A_ZERO |
832		      R200_TXA_ARG_B_ZERO |
833		      R200_TXA_ARG_C_ZERO |
834		      R200_TXA_OP_MADD);
835	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
836		      R200_TXA_OUTPUT_REG_NONE);
837
838	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
839	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
840		      R200_TXC_ARG_A_TFACTOR_COLOR |
841		      R200_TXC_BIAS_ARG_A |
842		      R200_TXC_SCALE_ARG_A |
843		      R200_TXC_ARG_B_R2_COLOR |
844		      R200_TXC_BIAS_ARG_B |
845		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
846		      R200_TXC_ARG_C_R0_COLOR |
847		      R200_TXC_OP_MADD);
848	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
849		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
850		      R200_TXC_SCALE_2X |
851		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
852	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
853		      R200_TXA_ARG_A_ZERO |
854		      R200_TXA_ARG_B_ZERO |
855		      R200_TXA_ARG_C_ZERO |
856		      R200_TXA_COMP_ARG_C |
857		      R200_TXA_OP_MADD);
858	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
859		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
860
861	/* shader constants */
862	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
863						      yco > 1.0 ? yco - 1.0: yco,
864						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
865						      0.0));
866	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
867						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
868						      uco[2] * ucscale + 0.5,
869						      0.0));
870	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
871						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
872						      vco[2] * vcscale + 0.5,
873						      0.0));
874
875	FINISH_ACCEL();
876    } else {
877	pPriv->vtx_count = 4;
878
879	BEGIN_ACCEL_RELOC(24, 1);
880
881	OUT_ACCEL_REG(RADEON_PP_CNTL,
882		      RADEON_TEX_0_ENABLE |
883		      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
884		      RADEON_TEX_BLEND_2_ENABLE);
885
886	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
887	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
888		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
889
890	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
891	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
892	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
893	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
894		      (pPriv->w - 1) |
895		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
896	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
897	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, txoffset, src_bo);
898
899	/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
900	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
901		      R200_TXC_ARG_A_TFACTOR_COLOR |
902		      R200_TXC_ARG_B_R0_COLOR |
903		      R200_TXC_ARG_C_TFACTOR_COLOR |
904		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
905		      R200_TXC_OP_DOT2_ADD);
906	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
907		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
908		      R200_TXC_SCALE_INV2 |
909		      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
910		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
911	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
912		      R200_TXA_ARG_A_ZERO |
913		      R200_TXA_ARG_B_ZERO |
914		      R200_TXA_ARG_C_ZERO |
915		      R200_TXA_OP_MADD);
916	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
917		      R200_TXA_OUTPUT_REG_NONE);
918
919	/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
920	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
921		      R200_TXC_ARG_A_TFACTOR_COLOR |
922		      R200_TXC_BIAS_ARG_A |
923		      R200_TXC_SCALE_ARG_A |
924		      R200_TXC_ARG_B_R0_COLOR |
925		      R200_TXC_BIAS_ARG_B |
926		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
927		      R200_TXC_ARG_C_R1_COLOR |
928		      R200_TXC_OP_MADD);
929	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
930		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
931		      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
932		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
933	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
934		      R200_TXA_ARG_A_ZERO |
935		      R200_TXA_ARG_B_ZERO |
936		      R200_TXA_ARG_C_ZERO |
937		      R200_TXA_OP_MADD);
938	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
939		      R200_TXA_OUTPUT_REG_NONE);
940
941	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
942	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
943		      R200_TXC_ARG_A_TFACTOR_COLOR |
944		      R200_TXC_BIAS_ARG_A |
945		      R200_TXC_SCALE_ARG_A |
946		      R200_TXC_ARG_B_R0_COLOR |
947		      R200_TXC_BIAS_ARG_B |
948		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
949		      R200_TXC_ARG_C_R1_COLOR |
950		      R200_TXC_OP_MADD);
951	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
952		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
953		      R200_TXC_SCALE_2X |
954		      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
955		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
956	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
957		      R200_TXA_ARG_A_ZERO |
958		      R200_TXA_ARG_B_ZERO |
959		      R200_TXA_ARG_C_ZERO |
960		      R200_TXA_COMP_ARG_C |
961		      R200_TXA_OP_MADD);
962	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
963		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
964
965	/* shader constants */
966	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
967						      yco > 1.0 ? yco - 1.0: yco,
968						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
969						      0.0));
970	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
971						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
972						      uco[2] * ucscale + 0.5,
973						      0.0));
974	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
975						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
976						      vco[2] * vcscale + 0.5,
977						      0.0));
978
979	FINISH_ACCEL();
980    }
981
982    BEGIN_ACCEL(2);
983    OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
984    OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
985					   (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
986    FINISH_ACCEL();
987
988    if (pPriv->vsync) {
989	xf86CrtcPtr crtc;
990	if (pPriv->desired_crtc)
991	    crtc = pPriv->desired_crtc;
992	else
993	    crtc = radeon_pick_best_crtc(pScrn,
994					 pPriv->drw_x,
995					 pPriv->drw_x + pPriv->dst_w,
996					 pPriv->drw_y,
997					 pPriv->drw_y + pPriv->dst_h);
998	if (crtc)
999	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
1000					  crtc,
1001					  pPriv->drw_y - crtc->y,
1002					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
1003    }
1004
1005    return TRUE;
1006}
1007
1008static void
1009FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
1010{
1011    RADEONInfoPtr info = RADEONPTR(pScrn);
1012    PixmapPtr pPixmap = pPriv->pPixmap;
1013    int dstxoff, dstyoff;
1014    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
1015    int nBox = REGION_NUM_RECTS(&pPriv->clip);
1016    ACCEL_PREAMBLE();
1017
1018#ifdef COMPOSITE
1019    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
1020    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
1021#else
1022    dstxoff = 0;
1023    dstyoff = 0;
1024#endif
1025
1026    if (!FUNC_NAME(R200PrepareTexturedVideo)(pScrn, pPriv))
1027	return;
1028
1029    /*
1030     * Rendering of the actual polygon is done in two different
1031     * ways depending on chip generation:
1032     *
1033     * < R300:
1034     *
1035     *     These chips can render a rectangle in one pass, so
1036     *     handling is pretty straight-forward.
1037     *
1038     * >= R300:
1039     *
1040     *     These chips can accept a quad, but will render it as
1041     *     two triangles which results in a diagonal tear. Instead
1042     *     We render a single, large triangle and use the scissor
1043     *     functionality to restrict it to the desired rectangle.
1044     *     Due to guardband limits on r3xx/r4xx, we can only use
1045     *     the single triangle up to 2560/4021 pixels; above that we
1046     *     render as a quad.
1047     */
1048
1049#ifdef ACCEL_CP
1050    while (nBox) {
1051	int draw_size = 3 * pPriv->vtx_count + 4;
1052	int loop_boxes;
1053
1054	if (draw_size > radeon_cs_space_remaining(pScrn)) {
1055	    if (info->cs)
1056		radeon_cs_flush_indirect(pScrn);
1057	    else
1058		RADEONCPFlushIndirect(pScrn, 1);
1059	    if (!FUNC_NAME(R200PrepareTexturedVideo)(pScrn, pPriv))
1060		return;
1061	}
1062	loop_boxes = MIN(radeon_cs_space_remaining(pScrn) / draw_size, nBox);
1063	nBox -= loop_boxes;
1064
1065	BEGIN_RING(loop_boxes * 3 * pPriv->vtx_count + 4);
1066	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
1067			    loop_boxes * 3 * pPriv->vtx_count));
1068	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
1069		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
1070		 ((loop_boxes * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
1071
1072	while (loop_boxes--) {
1073	    float srcX, srcY, srcw, srch;
1074	    int dstX, dstY, dstw, dsth;
1075	    dstX = pBox->x1 + dstxoff;
1076	    dstY = pBox->y1 + dstyoff;
1077	    dstw = pBox->x2 - pBox->x1;
1078	    dsth = pBox->y2 - pBox->y1;
1079
1080	    srcX = pPriv->src_x;
1081	    srcX += ((pBox->x1 - pPriv->drw_x) *
1082		     pPriv->src_w) / (float)pPriv->dst_w;
1083	    srcY = pPriv->src_y;
1084	    srcY += ((pBox->y1 - pPriv->drw_y) *
1085		     pPriv->src_h) / (float)pPriv->dst_h;
1086
1087	    srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
1088	    srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
1089
1090	    if (pPriv->is_planar) {
1091		/*
1092		 * Just render a rect (using three coords).
1093		 */
1094		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
1095			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
1096			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1097		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
1098			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
1099			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1100		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
1101			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
1102			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1103	    } else {
1104		/*
1105		 * Just render a rect (using three coords).
1106		 */
1107		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
1108			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1109		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
1110			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1111		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
1112			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1113	    }
1114
1115	    pBox++;
1116	}
1117
1118	OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
1119	ADVANCE_RING();
1120    }
1121#else /* ACCEL_CP */
1122    BEGIN_ACCEL(nBox * 3 * pPriv->vtx_count + 2);
1123    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
1124				      RADEON_VF_PRIM_WALK_DATA |
1125				      ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
1126    while (nBox--) {
1127	float srcX, srcY, srcw, srch;
1128	int dstX, dstY, dstw, dsth;
1129	dstX = pBox->x1 + dstxoff;
1130	dstY = pBox->y1 + dstyoff;
1131	dstw = pBox->x2 - pBox->x1;
1132	dsth = pBox->y2 - pBox->y1;
1133
1134	srcX = pPriv->src_x;
1135	srcX += ((pBox->x1 - pPriv->drw_x) *
1136		 pPriv->src_w) / (float)pPriv->dst_w;
1137	srcY = pPriv->src_y;
1138	srcY += ((pBox->y1 - pPriv->drw_y) *
1139		 pPriv->src_h) / (float)pPriv->dst_h;
1140
1141	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
1142	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
1143
1144	if (pPriv->is_planar) {
1145	    /*
1146	     * Just render a rect (using three coords).
1147	     */
1148	    VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
1149		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
1150		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1151	    VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
1152		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
1153		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1154	    VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
1155		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
1156		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1157	} else {
1158	    /*
1159	     * Just render a rect (using three coords).
1160	     */
1161	    VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
1162		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1163	    VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
1164		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1165	    VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
1166		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1167	}
1168
1169	pBox++;
1170    }
1171
1172    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
1173    FINISH_ACCEL();
1174#endif /* !ACCEL_CP */
1175
1176    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
1177}
1178
1179static Bool
1180FUNC_NAME(R300PrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
1181{
1182    RADEONInfoPtr info = RADEONPTR(pScrn);
1183    PixmapPtr pPixmap = pPriv->pPixmap;
1184    struct radeon_exa_pixmap_priv *driver_priv;
1185    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
1186    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
1187    uint32_t dst_pitch, dst_format;
1188    uint32_t txenable, colorpitch, bicubic_offset;
1189    uint32_t output_fmt;
1190    int pixel_shift;
1191    ACCEL_PREAMBLE();
1192
1193#ifdef XF86DRM_MODE
1194    if (info->cs) {
1195	int ret;
1196
1197	radeon_cs_space_reset_bos(info->cs);
1198	radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
1199
1200	if (pPriv->bicubic_enabled)
1201	  radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
1202
1203	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
1204	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
1205
1206	ret = radeon_cs_space_check(info->cs);
1207	if (ret) {
1208	    ErrorF("Not enough RAM to hw accel xv operation\n");
1209	    return FALSE;
1210	}
1211    }
1212#else
1213    (void)src_bo;
1214#endif
1215
1216    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
1217
1218#ifdef USE_EXA
1219    if (info->useEXA) {
1220	dst_pitch = exaGetPixmapPitch(pPixmap);
1221    } else
1222#endif
1223    {
1224	dst_pitch = pPixmap->devKind;
1225    }
1226
1227#ifdef USE_EXA
1228    if (info->useEXA) {
1229	RADEON_SWITCH_TO_3D();
1230    } else
1231#endif
1232    {
1233	BEGIN_ACCEL(2);
1234	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
1235	/* We must wait for 3d to idle, in case source was just written as a dest. */
1236	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
1237		      RADEON_WAIT_HOST_IDLECLEAN |
1238		      RADEON_WAIT_2D_IDLECLEAN |
1239		      RADEON_WAIT_3D_IDLECLEAN |
1240		      RADEON_WAIT_DMA_GUI_IDLE);
1241	FINISH_ACCEL();
1242
1243	if (!info->accel_state->XInited3D)
1244	    RADEONInit3DEngine(pScrn);
1245    }
1246
1247    if (pPriv->bicubic_enabled)
1248	pPriv->vtx_count = 6;
1249    else
1250	pPriv->vtx_count = 4;
1251
1252    switch (pPixmap->drawable.bitsPerPixel) {
1253    case 16:
1254	if (pPixmap->drawable.depth == 15)
1255	    dst_format = R300_COLORFORMAT_ARGB1555;
1256	else
1257	    dst_format = R300_COLORFORMAT_RGB565;
1258	break;
1259    case 32:
1260	dst_format = R300_COLORFORMAT_ARGB8888;
1261	break;
1262    default:
1263	return FALSE;
1264    }
1265
1266    output_fmt = (R300_OUT_FMT_C4_8 |
1267		  R300_OUT_FMT_C0_SEL_BLUE |
1268		  R300_OUT_FMT_C1_SEL_GREEN |
1269		  R300_OUT_FMT_C2_SEL_RED |
1270		  R300_OUT_FMT_C3_SEL_ALPHA);
1271
1272    colorpitch = dst_pitch >> pixel_shift;
1273    colorpitch |= dst_format;
1274
1275    if (RADEONTilingEnabled(pScrn, pPixmap))
1276	colorpitch |= R300_COLORTILE;
1277
1278
1279    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
1280	(pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
1281	pPriv->is_planar = TRUE;
1282    else
1283	pPriv->is_planar = FALSE;
1284
1285    if (pPriv->is_planar) {
1286	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
1287	txpitch = pPriv->src_pitch;
1288    } else {
1289	if (pPriv->id == FOURCC_UYVY)
1290	    txformat1 = R300_TX_FORMAT_YVYU422;
1291	else
1292	    txformat1 = R300_TX_FORMAT_VYUY422;
1293
1294	if (pPriv->bicubic_state != BICUBIC_OFF)
1295	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
1296
1297	/* pitch is in pixels */
1298	txpitch = pPriv->src_pitch / 2;
1299    }
1300    txpitch -= 1;
1301
1302    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1303		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1304		 R300_TXPITCH_EN);
1305
1306    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1307		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1308		R300_TX_MAG_FILTER_LINEAR |
1309		R300_TX_MIN_FILTER_LINEAR |
1310		(0 << R300_TX_ID_SHIFT));
1311
1312    txoffset = info->cs ? 0 : pPriv->src_offset;
1313
1314    BEGIN_ACCEL_RELOC(6, 1);
1315    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
1316    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
1317    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
1318    if (pPriv->is_planar)
1319	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1 | R300_TX_FORMAT_CACHE_HALF_REGION_0);
1320    else
1321	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
1322    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
1323    OUT_TEXTURE_REG(R300_TX_OFFSET_0, txoffset, src_bo);
1324    FINISH_ACCEL();
1325
1326    txenable = R300_TEX_0_ENABLE;
1327
1328    if (pPriv->is_planar) {
1329	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1330		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1331		     R300_TXPITCH_EN);
1332	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
1333	txpitch -= 1;
1334	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1335		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1336		    R300_TX_MIN_FILTER_LINEAR |
1337		    R300_TX_MAG_FILTER_LINEAR);
1338
1339	BEGIN_ACCEL_RELOC(12, 2);
1340	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
1341	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
1342	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
1343	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
1344	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
1345	OUT_TEXTURE_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
1346	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
1347	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
1348	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
1349	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
1350	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
1351	OUT_TEXTURE_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset, src_bo);
1352	FINISH_ACCEL();
1353	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
1354    }
1355
1356    if (pPriv->bicubic_enabled) {
1357	/* Size is 128x1 */
1358	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
1359		     (0x0 << R300_TXHEIGHT_SHIFT) |
1360		     R300_TXPITCH_EN);
1361	/* Format is 32-bit floats, 4bpp */
1362	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
1363	/* Pitch is 127 (128-1) */
1364	txpitch = 0x7f;
1365	/* Tex filter */
1366	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
1367		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
1368		    R300_TX_MIN_FILTER_NEAREST |
1369		    R300_TX_MAG_FILTER_NEAREST |
1370		    (1 << R300_TX_ID_SHIFT));
1371
1372	if (info->cs)
1373	    bicubic_offset = 0;
1374	else
1375	    bicubic_offset = pPriv->bicubic_src_offset;
1376
1377	BEGIN_ACCEL_RELOC(6, 1);
1378	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
1379	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
1380	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
1381	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
1382	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
1383	OUT_TEXTURE_REG(R300_TX_OFFSET_1, bicubic_offset, info->bicubic_bo);
1384	FINISH_ACCEL();
1385
1386	/* Enable tex 1 */
1387	txenable |= R300_TEX_1_ENABLE;
1388    }
1389
1390    /* setup the VAP */
1391    if (info->accel_state->has_tcl) {
1392	if (pPriv->bicubic_enabled)
1393	    BEGIN_ACCEL(7);
1394	else
1395	    BEGIN_ACCEL(6);
1396    } else {
1397	if (pPriv->bicubic_enabled)
1398	    BEGIN_ACCEL(5);
1399	else
1400	    BEGIN_ACCEL(4);
1401    }
1402
1403    /* These registers define the number, type, and location of data submitted
1404     * to the PVS unit of GA input (when PVS is disabled)
1405     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
1406     * enabled.  This memory provides the imputs to the vertex shader program
1407     * and ordering is not important.  When PVS/TCL is disabled, this field maps
1408     * directly to the GA input memory and the order is signifigant.  In
1409     * PVS_BYPASS mode the order is as follows:
1410     * Position
1411     * Point Size
1412     * Color 0-3
1413     * Textures 0-7
1414     * Fog
1415     */
1416    if (pPriv->bicubic_enabled) {
1417	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
1418		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1419		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1420		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1421		       R300_SIGNED_0 |
1422		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1423		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1424		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1425		       R300_SIGNED_1));
1426	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
1427		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
1428		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
1429		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
1430		       R300_LAST_VEC_2 |
1431		       R300_SIGNED_2));
1432    } else {
1433	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
1434		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1435		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1436		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1437		       R300_SIGNED_0 |
1438		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1439		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1440		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1441		       R300_LAST_VEC_1 |
1442		       R300_SIGNED_1));
1443    }
1444
1445    /* load the vertex shader
1446     * We pre-load vertex programs in RADEONInit3DEngine():
1447     * - exa
1448     * - Xv
1449     * - Xv bicubic
1450     * Here we select the offset of the vertex program we want to use
1451     */
1452    if (info->accel_state->has_tcl) {
1453	if (pPriv->bicubic_enabled) {
1454	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
1455			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
1456			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1457			   (13 << R300_PVS_LAST_INST_SHIFT)));
1458	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
1459			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1460	} else {
1461	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
1462			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
1463			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1464			   (10 << R300_PVS_LAST_INST_SHIFT)));
1465	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
1466			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1467	}
1468    }
1469
1470    /* Position and one set of 2 texture coordinates */
1471    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
1472    if (pPriv->bicubic_enabled)
1473	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
1474					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
1475    else
1476	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
1477
1478    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
1479    FINISH_ACCEL();
1480
1481    /* setup pixel shader */
1482    if (pPriv->bicubic_state != BICUBIC_OFF) {
1483	if (pPriv->bicubic_enabled) {
1484	    BEGIN_ACCEL(79);
1485
1486	    /* 4 components: 2 for tex0 and 2 for tex1 */
1487	    OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1488					  R300_RS_COUNT_HIRES_EN));
1489
1490	    /* R300_INST_COUNT_RS - highest RS instruction used */
1491	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
1492
1493	    /* Pixel stack frame size. */
1494	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
1495
1496	    /* Indirection levels */
1497	    OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
1498					   R300_FIRST_TEX));
1499
1500	    /* Set nodes. */
1501	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1502						R300_ALU_CODE_SIZE(14) |
1503						R300_TEX_CODE_OFFSET(0) |
1504						R300_TEX_CODE_SIZE(6)));
1505
1506	    /* Nodes are allocated highest first, but executed lowest first */
1507	    OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
1508	    OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
1509						R300_ALU_SIZE(0) |
1510						R300_TEX_START(0) |
1511						R300_TEX_SIZE(0)));
1512	    OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
1513						R300_ALU_SIZE(9) |
1514						R300_TEX_START(1) |
1515						R300_TEX_SIZE(0)));
1516	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
1517						R300_ALU_SIZE(2) |
1518						R300_TEX_START(2) |
1519						R300_TEX_SIZE(3) |
1520						R300_RGBA_OUT));
1521
1522	    /* ** BICUBIC FP ** */
1523
1524	    /* texcoord0 => temp0
1525	     * texcoord1 => temp1 */
1526
1527	    // first node
1528	    /* TEX temp2, temp1.rrr0, tex1, 1D */
1529	    OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
1530						R300_TEX_ID(1) |
1531						R300_TEX_SRC_ADDR(1) |
1532						R300_TEX_DST_ADDR(2)));
1533
1534	    /* MOV temp1.r, temp1.ggg0 */
1535	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1536						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1537						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1538						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1539	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
1540						    R300_ALU_RGB_ADDRD(1) |
1541						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1542	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1543						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1544						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1545						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1546	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
1547						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1548
1549
1550	    // second node
1551	    /* TEX temp1, temp1, tex1, 1D */
1552	    OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
1553						R300_TEX_ID(1) |
1554						R300_TEX_SRC_ADDR(1) |
1555						R300_TEX_DST_ADDR(1)));
1556
1557	    /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
1558	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1559						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1560						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1561						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1562	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
1563						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1564						    R300_ALU_RGB_ADDRD(3) |
1565						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1566	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1567						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1568						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1569						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1570	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
1571						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1572
1573
1574	    /* MUL temp2.rg, temp2.rrr0, const0.rgb */
1575	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1576						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1577						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1578						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1579	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
1580						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1581						    R300_ALU_RGB_ADDRD(2) |
1582						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1583	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1584						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1585						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1586						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1587	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
1588						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1589
1590	    /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
1591	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1592						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1593						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1594						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1595	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
1596						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1597						    R300_ALU_RGB_ADDR2(3) |
1598						    R300_ALU_RGB_ADDRD(4) |
1599						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1600	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1601						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1602						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1603						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1604	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
1605						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1606
1607	    /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
1608	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1609						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1610						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1611						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1612	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
1613						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1614						    R300_ALU_RGB_ADDR2(2) |
1615						    R300_ALU_RGB_ADDRD(5) |
1616						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1617	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1618						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1619						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1620						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1621	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
1622						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1623
1624	    /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
1625	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1626						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1627						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1628						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1629	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
1630						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1631						    R300_ALU_RGB_ADDR2(3) |
1632						    R300_ALU_RGB_ADDRD(3) |
1633						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1634	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1635						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1636						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1637						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1638	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
1639						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1640
1641	    /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
1642	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1643						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1644						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1645						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1646	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
1647						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1648						    R300_ALU_RGB_ADDR2(2) |
1649						    R300_ALU_RGB_ADDRD(1) |
1650						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1651	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1652						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1653						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1654						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1655	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
1656						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1657
1658	    /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
1659	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1660						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1661						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1662						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1663	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1664						    R300_ALU_RGB_ADDR2(1) |
1665						    R300_ALU_RGB_ADDRD(1) |
1666						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1667	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1668						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1669						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1670						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1671	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
1672						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1673
1674	    /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
1675	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1676						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1677						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1678						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1679	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1680						    R300_ALU_RGB_ADDR2(3) |
1681						    R300_ALU_RGB_ADDRD(2) |
1682						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1683	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1684						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1685						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1686						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1687	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
1688						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1689
1690	    /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
1691	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1692						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1693						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1694						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1695	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1696						    R300_ALU_RGB_ADDR2(5) |
1697						    R300_ALU_RGB_ADDRD(3) |
1698						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1699	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1700						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1701						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1702						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1703	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
1704						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1705
1706	    /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
1707	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1708						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1709						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1710						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1711	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
1712						     R300_ALU_RGB_ADDR2(4) |
1713						     R300_ALU_RGB_ADDRD(0) |
1714						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1715	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1716						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1717						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1718						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1719	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
1720						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1721
1722
1723	    // third node
1724	    /* TEX temp4, temp1.rg--, tex0, 1D */
1725	    OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
1726						R300_TEX_ID(0) |
1727						R300_TEX_SRC_ADDR(1) |
1728						R300_TEX_DST_ADDR(4)));
1729
1730	    /* TEX temp3, temp3.rg--, tex0, 1D */
1731	    OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
1732						R300_TEX_ID(0) |
1733						R300_TEX_SRC_ADDR(3) |
1734						R300_TEX_DST_ADDR(3)));
1735
1736	    /* TEX temp5, temp2.rg--, tex0, 1D */
1737	    OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
1738						R300_TEX_ID(0) |
1739						R300_TEX_SRC_ADDR(2) |
1740						R300_TEX_DST_ADDR(5)));
1741
1742	    /* TEX temp0, temp0.rg--, tex0, 1D */
1743	    OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
1744						R300_TEX_ID(0) |
1745						R300_TEX_SRC_ADDR(0) |
1746						R300_TEX_DST_ADDR(0)));
1747
1748	    /* LRP temp3, temp1.bbbb, temp4, temp3 ->
1749	     * - PRESUB temps, temp4 - temp3
1750	     * - MAD temp3, temp1.bbbb, temps, temp3 */
1751	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1752						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1753						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1754						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1755						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1756	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
1757						     R300_ALU_RGB_ADDR1(4) |
1758						     R300_ALU_RGB_ADDR2(1) |
1759						     R300_ALU_RGB_ADDRD(3) |
1760						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1761	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1762						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1763						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1764						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1765	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
1766						       R300_ALU_ALPHA_ADDR1(4) |
1767						       R300_ALU_ALPHA_ADDR2(1) |
1768						       R300_ALU_ALPHA_ADDRD(3) |
1769						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1770
1771	    /* LRP temp0, temp1.bbbb, temp5, temp0 ->
1772	     * - PRESUB temps, temp5 - temp0
1773	     * - MAD temp0, temp1.bbbb, temps, temp0 */
1774	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1775						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1776						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1777						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1778						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
1779						     R300_ALU_RGB_INSERT_NOP));
1780	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
1781						     R300_ALU_RGB_ADDR1(5) |
1782						     R300_ALU_RGB_ADDR2(1) |
1783						     R300_ALU_RGB_ADDRD(0) |
1784						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1785	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1786						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1787						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1788						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1789	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
1790						       R300_ALU_ALPHA_ADDR1(5) |
1791						       R300_ALU_ALPHA_ADDR2(1) |
1792						       R300_ALU_ALPHA_ADDRD(0) |
1793						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1794
1795	    /* LRP output, temp2.bbbb, temp3, temp0 ->
1796	     * - PRESUB temps, temp3 - temp0
1797	     * - MAD output, temp2.bbbb, temps, temp0 */
1798	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1799						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1800						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1801						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1802						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1803	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
1804						     R300_ALU_RGB_ADDR1(3) |
1805						     R300_ALU_RGB_ADDR2(2) |
1806						     R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
1807	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1808						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1809						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1810						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1811	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
1812						       R300_ALU_ALPHA_ADDR1(3) |
1813						       R300_ALU_ALPHA_ADDR2(2) |
1814						       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
1815
1816	    /* Shader constants. */
1817	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
1818	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
1819	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
1820	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
1821
1822	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
1823	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
1824	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
1825	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
1826
1827	    FINISH_ACCEL();
1828	} else {
1829	    BEGIN_ACCEL(11);
1830	    /* 2 components: 2 for tex0 */
1831	    OUT_ACCEL_REG(R300_RS_COUNT,
1832                          ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1833                           R300_RS_COUNT_HIRES_EN));
1834	    /* R300_INST_COUNT_RS - highest RS instruction used */
1835	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1836
1837	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
1838
1839	    /* Indirection levels */
1840	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1841					   R300_FIRST_TEX));
1842
1843	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1844						R300_ALU_CODE_SIZE(1) |
1845						R300_TEX_CODE_OFFSET(0) |
1846						R300_TEX_CODE_SIZE(1)));
1847
1848	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1849						R300_ALU_SIZE(0) |
1850						R300_TEX_START(0) |
1851						R300_TEX_SIZE(0) |
1852						R300_RGBA_OUT));
1853
1854	    /* tex inst */
1855	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1856					       R300_TEX_DST_ADDR(0) |
1857					       R300_TEX_ID(0) |
1858					       R300_TEX_INST(R300_TEX_INST_LD)));
1859
1860	    /* ALU inst */
1861	    /* RGB */
1862	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
1863                                                   R300_ALU_RGB_ADDR1(0) |
1864                                                   R300_ALU_RGB_ADDR2(0) |
1865                                                   R300_ALU_RGB_ADDRD(0) |
1866                                                   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
1867								       R300_ALU_RGB_MASK_G |
1868								       R300_ALU_RGB_MASK_B)) |
1869                                                   R300_ALU_RGB_TARGET_A));
1870	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1871                                                   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1872                                                   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1873						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1874                                                   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1875                                                   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1876                                                   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1877                                                   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1878                                                   R300_ALU_RGB_CLAMP));
1879	    /* Alpha */
1880	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
1881						     R300_ALU_ALPHA_ADDR1(0) |
1882						     R300_ALU_ALPHA_ADDR2(0) |
1883						     R300_ALU_ALPHA_ADDRD(0) |
1884						     R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1885						     R300_ALU_ALPHA_TARGET_A |
1886						     R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
1887	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
1888						     R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
1889						     R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
1890						     R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
1891						     R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
1892						     R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
1893						     R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1894						     R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
1895						     R300_ALU_ALPHA_CLAMP));
1896	    FINISH_ACCEL();
1897	}
1898    } else {
1899	/*
1900	 * y' = y - .0625
1901	 * u' = u - .5
1902	 * v' = v - .5;
1903	 *
1904	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
1905	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
1906	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
1907	 *
1908	 * DP3 might look like the straightforward solution
1909	 * but we'd need to move the texture yuv values in
1910	 * the same reg for this to work. Therefore use MADs.
1911	 * Brightness just adds to the off constant.
1912	 * Contrast is multiplication of luminance.
1913	 * Saturation and hue change the u and v coeffs.
1914	 * Default values (before adjustments - depend on colorspace):
1915	 * yco = 1.1643
1916	 * uco = 0, -0.39173, 2.017
1917	 * vco = 1.5958, -0.8129, 0
1918	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
1919	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
1920	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
1921	 *
1922	 * temp = MAD(yco, yuv.yyyy, off)
1923	 * temp = MAD(uco, yuv.uuuu, temp)
1924	 * result = MAD(vco, yuv.vvvv, temp)
1925	 */
1926	/* TODO: don't recalc consts always */
1927	const float Loff = -0.0627;
1928	const float Coff = -0.502;
1929	float uvcosf, uvsinf;
1930	float yco;
1931	float uco[3], vco[3], off[3];
1932	float bright, cont, gamma;
1933	int ref = pPriv->transform_index;
1934	Bool needgamma = FALSE;
1935
1936	cont = RTFContrast(pPriv->contrast);
1937	bright = RTFBrightness(pPriv->brightness);
1938	gamma = (float)pPriv->gamma / 1000.0;
1939	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
1940	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
1941	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
1942
1943	yco = trans[ref].RefLuma * cont;
1944	uco[0] = -trans[ref].RefRCr * uvsinf;
1945	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
1946	uco[2] = trans[ref].RefBCb * uvcosf;
1947	vco[0] = trans[ref].RefRCr * uvcosf;
1948	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
1949	vco[2] = trans[ref].RefBCb * uvsinf;
1950	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
1951	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
1952	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
1953
1954	if (gamma != 1.0) {
1955	    needgamma = TRUE;
1956	    /* note: gamma correction is out = in ^ gamma;
1957	       gpu can only do LG2/EX2 therefore we transform into
1958	       in ^ gamma = 2 ^ (log2(in) * gamma).
1959	       Lots of scalar ops, unfortunately (better solution?) -
1960	       without gamma that's 3 inst, with gamma it's 10...
1961	       could use different gamma factors per channel,
1962	       if that's of any use. */
1963	}
1964
1965	if (pPriv->is_planar) {
1966	    BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
1967	    /* 2 components: same 2 for tex0/1/2 */
1968	    OUT_ACCEL_REG(R300_RS_COUNT,
1969			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1970			   R300_RS_COUNT_HIRES_EN));
1971	    /* R300_INST_COUNT_RS - highest RS instruction used */
1972	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1973
1974	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
1975
1976	    /* Indirection levels */
1977	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1978					   R300_FIRST_TEX));
1979
1980	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1981						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1982						R300_TEX_CODE_OFFSET(0) |
1983						R300_TEX_CODE_SIZE(3)));
1984
1985	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1986						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1987						R300_TEX_START(0) |
1988						R300_TEX_SIZE(2) |
1989						R300_RGBA_OUT));
1990
1991	    /* tex inst */
1992	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1993					       R300_TEX_DST_ADDR(2) |
1994					       R300_TEX_ID(0) |
1995					       R300_TEX_INST(R300_TEX_INST_LD)));
1996	    OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
1997					       R300_TEX_DST_ADDR(1) |
1998					       R300_TEX_ID(1) |
1999					       R300_TEX_INST(R300_TEX_INST_LD)));
2000	    OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
2001					       R300_TEX_DST_ADDR(0) |
2002					       R300_TEX_ID(2) |
2003					       R300_TEX_INST(R300_TEX_INST_LD)));
2004
2005	    /* ALU inst */
2006	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
2007	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
2008						    R300_ALU_RGB_ADDR1(2) |
2009						    R300_ALU_RGB_ADDR2(0) |
2010						    R300_ALU_RGB_ADDRD(2) |
2011						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2012	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
2013						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2014						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
2015						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2016						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
2017						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2018						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2019						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2020	    /* alpha nop, but need to set up alpha source for rgb usage */
2021	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
2022						      R300_ALU_ALPHA_ADDR1(2) |
2023						      R300_ALU_ALPHA_ADDR2(0) |
2024						      R300_ALU_ALPHA_ADDRD(2) |
2025						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2026	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2027						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2028						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2029						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2030
2031	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
2032	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
2033						    R300_ALU_RGB_ADDR1(1) |
2034						    R300_ALU_RGB_ADDR2(2) |
2035						    R300_ALU_RGB_ADDRD(2) |
2036						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2037	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2038						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2039						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
2040						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2041						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2042						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2043						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2044						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2045	    /* alpha nop */
2046	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
2047						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2048	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2049						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2050						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2051						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2052
2053	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
2054	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
2055						    R300_ALU_RGB_ADDR1(0) |
2056						    R300_ALU_RGB_ADDR2(2) |
2057						    R300_ALU_RGB_ADDRD(0) |
2058						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
2059						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
2060	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2061						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2062						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
2063						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2064						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2065						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2066						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2067						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2068						    R300_ALU_RGB_CLAMP));
2069	    /* write alpha 1 */
2070	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2071						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2072						      R300_ALU_ALPHA_TARGET_A));
2073	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2074						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2075						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2076						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2077
2078	    if (needgamma) {
2079		/* rgb temp0.r = op_sop, set up src0 reg */
2080		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2081							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2082		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
2083			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2084			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2085		/* alpha lg2 temp0, temp0.r */
2086		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2087							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2088		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2089							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2090							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2091							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2092
2093		/* rgb temp0.g = op_sop, set up src0 reg */
2094		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2095							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2096		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
2097			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2098			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2099		/* alpha lg2 temp0, temp0.g */
2100		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2101							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2102		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2103							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2104							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2105							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2106
2107		/* rgb temp0.b = op_sop, set up src0 reg */
2108		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2109							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2110		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
2111			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2112			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2113		/* alpha lg2 temp0, temp0.b */
2114		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2115							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2116		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2117							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2118							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2119							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2120
2121		/* MUL const1, temp1, temp0 */
2122		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2123							R300_ALU_RGB_ADDR1(0) |
2124							R300_ALU_RGB_ADDR2(0) |
2125							R300_ALU_RGB_ADDRD(0) |
2126							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2127		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2128							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2129							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2130							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2131							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2132							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2133							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2134							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2135		/* alpha nop, but set up const1 */
2136		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2137							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2138							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2139		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2140							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2141							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2142							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2143
2144		/* rgb out0.r = op_sop, set up src0 reg */
2145		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2146							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2147							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2148		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
2149			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2150			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2151		/* alpha ex2 temp0, temp0.r */
2152		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2153							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2154		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2155							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2156							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2157							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2158
2159		/* rgb out0.g = op_sop, set up src0 reg */
2160		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2161							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2162							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2163		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
2164			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2165			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2166		/* alpha ex2 temp0, temp0.g */
2167		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2168							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2169		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2170							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2171							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2172							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2173
2174		/* rgb out0.b = op_sop, set up src0 reg */
2175		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2176							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2177							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2178		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
2179			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2180			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2181		/* alpha ex2 temp0, temp0.b */
2182		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2183							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2184		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2185							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2186							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2187							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2188	    }
2189	} else {
2190	    BEGIN_ACCEL(needgamma ? 28 + 31 : 31);
2191	    /* 2 components */
2192	    OUT_ACCEL_REG(R300_RS_COUNT,
2193			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2194			   R300_RS_COUNT_HIRES_EN));
2195	    /* R300_INST_COUNT_RS - highest RS instruction used */
2196	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
2197
2198	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
2199
2200	    /* Indirection levels */
2201	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
2202					   R300_FIRST_TEX));
2203
2204	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
2205						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
2206						R300_TEX_CODE_OFFSET(0) |
2207						R300_TEX_CODE_SIZE(1)));
2208
2209	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
2210						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
2211						R300_TEX_START(0) |
2212						R300_TEX_SIZE(0) |
2213						R300_RGBA_OUT));
2214
2215	    /* tex inst */
2216	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
2217					       R300_TEX_DST_ADDR(0) |
2218					       R300_TEX_ID(0) |
2219					       R300_TEX_INST(R300_TEX_INST_LD)));
2220
2221	    /* ALU inst */
2222	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
2223	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
2224						    R300_ALU_RGB_ADDR1(0) |
2225						    R300_ALU_RGB_ADDR2(0) |
2226						    R300_ALU_RGB_ADDRD(1) |
2227						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2228	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
2229						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2230						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
2231						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2232						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
2233						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2234						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2235						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2236	    /* alpha nop, but need to set up alpha source for rgb usage */
2237	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
2238						      R300_ALU_ALPHA_ADDR1(0) |
2239						      R300_ALU_ALPHA_ADDR2(0) |
2240						      R300_ALU_ALPHA_ADDRD(0) |
2241						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2242	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2243						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2244						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2245						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2246
2247	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
2248	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
2249						    R300_ALU_RGB_ADDR1(0) |
2250						    R300_ALU_RGB_ADDR2(1) |
2251						    R300_ALU_RGB_ADDRD(1) |
2252						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2253	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2254						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2255						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
2256						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2257						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2258						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2259						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2260						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2261	    /* alpha nop */
2262	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
2263						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2264	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2265						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2266						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2267						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2268
2269	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
2270	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
2271						    R300_ALU_RGB_ADDR1(0) |
2272						    R300_ALU_RGB_ADDR2(1) |
2273						    R300_ALU_RGB_ADDRD(0) |
2274						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
2275						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
2276	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2277						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2278						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
2279						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2280						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2281						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2282						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2283						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2284						    R300_ALU_RGB_CLAMP));
2285	    /* write alpha 1 */
2286	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2287						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2288						      R300_ALU_ALPHA_TARGET_A));
2289	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2290						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2291						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2292						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2293
2294	    if (needgamma) {
2295		/* rgb temp0.r = op_sop, set up src0 reg */
2296		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2297							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2298		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
2299			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2300			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2301		/* alpha lg2 temp0, temp0.r */
2302		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2303							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2304		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2305							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2306							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2307							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2308
2309		/* rgb temp0.g = op_sop, set up src0 reg */
2310		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2311							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2312		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
2313			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2314			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2315		/* alpha lg2 temp0, temp0.g */
2316		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2317							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2318		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2319							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2320							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2321							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2322
2323		/* rgb temp0.b = op_sop, set up src0 reg */
2324		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2325							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2326		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
2327			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2328			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2329		/* alpha lg2 temp0, temp0.b */
2330		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2331							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2332		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2333							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2334							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2335							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2336
2337		/* MUL const1, temp1, temp0 */
2338		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2339							R300_ALU_RGB_ADDR1(0) |
2340							R300_ALU_RGB_ADDR2(0) |
2341							R300_ALU_RGB_ADDRD(0) |
2342							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2343		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2344							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2345							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2346							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2347							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2348							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2349							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2350							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2351		/* alpha nop, but set up const1 */
2352		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2353							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2354							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2355		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2356							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2357							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2358							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2359
2360		/* rgb out0.r = op_sop, set up src0 reg */
2361		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2362							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2363							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2364		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
2365			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2366			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2367		/* alpha ex2 temp0, temp0.r */
2368		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2369							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2370		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2371							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2372							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2373							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2374
2375		/* rgb out0.g = op_sop, set up src0 reg */
2376		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2377							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2378							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2379		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
2380			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2381			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2382		/* alpha ex2 temp0, temp0.g */
2383		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2384							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2385		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2386							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2387							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2388							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2389
2390		/* rgb out0.b = op_sop, set up src0 reg */
2391		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2392							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2393							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2394		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
2395			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2396			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2397		/* alpha ex2 temp0, temp0.b */
2398		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2399							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2400		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2401							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2402							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2403							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2404	    }
2405	}
2406
2407	/* Shader constants. */
2408	/* constant 0: off, yco */
2409	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
2410	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
2411	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
2412	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
2413	/* constant 1: uco */
2414	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
2415	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
2416	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
2417	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
2418	/* constant 2: vco */
2419	OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
2420	OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
2421	OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
2422	OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
2423
2424	FINISH_ACCEL();
2425    }
2426
2427    BEGIN_ACCEL_RELOC(6, 2);
2428    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
2429    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
2430
2431    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
2432    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
2433
2434    /* no need to enable blending */
2435    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
2436
2437    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
2438    FINISH_ACCEL();
2439
2440    if (pPriv->vsync) {
2441	xf86CrtcPtr crtc;
2442	if (pPriv->desired_crtc)
2443	    crtc = pPriv->desired_crtc;
2444	else
2445	    crtc = radeon_pick_best_crtc(pScrn,
2446					 pPriv->drw_x,
2447					 pPriv->drw_x + pPriv->dst_w,
2448					 pPriv->drw_y,
2449					 pPriv->drw_y + pPriv->dst_h);
2450	if (crtc)
2451	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
2452					  crtc,
2453					  pPriv->drw_y - crtc->y,
2454					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
2455    }
2456
2457    return TRUE;
2458}
2459
2460static void
2461FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2462{
2463    RADEONInfoPtr info = RADEONPTR(pScrn);
2464    PixmapPtr pPixmap = pPriv->pPixmap;
2465    int dstxoff, dstyoff;
2466    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
2467    int nBox = REGION_NUM_RECTS(&pPriv->clip);
2468    ACCEL_PREAMBLE();
2469
2470#ifdef COMPOSITE
2471    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
2472    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
2473#else
2474    dstxoff = 0;
2475    dstyoff = 0;
2476#endif
2477
2478    if (!FUNC_NAME(R300PrepareTexturedVideo)(pScrn, pPriv))
2479	return;
2480
2481    /*
2482     * Rendering of the actual polygon is done in two different
2483     * ways depending on chip generation:
2484     *
2485     * < R300:
2486     *
2487     *     These chips can render a rectangle in one pass, so
2488     *     handling is pretty straight-forward.
2489     *
2490     * >= R300:
2491     *
2492     *     These chips can accept a quad, but will render it as
2493     *     two triangles which results in a diagonal tear. Instead
2494     *     We render a single, large triangle and use the scissor
2495     *     functionality to restrict it to the desired rectangle.
2496     *     Due to guardband limits on r3xx/r4xx, we can only use
2497     *     the single triangle up to 2560/4021 pixels; above that we
2498     *     render as a quad.
2499     */
2500
2501    while (nBox--) {
2502	float srcX, srcY, srcw, srch;
2503	int dstX, dstY, dstw, dsth;
2504	Bool use_quad = FALSE;
2505#ifdef ACCEL_CP
2506	int draw_size = 4 * pPriv->vtx_count + 4 + 2 + 3;
2507
2508	if (draw_size > radeon_cs_space_remaining(pScrn)) {
2509	    if (info->cs)
2510		radeon_cs_flush_indirect(pScrn);
2511	    else
2512		RADEONCPFlushIndirect(pScrn, 1);
2513	    if (!FUNC_NAME(R300PrepareTexturedVideo)(pScrn, pPriv))
2514		return;
2515	}
2516#endif
2517
2518	dstX = pBox->x1 + dstxoff;
2519	dstY = pBox->y1 + dstyoff;
2520	dstw = pBox->x2 - pBox->x1;
2521	dsth = pBox->y2 - pBox->y1;
2522
2523	srcX = pPriv->src_x;
2524	srcX += ((pBox->x1 - pPriv->drw_x) *
2525		 pPriv->src_w) / (float)pPriv->dst_w;
2526	srcY = pPriv->src_y;
2527	srcY += ((pBox->y1 - pPriv->drw_y) *
2528		 pPriv->src_h) / (float)pPriv->dst_h;
2529
2530	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
2531	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
2532
2533	if (IS_R400_3D) {
2534	    if ((dstw+dsth) > 4021)
2535		use_quad = TRUE;
2536	} else {
2537	    if ((dstw+dsth) > 2560)
2538		use_quad = TRUE;
2539	}
2540	/*
2541	 * Set up the scissor area to that of the output size.
2542	 */
2543	BEGIN_ACCEL(2);
2544	/* R300 has an offset */
2545	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1440) << R300_SCISSOR_X_SHIFT) |
2546					 ((dstY + 1440) << R300_SCISSOR_Y_SHIFT)));
2547	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1440 - 1) << R300_SCISSOR_X_SHIFT) |
2548					 ((dstY + dsth + 1440 - 1) << R300_SCISSOR_Y_SHIFT)));
2549	FINISH_ACCEL();
2550
2551#ifdef ACCEL_CP
2552	if (use_quad) {
2553	    BEGIN_RING(4 * pPriv->vtx_count + 4);
2554	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2555				4 * pPriv->vtx_count));
2556	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
2557		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2558		     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2559	} else {
2560	    BEGIN_RING(3 * pPriv->vtx_count + 4);
2561	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2562				3 * pPriv->vtx_count));
2563	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
2564		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2565		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2566	}
2567#else /* ACCEL_CP */
2568	if (use_quad)
2569	    BEGIN_ACCEL(2 + pPriv->vtx_count * 4);
2570	else
2571	    BEGIN_ACCEL(2 + pPriv->vtx_count * 3);
2572
2573	if (use_quad)
2574	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
2575					      RADEON_VF_PRIM_WALK_DATA |
2576					      (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
2577	else
2578	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
2579					      RADEON_VF_PRIM_WALK_DATA |
2580					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
2581#endif
2582	if (pPriv->bicubic_enabled) {
2583		/*
2584		 * This code is only executed on >= R300, so we don't
2585		 * have to deal with the legacy handling.
2586		 */
2587	    if (use_quad) {
2588		VTX_OUT_6((float)dstX,                     (float)dstY,
2589			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2590			  (float)srcX + 0.5,               (float)srcY + 0.5);
2591		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
2592			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
2593			  (float)srcX + 0.5,               (float)(srcY + srch) + 0.5);
2594		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
2595			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
2596			  (float)(srcX + srcw) + 0.5,      (float)(srcY + srch) + 0.5);
2597		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
2598			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
2599			  (float)(srcX + srcw) + 0.5,      (float)srcY + 0.5);
2600	    } else {
2601		VTX_OUT_6((float)dstX,                     (float)dstY,
2602			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2603			  (float)srcX + 0.5,               (float)srcY + 0.5);
2604		VTX_OUT_6((float)dstX,                     (float)(dstY + dstw + dsth),
2605			  (float)srcX / pPriv->w,
2606			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
2607			  (float)srcX + 0.5,
2608			  (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
2609		VTX_OUT_6((float)(dstX + dstw + dsth),     (float)dstY,
2610			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2611			  (float)srcY / pPriv->h,
2612			  (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
2613			  (float)srcY + 0.5);
2614	    }
2615	} else {
2616	    if (use_quad) {
2617		VTX_OUT_4((float)dstX,                     (float)dstY,
2618			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h);
2619		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
2620			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
2621		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
2622			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
2623		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
2624			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
2625	    } else {
2626		/*
2627		 * Render a big, scissored triangle. This means
2628		 * increasing the triangle size and adjusting
2629		 * texture coordinates.
2630		 */
2631		VTX_OUT_4((float)dstX,                 (float)dstY,
2632			  (float)srcX / pPriv->w,      (float)srcY / pPriv->h);
2633		VTX_OUT_4((float)dstX,                 (float)(dstY + dsth + dstw),
2634			  (float)srcX / pPriv->w,
2635			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
2636		VTX_OUT_4((float)(dstX + dstw + dsth), (float)dstY,
2637			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2638			  (float)srcY / pPriv->h);
2639	    }
2640	}
2641
2642	/* flushing is pipelined, free/finish is not */
2643	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2644
2645#ifdef ACCEL_CP
2646	ADVANCE_RING();
2647#else
2648	FINISH_ACCEL();
2649#endif /* !ACCEL_CP */
2650
2651	pBox++;
2652    }
2653
2654    BEGIN_ACCEL(3);
2655    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
2656    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
2657    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
2658    FINISH_ACCEL();
2659
2660    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
2661}
2662
2663static Bool
2664FUNC_NAME(R500PrepareTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2665{
2666    RADEONInfoPtr info = RADEONPTR(pScrn);
2667    PixmapPtr pPixmap = pPriv->pPixmap;
2668    struct radeon_exa_pixmap_priv *driver_priv;
2669    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
2670    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch, us_format = 0;
2671    uint32_t dst_pitch, dst_format;
2672    uint32_t txenable, colorpitch, bicubic_offset;
2673    uint32_t output_fmt;
2674    int pixel_shift, out_size = 6;
2675    ACCEL_PREAMBLE();
2676
2677#ifdef XF86DRM_MODE
2678    if (info->cs) {
2679	int ret;
2680
2681	radeon_cs_space_reset_bos(info->cs);
2682	radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2683
2684	if (pPriv->bicubic_enabled)
2685	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2686
2687	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
2688	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
2689
2690	ret = radeon_cs_space_check(info->cs);
2691	if (ret) {
2692	    ErrorF("Not enough RAM to hw accel xv operation\n");
2693	    return FALSE;
2694	}
2695    }
2696#else
2697    (void)src_bo;
2698#endif
2699
2700    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
2701
2702#ifdef USE_EXA
2703    if (info->useEXA) {
2704	dst_pitch = exaGetPixmapPitch(pPixmap);
2705    } else
2706#endif
2707    {
2708	dst_pitch = pPixmap->devKind;
2709    }
2710
2711#ifdef USE_EXA
2712    if (info->useEXA) {
2713	RADEON_SWITCH_TO_3D();
2714    } else
2715#endif
2716    {
2717	BEGIN_ACCEL(2);
2718	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2719	/* We must wait for 3d to idle, in case source was just written as a dest. */
2720	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
2721		      RADEON_WAIT_HOST_IDLECLEAN |
2722		      RADEON_WAIT_2D_IDLECLEAN |
2723		      RADEON_WAIT_3D_IDLECLEAN |
2724		      RADEON_WAIT_DMA_GUI_IDLE);
2725	FINISH_ACCEL();
2726
2727	if (!info->accel_state->XInited3D)
2728	    RADEONInit3DEngine(pScrn);
2729    }
2730
2731    if (pPriv->bicubic_enabled)
2732	pPriv->vtx_count = 6;
2733    else
2734	pPriv->vtx_count = 4;
2735
2736    switch (pPixmap->drawable.bitsPerPixel) {
2737    case 16:
2738	if (pPixmap->drawable.depth == 15)
2739	    dst_format = R300_COLORFORMAT_ARGB1555;
2740	else
2741	    dst_format = R300_COLORFORMAT_RGB565;
2742	break;
2743    case 32:
2744	dst_format = R300_COLORFORMAT_ARGB8888;
2745	break;
2746    default:
2747	return FALSE;
2748    }
2749
2750    output_fmt = (R300_OUT_FMT_C4_8 |
2751		  R300_OUT_FMT_C0_SEL_BLUE |
2752		  R300_OUT_FMT_C1_SEL_GREEN |
2753		  R300_OUT_FMT_C2_SEL_RED |
2754		  R300_OUT_FMT_C3_SEL_ALPHA);
2755
2756    colorpitch = dst_pitch >> pixel_shift;
2757    colorpitch |= dst_format;
2758
2759    if (RADEONTilingEnabled(pScrn, pPixmap))
2760	colorpitch |= R300_COLORTILE;
2761
2762    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
2763        (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
2764	pPriv->is_planar = TRUE;
2765    else
2766	pPriv->is_planar = FALSE;
2767
2768    if (pPriv->is_planar) {
2769	txformat1 = R300_TX_FORMAT_X8;
2770	txpitch = pPriv->src_pitch;
2771    } else {
2772	if (pPriv->id == FOURCC_UYVY)
2773	    txformat1 = R300_TX_FORMAT_YVYU422;
2774	else
2775	    txformat1 = R300_TX_FORMAT_VYUY422;
2776
2777	if (pPriv->bicubic_state != BICUBIC_OFF)
2778	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
2779
2780	/* pitch is in pixels */
2781	txpitch = pPriv->src_pitch / 2;
2782    }
2783    txpitch -= 1;
2784
2785    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2786		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2787		 R300_TXPITCH_EN);
2788
2789    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2790		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2791		R300_TX_MAG_FILTER_LINEAR |
2792		R300_TX_MIN_FILTER_LINEAR |
2793		(0 << R300_TX_ID_SHIFT));
2794
2795
2796    if ((pPriv->w - 1) & 0x800)
2797	txpitch |= R500_TXWIDTH_11;
2798
2799    if ((pPriv->h - 1) & 0x800)
2800	txpitch |= R500_TXHEIGHT_11;
2801
2802    if (info->ChipFamily == CHIP_FAMILY_R520) {
2803	unsigned us_width = (pPriv->w - 1) & 0x7ff;
2804	unsigned us_height = (pPriv->h - 1) & 0x7ff;
2805	unsigned us_depth = 0;
2806
2807	if (pPriv->w > 2048) {
2808	    us_width = (0x7ff + us_width) >> 1;
2809	    us_depth |= 0x0d;
2810	}
2811	if (pPriv->h > 2048) {
2812	    us_height = (0x7ff + us_height) >> 1;
2813	    us_depth |= 0x0e;
2814	}
2815	us_format = (us_width << R300_TXWIDTH_SHIFT) |
2816		    (us_height << R300_TXHEIGHT_SHIFT) |
2817		    (us_depth << R300_TXDEPTH_SHIFT);
2818	out_size++;
2819    }
2820
2821    txoffset = info->cs ? 0 : pPriv->src_offset;
2822
2823    BEGIN_ACCEL_RELOC(out_size, 1);
2824    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
2825    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
2826    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
2827    OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
2828    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
2829    OUT_TEXTURE_REG(R300_TX_OFFSET_0, txoffset, src_bo);
2830    if (info->ChipFamily == CHIP_FAMILY_R520)
2831	OUT_ACCEL_REG(R500_US_FORMAT0_0, us_format);
2832    FINISH_ACCEL();
2833
2834    txenable = R300_TEX_0_ENABLE;
2835
2836    if (pPriv->is_planar) {
2837	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2838		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2839		     R300_TXPITCH_EN);
2840	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
2841	txpitch -= 1;
2842	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2843		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2844		    R300_TX_MIN_FILTER_LINEAR |
2845		    R300_TX_MAG_FILTER_LINEAR);
2846
2847	BEGIN_ACCEL_RELOC(12, 2);
2848	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
2849	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
2850	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
2851	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8);
2852	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
2853	OUT_TEXTURE_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
2854	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
2855	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
2856	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
2857	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8);
2858	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
2859	OUT_TEXTURE_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset, src_bo);
2860	FINISH_ACCEL();
2861	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
2862    }
2863
2864    if (pPriv->bicubic_enabled) {
2865	/* Size is 128x1 */
2866	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
2867		     (0x0 << R300_TXHEIGHT_SHIFT) |
2868		     R300_TXPITCH_EN);
2869	/* Format is 32-bit floats, 4bpp */
2870	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
2871	/* Pitch is 127 (128-1) */
2872	txpitch = 0x7f;
2873	/* Tex filter */
2874	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
2875		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
2876		    R300_TX_MIN_FILTER_NEAREST |
2877		    R300_TX_MAG_FILTER_NEAREST |
2878		    (1 << R300_TX_ID_SHIFT));
2879
2880	if (info->cs)
2881	    bicubic_offset = 0;
2882	else
2883	    bicubic_offset = pPriv->bicubic_src_offset;
2884
2885	BEGIN_ACCEL_RELOC(6, 1);
2886	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
2887	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
2888	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
2889	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
2890	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
2891	OUT_TEXTURE_REG(R300_TX_OFFSET_1, bicubic_offset, info->bicubic_bo);
2892	FINISH_ACCEL();
2893
2894	/* Enable tex 1 */
2895	txenable |= R300_TEX_1_ENABLE;
2896    }
2897
2898    /* setup the VAP */
2899    if (info->accel_state->has_tcl) {
2900	if (pPriv->bicubic_enabled)
2901	    BEGIN_ACCEL(7);
2902	else
2903	    BEGIN_ACCEL(6);
2904    } else {
2905	if (pPriv->bicubic_enabled)
2906	    BEGIN_ACCEL(5);
2907	else
2908	    BEGIN_ACCEL(4);
2909    }
2910
2911    /* These registers define the number, type, and location of data submitted
2912     * to the PVS unit of GA input (when PVS is disabled)
2913     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
2914     * enabled.  This memory provides the imputs to the vertex shader program
2915     * and ordering is not important.  When PVS/TCL is disabled, this field maps
2916     * directly to the GA input memory and the order is signifigant.  In
2917     * PVS_BYPASS mode the order is as follows:
2918     * Position
2919     * Point Size
2920     * Color 0-3
2921     * Textures 0-7
2922     * Fog
2923     */
2924    if (pPriv->bicubic_enabled) {
2925	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
2926		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2927		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2928		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2929		       R300_SIGNED_0 |
2930		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2931		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2932		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2933		       R300_SIGNED_1));
2934	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
2935		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
2936		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
2937		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
2938		       R300_LAST_VEC_2 |
2939		       R300_SIGNED_2));
2940    } else {
2941	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
2942		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2943		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2944		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2945		       R300_SIGNED_0 |
2946		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2947		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2948		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2949		       R300_LAST_VEC_1 |
2950		       R300_SIGNED_1));
2951    }
2952
2953    /* load the vertex shader
2954     * We pre-load vertex programs in RADEONInit3DEngine():
2955     * - exa
2956     * - Xv
2957     * - Xv bicubic
2958     * Here we select the offset of the vertex program we want to use
2959     */
2960    if (info->accel_state->has_tcl) {
2961	if (pPriv->bicubic_enabled) {
2962	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
2963			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
2964			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2965			   (13 << R300_PVS_LAST_INST_SHIFT)));
2966	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
2967			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2968	} else {
2969	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
2970			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
2971			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2972			   (10 << R300_PVS_LAST_INST_SHIFT)));
2973	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
2974			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2975	}
2976    }
2977
2978    /* Position and one set of 2 texture coordinates */
2979    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
2980    if (pPriv->bicubic_enabled)
2981	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
2982					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
2983    else
2984	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
2985
2986    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
2987    FINISH_ACCEL();
2988
2989    /* setup pixel shader */
2990    if (pPriv->bicubic_state != BICUBIC_OFF) {
2991	if (pPriv->bicubic_enabled) {
2992	    BEGIN_ACCEL(7);
2993
2994	    /* 4 components: 2 for tex0 and 2 for tex1 */
2995	    OUT_ACCEL_REG(R300_RS_COUNT,
2996			  ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2997			   R300_RS_COUNT_HIRES_EN));
2998
2999	    /* R300_INST_COUNT_RS - highest RS instruction used */
3000	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
3001
3002	    /* Pixel stack frame size. */
3003	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
3004
3005	    /* FP length. */
3006	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3007					      R500_US_CODE_END_ADDR(13)));
3008	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3009					       R500_US_CODE_RANGE_SIZE(13)));
3010
3011	    /* Prepare for FP emission. */
3012	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3013	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3014	    FINISH_ACCEL();
3015
3016	    BEGIN_ACCEL(89);
3017	    /* Pixel shader.
3018	     * I've gone ahead and annotated each instruction, since this
3019	     * thing is MASSIVE. :3
3020	     * Note: In order to avoid buggies with temps and multiple
3021	     * inputs, all temps are offset by 2. temp0 -> register2. */
3022
3023	    /* TEX temp2, input1.xxxx, tex1, 1D */
3024	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3025						   R500_INST_RGB_WMASK_R |
3026						   R500_INST_RGB_WMASK_G |
3027						   R500_INST_RGB_WMASK_B));
3028	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3029						   R500_TEX_INST_LD |
3030						   R500_TEX_IGNORE_UNCOVERED));
3031	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
3032						   R500_TEX_SRC_S_SWIZ_R |
3033						   R500_TEX_SRC_T_SWIZ_R |
3034						   R500_TEX_SRC_R_SWIZ_R |
3035						   R500_TEX_SRC_Q_SWIZ_R |
3036						   R500_TEX_DST_ADDR(2) |
3037						   R500_TEX_DST_R_SWIZ_R |
3038						   R500_TEX_DST_G_SWIZ_G |
3039						   R500_TEX_DST_B_SWIZ_B |
3040						   R500_TEX_DST_A_SWIZ_A));
3041	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3042	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3043	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3044
3045	    /* TEX temp5, input1.yyyy, tex1, 1D */
3046	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3047						   R500_INST_TEX_SEM_WAIT |
3048						   R500_INST_RGB_WMASK_R |
3049						   R500_INST_RGB_WMASK_G |
3050						   R500_INST_RGB_WMASK_B));
3051	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3052						   R500_TEX_INST_LD |
3053						   R500_TEX_SEM_ACQUIRE |
3054						   R500_TEX_IGNORE_UNCOVERED));
3055	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
3056						   R500_TEX_SRC_S_SWIZ_G |
3057						   R500_TEX_SRC_T_SWIZ_G |
3058						   R500_TEX_SRC_R_SWIZ_G |
3059						   R500_TEX_SRC_Q_SWIZ_G |
3060						   R500_TEX_DST_ADDR(5) |
3061						   R500_TEX_DST_R_SWIZ_R |
3062						   R500_TEX_DST_G_SWIZ_G |
3063						   R500_TEX_DST_B_SWIZ_B |
3064						   R500_TEX_DST_A_SWIZ_A));
3065	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3066	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3067	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3068
3069	    /* MUL temp4, const0.x0x0, temp2.yyxx */
3070	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3071						   R500_INST_TEX_SEM_WAIT |
3072						   R500_INST_RGB_WMASK_R |
3073						   R500_INST_RGB_WMASK_G |
3074						   R500_INST_RGB_WMASK_B |
3075						   R500_INST_ALPHA_WMASK));
3076	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3077						   R500_RGB_ADDR0_CONST |
3078						   R500_RGB_ADDR1(2)));
3079	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3080						   R500_ALPHA_ADDR0_CONST |
3081						   R500_ALPHA_ADDR1(2)));
3082	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3083						   R500_ALU_RGB_R_SWIZ_A_R |
3084						   R500_ALU_RGB_G_SWIZ_A_0 |
3085						   R500_ALU_RGB_B_SWIZ_A_R |
3086						   R500_ALU_RGB_SEL_B_SRC1 |
3087						   R500_ALU_RGB_R_SWIZ_B_G |
3088						   R500_ALU_RGB_G_SWIZ_B_G |
3089						   R500_ALU_RGB_B_SWIZ_B_R));
3090	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
3091						   R500_ALPHA_OP_MAD |
3092						   R500_ALPHA_SEL_A_SRC0 |
3093						   R500_ALPHA_SWIZ_A_0 |
3094						   R500_ALPHA_SEL_B_SRC1 |
3095						   R500_ALPHA_SWIZ_B_R));
3096	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
3097						   R500_ALU_RGBA_OP_MAD |
3098						   R500_ALU_RGBA_R_SWIZ_0 |
3099						   R500_ALU_RGBA_G_SWIZ_0 |
3100						   R500_ALU_RGBA_B_SWIZ_0 |
3101						   R500_ALU_RGBA_A_SWIZ_0));
3102
3103	    /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
3104	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3105						   R500_INST_RGB_WMASK_R |
3106						   R500_INST_RGB_WMASK_G |
3107						   R500_INST_RGB_WMASK_B |
3108						   R500_INST_ALPHA_WMASK));
3109	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3110						   R500_RGB_ADDR0_CONST |
3111						   R500_RGB_ADDR1(5) |
3112						   R500_RGB_ADDR2(4)));
3113	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3114						   R500_ALPHA_ADDR0_CONST |
3115						   R500_ALPHA_ADDR1(5) |
3116						   R500_ALPHA_ADDR2(4)));
3117	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3118						   R500_ALU_RGB_R_SWIZ_A_0 |
3119						   R500_ALU_RGB_G_SWIZ_A_G |
3120						   R500_ALU_RGB_B_SWIZ_A_0 |
3121						   R500_ALU_RGB_SEL_B_SRC1 |
3122						   R500_ALU_RGB_R_SWIZ_B_R |
3123						   R500_ALU_RGB_G_SWIZ_B_R |
3124						   R500_ALU_RGB_B_SWIZ_B_R));
3125	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3126						   R500_ALPHA_OP_MAD |
3127						   R500_ALPHA_SEL_A_SRC0 |
3128						   R500_ALPHA_SWIZ_A_G |
3129						   R500_ALPHA_SEL_B_SRC1 |
3130						   R500_ALPHA_SWIZ_B_R));
3131	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3132						   R500_ALU_RGBA_OP_MAD |
3133						   R500_ALU_RGBA_SEL_C_SRC2 |
3134						   R500_ALU_RGBA_R_SWIZ_R |
3135						   R500_ALU_RGBA_G_SWIZ_G |
3136						   R500_ALU_RGBA_B_SWIZ_B |
3137						   R500_ALU_RGBA_A_SWIZ_A));
3138
3139	    /* ADD temp3, temp3, input0.xyxy */
3140	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3141						   R500_INST_RGB_WMASK_R |
3142						   R500_INST_RGB_WMASK_G |
3143						   R500_INST_RGB_WMASK_B |
3144						   R500_INST_ALPHA_WMASK));
3145	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
3146						   R500_RGB_ADDR2(0)));
3147	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
3148						   R500_ALPHA_ADDR2(0)));
3149	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
3150						   R500_ALU_RGB_G_SWIZ_A_1 |
3151						   R500_ALU_RGB_B_SWIZ_A_1 |
3152						   R500_ALU_RGB_SEL_B_SRC1 |
3153						   R500_ALU_RGB_R_SWIZ_B_R |
3154						   R500_ALU_RGB_G_SWIZ_B_G |
3155						   R500_ALU_RGB_B_SWIZ_B_B));
3156	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3157						   R500_ALPHA_OP_MAD |
3158						   R500_ALPHA_SWIZ_A_1 |
3159						   R500_ALPHA_SEL_B_SRC1 |
3160						   R500_ALPHA_SWIZ_B_A));
3161	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3162						   R500_ALU_RGBA_OP_MAD |
3163						   R500_ALU_RGBA_SEL_C_SRC2 |
3164						   R500_ALU_RGBA_R_SWIZ_R |
3165						   R500_ALU_RGBA_G_SWIZ_G |
3166						   R500_ALU_RGBA_B_SWIZ_R |
3167						   R500_ALU_RGBA_A_SWIZ_G));
3168
3169	    /* TEX temp1, temp3.zwxy, tex0, 2D */
3170	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3171						   R500_INST_RGB_WMASK_R |
3172						   R500_INST_RGB_WMASK_G |
3173						   R500_INST_RGB_WMASK_B |
3174						   R500_INST_ALPHA_WMASK));
3175	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3176						   R500_TEX_INST_LD |
3177						   R500_TEX_IGNORE_UNCOVERED));
3178	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
3179						   R500_TEX_SRC_S_SWIZ_B |
3180						   R500_TEX_SRC_T_SWIZ_A |
3181						   R500_TEX_SRC_R_SWIZ_R |
3182						   R500_TEX_SRC_Q_SWIZ_G |
3183						   R500_TEX_DST_ADDR(1) |
3184						   R500_TEX_DST_R_SWIZ_R |
3185						   R500_TEX_DST_G_SWIZ_G |
3186						   R500_TEX_DST_B_SWIZ_B |
3187						   R500_TEX_DST_A_SWIZ_A));
3188	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3189	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3190	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3191
3192	    /* TEX temp3, temp3.xyzw, tex0, 2D */
3193	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3194						   R500_INST_TEX_SEM_WAIT |
3195						   R500_INST_RGB_WMASK_R |
3196						   R500_INST_RGB_WMASK_G |
3197						   R500_INST_RGB_WMASK_B |
3198						   R500_INST_ALPHA_WMASK));
3199	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3200						   R500_TEX_INST_LD |
3201						   R500_TEX_SEM_ACQUIRE |
3202						   R500_TEX_IGNORE_UNCOVERED));
3203	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
3204						   R500_TEX_SRC_S_SWIZ_R |
3205						   R500_TEX_SRC_T_SWIZ_G |
3206						   R500_TEX_SRC_R_SWIZ_B |
3207						   R500_TEX_SRC_Q_SWIZ_A |
3208						   R500_TEX_DST_ADDR(3) |
3209						   R500_TEX_DST_R_SWIZ_R |
3210						   R500_TEX_DST_G_SWIZ_G |
3211						   R500_TEX_DST_B_SWIZ_B |
3212						   R500_TEX_DST_A_SWIZ_A));
3213	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3214	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3215	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3216
3217	    /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
3218	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3219						   R500_INST_RGB_WMASK_R |
3220						   R500_INST_RGB_WMASK_G |
3221						   R500_INST_RGB_WMASK_B |
3222						   R500_INST_ALPHA_WMASK));
3223	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3224						   R500_RGB_ADDR0_CONST |
3225						   R500_RGB_ADDR1(5) |
3226						   R500_RGB_ADDR2(4)));
3227	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3228						   R500_ALPHA_ADDR0_CONST |
3229						   R500_ALPHA_ADDR1(5) |
3230						   R500_ALPHA_ADDR2(4)));
3231	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3232						   R500_ALU_RGB_R_SWIZ_A_0 |
3233						   R500_ALU_RGB_G_SWIZ_A_G |
3234						   R500_ALU_RGB_B_SWIZ_A_0 |
3235						   R500_ALU_RGB_SEL_B_SRC1 |
3236						   R500_ALU_RGB_R_SWIZ_B_G |
3237						   R500_ALU_RGB_G_SWIZ_B_G |
3238						   R500_ALU_RGB_B_SWIZ_B_G));
3239	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
3240						   R500_ALPHA_OP_MAD |
3241						   R500_ALPHA_SEL_A_SRC0 |
3242						   R500_ALPHA_SWIZ_A_G |
3243						   R500_ALPHA_SEL_B_SRC1 |
3244						   R500_ALPHA_SWIZ_B_G));
3245	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
3246						   R500_ALU_RGBA_OP_MAD |
3247						   R500_ALU_RGBA_SEL_C_SRC2 |
3248						   R500_ALU_RGBA_R_SWIZ_R |
3249						   R500_ALU_RGBA_G_SWIZ_G |
3250						   R500_ALU_RGBA_B_SWIZ_B |
3251						   R500_ALU_RGBA_A_SWIZ_A));
3252
3253	    /* ADD temp0, temp4, input0.xyxy */
3254	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3255						   R500_INST_RGB_WMASK_R |
3256						   R500_INST_RGB_WMASK_G |
3257						   R500_INST_RGB_WMASK_B |
3258						   R500_INST_ALPHA_WMASK));
3259	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
3260						   R500_RGB_ADDR2(0)));
3261	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
3262						   R500_ALPHA_ADDR2(0)));
3263	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
3264						   R500_ALU_RGB_G_SWIZ_A_1 |
3265						   R500_ALU_RGB_B_SWIZ_A_1 |
3266						   R500_ALU_RGB_SEL_B_SRC1 |
3267						   R500_ALU_RGB_R_SWIZ_B_R |
3268						   R500_ALU_RGB_G_SWIZ_B_G |
3269						   R500_ALU_RGB_B_SWIZ_B_B));
3270	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3271						   R500_ALPHA_OP_MAD |
3272						   R500_ALPHA_SWIZ_A_1 |
3273						   R500_ALPHA_SEL_B_SRC1 |
3274						   R500_ALPHA_SWIZ_B_A));
3275	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3276						   R500_ALU_RGBA_OP_MAD |
3277						   R500_ALU_RGBA_SEL_C_SRC2 |
3278						   R500_ALU_RGBA_R_SWIZ_R |
3279						   R500_ALU_RGBA_G_SWIZ_G |
3280						   R500_ALU_RGBA_B_SWIZ_R |
3281						   R500_ALU_RGBA_A_SWIZ_G));
3282
3283	    /* TEX temp4, temp0.zwzw, tex0, 2D */
3284	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3285						   R500_INST_TEX_SEM_WAIT |
3286						   R500_INST_RGB_WMASK_R |
3287						   R500_INST_RGB_WMASK_G |
3288						   R500_INST_RGB_WMASK_B |
3289						   R500_INST_ALPHA_WMASK));
3290	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3291						   R500_TEX_INST_LD |
3292						   R500_TEX_IGNORE_UNCOVERED));
3293	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3294						   R500_TEX_SRC_S_SWIZ_B |
3295						   R500_TEX_SRC_T_SWIZ_A |
3296						   R500_TEX_SRC_R_SWIZ_B |
3297						   R500_TEX_SRC_Q_SWIZ_A |
3298						   R500_TEX_DST_ADDR(4) |
3299						   R500_TEX_DST_R_SWIZ_R |
3300						   R500_TEX_DST_G_SWIZ_G |
3301						   R500_TEX_DST_B_SWIZ_B |
3302						   R500_TEX_DST_A_SWIZ_A));
3303	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3304	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3305	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3306
3307	    /* TEX temp0, temp0.xyzw, tex0, 2D */
3308	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3309						   R500_INST_TEX_SEM_WAIT |
3310						   R500_INST_RGB_WMASK_R |
3311						   R500_INST_RGB_WMASK_G |
3312						   R500_INST_RGB_WMASK_B |
3313						   R500_INST_ALPHA_WMASK));
3314	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3315						   R500_TEX_INST_LD |
3316						   R500_TEX_SEM_ACQUIRE |
3317						   R500_TEX_IGNORE_UNCOVERED));
3318	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3319						   R500_TEX_SRC_S_SWIZ_R |
3320						   R500_TEX_SRC_T_SWIZ_G |
3321						   R500_TEX_SRC_R_SWIZ_B |
3322						   R500_TEX_SRC_Q_SWIZ_A |
3323						   R500_TEX_DST_ADDR(0) |
3324						   R500_TEX_DST_R_SWIZ_R |
3325						   R500_TEX_DST_G_SWIZ_G |
3326						   R500_TEX_DST_B_SWIZ_B |
3327						   R500_TEX_DST_A_SWIZ_A));
3328	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3329	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3330	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3331
3332	    /* LRP temp3, temp2.zzzz, temp1, temp3 ->
3333	     * - PRESUB temps, temp1 - temp3
3334	     * - MAD temp2.zzzz, temps, temp3 */
3335	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3336						   R500_INST_RGB_WMASK_R |
3337						   R500_INST_RGB_WMASK_G |
3338						   R500_INST_RGB_WMASK_B |
3339						   R500_INST_ALPHA_WMASK));
3340	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
3341						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3342						   R500_RGB_ADDR1(1) |
3343						   R500_RGB_ADDR2(2)));
3344	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
3345						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3346						   R500_ALPHA_ADDR1(1) |
3347						   R500_ALPHA_ADDR2(2)));
3348	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3349						   R500_ALU_RGB_R_SWIZ_A_B |
3350						   R500_ALU_RGB_G_SWIZ_A_B |
3351						   R500_ALU_RGB_B_SWIZ_A_B |
3352						   R500_ALU_RGB_SEL_B_SRCP |
3353						   R500_ALU_RGB_R_SWIZ_B_R |
3354						   R500_ALU_RGB_G_SWIZ_B_G |
3355						   R500_ALU_RGB_B_SWIZ_B_B));
3356	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3357						   R500_ALPHA_OP_MAD |
3358						   R500_ALPHA_SEL_A_SRC2 |
3359						   R500_ALPHA_SWIZ_A_B |
3360						   R500_ALPHA_SEL_B_SRCP |
3361						   R500_ALPHA_SWIZ_B_A));
3362	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3363						   R500_ALU_RGBA_OP_MAD |
3364						   R500_ALU_RGBA_SEL_C_SRC0 |
3365						   R500_ALU_RGBA_R_SWIZ_R |
3366						   R500_ALU_RGBA_G_SWIZ_G |
3367						   R500_ALU_RGBA_B_SWIZ_B |
3368						   R500_ALU_RGBA_A_SWIZ_A));
3369
3370	    /* LRP temp0, temp2.zzzz, temp4, temp0 ->
3371	     * - PRESUB temps, temp4 - temp1
3372	     * - MAD temp2.zzzz, temps, temp0 */
3373	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3374						   R500_INST_TEX_SEM_WAIT |
3375						   R500_INST_RGB_WMASK_R |
3376						   R500_INST_RGB_WMASK_G |
3377						   R500_INST_RGB_WMASK_B |
3378						   R500_INST_ALPHA_WMASK));
3379	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3380						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3381						   R500_RGB_ADDR1(4) |
3382						   R500_RGB_ADDR2(2)));
3383	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3384						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3385						   R500_ALPHA_ADDR1(4) |
3386						   R500_ALPHA_ADDR2(2)));
3387	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3388						   R500_ALU_RGB_R_SWIZ_A_B |
3389						   R500_ALU_RGB_G_SWIZ_A_B |
3390						   R500_ALU_RGB_B_SWIZ_A_B |
3391						   R500_ALU_RGB_SEL_B_SRCP |
3392						   R500_ALU_RGB_R_SWIZ_B_R |
3393						   R500_ALU_RGB_G_SWIZ_B_G |
3394						   R500_ALU_RGB_B_SWIZ_B_B));
3395	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3396						   R500_ALPHA_OP_MAD |
3397						   R500_ALPHA_SEL_A_SRC2 |
3398						   R500_ALPHA_SWIZ_A_B |
3399						   R500_ALPHA_SEL_B_SRCP |
3400						   R500_ALPHA_SWIZ_B_A));
3401	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3402						   R500_ALU_RGBA_OP_MAD |
3403						   R500_ALU_RGBA_SEL_C_SRC0 |
3404						   R500_ALU_RGBA_R_SWIZ_R |
3405						   R500_ALU_RGBA_G_SWIZ_G |
3406						   R500_ALU_RGBA_B_SWIZ_B |
3407						   R500_ALU_RGBA_A_SWIZ_A));
3408
3409	    /* LRP output, temp5.zzzz, temp3, temp0 ->
3410	     * - PRESUB temps, temp3 - temp0
3411	     * - MAD temp5.zzzz, temps, temp0 */
3412	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3413						   R500_INST_LAST |
3414						   R500_INST_TEX_SEM_WAIT |
3415						   R500_INST_RGB_WMASK_R |
3416						   R500_INST_RGB_WMASK_G |
3417						   R500_INST_RGB_WMASK_B |
3418						   R500_INST_ALPHA_WMASK |
3419						   R500_INST_RGB_OMASK_R |
3420						   R500_INST_RGB_OMASK_G |
3421						   R500_INST_RGB_OMASK_B |
3422						   R500_INST_ALPHA_OMASK));
3423	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3424						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3425						   R500_RGB_ADDR1(3) |
3426						   R500_RGB_ADDR2(5)));
3427	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3428						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3429						   R500_ALPHA_ADDR1(3) |
3430						   R500_ALPHA_ADDR2(5)));
3431	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3432						   R500_ALU_RGB_R_SWIZ_A_B |
3433						   R500_ALU_RGB_G_SWIZ_A_B |
3434						   R500_ALU_RGB_B_SWIZ_A_B |
3435						   R500_ALU_RGB_SEL_B_SRCP |
3436						   R500_ALU_RGB_R_SWIZ_B_R |
3437						   R500_ALU_RGB_G_SWIZ_B_G |
3438						   R500_ALU_RGB_B_SWIZ_B_B));
3439	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3440						   R500_ALPHA_OP_MAD |
3441						   R500_ALPHA_SEL_A_SRC2 |
3442						   R500_ALPHA_SWIZ_A_B |
3443						   R500_ALPHA_SEL_B_SRCP |
3444						   R500_ALPHA_SWIZ_B_A));
3445	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3446						   R500_ALU_RGBA_OP_MAD |
3447						   R500_ALU_RGBA_SEL_C_SRC0 |
3448						   R500_ALU_RGBA_R_SWIZ_R |
3449						   R500_ALU_RGBA_G_SWIZ_G |
3450						   R500_ALU_RGBA_B_SWIZ_B |
3451						   R500_ALU_RGBA_A_SWIZ_A));
3452
3453	    /* Shader constants. */
3454	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3455
3456	    /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
3457	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
3458	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
3459	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3460	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3461
3462	    FINISH_ACCEL();
3463	} else {
3464	    BEGIN_ACCEL(19);
3465	    /* 2 components: 2 for tex0 */
3466	    OUT_ACCEL_REG(R300_RS_COUNT,
3467			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3468			   R300_RS_COUNT_HIRES_EN));
3469
3470	    /* R300_INST_COUNT_RS - highest RS instruction used */
3471	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3472
3473	    /* Pixel stack frame size. */
3474	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
3475
3476	    /* FP length. */
3477	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3478					      R500_US_CODE_END_ADDR(1)));
3479	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3480					       R500_US_CODE_RANGE_SIZE(1)));
3481
3482	    /* Prepare for FP emission. */
3483	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3484	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3485
3486	    /* tex inst */
3487	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3488						   R500_INST_TEX_SEM_WAIT |
3489						   R500_INST_RGB_WMASK_R |
3490						   R500_INST_RGB_WMASK_G |
3491						   R500_INST_RGB_WMASK_B |
3492						   R500_INST_ALPHA_WMASK |
3493						   R500_INST_RGB_CLAMP |
3494						   R500_INST_ALPHA_CLAMP));
3495	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3496						   R500_TEX_INST_LD |
3497						   R500_TEX_SEM_ACQUIRE |
3498						   R500_TEX_IGNORE_UNCOVERED));
3499	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3500						   R500_TEX_SRC_S_SWIZ_R |
3501						   R500_TEX_SRC_T_SWIZ_G |
3502						   R500_TEX_DST_ADDR(0) |
3503						   R500_TEX_DST_R_SWIZ_R |
3504						   R500_TEX_DST_G_SWIZ_G |
3505						   R500_TEX_DST_B_SWIZ_B |
3506						   R500_TEX_DST_A_SWIZ_A));
3507	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3508						   R500_DX_S_SWIZ_R |
3509						   R500_DX_T_SWIZ_R |
3510						   R500_DX_R_SWIZ_R |
3511						   R500_DX_Q_SWIZ_R |
3512						   R500_DY_ADDR(0) |
3513						   R500_DY_S_SWIZ_R |
3514						   R500_DY_T_SWIZ_R |
3515						   R500_DY_R_SWIZ_R |
3516						   R500_DY_Q_SWIZ_R));
3517	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3518	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3519
3520	    /* ALU inst */
3521	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3522						   R500_INST_TEX_SEM_WAIT |
3523						   R500_INST_LAST |
3524						   R500_INST_RGB_OMASK_R |
3525						   R500_INST_RGB_OMASK_G |
3526						   R500_INST_RGB_OMASK_B |
3527						   R500_INST_ALPHA_OMASK |
3528						   R500_INST_RGB_CLAMP |
3529						   R500_INST_ALPHA_CLAMP));
3530	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3531						   R500_RGB_ADDR1(0) |
3532						   R500_RGB_ADDR1_CONST |
3533						   R500_RGB_ADDR2(0) |
3534						   R500_RGB_ADDR2_CONST));
3535	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3536						   R500_ALPHA_ADDR1(0) |
3537						   R500_ALPHA_ADDR1_CONST |
3538						   R500_ALPHA_ADDR2(0) |
3539						   R500_ALPHA_ADDR2_CONST));
3540	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3541						   R500_ALU_RGB_R_SWIZ_A_R |
3542						   R500_ALU_RGB_G_SWIZ_A_G |
3543						   R500_ALU_RGB_B_SWIZ_A_B |
3544						   R500_ALU_RGB_SEL_B_SRC0 |
3545						   R500_ALU_RGB_R_SWIZ_B_1 |
3546						   R500_ALU_RGB_B_SWIZ_B_1 |
3547						   R500_ALU_RGB_G_SWIZ_B_1));
3548	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3549						   R500_ALPHA_SWIZ_A_A |
3550						   R500_ALPHA_SWIZ_B_1));
3551	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3552						   R500_ALU_RGBA_R_SWIZ_0 |
3553						   R500_ALU_RGBA_G_SWIZ_0 |
3554						   R500_ALU_RGBA_B_SWIZ_0 |
3555						   R500_ALU_RGBA_A_SWIZ_0));
3556	    FINISH_ACCEL();
3557	}
3558    } else {
3559	/*
3560	 * y' = y - .0625
3561	 * u' = u - .5
3562	 * v' = v - .5;
3563	 *
3564	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
3565	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
3566	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
3567	 *
3568	 * DP3 might look like the straightforward solution
3569	 * but we'd need to move the texture yuv values in
3570	 * the same reg for this to work. Therefore use MADs.
3571	 * Brightness just adds to the off constant.
3572	 * Contrast is multiplication of luminance.
3573	 * Saturation and hue change the u and v coeffs.
3574	 * Default values (before adjustments - depend on colorspace):
3575	 * yco = 1.1643
3576	 * uco = 0, -0.39173, 2.017
3577	 * vco = 1.5958, -0.8129, 0
3578	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
3579	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
3580	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
3581	 *
3582	 * temp = MAD(yco, yuv.yyyy, off)
3583	 * temp = MAD(uco, yuv.uuuu, temp)
3584	 * result = MAD(vco, yuv.vvvv, temp)
3585	 */
3586	/* TODO: don't recalc consts always */
3587	const float Loff = -0.0627;
3588	const float Coff = -0.502;
3589	float uvcosf, uvsinf;
3590	float yco;
3591	float uco[3], vco[3], off[3];
3592	float bright, cont, gamma;
3593	int ref = pPriv->transform_index;
3594	Bool needgamma = FALSE;
3595
3596	cont = RTFContrast(pPriv->contrast);
3597	bright = RTFBrightness(pPriv->brightness);
3598	gamma = (float)pPriv->gamma / 1000.0;
3599	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
3600	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
3601	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
3602
3603	yco = trans[ref].RefLuma * cont;
3604	uco[0] = -trans[ref].RefRCr * uvsinf;
3605	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
3606	uco[2] = trans[ref].RefBCb * uvcosf;
3607	vco[0] = trans[ref].RefRCr * uvcosf;
3608	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
3609	vco[2] = trans[ref].RefBCb * uvsinf;
3610	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
3611	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
3612	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
3613
3614	//XXX gamma
3615
3616	if (gamma != 1.0) {
3617	    needgamma = TRUE;
3618	    /* note: gamma correction is out = in ^ gamma;
3619	       gpu can only do LG2/EX2 therefore we transform into
3620	       in ^ gamma = 2 ^ (log2(in) * gamma).
3621	       Lots of scalar ops, unfortunately (better solution?) -
3622	       without gamma that's 3 inst, with gamma it's 10...
3623	       could use different gamma factors per channel,
3624	       if that's of any use. */
3625	}
3626
3627	if (pPriv->is_planar) {
3628	    BEGIN_ACCEL(56);
3629	    /* 2 components: 2 for tex0 */
3630	    OUT_ACCEL_REG(R300_RS_COUNT,
3631			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3632			   R300_RS_COUNT_HIRES_EN));
3633
3634	    /* R300_INST_COUNT_RS - highest RS instruction used */
3635	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3636
3637	    /* Pixel stack frame size. */
3638	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
3639
3640	    /* FP length. */
3641	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3642					      R500_US_CODE_END_ADDR(5)));
3643	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3644					       R500_US_CODE_RANGE_SIZE(5)));
3645
3646	    /* Prepare for FP emission. */
3647	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3648	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3649
3650	    /* tex inst */
3651	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3652						   R500_INST_TEX_SEM_WAIT |
3653						   R500_INST_RGB_WMASK_R |
3654						   R500_INST_RGB_WMASK_G |
3655						   R500_INST_RGB_WMASK_B |
3656						   R500_INST_ALPHA_WMASK |
3657						   R500_INST_RGB_CLAMP |
3658						   R500_INST_ALPHA_CLAMP));
3659	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3660						   R500_TEX_INST_LD |
3661						   R500_TEX_IGNORE_UNCOVERED));
3662	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3663						   R500_TEX_SRC_S_SWIZ_R |
3664						   R500_TEX_SRC_T_SWIZ_G |
3665						   R500_TEX_DST_ADDR(2) |
3666						   R500_TEX_DST_R_SWIZ_R |
3667						   R500_TEX_DST_G_SWIZ_G |
3668						   R500_TEX_DST_B_SWIZ_B |
3669						   R500_TEX_DST_A_SWIZ_A));
3670	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3671						   R500_DX_S_SWIZ_R |
3672						   R500_DX_T_SWIZ_R |
3673						   R500_DX_R_SWIZ_R |
3674						   R500_DX_Q_SWIZ_R |
3675						   R500_DY_ADDR(0) |
3676						   R500_DY_S_SWIZ_R |
3677						   R500_DY_T_SWIZ_R |
3678						   R500_DY_R_SWIZ_R |
3679						   R500_DY_Q_SWIZ_R));
3680	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3681	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3682
3683	    /* tex inst */
3684	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3685						   R500_INST_TEX_SEM_WAIT |
3686						   R500_INST_RGB_WMASK_R |
3687						   R500_INST_RGB_WMASK_G |
3688						   R500_INST_RGB_WMASK_B |
3689						   R500_INST_ALPHA_WMASK |
3690						   R500_INST_RGB_CLAMP |
3691						   R500_INST_ALPHA_CLAMP));
3692	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3693						   R500_TEX_INST_LD |
3694						   R500_TEX_IGNORE_UNCOVERED));
3695	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3696						   R500_TEX_SRC_S_SWIZ_R |
3697						   R500_TEX_SRC_T_SWIZ_G |
3698						   R500_TEX_DST_ADDR(1) |
3699						   R500_TEX_DST_R_SWIZ_R |
3700						   R500_TEX_DST_G_SWIZ_G |
3701						   R500_TEX_DST_B_SWIZ_B |
3702						   R500_TEX_DST_A_SWIZ_A));
3703	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3704						   R500_DX_S_SWIZ_R |
3705						   R500_DX_T_SWIZ_R |
3706						   R500_DX_R_SWIZ_R |
3707						   R500_DX_Q_SWIZ_R |
3708						   R500_DY_ADDR(0) |
3709						   R500_DY_S_SWIZ_R |
3710						   R500_DY_T_SWIZ_R |
3711						   R500_DY_R_SWIZ_R |
3712						   R500_DY_Q_SWIZ_R));
3713	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3714	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3715
3716	    /* tex inst */
3717	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3718						   R500_INST_TEX_SEM_WAIT |
3719						   R500_INST_RGB_WMASK_R |
3720						   R500_INST_RGB_WMASK_G |
3721						   R500_INST_RGB_WMASK_B |
3722						   R500_INST_ALPHA_WMASK |
3723						   R500_INST_RGB_CLAMP |
3724						   R500_INST_ALPHA_CLAMP));
3725	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
3726						   R500_TEX_INST_LD |
3727						   R500_TEX_SEM_ACQUIRE |
3728						   R500_TEX_IGNORE_UNCOVERED));
3729	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3730						   R500_TEX_SRC_S_SWIZ_R |
3731						   R500_TEX_SRC_T_SWIZ_G |
3732						   R500_TEX_DST_ADDR(0) |
3733						   R500_TEX_DST_R_SWIZ_R |
3734						   R500_TEX_DST_G_SWIZ_G |
3735						   R500_TEX_DST_B_SWIZ_B |
3736						   R500_TEX_DST_A_SWIZ_A));
3737	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3738						   R500_DX_S_SWIZ_R |
3739						   R500_DX_T_SWIZ_R |
3740						   R500_DX_R_SWIZ_R |
3741						   R500_DX_Q_SWIZ_R |
3742						   R500_DY_ADDR(0) |
3743						   R500_DY_S_SWIZ_R |
3744						   R500_DY_T_SWIZ_R |
3745						   R500_DY_R_SWIZ_R |
3746						   R500_DY_Q_SWIZ_R));
3747	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3748	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3749
3750	    /* ALU inst */
3751	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
3752	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3753						   R500_INST_TEX_SEM_WAIT |
3754						   R500_INST_RGB_WMASK_R |
3755						   R500_INST_RGB_WMASK_G |
3756						   R500_INST_RGB_WMASK_B |
3757						   R500_INST_ALPHA_WMASK));
3758	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3759						   R500_RGB_ADDR0_CONST |
3760						   R500_RGB_ADDR1(2) |
3761						   R500_RGB_ADDR2(0) |
3762						   R500_RGB_ADDR2_CONST));
3763	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3764						   R500_ALPHA_ADDR0_CONST |
3765						   R500_ALPHA_ADDR1(2) |
3766						   R500_ALPHA_ADDR2(0) |
3767						   R500_ALPHA_ADDR2_CONST));
3768	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3769						   R500_ALU_RGB_R_SWIZ_A_A |
3770						   R500_ALU_RGB_G_SWIZ_A_A |
3771						   R500_ALU_RGB_B_SWIZ_A_A |
3772						   R500_ALU_RGB_SEL_B_SRC1 |
3773						   R500_ALU_RGB_R_SWIZ_B_R |
3774						   R500_ALU_RGB_B_SWIZ_B_G |
3775						   R500_ALU_RGB_G_SWIZ_B_B));
3776	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3777						   R500_ALPHA_ADDRD(2) |
3778						   R500_ALPHA_SWIZ_A_0 |
3779						   R500_ALPHA_SWIZ_B_0));
3780	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3781						   R500_ALU_RGBA_ADDRD(2) |
3782						   R500_ALU_RGBA_SEL_C_SRC0 |
3783						   R500_ALU_RGBA_R_SWIZ_R |
3784						   R500_ALU_RGBA_G_SWIZ_G |
3785						   R500_ALU_RGBA_B_SWIZ_B |
3786						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3787						   R500_ALU_RGBA_A_SWIZ_0));
3788
3789	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
3790	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3791						   R500_INST_TEX_SEM_WAIT |
3792						   R500_INST_RGB_WMASK_R |
3793						   R500_INST_RGB_WMASK_G |
3794						   R500_INST_RGB_WMASK_B |
3795						   R500_INST_ALPHA_WMASK));
3796	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3797						   R500_RGB_ADDR0_CONST |
3798						   R500_RGB_ADDR1(1) |
3799						   R500_RGB_ADDR2(2)));
3800	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3801						   R500_ALPHA_ADDR0_CONST |
3802						   R500_ALPHA_ADDR1(1) |
3803						   R500_ALPHA_ADDR2(2)));
3804	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3805						   R500_ALU_RGB_R_SWIZ_A_R |
3806						   R500_ALU_RGB_G_SWIZ_A_G |
3807						   R500_ALU_RGB_B_SWIZ_A_B |
3808						   R500_ALU_RGB_SEL_B_SRC1 |
3809						   R500_ALU_RGB_R_SWIZ_B_R |
3810						   R500_ALU_RGB_B_SWIZ_B_G |
3811						   R500_ALU_RGB_G_SWIZ_B_B));
3812	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3813						   R500_ALPHA_ADDRD(2) |
3814						   R500_ALPHA_SWIZ_A_0 |
3815						   R500_ALPHA_SWIZ_B_0));
3816	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3817						   R500_ALU_RGBA_ADDRD(2) |
3818						   R500_ALU_RGBA_SEL_C_SRC2 |
3819						   R500_ALU_RGBA_R_SWIZ_R |
3820						   R500_ALU_RGBA_G_SWIZ_G |
3821						   R500_ALU_RGBA_B_SWIZ_B |
3822						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3823						   R500_ALU_RGBA_A_SWIZ_0));
3824
3825	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
3826	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3827						   R500_INST_TEX_SEM_WAIT |
3828						   R500_INST_LAST |
3829						   R500_INST_RGB_OMASK_R |
3830						   R500_INST_RGB_OMASK_G |
3831						   R500_INST_RGB_OMASK_B |
3832						   R500_INST_ALPHA_OMASK |
3833						   R500_INST_RGB_CLAMP |
3834						   R500_INST_ALPHA_CLAMP));
3835	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3836						   R500_RGB_ADDR0_CONST |
3837						   R500_RGB_ADDR1(0) |
3838						   R500_RGB_ADDR2(2)));
3839	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
3840						   R500_ALPHA_ADDR0_CONST |
3841						   R500_ALPHA_ADDR1(0) |
3842						   R500_ALPHA_ADDR2(2)));
3843	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3844						   R500_ALU_RGB_R_SWIZ_A_R |
3845						   R500_ALU_RGB_G_SWIZ_A_G |
3846						   R500_ALU_RGB_B_SWIZ_A_B |
3847						   R500_ALU_RGB_SEL_B_SRC1 |
3848						   R500_ALU_RGB_R_SWIZ_B_R |
3849						   R500_ALU_RGB_B_SWIZ_B_G |
3850						   R500_ALU_RGB_G_SWIZ_B_B));
3851	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3852						   R500_ALPHA_ADDRD(0) |
3853						   R500_ALPHA_SWIZ_A_0 |
3854						   R500_ALPHA_SWIZ_B_0));
3855	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3856						   R500_ALU_RGBA_ADDRD(0) |
3857						   R500_ALU_RGBA_SEL_C_SRC2 |
3858						   R500_ALU_RGBA_R_SWIZ_R |
3859						   R500_ALU_RGBA_G_SWIZ_G |
3860						   R500_ALU_RGBA_B_SWIZ_B |
3861						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3862						   R500_ALU_RGBA_A_SWIZ_1));
3863
3864	} else {
3865	    BEGIN_ACCEL(44);
3866	    /* 2 components: 2 for tex0/1/2 */
3867	    OUT_ACCEL_REG(R300_RS_COUNT,
3868			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3869			   R300_RS_COUNT_HIRES_EN));
3870
3871	    /* R300_INST_COUNT_RS - highest RS instruction used */
3872	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3873
3874	    /* Pixel stack frame size. */
3875	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
3876
3877	    /* FP length. */
3878	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3879					      R500_US_CODE_END_ADDR(3)));
3880	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3881					       R500_US_CODE_RANGE_SIZE(3)));
3882
3883	    /* Prepare for FP emission. */
3884	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3885	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3886
3887	    /* tex inst */
3888	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3889						   R500_INST_TEX_SEM_WAIT |
3890						   R500_INST_RGB_WMASK_R |
3891						   R500_INST_RGB_WMASK_G |
3892						   R500_INST_RGB_WMASK_B |
3893						   R500_INST_ALPHA_WMASK |
3894						   R500_INST_RGB_CLAMP |
3895						   R500_INST_ALPHA_CLAMP));
3896	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3897						   R500_TEX_INST_LD |
3898						   R500_TEX_SEM_ACQUIRE |
3899						   R500_TEX_IGNORE_UNCOVERED));
3900	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3901						   R500_TEX_SRC_S_SWIZ_R |
3902						   R500_TEX_SRC_T_SWIZ_G |
3903						   R500_TEX_DST_ADDR(0) |
3904						   R500_TEX_DST_R_SWIZ_R |
3905						   R500_TEX_DST_G_SWIZ_G |
3906						   R500_TEX_DST_B_SWIZ_B |
3907						   R500_TEX_DST_A_SWIZ_A));
3908	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3909						   R500_DX_S_SWIZ_R |
3910						   R500_DX_T_SWIZ_R |
3911						   R500_DX_R_SWIZ_R |
3912						   R500_DX_Q_SWIZ_R |
3913						   R500_DY_ADDR(0) |
3914						   R500_DY_S_SWIZ_R |
3915						   R500_DY_T_SWIZ_R |
3916						   R500_DY_R_SWIZ_R |
3917						   R500_DY_Q_SWIZ_R));
3918	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3919	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3920
3921	    /* ALU inst */
3922	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
3923	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3924						   R500_INST_TEX_SEM_WAIT |
3925						   R500_INST_RGB_WMASK_R |
3926						   R500_INST_RGB_WMASK_G |
3927						   R500_INST_RGB_WMASK_B |
3928						   R500_INST_ALPHA_WMASK));
3929	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3930						   R500_RGB_ADDR0_CONST |
3931						   R500_RGB_ADDR1(0) |
3932						   R500_RGB_ADDR2(0) |
3933						   R500_RGB_ADDR2_CONST));
3934	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3935						   R500_ALPHA_ADDR0_CONST |
3936						   R500_ALPHA_ADDR1(0) |
3937						   R500_ALPHA_ADDR2(0) |
3938						   R500_ALPHA_ADDR2_CONST));
3939	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3940						   R500_ALU_RGB_R_SWIZ_A_A |
3941						   R500_ALU_RGB_G_SWIZ_A_A |
3942						   R500_ALU_RGB_B_SWIZ_A_A |
3943						   R500_ALU_RGB_SEL_B_SRC1 |
3944						   R500_ALU_RGB_R_SWIZ_B_G |
3945						   R500_ALU_RGB_B_SWIZ_B_G |
3946						   R500_ALU_RGB_G_SWIZ_B_G));
3947	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3948						   R500_ALPHA_ADDRD(1) |
3949						   R500_ALPHA_SWIZ_A_0 |
3950						   R500_ALPHA_SWIZ_B_0));
3951	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3952						   R500_ALU_RGBA_ADDRD(1) |
3953						   R500_ALU_RGBA_SEL_C_SRC0 |
3954						   R500_ALU_RGBA_R_SWIZ_R |
3955						   R500_ALU_RGBA_G_SWIZ_G |
3956						   R500_ALU_RGBA_B_SWIZ_B |
3957						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3958						   R500_ALU_RGBA_A_SWIZ_0));
3959
3960	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
3961	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3962						   R500_INST_TEX_SEM_WAIT |
3963						   R500_INST_RGB_WMASK_R |
3964						   R500_INST_RGB_WMASK_G |
3965						   R500_INST_RGB_WMASK_B |
3966						   R500_INST_ALPHA_WMASK));
3967	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3968						   R500_RGB_ADDR0_CONST |
3969						   R500_RGB_ADDR1(0) |
3970						   R500_RGB_ADDR2(1)));
3971	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3972						   R500_ALPHA_ADDR0_CONST |
3973						   R500_ALPHA_ADDR1(0) |
3974						   R500_ALPHA_ADDR2(1)));
3975	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3976						   R500_ALU_RGB_R_SWIZ_A_R |
3977						   R500_ALU_RGB_G_SWIZ_A_G |
3978						   R500_ALU_RGB_B_SWIZ_A_B |
3979						   R500_ALU_RGB_SEL_B_SRC1 |
3980						   R500_ALU_RGB_R_SWIZ_B_B |
3981						   R500_ALU_RGB_B_SWIZ_B_B |
3982						   R500_ALU_RGB_G_SWIZ_B_B));
3983	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3984						   R500_ALPHA_ADDRD(1) |
3985						   R500_ALPHA_SWIZ_A_0 |
3986						   R500_ALPHA_SWIZ_B_0));
3987	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3988						   R500_ALU_RGBA_ADDRD(1) |
3989						   R500_ALU_RGBA_SEL_C_SRC2 |
3990						   R500_ALU_RGBA_R_SWIZ_R |
3991						   R500_ALU_RGBA_G_SWIZ_G |
3992						   R500_ALU_RGBA_B_SWIZ_B |
3993						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3994						   R500_ALU_RGBA_A_SWIZ_0));
3995
3996	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
3997	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3998						   R500_INST_TEX_SEM_WAIT |
3999						   R500_INST_LAST |
4000						   R500_INST_RGB_OMASK_R |
4001						   R500_INST_RGB_OMASK_G |
4002						   R500_INST_RGB_OMASK_B |
4003						   R500_INST_ALPHA_OMASK |
4004						   R500_INST_RGB_CLAMP |
4005						   R500_INST_ALPHA_CLAMP));
4006	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
4007						   R500_RGB_ADDR0_CONST |
4008						   R500_RGB_ADDR1(0) |
4009						   R500_RGB_ADDR2(1)));
4010	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
4011						   R500_ALPHA_ADDR0_CONST |
4012						   R500_ALPHA_ADDR1(0) |
4013						   R500_ALPHA_ADDR2(1)));
4014	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
4015						   R500_ALU_RGB_R_SWIZ_A_R |
4016						   R500_ALU_RGB_G_SWIZ_A_G |
4017						   R500_ALU_RGB_B_SWIZ_A_B |
4018						   R500_ALU_RGB_SEL_B_SRC1 |
4019						   R500_ALU_RGB_R_SWIZ_B_R |
4020						   R500_ALU_RGB_B_SWIZ_B_R |
4021						   R500_ALU_RGB_G_SWIZ_B_R));
4022	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
4023						   R500_ALPHA_ADDRD(1) |
4024						   R500_ALPHA_SWIZ_A_0 |
4025						   R500_ALPHA_SWIZ_B_0));
4026	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
4027						   R500_ALU_RGBA_ADDRD(1) |
4028						   R500_ALU_RGBA_SEL_C_SRC2 |
4029						   R500_ALU_RGBA_R_SWIZ_R |
4030						   R500_ALU_RGBA_G_SWIZ_G |
4031						   R500_ALU_RGBA_B_SWIZ_B |
4032						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
4033						   R500_ALU_RGBA_A_SWIZ_1));
4034	}
4035
4036	/* Shader constants. */
4037	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
4038
4039	/* constant 0: off, yco */
4040	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
4041	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
4042	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
4043	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
4044	/* constant 1: uco */
4045	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
4046	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
4047	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
4048	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
4049	/* constant 2: vco */
4050	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
4051	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
4052	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
4053	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
4054
4055	FINISH_ACCEL();
4056    }
4057
4058    BEGIN_ACCEL_RELOC(6, 2);
4059    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
4060    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
4061
4062    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
4063    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
4064
4065    /* no need to enable blending */
4066    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
4067
4068    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, pPriv->vtx_count);
4069    FINISH_ACCEL();
4070
4071    if (pPriv->vsync) {
4072	xf86CrtcPtr crtc;
4073	if (pPriv->desired_crtc)
4074	    crtc = pPriv->desired_crtc;
4075	else
4076	    crtc = radeon_pick_best_crtc(pScrn,
4077					 pPriv->drw_x,
4078					 pPriv->drw_x + pPriv->dst_w,
4079					 pPriv->drw_y,
4080					 pPriv->drw_y + pPriv->dst_h);
4081	if (crtc)
4082	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
4083					  crtc,
4084					  pPriv->drw_y - crtc->y,
4085					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
4086    }
4087
4088    return TRUE;
4089}
4090
4091static void
4092FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
4093{
4094    RADEONInfoPtr info = RADEONPTR(pScrn);
4095    PixmapPtr pPixmap = pPriv->pPixmap;
4096    int dstxoff, dstyoff;
4097    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
4098    int nBox = REGION_NUM_RECTS(&pPriv->clip);
4099    ACCEL_PREAMBLE();
4100
4101#ifdef COMPOSITE
4102    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
4103    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
4104#else
4105    dstxoff = 0;
4106    dstyoff = 0;
4107#endif
4108
4109    if (!FUNC_NAME(R500PrepareTexturedVideo)(pScrn, pPriv))
4110	return;
4111
4112    /*
4113     * Rendering of the actual polygon is done in two different
4114     * ways depending on chip generation:
4115     *
4116     * < R300:
4117     *
4118     *     These chips can render a rectangle in one pass, so
4119     *     handling is pretty straight-forward.
4120     *
4121     * >= R300:
4122     *
4123     *     These chips can accept a quad, but will render it as
4124     *     two triangles which results in a diagonal tear. Instead
4125     *     We render a single, large triangle and use the scissor
4126     *     functionality to restrict it to the desired rectangle.
4127     *     Due to guardband limits on r3xx/r4xx, we can only use
4128     *     the single triangle up to 2880 pixels; above that we
4129     *     render as a quad.
4130     */
4131
4132    while (nBox--) {
4133	float srcX, srcY, srcw, srch;
4134	int dstX, dstY, dstw, dsth;
4135#ifdef ACCEL_CP
4136	int draw_size = 3 * pPriv->vtx_count + 4 + 2 + 3;
4137
4138	if (draw_size > radeon_cs_space_remaining(pScrn)) {
4139	    if (info->cs)
4140		radeon_cs_flush_indirect(pScrn);
4141	    else
4142		RADEONCPFlushIndirect(pScrn, 1);
4143	    if (!FUNC_NAME(R500PrepareTexturedVideo)(pScrn, pPriv))
4144		return;
4145	}
4146#endif
4147
4148	dstX = pBox->x1 + dstxoff;
4149	dstY = pBox->y1 + dstyoff;
4150	dstw = pBox->x2 - pBox->x1;
4151	dsth = pBox->y2 - pBox->y1;
4152
4153	srcX = pPriv->src_x;
4154	srcX += ((pBox->x1 - pPriv->drw_x) *
4155		 pPriv->src_w) / (float)pPriv->dst_w;
4156	srcY = pPriv->src_y;
4157	srcY += ((pBox->y1 - pPriv->drw_y) *
4158		 pPriv->src_h) / (float)pPriv->dst_h;
4159
4160	srcw = (pPriv->src_w * dstw) / (float)pPriv->dst_w;
4161	srch = (pPriv->src_h * dsth) / (float)pPriv->dst_h;
4162
4163	BEGIN_ACCEL(2);
4164	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
4165					 ((dstY) << R300_SCISSOR_Y_SHIFT)));
4166	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
4167					 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
4168	FINISH_ACCEL();
4169
4170#ifdef ACCEL_CP
4171	BEGIN_RING(3 * pPriv->vtx_count + 4);
4172	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
4173			    3 * pPriv->vtx_count));
4174	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
4175		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
4176		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
4177#else /* ACCEL_CP */
4178	BEGIN_ACCEL(2 + pPriv->vtx_count * 3);
4179	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
4180					  RADEON_VF_PRIM_WALK_DATA |
4181					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
4182#endif
4183	if (pPriv->bicubic_enabled) {
4184	    VTX_OUT_6((float)dstX,            (float)dstY,
4185		      (float)srcX / pPriv->w, (float)srcY / pPriv->h,
4186		      (float)srcX + 0.5,      (float)srcY + 0.5);
4187	    VTX_OUT_6((float)dstX,            (float)(dstY + dstw + dsth),
4188		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
4189		      (float)srcX + 0.5,      (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
4190	    VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
4191		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
4192		      (float)srcY / pPriv->h,
4193		      (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
4194		      (float)srcY + 0.5);
4195	} else {
4196	    /*
4197	     * Render a big, scissored triangle. This means
4198	     * increasing the triangle size and adjusting
4199	     * texture coordinates.
4200	     */
4201	    VTX_OUT_4((float)dstX,            (float)dstY,
4202		      (float)srcX / pPriv->w, (float)srcY / pPriv->h);
4203	    VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
4204		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
4205	    VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
4206		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
4207		      (float)srcY / pPriv->h);
4208	}
4209
4210	/* flushing is pipelined, free/finish is not */
4211	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
4212
4213#ifdef ACCEL_CP
4214	ADVANCE_RING();
4215#else
4216	FINISH_ACCEL();
4217#endif /* !ACCEL_CP */
4218
4219	pBox++;
4220    }
4221
4222    BEGIN_ACCEL(3);
4223    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
4224    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
4225    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
4226    FINISH_ACCEL();
4227
4228    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
4229}
4230
4231#undef VTX_OUT_4
4232#undef VTX_OUT_6
4233#undef FUNC_NAME
4234