radeon_textured_videofuncs.c revision ad43ddac
1/*
2 * Copyright 2008 Alex Deucher
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 *
24 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
25 *
26 */
27
28#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
29#error Cannot define both MMIO and CP acceleration!
30#endif
31
32#if !defined(UNIXCPP) || defined(ANSICPP)
33#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
34#else
35#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
36#endif
37
38#ifdef ACCEL_MMIO
39#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
40#else
41#ifdef ACCEL_CP
42#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
43#else
44#error No accel type defined!
45#endif
46#endif
47
48#ifdef ACCEL_CP
49
50#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
51do {								\
52    OUT_RING_F(_dstX);						\
53    OUT_RING_F(_dstY);						\
54    OUT_RING_F(_srcX);						\
55    OUT_RING_F(_srcY);						\
56    OUT_RING_F(_maskX);						\
57    OUT_RING_F(_maskY);						\
58} while (0)
59
60#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
61do {								\
62    OUT_RING_F(_dstX);						\
63    OUT_RING_F(_dstY);						\
64    OUT_RING_F(_srcX);						\
65    OUT_RING_F(_srcY);						\
66} while (0)
67
68#else /* ACCEL_CP */
69
70#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)		\
71do {									\
72    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);			\
73    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);			\
74    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);			\
75    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);			\
76    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskX);			\
77    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);			\
78} while (0)
79
80#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
81do {								\
82    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);		\
83    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);		\
84    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);		\
85    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);		\
86} while (0)
87
88#endif /* !ACCEL_CP */
89
90static void
91FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
92{
93    RADEONInfoPtr info = RADEONPTR(pScrn);
94    PixmapPtr pPixmap = pPriv->pPixmap;
95    struct radeon_exa_pixmap_priv *driver_priv;
96    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
97    uint32_t txformat, txsize, txpitch, txoffset;
98    uint32_t dst_pitch, dst_format;
99    uint32_t colorpitch;
100    Bool isplanar = FALSE;
101    int dstxoff, dstyoff, pixel_shift, vtx_count;
102    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
103    int nBox = REGION_NUM_RECTS(&pPriv->clip);
104    ACCEL_PREAMBLE();
105
106#ifdef XF86DRM_MODE
107    if (info->cs) {
108	int ret;
109
110	radeon_cs_space_reset_bos(info->cs);
111        radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
112
113	if (pPriv->bicubic_enabled)
114	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
115
116	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
117	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
118
119	ret = radeon_cs_space_check(info->cs);
120	if (ret) {
121	    ErrorF("Not enough RAM to hw accel xv operation\n");
122	    return;
123	}
124    }
125#endif
126
127    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
128
129
130#ifdef USE_EXA
131    if (info->useEXA) {
132	dst_pitch = exaGetPixmapPitch(pPixmap);
133    } else
134#endif
135    {
136        dst_pitch = pPixmap->devKind;
137    }
138
139#ifdef COMPOSITE
140    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
141    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
142#else
143    dstxoff = 0;
144    dstyoff = 0;
145#endif
146
147#ifdef USE_EXA
148    if (info->useEXA) {
149	RADEON_SWITCH_TO_3D();
150    } else
151#endif
152    {
153	BEGIN_ACCEL(2);
154	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
155	/* We must wait for 3d to idle, in case source was just written as a dest. */
156	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
157		      RADEON_WAIT_HOST_IDLECLEAN |
158		      RADEON_WAIT_2D_IDLECLEAN |
159		      RADEON_WAIT_3D_IDLECLEAN |
160		      RADEON_WAIT_DMA_GUI_IDLE);
161	FINISH_ACCEL();
162
163	if (!info->accel_state->XInited3D)
164	    RADEONInit3DEngine(pScrn);
165    }
166
167    /* Same for R100/R200 */
168    switch (pPixmap->drawable.bitsPerPixel) {
169    case 16:
170	if (pPixmap->drawable.depth == 15)
171	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
172	else
173	    dst_format = RADEON_COLOR_FORMAT_RGB565;
174	break;
175    case 32:
176	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
177	break;
178    default:
179	return;
180    }
181
182    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
183	isplanar = TRUE;
184	txformat = RADEON_TXFORMAT_Y8;
185    } else {
186	if (pPriv->id == FOURCC_UYVY)
187	    txformat = RADEON_TXFORMAT_YVYU422;
188	else
189	    txformat = RADEON_TXFORMAT_VYUY422;
190    }
191
192    txformat |= RADEON_TXFORMAT_NON_POWER2;
193
194    colorpitch = dst_pitch >> pixel_shift;
195
196    if (RADEONTilingEnabled(pScrn, pPixmap))
197	colorpitch |= RADEON_COLOR_TILE_ENABLE;
198
199    txoffset = info->cs ? 0 : pPriv->src_offset;
200
201    BEGIN_ACCEL_RELOC(4,2);
202
203    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
204    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
205    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
206    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
207		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
208
209    FINISH_ACCEL();
210
211    if (isplanar) {
212	/* need 2 texcoord sets (even though they are identical) due
213	   to denormalization! hw apparently can't premultiply
214	   same coord set by different texture size */
215	vtx_count = 6;
216
217	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
218		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
219	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
220	txpitch -= 32;
221
222	BEGIN_ACCEL_RELOC(23, 3);
223
224	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
225					  RADEON_SE_VTX_FMT_ST0 |
226					  RADEON_SE_VTX_FMT_ST1));
227
228	OUT_ACCEL_REG(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE |
229				       RADEON_TEX_1_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
230				       RADEON_TEX_2_ENABLE | RADEON_TEX_BLEND_2_ENABLE |
231				       RADEON_PLANAR_YUV_ENABLE));
232
233	/* Y */
234	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
235		      RADEON_MAG_FILTER_LINEAR |
236		      RADEON_MIN_FILTER_LINEAR |
237		      RADEON_CLAMP_S_CLAMP_LAST |
238		      RADEON_CLAMP_T_CLAMP_LAST |
239		      RADEON_YUV_TO_RGB);
240	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
241	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, txoffset, src_bo);
242	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
243		      RADEON_COLOR_ARG_A_ZERO |
244		      RADEON_COLOR_ARG_B_ZERO |
245		      RADEON_COLOR_ARG_C_T0_COLOR |
246		      RADEON_BLEND_CTL_ADD |
247		      RADEON_CLAMP_TX);
248	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
249		      RADEON_ALPHA_ARG_A_ZERO |
250		      RADEON_ALPHA_ARG_B_ZERO |
251		      RADEON_ALPHA_ARG_C_T0_ALPHA |
252		      RADEON_BLEND_CTL_ADD |
253		      RADEON_CLAMP_TX);
254
255	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
256		      (pPriv->w - 1) |
257		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
258	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
259		      pPriv->src_pitch - 32);
260
261	/* U */
262	OUT_ACCEL_REG(RADEON_PP_TXFILTER_1,
263		      RADEON_MAG_FILTER_LINEAR |
264		      RADEON_MIN_FILTER_LINEAR |
265		      RADEON_CLAMP_S_CLAMP_LAST |
266		      RADEON_CLAMP_T_CLAMP_LAST);
267	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_1, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
268	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
269	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_1,
270		      RADEON_COLOR_ARG_A_ZERO |
271		      RADEON_COLOR_ARG_B_ZERO |
272		      RADEON_COLOR_ARG_C_T0_COLOR |
273		      RADEON_BLEND_CTL_ADD |
274		      RADEON_CLAMP_TX);
275	OUT_ACCEL_REG(RADEON_PP_TXABLEND_1,
276		      RADEON_ALPHA_ARG_A_ZERO |
277		      RADEON_ALPHA_ARG_B_ZERO |
278		      RADEON_ALPHA_ARG_C_T0_ALPHA |
279		      RADEON_BLEND_CTL_ADD |
280		      RADEON_CLAMP_TX);
281
282	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_1, txsize);
283	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_1, txpitch);
284
285	/* V */
286	OUT_ACCEL_REG(RADEON_PP_TXFILTER_2,
287		      RADEON_MAG_FILTER_LINEAR |
288		      RADEON_MIN_FILTER_LINEAR |
289		      RADEON_CLAMP_S_CLAMP_LAST |
290		      RADEON_CLAMP_T_CLAMP_LAST);
291	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_2, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
292	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_2, txoffset + pPriv->planev_offset, src_bo);
293	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_2,
294		      RADEON_COLOR_ARG_A_ZERO |
295		      RADEON_COLOR_ARG_B_ZERO |
296		      RADEON_COLOR_ARG_C_T0_COLOR |
297		      RADEON_BLEND_CTL_ADD |
298		      RADEON_CLAMP_TX);
299	OUT_ACCEL_REG(RADEON_PP_TXABLEND_2,
300		      RADEON_ALPHA_ARG_A_ZERO |
301		      RADEON_ALPHA_ARG_B_ZERO |
302		      RADEON_ALPHA_ARG_C_T0_ALPHA |
303		      RADEON_BLEND_CTL_ADD |
304		      RADEON_CLAMP_TX);
305
306	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_2, txsize);
307	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_2, txpitch);
308	FINISH_ACCEL();
309    } else {
310	vtx_count = 4;
311	BEGIN_ACCEL_RELOC(9, 1);
312
313	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
314					  RADEON_SE_VTX_FMT_ST0));
315
316	OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
317
318	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
319		      RADEON_MAG_FILTER_LINEAR |
320		      RADEON_MIN_FILTER_LINEAR |
321		      RADEON_CLAMP_S_CLAMP_LAST |
322		      RADEON_CLAMP_T_CLAMP_LAST |
323		      RADEON_YUV_TO_RGB);
324	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
325	OUT_TEXTURE_REG(RADEON_PP_TXOFFSET_0, txoffset, src_bo);
326	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
327		      RADEON_COLOR_ARG_A_ZERO |
328		      RADEON_COLOR_ARG_B_ZERO |
329		      RADEON_COLOR_ARG_C_T0_COLOR |
330		      RADEON_BLEND_CTL_ADD |
331		      RADEON_CLAMP_TX);
332	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
333		      RADEON_ALPHA_ARG_A_ZERO |
334		      RADEON_ALPHA_ARG_B_ZERO |
335		      RADEON_ALPHA_ARG_C_T0_ALPHA |
336		      RADEON_BLEND_CTL_ADD |
337		      RADEON_CLAMP_TX);
338
339	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
340		      (pPriv->w - 1) |
341		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
342	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
343		      pPriv->src_pitch - 32);
344	FINISH_ACCEL();
345    }
346
347    {
348      int scissor_w, scissor_h;
349      scissor_w = MIN(pPixmap->drawable.width, 2047);
350      scissor_h = MIN(pPixmap->drawable.height, 2047);
351
352      BEGIN_ACCEL(2);
353      OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
354      OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
355					     (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
356      FINISH_ACCEL();
357    }
358    if (pPriv->vsync) {
359	xf86CrtcPtr crtc;
360	if (pPriv->desired_crtc)
361	    crtc = pPriv->desired_crtc;
362	else
363	    crtc = radeon_pick_best_crtc(pScrn,
364					 pPriv->drw_x,
365					 pPriv->drw_x + pPriv->dst_w,
366					 pPriv->drw_y,
367					 pPriv->drw_y + pPriv->dst_h);
368	if (crtc)
369	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
370					  crtc,
371					  pPriv->drw_y - crtc->y,
372					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
373    }
374    /*
375     * Rendering of the actual polygon is done in two different
376     * ways depending on chip generation:
377     *
378     * < R300:
379     *
380     *     These chips can render a rectangle in one pass, so
381     *     handling is pretty straight-forward.
382     *
383     * >= R300:
384     *
385     *     These chips can accept a quad, but will render it as
386     *     two triangles which results in a diagonal tear. Instead
387     *     We render a single, large triangle and use the scissor
388     *     functionality to restrict it to the desired rectangle.
389     *     Due to guardband limits on r3xx/r4xx, we can only use
390     *     the single triangle up to 2560/4021 pixels; above that we
391     *     render as a quad.
392     */
393
394#ifdef ACCEL_CP
395	BEGIN_RING(nBox * 3 * vtx_count + 5);
396	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
397			    nBox * 3 * vtx_count + 1));
398	if (isplanar)
399	    OUT_RING(RADEON_CP_VC_FRMT_XY |
400		     RADEON_CP_VC_FRMT_ST0 |
401		     RADEON_CP_VC_FRMT_ST1);
402	else
403	    OUT_RING(RADEON_CP_VC_FRMT_XY |
404		     RADEON_CP_VC_FRMT_ST0);
405	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
406		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
407		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
408		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
409		 ((nBox * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
410#else /* ACCEL_CP */
411	BEGIN_ACCEL(nBox * vtx_count * 3 + 2);
412	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
413					  RADEON_VF_PRIM_WALK_DATA |
414					  RADEON_VF_RADEON_MODE |
415					  ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
416#endif
417
418    while (nBox--) {
419	int srcX, srcY, srcw, srch;
420	int dstX, dstY, dstw, dsth;
421	dstX = pBox->x1 + dstxoff;
422	dstY = pBox->y1 + dstyoff;
423	dstw = pBox->x2 - pBox->x1;
424	dsth = pBox->y2 - pBox->y1;
425
426	srcX = pPriv->src_x;
427	srcX += ((pBox->x1 - pPriv->drw_x) *
428		 pPriv->src_w) / pPriv->dst_w;
429	srcY = pPriv->src_y;
430	srcY += ((pBox->y1 - pPriv->drw_y) *
431		 pPriv->src_h) / pPriv->dst_h;
432
433	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
434	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
435
436
437	if (isplanar) {
438	    /*
439	     * Just render a rect (using three coords).
440	     */
441	    VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
442		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
443		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
444	    VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
445		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
446		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
447	    VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
448		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
449		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
450	} else {
451	    /*
452	     * Just render a rect (using three coords).
453	     */
454	    VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
455		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
456	    VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
457		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
458	    VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
459		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
460	}
461
462	pBox++;
463    }
464
465    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
466#ifdef ACCEL_CP
467	ADVANCE_RING();
468#else
469	FINISH_ACCEL();
470#endif /* !ACCEL_CP */
471
472    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
473}
474
475static void
476FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
477{
478    RADEONInfoPtr info = RADEONPTR(pScrn);
479    PixmapPtr pPixmap = pPriv->pPixmap;
480    struct radeon_exa_pixmap_priv *driver_priv;
481    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
482    uint32_t txformat;
483    uint32_t txfilter, txsize, txpitch, txoffset;
484    uint32_t dst_pitch, dst_format;
485    uint32_t colorpitch;
486    Bool isplanar = FALSE;
487    int dstxoff, dstyoff, pixel_shift, vtx_count;
488    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
489    int nBox = REGION_NUM_RECTS(&pPriv->clip);
490
491    /* note: in contrast to r300, use input biasing on uv components */
492    const float Loff = -0.0627;
493    float uvcosf, uvsinf;
494    float yco, yoff;
495    float uco[3], vco[3];
496    float bright, cont, sat;
497    int ref = pPriv->transform_index;
498    float ucscale = 0.25, vcscale = 0.25;
499    Bool needux8 = FALSE, needvx8 = FALSE;
500    ACCEL_PREAMBLE();
501
502#ifdef XF86DRM_MODE
503    if (info->cs) {
504	int ret;
505
506	radeon_cs_space_reset_bos(info->cs);
507        radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
508
509	if (pPriv->bicubic_enabled)
510	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
511
512	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
513	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
514
515	ret = radeon_cs_space_check(info->cs);
516	if (ret) {
517	    ErrorF("Not enough RAM to hw accel xv operation\n");
518	    return;
519	}
520    }
521#endif
522
523    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
524
525#ifdef USE_EXA
526    if (info->useEXA) {
527	dst_pitch = exaGetPixmapPitch(pPixmap);
528    } else
529#endif
530    {
531	dst_pitch = pPixmap->devKind;
532    }
533
534#ifdef COMPOSITE
535    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
536    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
537#else
538    dstxoff = 0;
539    dstyoff = 0;
540#endif
541
542#ifdef USE_EXA
543    if (info->useEXA) {
544	RADEON_SWITCH_TO_3D();
545    } else
546#endif
547    {
548	BEGIN_ACCEL(2);
549	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
550	/* We must wait for 3d to idle, in case source was just written as a dest. */
551	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
552		      RADEON_WAIT_HOST_IDLECLEAN |
553		      RADEON_WAIT_2D_IDLECLEAN |
554		      RADEON_WAIT_3D_IDLECLEAN |
555		      RADEON_WAIT_DMA_GUI_IDLE);
556	FINISH_ACCEL();
557
558	if (!info->accel_state->XInited3D)
559	    RADEONInit3DEngine(pScrn);
560    }
561
562    /* Same for R100/R200 */
563    switch (pPixmap->drawable.bitsPerPixel) {
564    case 16:
565	if (pPixmap->drawable.depth == 15)
566	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
567	else
568	    dst_format = RADEON_COLOR_FORMAT_RGB565;
569	break;
570    case 32:
571	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
572	break;
573    default:
574	return;
575    }
576
577    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
578	isplanar = TRUE;
579	txformat = RADEON_TXFORMAT_I8;
580    } else {
581	if (pPriv->id == FOURCC_UYVY)
582	    txformat = RADEON_TXFORMAT_YVYU422;
583	else
584	    txformat = RADEON_TXFORMAT_VYUY422;
585    }
586
587    txformat |= RADEON_TXFORMAT_NON_POWER2;
588
589    colorpitch = dst_pitch >> pixel_shift;
590
591    if (RADEONTilingEnabled(pScrn, pPixmap))
592	colorpitch |= RADEON_COLOR_TILE_ENABLE;
593
594    BEGIN_ACCEL_RELOC(4,2);
595
596    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
597    EMIT_WRITE_OFFSET(RADEON_RB3D_COLOROFFSET, 0, pPixmap);
598    EMIT_COLORPITCH(RADEON_RB3D_COLORPITCH, colorpitch, pPixmap);
599
600    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
601		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
602
603    FINISH_ACCEL();
604
605    txfilter =  R200_MAG_FILTER_LINEAR |
606	R200_MIN_FILTER_LINEAR |
607	R200_CLAMP_S_CLAMP_LAST |
608	R200_CLAMP_T_CLAMP_LAST;
609
610    /* contrast can cause constant overflow, clamp */
611    cont = RTFContrast(pPriv->contrast);
612    if (cont * trans[ref].RefLuma > 2.0)
613	cont = 2.0 / trans[ref].RefLuma;
614    /* brightness is only from -0.5 to 0.5 should be safe */
615    bright = RTFBrightness(pPriv->brightness);
616    /* saturation can also cause overflow, clamp */
617    sat = RTFSaturation(pPriv->saturation);
618    if (sat * trans[ref].RefBCb > 4.0)
619	sat = 4.0 / trans[ref].RefBCb;
620    uvcosf = sat * cos(RTFHue(pPriv->hue));
621    uvsinf = sat * sin(RTFHue(pPriv->hue));
622
623    yco = trans[ref].RefLuma * cont;
624    uco[0] = -trans[ref].RefRCr * uvsinf;
625    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
626    uco[2] = trans[ref].RefBCb * uvcosf;
627    vco[0] = trans[ref].RefRCr * uvcosf;
628    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
629    vco[2] = trans[ref].RefBCb * uvsinf;
630    yoff = Loff * yco + bright;
631
632    if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
633	needux8 = TRUE;
634	ucscale = 0.125;
635    }
636    if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
637	needvx8 = TRUE;
638	vcscale = 0.125;
639    }
640
641    txoffset = info->cs ? 0 : pPriv->src_offset;
642
643    if (isplanar) {
644	/* need 2 texcoord sets (even though they are identical) due
645	   to denormalization! hw apparently can't premultiply
646	   same coord set by different texture size */
647	vtx_count = 6;
648
649	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
650		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
651	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
652	txpitch -= 32;
653
654	BEGIN_ACCEL_RELOC(36, 3);
655
656	OUT_ACCEL_REG(RADEON_PP_CNTL,
657		      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
658		      RADEON_TEX_BLEND_0_ENABLE |
659		      RADEON_TEX_BLEND_1_ENABLE |
660		      RADEON_TEX_BLEND_2_ENABLE);
661
662	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
663	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
664		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
665		      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
666
667	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
668	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
669	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
670	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
671		      (pPriv->w - 1) |
672		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
673	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
674	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, txoffset, src_bo);
675
676	OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
677	OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
678	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
679	OUT_ACCEL_REG(R200_PP_TXSIZE_1, txsize);
680	OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
681	OUT_TEXTURE_REG(R200_PP_TXOFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
682
683	OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
684	OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
685	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
686	OUT_ACCEL_REG(R200_PP_TXSIZE_2, txsize);
687	OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
688	OUT_TEXTURE_REG(R200_PP_TXOFFSET_2, txoffset + pPriv->planev_offset, src_bo);
689
690	/* similar to r300 code. Note the big problem is that hardware constants
691	 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
692	 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
693	 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
694	 * the constants not. To get larger range can use output scale, but for
695	 * that 2.018 value we need a total scale by 8, which means the constants
696	 * really have no accuracy whatsoever (5 fractional bits only).
697	 * The only direct way to get high  precision "constants" into the fragment
698	 * pipe I know of is to use the texcoord interpolator (not color, this one
699	 * is 8 bit only too), which seems a bit expensive. We're lucky though it
700	 * seems the values we need seem to fit better than worst case (get about
701	 * 6 fractional bits for this instead of 5, at least when not correcting for
702	 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
703	 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
704	 * even with non-default saturation/hue/contrast/brightness adjustments,
705	 * it gets a little crazy and ultimately precision might still be lacking.
706	 *
707	 * A higher precision (8 fractional bits) version might just put uco into
708	 * a texcoord, and calculate a new vcoconst in the shader, like so:
709	 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
710	 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
711	 * vcocalc = ADD temp, bias/scale(cohelper), vco
712	 * would in total use 4 tex units, 4 instructions which seems fairly
713	 * balanced for this architecture (instead of 3 + 3 for the solution here)
714	 *
715	 * temp = MAD(yco, yuv.yyyy, yoff)
716	 * temp = MAD(uco, yuv.uuuu, temp)
717	 * result = MAD(vco, yuv.vvvv, temp)
718	 *
719	 * note first mad produces actually scalar, hence we transform
720	 * it into a dp2a to get 8 bit precision of yco instead of 7 -
721	 * That's assuming hw correctly expands consts to internal precision.
722	 * (y * 1 + y * (yco - 1) + yoff)
723	 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
724	 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
725	 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
726	 *
727	 * vco, uco need bias (and hence scale too)
728	 *
729	 */
730
731	/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
732	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
733		      R200_TXC_ARG_A_TFACTOR_COLOR |
734		      R200_TXC_ARG_B_R0_COLOR |
735		      R200_TXC_ARG_C_TFACTOR_COLOR |
736		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
737		      R200_TXC_OP_DOT2_ADD);
738	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
739		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
740		      R200_TXC_SCALE_INV2 |
741		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
742	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
743		      R200_TXA_ARG_A_ZERO |
744		      R200_TXA_ARG_B_ZERO |
745		      R200_TXA_ARG_C_ZERO |
746		      R200_TXA_OP_MADD);
747	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
748		      R200_TXA_OUTPUT_REG_NONE);
749
750	/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
751	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
752		      R200_TXC_ARG_A_TFACTOR_COLOR |
753		      R200_TXC_BIAS_ARG_A |
754		      R200_TXC_SCALE_ARG_A |
755		      R200_TXC_ARG_B_R1_COLOR |
756		      R200_TXC_BIAS_ARG_B |
757		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
758		      R200_TXC_ARG_C_R0_COLOR |
759		      R200_TXC_OP_MADD);
760	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
761		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
762		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
763	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
764		      R200_TXA_ARG_A_ZERO |
765		      R200_TXA_ARG_B_ZERO |
766		      R200_TXA_ARG_C_ZERO |
767		      R200_TXA_OP_MADD);
768	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
769		      R200_TXA_OUTPUT_REG_NONE);
770
771	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
772	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
773		      R200_TXC_ARG_A_TFACTOR_COLOR |
774		      R200_TXC_BIAS_ARG_A |
775		      R200_TXC_SCALE_ARG_A |
776		      R200_TXC_ARG_B_R2_COLOR |
777		      R200_TXC_BIAS_ARG_B |
778		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
779		      R200_TXC_ARG_C_R0_COLOR |
780		      R200_TXC_OP_MADD);
781	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
782		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
783		      R200_TXC_SCALE_2X |
784		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
785	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
786		      R200_TXA_ARG_A_ZERO |
787		      R200_TXA_ARG_B_ZERO |
788		      R200_TXA_ARG_C_ZERO |
789		      R200_TXA_COMP_ARG_C |
790		      R200_TXA_OP_MADD);
791	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
792		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
793
794	/* shader constants */
795	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
796						      yco > 1.0 ? yco - 1.0: yco,
797						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
798						      0.0));
799	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
800						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
801						      uco[2] * ucscale + 0.5,
802						      0.0));
803	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
804						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
805						      vco[2] * vcscale + 0.5,
806						      0.0));
807
808	FINISH_ACCEL();
809    } else {
810	vtx_count = 4;
811
812	BEGIN_ACCEL_RELOC(24, 1);
813
814	OUT_ACCEL_REG(RADEON_PP_CNTL,
815		      RADEON_TEX_0_ENABLE |
816		      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
817		      RADEON_TEX_BLEND_2_ENABLE);
818
819	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
820	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
821		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
822
823	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
824	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
825	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
826	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
827		      (pPriv->w - 1) |
828		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
829	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
830	OUT_TEXTURE_REG(R200_PP_TXOFFSET_0, txoffset, src_bo);
831
832	/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
833	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
834		      R200_TXC_ARG_A_TFACTOR_COLOR |
835		      R200_TXC_ARG_B_R0_COLOR |
836		      R200_TXC_ARG_C_TFACTOR_COLOR |
837		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
838		      R200_TXC_OP_DOT2_ADD);
839	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
840		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
841		      R200_TXC_SCALE_INV2 |
842		      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
843		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
844	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
845		      R200_TXA_ARG_A_ZERO |
846		      R200_TXA_ARG_B_ZERO |
847		      R200_TXA_ARG_C_ZERO |
848		      R200_TXA_OP_MADD);
849	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
850		      R200_TXA_OUTPUT_REG_NONE);
851
852	/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
853	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
854		      R200_TXC_ARG_A_TFACTOR_COLOR |
855		      R200_TXC_BIAS_ARG_A |
856		      R200_TXC_SCALE_ARG_A |
857		      R200_TXC_ARG_B_R0_COLOR |
858		      R200_TXC_BIAS_ARG_B |
859		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
860		      R200_TXC_ARG_C_R1_COLOR |
861		      R200_TXC_OP_MADD);
862	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
863		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
864		      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
865		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
866	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
867		      R200_TXA_ARG_A_ZERO |
868		      R200_TXA_ARG_B_ZERO |
869		      R200_TXA_ARG_C_ZERO |
870		      R200_TXA_OP_MADD);
871	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
872		      R200_TXA_OUTPUT_REG_NONE);
873
874	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
875	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
876		      R200_TXC_ARG_A_TFACTOR_COLOR |
877		      R200_TXC_BIAS_ARG_A |
878		      R200_TXC_SCALE_ARG_A |
879		      R200_TXC_ARG_B_R0_COLOR |
880		      R200_TXC_BIAS_ARG_B |
881		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
882		      R200_TXC_ARG_C_R1_COLOR |
883		      R200_TXC_OP_MADD);
884	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
885		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
886		      R200_TXC_SCALE_2X |
887		      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
888		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
889	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
890		      R200_TXA_ARG_A_ZERO |
891		      R200_TXA_ARG_B_ZERO |
892		      R200_TXA_ARG_C_ZERO |
893		      R200_TXA_COMP_ARG_C |
894		      R200_TXA_OP_MADD);
895	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
896		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
897
898	/* shader constants */
899	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
900						      yco > 1.0 ? yco - 1.0: yco,
901						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
902						      0.0));
903	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
904						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
905						      uco[2] * ucscale + 0.5,
906						      0.0));
907	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
908						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
909						      vco[2] * vcscale + 0.5,
910						      0.0));
911
912	FINISH_ACCEL();
913    }
914
915    {
916      int scissor_w, scissor_h;
917      scissor_w = MIN(pPixmap->drawable.width, 2047);
918      scissor_h = MIN(pPixmap->drawable.height, 2047);
919      BEGIN_ACCEL(2);
920      OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
921      OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, ((scissor_w << RADEON_RE_WIDTH_SHIFT) |
922					     (scissor_h << RADEON_RE_HEIGHT_SHIFT)));
923    }
924    FINISH_ACCEL();
925
926    if (pPriv->vsync) {
927	xf86CrtcPtr crtc;
928	if (pPriv->desired_crtc)
929	    crtc = pPriv->desired_crtc;
930	else
931	    crtc = radeon_pick_best_crtc(pScrn,
932					 pPriv->drw_x,
933					 pPriv->drw_x + pPriv->dst_w,
934					 pPriv->drw_y,
935					 pPriv->drw_y + pPriv->dst_h);
936	if (crtc)
937	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
938					  crtc,
939					  pPriv->drw_y - crtc->y,
940					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
941    }
942    /*
943     * Rendering of the actual polygon is done in two different
944     * ways depending on chip generation:
945     *
946     * < R300:
947     *
948     *     These chips can render a rectangle in one pass, so
949     *     handling is pretty straight-forward.
950     *
951     * >= R300:
952     *
953     *     These chips can accept a quad, but will render it as
954     *     two triangles which results in a diagonal tear. Instead
955     *     We render a single, large triangle and use the scissor
956     *     functionality to restrict it to the desired rectangle.
957     *     Due to guardband limits on r3xx/r4xx, we can only use
958     *     the single triangle up to 2560/4021 pixels; above that we
959     *     render as a quad.
960     */
961
962#ifdef ACCEL_CP
963	BEGIN_RING(nBox * 3 * vtx_count + 4);
964	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
965			    nBox * 3 * vtx_count));
966	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
967		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
968		 ((nBox * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
969#else /* ACCEL_CP */
970	BEGIN_ACCEL(nBox * 3 * vtx_count + 2);
971	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
972					  RADEON_VF_PRIM_WALK_DATA |
973					  ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
974
975#endif
976
977    while (nBox--) {
978	int srcX, srcY, srcw, srch;
979	int dstX, dstY, dstw, dsth;
980	dstX = pBox->x1 + dstxoff;
981	dstY = pBox->y1 + dstyoff;
982	dstw = pBox->x2 - pBox->x1;
983	dsth = pBox->y2 - pBox->y1;
984
985	srcX = pPriv->src_x;
986	srcX += ((pBox->x1 - pPriv->drw_x) *
987		 pPriv->src_w) / pPriv->dst_w;
988	srcY = pPriv->src_y;
989	srcY += ((pBox->y1 - pPriv->drw_y) *
990		 pPriv->src_h) / pPriv->dst_h;
991
992	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
993	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
994
995	if (isplanar) {
996	    /*
997	     * Just render a rect (using three coords).
998	     */
999	    VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
1000		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
1001		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1002	    VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
1003		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
1004		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1005	    VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
1006		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
1007		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1008	} else {
1009	    /*
1010	     * Just render a rect (using three coords).
1011	     */
1012	    VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
1013		      (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
1014	    VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
1015		      (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
1016	    VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
1017		      (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
1018	}
1019
1020	pBox++;
1021    }
1022
1023    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
1024
1025#ifdef ACCEL_CP
1026	ADVANCE_RING();
1027#else
1028	FINISH_ACCEL();
1029#endif /* !ACCEL_CP */
1030
1031    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
1032}
1033
1034static void
1035FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
1036{
1037    RADEONInfoPtr info = RADEONPTR(pScrn);
1038    PixmapPtr pPixmap = pPriv->pPixmap;
1039    struct radeon_exa_pixmap_priv *driver_priv;
1040    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
1041    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
1042    uint32_t dst_pitch, dst_format;
1043    uint32_t txenable, colorpitch, bicubic_offset;
1044    uint32_t output_fmt;
1045    Bool isplanar = FALSE;
1046    int dstxoff, dstyoff, pixel_shift, vtx_count;
1047    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
1048    int nBox = REGION_NUM_RECTS(&pPriv->clip);
1049    ACCEL_PREAMBLE();
1050
1051#ifdef XF86DRM_MODE
1052    if (info->cs) {
1053	int ret;
1054
1055	radeon_cs_space_reset_bos(info->cs);
1056	radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
1057
1058	if (pPriv->bicubic_enabled)
1059	  radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
1060
1061	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
1062	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
1063
1064	ret = radeon_cs_space_check(info->cs);
1065	if (ret) {
1066	    ErrorF("Not enough RAM to hw accel xv operation\n");
1067	    return;
1068	}
1069    }
1070#endif
1071
1072    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
1073
1074#ifdef USE_EXA
1075    if (info->useEXA) {
1076	dst_pitch = exaGetPixmapPitch(pPixmap);
1077    } else
1078#endif
1079    {
1080	dst_pitch = pPixmap->devKind;
1081    }
1082
1083#ifdef COMPOSITE
1084    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
1085    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
1086#else
1087    dstxoff = 0;
1088    dstyoff = 0;
1089#endif
1090
1091#ifdef USE_EXA
1092    if (info->useEXA) {
1093	RADEON_SWITCH_TO_3D();
1094    } else
1095#endif
1096    {
1097	BEGIN_ACCEL(2);
1098	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
1099	/* We must wait for 3d to idle, in case source was just written as a dest. */
1100	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
1101		      RADEON_WAIT_HOST_IDLECLEAN |
1102		      RADEON_WAIT_2D_IDLECLEAN |
1103		      RADEON_WAIT_3D_IDLECLEAN |
1104		      RADEON_WAIT_DMA_GUI_IDLE);
1105	FINISH_ACCEL();
1106
1107	if (!info->accel_state->XInited3D)
1108	    RADEONInit3DEngine(pScrn);
1109    }
1110
1111    if (pPriv->bicubic_enabled)
1112	vtx_count = 6;
1113    else
1114	vtx_count = 4;
1115
1116    switch (pPixmap->drawable.bitsPerPixel) {
1117    case 16:
1118	if (pPixmap->drawable.depth == 15)
1119	    dst_format = R300_COLORFORMAT_ARGB1555;
1120	else
1121	    dst_format = R300_COLORFORMAT_RGB565;
1122	break;
1123    case 32:
1124	dst_format = R300_COLORFORMAT_ARGB8888;
1125	break;
1126    default:
1127	return;
1128    }
1129
1130    output_fmt = (R300_OUT_FMT_C4_8 |
1131		  R300_OUT_FMT_C0_SEL_BLUE |
1132		  R300_OUT_FMT_C1_SEL_GREEN |
1133		  R300_OUT_FMT_C2_SEL_RED |
1134		  R300_OUT_FMT_C3_SEL_ALPHA);
1135
1136    colorpitch = dst_pitch >> pixel_shift;
1137    colorpitch |= dst_format;
1138
1139    if (RADEONTilingEnabled(pScrn, pPixmap))
1140	colorpitch |= R300_COLORTILE;
1141
1142
1143    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
1144	(pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
1145	isplanar = TRUE;
1146
1147    if (isplanar) {
1148	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
1149	txpitch = pPriv->src_pitch;
1150    } else {
1151	if (pPriv->id == FOURCC_UYVY)
1152	    txformat1 = R300_TX_FORMAT_YVYU422;
1153	else
1154	    txformat1 = R300_TX_FORMAT_VYUY422;
1155
1156	if (pPriv->bicubic_state != BICUBIC_OFF)
1157	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
1158
1159	/* pitch is in pixels */
1160	txpitch = pPriv->src_pitch / 2;
1161    }
1162    txpitch -= 1;
1163
1164    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1165		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1166		 R300_TXPITCH_EN);
1167
1168    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1169		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1170		R300_TX_MAG_FILTER_LINEAR |
1171		R300_TX_MIN_FILTER_LINEAR |
1172		(0 << R300_TX_ID_SHIFT));
1173
1174    txoffset = info->cs ? 0 : pPriv->src_offset;
1175
1176    BEGIN_ACCEL_RELOC(6, 1);
1177    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
1178    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
1179    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
1180    if (isplanar)
1181	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1 | R300_TX_FORMAT_CACHE_HALF_REGION_0);
1182    else
1183	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
1184    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
1185    OUT_TEXTURE_REG(R300_TX_OFFSET_0, txoffset, src_bo);
1186    FINISH_ACCEL();
1187
1188    txenable = R300_TEX_0_ENABLE;
1189
1190    if (isplanar) {
1191	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
1192		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
1193		     R300_TXPITCH_EN);
1194	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
1195	txpitch -= 1;
1196	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
1197		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
1198		    R300_TX_MIN_FILTER_LINEAR |
1199		    R300_TX_MAG_FILTER_LINEAR);
1200
1201	BEGIN_ACCEL_RELOC(12, 2);
1202	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
1203	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
1204	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
1205	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
1206	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
1207	OUT_TEXTURE_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
1208	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
1209	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
1210	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
1211	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
1212	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
1213	OUT_TEXTURE_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset, src_bo);
1214	FINISH_ACCEL();
1215	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
1216    }
1217
1218    if (pPriv->bicubic_enabled) {
1219	/* Size is 128x1 */
1220	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
1221		     (0x0 << R300_TXHEIGHT_SHIFT) |
1222		     R300_TXPITCH_EN);
1223	/* Format is 32-bit floats, 4bpp */
1224	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
1225	/* Pitch is 127 (128-1) */
1226	txpitch = 0x7f;
1227	/* Tex filter */
1228	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
1229		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
1230		    R300_TX_MIN_FILTER_NEAREST |
1231		    R300_TX_MAG_FILTER_NEAREST |
1232		    (1 << R300_TX_ID_SHIFT));
1233
1234	if (info->cs)
1235	    bicubic_offset = 0;
1236	else
1237	    bicubic_offset = pPriv->bicubic_src_offset;
1238
1239	BEGIN_ACCEL_RELOC(6, 1);
1240	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
1241	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
1242	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
1243	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
1244	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
1245	OUT_TEXTURE_REG(R300_TX_OFFSET_1, bicubic_offset, info->bicubic_bo);
1246	FINISH_ACCEL();
1247
1248	/* Enable tex 1 */
1249	txenable |= R300_TEX_1_ENABLE;
1250    }
1251
1252    /* setup the VAP */
1253    if (info->accel_state->has_tcl) {
1254	if (pPriv->bicubic_enabled)
1255	    BEGIN_ACCEL(7);
1256	else
1257	    BEGIN_ACCEL(6);
1258    } else {
1259	if (pPriv->bicubic_enabled)
1260	    BEGIN_ACCEL(5);
1261	else
1262	    BEGIN_ACCEL(4);
1263    }
1264
1265    /* These registers define the number, type, and location of data submitted
1266     * to the PVS unit of GA input (when PVS is disabled)
1267     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
1268     * enabled.  This memory provides the imputs to the vertex shader program
1269     * and ordering is not important.  When PVS/TCL is disabled, this field maps
1270     * directly to the GA input memory and the order is signifigant.  In
1271     * PVS_BYPASS mode the order is as follows:
1272     * Position
1273     * Point Size
1274     * Color 0-3
1275     * Textures 0-7
1276     * Fog
1277     */
1278    if (pPriv->bicubic_enabled) {
1279	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
1280		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1281		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1282		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1283		       R300_SIGNED_0 |
1284		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1285		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1286		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1287		       R300_SIGNED_1));
1288	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
1289		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
1290		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
1291		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
1292		       R300_LAST_VEC_2 |
1293		       R300_SIGNED_2));
1294    } else {
1295	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
1296		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
1297		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
1298		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
1299		       R300_SIGNED_0 |
1300		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
1301		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
1302		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
1303		       R300_LAST_VEC_1 |
1304		       R300_SIGNED_1));
1305    }
1306
1307    /* load the vertex shader
1308     * We pre-load vertex programs in RADEONInit3DEngine():
1309     * - exa
1310     * - Xv
1311     * - Xv bicubic
1312     * Here we select the offset of the vertex program we want to use
1313     */
1314    if (info->accel_state->has_tcl) {
1315	if (pPriv->bicubic_enabled) {
1316	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
1317			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
1318			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1319			   (13 << R300_PVS_LAST_INST_SHIFT)));
1320	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
1321			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1322	} else {
1323	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
1324			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
1325			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
1326			   (10 << R300_PVS_LAST_INST_SHIFT)));
1327	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
1328			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
1329	}
1330    }
1331
1332    /* Position and one set of 2 texture coordinates */
1333    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
1334    if (pPriv->bicubic_enabled)
1335	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
1336					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
1337    else
1338	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
1339
1340    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
1341    FINISH_ACCEL();
1342
1343    /* setup pixel shader */
1344    if (pPriv->bicubic_state != BICUBIC_OFF) {
1345	if (pPriv->bicubic_enabled) {
1346	    BEGIN_ACCEL(79);
1347
1348	    /* 4 components: 2 for tex0 and 2 for tex1 */
1349	    OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1350					  R300_RS_COUNT_HIRES_EN));
1351
1352	    /* R300_INST_COUNT_RS - highest RS instruction used */
1353	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
1354
1355	    /* Pixel stack frame size. */
1356	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
1357
1358	    /* Indirection levels */
1359	    OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
1360					   R300_FIRST_TEX));
1361
1362	    /* Set nodes. */
1363	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1364						R300_ALU_CODE_SIZE(14) |
1365						R300_TEX_CODE_OFFSET(0) |
1366						R300_TEX_CODE_SIZE(6)));
1367
1368	    /* Nodes are allocated highest first, but executed lowest first */
1369	    OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
1370	    OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
1371						R300_ALU_SIZE(0) |
1372						R300_TEX_START(0) |
1373						R300_TEX_SIZE(0)));
1374	    OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
1375						R300_ALU_SIZE(9) |
1376						R300_TEX_START(1) |
1377						R300_TEX_SIZE(0)));
1378	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
1379						R300_ALU_SIZE(2) |
1380						R300_TEX_START(2) |
1381						R300_TEX_SIZE(3) |
1382						R300_RGBA_OUT));
1383
1384	    /* ** BICUBIC FP ** */
1385
1386	    /* texcoord0 => temp0
1387	     * texcoord1 => temp1 */
1388
1389	    // first node
1390	    /* TEX temp2, temp1.rrr0, tex1, 1D */
1391	    OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
1392						R300_TEX_ID(1) |
1393						R300_TEX_SRC_ADDR(1) |
1394						R300_TEX_DST_ADDR(2)));
1395
1396	    /* MOV temp1.r, temp1.ggg0 */
1397	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1398						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1399						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1400						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1401	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
1402						    R300_ALU_RGB_ADDRD(1) |
1403						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1404	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1405						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1406						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1407						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1408	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
1409						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1410
1411
1412	    // second node
1413	    /* TEX temp1, temp1, tex1, 1D */
1414	    OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
1415						R300_TEX_ID(1) |
1416						R300_TEX_SRC_ADDR(1) |
1417						R300_TEX_DST_ADDR(1)));
1418
1419	    /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
1420	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1421						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1422						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1423						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1424	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
1425						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1426						    R300_ALU_RGB_ADDRD(3) |
1427						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1428	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1429						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1430						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1431						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1432	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
1433						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1434
1435
1436	    /* MUL temp2.rg, temp2.rrr0, const0.rgb */
1437	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1438						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1439						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1440						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
1441	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
1442						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
1443						    R300_ALU_RGB_ADDRD(2) |
1444						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1445	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1446						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1447						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1448						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1449	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
1450						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1451
1452	    /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
1453	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1454						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1455						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1456						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1457	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
1458						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1459						    R300_ALU_RGB_ADDR2(3) |
1460						    R300_ALU_RGB_ADDRD(4) |
1461						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1462	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1463						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1464						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1465						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1466	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
1467						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1468
1469	    /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
1470	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1471						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
1472						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1473						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1474	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
1475						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1476						    R300_ALU_RGB_ADDR2(2) |
1477						    R300_ALU_RGB_ADDRD(5) |
1478						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1479	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1480						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1481						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1482						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1483	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
1484						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1485
1486	    /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
1487	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1488						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1489						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1490						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1491	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
1492						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1493						    R300_ALU_RGB_ADDR2(3) |
1494						    R300_ALU_RGB_ADDRD(3) |
1495						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1496	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1497						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1498						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1499						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1500	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
1501						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1502
1503	    /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
1504	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1505						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
1506						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1507						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1508	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
1509						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
1510						    R300_ALU_RGB_ADDR2(2) |
1511						    R300_ALU_RGB_ADDRD(1) |
1512						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1513	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1514						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1515						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1516						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1517	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
1518						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1519
1520	    /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
1521	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1522						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1523						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1524						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1525	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
1526						    R300_ALU_RGB_ADDR2(1) |
1527						    R300_ALU_RGB_ADDRD(1) |
1528						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1529	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1530						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1531						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1532						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1533	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
1534						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1535
1536	    /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
1537	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1538						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1539						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1540						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1541	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
1542						    R300_ALU_RGB_ADDR2(3) |
1543						    R300_ALU_RGB_ADDRD(2) |
1544						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1545	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1546						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1547						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1548						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1549	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
1550						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1551
1552	    /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
1553	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1554						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1555						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1556						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1557	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
1558						    R300_ALU_RGB_ADDR2(5) |
1559						    R300_ALU_RGB_ADDRD(3) |
1560						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1561	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1562						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1563						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1564						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1565	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
1566						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1567
1568	    /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
1569	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1570						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1571						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1572						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
1573	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
1574						     R300_ALU_RGB_ADDR2(4) |
1575						     R300_ALU_RGB_ADDRD(0) |
1576						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
1577	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1578						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1579						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1580						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1581	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
1582						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1583
1584
1585	    // third node
1586	    /* TEX temp4, temp1.rg--, tex0, 1D */
1587	    OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
1588						R300_TEX_ID(0) |
1589						R300_TEX_SRC_ADDR(1) |
1590						R300_TEX_DST_ADDR(4)));
1591
1592	    /* TEX temp3, temp3.rg--, tex0, 1D */
1593	    OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
1594						R300_TEX_ID(0) |
1595						R300_TEX_SRC_ADDR(3) |
1596						R300_TEX_DST_ADDR(3)));
1597
1598	    /* TEX temp5, temp2.rg--, tex0, 1D */
1599	    OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
1600						R300_TEX_ID(0) |
1601						R300_TEX_SRC_ADDR(2) |
1602						R300_TEX_DST_ADDR(5)));
1603
1604	    /* TEX temp0, temp0.rg--, tex0, 1D */
1605	    OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
1606						R300_TEX_ID(0) |
1607						R300_TEX_SRC_ADDR(0) |
1608						R300_TEX_DST_ADDR(0)));
1609
1610	    /* LRP temp3, temp1.bbbb, temp4, temp3 ->
1611	     * - PRESUB temps, temp4 - temp3
1612	     * - MAD temp3, temp1.bbbb, temps, temp3 */
1613	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1614						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1615						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1616						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1617						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1618	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
1619						     R300_ALU_RGB_ADDR1(4) |
1620						     R300_ALU_RGB_ADDR2(1) |
1621						     R300_ALU_RGB_ADDRD(3) |
1622						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1623	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1624						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1625						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1626						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1627	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
1628						       R300_ALU_ALPHA_ADDR1(4) |
1629						       R300_ALU_ALPHA_ADDR2(1) |
1630						       R300_ALU_ALPHA_ADDRD(3) |
1631						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1632
1633	    /* LRP temp0, temp1.bbbb, temp5, temp0 ->
1634	     * - PRESUB temps, temp5 - temp0
1635	     * - MAD temp0, temp1.bbbb, temps, temp0 */
1636	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1637						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1638						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1639						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1640						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
1641						     R300_ALU_RGB_INSERT_NOP));
1642	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
1643						     R300_ALU_RGB_ADDR1(5) |
1644						     R300_ALU_RGB_ADDR2(1) |
1645						     R300_ALU_RGB_ADDRD(0) |
1646						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1647	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1648						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1649						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1650						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1651	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
1652						       R300_ALU_ALPHA_ADDR1(5) |
1653						       R300_ALU_ALPHA_ADDR2(1) |
1654						       R300_ALU_ALPHA_ADDRD(0) |
1655						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
1656
1657	    /* LRP output, temp2.bbbb, temp3, temp0 ->
1658	     * - PRESUB temps, temp3 - temp0
1659	     * - MAD output, temp2.bbbb, temps, temp0 */
1660	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1661						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
1662						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
1663						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1664						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
1665	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
1666						     R300_ALU_RGB_ADDR1(3) |
1667						     R300_ALU_RGB_ADDR2(2) |
1668						     R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
1669	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1670						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
1671						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
1672						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
1673	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
1674						       R300_ALU_ALPHA_ADDR1(3) |
1675						       R300_ALU_ALPHA_ADDR2(2) |
1676						       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
1677
1678	    /* Shader constants. */
1679	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
1680	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
1681	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
1682	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
1683
1684	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
1685	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
1686	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
1687	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
1688
1689	    FINISH_ACCEL();
1690	} else {
1691	    BEGIN_ACCEL(11);
1692	    /* 2 components: 2 for tex0 */
1693	    OUT_ACCEL_REG(R300_RS_COUNT,
1694                          ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1695                           R300_RS_COUNT_HIRES_EN));
1696	    /* R300_INST_COUNT_RS - highest RS instruction used */
1697	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1698
1699	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
1700
1701	    /* Indirection levels */
1702	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1703					   R300_FIRST_TEX));
1704
1705	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1706						R300_ALU_CODE_SIZE(1) |
1707						R300_TEX_CODE_OFFSET(0) |
1708						R300_TEX_CODE_SIZE(1)));
1709
1710	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1711						R300_ALU_SIZE(0) |
1712						R300_TEX_START(0) |
1713						R300_TEX_SIZE(0) |
1714						R300_RGBA_OUT));
1715
1716	    /* tex inst */
1717	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1718					       R300_TEX_DST_ADDR(0) |
1719					       R300_TEX_ID(0) |
1720					       R300_TEX_INST(R300_TEX_INST_LD)));
1721
1722	    /* ALU inst */
1723	    /* RGB */
1724	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
1725                                                   R300_ALU_RGB_ADDR1(0) |
1726                                                   R300_ALU_RGB_ADDR2(0) |
1727                                                   R300_ALU_RGB_ADDRD(0) |
1728                                                   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
1729								       R300_ALU_RGB_MASK_G |
1730								       R300_ALU_RGB_MASK_B)) |
1731                                                   R300_ALU_RGB_TARGET_A));
1732	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1733                                                   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1734                                                   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
1735						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1736                                                   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1737                                                   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1738                                                   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1739                                                   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1740                                                   R300_ALU_RGB_CLAMP));
1741	    /* Alpha */
1742	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
1743						     R300_ALU_ALPHA_ADDR1(0) |
1744						     R300_ALU_ALPHA_ADDR2(0) |
1745						     R300_ALU_ALPHA_ADDRD(0) |
1746						     R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1747						     R300_ALU_ALPHA_TARGET_A |
1748						     R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
1749	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
1750						     R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
1751						     R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
1752						     R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
1753						     R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
1754						     R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
1755						     R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1756						     R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
1757						     R300_ALU_ALPHA_CLAMP));
1758	    FINISH_ACCEL();
1759	}
1760    } else {
1761	/*
1762	 * y' = y - .0625
1763	 * u' = u - .5
1764	 * v' = v - .5;
1765	 *
1766	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
1767	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
1768	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
1769	 *
1770	 * DP3 might look like the straightforward solution
1771	 * but we'd need to move the texture yuv values in
1772	 * the same reg for this to work. Therefore use MADs.
1773	 * Brightness just adds to the off constant.
1774	 * Contrast is multiplication of luminance.
1775	 * Saturation and hue change the u and v coeffs.
1776	 * Default values (before adjustments - depend on colorspace):
1777	 * yco = 1.1643
1778	 * uco = 0, -0.39173, 2.017
1779	 * vco = 1.5958, -0.8129, 0
1780	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
1781	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
1782	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
1783	 *
1784	 * temp = MAD(yco, yuv.yyyy, off)
1785	 * temp = MAD(uco, yuv.uuuu, temp)
1786	 * result = MAD(vco, yuv.vvvv, temp)
1787	 */
1788	/* TODO: don't recalc consts always */
1789	const float Loff = -0.0627;
1790	const float Coff = -0.502;
1791	float uvcosf, uvsinf;
1792	float yco;
1793	float uco[3], vco[3], off[3];
1794	float bright, cont, gamma;
1795	int ref = pPriv->transform_index;
1796	Bool needgamma = FALSE;
1797
1798	cont = RTFContrast(pPriv->contrast);
1799	bright = RTFBrightness(pPriv->brightness);
1800	gamma = (float)pPriv->gamma / 1000.0;
1801	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
1802	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
1803	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
1804
1805	yco = trans[ref].RefLuma * cont;
1806	uco[0] = -trans[ref].RefRCr * uvsinf;
1807	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
1808	uco[2] = trans[ref].RefBCb * uvcosf;
1809	vco[0] = trans[ref].RefRCr * uvcosf;
1810	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
1811	vco[2] = trans[ref].RefBCb * uvsinf;
1812	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
1813	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
1814	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
1815
1816	if (gamma != 1.0) {
1817	    needgamma = TRUE;
1818	    /* note: gamma correction is out = in ^ gamma;
1819	       gpu can only do LG2/EX2 therefore we transform into
1820	       in ^ gamma = 2 ^ (log2(in) * gamma).
1821	       Lots of scalar ops, unfortunately (better solution?) -
1822	       without gamma that's 3 inst, with gamma it's 10...
1823	       could use different gamma factors per channel,
1824	       if that's of any use. */
1825	}
1826
1827	if (isplanar) {
1828	    BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
1829	    /* 2 components: same 2 for tex0/1/2 */
1830	    OUT_ACCEL_REG(R300_RS_COUNT,
1831			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1832			   R300_RS_COUNT_HIRES_EN));
1833	    /* R300_INST_COUNT_RS - highest RS instruction used */
1834	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
1835
1836	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
1837
1838	    /* Indirection levels */
1839	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
1840					   R300_FIRST_TEX));
1841
1842	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
1843						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
1844						R300_TEX_CODE_OFFSET(0) |
1845						R300_TEX_CODE_SIZE(3)));
1846
1847	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
1848						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
1849						R300_TEX_START(0) |
1850						R300_TEX_SIZE(2) |
1851						R300_RGBA_OUT));
1852
1853	    /* tex inst */
1854	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
1855					       R300_TEX_DST_ADDR(2) |
1856					       R300_TEX_ID(0) |
1857					       R300_TEX_INST(R300_TEX_INST_LD)));
1858	    OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
1859					       R300_TEX_DST_ADDR(1) |
1860					       R300_TEX_ID(1) |
1861					       R300_TEX_INST(R300_TEX_INST_LD)));
1862	    OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
1863					       R300_TEX_DST_ADDR(0) |
1864					       R300_TEX_ID(2) |
1865					       R300_TEX_INST(R300_TEX_INST_LD)));
1866
1867	    /* ALU inst */
1868	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
1869	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
1870						    R300_ALU_RGB_ADDR1(2) |
1871						    R300_ALU_RGB_ADDR2(0) |
1872						    R300_ALU_RGB_ADDRD(2) |
1873						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1874	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
1875						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1876						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1877						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1878						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
1879						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1880						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1881						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1882	    /* alpha nop, but need to set up alpha source for rgb usage */
1883	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
1884						      R300_ALU_ALPHA_ADDR1(2) |
1885						      R300_ALU_ALPHA_ADDR2(0) |
1886						      R300_ALU_ALPHA_ADDRD(2) |
1887						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1888	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1889						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1890						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1891						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1892
1893	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
1894	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
1895						    R300_ALU_RGB_ADDR1(1) |
1896						    R300_ALU_RGB_ADDR2(2) |
1897						    R300_ALU_RGB_ADDRD(2) |
1898						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1899	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1900						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1901						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1902						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1903						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1904						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1905						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1906						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1907	    /* alpha nop */
1908	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
1909						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1910	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1911						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1912						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1913						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1914
1915	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
1916	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
1917						    R300_ALU_RGB_ADDR1(0) |
1918						    R300_ALU_RGB_ADDR2(2) |
1919						    R300_ALU_RGB_ADDRD(0) |
1920						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
1921						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
1922	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1923						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1924						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
1925						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1926						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
1927						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1928						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1929						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
1930						    R300_ALU_RGB_CLAMP));
1931	    /* write alpha 1 */
1932	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
1933						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
1934						      R300_ALU_ALPHA_TARGET_A));
1935	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
1936						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
1937						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1938						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
1939
1940	    if (needgamma) {
1941		/* rgb temp0.r = op_sop, set up src0 reg */
1942		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
1943							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
1944		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
1945			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1946			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1947		/* alpha lg2 temp0, temp0.r */
1948		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
1949							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1950		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1951							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
1952							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1953							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1954
1955		/* rgb temp0.g = op_sop, set up src0 reg */
1956		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
1957							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
1958		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
1959			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1960			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1961		/* alpha lg2 temp0, temp0.g */
1962		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
1963							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1964		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1965							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
1966							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1967							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1968
1969		/* rgb temp0.b = op_sop, set up src0 reg */
1970		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
1971							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
1972		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
1973			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
1974			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
1975		/* alpha lg2 temp0, temp0.b */
1976		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
1977							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
1978		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
1979							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
1980							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
1981							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
1982
1983		/* MUL const1, temp1, temp0 */
1984		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
1985							R300_ALU_RGB_ADDR1(0) |
1986							R300_ALU_RGB_ADDR2(0) |
1987							R300_ALU_RGB_ADDRD(0) |
1988							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
1989		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
1990							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
1991							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
1992							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
1993							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
1994							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
1995							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
1996							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
1997		/* alpha nop, but set up const1 */
1998		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
1999							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2000							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2001		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2002							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2003							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2004							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2005
2006		/* rgb out0.r = op_sop, set up src0 reg */
2007		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2008							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2009							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2010		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
2011			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2012			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2013		/* alpha ex2 temp0, temp0.r */
2014		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2015							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2016		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2017							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2018							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2019							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2020
2021		/* rgb out0.g = op_sop, set up src0 reg */
2022		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2023							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2024							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2025		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
2026			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2027			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2028		/* alpha ex2 temp0, temp0.g */
2029		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2030							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2031		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2032							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2033							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2034							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2035
2036		/* rgb out0.b = op_sop, set up src0 reg */
2037		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2038							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2039							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2040		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
2041			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2042			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2043		/* alpha ex2 temp0, temp0.b */
2044		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2045							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2046		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2047							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2048							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2049							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2050	    }
2051	} else {
2052	    BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
2053	    /* 2 components */
2054	    OUT_ACCEL_REG(R300_RS_COUNT,
2055			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2056			   R300_RS_COUNT_HIRES_EN));
2057	    /* R300_INST_COUNT_RS - highest RS instruction used */
2058	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
2059
2060	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
2061
2062	    /* Indirection levels */
2063	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
2064					   R300_FIRST_TEX));
2065
2066	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
2067						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
2068						R300_TEX_CODE_OFFSET(0) |
2069						R300_TEX_CODE_SIZE(1)));
2070
2071	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
2072						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
2073						R300_TEX_START(0) |
2074						R300_TEX_SIZE(0) |
2075						R300_RGBA_OUT));
2076
2077	    /* tex inst */
2078	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
2079					       R300_TEX_DST_ADDR(0) |
2080					       R300_TEX_ID(0) |
2081					       R300_TEX_INST(R300_TEX_INST_LD)));
2082
2083	    /* ALU inst */
2084	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
2085	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
2086						    R300_ALU_RGB_ADDR1(0) |
2087						    R300_ALU_RGB_ADDR2(0) |
2088						    R300_ALU_RGB_ADDRD(1) |
2089						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2090	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
2091						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2092						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
2093						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2094						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
2095						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2096						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2097						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2098	    /* alpha nop, but need to set up alpha source for rgb usage */
2099	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
2100						      R300_ALU_ALPHA_ADDR1(0) |
2101						      R300_ALU_ALPHA_ADDR2(0) |
2102						      R300_ALU_ALPHA_ADDRD(0) |
2103						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2104	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2105						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2106						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2107						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2108
2109	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
2110	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
2111						    R300_ALU_RGB_ADDR1(0) |
2112						    R300_ALU_RGB_ADDR2(1) |
2113						    R300_ALU_RGB_ADDRD(1) |
2114						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2115	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2116						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2117						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
2118						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2119						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2120						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2121						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2122						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2123	    /* alpha nop */
2124	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
2125						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2126	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2127						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2128						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2129						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2130
2131	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
2132	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
2133						    R300_ALU_RGB_ADDR1(0) |
2134						    R300_ALU_RGB_ADDR2(1) |
2135						    R300_ALU_RGB_ADDRD(0) |
2136						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
2137						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
2138	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2139						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2140						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
2141						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2142						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
2143						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2144						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2145						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
2146						    R300_ALU_RGB_CLAMP));
2147	    /* write alpha 1 */
2148	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
2149						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
2150						      R300_ALU_ALPHA_TARGET_A));
2151	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2152						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2153						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2154						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
2155
2156	    if (needgamma) {
2157		/* rgb temp0.r = op_sop, set up src0 reg */
2158		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
2159							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
2160		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
2161			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2162			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2163		/* alpha lg2 temp0, temp0.r */
2164		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
2165							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2166		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2167							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2168							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2169							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2170
2171		/* rgb temp0.g = op_sop, set up src0 reg */
2172		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
2173							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
2174		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
2175			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2176			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2177		/* alpha lg2 temp0, temp0.g */
2178		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
2179							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2180		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2181							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2182							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2183							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2184
2185		/* rgb temp0.b = op_sop, set up src0 reg */
2186		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
2187							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
2188		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
2189			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2190			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2191		/* alpha lg2 temp0, temp0.b */
2192		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
2193							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2194		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
2195							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2196							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2197							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2198
2199		/* MUL const1, temp1, temp0 */
2200		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
2201							R300_ALU_RGB_ADDR1(0) |
2202							R300_ALU_RGB_ADDR2(0) |
2203							R300_ALU_RGB_ADDRD(0) |
2204							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
2205		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
2206							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
2207							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
2208							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
2209							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
2210							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
2211							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
2212							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
2213		/* alpha nop, but set up const1 */
2214		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
2215							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
2216							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2217		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
2218							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
2219							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2220							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2221
2222		/* rgb out0.r = op_sop, set up src0 reg */
2223		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
2224							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
2225							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
2226		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
2227			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2228			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2229		/* alpha ex2 temp0, temp0.r */
2230		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
2231							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2232		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2233							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
2234							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2235							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2236
2237		/* rgb out0.g = op_sop, set up src0 reg */
2238		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
2239							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
2240							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
2241		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
2242			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2243			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2244		/* alpha ex2 temp0, temp0.g */
2245		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
2246							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2247		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2248							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
2249							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2250							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2251
2252		/* rgb out0.b = op_sop, set up src0 reg */
2253		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
2254							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
2255							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
2256		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
2257			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
2258			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
2259		/* alpha ex2 temp0, temp0.b */
2260		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
2261							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
2262		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
2263							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
2264							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
2265							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
2266	    }
2267	}
2268
2269	/* Shader constants. */
2270	/* constant 0: off, yco */
2271	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
2272	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
2273	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
2274	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
2275	/* constant 1: uco */
2276	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
2277	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
2278	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
2279	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
2280	/* constant 2: vco */
2281	OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
2282	OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
2283	OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
2284	OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
2285
2286	FINISH_ACCEL();
2287    }
2288
2289    BEGIN_ACCEL_RELOC(6, 2);
2290    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
2291    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
2292
2293    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
2294    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
2295
2296    /* no need to enable blending */
2297    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
2298
2299    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
2300    FINISH_ACCEL();
2301
2302    if (pPriv->vsync) {
2303	xf86CrtcPtr crtc;
2304	if (pPriv->desired_crtc)
2305	    crtc = pPriv->desired_crtc;
2306	else
2307	    crtc = radeon_pick_best_crtc(pScrn,
2308					 pPriv->drw_x,
2309					 pPriv->drw_x + pPriv->dst_w,
2310					 pPriv->drw_y,
2311					 pPriv->drw_y + pPriv->dst_h);
2312	if (crtc)
2313	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
2314					  crtc,
2315					  pPriv->drw_y - crtc->y,
2316					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
2317    }
2318    /*
2319     * Rendering of the actual polygon is done in two different
2320     * ways depending on chip generation:
2321     *
2322     * < R300:
2323     *
2324     *     These chips can render a rectangle in one pass, so
2325     *     handling is pretty straight-forward.
2326     *
2327     * >= R300:
2328     *
2329     *     These chips can accept a quad, but will render it as
2330     *     two triangles which results in a diagonal tear. Instead
2331     *     We render a single, large triangle and use the scissor
2332     *     functionality to restrict it to the desired rectangle.
2333     *     Due to guardband limits on r3xx/r4xx, we can only use
2334     *     the single triangle up to 2560/4021 pixels; above that we
2335     *     render as a quad.
2336     */
2337
2338    while (nBox--) {
2339	int srcX, srcY, srcw, srch;
2340	int dstX, dstY, dstw, dsth;
2341	Bool use_quad = FALSE;
2342	dstX = pBox->x1 + dstxoff;
2343	dstY = pBox->y1 + dstyoff;
2344	dstw = pBox->x2 - pBox->x1;
2345	dsth = pBox->y2 - pBox->y1;
2346
2347	srcX = pPriv->src_x;
2348	srcX += ((pBox->x1 - pPriv->drw_x) *
2349		 pPriv->src_w) / pPriv->dst_w;
2350	srcY = pPriv->src_y;
2351	srcY += ((pBox->y1 - pPriv->drw_y) *
2352		 pPriv->src_h) / pPriv->dst_h;
2353
2354	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
2355	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
2356
2357#if 0
2358	ErrorF("dst: %d, %d, %d, %d\n", dstX, dstY, dstw, dsth);
2359	ErrorF("src: %d, %d, %d, %d\n", srcX, srcY, srcw, srch);
2360#endif
2361
2362	if (IS_R400_3D) {
2363	    if ((dstw+dsth) > 4021)
2364		use_quad = TRUE;
2365	} else {
2366	    if ((dstw+dsth) > 2560)
2367		use_quad = TRUE;
2368	}
2369	/*
2370	 * Set up the scissor area to that of the output size.
2371	 */
2372	BEGIN_ACCEL(2);
2373	/* R300 has an offset */
2374	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1440) << R300_SCISSOR_X_SHIFT) |
2375					 ((dstY + 1440) << R300_SCISSOR_Y_SHIFT)));
2376	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1440 - 1) << R300_SCISSOR_X_SHIFT) |
2377					 ((dstY + dsth + 1440 - 1) << R300_SCISSOR_Y_SHIFT)));
2378	FINISH_ACCEL();
2379
2380#ifdef ACCEL_CP
2381	if (use_quad) {
2382	    BEGIN_RING(4 * vtx_count + 4);
2383	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2384				4 * vtx_count));
2385	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
2386		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2387		     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2388	} else {
2389	    BEGIN_RING(3 * vtx_count + 4);
2390	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2391				3 * vtx_count));
2392	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
2393		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2394		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2395	}
2396#else /* ACCEL_CP */
2397	if (use_quad)
2398	    BEGIN_ACCEL(2 + vtx_count * 4);
2399	else
2400	    BEGIN_ACCEL(2 + vtx_count * 3);
2401
2402	if (use_quad)
2403	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
2404					      RADEON_VF_PRIM_WALK_DATA |
2405					      (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
2406	else
2407	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
2408					      RADEON_VF_PRIM_WALK_DATA |
2409					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
2410#endif
2411	if (pPriv->bicubic_enabled) {
2412		/*
2413		 * This code is only executed on >= R300, so we don't
2414		 * have to deal with the legacy handling.
2415		 */
2416	    if (use_quad) {
2417		VTX_OUT_6((float)dstX,                     (float)dstY,
2418			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2419			  (float)srcX + 0.5,               (float)srcY + 0.5);
2420		VTX_OUT_6((float)dstX,                     (float)(dstY + dsth),
2421			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h,
2422			  (float)srcX + 0.5,               (float)(srcY + srch) + 0.5);
2423		VTX_OUT_6((float)(dstX + dstw),            (float)(dstY + dsth),
2424			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h,
2425			  (float)(srcX + srcw) + 0.5,      (float)(srcY + srch) + 0.5);
2426		VTX_OUT_6((float)(dstX + dstw),            (float)dstY,
2427			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h,
2428			  (float)(srcX + srcw) + 0.5,      (float)srcY + 0.5);
2429	    } else {
2430		VTX_OUT_6((float)dstX,                     (float)dstY,
2431			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h,
2432			  (float)srcX + 0.5,               (float)srcY + 0.5);
2433		VTX_OUT_6((float)dstX,                     (float)(dstY + dstw + dsth),
2434			  (float)srcX / pPriv->w,
2435			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
2436			  (float)srcX + 0.5,
2437			  (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
2438		VTX_OUT_6((float)(dstX + dstw + dsth),     (float)dstY,
2439			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2440			  (float)srcY / pPriv->h,
2441			  (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
2442			  (float)srcY + 0.5);
2443	    }
2444	} else {
2445	    if (use_quad) {
2446		VTX_OUT_4((float)dstX,                     (float)dstY,
2447			  (float)srcX / pPriv->w,          (float)srcY / pPriv->h);
2448		VTX_OUT_4((float)dstX,                     (float)(dstY + dsth),
2449			  (float)srcX / pPriv->w,          (float)(srcY + srch) / pPriv->h);
2450		VTX_OUT_4((float)(dstX + dstw),            (float)(dstY + dsth),
2451			  (float)(srcX + srcw) / pPriv->w, (float)(srcY + srch) / pPriv->h);
2452		VTX_OUT_4((float)(dstX + dstw),            (float)dstY,
2453			  (float)(srcX + srcw) / pPriv->w, (float)srcY / pPriv->h);
2454	    } else {
2455		/*
2456		 * Render a big, scissored triangle. This means
2457		 * increasing the triangle size and adjusting
2458		 * texture coordinates.
2459		 */
2460		VTX_OUT_4((float)dstX,                 (float)dstY,
2461			  (float)srcX / pPriv->w,      (float)srcY / pPriv->h);
2462		VTX_OUT_4((float)dstX,                 (float)(dstY + dsth + dstw),
2463			  (float)srcX / pPriv->w,
2464			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
2465		VTX_OUT_4((float)(dstX + dstw + dsth), (float)dstY,
2466			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
2467			  (float)srcY / pPriv->h);
2468	    }
2469	}
2470
2471	/* flushing is pipelined, free/finish is not */
2472	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2473
2474#ifdef ACCEL_CP
2475	ADVANCE_RING();
2476#else
2477	FINISH_ACCEL();
2478#endif /* !ACCEL_CP */
2479
2480	pBox++;
2481    }
2482
2483    BEGIN_ACCEL(3);
2484    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
2485    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
2486    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
2487    FINISH_ACCEL();
2488
2489    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
2490}
2491
2492static void
2493FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
2494{
2495    RADEONInfoPtr info = RADEONPTR(pScrn);
2496    PixmapPtr pPixmap = pPriv->pPixmap;
2497    struct radeon_exa_pixmap_priv *driver_priv;
2498    struct radeon_bo *src_bo = pPriv->src_bo[pPriv->currentBuffer];
2499    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
2500    uint32_t dst_pitch, dst_format;
2501    uint32_t txenable, colorpitch, bicubic_offset;
2502    uint32_t output_fmt;
2503    Bool isplanar = FALSE;
2504    int dstxoff, dstyoff, pixel_shift, vtx_count;
2505    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
2506    int nBox = REGION_NUM_RECTS(&pPriv->clip);
2507    ACCEL_PREAMBLE();
2508
2509#ifdef XF86DRM_MODE
2510    if (info->cs) {
2511	int ret;
2512
2513	radeon_cs_space_reset_bos(info->cs);
2514	radeon_cs_space_add_persistent_bo(info->cs, src_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2515
2516	if (pPriv->bicubic_enabled)
2517	    radeon_cs_space_add_persistent_bo(info->cs, info->bicubic_bo, RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
2518
2519	driver_priv = exaGetPixmapDriverPrivate(pPixmap);
2520	radeon_cs_space_add_persistent_bo(info->cs, driver_priv->bo, 0, RADEON_GEM_DOMAIN_VRAM);
2521
2522	ret = radeon_cs_space_check(info->cs);
2523	if (ret) {
2524	    ErrorF("Not enough RAM to hw accel xv operation\n");
2525	    return;
2526	}
2527    }
2528#endif
2529
2530    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
2531
2532#ifdef USE_EXA
2533    if (info->useEXA) {
2534	dst_pitch = exaGetPixmapPitch(pPixmap);
2535    } else
2536#endif
2537    {
2538	dst_pitch = pPixmap->devKind;
2539    }
2540
2541#ifdef COMPOSITE
2542    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
2543    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
2544#else
2545    dstxoff = 0;
2546    dstyoff = 0;
2547#endif
2548
2549#ifdef USE_EXA
2550    if (info->useEXA) {
2551	RADEON_SWITCH_TO_3D();
2552    } else
2553#endif
2554    {
2555	BEGIN_ACCEL(2);
2556	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2557	/* We must wait for 3d to idle, in case source was just written as a dest. */
2558	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
2559		      RADEON_WAIT_HOST_IDLECLEAN |
2560		      RADEON_WAIT_2D_IDLECLEAN |
2561		      RADEON_WAIT_3D_IDLECLEAN |
2562		      RADEON_WAIT_DMA_GUI_IDLE);
2563	FINISH_ACCEL();
2564
2565	if (!info->accel_state->XInited3D)
2566	    RADEONInit3DEngine(pScrn);
2567    }
2568
2569    if (pPriv->bicubic_enabled)
2570	vtx_count = 6;
2571    else
2572	vtx_count = 4;
2573
2574    switch (pPixmap->drawable.bitsPerPixel) {
2575    case 16:
2576	if (pPixmap->drawable.depth == 15)
2577	    dst_format = R300_COLORFORMAT_ARGB1555;
2578	else
2579	    dst_format = R300_COLORFORMAT_RGB565;
2580	break;
2581    case 32:
2582	dst_format = R300_COLORFORMAT_ARGB8888;
2583	break;
2584    default:
2585	return;
2586    }
2587
2588    output_fmt = (R300_OUT_FMT_C4_8 |
2589		  R300_OUT_FMT_C0_SEL_BLUE |
2590		  R300_OUT_FMT_C1_SEL_GREEN |
2591		  R300_OUT_FMT_C2_SEL_RED |
2592		  R300_OUT_FMT_C3_SEL_ALPHA);
2593
2594    colorpitch = dst_pitch >> pixel_shift;
2595    colorpitch |= dst_format;
2596
2597    if (RADEONTilingEnabled(pScrn, pPixmap))
2598	colorpitch |= R300_COLORTILE;
2599
2600    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
2601        (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
2602	isplanar = TRUE;
2603
2604    if (isplanar) {
2605	txformat1 = R300_TX_FORMAT_X8;
2606	txpitch = pPriv->src_pitch;
2607    } else {
2608	if (pPriv->id == FOURCC_UYVY)
2609	    txformat1 = R300_TX_FORMAT_YVYU422;
2610	else
2611	    txformat1 = R300_TX_FORMAT_VYUY422;
2612
2613	if (pPriv->bicubic_state != BICUBIC_OFF)
2614	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
2615
2616	/* pitch is in pixels */
2617	txpitch = pPriv->src_pitch / 2;
2618    }
2619    txpitch -= 1;
2620
2621    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2622		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2623		 R300_TXPITCH_EN);
2624
2625    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2626		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2627		R300_TX_MAG_FILTER_LINEAR |
2628		R300_TX_MIN_FILTER_LINEAR |
2629		(0 << R300_TX_ID_SHIFT));
2630
2631
2632    if ((pPriv->w - 1) & 0x800)
2633	txpitch |= R500_TXWIDTH_11;
2634
2635    if ((pPriv->h - 1) & 0x800)
2636	txpitch |= R500_TXHEIGHT_11;
2637
2638    txoffset = info->cs ? 0 : pPriv->src_offset;
2639
2640    BEGIN_ACCEL_RELOC(6, 1);
2641    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
2642    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
2643    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
2644    OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
2645    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
2646    OUT_TEXTURE_REG(R300_TX_OFFSET_0, txoffset, src_bo);
2647    FINISH_ACCEL();
2648
2649    txenable = R300_TEX_0_ENABLE;
2650
2651    if (isplanar) {
2652	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
2653		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
2654		     R300_TXPITCH_EN);
2655	txpitch = RADEON_ALIGN(pPriv->src_pitch >> 1, 64);
2656	txpitch -= 1;
2657	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
2658		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
2659		    R300_TX_MIN_FILTER_LINEAR |
2660		    R300_TX_MAG_FILTER_LINEAR);
2661
2662	BEGIN_ACCEL_RELOC(12, 2);
2663	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
2664	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
2665	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
2666	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8);
2667	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
2668	OUT_TEXTURE_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset, src_bo);
2669	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
2670	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
2671	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
2672	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8);
2673	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
2674	OUT_TEXTURE_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset, src_bo);
2675	FINISH_ACCEL();
2676	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
2677    }
2678
2679    if (pPriv->bicubic_enabled) {
2680	/* Size is 128x1 */
2681	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
2682		     (0x0 << R300_TXHEIGHT_SHIFT) |
2683		     R300_TXPITCH_EN);
2684	/* Format is 32-bit floats, 4bpp */
2685	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
2686	/* Pitch is 127 (128-1) */
2687	txpitch = 0x7f;
2688	/* Tex filter */
2689	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
2690		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
2691		    R300_TX_MIN_FILTER_NEAREST |
2692		    R300_TX_MAG_FILTER_NEAREST |
2693		    (1 << R300_TX_ID_SHIFT));
2694
2695	if (info->cs)
2696	    bicubic_offset = 0;
2697	else
2698	    bicubic_offset = pPriv->bicubic_src_offset;
2699
2700	BEGIN_ACCEL_RELOC(6, 1);
2701	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
2702	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
2703	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
2704	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
2705	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
2706	OUT_TEXTURE_REG(R300_TX_OFFSET_1, bicubic_offset, info->bicubic_bo);
2707	FINISH_ACCEL();
2708
2709	/* Enable tex 1 */
2710	txenable |= R300_TEX_1_ENABLE;
2711    }
2712
2713    /* setup the VAP */
2714    if (info->accel_state->has_tcl) {
2715	if (pPriv->bicubic_enabled)
2716	    BEGIN_ACCEL(7);
2717	else
2718	    BEGIN_ACCEL(6);
2719    } else {
2720	if (pPriv->bicubic_enabled)
2721	    BEGIN_ACCEL(5);
2722	else
2723	    BEGIN_ACCEL(4);
2724    }
2725
2726    /* These registers define the number, type, and location of data submitted
2727     * to the PVS unit of GA input (when PVS is disabled)
2728     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
2729     * enabled.  This memory provides the imputs to the vertex shader program
2730     * and ordering is not important.  When PVS/TCL is disabled, this field maps
2731     * directly to the GA input memory and the order is signifigant.  In
2732     * PVS_BYPASS mode the order is as follows:
2733     * Position
2734     * Point Size
2735     * Color 0-3
2736     * Textures 0-7
2737     * Fog
2738     */
2739    if (pPriv->bicubic_enabled) {
2740	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
2741		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2742		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2743		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2744		       R300_SIGNED_0 |
2745		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2746		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2747		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2748		       R300_SIGNED_1));
2749	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
2750		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
2751		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
2752		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
2753		       R300_LAST_VEC_2 |
2754		       R300_SIGNED_2));
2755    } else {
2756	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
2757		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
2758		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
2759		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
2760		       R300_SIGNED_0 |
2761		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
2762		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
2763		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
2764		       R300_LAST_VEC_1 |
2765		       R300_SIGNED_1));
2766    }
2767
2768    /* load the vertex shader
2769     * We pre-load vertex programs in RADEONInit3DEngine():
2770     * - exa
2771     * - Xv
2772     * - Xv bicubic
2773     * Here we select the offset of the vertex program we want to use
2774     */
2775    if (info->accel_state->has_tcl) {
2776	if (pPriv->bicubic_enabled) {
2777	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
2778			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
2779			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2780			   (13 << R300_PVS_LAST_INST_SHIFT)));
2781	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
2782			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2783	} else {
2784	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
2785			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
2786			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
2787			   (10 << R300_PVS_LAST_INST_SHIFT)));
2788	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
2789			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
2790	}
2791    }
2792
2793    /* Position and one set of 2 texture coordinates */
2794    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
2795    if (pPriv->bicubic_enabled)
2796	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
2797					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
2798    else
2799	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
2800
2801    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
2802    FINISH_ACCEL();
2803
2804    /* setup pixel shader */
2805    if (pPriv->bicubic_state != BICUBIC_OFF) {
2806	if (pPriv->bicubic_enabled) {
2807	    BEGIN_ACCEL(7);
2808
2809	    /* 4 components: 2 for tex0 and 2 for tex1 */
2810	    OUT_ACCEL_REG(R300_RS_COUNT,
2811			  ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
2812			   R300_RS_COUNT_HIRES_EN));
2813
2814	    /* R300_INST_COUNT_RS - highest RS instruction used */
2815	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1));
2816
2817	    /* Pixel stack frame size. */
2818	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
2819
2820	    /* FP length. */
2821	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
2822					      R500_US_CODE_END_ADDR(13)));
2823	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
2824					       R500_US_CODE_RANGE_SIZE(13)));
2825
2826	    /* Prepare for FP emission. */
2827	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
2828	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
2829	    FINISH_ACCEL();
2830
2831	    BEGIN_ACCEL(89);
2832	    /* Pixel shader.
2833	     * I've gone ahead and annotated each instruction, since this
2834	     * thing is MASSIVE. :3
2835	     * Note: In order to avoid buggies with temps and multiple
2836	     * inputs, all temps are offset by 2. temp0 -> register2. */
2837
2838	    /* TEX temp2, input1.xxxx, tex1, 1D */
2839	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2840						   R500_INST_RGB_WMASK_R |
2841						   R500_INST_RGB_WMASK_G |
2842						   R500_INST_RGB_WMASK_B));
2843	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
2844						   R500_TEX_INST_LD |
2845						   R500_TEX_IGNORE_UNCOVERED));
2846	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
2847						   R500_TEX_SRC_S_SWIZ_R |
2848						   R500_TEX_SRC_T_SWIZ_R |
2849						   R500_TEX_SRC_R_SWIZ_R |
2850						   R500_TEX_SRC_Q_SWIZ_R |
2851						   R500_TEX_DST_ADDR(2) |
2852						   R500_TEX_DST_R_SWIZ_R |
2853						   R500_TEX_DST_G_SWIZ_G |
2854						   R500_TEX_DST_B_SWIZ_B |
2855						   R500_TEX_DST_A_SWIZ_A));
2856	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2857	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2858	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2859
2860	    /* TEX temp5, input1.yyyy, tex1, 1D */
2861	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2862						   R500_INST_TEX_SEM_WAIT |
2863						   R500_INST_RGB_WMASK_R |
2864						   R500_INST_RGB_WMASK_G |
2865						   R500_INST_RGB_WMASK_B));
2866	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
2867						   R500_TEX_INST_LD |
2868						   R500_TEX_SEM_ACQUIRE |
2869						   R500_TEX_IGNORE_UNCOVERED));
2870	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
2871						   R500_TEX_SRC_S_SWIZ_G |
2872						   R500_TEX_SRC_T_SWIZ_G |
2873						   R500_TEX_SRC_R_SWIZ_G |
2874						   R500_TEX_SRC_Q_SWIZ_G |
2875						   R500_TEX_DST_ADDR(5) |
2876						   R500_TEX_DST_R_SWIZ_R |
2877						   R500_TEX_DST_G_SWIZ_G |
2878						   R500_TEX_DST_B_SWIZ_B |
2879						   R500_TEX_DST_A_SWIZ_A));
2880	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2881	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2882	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
2883
2884	    /* MUL temp4, const0.x0x0, temp2.yyxx */
2885	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2886						   R500_INST_TEX_SEM_WAIT |
2887						   R500_INST_RGB_WMASK_R |
2888						   R500_INST_RGB_WMASK_G |
2889						   R500_INST_RGB_WMASK_B |
2890						   R500_INST_ALPHA_WMASK));
2891	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2892						   R500_RGB_ADDR0_CONST |
2893						   R500_RGB_ADDR1(2)));
2894	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2895						   R500_ALPHA_ADDR0_CONST |
2896						   R500_ALPHA_ADDR1(2)));
2897	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2898						   R500_ALU_RGB_R_SWIZ_A_R |
2899						   R500_ALU_RGB_G_SWIZ_A_0 |
2900						   R500_ALU_RGB_B_SWIZ_A_R |
2901						   R500_ALU_RGB_SEL_B_SRC1 |
2902						   R500_ALU_RGB_R_SWIZ_B_G |
2903						   R500_ALU_RGB_G_SWIZ_B_G |
2904						   R500_ALU_RGB_B_SWIZ_B_R));
2905	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
2906						   R500_ALPHA_OP_MAD |
2907						   R500_ALPHA_SEL_A_SRC0 |
2908						   R500_ALPHA_SWIZ_A_0 |
2909						   R500_ALPHA_SEL_B_SRC1 |
2910						   R500_ALPHA_SWIZ_B_R));
2911	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
2912						   R500_ALU_RGBA_OP_MAD |
2913						   R500_ALU_RGBA_R_SWIZ_0 |
2914						   R500_ALU_RGBA_G_SWIZ_0 |
2915						   R500_ALU_RGBA_B_SWIZ_0 |
2916						   R500_ALU_RGBA_A_SWIZ_0));
2917
2918	    /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
2919	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2920						   R500_INST_RGB_WMASK_R |
2921						   R500_INST_RGB_WMASK_G |
2922						   R500_INST_RGB_WMASK_B |
2923						   R500_INST_ALPHA_WMASK));
2924	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
2925						   R500_RGB_ADDR0_CONST |
2926						   R500_RGB_ADDR1(5) |
2927						   R500_RGB_ADDR2(4)));
2928	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
2929						   R500_ALPHA_ADDR0_CONST |
2930						   R500_ALPHA_ADDR1(5) |
2931						   R500_ALPHA_ADDR2(4)));
2932	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
2933						   R500_ALU_RGB_R_SWIZ_A_0 |
2934						   R500_ALU_RGB_G_SWIZ_A_G |
2935						   R500_ALU_RGB_B_SWIZ_A_0 |
2936						   R500_ALU_RGB_SEL_B_SRC1 |
2937						   R500_ALU_RGB_R_SWIZ_B_R |
2938						   R500_ALU_RGB_G_SWIZ_B_R |
2939						   R500_ALU_RGB_B_SWIZ_B_R));
2940	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
2941						   R500_ALPHA_OP_MAD |
2942						   R500_ALPHA_SEL_A_SRC0 |
2943						   R500_ALPHA_SWIZ_A_G |
2944						   R500_ALPHA_SEL_B_SRC1 |
2945						   R500_ALPHA_SWIZ_B_R));
2946	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
2947						   R500_ALU_RGBA_OP_MAD |
2948						   R500_ALU_RGBA_SEL_C_SRC2 |
2949						   R500_ALU_RGBA_R_SWIZ_R |
2950						   R500_ALU_RGBA_G_SWIZ_G |
2951						   R500_ALU_RGBA_B_SWIZ_B |
2952						   R500_ALU_RGBA_A_SWIZ_A));
2953
2954	    /* ADD temp3, temp3, input0.xyxy */
2955	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
2956						   R500_INST_RGB_WMASK_R |
2957						   R500_INST_RGB_WMASK_G |
2958						   R500_INST_RGB_WMASK_B |
2959						   R500_INST_ALPHA_WMASK));
2960	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
2961						   R500_RGB_ADDR2(0)));
2962	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
2963						   R500_ALPHA_ADDR2(0)));
2964	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
2965						   R500_ALU_RGB_G_SWIZ_A_1 |
2966						   R500_ALU_RGB_B_SWIZ_A_1 |
2967						   R500_ALU_RGB_SEL_B_SRC1 |
2968						   R500_ALU_RGB_R_SWIZ_B_R |
2969						   R500_ALU_RGB_G_SWIZ_B_G |
2970						   R500_ALU_RGB_B_SWIZ_B_B));
2971	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
2972						   R500_ALPHA_OP_MAD |
2973						   R500_ALPHA_SWIZ_A_1 |
2974						   R500_ALPHA_SEL_B_SRC1 |
2975						   R500_ALPHA_SWIZ_B_A));
2976	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
2977						   R500_ALU_RGBA_OP_MAD |
2978						   R500_ALU_RGBA_SEL_C_SRC2 |
2979						   R500_ALU_RGBA_R_SWIZ_R |
2980						   R500_ALU_RGBA_G_SWIZ_G |
2981						   R500_ALU_RGBA_B_SWIZ_R |
2982						   R500_ALU_RGBA_A_SWIZ_G));
2983
2984	    /* TEX temp1, temp3.zwxy, tex0, 2D */
2985	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
2986						   R500_INST_RGB_WMASK_R |
2987						   R500_INST_RGB_WMASK_G |
2988						   R500_INST_RGB_WMASK_B |
2989						   R500_INST_ALPHA_WMASK));
2990	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
2991						   R500_TEX_INST_LD |
2992						   R500_TEX_IGNORE_UNCOVERED));
2993	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
2994						   R500_TEX_SRC_S_SWIZ_B |
2995						   R500_TEX_SRC_T_SWIZ_A |
2996						   R500_TEX_SRC_R_SWIZ_R |
2997						   R500_TEX_SRC_Q_SWIZ_G |
2998						   R500_TEX_DST_ADDR(1) |
2999						   R500_TEX_DST_R_SWIZ_R |
3000						   R500_TEX_DST_G_SWIZ_G |
3001						   R500_TEX_DST_B_SWIZ_B |
3002						   R500_TEX_DST_A_SWIZ_A));
3003	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3004	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3005	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3006
3007	    /* TEX temp3, temp3.xyzw, tex0, 2D */
3008	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3009						   R500_INST_TEX_SEM_WAIT |
3010						   R500_INST_RGB_WMASK_R |
3011						   R500_INST_RGB_WMASK_G |
3012						   R500_INST_RGB_WMASK_B |
3013						   R500_INST_ALPHA_WMASK));
3014	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3015						   R500_TEX_INST_LD |
3016						   R500_TEX_SEM_ACQUIRE |
3017						   R500_TEX_IGNORE_UNCOVERED));
3018	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
3019						   R500_TEX_SRC_S_SWIZ_R |
3020						   R500_TEX_SRC_T_SWIZ_G |
3021						   R500_TEX_SRC_R_SWIZ_B |
3022						   R500_TEX_SRC_Q_SWIZ_A |
3023						   R500_TEX_DST_ADDR(3) |
3024						   R500_TEX_DST_R_SWIZ_R |
3025						   R500_TEX_DST_G_SWIZ_G |
3026						   R500_TEX_DST_B_SWIZ_B |
3027						   R500_TEX_DST_A_SWIZ_A));
3028	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3029	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3030	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3031
3032	    /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
3033	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3034						   R500_INST_RGB_WMASK_R |
3035						   R500_INST_RGB_WMASK_G |
3036						   R500_INST_RGB_WMASK_B |
3037						   R500_INST_ALPHA_WMASK));
3038	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3039						   R500_RGB_ADDR0_CONST |
3040						   R500_RGB_ADDR1(5) |
3041						   R500_RGB_ADDR2(4)));
3042	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3043						   R500_ALPHA_ADDR0_CONST |
3044						   R500_ALPHA_ADDR1(5) |
3045						   R500_ALPHA_ADDR2(4)));
3046	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3047						   R500_ALU_RGB_R_SWIZ_A_0 |
3048						   R500_ALU_RGB_G_SWIZ_A_G |
3049						   R500_ALU_RGB_B_SWIZ_A_0 |
3050						   R500_ALU_RGB_SEL_B_SRC1 |
3051						   R500_ALU_RGB_R_SWIZ_B_G |
3052						   R500_ALU_RGB_G_SWIZ_B_G |
3053						   R500_ALU_RGB_B_SWIZ_B_G));
3054	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
3055						   R500_ALPHA_OP_MAD |
3056						   R500_ALPHA_SEL_A_SRC0 |
3057						   R500_ALPHA_SWIZ_A_G |
3058						   R500_ALPHA_SEL_B_SRC1 |
3059						   R500_ALPHA_SWIZ_B_G));
3060	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
3061						   R500_ALU_RGBA_OP_MAD |
3062						   R500_ALU_RGBA_SEL_C_SRC2 |
3063						   R500_ALU_RGBA_R_SWIZ_R |
3064						   R500_ALU_RGBA_G_SWIZ_G |
3065						   R500_ALU_RGBA_B_SWIZ_B |
3066						   R500_ALU_RGBA_A_SWIZ_A));
3067
3068	    /* ADD temp0, temp4, input0.xyxy */
3069	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3070						   R500_INST_RGB_WMASK_R |
3071						   R500_INST_RGB_WMASK_G |
3072						   R500_INST_RGB_WMASK_B |
3073						   R500_INST_ALPHA_WMASK));
3074	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
3075						   R500_RGB_ADDR2(0)));
3076	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
3077						   R500_ALPHA_ADDR2(0)));
3078	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
3079						   R500_ALU_RGB_G_SWIZ_A_1 |
3080						   R500_ALU_RGB_B_SWIZ_A_1 |
3081						   R500_ALU_RGB_SEL_B_SRC1 |
3082						   R500_ALU_RGB_R_SWIZ_B_R |
3083						   R500_ALU_RGB_G_SWIZ_B_G |
3084						   R500_ALU_RGB_B_SWIZ_B_B));
3085	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3086						   R500_ALPHA_OP_MAD |
3087						   R500_ALPHA_SWIZ_A_1 |
3088						   R500_ALPHA_SEL_B_SRC1 |
3089						   R500_ALPHA_SWIZ_B_A));
3090	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3091						   R500_ALU_RGBA_OP_MAD |
3092						   R500_ALU_RGBA_SEL_C_SRC2 |
3093						   R500_ALU_RGBA_R_SWIZ_R |
3094						   R500_ALU_RGBA_G_SWIZ_G |
3095						   R500_ALU_RGBA_B_SWIZ_R |
3096						   R500_ALU_RGBA_A_SWIZ_G));
3097
3098	    /* TEX temp4, temp0.zwzw, tex0, 2D */
3099	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3100						   R500_INST_TEX_SEM_WAIT |
3101						   R500_INST_RGB_WMASK_R |
3102						   R500_INST_RGB_WMASK_G |
3103						   R500_INST_RGB_WMASK_B |
3104						   R500_INST_ALPHA_WMASK));
3105	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3106						   R500_TEX_INST_LD |
3107						   R500_TEX_IGNORE_UNCOVERED));
3108	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3109						   R500_TEX_SRC_S_SWIZ_B |
3110						   R500_TEX_SRC_T_SWIZ_A |
3111						   R500_TEX_SRC_R_SWIZ_B |
3112						   R500_TEX_SRC_Q_SWIZ_A |
3113						   R500_TEX_DST_ADDR(4) |
3114						   R500_TEX_DST_R_SWIZ_R |
3115						   R500_TEX_DST_G_SWIZ_G |
3116						   R500_TEX_DST_B_SWIZ_B |
3117						   R500_TEX_DST_A_SWIZ_A));
3118	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3119	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3120	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3121
3122	    /* TEX temp0, temp0.xyzw, tex0, 2D */
3123	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3124						   R500_INST_TEX_SEM_WAIT |
3125						   R500_INST_RGB_WMASK_R |
3126						   R500_INST_RGB_WMASK_G |
3127						   R500_INST_RGB_WMASK_B |
3128						   R500_INST_ALPHA_WMASK));
3129	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3130						   R500_TEX_INST_LD |
3131						   R500_TEX_SEM_ACQUIRE |
3132						   R500_TEX_IGNORE_UNCOVERED));
3133	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3134						   R500_TEX_SRC_S_SWIZ_R |
3135						   R500_TEX_SRC_T_SWIZ_G |
3136						   R500_TEX_SRC_R_SWIZ_B |
3137						   R500_TEX_SRC_Q_SWIZ_A |
3138						   R500_TEX_DST_ADDR(0) |
3139						   R500_TEX_DST_R_SWIZ_R |
3140						   R500_TEX_DST_G_SWIZ_G |
3141						   R500_TEX_DST_B_SWIZ_B |
3142						   R500_TEX_DST_A_SWIZ_A));
3143	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3144	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3145	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3146
3147	    /* LRP temp3, temp2.zzzz, temp1, temp3 ->
3148	     * - PRESUB temps, temp1 - temp3
3149	     * - MAD temp2.zzzz, temps, temp3 */
3150	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3151						   R500_INST_RGB_WMASK_R |
3152						   R500_INST_RGB_WMASK_G |
3153						   R500_INST_RGB_WMASK_B |
3154						   R500_INST_ALPHA_WMASK));
3155	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
3156						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3157						   R500_RGB_ADDR1(1) |
3158						   R500_RGB_ADDR2(2)));
3159	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
3160						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3161						   R500_ALPHA_ADDR1(1) |
3162						   R500_ALPHA_ADDR2(2)));
3163	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3164						   R500_ALU_RGB_R_SWIZ_A_B |
3165						   R500_ALU_RGB_G_SWIZ_A_B |
3166						   R500_ALU_RGB_B_SWIZ_A_B |
3167						   R500_ALU_RGB_SEL_B_SRCP |
3168						   R500_ALU_RGB_R_SWIZ_B_R |
3169						   R500_ALU_RGB_G_SWIZ_B_G |
3170						   R500_ALU_RGB_B_SWIZ_B_B));
3171	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
3172						   R500_ALPHA_OP_MAD |
3173						   R500_ALPHA_SEL_A_SRC2 |
3174						   R500_ALPHA_SWIZ_A_B |
3175						   R500_ALPHA_SEL_B_SRCP |
3176						   R500_ALPHA_SWIZ_B_A));
3177	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
3178						   R500_ALU_RGBA_OP_MAD |
3179						   R500_ALU_RGBA_SEL_C_SRC0 |
3180						   R500_ALU_RGBA_R_SWIZ_R |
3181						   R500_ALU_RGBA_G_SWIZ_G |
3182						   R500_ALU_RGBA_B_SWIZ_B |
3183						   R500_ALU_RGBA_A_SWIZ_A));
3184
3185	    /* LRP temp0, temp2.zzzz, temp4, temp0 ->
3186	     * - PRESUB temps, temp4 - temp1
3187	     * - MAD temp2.zzzz, temps, temp0 */
3188	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3189						   R500_INST_TEX_SEM_WAIT |
3190						   R500_INST_RGB_WMASK_R |
3191						   R500_INST_RGB_WMASK_G |
3192						   R500_INST_RGB_WMASK_B |
3193						   R500_INST_ALPHA_WMASK));
3194	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3195						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3196						   R500_RGB_ADDR1(4) |
3197						   R500_RGB_ADDR2(2)));
3198	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3199						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3200						   R500_ALPHA_ADDR1(4) |
3201						   R500_ALPHA_ADDR2(2)));
3202	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3203						   R500_ALU_RGB_R_SWIZ_A_B |
3204						   R500_ALU_RGB_G_SWIZ_A_B |
3205						   R500_ALU_RGB_B_SWIZ_A_B |
3206						   R500_ALU_RGB_SEL_B_SRCP |
3207						   R500_ALU_RGB_R_SWIZ_B_R |
3208						   R500_ALU_RGB_G_SWIZ_B_G |
3209						   R500_ALU_RGB_B_SWIZ_B_B));
3210	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3211						   R500_ALPHA_OP_MAD |
3212						   R500_ALPHA_SEL_A_SRC2 |
3213						   R500_ALPHA_SWIZ_A_B |
3214						   R500_ALPHA_SEL_B_SRCP |
3215						   R500_ALPHA_SWIZ_B_A));
3216	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3217						   R500_ALU_RGBA_OP_MAD |
3218						   R500_ALU_RGBA_SEL_C_SRC0 |
3219						   R500_ALU_RGBA_R_SWIZ_R |
3220						   R500_ALU_RGBA_G_SWIZ_G |
3221						   R500_ALU_RGBA_B_SWIZ_B |
3222						   R500_ALU_RGBA_A_SWIZ_A));
3223
3224	    /* LRP output, temp5.zzzz, temp3, temp0 ->
3225	     * - PRESUB temps, temp3 - temp0
3226	     * - MAD temp5.zzzz, temps, temp0 */
3227	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3228						   R500_INST_LAST |
3229						   R500_INST_TEX_SEM_WAIT |
3230						   R500_INST_RGB_WMASK_R |
3231						   R500_INST_RGB_WMASK_G |
3232						   R500_INST_RGB_WMASK_B |
3233						   R500_INST_ALPHA_WMASK |
3234						   R500_INST_RGB_OMASK_R |
3235						   R500_INST_RGB_OMASK_G |
3236						   R500_INST_RGB_OMASK_B |
3237						   R500_INST_ALPHA_OMASK));
3238	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3239						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
3240						   R500_RGB_ADDR1(3) |
3241						   R500_RGB_ADDR2(5)));
3242	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3243						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
3244						   R500_ALPHA_ADDR1(3) |
3245						   R500_ALPHA_ADDR2(5)));
3246	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
3247						   R500_ALU_RGB_R_SWIZ_A_B |
3248						   R500_ALU_RGB_G_SWIZ_A_B |
3249						   R500_ALU_RGB_B_SWIZ_A_B |
3250						   R500_ALU_RGB_SEL_B_SRCP |
3251						   R500_ALU_RGB_R_SWIZ_B_R |
3252						   R500_ALU_RGB_G_SWIZ_B_G |
3253						   R500_ALU_RGB_B_SWIZ_B_B));
3254	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
3255						   R500_ALPHA_OP_MAD |
3256						   R500_ALPHA_SEL_A_SRC2 |
3257						   R500_ALPHA_SWIZ_A_B |
3258						   R500_ALPHA_SEL_B_SRCP |
3259						   R500_ALPHA_SWIZ_B_A));
3260	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
3261						   R500_ALU_RGBA_OP_MAD |
3262						   R500_ALU_RGBA_SEL_C_SRC0 |
3263						   R500_ALU_RGBA_R_SWIZ_R |
3264						   R500_ALU_RGBA_G_SWIZ_G |
3265						   R500_ALU_RGBA_B_SWIZ_B |
3266						   R500_ALU_RGBA_A_SWIZ_A));
3267
3268	    /* Shader constants. */
3269	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3270
3271	    /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
3272	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
3273	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
3274	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3275	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
3276
3277	    FINISH_ACCEL();
3278	} else {
3279	    BEGIN_ACCEL(19);
3280	    /* 2 components: 2 for tex0 */
3281	    OUT_ACCEL_REG(R300_RS_COUNT,
3282			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3283			   R300_RS_COUNT_HIRES_EN));
3284
3285	    /* R300_INST_COUNT_RS - highest RS instruction used */
3286	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3287
3288	    /* Pixel stack frame size. */
3289	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
3290
3291	    /* FP length. */
3292	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3293					      R500_US_CODE_END_ADDR(1)));
3294	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3295					       R500_US_CODE_RANGE_SIZE(1)));
3296
3297	    /* Prepare for FP emission. */
3298	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3299	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3300
3301	    /* tex inst */
3302	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3303						   R500_INST_TEX_SEM_WAIT |
3304						   R500_INST_RGB_WMASK_R |
3305						   R500_INST_RGB_WMASK_G |
3306						   R500_INST_RGB_WMASK_B |
3307						   R500_INST_ALPHA_WMASK |
3308						   R500_INST_RGB_CLAMP |
3309						   R500_INST_ALPHA_CLAMP));
3310	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3311						   R500_TEX_INST_LD |
3312						   R500_TEX_SEM_ACQUIRE |
3313						   R500_TEX_IGNORE_UNCOVERED));
3314	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3315						   R500_TEX_SRC_S_SWIZ_R |
3316						   R500_TEX_SRC_T_SWIZ_G |
3317						   R500_TEX_DST_ADDR(0) |
3318						   R500_TEX_DST_R_SWIZ_R |
3319						   R500_TEX_DST_G_SWIZ_G |
3320						   R500_TEX_DST_B_SWIZ_B |
3321						   R500_TEX_DST_A_SWIZ_A));
3322	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3323						   R500_DX_S_SWIZ_R |
3324						   R500_DX_T_SWIZ_R |
3325						   R500_DX_R_SWIZ_R |
3326						   R500_DX_Q_SWIZ_R |
3327						   R500_DY_ADDR(0) |
3328						   R500_DY_S_SWIZ_R |
3329						   R500_DY_T_SWIZ_R |
3330						   R500_DY_R_SWIZ_R |
3331						   R500_DY_Q_SWIZ_R));
3332	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3333	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3334
3335	    /* ALU inst */
3336	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3337						   R500_INST_TEX_SEM_WAIT |
3338						   R500_INST_LAST |
3339						   R500_INST_RGB_OMASK_R |
3340						   R500_INST_RGB_OMASK_G |
3341						   R500_INST_RGB_OMASK_B |
3342						   R500_INST_ALPHA_OMASK |
3343						   R500_INST_RGB_CLAMP |
3344						   R500_INST_ALPHA_CLAMP));
3345	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3346						   R500_RGB_ADDR1(0) |
3347						   R500_RGB_ADDR1_CONST |
3348						   R500_RGB_ADDR2(0) |
3349						   R500_RGB_ADDR2_CONST));
3350	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3351						   R500_ALPHA_ADDR1(0) |
3352						   R500_ALPHA_ADDR1_CONST |
3353						   R500_ALPHA_ADDR2(0) |
3354						   R500_ALPHA_ADDR2_CONST));
3355	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3356						   R500_ALU_RGB_R_SWIZ_A_R |
3357						   R500_ALU_RGB_G_SWIZ_A_G |
3358						   R500_ALU_RGB_B_SWIZ_A_B |
3359						   R500_ALU_RGB_SEL_B_SRC0 |
3360						   R500_ALU_RGB_R_SWIZ_B_1 |
3361						   R500_ALU_RGB_B_SWIZ_B_1 |
3362						   R500_ALU_RGB_G_SWIZ_B_1));
3363	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3364						   R500_ALPHA_SWIZ_A_A |
3365						   R500_ALPHA_SWIZ_B_1));
3366	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3367						   R500_ALU_RGBA_R_SWIZ_0 |
3368						   R500_ALU_RGBA_G_SWIZ_0 |
3369						   R500_ALU_RGBA_B_SWIZ_0 |
3370						   R500_ALU_RGBA_A_SWIZ_0));
3371	    FINISH_ACCEL();
3372	}
3373    } else {
3374	/*
3375	 * y' = y - .0625
3376	 * u' = u - .5
3377	 * v' = v - .5;
3378	 *
3379	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
3380	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
3381	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
3382	 *
3383	 * DP3 might look like the straightforward solution
3384	 * but we'd need to move the texture yuv values in
3385	 * the same reg for this to work. Therefore use MADs.
3386	 * Brightness just adds to the off constant.
3387	 * Contrast is multiplication of luminance.
3388	 * Saturation and hue change the u and v coeffs.
3389	 * Default values (before adjustments - depend on colorspace):
3390	 * yco = 1.1643
3391	 * uco = 0, -0.39173, 2.017
3392	 * vco = 1.5958, -0.8129, 0
3393	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
3394	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
3395	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
3396	 *
3397	 * temp = MAD(yco, yuv.yyyy, off)
3398	 * temp = MAD(uco, yuv.uuuu, temp)
3399	 * result = MAD(vco, yuv.vvvv, temp)
3400	 */
3401	/* TODO: don't recalc consts always */
3402	const float Loff = -0.0627;
3403	const float Coff = -0.502;
3404	float uvcosf, uvsinf;
3405	float yco;
3406	float uco[3], vco[3], off[3];
3407	float bright, cont, gamma;
3408	int ref = pPriv->transform_index;
3409	Bool needgamma = FALSE;
3410
3411	cont = RTFContrast(pPriv->contrast);
3412	bright = RTFBrightness(pPriv->brightness);
3413	gamma = (float)pPriv->gamma / 1000.0;
3414	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
3415	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
3416	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
3417
3418	yco = trans[ref].RefLuma * cont;
3419	uco[0] = -trans[ref].RefRCr * uvsinf;
3420	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
3421	uco[2] = trans[ref].RefBCb * uvcosf;
3422	vco[0] = trans[ref].RefRCr * uvcosf;
3423	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
3424	vco[2] = trans[ref].RefBCb * uvsinf;
3425	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
3426	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
3427	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
3428
3429	//XXX gamma
3430
3431	if (gamma != 1.0) {
3432	    needgamma = TRUE;
3433	    /* note: gamma correction is out = in ^ gamma;
3434	       gpu can only do LG2/EX2 therefore we transform into
3435	       in ^ gamma = 2 ^ (log2(in) * gamma).
3436	       Lots of scalar ops, unfortunately (better solution?) -
3437	       without gamma that's 3 inst, with gamma it's 10...
3438	       could use different gamma factors per channel,
3439	       if that's of any use. */
3440	}
3441
3442	if (isplanar) {
3443	    BEGIN_ACCEL(56);
3444	    /* 2 components: 2 for tex0 */
3445	    OUT_ACCEL_REG(R300_RS_COUNT,
3446			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3447			   R300_RS_COUNT_HIRES_EN));
3448
3449	    /* R300_INST_COUNT_RS - highest RS instruction used */
3450	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3451
3452	    /* Pixel stack frame size. */
3453	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
3454
3455	    /* FP length. */
3456	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3457					      R500_US_CODE_END_ADDR(5)));
3458	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3459					       R500_US_CODE_RANGE_SIZE(5)));
3460
3461	    /* Prepare for FP emission. */
3462	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3463	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3464
3465	    /* tex inst */
3466	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3467						   R500_INST_TEX_SEM_WAIT |
3468						   R500_INST_RGB_WMASK_R |
3469						   R500_INST_RGB_WMASK_G |
3470						   R500_INST_RGB_WMASK_B |
3471						   R500_INST_ALPHA_WMASK |
3472						   R500_INST_RGB_CLAMP |
3473						   R500_INST_ALPHA_CLAMP));
3474	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3475						   R500_TEX_INST_LD |
3476						   R500_TEX_IGNORE_UNCOVERED));
3477	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3478						   R500_TEX_SRC_S_SWIZ_R |
3479						   R500_TEX_SRC_T_SWIZ_G |
3480						   R500_TEX_DST_ADDR(2) |
3481						   R500_TEX_DST_R_SWIZ_R |
3482						   R500_TEX_DST_G_SWIZ_G |
3483						   R500_TEX_DST_B_SWIZ_B |
3484						   R500_TEX_DST_A_SWIZ_A));
3485	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3486						   R500_DX_S_SWIZ_R |
3487						   R500_DX_T_SWIZ_R |
3488						   R500_DX_R_SWIZ_R |
3489						   R500_DX_Q_SWIZ_R |
3490						   R500_DY_ADDR(0) |
3491						   R500_DY_S_SWIZ_R |
3492						   R500_DY_T_SWIZ_R |
3493						   R500_DY_R_SWIZ_R |
3494						   R500_DY_Q_SWIZ_R));
3495	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3496	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3497
3498	    /* tex inst */
3499	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3500						   R500_INST_TEX_SEM_WAIT |
3501						   R500_INST_RGB_WMASK_R |
3502						   R500_INST_RGB_WMASK_G |
3503						   R500_INST_RGB_WMASK_B |
3504						   R500_INST_ALPHA_WMASK |
3505						   R500_INST_RGB_CLAMP |
3506						   R500_INST_ALPHA_CLAMP));
3507	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
3508						   R500_TEX_INST_LD |
3509						   R500_TEX_IGNORE_UNCOVERED));
3510	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3511						   R500_TEX_SRC_S_SWIZ_R |
3512						   R500_TEX_SRC_T_SWIZ_G |
3513						   R500_TEX_DST_ADDR(1) |
3514						   R500_TEX_DST_R_SWIZ_R |
3515						   R500_TEX_DST_G_SWIZ_G |
3516						   R500_TEX_DST_B_SWIZ_B |
3517						   R500_TEX_DST_A_SWIZ_A));
3518	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3519						   R500_DX_S_SWIZ_R |
3520						   R500_DX_T_SWIZ_R |
3521						   R500_DX_R_SWIZ_R |
3522						   R500_DX_Q_SWIZ_R |
3523						   R500_DY_ADDR(0) |
3524						   R500_DY_S_SWIZ_R |
3525						   R500_DY_T_SWIZ_R |
3526						   R500_DY_R_SWIZ_R |
3527						   R500_DY_Q_SWIZ_R));
3528	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3529	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3530
3531	    /* tex inst */
3532	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3533						   R500_INST_TEX_SEM_WAIT |
3534						   R500_INST_RGB_WMASK_R |
3535						   R500_INST_RGB_WMASK_G |
3536						   R500_INST_RGB_WMASK_B |
3537						   R500_INST_ALPHA_WMASK |
3538						   R500_INST_RGB_CLAMP |
3539						   R500_INST_ALPHA_CLAMP));
3540	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
3541						   R500_TEX_INST_LD |
3542						   R500_TEX_SEM_ACQUIRE |
3543						   R500_TEX_IGNORE_UNCOVERED));
3544	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3545						   R500_TEX_SRC_S_SWIZ_R |
3546						   R500_TEX_SRC_T_SWIZ_G |
3547						   R500_TEX_DST_ADDR(0) |
3548						   R500_TEX_DST_R_SWIZ_R |
3549						   R500_TEX_DST_G_SWIZ_G |
3550						   R500_TEX_DST_B_SWIZ_B |
3551						   R500_TEX_DST_A_SWIZ_A));
3552	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3553						   R500_DX_S_SWIZ_R |
3554						   R500_DX_T_SWIZ_R |
3555						   R500_DX_R_SWIZ_R |
3556						   R500_DX_Q_SWIZ_R |
3557						   R500_DY_ADDR(0) |
3558						   R500_DY_S_SWIZ_R |
3559						   R500_DY_T_SWIZ_R |
3560						   R500_DY_R_SWIZ_R |
3561						   R500_DY_Q_SWIZ_R));
3562	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3563	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3564
3565	    /* ALU inst */
3566	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
3567	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3568						   R500_INST_TEX_SEM_WAIT |
3569						   R500_INST_RGB_WMASK_R |
3570						   R500_INST_RGB_WMASK_G |
3571						   R500_INST_RGB_WMASK_B |
3572						   R500_INST_ALPHA_WMASK));
3573	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3574						   R500_RGB_ADDR0_CONST |
3575						   R500_RGB_ADDR1(2) |
3576						   R500_RGB_ADDR2(0) |
3577						   R500_RGB_ADDR2_CONST));
3578	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3579						   R500_ALPHA_ADDR0_CONST |
3580						   R500_ALPHA_ADDR1(2) |
3581						   R500_ALPHA_ADDR2(0) |
3582						   R500_ALPHA_ADDR2_CONST));
3583	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3584						   R500_ALU_RGB_R_SWIZ_A_A |
3585						   R500_ALU_RGB_G_SWIZ_A_A |
3586						   R500_ALU_RGB_B_SWIZ_A_A |
3587						   R500_ALU_RGB_SEL_B_SRC1 |
3588						   R500_ALU_RGB_R_SWIZ_B_R |
3589						   R500_ALU_RGB_B_SWIZ_B_G |
3590						   R500_ALU_RGB_G_SWIZ_B_B));
3591	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3592						   R500_ALPHA_ADDRD(2) |
3593						   R500_ALPHA_SWIZ_A_0 |
3594						   R500_ALPHA_SWIZ_B_0));
3595	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3596						   R500_ALU_RGBA_ADDRD(2) |
3597						   R500_ALU_RGBA_SEL_C_SRC0 |
3598						   R500_ALU_RGBA_R_SWIZ_R |
3599						   R500_ALU_RGBA_G_SWIZ_G |
3600						   R500_ALU_RGBA_B_SWIZ_B |
3601						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3602						   R500_ALU_RGBA_A_SWIZ_0));
3603
3604	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
3605	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3606						   R500_INST_TEX_SEM_WAIT |
3607						   R500_INST_RGB_WMASK_R |
3608						   R500_INST_RGB_WMASK_G |
3609						   R500_INST_RGB_WMASK_B |
3610						   R500_INST_ALPHA_WMASK));
3611	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3612						   R500_RGB_ADDR0_CONST |
3613						   R500_RGB_ADDR1(1) |
3614						   R500_RGB_ADDR2(2)));
3615	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3616						   R500_ALPHA_ADDR0_CONST |
3617						   R500_ALPHA_ADDR1(1) |
3618						   R500_ALPHA_ADDR2(2)));
3619	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3620						   R500_ALU_RGB_R_SWIZ_A_R |
3621						   R500_ALU_RGB_G_SWIZ_A_G |
3622						   R500_ALU_RGB_B_SWIZ_A_B |
3623						   R500_ALU_RGB_SEL_B_SRC1 |
3624						   R500_ALU_RGB_R_SWIZ_B_R |
3625						   R500_ALU_RGB_B_SWIZ_B_G |
3626						   R500_ALU_RGB_G_SWIZ_B_B));
3627	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3628						   R500_ALPHA_ADDRD(2) |
3629						   R500_ALPHA_SWIZ_A_0 |
3630						   R500_ALPHA_SWIZ_B_0));
3631	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3632						   R500_ALU_RGBA_ADDRD(2) |
3633						   R500_ALU_RGBA_SEL_C_SRC2 |
3634						   R500_ALU_RGBA_R_SWIZ_R |
3635						   R500_ALU_RGBA_G_SWIZ_G |
3636						   R500_ALU_RGBA_B_SWIZ_B |
3637						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3638						   R500_ALU_RGBA_A_SWIZ_0));
3639
3640	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
3641	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3642						   R500_INST_TEX_SEM_WAIT |
3643						   R500_INST_LAST |
3644						   R500_INST_RGB_OMASK_R |
3645						   R500_INST_RGB_OMASK_G |
3646						   R500_INST_RGB_OMASK_B |
3647						   R500_INST_ALPHA_OMASK |
3648						   R500_INST_RGB_CLAMP |
3649						   R500_INST_ALPHA_CLAMP));
3650	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3651						   R500_RGB_ADDR0_CONST |
3652						   R500_RGB_ADDR1(0) |
3653						   R500_RGB_ADDR2(2)));
3654	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
3655						   R500_ALPHA_ADDR0_CONST |
3656						   R500_ALPHA_ADDR1(0) |
3657						   R500_ALPHA_ADDR2(2)));
3658	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3659						   R500_ALU_RGB_R_SWIZ_A_R |
3660						   R500_ALU_RGB_G_SWIZ_A_G |
3661						   R500_ALU_RGB_B_SWIZ_A_B |
3662						   R500_ALU_RGB_SEL_B_SRC1 |
3663						   R500_ALU_RGB_R_SWIZ_B_R |
3664						   R500_ALU_RGB_B_SWIZ_B_G |
3665						   R500_ALU_RGB_G_SWIZ_B_B));
3666	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3667						   R500_ALPHA_ADDRD(0) |
3668						   R500_ALPHA_SWIZ_A_0 |
3669						   R500_ALPHA_SWIZ_B_0));
3670	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3671						   R500_ALU_RGBA_ADDRD(0) |
3672						   R500_ALU_RGBA_SEL_C_SRC2 |
3673						   R500_ALU_RGBA_R_SWIZ_R |
3674						   R500_ALU_RGBA_G_SWIZ_G |
3675						   R500_ALU_RGBA_B_SWIZ_B |
3676						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3677						   R500_ALU_RGBA_A_SWIZ_1));
3678
3679	} else {
3680	    BEGIN_ACCEL(44);
3681	    /* 2 components: 2 for tex0/1/2 */
3682	    OUT_ACCEL_REG(R300_RS_COUNT,
3683			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
3684			   R300_RS_COUNT_HIRES_EN));
3685
3686	    /* R300_INST_COUNT_RS - highest RS instruction used */
3687	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0));
3688
3689	    /* Pixel stack frame size. */
3690	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
3691
3692	    /* FP length. */
3693	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
3694					      R500_US_CODE_END_ADDR(3)));
3695	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
3696					       R500_US_CODE_RANGE_SIZE(3)));
3697
3698	    /* Prepare for FP emission. */
3699	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
3700	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
3701
3702	    /* tex inst */
3703	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
3704						   R500_INST_TEX_SEM_WAIT |
3705						   R500_INST_RGB_WMASK_R |
3706						   R500_INST_RGB_WMASK_G |
3707						   R500_INST_RGB_WMASK_B |
3708						   R500_INST_ALPHA_WMASK |
3709						   R500_INST_RGB_CLAMP |
3710						   R500_INST_ALPHA_CLAMP));
3711	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
3712						   R500_TEX_INST_LD |
3713						   R500_TEX_SEM_ACQUIRE |
3714						   R500_TEX_IGNORE_UNCOVERED));
3715	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
3716						   R500_TEX_SRC_S_SWIZ_R |
3717						   R500_TEX_SRC_T_SWIZ_G |
3718						   R500_TEX_DST_ADDR(0) |
3719						   R500_TEX_DST_R_SWIZ_R |
3720						   R500_TEX_DST_G_SWIZ_G |
3721						   R500_TEX_DST_B_SWIZ_B |
3722						   R500_TEX_DST_A_SWIZ_A));
3723	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
3724						   R500_DX_S_SWIZ_R |
3725						   R500_DX_T_SWIZ_R |
3726						   R500_DX_R_SWIZ_R |
3727						   R500_DX_Q_SWIZ_R |
3728						   R500_DY_ADDR(0) |
3729						   R500_DY_S_SWIZ_R |
3730						   R500_DY_T_SWIZ_R |
3731						   R500_DY_R_SWIZ_R |
3732						   R500_DY_Q_SWIZ_R));
3733	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3734	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
3735
3736	    /* ALU inst */
3737	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
3738	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3739						   R500_INST_TEX_SEM_WAIT |
3740						   R500_INST_RGB_WMASK_R |
3741						   R500_INST_RGB_WMASK_G |
3742						   R500_INST_RGB_WMASK_B |
3743						   R500_INST_ALPHA_WMASK));
3744	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
3745						   R500_RGB_ADDR0_CONST |
3746						   R500_RGB_ADDR1(0) |
3747						   R500_RGB_ADDR2(0) |
3748						   R500_RGB_ADDR2_CONST));
3749	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
3750						   R500_ALPHA_ADDR0_CONST |
3751						   R500_ALPHA_ADDR1(0) |
3752						   R500_ALPHA_ADDR2(0) |
3753						   R500_ALPHA_ADDR2_CONST));
3754	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3755						   R500_ALU_RGB_R_SWIZ_A_A |
3756						   R500_ALU_RGB_G_SWIZ_A_A |
3757						   R500_ALU_RGB_B_SWIZ_A_A |
3758						   R500_ALU_RGB_SEL_B_SRC1 |
3759						   R500_ALU_RGB_R_SWIZ_B_G |
3760						   R500_ALU_RGB_B_SWIZ_B_G |
3761						   R500_ALU_RGB_G_SWIZ_B_G));
3762	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3763						   R500_ALPHA_ADDRD(1) |
3764						   R500_ALPHA_SWIZ_A_0 |
3765						   R500_ALPHA_SWIZ_B_0));
3766	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3767						   R500_ALU_RGBA_ADDRD(1) |
3768						   R500_ALU_RGBA_SEL_C_SRC0 |
3769						   R500_ALU_RGBA_R_SWIZ_R |
3770						   R500_ALU_RGBA_G_SWIZ_G |
3771						   R500_ALU_RGBA_B_SWIZ_B |
3772						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3773						   R500_ALU_RGBA_A_SWIZ_0));
3774
3775	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
3776	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
3777						   R500_INST_TEX_SEM_WAIT |
3778						   R500_INST_RGB_WMASK_R |
3779						   R500_INST_RGB_WMASK_G |
3780						   R500_INST_RGB_WMASK_B |
3781						   R500_INST_ALPHA_WMASK));
3782	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
3783						   R500_RGB_ADDR0_CONST |
3784						   R500_RGB_ADDR1(0) |
3785						   R500_RGB_ADDR2(1)));
3786	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3787						   R500_ALPHA_ADDR0_CONST |
3788						   R500_ALPHA_ADDR1(0) |
3789						   R500_ALPHA_ADDR2(1)));
3790	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3791						   R500_ALU_RGB_R_SWIZ_A_R |
3792						   R500_ALU_RGB_G_SWIZ_A_G |
3793						   R500_ALU_RGB_B_SWIZ_A_B |
3794						   R500_ALU_RGB_SEL_B_SRC1 |
3795						   R500_ALU_RGB_R_SWIZ_B_B |
3796						   R500_ALU_RGB_B_SWIZ_B_B |
3797						   R500_ALU_RGB_G_SWIZ_B_B));
3798	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3799						   R500_ALPHA_ADDRD(1) |
3800						   R500_ALPHA_SWIZ_A_0 |
3801						   R500_ALPHA_SWIZ_B_0));
3802	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3803						   R500_ALU_RGBA_ADDRD(1) |
3804						   R500_ALU_RGBA_SEL_C_SRC2 |
3805						   R500_ALU_RGBA_R_SWIZ_R |
3806						   R500_ALU_RGBA_G_SWIZ_G |
3807						   R500_ALU_RGBA_B_SWIZ_B |
3808						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3809						   R500_ALU_RGBA_A_SWIZ_0));
3810
3811	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
3812	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
3813						   R500_INST_TEX_SEM_WAIT |
3814						   R500_INST_LAST |
3815						   R500_INST_RGB_OMASK_R |
3816						   R500_INST_RGB_OMASK_G |
3817						   R500_INST_RGB_OMASK_B |
3818						   R500_INST_ALPHA_OMASK |
3819						   R500_INST_RGB_CLAMP |
3820						   R500_INST_ALPHA_CLAMP));
3821	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
3822						   R500_RGB_ADDR0_CONST |
3823						   R500_RGB_ADDR1(0) |
3824						   R500_RGB_ADDR2(1)));
3825	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
3826						   R500_ALPHA_ADDR0_CONST |
3827						   R500_ALPHA_ADDR1(0) |
3828						   R500_ALPHA_ADDR2(1)));
3829	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
3830						   R500_ALU_RGB_R_SWIZ_A_R |
3831						   R500_ALU_RGB_G_SWIZ_A_G |
3832						   R500_ALU_RGB_B_SWIZ_A_B |
3833						   R500_ALU_RGB_SEL_B_SRC1 |
3834						   R500_ALU_RGB_R_SWIZ_B_R |
3835						   R500_ALU_RGB_B_SWIZ_B_R |
3836						   R500_ALU_RGB_G_SWIZ_B_R));
3837	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
3838						   R500_ALPHA_ADDRD(1) |
3839						   R500_ALPHA_SWIZ_A_0 |
3840						   R500_ALPHA_SWIZ_B_0));
3841	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
3842						   R500_ALU_RGBA_ADDRD(1) |
3843						   R500_ALU_RGBA_SEL_C_SRC2 |
3844						   R500_ALU_RGBA_R_SWIZ_R |
3845						   R500_ALU_RGBA_G_SWIZ_G |
3846						   R500_ALU_RGBA_B_SWIZ_B |
3847						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
3848						   R500_ALU_RGBA_A_SWIZ_1));
3849	}
3850
3851	/* Shader constants. */
3852	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
3853
3854	/* constant 0: off, yco */
3855	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
3856	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
3857	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
3858	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
3859	/* constant 1: uco */
3860	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
3861	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
3862	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
3863	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
3864	/* constant 2: vco */
3865	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
3866	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
3867	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
3868	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
3869
3870	FINISH_ACCEL();
3871    }
3872
3873    BEGIN_ACCEL_RELOC(6, 2);
3874    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
3875    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
3876
3877    EMIT_WRITE_OFFSET(R300_RB3D_COLOROFFSET0, 0, pPixmap);
3878    EMIT_COLORPITCH(R300_RB3D_COLORPITCH0, colorpitch, pPixmap);
3879
3880    /* no need to enable blending */
3881    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
3882
3883    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
3884    FINISH_ACCEL();
3885
3886    if (pPriv->vsync) {
3887	xf86CrtcPtr crtc;
3888	if (pPriv->desired_crtc)
3889	    crtc = pPriv->desired_crtc;
3890	else
3891	    crtc = radeon_pick_best_crtc(pScrn,
3892					 pPriv->drw_x,
3893					 pPriv->drw_x + pPriv->dst_w,
3894					 pPriv->drw_y,
3895					 pPriv->drw_y + pPriv->dst_h);
3896	if (crtc)
3897	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
3898					  crtc,
3899					  pPriv->drw_y - crtc->y,
3900					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
3901    }
3902    /*
3903     * Rendering of the actual polygon is done in two different
3904     * ways depending on chip generation:
3905     *
3906     * < R300:
3907     *
3908     *     These chips can render a rectangle in one pass, so
3909     *     handling is pretty straight-forward.
3910     *
3911     * >= R300:
3912     *
3913     *     These chips can accept a quad, but will render it as
3914     *     two triangles which results in a diagonal tear. Instead
3915     *     We render a single, large triangle and use the scissor
3916     *     functionality to restrict it to the desired rectangle.
3917     *     Due to guardband limits on r3xx/r4xx, we can only use
3918     *     the single triangle up to 2880 pixels; above that we
3919     *     render as a quad.
3920     */
3921
3922    while (nBox--) {
3923	int srcX, srcY, srcw, srch;
3924	int dstX, dstY, dstw, dsth;
3925	dstX = pBox->x1 + dstxoff;
3926	dstY = pBox->y1 + dstyoff;
3927	dstw = pBox->x2 - pBox->x1;
3928	dsth = pBox->y2 - pBox->y1;
3929
3930	srcX = pPriv->src_x;
3931	srcX += ((pBox->x1 - pPriv->drw_x) *
3932		 pPriv->src_w) / pPriv->dst_w;
3933	srcY = pPriv->src_y;
3934	srcY += ((pBox->y1 - pPriv->drw_y) *
3935		 pPriv->src_h) / pPriv->dst_h;
3936
3937	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
3938	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
3939
3940	BEGIN_ACCEL(2);
3941	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
3942					 ((dstY) << R300_SCISSOR_Y_SHIFT)));
3943	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
3944					 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
3945	FINISH_ACCEL();
3946
3947#ifdef ACCEL_CP
3948	BEGIN_RING(3 * vtx_count + 4);
3949	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
3950			    3 * vtx_count));
3951	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
3952		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
3953		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
3954#else /* ACCEL_CP */
3955	BEGIN_ACCEL(2 + vtx_count * 3);
3956	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
3957					  RADEON_VF_PRIM_WALK_DATA |
3958					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
3959#endif
3960	if (pPriv->bicubic_enabled) {
3961	    VTX_OUT_6((float)dstX,            (float)dstY,
3962		      (float)srcX / pPriv->w, (float)srcY / pPriv->h,
3963		      (float)srcX + 0.5,      (float)srcY + 0.5);
3964	    VTX_OUT_6((float)dstX,            (float)(dstY + dstw + dsth),
3965		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h,
3966		      (float)srcX + 0.5,      (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
3967	    VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
3968		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
3969		      (float)srcY / pPriv->h,
3970		      (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
3971		      (float)srcY + 0.5);
3972	} else {
3973	    /*
3974	     * Render a big, scissored triangle. This means
3975	     * increasing the triangle size and adjusting
3976	     * texture coordinates.
3977	     */
3978	    VTX_OUT_4((float)dstX,            (float)dstY,
3979		      (float)srcX / pPriv->w, (float)srcY / pPriv->h);
3980	    VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
3981		      (float)srcX / pPriv->w, ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / pPriv->h);
3982	    VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
3983		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / pPriv->w,
3984		      (float)srcY / pPriv->h);
3985	}
3986
3987	/* flushing is pipelined, free/finish is not */
3988	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
3989
3990#ifdef ACCEL_CP
3991	ADVANCE_RING();
3992#else
3993	FINISH_ACCEL();
3994#endif /* !ACCEL_CP */
3995
3996	pBox++;
3997    }
3998
3999    BEGIN_ACCEL(3);
4000    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
4001    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
4002    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
4003    FINISH_ACCEL();
4004
4005    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
4006}
4007
4008#undef VTX_OUT_4
4009#undef VTX_OUT_6
4010#undef FUNC_NAME
4011