radeon_textured_videofuncs.c revision b7e1c893
1/*
2 * Copyright 2008 Alex Deucher
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 *
24 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
25 *
26 */
27
28#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
29#error Cannot define both MMIO and CP acceleration!
30#endif
31
32#if !defined(UNIXCPP) || defined(ANSICPP)
33#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
34#else
35#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
36#endif
37
38#ifdef ACCEL_MMIO
39#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
40#else
41#ifdef ACCEL_CP
42#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
43#else
44#error No accel type defined!
45#endif
46#endif
47
48#ifdef ACCEL_CP
49
50#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
51do {									\
52    OUT_RING_F(_dstX);						\
53    OUT_RING_F(_dstY);						\
54    OUT_RING_F(_srcX);						\
55    OUT_RING_F(_srcY);						\
56    OUT_RING_F(_maskX);						\
57    OUT_RING_F(_maskY);						\
58} while (0)
59
60#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)	\
61do {								\
62    OUT_RING_F(_dstX);						\
63    OUT_RING_F(_dstY);						\
64    OUT_RING_F(_srcX);						\
65    OUT_RING_F(_srcY);						\
66} while (0)
67
68#else /* ACCEL_CP */
69
70#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
71do {									\
72    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);			\
73    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);			\
74    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);			\
75    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);			\
76    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskX);			\
77    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);			\
78} while (0)
79
80#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)	\
81do {								\
82    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);		\
83    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);		\
84    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);		\
85    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);		\
86} while (0)
87
88#endif /* !ACCEL_CP */
89
90static void
91FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
92{
93    RADEONInfoPtr info = RADEONPTR(pScrn);
94    PixmapPtr pPixmap = pPriv->pPixmap;
95    uint32_t txformat;
96    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
97    uint32_t dst_offset, dst_pitch, dst_format;
98    uint32_t txenable, colorpitch;
99    uint32_t blendcntl;
100    Bool isplanar = FALSE;
101    int dstxoff, dstyoff, pixel_shift, vtx_count;
102    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
103    int nBox = REGION_NUM_RECTS(&pPriv->clip);
104    ACCEL_PREAMBLE();
105
106    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
107
108#ifdef USE_EXA
109    if (info->useEXA) {
110	dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
111	dst_pitch = exaGetPixmapPitch(pPixmap);
112    } else
113#endif
114	{
115	    dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
116		info->fbLocation + pScrn->fbOffset;
117	    dst_pitch = pPixmap->devKind;
118	}
119
120#ifdef COMPOSITE
121    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
122    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
123#else
124    dstxoff = 0;
125    dstyoff = 0;
126#endif
127
128#ifdef USE_EXA
129    if (info->useEXA) {
130	RADEON_SWITCH_TO_3D();
131    } else
132#endif
133	{
134	    BEGIN_ACCEL(2);
135	    if (IS_R300_3D || IS_R500_3D)
136		OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
137	    else
138		OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
139	    /* We must wait for 3d to idle, in case source was just written as a dest. */
140	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
141			  RADEON_WAIT_HOST_IDLECLEAN |
142			  RADEON_WAIT_2D_IDLECLEAN |
143			  RADEON_WAIT_3D_IDLECLEAN |
144			  RADEON_WAIT_DMA_GUI_IDLE);
145	    FINISH_ACCEL();
146
147	    if (!info->accel_state->XInited3D)
148		RADEONInit3DEngine(pScrn);
149	}
150
151    if (pPriv->bicubic_enabled)
152	vtx_count = 6;
153    else
154	vtx_count = 4;
155
156    if (IS_R300_3D || IS_R500_3D) {
157	uint32_t output_fmt;
158
159	switch (pPixmap->drawable.bitsPerPixel) {
160	case 16:
161	    if (pPixmap->drawable.depth == 15)
162		dst_format = R300_COLORFORMAT_ARGB1555;
163	    else
164		dst_format = R300_COLORFORMAT_RGB565;
165	    break;
166	case 32:
167	    dst_format = R300_COLORFORMAT_ARGB8888;
168	    break;
169	default:
170	    return;
171	}
172
173	output_fmt = (R300_OUT_FMT_C4_8 |
174		      R300_OUT_FMT_C0_SEL_BLUE |
175		      R300_OUT_FMT_C1_SEL_GREEN |
176		      R300_OUT_FMT_C2_SEL_RED |
177		      R300_OUT_FMT_C3_SEL_ALPHA);
178
179	colorpitch = dst_pitch >> pixel_shift;
180	colorpitch |= dst_format;
181
182	if (RADEONTilingEnabled(pScrn, pPixmap))
183	    colorpitch |= R300_COLORTILE;
184
185	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
186	    isplanar = TRUE;
187	}
188
189	if (isplanar) {
190	    txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
191	    txpitch = pPriv->src_pitch;
192	} else {
193	    if (pPriv->id == FOURCC_UYVY)
194		txformat1 = R300_TX_FORMAT_YVYU422;
195	    else
196		txformat1 = R300_TX_FORMAT_VYUY422;
197
198	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
199
200	    /* pitch is in pixels */
201	    txpitch = pPriv->src_pitch / 2;
202	}
203	txpitch -= 1;
204
205	txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
206		    (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
207		    R300_TXPITCH_EN);
208
209	info->accel_state->texW[0] = pPriv->w;
210	info->accel_state->texH[0] = pPriv->h;
211
212	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
213		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
214		    R300_TX_MAG_FILTER_LINEAR |
215		    R300_TX_MIN_FILTER_LINEAR |
216		    (0 << R300_TX_ID_SHIFT));
217
218
219	if (IS_R500_3D && ((pPriv->w - 1) & 0x800))
220	    txpitch |= R500_TXWIDTH_11;
221
222	if (IS_R500_3D && ((pPriv->h - 1) & 0x800))
223	    txpitch |= R500_TXHEIGHT_11;
224
225	txoffset = pPriv->src_offset;
226
227	BEGIN_ACCEL(6);
228	OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
229	OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
230	OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
231	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
232	OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
233	OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
234	FINISH_ACCEL();
235
236	txenable = R300_TEX_0_ENABLE;
237
238	if (isplanar) {
239	    txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
240			(((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
241			R300_TXPITCH_EN);
242	    txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
243	    txpitch -= 1;
244	    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
245		        R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
246			R300_TX_MIN_FILTER_LINEAR |
247			R300_TX_MAG_FILTER_LINEAR);
248
249		BEGIN_ACCEL(12);
250		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
251		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
252		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
253		OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
254		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
255		OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
256		OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
257		OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
258		OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
259		OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
260		OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
261		OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
262		FINISH_ACCEL();
263		txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
264	}
265
266	if (pPriv->bicubic_enabled) {
267		/* Size is 128x1 */
268		txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
269			     (0x0 << R300_TXHEIGHT_SHIFT) |
270			     R300_TXPITCH_EN);
271		/* Format is 32-bit floats, 4bpp */
272		txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
273		/* Pitch is 127 (128-1) */
274		txpitch = 0x7f;
275		/* Tex filter */
276		txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
277			    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
278			    R300_TX_MIN_FILTER_NEAREST |
279			    R300_TX_MAG_FILTER_NEAREST |
280			    (1 << R300_TX_ID_SHIFT));
281
282		BEGIN_ACCEL(6);
283		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
284		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
285		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
286		OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
287		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
288		OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset);
289		FINISH_ACCEL();
290
291		/* Enable tex 1 */
292		txenable |= R300_TEX_1_ENABLE;
293	}
294
295	/* setup the VAP */
296	if (info->accel_state->has_tcl) {
297	    if (pPriv->bicubic_enabled)
298		BEGIN_ACCEL(7);
299	    else
300		BEGIN_ACCEL(6);
301	} else {
302	    if (pPriv->bicubic_enabled)
303		BEGIN_ACCEL(5);
304	    else
305		BEGIN_ACCEL(4);
306	}
307
308	/* These registers define the number, type, and location of data submitted
309	 * to the PVS unit of GA input (when PVS is disabled)
310	 * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
311	 * enabled.  This memory provides the imputs to the vertex shader program
312	 * and ordering is not important.  When PVS/TCL is disabled, this field maps
313	 * directly to the GA input memory and the order is signifigant.  In
314	 * PVS_BYPASS mode the order is as follows:
315	 * Position
316	 * Point Size
317	 * Color 0-3
318	 * Textures 0-7
319	 * Fog
320	 */
321	if (pPriv->bicubic_enabled) {
322	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
323			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
324			   (0 << R300_SKIP_DWORDS_0_SHIFT) |
325			   (0 << R300_DST_VEC_LOC_0_SHIFT) |
326			   R300_SIGNED_0 |
327			   (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
328			   (0 << R300_SKIP_DWORDS_1_SHIFT) |
329			   (6 << R300_DST_VEC_LOC_1_SHIFT) |
330			   R300_SIGNED_1));
331	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
332			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
333			   (0 << R300_SKIP_DWORDS_2_SHIFT) |
334			   (7 << R300_DST_VEC_LOC_2_SHIFT) |
335			   R300_LAST_VEC_2 |
336			   R300_SIGNED_2));
337	} else {
338	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
339			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
340			   (0 << R300_SKIP_DWORDS_0_SHIFT) |
341			   (0 << R300_DST_VEC_LOC_0_SHIFT) |
342			   R300_SIGNED_0 |
343			   (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
344			   (0 << R300_SKIP_DWORDS_1_SHIFT) |
345			   (6 << R300_DST_VEC_LOC_1_SHIFT) |
346			   R300_LAST_VEC_1 |
347			   R300_SIGNED_1));
348	}
349
350	/* load the vertex shader
351	 * We pre-load vertex programs in RADEONInit3DEngine():
352	 * - exa mask/Xv bicubic
353	 * - exa no mask
354	 * - Xv
355	 * Here we select the offset of the vertex program we want to use
356	 */
357	if (info->accel_state->has_tcl) {
358	    if (pPriv->bicubic_enabled) {
359		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
360			      ((0 << R300_PVS_FIRST_INST_SHIFT) |
361			       (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
362			       (2 << R300_PVS_LAST_INST_SHIFT)));
363		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
364			      (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
365	    } else {
366		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
367			      ((5 << R300_PVS_FIRST_INST_SHIFT) |
368			       (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
369			       (6 << R300_PVS_LAST_INST_SHIFT)));
370		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
371			      (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
372	    }
373	}
374
375	/* Position and one set of 2 texture coordinates */
376	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
377	if (pPriv->bicubic_enabled)
378	    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
379						   (2 << R300_TEX_1_COMP_CNT_SHIFT)));
380	else
381	    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
382
383	OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
384	FINISH_ACCEL();
385
386	/* setup pixel shader */
387	if (IS_R300_3D) {
388	    if (pPriv->bicubic_enabled) {
389		BEGIN_ACCEL(79);
390
391		/* 4 components: 2 for tex0 and 2 for tex1 */
392		OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
393						   R300_RS_COUNT_HIRES_EN));
394
395		/* R300_INST_COUNT_RS - highest RS instruction used */
396		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
397
398		/* Pixel stack frame size. */
399		OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
400
401		/* Indirection levels */
402		OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
403							R300_FIRST_TEX));
404
405		/* Set nodes. */
406		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
407							R300_ALU_CODE_SIZE(14) |
408							R300_TEX_CODE_OFFSET(0) |
409							R300_TEX_CODE_SIZE(6)));
410
411		/* Nodes are allocated highest first, but executed lowest first */
412		OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
413		OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
414							R300_ALU_SIZE(0) |
415							R300_TEX_START(0) |
416							R300_TEX_SIZE(0)));
417		OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
418							R300_ALU_SIZE(9) |
419							R300_TEX_START(1) |
420							R300_TEX_SIZE(0)));
421		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
422							R300_ALU_SIZE(2) |
423							R300_TEX_START(2) |
424							R300_TEX_SIZE(3) |
425							R300_RGBA_OUT));
426
427		/* ** BICUBIC FP ** */
428
429		/* texcoord0 => temp0
430		 * texcoord1 => temp1 */
431
432		// first node
433		/* TEX temp2, temp1.rrr0, tex1, 1D */
434		OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
435						   R300_TEX_ID(1) |
436						   R300_TEX_SRC_ADDR(1) |
437						   R300_TEX_DST_ADDR(2)));
438
439		/* MOV temp1.r, temp1.ggg0 */
440		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
441						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
442						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
443						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
444		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
445						   R300_ALU_RGB_ADDRD(1) |
446						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
447		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
448						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
449						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
450						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
451		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
452						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
453
454
455		// second node
456		/* TEX temp1, temp1, tex1, 1D */
457		OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
458						   R300_TEX_ID(1) |
459						   R300_TEX_SRC_ADDR(1) |
460						   R300_TEX_DST_ADDR(1)));
461
462		/* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
463		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
464						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
465						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
466						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
467		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
468						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
469						   R300_ALU_RGB_ADDRD(3) |
470						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
471		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
472						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
473						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
474						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
475		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
476						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
477
478
479		/* MUL temp2.rg, temp2.rrr0, const0.rgb */
480		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
481						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
482						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
483						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
484		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
485						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
486						   R300_ALU_RGB_ADDRD(2) |
487						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
488		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
489						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
490						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
491						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
492		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
493						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
494
495		/* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
496		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
497						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
498						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
499						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
500		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
501						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
502						   R300_ALU_RGB_ADDR2(3) |
503						   R300_ALU_RGB_ADDRD(4) |
504						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
505		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
506						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
507						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
508						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
509		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
510						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
511
512		/* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
513		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
514						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
515						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
516						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
517		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
518						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
519						   R300_ALU_RGB_ADDR2(2) |
520						   R300_ALU_RGB_ADDRD(5) |
521						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
522		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
523						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
524						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
525						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
526		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
527						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
528
529		/* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
530		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
531						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
532						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
533						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
534		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
535						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
536						   R300_ALU_RGB_ADDR2(3) |
537						   R300_ALU_RGB_ADDRD(3) |
538						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
539		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
540						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
541						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
542						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
543		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
544						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
545
546		/* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
547		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
548						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
549						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
550						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
551		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
552						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
553						   R300_ALU_RGB_ADDR2(2) |
554						   R300_ALU_RGB_ADDRD(1) |
555						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
556		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
557						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
558						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
559						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
560		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
561						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
562
563		/* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
564		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
565						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
566						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
567						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
568		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
569						   R300_ALU_RGB_ADDR2(1) |
570						   R300_ALU_RGB_ADDRD(1) |
571						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
572		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
573						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
574						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
575						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
576		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
577						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
578
579		/* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
580		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
581						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
582						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
583						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
584		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
585						   R300_ALU_RGB_ADDR2(3) |
586						   R300_ALU_RGB_ADDRD(2) |
587						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
588		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
589						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
590						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
591						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
592		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
593						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
594
595		/* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
596		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
597						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
598						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
599						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
600		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
601						   R300_ALU_RGB_ADDR2(5) |
602						   R300_ALU_RGB_ADDRD(3) |
603						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
604		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
605						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
606						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
607						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
608		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
609						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
610
611		/* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
612		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
613						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
614						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
615						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
616		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
617						   R300_ALU_RGB_ADDR2(4) |
618						   R300_ALU_RGB_ADDRD(0) |
619						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
620		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
621						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
622						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
623						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
624		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
625						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
626
627
628		// third node
629		/* TEX temp4, temp1.rg--, tex0, 1D */
630		OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
631						   R300_TEX_ID(0) |
632						   R300_TEX_SRC_ADDR(1) |
633						   R300_TEX_DST_ADDR(4)));
634
635		/* TEX temp3, temp3.rg--, tex0, 1D */
636		OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
637						   R300_TEX_ID(0) |
638						   R300_TEX_SRC_ADDR(3) |
639						   R300_TEX_DST_ADDR(3)));
640
641		/* TEX temp5, temp2.rg--, tex0, 1D */
642		OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
643						   R300_TEX_ID(0) |
644						   R300_TEX_SRC_ADDR(2) |
645						   R300_TEX_DST_ADDR(5)));
646
647		/* TEX temp0, temp0.rg--, tex0, 1D */
648		OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
649						   R300_TEX_ID(0) |
650						   R300_TEX_SRC_ADDR(0) |
651						   R300_TEX_DST_ADDR(0)));
652
653		/* LRP temp3, temp1.bbbb, temp4, temp3 ->
654		 * - PRESUB temps, temp4 - temp3
655		 * - MAD temp3, temp1.bbbb, temps, temp3 */
656		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
657						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
658						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
659						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
660						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
661		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
662						   R300_ALU_RGB_ADDR1(4) |
663						   R300_ALU_RGB_ADDR2(1) |
664						   R300_ALU_RGB_ADDRD(3) |
665						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
666		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
667						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
668						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
669						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
670		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
671						   R300_ALU_ALPHA_ADDR1(4) |
672						   R300_ALU_ALPHA_ADDR2(1) |
673						   R300_ALU_ALPHA_ADDRD(3) |
674						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
675
676		/* LRP temp0, temp1.bbbb, temp5, temp0 ->
677		 * - PRESUB temps, temp5 - temp0
678		 * - MAD temp0, temp1.bbbb, temps, temp0 */
679		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
680						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
681						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
682						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
683						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
684						   R300_ALU_RGB_INSERT_NOP));
685		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
686						   R300_ALU_RGB_ADDR1(5) |
687						   R300_ALU_RGB_ADDR2(1) |
688						   R300_ALU_RGB_ADDRD(0) |
689						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
690		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
691						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
692						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
693						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
694		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
695						   R300_ALU_ALPHA_ADDR1(5) |
696						   R300_ALU_ALPHA_ADDR2(1) |
697						   R300_ALU_ALPHA_ADDRD(0) |
698						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
699
700		/* LRP output, temp2.bbbb, temp3, temp0 ->
701		 * - PRESUB temps, temp3 - temp0
702		 * - MAD output, temp2.bbbb, temps, temp0 */
703		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
704						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
705						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
706						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
707						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
708		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
709						   R300_ALU_RGB_ADDR1(3) |
710						   R300_ALU_RGB_ADDR2(2) |
711						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
712		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
713						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
714						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
715						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
716		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
717						   R300_ALU_ALPHA_ADDR1(3) |
718						   R300_ALU_ALPHA_ADDR2(2) |
719						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
720
721		/* Shader constants. */
722		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
723		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
724		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
725		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
726
727		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
728		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
729		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
730		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
731
732		FINISH_ACCEL();
733	    } else if (isplanar) {
734	    /*
735	     * y' = y - .0625
736	     * u' = u - .5
737	     * v' = v - .5;
738	     *
739	     * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
740	     * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
741	     * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
742	     *
743	     * DP3 might look like the straightforward solution
744	     * but we'd need to move the texture yuv values in
745	     * the same reg for this to work. Therefore use MADs.
746	     * Without changing the shader at all (only the constants)
747	     * could also provide hue/saturation/brightness/contrast control.
748	     *
749	     * yco = 1.1643
750	     * uco = 0, -0.39173, 2.017
751	     * vco = 1.5958, -0.8129, 0
752	     * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
753	     *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
754	     *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
755	     *
756	     * temp = MAD(yco, yuv.yyyy, off)
757	     * temp = MAD(uco, yuv.uuuu, temp)
758	     * result = MAD(vco, yuv.vvvv, temp)
759	     */
760		float yco = 1.1643;
761		float uco[3] = {0.0, -0.39173, 2.018};
762		float vco[3] = {1.5958, -0.8129, 0.0};
763		float off[3] = {-0.0625 * yco + -0.5 * uco[0] + -0.5 * vco[0],
764				-0.0625 * yco + -0.5 * uco[1] + -0.5 * vco[1],
765				-0.0625 * yco + -0.5 * uco[2] + -0.5 * vco[2]};
766
767		BEGIN_ACCEL(33);
768		/* 2 components: same 2 for tex0/1/2 */
769		OUT_ACCEL_REG(R300_RS_COUNT,
770			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
771			   R300_RS_COUNT_HIRES_EN));
772		/* R300_INST_COUNT_RS - highest RS instruction used */
773		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
774
775		OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
776
777		/* Indirection levels */
778		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
779							R300_FIRST_TEX));
780
781		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
782						   R300_ALU_CODE_SIZE(3) |
783						   R300_TEX_CODE_OFFSET(0) |
784						   R300_TEX_CODE_SIZE(3)));
785
786		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
787						   R300_ALU_SIZE(2) |
788						   R300_TEX_START(0) |
789						   R300_TEX_SIZE(2) |
790						   R300_RGBA_OUT));
791
792		/* tex inst */
793		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
794						  R300_TEX_DST_ADDR(0) |
795						  R300_TEX_ID(0) |
796						  R300_TEX_INST(R300_TEX_INST_LD)));
797		OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
798						  R300_TEX_DST_ADDR(1) |
799						  R300_TEX_ID(1) |
800						  R300_TEX_INST(R300_TEX_INST_LD)));
801		OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
802						  R300_TEX_DST_ADDR(2) |
803						  R300_TEX_ID(2) |
804						  R300_TEX_INST(R300_TEX_INST_LD)));
805
806		/* ALU inst */
807		/* MAD temp0, const0.a, temp0, const0.rgb */
808		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
809						   R300_ALU_RGB_ADDR1(0) |
810						   R300_ALU_RGB_ADDR2(0) |
811						   R300_ALU_RGB_ADDRD(0) |
812						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
813		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
814						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
815						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
816						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
817						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
818						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
819						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
820						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
821		/* alpha nop, but need to set up alpha source for rgb usage */
822		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
823						   R300_ALU_ALPHA_ADDR1(0) |
824						   R300_ALU_ALPHA_ADDR2(0) |
825						   R300_ALU_ALPHA_ADDRD(0) |
826						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
827		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
828						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
829						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
830						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
831
832		/* MAD const1, temp1, temp0 */
833		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
834						   R300_ALU_RGB_ADDR1(1) |
835						   R300_ALU_RGB_ADDR2(0) |
836						   R300_ALU_RGB_ADDRD(0) |
837						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
838		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
839						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
840						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
841						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
842						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
843						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
844						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
845						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
846		/* alpha nop */
847		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
848						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
849		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
850						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
851						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
852						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
853
854		/* MAD result, const2, temp2, temp0 */
855		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
856						   R300_ALU_RGB_ADDR1(2) |
857						   R300_ALU_RGB_ADDR2(0) |
858						   R300_ALU_RGB_ADDRD(0) |
859						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
860						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
861		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
862						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
863						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
864						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
865						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
866						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
867						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
868						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
869						   R300_ALU_RGB_CLAMP));
870		/* write alpha 1 */
871		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
872						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
873						   R300_ALU_ALPHA_TARGET_A));
874		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
875						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
876						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
877						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
878
879		/* Shader constants. */
880		/* constant 0: off, yco */
881		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
882		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
883		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
884		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
885		/* constant 1: uco */
886		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
887		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
888		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
889		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(0.0));
890		/* constant 2: vco */
891		OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
892		OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
893		OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
894		OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
895
896		FINISH_ACCEL();
897
898	    } else {
899		BEGIN_ACCEL(11);
900		/* 2 components: 2 for tex0 */
901		OUT_ACCEL_REG(R300_RS_COUNT,
902			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
903			   R300_RS_COUNT_HIRES_EN));
904		/* R300_INST_COUNT_RS - highest RS instruction used */
905		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
906
907		OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
908
909		/* Indirection levels */
910		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
911							R300_FIRST_TEX));
912
913		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
914						   R300_ALU_CODE_SIZE(1) |
915						   R300_TEX_CODE_OFFSET(0) |
916						   R300_TEX_CODE_SIZE(1)));
917
918		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
919						   R300_ALU_SIZE(0) |
920						   R300_TEX_START(0) |
921						   R300_TEX_SIZE(0) |
922						   R300_RGBA_OUT));
923
924		/* tex inst */
925		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
926						  R300_TEX_DST_ADDR(0) |
927						  R300_TEX_ID(0) |
928						  R300_TEX_INST(R300_TEX_INST_LD)));
929
930		/* ALU inst */
931		/* RGB */
932		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
933						   R300_ALU_RGB_ADDR1(0) |
934						   R300_ALU_RGB_ADDR2(0) |
935						   R300_ALU_RGB_ADDRD(0) |
936						   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
937						   R300_ALU_RGB_MASK_G |
938						   R300_ALU_RGB_MASK_B)) |
939						   R300_ALU_RGB_TARGET_A));
940		OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
941						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
942						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
943						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
944						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
945						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
946						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
947						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
948						   R300_ALU_RGB_CLAMP));
949		/* Alpha */
950		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
951						   R300_ALU_ALPHA_ADDR1(0) |
952						   R300_ALU_ALPHA_ADDR2(0) |
953						   R300_ALU_ALPHA_ADDRD(0) |
954						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
955						   R300_ALU_ALPHA_TARGET_A |
956						   R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
957		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
958						   R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
959						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
960						   R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
961						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
962						   R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
963						   R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
964						   R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
965						   R300_ALU_ALPHA_CLAMP));
966		FINISH_ACCEL();
967	    }
968	} else {
969	    if (pPriv->bicubic_enabled) {
970		BEGIN_ACCEL(7);
971
972		/* 4 components: 2 for tex0 and 2 for tex1 */
973		OUT_ACCEL_REG(R300_RS_COUNT,
974			      ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
975			       R300_RS_COUNT_HIRES_EN));
976
977		/* R300_INST_COUNT_RS - highest RS instruction used */
978		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
979
980		/* Pixel stack frame size. */
981		OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
982
983		/* FP length. */
984		OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
985						  R500_US_CODE_END_ADDR(13)));
986		OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
987						   R500_US_CODE_RANGE_SIZE(13)));
988
989		/* Prepare for FP emission. */
990		OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
991		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
992		FINISH_ACCEL();
993
994		BEGIN_ACCEL(89);
995		/* Pixel shader.
996		 * I've gone ahead and annotated each instruction, since this
997		 * thing is MASSIVE. :3
998		 * Note: In order to avoid buggies with temps and multiple
999		 * inputs, all temps are offset by 2. temp0 -> register2. */
1000
1001		/* TEX temp2, input1.xxxx, tex1, 1D */
1002		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1003						       R500_INST_RGB_WMASK_R |
1004						       R500_INST_RGB_WMASK_G |
1005						       R500_INST_RGB_WMASK_B));
1006		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
1007						       R500_TEX_INST_LD |
1008						       R500_TEX_IGNORE_UNCOVERED));
1009		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
1010						       R500_TEX_SRC_S_SWIZ_R |
1011						       R500_TEX_SRC_T_SWIZ_R |
1012						       R500_TEX_SRC_R_SWIZ_R |
1013						       R500_TEX_SRC_Q_SWIZ_R |
1014						       R500_TEX_DST_ADDR(2) |
1015						       R500_TEX_DST_R_SWIZ_R |
1016						       R500_TEX_DST_G_SWIZ_G |
1017						       R500_TEX_DST_B_SWIZ_B |
1018						       R500_TEX_DST_A_SWIZ_A));
1019		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1020		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1021		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1022
1023		/* TEX temp5, input1.yyyy, tex1, 1D */
1024		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1025						       R500_INST_TEX_SEM_WAIT |
1026						       R500_INST_RGB_WMASK_R |
1027						       R500_INST_RGB_WMASK_G |
1028						       R500_INST_RGB_WMASK_B));
1029		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
1030						       R500_TEX_INST_LD |
1031						       R500_TEX_SEM_ACQUIRE |
1032						       R500_TEX_IGNORE_UNCOVERED));
1033		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
1034						       R500_TEX_SRC_S_SWIZ_G |
1035						       R500_TEX_SRC_T_SWIZ_G |
1036						       R500_TEX_SRC_R_SWIZ_G |
1037						       R500_TEX_SRC_Q_SWIZ_G |
1038						       R500_TEX_DST_ADDR(5) |
1039						       R500_TEX_DST_R_SWIZ_R |
1040						       R500_TEX_DST_G_SWIZ_G |
1041						       R500_TEX_DST_B_SWIZ_B |
1042						       R500_TEX_DST_A_SWIZ_A));
1043		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1044		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1045		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1046
1047		/* MUL temp4, const0.x0x0, temp2.yyxx */
1048		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1049						       R500_INST_TEX_SEM_WAIT |
1050						       R500_INST_RGB_WMASK_R |
1051						       R500_INST_RGB_WMASK_G |
1052						       R500_INST_RGB_WMASK_B |
1053						       R500_INST_ALPHA_WMASK));
1054		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
1055						       R500_RGB_ADDR0_CONST |
1056						       R500_RGB_ADDR1(2)));
1057		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
1058						       R500_ALPHA_ADDR0_CONST |
1059						       R500_ALPHA_ADDR1(2)));
1060		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
1061						       R500_ALU_RGB_R_SWIZ_A_R |
1062						       R500_ALU_RGB_G_SWIZ_A_0 |
1063						       R500_ALU_RGB_B_SWIZ_A_R |
1064						       R500_ALU_RGB_SEL_B_SRC1 |
1065						       R500_ALU_RGB_R_SWIZ_B_G |
1066						       R500_ALU_RGB_G_SWIZ_B_G |
1067						       R500_ALU_RGB_B_SWIZ_B_R));
1068		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
1069						       R500_ALPHA_OP_MAD |
1070						       R500_ALPHA_SEL_A_SRC0 |
1071						       R500_ALPHA_SWIZ_A_0 |
1072						       R500_ALPHA_SEL_B_SRC1 |
1073						       R500_ALPHA_SWIZ_B_R));
1074		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
1075						       R500_ALU_RGBA_OP_MAD |
1076						       R500_ALU_RGBA_R_SWIZ_0 |
1077						       R500_ALU_RGBA_G_SWIZ_0 |
1078						       R500_ALU_RGBA_B_SWIZ_0 |
1079						       R500_ALU_RGBA_A_SWIZ_0));
1080
1081		/* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
1082		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1083						       R500_INST_RGB_WMASK_R |
1084						       R500_INST_RGB_WMASK_G |
1085						       R500_INST_RGB_WMASK_B |
1086						       R500_INST_ALPHA_WMASK));
1087		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
1088						       R500_RGB_ADDR0_CONST |
1089						       R500_RGB_ADDR1(5) |
1090						       R500_RGB_ADDR2(4)));
1091		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
1092						       R500_ALPHA_ADDR0_CONST |
1093						       R500_ALPHA_ADDR1(5) |
1094						       R500_ALPHA_ADDR2(4)));
1095		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
1096						       R500_ALU_RGB_R_SWIZ_A_0 |
1097						       R500_ALU_RGB_G_SWIZ_A_G |
1098						       R500_ALU_RGB_B_SWIZ_A_0 |
1099						       R500_ALU_RGB_SEL_B_SRC1 |
1100						       R500_ALU_RGB_R_SWIZ_B_R |
1101						       R500_ALU_RGB_G_SWIZ_B_R |
1102						       R500_ALU_RGB_B_SWIZ_B_R));
1103		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
1104						       R500_ALPHA_OP_MAD |
1105						       R500_ALPHA_SEL_A_SRC0 |
1106						       R500_ALPHA_SWIZ_A_G |
1107						       R500_ALPHA_SEL_B_SRC1 |
1108						       R500_ALPHA_SWIZ_B_R));
1109		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
1110						       R500_ALU_RGBA_OP_MAD |
1111						       R500_ALU_RGBA_SEL_C_SRC2 |
1112						       R500_ALU_RGBA_R_SWIZ_R |
1113						       R500_ALU_RGBA_G_SWIZ_G |
1114						       R500_ALU_RGBA_B_SWIZ_B |
1115						       R500_ALU_RGBA_A_SWIZ_A));
1116
1117		/* ADD temp3, temp3, input0.xyxy */
1118		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1119						       R500_INST_RGB_WMASK_R |
1120						       R500_INST_RGB_WMASK_G |
1121						       R500_INST_RGB_WMASK_B |
1122						       R500_INST_ALPHA_WMASK));
1123		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
1124						       R500_RGB_ADDR2(0)));
1125		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
1126						       R500_ALPHA_ADDR2(0)));
1127		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
1128						       R500_ALU_RGB_G_SWIZ_A_1 |
1129						       R500_ALU_RGB_B_SWIZ_A_1 |
1130						       R500_ALU_RGB_SEL_B_SRC1 |
1131						       R500_ALU_RGB_R_SWIZ_B_R |
1132						       R500_ALU_RGB_G_SWIZ_B_G |
1133						       R500_ALU_RGB_B_SWIZ_B_B));
1134		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
1135						       R500_ALPHA_OP_MAD |
1136						       R500_ALPHA_SWIZ_A_1 |
1137						       R500_ALPHA_SEL_B_SRC1 |
1138						       R500_ALPHA_SWIZ_B_A));
1139		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
1140						       R500_ALU_RGBA_OP_MAD |
1141						       R500_ALU_RGBA_SEL_C_SRC2 |
1142						       R500_ALU_RGBA_R_SWIZ_R |
1143						       R500_ALU_RGBA_G_SWIZ_G |
1144						       R500_ALU_RGBA_B_SWIZ_R |
1145						       R500_ALU_RGBA_A_SWIZ_G));
1146
1147		/* TEX temp1, temp3.zwxy, tex0, 2D */
1148		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1149						       R500_INST_RGB_WMASK_R |
1150						       R500_INST_RGB_WMASK_G |
1151						       R500_INST_RGB_WMASK_B |
1152						       R500_INST_ALPHA_WMASK));
1153		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
1154						       R500_TEX_INST_LD |
1155						       R500_TEX_IGNORE_UNCOVERED));
1156		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
1157						       R500_TEX_SRC_S_SWIZ_B |
1158						       R500_TEX_SRC_T_SWIZ_A |
1159						       R500_TEX_SRC_R_SWIZ_R |
1160						       R500_TEX_SRC_Q_SWIZ_G |
1161						       R500_TEX_DST_ADDR(1) |
1162						       R500_TEX_DST_R_SWIZ_R |
1163						       R500_TEX_DST_G_SWIZ_G |
1164						       R500_TEX_DST_B_SWIZ_B |
1165						       R500_TEX_DST_A_SWIZ_A));
1166		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1167		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1168		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1169
1170		/* TEX temp3, temp3.xyzw, tex0, 2D */
1171		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1172						       R500_INST_TEX_SEM_WAIT |
1173						       R500_INST_RGB_WMASK_R |
1174						       R500_INST_RGB_WMASK_G |
1175						       R500_INST_RGB_WMASK_B |
1176						       R500_INST_ALPHA_WMASK));
1177		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
1178						       R500_TEX_INST_LD |
1179						       R500_TEX_SEM_ACQUIRE |
1180						       R500_TEX_IGNORE_UNCOVERED));
1181		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
1182						       R500_TEX_SRC_S_SWIZ_R |
1183						       R500_TEX_SRC_T_SWIZ_G |
1184						       R500_TEX_SRC_R_SWIZ_B |
1185						       R500_TEX_SRC_Q_SWIZ_A |
1186						       R500_TEX_DST_ADDR(3) |
1187						       R500_TEX_DST_R_SWIZ_R |
1188						       R500_TEX_DST_G_SWIZ_G |
1189						       R500_TEX_DST_B_SWIZ_B |
1190						       R500_TEX_DST_A_SWIZ_A));
1191		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1192		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1193		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1194
1195		/* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
1196		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1197						       R500_INST_RGB_WMASK_R |
1198						       R500_INST_RGB_WMASK_G |
1199						       R500_INST_RGB_WMASK_B |
1200						       R500_INST_ALPHA_WMASK));
1201		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
1202						       R500_RGB_ADDR0_CONST |
1203						       R500_RGB_ADDR1(5) |
1204						       R500_RGB_ADDR2(4)));
1205		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
1206						       R500_ALPHA_ADDR0_CONST |
1207						       R500_ALPHA_ADDR1(5) |
1208						       R500_ALPHA_ADDR2(4)));
1209		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
1210						       R500_ALU_RGB_R_SWIZ_A_0 |
1211						       R500_ALU_RGB_G_SWIZ_A_G |
1212						       R500_ALU_RGB_B_SWIZ_A_0 |
1213						       R500_ALU_RGB_SEL_B_SRC1 |
1214						       R500_ALU_RGB_R_SWIZ_B_G |
1215						       R500_ALU_RGB_G_SWIZ_B_G |
1216						       R500_ALU_RGB_B_SWIZ_B_G));
1217		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
1218						       R500_ALPHA_OP_MAD |
1219						       R500_ALPHA_SEL_A_SRC0 |
1220						       R500_ALPHA_SWIZ_A_G |
1221						       R500_ALPHA_SEL_B_SRC1 |
1222						       R500_ALPHA_SWIZ_B_G));
1223		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
1224						       R500_ALU_RGBA_OP_MAD |
1225						       R500_ALU_RGBA_SEL_C_SRC2 |
1226						       R500_ALU_RGBA_R_SWIZ_R |
1227						       R500_ALU_RGBA_G_SWIZ_G |
1228						       R500_ALU_RGBA_B_SWIZ_B |
1229						       R500_ALU_RGBA_A_SWIZ_A));
1230
1231		/* ADD temp0, temp4, input0.xyxy */
1232		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1233						       R500_INST_RGB_WMASK_R |
1234						       R500_INST_RGB_WMASK_G |
1235						       R500_INST_RGB_WMASK_B |
1236						       R500_INST_ALPHA_WMASK));
1237		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
1238						       R500_RGB_ADDR2(0)));
1239		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
1240						       R500_ALPHA_ADDR2(0)));
1241		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
1242						       R500_ALU_RGB_G_SWIZ_A_1 |
1243						       R500_ALU_RGB_B_SWIZ_A_1 |
1244						       R500_ALU_RGB_SEL_B_SRC1 |
1245						       R500_ALU_RGB_R_SWIZ_B_R |
1246						       R500_ALU_RGB_G_SWIZ_B_G |
1247						       R500_ALU_RGB_B_SWIZ_B_B));
1248		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
1249						       R500_ALPHA_OP_MAD |
1250						       R500_ALPHA_SWIZ_A_1 |
1251						       R500_ALPHA_SEL_B_SRC1 |
1252						       R500_ALPHA_SWIZ_B_A));
1253		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
1254						       R500_ALU_RGBA_OP_MAD |
1255						       R500_ALU_RGBA_SEL_C_SRC2 |
1256						       R500_ALU_RGBA_R_SWIZ_R |
1257						       R500_ALU_RGBA_G_SWIZ_G |
1258						       R500_ALU_RGBA_B_SWIZ_R |
1259						       R500_ALU_RGBA_A_SWIZ_G));
1260
1261		/* TEX temp4, temp0.zwzw, tex0, 2D */
1262		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1263						       R500_INST_TEX_SEM_WAIT |
1264						       R500_INST_RGB_WMASK_R |
1265						       R500_INST_RGB_WMASK_G |
1266						       R500_INST_RGB_WMASK_B |
1267						       R500_INST_ALPHA_WMASK));
1268		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
1269						       R500_TEX_INST_LD |
1270						       R500_TEX_IGNORE_UNCOVERED));
1271		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
1272						       R500_TEX_SRC_S_SWIZ_B |
1273						       R500_TEX_SRC_T_SWIZ_A |
1274						       R500_TEX_SRC_R_SWIZ_B |
1275						       R500_TEX_SRC_Q_SWIZ_A |
1276						       R500_TEX_DST_ADDR(4) |
1277						       R500_TEX_DST_R_SWIZ_R |
1278						       R500_TEX_DST_G_SWIZ_G |
1279						       R500_TEX_DST_B_SWIZ_B |
1280						       R500_TEX_DST_A_SWIZ_A));
1281		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1282		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1283		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1284
1285		/* TEX temp0, temp0.xyzw, tex0, 2D */
1286		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1287						       R500_INST_TEX_SEM_WAIT |
1288						       R500_INST_RGB_WMASK_R |
1289						       R500_INST_RGB_WMASK_G |
1290						       R500_INST_RGB_WMASK_B |
1291						   R500_INST_ALPHA_WMASK));
1292		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
1293						       R500_TEX_INST_LD |
1294						       R500_TEX_SEM_ACQUIRE |
1295						       R500_TEX_IGNORE_UNCOVERED));
1296		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
1297						       R500_TEX_SRC_S_SWIZ_R |
1298						       R500_TEX_SRC_T_SWIZ_G |
1299						       R500_TEX_SRC_R_SWIZ_B |
1300						       R500_TEX_SRC_Q_SWIZ_A |
1301						       R500_TEX_DST_ADDR(0) |
1302						       R500_TEX_DST_R_SWIZ_R |
1303						       R500_TEX_DST_G_SWIZ_G |
1304						       R500_TEX_DST_B_SWIZ_B |
1305						       R500_TEX_DST_A_SWIZ_A));
1306		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1307		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1308		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1309
1310		/* LRP temp3, temp2.zzzz, temp1, temp3 ->
1311		 * - PRESUB temps, temp1 - temp3
1312		 * - MAD temp2.zzzz, temps, temp3 */
1313		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1314						       R500_INST_RGB_WMASK_R |
1315						       R500_INST_RGB_WMASK_G |
1316						       R500_INST_RGB_WMASK_B |
1317						       R500_INST_ALPHA_WMASK));
1318		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
1319						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
1320						       R500_RGB_ADDR1(1) |
1321						       R500_RGB_ADDR2(2)));
1322		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
1323						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
1324						       R500_ALPHA_ADDR1(1) |
1325						       R500_ALPHA_ADDR2(2)));
1326		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
1327						       R500_ALU_RGB_R_SWIZ_A_B |
1328						       R500_ALU_RGB_G_SWIZ_A_B |
1329						       R500_ALU_RGB_B_SWIZ_A_B |
1330						       R500_ALU_RGB_SEL_B_SRCP |
1331						       R500_ALU_RGB_R_SWIZ_B_R |
1332						       R500_ALU_RGB_G_SWIZ_B_G |
1333						       R500_ALU_RGB_B_SWIZ_B_B));
1334		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
1335						       R500_ALPHA_OP_MAD |
1336						       R500_ALPHA_SEL_A_SRC2 |
1337						       R500_ALPHA_SWIZ_A_B |
1338						       R500_ALPHA_SEL_B_SRCP |
1339						       R500_ALPHA_SWIZ_B_A));
1340		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
1341						       R500_ALU_RGBA_OP_MAD |
1342						       R500_ALU_RGBA_SEL_C_SRC0 |
1343						       R500_ALU_RGBA_R_SWIZ_R |
1344						       R500_ALU_RGBA_G_SWIZ_G |
1345						       R500_ALU_RGBA_B_SWIZ_B |
1346						       R500_ALU_RGBA_A_SWIZ_A));
1347
1348		/* LRP temp0, temp2.zzzz, temp4, temp0 ->
1349		 * - PRESUB temps, temp4 - temp1
1350		 * - MAD temp2.zzzz, temps, temp0 */
1351		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
1352						       R500_INST_TEX_SEM_WAIT |
1353						       R500_INST_RGB_WMASK_R |
1354						       R500_INST_RGB_WMASK_G |
1355						       R500_INST_RGB_WMASK_B |
1356						       R500_INST_ALPHA_WMASK));
1357		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
1358						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
1359						       R500_RGB_ADDR1(4) |
1360						       R500_RGB_ADDR2(2)));
1361		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
1362						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
1363						       R500_ALPHA_ADDR1(4) |
1364						       R500_ALPHA_ADDR2(2)));
1365		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
1366						       R500_ALU_RGB_R_SWIZ_A_B |
1367						       R500_ALU_RGB_G_SWIZ_A_B |
1368						       R500_ALU_RGB_B_SWIZ_A_B |
1369						       R500_ALU_RGB_SEL_B_SRCP |
1370						       R500_ALU_RGB_R_SWIZ_B_R |
1371						       R500_ALU_RGB_G_SWIZ_B_G |
1372						       R500_ALU_RGB_B_SWIZ_B_B));
1373		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
1374						       R500_ALPHA_OP_MAD |
1375						       R500_ALPHA_SEL_A_SRC2 |
1376						       R500_ALPHA_SWIZ_A_B |
1377						       R500_ALPHA_SEL_B_SRCP |
1378						       R500_ALPHA_SWIZ_B_A));
1379		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
1380						       R500_ALU_RGBA_OP_MAD |
1381						       R500_ALU_RGBA_SEL_C_SRC0 |
1382						       R500_ALU_RGBA_R_SWIZ_R |
1383						       R500_ALU_RGBA_G_SWIZ_G |
1384						       R500_ALU_RGBA_B_SWIZ_B |
1385						       R500_ALU_RGBA_A_SWIZ_A));
1386
1387		/* LRP output, temp5.zzzz, temp3, temp0 ->
1388		 * - PRESUB temps, temp3 - temp0
1389		 * - MAD temp5.zzzz, temps, temp0 */
1390		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
1391						       R500_INST_LAST |
1392						       R500_INST_TEX_SEM_WAIT |
1393						       R500_INST_RGB_WMASK_R |
1394						       R500_INST_RGB_WMASK_G |
1395						       R500_INST_RGB_WMASK_B |
1396						       R500_INST_ALPHA_WMASK |
1397						       R500_INST_RGB_OMASK_R |
1398						       R500_INST_RGB_OMASK_G |
1399						       R500_INST_RGB_OMASK_B |
1400						       R500_INST_ALPHA_OMASK));
1401		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
1402						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
1403						       R500_RGB_ADDR1(3) |
1404						       R500_RGB_ADDR2(5)));
1405		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
1406						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
1407						       R500_ALPHA_ADDR1(3) |
1408						       R500_ALPHA_ADDR2(5)));
1409		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
1410						       R500_ALU_RGB_R_SWIZ_A_B |
1411						       R500_ALU_RGB_G_SWIZ_A_B |
1412						       R500_ALU_RGB_B_SWIZ_A_B |
1413						       R500_ALU_RGB_SEL_B_SRCP |
1414						       R500_ALU_RGB_R_SWIZ_B_R |
1415						       R500_ALU_RGB_G_SWIZ_B_G |
1416						       R500_ALU_RGB_B_SWIZ_B_B));
1417		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
1418						       R500_ALPHA_OP_MAD |
1419						       R500_ALPHA_SEL_A_SRC2 |
1420						       R500_ALPHA_SWIZ_A_B |
1421						       R500_ALPHA_SEL_B_SRCP |
1422						       R500_ALPHA_SWIZ_B_A));
1423		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
1424						       R500_ALU_RGBA_OP_MAD |
1425						       R500_ALU_RGBA_SEL_C_SRC0 |
1426						       R500_ALU_RGBA_R_SWIZ_R |
1427						       R500_ALU_RGBA_G_SWIZ_G |
1428						       R500_ALU_RGBA_B_SWIZ_B |
1429						       R500_ALU_RGBA_A_SWIZ_A));
1430
1431		/* Shader constants. */
1432		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
1433
1434		/* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
1435		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
1436		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
1437		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
1438		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
1439
1440		FINISH_ACCEL();
1441
1442	    } else {
1443		BEGIN_ACCEL(19);
1444		/* 2 components: 2 for tex0 */
1445		OUT_ACCEL_REG(R300_RS_COUNT,
1446			      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
1447			       R300_RS_COUNT_HIRES_EN));
1448
1449		/* R300_INST_COUNT_RS - highest RS instruction used */
1450		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
1451
1452		/* Pixel stack frame size. */
1453		OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
1454
1455		/* FP length. */
1456		OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
1457						  R500_US_CODE_END_ADDR(1)));
1458		OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
1459						   R500_US_CODE_RANGE_SIZE(1)));
1460
1461		/* Prepare for FP emission. */
1462		OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
1463		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
1464
1465		/* tex inst */
1466		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
1467						       R500_INST_TEX_SEM_WAIT |
1468						       R500_INST_RGB_WMASK_R |
1469						       R500_INST_RGB_WMASK_G |
1470						       R500_INST_RGB_WMASK_B |
1471						       R500_INST_ALPHA_WMASK |
1472						       R500_INST_RGB_CLAMP |
1473						       R500_INST_ALPHA_CLAMP));
1474		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
1475						       R500_TEX_INST_LD |
1476						       R500_TEX_SEM_ACQUIRE |
1477						       R500_TEX_IGNORE_UNCOVERED));
1478		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
1479						       R500_TEX_SRC_S_SWIZ_R |
1480						       R500_TEX_SRC_T_SWIZ_G |
1481						       R500_TEX_DST_ADDR(0) |
1482						       R500_TEX_DST_R_SWIZ_R |
1483						       R500_TEX_DST_G_SWIZ_G |
1484						       R500_TEX_DST_B_SWIZ_B |
1485						       R500_TEX_DST_A_SWIZ_A));
1486		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
1487						       R500_DX_S_SWIZ_R |
1488						       R500_DX_T_SWIZ_R |
1489						       R500_DX_R_SWIZ_R |
1490						       R500_DX_Q_SWIZ_R |
1491						       R500_DY_ADDR(0) |
1492						       R500_DY_S_SWIZ_R |
1493						       R500_DY_T_SWIZ_R |
1494						       R500_DY_R_SWIZ_R |
1495						       R500_DY_Q_SWIZ_R));
1496		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1497		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
1498
1499		/* ALU inst */
1500		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
1501						       R500_INST_TEX_SEM_WAIT |
1502						       R500_INST_LAST |
1503						       R500_INST_RGB_OMASK_R |
1504						       R500_INST_RGB_OMASK_G |
1505						       R500_INST_RGB_OMASK_B |
1506						       R500_INST_ALPHA_OMASK |
1507						       R500_INST_RGB_CLAMP |
1508						       R500_INST_ALPHA_CLAMP));
1509		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
1510						       R500_RGB_ADDR1(0) |
1511						       R500_RGB_ADDR1_CONST |
1512						       R500_RGB_ADDR2(0) |
1513						       R500_RGB_ADDR2_CONST));
1514		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
1515						       R500_ALPHA_ADDR1(0) |
1516						       R500_ALPHA_ADDR1_CONST |
1517						       R500_ALPHA_ADDR2(0) |
1518						       R500_ALPHA_ADDR2_CONST));
1519		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
1520						       R500_ALU_RGB_R_SWIZ_A_R |
1521						       R500_ALU_RGB_G_SWIZ_A_G |
1522						       R500_ALU_RGB_B_SWIZ_A_B |
1523						       R500_ALU_RGB_SEL_B_SRC0 |
1524						       R500_ALU_RGB_R_SWIZ_B_1 |
1525						       R500_ALU_RGB_B_SWIZ_B_1 |
1526						       R500_ALU_RGB_G_SWIZ_B_1));
1527		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
1528						       R500_ALPHA_SWIZ_A_A |
1529						       R500_ALPHA_SWIZ_B_1));
1530		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
1531						       R500_ALU_RGBA_R_SWIZ_0 |
1532						       R500_ALU_RGBA_G_SWIZ_0 |
1533						       R500_ALU_RGBA_B_SWIZ_0 |
1534						       R500_ALU_RGBA_A_SWIZ_0));
1535		FINISH_ACCEL();
1536	    }
1537	}
1538
1539	BEGIN_ACCEL(6);
1540	OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
1541	OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
1542
1543	OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset);
1544	OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch);
1545
1546	blendcntl = RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO;
1547	/* no need to enable blending */
1548	OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, blendcntl);
1549
1550	OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
1551	FINISH_ACCEL();
1552
1553    } else {
1554
1555	/* Same for R100/R200 */
1556	switch (pPixmap->drawable.bitsPerPixel) {
1557	case 16:
1558	    if (pPixmap->drawable.depth == 15)
1559		dst_format = RADEON_COLOR_FORMAT_ARGB1555;
1560	    else
1561		dst_format = RADEON_COLOR_FORMAT_RGB565;
1562	    break;
1563	case 32:
1564	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
1565	    break;
1566	default:
1567	    return;
1568	}
1569
1570	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
1571	    isplanar = TRUE;
1572	}
1573
1574	if (isplanar) {
1575	    txformat = RADEON_TXFORMAT_I8;
1576	} else {
1577	    if (pPriv->id == FOURCC_UYVY)
1578		txformat = RADEON_TXFORMAT_YVYU422;
1579	    else
1580		txformat = RADEON_TXFORMAT_VYUY422;
1581	}
1582
1583	txformat |= RADEON_TXFORMAT_NON_POWER2;
1584
1585	colorpitch = dst_pitch >> pixel_shift;
1586
1587	if (RADEONTilingEnabled(pScrn, pPixmap))
1588	    colorpitch |= RADEON_COLOR_TILE_ENABLE;
1589
1590	BEGIN_ACCEL(4);
1591
1592	OUT_ACCEL_REG(RADEON_RB3D_CNTL,
1593		      dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
1594	OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
1595
1596	OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
1597
1598	OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
1599		      RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
1600
1601	FINISH_ACCEL();
1602
1603
1604	if ((info->ChipFamily == CHIP_FAMILY_RV250) ||
1605	    (info->ChipFamily == CHIP_FAMILY_RV280) ||
1606	    (info->ChipFamily == CHIP_FAMILY_RS300) ||
1607	    (info->ChipFamily == CHIP_FAMILY_R200)) {
1608
1609	    info->accel_state->texW[0] = pPriv->w;
1610	    info->accel_state->texH[0] = pPriv->h;
1611
1612	    if (isplanar) {
1613		/* note: in contrast to r300, use input biasing on uv components */
1614		float yco = 1.1643;
1615		float yoff = -0.0625 * yco;
1616		float uco[3] = {0.0, -0.39173, 2.018};
1617		float vco[3] = {1.5958, -0.8129, 0.0};
1618
1619		/* need 2 texcoord sets (even though they are identical) due
1620		   to denormalization! hw apparently can't premultiply
1621		   same coord set by different texture size */
1622		vtx_count = 6;
1623
1624		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
1625			    (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
1626		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
1627		txpitch -= 32;
1628		txfilter =  R200_MAG_FILTER_LINEAR |
1629			    R200_MIN_FILTER_LINEAR |
1630			    R200_CLAMP_S_CLAMP_LAST |
1631			    R200_CLAMP_T_CLAMP_LAST;
1632
1633		BEGIN_ACCEL(36);
1634
1635		OUT_ACCEL_REG(RADEON_PP_CNTL,
1636			      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
1637			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
1638			      RADEON_TEX_BLEND_2_ENABLE);
1639
1640		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
1641		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
1642			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
1643			      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
1644
1645		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
1646		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
1647		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
1648		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
1649			      (pPriv->w - 1) |
1650			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
1651		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
1652		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
1653
1654		OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
1655		OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
1656		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
1657		OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
1658		OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
1659		OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);
1660
1661		OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
1662		OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
1663		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
1664		OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
1665		OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
1666		OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);
1667
1668		/* similar to r300 code. Note the big problem is that hardware constants
1669		 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
1670		 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
1671		 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
1672		 * the constants not. To get larger range can use output scale, but for
1673		 * that 2.018 value we need a total scale by 8, which means the constants
1674		 * really have no accuracy whatsoever (5 fractional bits only).
1675		 * The only direct way to get high  precision "constants" into the fragment
1676		 * pipe I know of is to use the texcoord interpolator (not color, this one
1677		 * is 8 bit only too), which seems a bit expensive. We're lucky though it
1678		 * seems the values we need seem to fit better than worst case (get about
1679		 * 6 fractional bits for this instead of 5, at least when not correcting for
1680		 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
1681		 * yoff get 8 fractional bits).
1682		 *
1683		 * A higher precision (8 fractional bits) version might just put uco into
1684		 * a texcoord, and calculate a new vcoconst in the shader, like so:
1685		 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
1686		 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
1687		 * vcocalc = ADD temp, bias/scale(cohelper), vco
1688		 * would in total use 4 tex units, 4 instructions which seems fairly
1689		 * balanced for this architecture (instead of 3 + 3 for the solution here)
1690		 *
1691		 * temp = MAD(yco, yuv.yyyy, yoff)
1692		 * temp = MAD(uco, yuv.uuuu, temp)
1693		 * result = MAD(vco, yuv.vvvv, temp)
1694		 *
1695		 * note first mad produces actually scalar, hence we transform
1696		 * it into a dp2a to get 8 bit precision of yco instead of 7 -
1697		 * That's assuming hw correctly expands consts to internal precision.
1698		 * (y * 1 + y * (yco - 1) + yoff)
1699		 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
1700		 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
1701		 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
1702		 *
1703		 * vco, uco need bias (and hence scale too)
1704		 *
1705		 */
1706
1707		/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
1708		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
1709			      R200_TXC_ARG_A_TFACTOR_COLOR |
1710			      R200_TXC_ARG_B_R0_COLOR |
1711			      R200_TXC_ARG_C_TFACTOR_COLOR |
1712			      R200_TXC_NEG_ARG_C |
1713			      R200_TXC_OP_DOT2_ADD);
1714		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
1715			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
1716			      R200_TXC_SCALE_INV2 |
1717			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
1718		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
1719			      R200_TXA_ARG_A_ZERO |
1720			      R200_TXA_ARG_B_ZERO |
1721			      R200_TXA_ARG_C_ZERO |
1722			      R200_TXA_OP_MADD);
1723		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
1724			      R200_TXA_OUTPUT_REG_NONE);
1725
1726		/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
1727		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
1728			      R200_TXC_ARG_A_TFACTOR_COLOR |
1729			      R200_TXC_BIAS_ARG_A |
1730			      R200_TXC_SCALE_ARG_A |
1731			      R200_TXC_ARG_B_R1_COLOR |
1732			      R200_TXC_BIAS_ARG_B |
1733			      R200_TXC_SCALE_ARG_B |
1734			      R200_TXC_ARG_C_R0_COLOR |
1735			      R200_TXC_OP_MADD);
1736		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
1737			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
1738			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
1739		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
1740			      R200_TXA_ARG_A_ZERO |
1741			      R200_TXA_ARG_B_ZERO |
1742			      R200_TXA_ARG_C_ZERO |
1743			      R200_TXA_OP_MADD);
1744		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
1745			      R200_TXA_OUTPUT_REG_NONE);
1746
1747		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
1748		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
1749			      R200_TXC_ARG_A_TFACTOR_COLOR |
1750			      R200_TXC_BIAS_ARG_A |
1751			      R200_TXC_SCALE_ARG_A |
1752			      R200_TXC_ARG_B_R2_COLOR |
1753			      R200_TXC_BIAS_ARG_B |
1754			      R200_TXC_ARG_C_R0_COLOR |
1755			      R200_TXC_OP_MADD);
1756		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
1757			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
1758			      R200_TXC_SCALE_2X |
1759			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
1760		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
1761			      R200_TXA_ARG_A_ZERO |
1762			      R200_TXA_ARG_B_ZERO |
1763			      R200_TXA_ARG_C_ZERO |
1764			      R200_TXA_COMP_ARG_C |
1765			      R200_TXA_OP_MADD);
1766		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
1767			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
1768
1769		/* shader constants */
1770		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
1771							      yco - 1.0,
1772							      -yoff, /* range [-1, 0] */
1773							      0.0));
1774		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
1775							      uco[1] * 0.125 + 0.5,
1776							      uco[2] * 0.125 + 0.5,
1777							      0.0));
1778		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
1779							      vco[1] * 0.25 + 0.5,
1780							      vco[2] * 0.25 + 0.5,
1781							      0.0));
1782
1783		FINISH_ACCEL();
1784	    }
1785	    else if (info->ChipFamily == CHIP_FAMILY_RV250) {
1786		/* fix up broken packed yuv - shader same as above except
1787		   yuv compoents are all in same reg */
1788		float yco = 1.1643;
1789		float yoff = -0.0625 * yco;
1790		float uco[3] = {0.0, -0.39173, 2.018};
1791		float vco[3] = {1.5958, -0.8129, 0.0};
1792
1793		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
1794			    (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
1795		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
1796		txpitch -= 32;
1797		txfilter =  R200_MAG_FILTER_LINEAR |
1798			    R200_MIN_FILTER_LINEAR |
1799			    R200_CLAMP_S_CLAMP_LAST |
1800			    R200_CLAMP_T_CLAMP_LAST;
1801
1802		BEGIN_ACCEL(24);
1803
1804		OUT_ACCEL_REG(RADEON_PP_CNTL,
1805			      RADEON_TEX_0_ENABLE |
1806			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
1807			      RADEON_TEX_BLEND_2_ENABLE);
1808
1809		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
1810		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
1811			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
1812
1813		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
1814		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
1815		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
1816		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
1817			      (pPriv->w - 1) |
1818			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
1819		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
1820		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
1821
1822		/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
1823		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
1824			      R200_TXC_ARG_A_TFACTOR_COLOR |
1825			      R200_TXC_ARG_B_R0_COLOR |
1826			      R200_TXC_ARG_C_TFACTOR_COLOR |
1827			      R200_TXC_NEG_ARG_C |
1828			      R200_TXC_OP_DOT2_ADD);
1829		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
1830			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
1831			      R200_TXC_SCALE_INV2 |
1832			      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
1833			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
1834		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
1835			      R200_TXA_ARG_A_ZERO |
1836			      R200_TXA_ARG_B_ZERO |
1837			      R200_TXA_ARG_C_ZERO |
1838			      R200_TXA_OP_MADD);
1839		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
1840			      R200_TXA_OUTPUT_REG_NONE);
1841
1842		/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
1843		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
1844			      R200_TXC_ARG_A_TFACTOR_COLOR |
1845			      R200_TXC_BIAS_ARG_A |
1846			      R200_TXC_SCALE_ARG_A |
1847			      R200_TXC_ARG_B_R0_COLOR |
1848			      R200_TXC_BIAS_ARG_B |
1849			      R200_TXC_SCALE_ARG_B |
1850			      R200_TXC_ARG_C_R1_COLOR |
1851			      R200_TXC_OP_MADD);
1852		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
1853			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
1854			      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
1855			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
1856		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
1857			      R200_TXA_ARG_A_ZERO |
1858			      R200_TXA_ARG_B_ZERO |
1859			      R200_TXA_ARG_C_ZERO |
1860			      R200_TXA_OP_MADD);
1861		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
1862			      R200_TXA_OUTPUT_REG_NONE);
1863
1864		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
1865		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
1866			      R200_TXC_ARG_A_TFACTOR_COLOR |
1867			      R200_TXC_BIAS_ARG_A |
1868			      R200_TXC_SCALE_ARG_A |
1869			      R200_TXC_ARG_B_R0_COLOR |
1870			      R200_TXC_BIAS_ARG_B |
1871			      R200_TXC_ARG_C_R1_COLOR |
1872			      R200_TXC_OP_MADD);
1873		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
1874			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
1875			      R200_TXC_SCALE_2X |
1876			      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
1877			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
1878		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
1879			      R200_TXA_ARG_A_ZERO |
1880			      R200_TXA_ARG_B_ZERO |
1881			      R200_TXA_ARG_C_ZERO |
1882			      R200_TXA_COMP_ARG_C |
1883			      R200_TXA_OP_MADD);
1884		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
1885			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
1886
1887		/* shader constants */
1888		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
1889							      yco - 1.0,
1890							      -yoff, /* range [-1, 0] */
1891							      0.0));
1892		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
1893							      uco[1] * 0.125 + 0.5,
1894							      uco[2] * 0.125 + 0.5,
1895							      0.0));
1896		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
1897							      vco[1] * 0.25 + 0.5,
1898							      vco[2] * 0.25 + 0.5,
1899							      0.0));
1900
1901		FINISH_ACCEL();
1902	    }
1903	    else {
1904		BEGIN_ACCEL(13);
1905		OUT_ACCEL_REG(RADEON_PP_CNTL,
1906			      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
1907
1908		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
1909		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
1910			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
1911
1912		OUT_ACCEL_REG(R200_PP_TXFILTER_0,
1913			      R200_MAG_FILTER_LINEAR |
1914			      R200_MIN_FILTER_LINEAR |
1915			      R200_CLAMP_S_CLAMP_LAST |
1916			      R200_CLAMP_T_CLAMP_LAST |
1917			      R200_YUV_TO_RGB);
1918		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
1919		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
1920		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
1921			      (pPriv->w - 1) |
1922			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
1923		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
1924
1925		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
1926
1927		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
1928			      R200_TXC_ARG_A_ZERO |
1929			      R200_TXC_ARG_B_ZERO |
1930			      R200_TXC_ARG_C_R0_COLOR |
1931			      R200_TXC_OP_MADD);
1932		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
1933			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
1934		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
1935			      R200_TXA_ARG_A_ZERO |
1936			      R200_TXA_ARG_B_ZERO |
1937			      R200_TXA_ARG_C_R0_ALPHA |
1938			      R200_TXA_OP_MADD);
1939		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
1940			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
1941		FINISH_ACCEL();
1942	    }
1943	} else {
1944
1945	    info->accel_state->texW[0] = 1;
1946	    info->accel_state->texH[0] = 1;
1947
1948	    BEGIN_ACCEL(9);
1949
1950	    OUT_ACCEL_REG(RADEON_PP_CNTL,
1951			  RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
1952
1953	    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
1954					      RADEON_SE_VTX_FMT_ST0));
1955
1956	    OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
1957			  RADEON_MAG_FILTER_LINEAR |
1958			  RADEON_MIN_FILTER_LINEAR |
1959			  RADEON_CLAMP_S_CLAMP_LAST |
1960			  RADEON_CLAMP_T_CLAMP_LAST |
1961			  RADEON_YUV_TO_RGB);
1962	    OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
1963	    OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
1964	    OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
1965			  RADEON_COLOR_ARG_A_ZERO |
1966			  RADEON_COLOR_ARG_B_ZERO |
1967			  RADEON_COLOR_ARG_C_T0_COLOR |
1968			  RADEON_BLEND_CTL_ADD |
1969			  RADEON_CLAMP_TX);
1970	    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
1971			  RADEON_ALPHA_ARG_A_ZERO |
1972			  RADEON_ALPHA_ARG_B_ZERO |
1973			  RADEON_ALPHA_ARG_C_T0_ALPHA |
1974			  RADEON_BLEND_CTL_ADD |
1975			  RADEON_CLAMP_TX);
1976
1977	    OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
1978			  (pPriv->w - 1) |
1979			  ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
1980	    OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
1981			  pPriv->src_pitch - 32);
1982	    FINISH_ACCEL();
1983	}
1984    }
1985
1986    if (pPriv->vsync) {
1987	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
1988						    pPriv->drw_x,
1989						    pPriv->drw_x + pPriv->dst_w,
1990						    pPriv->drw_y,
1991						    pPriv->drw_y + pPriv->dst_h);
1992	if (crtc) {
1993	    RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;
1994
1995	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
1996					  radeon_crtc->crtc_id,
1997					  pPriv->drw_y - crtc->y,
1998					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
1999	}
2000    }
2001    /*
2002     * Rendering of the actual polygon is done in two different
2003     * ways depending on chip generation:
2004     *
2005     * < R300:
2006     *
2007     *     These chips can render a rectangle in one pass, so
2008     *     handling is pretty straight-forward.
2009     *
2010     * >= R300:
2011     *
2012     *     These chips can accept a quad, but will render it as
2013     *     two triangles which results in a diagonal tear. Instead
2014     *     We render a single, large triangle and use the scissor
2015     *     functionality to restrict it to the desired rectangle.
2016     *     Due to guardband limits on r3xx/r4xx, we can only use
2017     *     the single triangle up to 2880 pixels; above that we
2018     *     render as a quad.
2019     */
2020
2021    while (nBox--) {
2022	int srcX, srcY, srcw, srch;
2023	int dstX, dstY, dstw, dsth;
2024	Bool use_quad = FALSE;
2025	dstX = pBox->x1 + dstxoff;
2026	dstY = pBox->y1 + dstyoff;
2027	dstw = pBox->x2 - pBox->x1;
2028	dsth = pBox->y2 - pBox->y1;
2029
2030	srcX = ((pBox->x1 - pPriv->drw_x) *
2031		pPriv->src_w) / pPriv->dst_w;
2032	srcY = ((pBox->y1 - pPriv->drw_y) *
2033		pPriv->src_h) / pPriv->dst_h;
2034
2035	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
2036	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
2037
2038#if 0
2039	ErrorF("dst: %d, %d, %d, %d\n", dstX, dstY, dstw, dsth);
2040	ErrorF("src: %d, %d, %d, %d\n", srcX, srcY, srcw, srch);
2041#endif
2042
2043	if (IS_R300_3D || IS_R500_3D) {
2044	    if (IS_R300_3D && ((dstw+dsth) > 2880))
2045		use_quad = TRUE;
2046	    /*
2047	     * Set up the scissor area to that of the output size.
2048	     */
2049	    BEGIN_ACCEL(2);
2050	    if (IS_R300_3D) {
2051		/* R300 has an offset */
2052		OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) |
2053						 ((dstY + 1088) << R300_SCISSOR_Y_SHIFT)));
2054		OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) |
2055						 ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT)));
2056	    } else {
2057		OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
2058						 ((dstY) << R300_SCISSOR_Y_SHIFT)));
2059		OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
2060						 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
2061	    }
2062	    FINISH_ACCEL();
2063	}
2064
2065#ifdef ACCEL_CP
2066	if (info->ChipFamily < CHIP_FAMILY_R200) {
2067	    BEGIN_RING(3 * vtx_count + 3);
2068	    OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
2069				3 * vtx_count + 1));
2070	    OUT_RING(RADEON_CP_VC_FRMT_XY |
2071		     RADEON_CP_VC_FRMT_ST0);
2072	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
2073		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2074		     RADEON_CP_VC_CNTL_MAOS_ENABLE |
2075		     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
2076		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2077	} else if (IS_R300_3D || IS_R500_3D) {
2078	    if (use_quad) {
2079		BEGIN_RING(4 * vtx_count + 4);
2080		OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2081				    4 * vtx_count));
2082		OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
2083			 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2084			 (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2085	    } else {
2086		BEGIN_RING(3 * vtx_count + 4);
2087		OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2088				    3 * vtx_count));
2089		OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
2090			 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2091			 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2092	    }
2093	} else {
2094	    BEGIN_RING(3 * vtx_count + 2);
2095	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
2096				3 * vtx_count));
2097	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
2098		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
2099		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
2100	}
2101#else /* ACCEL_CP */
2102	if (IS_R300_3D || IS_R500_3D) {
2103	    if (use_quad)
2104		BEGIN_ACCEL(2 + vtx_count * 4);
2105	    else
2106		BEGIN_ACCEL(2 + vtx_count * 3);
2107	} else
2108	    BEGIN_ACCEL(1 + vtx_count * 3);
2109
2110	if (info->ChipFamily < CHIP_FAMILY_R200)
2111	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
2112					      RADEON_VF_PRIM_WALK_DATA |
2113					      RADEON_VF_RADEON_MODE |
2114					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
2115	else if (IS_R300_3D || IS_R500_3D) {
2116	    if (use_quad)
2117		OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
2118						  RADEON_VF_PRIM_WALK_DATA |
2119						  (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
2120	    else
2121		OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
2122						  RADEON_VF_PRIM_WALK_DATA |
2123						  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
2124	} else
2125	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
2126					      RADEON_VF_PRIM_WALK_DATA |
2127					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
2128
2129#endif
2130	if (pPriv->bicubic_enabled) {
2131		/*
2132		 * This code is only executed on >= R300, so we don't
2133		 * have to deal with the legacy handling.
2134		 */
2135	    if (use_quad) {
2136		VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
2137			       (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
2138			       (float)srcX + 0.5,                                 (float)srcY + 0.5);
2139		VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dsth),
2140			       (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
2141			       (float)srcX + 0.5,                                 (float)(srcY + srch) + 0.5);
2142		VTX_OUT_FILTER((float)(dstX + dstw),                              (float)(dstY + dsth),
2143			       (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
2144			       (float)(srcX + srcw) + 0.5,                        (float)(srcY + srch) + 0.5);
2145		VTX_OUT_FILTER((float)(dstX + dstw),                              (float)dstY,
2146			       (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
2147			       (float)(srcX + srcw) + 0.5,                        (float)srcY + 0.5);
2148	    } else {
2149		VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
2150			       (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
2151			       (float)srcX + 0.5,                                 (float)srcY + 0.5);
2152		VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dstw + dsth),
2153			       (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
2154			       (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
2155		VTX_OUT_FILTER((float)(dstX + dstw + dsth),                       (float)dstY,
2156			       ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
2157			                                                          (float)srcY / info->accel_state->texH[0],
2158			       (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
2159			                                                          (float)srcY + 0.5);
2160	    }
2161	} else {
2162	    if (IS_R300_3D || IS_R500_3D) {
2163		if (use_quad) {
2164		    VTX_OUT((float)dstX,                                       (float)dstY,
2165			    (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
2166		    VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
2167			    (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
2168		    VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
2169			    (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
2170		    VTX_OUT((float)(dstX + dstw),                              (float)dstY,
2171			    (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
2172		} else {
2173		    /*
2174		     * Render a big, scissored triangle. This means
2175		     * increasing the triangle size and adjusting
2176		     * texture coordinates.
2177		     */
2178		    VTX_OUT((float)dstX,                              (float)dstY,
2179			    (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
2180		    VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
2181			    (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
2182
2183		    VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
2184			    ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
2185			                                              (float)srcY / info->accel_state->texH[0]);
2186		}
2187	    } else if (isplanar) {
2188		/*
2189		 * Just render a rect (using three coords).
2190		 * Filter is a bit a misnomer, it's just texcoords...
2191		 */
2192		VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
2193			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
2194			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
2195		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
2196			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
2197			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
2198		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
2199			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
2200			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
2201	    } else {
2202		/*
2203		 * Just render a rect (using three coords).
2204		 */
2205		VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
2206			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
2207		VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
2208			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
2209		VTX_OUT((float)(dstX + dstw),                              (float)dstY,
2210			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
2211	    }
2212	}
2213
2214	if (IS_R300_3D || IS_R500_3D)
2215	    /* flushing is pipelined, free/finish is not */
2216	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
2217
2218#ifdef ACCEL_CP
2219	ADVANCE_RING();
2220#else
2221	FINISH_ACCEL();
2222#endif /* !ACCEL_CP */
2223
2224	pBox++;
2225    }
2226
2227    if (IS_R300_3D || IS_R500_3D) {
2228	BEGIN_ACCEL(3);
2229	OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
2230	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
2231    } else
2232	BEGIN_ACCEL(1);
2233    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
2234    FINISH_ACCEL();
2235
2236    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
2237}
2238
2239#undef VTX_OUT
2240#undef VTX_OUT_FILTER
2241#undef FUNC_NAME
2242