1/*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23#include "radeon_compiler.h"
24
25#include <stdio.h>
26
27#include "r300_reg.h"
28
29#include "radeon_compiler_util.h"
30#include "radeon_dataflow.h"
31#include "radeon_program.h"
32#include "radeon_program_alu.h"
33#include "radeon_swizzle.h"
34#include "radeon_emulate_branches.h"
35#include "radeon_emulate_loops.h"
36#include "radeon_remove_constants.h"
37
38/*
39 * Take an already-setup and valid source then swizzle it appropriately to
40 * obtain a constant ZERO or ONE source.
41 */
42#define __CONST(x, y)	\
43	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
44			   t_swizzle(y),	\
45			   t_swizzle(y),	\
46			   t_swizzle(y),	\
47			   t_swizzle(y),	\
48			   t_src_class(vpi->SrcReg[x].File), \
49			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
50
51
52static unsigned long t_dst_mask(unsigned int mask)
53{
54	/* RC_MASK_* is equivalent to VSF_FLAG_* */
55	return mask & RC_MASK_XYZW;
56}
57
58static unsigned long t_dst_class(rc_register_file file)
59{
60	switch (file) {
61	default:
62		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
63		/* fall-through */
64	case RC_FILE_TEMPORARY:
65		return PVS_DST_REG_TEMPORARY;
66	case RC_FILE_OUTPUT:
67		return PVS_DST_REG_OUT;
68	case RC_FILE_ADDRESS:
69		return PVS_DST_REG_A0;
70	}
71}
72
73static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
74				 struct rc_dst_register *dst)
75{
76	if (dst->File == RC_FILE_OUTPUT)
77		return vp->outputs[dst->Index];
78
79	return dst->Index;
80}
81
82static unsigned long t_src_class(rc_register_file file)
83{
84	switch (file) {
85	default:
86		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
87		/* fall-through */
88	case RC_FILE_NONE:
89	case RC_FILE_TEMPORARY:
90		return PVS_SRC_REG_TEMPORARY;
91	case RC_FILE_INPUT:
92		return PVS_SRC_REG_INPUT;
93	case RC_FILE_CONSTANT:
94		return PVS_SRC_REG_CONSTANT;
95	}
96}
97
98static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
99{
100	unsigned long aclass = t_src_class(a.File);
101	unsigned long bclass = t_src_class(b.File);
102
103	if (aclass != bclass)
104		return 0;
105	if (aclass == PVS_SRC_REG_TEMPORARY)
106		return 0;
107
108	if (a.RelAddr || b.RelAddr)
109		return 1;
110	if (a.Index != b.Index)
111		return 1;
112
113	return 0;
114}
115
116static inline unsigned long t_swizzle(unsigned int swizzle)
117{
118	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
119	return swizzle;
120}
121
122static unsigned long t_src_index(struct r300_vertex_program_code *vp,
123				 struct rc_src_register *src)
124{
125	if (src->File == RC_FILE_INPUT) {
126		assert(vp->inputs[src->Index] != -1);
127		return vp->inputs[src->Index];
128	} else {
129		if (src->Index < 0) {
130			fprintf(stderr,
131				"negative offsets for indirect addressing do not work.\n");
132			return 0;
133		}
134		return src->Index;
135	}
136}
137
138/* these two functions should probably be merged... */
139
140static unsigned long t_src(struct r300_vertex_program_code *vp,
141			   struct rc_src_register *src)
142{
143	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
144	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
145	 */
146	return PVS_SRC_OPERAND(t_src_index(vp, src),
147			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
148			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
149			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
150			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
151			       t_src_class(src->File),
152			       src->Negate) |
153	       (src->RelAddr << 4) | (src->Abs << 3);
154}
155
156static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
157				  struct rc_src_register *src)
158{
159	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
160	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
161	 */
162	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
163
164	return PVS_SRC_OPERAND(t_src_index(vp, src),
165			       t_swizzle(swz),
166			       t_swizzle(swz),
167			       t_swizzle(swz),
168			       t_swizzle(swz),
169			       t_src_class(src->File),
170			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
171	       (src->RelAddr << 4) | (src->Abs << 3);
172}
173
174static int valid_dst(struct r300_vertex_program_code *vp,
175			   struct rc_dst_register *dst)
176{
177	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
178		return 0;
179	} else if (dst->File == RC_FILE_ADDRESS) {
180		assert(dst->Index == 0);
181	}
182
183	return 1;
184}
185
186static void ei_vector1(struct r300_vertex_program_code *vp,
187				unsigned int hw_opcode,
188				struct rc_sub_instruction *vpi,
189				unsigned int * inst)
190{
191	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
192				     0,
193				     0,
194				     t_dst_index(vp, &vpi->DstReg),
195				     t_dst_mask(vpi->DstReg.WriteMask),
196				     t_dst_class(vpi->DstReg.File),
197                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
198	inst[1] = t_src(vp, &vpi->SrcReg[0]);
199	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
200	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
201}
202
203static void ei_vector2(struct r300_vertex_program_code *vp,
204				unsigned int hw_opcode,
205				struct rc_sub_instruction *vpi,
206				unsigned int * inst)
207{
208	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
209				     0,
210				     0,
211				     t_dst_index(vp, &vpi->DstReg),
212				     t_dst_mask(vpi->DstReg.WriteMask),
213				     t_dst_class(vpi->DstReg.File),
214                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
215	inst[1] = t_src(vp, &vpi->SrcReg[0]);
216	inst[2] = t_src(vp, &vpi->SrcReg[1]);
217	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
218}
219
220static void ei_math1(struct r300_vertex_program_code *vp,
221				unsigned int hw_opcode,
222				struct rc_sub_instruction *vpi,
223				unsigned int * inst)
224{
225	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
226				     1,
227				     0,
228				     t_dst_index(vp, &vpi->DstReg),
229				     t_dst_mask(vpi->DstReg.WriteMask),
230				     t_dst_class(vpi->DstReg.File),
231                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
232	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
233	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
234	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
235}
236
237static void ei_lit(struct r300_vertex_program_code *vp,
238				      struct rc_sub_instruction *vpi,
239				      unsigned int * inst)
240{
241	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
242
243	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
244				     1,
245				     0,
246				     t_dst_index(vp, &vpi->DstReg),
247				     t_dst_mask(vpi->DstReg.WriteMask),
248				     t_dst_class(vpi->DstReg.File),
249                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
250	/* NOTE: Users swizzling might not work. */
251	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
252				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
253				  PVS_SRC_SELECT_FORCE_0,	// Z
254				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
255				  t_src_class(vpi->SrcReg[0].File),
256				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
257	    (vpi->SrcReg[0].RelAddr << 4);
258	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
259				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
260				  PVS_SRC_SELECT_FORCE_0,	// Z
261				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
262				  t_src_class(vpi->SrcReg[0].File),
263				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
264	    (vpi->SrcReg[0].RelAddr << 4);
265	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
266				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
267				  PVS_SRC_SELECT_FORCE_0,	// Z
268				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
269				  t_src_class(vpi->SrcReg[0].File),
270				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
271	    (vpi->SrcReg[0].RelAddr << 4);
272}
273
274static void ei_mad(struct r300_vertex_program_code *vp,
275				      struct rc_sub_instruction *vpi,
276				      unsigned int * inst)
277{
278	unsigned int i;
279	/* Remarks about hardware limitations of MAD
280	 * (please preserve this comment, as this information is _NOT_
281	 * in the documentation provided by AMD).
282	 *
283	 * As described in the documentation, MAD with three unique temporary
284	 * source registers requires the use of the macro version.
285	 *
286	 * However (and this is not mentioned in the documentation), apparently
287	 * the macro version is _NOT_ a full superset of the normal version.
288	 * In particular, the macro version does not always work when relative
289	 * addressing is used in the source operands.
290	 *
291	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
292	 * assembly shader path when using medium quality animations
293	 * (i.e. animations with matrix blending instead of quaternion blending).
294	 *
295	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
296	 * test for this issue - for some reason, it is possible to have vertex
297	 * programs whose prefix is *exactly* the same as the prefix of the
298	 * offending program in Sauerbraten up to the offending instruction
299	 * without causing any trouble.
300	 *
301	 * Bottom line: Only use the macro version only when really necessary;
302	 * according to AMD docs, this should improve performance by one clock
303	 * as a nice side bonus.
304	 */
305	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
306	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
307	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
308	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
309	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
310	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
311		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
312				0,
313				1,
314				t_dst_index(vp, &vpi->DstReg),
315				t_dst_mask(vpi->DstReg.WriteMask),
316				t_dst_class(vpi->DstReg.File),
317                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
318	} else {
319		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
320				0,
321				0,
322				t_dst_index(vp, &vpi->DstReg),
323				t_dst_mask(vpi->DstReg.WriteMask),
324				t_dst_class(vpi->DstReg.File),
325                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
326
327		/* Arguments with constant swizzles still count as a unique
328		 * temporary, so we should make sure these arguments share a
329		 * register index with one of the other arguments. */
330		for (i = 0; i < 3; i++) {
331			unsigned int j;
332			if (vpi->SrcReg[i].File != RC_FILE_NONE)
333				continue;
334
335			for (j = 0; j < 3; j++) {
336				if (i != j) {
337					vpi->SrcReg[i].Index =
338						vpi->SrcReg[j].Index;
339					break;
340				}
341			}
342		}
343	}
344	inst[1] = t_src(vp, &vpi->SrcReg[0]);
345	inst[2] = t_src(vp, &vpi->SrcReg[1]);
346	inst[3] = t_src(vp, &vpi->SrcReg[2]);
347}
348
349static void ei_pow(struct r300_vertex_program_code *vp,
350				      struct rc_sub_instruction *vpi,
351				      unsigned int * inst)
352{
353	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
354				     1,
355				     0,
356				     t_dst_index(vp, &vpi->DstReg),
357				     t_dst_mask(vpi->DstReg.WriteMask),
358				     t_dst_class(vpi->DstReg.File),
359                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
360	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
361	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
362	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
363}
364
365static void translate_vertex_program(struct radeon_compiler *c, void *user)
366{
367	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
368	struct rc_instruction *rci;
369
370	unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
371	unsigned loop_depth = 0;
372
373	compiler->code->pos_end = 0;	/* Not supported yet */
374	compiler->code->length = 0;
375	compiler->code->num_temporaries = 0;
376
377	compiler->SetHwInputOutput(compiler);
378
379	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
380		struct rc_sub_instruction *vpi = &rci->U.I;
381		unsigned int *inst = compiler->code->body.d + compiler->code->length;
382		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
383
384		/* Skip instructions writing to non-existing destination */
385		if (!valid_dst(compiler->code, &vpi->DstReg))
386			continue;
387
388		if (info->HasDstReg) {
389			/* Neither is Saturate. */
390			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
391				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
392					 "modifier (yet).\n");
393			}
394		}
395
396		if (compiler->code->length >= c->max_alu_insts * 4) {
397			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
398			return;
399		}
400
401		assert(compiler->Base.is_r500 ||
402		       (vpi->Opcode != RC_OPCODE_SEQ &&
403			vpi->Opcode != RC_OPCODE_SNE));
404
405		switch (vpi->Opcode) {
406		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
407		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
408		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
409		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
410		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
411		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
412		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
413		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
414		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
415		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
416		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
417		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
418		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
419		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
420		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
421		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
422		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
423		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
424		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
425		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
426		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
427		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
428		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
429		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
430		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
431		case RC_OPCODE_BGNLOOP:
432		{
433			if ((!compiler->Base.is_r500
434				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
435				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
436				rc_error(&compiler->Base,
437						"Loops are nested too deep.");
438				return;
439			}
440			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
441			break;
442		}
443		case RC_OPCODE_ENDLOOP:
444		{
445			unsigned int act_addr;
446			unsigned int last_addr;
447			unsigned int ret_addr;
448
449			ret_addr = loops[--loop_depth];
450			act_addr = ret_addr - 1;
451			last_addr = (compiler->code->length / 4) - 1;
452
453			if (loop_depth >= R300_VS_MAX_FC_OPS) {
454				rc_error(&compiler->Base,
455					"Too many flow control instructions.");
456				return;
457			}
458			if (compiler->Base.is_r500) {
459				compiler->code->fc_op_addrs.r500
460					[compiler->code->num_fc_ops].lw =
461					R500_PVS_FC_ACT_ADRS(act_addr)
462					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
463					;
464				compiler->code->fc_op_addrs.r500
465					[compiler->code->num_fc_ops].uw =
466					R500_PVS_FC_LAST_INST(last_addr)
467					| R500_PVS_FC_RTN_INST(ret_addr)
468					;
469			} else {
470				compiler->code->fc_op_addrs.r300
471					[compiler->code->num_fc_ops] =
472					R300_PVS_FC_ACT_ADRS(act_addr)
473					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
474					| R300_PVS_FC_LAST_INST(last_addr)
475					| R300_PVS_FC_RTN_INST(ret_addr)
476					;
477			}
478			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
479				R300_PVS_FC_LOOP_INIT_VAL(0x0)
480				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
481				;
482			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
483						compiler->code->num_fc_ops);
484			compiler->code->num_fc_ops++;
485
486			break;
487		}
488
489		case RC_ME_PRED_SET_CLR:
490			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
491			break;
492
493		case RC_ME_PRED_SET_INV:
494			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
495			break;
496
497		case RC_ME_PRED_SET_POP:
498			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
499			break;
500
501		case RC_ME_PRED_SET_RESTORE:
502			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
503			break;
504
505		case RC_ME_PRED_SEQ:
506			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
507			break;
508
509		case RC_ME_PRED_SNEQ:
510			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
511			break;
512
513		case RC_VE_PRED_SNEQ_PUSH:
514			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
515								vpi, inst);
516			break;
517
518		default:
519			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
520			return;
521		}
522
523		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
524			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
525						<< PVS_DST_PRED_ENABLE_SHIFT);
526			if (vpi->DstReg.Pred == RC_PRED_SET) {
527				inst[0] |= (PVS_DST_PRED_SENSE_MASK
528						<< PVS_DST_PRED_SENSE_SHIFT);
529			}
530		}
531
532		/* Update the number of temporaries. */
533		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
534		    vpi->DstReg.Index >= compiler->code->num_temporaries)
535			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
536
537		for (unsigned i = 0; i < info->NumSrcRegs; i++)
538			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
539			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
540				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
541
542		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
543			rc_error(&compiler->Base, "Too many temporaries.\n");
544			return;
545		}
546
547		compiler->code->length += 4;
548
549		if (compiler->Base.Error)
550			return;
551	}
552}
553
554struct temporary_allocation {
555	unsigned int Allocated:1;
556	unsigned int HwTemp:15;
557	struct rc_instruction * LastRead;
558};
559
560static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
561{
562	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
563	struct rc_instruction *inst;
564	struct rc_instruction *end_loop = NULL;
565	unsigned int num_orig_temps = 0;
566	char hwtemps[RC_REGISTER_MAX_INDEX];
567	struct temporary_allocation * ta;
568	unsigned int i, j;
569
570	memset(hwtemps, 0, sizeof(hwtemps));
571
572	rc_recompute_ips(c);
573
574	/* Pass 1: Count original temporaries. */
575	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
576		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
577
578		for (i = 0; i < opcode->NumSrcRegs; ++i) {
579			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
580				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
581					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
582			}
583		}
584
585		if (opcode->HasDstReg) {
586			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
587				if (inst->U.I.DstReg.Index >= num_orig_temps)
588					num_orig_temps = inst->U.I.DstReg.Index + 1;
589			}
590		}
591	}
592
593	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
594			sizeof(struct temporary_allocation) * num_orig_temps);
595	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
596
597	/* Pass 2: Determine original temporary lifetimes */
598	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
599		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
600		/* Instructions inside of loops need to use the ENDLOOP
601		 * instruction as their LastRead. */
602		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
603			int endloops = 1;
604			struct rc_instruction * ptr;
605			for(ptr = inst->Next;
606				ptr != &compiler->Base.Program.Instructions;
607							ptr = ptr->Next){
608				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
609					endloops++;
610				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
611					endloops--;
612					if (endloops <= 0) {
613						end_loop = ptr;
614						break;
615					}
616				}
617			}
618		}
619
620		if (inst == end_loop) {
621			end_loop = NULL;
622			continue;
623		}
624
625		for (i = 0; i < opcode->NumSrcRegs; ++i) {
626			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
627				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
628			}
629		}
630	}
631
632	/* Pass 3: Register allocation */
633	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
634		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
635
636		for (i = 0; i < opcode->NumSrcRegs; ++i) {
637			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
638				unsigned int orig = inst->U.I.SrcReg[i].Index;
639				inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
640
641				if (ta[orig].Allocated && inst == ta[orig].LastRead)
642					hwtemps[ta[orig].HwTemp] = 0;
643			}
644		}
645
646		if (opcode->HasDstReg) {
647			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
648				unsigned int orig = inst->U.I.DstReg.Index;
649
650				if (!ta[orig].Allocated) {
651					for(j = 0; j < c->max_temp_regs; ++j) {
652						if (!hwtemps[j])
653							break;
654					}
655					ta[orig].Allocated = 1;
656					ta[orig].HwTemp = j;
657					hwtemps[ta[orig].HwTemp] = 1;
658				}
659
660				inst->U.I.DstReg.Index = ta[orig].HwTemp;
661			}
662		}
663	}
664}
665
666/**
667 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
668 * and the Saturate opcode modifier. Only Absolute is currently transformed.
669 */
670static int transform_nonnative_modifiers(
671	struct radeon_compiler *c,
672	struct rc_instruction *inst,
673	void* unused)
674{
675	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
676	unsigned i;
677
678	/* Transform ABS(a) to MAX(a, -a). */
679	for (i = 0; i < opcode->NumSrcRegs; i++) {
680		if (inst->U.I.SrcReg[i].Abs) {
681			struct rc_instruction *new_inst;
682			unsigned temp;
683
684			inst->U.I.SrcReg[i].Abs = 0;
685
686			temp = rc_find_free_temporary(c);
687
688			new_inst = rc_insert_new_instruction(c, inst->Prev);
689			new_inst->U.I.Opcode = RC_OPCODE_MAX;
690			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
691			new_inst->U.I.DstReg.Index = temp;
692			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
693			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
694			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
695
696			memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
697			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
698			inst->U.I.SrcReg[i].Index = temp;
699			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
700		}
701	}
702	return 1;
703}
704
705/**
706 * Vertex engine cannot read two inputs or two constants at the same time.
707 * Introduce intermediate MOVs to temporary registers to account for this.
708 */
709static int transform_source_conflicts(
710	struct radeon_compiler *c,
711	struct rc_instruction* inst,
712	void* unused)
713{
714	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
715
716	if (opcode->NumSrcRegs == 3) {
717		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
718		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
719			int tmpreg = rc_find_free_temporary(c);
720			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
721			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
722			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
723			inst_mov->U.I.DstReg.Index = tmpreg;
724			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
725
726			reset_srcreg(&inst->U.I.SrcReg[2]);
727			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
728			inst->U.I.SrcReg[2].Index = tmpreg;
729		}
730	}
731
732	if (opcode->NumSrcRegs >= 2) {
733		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
734			int tmpreg = rc_find_free_temporary(c);
735			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
736			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
737			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
738			inst_mov->U.I.DstReg.Index = tmpreg;
739			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
740
741			reset_srcreg(&inst->U.I.SrcReg[1]);
742			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
743			inst->U.I.SrcReg[1].Index = tmpreg;
744		}
745	}
746
747	return 1;
748}
749
750static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
751{
752	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
753	int i;
754
755	for(i = 0; i < 32; ++i) {
756		if ((compiler->RequiredOutputs & (1 << i)) &&
757		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
758			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
759			inst->U.I.Opcode = RC_OPCODE_MOV;
760
761			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
762			inst->U.I.DstReg.Index = i;
763			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
764
765			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
766			inst->U.I.SrcReg[0].Index = 0;
767			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
768
769			compiler->Base.Program.OutputsWritten |= 1 << i;
770		}
771	}
772}
773
774static void dataflow_outputs_mark_used(void * userdata, void * data,
775		void (*callback)(void *, unsigned int, unsigned int))
776{
777	struct r300_vertex_program_compiler * c = userdata;
778	int i;
779
780	for(i = 0; i < 32; ++i) {
781		if (c->RequiredOutputs & (1 << i))
782			callback(data, i, RC_MASK_XYZW);
783	}
784}
785
786static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
787{
788	(void) opcode;
789	(void) reg;
790
791	return 1;
792}
793
794static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
795					  struct rc_instruction *arl,
796					  struct rc_instruction *end,
797					  int min_offset)
798{
799	struct rc_instruction *inst, *add;
800	unsigned const_swizzle;
801
802	/* Transform ARL/ARR */
803	add = rc_insert_new_instruction(&c->Base, arl->Prev);
804	add->U.I.Opcode = RC_OPCODE_ADD;
805	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
806	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
807	add->U.I.DstReg.WriteMask = RC_MASK_X;
808	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
809	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
810	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
811								     min_offset, &const_swizzle);
812	add->U.I.SrcReg[1].Swizzle = const_swizzle;
813
814	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
815	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
816	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
817
818	/* Rewrite offsets up to and excluding inst. */
819	for (inst = arl->Next; inst != end; inst = inst->Next) {
820		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
821
822		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
823			if (inst->U.I.SrcReg[i].RelAddr)
824				inst->U.I.SrcReg[i].Index -= min_offset;
825	}
826}
827
828static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
829{
830	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
831	struct rc_instruction *inst, *lastARL = NULL;
832	int min_offset = 0;
833
834	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
835		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
836
837		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
838			if (lastARL != NULL && min_offset < 0)
839				transform_negative_addressing(c, lastARL, inst, min_offset);
840
841			lastARL = inst;
842			min_offset = 0;
843			continue;
844		}
845
846		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
847			if (inst->U.I.SrcReg[i].RelAddr &&
848			    inst->U.I.SrcReg[i].Index < 0) {
849				/* ARL must precede any indirect addressing. */
850				if (!lastARL) {
851					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
852					return;
853				}
854
855				if (inst->U.I.SrcReg[i].Index < min_offset)
856					min_offset = inst->U.I.SrcReg[i].Index;
857			}
858		}
859	}
860
861	if (lastARL != NULL && min_offset < 0)
862		transform_negative_addressing(c, lastARL, inst, min_offset);
863}
864
865struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
866	.IsNative = &swizzle_is_native,
867	.Split = 0 /* should never be called */
868};
869
870void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
871{
872	int is_r500 = c->Base.is_r500;
873	int opt = !c->Base.disable_optimizations;
874
875	/* Lists of instruction transformations. */
876	struct radeon_program_transformation alu_rewrite_r500[] = {
877		{ &r300_transform_vertex_alu, 0 },
878		{ &r300_transform_trig_scale_vertex, 0 },
879		{ 0, 0 }
880	};
881
882	struct radeon_program_transformation alu_rewrite_r300[] = {
883		{ &r300_transform_vertex_alu, 0 },
884		{ &r300_transform_trig_simple, 0 },
885		{ 0, 0 }
886	};
887
888	/* Note: These passes have to be done seperately from ALU rewrite,
889	 * otherwise non-native ALU instructions with source conflits
890	 * or non-native modifiers will not be treated properly.
891	 */
892	struct radeon_program_transformation emulate_modifiers[] = {
893		{ &transform_nonnative_modifiers, 0 },
894		{ 0, 0 }
895	};
896
897	struct radeon_program_transformation resolve_src_conflicts[] = {
898		{ &transform_source_conflicts, 0 },
899		{ 0, 0 }
900	};
901
902	/* List of compiler passes. */
903	struct radeon_compiler_pass vs_list[] = {
904		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
905		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
906		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
907		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
908		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
909		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
910		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
911		{"deadcode",			1, opt,		rc_dataflow_deadcode,		dataflow_outputs_mark_used},
912		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
913		/* This pass must be done after optimizations. */
914		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
915		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
916		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
917		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
918		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
919		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
920		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
921		{NULL, 0, 0, NULL, NULL}
922	};
923
924	c->Base.type = RC_VERTEX_PROGRAM;
925	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
926
927	rc_run_compiler(&c->Base, vs_list);
928
929	c->code->InputsRead = c->Base.Program.InputsRead;
930	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
931	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
932}
933