1/*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23#include "radeon_compiler.h"
24
25#include <stdbool.h>
26#include <stdio.h>
27
28#include "r300_reg.h"
29
30#include "radeon_compiler_util.h"
31#include "radeon_dataflow.h"
32#include "radeon_program.h"
33#include "radeon_program_alu.h"
34#include "radeon_swizzle.h"
35#include "radeon_emulate_branches.h"
36#include "radeon_emulate_loops.h"
37#include "radeon_remove_constants.h"
38
39#include "util/compiler.h"
40
41/*
42 * Take an already-setup and valid source then swizzle it appropriately to
43 * obtain a constant ZERO or ONE source.
44 */
45#define __CONST(x, y)	\
46	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
47			   t_swizzle(y),	\
48			   t_swizzle(y),	\
49			   t_swizzle(y),	\
50			   t_swizzle(y),	\
51			   t_src_class(vpi->SrcReg[x].File), \
52			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
53
54
55static unsigned long t_dst_mask(unsigned int mask)
56{
57	/* RC_MASK_* is equivalent to VSF_FLAG_* */
58	return mask & RC_MASK_XYZW;
59}
60
61static unsigned long t_dst_class(rc_register_file file)
62{
63	switch (file) {
64	default:
65		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
66		FALLTHROUGH;
67	case RC_FILE_TEMPORARY:
68		return PVS_DST_REG_TEMPORARY;
69	case RC_FILE_OUTPUT:
70		return PVS_DST_REG_OUT;
71	case RC_FILE_ADDRESS:
72		return PVS_DST_REG_A0;
73	}
74}
75
76static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
77				 struct rc_dst_register *dst)
78{
79	if (dst->File == RC_FILE_OUTPUT)
80		return vp->outputs[dst->Index];
81
82	return dst->Index;
83}
84
85static unsigned long t_src_class(rc_register_file file)
86{
87	switch (file) {
88	default:
89		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
90		FALLTHROUGH;
91	case RC_FILE_NONE:
92	case RC_FILE_TEMPORARY:
93		return PVS_SRC_REG_TEMPORARY;
94	case RC_FILE_INPUT:
95		return PVS_SRC_REG_INPUT;
96	case RC_FILE_CONSTANT:
97		return PVS_SRC_REG_CONSTANT;
98	}
99}
100
101static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
102{
103	unsigned long aclass = t_src_class(a.File);
104	unsigned long bclass = t_src_class(b.File);
105
106	if (aclass != bclass)
107		return 0;
108	if (aclass == PVS_SRC_REG_TEMPORARY)
109		return 0;
110
111	if (a.RelAddr || b.RelAddr)
112		return 1;
113	if (a.Index != b.Index)
114		return 1;
115
116	return 0;
117}
118
119static inline unsigned long t_swizzle(unsigned int swizzle)
120{
121	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122	return swizzle;
123}
124
125static unsigned long t_src_index(struct r300_vertex_program_code *vp,
126				 struct rc_src_register *src)
127{
128	if (src->File == RC_FILE_INPUT) {
129		assert(vp->inputs[src->Index] != -1);
130		return vp->inputs[src->Index];
131	} else {
132		if (src->Index < 0) {
133			fprintf(stderr,
134				"negative offsets for indirect addressing do not work.\n");
135			return 0;
136		}
137		return src->Index;
138	}
139}
140
141/* these two functions should probably be merged... */
142
143static unsigned long t_src(struct r300_vertex_program_code *vp,
144			   struct rc_src_register *src)
145{
146	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148	 */
149	return PVS_SRC_OPERAND(t_src_index(vp, src),
150			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
151			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
152			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
153			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
154			       t_src_class(src->File),
155			       src->Negate) |
156	       (src->RelAddr << 4) | (src->Abs << 3);
157}
158
159static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
160				  struct rc_src_register *src)
161{
162	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164	 */
165	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
166
167	return PVS_SRC_OPERAND(t_src_index(vp, src),
168			       t_swizzle(swz),
169			       t_swizzle(swz),
170			       t_swizzle(swz),
171			       t_swizzle(swz),
172			       t_src_class(src->File),
173			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
174	       (src->RelAddr << 4) | (src->Abs << 3);
175}
176
177static int valid_dst(struct r300_vertex_program_code *vp,
178			   struct rc_dst_register *dst)
179{
180	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
181		return 0;
182	} else if (dst->File == RC_FILE_ADDRESS) {
183		assert(dst->Index == 0);
184	}
185
186	return 1;
187}
188
189static void ei_vector1(struct r300_vertex_program_code *vp,
190				unsigned int hw_opcode,
191				struct rc_sub_instruction *vpi,
192				unsigned int * inst)
193{
194	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195				     0,
196				     0,
197				     t_dst_index(vp, &vpi->DstReg),
198				     t_dst_mask(vpi->DstReg.WriteMask),
199				     t_dst_class(vpi->DstReg.File),
200                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201	inst[1] = t_src(vp, &vpi->SrcReg[0]);
202	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
204}
205
206static void ei_vector2(struct r300_vertex_program_code *vp,
207				unsigned int hw_opcode,
208				struct rc_sub_instruction *vpi,
209				unsigned int * inst)
210{
211	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212				     0,
213				     0,
214				     t_dst_index(vp, &vpi->DstReg),
215				     t_dst_mask(vpi->DstReg.WriteMask),
216				     t_dst_class(vpi->DstReg.File),
217                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218	inst[1] = t_src(vp, &vpi->SrcReg[0]);
219	inst[2] = t_src(vp, &vpi->SrcReg[1]);
220	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
221}
222
223static void ei_math1(struct r300_vertex_program_code *vp,
224				unsigned int hw_opcode,
225				struct rc_sub_instruction *vpi,
226				unsigned int * inst)
227{
228	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
229				     1,
230				     0,
231				     t_dst_index(vp, &vpi->DstReg),
232				     t_dst_mask(vpi->DstReg.WriteMask),
233				     t_dst_class(vpi->DstReg.File),
234                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
235	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
236	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
237	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
238}
239
240static void ei_lit(struct r300_vertex_program_code *vp,
241				      struct rc_sub_instruction *vpi,
242				      unsigned int * inst)
243{
244	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
245
246	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
247				     1,
248				     0,
249				     t_dst_index(vp, &vpi->DstReg),
250				     t_dst_mask(vpi->DstReg.WriteMask),
251				     t_dst_class(vpi->DstReg.File),
252                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
253	/* NOTE: Users swizzling might not work. */
254	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
255				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
256				  PVS_SRC_SELECT_FORCE_0,	// Z
257				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
258				  t_src_class(vpi->SrcReg[0].File),
259				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
260	    (vpi->SrcReg[0].RelAddr << 4);
261	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
262				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
263				  PVS_SRC_SELECT_FORCE_0,	// Z
264				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
265				  t_src_class(vpi->SrcReg[0].File),
266				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
267	    (vpi->SrcReg[0].RelAddr << 4);
268	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
269				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
270				  PVS_SRC_SELECT_FORCE_0,	// Z
271				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
272				  t_src_class(vpi->SrcReg[0].File),
273				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
274	    (vpi->SrcReg[0].RelAddr << 4);
275}
276
277static void ei_mad(struct r300_vertex_program_code *vp,
278				      struct rc_sub_instruction *vpi,
279				      unsigned int * inst)
280{
281	unsigned int i;
282	/* Remarks about hardware limitations of MAD
283	 * (please preserve this comment, as this information is _NOT_
284	 * in the documentation provided by AMD).
285	 *
286	 * As described in the documentation, MAD with three unique temporary
287	 * source registers requires the use of the macro version.
288	 *
289	 * However (and this is not mentioned in the documentation), apparently
290	 * the macro version is _NOT_ a full superset of the normal version.
291	 * In particular, the macro version does not always work when relative
292	 * addressing is used in the source operands.
293	 *
294	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
295	 * assembly shader path when using medium quality animations
296	 * (i.e. animations with matrix blending instead of quaternion blending).
297	 *
298	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
299	 * test for this issue - for some reason, it is possible to have vertex
300	 * programs whose prefix is *exactly* the same as the prefix of the
301	 * offending program in Sauerbraten up to the offending instruction
302	 * without causing any trouble.
303	 *
304	 * Bottom line: Only use the macro version only when really necessary;
305	 * according to AMD docs, this should improve performance by one clock
306	 * as a nice side bonus.
307	 */
308	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
309	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
310	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
311	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
312	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
313	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
314		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
315				0,
316				1,
317				t_dst_index(vp, &vpi->DstReg),
318				t_dst_mask(vpi->DstReg.WriteMask),
319				t_dst_class(vpi->DstReg.File),
320                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
321	} else {
322		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
323				0,
324				0,
325				t_dst_index(vp, &vpi->DstReg),
326				t_dst_mask(vpi->DstReg.WriteMask),
327				t_dst_class(vpi->DstReg.File),
328                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
329
330		/* Arguments with constant swizzles still count as a unique
331		 * temporary, so we should make sure these arguments share a
332		 * register index with one of the other arguments. */
333		for (i = 0; i < 3; i++) {
334			unsigned int j;
335			if (vpi->SrcReg[i].File != RC_FILE_NONE)
336				continue;
337
338			for (j = 0; j < 3; j++) {
339				if (i != j) {
340					vpi->SrcReg[i].Index =
341						vpi->SrcReg[j].Index;
342					break;
343				}
344			}
345		}
346	}
347	inst[1] = t_src(vp, &vpi->SrcReg[0]);
348	inst[2] = t_src(vp, &vpi->SrcReg[1]);
349	inst[3] = t_src(vp, &vpi->SrcReg[2]);
350}
351
352static void ei_pow(struct r300_vertex_program_code *vp,
353				      struct rc_sub_instruction *vpi,
354				      unsigned int * inst)
355{
356	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
357				     1,
358				     0,
359				     t_dst_index(vp, &vpi->DstReg),
360				     t_dst_mask(vpi->DstReg.WriteMask),
361				     t_dst_class(vpi->DstReg.File),
362                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
363	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
364	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
365	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
366}
367
368static void translate_vertex_program(struct radeon_compiler *c, void *user)
369{
370	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
371	struct rc_instruction *rci;
372
373	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
374	unsigned loop_depth = 0;
375
376	compiler->code->pos_end = 0;	/* Not supported yet */
377	compiler->code->length = 0;
378	compiler->code->num_temporaries = 0;
379
380	compiler->SetHwInputOutput(compiler);
381
382	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
383		struct rc_sub_instruction *vpi = &rci->U.I;
384		unsigned int *inst = compiler->code->body.d + compiler->code->length;
385		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
386
387		/* Skip instructions writing to non-existing destination */
388		if (!valid_dst(compiler->code, &vpi->DstReg))
389			continue;
390
391		if (info->HasDstReg) {
392			/* Neither is Saturate. */
393			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
394				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
395					 "modifier (yet).\n");
396			}
397		}
398
399		if (compiler->code->length >= c->max_alu_insts * 4) {
400			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
401			return;
402		}
403
404		assert(compiler->Base.is_r500 ||
405		       (vpi->Opcode != RC_OPCODE_SEQ &&
406			vpi->Opcode != RC_OPCODE_SNE));
407
408		switch (vpi->Opcode) {
409		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
410		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
411		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
412		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
413		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
414		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
415		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
416		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
417		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
418		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
419		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
420		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
421		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
422		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
423		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
424		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
425		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
426		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
427		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
428		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
429		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
430		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
431		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
432		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
433		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
434		case RC_OPCODE_BGNLOOP:
435		{
436			if ((!compiler->Base.is_r500
437				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
438				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
439				rc_error(&compiler->Base,
440						"Loops are nested too deep.");
441				return;
442			}
443			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
444			break;
445		}
446		case RC_OPCODE_ENDLOOP:
447		{
448			unsigned int act_addr;
449			unsigned int last_addr;
450			unsigned int ret_addr;
451
452			ret_addr = loops[--loop_depth];
453			act_addr = ret_addr - 1;
454			last_addr = (compiler->code->length / 4) - 1;
455
456			if (loop_depth >= R300_VS_MAX_FC_OPS) {
457				rc_error(&compiler->Base,
458					"Too many flow control instructions.");
459				return;
460			}
461			if (compiler->Base.is_r500) {
462				compiler->code->fc_op_addrs.r500
463					[compiler->code->num_fc_ops].lw =
464					R500_PVS_FC_ACT_ADRS(act_addr)
465					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
466					;
467				compiler->code->fc_op_addrs.r500
468					[compiler->code->num_fc_ops].uw =
469					R500_PVS_FC_LAST_INST(last_addr)
470					| R500_PVS_FC_RTN_INST(ret_addr)
471					;
472			} else {
473				compiler->code->fc_op_addrs.r300
474					[compiler->code->num_fc_ops] =
475					R300_PVS_FC_ACT_ADRS(act_addr)
476					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
477					| R300_PVS_FC_LAST_INST(last_addr)
478					| R300_PVS_FC_RTN_INST(ret_addr)
479					;
480			}
481			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
482				R300_PVS_FC_LOOP_INIT_VAL(0x0)
483				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
484				;
485			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
486						compiler->code->num_fc_ops);
487			compiler->code->num_fc_ops++;
488
489			break;
490		}
491
492		case RC_ME_PRED_SET_CLR:
493			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
494			break;
495
496		case RC_ME_PRED_SET_INV:
497			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
498			break;
499
500		case RC_ME_PRED_SET_POP:
501			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
502			break;
503
504		case RC_ME_PRED_SET_RESTORE:
505			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
506			break;
507
508		case RC_ME_PRED_SEQ:
509			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
510			break;
511
512		case RC_ME_PRED_SNEQ:
513			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
514			break;
515
516		case RC_VE_PRED_SNEQ_PUSH:
517			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
518								vpi, inst);
519			break;
520
521		default:
522			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
523			return;
524		}
525
526		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
527			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
528						<< PVS_DST_PRED_ENABLE_SHIFT);
529			if (vpi->DstReg.Pred == RC_PRED_SET) {
530				inst[0] |= (PVS_DST_PRED_SENSE_MASK
531						<< PVS_DST_PRED_SENSE_SHIFT);
532			}
533		}
534
535		/* Update the number of temporaries. */
536		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
537		    vpi->DstReg.Index >= compiler->code->num_temporaries)
538			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
539
540		for (unsigned i = 0; i < info->NumSrcRegs; i++)
541			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
542			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
543				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
544
545		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
546			rc_error(&compiler->Base, "Too many temporaries.\n");
547			return;
548		}
549
550		compiler->code->length += 4;
551
552		if (compiler->Base.Error)
553			return;
554	}
555}
556
557struct temporary_allocation {
558	unsigned int Allocated:1;
559	unsigned int HwTemp:15;
560	struct rc_instruction * LastRead;
561};
562
563static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
564                   unsigned int orig)
565{
566    if (!ta[orig].Allocated) {
567        int j;
568        for (j = 0; j < c->max_temp_regs; ++j)
569        {
570            if (!hwtemps[j])
571                break;
572        }
573        ta[orig].Allocated = 1;
574        ta[orig].HwTemp = j;
575        hwtemps[ta[orig].HwTemp] = true;
576    }
577
578    return ta[orig].HwTemp;
579}
580
581static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
582{
583	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
584	struct rc_instruction *inst;
585	struct rc_instruction *end_loop = NULL;
586	unsigned int num_orig_temps = 0;
587	bool hwtemps[RC_REGISTER_MAX_INDEX];
588	struct temporary_allocation * ta;
589	unsigned int i;
590
591	memset(hwtemps, 0, sizeof(hwtemps));
592
593	rc_recompute_ips(c);
594
595	/* Pass 1: Count original temporaries. */
596	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
597		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
598
599		for (i = 0; i < opcode->NumSrcRegs; ++i) {
600			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
601				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
602					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
603			}
604		}
605
606		if (opcode->HasDstReg) {
607			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
608				if (inst->U.I.DstReg.Index >= num_orig_temps)
609					num_orig_temps = inst->U.I.DstReg.Index + 1;
610			}
611		}
612	}
613
614	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
615			sizeof(struct temporary_allocation) * num_orig_temps);
616	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
617
618	/* Pass 2: Determine original temporary lifetimes */
619	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
620		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
621		/* Instructions inside of loops need to use the ENDLOOP
622		 * instruction as their LastRead. */
623		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
624			int endloops = 1;
625			struct rc_instruction * ptr;
626			for(ptr = inst->Next;
627				ptr != &compiler->Base.Program.Instructions;
628							ptr = ptr->Next){
629				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
630					endloops++;
631				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
632					endloops--;
633					if (endloops <= 0) {
634						end_loop = ptr;
635						break;
636					}
637				}
638			}
639		}
640
641		if (inst == end_loop) {
642			end_loop = NULL;
643			continue;
644		}
645
646		for (i = 0; i < opcode->NumSrcRegs; ++i) {
647			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
648				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
649			}
650		}
651	}
652
653	/* Pass 3: Register allocation */
654	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
655		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
656
657		for (i = 0; i < opcode->NumSrcRegs; ++i) {
658			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
659				unsigned int orig = inst->U.I.SrcReg[i].Index;
660				inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig);
661
662				if (ta[orig].Allocated && inst == ta[orig].LastRead)
663					hwtemps[ta[orig].HwTemp] = false;
664			}
665		}
666
667		if (opcode->HasDstReg) {
668			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
669				unsigned int orig = inst->U.I.DstReg.Index;
670				inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig);
671			}
672		}
673	}
674}
675
676/**
677 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
678 * and the Saturate opcode modifier. Only Absolute is currently transformed.
679 */
680static int transform_nonnative_modifiers(
681	struct radeon_compiler *c,
682	struct rc_instruction *inst,
683	void* unused)
684{
685	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
686	unsigned i;
687
688	/* Transform ABS(a) to MAX(a, -a). */
689	for (i = 0; i < opcode->NumSrcRegs; i++) {
690		if (inst->U.I.SrcReg[i].Abs) {
691			struct rc_instruction *new_inst;
692			unsigned temp;
693
694			inst->U.I.SrcReg[i].Abs = 0;
695
696			temp = rc_find_free_temporary(c);
697
698			new_inst = rc_insert_new_instruction(c, inst->Prev);
699			new_inst->U.I.Opcode = RC_OPCODE_MAX;
700			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
701			new_inst->U.I.DstReg.Index = temp;
702			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
703			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
704			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
705
706			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
707			inst->U.I.SrcReg[i].Index = temp;
708			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
709			inst->U.I.SrcReg[i].RelAddr = 0;
710		}
711	}
712	return 1;
713}
714
715/**
716 * Vertex engine cannot read two inputs or two constants at the same time.
717 * Introduce intermediate MOVs to temporary registers to account for this.
718 */
719static int transform_source_conflicts(
720	struct radeon_compiler *c,
721	struct rc_instruction* inst,
722	void* unused)
723{
724	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
725
726	if (opcode->NumSrcRegs == 3) {
727		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
728		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
729			int tmpreg = rc_find_free_temporary(c);
730			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
731			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
732			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
733			inst_mov->U.I.DstReg.Index = tmpreg;
734			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
735			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
736			inst_mov->U.I.SrcReg[0].Negate = 0;
737			inst_mov->U.I.SrcReg[0].Abs = 0;
738
739			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
740			inst->U.I.SrcReg[2].Index = tmpreg;
741			inst->U.I.SrcReg[2].RelAddr = false;
742		}
743	}
744
745	if (opcode->NumSrcRegs >= 2) {
746		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
747			int tmpreg = rc_find_free_temporary(c);
748			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
749			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
750			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
751			inst_mov->U.I.DstReg.Index = tmpreg;
752			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
753			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
754			inst_mov->U.I.SrcReg[0].Negate = 0;
755			inst_mov->U.I.SrcReg[0].Abs = 0;
756
757			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
758			inst->U.I.SrcReg[1].Index = tmpreg;
759			inst->U.I.SrcReg[1].RelAddr = false;
760		}
761	}
762
763	return 1;
764}
765
766static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
767{
768	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
769	int i;
770
771	for(i = 0; i < 32; ++i) {
772		if ((compiler->RequiredOutputs & (1U << i)) &&
773		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
774			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
775			inst->U.I.Opcode = RC_OPCODE_MOV;
776
777			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
778			inst->U.I.DstReg.Index = i;
779			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
780
781			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
782			inst->U.I.SrcReg[0].Index = 0;
783			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
784
785			compiler->Base.Program.OutputsWritten |= 1U << i;
786		}
787	}
788}
789
790static void dataflow_outputs_mark_used(void * userdata, void * data,
791		void (*callback)(void *, unsigned int, unsigned int))
792{
793	struct r300_vertex_program_compiler * c = userdata;
794	int i;
795
796	for(i = 0; i < 32; ++i) {
797		if (c->RequiredOutputs & (1U << i))
798			callback(data, i, RC_MASK_XYZW);
799	}
800}
801
802static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
803{
804	(void) opcode;
805	(void) reg;
806
807	return 1;
808}
809
810static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
811					  struct rc_instruction *arl,
812					  struct rc_instruction *end,
813					  int min_offset)
814{
815	struct rc_instruction *inst, *add;
816	unsigned const_swizzle;
817
818	/* Transform ARL/ARR */
819	add = rc_insert_new_instruction(&c->Base, arl->Prev);
820	add->U.I.Opcode = RC_OPCODE_ADD;
821	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
822	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
823	add->U.I.DstReg.WriteMask = RC_MASK_X;
824	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
825	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
826	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
827								     min_offset, &const_swizzle);
828	add->U.I.SrcReg[1].Swizzle = const_swizzle;
829
830	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
831	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
832	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
833
834	/* Rewrite offsets up to and excluding inst. */
835	for (inst = arl->Next; inst != end; inst = inst->Next) {
836		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
837
838		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
839			if (inst->U.I.SrcReg[i].RelAddr)
840				inst->U.I.SrcReg[i].Index -= min_offset;
841	}
842}
843
844static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
845{
846	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
847	struct rc_instruction *inst, *lastARL = NULL;
848	int min_offset = 0;
849
850	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
851		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
852
853		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
854			if (lastARL != NULL && min_offset < 0)
855				transform_negative_addressing(c, lastARL, inst, min_offset);
856
857			lastARL = inst;
858			min_offset = 0;
859			continue;
860		}
861
862		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
863			if (inst->U.I.SrcReg[i].RelAddr &&
864			    inst->U.I.SrcReg[i].Index < 0) {
865				/* ARL must precede any indirect addressing. */
866				if (!lastARL) {
867					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
868					return;
869				}
870
871				if (inst->U.I.SrcReg[i].Index < min_offset)
872					min_offset = inst->U.I.SrcReg[i].Index;
873			}
874		}
875	}
876
877	if (lastARL != NULL && min_offset < 0)
878		transform_negative_addressing(c, lastARL, inst, min_offset);
879}
880
881const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
882	.IsNative = &swizzle_is_native,
883	.Split = 0 /* should never be called */
884};
885
886void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
887{
888	int is_r500 = c->Base.is_r500;
889	int opt = !c->Base.disable_optimizations;
890
891	/* Lists of instruction transformations. */
892	struct radeon_program_transformation alu_rewrite_r500[] = {
893		{ &r300_transform_vertex_alu, 0 },
894		{ &r300_transform_trig_scale_vertex, 0 },
895		{ 0, 0 }
896	};
897
898	struct radeon_program_transformation alu_rewrite_r300[] = {
899		{ &r300_transform_vertex_alu, 0 },
900		{ &r300_transform_trig_simple, 0 },
901		{ 0, 0 }
902	};
903
904	/* Note: These passes have to be done seperately from ALU rewrite,
905	 * otherwise non-native ALU instructions with source conflits
906	 * or non-native modifiers will not be treated properly.
907	 */
908	struct radeon_program_transformation emulate_modifiers[] = {
909		{ &transform_nonnative_modifiers, 0 },
910		{ 0, 0 }
911	};
912
913	struct radeon_program_transformation resolve_src_conflicts[] = {
914		{ &transform_source_conflicts, 0 },
915		{ 0, 0 }
916	};
917
918	/* List of compiler passes. */
919	struct radeon_compiler_pass vs_list[] = {
920		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
921		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
922		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
923		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
924		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
925		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
926		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
927		{"deadcode",			1, opt,		rc_dataflow_deadcode,		dataflow_outputs_mark_used},
928		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
929		/* This pass must be done after optimizations. */
930		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
931		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
932		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
933		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
934		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
935		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
936		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
937		{NULL, 0, 0, NULL, NULL}
938	};
939
940	c->Base.type = RC_VERTEX_PROGRAM;
941	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
942
943	rc_run_compiler(&c->Base, vs_list);
944
945	c->code->InputsRead = c->Base.Program.InputsRead;
946	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
947	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
948}
949