1/* 2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */ 22 23#include "radeon_compiler.h" 24 25#include <stdbool.h> 26#include <stdio.h> 27 28#include "r300_reg.h" 29 30#include "radeon_compiler_util.h" 31#include "radeon_dataflow.h" 32#include "radeon_program.h" 33#include "radeon_program_alu.h" 34#include "radeon_swizzle.h" 35#include "radeon_emulate_branches.h" 36#include "radeon_emulate_loops.h" 37#include "radeon_remove_constants.h" 38 39#include "util/compiler.h" 40 41/* 42 * Take an already-setup and valid source then swizzle it appropriately to 43 * obtain a constant ZERO or ONE source. 44 */ 45#define __CONST(x, y) \ 46 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \ 47 t_swizzle(y), \ 48 t_swizzle(y), \ 49 t_swizzle(y), \ 50 t_swizzle(y), \ 51 t_src_class(vpi->SrcReg[x].File), \ 52 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4)) 53 54 55static unsigned long t_dst_mask(unsigned int mask) 56{ 57 /* RC_MASK_* is equivalent to VSF_FLAG_* */ 58 return mask & RC_MASK_XYZW; 59} 60 61static unsigned long t_dst_class(rc_register_file file) 62{ 63 switch (file) { 64 default: 65 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 66 FALLTHROUGH; 67 case RC_FILE_TEMPORARY: 68 return PVS_DST_REG_TEMPORARY; 69 case RC_FILE_OUTPUT: 70 return PVS_DST_REG_OUT; 71 case RC_FILE_ADDRESS: 72 return PVS_DST_REG_A0; 73 } 74} 75 76static unsigned long t_dst_index(struct r300_vertex_program_code *vp, 77 struct rc_dst_register *dst) 78{ 79 if (dst->File == RC_FILE_OUTPUT) 80 return vp->outputs[dst->Index]; 81 82 return dst->Index; 83} 84 85static unsigned long t_src_class(rc_register_file file) 86{ 87 switch (file) { 88 default: 89 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 90 FALLTHROUGH; 91 case RC_FILE_NONE: 92 case RC_FILE_TEMPORARY: 93 return PVS_SRC_REG_TEMPORARY; 94 case RC_FILE_INPUT: 95 return PVS_SRC_REG_INPUT; 96 case RC_FILE_CONSTANT: 97 return PVS_SRC_REG_CONSTANT; 98 } 99} 100 101static int t_src_conflict(struct rc_src_register a, struct rc_src_register b) 102{ 103 unsigned long aclass = t_src_class(a.File); 104 unsigned long bclass = t_src_class(b.File); 105 106 if (aclass != bclass) 107 return 0; 108 if (aclass == PVS_SRC_REG_TEMPORARY) 109 return 0; 110 111 if (a.RelAddr || b.RelAddr) 112 return 1; 113 if (a.Index != b.Index) 114 return 1; 115 116 return 0; 117} 118 119static inline unsigned long t_swizzle(unsigned int swizzle) 120{ 121 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ 122 return swizzle; 123} 124 125static unsigned long t_src_index(struct r300_vertex_program_code *vp, 126 struct rc_src_register *src) 127{ 128 if (src->File == RC_FILE_INPUT) { 129 assert(vp->inputs[src->Index] != -1); 130 return vp->inputs[src->Index]; 131 } else { 132 if (src->Index < 0) { 133 fprintf(stderr, 134 "negative offsets for indirect addressing do not work.\n"); 135 return 0; 136 } 137 return src->Index; 138 } 139} 140 141/* these two functions should probably be merged... */ 142 143static unsigned long t_src(struct r300_vertex_program_code *vp, 144 struct rc_src_register *src) 145{ 146 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 147 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 148 */ 149 return PVS_SRC_OPERAND(t_src_index(vp, src), 150 t_swizzle(GET_SWZ(src->Swizzle, 0)), 151 t_swizzle(GET_SWZ(src->Swizzle, 1)), 152 t_swizzle(GET_SWZ(src->Swizzle, 2)), 153 t_swizzle(GET_SWZ(src->Swizzle, 3)), 154 t_src_class(src->File), 155 src->Negate) | 156 (src->RelAddr << 4) | (src->Abs << 3); 157} 158 159static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, 160 struct rc_src_register *src) 161{ 162 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 163 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 164 */ 165 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle); 166 167 return PVS_SRC_OPERAND(t_src_index(vp, src), 168 t_swizzle(swz), 169 t_swizzle(swz), 170 t_swizzle(swz), 171 t_swizzle(swz), 172 t_src_class(src->File), 173 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 174 (src->RelAddr << 4) | (src->Abs << 3); 175} 176 177static int valid_dst(struct r300_vertex_program_code *vp, 178 struct rc_dst_register *dst) 179{ 180 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) { 181 return 0; 182 } else if (dst->File == RC_FILE_ADDRESS) { 183 assert(dst->Index == 0); 184 } 185 186 return 1; 187} 188 189static void ei_vector1(struct r300_vertex_program_code *vp, 190 unsigned int hw_opcode, 191 struct rc_sub_instruction *vpi, 192 unsigned int * inst) 193{ 194 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 195 0, 196 0, 197 t_dst_index(vp, &vpi->DstReg), 198 t_dst_mask(vpi->DstReg.WriteMask), 199 t_dst_class(vpi->DstReg.File), 200 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 201 inst[1] = t_src(vp, &vpi->SrcReg[0]); 202 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 203 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 204} 205 206static void ei_vector2(struct r300_vertex_program_code *vp, 207 unsigned int hw_opcode, 208 struct rc_sub_instruction *vpi, 209 unsigned int * inst) 210{ 211 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 212 0, 213 0, 214 t_dst_index(vp, &vpi->DstReg), 215 t_dst_mask(vpi->DstReg.WriteMask), 216 t_dst_class(vpi->DstReg.File), 217 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 218 inst[1] = t_src(vp, &vpi->SrcReg[0]); 219 inst[2] = t_src(vp, &vpi->SrcReg[1]); 220 inst[3] = __CONST(1, RC_SWIZZLE_ZERO); 221} 222 223static void ei_math1(struct r300_vertex_program_code *vp, 224 unsigned int hw_opcode, 225 struct rc_sub_instruction *vpi, 226 unsigned int * inst) 227{ 228 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 229 1, 230 0, 231 t_dst_index(vp, &vpi->DstReg), 232 t_dst_mask(vpi->DstReg.WriteMask), 233 t_dst_class(vpi->DstReg.File), 234 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 235 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 236 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 237 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 238} 239 240static void ei_lit(struct r300_vertex_program_code *vp, 241 struct rc_sub_instruction *vpi, 242 unsigned int * inst) 243{ 244 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} 245 246 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 247 1, 248 0, 249 t_dst_index(vp, &vpi->DstReg), 250 t_dst_mask(vpi->DstReg.WriteMask), 251 t_dst_class(vpi->DstReg.File), 252 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 253 /* NOTE: Users swizzling might not work. */ 254 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 255 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 256 PVS_SRC_SELECT_FORCE_0, // Z 257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 258 t_src_class(vpi->SrcReg[0].File), 259 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 260 (vpi->SrcReg[0].RelAddr << 4); 261 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 262 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 263 PVS_SRC_SELECT_FORCE_0, // Z 264 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 265 t_src_class(vpi->SrcReg[0].File), 266 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 267 (vpi->SrcReg[0].RelAddr << 4); 268 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 269 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 270 PVS_SRC_SELECT_FORCE_0, // Z 271 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 272 t_src_class(vpi->SrcReg[0].File), 273 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 274 (vpi->SrcReg[0].RelAddr << 4); 275} 276 277static void ei_mad(struct r300_vertex_program_code *vp, 278 struct rc_sub_instruction *vpi, 279 unsigned int * inst) 280{ 281 unsigned int i; 282 /* Remarks about hardware limitations of MAD 283 * (please preserve this comment, as this information is _NOT_ 284 * in the documentation provided by AMD). 285 * 286 * As described in the documentation, MAD with three unique temporary 287 * source registers requires the use of the macro version. 288 * 289 * However (and this is not mentioned in the documentation), apparently 290 * the macro version is _NOT_ a full superset of the normal version. 291 * In particular, the macro version does not always work when relative 292 * addressing is used in the source operands. 293 * 294 * This limitation caused incorrect rendering in Sauerbraten's OpenGL 295 * assembly shader path when using medium quality animations 296 * (i.e. animations with matrix blending instead of quaternion blending). 297 * 298 * Unfortunately, I (nha) have been unable to extract a Piglit regression 299 * test for this issue - for some reason, it is possible to have vertex 300 * programs whose prefix is *exactly* the same as the prefix of the 301 * offending program in Sauerbraten up to the offending instruction 302 * without causing any trouble. 303 * 304 * Bottom line: Only use the macro version only when really necessary; 305 * according to AMD docs, this should improve performance by one clock 306 * as a nice side bonus. 307 */ 308 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && 309 vpi->SrcReg[1].File == RC_FILE_TEMPORARY && 310 vpi->SrcReg[2].File == RC_FILE_TEMPORARY && 311 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index && 312 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index && 313 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) { 314 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 315 0, 316 1, 317 t_dst_index(vp, &vpi->DstReg), 318 t_dst_mask(vpi->DstReg.WriteMask), 319 t_dst_class(vpi->DstReg.File), 320 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 321 } else { 322 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 323 0, 324 0, 325 t_dst_index(vp, &vpi->DstReg), 326 t_dst_mask(vpi->DstReg.WriteMask), 327 t_dst_class(vpi->DstReg.File), 328 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 329 330 /* Arguments with constant swizzles still count as a unique 331 * temporary, so we should make sure these arguments share a 332 * register index with one of the other arguments. */ 333 for (i = 0; i < 3; i++) { 334 unsigned int j; 335 if (vpi->SrcReg[i].File != RC_FILE_NONE) 336 continue; 337 338 for (j = 0; j < 3; j++) { 339 if (i != j) { 340 vpi->SrcReg[i].Index = 341 vpi->SrcReg[j].Index; 342 break; 343 } 344 } 345 } 346 } 347 inst[1] = t_src(vp, &vpi->SrcReg[0]); 348 inst[2] = t_src(vp, &vpi->SrcReg[1]); 349 inst[3] = t_src(vp, &vpi->SrcReg[2]); 350} 351 352static void ei_pow(struct r300_vertex_program_code *vp, 353 struct rc_sub_instruction *vpi, 354 unsigned int * inst) 355{ 356 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 357 1, 358 0, 359 t_dst_index(vp, &vpi->DstReg), 360 t_dst_mask(vpi->DstReg.WriteMask), 361 t_dst_class(vpi->DstReg.File), 362 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 363 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 364 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 365 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); 366} 367 368static void translate_vertex_program(struct radeon_compiler *c, void *user) 369{ 370 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 371 struct rc_instruction *rci; 372 373 unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {}; 374 unsigned loop_depth = 0; 375 376 compiler->code->pos_end = 0; /* Not supported yet */ 377 compiler->code->length = 0; 378 compiler->code->num_temporaries = 0; 379 380 compiler->SetHwInputOutput(compiler); 381 382 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) { 383 struct rc_sub_instruction *vpi = &rci->U.I; 384 unsigned int *inst = compiler->code->body.d + compiler->code->length; 385 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode); 386 387 /* Skip instructions writing to non-existing destination */ 388 if (!valid_dst(compiler->code, &vpi->DstReg)) 389 continue; 390 391 if (info->HasDstReg) { 392 /* Neither is Saturate. */ 393 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) { 394 rc_error(&compiler->Base, "Vertex program does not support the Saturate " 395 "modifier (yet).\n"); 396 } 397 } 398 399 if (compiler->code->length >= c->max_alu_insts * 4) { 400 rc_error(&compiler->Base, "Vertex program has too many instructions\n"); 401 return; 402 } 403 404 assert(compiler->Base.is_r500 || 405 (vpi->Opcode != RC_OPCODE_SEQ && 406 vpi->Opcode != RC_OPCODE_SNE)); 407 408 switch (vpi->Opcode) { 409 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; 410 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; 411 case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break; 412 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; 413 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; 414 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; 415 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; 416 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; 417 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; 418 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; 419 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; 420 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; 421 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break; 422 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break; 423 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break; 424 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break; 425 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break; 426 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break; 427 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break; 428 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break; 429 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break; 430 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break; 431 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break; 432 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; 433 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; 434 case RC_OPCODE_BGNLOOP: 435 { 436 if ((!compiler->Base.is_r500 437 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) 438 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) { 439 rc_error(&compiler->Base, 440 "Loops are nested too deep."); 441 return; 442 } 443 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1; 444 break; 445 } 446 case RC_OPCODE_ENDLOOP: 447 { 448 unsigned int act_addr; 449 unsigned int last_addr; 450 unsigned int ret_addr; 451 452 ret_addr = loops[--loop_depth]; 453 act_addr = ret_addr - 1; 454 last_addr = (compiler->code->length / 4) - 1; 455 456 if (loop_depth >= R300_VS_MAX_FC_OPS) { 457 rc_error(&compiler->Base, 458 "Too many flow control instructions."); 459 return; 460 } 461 if (compiler->Base.is_r500) { 462 compiler->code->fc_op_addrs.r500 463 [compiler->code->num_fc_ops].lw = 464 R500_PVS_FC_ACT_ADRS(act_addr) 465 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) 466 ; 467 compiler->code->fc_op_addrs.r500 468 [compiler->code->num_fc_ops].uw = 469 R500_PVS_FC_LAST_INST(last_addr) 470 | R500_PVS_FC_RTN_INST(ret_addr) 471 ; 472 } else { 473 compiler->code->fc_op_addrs.r300 474 [compiler->code->num_fc_ops] = 475 R300_PVS_FC_ACT_ADRS(act_addr) 476 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) 477 | R300_PVS_FC_LAST_INST(last_addr) 478 | R300_PVS_FC_RTN_INST(ret_addr) 479 ; 480 } 481 compiler->code->fc_loop_index[compiler->code->num_fc_ops] = 482 R300_PVS_FC_LOOP_INIT_VAL(0x0) 483 | R300_PVS_FC_LOOP_STEP_VAL(0x1) 484 ; 485 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( 486 compiler->code->num_fc_ops); 487 compiler->code->num_fc_ops++; 488 489 break; 490 } 491 492 case RC_ME_PRED_SET_CLR: 493 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst); 494 break; 495 496 case RC_ME_PRED_SET_INV: 497 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst); 498 break; 499 500 case RC_ME_PRED_SET_POP: 501 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst); 502 break; 503 504 case RC_ME_PRED_SET_RESTORE: 505 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst); 506 break; 507 508 case RC_ME_PRED_SEQ: 509 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst); 510 break; 511 512 case RC_ME_PRED_SNEQ: 513 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst); 514 break; 515 516 case RC_VE_PRED_SNEQ_PUSH: 517 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, 518 vpi, inst); 519 break; 520 521 default: 522 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name); 523 return; 524 } 525 526 if (vpi->DstReg.Pred != RC_PRED_DISABLED) { 527 inst[0] |= (PVS_DST_PRED_ENABLE_MASK 528 << PVS_DST_PRED_ENABLE_SHIFT); 529 if (vpi->DstReg.Pred == RC_PRED_SET) { 530 inst[0] |= (PVS_DST_PRED_SENSE_MASK 531 << PVS_DST_PRED_SENSE_SHIFT); 532 } 533 } 534 535 /* Update the number of temporaries. */ 536 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY && 537 vpi->DstReg.Index >= compiler->code->num_temporaries) 538 compiler->code->num_temporaries = vpi->DstReg.Index + 1; 539 540 for (unsigned i = 0; i < info->NumSrcRegs; i++) 541 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY && 542 vpi->SrcReg[i].Index >= compiler->code->num_temporaries) 543 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; 544 545 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { 546 rc_error(&compiler->Base, "Too many temporaries.\n"); 547 return; 548 } 549 550 compiler->code->length += 4; 551 552 if (compiler->Base.Error) 553 return; 554 } 555} 556 557struct temporary_allocation { 558 unsigned int Allocated:1; 559 unsigned int HwTemp:15; 560 struct rc_instruction * LastRead; 561}; 562 563static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps, 564 unsigned int orig) 565{ 566 if (!ta[orig].Allocated) { 567 int j; 568 for (j = 0; j < c->max_temp_regs; ++j) 569 { 570 if (!hwtemps[j]) 571 break; 572 } 573 ta[orig].Allocated = 1; 574 ta[orig].HwTemp = j; 575 hwtemps[ta[orig].HwTemp] = true; 576 } 577 578 return ta[orig].HwTemp; 579} 580 581static void allocate_temporary_registers(struct radeon_compiler *c, void *user) 582{ 583 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 584 struct rc_instruction *inst; 585 struct rc_instruction *end_loop = NULL; 586 unsigned int num_orig_temps = 0; 587 bool hwtemps[RC_REGISTER_MAX_INDEX]; 588 struct temporary_allocation * ta; 589 unsigned int i; 590 591 memset(hwtemps, 0, sizeof(hwtemps)); 592 593 rc_recompute_ips(c); 594 595 /* Pass 1: Count original temporaries. */ 596 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 597 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 598 599 for (i = 0; i < opcode->NumSrcRegs; ++i) { 600 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 601 if (inst->U.I.SrcReg[i].Index >= num_orig_temps) 602 num_orig_temps = inst->U.I.SrcReg[i].Index + 1; 603 } 604 } 605 606 if (opcode->HasDstReg) { 607 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 608 if (inst->U.I.DstReg.Index >= num_orig_temps) 609 num_orig_temps = inst->U.I.DstReg.Index + 1; 610 } 611 } 612 } 613 614 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool, 615 sizeof(struct temporary_allocation) * num_orig_temps); 616 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps); 617 618 /* Pass 2: Determine original temporary lifetimes */ 619 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 620 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 621 /* Instructions inside of loops need to use the ENDLOOP 622 * instruction as their LastRead. */ 623 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) { 624 int endloops = 1; 625 struct rc_instruction * ptr; 626 for(ptr = inst->Next; 627 ptr != &compiler->Base.Program.Instructions; 628 ptr = ptr->Next){ 629 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) { 630 endloops++; 631 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) { 632 endloops--; 633 if (endloops <= 0) { 634 end_loop = ptr; 635 break; 636 } 637 } 638 } 639 } 640 641 if (inst == end_loop) { 642 end_loop = NULL; 643 continue; 644 } 645 646 for (i = 0; i < opcode->NumSrcRegs; ++i) { 647 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 648 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst; 649 } 650 } 651 } 652 653 /* Pass 3: Register allocation */ 654 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 655 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 656 657 for (i = 0; i < opcode->NumSrcRegs; ++i) { 658 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 659 unsigned int orig = inst->U.I.SrcReg[i].Index; 660 inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig); 661 662 if (ta[orig].Allocated && inst == ta[orig].LastRead) 663 hwtemps[ta[orig].HwTemp] = false; 664 } 665 } 666 667 if (opcode->HasDstReg) { 668 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 669 unsigned int orig = inst->U.I.DstReg.Index; 670 inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig); 671 } 672 } 673 } 674} 675 676/** 677 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier 678 * and the Saturate opcode modifier. Only Absolute is currently transformed. 679 */ 680static int transform_nonnative_modifiers( 681 struct radeon_compiler *c, 682 struct rc_instruction *inst, 683 void* unused) 684{ 685 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode); 686 unsigned i; 687 688 /* Transform ABS(a) to MAX(a, -a). */ 689 for (i = 0; i < opcode->NumSrcRegs; i++) { 690 if (inst->U.I.SrcReg[i].Abs) { 691 struct rc_instruction *new_inst; 692 unsigned temp; 693 694 inst->U.I.SrcReg[i].Abs = 0; 695 696 temp = rc_find_free_temporary(c); 697 698 new_inst = rc_insert_new_instruction(c, inst->Prev); 699 new_inst->U.I.Opcode = RC_OPCODE_MAX; 700 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY; 701 new_inst->U.I.DstReg.Index = temp; 702 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i]; 703 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i]; 704 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 705 706 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; 707 inst->U.I.SrcReg[i].Index = temp; 708 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW; 709 inst->U.I.SrcReg[i].RelAddr = 0; 710 } 711 } 712 return 1; 713} 714 715/** 716 * Vertex engine cannot read two inputs or two constants at the same time. 717 * Introduce intermediate MOVs to temporary registers to account for this. 718 */ 719static int transform_source_conflicts( 720 struct radeon_compiler *c, 721 struct rc_instruction* inst, 722 void* unused) 723{ 724 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 725 726 if (opcode->NumSrcRegs == 3) { 727 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) 728 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) { 729 int tmpreg = rc_find_free_temporary(c); 730 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 731 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 732 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 733 inst_mov->U.I.DstReg.Index = tmpreg; 734 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; 735 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 736 inst_mov->U.I.SrcReg[0].Negate = 0; 737 inst_mov->U.I.SrcReg[0].Abs = 0; 738 739 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY; 740 inst->U.I.SrcReg[2].Index = tmpreg; 741 inst->U.I.SrcReg[2].RelAddr = false; 742 } 743 } 744 745 if (opcode->NumSrcRegs >= 2) { 746 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) { 747 int tmpreg = rc_find_free_temporary(c); 748 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 749 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 750 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 751 inst_mov->U.I.DstReg.Index = tmpreg; 752 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1]; 753 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 754 inst_mov->U.I.SrcReg[0].Negate = 0; 755 inst_mov->U.I.SrcReg[0].Abs = 0; 756 757 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY; 758 inst->U.I.SrcReg[1].Index = tmpreg; 759 inst->U.I.SrcReg[1].RelAddr = false; 760 } 761 } 762 763 return 1; 764} 765 766static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user) 767{ 768 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c; 769 int i; 770 771 for(i = 0; i < 32; ++i) { 772 if ((compiler->RequiredOutputs & (1U << i)) && 773 !(compiler->Base.Program.OutputsWritten & (1U << i))) { 774 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev); 775 inst->U.I.Opcode = RC_OPCODE_MOV; 776 777 inst->U.I.DstReg.File = RC_FILE_OUTPUT; 778 inst->U.I.DstReg.Index = i; 779 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; 780 781 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT; 782 inst->U.I.SrcReg[0].Index = 0; 783 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 784 785 compiler->Base.Program.OutputsWritten |= 1U << i; 786 } 787 } 788} 789 790static void dataflow_outputs_mark_used(void * userdata, void * data, 791 void (*callback)(void *, unsigned int, unsigned int)) 792{ 793 struct r300_vertex_program_compiler * c = userdata; 794 int i; 795 796 for(i = 0; i < 32; ++i) { 797 if (c->RequiredOutputs & (1U << i)) 798 callback(data, i, RC_MASK_XYZW); 799 } 800} 801 802static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) 803{ 804 (void) opcode; 805 (void) reg; 806 807 return 1; 808} 809 810static void transform_negative_addressing(struct r300_vertex_program_compiler *c, 811 struct rc_instruction *arl, 812 struct rc_instruction *end, 813 int min_offset) 814{ 815 struct rc_instruction *inst, *add; 816 unsigned const_swizzle; 817 818 /* Transform ARL/ARR */ 819 add = rc_insert_new_instruction(&c->Base, arl->Prev); 820 add->U.I.Opcode = RC_OPCODE_ADD; 821 add->U.I.DstReg.File = RC_FILE_TEMPORARY; 822 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base); 823 add->U.I.DstReg.WriteMask = RC_MASK_X; 824 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0]; 825 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT; 826 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants, 827 min_offset, &const_swizzle); 828 add->U.I.SrcReg[1].Swizzle = const_swizzle; 829 830 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY; 831 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index; 832 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX; 833 834 /* Rewrite offsets up to and excluding inst. */ 835 for (inst = arl->Next; inst != end; inst = inst->Next) { 836 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 837 838 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) 839 if (inst->U.I.SrcReg[i].RelAddr) 840 inst->U.I.SrcReg[i].Index -= min_offset; 841 } 842} 843 844static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user) 845{ 846 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler; 847 struct rc_instruction *inst, *lastARL = NULL; 848 int min_offset = 0; 849 850 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) { 851 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 852 853 if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) { 854 if (lastARL != NULL && min_offset < 0) 855 transform_negative_addressing(c, lastARL, inst, min_offset); 856 857 lastARL = inst; 858 min_offset = 0; 859 continue; 860 } 861 862 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) { 863 if (inst->U.I.SrcReg[i].RelAddr && 864 inst->U.I.SrcReg[i].Index < 0) { 865 /* ARL must precede any indirect addressing. */ 866 if (!lastARL) { 867 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR."); 868 return; 869 } 870 871 if (inst->U.I.SrcReg[i].Index < min_offset) 872 min_offset = inst->U.I.SrcReg[i].Index; 873 } 874 } 875 } 876 877 if (lastARL != NULL && min_offset < 0) 878 transform_negative_addressing(c, lastARL, inst, min_offset); 879} 880 881const struct rc_swizzle_caps r300_vertprog_swizzle_caps = { 882 .IsNative = &swizzle_is_native, 883 .Split = 0 /* should never be called */ 884}; 885 886void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) 887{ 888 int is_r500 = c->Base.is_r500; 889 int opt = !c->Base.disable_optimizations; 890 891 /* Lists of instruction transformations. */ 892 struct radeon_program_transformation alu_rewrite_r500[] = { 893 { &r300_transform_vertex_alu, 0 }, 894 { &r300_transform_trig_scale_vertex, 0 }, 895 { 0, 0 } 896 }; 897 898 struct radeon_program_transformation alu_rewrite_r300[] = { 899 { &r300_transform_vertex_alu, 0 }, 900 { &r300_transform_trig_simple, 0 }, 901 { 0, 0 } 902 }; 903 904 /* Note: These passes have to be done seperately from ALU rewrite, 905 * otherwise non-native ALU instructions with source conflits 906 * or non-native modifiers will not be treated properly. 907 */ 908 struct radeon_program_transformation emulate_modifiers[] = { 909 { &transform_nonnative_modifiers, 0 }, 910 { 0, 0 } 911 }; 912 913 struct radeon_program_transformation resolve_src_conflicts[] = { 914 { &transform_source_conflicts, 0 }, 915 { 0, 0 } 916 }; 917 918 /* List of compiler passes. */ 919 struct radeon_compiler_pass vs_list[] = { 920 /* NAME DUMP PREDICATE FUNCTION PARAM */ 921 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, 922 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL}, 923 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL}, 924 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, 925 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300}, 926 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers}, 927 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used}, 928 {"dataflow optimize", 1, opt, rc_optimize, NULL}, 929 /* This pass must be done after optimizations. */ 930 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts}, 931 {"register allocation", 1, opt, allocate_temporary_registers, NULL}, 932 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table}, 933 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL}, 934 {"final code validation", 0, 1, rc_validate_final_shader, NULL}, 935 {"machine code generation", 0, 1, translate_vertex_program, NULL}, 936 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL}, 937 {NULL, 0, 0, NULL, NULL} 938 }; 939 940 c->Base.type = RC_VERTEX_PROGRAM; 941 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; 942 943 rc_run_compiler(&c->Base, vs_list); 944 945 c->code->InputsRead = c->Base.Program.InputsRead; 946 c->code->OutputsWritten = c->Base.Program.OutputsWritten; 947 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants); 948} 949