1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_formats.h" 25#include "r600_opcodes.h" 26#include "r600_shader.h" 27#include "r600_dump.h" 28#include "r600d.h" 29#include "sfn/sfn_nir.h" 30 31#include "sb/sb_public.h" 32 33#include "pipe/p_shader_tokens.h" 34#include "tgsi/tgsi_info.h" 35#include "tgsi/tgsi_parse.h" 36#include "tgsi/tgsi_scan.h" 37#include "tgsi/tgsi_dump.h" 38#include "tgsi/tgsi_from_mesa.h" 39#include "nir/tgsi_to_nir.h" 40#include "nir/nir_to_tgsi_info.h" 41#include "compiler/nir/nir.h" 42#include "util/u_bitcast.h" 43#include "util/u_memory.h" 44#include "util/u_math.h" 45#include <stdio.h> 46#include <errno.h> 47 48/* CAYMAN notes 49Why CAYMAN got loops for lots of instructions is explained here. 50 51-These 8xx t-slot only ops are implemented in all vector slots. 52MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 53These 8xx t-slot only opcodes become vector ops, with all four 54slots expecting the arguments on sources a and b. Result is 55broadcast to all channels. 56MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64 57These 8xx t-slot only opcodes become vector ops in the z, y, and 58x slots. 59EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 60RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 61SQRT_IEEE/_64 62SIN/COS 63The w slot may have an independent co-issued operation, or if the 64result is required to be in the w slot, the opcode above may be 65issued in the w slot as well. 66The compiler must issue the source argument to slots z, y, and x 67*/ 68 69/* Contents of r0 on entry to various shaders 70 71 VS - .x = VertexID 72 .y = RelVertexID (??) 73 .w = InstanceID 74 75 GS - r0.xyw, r1.xyz = per-vertex offsets 76 r0.z = PrimitiveID 77 78 TCS - .x = PatchID 79 .y = RelPatchID (??) 80 .z = InvocationID 81 .w = tess factor base. 82 83 TES - .x = TessCoord.x 84 - .y = TessCoord.y 85 - .z = RelPatchID (??) 86 - .w = PrimitiveID 87 88 PS - face_gpr.z = SampleMask 89 face_gpr.w = SampleID 90*/ 91#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16) 92static int r600_shader_from_tgsi(struct r600_context *rctx, 93 struct r600_pipe_shader *pipeshader, 94 union r600_shader_key key); 95 96static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 97 int size, unsigned comp_mask) { 98 99 if (!size) 100 return; 101 102 if (ps->num_arrays == ps->max_arrays) { 103 ps->max_arrays += 64; 104 ps->arrays = realloc(ps->arrays, ps->max_arrays * 105 sizeof(struct r600_shader_array)); 106 } 107 108 int n = ps->num_arrays; 109 ++ps->num_arrays; 110 111 ps->arrays[n].comp_mask = comp_mask; 112 ps->arrays[n].gpr_start = start_gpr; 113 ps->arrays[n].gpr_count = size; 114} 115 116static void r600_dump_streamout(struct pipe_stream_output_info *so) 117{ 118 unsigned i; 119 120 fprintf(stderr, "STREAMOUT\n"); 121 for (i = 0; i < so->num_outputs; i++) { 122 unsigned mask = ((1 << so->output[i].num_components) - 1) << 123 so->output[i].start_component; 124 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 125 i, 126 so->output[i].stream, 127 so->output[i].output_buffer, 128 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 129 so->output[i].register_index, 130 mask & 1 ? "x" : "", 131 mask & 2 ? "y" : "", 132 mask & 4 ? "z" : "", 133 mask & 8 ? "w" : "", 134 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 135 } 136} 137 138static int store_shader(struct pipe_context *ctx, 139 struct r600_pipe_shader *shader) 140{ 141 struct r600_context *rctx = (struct r600_context *)ctx; 142 uint32_t *ptr, i; 143 144 if (shader->bo == NULL) { 145 shader->bo = (struct r600_resource*) 146 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 147 if (shader->bo == NULL) { 148 return -ENOMEM; 149 } 150 ptr = r600_buffer_map_sync_with_rings( 151 &rctx->b, shader->bo, 152 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); 153 if (R600_BIG_ENDIAN) { 154 for (i = 0; i < shader->shader.bc.ndw; ++i) { 155 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]); 156 } 157 } else { 158 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 159 } 160 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf); 161 } 162 163 return 0; 164} 165 166extern const struct nir_shader_compiler_options r600_nir_options; 167static int nshader = 0; 168int r600_pipe_shader_create(struct pipe_context *ctx, 169 struct r600_pipe_shader *shader, 170 union r600_shader_key key) 171{ 172 struct r600_context *rctx = (struct r600_context *)ctx; 173 struct r600_pipe_shader_selector *sel = shader->selector; 174 int r; 175 struct r600_screen *rscreen = (struct r600_screen *)ctx->screen; 176 177 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ? 178 tgsi_get_processor_type(sel->tokens): 179 pipe_shader_type_from_mesa(sel->nir->info.stage); 180 181 bool dump = r600_can_dump_shader(&rctx->screen->b, processor); 182 unsigned use_sb = !(rctx->screen->b.debug_flags & (DBG_NO_SB | DBG_NIR)) || 183 (rctx->screen->b.debug_flags & DBG_NIR_SB); 184 unsigned sb_disasm; 185 unsigned export_shader; 186 187 shader->shader.bc.isa = rctx->isa; 188 189 if (!(rscreen->b.debug_flags & DBG_NIR_PREFERRED)) { 190 assert(sel->ir_type == PIPE_SHADER_IR_TGSI); 191 r = r600_shader_from_tgsi(rctx, shader, key); 192 if (r) { 193 R600_ERR("translation from TGSI failed !\n"); 194 goto error; 195 } 196 } else { 197 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 198 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true); 199 const nir_shader_compiler_options *nir_options = 200 (const nir_shader_compiler_options *) 201 ctx->screen->get_compiler_options(ctx->screen, 202 PIPE_SHADER_IR_NIR, 203 shader->shader.processor_type); 204 /* Lower int64 ops because we have some r600 build-in shaders that use it */ 205 if (nir_options->lower_int64_options) { 206 NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa); 207 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL); 208 NIR_PASS_V(sel->nir, nir_lower_int64); 209 NIR_PASS_V(sel->nir, nir_opt_vectorize, NULL, NULL); 210 } 211 NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false); 212 } 213 nir_tgsi_scan_shader(sel->nir, &sel->info, true); 214 215 r = r600_shader_from_nir(rctx, shader, &key); 216 if (r) { 217 fprintf(stderr, "--Failed shader--------------------------------------------------\n"); 218 219 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 220 fprintf(stderr, "--TGSI--------------------------------------------------------\n"); 221 tgsi_dump(sel->tokens, 0); 222 } 223 224 if (rscreen->b.debug_flags & (DBG_NIR_PREFERRED)) { 225 fprintf(stderr, "--NIR --------------------------------------------------------\n"); 226 nir_print_shader(sel->nir, stderr); 227 } 228 229 R600_ERR("translation from NIR failed !\n"); 230 goto error; 231 } 232 } 233 234 if (dump) { 235 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 236 fprintf(stderr, "--TGSI--------------------------------------------------------\n"); 237 tgsi_dump(sel->tokens, 0); 238 } 239 240 if (sel->so.num_outputs) { 241 r600_dump_streamout(&sel->so); 242 } 243 } 244 245 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { 246 /* only disable for vertex shaders in tess paths */ 247 if (key.vs.as_ls) 248 use_sb = 0; 249 } 250 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL); 251 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL); 252 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE); 253 254 /* disable SB for shaders using doubles */ 255 use_sb &= !shader->shader.uses_doubles; 256 257 use_sb &= !shader->shader.uses_atomics; 258 use_sb &= !shader->shader.uses_images; 259 use_sb &= !shader->shader.uses_helper_invocation; 260 261 /* Check if the bytecode has already been built. */ 262 if (!shader->shader.bc.bytecode) { 263 r = r600_bytecode_build(&shader->shader.bc); 264 if (r) { 265 R600_ERR("building bytecode failed !\n"); 266 goto error; 267 } 268 } 269 270 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 271 if (dump && !sb_disasm) { 272 fprintf(stderr, "--------------------------------------------------------------\n"); 273 r600_bytecode_disasm(&shader->shader.bc); 274 fprintf(stderr, "______________________________________________________________\n"); 275 } else if ((dump && sb_disasm) || use_sb) { 276 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 277 dump, use_sb); 278 if (r) { 279 R600_ERR("r600_sb_bytecode_process failed !\n"); 280 goto error; 281 } 282 } 283 284 if (dump) { 285 FILE *f; 286 char fname[1024]; 287 snprintf(fname, 1024, "shader_from_%s_%d.cpp", 288 (sel->ir_type == PIPE_SHADER_IR_TGSI ? 289 (rscreen->b.debug_flags & DBG_NIR_PREFERRED ? "tgsi-nir" : "tgsi") 290 : "nir"), nshader); 291 f = fopen(fname, "w"); 292 print_shader_info(f, nshader++, &shader->shader); 293 print_shader_info(stderr, nshader++, &shader->shader); 294 print_pipe_info(stderr, &sel->info); 295 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 296 fprintf(f, "/****TGSI**********************************\n"); 297 tgsi_dump_to_file(sel->tokens, 0, f); 298 } 299 300 if (rscreen->b.debug_flags & DBG_NIR_PREFERRED){ 301 fprintf(f, "/****NIR **********************************\n"); 302 nir_print_shader(sel->nir, f); 303 } 304 fprintf(f, "******************************************/\n"); 305 fclose(f); 306 } 307 308 if (shader->gs_copy_shader) { 309 if (dump) { 310 // dump copy shader 311 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc, 312 &shader->gs_copy_shader->shader, dump, 0); 313 if (r) 314 goto error; 315 } 316 317 if ((r = store_shader(ctx, shader->gs_copy_shader))) 318 goto error; 319 } 320 321 /* Store the shader in a buffer. */ 322 if ((r = store_shader(ctx, shader))) 323 goto error; 324 325 /* Build state. */ 326 switch (shader->shader.processor_type) { 327 case PIPE_SHADER_TESS_CTRL: 328 evergreen_update_hs_state(ctx, shader); 329 break; 330 case PIPE_SHADER_TESS_EVAL: 331 if (key.tes.as_es) 332 evergreen_update_es_state(ctx, shader); 333 else 334 evergreen_update_vs_state(ctx, shader); 335 break; 336 case PIPE_SHADER_GEOMETRY: 337 if (rctx->b.chip_class >= EVERGREEN) { 338 evergreen_update_gs_state(ctx, shader); 339 evergreen_update_vs_state(ctx, shader->gs_copy_shader); 340 } else { 341 r600_update_gs_state(ctx, shader); 342 r600_update_vs_state(ctx, shader->gs_copy_shader); 343 } 344 break; 345 case PIPE_SHADER_VERTEX: 346 export_shader = key.vs.as_es; 347 if (rctx->b.chip_class >= EVERGREEN) { 348 if (key.vs.as_ls) 349 evergreen_update_ls_state(ctx, shader); 350 else if (key.vs.as_es) 351 evergreen_update_es_state(ctx, shader); 352 else 353 evergreen_update_vs_state(ctx, shader); 354 } else { 355 if (export_shader) 356 r600_update_es_state(ctx, shader); 357 else 358 r600_update_vs_state(ctx, shader); 359 } 360 break; 361 case PIPE_SHADER_FRAGMENT: 362 if (rctx->b.chip_class >= EVERGREEN) { 363 evergreen_update_ps_state(ctx, shader); 364 } else { 365 r600_update_ps_state(ctx, shader); 366 } 367 break; 368 case PIPE_SHADER_COMPUTE: 369 evergreen_update_ls_state(ctx, shader); 370 break; 371 default: 372 r = -EINVAL; 373 goto error; 374 } 375 return 0; 376 377error: 378 r600_pipe_shader_destroy(ctx, shader); 379 return r; 380} 381 382void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader) 383{ 384 r600_resource_reference(&shader->bo, NULL); 385 if (list_is_linked(&shader->shader.bc.cf)) 386 r600_bytecode_clear(&shader->shader.bc); 387 r600_release_command_buffer(&shader->command_buffer); 388} 389 390/* 391 * tgsi -> r600 shader 392 */ 393struct r600_shader_tgsi_instruction; 394 395struct r600_shader_src { 396 unsigned sel; 397 unsigned swizzle[4]; 398 unsigned neg; 399 unsigned abs; 400 unsigned rel; 401 unsigned kc_bank; 402 boolean kc_rel; /* true if cache bank is indexed */ 403 uint32_t value[4]; 404}; 405 406struct eg_interp { 407 boolean enabled; 408 unsigned ij_index; 409}; 410 411struct r600_shader_ctx { 412 struct tgsi_shader_info info; 413 struct tgsi_array_info *array_infos; 414 /* flag for each tgsi temp array if its been spilled or not */ 415 bool *spilled_arrays; 416 struct tgsi_parse_context parse; 417 const struct tgsi_token *tokens; 418 unsigned type; 419 unsigned file_offset[TGSI_FILE_COUNT]; 420 unsigned temp_reg; 421 const struct r600_shader_tgsi_instruction *inst_info; 422 struct r600_bytecode *bc; 423 struct r600_shader *shader; 424 struct r600_shader_src src[4]; 425 uint32_t *literals; 426 uint32_t nliterals; 427 uint32_t max_driver_temp_used; 428 /* needed for evergreen interpolation */ 429 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid 430 /* evergreen/cayman also store sample mask in face register */ 431 int face_gpr; 432 /* sample id is .w component stored in fixed point position register */ 433 int fixed_pt_position_gpr; 434 int colors_used; 435 boolean clip_vertex_write; 436 unsigned cv_output; 437 unsigned edgeflag_output; 438 int helper_invoc_reg; 439 int cs_block_size_reg; 440 int cs_grid_size_reg; 441 bool cs_block_size_loaded, cs_grid_size_loaded; 442 int fragcoord_input; 443 int next_ring_offset; 444 int gs_out_ring_offset; 445 int gs_next_vertex; 446 struct r600_shader *gs_for_vs; 447 int gs_export_gpr_tregs[4]; 448 int gs_rotated_input[2]; 449 const struct pipe_stream_output_info *gs_stream_output_info; 450 unsigned enabled_stream_buffers_mask; 451 unsigned tess_input_info; /* temp with tess input offsets */ 452 unsigned tess_output_info; /* temp with tess input offsets */ 453 unsigned thread_id_gpr; /* temp with thread id calculated for images */ 454}; 455 456struct r600_shader_tgsi_instruction { 457 unsigned op; 458 int (*process)(struct r600_shader_ctx *ctx); 459}; 460 461static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind); 462static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 463static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 464static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 465static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 466static int tgsi_else(struct r600_shader_ctx *ctx); 467static int tgsi_endif(struct r600_shader_ctx *ctx); 468static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 469static int tgsi_endloop(struct r600_shader_ctx *ctx); 470static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 471static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 472 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 473 unsigned int dst_reg); 474static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 475 const struct r600_shader_src *shader_src, 476 unsigned chan); 477static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 478 unsigned dst_reg, unsigned mask); 479 480static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx) 481{ 482 if (ctx->bc->family == CHIP_HEMLOCK || 483 ctx->bc->family == CHIP_CYPRESS || 484 ctx->bc->family == CHIP_JUNIPER) 485 return false; 486 return true; 487} 488 489static int tgsi_last_instruction(unsigned writemask) 490{ 491 int i, lasti = 0; 492 493 for (i = 0; i < 4; i++) { 494 if (writemask & (1 << i)) { 495 lasti = i; 496 } 497 } 498 return lasti; 499} 500 501static int tgsi_is_supported(struct r600_shader_ctx *ctx) 502{ 503 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 504 unsigned j; 505 506 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { 507 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 508 return -EINVAL; 509 } 510#if 0 511 if (i->Instruction.Label) { 512 R600_ERR("label unsupported\n"); 513 return -EINVAL; 514 } 515#endif 516 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 517 if (i->Src[j].Register.Dimension) { 518 switch (i->Src[j].Register.File) { 519 case TGSI_FILE_CONSTANT: 520 case TGSI_FILE_HW_ATOMIC: 521 break; 522 case TGSI_FILE_INPUT: 523 if (ctx->type == PIPE_SHADER_GEOMETRY || 524 ctx->type == PIPE_SHADER_TESS_CTRL || 525 ctx->type == PIPE_SHADER_TESS_EVAL) 526 break; 527 FALLTHROUGH; 528 case TGSI_FILE_OUTPUT: 529 if (ctx->type == PIPE_SHADER_TESS_CTRL) 530 break; 531 FALLTHROUGH; 532 default: 533 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j, 534 i->Src[j].Register.File, 535 i->Src[j].Register.Dimension); 536 return -EINVAL; 537 } 538 } 539 } 540 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 541 if (i->Dst[j].Register.Dimension) { 542 if (ctx->type == PIPE_SHADER_TESS_CTRL) 543 continue; 544 R600_ERR("unsupported dst (dimension)\n"); 545 return -EINVAL; 546 } 547 } 548 return 0; 549} 550 551int eg_get_interpolator_index(unsigned interpolate, unsigned location) 552{ 553 if (interpolate == TGSI_INTERPOLATE_COLOR || 554 interpolate == TGSI_INTERPOLATE_LINEAR || 555 interpolate == TGSI_INTERPOLATE_PERSPECTIVE) 556 { 557 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR; 558 int loc; 559 560 switch(location) { 561 case TGSI_INTERPOLATE_LOC_CENTER: 562 loc = 1; 563 break; 564 case TGSI_INTERPOLATE_LOC_CENTROID: 565 loc = 2; 566 break; 567 case TGSI_INTERPOLATE_LOC_SAMPLE: 568 default: 569 loc = 0; break; 570 } 571 572 return is_linear * 3 + loc; 573 } 574 575 return -1; 576} 577 578static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 579 int input) 580{ 581 int i = eg_get_interpolator_index( 582 ctx->shader->input[input].interpolate, 583 ctx->shader->input[input].interpolate_location); 584 assert(i >= 0); 585 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index; 586} 587 588static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 589{ 590 int i, r; 591 struct r600_bytecode_alu alu; 592 int gpr = 0, base_chan = 0; 593 int ij_index = ctx->shader->input[input].ij_index; 594 595 /* work out gpr and base_chan from index */ 596 gpr = ij_index / 2; 597 base_chan = (2 * (ij_index % 2)) + 1; 598 599 for (i = 0; i < 8; i++) { 600 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 601 602 if (i < 4) 603 alu.op = ALU_OP2_INTERP_ZW; 604 else 605 alu.op = ALU_OP2_INTERP_XY; 606 607 if ((i > 1) && (i < 6)) { 608 alu.dst.sel = ctx->shader->input[input].gpr; 609 alu.dst.write = 1; 610 } 611 612 alu.dst.chan = i % 4; 613 614 alu.src[0].sel = gpr; 615 alu.src[0].chan = (base_chan - (i % 2)); 616 617 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 618 619 alu.bank_swizzle_force = SQ_ALU_VEC_210; 620 if ((i % 4) == 3) 621 alu.last = 1; 622 r = r600_bytecode_add_alu(ctx->bc, &alu); 623 if (r) 624 return r; 625 } 626 return 0; 627} 628 629static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 630{ 631 int i, r; 632 struct r600_bytecode_alu alu; 633 634 for (i = 0; i < 4; i++) { 635 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 636 637 alu.op = ALU_OP1_INTERP_LOAD_P0; 638 639 alu.dst.sel = ctx->shader->input[input].gpr; 640 alu.dst.write = 1; 641 642 alu.dst.chan = i; 643 644 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 645 alu.src[0].chan = i; 646 647 if (i == 3) 648 alu.last = 1; 649 r = r600_bytecode_add_alu(ctx->bc, &alu); 650 if (r) 651 return r; 652 } 653 return 0; 654} 655 656/* 657 * Special export handling in shaders 658 * 659 * shader export ARRAY_BASE for EXPORT_POS: 660 * 60 is position 661 * 61 is misc vector 662 * 62, 63 are clip distance vectors 663 * 664 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 665 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 666 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 667 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 668 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 669 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 670 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 671 * exclusive from render target index) 672 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 673 * 674 * 675 * shader export ARRAY_BASE for EXPORT_PIXEL: 676 * 0-7 CB targets 677 * 61 computed Z vector 678 * 679 * The use of the values exported in the computed Z vector are controlled 680 * by DB_SHADER_CONTROL: 681 * Z_EXPORT_ENABLE - Z as a float in RED 682 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 683 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 684 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 685 * DB_SOURCE_FORMAT - export control restrictions 686 * 687 */ 688 689 690/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 691static int r600_spi_sid(struct r600_shader_io * io) 692{ 693 int index, name = io->name; 694 695 /* These params are handled differently, they don't need 696 * semantic indices, so we'll use 0 for them. 697 */ 698 if (name == TGSI_SEMANTIC_POSITION || 699 name == TGSI_SEMANTIC_PSIZE || 700 name == TGSI_SEMANTIC_EDGEFLAG || 701 name == TGSI_SEMANTIC_FACE || 702 name == TGSI_SEMANTIC_SAMPLEMASK) 703 index = 0; 704 else { 705 if (name == TGSI_SEMANTIC_GENERIC) { 706 /* For generic params simply use sid from tgsi */ 707 index = 9 + io->sid; 708 } else if (name == TGSI_SEMANTIC_TEXCOORD) { 709 index = io->sid; 710 } else { 711 /* For non-generic params - pack name and sid into 8 bits */ 712 index = 0x80 | (name<<3) | (io->sid); 713 } 714 715 /* Make sure that all really used indices have nonzero value, so 716 * we can just compare it to 0 later instead of comparing the name 717 * with different values to detect special cases. */ 718 index++; 719 } 720 721 return index; 722}; 723 724/* we need this to get a common lds index for vs/tcs/tes input/outputs */ 725int r600_get_lds_unique_index(unsigned semantic_name, unsigned index) 726{ 727 switch (semantic_name) { 728 case TGSI_SEMANTIC_POSITION: 729 return 0; 730 case TGSI_SEMANTIC_PSIZE: 731 return 1; 732 case TGSI_SEMANTIC_CLIPDIST: 733 assert(index <= 1); 734 return 2 + index; 735 case TGSI_SEMANTIC_TEXCOORD: 736 return 4 + index; 737 case TGSI_SEMANTIC_GENERIC: 738 if (index <= 63-4) 739 return 4 + index; 740 else 741 /* same explanation as in the default statement, 742 * the only user hitting this is st/nine. 743 */ 744 return 0; 745 746 /* patch indices are completely separate and thus start from 0 */ 747 case TGSI_SEMANTIC_TESSOUTER: 748 return 0; 749 case TGSI_SEMANTIC_TESSINNER: 750 return 1; 751 case TGSI_SEMANTIC_PATCH: 752 return 2 + index; 753 754 default: 755 /* Don't fail here. The result of this function is only used 756 * for LS, TCS, TES, and GS, where legacy GL semantics can't 757 * occur, but this function is called for all vertex shaders 758 * before it's known whether LS will be compiled or not. 759 */ 760 return 0; 761 } 762} 763 764/* turn input into interpolate on EG */ 765static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 766{ 767 int r = 0; 768 769 if (ctx->shader->input[index].spi_sid) { 770 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 771 if (ctx->shader->input[index].interpolate > 0) { 772 evergreen_interp_assign_ij_index(ctx, index); 773 r = evergreen_interp_alu(ctx, index); 774 } else { 775 r = evergreen_interp_flat(ctx, index); 776 } 777 } 778 return r; 779} 780 781static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 782{ 783 struct r600_bytecode_alu alu; 784 int i, r; 785 int gpr_front = ctx->shader->input[front].gpr; 786 int gpr_back = ctx->shader->input[back].gpr; 787 788 for (i = 0; i < 4; i++) { 789 memset(&alu, 0, sizeof(alu)); 790 alu.op = ALU_OP3_CNDGT; 791 alu.is_op3 = 1; 792 alu.dst.write = 1; 793 alu.dst.sel = gpr_front; 794 alu.src[0].sel = ctx->face_gpr; 795 alu.src[1].sel = gpr_front; 796 alu.src[2].sel = gpr_back; 797 798 alu.dst.chan = i; 799 alu.src[1].chan = i; 800 alu.src[2].chan = i; 801 alu.last = (i==3); 802 803 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 804 return r; 805 } 806 807 return 0; 808} 809 810/* execute a single slot ALU calculation */ 811static int single_alu_op2(struct r600_shader_ctx *ctx, int op, 812 int dst_sel, int dst_chan, 813 int src0_sel, unsigned src0_chan_val, 814 int src1_sel, unsigned src1_chan_val) 815{ 816 struct r600_bytecode_alu alu; 817 int r, i; 818 819 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) { 820 for (i = 0; i < 4; i++) { 821 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 822 alu.op = op; 823 alu.src[0].sel = src0_sel; 824 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 825 alu.src[0].value = src0_chan_val; 826 else 827 alu.src[0].chan = src0_chan_val; 828 alu.src[1].sel = src1_sel; 829 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 830 alu.src[1].value = src1_chan_val; 831 else 832 alu.src[1].chan = src1_chan_val; 833 alu.dst.sel = dst_sel; 834 alu.dst.chan = i; 835 alu.dst.write = i == dst_chan; 836 alu.last = (i == 3); 837 r = r600_bytecode_add_alu(ctx->bc, &alu); 838 if (r) 839 return r; 840 } 841 return 0; 842 } 843 844 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 845 alu.op = op; 846 alu.src[0].sel = src0_sel; 847 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 848 alu.src[0].value = src0_chan_val; 849 else 850 alu.src[0].chan = src0_chan_val; 851 alu.src[1].sel = src1_sel; 852 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 853 alu.src[1].value = src1_chan_val; 854 else 855 alu.src[1].chan = src1_chan_val; 856 alu.dst.sel = dst_sel; 857 alu.dst.chan = dst_chan; 858 alu.dst.write = 1; 859 alu.last = 1; 860 r = r600_bytecode_add_alu(ctx->bc, &alu); 861 if (r) 862 return r; 863 return 0; 864} 865 866/* execute a single slot ALU calculation */ 867static int single_alu_op3(struct r600_shader_ctx *ctx, int op, 868 int dst_sel, int dst_chan, 869 int src0_sel, unsigned src0_chan_val, 870 int src1_sel, unsigned src1_chan_val, 871 int src2_sel, unsigned src2_chan_val) 872{ 873 struct r600_bytecode_alu alu; 874 int r; 875 876 /* validate this for other ops */ 877 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT); 878 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 879 alu.op = op; 880 alu.src[0].sel = src0_sel; 881 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 882 alu.src[0].value = src0_chan_val; 883 else 884 alu.src[0].chan = src0_chan_val; 885 alu.src[1].sel = src1_sel; 886 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 887 alu.src[1].value = src1_chan_val; 888 else 889 alu.src[1].chan = src1_chan_val; 890 alu.src[2].sel = src2_sel; 891 if (src2_sel == V_SQ_ALU_SRC_LITERAL) 892 alu.src[2].value = src2_chan_val; 893 else 894 alu.src[2].chan = src2_chan_val; 895 alu.dst.sel = dst_sel; 896 alu.dst.chan = dst_chan; 897 alu.is_op3 = 1; 898 alu.last = 1; 899 r = r600_bytecode_add_alu(ctx->bc, &alu); 900 if (r) 901 return r; 902 return 0; 903} 904 905/* put it in temp_reg.x */ 906static int get_lds_offset0(struct r600_shader_ctx *ctx, 907 int rel_patch_chan, 908 int temp_reg, bool is_patch_var) 909{ 910 int r; 911 912 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */ 913 /* ADD 914 Dimension - patch0_offset (input_vals.z), 915 Non-dim - patch0_data_offset (input_vals.w) 916 */ 917 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 918 temp_reg, 0, 919 ctx->tess_output_info, 0, 920 0, rel_patch_chan, 921 ctx->tess_output_info, is_patch_var ? 3 : 2); 922 if (r) 923 return r; 924 return 0; 925} 926 927static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) 928{ 929 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; 930} 931 932static int r600_get_temp(struct r600_shader_ctx *ctx) 933{ 934 return ctx->temp_reg + ctx->max_driver_temp_used++; 935} 936 937static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) 938{ 939 int i; 940 i = ctx->shader->noutput++; 941 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID; 942 ctx->shader->output[i].sid = 0; 943 ctx->shader->output[i].gpr = 0; 944 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT; 945 ctx->shader->output[i].write_mask = 0x4; 946 ctx->shader->output[i].spi_sid = prim_id_sid; 947 948 return 0; 949} 950 951static int tgsi_barrier(struct r600_shader_ctx *ctx) 952{ 953 struct r600_bytecode_alu alu; 954 int r; 955 956 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 957 alu.op = ctx->inst_info->op; 958 alu.last = 1; 959 960 r = r600_bytecode_add_alu(ctx->bc, &alu); 961 if (r) 962 return r; 963 return 0; 964} 965 966static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed) 967{ 968 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays 969 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY]; 970 unsigned narrays_left = n; 971 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed 972 973 *scratch_space_needed = 0; 974 while (*regno > 124 && narrays_left) { 975 unsigned i; 976 unsigned largest = 0; 977 unsigned largest_index = 0; 978 979 for (i = 0; i < n; i++) { 980 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 981 if (!spilled[i] && size > largest) { 982 largest = size; 983 largest_index = i; 984 } 985 } 986 987 spilled[largest_index] = true; 988 *regno -= largest; 989 *scratch_space_needed += largest; 990 991 narrays_left --; 992 } 993 994 if (narrays_left == 0) { 995 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY); 996 } 997} 998 999/* Take spilled temp arrays into account when translating tgsi register 1000 * indexes into r600 gprs if spilled is false, or scratch array offset if 1001 * spilled is true */ 1002static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled) 1003{ 1004 unsigned i; 1005 unsigned spilled_size = 0; 1006 1007 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 1008 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 1009 if (ctx->spilled_arrays[i]) { 1010 /* vec4 index into spilled scratch memory */ 1011 *spilled = true; 1012 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size; 1013 } 1014 else { 1015 /* regular GPR array */ 1016 *spilled = false; 1017 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 1018 } 1019 } 1020 1021 if (tgsi_reg_index < ctx->array_infos[i].range.First) 1022 break; 1023 if (ctx->spilled_arrays[i]) { 1024 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 1025 } 1026 } 1027 1028 /* regular GPR index, minus the holes from spilled arrays */ 1029 *spilled = false; 1030 1031 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 1032} 1033 1034/* look up spill area base offset and array size for a spilled temp array */ 1035static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, 1036 unsigned *array_base, unsigned *array_size) 1037{ 1038 unsigned i; 1039 unsigned offset = 0; 1040 1041 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 1042 if (ctx->spilled_arrays[i]) { 1043 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 1044 1045 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 1046 *array_base = offset; 1047 *array_size = size - 1; /* hw counts from 1 */ 1048 1049 return; 1050 } 1051 1052 offset += size; 1053 } 1054 } 1055} 1056 1057static int tgsi_declaration(struct r600_shader_ctx *ctx) 1058{ 1059 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 1060 int r, i, j, count = d->Range.Last - d->Range.First + 1; 1061 1062 switch (d->Declaration.File) { 1063 case TGSI_FILE_INPUT: 1064 for (j = 0; j < count; j++) { 1065 i = ctx->shader->ninput + j; 1066 assert(i < ARRAY_SIZE(ctx->shader->input)); 1067 ctx->shader->input[i].name = d->Semantic.Name; 1068 ctx->shader->input[i].sid = d->Semantic.Index + j; 1069 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 1070 ctx->shader->input[i].interpolate_location = d->Interp.Location; 1071 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; 1072 if (ctx->type == PIPE_SHADER_FRAGMENT) { 1073 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 1074 switch (ctx->shader->input[i].name) { 1075 case TGSI_SEMANTIC_FACE: 1076 if (ctx->face_gpr != -1) 1077 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ 1078 else 1079 ctx->face_gpr = ctx->shader->input[i].gpr; 1080 break; 1081 case TGSI_SEMANTIC_COLOR: 1082 ctx->colors_used++; 1083 break; 1084 case TGSI_SEMANTIC_POSITION: 1085 ctx->fragcoord_input = i; 1086 break; 1087 case TGSI_SEMANTIC_PRIMID: 1088 /* set this for now */ 1089 ctx->shader->gs_prim_id_input = true; 1090 ctx->shader->ps_prim_id_input = i; 1091 break; 1092 } 1093 if (ctx->bc->chip_class >= EVERGREEN) { 1094 if ((r = evergreen_interp_input(ctx, i))) 1095 return r; 1096 } 1097 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 1098 /* FIXME probably skip inputs if they aren't passed in the ring */ 1099 ctx->shader->input[i].ring_offset = ctx->next_ring_offset; 1100 ctx->next_ring_offset += 16; 1101 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) 1102 ctx->shader->gs_prim_id_input = true; 1103 } 1104 } 1105 ctx->shader->ninput += count; 1106 break; 1107 case TGSI_FILE_OUTPUT: 1108 for (j = 0; j < count; j++) { 1109 i = ctx->shader->noutput + j; 1110 assert(i < ARRAY_SIZE(ctx->shader->output)); 1111 ctx->shader->output[i].name = d->Semantic.Name; 1112 ctx->shader->output[i].sid = d->Semantic.Index + j; 1113 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; 1114 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 1115 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 1116 if (ctx->type == PIPE_SHADER_VERTEX || 1117 ctx->type == PIPE_SHADER_GEOMETRY || 1118 ctx->type == PIPE_SHADER_TESS_EVAL) { 1119 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 1120 switch (d->Semantic.Name) { 1121 case TGSI_SEMANTIC_CLIPDIST: 1122 break; 1123 case TGSI_SEMANTIC_PSIZE: 1124 ctx->shader->vs_out_misc_write = 1; 1125 ctx->shader->vs_out_point_size = 1; 1126 break; 1127 case TGSI_SEMANTIC_EDGEFLAG: 1128 ctx->shader->vs_out_misc_write = 1; 1129 ctx->shader->vs_out_edgeflag = 1; 1130 ctx->edgeflag_output = i; 1131 break; 1132 case TGSI_SEMANTIC_VIEWPORT_INDEX: 1133 ctx->shader->vs_out_misc_write = 1; 1134 ctx->shader->vs_out_viewport = 1; 1135 break; 1136 case TGSI_SEMANTIC_LAYER: 1137 ctx->shader->vs_out_misc_write = 1; 1138 ctx->shader->vs_out_layer = 1; 1139 break; 1140 case TGSI_SEMANTIC_CLIPVERTEX: 1141 ctx->clip_vertex_write = TRUE; 1142 ctx->cv_output = i; 1143 break; 1144 } 1145 if (ctx->type == PIPE_SHADER_GEOMETRY) { 1146 ctx->gs_out_ring_offset += 16; 1147 } 1148 } else if (ctx->type == PIPE_SHADER_FRAGMENT) { 1149 switch (d->Semantic.Name) { 1150 case TGSI_SEMANTIC_COLOR: 1151 ctx->shader->nr_ps_max_color_exports++; 1152 break; 1153 } 1154 } 1155 } 1156 ctx->shader->noutput += count; 1157 break; 1158 case TGSI_FILE_TEMPORARY: 1159 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 1160 if (d->Array.ArrayID) { 1161 bool spilled; 1162 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx, 1163 d->Range.First, 1164 &spilled); 1165 1166 if (!spilled) { 1167 r600_add_gpr_array(ctx->shader, idx, 1168 d->Range.Last - d->Range.First + 1, 0x0F); 1169 } 1170 } 1171 } 1172 break; 1173 1174 case TGSI_FILE_CONSTANT: 1175 case TGSI_FILE_SAMPLER: 1176 case TGSI_FILE_SAMPLER_VIEW: 1177 case TGSI_FILE_ADDRESS: 1178 case TGSI_FILE_BUFFER: 1179 case TGSI_FILE_IMAGE: 1180 case TGSI_FILE_MEMORY: 1181 break; 1182 1183 case TGSI_FILE_HW_ATOMIC: 1184 i = ctx->shader->nhwatomic_ranges; 1185 ctx->shader->atomics[i].start = d->Range.First; 1186 ctx->shader->atomics[i].end = d->Range.Last; 1187 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic; 1188 ctx->shader->atomics[i].array_id = d->Array.ArrayID; 1189 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D; 1190 ctx->shader->nhwatomic_ranges++; 1191 ctx->shader->nhwatomic += count; 1192 break; 1193 1194 case TGSI_FILE_SYSTEM_VALUE: 1195 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK || 1196 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID || 1197 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) { 1198 break; /* Already handled from allocate_system_value_inputs */ 1199 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 1200 break; 1201 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 1202 break; 1203 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID) 1204 break; 1205 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || 1206 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { 1207 int param = r600_get_lds_unique_index(d->Semantic.Name, 0); 1208 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; 1209 unsigned temp_reg = r600_get_temp(ctx); 1210 1211 r = get_lds_offset0(ctx, 2, temp_reg, true); 1212 if (r) 1213 return r; 1214 1215 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1216 temp_reg, 0, 1217 temp_reg, 0, 1218 V_SQ_ALU_SRC_LITERAL, param * 16); 1219 if (r) 1220 return r; 1221 1222 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf); 1223 } 1224 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { 1225 /* MOV r1.x, r0.x; 1226 MOV r1.y, r0.y; 1227 */ 1228 for (i = 0; i < 2; i++) { 1229 struct r600_bytecode_alu alu; 1230 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1231 alu.op = ALU_OP1_MOV; 1232 alu.src[0].sel = 0; 1233 alu.src[0].chan = 0 + i; 1234 alu.dst.sel = 1; 1235 alu.dst.chan = 0 + i; 1236 alu.dst.write = 1; 1237 alu.last = (i == 1) ? 1 : 0; 1238 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1239 return r; 1240 } 1241 /* ADD r1.z, 1.0f, -r0.x */ 1242 struct r600_bytecode_alu alu; 1243 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1244 alu.op = ALU_OP2_ADD; 1245 alu.src[0].sel = V_SQ_ALU_SRC_1; 1246 alu.src[1].sel = 1; 1247 alu.src[1].chan = 0; 1248 alu.src[1].neg = 1; 1249 alu.dst.sel = 1; 1250 alu.dst.chan = 2; 1251 alu.dst.write = 1; 1252 alu.last = 1; 1253 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1254 return r; 1255 1256 /* ADD r1.z, r1.z, -r1.y */ 1257 alu.op = ALU_OP2_ADD; 1258 alu.src[0].sel = 1; 1259 alu.src[0].chan = 2; 1260 alu.src[1].sel = 1; 1261 alu.src[1].chan = 1; 1262 alu.src[1].neg = 1; 1263 alu.dst.sel = 1; 1264 alu.dst.chan = 2; 1265 alu.dst.write = 1; 1266 alu.last = 1; 1267 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1268 return r; 1269 break; 1270 } 1271 break; 1272 default: 1273 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 1274 return -EINVAL; 1275 } 1276 return 0; 1277} 1278 1279static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset) 1280{ 1281 struct tgsi_parse_context parse; 1282 struct { 1283 boolean enabled; 1284 int *reg; 1285 unsigned name, alternate_name; 1286 } inputs[2] = { 1287 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */ 1288 1289 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */ 1290 }; 1291 int num_regs = 0; 1292 unsigned k, i; 1293 1294 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { 1295 return 0; 1296 } 1297 1298 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ 1299 while (!tgsi_parse_end_of_tokens(&parse)) { 1300 tgsi_parse_token(&parse); 1301 1302 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { 1303 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; 1304 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || 1305 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 1306 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) 1307 { 1308 int interpolate, location, k; 1309 1310 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 1311 location = TGSI_INTERPOLATE_LOC_CENTER; 1312 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 1313 location = TGSI_INTERPOLATE_LOC_CENTER; 1314 /* Needs sample positions, currently those are always available */ 1315 } else { 1316 location = TGSI_INTERPOLATE_LOC_CENTROID; 1317 } 1318 1319 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; 1320 k = eg_get_interpolator_index(interpolate, location); 1321 if (k >= 0) 1322 ctx->eg_interpolators[k].enabled = true; 1323 } 1324 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { 1325 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration; 1326 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { 1327 for (k = 0; k < ARRAY_SIZE(inputs); k++) { 1328 if (d->Semantic.Name == inputs[k].name || 1329 d->Semantic.Name == inputs[k].alternate_name) { 1330 inputs[k].enabled = true; 1331 } 1332 } 1333 } 1334 } 1335 } 1336 1337 tgsi_parse_free(&parse); 1338 1339 if (ctx->info.reads_samplemask && 1340 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) { 1341 inputs[1].enabled = true; 1342 } 1343 1344 if (ctx->bc->chip_class >= EVERGREEN) { 1345 int num_baryc = 0; 1346 /* assign gpr to each interpolator according to priority */ 1347 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { 1348 if (ctx->eg_interpolators[i].enabled) { 1349 ctx->eg_interpolators[i].ij_index = num_baryc; 1350 num_baryc++; 1351 } 1352 } 1353 num_baryc = (num_baryc + 1) >> 1; 1354 gpr_offset += num_baryc; 1355 } 1356 1357 for (i = 0; i < ARRAY_SIZE(inputs); i++) { 1358 boolean enabled = inputs[i].enabled; 1359 int *reg = inputs[i].reg; 1360 unsigned name = inputs[i].name; 1361 1362 if (enabled) { 1363 int gpr = gpr_offset + num_regs++; 1364 ctx->shader->nsys_inputs++; 1365 1366 // add to inputs, allocate a gpr 1367 k = ctx->shader->ninput++; 1368 ctx->shader->input[k].name = name; 1369 ctx->shader->input[k].sid = 0; 1370 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT; 1371 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER; 1372 *reg = ctx->shader->input[k].gpr = gpr; 1373 } 1374 } 1375 1376 return gpr_offset + num_regs; 1377} 1378 1379/* 1380 * for evergreen we need to scan the shader to find the number of GPRs we need to 1381 * reserve for interpolation and system values 1382 * 1383 * we need to know if we are going to emit any sample or centroid inputs 1384 * if perspective and linear are required 1385*/ 1386static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 1387{ 1388 unsigned i; 1389 1390 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); 1391 1392 /* 1393 * Could get this information from the shader info. But right now 1394 * we interpolate all declared inputs, whereas the shader info will 1395 * only contain the bits if the inputs are actually used, so it might 1396 * not be safe... 1397 */ 1398 for (i = 0; i < ctx->info.num_inputs; i++) { 1399 int k; 1400 /* skip position/face/mask/sampleid */ 1401 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 1402 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE || 1403 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK || 1404 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID) 1405 continue; 1406 1407 k = eg_get_interpolator_index( 1408 ctx->info.input_interpolate[i], 1409 ctx->info.input_interpolate_loc[i]); 1410 if (k >= 0) 1411 ctx->eg_interpolators[k].enabled = TRUE; 1412 } 1413 1414 /* XXX PULL MODEL and LINE STIPPLE */ 1415 1416 return allocate_system_value_inputs(ctx, 0); 1417} 1418 1419/* sample_id_sel == NULL means fetch for current sample */ 1420static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel) 1421{ 1422 struct r600_bytecode_vtx vtx; 1423 int r, t1; 1424 1425 t1 = r600_get_temp(ctx); 1426 1427 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1428 vtx.op = FETCH_OP_VFETCH; 1429 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1430 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1431 if (sample_id == NULL) { 1432 assert(ctx->fixed_pt_position_gpr != -1); 1433 1434 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; 1435 vtx.src_sel_x = 3; 1436 } 1437 else { 1438 struct r600_bytecode_alu alu; 1439 1440 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1441 alu.op = ALU_OP1_MOV; 1442 r600_bytecode_src(&alu.src[0], sample_id, chan_sel); 1443 alu.dst.sel = t1; 1444 alu.dst.write = 1; 1445 alu.last = 1; 1446 r = r600_bytecode_add_alu(ctx->bc, &alu); 1447 if (r) 1448 return r; 1449 1450 vtx.src_gpr = t1; 1451 vtx.src_sel_x = 0; 1452 } 1453 vtx.mega_fetch_count = 16; 1454 vtx.dst_gpr = t1; 1455 vtx.dst_sel_x = 0; 1456 vtx.dst_sel_y = 1; 1457 vtx.dst_sel_z = 2; 1458 vtx.dst_sel_w = 3; 1459 vtx.data_format = FMT_32_32_32_32_FLOAT; 1460 vtx.num_format_all = 2; 1461 vtx.format_comp_all = 1; 1462 vtx.use_const_fields = 0; 1463 vtx.offset = 0; 1464 vtx.endian = r600_endian_swap(32); 1465 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1466 1467 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1468 if (r) 1469 return r; 1470 1471 return t1; 1472} 1473 1474static int eg_load_helper_invocation(struct r600_shader_ctx *ctx) 1475{ 1476 int r; 1477 struct r600_bytecode_alu alu; 1478 1479 /* do a vtx fetch with wqm set on the vtx fetch */ 1480 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1481 alu.op = ALU_OP1_MOV; 1482 alu.dst.sel = ctx->helper_invoc_reg; 1483 alu.dst.chan = 0; 1484 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1485 alu.src[0].value = 0xffffffff; 1486 alu.dst.write = 1; 1487 alu.last = 1; 1488 r = r600_bytecode_add_alu(ctx->bc, &alu); 1489 if (r) 1490 return r; 1491 1492 /* do a vtx fetch in VPM mode */ 1493 struct r600_bytecode_vtx vtx; 1494 memset(&vtx, 0, sizeof(vtx)); 1495 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 1496 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1497 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1498 vtx.src_gpr = 0; 1499 vtx.mega_fetch_count = 16; /* no idea here really... */ 1500 vtx.dst_gpr = ctx->helper_invoc_reg; 1501 vtx.dst_sel_x = 4; 1502 vtx.dst_sel_y = 7; /* SEL_Y */ 1503 vtx.dst_sel_z = 7; /* SEL_Z */ 1504 vtx.dst_sel_w = 7; /* SEL_W */ 1505 vtx.data_format = FMT_32; 1506 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 1507 return r; 1508 ctx->bc->cf_last->vpm = 1; 1509 return 0; 1510} 1511 1512static int cm_load_helper_invocation(struct r600_shader_ctx *ctx) 1513{ 1514 int r; 1515 struct r600_bytecode_alu alu; 1516 1517 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1518 alu.op = ALU_OP1_MOV; 1519 alu.dst.sel = ctx->helper_invoc_reg; 1520 alu.dst.chan = 0; 1521 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1522 alu.src[0].value = 0xffffffff; 1523 alu.dst.write = 1; 1524 alu.last = 1; 1525 r = r600_bytecode_add_alu(ctx->bc, &alu); 1526 if (r) 1527 return r; 1528 1529 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1530 alu.op = ALU_OP1_MOV; 1531 alu.dst.sel = ctx->helper_invoc_reg; 1532 alu.dst.chan = 0; 1533 alu.src[0].sel = V_SQ_ALU_SRC_0; 1534 alu.dst.write = 1; 1535 alu.last = 1; 1536 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE); 1537 if (r) 1538 return r; 1539 1540 return ctx->helper_invoc_reg; 1541} 1542 1543static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block) 1544{ 1545 struct r600_bytecode_vtx vtx; 1546 int r, t1; 1547 1548 if (ctx->cs_block_size_loaded) 1549 return ctx->cs_block_size_reg; 1550 if (ctx->cs_grid_size_loaded) 1551 return ctx->cs_grid_size_reg; 1552 1553 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg; 1554 struct r600_bytecode_alu alu; 1555 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1556 alu.op = ALU_OP1_MOV; 1557 alu.src[0].sel = V_SQ_ALU_SRC_0; 1558 alu.dst.sel = t1; 1559 alu.dst.write = 1; 1560 alu.last = 1; 1561 r = r600_bytecode_add_alu(ctx->bc, &alu); 1562 if (r) 1563 return r; 1564 1565 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1566 vtx.op = FETCH_OP_VFETCH; 1567 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1568 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1569 vtx.src_gpr = t1; 1570 vtx.src_sel_x = 0; 1571 1572 vtx.mega_fetch_count = 16; 1573 vtx.dst_gpr = t1; 1574 vtx.dst_sel_x = 0; 1575 vtx.dst_sel_y = 1; 1576 vtx.dst_sel_z = 2; 1577 vtx.dst_sel_w = 7; 1578 vtx.data_format = FMT_32_32_32_32; 1579 vtx.num_format_all = 1; 1580 vtx.format_comp_all = 0; 1581 vtx.use_const_fields = 0; 1582 vtx.offset = load_block ? 0 : 16; // first element is size of buffer 1583 vtx.endian = r600_endian_swap(32); 1584 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1585 1586 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1587 if (r) 1588 return r; 1589 1590 if (load_block) 1591 ctx->cs_block_size_loaded = true; 1592 else 1593 ctx->cs_grid_size_loaded = true; 1594 return t1; 1595} 1596 1597static void tgsi_src(struct r600_shader_ctx *ctx, 1598 const struct tgsi_full_src_register *tgsi_src, 1599 struct r600_shader_src *r600_src) 1600{ 1601 memset(r600_src, 0, sizeof(*r600_src)); 1602 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 1603 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 1604 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 1605 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 1606 r600_src->neg = tgsi_src->Register.Negate; 1607 r600_src->abs = tgsi_src->Register.Absolute; 1608 1609 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) { 1610 bool spilled; 1611 unsigned idx; 1612 1613 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled); 1614 1615 if (spilled) { 1616 int reg = r600_get_temp(ctx); 1617 int r; 1618 1619 r600_src->sel = reg; 1620 1621 if (ctx->bc->chip_class < R700) { 1622 struct r600_bytecode_output cf; 1623 1624 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 1625 cf.op = CF_OP_MEM_SCRATCH; 1626 cf.elem_size = 3; 1627 cf.gpr = reg; 1628 cf.comp_mask = 0xF; 1629 cf.swizzle_x = 0; 1630 cf.swizzle_y = 1; 1631 cf.swizzle_z = 2; 1632 cf.swizzle_w = 3; 1633 cf.burst_count = 1; 1634 1635 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1636 &cf.array_base, &cf.array_size); 1637 1638 if (tgsi_src->Register.Indirect) { 1639 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 1640 cf.index_gpr = ctx->bc->ar_reg; 1641 } 1642 else { 1643 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ; 1644 cf.array_base += idx; 1645 cf.array_size = 0; 1646 } 1647 1648 r = r600_bytecode_add_output(ctx->bc, &cf); 1649 } 1650 else { 1651 struct r600_bytecode_vtx vtx; 1652 1653 if (r600_bytecode_get_need_wait_ack(ctx->bc)) { 1654 r600_bytecode_need_wait_ack(ctx->bc, false); 1655 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 1656 } 1657 1658 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1659 vtx.op = FETCH_OP_READ_SCRATCH; 1660 vtx.dst_gpr = reg; 1661 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation 1662 vtx.elem_size = 3; 1663 vtx.data_format = FMT_32_32_32_32; 1664 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT; 1665 vtx.dst_sel_x = tgsi_src->Register.SwizzleX; 1666 vtx.dst_sel_y = tgsi_src->Register.SwizzleY; 1667 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ; 1668 vtx.dst_sel_w = tgsi_src->Register.SwizzleW; 1669 1670 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1671 &vtx.array_base, &vtx.array_size); 1672 1673 if (tgsi_src->Register.Indirect) { 1674 vtx.indexed = 1; 1675 vtx.src_gpr = ctx->bc->ar_reg; 1676 } 1677 else { 1678 vtx.array_base += idx; 1679 vtx.array_size = 0; 1680 } 1681 1682 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1683 } 1684 1685 if (r) 1686 return; 1687 } 1688 else { 1689 if (tgsi_src->Register.Indirect) 1690 r600_src->rel = V_SQ_REL_RELATIVE; 1691 1692 r600_src->sel = idx; 1693 } 1694 1695 return; 1696 } 1697 1698 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 1699 int index; 1700 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 1701 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 1702 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 1703 1704 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 1705 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel); 1706 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 1707 return; 1708 } 1709 index = tgsi_src->Register.Index; 1710 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 1711 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 1712 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 1713 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) { 1714 r600_src->swizzle[0] = 2; // Z value 1715 r600_src->swizzle[1] = 2; 1716 r600_src->swizzle[2] = 2; 1717 r600_src->swizzle[3] = 2; 1718 r600_src->sel = ctx->face_gpr; 1719 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) { 1720 r600_src->swizzle[0] = 3; // W value 1721 r600_src->swizzle[1] = 3; 1722 r600_src->swizzle[2] = 3; 1723 r600_src->swizzle[3] = 3; 1724 r600_src->sel = ctx->fixed_pt_position_gpr; 1725 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) { 1726 r600_src->swizzle[0] = 0; 1727 r600_src->swizzle[1] = 1; 1728 r600_src->swizzle[2] = 4; 1729 r600_src->swizzle[3] = 4; 1730 r600_src->sel = load_sample_position(ctx, NULL, -1); 1731 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 1732 r600_src->swizzle[0] = 3; 1733 r600_src->swizzle[1] = 3; 1734 r600_src->swizzle[2] = 3; 1735 r600_src->swizzle[3] = 3; 1736 r600_src->sel = 0; 1737 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 1738 r600_src->swizzle[0] = 0; 1739 r600_src->swizzle[1] = 0; 1740 r600_src->swizzle[2] = 0; 1741 r600_src->swizzle[3] = 0; 1742 r600_src->sel = 0; 1743 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) { 1744 r600_src->sel = 0; 1745 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) { 1746 r600_src->sel = 1; 1747 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1748 r600_src->swizzle[0] = 3; 1749 r600_src->swizzle[1] = 3; 1750 r600_src->swizzle[2] = 3; 1751 r600_src->swizzle[3] = 3; 1752 r600_src->sel = 1; 1753 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1754 r600_src->swizzle[0] = 2; 1755 r600_src->swizzle[1] = 2; 1756 r600_src->swizzle[2] = 2; 1757 r600_src->swizzle[3] = 2; 1758 r600_src->sel = 0; 1759 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) { 1760 r600_src->sel = 1; 1761 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) { 1762 r600_src->sel = 3; 1763 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) { 1764 r600_src->sel = 2; 1765 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) { 1766 r600_src->sel = ctx->tess_input_info; 1767 r600_src->swizzle[0] = 2; 1768 r600_src->swizzle[1] = 2; 1769 r600_src->swizzle[2] = 2; 1770 r600_src->swizzle[3] = 2; 1771 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1772 r600_src->sel = 0; 1773 r600_src->swizzle[0] = 0; 1774 r600_src->swizzle[1] = 0; 1775 r600_src->swizzle[2] = 0; 1776 r600_src->swizzle[3] = 0; 1777 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1778 r600_src->sel = 0; 1779 r600_src->swizzle[0] = 3; 1780 r600_src->swizzle[1] = 3; 1781 r600_src->swizzle[2] = 3; 1782 r600_src->swizzle[3] = 3; 1783 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) { 1784 r600_src->sel = load_block_grid_size(ctx, false); 1785 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) { 1786 r600_src->sel = load_block_grid_size(ctx, true); 1787 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) { 1788 r600_src->sel = ctx->helper_invoc_reg; 1789 r600_src->swizzle[0] = 0; 1790 r600_src->swizzle[1] = 0; 1791 r600_src->swizzle[2] = 0; 1792 r600_src->swizzle[3] = 0; 1793 } 1794 } else { 1795 if (tgsi_src->Register.Indirect) 1796 r600_src->rel = V_SQ_REL_RELATIVE; 1797 r600_src->sel = tgsi_src->Register.Index; 1798 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 1799 } 1800 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 1801 if (tgsi_src->Register.Dimension) { 1802 r600_src->kc_bank = tgsi_src->Dimension.Index; 1803 if (tgsi_src->Dimension.Indirect) { 1804 r600_src->kc_rel = 1; 1805 } 1806 } 1807 } 1808} 1809 1810static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 1811 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 1812 unsigned int dst_reg) 1813{ 1814 struct r600_bytecode_vtx vtx; 1815 unsigned int ar_reg; 1816 int r; 1817 1818 if (offset) { 1819 struct r600_bytecode_alu alu; 1820 1821 memset(&alu, 0, sizeof(alu)); 1822 1823 alu.op = ALU_OP2_ADD_INT; 1824 alu.src[0].sel = ctx->bc->ar_reg; 1825 alu.src[0].chan = ar_chan; 1826 1827 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1828 alu.src[1].value = offset; 1829 1830 alu.dst.sel = dst_reg; 1831 alu.dst.chan = ar_chan; 1832 alu.dst.write = 1; 1833 alu.last = 1; 1834 1835 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1836 return r; 1837 1838 ar_reg = dst_reg; 1839 } else { 1840 ar_reg = ctx->bc->ar_reg; 1841 } 1842 1843 memset(&vtx, 0, sizeof(vtx)); 1844 vtx.buffer_id = cb_idx; 1845 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1846 vtx.src_gpr = ar_reg; 1847 vtx.src_sel_x = ar_chan; 1848 vtx.mega_fetch_count = 16; 1849 vtx.dst_gpr = dst_reg; 1850 vtx.dst_sel_x = 0; /* SEL_X */ 1851 vtx.dst_sel_y = 1; /* SEL_Y */ 1852 vtx.dst_sel_z = 2; /* SEL_Z */ 1853 vtx.dst_sel_w = 3; /* SEL_W */ 1854 vtx.data_format = FMT_32_32_32_32_FLOAT; 1855 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 1856 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 1857 vtx.endian = r600_endian_swap(32); 1858 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE; 1859 1860 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1861 return r; 1862 1863 return 0; 1864} 1865 1866static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1867{ 1868 struct r600_bytecode_vtx vtx; 1869 int r; 1870 unsigned index = src->Register.Index; 1871 unsigned vtx_id = src->Dimension.Index; 1872 int offset_reg = ctx->gs_rotated_input[vtx_id / 3]; 1873 int offset_chan = vtx_id % 3; 1874 int t2 = 0; 1875 1876 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, 1877 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ 1878 1879 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2) 1880 offset_chan = 3; 1881 1882 if (src->Dimension.Indirect || src->Register.Indirect) 1883 t2 = r600_get_temp(ctx); 1884 1885 if (src->Dimension.Indirect) { 1886 int treg[3]; 1887 struct r600_bytecode_alu alu; 1888 int r, i; 1889 unsigned addr_reg; 1890 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); 1891 if (src->DimIndirect.Index > 0) { 1892 r = single_alu_op2(ctx, ALU_OP1_MOV, 1893 ctx->bc->ar_reg, 0, 1894 addr_reg, 0, 1895 0, 0); 1896 if (r) 1897 return r; 1898 } 1899 /* 1900 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. 1901 at least this is what fglrx seems to do. */ 1902 for (i = 0; i < 3; i++) { 1903 treg[i] = r600_get_temp(ctx); 1904 } 1905 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); 1906 1907 for (i = 0; i < 3; i++) { 1908 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1909 alu.op = ALU_OP1_MOV; 1910 alu.src[0].sel = ctx->gs_rotated_input[0]; 1911 alu.src[0].chan = i == 2 ? 3 : i; 1912 alu.dst.sel = treg[i]; 1913 alu.dst.chan = 0; 1914 alu.dst.write = 1; 1915 alu.last = 1; 1916 r = r600_bytecode_add_alu(ctx->bc, &alu); 1917 if (r) 1918 return r; 1919 } 1920 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1921 alu.op = ALU_OP1_MOV; 1922 alu.src[0].sel = treg[0]; 1923 alu.src[0].rel = 1; 1924 alu.dst.sel = t2; 1925 alu.dst.write = 1; 1926 alu.last = 1; 1927 r = r600_bytecode_add_alu(ctx->bc, &alu); 1928 if (r) 1929 return r; 1930 offset_reg = t2; 1931 offset_chan = 0; 1932 } 1933 1934 if (src->Register.Indirect) { 1935 int addr_reg; 1936 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; 1937 1938 addr_reg = get_address_file_reg(ctx, src->Indirect.Index); 1939 1940 /* pull the value from index_reg */ 1941 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1942 t2, 1, 1943 addr_reg, 0, 1944 V_SQ_ALU_SRC_LITERAL, first); 1945 if (r) 1946 return r; 1947 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1948 t2, 0, 1949 t2, 1, 1950 V_SQ_ALU_SRC_LITERAL, 4, 1951 offset_reg, offset_chan); 1952 if (r) 1953 return r; 1954 offset_reg = t2; 1955 offset_chan = 0; 1956 index = src->Register.Index - first; 1957 } 1958 1959 memset(&vtx, 0, sizeof(vtx)); 1960 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 1961 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1962 vtx.src_gpr = offset_reg; 1963 vtx.src_sel_x = offset_chan; 1964 vtx.offset = index * 16; /*bytes*/ 1965 vtx.mega_fetch_count = 16; 1966 vtx.dst_gpr = dst_reg; 1967 vtx.dst_sel_x = 0; /* SEL_X */ 1968 vtx.dst_sel_y = 1; /* SEL_Y */ 1969 vtx.dst_sel_z = 2; /* SEL_Z */ 1970 vtx.dst_sel_w = 3; /* SEL_W */ 1971 if (ctx->bc->chip_class >= EVERGREEN) { 1972 vtx.use_const_fields = 1; 1973 } else { 1974 vtx.data_format = FMT_32_32_32_32_FLOAT; 1975 } 1976 1977 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1978 return r; 1979 1980 return 0; 1981} 1982 1983static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) 1984{ 1985 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1986 unsigned i; 1987 1988 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1989 struct tgsi_full_src_register *src = &inst->Src[i]; 1990 1991 if (src->Register.File == TGSI_FILE_INPUT) { 1992 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) { 1993 /* primitive id is in R0.z */ 1994 ctx->src[i].sel = 0; 1995 ctx->src[i].swizzle[0] = 2; 1996 } 1997 } 1998 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) { 1999 int treg = r600_get_temp(ctx); 2000 2001 fetch_gs_input(ctx, src, treg); 2002 ctx->src[i].sel = treg; 2003 ctx->src[i].rel = 0; 2004 } 2005 } 2006 return 0; 2007} 2008 2009 2010/* Tessellation shaders pass outputs to the next shader using LDS. 2011 * 2012 * LS outputs = TCS(HS) inputs 2013 * TCS(HS) outputs = TES(DS) inputs 2014 * 2015 * The LDS layout is: 2016 * - TCS inputs for patch 0 2017 * - TCS inputs for patch 1 2018 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 2019 * - ... 2020 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 2021 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 2022 * - TCS outputs for patch 1 2023 * - Per-patch TCS outputs for patch 1 2024 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 2025 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 2026 * - ... 2027 * 2028 * All three shaders VS(LS), TCS, TES share the same LDS space. 2029 */ 2030/* this will return with the dw address in temp_reg.x */ 2031static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, 2032 const struct tgsi_full_dst_register *dst, 2033 const struct tgsi_full_src_register *src, 2034 int stride_bytes_reg, int stride_bytes_chan) 2035{ 2036 struct tgsi_full_dst_register reg; 2037 ubyte *name, *index, *array_first; 2038 int r; 2039 int param; 2040 struct tgsi_shader_info *info = &ctx->info; 2041 /* Set the register description. The address computation is the same 2042 * for sources and destinations. */ 2043 if (src) { 2044 reg.Register.File = src->Register.File; 2045 reg.Register.Index = src->Register.Index; 2046 reg.Register.Indirect = src->Register.Indirect; 2047 reg.Register.Dimension = src->Register.Dimension; 2048 reg.Indirect = src->Indirect; 2049 reg.Dimension = src->Dimension; 2050 reg.DimIndirect = src->DimIndirect; 2051 } else 2052 reg = *dst; 2053 2054 /* If the register is 2-dimensional (e.g. an array of vertices 2055 * in a primitive), calculate the base address of the vertex. */ 2056 if (reg.Register.Dimension) { 2057 int sel, chan; 2058 if (reg.Dimension.Indirect) { 2059 unsigned addr_reg; 2060 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS); 2061 2062 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index); 2063 /* pull the value from index_reg */ 2064 sel = addr_reg; 2065 chan = 0; 2066 } else { 2067 sel = V_SQ_ALU_SRC_LITERAL; 2068 chan = reg.Dimension.Index; 2069 } 2070 2071 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2072 temp_reg, 0, 2073 stride_bytes_reg, stride_bytes_chan, 2074 sel, chan, 2075 temp_reg, 0); 2076 if (r) 2077 return r; 2078 } 2079 2080 if (reg.Register.File == TGSI_FILE_INPUT) { 2081 name = info->input_semantic_name; 2082 index = info->input_semantic_index; 2083 array_first = info->input_array_first; 2084 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 2085 name = info->output_semantic_name; 2086 index = info->output_semantic_index; 2087 array_first = info->output_array_first; 2088 } else { 2089 assert(0); 2090 return -1; 2091 } 2092 if (reg.Register.Indirect) { 2093 int addr_reg; 2094 int first; 2095 /* Add the relative address of the element. */ 2096 if (reg.Indirect.ArrayID) 2097 first = array_first[reg.Indirect.ArrayID]; 2098 else 2099 first = reg.Register.Index; 2100 2101 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); 2102 2103 /* pull the value from index_reg */ 2104 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2105 temp_reg, 0, 2106 V_SQ_ALU_SRC_LITERAL, 16, 2107 addr_reg, 0, 2108 temp_reg, 0); 2109 if (r) 2110 return r; 2111 2112 param = r600_get_lds_unique_index(name[first], 2113 index[first]); 2114 2115 } else { 2116 param = r600_get_lds_unique_index(name[reg.Register.Index], 2117 index[reg.Register.Index]); 2118 } 2119 2120 /* add to base_addr - passed in temp_reg.x */ 2121 if (param) { 2122 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2123 temp_reg, 0, 2124 temp_reg, 0, 2125 V_SQ_ALU_SRC_LITERAL, param * 16); 2126 if (r) 2127 return r; 2128 2129 } 2130 return 0; 2131} 2132 2133static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 2134 unsigned dst_reg, unsigned mask) 2135{ 2136 struct r600_bytecode_alu alu; 2137 int r, i, lasti; 2138 2139 if ((ctx->bc->cf_last->ndw>>1) >= 0x60) 2140 ctx->bc->force_add_cf = 1; 2141 2142 lasti = tgsi_last_instruction(mask); 2143 for (i = 1; i <= lasti; i++) { 2144 if (!(mask & (1 << i))) 2145 continue; 2146 2147 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2148 temp_reg, i, 2149 temp_reg, 0, 2150 V_SQ_ALU_SRC_LITERAL, 4 * i); 2151 if (r) 2152 return r; 2153 } 2154 for (i = 0; i <= lasti; i++) { 2155 if (!(mask & (1 << i))) 2156 continue; 2157 2158 /* emit an LDS_READ_RET */ 2159 memset(&alu, 0, sizeof(alu)); 2160 alu.op = LDS_OP1_LDS_READ_RET; 2161 alu.src[0].sel = temp_reg; 2162 alu.src[0].chan = i; 2163 alu.src[1].sel = V_SQ_ALU_SRC_0; 2164 alu.src[2].sel = V_SQ_ALU_SRC_0; 2165 alu.dst.chan = 0; 2166 alu.is_lds_idx_op = true; 2167 alu.last = 1; 2168 r = r600_bytecode_add_alu(ctx->bc, &alu); 2169 if (r) 2170 return r; 2171 } 2172 for (i = 0; i <= lasti; i++) { 2173 if (!(mask & (1 << i))) 2174 continue; 2175 2176 /* then read from LDS_OQ_A_POP */ 2177 memset(&alu, 0, sizeof(alu)); 2178 2179 alu.op = ALU_OP1_MOV; 2180 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 2181 alu.src[0].chan = 0; 2182 alu.dst.sel = dst_reg; 2183 alu.dst.chan = i; 2184 alu.dst.write = 1; 2185 alu.last = 1; 2186 r = r600_bytecode_add_alu(ctx->bc, &alu); 2187 if (r) 2188 return r; 2189 } 2190 return 0; 2191} 2192 2193static int fetch_mask(struct tgsi_src_register *reg) 2194{ 2195 int mask = 0; 2196 mask |= 1 << reg->SwizzleX; 2197 mask |= 1 << reg->SwizzleY; 2198 mask |= 1 << reg->SwizzleZ; 2199 mask |= 1 << reg->SwizzleW; 2200 return mask; 2201} 2202 2203static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2204{ 2205 int r; 2206 unsigned temp_reg = r600_get_temp(ctx); 2207 2208 r = get_lds_offset0(ctx, 2, temp_reg, 2209 src->Register.Dimension ? false : true); 2210 if (r) 2211 return r; 2212 2213 /* the base address is now in temp.x */ 2214 r = r600_get_byte_address(ctx, temp_reg, 2215 NULL, src, ctx->tess_output_info, 1); 2216 if (r) 2217 return r; 2218 2219 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2220 if (r) 2221 return r; 2222 return 0; 2223} 2224 2225static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2226{ 2227 int r; 2228 unsigned temp_reg = r600_get_temp(ctx); 2229 2230 /* t.x = ips * r0.y */ 2231 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2232 temp_reg, 0, 2233 ctx->tess_input_info, 0, 2234 0, 1); 2235 2236 if (r) 2237 return r; 2238 2239 /* the base address is now in temp.x */ 2240 r = r600_get_byte_address(ctx, temp_reg, 2241 NULL, src, ctx->tess_input_info, 1); 2242 if (r) 2243 return r; 2244 2245 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2246 if (r) 2247 return r; 2248 return 0; 2249} 2250 2251static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2252{ 2253 int r; 2254 unsigned temp_reg = r600_get_temp(ctx); 2255 2256 r = get_lds_offset0(ctx, 1, temp_reg, 2257 src->Register.Dimension ? false : true); 2258 if (r) 2259 return r; 2260 /* the base address is now in temp.x */ 2261 r = r600_get_byte_address(ctx, temp_reg, 2262 NULL, src, 2263 ctx->tess_output_info, 1); 2264 if (r) 2265 return r; 2266 2267 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2268 if (r) 2269 return r; 2270 return 0; 2271} 2272 2273static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) 2274{ 2275 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2276 unsigned i; 2277 2278 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2279 struct tgsi_full_src_register *src = &inst->Src[i]; 2280 2281 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) { 2282 int treg = r600_get_temp(ctx); 2283 fetch_tes_input(ctx, src, treg); 2284 ctx->src[i].sel = treg; 2285 ctx->src[i].rel = 0; 2286 } 2287 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) { 2288 int treg = r600_get_temp(ctx); 2289 fetch_tcs_input(ctx, src, treg); 2290 ctx->src[i].sel = treg; 2291 ctx->src[i].rel = 0; 2292 } 2293 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) { 2294 int treg = r600_get_temp(ctx); 2295 fetch_tcs_output(ctx, src, treg); 2296 ctx->src[i].sel = treg; 2297 ctx->src[i].rel = 0; 2298 } 2299 } 2300 return 0; 2301} 2302 2303static int tgsi_split_constant(struct r600_shader_ctx *ctx) 2304{ 2305 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2306 struct r600_bytecode_alu alu; 2307 int i, j, k, nconst, r; 2308 2309 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 2310 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 2311 nconst++; 2312 } 2313 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 2314 } 2315 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 2316 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 2317 continue; 2318 } 2319 2320 if (ctx->src[i].rel) { 2321 int chan = inst->Src[i].Indirect.Swizzle; 2322 int treg = r600_get_temp(ctx); 2323 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) 2324 return r; 2325 2326 ctx->src[i].kc_bank = 0; 2327 ctx->src[i].kc_rel = 0; 2328 ctx->src[i].sel = treg; 2329 ctx->src[i].rel = 0; 2330 j--; 2331 } else if (j > 0) { 2332 int treg = r600_get_temp(ctx); 2333 for (k = 0; k < 4; k++) { 2334 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2335 alu.op = ALU_OP1_MOV; 2336 alu.src[0].sel = ctx->src[i].sel; 2337 alu.src[0].chan = k; 2338 alu.src[0].rel = ctx->src[i].rel; 2339 alu.src[0].kc_bank = ctx->src[i].kc_bank; 2340 alu.src[0].kc_rel = ctx->src[i].kc_rel; 2341 alu.dst.sel = treg; 2342 alu.dst.chan = k; 2343 alu.dst.write = 1; 2344 if (k == 3) 2345 alu.last = 1; 2346 r = r600_bytecode_add_alu(ctx->bc, &alu); 2347 if (r) 2348 return r; 2349 } 2350 ctx->src[i].sel = treg; 2351 ctx->src[i].rel =0; 2352 j--; 2353 } 2354 } 2355 return 0; 2356} 2357 2358/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 2359static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 2360{ 2361 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2362 struct r600_bytecode_alu alu; 2363 int i, j, k, nliteral, r; 2364 2365 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 2366 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2367 nliteral++; 2368 } 2369 } 2370 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 2371 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2372 int treg = r600_get_temp(ctx); 2373 for (k = 0; k < 4; k++) { 2374 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2375 alu.op = ALU_OP1_MOV; 2376 alu.src[0].sel = ctx->src[i].sel; 2377 alu.src[0].chan = k; 2378 alu.src[0].value = ctx->src[i].value[k]; 2379 alu.dst.sel = treg; 2380 alu.dst.chan = k; 2381 alu.dst.write = 1; 2382 if (k == 3) 2383 alu.last = 1; 2384 r = r600_bytecode_add_alu(ctx->bc, &alu); 2385 if (r) 2386 return r; 2387 } 2388 ctx->src[i].sel = treg; 2389 j--; 2390 } 2391 } 2392 return 0; 2393} 2394 2395static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 2396{ 2397 int i, r, count = ctx->shader->ninput; 2398 2399 for (i = 0; i < count; i++) { 2400 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 2401 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 2402 if (r) 2403 return r; 2404 } 2405 } 2406 return 0; 2407} 2408 2409static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so, 2410 int stream, unsigned *stream_item_size UNUSED) 2411{ 2412 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 2413 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS]; 2414 int j, r; 2415 unsigned i; 2416 2417 /* Sanity checking. */ 2418 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) { 2419 R600_ERR("Too many stream outputs: %d\n", so->num_outputs); 2420 r = -EINVAL; 2421 goto out_err; 2422 } 2423 for (i = 0; i < so->num_outputs; i++) { 2424 if (so->output[i].output_buffer >= 4) { 2425 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 2426 so->output[i].output_buffer); 2427 r = -EINVAL; 2428 goto out_err; 2429 } 2430 } 2431 2432 /* Initialize locations where the outputs are stored. */ 2433 for (i = 0; i < so->num_outputs; i++) { 2434 2435 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; 2436 start_comp[i] = so->output[i].start_component; 2437 /* Lower outputs with dst_offset < start_component. 2438 * 2439 * We can only output 4D vectors with a write mask, e.g. we can 2440 * only output the W component at offset 3, etc. If we want 2441 * to store Y, Z, or W at buffer offset 0, we need to use MOV 2442 * to move it to X and output X. */ 2443 if (so->output[i].dst_offset < so->output[i].start_component) { 2444 unsigned tmp = r600_get_temp(ctx); 2445 2446 for (j = 0; j < so->output[i].num_components; j++) { 2447 struct r600_bytecode_alu alu; 2448 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2449 alu.op = ALU_OP1_MOV; 2450 alu.src[0].sel = so_gpr[i]; 2451 alu.src[0].chan = so->output[i].start_component + j; 2452 2453 alu.dst.sel = tmp; 2454 alu.dst.chan = j; 2455 alu.dst.write = 1; 2456 if (j == so->output[i].num_components - 1) 2457 alu.last = 1; 2458 r = r600_bytecode_add_alu(ctx->bc, &alu); 2459 if (r) 2460 return r; 2461 } 2462 start_comp[i] = 0; 2463 so_gpr[i] = tmp; 2464 } 2465 } 2466 2467 /* Write outputs to buffers. */ 2468 for (i = 0; i < so->num_outputs; i++) { 2469 struct r600_bytecode_output output; 2470 2471 if (stream != -1 && stream != so->output[i].stream) 2472 continue; 2473 2474 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2475 output.gpr = so_gpr[i]; 2476 output.elem_size = so->output[i].num_components - 1; 2477 if (output.elem_size == 2) 2478 output.elem_size = 3; // 3 not supported, write 4 with junk at end 2479 output.array_base = so->output[i].dst_offset - start_comp[i]; 2480 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2481 output.burst_count = 1; 2482 /* array_size is an upper limit for the burst_count 2483 * with MEM_STREAM instructions */ 2484 output.array_size = 0xFFF; 2485 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i]; 2486 2487 if (ctx->bc->chip_class >= EVERGREEN) { 2488 switch (so->output[i].output_buffer) { 2489 case 0: 2490 output.op = CF_OP_MEM_STREAM0_BUF0; 2491 break; 2492 case 1: 2493 output.op = CF_OP_MEM_STREAM0_BUF1; 2494 break; 2495 case 2: 2496 output.op = CF_OP_MEM_STREAM0_BUF2; 2497 break; 2498 case 3: 2499 output.op = CF_OP_MEM_STREAM0_BUF3; 2500 break; 2501 } 2502 output.op += so->output[i].stream * 4; 2503 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3); 2504 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; 2505 } else { 2506 switch (so->output[i].output_buffer) { 2507 case 0: 2508 output.op = CF_OP_MEM_STREAM0; 2509 break; 2510 case 1: 2511 output.op = CF_OP_MEM_STREAM1; 2512 break; 2513 case 2: 2514 output.op = CF_OP_MEM_STREAM2; 2515 break; 2516 case 3: 2517 output.op = CF_OP_MEM_STREAM3; 2518 break; 2519 } 2520 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer; 2521 } 2522 r = r600_bytecode_add_output(ctx->bc, &output); 2523 if (r) 2524 goto out_err; 2525 } 2526 return 0; 2527out_err: 2528 return r; 2529} 2530 2531static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx) 2532{ 2533 struct r600_bytecode_alu alu; 2534 unsigned reg; 2535 2536 if (!ctx->shader->vs_out_edgeflag) 2537 return; 2538 2539 reg = ctx->shader->output[ctx->edgeflag_output].gpr; 2540 2541 /* clamp(x, 0, 1) */ 2542 memset(&alu, 0, sizeof(alu)); 2543 alu.op = ALU_OP1_MOV; 2544 alu.src[0].sel = reg; 2545 alu.dst.sel = reg; 2546 alu.dst.write = 1; 2547 alu.dst.clamp = 1; 2548 alu.last = 1; 2549 r600_bytecode_add_alu(ctx->bc, &alu); 2550 2551 memset(&alu, 0, sizeof(alu)); 2552 alu.op = ALU_OP1_FLT_TO_INT; 2553 alu.src[0].sel = reg; 2554 alu.dst.sel = reg; 2555 alu.dst.write = 1; 2556 alu.last = 1; 2557 r600_bytecode_add_alu(ctx->bc, &alu); 2558} 2559 2560int generate_gs_copy_shader(struct r600_context *rctx, 2561 struct r600_pipe_shader *gs, 2562 struct pipe_stream_output_info *so) 2563{ 2564 struct r600_shader_ctx ctx = {}; 2565 struct r600_shader *gs_shader = &gs->shader; 2566 struct r600_pipe_shader *cshader; 2567 unsigned ocnt = gs_shader->noutput; 2568 struct r600_bytecode_alu alu; 2569 struct r600_bytecode_vtx vtx; 2570 struct r600_bytecode_output output; 2571 struct r600_bytecode_cf *cf_jump, *cf_pop, 2572 *last_exp_pos = NULL, *last_exp_param = NULL; 2573 int next_clip_pos = 61, next_param = 0; 2574 unsigned i, j; 2575 int ring; 2576 bool only_ring_0 = true; 2577 cshader = calloc(1, sizeof(struct r600_pipe_shader)); 2578 if (!cshader) 2579 return 0; 2580 2581 memcpy(cshader->shader.output, gs_shader->output, ocnt * 2582 sizeof(struct r600_shader_io)); 2583 2584 cshader->shader.noutput = ocnt; 2585 2586 ctx.shader = &cshader->shader; 2587 ctx.bc = &ctx.shader->bc; 2588 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX; 2589 2590 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family, 2591 rctx->screen->has_compressed_msaa_texturing); 2592 2593 ctx.bc->isa = rctx->isa; 2594 2595 cf_jump = NULL; 2596 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes)); 2597 2598 /* R0.x = R0.x & 0x3fffffff */ 2599 memset(&alu, 0, sizeof(alu)); 2600 alu.op = ALU_OP2_AND_INT; 2601 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2602 alu.src[1].value = 0x3fffffff; 2603 alu.dst.write = 1; 2604 r600_bytecode_add_alu(ctx.bc, &alu); 2605 2606 /* R0.y = R0.x >> 30 */ 2607 memset(&alu, 0, sizeof(alu)); 2608 alu.op = ALU_OP2_LSHR_INT; 2609 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2610 alu.src[1].value = 0x1e; 2611 alu.dst.chan = 1; 2612 alu.dst.write = 1; 2613 alu.last = 1; 2614 r600_bytecode_add_alu(ctx.bc, &alu); 2615 2616 /* fetch vertex data from GSVS ring */ 2617 for (i = 0; i < ocnt; ++i) { 2618 struct r600_shader_io *out = &ctx.shader->output[i]; 2619 2620 out->gpr = i + 1; 2621 out->ring_offset = i * 16; 2622 2623 memset(&vtx, 0, sizeof(vtx)); 2624 vtx.op = FETCH_OP_VFETCH; 2625 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 2626 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2627 vtx.mega_fetch_count = 16; 2628 vtx.offset = out->ring_offset; 2629 vtx.dst_gpr = out->gpr; 2630 vtx.src_gpr = 0; 2631 vtx.dst_sel_x = 0; 2632 vtx.dst_sel_y = 1; 2633 vtx.dst_sel_z = 2; 2634 vtx.dst_sel_w = 3; 2635 if (rctx->b.chip_class >= EVERGREEN) { 2636 vtx.use_const_fields = 1; 2637 } else { 2638 vtx.data_format = FMT_32_32_32_32_FLOAT; 2639 } 2640 2641 r600_bytecode_add_vtx(ctx.bc, &vtx); 2642 } 2643 ctx.temp_reg = i + 1; 2644 for (ring = 3; ring >= 0; --ring) { 2645 bool enabled = false; 2646 for (i = 0; i < so->num_outputs; i++) { 2647 if (so->output[i].stream == ring) { 2648 enabled = true; 2649 if (ring > 0) 2650 only_ring_0 = false; 2651 break; 2652 } 2653 } 2654 if (ring != 0 && !enabled) { 2655 cshader->shader.ring_item_sizes[ring] = 0; 2656 continue; 2657 } 2658 2659 if (cf_jump) { 2660 // Patch up jump label 2661 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2662 cf_pop = ctx.bc->cf_last; 2663 2664 cf_jump->cf_addr = cf_pop->id + 2; 2665 cf_jump->pop_count = 1; 2666 cf_pop->cf_addr = cf_pop->id + 2; 2667 cf_pop->pop_count = 1; 2668 } 2669 2670 /* PRED_SETE_INT __, R0.y, ring */ 2671 memset(&alu, 0, sizeof(alu)); 2672 alu.op = ALU_OP2_PRED_SETE_INT; 2673 alu.src[0].chan = 1; 2674 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2675 alu.src[1].value = ring; 2676 alu.execute_mask = 1; 2677 alu.update_pred = 1; 2678 alu.last = 1; 2679 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); 2680 2681 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); 2682 cf_jump = ctx.bc->cf_last; 2683 2684 if (enabled) 2685 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]); 2686 cshader->shader.ring_item_sizes[ring] = ocnt * 16; 2687 } 2688 2689 /* bc adds nops - copy it */ 2690 if (ctx.bc->chip_class == R600) { 2691 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2692 alu.op = ALU_OP0_NOP; 2693 alu.last = 1; 2694 r600_bytecode_add_alu(ctx.bc, &alu); 2695 2696 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2697 } 2698 2699 /* export vertex data */ 2700 /* XXX factor out common code with r600_shader_from_tgsi ? */ 2701 for (i = 0; i < ocnt; ++i) { 2702 struct r600_shader_io *out = &ctx.shader->output[i]; 2703 bool instream0 = true; 2704 if (out->name == TGSI_SEMANTIC_CLIPVERTEX) 2705 continue; 2706 2707 for (j = 0; j < so->num_outputs; j++) { 2708 if (so->output[j].register_index == i) { 2709 if (so->output[j].stream == 0) 2710 break; 2711 if (so->output[j].stream > 0) 2712 instream0 = false; 2713 } 2714 } 2715 if (!instream0) 2716 continue; 2717 memset(&output, 0, sizeof(output)); 2718 output.gpr = out->gpr; 2719 output.elem_size = 3; 2720 output.swizzle_x = 0; 2721 output.swizzle_y = 1; 2722 output.swizzle_z = 2; 2723 output.swizzle_w = 3; 2724 output.burst_count = 1; 2725 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2726 output.op = CF_OP_EXPORT; 2727 switch (out->name) { 2728 case TGSI_SEMANTIC_POSITION: 2729 output.array_base = 60; 2730 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2731 break; 2732 2733 case TGSI_SEMANTIC_PSIZE: 2734 output.array_base = 61; 2735 if (next_clip_pos == 61) 2736 next_clip_pos = 62; 2737 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2738 output.swizzle_y = 7; 2739 output.swizzle_z = 7; 2740 output.swizzle_w = 7; 2741 ctx.shader->vs_out_misc_write = 1; 2742 ctx.shader->vs_out_point_size = 1; 2743 break; 2744 case TGSI_SEMANTIC_LAYER: 2745 if (out->spi_sid) { 2746 /* duplicate it as PARAM to pass to the pixel shader */ 2747 output.array_base = next_param++; 2748 r600_bytecode_add_output(ctx.bc, &output); 2749 last_exp_param = ctx.bc->cf_last; 2750 } 2751 output.array_base = 61; 2752 if (next_clip_pos == 61) 2753 next_clip_pos = 62; 2754 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2755 output.swizzle_x = 7; 2756 output.swizzle_y = 7; 2757 output.swizzle_z = 0; 2758 output.swizzle_w = 7; 2759 ctx.shader->vs_out_misc_write = 1; 2760 ctx.shader->vs_out_layer = 1; 2761 break; 2762 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2763 if (out->spi_sid) { 2764 /* duplicate it as PARAM to pass to the pixel shader */ 2765 output.array_base = next_param++; 2766 r600_bytecode_add_output(ctx.bc, &output); 2767 last_exp_param = ctx.bc->cf_last; 2768 } 2769 output.array_base = 61; 2770 if (next_clip_pos == 61) 2771 next_clip_pos = 62; 2772 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2773 ctx.shader->vs_out_misc_write = 1; 2774 ctx.shader->vs_out_viewport = 1; 2775 output.swizzle_x = 7; 2776 output.swizzle_y = 7; 2777 output.swizzle_z = 7; 2778 output.swizzle_w = 0; 2779 break; 2780 case TGSI_SEMANTIC_CLIPDIST: 2781 /* spi_sid is 0 for clipdistance outputs that were generated 2782 * for clipvertex - we don't need to pass them to PS */ 2783 ctx.shader->clip_dist_write = gs->shader.clip_dist_write; 2784 ctx.shader->cull_dist_write = gs->shader.cull_dist_write; 2785 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask; 2786 if (out->spi_sid) { 2787 /* duplicate it as PARAM to pass to the pixel shader */ 2788 output.array_base = next_param++; 2789 r600_bytecode_add_output(ctx.bc, &output); 2790 last_exp_param = ctx.bc->cf_last; 2791 } 2792 output.array_base = next_clip_pos++; 2793 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2794 break; 2795 case TGSI_SEMANTIC_FOG: 2796 output.swizzle_y = 4; /* 0 */ 2797 output.swizzle_z = 4; /* 0 */ 2798 output.swizzle_w = 5; /* 1 */ 2799 break; 2800 default: 2801 output.array_base = next_param++; 2802 break; 2803 } 2804 r600_bytecode_add_output(ctx.bc, &output); 2805 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) 2806 last_exp_param = ctx.bc->cf_last; 2807 else 2808 last_exp_pos = ctx.bc->cf_last; 2809 } 2810 2811 if (!last_exp_pos) { 2812 memset(&output, 0, sizeof(output)); 2813 output.gpr = 0; 2814 output.elem_size = 3; 2815 output.swizzle_x = 7; 2816 output.swizzle_y = 7; 2817 output.swizzle_z = 7; 2818 output.swizzle_w = 7; 2819 output.burst_count = 1; 2820 output.type = 2; 2821 output.op = CF_OP_EXPORT; 2822 output.array_base = 60; 2823 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2824 r600_bytecode_add_output(ctx.bc, &output); 2825 last_exp_pos = ctx.bc->cf_last; 2826 } 2827 2828 if (!last_exp_param) { 2829 memset(&output, 0, sizeof(output)); 2830 output.gpr = 0; 2831 output.elem_size = 3; 2832 output.swizzle_x = 7; 2833 output.swizzle_y = 7; 2834 output.swizzle_z = 7; 2835 output.swizzle_w = 7; 2836 output.burst_count = 1; 2837 output.type = 2; 2838 output.op = CF_OP_EXPORT; 2839 output.array_base = next_param++; 2840 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2841 r600_bytecode_add_output(ctx.bc, &output); 2842 last_exp_param = ctx.bc->cf_last; 2843 } 2844 2845 last_exp_pos->op = CF_OP_EXPORT_DONE; 2846 last_exp_param->op = CF_OP_EXPORT_DONE; 2847 2848 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2849 cf_pop = ctx.bc->cf_last; 2850 2851 cf_jump->cf_addr = cf_pop->id + 2; 2852 cf_jump->pop_count = 1; 2853 cf_pop->cf_addr = cf_pop->id + 2; 2854 cf_pop->pop_count = 1; 2855 2856 if (ctx.bc->chip_class == CAYMAN) 2857 cm_bytecode_add_cf_end(ctx.bc); 2858 else { 2859 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2860 ctx.bc->cf_last->end_of_program = 1; 2861 } 2862 2863 gs->gs_copy_shader = cshader; 2864 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 2865 2866 ctx.bc->nstack = 1; 2867 2868 return r600_bytecode_build(ctx.bc); 2869} 2870 2871static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind) 2872{ 2873 if (ind) { 2874 struct r600_bytecode_alu alu; 2875 int r; 2876 2877 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2878 alu.op = ALU_OP2_ADD_INT; 2879 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx]; 2880 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2881 alu.src[1].value = ctx->gs_out_ring_offset >> 4; 2882 alu.dst.sel = ctx->gs_export_gpr_tregs[idx]; 2883 alu.dst.write = 1; 2884 alu.last = 1; 2885 r = r600_bytecode_add_alu(ctx->bc, &alu); 2886 if (r) 2887 return r; 2888 } 2889 return 0; 2890} 2891 2892static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind) 2893{ 2894 struct r600_bytecode_output output; 2895 int ring_offset; 2896 unsigned i, k; 2897 int effective_stream = stream == -1 ? 0 : stream; 2898 int idx = 0; 2899 2900 for (i = 0; i < ctx->shader->noutput; i++) { 2901 if (ctx->gs_for_vs) { 2902 /* for ES we need to lookup corresponding ring offset expected by GS 2903 * (map this output to GS input by name and sid) */ 2904 /* FIXME precompute offsets */ 2905 ring_offset = -1; 2906 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) { 2907 struct r600_shader_io *in = &ctx->gs_for_vs->input[k]; 2908 struct r600_shader_io *out = &ctx->shader->output[i]; 2909 if (in->name == out->name && in->sid == out->sid) 2910 ring_offset = in->ring_offset; 2911 } 2912 2913 if (ring_offset == -1) 2914 continue; 2915 } else { 2916 ring_offset = idx * 16; 2917 idx++; 2918 } 2919 2920 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION) 2921 continue; 2922 /* next_ring_offset after parsing input decls contains total size of 2923 * single vertex data, gs_next_vertex - current vertex index */ 2924 if (!ind) 2925 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex; 2926 2927 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2928 output.gpr = ctx->shader->output[i].gpr; 2929 output.elem_size = 3; 2930 output.comp_mask = 0xF; 2931 output.burst_count = 1; 2932 2933 if (ind) 2934 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 2935 else 2936 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2937 2938 switch (stream) { 2939 default: 2940 case 0: 2941 output.op = CF_OP_MEM_RING; break; 2942 case 1: 2943 output.op = CF_OP_MEM_RING1; break; 2944 case 2: 2945 output.op = CF_OP_MEM_RING2; break; 2946 case 3: 2947 output.op = CF_OP_MEM_RING3; break; 2948 } 2949 2950 if (ind) { 2951 output.array_base = ring_offset >> 2; /* in dwords */ 2952 output.array_size = 0xfff; 2953 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream]; 2954 } else 2955 output.array_base = ring_offset >> 2; /* in dwords */ 2956 r600_bytecode_add_output(ctx->bc, &output); 2957 } 2958 2959 ++ctx->gs_next_vertex; 2960 return 0; 2961} 2962 2963 2964static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) 2965{ 2966 int r; 2967 struct r600_bytecode_vtx vtx; 2968 int temp_val = ctx->temp_reg; 2969 /* need to store the TCS output somewhere */ 2970 r = single_alu_op2(ctx, ALU_OP1_MOV, 2971 temp_val, 0, 2972 V_SQ_ALU_SRC_LITERAL, 0, 2973 0, 0); 2974 if (r) 2975 return r; 2976 2977 /* used by VS/TCS */ 2978 if (ctx->tess_input_info) { 2979 /* fetch tcs input values into resv space */ 2980 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2981 vtx.op = FETCH_OP_VFETCH; 2982 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2983 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2984 vtx.mega_fetch_count = 16; 2985 vtx.data_format = FMT_32_32_32_32; 2986 vtx.num_format_all = 2; 2987 vtx.format_comp_all = 1; 2988 vtx.use_const_fields = 0; 2989 vtx.endian = r600_endian_swap(32); 2990 vtx.srf_mode_all = 1; 2991 vtx.offset = 0; 2992 vtx.dst_gpr = ctx->tess_input_info; 2993 vtx.dst_sel_x = 0; 2994 vtx.dst_sel_y = 1; 2995 vtx.dst_sel_z = 2; 2996 vtx.dst_sel_w = 3; 2997 vtx.src_gpr = temp_val; 2998 vtx.src_sel_x = 0; 2999 3000 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 3001 if (r) 3002 return r; 3003 } 3004 3005 /* used by TCS/TES */ 3006 if (ctx->tess_output_info) { 3007 /* fetch tcs output values into resv space */ 3008 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 3009 vtx.op = FETCH_OP_VFETCH; 3010 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 3011 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 3012 vtx.mega_fetch_count = 16; 3013 vtx.data_format = FMT_32_32_32_32; 3014 vtx.num_format_all = 2; 3015 vtx.format_comp_all = 1; 3016 vtx.use_const_fields = 0; 3017 vtx.endian = r600_endian_swap(32); 3018 vtx.srf_mode_all = 1; 3019 vtx.offset = 16; 3020 vtx.dst_gpr = ctx->tess_output_info; 3021 vtx.dst_sel_x = 0; 3022 vtx.dst_sel_y = 1; 3023 vtx.dst_sel_z = 2; 3024 vtx.dst_sel_w = 3; 3025 vtx.src_gpr = temp_val; 3026 vtx.src_sel_x = 0; 3027 3028 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 3029 if (r) 3030 return r; 3031 } 3032 return 0; 3033} 3034 3035static int emit_lds_vs_writes(struct r600_shader_ctx *ctx) 3036{ 3037 int j, r; 3038 int temp_reg; 3039 unsigned i; 3040 3041 /* fetch tcs input values into input_vals */ 3042 ctx->tess_input_info = r600_get_temp(ctx); 3043 ctx->tess_output_info = 0; 3044 r = r600_fetch_tess_io_info(ctx); 3045 if (r) 3046 return r; 3047 3048 temp_reg = r600_get_temp(ctx); 3049 /* dst reg contains LDS address stride * idx */ 3050 /* MUL vertexID, vertex_dw_stride */ 3051 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 3052 temp_reg, 0, 3053 ctx->tess_input_info, 1, 3054 0, 1); /* rel id in r0.y? */ 3055 if (r) 3056 return r; 3057 3058 for (i = 0; i < ctx->shader->noutput; i++) { 3059 struct r600_bytecode_alu alu; 3060 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, 3061 ctx->shader->output[i].sid); 3062 3063 if (param) { 3064 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3065 temp_reg, 1, 3066 temp_reg, 0, 3067 V_SQ_ALU_SRC_LITERAL, param * 16); 3068 if (r) 3069 return r; 3070 } 3071 3072 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3073 temp_reg, 2, 3074 temp_reg, param ? 1 : 0, 3075 V_SQ_ALU_SRC_LITERAL, 8); 3076 if (r) 3077 return r; 3078 3079 3080 for (j = 0; j < 2; j++) { 3081 int chan = (j == 1) ? 2 : (param ? 1 : 0); 3082 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3083 alu.op = LDS_OP3_LDS_WRITE_REL; 3084 alu.src[0].sel = temp_reg; 3085 alu.src[0].chan = chan; 3086 alu.src[1].sel = ctx->shader->output[i].gpr; 3087 alu.src[1].chan = j * 2; 3088 alu.src[2].sel = ctx->shader->output[i].gpr; 3089 alu.src[2].chan = (j * 2) + 1; 3090 alu.last = 1; 3091 alu.dst.chan = 0; 3092 alu.lds_idx = 1; 3093 alu.is_lds_idx_op = true; 3094 r = r600_bytecode_add_alu(ctx->bc, &alu); 3095 if (r) 3096 return r; 3097 } 3098 } 3099 return 0; 3100} 3101 3102static int r600_store_tcs_output(struct r600_shader_ctx *ctx) 3103{ 3104 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3105 const struct tgsi_full_dst_register *dst = &inst->Dst[0]; 3106 int i, r, lasti; 3107 int temp_reg = r600_get_temp(ctx); 3108 struct r600_bytecode_alu alu; 3109 unsigned write_mask = dst->Register.WriteMask; 3110 3111 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) 3112 return 0; 3113 3114 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); 3115 if (r) 3116 return r; 3117 3118 /* the base address is now in temp.x */ 3119 r = r600_get_byte_address(ctx, temp_reg, 3120 &inst->Dst[0], NULL, ctx->tess_output_info, 1); 3121 if (r) 3122 return r; 3123 3124 /* LDS write */ 3125 lasti = tgsi_last_instruction(write_mask); 3126 for (i = 1; i <= lasti; i++) { 3127 3128 if (!(write_mask & (1 << i))) 3129 continue; 3130 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3131 temp_reg, i, 3132 temp_reg, 0, 3133 V_SQ_ALU_SRC_LITERAL, 4 * i); 3134 if (r) 3135 return r; 3136 } 3137 3138 for (i = 0; i <= lasti; i++) { 3139 if (!(write_mask & (1 << i))) 3140 continue; 3141 3142 if ((i == 0 && ((write_mask & 3) == 3)) || 3143 (i == 2 && ((write_mask & 0xc) == 0xc))) { 3144 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3145 alu.op = LDS_OP3_LDS_WRITE_REL; 3146 alu.src[0].sel = temp_reg; 3147 alu.src[0].chan = i; 3148 3149 alu.src[1].sel = dst->Register.Index; 3150 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3151 alu.src[1].chan = i; 3152 3153 alu.src[2].sel = dst->Register.Index; 3154 alu.src[2].sel += ctx->file_offset[dst->Register.File]; 3155 alu.src[2].chan = i + 1; 3156 alu.lds_idx = 1; 3157 alu.dst.chan = 0; 3158 alu.last = 1; 3159 alu.is_lds_idx_op = true; 3160 r = r600_bytecode_add_alu(ctx->bc, &alu); 3161 if (r) 3162 return r; 3163 i += 1; 3164 continue; 3165 } 3166 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3167 alu.op = LDS_OP2_LDS_WRITE; 3168 alu.src[0].sel = temp_reg; 3169 alu.src[0].chan = i; 3170 3171 alu.src[1].sel = dst->Register.Index; 3172 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3173 alu.src[1].chan = i; 3174 3175 alu.src[2].sel = V_SQ_ALU_SRC_0; 3176 alu.dst.chan = 0; 3177 alu.last = 1; 3178 alu.is_lds_idx_op = true; 3179 r = r600_bytecode_add_alu(ctx->bc, &alu); 3180 if (r) 3181 return r; 3182 } 3183 return 0; 3184} 3185 3186static int r600_tess_factor_read(struct r600_shader_ctx *ctx, 3187 int output_idx, int nc) 3188{ 3189 int param; 3190 unsigned temp_reg = r600_get_temp(ctx); 3191 unsigned name = ctx->shader->output[output_idx].name; 3192 int dreg = ctx->shader->output[output_idx].gpr; 3193 int r; 3194 3195 param = r600_get_lds_unique_index(name, 0); 3196 r = get_lds_offset0(ctx, 1, temp_reg, true); 3197 if (r) 3198 return r; 3199 3200 if (param) { 3201 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3202 temp_reg, 0, 3203 temp_reg, 0, 3204 V_SQ_ALU_SRC_LITERAL, param * 16); 3205 if (r) 3206 return r; 3207 } 3208 3209 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1)); 3210 return 0; 3211} 3212 3213static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) 3214{ 3215 int stride, outer_comps, inner_comps; 3216 int tessinner_idx = -1, tessouter_idx = -1; 3217 int i, r; 3218 unsigned j; 3219 int temp_reg = r600_get_temp(ctx); 3220 int treg[3] = {-1, -1, -1}; 3221 struct r600_bytecode_alu alu; 3222 struct r600_bytecode_cf *cf_jump, *cf_pop; 3223 3224 /* only execute factor emission for invocation 0 */ 3225 /* PRED_SETE_INT __, R0.x, 0 */ 3226 memset(&alu, 0, sizeof(alu)); 3227 alu.op = ALU_OP2_PRED_SETE_INT; 3228 alu.src[0].chan = 2; 3229 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3230 alu.execute_mask = 1; 3231 alu.update_pred = 1; 3232 alu.last = 1; 3233 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE); 3234 3235 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 3236 cf_jump = ctx->bc->cf_last; 3237 3238 treg[0] = r600_get_temp(ctx); 3239 switch (ctx->shader->tcs_prim_mode) { 3240 case PIPE_PRIM_LINES: 3241 stride = 8; /* 2 dwords, 1 vec2 store */ 3242 outer_comps = 2; 3243 inner_comps = 0; 3244 break; 3245 case PIPE_PRIM_TRIANGLES: 3246 stride = 16; /* 4 dwords, 1 vec4 store */ 3247 outer_comps = 3; 3248 inner_comps = 1; 3249 treg[1] = r600_get_temp(ctx); 3250 break; 3251 case PIPE_PRIM_QUADS: 3252 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */ 3253 outer_comps = 4; 3254 inner_comps = 2; 3255 treg[1] = r600_get_temp(ctx); 3256 treg[2] = r600_get_temp(ctx); 3257 break; 3258 default: 3259 assert(0); 3260 return -1; 3261 } 3262 3263 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */ 3264 /* TF_WRITE takes index in R.x, value in R.y */ 3265 for (j = 0; j < ctx->shader->noutput; j++) { 3266 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER) 3267 tessinner_idx = j; 3268 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER) 3269 tessouter_idx = j; 3270 } 3271 3272 if (tessouter_idx == -1) 3273 return -1; 3274 3275 if (tessinner_idx == -1 && inner_comps) 3276 return -1; 3277 3278 if (tessouter_idx != -1) { 3279 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps); 3280 if (r) 3281 return r; 3282 } 3283 3284 if (tessinner_idx != -1) { 3285 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps); 3286 if (r) 3287 return r; 3288 } 3289 3290 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */ 3291 /* r.x = relpatchid(r0.y) * tf_stride */ 3292 3293 /* multiply incoming r0.y * stride - t.x = r0.y * stride */ 3294 /* add incoming r0.w to it: t.x = t.x + r0.w */ 3295 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3296 temp_reg, 0, 3297 0, 1, 3298 V_SQ_ALU_SRC_LITERAL, stride, 3299 0, 3); 3300 if (r) 3301 return r; 3302 3303 for (i = 0; i < outer_comps + inner_comps; i++) { 3304 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx; 3305 int out_comp = i >= outer_comps ? i - outer_comps : i; 3306 3307 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) { 3308 if (out_comp == 1) 3309 out_comp = 0; 3310 else if (out_comp == 0) 3311 out_comp = 1; 3312 } 3313 3314 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3315 treg[i / 2], (2 * (i % 2)), 3316 temp_reg, 0, 3317 V_SQ_ALU_SRC_LITERAL, 4 * i); 3318 if (r) 3319 return r; 3320 r = single_alu_op2(ctx, ALU_OP1_MOV, 3321 treg[i / 2], 1 + (2 * (i%2)), 3322 ctx->shader->output[out_idx].gpr, out_comp, 3323 0, 0); 3324 if (r) 3325 return r; 3326 } 3327 for (i = 0; i < outer_comps + inner_comps; i++) { 3328 struct r600_bytecode_gds gds; 3329 3330 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 3331 gds.src_gpr = treg[i / 2]; 3332 gds.src_sel_x = 2 * (i % 2); 3333 gds.src_sel_y = 1 + (2 * (i % 2)); 3334 gds.src_sel_z = 4; 3335 gds.dst_sel_x = 7; 3336 gds.dst_sel_y = 7; 3337 gds.dst_sel_z = 7; 3338 gds.dst_sel_w = 7; 3339 gds.op = FETCH_OP_TF_WRITE; 3340 r = r600_bytecode_add_gds(ctx->bc, &gds); 3341 if (r) 3342 return r; 3343 } 3344 3345 // Patch up jump label 3346 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 3347 cf_pop = ctx->bc->cf_last; 3348 3349 cf_jump->cf_addr = cf_pop->id + 2; 3350 cf_jump->pop_count = 1; 3351 cf_pop->cf_addr = cf_pop->id + 2; 3352 cf_pop->pop_count = 1; 3353 3354 return 0; 3355} 3356 3357/* 3358 * We have to work out the thread ID for load and atomic 3359 * operations, which store the returned value to an index 3360 * in an intermediate buffer. 3361 * The index is calculated by taking the thread id, 3362 * calculated from the MBCNT instructions. 3363 * Then the shader engine ID is multiplied by 256, 3364 * and the wave id is added. 3365 * Then the result is multipled by 64 and thread id is 3366 * added. 3367 */ 3368static int load_thread_id_gpr(struct r600_shader_ctx *ctx) 3369{ 3370 struct r600_bytecode_alu alu; 3371 int r; 3372 3373 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3374 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT; 3375 alu.dst.sel = ctx->temp_reg; 3376 alu.dst.chan = 0; 3377 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3378 alu.src[0].value = 0xffffffff; 3379 alu.dst.write = 1; 3380 r = r600_bytecode_add_alu(ctx->bc, &alu); 3381 if (r) 3382 return r; 3383 3384 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3385 alu.op = ALU_OP1_MBCNT_32HI_INT; 3386 alu.dst.sel = ctx->temp_reg; 3387 alu.dst.chan = 1; 3388 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3389 alu.src[0].value = 0xffffffff; 3390 alu.dst.write = 1; 3391 r = r600_bytecode_add_alu(ctx->bc, &alu); 3392 if (r) 3393 return r; 3394 3395 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3396 alu.op = ALU_OP3_MULADD_UINT24; 3397 alu.dst.sel = ctx->temp_reg; 3398 alu.dst.chan = 2; 3399 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID; 3400 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3401 alu.src[1].value = 256; 3402 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID; 3403 alu.dst.write = 1; 3404 alu.is_op3 = 1; 3405 alu.last = 1; 3406 r = r600_bytecode_add_alu(ctx->bc, &alu); 3407 if (r) 3408 return r; 3409 3410 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3411 ctx->thread_id_gpr, 1, 3412 ctx->temp_reg, 2, 3413 V_SQ_ALU_SRC_LITERAL, 0x40, 3414 ctx->temp_reg, 0); 3415 if (r) 3416 return r; 3417 return 0; 3418} 3419 3420static int r600_shader_from_tgsi(struct r600_context *rctx, 3421 struct r600_pipe_shader *pipeshader, 3422 union r600_shader_key key) 3423{ 3424 struct r600_screen *rscreen = rctx->screen; 3425 struct r600_shader *shader = &pipeshader->shader; 3426 struct tgsi_token *tokens = pipeshader->selector->tokens; 3427 struct pipe_stream_output_info so = pipeshader->selector->so; 3428 struct tgsi_full_immediate *immediate; 3429 struct r600_shader_ctx ctx; 3430 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)]; 3431 unsigned output_done, noutput; 3432 unsigned opcode; 3433 int j, k, r = 0; 3434 unsigned i; 3435 int next_param_base = 0, next_clip_base; 3436 int max_color_exports = MAX2(key.ps.nr_cbufs, 1); 3437 bool indirect_gprs; 3438 bool ring_outputs = false; 3439 bool lds_outputs = false; 3440 bool lds_inputs = false; 3441 bool pos_emitted = false; 3442 3443 ctx.bc = &shader->bc; 3444 ctx.shader = shader; 3445 3446 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, 3447 rscreen->has_compressed_msaa_texturing); 3448 ctx.tokens = tokens; 3449 tgsi_scan_shader(tokens, &ctx.info); 3450 shader->indirect_files = ctx.info.indirect_files; 3451 3452 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY]; 3453 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos)); 3454 ctx.spilled_arrays = calloc(narrays, sizeof(bool)); 3455 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos); 3456 3457 shader->uses_helper_invocation = false; 3458 shader->uses_doubles = ctx.info.uses_doubles; 3459 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; 3460 shader->nsys_inputs = 0; 3461 3462 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 || 3463 ctx.info.file_count[TGSI_FILE_BUFFER] > 0; 3464 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); 3465 tgsi_parse_init(&ctx.parse, tokens); 3466 ctx.type = ctx.info.processor; 3467 shader->processor_type = ctx.type; 3468 ctx.bc->type = shader->processor_type; 3469 3470 switch (ctx.type) { 3471 case PIPE_SHADER_VERTEX: 3472 shader->vs_as_gs_a = key.vs.as_gs_a; 3473 shader->vs_as_es = key.vs.as_es; 3474 shader->vs_as_ls = key.vs.as_ls; 3475 shader->atomic_base = key.vs.first_atomic_counter; 3476 if (shader->vs_as_es) 3477 ring_outputs = true; 3478 if (shader->vs_as_ls) 3479 lds_outputs = true; 3480 break; 3481 case PIPE_SHADER_GEOMETRY: 3482 ring_outputs = true; 3483 shader->atomic_base = key.gs.first_atomic_counter; 3484 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix; 3485 break; 3486 case PIPE_SHADER_TESS_CTRL: 3487 shader->tcs_prim_mode = key.tcs.prim_mode; 3488 shader->atomic_base = key.tcs.first_atomic_counter; 3489 lds_outputs = true; 3490 lds_inputs = true; 3491 break; 3492 case PIPE_SHADER_TESS_EVAL: 3493 shader->tes_as_es = key.tes.as_es; 3494 shader->atomic_base = key.tes.first_atomic_counter; 3495 lds_inputs = true; 3496 if (shader->tes_as_es) 3497 ring_outputs = true; 3498 break; 3499 case PIPE_SHADER_FRAGMENT: 3500 shader->two_side = key.ps.color_two_side; 3501 shader->atomic_base = key.ps.first_atomic_counter; 3502 shader->rat_base = key.ps.nr_cbufs; 3503 shader->image_size_const_offset = key.ps.image_size_const_offset; 3504 break; 3505 case PIPE_SHADER_COMPUTE: 3506 shader->rat_base = 0; 3507 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER]; 3508 break; 3509 default: 3510 break; 3511 } 3512 3513 if (shader->vs_as_es || shader->tes_as_es) { 3514 ctx.gs_for_vs = &rctx->gs_shader->current->shader; 3515 } else { 3516 ctx.gs_for_vs = NULL; 3517 } 3518 3519 ctx.next_ring_offset = 0; 3520 ctx.gs_out_ring_offset = 0; 3521 ctx.gs_next_vertex = 0; 3522 ctx.gs_stream_output_info = &so; 3523 3524 ctx.thread_id_gpr = -1; 3525 ctx.face_gpr = -1; 3526 ctx.fixed_pt_position_gpr = -1; 3527 ctx.fragcoord_input = -1; 3528 ctx.colors_used = 0; 3529 ctx.clip_vertex_write = 0; 3530 3531 ctx.helper_invoc_reg = -1; 3532 ctx.cs_block_size_reg = -1; 3533 ctx.cs_grid_size_reg = -1; 3534 ctx.cs_block_size_loaded = false; 3535 ctx.cs_grid_size_loaded = false; 3536 3537 shader->nr_ps_color_exports = 0; 3538 shader->nr_ps_max_color_exports = 0; 3539 3540 3541 /* register allocations */ 3542 /* Values [0,127] correspond to GPR[0..127]. 3543 * Values [128,159] correspond to constant buffer bank 0 3544 * Values [160,191] correspond to constant buffer bank 1 3545 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 3546 * Values [256,287] correspond to constant buffer bank 2 (EG) 3547 * Values [288,319] correspond to constant buffer bank 3 (EG) 3548 * Other special values are shown in the list below. 3549 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 3550 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 3551 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 3552 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 3553 * 248 SQ_ALU_SRC_0: special constant 0.0. 3554 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 3555 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 3556 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 3557 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 3558 * 253 SQ_ALU_SRC_LITERAL: literal constant. 3559 * 254 SQ_ALU_SRC_PV: previous vector result. 3560 * 255 SQ_ALU_SRC_PS: previous scalar result. 3561 */ 3562 for (i = 0; i < TGSI_FILE_COUNT; i++) { 3563 ctx.file_offset[i] = 0; 3564 } 3565 3566 if (ctx.type == PIPE_SHADER_VERTEX) { 3567 3568 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3569 if (ctx.info.num_inputs) 3570 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 3571 } 3572 if (ctx.type == PIPE_SHADER_FRAGMENT) { 3573 if (ctx.bc->chip_class >= EVERGREEN) 3574 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 3575 else 3576 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); 3577 3578 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3579 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) { 3580 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3581 shader->uses_helper_invocation = true; 3582 } 3583 } 3584 } 3585 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3586 /* FIXME 1 would be enough in some cases (3 or less input vertices) */ 3587 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3588 } 3589 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3590 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3591 if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3592 bool add_tesscoord = false, add_tess_inout = false; 3593 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3594 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3595 /* if we have tesscoord save one reg */ 3596 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD) 3597 add_tesscoord = true; 3598 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER || 3599 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER) 3600 add_tess_inout = true; 3601 } 3602 if (add_tesscoord || add_tess_inout) 3603 ctx.file_offset[TGSI_FILE_INPUT]++; 3604 if (add_tess_inout) 3605 ctx.file_offset[TGSI_FILE_INPUT]+=2; 3606 } 3607 if (ctx.type == PIPE_SHADER_COMPUTE) { 3608 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3609 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3610 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE) 3611 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3612 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE) 3613 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3614 } 3615 } 3616 3617 ctx.file_offset[TGSI_FILE_OUTPUT] = 3618 ctx.file_offset[TGSI_FILE_INPUT] + 3619 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3620 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 3621 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 3622 3623 /* Outside the GPR range. This will be translated to one of the 3624 * kcache banks later. */ 3625 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 3626 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 3627 3628 pipeshader->scratch_space_needed = 0; 3629 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] + 3630 ctx.info.file_max[TGSI_FILE_TEMPORARY]; 3631 if (regno > 124) { 3632 choose_spill_arrays(&ctx, ®no, &pipeshader->scratch_space_needed); 3633 shader->indirect_files = ctx.info.indirect_files; 3634 } 3635 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0; 3636 3637 ctx.bc->ar_reg = ++regno; 3638 ctx.bc->index_reg[0] = ++regno; 3639 ctx.bc->index_reg[1] = ++regno; 3640 3641 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3642 ctx.tess_input_info = ++regno; 3643 ctx.tess_output_info = ++regno; 3644 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3645 ctx.tess_input_info = ++regno; 3646 ctx.tess_output_info = ++regno; 3647 } else if (ctx.type == PIPE_SHADER_GEOMETRY) { 3648 ctx.gs_export_gpr_tregs[0] = ++regno; 3649 ctx.gs_export_gpr_tregs[1] = ++regno; 3650 ctx.gs_export_gpr_tregs[2] = ++regno; 3651 ctx.gs_export_gpr_tregs[3] = ++regno; 3652 if (ctx.shader->gs_tri_strip_adj_fix) { 3653 ctx.gs_rotated_input[0] = ++regno; 3654 ctx.gs_rotated_input[1] = ++regno; 3655 } else { 3656 ctx.gs_rotated_input[0] = 0; 3657 ctx.gs_rotated_input[1] = 1; 3658 } 3659 } 3660 3661 if (shader->uses_images) { 3662 ctx.thread_id_gpr = ++regno; 3663 } 3664 ctx.temp_reg = ++regno; 3665 3666 shader->max_arrays = 0; 3667 shader->num_arrays = 0; 3668 if (indirect_gprs) { 3669 3670 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 3671 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 3672 ctx.file_offset[TGSI_FILE_OUTPUT] - 3673 ctx.file_offset[TGSI_FILE_INPUT], 3674 0x0F); 3675 } 3676 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 3677 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 3678 ctx.file_offset[TGSI_FILE_TEMPORARY] - 3679 ctx.file_offset[TGSI_FILE_OUTPUT], 3680 0x0F); 3681 } 3682 } 3683 3684 ctx.nliterals = 0; 3685 ctx.literals = NULL; 3686 ctx.max_driver_temp_used = 0; 3687 3688 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && 3689 ctx.info.colors_written == 1; 3690 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; 3691 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; 3692 3693 if (ctx.type == PIPE_SHADER_VERTEX || 3694 ctx.type == PIPE_SHADER_GEOMETRY || 3695 ctx.type == PIPE_SHADER_TESS_EVAL) { 3696 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] + 3697 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1; 3698 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1; 3699 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]; 3700 } 3701 3702 if (shader->vs_as_gs_a) 3703 vs_add_primid_output(&ctx, key.vs.prim_id_out); 3704 3705 if (ctx.thread_id_gpr != -1) { 3706 r = load_thread_id_gpr(&ctx); 3707 if (r) 3708 return r; 3709 } 3710 3711 if (ctx.type == PIPE_SHADER_TESS_EVAL) 3712 r600_fetch_tess_io_info(&ctx); 3713 3714 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3715 tgsi_parse_token(&ctx.parse); 3716 switch (ctx.parse.FullToken.Token.Type) { 3717 case TGSI_TOKEN_TYPE_IMMEDIATE: 3718 immediate = &ctx.parse.FullToken.FullImmediate; 3719 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 3720 if(ctx.literals == NULL) { 3721 r = -ENOMEM; 3722 goto out_err; 3723 } 3724 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 3725 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 3726 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 3727 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 3728 ctx.nliterals++; 3729 break; 3730 case TGSI_TOKEN_TYPE_DECLARATION: 3731 r = tgsi_declaration(&ctx); 3732 if (r) 3733 goto out_err; 3734 break; 3735 case TGSI_TOKEN_TYPE_INSTRUCTION: 3736 case TGSI_TOKEN_TYPE_PROPERTY: 3737 break; 3738 default: 3739 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 3740 r = -EINVAL; 3741 goto out_err; 3742 } 3743 } 3744 3745 shader->ring_item_sizes[0] = ctx.next_ring_offset; 3746 shader->ring_item_sizes[1] = 0; 3747 shader->ring_item_sizes[2] = 0; 3748 shader->ring_item_sizes[3] = 0; 3749 3750 /* Process two side if needed */ 3751 if (shader->two_side && ctx.colors_used) { 3752 int i, count = ctx.shader->ninput; 3753 unsigned next_lds_loc = ctx.shader->nlds; 3754 3755 /* additional inputs will be allocated right after the existing inputs, 3756 * we won't need them after the color selection, so we don't need to 3757 * reserve these gprs for the rest of the shader code and to adjust 3758 * output offsets etc. */ 3759 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 3760 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3761 3762 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */ 3763 if (ctx.face_gpr == -1) { 3764 i = ctx.shader->ninput++; 3765 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 3766 ctx.shader->input[i].spi_sid = 0; 3767 ctx.shader->input[i].gpr = gpr++; 3768 ctx.face_gpr = ctx.shader->input[i].gpr; 3769 } 3770 3771 for (i = 0; i < count; i++) { 3772 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 3773 int ni = ctx.shader->ninput++; 3774 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 3775 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 3776 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 3777 ctx.shader->input[ni].gpr = gpr++; 3778 // TGSI to LLVM needs to know the lds position of inputs. 3779 // Non LLVM path computes it later (in process_twoside_color) 3780 ctx.shader->input[ni].lds_pos = next_lds_loc++; 3781 ctx.shader->input[i].back_color_input = ni; 3782 if (ctx.bc->chip_class >= EVERGREEN) { 3783 if ((r = evergreen_interp_input(&ctx, ni))) 3784 return r; 3785 } 3786 } 3787 } 3788 } 3789 3790 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) 3791 shader->nr_ps_max_color_exports = 8; 3792 3793 if (ctx.shader->uses_helper_invocation) { 3794 if (ctx.bc->chip_class == CAYMAN) 3795 r = cm_load_helper_invocation(&ctx); 3796 else 3797 r = eg_load_helper_invocation(&ctx); 3798 if (r) 3799 return r; 3800 } 3801 3802 /* 3803 * XXX this relies on fixed_pt_position_gpr only being present when 3804 * this shader should be executed per sample. Should be the case for now... 3805 */ 3806 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) { 3807 /* 3808 * Fix up sample mask. The hw always gives us coverage mask for 3809 * the pixel. However, for per-sample shading, we need the 3810 * coverage for the shader invocation only. 3811 * Also, with disabled msaa, only the first bit should be set 3812 * (luckily the same fixup works for both problems). 3813 * For now, we can only do it if we know this shader is always 3814 * executed per sample (due to usage of bits in the shader 3815 * forcing per-sample execution). 3816 * If the fb is not multisampled, we'd do unnecessary work but 3817 * it should still be correct. 3818 * It will however do nothing for sample shading according 3819 * to MinSampleShading. 3820 */ 3821 struct r600_bytecode_alu alu; 3822 int tmp = r600_get_temp(&ctx); 3823 assert(ctx.face_gpr != -1); 3824 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3825 3826 alu.op = ALU_OP2_LSHL_INT; 3827 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3828 alu.src[0].value = 0x1; 3829 alu.src[1].sel = ctx.fixed_pt_position_gpr; 3830 alu.src[1].chan = 3; 3831 alu.dst.sel = tmp; 3832 alu.dst.chan = 0; 3833 alu.dst.write = 1; 3834 alu.last = 1; 3835 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3836 return r; 3837 3838 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3839 alu.op = ALU_OP2_AND_INT; 3840 alu.src[0].sel = tmp; 3841 alu.src[1].sel = ctx.face_gpr; 3842 alu.src[1].chan = 2; 3843 alu.dst.sel = ctx.face_gpr; 3844 alu.dst.chan = 2; 3845 alu.dst.write = 1; 3846 alu.last = 1; 3847 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3848 return r; 3849 } 3850 3851 if (ctx.fragcoord_input >= 0) { 3852 if (ctx.bc->chip_class == CAYMAN) { 3853 for (j = 0 ; j < 4; j++) { 3854 struct r600_bytecode_alu alu; 3855 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3856 alu.op = ALU_OP1_RECIP_IEEE; 3857 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3858 alu.src[0].chan = 3; 3859 3860 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3861 alu.dst.chan = j; 3862 alu.dst.write = (j == 3); 3863 alu.last = (j == 3); 3864 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3865 return r; 3866 } 3867 } else { 3868 struct r600_bytecode_alu alu; 3869 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3870 alu.op = ALU_OP1_RECIP_IEEE; 3871 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3872 alu.src[0].chan = 3; 3873 3874 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3875 alu.dst.chan = 3; 3876 alu.dst.write = 1; 3877 alu.last = 1; 3878 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3879 return r; 3880 } 3881 } 3882 3883 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3884 struct r600_bytecode_alu alu; 3885 int r; 3886 3887 /* GS thread with no output workaround - emit a cut at start of GS */ 3888 if (ctx.bc->chip_class == R600) 3889 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); 3890 3891 for (j = 0; j < 4; j++) { 3892 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3893 alu.op = ALU_OP1_MOV; 3894 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3895 alu.src[0].value = 0; 3896 alu.dst.sel = ctx.gs_export_gpr_tregs[j]; 3897 alu.dst.write = 1; 3898 alu.last = 1; 3899 r = r600_bytecode_add_alu(ctx.bc, &alu); 3900 if (r) 3901 return r; 3902 } 3903 3904 if (ctx.shader->gs_tri_strip_adj_fix) { 3905 r = single_alu_op2(&ctx, ALU_OP2_AND_INT, 3906 ctx.gs_rotated_input[0], 2, 3907 0, 2, 3908 V_SQ_ALU_SRC_LITERAL, 1); 3909 if (r) 3910 return r; 3911 3912 for (i = 0; i < 6; i++) { 3913 int rotated = (i + 4) % 6; 3914 int offset_reg = i / 3; 3915 int offset_chan = i % 3; 3916 int rotated_offset_reg = rotated / 3; 3917 int rotated_offset_chan = rotated % 3; 3918 3919 if (offset_reg == 0 && offset_chan == 2) 3920 offset_chan = 3; 3921 if (rotated_offset_reg == 0 && rotated_offset_chan == 2) 3922 rotated_offset_chan = 3; 3923 3924 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT, 3925 ctx.gs_rotated_input[offset_reg], offset_chan, 3926 ctx.gs_rotated_input[0], 2, 3927 offset_reg, offset_chan, 3928 rotated_offset_reg, rotated_offset_chan); 3929 if (r) 3930 return r; 3931 } 3932 } 3933 } 3934 3935 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3936 r600_fetch_tess_io_info(&ctx); 3937 3938 if (shader->two_side && ctx.colors_used) { 3939 if ((r = process_twoside_color_inputs(&ctx))) 3940 return r; 3941 } 3942 3943 tgsi_parse_init(&ctx.parse, tokens); 3944 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3945 tgsi_parse_token(&ctx.parse); 3946 switch (ctx.parse.FullToken.Token.Type) { 3947 case TGSI_TOKEN_TYPE_INSTRUCTION: 3948 r = tgsi_is_supported(&ctx); 3949 if (r) 3950 goto out_err; 3951 ctx.max_driver_temp_used = 0; 3952 /* reserve first tmp for everyone */ 3953 r600_get_temp(&ctx); 3954 3955 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 3956 if ((r = tgsi_split_constant(&ctx))) 3957 goto out_err; 3958 if ((r = tgsi_split_literal_constant(&ctx))) 3959 goto out_err; 3960 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3961 if ((r = tgsi_split_gs_inputs(&ctx))) 3962 goto out_err; 3963 } else if (lds_inputs) { 3964 if ((r = tgsi_split_lds_inputs(&ctx))) 3965 goto out_err; 3966 } 3967 if (ctx.bc->chip_class == CAYMAN) 3968 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 3969 else if (ctx.bc->chip_class >= EVERGREEN) 3970 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 3971 else 3972 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 3973 3974 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise; 3975 3976 r = ctx.inst_info->process(&ctx); 3977 if (r) 3978 goto out_err; 3979 3980 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3981 r = r600_store_tcs_output(&ctx); 3982 if (r) 3983 goto out_err; 3984 } 3985 break; 3986 default: 3987 break; 3988 } 3989 } 3990 3991 /* Reset the temporary register counter. */ 3992 ctx.max_driver_temp_used = 0; 3993 3994 noutput = shader->noutput; 3995 3996 if (!ring_outputs && ctx.clip_vertex_write) { 3997 unsigned clipdist_temp[2]; 3998 3999 clipdist_temp[0] = r600_get_temp(&ctx); 4000 clipdist_temp[1] = r600_get_temp(&ctx); 4001 4002 /* need to convert a clipvertex write into clipdistance writes and not export 4003 the clip vertex anymore */ 4004 4005 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 4006 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 4007 shader->output[noutput].gpr = clipdist_temp[0]; 4008 noutput++; 4009 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 4010 shader->output[noutput].gpr = clipdist_temp[1]; 4011 noutput++; 4012 4013 /* reset spi_sid for clipvertex output to avoid confusing spi */ 4014 shader->output[ctx.cv_output].spi_sid = 0; 4015 4016 shader->clip_dist_write = 0xFF; 4017 shader->cc_dist_mask = 0xFF; 4018 4019 for (i = 0; i < 8; i++) { 4020 int oreg = i >> 2; 4021 int ochan = i & 3; 4022 4023 for (j = 0; j < 4; j++) { 4024 struct r600_bytecode_alu alu; 4025 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4026 alu.op = ALU_OP2_DOT4; 4027 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 4028 alu.src[0].chan = j; 4029 4030 alu.src[1].sel = 512 + i; 4031 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4032 alu.src[1].chan = j; 4033 4034 alu.dst.sel = clipdist_temp[oreg]; 4035 alu.dst.chan = j; 4036 alu.dst.write = (j == ochan); 4037 if (j == 3) 4038 alu.last = 1; 4039 r = r600_bytecode_add_alu(ctx.bc, &alu); 4040 if (r) 4041 return r; 4042 } 4043 } 4044 } 4045 4046 /* Add stream outputs. */ 4047 if (so.num_outputs) { 4048 bool emit = false; 4049 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX) 4050 emit = true; 4051 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL) 4052 emit = true; 4053 if (emit) 4054 emit_streamout(&ctx, &so, -1, NULL); 4055 } 4056 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 4057 convert_edgeflag_to_int(&ctx); 4058 4059 if (ctx.type == PIPE_SHADER_TESS_CTRL) 4060 r600_emit_tess_factor(&ctx); 4061 4062 if (lds_outputs) { 4063 if (ctx.type == PIPE_SHADER_VERTEX) { 4064 if (ctx.shader->noutput) 4065 emit_lds_vs_writes(&ctx); 4066 } 4067 } else if (ring_outputs) { 4068 if (shader->vs_as_es || shader->tes_as_es) { 4069 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx); 4070 ctx.gs_export_gpr_tregs[1] = -1; 4071 ctx.gs_export_gpr_tregs[2] = -1; 4072 ctx.gs_export_gpr_tregs[3] = -1; 4073 4074 emit_gs_ring_writes(&ctx, &so, -1, FALSE); 4075 } 4076 } else { 4077 /* Export output */ 4078 next_clip_base = shader->vs_out_misc_write ? 62 : 61; 4079 4080 for (i = 0, j = 0; i < noutput; i++, j++) { 4081 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4082 output[j].gpr = shader->output[i].gpr; 4083 output[j].elem_size = 3; 4084 output[j].swizzle_x = 0; 4085 output[j].swizzle_y = 1; 4086 output[j].swizzle_z = 2; 4087 output[j].swizzle_w = 3; 4088 output[j].burst_count = 1; 4089 output[j].type = 0xffffffff; 4090 output[j].op = CF_OP_EXPORT; 4091 switch (ctx.type) { 4092 case PIPE_SHADER_VERTEX: 4093 case PIPE_SHADER_TESS_EVAL: 4094 switch (shader->output[i].name) { 4095 case TGSI_SEMANTIC_POSITION: 4096 output[j].array_base = 60; 4097 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4098 pos_emitted = true; 4099 break; 4100 4101 case TGSI_SEMANTIC_PSIZE: 4102 output[j].array_base = 61; 4103 output[j].swizzle_y = 7; 4104 output[j].swizzle_z = 7; 4105 output[j].swizzle_w = 7; 4106 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4107 pos_emitted = true; 4108 break; 4109 case TGSI_SEMANTIC_EDGEFLAG: 4110 output[j].array_base = 61; 4111 output[j].swizzle_x = 7; 4112 output[j].swizzle_y = 0; 4113 output[j].swizzle_z = 7; 4114 output[j].swizzle_w = 7; 4115 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4116 pos_emitted = true; 4117 break; 4118 case TGSI_SEMANTIC_LAYER: 4119 /* spi_sid is 0 for outputs that are 4120 * not consumed by PS */ 4121 if (shader->output[i].spi_sid) { 4122 output[j].array_base = next_param_base++; 4123 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4124 j++; 4125 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4126 } 4127 output[j].array_base = 61; 4128 output[j].swizzle_x = 7; 4129 output[j].swizzle_y = 7; 4130 output[j].swizzle_z = 0; 4131 output[j].swizzle_w = 7; 4132 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4133 pos_emitted = true; 4134 break; 4135 case TGSI_SEMANTIC_VIEWPORT_INDEX: 4136 /* spi_sid is 0 for outputs that are 4137 * not consumed by PS */ 4138 if (shader->output[i].spi_sid) { 4139 output[j].array_base = next_param_base++; 4140 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4141 j++; 4142 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4143 } 4144 output[j].array_base = 61; 4145 output[j].swizzle_x = 7; 4146 output[j].swizzle_y = 7; 4147 output[j].swizzle_z = 7; 4148 output[j].swizzle_w = 0; 4149 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4150 pos_emitted = true; 4151 break; 4152 case TGSI_SEMANTIC_CLIPVERTEX: 4153 j--; 4154 break; 4155 case TGSI_SEMANTIC_CLIPDIST: 4156 output[j].array_base = next_clip_base++; 4157 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4158 pos_emitted = true; 4159 /* spi_sid is 0 for clipdistance outputs that were generated 4160 * for clipvertex - we don't need to pass them to PS */ 4161 if (shader->output[i].spi_sid) { 4162 j++; 4163 /* duplicate it as PARAM to pass to the pixel shader */ 4164 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4165 output[j].array_base = next_param_base++; 4166 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4167 } 4168 break; 4169 case TGSI_SEMANTIC_FOG: 4170 output[j].swizzle_y = 4; /* 0 */ 4171 output[j].swizzle_z = 4; /* 0 */ 4172 output[j].swizzle_w = 5; /* 1 */ 4173 break; 4174 case TGSI_SEMANTIC_PRIMID: 4175 output[j].swizzle_x = 2; 4176 output[j].swizzle_y = 4; /* 0 */ 4177 output[j].swizzle_z = 4; /* 0 */ 4178 output[j].swizzle_w = 4; /* 0 */ 4179 break; 4180 } 4181 4182 break; 4183 case PIPE_SHADER_FRAGMENT: 4184 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 4185 /* never export more colors than the number of CBs */ 4186 if (shader->output[i].sid >= max_color_exports) { 4187 /* skip export */ 4188 j--; 4189 continue; 4190 } 4191 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4192 output[j].array_base = shader->output[i].sid; 4193 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4194 shader->nr_ps_color_exports++; 4195 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4)); 4196 4197 /* If the i-th target format is set, all previous target formats must 4198 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well. 4199 */ 4200 if (shader->output[i].sid > 0) 4201 for (unsigned x = 0; x < shader->output[i].sid; x++) 4202 shader->ps_color_export_mask |= (1 << (x*4)); 4203 4204 if (shader->output[i].sid > shader->ps_export_highest) 4205 shader->ps_export_highest = shader->output[i].sid; 4206 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) { 4207 for (k = 1; k < max_color_exports; k++) { 4208 j++; 4209 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4210 output[j].gpr = shader->output[i].gpr; 4211 output[j].elem_size = 3; 4212 output[j].swizzle_x = 0; 4213 output[j].swizzle_y = 1; 4214 output[j].swizzle_z = 2; 4215 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4216 output[j].burst_count = 1; 4217 output[j].array_base = k; 4218 output[j].op = CF_OP_EXPORT; 4219 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4220 shader->nr_ps_color_exports++; 4221 if (k > shader->ps_export_highest) 4222 shader->ps_export_highest = k; 4223 shader->ps_color_export_mask |= (0xf << (j * 4)); 4224 } 4225 } 4226 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 4227 output[j].array_base = 61; 4228 output[j].swizzle_x = 2; 4229 output[j].swizzle_y = 7; 4230 output[j].swizzle_z = output[j].swizzle_w = 7; 4231 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4232 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 4233 output[j].array_base = 61; 4234 output[j].swizzle_x = 7; 4235 output[j].swizzle_y = 1; 4236 output[j].swizzle_z = output[j].swizzle_w = 7; 4237 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4238 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) { 4239 output[j].array_base = 61; 4240 output[j].swizzle_x = 7; 4241 output[j].swizzle_y = 7; 4242 output[j].swizzle_z = 0; 4243 output[j].swizzle_w = 7; 4244 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4245 } else { 4246 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 4247 r = -EINVAL; 4248 goto out_err; 4249 } 4250 break; 4251 case PIPE_SHADER_TESS_CTRL: 4252 break; 4253 default: 4254 R600_ERR("unsupported processor type %d\n", ctx.type); 4255 r = -EINVAL; 4256 goto out_err; 4257 } 4258 4259 if (output[j].type == 0xffffffff) { 4260 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4261 output[j].array_base = next_param_base++; 4262 } 4263 } 4264 4265 /* add fake position export */ 4266 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) { 4267 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4268 output[j].gpr = 0; 4269 output[j].elem_size = 3; 4270 output[j].swizzle_x = 7; 4271 output[j].swizzle_y = 7; 4272 output[j].swizzle_z = 7; 4273 output[j].swizzle_w = 7; 4274 output[j].burst_count = 1; 4275 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4276 output[j].array_base = 60; 4277 output[j].op = CF_OP_EXPORT; 4278 j++; 4279 } 4280 4281 /* add fake param output for vertex shader if no param is exported */ 4282 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) { 4283 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4284 output[j].gpr = 0; 4285 output[j].elem_size = 3; 4286 output[j].swizzle_x = 7; 4287 output[j].swizzle_y = 7; 4288 output[j].swizzle_z = 7; 4289 output[j].swizzle_w = 7; 4290 output[j].burst_count = 1; 4291 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4292 output[j].array_base = 0; 4293 output[j].op = CF_OP_EXPORT; 4294 j++; 4295 } 4296 4297 /* add fake pixel export */ 4298 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) { 4299 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4300 output[j].gpr = 0; 4301 output[j].elem_size = 3; 4302 output[j].swizzle_x = 7; 4303 output[j].swizzle_y = 7; 4304 output[j].swizzle_z = 7; 4305 output[j].swizzle_w = 7; 4306 output[j].burst_count = 1; 4307 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4308 output[j].array_base = 0; 4309 output[j].op = CF_OP_EXPORT; 4310 j++; 4311 shader->nr_ps_color_exports++; 4312 shader->ps_color_export_mask = 0xf; 4313 } 4314 4315 noutput = j; 4316 4317 /* set export done on last export of each type */ 4318 for (k = noutput - 1, output_done = 0; k >= 0; k--) { 4319 if (!(output_done & (1 << output[k].type))) { 4320 output_done |= (1 << output[k].type); 4321 output[k].op = CF_OP_EXPORT_DONE; 4322 } 4323 } 4324 /* add output to bytecode */ 4325 for (i = 0; i < noutput; i++) { 4326 r = r600_bytecode_add_output(ctx.bc, &output[i]); 4327 if (r) 4328 goto out_err; 4329 } 4330 } 4331 4332 /* add program end */ 4333 if (ctx.bc->chip_class == CAYMAN) 4334 cm_bytecode_add_cf_end(ctx.bc); 4335 else { 4336 const struct cf_op_info *last = NULL; 4337 4338 if (ctx.bc->cf_last) 4339 last = r600_isa_cf(ctx.bc->cf_last->op); 4340 4341 /* alu clause instructions don't have EOP bit, so add NOP */ 4342 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP) 4343 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 4344 4345 ctx.bc->cf_last->end_of_program = 1; 4346 } 4347 4348 /* check GPR limit - we have 124 = 128 - 4 4349 * (4 are reserved as alu clause temporary registers) */ 4350 if (ctx.bc->ngpr > 124) { 4351 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 4352 r = -ENOMEM; 4353 goto out_err; 4354 } 4355 4356 if (ctx.type == PIPE_SHADER_GEOMETRY) { 4357 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so))) 4358 return r; 4359 } 4360 4361 free(ctx.spilled_arrays); 4362 free(ctx.array_infos); 4363 free(ctx.literals); 4364 tgsi_parse_free(&ctx.parse); 4365 return 0; 4366out_err: 4367 free(ctx.spilled_arrays); 4368 free(ctx.array_infos); 4369 free(ctx.literals); 4370 tgsi_parse_free(&ctx.parse); 4371 return r; 4372} 4373 4374static int tgsi_unsupported(struct r600_shader_ctx *ctx) 4375{ 4376 const unsigned tgsi_opcode = 4377 ctx->parse.FullToken.FullInstruction.Instruction.Opcode; 4378 R600_ERR("%s tgsi opcode unsupported\n", 4379 tgsi_get_opcode_name(tgsi_opcode)); 4380 return -EINVAL; 4381} 4382 4383static int tgsi_end(struct r600_shader_ctx *ctx UNUSED) 4384{ 4385 return 0; 4386} 4387 4388static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 4389 const struct r600_shader_src *shader_src, 4390 unsigned chan) 4391{ 4392 bc_src->sel = shader_src->sel; 4393 bc_src->chan = shader_src->swizzle[chan]; 4394 bc_src->neg = shader_src->neg; 4395 bc_src->abs = shader_src->abs; 4396 bc_src->rel = shader_src->rel; 4397 bc_src->value = shader_src->value[bc_src->chan]; 4398 bc_src->kc_bank = shader_src->kc_bank; 4399 bc_src->kc_rel = shader_src->kc_rel; 4400} 4401 4402static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 4403{ 4404 bc_src->abs = 1; 4405 bc_src->neg = 0; 4406} 4407 4408static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 4409{ 4410 bc_src->neg = !bc_src->neg; 4411} 4412 4413static void tgsi_dst(struct r600_shader_ctx *ctx, 4414 const struct tgsi_full_dst_register *tgsi_dst, 4415 unsigned swizzle, 4416 struct r600_bytecode_alu_dst *r600_dst) 4417{ 4418 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4419 4420 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) { 4421 bool spilled; 4422 unsigned idx; 4423 4424 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled); 4425 4426 if (spilled) { 4427 struct r600_bytecode_output cf; 4428 int reg = 0; 4429 int r; 4430 bool add_pending_output = true; 4431 4432 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 4433 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index, 4434 &cf.array_base, &cf.array_size); 4435 4436 /* If no component has spilled, reserve a register and add the spill code 4437 * ctx->bc->n_pending_outputs is cleared after each instruction group */ 4438 if (ctx->bc->n_pending_outputs == 0) { 4439 reg = r600_get_temp(ctx); 4440 } else { 4441 /* If we are already spilling and the output address is the same like 4442 * before then just reuse the same slot */ 4443 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1]; 4444 if ((cf.array_base + idx == tmpl->array_base) || 4445 (cf.array_base == tmpl->array_base && 4446 tmpl->index_gpr == ctx->bc->ar_reg && 4447 tgsi_dst->Register.Indirect)) { 4448 reg = ctx->bc->pending_outputs[0].gpr; 4449 add_pending_output = false; 4450 } else { 4451 reg = r600_get_temp(ctx); 4452 } 4453 } 4454 4455 r600_dst->sel = reg; 4456 r600_dst->chan = swizzle; 4457 r600_dst->write = 1; 4458 if (inst->Instruction.Saturate) { 4459 r600_dst->clamp = 1; 4460 } 4461 4462 /* Add new outputs as pending */ 4463 if (add_pending_output) { 4464 cf.op = CF_OP_MEM_SCRATCH; 4465 cf.elem_size = 3; 4466 cf.gpr = reg; 4467 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 4468 cf.mark = 1; 4469 cf.comp_mask = inst->Dst[0].Register.WriteMask; 4470 cf.swizzle_x = 0; 4471 cf.swizzle_y = 1; 4472 cf.swizzle_z = 2; 4473 cf.swizzle_w = 3; 4474 cf.burst_count = 1; 4475 4476 if (tgsi_dst->Register.Indirect) { 4477 if (ctx->bc->chip_class < R700) 4478 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 4479 else 4480 cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK; 4481 cf.index_gpr = ctx->bc->ar_reg; 4482 } 4483 else { 4484 cf.array_base += idx; 4485 cf.array_size = 0; 4486 } 4487 4488 r = r600_bytecode_add_pending_output(ctx->bc, &cf); 4489 if (r) 4490 return; 4491 4492 if (ctx->bc->chip_class >= R700) 4493 r600_bytecode_need_wait_ack(ctx->bc, true); 4494 } 4495 return; 4496 } 4497 else { 4498 r600_dst->sel = idx; 4499 } 4500 } 4501 else { 4502 r600_dst->sel = tgsi_dst->Register.Index; 4503 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 4504 } 4505 r600_dst->chan = swizzle; 4506 r600_dst->write = 1; 4507 if (inst->Instruction.Saturate) { 4508 r600_dst->clamp = 1; 4509 } 4510 if (ctx->type == PIPE_SHADER_TESS_CTRL) { 4511 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) { 4512 return; 4513 } 4514 } 4515 if (tgsi_dst->Register.Indirect) 4516 r600_dst->rel = V_SQ_REL_RELATIVE; 4517 4518} 4519 4520static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override) 4521{ 4522 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4523 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4524 struct r600_bytecode_alu alu; 4525 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4526 int use_tmp = 0; 4527 int swizzle_x = inst->Src[0].Register.SwizzleX; 4528 4529 if (singledest) { 4530 switch (write_mask) { 4531 case 0x1: 4532 if (swizzle_x == 2) { 4533 write_mask = 0xc; 4534 use_tmp = 3; 4535 } else 4536 write_mask = 0x3; 4537 break; 4538 case 0x2: 4539 if (swizzle_x == 2) { 4540 write_mask = 0xc; 4541 use_tmp = 3; 4542 } else { 4543 write_mask = 0x3; 4544 use_tmp = 1; 4545 } 4546 break; 4547 case 0x4: 4548 if (swizzle_x == 0) { 4549 write_mask = 0x3; 4550 use_tmp = 1; 4551 } else 4552 write_mask = 0xc; 4553 break; 4554 case 0x8: 4555 if (swizzle_x == 0) { 4556 write_mask = 0x3; 4557 use_tmp = 1; 4558 } else { 4559 write_mask = 0xc; 4560 use_tmp = 3; 4561 } 4562 break; 4563 } 4564 } 4565 4566 lasti = tgsi_last_instruction(write_mask); 4567 for (i = 0; i <= lasti; i++) { 4568 4569 if (!(write_mask & (1 << i))) 4570 continue; 4571 4572 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4573 4574 if (singledest) { 4575 if (use_tmp || dest_temp) { 4576 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp; 4577 alu.dst.chan = i; 4578 alu.dst.write = 1; 4579 } else { 4580 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4581 } 4582 if (i == 1 || i == 3) 4583 alu.dst.write = 0; 4584 } else 4585 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4586 4587 alu.op = op_override ? op_override : ctx->inst_info->op; 4588 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) { 4589 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4590 } else if (!swap) { 4591 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4592 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4593 } 4594 } else { 4595 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i)); 4596 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i)); 4597 } 4598 4599 /* handle some special cases */ 4600 if (i == 1 || i == 3) { 4601 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) { 4602 case TGSI_OPCODE_DABS: 4603 r600_bytecode_src_set_abs(&alu.src[0]); 4604 break; 4605 default: 4606 break; 4607 } 4608 } 4609 if (i == lasti) { 4610 alu.last = 1; 4611 } 4612 r = r600_bytecode_add_alu(ctx->bc, &alu); 4613 if (r) 4614 return r; 4615 } 4616 4617 if (use_tmp) { 4618 write_mask = inst->Dst[0].Register.WriteMask; 4619 4620 lasti = tgsi_last_instruction(write_mask); 4621 /* move result from temp to dst */ 4622 for (i = 0; i <= lasti; i++) { 4623 if (!(write_mask & (1 << i))) 4624 continue; 4625 4626 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4627 alu.op = ALU_OP1_MOV; 4628 4629 if (dest_temp) { 4630 alu.dst.sel = dest_temp; 4631 alu.dst.chan = i; 4632 alu.dst.write = 1; 4633 } else 4634 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4635 alu.src[0].sel = ctx->temp_reg; 4636 alu.src[0].chan = use_tmp - 1; 4637 alu.last = (i == lasti); 4638 4639 r = r600_bytecode_add_alu(ctx->bc, &alu); 4640 if (r) 4641 return r; 4642 } 4643 } 4644 return 0; 4645} 4646 4647static int tgsi_op2_64(struct r600_shader_ctx *ctx) 4648{ 4649 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4650 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4651 /* confirm writemasking */ 4652 if ((write_mask & 0x3) != 0x3 && 4653 (write_mask & 0xc) != 0xc) { 4654 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask); 4655 return -1; 4656 } 4657 return tgsi_op2_64_params(ctx, false, false, 0, 0); 4658} 4659 4660static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx) 4661{ 4662 return tgsi_op2_64_params(ctx, true, false, 0, 0); 4663} 4664 4665static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx) 4666{ 4667 return tgsi_op2_64_params(ctx, true, true, 0, 0); 4668} 4669 4670static int tgsi_op3_64(struct r600_shader_ctx *ctx) 4671{ 4672 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4673 struct r600_bytecode_alu alu; 4674 int i, j, r; 4675 int lasti = 3; 4676 int tmp = r600_get_temp(ctx); 4677 4678 for (i = 0; i < lasti + 1; i++) { 4679 4680 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4681 alu.op = ctx->inst_info->op; 4682 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4683 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1); 4684 } 4685 4686 if (inst->Dst[0].Register.WriteMask & (1 << i)) 4687 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4688 else 4689 alu.dst.sel = tmp; 4690 4691 alu.dst.chan = i; 4692 alu.is_op3 = 1; 4693 if (i == lasti) { 4694 alu.last = 1; 4695 } 4696 r = r600_bytecode_add_alu(ctx->bc, &alu); 4697 if (r) 4698 return r; 4699 } 4700 return 0; 4701} 4702 4703static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 4704{ 4705 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4706 struct r600_bytecode_alu alu; 4707 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4708 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4709 /* use temp register if trans_only and more than one dst component */ 4710 int use_tmp = trans_only && (write_mask ^ (1 << lasti)); 4711 unsigned op = ctx->inst_info->op; 4712 4713 if (op == ALU_OP2_MUL_IEEE && 4714 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 4715 op = ALU_OP2_MUL; 4716 4717 /* nir_to_tgsi lowers nir_op_isub to UADD + negate, since r600 doesn't support 4718 * source modifiers with integer ops we switch back to SUB_INT */ 4719 bool src1_neg = ctx->src[1].neg; 4720 if (op == ALU_OP2_ADD_INT && src1_neg) { 4721 src1_neg = false; 4722 op = ALU_OP2_SUB_INT; 4723 } 4724 4725 for (i = 0; i <= lasti; i++) { 4726 if (!(write_mask & (1 << i))) 4727 continue; 4728 4729 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4730 if (use_tmp) { 4731 alu.dst.sel = ctx->temp_reg; 4732 alu.dst.chan = i; 4733 alu.dst.write = 1; 4734 } else 4735 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4736 4737 alu.op = op; 4738 if (!swap) { 4739 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4740 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 4741 } 4742 alu.src[1].neg = src1_neg; 4743 } else { 4744 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4745 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4746 } 4747 if (i == lasti || trans_only) { 4748 alu.last = 1; 4749 } 4750 r = r600_bytecode_add_alu(ctx->bc, &alu); 4751 if (r) 4752 return r; 4753 } 4754 4755 if (use_tmp) { 4756 /* move result from temp to dst */ 4757 for (i = 0; i <= lasti; i++) { 4758 if (!(write_mask & (1 << i))) 4759 continue; 4760 4761 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4762 alu.op = ALU_OP1_MOV; 4763 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4764 alu.src[0].sel = ctx->temp_reg; 4765 alu.src[0].chan = i; 4766 alu.last = (i == lasti); 4767 4768 r = r600_bytecode_add_alu(ctx->bc, &alu); 4769 if (r) 4770 return r; 4771 } 4772 } 4773 return 0; 4774} 4775 4776static int tgsi_op2(struct r600_shader_ctx *ctx) 4777{ 4778 return tgsi_op2_s(ctx, 0, 0); 4779} 4780 4781static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 4782{ 4783 return tgsi_op2_s(ctx, 1, 0); 4784} 4785 4786static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 4787{ 4788 return tgsi_op2_s(ctx, 0, 1); 4789} 4790 4791static int tgsi_ineg(struct r600_shader_ctx *ctx) 4792{ 4793 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4794 struct r600_bytecode_alu alu; 4795 int i, r; 4796 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4797 4798 for (i = 0; i < lasti + 1; i++) { 4799 4800 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4801 continue; 4802 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4803 alu.op = ctx->inst_info->op; 4804 4805 alu.src[0].sel = V_SQ_ALU_SRC_0; 4806 4807 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4808 4809 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4810 4811 if (i == lasti) { 4812 alu.last = 1; 4813 } 4814 r = r600_bytecode_add_alu(ctx->bc, &alu); 4815 if (r) 4816 return r; 4817 } 4818 return 0; 4819 4820} 4821 4822static int tgsi_dneg(struct r600_shader_ctx *ctx) 4823{ 4824 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4825 struct r600_bytecode_alu alu; 4826 int i, r; 4827 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4828 4829 for (i = 0; i < lasti + 1; i++) { 4830 4831 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4832 continue; 4833 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4834 alu.op = ALU_OP1_MOV; 4835 4836 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4837 4838 if (i == 1 || i == 3) 4839 r600_bytecode_src_toggle_neg(&alu.src[0]); 4840 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4841 4842 if (i == lasti) { 4843 alu.last = 1; 4844 } 4845 r = r600_bytecode_add_alu(ctx->bc, &alu); 4846 if (r) 4847 return r; 4848 } 4849 return 0; 4850 4851} 4852 4853static int tgsi_dfracexp(struct r600_shader_ctx *ctx) 4854{ 4855 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4856 struct r600_bytecode_alu alu; 4857 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4858 int i, j, r; 4859 4860 for (i = 0; i <= 3; i++) { 4861 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4862 alu.op = ctx->inst_info->op; 4863 4864 alu.dst.sel = ctx->temp_reg; 4865 alu.dst.chan = i; 4866 alu.dst.write = 1; 4867 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4868 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4869 } 4870 4871 if (i == 3) 4872 alu.last = 1; 4873 4874 r = r600_bytecode_add_alu(ctx->bc, &alu); 4875 if (r) 4876 return r; 4877 } 4878 4879 /* Replicate significand result across channels. */ 4880 for (i = 0; i <= 3; i++) { 4881 if (!(write_mask & (1 << i))) 4882 continue; 4883 4884 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4885 alu.op = ALU_OP1_MOV; 4886 alu.src[0].chan = (i & 1) + 2; 4887 alu.src[0].sel = ctx->temp_reg; 4888 4889 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4890 alu.dst.write = 1; 4891 alu.last = 1; 4892 r = r600_bytecode_add_alu(ctx->bc, &alu); 4893 if (r) 4894 return r; 4895 } 4896 4897 for (i = 0; i <= 3; i++) { 4898 if (inst->Dst[1].Register.WriteMask & (1 << i)) { 4899 /* MOV third channels to writemask dst1 */ 4900 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4901 alu.op = ALU_OP1_MOV; 4902 alu.src[0].chan = 1; 4903 alu.src[0].sel = ctx->temp_reg; 4904 4905 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst); 4906 alu.last = 1; 4907 r = r600_bytecode_add_alu(ctx->bc, &alu); 4908 if (r) 4909 return r; 4910 break; 4911 } 4912 } 4913 return 0; 4914} 4915 4916 4917static int egcm_int_to_double(struct r600_shader_ctx *ctx) 4918{ 4919 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4920 struct r600_bytecode_alu alu; 4921 int i, c, r; 4922 int write_mask = inst->Dst[0].Register.WriteMask; 4923 int temp_reg = r600_get_temp(ctx); 4924 4925 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || 4926 inst->Instruction.Opcode == TGSI_OPCODE_U2D); 4927 4928 for (c = 0; c < 2; c++) { 4929 int dchan = c * 2; 4930 if (write_mask & (0x3 << dchan)) { 4931 /* split into 24-bit int and 8-bit int */ 4932 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4933 alu.op = ALU_OP2_AND_INT; 4934 alu.dst.sel = temp_reg; 4935 alu.dst.chan = dchan; 4936 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4937 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4938 alu.src[1].value = 0xffffff00; 4939 alu.dst.write = 1; 4940 r = r600_bytecode_add_alu(ctx->bc, &alu); 4941 if (r) 4942 return r; 4943 4944 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4945 alu.op = ALU_OP2_AND_INT; 4946 alu.dst.sel = temp_reg; 4947 alu.dst.chan = dchan + 1; 4948 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4949 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4950 alu.src[1].value = 0xff; 4951 alu.dst.write = 1; 4952 alu.last = 1; 4953 r = r600_bytecode_add_alu(ctx->bc, &alu); 4954 if (r) 4955 return r; 4956 } 4957 } 4958 4959 for (c = 0; c < 2; c++) { 4960 int dchan = c * 2; 4961 if (write_mask & (0x3 << dchan)) { 4962 for (i = dchan; i <= dchan + 1; i++) { 4963 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4964 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT; 4965 4966 alu.src[0].sel = temp_reg; 4967 alu.src[0].chan = i; 4968 alu.dst.sel = temp_reg; 4969 alu.dst.chan = i; 4970 alu.dst.write = 1; 4971 if (ctx->bc->chip_class == CAYMAN) 4972 alu.last = i == dchan + 1; 4973 else 4974 alu.last = 1; /* trans only ops on evergreen */ 4975 4976 r = r600_bytecode_add_alu(ctx->bc, &alu); 4977 if (r) 4978 return r; 4979 } 4980 } 4981 } 4982 4983 for (c = 0; c < 2; c++) { 4984 int dchan = c * 2; 4985 if (write_mask & (0x3 << dchan)) { 4986 for (i = 0; i < 4; i++) { 4987 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4988 alu.op = ALU_OP1_FLT32_TO_FLT64; 4989 4990 alu.src[0].chan = dchan + (i / 2); 4991 if (i == 0 || i == 2) 4992 alu.src[0].sel = temp_reg; 4993 else { 4994 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 4995 alu.src[0].value = 0x0; 4996 } 4997 alu.dst.sel = ctx->temp_reg; 4998 alu.dst.chan = i; 4999 alu.last = i == 3; 5000 alu.dst.write = 1; 5001 5002 r = r600_bytecode_add_alu(ctx->bc, &alu); 5003 if (r) 5004 return r; 5005 } 5006 5007 for (i = 0; i <= 1; i++) { 5008 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5009 alu.op = ALU_OP2_ADD_64; 5010 5011 alu.src[0].chan = fp64_switch(i); 5012 alu.src[0].sel = ctx->temp_reg; 5013 5014 alu.src[1].chan = fp64_switch(i + 2); 5015 alu.src[1].sel = ctx->temp_reg; 5016 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst); 5017 alu.last = i == 1; 5018 5019 r = r600_bytecode_add_alu(ctx->bc, &alu); 5020 if (r) 5021 return r; 5022 } 5023 } 5024 } 5025 5026 return 0; 5027} 5028 5029static int egcm_double_to_int(struct r600_shader_ctx *ctx) 5030{ 5031 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5032 struct r600_bytecode_alu alu; 5033 int i, r; 5034 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5035 int treg = r600_get_temp(ctx); 5036 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I || 5037 inst->Instruction.Opcode == TGSI_OPCODE_D2U); 5038 5039 /* do a 64->32 into a temp register */ 5040 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32); 5041 if (r) 5042 return r; 5043 5044 for (i = 0; i <= lasti; i++) { 5045 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5046 continue; 5047 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5048 alu.op = ctx->inst_info->op; 5049 5050 alu.src[0].chan = i; 5051 alu.src[0].sel = treg; 5052 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5053 alu.last = (i == lasti); 5054 5055 r = r600_bytecode_add_alu(ctx->bc, &alu); 5056 if (r) 5057 return r; 5058 } 5059 5060 return 0; 5061} 5062 5063static int cayman_emit_unary_double_raw(struct r600_bytecode *bc, 5064 unsigned op, 5065 int dst_reg, 5066 struct r600_shader_src *src, 5067 bool abs) 5068{ 5069 struct r600_bytecode_alu alu; 5070 const int last_slot = 3; 5071 int r; 5072 5073 /* these have to write the result to X/Y by the looks of it */ 5074 for (int i = 0 ; i < last_slot; i++) { 5075 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5076 alu.op = op; 5077 5078 r600_bytecode_src(&alu.src[0], src, 1); 5079 r600_bytecode_src(&alu.src[1], src, 0); 5080 5081 if (abs) 5082 r600_bytecode_src_set_abs(&alu.src[1]); 5083 5084 alu.dst.sel = dst_reg; 5085 alu.dst.chan = i; 5086 alu.dst.write = (i == 0 || i == 1); 5087 5088 if (bc->chip_class != CAYMAN || i == last_slot - 1) 5089 alu.last = 1; 5090 r = r600_bytecode_add_alu(bc, &alu); 5091 if (r) 5092 return r; 5093 } 5094 5095 return 0; 5096} 5097 5098static int cayman_emit_double_instr(struct r600_shader_ctx *ctx) 5099{ 5100 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5101 int i, r; 5102 struct r600_bytecode_alu alu; 5103 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5104 int t1 = ctx->temp_reg; 5105 5106 /* should only be one src regs */ 5107 assert(inst->Instruction.NumSrcRegs == 1); 5108 5109 /* only support one double at a time */ 5110 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5111 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5112 5113 r = cayman_emit_unary_double_raw( 5114 ctx->bc, ctx->inst_info->op, t1, 5115 &ctx->src[0], 5116 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ || 5117 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT); 5118 if (r) 5119 return r; 5120 5121 for (i = 0 ; i <= lasti; i++) { 5122 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5123 continue; 5124 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5125 alu.op = ALU_OP1_MOV; 5126 alu.src[0].sel = t1; 5127 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1; 5128 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5129 alu.dst.write = 1; 5130 if (i == lasti) 5131 alu.last = 1; 5132 r = r600_bytecode_add_alu(ctx->bc, &alu); 5133 if (r) 5134 return r; 5135 } 5136 return 0; 5137} 5138 5139static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 5140{ 5141 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5142 int i, j, r; 5143 struct r600_bytecode_alu alu; 5144 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5145 5146 for (i = 0 ; i < last_slot; i++) { 5147 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5148 alu.op = ctx->inst_info->op; 5149 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5150 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 5151 5152 /* RSQ should take the absolute value of src */ 5153 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) { 5154 r600_bytecode_src_set_abs(&alu.src[j]); 5155 } 5156 } 5157 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5158 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5159 5160 if (i == last_slot - 1) 5161 alu.last = 1; 5162 r = r600_bytecode_add_alu(ctx->bc, &alu); 5163 if (r) 5164 return r; 5165 } 5166 return 0; 5167} 5168 5169static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 5170{ 5171 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5172 int i, j, k, r; 5173 struct r600_bytecode_alu alu; 5174 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5175 int t1 = ctx->temp_reg; 5176 5177 for (k = 0; k <= lasti; k++) { 5178 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 5179 continue; 5180 5181 for (i = 0 ; i < 4; i++) { 5182 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5183 alu.op = ctx->inst_info->op; 5184 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5185 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 5186 } 5187 alu.dst.sel = t1; 5188 alu.dst.chan = i; 5189 alu.dst.write = (i == k); 5190 if (i == 3) 5191 alu.last = 1; 5192 r = r600_bytecode_add_alu(ctx->bc, &alu); 5193 if (r) 5194 return r; 5195 } 5196 } 5197 5198 for (i = 0 ; i <= lasti; i++) { 5199 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5200 continue; 5201 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5202 alu.op = ALU_OP1_MOV; 5203 alu.src[0].sel = t1; 5204 alu.src[0].chan = i; 5205 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5206 alu.dst.write = 1; 5207 if (i == lasti) 5208 alu.last = 1; 5209 r = r600_bytecode_add_alu(ctx->bc, &alu); 5210 if (r) 5211 return r; 5212 } 5213 5214 return 0; 5215} 5216 5217 5218static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) 5219{ 5220 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5221 int i, j, k, r; 5222 struct r600_bytecode_alu alu; 5223 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5224 int t1 = ctx->temp_reg; 5225 5226 /* t1 would get overwritten below if we actually tried to 5227 * multiply two pairs of doubles at a time. */ 5228 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5229 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5230 5231 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5232 5233 for (i = 0; i < 4; i++) { 5234 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5235 alu.op = ctx->inst_info->op; 5236 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5237 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); 5238 } 5239 alu.dst.sel = t1; 5240 alu.dst.chan = i; 5241 alu.dst.write = 1; 5242 if (i == 3) 5243 alu.last = 1; 5244 r = r600_bytecode_add_alu(ctx->bc, &alu); 5245 if (r) 5246 return r; 5247 } 5248 5249 for (i = 0; i <= lasti; i++) { 5250 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5251 continue; 5252 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5253 alu.op = ALU_OP1_MOV; 5254 alu.src[0].sel = t1; 5255 alu.src[0].chan = i; 5256 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5257 alu.dst.write = 1; 5258 if (i == lasti) 5259 alu.last = 1; 5260 r = r600_bytecode_add_alu(ctx->bc, &alu); 5261 if (r) 5262 return r; 5263 } 5264 5265 return 0; 5266} 5267 5268/* 5269 * Emit RECIP_64 + MUL_64 to implement division. 5270 */ 5271static int cayman_ddiv_instr(struct r600_shader_ctx *ctx) 5272{ 5273 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5274 int r; 5275 struct r600_bytecode_alu alu; 5276 int t1 = ctx->temp_reg; 5277 int k; 5278 5279 /* Only support one double at a time. This is the same constraint as 5280 * in DMUL lowering. */ 5281 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5282 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5283 5284 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5285 5286 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false); 5287 if (r) 5288 return r; 5289 5290 for (int i = 0; i < 4; i++) { 5291 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5292 alu.op = ALU_OP2_MUL_64; 5293 5294 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1)); 5295 5296 alu.src[1].sel = t1; 5297 alu.src[1].chan = (i == 3) ? 0 : 1; 5298 5299 alu.dst.sel = t1; 5300 alu.dst.chan = i; 5301 alu.dst.write = 1; 5302 if (i == 3) 5303 alu.last = 1; 5304 r = r600_bytecode_add_alu(ctx->bc, &alu); 5305 if (r) 5306 return r; 5307 } 5308 5309 for (int i = 0; i < 2; i++) { 5310 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5311 alu.op = ALU_OP1_MOV; 5312 alu.src[0].sel = t1; 5313 alu.src[0].chan = i; 5314 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst); 5315 alu.dst.write = 1; 5316 if (i == 1) 5317 alu.last = 1; 5318 r = r600_bytecode_add_alu(ctx->bc, &alu); 5319 if (r) 5320 return r; 5321 } 5322 return 0; 5323} 5324 5325/* 5326 * r600 - trunc to -PI..PI range 5327 * r700 - normalize by dividing by 2PI 5328 * see fdo bug 27901 5329 */ 5330static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 5331{ 5332 int r; 5333 struct r600_bytecode_alu alu; 5334 5335 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5336 alu.op = ALU_OP3_MULADD; 5337 alu.is_op3 = 1; 5338 5339 alu.dst.chan = 0; 5340 alu.dst.sel = ctx->temp_reg; 5341 alu.dst.write = 1; 5342 5343 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5344 5345 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5346 alu.src[1].chan = 0; 5347 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI); 5348 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5349 alu.src[2].chan = 0; 5350 alu.last = 1; 5351 r = r600_bytecode_add_alu(ctx->bc, &alu); 5352 if (r) 5353 return r; 5354 5355 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5356 alu.op = ALU_OP1_FRACT; 5357 5358 alu.dst.chan = 0; 5359 alu.dst.sel = ctx->temp_reg; 5360 alu.dst.write = 1; 5361 5362 alu.src[0].sel = ctx->temp_reg; 5363 alu.src[0].chan = 0; 5364 alu.last = 1; 5365 r = r600_bytecode_add_alu(ctx->bc, &alu); 5366 if (r) 5367 return r; 5368 5369 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5370 alu.op = ALU_OP3_MULADD; 5371 alu.is_op3 = 1; 5372 5373 alu.dst.chan = 0; 5374 alu.dst.sel = ctx->temp_reg; 5375 alu.dst.write = 1; 5376 5377 alu.src[0].sel = ctx->temp_reg; 5378 alu.src[0].chan = 0; 5379 5380 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5381 alu.src[1].chan = 0; 5382 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 5383 alu.src[2].chan = 0; 5384 5385 if (ctx->bc->chip_class == R600) { 5386 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI); 5387 alu.src[2].value = u_bitcast_f2u(-M_PI); 5388 } else { 5389 alu.src[1].sel = V_SQ_ALU_SRC_1; 5390 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5391 alu.src[2].neg = 1; 5392 } 5393 5394 alu.last = 1; 5395 r = r600_bytecode_add_alu(ctx->bc, &alu); 5396 if (r) 5397 return r; 5398 return 0; 5399} 5400 5401static int cayman_trig(struct r600_shader_ctx *ctx) 5402{ 5403 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5404 struct r600_bytecode_alu alu; 5405 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5406 int i, r; 5407 5408 r = tgsi_setup_trig(ctx); 5409 if (r) 5410 return r; 5411 5412 5413 for (i = 0; i < last_slot; i++) { 5414 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5415 alu.op = ctx->inst_info->op; 5416 alu.dst.chan = i; 5417 5418 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5419 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5420 5421 alu.src[0].sel = ctx->temp_reg; 5422 alu.src[0].chan = 0; 5423 if (i == last_slot - 1) 5424 alu.last = 1; 5425 r = r600_bytecode_add_alu(ctx->bc, &alu); 5426 if (r) 5427 return r; 5428 } 5429 return 0; 5430} 5431 5432static int tgsi_trig(struct r600_shader_ctx *ctx) 5433{ 5434 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5435 struct r600_bytecode_alu alu; 5436 int i, r; 5437 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5438 5439 r = tgsi_setup_trig(ctx); 5440 if (r) 5441 return r; 5442 5443 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5444 alu.op = ctx->inst_info->op; 5445 alu.dst.chan = 0; 5446 alu.dst.sel = ctx->temp_reg; 5447 alu.dst.write = 1; 5448 5449 alu.src[0].sel = ctx->temp_reg; 5450 alu.src[0].chan = 0; 5451 alu.last = 1; 5452 r = r600_bytecode_add_alu(ctx->bc, &alu); 5453 if (r) 5454 return r; 5455 5456 /* replicate result */ 5457 for (i = 0; i < lasti + 1; i++) { 5458 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5459 continue; 5460 5461 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5462 alu.op = ALU_OP1_MOV; 5463 5464 alu.src[0].sel = ctx->temp_reg; 5465 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5466 if (i == lasti) 5467 alu.last = 1; 5468 r = r600_bytecode_add_alu(ctx->bc, &alu); 5469 if (r) 5470 return r; 5471 } 5472 return 0; 5473} 5474 5475static int tgsi_kill(struct r600_shader_ctx *ctx) 5476{ 5477 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5478 struct r600_bytecode_alu alu; 5479 int i, r; 5480 5481 for (i = 0; i < 4; i++) { 5482 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5483 alu.op = ctx->inst_info->op; 5484 5485 alu.dst.chan = i; 5486 5487 alu.src[0].sel = V_SQ_ALU_SRC_0; 5488 5489 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) { 5490 alu.src[1].sel = V_SQ_ALU_SRC_1; 5491 alu.src[1].neg = 1; 5492 } else { 5493 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5494 } 5495 if (i == 3) { 5496 alu.last = 1; 5497 } 5498 r = r600_bytecode_add_alu(ctx->bc, &alu); 5499 if (r) 5500 return r; 5501 } 5502 5503 /* kill must be last in ALU */ 5504 ctx->bc->force_add_cf = 1; 5505 ctx->shader->uses_kill = TRUE; 5506 return 0; 5507} 5508 5509static int tgsi_lit(struct r600_shader_ctx *ctx) 5510{ 5511 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5512 struct r600_bytecode_alu alu; 5513 int r; 5514 5515 /* tmp.x = max(src.y, 0.0) */ 5516 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5517 alu.op = ALU_OP2_MAX; 5518 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 5519 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5520 alu.src[1].chan = 1; 5521 5522 alu.dst.sel = ctx->temp_reg; 5523 alu.dst.chan = 0; 5524 alu.dst.write = 1; 5525 5526 alu.last = 1; 5527 r = r600_bytecode_add_alu(ctx->bc, &alu); 5528 if (r) 5529 return r; 5530 5531 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 5532 { 5533 int chan; 5534 int sel; 5535 unsigned i; 5536 5537 if (ctx->bc->chip_class == CAYMAN) { 5538 for (i = 0; i < 3; i++) { 5539 /* tmp.z = log(tmp.x) */ 5540 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5541 alu.op = ALU_OP1_LOG_CLAMPED; 5542 alu.src[0].sel = ctx->temp_reg; 5543 alu.src[0].chan = 0; 5544 alu.dst.sel = ctx->temp_reg; 5545 alu.dst.chan = i; 5546 if (i == 2) { 5547 alu.dst.write = 1; 5548 alu.last = 1; 5549 } else 5550 alu.dst.write = 0; 5551 5552 r = r600_bytecode_add_alu(ctx->bc, &alu); 5553 if (r) 5554 return r; 5555 } 5556 } else { 5557 /* tmp.z = log(tmp.x) */ 5558 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5559 alu.op = ALU_OP1_LOG_CLAMPED; 5560 alu.src[0].sel = ctx->temp_reg; 5561 alu.src[0].chan = 0; 5562 alu.dst.sel = ctx->temp_reg; 5563 alu.dst.chan = 2; 5564 alu.dst.write = 1; 5565 alu.last = 1; 5566 r = r600_bytecode_add_alu(ctx->bc, &alu); 5567 if (r) 5568 return r; 5569 } 5570 5571 chan = alu.dst.chan; 5572 sel = alu.dst.sel; 5573 5574 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 5575 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5576 alu.op = ALU_OP3_MUL_LIT; 5577 alu.src[0].sel = sel; 5578 alu.src[0].chan = chan; 5579 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 5580 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 5581 alu.dst.sel = ctx->temp_reg; 5582 alu.dst.chan = 0; 5583 alu.dst.write = 1; 5584 alu.is_op3 = 1; 5585 alu.last = 1; 5586 r = r600_bytecode_add_alu(ctx->bc, &alu); 5587 if (r) 5588 return r; 5589 5590 if (ctx->bc->chip_class == CAYMAN) { 5591 for (i = 0; i < 3; i++) { 5592 /* dst.z = exp(tmp.x) */ 5593 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5594 alu.op = ALU_OP1_EXP_IEEE; 5595 alu.src[0].sel = ctx->temp_reg; 5596 alu.src[0].chan = 0; 5597 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5598 if (i == 2) { 5599 alu.dst.write = 1; 5600 alu.last = 1; 5601 } else 5602 alu.dst.write = 0; 5603 r = r600_bytecode_add_alu(ctx->bc, &alu); 5604 if (r) 5605 return r; 5606 } 5607 } else { 5608 /* dst.z = exp(tmp.x) */ 5609 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5610 alu.op = ALU_OP1_EXP_IEEE; 5611 alu.src[0].sel = ctx->temp_reg; 5612 alu.src[0].chan = 0; 5613 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 5614 alu.last = 1; 5615 r = r600_bytecode_add_alu(ctx->bc, &alu); 5616 if (r) 5617 return r; 5618 } 5619 } 5620 5621 /* dst.x, <- 1.0 */ 5622 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5623 alu.op = ALU_OP1_MOV; 5624 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 5625 alu.src[0].chan = 0; 5626 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 5627 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 5628 r = r600_bytecode_add_alu(ctx->bc, &alu); 5629 if (r) 5630 return r; 5631 5632 /* dst.y = max(src.x, 0.0) */ 5633 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5634 alu.op = ALU_OP2_MAX; 5635 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5636 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5637 alu.src[1].chan = 0; 5638 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 5639 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 5640 r = r600_bytecode_add_alu(ctx->bc, &alu); 5641 if (r) 5642 return r; 5643 5644 /* dst.w, <- 1.0 */ 5645 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5646 alu.op = ALU_OP1_MOV; 5647 alu.src[0].sel = V_SQ_ALU_SRC_1; 5648 alu.src[0].chan = 0; 5649 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 5650 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 5651 alu.last = 1; 5652 r = r600_bytecode_add_alu(ctx->bc, &alu); 5653 if (r) 5654 return r; 5655 5656 return 0; 5657} 5658 5659static int tgsi_rsq(struct r600_shader_ctx *ctx) 5660{ 5661 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5662 struct r600_bytecode_alu alu; 5663 int i, r; 5664 5665 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5666 5667 alu.op = ALU_OP1_RECIPSQRT_IEEE; 5668 5669 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5670 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5671 r600_bytecode_src_set_abs(&alu.src[i]); 5672 } 5673 alu.dst.sel = ctx->temp_reg; 5674 alu.dst.write = 1; 5675 alu.last = 1; 5676 r = r600_bytecode_add_alu(ctx->bc, &alu); 5677 if (r) 5678 return r; 5679 /* replicate result */ 5680 return tgsi_helper_tempx_replicate(ctx); 5681} 5682 5683static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 5684{ 5685 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5686 struct r600_bytecode_alu alu; 5687 int i, r; 5688 5689 for (i = 0; i < 4; i++) { 5690 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5691 alu.src[0].sel = ctx->temp_reg; 5692 alu.op = ALU_OP1_MOV; 5693 alu.dst.chan = i; 5694 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5695 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5696 if (i == 3) 5697 alu.last = 1; 5698 r = r600_bytecode_add_alu(ctx->bc, &alu); 5699 if (r) 5700 return r; 5701 } 5702 return 0; 5703} 5704 5705static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 5706{ 5707 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5708 struct r600_bytecode_alu alu; 5709 int i, r; 5710 5711 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5712 alu.op = ctx->inst_info->op; 5713 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5714 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5715 } 5716 alu.dst.sel = ctx->temp_reg; 5717 alu.dst.write = 1; 5718 alu.last = 1; 5719 r = r600_bytecode_add_alu(ctx->bc, &alu); 5720 if (r) 5721 return r; 5722 /* replicate result */ 5723 return tgsi_helper_tempx_replicate(ctx); 5724} 5725 5726static int cayman_pow(struct r600_shader_ctx *ctx) 5727{ 5728 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5729 int i, r; 5730 struct r600_bytecode_alu alu; 5731 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5732 5733 for (i = 0; i < 3; i++) { 5734 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5735 alu.op = ALU_OP1_LOG_IEEE; 5736 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5737 alu.dst.sel = ctx->temp_reg; 5738 alu.dst.chan = i; 5739 alu.dst.write = 1; 5740 if (i == 2) 5741 alu.last = 1; 5742 r = r600_bytecode_add_alu(ctx->bc, &alu); 5743 if (r) 5744 return r; 5745 } 5746 5747 /* b * LOG2(a) */ 5748 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5749 alu.op = ALU_OP2_MUL; 5750 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5751 alu.src[1].sel = ctx->temp_reg; 5752 alu.dst.sel = ctx->temp_reg; 5753 alu.dst.write = 1; 5754 alu.last = 1; 5755 r = r600_bytecode_add_alu(ctx->bc, &alu); 5756 if (r) 5757 return r; 5758 5759 for (i = 0; i < last_slot; i++) { 5760 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5761 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5762 alu.op = ALU_OP1_EXP_IEEE; 5763 alu.src[0].sel = ctx->temp_reg; 5764 5765 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5766 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5767 if (i == last_slot - 1) 5768 alu.last = 1; 5769 r = r600_bytecode_add_alu(ctx->bc, &alu); 5770 if (r) 5771 return r; 5772 } 5773 return 0; 5774} 5775 5776static int tgsi_pow(struct r600_shader_ctx *ctx) 5777{ 5778 struct r600_bytecode_alu alu; 5779 int r; 5780 5781 /* LOG2(a) */ 5782 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5783 alu.op = ALU_OP1_LOG_IEEE; 5784 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5785 alu.dst.sel = ctx->temp_reg; 5786 alu.dst.write = 1; 5787 alu.last = 1; 5788 r = r600_bytecode_add_alu(ctx->bc, &alu); 5789 if (r) 5790 return r; 5791 /* b * LOG2(a) */ 5792 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5793 alu.op = ALU_OP2_MUL; 5794 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5795 alu.src[1].sel = ctx->temp_reg; 5796 alu.dst.sel = ctx->temp_reg; 5797 alu.dst.write = 1; 5798 alu.last = 1; 5799 r = r600_bytecode_add_alu(ctx->bc, &alu); 5800 if (r) 5801 return r; 5802 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5803 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5804 alu.op = ALU_OP1_EXP_IEEE; 5805 alu.src[0].sel = ctx->temp_reg; 5806 alu.dst.sel = ctx->temp_reg; 5807 alu.dst.write = 1; 5808 alu.last = 1; 5809 r = r600_bytecode_add_alu(ctx->bc, &alu); 5810 if (r) 5811 return r; 5812 return tgsi_helper_tempx_replicate(ctx); 5813} 5814 5815static int emit_mul_int_op(struct r600_bytecode *bc, 5816 struct r600_bytecode_alu *alu_src) 5817{ 5818 struct r600_bytecode_alu alu; 5819 int i, r; 5820 alu = *alu_src; 5821 if (bc->chip_class == CAYMAN) { 5822 for (i = 0; i < 4; i++) { 5823 alu.dst.chan = i; 5824 alu.dst.write = (i == alu_src->dst.chan); 5825 alu.last = (i == 3); 5826 5827 r = r600_bytecode_add_alu(bc, &alu); 5828 if (r) 5829 return r; 5830 } 5831 } else { 5832 alu.last = 1; 5833 r = r600_bytecode_add_alu(bc, &alu); 5834 if (r) 5835 return r; 5836 } 5837 return 0; 5838} 5839 5840static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 5841{ 5842 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5843 struct r600_bytecode_alu alu; 5844 int i, r, j; 5845 unsigned write_mask = inst->Dst[0].Register.WriteMask; 5846 int lasti = tgsi_last_instruction(write_mask); 5847 int tmp0 = ctx->temp_reg; 5848 int tmp1 = r600_get_temp(ctx); 5849 int tmp2 = r600_get_temp(ctx); 5850 int tmp3 = r600_get_temp(ctx); 5851 int tmp4 = 0; 5852 5853 /* Use additional temp if dst register and src register are the same */ 5854 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index || 5855 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) { 5856 tmp4 = r600_get_temp(ctx); 5857 } 5858 5859 /* Unsigned path: 5860 * 5861 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 5862 * 5863 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 5864 * 2. tmp0.z = lo (tmp0.x * src2) 5865 * 3. tmp0.w = -tmp0.z 5866 * 4. tmp0.y = hi (tmp0.x * src2) 5867 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 5868 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 5869 * 7. tmp1.x = tmp0.x - tmp0.w 5870 * 8. tmp1.y = tmp0.x + tmp0.w 5871 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 5872 * 10. tmp0.z = hi(tmp0.x * src1) = q 5873 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 5874 * 5875 * 12. tmp0.w = src1 - tmp0.y = r 5876 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 5877 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 5878 * 5879 * if DIV 5880 * 5881 * 15. tmp1.z = tmp0.z + 1 = q + 1 5882 * 16. tmp1.w = tmp0.z - 1 = q - 1 5883 * 5884 * else MOD 5885 * 5886 * 15. tmp1.z = tmp0.w - src2 = r - src2 5887 * 16. tmp1.w = tmp0.w + src2 = r + src2 5888 * 5889 * endif 5890 * 5891 * 17. tmp1.x = tmp1.x & tmp1.y 5892 * 5893 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 5894 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 5895 * 5896 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 5897 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 5898 * 5899 * Signed path: 5900 * 5901 * Same as unsigned, using abs values of the operands, 5902 * and fixing the sign of the result in the end. 5903 */ 5904 5905 for (i = 0; i < 4; i++) { 5906 if (!(write_mask & (1<<i))) 5907 continue; 5908 5909 if (signed_op) { 5910 5911 /* tmp2.x = -src0 */ 5912 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5913 alu.op = ALU_OP2_SUB_INT; 5914 5915 alu.dst.sel = tmp2; 5916 alu.dst.chan = 0; 5917 alu.dst.write = 1; 5918 5919 alu.src[0].sel = V_SQ_ALU_SRC_0; 5920 5921 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5922 5923 alu.last = 1; 5924 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5925 return r; 5926 5927 /* tmp2.y = -src1 */ 5928 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5929 alu.op = ALU_OP2_SUB_INT; 5930 5931 alu.dst.sel = tmp2; 5932 alu.dst.chan = 1; 5933 alu.dst.write = 1; 5934 5935 alu.src[0].sel = V_SQ_ALU_SRC_0; 5936 5937 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5938 5939 alu.last = 1; 5940 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5941 return r; 5942 5943 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 5944 /* it will be a sign of the quotient */ 5945 if (!mod) { 5946 5947 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5948 alu.op = ALU_OP2_XOR_INT; 5949 5950 alu.dst.sel = tmp2; 5951 alu.dst.chan = 2; 5952 alu.dst.write = 1; 5953 5954 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5955 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5956 5957 alu.last = 1; 5958 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5959 return r; 5960 } 5961 5962 /* tmp2.x = |src0| */ 5963 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5964 alu.op = ALU_OP3_CNDGE_INT; 5965 alu.is_op3 = 1; 5966 5967 alu.dst.sel = tmp2; 5968 alu.dst.chan = 0; 5969 alu.dst.write = 1; 5970 5971 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5972 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5973 alu.src[2].sel = tmp2; 5974 alu.src[2].chan = 0; 5975 5976 alu.last = 1; 5977 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5978 return r; 5979 5980 /* tmp2.y = |src1| */ 5981 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5982 alu.op = ALU_OP3_CNDGE_INT; 5983 alu.is_op3 = 1; 5984 5985 alu.dst.sel = tmp2; 5986 alu.dst.chan = 1; 5987 alu.dst.write = 1; 5988 5989 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5990 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5991 alu.src[2].sel = tmp2; 5992 alu.src[2].chan = 1; 5993 5994 alu.last = 1; 5995 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5996 return r; 5997 5998 } 5999 6000 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 6001 if (ctx->bc->chip_class == CAYMAN) { 6002 /* tmp3.x = u2f(src2) */ 6003 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6004 alu.op = ALU_OP1_UINT_TO_FLT; 6005 6006 alu.dst.sel = tmp3; 6007 alu.dst.chan = 0; 6008 alu.dst.write = 1; 6009 6010 if (signed_op) { 6011 alu.src[0].sel = tmp2; 6012 alu.src[0].chan = 1; 6013 } else { 6014 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6015 } 6016 6017 alu.last = 1; 6018 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6019 return r; 6020 6021 /* tmp0.x = recip(tmp3.x) */ 6022 for (j = 0 ; j < 3; j++) { 6023 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6024 alu.op = ALU_OP1_RECIP_IEEE; 6025 6026 alu.dst.sel = tmp0; 6027 alu.dst.chan = j; 6028 alu.dst.write = (j == 0); 6029 6030 alu.src[0].sel = tmp3; 6031 alu.src[0].chan = 0; 6032 6033 if (j == 2) 6034 alu.last = 1; 6035 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6036 return r; 6037 } 6038 6039 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6040 alu.op = ALU_OP2_MUL; 6041 6042 alu.src[0].sel = tmp0; 6043 alu.src[0].chan = 0; 6044 6045 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6046 alu.src[1].value = 0x4f800000; 6047 6048 alu.dst.sel = tmp3; 6049 alu.dst.write = 1; 6050 alu.last = 1; 6051 r = r600_bytecode_add_alu(ctx->bc, &alu); 6052 if (r) 6053 return r; 6054 6055 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6056 alu.op = ALU_OP1_FLT_TO_UINT; 6057 6058 alu.dst.sel = tmp0; 6059 alu.dst.chan = 0; 6060 alu.dst.write = 1; 6061 6062 alu.src[0].sel = tmp3; 6063 alu.src[0].chan = 0; 6064 6065 alu.last = 1; 6066 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6067 return r; 6068 6069 } else { 6070 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6071 alu.op = ALU_OP1_RECIP_UINT; 6072 6073 alu.dst.sel = tmp0; 6074 alu.dst.chan = 0; 6075 alu.dst.write = 1; 6076 6077 if (signed_op) { 6078 alu.src[0].sel = tmp2; 6079 alu.src[0].chan = 1; 6080 } else { 6081 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6082 } 6083 6084 alu.last = 1; 6085 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6086 return r; 6087 } 6088 6089 /* 2. tmp0.z = lo (tmp0.x * src2) */ 6090 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6091 alu.op = ALU_OP2_MULLO_UINT; 6092 6093 alu.dst.sel = tmp0; 6094 alu.dst.chan = 2; 6095 alu.dst.write = 1; 6096 6097 alu.src[0].sel = tmp0; 6098 alu.src[0].chan = 0; 6099 if (signed_op) { 6100 alu.src[1].sel = tmp2; 6101 alu.src[1].chan = 1; 6102 } else { 6103 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6104 } 6105 6106 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6107 return r; 6108 6109 /* 3. tmp0.w = -tmp0.z */ 6110 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6111 alu.op = ALU_OP2_SUB_INT; 6112 6113 alu.dst.sel = tmp0; 6114 alu.dst.chan = 3; 6115 alu.dst.write = 1; 6116 6117 alu.src[0].sel = V_SQ_ALU_SRC_0; 6118 alu.src[1].sel = tmp0; 6119 alu.src[1].chan = 2; 6120 6121 alu.last = 1; 6122 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6123 return r; 6124 6125 /* 4. tmp0.y = hi (tmp0.x * src2) */ 6126 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6127 alu.op = ALU_OP2_MULHI_UINT; 6128 6129 alu.dst.sel = tmp0; 6130 alu.dst.chan = 1; 6131 alu.dst.write = 1; 6132 6133 alu.src[0].sel = tmp0; 6134 alu.src[0].chan = 0; 6135 6136 if (signed_op) { 6137 alu.src[1].sel = tmp2; 6138 alu.src[1].chan = 1; 6139 } else { 6140 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6141 } 6142 6143 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6144 return r; 6145 6146 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 6147 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6148 alu.op = ALU_OP3_CNDE_INT; 6149 alu.is_op3 = 1; 6150 6151 alu.dst.sel = tmp0; 6152 alu.dst.chan = 2; 6153 alu.dst.write = 1; 6154 6155 alu.src[0].sel = tmp0; 6156 alu.src[0].chan = 1; 6157 alu.src[1].sel = tmp0; 6158 alu.src[1].chan = 3; 6159 alu.src[2].sel = tmp0; 6160 alu.src[2].chan = 2; 6161 6162 alu.last = 1; 6163 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6164 return r; 6165 6166 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 6167 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6168 alu.op = ALU_OP2_MULHI_UINT; 6169 6170 alu.dst.sel = tmp0; 6171 alu.dst.chan = 3; 6172 alu.dst.write = 1; 6173 6174 alu.src[0].sel = tmp0; 6175 alu.src[0].chan = 2; 6176 6177 alu.src[1].sel = tmp0; 6178 alu.src[1].chan = 0; 6179 6180 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6181 return r; 6182 6183 /* 7. tmp1.x = tmp0.x - tmp0.w */ 6184 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6185 alu.op = ALU_OP2_SUB_INT; 6186 6187 alu.dst.sel = tmp1; 6188 alu.dst.chan = 0; 6189 alu.dst.write = 1; 6190 6191 alu.src[0].sel = tmp0; 6192 alu.src[0].chan = 0; 6193 alu.src[1].sel = tmp0; 6194 alu.src[1].chan = 3; 6195 6196 alu.last = 1; 6197 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6198 return r; 6199 6200 /* 8. tmp1.y = tmp0.x + tmp0.w */ 6201 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6202 alu.op = ALU_OP2_ADD_INT; 6203 6204 alu.dst.sel = tmp1; 6205 alu.dst.chan = 1; 6206 alu.dst.write = 1; 6207 6208 alu.src[0].sel = tmp0; 6209 alu.src[0].chan = 0; 6210 alu.src[1].sel = tmp0; 6211 alu.src[1].chan = 3; 6212 6213 alu.last = 1; 6214 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6215 return r; 6216 6217 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 6218 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6219 alu.op = ALU_OP3_CNDE_INT; 6220 alu.is_op3 = 1; 6221 6222 alu.dst.sel = tmp0; 6223 alu.dst.chan = 0; 6224 alu.dst.write = 1; 6225 6226 alu.src[0].sel = tmp0; 6227 alu.src[0].chan = 1; 6228 alu.src[1].sel = tmp1; 6229 alu.src[1].chan = 1; 6230 alu.src[2].sel = tmp1; 6231 alu.src[2].chan = 0; 6232 6233 alu.last = 1; 6234 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6235 return r; 6236 6237 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 6238 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6239 alu.op = ALU_OP2_MULHI_UINT; 6240 6241 alu.dst.sel = tmp0; 6242 alu.dst.chan = 2; 6243 alu.dst.write = 1; 6244 6245 alu.src[0].sel = tmp0; 6246 alu.src[0].chan = 0; 6247 6248 if (signed_op) { 6249 alu.src[1].sel = tmp2; 6250 alu.src[1].chan = 0; 6251 } else { 6252 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6253 } 6254 6255 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6256 return r; 6257 6258 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 6259 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6260 alu.op = ALU_OP2_MULLO_UINT; 6261 6262 alu.dst.sel = tmp0; 6263 alu.dst.chan = 1; 6264 alu.dst.write = 1; 6265 6266 if (signed_op) { 6267 alu.src[0].sel = tmp2; 6268 alu.src[0].chan = 1; 6269 } else { 6270 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6271 } 6272 6273 alu.src[1].sel = tmp0; 6274 alu.src[1].chan = 2; 6275 6276 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6277 return r; 6278 6279 /* 12. tmp0.w = src1 - tmp0.y = r */ 6280 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6281 alu.op = ALU_OP2_SUB_INT; 6282 6283 alu.dst.sel = tmp0; 6284 alu.dst.chan = 3; 6285 alu.dst.write = 1; 6286 6287 if (signed_op) { 6288 alu.src[0].sel = tmp2; 6289 alu.src[0].chan = 0; 6290 } else { 6291 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6292 } 6293 6294 alu.src[1].sel = tmp0; 6295 alu.src[1].chan = 1; 6296 6297 alu.last = 1; 6298 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6299 return r; 6300 6301 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 6302 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6303 alu.op = ALU_OP2_SETGE_UINT; 6304 6305 alu.dst.sel = tmp1; 6306 alu.dst.chan = 0; 6307 alu.dst.write = 1; 6308 6309 alu.src[0].sel = tmp0; 6310 alu.src[0].chan = 3; 6311 if (signed_op) { 6312 alu.src[1].sel = tmp2; 6313 alu.src[1].chan = 1; 6314 } else { 6315 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6316 } 6317 6318 alu.last = 1; 6319 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6320 return r; 6321 6322 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 6323 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6324 alu.op = ALU_OP2_SETGE_UINT; 6325 6326 alu.dst.sel = tmp1; 6327 alu.dst.chan = 1; 6328 alu.dst.write = 1; 6329 6330 if (signed_op) { 6331 alu.src[0].sel = tmp2; 6332 alu.src[0].chan = 0; 6333 } else { 6334 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6335 } 6336 6337 alu.src[1].sel = tmp0; 6338 alu.src[1].chan = 1; 6339 6340 alu.last = 1; 6341 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6342 return r; 6343 6344 if (mod) { /* UMOD */ 6345 6346 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 6347 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6348 alu.op = ALU_OP2_SUB_INT; 6349 6350 alu.dst.sel = tmp1; 6351 alu.dst.chan = 2; 6352 alu.dst.write = 1; 6353 6354 alu.src[0].sel = tmp0; 6355 alu.src[0].chan = 3; 6356 6357 if (signed_op) { 6358 alu.src[1].sel = tmp2; 6359 alu.src[1].chan = 1; 6360 } else { 6361 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6362 } 6363 6364 alu.last = 1; 6365 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6366 return r; 6367 6368 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 6369 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6370 alu.op = ALU_OP2_ADD_INT; 6371 6372 alu.dst.sel = tmp1; 6373 alu.dst.chan = 3; 6374 alu.dst.write = 1; 6375 6376 alu.src[0].sel = tmp0; 6377 alu.src[0].chan = 3; 6378 if (signed_op) { 6379 alu.src[1].sel = tmp2; 6380 alu.src[1].chan = 1; 6381 } else { 6382 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6383 } 6384 6385 alu.last = 1; 6386 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6387 return r; 6388 6389 } else { /* UDIV */ 6390 6391 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 6392 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6393 alu.op = ALU_OP2_ADD_INT; 6394 6395 alu.dst.sel = tmp1; 6396 alu.dst.chan = 2; 6397 alu.dst.write = 1; 6398 6399 alu.src[0].sel = tmp0; 6400 alu.src[0].chan = 2; 6401 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6402 6403 alu.last = 1; 6404 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6405 return r; 6406 6407 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 6408 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6409 alu.op = ALU_OP2_ADD_INT; 6410 6411 alu.dst.sel = tmp1; 6412 alu.dst.chan = 3; 6413 alu.dst.write = 1; 6414 6415 alu.src[0].sel = tmp0; 6416 alu.src[0].chan = 2; 6417 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 6418 6419 alu.last = 1; 6420 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6421 return r; 6422 6423 } 6424 6425 /* 17. tmp1.x = tmp1.x & tmp1.y */ 6426 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6427 alu.op = ALU_OP2_AND_INT; 6428 6429 alu.dst.sel = tmp1; 6430 alu.dst.chan = 0; 6431 alu.dst.write = 1; 6432 6433 alu.src[0].sel = tmp1; 6434 alu.src[0].chan = 0; 6435 alu.src[1].sel = tmp1; 6436 alu.src[1].chan = 1; 6437 6438 alu.last = 1; 6439 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6440 return r; 6441 6442 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 6443 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 6444 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6445 alu.op = ALU_OP3_CNDE_INT; 6446 alu.is_op3 = 1; 6447 6448 alu.dst.sel = tmp0; 6449 alu.dst.chan = 2; 6450 alu.dst.write = 1; 6451 6452 alu.src[0].sel = tmp1; 6453 alu.src[0].chan = 0; 6454 alu.src[1].sel = tmp0; 6455 alu.src[1].chan = mod ? 3 : 2; 6456 alu.src[2].sel = tmp1; 6457 alu.src[2].chan = 2; 6458 6459 alu.last = 1; 6460 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6461 return r; 6462 6463 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 6464 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6465 alu.op = ALU_OP3_CNDE_INT; 6466 alu.is_op3 = 1; 6467 6468 if (signed_op) { 6469 alu.dst.sel = tmp0; 6470 alu.dst.chan = 2; 6471 alu.dst.write = 1; 6472 } else { 6473 if (tmp4 > 0) { 6474 alu.dst.sel = tmp4; 6475 alu.dst.chan = i; 6476 alu.dst.write = 1; 6477 } else { 6478 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6479 } 6480 } 6481 6482 alu.src[0].sel = tmp1; 6483 alu.src[0].chan = 1; 6484 alu.src[1].sel = tmp1; 6485 alu.src[1].chan = 3; 6486 alu.src[2].sel = tmp0; 6487 alu.src[2].chan = 2; 6488 6489 alu.last = 1; 6490 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6491 return r; 6492 6493 if (signed_op) { 6494 6495 /* fix the sign of the result */ 6496 6497 if (mod) { 6498 6499 /* tmp0.x = -tmp0.z */ 6500 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6501 alu.op = ALU_OP2_SUB_INT; 6502 6503 alu.dst.sel = tmp0; 6504 alu.dst.chan = 0; 6505 alu.dst.write = 1; 6506 6507 alu.src[0].sel = V_SQ_ALU_SRC_0; 6508 alu.src[1].sel = tmp0; 6509 alu.src[1].chan = 2; 6510 6511 alu.last = 1; 6512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6513 return r; 6514 6515 /* sign of the remainder is the same as the sign of src0 */ 6516 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 6517 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6518 alu.op = ALU_OP3_CNDGE_INT; 6519 alu.is_op3 = 1; 6520 6521 if (tmp4 > 0) { 6522 alu.dst.sel = tmp4; 6523 alu.dst.chan = i; 6524 alu.dst.write = 1; 6525 } else { 6526 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6527 } 6528 6529 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6530 alu.src[1].sel = tmp0; 6531 alu.src[1].chan = 2; 6532 alu.src[2].sel = tmp0; 6533 alu.src[2].chan = 0; 6534 6535 alu.last = 1; 6536 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6537 return r; 6538 6539 } else { 6540 6541 /* tmp0.x = -tmp0.z */ 6542 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6543 alu.op = ALU_OP2_SUB_INT; 6544 6545 alu.dst.sel = tmp0; 6546 alu.dst.chan = 0; 6547 alu.dst.write = 1; 6548 6549 alu.src[0].sel = V_SQ_ALU_SRC_0; 6550 alu.src[1].sel = tmp0; 6551 alu.src[1].chan = 2; 6552 6553 alu.last = 1; 6554 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6555 return r; 6556 6557 /* fix the quotient sign (same as the sign of src0*src1) */ 6558 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 6559 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6560 alu.op = ALU_OP3_CNDGE_INT; 6561 alu.is_op3 = 1; 6562 6563 if (tmp4 > 0) { 6564 alu.dst.sel = tmp4; 6565 alu.dst.chan = i; 6566 alu.dst.write = 1; 6567 } else { 6568 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6569 } 6570 6571 alu.src[0].sel = tmp2; 6572 alu.src[0].chan = 2; 6573 alu.src[1].sel = tmp0; 6574 alu.src[1].chan = 2; 6575 alu.src[2].sel = tmp0; 6576 alu.src[2].chan = 0; 6577 6578 alu.last = 1; 6579 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6580 return r; 6581 } 6582 } 6583 } 6584 6585 if (tmp4 > 0) { 6586 for (i = 0; i <= lasti; ++i) { 6587 if (!(write_mask & (1<<i))) 6588 continue; 6589 6590 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6591 alu.op = ALU_OP1_MOV; 6592 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6593 alu.src[0].sel = tmp4; 6594 alu.src[0].chan = i; 6595 6596 if (i == lasti) 6597 alu.last = 1; 6598 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6599 return r; 6600 } 6601 } 6602 6603 return 0; 6604} 6605 6606static int tgsi_udiv(struct r600_shader_ctx *ctx) 6607{ 6608 return tgsi_divmod(ctx, 0, 0); 6609} 6610 6611static int tgsi_umod(struct r600_shader_ctx *ctx) 6612{ 6613 return tgsi_divmod(ctx, 1, 0); 6614} 6615 6616static int tgsi_idiv(struct r600_shader_ctx *ctx) 6617{ 6618 return tgsi_divmod(ctx, 0, 1); 6619} 6620 6621static int tgsi_imod(struct r600_shader_ctx *ctx) 6622{ 6623 return tgsi_divmod(ctx, 1, 1); 6624} 6625 6626 6627static int tgsi_f2i(struct r600_shader_ctx *ctx) 6628{ 6629 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6630 struct r600_bytecode_alu alu; 6631 int i, r; 6632 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6633 int last_inst = tgsi_last_instruction(write_mask); 6634 6635 for (i = 0; i < 4; i++) { 6636 if (!(write_mask & (1<<i))) 6637 continue; 6638 6639 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6640 alu.op = ALU_OP1_TRUNC; 6641 6642 alu.dst.sel = ctx->temp_reg; 6643 alu.dst.chan = i; 6644 alu.dst.write = 1; 6645 6646 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6647 if (i == last_inst) 6648 alu.last = 1; 6649 r = r600_bytecode_add_alu(ctx->bc, &alu); 6650 if (r) 6651 return r; 6652 } 6653 6654 for (i = 0; i < 4; i++) { 6655 if (!(write_mask & (1<<i))) 6656 continue; 6657 6658 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6659 alu.op = ctx->inst_info->op; 6660 6661 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6662 6663 alu.src[0].sel = ctx->temp_reg; 6664 alu.src[0].chan = i; 6665 6666 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 6667 alu.last = 1; 6668 r = r600_bytecode_add_alu(ctx->bc, &alu); 6669 if (r) 6670 return r; 6671 } 6672 6673 return 0; 6674} 6675 6676static int tgsi_iabs(struct r600_shader_ctx *ctx) 6677{ 6678 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6679 struct r600_bytecode_alu alu; 6680 int i, r; 6681 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6682 int last_inst = tgsi_last_instruction(write_mask); 6683 6684 /* tmp = -src */ 6685 for (i = 0; i < 4; i++) { 6686 if (!(write_mask & (1<<i))) 6687 continue; 6688 6689 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6690 alu.op = ALU_OP2_SUB_INT; 6691 6692 alu.dst.sel = ctx->temp_reg; 6693 alu.dst.chan = i; 6694 alu.dst.write = 1; 6695 6696 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6697 alu.src[0].sel = V_SQ_ALU_SRC_0; 6698 6699 if (i == last_inst) 6700 alu.last = 1; 6701 r = r600_bytecode_add_alu(ctx->bc, &alu); 6702 if (r) 6703 return r; 6704 } 6705 6706 /* dst = (src >= 0 ? src : tmp) */ 6707 for (i = 0; i < 4; i++) { 6708 if (!(write_mask & (1<<i))) 6709 continue; 6710 6711 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6712 alu.op = ALU_OP3_CNDGE_INT; 6713 alu.is_op3 = 1; 6714 alu.dst.write = 1; 6715 6716 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6717 6718 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6719 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6720 alu.src[2].sel = ctx->temp_reg; 6721 alu.src[2].chan = i; 6722 6723 if (i == last_inst) 6724 alu.last = 1; 6725 r = r600_bytecode_add_alu(ctx->bc, &alu); 6726 if (r) 6727 return r; 6728 } 6729 return 0; 6730} 6731 6732static int tgsi_issg(struct r600_shader_ctx *ctx) 6733{ 6734 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6735 struct r600_bytecode_alu alu; 6736 int i, r; 6737 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6738 int last_inst = tgsi_last_instruction(write_mask); 6739 6740 /* tmp = (src >= 0 ? src : -1) */ 6741 for (i = 0; i < 4; i++) { 6742 if (!(write_mask & (1<<i))) 6743 continue; 6744 6745 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6746 alu.op = ALU_OP3_CNDGE_INT; 6747 alu.is_op3 = 1; 6748 6749 alu.dst.sel = ctx->temp_reg; 6750 alu.dst.chan = i; 6751 alu.dst.write = 1; 6752 6753 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6754 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6755 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 6756 6757 if (i == last_inst) 6758 alu.last = 1; 6759 r = r600_bytecode_add_alu(ctx->bc, &alu); 6760 if (r) 6761 return r; 6762 } 6763 6764 /* dst = (tmp > 0 ? 1 : tmp) */ 6765 for (i = 0; i < 4; i++) { 6766 if (!(write_mask & (1<<i))) 6767 continue; 6768 6769 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6770 alu.op = ALU_OP3_CNDGT_INT; 6771 alu.is_op3 = 1; 6772 alu.dst.write = 1; 6773 6774 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6775 6776 alu.src[0].sel = ctx->temp_reg; 6777 alu.src[0].chan = i; 6778 6779 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6780 6781 alu.src[2].sel = ctx->temp_reg; 6782 alu.src[2].chan = i; 6783 6784 if (i == last_inst) 6785 alu.last = 1; 6786 r = r600_bytecode_add_alu(ctx->bc, &alu); 6787 if (r) 6788 return r; 6789 } 6790 return 0; 6791} 6792 6793 6794 6795static int tgsi_ssg(struct r600_shader_ctx *ctx) 6796{ 6797 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6798 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6799 int last_inst = tgsi_last_instruction(write_mask); 6800 struct r600_bytecode_alu alu; 6801 int i, r; 6802 6803 /* tmp = (src > 0 ? 1 : src) */ 6804 for (i = 0; i <= last_inst; i++) { 6805 if (!(write_mask & (1 << i))) 6806 continue; 6807 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6808 alu.op = ALU_OP3_CNDGT; 6809 alu.is_op3 = 1; 6810 6811 alu.dst.sel = ctx->temp_reg; 6812 alu.dst.chan = i; 6813 6814 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6815 alu.src[1].sel = V_SQ_ALU_SRC_1; 6816 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6817 6818 if (i == last_inst) 6819 alu.last = 1; 6820 r = r600_bytecode_add_alu(ctx->bc, &alu); 6821 if (r) 6822 return r; 6823 } 6824 6825 /* dst = (-tmp > 0 ? -1 : tmp) */ 6826 for (i = 0; i <= last_inst; i++) { 6827 if (!(write_mask & (1 << i))) 6828 continue; 6829 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6830 alu.op = ALU_OP3_CNDGT; 6831 alu.is_op3 = 1; 6832 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6833 6834 alu.src[0].sel = ctx->temp_reg; 6835 alu.src[0].chan = i; 6836 alu.src[0].neg = 1; 6837 6838 alu.src[1].sel = V_SQ_ALU_SRC_1; 6839 alu.src[1].neg = 1; 6840 6841 alu.src[2].sel = ctx->temp_reg; 6842 alu.src[2].chan = i; 6843 6844 if (i == last_inst) 6845 alu.last = 1; 6846 r = r600_bytecode_add_alu(ctx->bc, &alu); 6847 if (r) 6848 return r; 6849 } 6850 return 0; 6851} 6852 6853static int tgsi_bfi(struct r600_shader_ctx *ctx) 6854{ 6855 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6856 struct r600_bytecode_alu alu; 6857 int i, r, t1, t2; 6858 6859 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6860 int last_inst = tgsi_last_instruction(write_mask); 6861 6862 t1 = r600_get_temp(ctx); 6863 6864 for (i = 0; i < 4; i++) { 6865 if (!(write_mask & (1<<i))) 6866 continue; 6867 6868 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6869 alu.op = ALU_OP2_SETGE_INT; 6870 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6871 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6872 alu.src[1].value = 32; 6873 alu.dst.sel = ctx->temp_reg; 6874 alu.dst.chan = i; 6875 alu.dst.write = 1; 6876 alu.last = i == last_inst; 6877 r = r600_bytecode_add_alu(ctx->bc, &alu); 6878 if (r) 6879 return r; 6880 } 6881 6882 for (i = 0; i < 4; i++) { 6883 if (!(write_mask & (1<<i))) 6884 continue; 6885 6886 /* create mask tmp */ 6887 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6888 alu.op = ALU_OP2_BFM_INT; 6889 alu.dst.sel = t1; 6890 alu.dst.chan = i; 6891 alu.dst.write = 1; 6892 alu.last = i == last_inst; 6893 6894 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6895 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6896 6897 r = r600_bytecode_add_alu(ctx->bc, &alu); 6898 if (r) 6899 return r; 6900 } 6901 6902 t2 = r600_get_temp(ctx); 6903 6904 for (i = 0; i < 4; i++) { 6905 if (!(write_mask & (1<<i))) 6906 continue; 6907 6908 /* shift insert left */ 6909 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6910 alu.op = ALU_OP2_LSHL_INT; 6911 alu.dst.sel = t2; 6912 alu.dst.chan = i; 6913 alu.dst.write = 1; 6914 alu.last = i == last_inst; 6915 6916 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6917 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6918 6919 r = r600_bytecode_add_alu(ctx->bc, &alu); 6920 if (r) 6921 return r; 6922 } 6923 6924 for (i = 0; i < 4; i++) { 6925 if (!(write_mask & (1<<i))) 6926 continue; 6927 6928 /* actual bitfield insert */ 6929 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6930 alu.op = ALU_OP3_BFI_INT; 6931 alu.is_op3 = 1; 6932 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6933 alu.dst.chan = i; 6934 alu.dst.write = 1; 6935 alu.last = i == last_inst; 6936 6937 alu.src[0].sel = t1; 6938 alu.src[0].chan = i; 6939 alu.src[1].sel = t2; 6940 alu.src[1].chan = i; 6941 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6942 6943 r = r600_bytecode_add_alu(ctx->bc, &alu); 6944 if (r) 6945 return r; 6946 } 6947 6948 for (i = 0; i < 4; i++) { 6949 if (!(write_mask & (1<<i))) 6950 continue; 6951 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6952 alu.op = ALU_OP3_CNDE_INT; 6953 alu.is_op3 = 1; 6954 alu.src[0].sel = ctx->temp_reg; 6955 alu.src[0].chan = i; 6956 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 6957 6958 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6959 6960 alu.src[1].sel = alu.dst.sel; 6961 alu.src[1].chan = i; 6962 6963 alu.last = i == last_inst; 6964 r = r600_bytecode_add_alu(ctx->bc, &alu); 6965 if (r) 6966 return r; 6967 } 6968 return 0; 6969} 6970 6971static int tgsi_msb(struct r600_shader_ctx *ctx) 6972{ 6973 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6974 struct r600_bytecode_alu alu; 6975 int i, r, t1, t2; 6976 6977 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6978 int last_inst = tgsi_last_instruction(write_mask); 6979 6980 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT || 6981 ctx->inst_info->op == ALU_OP1_FFBH_UINT); 6982 6983 t1 = ctx->temp_reg; 6984 6985 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */ 6986 for (i = 0; i < 4; i++) { 6987 if (!(write_mask & (1<<i))) 6988 continue; 6989 6990 /* t1 = FFBH_INT / FFBH_UINT */ 6991 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6992 alu.op = ctx->inst_info->op; 6993 alu.dst.sel = t1; 6994 alu.dst.chan = i; 6995 alu.dst.write = 1; 6996 alu.last = i == last_inst; 6997 6998 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6999 7000 r = r600_bytecode_add_alu(ctx->bc, &alu); 7001 if (r) 7002 return r; 7003 } 7004 7005 t2 = r600_get_temp(ctx); 7006 7007 for (i = 0; i < 4; i++) { 7008 if (!(write_mask & (1<<i))) 7009 continue; 7010 7011 /* t2 = 31 - t1 */ 7012 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7013 alu.op = ALU_OP2_SUB_INT; 7014 alu.dst.sel = t2; 7015 alu.dst.chan = i; 7016 alu.dst.write = 1; 7017 alu.last = i == last_inst; 7018 7019 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 7020 alu.src[0].value = 31; 7021 alu.src[1].sel = t1; 7022 alu.src[1].chan = i; 7023 7024 r = r600_bytecode_add_alu(ctx->bc, &alu); 7025 if (r) 7026 return r; 7027 } 7028 7029 for (i = 0; i < 4; i++) { 7030 if (!(write_mask & (1<<i))) 7031 continue; 7032 7033 /* result = t1 >= 0 ? t2 : t1 */ 7034 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7035 alu.op = ALU_OP3_CNDGE_INT; 7036 alu.is_op3 = 1; 7037 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7038 alu.dst.chan = i; 7039 alu.dst.write = 1; 7040 alu.last = i == last_inst; 7041 7042 alu.src[0].sel = t1; 7043 alu.src[0].chan = i; 7044 alu.src[1].sel = t2; 7045 alu.src[1].chan = i; 7046 alu.src[2].sel = t1; 7047 alu.src[2].chan = i; 7048 7049 r = r600_bytecode_add_alu(ctx->bc, &alu); 7050 if (r) 7051 return r; 7052 } 7053 7054 return 0; 7055} 7056 7057static int tgsi_interp_egcm(struct r600_shader_ctx *ctx) 7058{ 7059 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7060 struct r600_bytecode_alu alu; 7061 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti; 7062 unsigned location; 7063 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs; 7064 7065 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); 7066 7067 /* Interpolators have been marked for use already by allocate_system_value_inputs */ 7068 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7069 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7070 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */ 7071 } 7072 else { 7073 location = TGSI_INTERPOLATE_LOC_CENTROID; 7074 ctx->shader->input[input].uses_interpolate_at_centroid = 1; 7075 } 7076 7077 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location); 7078 if (k < 0) 7079 k = 0; 7080 interp_gpr = ctx->eg_interpolators[k].ij_index / 2; 7081 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2); 7082 7083 /* NOTE: currently offset is not perspective correct */ 7084 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7085 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7086 int sample_gpr = -1; 7087 int gradientsH, gradientsV; 7088 struct r600_bytecode_tex tex; 7089 7090 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7091 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]); 7092 } 7093 7094 gradientsH = r600_get_temp(ctx); 7095 gradientsV = r600_get_temp(ctx); 7096 for (i = 0; i < 2; i++) { 7097 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7098 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V; 7099 tex.src_gpr = interp_gpr; 7100 tex.src_sel_x = interp_base_chan + 0; 7101 tex.src_sel_y = interp_base_chan + 1; 7102 tex.src_sel_z = 0; 7103 tex.src_sel_w = 0; 7104 tex.dst_gpr = i == 0 ? gradientsH : gradientsV; 7105 tex.dst_sel_x = 0; 7106 tex.dst_sel_y = 1; 7107 tex.dst_sel_z = 7; 7108 tex.dst_sel_w = 7; 7109 tex.inst_mod = 1; // Use per pixel gradient calculation 7110 tex.sampler_id = 0; 7111 tex.resource_id = tex.sampler_id; 7112 r = r600_bytecode_add_tex(ctx->bc, &tex); 7113 if (r) 7114 return r; 7115 } 7116 7117 for (i = 0; i < 2; i++) { 7118 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7119 alu.op = ALU_OP3_MULADD; 7120 alu.is_op3 = 1; 7121 alu.src[0].sel = gradientsH; 7122 alu.src[0].chan = i; 7123 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7124 alu.src[1].sel = sample_gpr; 7125 alu.src[1].chan = 2; 7126 } 7127 else { 7128 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 7129 } 7130 alu.src[2].sel = interp_gpr; 7131 alu.src[2].chan = interp_base_chan + i; 7132 alu.dst.sel = ctx->temp_reg; 7133 alu.dst.chan = i; 7134 alu.last = i == 1; 7135 7136 r = r600_bytecode_add_alu(ctx->bc, &alu); 7137 if (r) 7138 return r; 7139 } 7140 7141 for (i = 0; i < 2; i++) { 7142 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7143 alu.op = ALU_OP3_MULADD; 7144 alu.is_op3 = 1; 7145 alu.src[0].sel = gradientsV; 7146 alu.src[0].chan = i; 7147 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7148 alu.src[1].sel = sample_gpr; 7149 alu.src[1].chan = 3; 7150 } 7151 else { 7152 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 7153 } 7154 alu.src[2].sel = ctx->temp_reg; 7155 alu.src[2].chan = i; 7156 alu.dst.sel = ctx->temp_reg; 7157 alu.dst.chan = i; 7158 alu.last = i == 1; 7159 7160 r = r600_bytecode_add_alu(ctx->bc, &alu); 7161 if (r) 7162 return r; 7163 } 7164 } 7165 7166 tmp = r600_get_temp(ctx); 7167 for (i = 0; i < 8; i++) { 7168 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7169 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY; 7170 7171 alu.dst.sel = tmp; 7172 if ((i > 1 && i < 6)) { 7173 alu.dst.write = 1; 7174 } 7175 else { 7176 alu.dst.write = 0; 7177 } 7178 alu.dst.chan = i % 4; 7179 7180 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7181 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7182 alu.src[0].sel = ctx->temp_reg; 7183 alu.src[0].chan = 1 - (i % 2); 7184 } else { 7185 alu.src[0].sel = interp_gpr; 7186 alu.src[0].chan = interp_base_chan + 1 - (i % 2); 7187 } 7188 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 7189 alu.src[1].chan = 0; 7190 7191 alu.last = i % 4 == 3; 7192 alu.bank_swizzle_force = SQ_ALU_VEC_210; 7193 7194 r = r600_bytecode_add_alu(ctx->bc, &alu); 7195 if (r) 7196 return r; 7197 } 7198 7199 // INTERP can't swizzle dst 7200 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7201 for (i = 0; i <= lasti; i++) { 7202 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7203 continue; 7204 7205 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7206 alu.op = ALU_OP1_MOV; 7207 alu.src[0].sel = tmp; 7208 alu.src[0].chan = ctx->src[0].swizzle[i]; 7209 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7210 alu.dst.write = 1; 7211 alu.last = i == lasti; 7212 r = r600_bytecode_add_alu(ctx->bc, &alu); 7213 if (r) 7214 return r; 7215 } 7216 7217 return 0; 7218} 7219 7220 7221static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 7222{ 7223 struct r600_bytecode_alu alu; 7224 int i, r; 7225 7226 for (i = 0; i < 4; i++) { 7227 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7228 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 7229 alu.op = ALU_OP0_NOP; 7230 alu.dst.chan = i; 7231 } else { 7232 alu.op = ALU_OP1_MOV; 7233 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7234 alu.src[0].sel = ctx->temp_reg; 7235 alu.src[0].chan = i; 7236 } 7237 if (i == 3) { 7238 alu.last = 1; 7239 } 7240 r = r600_bytecode_add_alu(ctx->bc, &alu); 7241 if (r) 7242 return r; 7243 } 7244 return 0; 7245} 7246 7247static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx, 7248 unsigned writemask, 7249 struct r600_bytecode_alu_src *bc_src, 7250 const struct r600_shader_src *shader_src) 7251{ 7252 struct r600_bytecode_alu alu; 7253 int i, r; 7254 int lasti = tgsi_last_instruction(writemask); 7255 int temp_reg = 0; 7256 7257 r600_bytecode_src(&bc_src[0], shader_src, 0); 7258 r600_bytecode_src(&bc_src[1], shader_src, 1); 7259 r600_bytecode_src(&bc_src[2], shader_src, 2); 7260 r600_bytecode_src(&bc_src[3], shader_src, 3); 7261 7262 if (bc_src->abs) { 7263 temp_reg = r600_get_temp(ctx); 7264 7265 for (i = 0; i < lasti + 1; i++) { 7266 if (!(writemask & (1 << i))) 7267 continue; 7268 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7269 alu.op = ALU_OP1_MOV; 7270 alu.dst.sel = temp_reg; 7271 alu.dst.chan = i; 7272 alu.dst.write = 1; 7273 alu.src[0] = bc_src[i]; 7274 if (i == lasti) { 7275 alu.last = 1; 7276 } 7277 r = r600_bytecode_add_alu(ctx->bc, &alu); 7278 if (r) 7279 return r; 7280 memset(&bc_src[i], 0, sizeof(*bc_src)); 7281 bc_src[i].sel = temp_reg; 7282 bc_src[i].chan = i; 7283 } 7284 } 7285 return 0; 7286} 7287 7288static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) 7289{ 7290 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7291 struct r600_bytecode_alu alu; 7292 struct r600_bytecode_alu_src srcs[4][4]; 7293 int i, j, r; 7294 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7295 unsigned op = ctx->inst_info->op; 7296 7297 if (op == ALU_OP3_MULADD_IEEE && 7298 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 7299 op = ALU_OP3_MULADD; 7300 7301 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7302 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 7303 srcs[j], &ctx->src[j]); 7304 if (r) 7305 return r; 7306 } 7307 7308 for (i = 0; i < lasti + 1; i++) { 7309 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7310 continue; 7311 7312 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7313 alu.op = op; 7314 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7315 alu.src[j] = srcs[j][i]; 7316 } 7317 7318 if (dst == -1) { 7319 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7320 } else { 7321 alu.dst.sel = dst; 7322 } 7323 alu.dst.chan = i; 7324 alu.dst.write = 1; 7325 alu.is_op3 = 1; 7326 if (i == lasti) { 7327 alu.last = 1; 7328 } 7329 r = r600_bytecode_add_alu(ctx->bc, &alu); 7330 if (r) 7331 return r; 7332 } 7333 return 0; 7334} 7335 7336static int tgsi_op3(struct r600_shader_ctx *ctx) 7337{ 7338 return tgsi_op3_dst(ctx, -1); 7339} 7340 7341static int tgsi_dp(struct r600_shader_ctx *ctx) 7342{ 7343 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7344 struct r600_bytecode_alu alu; 7345 int i, j, r; 7346 unsigned op = ctx->inst_info->op; 7347 if (op == ALU_OP2_DOT4_IEEE && 7348 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 7349 op = ALU_OP2_DOT4; 7350 7351 for (i = 0; i < 4; i++) { 7352 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7353 alu.op = op; 7354 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7355 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 7356 } 7357 7358 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7359 alu.dst.chan = i; 7360 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 7361 /* handle some special cases */ 7362 switch (inst->Instruction.Opcode) { 7363 case TGSI_OPCODE_DP2: 7364 if (i > 1) { 7365 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7366 alu.src[0].chan = alu.src[1].chan = 0; 7367 } 7368 break; 7369 case TGSI_OPCODE_DP3: 7370 if (i > 2) { 7371 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7372 alu.src[0].chan = alu.src[1].chan = 0; 7373 } 7374 break; 7375 default: 7376 break; 7377 } 7378 if (i == 3) { 7379 alu.last = 1; 7380 } 7381 r = r600_bytecode_add_alu(ctx->bc, &alu); 7382 if (r) 7383 return r; 7384 } 7385 return 0; 7386} 7387 7388static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 7389 unsigned index) 7390{ 7391 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7392 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 7393 inst->Src[index].Register.File != TGSI_FILE_INPUT && 7394 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 7395 ctx->src[index].neg || ctx->src[index].abs || 7396 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY); 7397} 7398 7399static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 7400 unsigned index) 7401{ 7402 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7403 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 7404} 7405 7406static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 7407{ 7408 struct r600_bytecode_vtx vtx; 7409 struct r600_bytecode_alu alu; 7410 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7411 int src_gpr, r, i; 7412 int id = tgsi_tex_get_src_gpr(ctx, 1); 7413 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7414 7415 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7416 if (src_requires_loading) { 7417 for (i = 0; i < 4; i++) { 7418 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7419 alu.op = ALU_OP1_MOV; 7420 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7421 alu.dst.sel = ctx->temp_reg; 7422 alu.dst.chan = i; 7423 if (i == 3) 7424 alu.last = 1; 7425 alu.dst.write = 1; 7426 r = r600_bytecode_add_alu(ctx->bc, &alu); 7427 if (r) 7428 return r; 7429 } 7430 src_gpr = ctx->temp_reg; 7431 } 7432 7433 memset(&vtx, 0, sizeof(vtx)); 7434 vtx.op = FETCH_OP_VFETCH; 7435 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 7436 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7437 vtx.src_gpr = src_gpr; 7438 vtx.mega_fetch_count = 16; 7439 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7440 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7441 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 7442 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 7443 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 7444 vtx.use_const_fields = 1; 7445 vtx.buffer_index_mode = sampler_index_mode; 7446 7447 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 7448 return r; 7449 7450 if (ctx->bc->chip_class >= EVERGREEN) 7451 return 0; 7452 7453 for (i = 0; i < 4; i++) { 7454 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7455 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7456 continue; 7457 7458 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7459 alu.op = ALU_OP2_AND_INT; 7460 7461 alu.dst.chan = i; 7462 alu.dst.sel = vtx.dst_gpr; 7463 alu.dst.write = 1; 7464 7465 alu.src[0].sel = vtx.dst_gpr; 7466 alu.src[0].chan = i; 7467 7468 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL; 7469 alu.src[1].sel += (id * 2); 7470 alu.src[1].chan = i % 4; 7471 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7472 7473 if (i == lasti) 7474 alu.last = 1; 7475 r = r600_bytecode_add_alu(ctx->bc, &alu); 7476 if (r) 7477 return r; 7478 } 7479 7480 if (inst->Dst[0].Register.WriteMask & 3) { 7481 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7482 alu.op = ALU_OP2_OR_INT; 7483 7484 alu.dst.chan = 3; 7485 alu.dst.sel = vtx.dst_gpr; 7486 alu.dst.write = 1; 7487 7488 alu.src[0].sel = vtx.dst_gpr; 7489 alu.src[0].chan = 3; 7490 7491 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1; 7492 alu.src[1].chan = 0; 7493 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7494 7495 alu.last = 1; 7496 r = r600_bytecode_add_alu(ctx->bc, &alu); 7497 if (r) 7498 return r; 7499 } 7500 return 0; 7501} 7502 7503static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base) 7504{ 7505 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7506 int r; 7507 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; 7508 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7509 7510 if (ctx->bc->chip_class < EVERGREEN) { 7511 struct r600_bytecode_alu alu; 7512 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7513 alu.op = ALU_OP1_MOV; 7514 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 7515 /* r600 we have them at channel 2 of the second dword */ 7516 alu.src[0].sel += (id * 2) + 1; 7517 alu.src[0].chan = 1; 7518 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7519 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 7520 alu.last = 1; 7521 r = r600_bytecode_add_alu(ctx->bc, &alu); 7522 if (r) 7523 return r; 7524 return 0; 7525 } else { 7526 struct r600_bytecode_vtx vtx; 7527 memset(&vtx, 0, sizeof(vtx)); 7528 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 7529 vtx.buffer_id = id + eg_buffer_base; 7530 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7531 vtx.src_gpr = 0; 7532 vtx.mega_fetch_count = 16; /* no idea here really... */ 7533 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7534 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7535 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */ 7536 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */ 7537 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */ 7538 vtx.data_format = FMT_32_32_32_32; 7539 vtx.buffer_index_mode = sampler_index_mode; 7540 7541 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 7542 return r; 7543 return 0; 7544 } 7545} 7546 7547 7548static int tgsi_tex(struct r600_shader_ctx *ctx) 7549{ 7550 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7551 struct r600_bytecode_tex tex; 7552 struct r600_bytecode_tex grad_offs[3]; 7553 struct r600_bytecode_alu alu; 7554 unsigned src_gpr; 7555 int r, i, j, n_grad_offs = 0; 7556 int opcode; 7557 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing && 7558 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7559 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 7560 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 7561 7562 bool txf_add_offsets = inst->Texture.NumOffsets && 7563 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7564 inst->Texture.Texture != TGSI_TEXTURE_BUFFER; 7565 7566 /* Texture fetch instructions can only use gprs as source. 7567 * Also they cannot negate the source or take the absolute value */ 7568 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS && 7569 tgsi_tex_src_requires_loading(ctx, 0)) || 7570 read_compressed_msaa || txf_add_offsets; 7571 7572 boolean src_loaded = FALSE; 7573 unsigned sampler_src_reg = 1; 7574 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 7575 boolean has_txq_cube_array_z = false; 7576 unsigned sampler_index_mode; 7577 int array_index_offset_channel = -1; 7578 7579 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 7580 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7581 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 7582 if (inst->Dst[0].Register.WriteMask & 4) { 7583 ctx->shader->has_txq_cube_array_z_comp = true; 7584 has_txq_cube_array_z = true; 7585 } 7586 7587 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 7588 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7589 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 || 7590 inst->Instruction.Opcode == TGSI_OPCODE_TG4) 7591 sampler_src_reg = 2; 7592 7593 /* TGSI moves the sampler to src reg 3 for TXD */ 7594 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) 7595 sampler_src_reg = 3; 7596 7597 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7598 7599 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7600 7601 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 7602 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 7603 if (ctx->bc->chip_class < EVERGREEN) 7604 ctx->shader->uses_tex_buffers = true; 7605 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS); 7606 } 7607 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 7608 if (ctx->bc->chip_class < EVERGREEN) 7609 ctx->shader->uses_tex_buffers = true; 7610 return do_vtx_fetch_inst(ctx, src_requires_loading); 7611 } 7612 } 7613 7614 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 7615 int out_chan; 7616 /* Add perspective divide */ 7617 if (ctx->bc->chip_class == CAYMAN) { 7618 out_chan = 2; 7619 for (i = 0; i < 3; i++) { 7620 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7621 alu.op = ALU_OP1_RECIP_IEEE; 7622 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7623 7624 alu.dst.sel = ctx->temp_reg; 7625 alu.dst.chan = i; 7626 if (i == 2) 7627 alu.last = 1; 7628 if (out_chan == i) 7629 alu.dst.write = 1; 7630 r = r600_bytecode_add_alu(ctx->bc, &alu); 7631 if (r) 7632 return r; 7633 } 7634 7635 } else { 7636 out_chan = 3; 7637 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7638 alu.op = ALU_OP1_RECIP_IEEE; 7639 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7640 7641 alu.dst.sel = ctx->temp_reg; 7642 alu.dst.chan = out_chan; 7643 alu.last = 1; 7644 alu.dst.write = 1; 7645 r = r600_bytecode_add_alu(ctx->bc, &alu); 7646 if (r) 7647 return r; 7648 } 7649 7650 for (i = 0; i < 3; i++) { 7651 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7652 alu.op = ALU_OP2_MUL; 7653 alu.src[0].sel = ctx->temp_reg; 7654 alu.src[0].chan = out_chan; 7655 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 7656 alu.dst.sel = ctx->temp_reg; 7657 alu.dst.chan = i; 7658 alu.dst.write = 1; 7659 r = r600_bytecode_add_alu(ctx->bc, &alu); 7660 if (r) 7661 return r; 7662 } 7663 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7664 alu.op = ALU_OP1_MOV; 7665 alu.src[0].sel = V_SQ_ALU_SRC_1; 7666 alu.src[0].chan = 0; 7667 alu.dst.sel = ctx->temp_reg; 7668 alu.dst.chan = 3; 7669 alu.last = 1; 7670 alu.dst.write = 1; 7671 r = r600_bytecode_add_alu(ctx->bc, &alu); 7672 if (r) 7673 return r; 7674 src_loaded = TRUE; 7675 src_gpr = ctx->temp_reg; 7676 } 7677 7678 7679 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 7680 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7681 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7682 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 7683 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) { 7684 7685 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 7686 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 7687 7688 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 7689 for (i = 0; i < 4; i++) { 7690 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7691 alu.op = ALU_OP2_CUBE; 7692 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 7693 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 7694 alu.dst.sel = ctx->temp_reg; 7695 alu.dst.chan = i; 7696 if (i == 3) 7697 alu.last = 1; 7698 alu.dst.write = 1; 7699 r = r600_bytecode_add_alu(ctx->bc, &alu); 7700 if (r) 7701 return r; 7702 } 7703 7704 /* tmp1.z = RCP_e(|tmp1.z|) */ 7705 if (ctx->bc->chip_class == CAYMAN) { 7706 for (i = 0; i < 3; i++) { 7707 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7708 alu.op = ALU_OP1_RECIP_IEEE; 7709 alu.src[0].sel = ctx->temp_reg; 7710 alu.src[0].chan = 2; 7711 alu.src[0].abs = 1; 7712 alu.dst.sel = ctx->temp_reg; 7713 alu.dst.chan = i; 7714 if (i == 2) 7715 alu.dst.write = 1; 7716 if (i == 2) 7717 alu.last = 1; 7718 r = r600_bytecode_add_alu(ctx->bc, &alu); 7719 if (r) 7720 return r; 7721 } 7722 } else { 7723 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7724 alu.op = ALU_OP1_RECIP_IEEE; 7725 alu.src[0].sel = ctx->temp_reg; 7726 alu.src[0].chan = 2; 7727 alu.src[0].abs = 1; 7728 alu.dst.sel = ctx->temp_reg; 7729 alu.dst.chan = 2; 7730 alu.dst.write = 1; 7731 alu.last = 1; 7732 r = r600_bytecode_add_alu(ctx->bc, &alu); 7733 if (r) 7734 return r; 7735 } 7736 7737 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 7738 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 7739 * muladd has no writemask, have to use another temp 7740 */ 7741 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7742 alu.op = ALU_OP3_MULADD; 7743 alu.is_op3 = 1; 7744 7745 alu.src[0].sel = ctx->temp_reg; 7746 alu.src[0].chan = 0; 7747 alu.src[1].sel = ctx->temp_reg; 7748 alu.src[1].chan = 2; 7749 7750 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7751 alu.src[2].chan = 0; 7752 alu.src[2].value = u_bitcast_f2u(1.5f); 7753 7754 alu.dst.sel = ctx->temp_reg; 7755 alu.dst.chan = 0; 7756 alu.dst.write = 1; 7757 7758 r = r600_bytecode_add_alu(ctx->bc, &alu); 7759 if (r) 7760 return r; 7761 7762 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7763 alu.op = ALU_OP3_MULADD; 7764 alu.is_op3 = 1; 7765 7766 alu.src[0].sel = ctx->temp_reg; 7767 alu.src[0].chan = 1; 7768 alu.src[1].sel = ctx->temp_reg; 7769 alu.src[1].chan = 2; 7770 7771 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7772 alu.src[2].chan = 0; 7773 alu.src[2].value = u_bitcast_f2u(1.5f); 7774 7775 alu.dst.sel = ctx->temp_reg; 7776 alu.dst.chan = 1; 7777 alu.dst.write = 1; 7778 7779 alu.last = 1; 7780 r = r600_bytecode_add_alu(ctx->bc, &alu); 7781 if (r) 7782 return r; 7783 /* write initial compare value into Z component 7784 - W src 0 for shadow cube 7785 - X src 1 for shadow cube array */ 7786 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7787 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7788 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7789 alu.op = ALU_OP1_MOV; 7790 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 7791 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7792 else 7793 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7794 alu.dst.sel = ctx->temp_reg; 7795 alu.dst.chan = 2; 7796 alu.dst.write = 1; 7797 alu.last = 1; 7798 r = r600_bytecode_add_alu(ctx->bc, &alu); 7799 if (r) 7800 return r; 7801 } 7802 7803 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7804 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7805 if (ctx->bc->chip_class >= EVERGREEN) { 7806 int mytmp = r600_get_temp(ctx); 7807 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7808 alu.op = ALU_OP1_MOV; 7809 alu.src[0].sel = ctx->temp_reg; 7810 alu.src[0].chan = 3; 7811 alu.dst.sel = mytmp; 7812 alu.dst.chan = 0; 7813 alu.dst.write = 1; 7814 alu.last = 1; 7815 r = r600_bytecode_add_alu(ctx->bc, &alu); 7816 if (r) 7817 return r; 7818 7819 /* Evaluate the array index according to floor(idx + 0.5). This 7820 * needs to be done before merging the face select value, because 7821 * otherwise the fractional part of the array index will interfere 7822 * with the face select value */ 7823 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7824 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7825 alu.op = ALU_OP1_RNDNE; 7826 alu.dst.sel = ctx->temp_reg; 7827 alu.dst.chan = 3; 7828 alu.dst.write = 1; 7829 alu.last = 1; 7830 r = r600_bytecode_add_alu(ctx->bc, &alu); 7831 if (r) 7832 return r; 7833 7834 /* Because the array slice index and the cube face index are merged 7835 * into one value we have to make sure the array slice index is >= 0, 7836 * otherwise the face selection will fail */ 7837 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7838 alu.op = ALU_OP2_MAX; 7839 alu.src[0].sel = ctx->temp_reg; 7840 alu.src[0].chan = 3; 7841 alu.src[1].sel = V_SQ_ALU_SRC_0; 7842 alu.dst.sel = ctx->temp_reg; 7843 alu.dst.chan = 3; 7844 alu.dst.write = 1; 7845 alu.last = 1; 7846 r = r600_bytecode_add_alu(ctx->bc, &alu); 7847 if (r) 7848 return r; 7849 7850 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 7851 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7852 alu.op = ALU_OP3_MULADD; 7853 alu.is_op3 = 1; 7854 alu.src[0].sel = ctx->temp_reg; 7855 alu.src[0].chan = 3; 7856 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7857 alu.src[1].chan = 0; 7858 alu.src[1].value = u_bitcast_f2u(8.0f); 7859 alu.src[2].sel = mytmp; 7860 alu.src[2].chan = 0; 7861 alu.dst.sel = ctx->temp_reg; 7862 alu.dst.chan = 3; 7863 alu.dst.write = 1; 7864 alu.last = 1; 7865 r = r600_bytecode_add_alu(ctx->bc, &alu); 7866 if (r) 7867 return r; 7868 } else if (ctx->bc->chip_class < EVERGREEN) { 7869 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7870 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 7871 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7872 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7873 tex.src_gpr = r600_get_temp(ctx); 7874 tex.src_sel_x = 0; 7875 tex.src_sel_y = 0; 7876 tex.src_sel_z = 0; 7877 tex.src_sel_w = 0; 7878 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 7879 tex.coord_type_x = 1; 7880 tex.coord_type_y = 1; 7881 tex.coord_type_z = 1; 7882 tex.coord_type_w = 1; 7883 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7884 alu.op = ALU_OP1_MOV; 7885 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7886 alu.dst.sel = tex.src_gpr; 7887 alu.dst.chan = 0; 7888 alu.last = 1; 7889 alu.dst.write = 1; 7890 r = r600_bytecode_add_alu(ctx->bc, &alu); 7891 if (r) 7892 return r; 7893 7894 r = r600_bytecode_add_tex(ctx->bc, &tex); 7895 if (r) 7896 return r; 7897 } 7898 7899 } 7900 7901 /* for cube forms of lod and bias we need to route things */ 7902 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 7903 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 7904 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7905 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 7906 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7907 alu.op = ALU_OP1_MOV; 7908 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7909 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 7910 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7911 else 7912 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7913 alu.dst.sel = ctx->temp_reg; 7914 alu.dst.chan = 2; 7915 alu.last = 1; 7916 alu.dst.write = 1; 7917 r = r600_bytecode_add_alu(ctx->bc, &alu); 7918 if (r) 7919 return r; 7920 } 7921 7922 src_loaded = TRUE; 7923 src_gpr = ctx->temp_reg; 7924 } 7925 7926 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 7927 int temp_h = 0, temp_v = 0; 7928 int start_val = 0; 7929 7930 /* if we've already loaded the src (i.e. CUBE don't reload it). */ 7931 if (src_loaded == TRUE) 7932 start_val = 1; 7933 else 7934 src_loaded = TRUE; 7935 for (i = start_val; i < 3; i++) { 7936 int treg = r600_get_temp(ctx); 7937 7938 if (i == 0) 7939 src_gpr = treg; 7940 else if (i == 1) 7941 temp_h = treg; 7942 else 7943 temp_v = treg; 7944 7945 for (j = 0; j < 4; j++) { 7946 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7947 alu.op = ALU_OP1_MOV; 7948 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 7949 alu.dst.sel = treg; 7950 alu.dst.chan = j; 7951 if (j == 3) 7952 alu.last = 1; 7953 alu.dst.write = 1; 7954 r = r600_bytecode_add_alu(ctx->bc, &alu); 7955 if (r) 7956 return r; 7957 } 7958 } 7959 for (i = 1; i < 3; i++) { 7960 /* set gradients h/v */ 7961 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++]; 7962 memset(t, 0, sizeof(struct r600_bytecode_tex)); 7963 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 7964 FETCH_OP_SET_GRADIENTS_V; 7965 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7966 t->sampler_index_mode = sampler_index_mode; 7967 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 7968 t->resource_index_mode = sampler_index_mode; 7969 7970 t->src_gpr = (i == 1) ? temp_h : temp_v; 7971 t->src_sel_x = 0; 7972 t->src_sel_y = 1; 7973 t->src_sel_z = 2; 7974 t->src_sel_w = 3; 7975 7976 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */ 7977 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7; 7978 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 7979 t->coord_type_x = 1; 7980 t->coord_type_y = 1; 7981 t->coord_type_z = 1; 7982 t->coord_type_w = 1; 7983 } 7984 } 7985 } 7986 7987 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 7988 /* Gather4 should follow the same rules as bilinear filtering, but the hardware 7989 * incorrectly forces nearest filtering if the texture format is integer. 7990 * The only effect it has on Gather4, which always returns 4 texels for 7991 * bilinear filtering, is that the final coordinates are off by 0.5 of 7992 * the texel size. 7993 * 7994 * The workaround is to subtract 0.5 from the unnormalized coordinates, 7995 * or (0.5 / size) from the normalized coordinates. 7996 */ 7997 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT || 7998 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) { 7999 int treg = r600_get_temp(ctx); 8000 8001 /* mov array and comparison oordinate to temp_reg if needed */ 8002 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8003 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8004 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) { 8005 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2; 8006 for (i = 2; i <= end; i++) { 8007 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8008 alu.op = ALU_OP1_MOV; 8009 alu.dst.sel = ctx->temp_reg; 8010 alu.dst.chan = i; 8011 alu.dst.write = 1; 8012 alu.last = (i == end); 8013 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8014 r = r600_bytecode_add_alu(ctx->bc, &alu); 8015 if (r) 8016 return r; 8017 } 8018 } 8019 8020 if (inst->Texture.Texture == TGSI_TEXTURE_RECT || 8021 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) { 8022 for (i = 0; i < 2; i++) { 8023 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8024 alu.op = ALU_OP2_ADD; 8025 alu.dst.sel = ctx->temp_reg; 8026 alu.dst.chan = i; 8027 alu.dst.write = 1; 8028 alu.last = i == 1; 8029 if (src_loaded) { 8030 alu.src[0].sel = ctx->temp_reg; 8031 alu.src[0].chan = i; 8032 } else 8033 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8034 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 8035 alu.src[1].neg = 1; 8036 r = r600_bytecode_add_alu(ctx->bc, &alu); 8037 if (r) 8038 return r; 8039 } 8040 } else { 8041 /* execute a TXQ */ 8042 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8043 tex.op = FETCH_OP_GET_TEXTURE_RESINFO; 8044 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8045 tex.sampler_index_mode = sampler_index_mode; 8046 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8047 tex.resource_index_mode = sampler_index_mode; 8048 tex.dst_gpr = treg; 8049 tex.src_sel_x = 4; 8050 tex.src_sel_y = 4; 8051 tex.src_sel_z = 4; 8052 tex.src_sel_w = 4; 8053 tex.dst_sel_x = 0; 8054 tex.dst_sel_y = 1; 8055 tex.dst_sel_z = 7; 8056 tex.dst_sel_w = 7; 8057 r = r600_bytecode_add_tex(ctx->bc, &tex); 8058 if (r) 8059 return r; 8060 8061 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */ 8062 if (ctx->bc->chip_class == CAYMAN) { 8063 /* */ 8064 for (i = 0; i < 2; i++) { 8065 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8066 alu.op = ALU_OP1_INT_TO_FLT; 8067 alu.dst.sel = treg; 8068 alu.dst.chan = i; 8069 alu.dst.write = 1; 8070 alu.src[0].sel = treg; 8071 alu.src[0].chan = i; 8072 alu.last = (i == 1) ? 1 : 0; 8073 r = r600_bytecode_add_alu(ctx->bc, &alu); 8074 if (r) 8075 return r; 8076 } 8077 for (j = 0; j < 2; j++) { 8078 for (i = 0; i < 3; i++) { 8079 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8080 alu.op = ALU_OP1_RECIP_IEEE; 8081 alu.src[0].sel = treg; 8082 alu.src[0].chan = j; 8083 alu.dst.sel = treg; 8084 alu.dst.chan = i; 8085 if (i == 2) 8086 alu.last = 1; 8087 if (i == j) 8088 alu.dst.write = 1; 8089 r = r600_bytecode_add_alu(ctx->bc, &alu); 8090 if (r) 8091 return r; 8092 } 8093 } 8094 } else { 8095 for (i = 0; i < 2; i++) { 8096 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8097 alu.op = ALU_OP1_INT_TO_FLT; 8098 alu.dst.sel = treg; 8099 alu.dst.chan = i; 8100 alu.dst.write = 1; 8101 alu.src[0].sel = treg; 8102 alu.src[0].chan = i; 8103 alu.last = 1; 8104 r = r600_bytecode_add_alu(ctx->bc, &alu); 8105 if (r) 8106 return r; 8107 } 8108 for (i = 0; i < 2; i++) { 8109 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8110 alu.op = ALU_OP1_RECIP_IEEE; 8111 alu.src[0].sel = treg; 8112 alu.src[0].chan = i; 8113 alu.dst.sel = treg; 8114 alu.dst.chan = i; 8115 alu.last = 1; 8116 alu.dst.write = 1; 8117 r = r600_bytecode_add_alu(ctx->bc, &alu); 8118 if (r) 8119 return r; 8120 } 8121 } 8122 for (i = 0; i < 2; i++) { 8123 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8124 alu.op = ALU_OP3_MULADD; 8125 alu.is_op3 = 1; 8126 alu.dst.sel = ctx->temp_reg; 8127 alu.dst.chan = i; 8128 alu.dst.write = 1; 8129 alu.last = i == 1; 8130 alu.src[0].sel = treg; 8131 alu.src[0].chan = i; 8132 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 8133 alu.src[1].neg = 1; 8134 if (src_loaded) { 8135 alu.src[2].sel = ctx->temp_reg; 8136 alu.src[2].chan = i; 8137 } else 8138 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 8139 r = r600_bytecode_add_alu(ctx->bc, &alu); 8140 if (r) 8141 return r; 8142 } 8143 } 8144 src_loaded = TRUE; 8145 src_gpr = ctx->temp_reg; 8146 } 8147 } 8148 8149 if (src_requires_loading && !src_loaded) { 8150 for (i = 0; i < 4; i++) { 8151 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8152 alu.op = ALU_OP1_MOV; 8153 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8154 alu.dst.sel = ctx->temp_reg; 8155 alu.dst.chan = i; 8156 if (i == 3) 8157 alu.last = 1; 8158 alu.dst.write = 1; 8159 r = r600_bytecode_add_alu(ctx->bc, &alu); 8160 if (r) 8161 return r; 8162 } 8163 src_loaded = TRUE; 8164 src_gpr = ctx->temp_reg; 8165 } 8166 8167 /* get offset values */ 8168 if (inst->Texture.NumOffsets) { 8169 assert(inst->Texture.NumOffsets == 1); 8170 8171 /* The texture offset feature doesn't work with the TXF instruction 8172 * and must be emulated by adding the offset to the texture coordinates. */ 8173 if (txf_add_offsets) { 8174 const struct tgsi_texture_offset *off = inst->TexOffsets; 8175 8176 switch (inst->Texture.Texture) { 8177 case TGSI_TEXTURE_3D: 8178 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8179 alu.op = ALU_OP2_ADD_INT; 8180 alu.src[0].sel = src_gpr; 8181 alu.src[0].chan = 2; 8182 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8183 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ]; 8184 alu.dst.sel = src_gpr; 8185 alu.dst.chan = 2; 8186 alu.dst.write = 1; 8187 alu.last = 1; 8188 r = r600_bytecode_add_alu(ctx->bc, &alu); 8189 if (r) 8190 return r; 8191 FALLTHROUGH; 8192 8193 case TGSI_TEXTURE_2D: 8194 case TGSI_TEXTURE_SHADOW2D: 8195 case TGSI_TEXTURE_RECT: 8196 case TGSI_TEXTURE_SHADOWRECT: 8197 case TGSI_TEXTURE_2D_ARRAY: 8198 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8199 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8200 alu.op = ALU_OP2_ADD_INT; 8201 alu.src[0].sel = src_gpr; 8202 alu.src[0].chan = 1; 8203 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8204 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY]; 8205 alu.dst.sel = src_gpr; 8206 alu.dst.chan = 1; 8207 alu.dst.write = 1; 8208 alu.last = 1; 8209 r = r600_bytecode_add_alu(ctx->bc, &alu); 8210 if (r) 8211 return r; 8212 FALLTHROUGH; 8213 8214 case TGSI_TEXTURE_1D: 8215 case TGSI_TEXTURE_SHADOW1D: 8216 case TGSI_TEXTURE_1D_ARRAY: 8217 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8218 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8219 alu.op = ALU_OP2_ADD_INT; 8220 alu.src[0].sel = src_gpr; 8221 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8222 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX]; 8223 alu.dst.sel = src_gpr; 8224 alu.dst.write = 1; 8225 alu.last = 1; 8226 r = r600_bytecode_add_alu(ctx->bc, &alu); 8227 if (r) 8228 return r; 8229 break; 8230 /* texture offsets do not apply to other texture targets */ 8231 } 8232 } else { 8233 switch (inst->Texture.Texture) { 8234 case TGSI_TEXTURE_3D: 8235 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 8236 FALLTHROUGH; 8237 case TGSI_TEXTURE_2D: 8238 case TGSI_TEXTURE_SHADOW2D: 8239 case TGSI_TEXTURE_RECT: 8240 case TGSI_TEXTURE_SHADOWRECT: 8241 case TGSI_TEXTURE_2D_ARRAY: 8242 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8243 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 8244 FALLTHROUGH; 8245 case TGSI_TEXTURE_1D: 8246 case TGSI_TEXTURE_SHADOW1D: 8247 case TGSI_TEXTURE_1D_ARRAY: 8248 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8249 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 8250 } 8251 } 8252 } 8253 8254 /* Obtain the sample index for reading a compressed MSAA color texture. 8255 * To read the FMASK, we use the ldfptr instruction, which tells us 8256 * where the samples are stored. 8257 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 8258 * which is the identity mapping. Each nibble says which physical sample 8259 * should be fetched to get that sample. 8260 * 8261 * Assume src.z contains the sample index. It should be modified like this: 8262 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 8263 * Then fetch the texel with src. 8264 */ 8265 if (read_compressed_msaa) { 8266 unsigned sample_chan = 3; 8267 unsigned temp = r600_get_temp(ctx); 8268 assert(src_loaded); 8269 8270 /* temp.w = ldfptr() */ 8271 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8272 tex.op = FETCH_OP_LD; 8273 tex.inst_mod = 1; /* to indicate this is ldfptr */ 8274 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8275 tex.sampler_index_mode = sampler_index_mode; 8276 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8277 tex.resource_index_mode = sampler_index_mode; 8278 tex.src_gpr = src_gpr; 8279 tex.dst_gpr = temp; 8280 tex.dst_sel_x = 7; /* mask out these components */ 8281 tex.dst_sel_y = 7; 8282 tex.dst_sel_z = 7; 8283 tex.dst_sel_w = 0; /* store X */ 8284 tex.src_sel_x = 0; 8285 tex.src_sel_y = 1; 8286 tex.src_sel_z = 2; 8287 tex.src_sel_w = 3; 8288 tex.offset_x = offset_x; 8289 tex.offset_y = offset_y; 8290 tex.offset_z = offset_z; 8291 r = r600_bytecode_add_tex(ctx->bc, &tex); 8292 if (r) 8293 return r; 8294 8295 /* temp.x = sample_index*4 */ 8296 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8297 alu.op = ALU_OP2_MULLO_INT; 8298 alu.src[0].sel = src_gpr; 8299 alu.src[0].chan = sample_chan; 8300 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8301 alu.src[1].value = 4; 8302 alu.dst.sel = temp; 8303 alu.dst.chan = 0; 8304 alu.dst.write = 1; 8305 r = emit_mul_int_op(ctx->bc, &alu); 8306 if (r) 8307 return r; 8308 8309 /* sample_index = temp.w >> temp.x */ 8310 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8311 alu.op = ALU_OP2_LSHR_INT; 8312 alu.src[0].sel = temp; 8313 alu.src[0].chan = 3; 8314 alu.src[1].sel = temp; 8315 alu.src[1].chan = 0; 8316 alu.dst.sel = src_gpr; 8317 alu.dst.chan = sample_chan; 8318 alu.dst.write = 1; 8319 alu.last = 1; 8320 r = r600_bytecode_add_alu(ctx->bc, &alu); 8321 if (r) 8322 return r; 8323 8324 /* sample_index & 0xF */ 8325 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8326 alu.op = ALU_OP2_AND_INT; 8327 alu.src[0].sel = src_gpr; 8328 alu.src[0].chan = sample_chan; 8329 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8330 alu.src[1].value = 0xF; 8331 alu.dst.sel = src_gpr; 8332 alu.dst.chan = sample_chan; 8333 alu.dst.write = 1; 8334 alu.last = 1; 8335 r = r600_bytecode_add_alu(ctx->bc, &alu); 8336 if (r) 8337 return r; 8338#if 0 8339 /* visualize the FMASK */ 8340 for (i = 0; i < 4; i++) { 8341 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8342 alu.op = ALU_OP1_INT_TO_FLT; 8343 alu.src[0].sel = src_gpr; 8344 alu.src[0].chan = sample_chan; 8345 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8346 alu.dst.chan = i; 8347 alu.dst.write = 1; 8348 alu.last = 1; 8349 r = r600_bytecode_add_alu(ctx->bc, &alu); 8350 if (r) 8351 return r; 8352 } 8353 return 0; 8354#endif 8355 } 8356 8357 /* does this shader want a num layers from TXQ for a cube array? */ 8358 if (has_txq_cube_array_z) { 8359 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8360 8361 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8362 alu.op = ALU_OP1_MOV; 8363 8364 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 8365 if (ctx->bc->chip_class >= EVERGREEN) { 8366 /* with eg each dword is number of cubes */ 8367 alu.src[0].sel += id / 4; 8368 alu.src[0].chan = id % 4; 8369 } else { 8370 /* r600 we have them at channel 2 of the second dword */ 8371 alu.src[0].sel += (id * 2) + 1; 8372 alu.src[0].chan = 2; 8373 } 8374 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 8375 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 8376 alu.last = 1; 8377 r = r600_bytecode_add_alu(ctx->bc, &alu); 8378 if (r) 8379 return r; 8380 /* disable writemask from texture instruction */ 8381 inst->Dst[0].Register.WriteMask &= ~4; 8382 } 8383 8384 opcode = ctx->inst_info->op; 8385 if (opcode == FETCH_OP_GATHER4 && 8386 inst->TexOffsets[0].File != TGSI_FILE_NULL && 8387 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) { 8388 struct r600_bytecode_tex *t; 8389 opcode = FETCH_OP_GATHER4_O; 8390 8391 /* GATHER4_O/GATHER4_C_O use offset values loaded by 8392 SET_TEXTURE_OFFSETS instruction. The immediate offset values 8393 encoded in the instruction are ignored. */ 8394 t = &grad_offs[n_grad_offs++]; 8395 memset(t, 0, sizeof(struct r600_bytecode_tex)); 8396 t->op = FETCH_OP_SET_TEXTURE_OFFSETS; 8397 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8398 t->sampler_index_mode = sampler_index_mode; 8399 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 8400 t->resource_index_mode = sampler_index_mode; 8401 8402 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index; 8403 t->src_sel_x = inst->TexOffsets[0].SwizzleX; 8404 t->src_sel_y = inst->TexOffsets[0].SwizzleY; 8405 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8406 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) 8407 /* make sure array index selector is 0, this is just a safety 8408 * precausion because TGSI seems to emit something strange here */ 8409 t->src_sel_z = 4; 8410 else 8411 t->src_sel_z = inst->TexOffsets[0].SwizzleZ; 8412 8413 t->src_sel_w = 4; 8414 8415 t->dst_sel_x = 7; 8416 t->dst_sel_y = 7; 8417 t->dst_sel_z = 7; 8418 t->dst_sel_w = 7; 8419 } 8420 8421 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8422 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8423 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8424 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8425 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 8426 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 8427 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8428 switch (opcode) { 8429 case FETCH_OP_SAMPLE: 8430 opcode = FETCH_OP_SAMPLE_C; 8431 break; 8432 case FETCH_OP_SAMPLE_L: 8433 opcode = FETCH_OP_SAMPLE_C_L; 8434 break; 8435 case FETCH_OP_SAMPLE_LB: 8436 opcode = FETCH_OP_SAMPLE_C_LB; 8437 break; 8438 case FETCH_OP_SAMPLE_G: 8439 opcode = FETCH_OP_SAMPLE_C_G; 8440 break; 8441 /* Texture gather variants */ 8442 case FETCH_OP_GATHER4: 8443 opcode = FETCH_OP_GATHER4_C; 8444 break; 8445 case FETCH_OP_GATHER4_O: 8446 opcode = FETCH_OP_GATHER4_C_O; 8447 break; 8448 } 8449 } 8450 8451 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8452 tex.op = opcode; 8453 8454 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8455 tex.sampler_index_mode = sampler_index_mode; 8456 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8457 tex.resource_index_mode = sampler_index_mode; 8458 tex.src_gpr = src_gpr; 8459 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8460 8461 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE || 8462 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) { 8463 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */ 8464 } 8465 8466 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 8467 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX]; 8468 tex.inst_mod = texture_component_select; 8469 8470 if (ctx->bc->chip_class == CAYMAN) { 8471 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8472 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8473 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8474 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8475 } else { 8476 /* GATHER4 result order is different from TGSI TG4 */ 8477 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7; 8478 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7; 8479 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7; 8480 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8481 } 8482 } 8483 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) { 8484 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8485 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8486 tex.dst_sel_z = 7; 8487 tex.dst_sel_w = 7; 8488 } 8489 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8490 tex.dst_sel_x = 3; 8491 tex.dst_sel_y = 7; 8492 tex.dst_sel_z = 7; 8493 tex.dst_sel_w = 7; 8494 } 8495 else { 8496 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8497 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8498 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8499 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8500 } 8501 8502 8503 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8504 tex.src_sel_x = 4; 8505 tex.src_sel_y = 4; 8506 tex.src_sel_z = 4; 8507 tex.src_sel_w = 4; 8508 } else if (src_loaded) { 8509 tex.src_sel_x = 0; 8510 tex.src_sel_y = 1; 8511 tex.src_sel_z = 2; 8512 tex.src_sel_w = 3; 8513 } else { 8514 tex.src_sel_x = ctx->src[0].swizzle[0]; 8515 tex.src_sel_y = ctx->src[0].swizzle[1]; 8516 tex.src_sel_z = ctx->src[0].swizzle[2]; 8517 tex.src_sel_w = ctx->src[0].swizzle[3]; 8518 tex.src_rel = ctx->src[0].rel; 8519 } 8520 8521 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 8522 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8523 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8524 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8525 tex.src_sel_x = 1; 8526 tex.src_sel_y = 0; 8527 tex.src_sel_z = 3; 8528 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 8529 } 8530 8531 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 8532 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 8533 tex.coord_type_x = 1; 8534 tex.coord_type_y = 1; 8535 } 8536 tex.coord_type_z = 1; 8537 tex.coord_type_w = 1; 8538 8539 tex.offset_x = offset_x; 8540 tex.offset_y = offset_y; 8541 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 && 8542 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8543 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) { 8544 tex.offset_z = 0; 8545 } 8546 else { 8547 tex.offset_z = offset_z; 8548 } 8549 8550 /* Put the depth for comparison in W. 8551 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 8552 * Some instructions expect the depth in Z. */ 8553 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8554 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8555 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8556 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 8557 opcode != FETCH_OP_SAMPLE_C_L && 8558 opcode != FETCH_OP_SAMPLE_C_LB) { 8559 tex.src_sel_w = tex.src_sel_z; 8560 } 8561 8562 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 8563 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 8564 if (opcode == FETCH_OP_SAMPLE_C_L || 8565 opcode == FETCH_OP_SAMPLE_C_LB) { 8566 /* the array index is read from Y */ 8567 tex.coord_type_y = 0; 8568 array_index_offset_channel = tex.src_sel_y; 8569 } else { 8570 /* the array index is read from Z */ 8571 tex.coord_type_z = 0; 8572 tex.src_sel_z = tex.src_sel_y; 8573 array_index_offset_channel = tex.src_sel_z; 8574 } 8575 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8576 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) { 8577 tex.coord_type_z = 0; 8578 array_index_offset_channel = tex.src_sel_z; 8579 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8580 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 8581 (ctx->bc->chip_class >= EVERGREEN)) 8582 /* the array index is read from Z, coordinate will be corrected elsewhere */ 8583 tex.coord_type_z = 0; 8584 8585 /* We have array access to 1D or 2D ARRAY, the coordinates are not int -> 8586 * evaluate the array index */ 8587 if (array_index_offset_channel >= 0 && 8588 opcode != FETCH_OP_LD && 8589 opcode != FETCH_OP_GET_TEXTURE_RESINFO) { 8590 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8591 alu.src[0].sel = tex.src_gpr; 8592 alu.src[0].chan = array_index_offset_channel; 8593 alu.src[0].rel = tex.src_rel; 8594 alu.op = ALU_OP1_RNDNE; 8595 alu.dst.sel = tex.src_gpr; 8596 alu.dst.chan = array_index_offset_channel; 8597 alu.dst.rel = tex.src_rel; 8598 alu.dst.write = 1; 8599 alu.last = 1; 8600 r = r600_bytecode_add_alu(ctx->bc, &alu); 8601 if (r) 8602 return r; 8603 } 8604 8605 /* mask unused source components */ 8606 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) { 8607 switch (inst->Texture.Texture) { 8608 case TGSI_TEXTURE_2D: 8609 case TGSI_TEXTURE_RECT: 8610 tex.src_sel_z = 7; 8611 tex.src_sel_w = 7; 8612 break; 8613 case TGSI_TEXTURE_1D_ARRAY: 8614 tex.src_sel_y = 7; 8615 tex.src_sel_w = 7; 8616 break; 8617 case TGSI_TEXTURE_1D: 8618 tex.src_sel_y = 7; 8619 tex.src_sel_z = 7; 8620 tex.src_sel_w = 7; 8621 break; 8622 } 8623 } 8624 8625 /* Emit set gradient and offset instructions. */ 8626 for (i = 0; i < n_grad_offs; ++i) { 8627 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]); 8628 if (r) 8629 return r; 8630 } 8631 8632 r = r600_bytecode_add_tex(ctx->bc, &tex); 8633 if (r) 8634 return r; 8635 8636 /* add shadow ambient support - gallium doesn't do it yet */ 8637 return 0; 8638} 8639 8640static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, 8641 struct tgsi_full_src_register *src) 8642{ 8643 unsigned i; 8644 8645 if (src->Register.Indirect) { 8646 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8647 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id) 8648 return ctx->shader->atomics[i].hw_idx; 8649 } 8650 } else { 8651 uint32_t index = src->Register.Index; 8652 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8653 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index) 8654 continue; 8655 if (index > ctx->shader->atomics[i].end) 8656 continue; 8657 if (index < ctx->shader->atomics[i].start) 8658 continue; 8659 uint32_t offset = (index - ctx->shader->atomics[i].start); 8660 return ctx->shader->atomics[i].hw_idx + offset; 8661 } 8662 } 8663 assert(0); 8664 return -1; 8665} 8666 8667static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx, 8668 int *uav_id_p, int *uav_index_mode_p) 8669{ 8670 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8671 int uav_id, uav_index_mode = 0; 8672 int r; 8673 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8674 8675 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); 8676 8677 if (inst->Src[0].Register.Indirect) { 8678 if (is_cm) { 8679 struct r600_bytecode_alu alu; 8680 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8681 alu.op = ALU_OP2_LSHL_INT; 8682 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index); 8683 alu.src[0].chan = 0; 8684 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8685 alu.src[1].value = 2; 8686 alu.dst.sel = ctx->temp_reg; 8687 alu.dst.chan = 0; 8688 alu.dst.write = 1; 8689 alu.last = 1; 8690 r = r600_bytecode_add_alu(ctx->bc, &alu); 8691 if (r) 8692 return r; 8693 8694 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8695 ctx->temp_reg, 0, 8696 ctx->temp_reg, 0, 8697 V_SQ_ALU_SRC_LITERAL, uav_id * 4); 8698 if (r) 8699 return r; 8700 } else 8701 uav_index_mode = 2; 8702 } else if (is_cm) { 8703 r = single_alu_op2(ctx, ALU_OP1_MOV, 8704 ctx->temp_reg, 0, 8705 V_SQ_ALU_SRC_LITERAL, uav_id * 4, 8706 0, 0); 8707 if (r) 8708 return r; 8709 } 8710 *uav_id_p = uav_id; 8711 *uav_index_mode_p = uav_index_mode; 8712 return 0; 8713} 8714 8715static int tgsi_load_gds(struct r600_shader_ctx *ctx) 8716{ 8717 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8718 int r; 8719 struct r600_bytecode_gds gds; 8720 int uav_id = 0; 8721 int uav_index_mode = 0; 8722 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8723 8724 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 8725 if (r) 8726 return r; 8727 8728 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 8729 gds.op = FETCH_OP_GDS_READ_RET; 8730 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8731 gds.uav_id = is_cm ? 0 : uav_id; 8732 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 8733 gds.src_gpr = ctx->temp_reg; 8734 gds.src_sel_x = (is_cm) ? 0 : 4; 8735 gds.src_sel_y = 4; 8736 gds.src_sel_z = 4; 8737 gds.dst_sel_x = 0; 8738 gds.dst_sel_y = 7; 8739 gds.dst_sel_z = 7; 8740 gds.dst_sel_w = 7; 8741 gds.src_gpr2 = 0; 8742 gds.alloc_consume = !is_cm; 8743 r = r600_bytecode_add_gds(ctx->bc, &gds); 8744 if (r) 8745 return r; 8746 8747 ctx->bc->cf_last->vpm = 1; 8748 return 0; 8749} 8750 8751/* this fixes up 1D arrays properly */ 8752static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr) 8753{ 8754 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8755 int r, i; 8756 struct r600_bytecode_alu alu; 8757 int temp_reg = r600_get_temp(ctx); 8758 8759 for (i = 0; i < 4; i++) { 8760 bool def_val = true, write_zero = false; 8761 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8762 alu.op = ALU_OP1_MOV; 8763 alu.dst.sel = temp_reg; 8764 alu.dst.chan = i; 8765 8766 switch (inst->Memory.Texture) { 8767 case TGSI_TEXTURE_BUFFER: 8768 case TGSI_TEXTURE_1D: 8769 if (i == 1 || i == 2 || i == 3) { 8770 write_zero = true; 8771 } 8772 break; 8773 case TGSI_TEXTURE_1D_ARRAY: 8774 if (i == 1 || i == 3) 8775 write_zero = true; 8776 else if (i == 2) { 8777 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1); 8778 def_val = false; 8779 } 8780 break; 8781 case TGSI_TEXTURE_2D: 8782 if (i == 2 || i == 3) 8783 write_zero = true; 8784 break; 8785 default: 8786 if (i == 3) 8787 write_zero = true; 8788 break; 8789 } 8790 8791 if (write_zero) { 8792 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 8793 alu.src[0].value = 0; 8794 } else if (def_val) { 8795 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i); 8796 } 8797 8798 if (i == 3) 8799 alu.last = 1; 8800 alu.dst.write = 1; 8801 r = r600_bytecode_add_alu(ctx->bc, &alu); 8802 if (r) 8803 return r; 8804 } 8805 *idx_gpr = temp_reg; 8806 return 0; 8807} 8808 8809static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx, 8810 int temp_reg) 8811{ 8812 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8813 int r; 8814 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) { 8815 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]); 8816 r = single_alu_op2(ctx, ALU_OP1_MOV, 8817 temp_reg, 0, 8818 V_SQ_ALU_SRC_LITERAL, value >> 2, 8819 0, 0); 8820 if (r) 8821 return r; 8822 } else { 8823 struct r600_bytecode_alu alu; 8824 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8825 alu.op = ALU_OP2_LSHR_INT; 8826 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0); 8827 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8828 alu.src[1].value = 2; 8829 alu.dst.sel = temp_reg; 8830 alu.dst.write = 1; 8831 alu.last = 1; 8832 r = r600_bytecode_add_alu(ctx->bc, &alu); 8833 if (r) 8834 return r; 8835 } 8836 return 0; 8837} 8838 8839static int tgsi_load_buffer(struct r600_shader_ctx *ctx) 8840{ 8841 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8842 /* have to work out the offset into the RAT immediate return buffer */ 8843 struct r600_bytecode_vtx vtx; 8844 struct r600_bytecode_cf *cf; 8845 int r; 8846 int temp_reg = r600_get_temp(ctx); 8847 unsigned rat_index_mode; 8848 unsigned base; 8849 8850 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8851 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE]; 8852 8853 r = load_buffer_coord(ctx, 1, temp_reg); 8854 if (r) 8855 return r; 8856 ctx->bc->cf_last->barrier = 1; 8857 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8858 vtx.op = FETCH_OP_VFETCH; 8859 vtx.buffer_id = inst->Src[0].Register.Index + base; 8860 vtx.buffer_index_mode = rat_index_mode; 8861 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8862 vtx.src_gpr = temp_reg; 8863 vtx.src_sel_x = 0; 8864 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8865 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 8866 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 8867 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 8868 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 8869 vtx.num_format_all = 1; 8870 vtx.format_comp_all = 1; 8871 vtx.srf_mode_all = 0; 8872 8873 if (inst->Dst[0].Register.WriteMask & 8) { 8874 vtx.data_format = FMT_32_32_32_32; 8875 vtx.use_const_fields = 0; 8876 } else if (inst->Dst[0].Register.WriteMask & 4) { 8877 vtx.data_format = FMT_32_32_32; 8878 vtx.use_const_fields = 0; 8879 } else if (inst->Dst[0].Register.WriteMask & 2) { 8880 vtx.data_format = FMT_32_32; 8881 vtx.use_const_fields = 0; 8882 } else { 8883 vtx.data_format = FMT_32; 8884 vtx.use_const_fields = 0; 8885 } 8886 8887 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8888 if (r) 8889 return r; 8890 cf = ctx->bc->cf_last; 8891 cf->barrier = 1; 8892 return 0; 8893} 8894 8895static int tgsi_load_rat(struct r600_shader_ctx *ctx) 8896{ 8897 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8898 /* have to work out the offset into the RAT immediate return buffer */ 8899 struct r600_bytecode_vtx vtx; 8900 struct r600_bytecode_cf *cf; 8901 int r; 8902 int idx_gpr; 8903 unsigned format, num_format, format_comp, endian; 8904 const struct util_format_description *desc; 8905 unsigned rat_index_mode; 8906 unsigned immed_base; 8907 8908 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8909 8910 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 8911 r = load_index_src(ctx, 1, &idx_gpr); 8912 if (r) 8913 return r; 8914 8915 if (rat_index_mode) 8916 egcm_load_index_reg(ctx->bc, 1, false); 8917 8918 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8919 cf = ctx->bc->cf_last; 8920 8921 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index; 8922 cf->rat.inst = V_RAT_INST_NOP_RTN; 8923 cf->rat.index_mode = rat_index_mode; 8924 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 8925 cf->output.gpr = ctx->thread_id_gpr; 8926 cf->output.index_gpr = idx_gpr; 8927 cf->output.comp_mask = 0xf; 8928 cf->output.burst_count = 1; 8929 cf->vpm = 1; 8930 cf->barrier = 1; 8931 cf->mark = 1; 8932 cf->output.elem_size = 0; 8933 8934 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 8935 cf = ctx->bc->cf_last; 8936 cf->barrier = 1; 8937 8938 desc = util_format_description(inst->Memory.Format); 8939 r600_vertex_data_type(inst->Memory.Format, 8940 &format, &num_format, &format_comp, &endian); 8941 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8942 vtx.op = FETCH_OP_VFETCH; 8943 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 8944 vtx.buffer_index_mode = rat_index_mode; 8945 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8946 vtx.src_gpr = ctx->thread_id_gpr; 8947 vtx.src_sel_x = 1; 8948 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8949 vtx.dst_sel_x = desc->swizzle[0]; 8950 vtx.dst_sel_y = desc->swizzle[1]; 8951 vtx.dst_sel_z = desc->swizzle[2]; 8952 vtx.dst_sel_w = desc->swizzle[3]; 8953 vtx.srf_mode_all = 1; 8954 vtx.data_format = format; 8955 vtx.num_format_all = num_format; 8956 vtx.format_comp_all = format_comp; 8957 vtx.endian = endian; 8958 vtx.offset = 0; 8959 vtx.mega_fetch_count = 3; 8960 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8961 if (r) 8962 return r; 8963 cf = ctx->bc->cf_last; 8964 cf->barrier = 1; 8965 return 0; 8966} 8967 8968static int tgsi_load_lds(struct r600_shader_ctx *ctx) 8969{ 8970 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8971 struct r600_bytecode_alu alu; 8972 int r; 8973 int temp_reg = r600_get_temp(ctx); 8974 8975 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8976 alu.op = ALU_OP1_MOV; 8977 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 8978 alu.dst.sel = temp_reg; 8979 alu.dst.write = 1; 8980 alu.last = 1; 8981 r = r600_bytecode_add_alu(ctx->bc, &alu); 8982 if (r) 8983 return r; 8984 8985 r = do_lds_fetch_values(ctx, temp_reg, 8986 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask); 8987 if (r) 8988 return r; 8989 return 0; 8990} 8991 8992static int tgsi_load(struct r600_shader_ctx *ctx) 8993{ 8994 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8995 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 8996 return tgsi_load_rat(ctx); 8997 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 8998 return tgsi_load_gds(ctx); 8999 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9000 return tgsi_load_buffer(ctx); 9001 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 9002 return tgsi_load_lds(ctx); 9003 return 0; 9004} 9005 9006static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) 9007{ 9008 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9009 struct r600_bytecode_cf *cf; 9010 int r, i; 9011 unsigned rat_index_mode; 9012 int lasti; 9013 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx); 9014 9015 r = load_buffer_coord(ctx, 0, treg2); 9016 if (r) 9017 return r; 9018 9019 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9020 if (rat_index_mode) 9021 egcm_load_index_reg(ctx->bc, 1, false); 9022 9023 for (i = 0; i <= 3; i++) { 9024 struct r600_bytecode_alu alu; 9025 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9026 alu.op = ALU_OP1_MOV; 9027 alu.dst.sel = temp_reg; 9028 alu.dst.chan = i; 9029 alu.src[0].sel = V_SQ_ALU_SRC_0; 9030 alu.last = (i == 3); 9031 alu.dst.write = 1; 9032 r = r600_bytecode_add_alu(ctx->bc, &alu); 9033 if (r) 9034 return r; 9035 } 9036 9037 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9038 for (i = 0; i <= lasti; i++) { 9039 struct r600_bytecode_alu alu; 9040 if (!((1 << i) & inst->Dst[0].Register.WriteMask)) 9041 continue; 9042 9043 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 9044 temp_reg, 0, 9045 treg2, 0, 9046 V_SQ_ALU_SRC_LITERAL, i); 9047 if (r) 9048 return r; 9049 9050 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9051 alu.op = ALU_OP1_MOV; 9052 alu.dst.sel = ctx->temp_reg; 9053 alu.dst.chan = 0; 9054 9055 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9056 alu.last = 1; 9057 alu.dst.write = 1; 9058 r = r600_bytecode_add_alu(ctx->bc, &alu); 9059 if (r) 9060 return r; 9061 9062 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9063 cf = ctx->bc->cf_last; 9064 9065 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE]; 9066 cf->rat.inst = V_RAT_INST_STORE_TYPED; 9067 cf->rat.index_mode = rat_index_mode; 9068 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 9069 cf->output.gpr = ctx->temp_reg; 9070 cf->output.index_gpr = temp_reg; 9071 cf->output.comp_mask = 1; 9072 cf->output.burst_count = 1; 9073 cf->vpm = 1; 9074 cf->barrier = 1; 9075 cf->output.elem_size = 0; 9076 } 9077 return 0; 9078} 9079 9080static int tgsi_store_rat(struct r600_shader_ctx *ctx) 9081{ 9082 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9083 struct r600_bytecode_cf *cf; 9084 bool src_requires_loading = false; 9085 int val_gpr, idx_gpr; 9086 int r, i; 9087 unsigned rat_index_mode; 9088 9089 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9090 9091 r = load_index_src(ctx, 0, &idx_gpr); 9092 if (r) 9093 return r; 9094 9095 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY) 9096 src_requires_loading = true; 9097 9098 if (src_requires_loading) { 9099 struct r600_bytecode_alu alu; 9100 for (i = 0; i < 4; i++) { 9101 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9102 alu.op = ALU_OP1_MOV; 9103 alu.dst.sel = ctx->temp_reg; 9104 alu.dst.chan = i; 9105 9106 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9107 if (i == 3) 9108 alu.last = 1; 9109 alu.dst.write = 1; 9110 r = r600_bytecode_add_alu(ctx->bc, &alu); 9111 if (r) 9112 return r; 9113 } 9114 val_gpr = ctx->temp_reg; 9115 } else 9116 val_gpr = tgsi_tex_get_src_gpr(ctx, 1); 9117 if (rat_index_mode) 9118 egcm_load_index_reg(ctx->bc, 1, false); 9119 9120 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9121 cf = ctx->bc->cf_last; 9122 9123 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index; 9124 cf->rat.inst = V_RAT_INST_STORE_TYPED; 9125 cf->rat.index_mode = rat_index_mode; 9126 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 9127 cf->output.gpr = val_gpr; 9128 cf->output.index_gpr = idx_gpr; 9129 cf->output.comp_mask = 0xf; 9130 cf->output.burst_count = 1; 9131 cf->vpm = 1; 9132 cf->barrier = 1; 9133 cf->output.elem_size = 0; 9134 return 0; 9135} 9136 9137static int tgsi_store_lds(struct r600_shader_ctx *ctx) 9138{ 9139 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9140 struct r600_bytecode_alu alu; 9141 int r, i, lasti; 9142 int write_mask = inst->Dst[0].Register.WriteMask; 9143 int temp_reg = r600_get_temp(ctx); 9144 9145 /* LDS write */ 9146 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9147 alu.op = ALU_OP1_MOV; 9148 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9149 alu.dst.sel = temp_reg; 9150 alu.dst.write = 1; 9151 alu.last = 1; 9152 r = r600_bytecode_add_alu(ctx->bc, &alu); 9153 if (r) 9154 return r; 9155 9156 lasti = tgsi_last_instruction(write_mask); 9157 for (i = 1; i <= lasti; i++) { 9158 if (!(write_mask & (1 << i))) 9159 continue; 9160 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 9161 temp_reg, i, 9162 temp_reg, 0, 9163 V_SQ_ALU_SRC_LITERAL, 4 * i); 9164 if (r) 9165 return r; 9166 } 9167 for (i = 0; i <= lasti; i++) { 9168 if (!(write_mask & (1 << i))) 9169 continue; 9170 9171 if ((i == 0 && ((write_mask & 3) == 3)) || 9172 (i == 2 && ((write_mask & 0xc) == 0xc))) { 9173 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9174 alu.op = LDS_OP3_LDS_WRITE_REL; 9175 9176 alu.src[0].sel = temp_reg; 9177 alu.src[0].chan = i; 9178 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9179 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1); 9180 alu.last = 1; 9181 alu.is_lds_idx_op = true; 9182 alu.lds_idx = 1; 9183 r = r600_bytecode_add_alu(ctx->bc, &alu); 9184 if (r) 9185 return r; 9186 i += 1; 9187 continue; 9188 } 9189 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9190 alu.op = LDS_OP2_LDS_WRITE; 9191 9192 alu.src[0].sel = temp_reg; 9193 alu.src[0].chan = i; 9194 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9195 9196 alu.last = 1; 9197 alu.is_lds_idx_op = true; 9198 9199 r = r600_bytecode_add_alu(ctx->bc, &alu); 9200 if (r) 9201 return r; 9202 } 9203 return 0; 9204} 9205 9206static int tgsi_store(struct r600_shader_ctx *ctx) 9207{ 9208 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9209 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) 9210 return tgsi_store_buffer_rat(ctx); 9211 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) 9212 return tgsi_store_lds(ctx); 9213 else 9214 return tgsi_store_rat(ctx); 9215} 9216 9217static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx) 9218{ 9219 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9220 /* have to work out the offset into the RAT immediate return buffer */ 9221 struct r600_bytecode_alu alu; 9222 struct r600_bytecode_vtx vtx; 9223 struct r600_bytecode_cf *cf; 9224 int r; 9225 int idx_gpr; 9226 unsigned format, num_format, format_comp, endian; 9227 const struct util_format_description *desc; 9228 unsigned rat_index_mode; 9229 unsigned immed_base; 9230 unsigned rat_base; 9231 9232 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 9233 rat_base = ctx->shader->rat_base; 9234 9235 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 9236 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9237 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9238 9239 r = load_buffer_coord(ctx, 1, ctx->temp_reg); 9240 if (r) 9241 return r; 9242 idx_gpr = ctx->temp_reg; 9243 } else { 9244 r = load_index_src(ctx, 1, &idx_gpr); 9245 if (r) 9246 return r; 9247 } 9248 9249 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9250 9251 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) { 9252 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9253 alu.op = ALU_OP1_MOV; 9254 alu.dst.sel = ctx->thread_id_gpr; 9255 alu.dst.chan = 0; 9256 alu.dst.write = 1; 9257 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9258 alu.last = 1; 9259 r = r600_bytecode_add_alu(ctx->bc, &alu); 9260 if (r) 9261 return r; 9262 9263 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9264 alu.op = ALU_OP1_MOV; 9265 alu.dst.sel = ctx->thread_id_gpr; 9266 if (ctx->bc->chip_class == CAYMAN) 9267 alu.dst.chan = 2; 9268 else 9269 alu.dst.chan = 3; 9270 alu.dst.write = 1; 9271 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9272 alu.last = 1; 9273 r = r600_bytecode_add_alu(ctx->bc, &alu); 9274 if (r) 9275 return r; 9276 } else { 9277 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9278 alu.op = ALU_OP1_MOV; 9279 alu.dst.sel = ctx->thread_id_gpr; 9280 alu.dst.chan = 0; 9281 alu.dst.write = 1; 9282 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9283 alu.last = 1; 9284 r = r600_bytecode_add_alu(ctx->bc, &alu); 9285 if (r) 9286 return r; 9287 } 9288 9289 if (rat_index_mode) 9290 egcm_load_index_reg(ctx->bc, 1, false); 9291 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9292 cf = ctx->bc->cf_last; 9293 9294 cf->rat.id = rat_base + inst->Src[0].Register.Index; 9295 cf->rat.inst = ctx->inst_info->op; 9296 cf->rat.index_mode = rat_index_mode; 9297 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 9298 cf->output.gpr = ctx->thread_id_gpr; 9299 cf->output.index_gpr = idx_gpr; 9300 cf->output.comp_mask = 0xf; 9301 cf->output.burst_count = 1; 9302 cf->vpm = 1; 9303 cf->barrier = 1; 9304 cf->mark = 1; 9305 cf->output.elem_size = 0; 9306 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 9307 cf = ctx->bc->cf_last; 9308 cf->barrier = 1; 9309 cf->cf_addr = 1; 9310 9311 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 9312 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 9313 desc = util_format_description(inst->Memory.Format); 9314 r600_vertex_data_type(inst->Memory.Format, 9315 &format, &num_format, &format_comp, &endian); 9316 vtx.dst_sel_x = desc->swizzle[0]; 9317 } else { 9318 format = FMT_32; 9319 num_format = 1; 9320 format_comp = 0; 9321 endian = 0; 9322 vtx.dst_sel_x = 0; 9323 } 9324 vtx.op = FETCH_OP_VFETCH; 9325 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 9326 vtx.buffer_index_mode = rat_index_mode; 9327 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 9328 vtx.src_gpr = ctx->thread_id_gpr; 9329 vtx.src_sel_x = 1; 9330 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9331 vtx.dst_sel_y = 7; 9332 vtx.dst_sel_z = 7; 9333 vtx.dst_sel_w = 7; 9334 vtx.use_const_fields = 0; 9335 vtx.srf_mode_all = 1; 9336 vtx.data_format = format; 9337 vtx.num_format_all = num_format; 9338 vtx.format_comp_all = format_comp; 9339 vtx.endian = endian; 9340 vtx.offset = 0; 9341 vtx.mega_fetch_count = 0xf; 9342 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 9343 if (r) 9344 return r; 9345 cf = ctx->bc->cf_last; 9346 cf->vpm = 1; 9347 cf->barrier = 1; 9348 return 0; 9349} 9350 9351static int get_gds_op(int opcode) 9352{ 9353 switch (opcode) { 9354 case TGSI_OPCODE_ATOMUADD: 9355 return FETCH_OP_GDS_ADD_RET; 9356 case TGSI_OPCODE_ATOMAND: 9357 return FETCH_OP_GDS_AND_RET; 9358 case TGSI_OPCODE_ATOMOR: 9359 return FETCH_OP_GDS_OR_RET; 9360 case TGSI_OPCODE_ATOMXOR: 9361 return FETCH_OP_GDS_XOR_RET; 9362 case TGSI_OPCODE_ATOMUMIN: 9363 return FETCH_OP_GDS_MIN_UINT_RET; 9364 case TGSI_OPCODE_ATOMUMAX: 9365 return FETCH_OP_GDS_MAX_UINT_RET; 9366 case TGSI_OPCODE_ATOMXCHG: 9367 return FETCH_OP_GDS_XCHG_RET; 9368 case TGSI_OPCODE_ATOMCAS: 9369 return FETCH_OP_GDS_CMP_XCHG_RET; 9370 default: 9371 return -1; 9372 } 9373} 9374 9375static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) 9376{ 9377 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9378 struct r600_bytecode_gds gds; 9379 struct r600_bytecode_alu alu; 9380 int gds_op = get_gds_op(inst->Instruction.Opcode); 9381 int r; 9382 int uav_id = 0; 9383 int uav_index_mode = 0; 9384 bool is_cm = (ctx->bc->chip_class == CAYMAN); 9385 9386 if (gds_op == -1) { 9387 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); 9388 return -1; 9389 } 9390 9391 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 9392 if (r) 9393 return r; 9394 9395 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) { 9396 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) { 9397 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]); 9398 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9399 alu.op = ALU_OP1_MOV; 9400 alu.dst.sel = ctx->temp_reg; 9401 alu.dst.chan = is_cm ? 2 : 1; 9402 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9403 alu.src[0].value = value; 9404 alu.last = 1; 9405 alu.dst.write = 1; 9406 r = r600_bytecode_add_alu(ctx->bc, &alu); 9407 if (r) 9408 return r; 9409 } else { 9410 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9411 alu.op = ALU_OP1_MOV; 9412 alu.dst.sel = ctx->temp_reg; 9413 alu.dst.chan = is_cm ? 2 : 1; 9414 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9415 alu.last = 1; 9416 alu.dst.write = 1; 9417 r = r600_bytecode_add_alu(ctx->bc, &alu); 9418 if (r) 9419 return r; 9420 } 9421 } 9422 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { 9423 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); 9424 int abs_value = abs(value); 9425 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET) 9426 gds_op = FETCH_OP_GDS_SUB_RET; 9427 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9428 alu.op = ALU_OP1_MOV; 9429 alu.dst.sel = ctx->temp_reg; 9430 alu.dst.chan = is_cm ? 1 : 0; 9431 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9432 alu.src[0].value = abs_value; 9433 alu.last = 1; 9434 alu.dst.write = 1; 9435 r = r600_bytecode_add_alu(ctx->bc, &alu); 9436 if (r) 9437 return r; 9438 } else { 9439 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9440 alu.op = ALU_OP1_MOV; 9441 alu.dst.sel = ctx->temp_reg; 9442 alu.dst.chan = is_cm ? 1 : 0; 9443 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9444 alu.last = 1; 9445 alu.dst.write = 1; 9446 r = r600_bytecode_add_alu(ctx->bc, &alu); 9447 if (r) 9448 return r; 9449 } 9450 9451 9452 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 9453 gds.op = gds_op; 9454 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9455 gds.uav_id = is_cm ? 0 : uav_id; 9456 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 9457 gds.src_gpr = ctx->temp_reg; 9458 gds.src_gpr2 = 0; 9459 gds.src_sel_x = is_cm ? 0 : 4; 9460 gds.src_sel_y = is_cm ? 1 : 0; 9461 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) 9462 gds.src_sel_z = is_cm ? 2 : 1; 9463 else 9464 gds.src_sel_z = 7; 9465 gds.dst_sel_x = 0; 9466 gds.dst_sel_y = 7; 9467 gds.dst_sel_z = 7; 9468 gds.dst_sel_w = 7; 9469 gds.alloc_consume = !is_cm; 9470 9471 r = r600_bytecode_add_gds(ctx->bc, &gds); 9472 if (r) 9473 return r; 9474 ctx->bc->cf_last->vpm = 1; 9475 return 0; 9476} 9477 9478static int get_lds_op(int opcode) 9479{ 9480 switch (opcode) { 9481 case TGSI_OPCODE_ATOMUADD: 9482 return LDS_OP2_LDS_ADD_RET; 9483 case TGSI_OPCODE_ATOMAND: 9484 return LDS_OP2_LDS_AND_RET; 9485 case TGSI_OPCODE_ATOMOR: 9486 return LDS_OP2_LDS_OR_RET; 9487 case TGSI_OPCODE_ATOMXOR: 9488 return LDS_OP2_LDS_XOR_RET; 9489 case TGSI_OPCODE_ATOMUMIN: 9490 return LDS_OP2_LDS_MIN_UINT_RET; 9491 case TGSI_OPCODE_ATOMUMAX: 9492 return LDS_OP2_LDS_MAX_UINT_RET; 9493 case TGSI_OPCODE_ATOMIMIN: 9494 return LDS_OP2_LDS_MIN_INT_RET; 9495 case TGSI_OPCODE_ATOMIMAX: 9496 return LDS_OP2_LDS_MAX_INT_RET; 9497 case TGSI_OPCODE_ATOMXCHG: 9498 return LDS_OP2_LDS_XCHG_RET; 9499 case TGSI_OPCODE_ATOMCAS: 9500 return LDS_OP3_LDS_CMP_XCHG_RET; 9501 default: 9502 return -1; 9503 } 9504} 9505 9506static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx) 9507{ 9508 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9509 int lds_op = get_lds_op(inst->Instruction.Opcode); 9510 int r; 9511 9512 struct r600_bytecode_alu alu; 9513 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9514 alu.op = lds_op; 9515 alu.is_lds_idx_op = true; 9516 alu.last = 1; 9517 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 9518 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0); 9519 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET) 9520 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0); 9521 else 9522 alu.src[2].sel = V_SQ_ALU_SRC_0; 9523 r = r600_bytecode_add_alu(ctx->bc, &alu); 9524 if (r) 9525 return r; 9526 9527 /* then read from LDS_OQ_A_POP */ 9528 memset(&alu, 0, sizeof(alu)); 9529 9530 alu.op = ALU_OP1_MOV; 9531 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 9532 alu.src[0].chan = 0; 9533 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 9534 alu.dst.write = 1; 9535 alu.last = 1; 9536 r = r600_bytecode_add_alu(ctx->bc, &alu); 9537 if (r) 9538 return r; 9539 9540 return 0; 9541} 9542 9543static int tgsi_atomic_op(struct r600_shader_ctx *ctx) 9544{ 9545 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9546 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 9547 return tgsi_atomic_op_rat(ctx); 9548 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 9549 return tgsi_atomic_op_gds(ctx); 9550 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9551 return tgsi_atomic_op_rat(ctx); 9552 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 9553 return tgsi_atomic_op_lds(ctx); 9554 return 0; 9555} 9556 9557static int tgsi_resq(struct r600_shader_ctx *ctx) 9558{ 9559 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9560 unsigned sampler_index_mode; 9561 struct r600_bytecode_tex tex; 9562 int r; 9563 boolean has_txq_cube_array_z = false; 9564 9565 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || 9566 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { 9567 if (ctx->bc->chip_class < EVERGREEN) 9568 ctx->shader->uses_tex_buffers = true; 9569 unsigned eg_buffer_base = 0; 9570 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET; 9571 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9572 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9573 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base); 9574 } 9575 9576 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY && 9577 inst->Dst[0].Register.WriteMask & 4) { 9578 ctx->shader->has_txq_cube_array_z_comp = true; 9579 has_txq_cube_array_z = true; 9580 } 9581 9582 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9583 if (sampler_index_mode) 9584 egcm_load_index_reg(ctx->bc, 1, false); 9585 9586 9587 /* does this shader want a num layers from TXQ for a cube array? */ 9588 if (has_txq_cube_array_z) { 9589 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset; 9590 struct r600_bytecode_alu alu; 9591 9592 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9593 alu.op = ALU_OP1_MOV; 9594 9595 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 9596 /* with eg each dword is either number of cubes */ 9597 alu.src[0].sel += id / 4; 9598 alu.src[0].chan = id % 4; 9599 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 9600 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 9601 alu.last = 1; 9602 r = r600_bytecode_add_alu(ctx->bc, &alu); 9603 if (r) 9604 return r; 9605 /* disable writemask from texture instruction */ 9606 inst->Dst[0].Register.WriteMask &= ~4; 9607 } 9608 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 9609 tex.op = ctx->inst_info->op; 9610 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index; 9611 tex.sampler_index_mode = sampler_index_mode; 9612 tex.resource_id = tex.sampler_id; 9613 tex.resource_index_mode = sampler_index_mode; 9614 tex.src_sel_x = 4; 9615 tex.src_sel_y = 4; 9616 tex.src_sel_z = 4; 9617 tex.src_sel_w = 4; 9618 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 9619 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 9620 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 9621 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 9622 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9623 r = r600_bytecode_add_tex(ctx->bc, &tex); 9624 if (r) 9625 return r; 9626 9627 return 0; 9628} 9629 9630static int tgsi_lrp(struct r600_shader_ctx *ctx) 9631{ 9632 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9633 struct r600_bytecode_alu alu; 9634 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9635 struct r600_bytecode_alu_src srcs[2][4]; 9636 unsigned i; 9637 int r; 9638 9639 /* optimize if it's just an equal balance */ 9640 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 9641 for (i = 0; i < lasti + 1; i++) { 9642 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9643 continue; 9644 9645 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9646 alu.op = ALU_OP2_ADD; 9647 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9648 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9649 alu.omod = 3; 9650 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9651 alu.dst.chan = i; 9652 if (i == lasti) { 9653 alu.last = 1; 9654 } 9655 r = r600_bytecode_add_alu(ctx->bc, &alu); 9656 if (r) 9657 return r; 9658 } 9659 return 0; 9660 } 9661 9662 /* 1 - src0 */ 9663 for (i = 0; i < lasti + 1; i++) { 9664 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9665 continue; 9666 9667 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9668 alu.op = ALU_OP2_ADD; 9669 alu.src[0].sel = V_SQ_ALU_SRC_1; 9670 alu.src[0].chan = 0; 9671 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 9672 r600_bytecode_src_toggle_neg(&alu.src[1]); 9673 alu.dst.sel = ctx->temp_reg; 9674 alu.dst.chan = i; 9675 if (i == lasti) { 9676 alu.last = 1; 9677 } 9678 alu.dst.write = 1; 9679 r = r600_bytecode_add_alu(ctx->bc, &alu); 9680 if (r) 9681 return r; 9682 } 9683 9684 /* (1 - src0) * src2 */ 9685 for (i = 0; i < lasti + 1; i++) { 9686 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9687 continue; 9688 9689 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9690 alu.op = ALU_OP2_MUL; 9691 alu.src[0].sel = ctx->temp_reg; 9692 alu.src[0].chan = i; 9693 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9694 alu.dst.sel = ctx->temp_reg; 9695 alu.dst.chan = i; 9696 if (i == lasti) { 9697 alu.last = 1; 9698 } 9699 alu.dst.write = 1; 9700 r = r600_bytecode_add_alu(ctx->bc, &alu); 9701 if (r) 9702 return r; 9703 } 9704 9705 /* src0 * src1 + (1 - src0) * src2 */ 9706 9707 for (i = 0; i < 2; i++) { 9708 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9709 srcs[i], &ctx->src[i]); 9710 if (r) 9711 return r; 9712 } 9713 9714 for (i = 0; i < lasti + 1; i++) { 9715 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9716 continue; 9717 9718 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9719 alu.op = ALU_OP3_MULADD; 9720 alu.is_op3 = 1; 9721 alu.src[0] = srcs[0][i]; 9722 alu.src[1] = srcs[1][i]; 9723 alu.src[2].sel = ctx->temp_reg; 9724 alu.src[2].chan = i; 9725 9726 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9727 alu.dst.chan = i; 9728 if (i == lasti) { 9729 alu.last = 1; 9730 } 9731 r = r600_bytecode_add_alu(ctx->bc, &alu); 9732 if (r) 9733 return r; 9734 } 9735 return 0; 9736} 9737 9738static int tgsi_cmp(struct r600_shader_ctx *ctx) 9739{ 9740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9741 struct r600_bytecode_alu alu; 9742 int i, r, j; 9743 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9744 struct r600_bytecode_alu_src srcs[3][4]; 9745 9746 unsigned op; 9747 9748 if (ctx->src[0].abs && ctx->src[0].neg) { 9749 op = ALU_OP3_CNDE; 9750 ctx->src[0].abs = 0; 9751 ctx->src[0].neg = 0; 9752 } else { 9753 op = ALU_OP3_CNDGE; 9754 } 9755 9756 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 9757 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9758 srcs[j], &ctx->src[j]); 9759 if (r) 9760 return r; 9761 } 9762 9763 for (i = 0; i < lasti + 1; i++) { 9764 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9765 continue; 9766 9767 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9768 alu.op = op; 9769 alu.src[0] = srcs[0][i]; 9770 alu.src[1] = srcs[2][i]; 9771 alu.src[2] = srcs[1][i]; 9772 9773 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9774 alu.dst.chan = i; 9775 alu.dst.write = 1; 9776 alu.is_op3 = 1; 9777 if (i == lasti) 9778 alu.last = 1; 9779 r = r600_bytecode_add_alu(ctx->bc, &alu); 9780 if (r) 9781 return r; 9782 } 9783 return 0; 9784} 9785 9786static int tgsi_ucmp(struct r600_shader_ctx *ctx) 9787{ 9788 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9789 struct r600_bytecode_alu alu; 9790 int i, r; 9791 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9792 9793 for (i = 0; i < lasti + 1; i++) { 9794 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9795 continue; 9796 9797 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9798 alu.op = ALU_OP3_CNDE_INT; 9799 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9800 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9801 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 9802 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9803 alu.dst.chan = i; 9804 alu.dst.write = 1; 9805 alu.is_op3 = 1; 9806 if (i == lasti) 9807 alu.last = 1; 9808 r = r600_bytecode_add_alu(ctx->bc, &alu); 9809 if (r) 9810 return r; 9811 } 9812 return 0; 9813} 9814 9815static int tgsi_exp(struct r600_shader_ctx *ctx) 9816{ 9817 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9818 struct r600_bytecode_alu alu; 9819 int r; 9820 unsigned i; 9821 9822 /* result.x = 2^floor(src); */ 9823 if (inst->Dst[0].Register.WriteMask & 1) { 9824 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9825 9826 alu.op = ALU_OP1_FLOOR; 9827 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9828 9829 alu.dst.sel = ctx->temp_reg; 9830 alu.dst.chan = 0; 9831 alu.dst.write = 1; 9832 alu.last = 1; 9833 r = r600_bytecode_add_alu(ctx->bc, &alu); 9834 if (r) 9835 return r; 9836 9837 if (ctx->bc->chip_class == CAYMAN) { 9838 for (i = 0; i < 3; i++) { 9839 alu.op = ALU_OP1_EXP_IEEE; 9840 alu.src[0].sel = ctx->temp_reg; 9841 alu.src[0].chan = 0; 9842 9843 alu.dst.sel = ctx->temp_reg; 9844 alu.dst.chan = i; 9845 alu.dst.write = i == 0; 9846 alu.last = i == 2; 9847 r = r600_bytecode_add_alu(ctx->bc, &alu); 9848 if (r) 9849 return r; 9850 } 9851 } else { 9852 alu.op = ALU_OP1_EXP_IEEE; 9853 alu.src[0].sel = ctx->temp_reg; 9854 alu.src[0].chan = 0; 9855 9856 alu.dst.sel = ctx->temp_reg; 9857 alu.dst.chan = 0; 9858 alu.dst.write = 1; 9859 alu.last = 1; 9860 r = r600_bytecode_add_alu(ctx->bc, &alu); 9861 if (r) 9862 return r; 9863 } 9864 } 9865 9866 /* result.y = tmp - floor(tmp); */ 9867 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9868 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9869 9870 alu.op = ALU_OP1_FRACT; 9871 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9872 9873 alu.dst.sel = ctx->temp_reg; 9874#if 0 9875 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9876 if (r) 9877 return r; 9878#endif 9879 alu.dst.write = 1; 9880 alu.dst.chan = 1; 9881 9882 alu.last = 1; 9883 9884 r = r600_bytecode_add_alu(ctx->bc, &alu); 9885 if (r) 9886 return r; 9887 } 9888 9889 /* result.z = RoughApprox2ToX(tmp);*/ 9890 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 9891 if (ctx->bc->chip_class == CAYMAN) { 9892 for (i = 0; i < 3; i++) { 9893 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9894 alu.op = ALU_OP1_EXP_IEEE; 9895 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9896 9897 alu.dst.sel = ctx->temp_reg; 9898 alu.dst.chan = i; 9899 if (i == 2) { 9900 alu.dst.write = 1; 9901 alu.last = 1; 9902 } 9903 9904 r = r600_bytecode_add_alu(ctx->bc, &alu); 9905 if (r) 9906 return r; 9907 } 9908 } else { 9909 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9910 alu.op = ALU_OP1_EXP_IEEE; 9911 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9912 9913 alu.dst.sel = ctx->temp_reg; 9914 alu.dst.write = 1; 9915 alu.dst.chan = 2; 9916 9917 alu.last = 1; 9918 9919 r = r600_bytecode_add_alu(ctx->bc, &alu); 9920 if (r) 9921 return r; 9922 } 9923 } 9924 9925 /* result.w = 1.0;*/ 9926 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 9927 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9928 9929 alu.op = ALU_OP1_MOV; 9930 alu.src[0].sel = V_SQ_ALU_SRC_1; 9931 alu.src[0].chan = 0; 9932 9933 alu.dst.sel = ctx->temp_reg; 9934 alu.dst.chan = 3; 9935 alu.dst.write = 1; 9936 alu.last = 1; 9937 r = r600_bytecode_add_alu(ctx->bc, &alu); 9938 if (r) 9939 return r; 9940 } 9941 return tgsi_helper_copy(ctx, inst); 9942} 9943 9944static int tgsi_log(struct r600_shader_ctx *ctx) 9945{ 9946 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9947 struct r600_bytecode_alu alu; 9948 int r; 9949 unsigned i; 9950 9951 /* result.x = floor(log2(|src|)); */ 9952 if (inst->Dst[0].Register.WriteMask & 1) { 9953 if (ctx->bc->chip_class == CAYMAN) { 9954 for (i = 0; i < 3; i++) { 9955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9956 9957 alu.op = ALU_OP1_LOG_IEEE; 9958 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9959 r600_bytecode_src_set_abs(&alu.src[0]); 9960 9961 alu.dst.sel = ctx->temp_reg; 9962 alu.dst.chan = i; 9963 if (i == 0) 9964 alu.dst.write = 1; 9965 if (i == 2) 9966 alu.last = 1; 9967 r = r600_bytecode_add_alu(ctx->bc, &alu); 9968 if (r) 9969 return r; 9970 } 9971 9972 } else { 9973 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9974 9975 alu.op = ALU_OP1_LOG_IEEE; 9976 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9977 r600_bytecode_src_set_abs(&alu.src[0]); 9978 9979 alu.dst.sel = ctx->temp_reg; 9980 alu.dst.chan = 0; 9981 alu.dst.write = 1; 9982 alu.last = 1; 9983 r = r600_bytecode_add_alu(ctx->bc, &alu); 9984 if (r) 9985 return r; 9986 } 9987 9988 alu.op = ALU_OP1_FLOOR; 9989 alu.src[0].sel = ctx->temp_reg; 9990 alu.src[0].chan = 0; 9991 9992 alu.dst.sel = ctx->temp_reg; 9993 alu.dst.chan = 0; 9994 alu.dst.write = 1; 9995 alu.last = 1; 9996 9997 r = r600_bytecode_add_alu(ctx->bc, &alu); 9998 if (r) 9999 return r; 10000 } 10001 10002 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 10003 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 10004 10005 if (ctx->bc->chip_class == CAYMAN) { 10006 for (i = 0; i < 3; i++) { 10007 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10008 10009 alu.op = ALU_OP1_LOG_IEEE; 10010 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10011 r600_bytecode_src_set_abs(&alu.src[0]); 10012 10013 alu.dst.sel = ctx->temp_reg; 10014 alu.dst.chan = i; 10015 if (i == 1) 10016 alu.dst.write = 1; 10017 if (i == 2) 10018 alu.last = 1; 10019 10020 r = r600_bytecode_add_alu(ctx->bc, &alu); 10021 if (r) 10022 return r; 10023 } 10024 } else { 10025 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10026 10027 alu.op = ALU_OP1_LOG_IEEE; 10028 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10029 r600_bytecode_src_set_abs(&alu.src[0]); 10030 10031 alu.dst.sel = ctx->temp_reg; 10032 alu.dst.chan = 1; 10033 alu.dst.write = 1; 10034 alu.last = 1; 10035 10036 r = r600_bytecode_add_alu(ctx->bc, &alu); 10037 if (r) 10038 return r; 10039 } 10040 10041 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10042 10043 alu.op = ALU_OP1_FLOOR; 10044 alu.src[0].sel = ctx->temp_reg; 10045 alu.src[0].chan = 1; 10046 10047 alu.dst.sel = ctx->temp_reg; 10048 alu.dst.chan = 1; 10049 alu.dst.write = 1; 10050 alu.last = 1; 10051 10052 r = r600_bytecode_add_alu(ctx->bc, &alu); 10053 if (r) 10054 return r; 10055 10056 if (ctx->bc->chip_class == CAYMAN) { 10057 for (i = 0; i < 3; i++) { 10058 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10059 alu.op = ALU_OP1_EXP_IEEE; 10060 alu.src[0].sel = ctx->temp_reg; 10061 alu.src[0].chan = 1; 10062 10063 alu.dst.sel = ctx->temp_reg; 10064 alu.dst.chan = i; 10065 if (i == 1) 10066 alu.dst.write = 1; 10067 if (i == 2) 10068 alu.last = 1; 10069 10070 r = r600_bytecode_add_alu(ctx->bc, &alu); 10071 if (r) 10072 return r; 10073 } 10074 } else { 10075 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10076 alu.op = ALU_OP1_EXP_IEEE; 10077 alu.src[0].sel = ctx->temp_reg; 10078 alu.src[0].chan = 1; 10079 10080 alu.dst.sel = ctx->temp_reg; 10081 alu.dst.chan = 1; 10082 alu.dst.write = 1; 10083 alu.last = 1; 10084 10085 r = r600_bytecode_add_alu(ctx->bc, &alu); 10086 if (r) 10087 return r; 10088 } 10089 10090 if (ctx->bc->chip_class == CAYMAN) { 10091 for (i = 0; i < 3; i++) { 10092 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10093 alu.op = ALU_OP1_RECIP_IEEE; 10094 alu.src[0].sel = ctx->temp_reg; 10095 alu.src[0].chan = 1; 10096 10097 alu.dst.sel = ctx->temp_reg; 10098 alu.dst.chan = i; 10099 if (i == 1) 10100 alu.dst.write = 1; 10101 if (i == 2) 10102 alu.last = 1; 10103 10104 r = r600_bytecode_add_alu(ctx->bc, &alu); 10105 if (r) 10106 return r; 10107 } 10108 } else { 10109 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10110 alu.op = ALU_OP1_RECIP_IEEE; 10111 alu.src[0].sel = ctx->temp_reg; 10112 alu.src[0].chan = 1; 10113 10114 alu.dst.sel = ctx->temp_reg; 10115 alu.dst.chan = 1; 10116 alu.dst.write = 1; 10117 alu.last = 1; 10118 10119 r = r600_bytecode_add_alu(ctx->bc, &alu); 10120 if (r) 10121 return r; 10122 } 10123 10124 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10125 10126 alu.op = ALU_OP2_MUL; 10127 10128 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10129 r600_bytecode_src_set_abs(&alu.src[0]); 10130 10131 alu.src[1].sel = ctx->temp_reg; 10132 alu.src[1].chan = 1; 10133 10134 alu.dst.sel = ctx->temp_reg; 10135 alu.dst.chan = 1; 10136 alu.dst.write = 1; 10137 alu.last = 1; 10138 10139 r = r600_bytecode_add_alu(ctx->bc, &alu); 10140 if (r) 10141 return r; 10142 } 10143 10144 /* result.z = log2(|src|);*/ 10145 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 10146 if (ctx->bc->chip_class == CAYMAN) { 10147 for (i = 0; i < 3; i++) { 10148 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10149 10150 alu.op = ALU_OP1_LOG_IEEE; 10151 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10152 r600_bytecode_src_set_abs(&alu.src[0]); 10153 10154 alu.dst.sel = ctx->temp_reg; 10155 if (i == 2) 10156 alu.dst.write = 1; 10157 alu.dst.chan = i; 10158 if (i == 2) 10159 alu.last = 1; 10160 10161 r = r600_bytecode_add_alu(ctx->bc, &alu); 10162 if (r) 10163 return r; 10164 } 10165 } else { 10166 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10167 10168 alu.op = ALU_OP1_LOG_IEEE; 10169 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10170 r600_bytecode_src_set_abs(&alu.src[0]); 10171 10172 alu.dst.sel = ctx->temp_reg; 10173 alu.dst.write = 1; 10174 alu.dst.chan = 2; 10175 alu.last = 1; 10176 10177 r = r600_bytecode_add_alu(ctx->bc, &alu); 10178 if (r) 10179 return r; 10180 } 10181 } 10182 10183 /* result.w = 1.0; */ 10184 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 10185 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10186 10187 alu.op = ALU_OP1_MOV; 10188 alu.src[0].sel = V_SQ_ALU_SRC_1; 10189 alu.src[0].chan = 0; 10190 10191 alu.dst.sel = ctx->temp_reg; 10192 alu.dst.chan = 3; 10193 alu.dst.write = 1; 10194 alu.last = 1; 10195 10196 r = r600_bytecode_add_alu(ctx->bc, &alu); 10197 if (r) 10198 return r; 10199 } 10200 10201 return tgsi_helper_copy(ctx, inst); 10202} 10203 10204static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 10205{ 10206 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10207 struct r600_bytecode_alu alu; 10208 int r; 10209 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10210 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); 10211 10212 assert(inst->Dst[0].Register.Index < 3); 10213 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10214 10215 switch (inst->Instruction.Opcode) { 10216 case TGSI_OPCODE_ARL: 10217 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 10218 break; 10219 case TGSI_OPCODE_ARR: 10220 alu.op = ALU_OP1_FLT_TO_INT; 10221 break; 10222 case TGSI_OPCODE_UARL: 10223 alu.op = ALU_OP1_MOV; 10224 break; 10225 default: 10226 assert(0); 10227 return -1; 10228 } 10229 10230 for (i = 0; i <= lasti; ++i) { 10231 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10232 continue; 10233 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10234 alu.last = i == lasti; 10235 alu.dst.sel = reg; 10236 alu.dst.chan = i; 10237 alu.dst.write = 1; 10238 r = r600_bytecode_add_alu(ctx->bc, &alu); 10239 if (r) 10240 return r; 10241 } 10242 10243 if (inst->Dst[0].Register.Index > 0) 10244 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0; 10245 else 10246 ctx->bc->ar_loaded = 0; 10247 10248 return 0; 10249} 10250static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 10251{ 10252 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10253 struct r600_bytecode_alu alu; 10254 int r; 10255 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10256 10257 switch (inst->Instruction.Opcode) { 10258 case TGSI_OPCODE_ARL: 10259 memset(&alu, 0, sizeof(alu)); 10260 alu.op = ALU_OP1_FLOOR; 10261 alu.dst.sel = ctx->bc->ar_reg; 10262 alu.dst.write = 1; 10263 for (i = 0; i <= lasti; ++i) { 10264 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10265 alu.dst.chan = i; 10266 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10267 alu.last = i == lasti; 10268 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10269 return r; 10270 } 10271 } 10272 10273 memset(&alu, 0, sizeof(alu)); 10274 alu.op = ALU_OP1_FLT_TO_INT; 10275 alu.src[0].sel = ctx->bc->ar_reg; 10276 alu.dst.sel = ctx->bc->ar_reg; 10277 alu.dst.write = 1; 10278 /* FLT_TO_INT is trans-only on r600/r700 */ 10279 alu.last = TRUE; 10280 for (i = 0; i <= lasti; ++i) { 10281 alu.dst.chan = i; 10282 alu.src[0].chan = i; 10283 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10284 return r; 10285 } 10286 break; 10287 case TGSI_OPCODE_ARR: 10288 memset(&alu, 0, sizeof(alu)); 10289 alu.op = ALU_OP1_FLT_TO_INT; 10290 alu.dst.sel = ctx->bc->ar_reg; 10291 alu.dst.write = 1; 10292 /* FLT_TO_INT is trans-only on r600/r700 */ 10293 alu.last = TRUE; 10294 for (i = 0; i <= lasti; ++i) { 10295 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10296 alu.dst.chan = i; 10297 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10298 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10299 return r; 10300 } 10301 } 10302 break; 10303 case TGSI_OPCODE_UARL: 10304 memset(&alu, 0, sizeof(alu)); 10305 alu.op = ALU_OP1_MOV; 10306 alu.dst.sel = ctx->bc->ar_reg; 10307 alu.dst.write = 1; 10308 for (i = 0; i <= lasti; ++i) { 10309 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10310 alu.dst.chan = i; 10311 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10312 alu.last = i == lasti; 10313 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10314 return r; 10315 } 10316 } 10317 break; 10318 default: 10319 assert(0); 10320 return -1; 10321 } 10322 10323 ctx->bc->ar_loaded = 0; 10324 return 0; 10325} 10326 10327static int tgsi_opdst(struct r600_shader_ctx *ctx) 10328{ 10329 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10330 struct r600_bytecode_alu alu; 10331 int i, r = 0; 10332 10333 for (i = 0; i < 4; i++) { 10334 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10335 10336 alu.op = ALU_OP2_MUL; 10337 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10338 10339 if (i == 0 || i == 3) { 10340 alu.src[0].sel = V_SQ_ALU_SRC_1; 10341 } else { 10342 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10343 } 10344 10345 if (i == 0 || i == 2) { 10346 alu.src[1].sel = V_SQ_ALU_SRC_1; 10347 } else { 10348 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 10349 } 10350 if (i == 3) 10351 alu.last = 1; 10352 r = r600_bytecode_add_alu(ctx->bc, &alu); 10353 if (r) 10354 return r; 10355 } 10356 return 0; 10357} 10358 10359static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type, 10360 struct r600_bytecode_alu_src *src) 10361{ 10362 struct r600_bytecode_alu alu; 10363 int r; 10364 10365 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10366 alu.op = opcode; 10367 alu.execute_mask = 1; 10368 alu.update_pred = 1; 10369 10370 alu.dst.sel = ctx->temp_reg; 10371 alu.dst.write = 1; 10372 alu.dst.chan = 0; 10373 10374 alu.src[0] = *src; 10375 alu.src[1].sel = V_SQ_ALU_SRC_0; 10376 alu.src[1].chan = 0; 10377 10378 alu.last = 1; 10379 10380 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 10381 if (r) 10382 return r; 10383 return 0; 10384} 10385 10386static int pops(struct r600_shader_ctx *ctx, int pops) 10387{ 10388 unsigned force_pop = ctx->bc->force_add_cf; 10389 10390 if (!force_pop) { 10391 int alu_pop = 3; 10392 if (ctx->bc->cf_last) { 10393 if (ctx->bc->cf_last->op == CF_OP_ALU) 10394 alu_pop = 0; 10395 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 10396 alu_pop = 1; 10397 } 10398 alu_pop += pops; 10399 if (alu_pop == 1) { 10400 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 10401 ctx->bc->force_add_cf = 1; 10402 } else if (alu_pop == 2) { 10403 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 10404 ctx->bc->force_add_cf = 1; 10405 } else { 10406 force_pop = 1; 10407 } 10408 } 10409 10410 if (force_pop) { 10411 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 10412 ctx->bc->cf_last->pop_count = pops; 10413 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10414 } 10415 10416 return 0; 10417} 10418 10419static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx, 10420 unsigned reason) 10421{ 10422 struct r600_stack_info *stack = &ctx->bc->stack; 10423 unsigned elements; 10424 int entries; 10425 10426 unsigned entry_size = stack->entry_size; 10427 10428 elements = (stack->loop + stack->push_wqm ) * entry_size; 10429 elements += stack->push; 10430 10431 switch (ctx->bc->chip_class) { 10432 case R600: 10433 case R700: 10434 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 10435 * the stack must be reserved to hold the current active/continue 10436 * masks */ 10437 if (reason == FC_PUSH_VPM || stack->push > 0) { 10438 elements += 2; 10439 } 10440 break; 10441 10442 case CAYMAN: 10443 /* r9xx: any stack operation on empty stack consumes 2 additional 10444 * elements */ 10445 elements += 2; 10446 10447 FALLTHROUGH; 10448 /* FIXME: do the two elements added above cover the cases for the 10449 * r8xx+ below? */ 10450 10451 case EVERGREEN: 10452 /* r8xx+: 2 extra elements are not always required, but one extra 10453 * element must be added for each of the following cases: 10454 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 10455 * stack usage. 10456 * (Currently we don't use ALU_ELSE_AFTER.) 10457 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 10458 * PUSH instruction executed. 10459 * 10460 * NOTE: it seems we also need to reserve additional element in some 10461 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 10462 * then STACK_SIZE should be 2 instead of 1 */ 10463 if (reason == FC_PUSH_VPM || stack->push > 0) { 10464 elements += 1; 10465 } 10466 break; 10467 10468 default: 10469 assert(0); 10470 break; 10471 } 10472 10473 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 10474 * for all chips, so we use 4 in the final formula, not the real entry_size 10475 * for the chip */ 10476 entry_size = 4; 10477 10478 entries = (elements + (entry_size - 1)) / entry_size; 10479 10480 if (entries > stack->max_entries) 10481 stack->max_entries = entries; 10482 return elements; 10483} 10484 10485static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 10486{ 10487 switch(reason) { 10488 case FC_PUSH_VPM: 10489 --ctx->bc->stack.push; 10490 assert(ctx->bc->stack.push >= 0); 10491 break; 10492 case FC_PUSH_WQM: 10493 --ctx->bc->stack.push_wqm; 10494 assert(ctx->bc->stack.push_wqm >= 0); 10495 break; 10496 case FC_LOOP: 10497 --ctx->bc->stack.loop; 10498 assert(ctx->bc->stack.loop >= 0); 10499 break; 10500 default: 10501 assert(0); 10502 break; 10503 } 10504} 10505 10506static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 10507{ 10508 switch (reason) { 10509 case FC_PUSH_VPM: 10510 ++ctx->bc->stack.push; 10511 break; 10512 case FC_PUSH_WQM: 10513 ++ctx->bc->stack.push_wqm; 10514 break; 10515 case FC_LOOP: 10516 ++ctx->bc->stack.loop; 10517 break; 10518 default: 10519 assert(0); 10520 } 10521 10522 return callstack_update_max_depth(ctx, reason); 10523} 10524 10525static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 10526{ 10527 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 10528 10529 sp->mid = realloc((void *)sp->mid, 10530 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 10531 sp->mid[sp->num_mid] = ctx->bc->cf_last; 10532 sp->num_mid++; 10533} 10534 10535static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 10536{ 10537 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack)); 10538 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 10539 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 10540 ctx->bc->fc_sp++; 10541} 10542 10543static void fc_poplevel(struct r600_shader_ctx *ctx) 10544{ 10545 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1]; 10546 free(sp->mid); 10547 sp->mid = NULL; 10548 sp->num_mid = 0; 10549 sp->start = NULL; 10550 sp->type = 0; 10551 ctx->bc->fc_sp--; 10552} 10553 10554#if 0 10555static int emit_return(struct r600_shader_ctx *ctx) 10556{ 10557 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 10558 return 0; 10559} 10560 10561static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 10562{ 10563 10564 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 10565 ctx->bc->cf_last->pop_count = pops; 10566 /* XXX work out offset */ 10567 return 0; 10568} 10569 10570static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 10571{ 10572 return 0; 10573} 10574 10575static void emit_testflag(struct r600_shader_ctx *ctx) 10576{ 10577 10578} 10579 10580static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 10581{ 10582 emit_testflag(ctx); 10583 emit_jump_to_offset(ctx, 1, 4); 10584 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 10585 pops(ctx, ifidx + 1); 10586 emit_return(ctx); 10587} 10588 10589static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 10590{ 10591 emit_testflag(ctx); 10592 10593 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10594 ctx->bc->cf_last->pop_count = 1; 10595 10596 fc_set_mid(ctx, fc_sp); 10597 10598 pops(ctx, 1); 10599} 10600#endif 10601 10602static int emit_if(struct r600_shader_ctx *ctx, int opcode, 10603 struct r600_bytecode_alu_src *src) 10604{ 10605 int alu_type = CF_OP_ALU_PUSH_BEFORE; 10606 bool needs_workaround = false; 10607 int elems = callstack_push(ctx, FC_PUSH_VPM); 10608 10609 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) 10610 needs_workaround = true; 10611 10612 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) { 10613 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size; 10614 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size; 10615 10616 if (elems && (!dmod1 || !dmod2)) 10617 needs_workaround = true; 10618 } 10619 10620 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 10621 * LOOP_STARTxxx for nested loops may put the branch stack into a state 10622 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 10623 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 10624 if (needs_workaround) { 10625 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 10626 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10627 alu_type = CF_OP_ALU; 10628 } 10629 10630 emit_logic_pred(ctx, opcode, alu_type, src); 10631 10632 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 10633 10634 fc_pushlevel(ctx, FC_IF); 10635 10636 return 0; 10637} 10638 10639static int tgsi_if(struct r600_shader_ctx *ctx) 10640{ 10641 struct r600_bytecode_alu_src alu_src; 10642 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10643 10644 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src); 10645} 10646 10647static int tgsi_uif(struct r600_shader_ctx *ctx) 10648{ 10649 struct r600_bytecode_alu_src alu_src; 10650 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10651 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 10652} 10653 10654static int tgsi_else(struct r600_shader_ctx *ctx) 10655{ 10656 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 10657 ctx->bc->cf_last->pop_count = 1; 10658 10659 fc_set_mid(ctx, ctx->bc->fc_sp - 1); 10660 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id; 10661 return 0; 10662} 10663 10664static int tgsi_endif(struct r600_shader_ctx *ctx) 10665{ 10666 int offset = 2; 10667 pops(ctx, 1); 10668 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) { 10669 R600_ERR("if/endif unbalanced in shader\n"); 10670 return -1; 10671 } 10672 10673 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */ 10674 if (ctx->bc->cf_last->eg_alu_extended) 10675 offset += 2; 10676 10677 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) { 10678 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset; 10679 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1; 10680 } else { 10681 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset; 10682 } 10683 fc_poplevel(ctx); 10684 10685 callstack_pop(ctx, FC_PUSH_VPM); 10686 return 0; 10687} 10688 10689static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 10690{ 10691 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 10692 * limited to 4096 iterations, like the other LOOP_* instructions. */ 10693 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 10694 10695 fc_pushlevel(ctx, FC_LOOP); 10696 10697 /* check stack depth */ 10698 callstack_push(ctx, FC_LOOP); 10699 return 0; 10700} 10701 10702static int tgsi_endloop(struct r600_shader_ctx *ctx) 10703{ 10704 int i; 10705 10706 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 10707 10708 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) { 10709 R600_ERR("loop/endloop in shader code are not paired.\n"); 10710 return -EINVAL; 10711 } 10712 10713 /* fixup loop pointers - from r600isa 10714 LOOP END points to CF after LOOP START, 10715 LOOP START point to CF after LOOP END 10716 BRK/CONT point to LOOP END CF 10717 */ 10718 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2; 10719 10720 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2; 10721 10722 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) { 10723 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id; 10724 } 10725 /* XXX add LOOPRET support */ 10726 fc_poplevel(ctx); 10727 callstack_pop(ctx, FC_LOOP); 10728 return 0; 10729} 10730 10731static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 10732{ 10733 unsigned int fscp; 10734 10735 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 10736 { 10737 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type) 10738 break; 10739 } 10740 10741 if (fscp == 0) { 10742 R600_ERR("Break not inside loop/endloop pair\n"); 10743 return -EINVAL; 10744 } 10745 10746 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10747 10748 fc_set_mid(ctx, fscp - 1); 10749 10750 return 0; 10751} 10752 10753static int tgsi_gs_emit(struct r600_shader_ctx *ctx) 10754{ 10755 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10756 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX]; 10757 int r; 10758 10759 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10760 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE); 10761 10762 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10763 if (!r) { 10764 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream 10765 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10766 return emit_inc_ring_offset(ctx, stream, TRUE); 10767 } 10768 return r; 10769} 10770 10771static int tgsi_umad(struct r600_shader_ctx *ctx) 10772{ 10773 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10774 struct r600_bytecode_alu alu; 10775 int i, j, r; 10776 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10777 10778 /* src0 * src1 */ 10779 for (i = 0; i < lasti + 1; i++) { 10780 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10781 continue; 10782 10783 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10784 10785 alu.dst.chan = i; 10786 alu.dst.sel = ctx->temp_reg; 10787 alu.dst.write = 1; 10788 10789 alu.op = ALU_OP2_MULLO_UINT; 10790 for (j = 0; j < 2; j++) { 10791 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 10792 } 10793 10794 alu.last = 1; 10795 r = emit_mul_int_op(ctx->bc, &alu); 10796 if (r) 10797 return r; 10798 } 10799 10800 10801 for (i = 0; i < lasti + 1; i++) { 10802 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10803 continue; 10804 10805 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10806 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10807 10808 alu.op = ALU_OP2_ADD_INT; 10809 10810 alu.src[0].sel = ctx->temp_reg; 10811 alu.src[0].chan = i; 10812 10813 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 10814 if (i == lasti) { 10815 alu.last = 1; 10816 } 10817 r = r600_bytecode_add_alu(ctx->bc, &alu); 10818 if (r) 10819 return r; 10820 } 10821 return 0; 10822} 10823 10824static int tgsi_pk2h(struct r600_shader_ctx *ctx) 10825{ 10826 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10827 struct r600_bytecode_alu alu; 10828 int r, i; 10829 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10830 10831 /* temp.xy = f32_to_f16(src) */ 10832 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10833 alu.op = ALU_OP1_FLT32_TO_FLT16; 10834 alu.dst.chan = 0; 10835 alu.dst.sel = ctx->temp_reg; 10836 alu.dst.write = 1; 10837 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10838 r = r600_bytecode_add_alu(ctx->bc, &alu); 10839 if (r) 10840 return r; 10841 alu.dst.chan = 1; 10842 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 10843 alu.last = 1; 10844 r = r600_bytecode_add_alu(ctx->bc, &alu); 10845 if (r) 10846 return r; 10847 10848 /* dst.x = temp.y * 0x10000 + temp.x */ 10849 for (i = 0; i < lasti + 1; i++) { 10850 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10851 continue; 10852 10853 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10854 alu.op = ALU_OP3_MULADD_UINT24; 10855 alu.is_op3 = 1; 10856 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10857 alu.last = i == lasti; 10858 alu.src[0].sel = ctx->temp_reg; 10859 alu.src[0].chan = 1; 10860 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10861 alu.src[1].value = 0x10000; 10862 alu.src[2].sel = ctx->temp_reg; 10863 alu.src[2].chan = 0; 10864 r = r600_bytecode_add_alu(ctx->bc, &alu); 10865 if (r) 10866 return r; 10867 } 10868 10869 return 0; 10870} 10871 10872static int tgsi_up2h(struct r600_shader_ctx *ctx) 10873{ 10874 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10875 struct r600_bytecode_alu alu; 10876 int r, i; 10877 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10878 10879 /* temp.x = src.x */ 10880 /* note: no need to mask out the high bits */ 10881 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10882 alu.op = ALU_OP1_MOV; 10883 alu.dst.chan = 0; 10884 alu.dst.sel = ctx->temp_reg; 10885 alu.dst.write = 1; 10886 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10887 r = r600_bytecode_add_alu(ctx->bc, &alu); 10888 if (r) 10889 return r; 10890 10891 /* temp.y = src.x >> 16 */ 10892 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10893 alu.op = ALU_OP2_LSHR_INT; 10894 alu.dst.chan = 1; 10895 alu.dst.sel = ctx->temp_reg; 10896 alu.dst.write = 1; 10897 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10898 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10899 alu.src[1].value = 16; 10900 alu.last = 1; 10901 r = r600_bytecode_add_alu(ctx->bc, &alu); 10902 if (r) 10903 return r; 10904 10905 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ 10906 for (i = 0; i < lasti + 1; i++) { 10907 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10908 continue; 10909 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10910 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10911 alu.op = ALU_OP1_FLT16_TO_FLT32; 10912 alu.src[0].sel = ctx->temp_reg; 10913 alu.src[0].chan = i % 2; 10914 alu.last = i == lasti; 10915 r = r600_bytecode_add_alu(ctx->bc, &alu); 10916 if (r) 10917 return r; 10918 } 10919 10920 return 0; 10921} 10922 10923static int tgsi_bfe(struct r600_shader_ctx *ctx) 10924{ 10925 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10926 struct r600_bytecode_alu alu; 10927 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10928 int r, i; 10929 int dst = -1; 10930 10931 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File && 10932 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) || 10933 (inst->Src[2].Register.File == inst->Dst[0].Register.File && 10934 inst->Src[2].Register.Index == inst->Dst[0].Register.Index)) 10935 dst = r600_get_temp(ctx); 10936 10937 r = tgsi_op3_dst(ctx, dst); 10938 if (r) 10939 return r; 10940 10941 for (i = 0; i < lasti + 1; i++) { 10942 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10943 alu.op = ALU_OP2_SETGE_INT; 10944 r600_bytecode_src(&alu.src[0], &ctx->src[2], i); 10945 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10946 alu.src[1].value = 32; 10947 alu.dst.sel = ctx->temp_reg; 10948 alu.dst.chan = i; 10949 alu.dst.write = 1; 10950 if (i == lasti) 10951 alu.last = 1; 10952 r = r600_bytecode_add_alu(ctx->bc, &alu); 10953 if (r) 10954 return r; 10955 } 10956 10957 for (i = 0; i < lasti + 1; i++) { 10958 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10959 alu.op = ALU_OP3_CNDE_INT; 10960 alu.is_op3 = 1; 10961 alu.src[0].sel = ctx->temp_reg; 10962 alu.src[0].chan = i; 10963 10964 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10965 if (dst != -1) 10966 alu.src[1].sel = dst; 10967 else 10968 alu.src[1].sel = alu.dst.sel; 10969 alu.src[1].chan = i; 10970 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 10971 alu.dst.write = 1; 10972 if (i == lasti) 10973 alu.last = 1; 10974 r = r600_bytecode_add_alu(ctx->bc, &alu); 10975 if (r) 10976 return r; 10977 } 10978 10979 return 0; 10980} 10981 10982static int tgsi_clock(struct r600_shader_ctx *ctx) 10983{ 10984 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10985 struct r600_bytecode_alu alu; 10986 int r; 10987 10988 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10989 alu.op = ALU_OP1_MOV; 10990 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 10991 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO; 10992 r = r600_bytecode_add_alu(ctx->bc, &alu); 10993 if (r) 10994 return r; 10995 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10996 alu.op = ALU_OP1_MOV; 10997 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 10998 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI; 10999 alu.last = 1; 11000 r = r600_bytecode_add_alu(ctx->bc, &alu); 11001 if (r) 11002 return r; 11003 return 0; 11004} 11005 11006static int emit_u64add(struct r600_shader_ctx *ctx, int op, 11007 int treg, 11008 int src0_sel, int src0_chan, 11009 int src1_sel, int src1_chan) 11010{ 11011 struct r600_bytecode_alu alu; 11012 int r; 11013 int opc; 11014 11015 if (op == ALU_OP2_ADD_INT) 11016 opc = ALU_OP2_ADDC_UINT; 11017 else 11018 opc = ALU_OP2_SUBB_UINT; 11019 11020 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11021 alu.op = op; ; 11022 alu.dst.sel = treg; 11023 alu.dst.chan = 0; 11024 alu.dst.write = 1; 11025 alu.src[0].sel = src0_sel; 11026 alu.src[0].chan = src0_chan + 0; 11027 alu.src[1].sel = src1_sel; 11028 alu.src[1].chan = src1_chan + 0; 11029 alu.src[1].neg = 0; 11030 r = r600_bytecode_add_alu(ctx->bc, &alu); 11031 if (r) 11032 return r; 11033 11034 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11035 alu.op = op; 11036 alu.dst.sel = treg; 11037 alu.dst.chan = 1; 11038 alu.dst.write = 1; 11039 alu.src[0].sel = src0_sel; 11040 alu.src[0].chan = src0_chan + 1; 11041 alu.src[1].sel = src1_sel; 11042 alu.src[1].chan = src1_chan + 1; 11043 alu.src[1].neg = 0; 11044 r = r600_bytecode_add_alu(ctx->bc, &alu); 11045 if (r) 11046 return r; 11047 11048 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11049 alu.op = opc; 11050 alu.dst.sel = treg; 11051 alu.dst.chan = 2; 11052 alu.dst.write = 1; 11053 alu.last = 1; 11054 alu.src[0].sel = src0_sel; 11055 alu.src[0].chan = src0_chan + 0; 11056 alu.src[1].sel = src1_sel; 11057 alu.src[1].chan = src1_chan + 0; 11058 alu.src[1].neg = 0; 11059 r = r600_bytecode_add_alu(ctx->bc, &alu); 11060 if (r) 11061 return r; 11062 11063 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11064 alu.op = op; 11065 alu.dst.sel = treg; 11066 alu.dst.chan = 1; 11067 alu.dst.write = 1; 11068 alu.src[0].sel = treg; 11069 alu.src[0].chan = 1; 11070 alu.src[1].sel = treg; 11071 alu.src[1].chan = 2; 11072 alu.last = 1; 11073 r = r600_bytecode_add_alu(ctx->bc, &alu); 11074 if (r) 11075 return r; 11076 return 0; 11077} 11078 11079static int egcm_u64add(struct r600_shader_ctx *ctx) 11080{ 11081 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11082 struct r600_bytecode_alu alu; 11083 int r; 11084 int treg = ctx->temp_reg; 11085 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT; 11086 11087 if (ctx->src[1].neg) { 11088 op = ALU_OP2_SUB_INT; 11089 opc = ALU_OP2_SUBB_UINT; 11090 } 11091 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11092 alu.op = op; ; 11093 alu.dst.sel = treg; 11094 alu.dst.chan = 0; 11095 alu.dst.write = 1; 11096 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11097 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11098 alu.src[1].neg = 0; 11099 r = r600_bytecode_add_alu(ctx->bc, &alu); 11100 if (r) 11101 return r; 11102 11103 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11104 alu.op = op; 11105 alu.dst.sel = treg; 11106 alu.dst.chan = 1; 11107 alu.dst.write = 1; 11108 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11109 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11110 alu.src[1].neg = 0; 11111 r = r600_bytecode_add_alu(ctx->bc, &alu); 11112 if (r) 11113 return r; 11114 11115 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11116 alu.op = opc ; 11117 alu.dst.sel = treg; 11118 alu.dst.chan = 2; 11119 alu.dst.write = 1; 11120 alu.last = 1; 11121 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11122 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11123 alu.src[1].neg = 0; 11124 r = r600_bytecode_add_alu(ctx->bc, &alu); 11125 if (r) 11126 return r; 11127 11128 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11129 alu.op = op; 11130 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11131 alu.src[0].sel = treg; 11132 alu.src[0].chan = 1; 11133 alu.src[1].sel = treg; 11134 alu.src[1].chan = 2; 11135 alu.last = 1; 11136 r = r600_bytecode_add_alu(ctx->bc, &alu); 11137 if (r) 11138 return r; 11139 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11140 alu.op = ALU_OP1_MOV; 11141 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11142 alu.src[0].sel = treg; 11143 alu.src[0].chan = 0; 11144 alu.last = 1; 11145 r = r600_bytecode_add_alu(ctx->bc, &alu); 11146 if (r) 11147 return r; 11148 return 0; 11149} 11150 11151 11152static int egcm_i64neg(struct r600_shader_ctx *ctx) 11153{ 11154 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11155 struct r600_bytecode_alu alu; 11156 int r; 11157 int treg = ctx->temp_reg; 11158 const int op = ALU_OP2_SUB_INT; 11159 const int opc = ALU_OP2_SUBB_UINT; 11160 11161 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11162 alu.op = op; ; 11163 alu.dst.sel = treg; 11164 alu.dst.chan = 0; 11165 alu.dst.write = 1; 11166 alu.src[0].sel = V_SQ_ALU_SRC_0; 11167 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0); 11168 alu.src[1].neg = 0; 11169 r = r600_bytecode_add_alu(ctx->bc, &alu); 11170 if (r) 11171 return r; 11172 11173 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11174 alu.op = op; 11175 alu.dst.sel = treg; 11176 alu.dst.chan = 1; 11177 alu.dst.write = 1; 11178 alu.src[0].sel = V_SQ_ALU_SRC_0; 11179 r600_bytecode_src(&alu.src[1], &ctx->src[0], 1); 11180 alu.src[1].neg = 0; 11181 r = r600_bytecode_add_alu(ctx->bc, &alu); 11182 if (r) 11183 return r; 11184 11185 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11186 alu.op = opc ; 11187 alu.dst.sel = treg; 11188 alu.dst.chan = 2; 11189 alu.dst.write = 1; 11190 alu.last = 1; 11191 alu.src[0].sel = V_SQ_ALU_SRC_0; 11192 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0); 11193 alu.src[1].neg = 0; 11194 r = r600_bytecode_add_alu(ctx->bc, &alu); 11195 if (r) 11196 return r; 11197 11198 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11199 alu.op = op; 11200 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11201 alu.src[0].sel = treg; 11202 alu.src[0].chan = 1; 11203 alu.src[1].sel = treg; 11204 alu.src[1].chan = 2; 11205 alu.last = 1; 11206 r = r600_bytecode_add_alu(ctx->bc, &alu); 11207 if (r) 11208 return r; 11209 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11210 alu.op = ALU_OP1_MOV; 11211 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11212 alu.src[0].sel = treg; 11213 alu.src[0].chan = 0; 11214 alu.last = 1; 11215 r = r600_bytecode_add_alu(ctx->bc, &alu); 11216 if (r) 11217 return r; 11218 return 0; 11219} 11220 11221/* result.y = mul_high a, b 11222 result.x = mul a,b 11223 result.y += a.x * b.y + a.y * b.x; 11224*/ 11225static int egcm_u64mul(struct r600_shader_ctx *ctx) 11226{ 11227 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11228 struct r600_bytecode_alu alu; 11229 int r; 11230 int treg = ctx->temp_reg; 11231 11232 /* temp.x = mul_lo a.x, b.x */ 11233 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11234 alu.op = ALU_OP2_MULLO_UINT; 11235 alu.dst.sel = treg; 11236 alu.dst.chan = 0; 11237 alu.dst.write = 1; 11238 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11239 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11240 r = emit_mul_int_op(ctx->bc, &alu); 11241 if (r) 11242 return r; 11243 11244 /* temp.y = mul_hi a.x, b.x */ 11245 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11246 alu.op = ALU_OP2_MULHI_UINT; 11247 alu.dst.sel = treg; 11248 alu.dst.chan = 1; 11249 alu.dst.write = 1; 11250 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11251 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11252 r = emit_mul_int_op(ctx->bc, &alu); 11253 if (r) 11254 return r; 11255 11256 /* temp.z = mul a.x, b.y */ 11257 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11258 alu.op = ALU_OP2_MULLO_UINT; 11259 alu.dst.sel = treg; 11260 alu.dst.chan = 2; 11261 alu.dst.write = 1; 11262 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11263 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11264 r = emit_mul_int_op(ctx->bc, &alu); 11265 if (r) 11266 return r; 11267 11268 /* temp.w = mul a.y, b.x */ 11269 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11270 alu.op = ALU_OP2_MULLO_UINT; 11271 alu.dst.sel = treg; 11272 alu.dst.chan = 3; 11273 alu.dst.write = 1; 11274 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11275 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11276 r = emit_mul_int_op(ctx->bc, &alu); 11277 if (r) 11278 return r; 11279 11280 /* temp.z = temp.z + temp.w */ 11281 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11282 alu.op = ALU_OP2_ADD_INT; 11283 alu.dst.sel = treg; 11284 alu.dst.chan = 2; 11285 alu.dst.write = 1; 11286 alu.src[0].sel = treg; 11287 alu.src[0].chan = 2; 11288 alu.src[1].sel = treg; 11289 alu.src[1].chan = 3; 11290 alu.last = 1; 11291 r = r600_bytecode_add_alu(ctx->bc, &alu); 11292 if (r) 11293 return r; 11294 11295 /* temp.y = temp.y + temp.z */ 11296 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11297 alu.op = ALU_OP2_ADD_INT; 11298 alu.dst.sel = treg; 11299 alu.dst.chan = 1; 11300 alu.dst.write = 1; 11301 alu.src[0].sel = treg; 11302 alu.src[0].chan = 1; 11303 alu.src[1].sel = treg; 11304 alu.src[1].chan = 2; 11305 alu.last = 1; 11306 r = r600_bytecode_add_alu(ctx->bc, &alu); 11307 if (r) 11308 return r; 11309 11310 /* dst.x = temp.x */ 11311 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11312 alu.op = ALU_OP1_MOV; 11313 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11314 alu.src[0].sel = treg; 11315 alu.src[0].chan = 0; 11316 r = r600_bytecode_add_alu(ctx->bc, &alu); 11317 if (r) 11318 return r; 11319 11320 /* dst.y = temp.y */ 11321 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11322 alu.op = ALU_OP1_MOV; 11323 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11324 alu.src[0].sel = treg; 11325 alu.src[0].chan = 1; 11326 alu.last = 1; 11327 r = r600_bytecode_add_alu(ctx->bc, &alu); 11328 if (r) 11329 return r; 11330 11331 return 0; 11332} 11333 11334static int emit_u64sge(struct r600_shader_ctx *ctx, 11335 int treg, 11336 int src0_sel, int src0_base_chan, 11337 int src1_sel, int src1_base_chan) 11338{ 11339 int r; 11340 /* for 64-bit sge */ 11341 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */ 11342 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT, 11343 treg, 1, 11344 src0_sel, src0_base_chan + 1, 11345 src1_sel, src1_base_chan + 1); 11346 if (r) 11347 return r; 11348 11349 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11350 treg, 0, 11351 src0_sel, src0_base_chan, 11352 src1_sel, src1_base_chan); 11353 if (r) 11354 return r; 11355 11356 r = single_alu_op2(ctx, ALU_OP2_SETE_INT, 11357 treg, 2, 11358 src0_sel, src0_base_chan + 1, 11359 src1_sel, src1_base_chan + 1); 11360 if (r) 11361 return r; 11362 11363 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11364 treg, 0, 11365 treg, 0, 11366 treg, 2); 11367 if (r) 11368 return r; 11369 11370 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11371 treg, 0, 11372 treg, 0, 11373 treg, 1); 11374 if (r) 11375 return r; 11376 return 0; 11377} 11378 11379/* this isn't a complete div it's just enough for qbo shader to work */ 11380static int egcm_u64div(struct r600_shader_ctx *ctx) 11381{ 11382 struct r600_bytecode_alu alu; 11383 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src; 11384 int r, i; 11385 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11386 11387 /* make sure we are dividing my a const with 0 in the high bits */ 11388 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL) 11389 return -1; 11390 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0) 11391 return -1; 11392 /* make sure we are doing one division */ 11393 if (inst->Dst[0].Register.WriteMask != 0x3) 11394 return -1; 11395 11396 /* emit_if uses ctx->temp_reg so we can't */ 11397 int treg = r600_get_temp(ctx); 11398 int tmp_num = r600_get_temp(ctx); 11399 int sub_tmp = r600_get_temp(ctx); 11400 11401 /* tmp quot are tmp_num.zw */ 11402 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0); 11403 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1); 11404 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0); 11405 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1); 11406 11407 /* MOV tmp_num.xy, numerator */ 11408 r = single_alu_op2(ctx, ALU_OP1_MOV, 11409 tmp_num, 0, 11410 alu_num_lo.sel, alu_num_lo.chan, 11411 0, 0); 11412 if (r) 11413 return r; 11414 r = single_alu_op2(ctx, ALU_OP1_MOV, 11415 tmp_num, 1, 11416 alu_num_hi.sel, alu_num_hi.chan, 11417 0, 0); 11418 if (r) 11419 return r; 11420 11421 r = single_alu_op2(ctx, ALU_OP1_MOV, 11422 tmp_num, 2, 11423 V_SQ_ALU_SRC_LITERAL, 0, 11424 0, 0); 11425 if (r) 11426 return r; 11427 11428 r = single_alu_op2(ctx, ALU_OP1_MOV, 11429 tmp_num, 3, 11430 V_SQ_ALU_SRC_LITERAL, 0, 11431 0, 0); 11432 if (r) 11433 return r; 11434 11435 /* treg 0 is log2_denom */ 11436 /* normally this gets the MSB for the denom high value 11437 - however we know this will always be 0 here. */ 11438 r = single_alu_op2(ctx, 11439 ALU_OP1_MOV, 11440 treg, 0, 11441 V_SQ_ALU_SRC_LITERAL, 32, 11442 0, 0); 11443 if (r) 11444 return r; 11445 11446 /* normally check demon hi for 0, but we know it is already */ 11447 /* t0.z = num_hi >= denom_lo */ 11448 r = single_alu_op2(ctx, 11449 ALU_OP2_SETGE_UINT, 11450 treg, 1, 11451 alu_num_hi.sel, alu_num_hi.chan, 11452 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11453 if (r) 11454 return r; 11455 11456 memset(&alu_src, 0, sizeof(alu_src)); 11457 alu_src.sel = treg; 11458 alu_src.chan = 1; 11459 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11460 if (r) 11461 return r; 11462 11463 /* for loops in here */ 11464 /* get msb t0.x = msb(src[1].x) first */ 11465 int msb_lo = util_last_bit(alu_denom_lo.value); 11466 r = single_alu_op2(ctx, ALU_OP1_MOV, 11467 treg, 0, 11468 V_SQ_ALU_SRC_LITERAL, msb_lo, 11469 0, 0); 11470 if (r) 11471 return r; 11472 11473 /* unroll the asm here */ 11474 for (i = 0; i < 31; i++) { 11475 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11476 treg, 2, 11477 V_SQ_ALU_SRC_LITERAL, i, 11478 treg, 0); 11479 if (r) 11480 return r; 11481 11482 /* we can do this on the CPU */ 11483 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i); 11484 /* t0.z = tmp_num.y >= t0.z */ 11485 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11486 treg, 1, 11487 tmp_num, 1, 11488 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11489 if (r) 11490 return r; 11491 11492 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11493 treg, 1, 11494 treg, 1, 11495 treg, 2); 11496 if (r) 11497 return r; 11498 11499 memset(&alu_src, 0, sizeof(alu_src)); 11500 alu_src.sel = treg; 11501 alu_src.chan = 1; 11502 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11503 if (r) 11504 return r; 11505 11506 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11507 tmp_num, 1, 11508 tmp_num, 1, 11509 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11510 if (r) 11511 return r; 11512 11513 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11514 tmp_num, 3, 11515 tmp_num, 3, 11516 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11517 if (r) 11518 return r; 11519 11520 r = tgsi_endif(ctx); 11521 if (r) 11522 return r; 11523 } 11524 11525 /* log2_denom is always <= 31, so manually peel the last loop 11526 * iteration. 11527 */ 11528 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11529 treg, 1, 11530 tmp_num, 1, 11531 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11532 if (r) 11533 return r; 11534 11535 memset(&alu_src, 0, sizeof(alu_src)); 11536 alu_src.sel = treg; 11537 alu_src.chan = 1; 11538 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11539 if (r) 11540 return r; 11541 11542 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11543 tmp_num, 1, 11544 tmp_num, 1, 11545 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11546 if (r) 11547 return r; 11548 11549 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11550 tmp_num, 3, 11551 tmp_num, 3, 11552 V_SQ_ALU_SRC_LITERAL, 1U); 11553 if (r) 11554 return r; 11555 r = tgsi_endif(ctx); 11556 if (r) 11557 return r; 11558 11559 r = tgsi_endif(ctx); 11560 if (r) 11561 return r; 11562 11563 /* onto the second loop to unroll */ 11564 for (i = 0; i < 31; i++) { 11565 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11566 treg, 1, 11567 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)), 11568 treg, 0); 11569 if (r) 11570 return r; 11571 11572 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i); 11573 r = single_alu_op2(ctx, ALU_OP1_MOV, 11574 treg, 2, 11575 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11576 0, 0); 11577 if (r) 11578 return r; 11579 11580 r = single_alu_op2(ctx, ALU_OP1_MOV, 11581 treg, 3, 11582 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11583 0, 0); 11584 if (r) 11585 return r; 11586 11587 r = emit_u64sge(ctx, sub_tmp, 11588 tmp_num, 0, 11589 treg, 2); 11590 if (r) 11591 return r; 11592 11593 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11594 treg, 1, 11595 treg, 1, 11596 sub_tmp, 0); 11597 if (r) 11598 return r; 11599 11600 memset(&alu_src, 0, sizeof(alu_src)); 11601 alu_src.sel = treg; 11602 alu_src.chan = 1; 11603 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11604 if (r) 11605 return r; 11606 11607 11608 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11609 sub_tmp, 11610 tmp_num, 0, 11611 treg, 2); 11612 if (r) 11613 return r; 11614 11615 r = single_alu_op2(ctx, ALU_OP1_MOV, 11616 tmp_num, 0, 11617 sub_tmp, 0, 11618 0, 0); 11619 if (r) 11620 return r; 11621 11622 r = single_alu_op2(ctx, ALU_OP1_MOV, 11623 tmp_num, 1, 11624 sub_tmp, 1, 11625 0, 0); 11626 if (r) 11627 return r; 11628 11629 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11630 tmp_num, 2, 11631 tmp_num, 2, 11632 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11633 if (r) 11634 return r; 11635 11636 r = tgsi_endif(ctx); 11637 if (r) 11638 return r; 11639 } 11640 11641 /* log2_denom is always <= 63, so manually peel the last loop 11642 * iteration. 11643 */ 11644 uint64_t denom_shl = (uint64_t)alu_denom_lo.value; 11645 r = single_alu_op2(ctx, ALU_OP1_MOV, 11646 treg, 2, 11647 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11648 0, 0); 11649 if (r) 11650 return r; 11651 11652 r = single_alu_op2(ctx, ALU_OP1_MOV, 11653 treg, 3, 11654 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11655 0, 0); 11656 if (r) 11657 return r; 11658 11659 r = emit_u64sge(ctx, sub_tmp, 11660 tmp_num, 0, 11661 treg, 2); 11662 if (r) 11663 return r; 11664 11665 memset(&alu_src, 0, sizeof(alu_src)); 11666 alu_src.sel = sub_tmp; 11667 alu_src.chan = 0; 11668 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11669 if (r) 11670 return r; 11671 11672 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11673 sub_tmp, 11674 tmp_num, 0, 11675 treg, 2); 11676 if (r) 11677 return r; 11678 11679 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11680 tmp_num, 2, 11681 tmp_num, 2, 11682 V_SQ_ALU_SRC_LITERAL, 1U); 11683 if (r) 11684 return r; 11685 r = tgsi_endif(ctx); 11686 if (r) 11687 return r; 11688 11689 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11690 alu.op = ALU_OP1_MOV; 11691 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11692 alu.src[0].sel = tmp_num; 11693 alu.src[0].chan = 2; 11694 r = r600_bytecode_add_alu(ctx->bc, &alu); 11695 if (r) 11696 return r; 11697 11698 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11699 alu.op = ALU_OP1_MOV; 11700 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11701 alu.src[0].sel = tmp_num; 11702 alu.src[0].chan = 3; 11703 alu.last = 1; 11704 r = r600_bytecode_add_alu(ctx->bc, &alu); 11705 if (r) 11706 return r; 11707 return 0; 11708} 11709 11710static int egcm_u64sne(struct r600_shader_ctx *ctx) 11711{ 11712 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11713 struct r600_bytecode_alu alu; 11714 int r; 11715 int treg = ctx->temp_reg; 11716 11717 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11718 alu.op = ALU_OP2_SETNE_INT; 11719 alu.dst.sel = treg; 11720 alu.dst.chan = 0; 11721 alu.dst.write = 1; 11722 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11723 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11724 r = r600_bytecode_add_alu(ctx->bc, &alu); 11725 if (r) 11726 return r; 11727 11728 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11729 alu.op = ALU_OP2_SETNE_INT; 11730 alu.dst.sel = treg; 11731 alu.dst.chan = 1; 11732 alu.dst.write = 1; 11733 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11734 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11735 alu.last = 1; 11736 r = r600_bytecode_add_alu(ctx->bc, &alu); 11737 if (r) 11738 return r; 11739 11740 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11741 alu.op = ALU_OP2_OR_INT; 11742 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11743 alu.src[0].sel = treg; 11744 alu.src[0].chan = 0; 11745 alu.src[1].sel = treg; 11746 alu.src[1].chan = 1; 11747 alu.last = 1; 11748 r = r600_bytecode_add_alu(ctx->bc, &alu); 11749 if (r) 11750 return r; 11751 return 0; 11752} 11753 11754static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 11755 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, 11756 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11757 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11758 11759 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11760 11761 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11762 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11763 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11764 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11765 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11766 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11767 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11768 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11769 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */ 11770 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11771 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11772 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11773 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11774 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11775 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11776 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11777 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11778 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11779 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11780 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11781 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11782 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11783 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11784 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11785 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11786 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11787 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11788 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11789 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11790 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported}, 11791 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11792 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11793 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11794 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11795 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11796 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11797 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11798 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11799 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11800 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11801 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 11802 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 11803 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 11804 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 11805 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 11806 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 11807 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 11808 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 11809 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 11810 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 11811 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 11812 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11813 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11814 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11815 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11816 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 11817 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 11818 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl}, 11819 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 11820 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 11821 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 11822 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 11823 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 11824 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 11825 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11826 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 11827 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 11828 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11829 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11830 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 11831 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 11832 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 11833 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 11834 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 11835 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 11836 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11837 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11838 [81] = { ALU_OP0_NOP, tgsi_unsupported}, 11839 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 11840 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 11841 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 11842 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 11843 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 11844 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans}, 11845 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 11846 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 11847 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 11848 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 11849 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 11850 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 11851 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 11852 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11853 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 11854 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 11855 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 11856 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 11857 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11858 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 11859 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11860 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11861 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 11862 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, 11863 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 11864 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 11865 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 11866 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 11867 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 11868 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 11869 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, 11870 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 11871 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 11872 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 11873 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 11874 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 11875 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11876 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 11877 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 11878 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 11879 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 11880 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 11881 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 11882 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans}, 11883 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 11884 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 11885 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 11886 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 11887 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 11888 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 11889 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 11890 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 11891 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 11892 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 11893 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 11894 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 11895 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans}, 11896 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 11897 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap}, 11898 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11899 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 11900 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 11901 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11902 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 11903 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 11904 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 11905 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 11906 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 11907 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 11908 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 11909 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 11910 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 11911 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 11912 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 11913 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 11914 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl}, 11915 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 11916 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 11917 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 11918 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, 11919 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, 11920 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 11921 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 11922 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 11923 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, 11924 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, 11925 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, 11926 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, 11927 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, 11928 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11929 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11930 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11931 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11932 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11933 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11934 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 11935 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11936 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11937 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 11938 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 11939 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported}, 11940 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported}, 11941 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported}, 11942 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported}, 11943 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported}, 11944 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported}, 11945 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported}, 11946 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported}, 11947 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported}, 11948 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported}, 11949 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported}, 11950 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported}, 11951 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported}, 11952 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 11953}; 11954 11955static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 11956 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 11957 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11958 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11959 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11960 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11961 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11962 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11963 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11964 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11965 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11966 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11967 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11968 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11969 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11970 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11971 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11972 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11973 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11974 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 11975 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11976 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11977 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11978 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11979 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11980 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11981 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11982 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11983 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11984 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11985 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11986 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11987 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11988 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 11989 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11990 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11991 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11992 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11993 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11994 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11995 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 11996 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11997 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11998 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11999 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 12000 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 12001 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 12002 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 12003 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 12004 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 12005 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 12006 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 12007 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 12008 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 12009 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 12010 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 12011 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12012 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12013 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12014 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 12015 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 12016 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 12017 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 12018 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 12019 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 12020 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 12021 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 12022 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 12023 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12024 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 12025 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 12026 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12027 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12028 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 12029 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 12030 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 12031 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 12032 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 12033 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 12034 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12035 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12036 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 12037 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 12038 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 12039 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 12040 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 12041 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 12042 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 12043 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 12044 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 12045 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 12046 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 12047 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 12048 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 12049 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12050 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 12051 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 12052 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 12053 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 12054 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12055 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 12056 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12057 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12058 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 12059 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 12060 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 12061 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 12062 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 12063 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 12064 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 12065 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 12066 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12067 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 12068 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 12069 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 12070 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 12071 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 12072 /* Refer below for TGSI_OPCODE_DFMA */ 12073 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i}, 12074 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 12075 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 12076 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 12077 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 12078 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 12079 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 12080 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 12081 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 12082 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 12083 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 12084 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 12085 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 12086 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 12087 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 12088 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 12089 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 12090 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 12091 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 12092 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 12093 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 12094 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 12095 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12096 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 12097 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 12098 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12099 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 12100 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 12101 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 12102 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 12103 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 12104 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 12105 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 12106 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 12107 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 12108 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 12109 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 12110 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 12111 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 12112 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 12113 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 12114 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 12115 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 12116 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 12117 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 12118 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 12119 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 12120 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12121 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 12122 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 12123 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 12124 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 12125 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 12126 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 12127 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 12128 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 12129 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 12130 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 12131 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 12132 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12133 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12134 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 12135 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 12136 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 12137 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 12138 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 12139 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 12140 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 12141 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 12142 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 12143 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 12144 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 12145 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 12146 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12147 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12148 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12149 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 12150 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 12151 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 12152 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 12153 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 12154 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 12155 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 12156 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 12157 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 12158 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 12159 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 12160 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 12161 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 12162 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 12163 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 12164 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12165 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12166 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 12167 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 12168 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12169 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12170 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12171 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12172 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12173 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12174 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12175 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12176 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12177 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12178 [TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg }, 12179 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12180}; 12181 12182static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 12183 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 12184 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 12185 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 12186 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 12187 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 12188 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 12189 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 12190 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 12191 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 12192 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12193 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12194 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 12195 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 12196 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 12197 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 12198 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 12199 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 12200 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 12201 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 12202 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr}, 12203 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 12204 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 12205 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 12206 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 12207 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 12208 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 12209 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 12210 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 12211 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 12212 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow}, 12213 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 12214 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 12215 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 12216 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 12217 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 12218 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig}, 12219 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12220 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12221 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 12222 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 12223 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12224 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12225 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12226 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 12227 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 12228 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 12229 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 12230 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig}, 12231 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 12232 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 12233 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 12234 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 12235 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 12236 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 12237 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 12238 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12239 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12240 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12241 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 12242 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 12243 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 12244 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 12245 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 12246 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 12247 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 12248 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 12249 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 12250 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12251 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 12252 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 12253 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12254 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12255 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 12256 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 12257 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 12258 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 12259 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 12260 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 12261 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12262 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12263 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 12264 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 12265 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2}, 12266 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 12267 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 12268 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 12269 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 12270 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 12271 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 12272 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 12273 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 12274 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 12275 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 12276 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12277 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 12278 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 12279 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 12280 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 12281 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12282 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 12283 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12284 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12285 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 12286 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 12287 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 12288 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 12289 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 12290 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 12291 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 12292 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 12293 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12294 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 12295 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 12296 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 12297 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 12298 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 12299 /* Refer below for TGSI_OPCODE_DFMA */ 12300 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2}, 12301 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 12302 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 12303 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 12304 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 12305 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 12306 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 12307 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 12308 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2}, 12309 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2}, 12310 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 12311 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 12312 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 12313 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 12314 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 12315 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 12316 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 12317 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 12318 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 12319 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 12320 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 12321 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 12322 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12323 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 12324 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 12325 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12326 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 12327 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 12328 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 12329 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 12330 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 12331 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 12332 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 12333 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 12334 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 12335 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 12336 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 12337 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 12338 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 12339 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 12340 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 12341 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 12342 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 12343 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 12344 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 12345 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 12346 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 12347 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12348 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 12349 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 12350 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 12351 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 12352 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 12353 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 12354 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 12355 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 12356 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 12357 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 12358 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 12359 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12360 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12361 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr}, 12362 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr}, 12363 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 12364 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 12365 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 12366 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 12367 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 12368 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 12369 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 12370 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 12371 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 12372 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 12373 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12374 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12375 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12376 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 12377 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 12378 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 12379 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 12380 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 12381 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 12382 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 12383 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 12384 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 12385 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 12386 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 12387 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 12388 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 12389 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 12390 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 12391 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12392 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12393 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 12394 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 12395 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12396 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12397 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12398 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12399 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12400 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12401 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12402 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12403 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12404 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12405 [TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg }, 12406 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12407}; 12408