1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_formats.h" 25#include "r600_opcodes.h" 26#include "r600_shader.h" 27#include "r600d.h" 28 29#include "sb/sb_public.h" 30 31#include "pipe/p_shader_tokens.h" 32#include "tgsi/tgsi_info.h" 33#include "tgsi/tgsi_parse.h" 34#include "tgsi/tgsi_scan.h" 35#include "tgsi/tgsi_dump.h" 36#include "util/u_bitcast.h" 37#include "util/u_memory.h" 38#include "util/u_math.h" 39#include <stdio.h> 40#include <errno.h> 41 42/* CAYMAN notes 43Why CAYMAN got loops for lots of instructions is explained here. 44 45-These 8xx t-slot only ops are implemented in all vector slots. 46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 47These 8xx t-slot only opcodes become vector ops, with all four 48slots expecting the arguments on sources a and b. Result is 49broadcast to all channels. 50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64 51These 8xx t-slot only opcodes become vector ops in the z, y, and 52x slots. 53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 55SQRT_IEEE/_64 56SIN/COS 57The w slot may have an independent co-issued operation, or if the 58result is required to be in the w slot, the opcode above may be 59issued in the w slot as well. 60The compiler must issue the source argument to slots z, y, and x 61*/ 62 63/* Contents of r0 on entry to various shaders 64 65 VS - .x = VertexID 66 .y = RelVertexID (??) 67 .w = InstanceID 68 69 GS - r0.xyw, r1.xyz = per-vertex offsets 70 r0.z = PrimitiveID 71 72 TCS - .x = PatchID 73 .y = RelPatchID (??) 74 .z = InvocationID 75 .w = tess factor base. 76 77 TES - .x = TessCoord.x 78 - .y = TessCoord.y 79 - .z = RelPatchID (??) 80 - .w = PrimitiveID 81 82 PS - face_gpr.z = SampleMask 83 face_gpr.w = SampleID 84*/ 85#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16) 86static int r600_shader_from_tgsi(struct r600_context *rctx, 87 struct r600_pipe_shader *pipeshader, 88 union r600_shader_key key); 89 90static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 91 int size, unsigned comp_mask) { 92 93 if (!size) 94 return; 95 96 if (ps->num_arrays == ps->max_arrays) { 97 ps->max_arrays += 64; 98 ps->arrays = realloc(ps->arrays, ps->max_arrays * 99 sizeof(struct r600_shader_array)); 100 } 101 102 int n = ps->num_arrays; 103 ++ps->num_arrays; 104 105 ps->arrays[n].comp_mask = comp_mask; 106 ps->arrays[n].gpr_start = start_gpr; 107 ps->arrays[n].gpr_count = size; 108} 109 110static void r600_dump_streamout(struct pipe_stream_output_info *so) 111{ 112 unsigned i; 113 114 fprintf(stderr, "STREAMOUT\n"); 115 for (i = 0; i < so->num_outputs; i++) { 116 unsigned mask = ((1 << so->output[i].num_components) - 1) << 117 so->output[i].start_component; 118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 119 i, 120 so->output[i].stream, 121 so->output[i].output_buffer, 122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 123 so->output[i].register_index, 124 mask & 1 ? "x" : "", 125 mask & 2 ? "y" : "", 126 mask & 4 ? "z" : "", 127 mask & 8 ? "w" : "", 128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 129 } 130} 131 132static int store_shader(struct pipe_context *ctx, 133 struct r600_pipe_shader *shader) 134{ 135 struct r600_context *rctx = (struct r600_context *)ctx; 136 uint32_t *ptr, i; 137 138 if (shader->bo == NULL) { 139 shader->bo = (struct r600_resource*) 140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 141 if (shader->bo == NULL) { 142 return -ENOMEM; 143 } 144 ptr = r600_buffer_map_sync_with_rings( 145 &rctx->b, shader->bo, 146 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); 147 if (R600_BIG_ENDIAN) { 148 for (i = 0; i < shader->shader.bc.ndw; ++i) { 149 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]); 150 } 151 } else { 152 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 153 } 154 rctx->b.ws->buffer_unmap(shader->bo->buf); 155 } 156 157 return 0; 158} 159 160int r600_pipe_shader_create(struct pipe_context *ctx, 161 struct r600_pipe_shader *shader, 162 union r600_shader_key key) 163{ 164 struct r600_context *rctx = (struct r600_context *)ctx; 165 struct r600_pipe_shader_selector *sel = shader->selector; 166 int r; 167 bool dump = r600_can_dump_shader(&rctx->screen->b, 168 tgsi_get_processor_type(sel->tokens)); 169 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); 170 unsigned sb_disasm; 171 unsigned export_shader; 172 173 shader->shader.bc.isa = rctx->isa; 174 175 if (dump) { 176 fprintf(stderr, "--------------------------------------------------------------\n"); 177 tgsi_dump(sel->tokens, 0); 178 179 if (sel->so.num_outputs) { 180 r600_dump_streamout(&sel->so); 181 } 182 } 183 r = r600_shader_from_tgsi(rctx, shader, key); 184 if (r) { 185 R600_ERR("translation from TGSI failed !\n"); 186 goto error; 187 } 188 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { 189 /* only disable for vertex shaders in tess paths */ 190 if (key.vs.as_ls) 191 use_sb = 0; 192 } 193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL); 194 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL); 195 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE); 196 197 /* disable SB for shaders using doubles */ 198 use_sb &= !shader->shader.uses_doubles; 199 200 use_sb &= !shader->shader.uses_atomics; 201 use_sb &= !shader->shader.uses_images; 202 use_sb &= !shader->shader.uses_helper_invocation; 203 204 /* Check if the bytecode has already been built. */ 205 if (!shader->shader.bc.bytecode) { 206 r = r600_bytecode_build(&shader->shader.bc); 207 if (r) { 208 R600_ERR("building bytecode failed !\n"); 209 goto error; 210 } 211 } 212 213 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 214 if (dump && !sb_disasm) { 215 fprintf(stderr, "--------------------------------------------------------------\n"); 216 r600_bytecode_disasm(&shader->shader.bc); 217 fprintf(stderr, "______________________________________________________________\n"); 218 } else if ((dump && sb_disasm) || use_sb) { 219 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 220 dump, use_sb); 221 if (r) { 222 R600_ERR("r600_sb_bytecode_process failed !\n"); 223 goto error; 224 } 225 } 226 227 if (shader->gs_copy_shader) { 228 if (dump) { 229 // dump copy shader 230 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc, 231 &shader->gs_copy_shader->shader, dump, 0); 232 if (r) 233 goto error; 234 } 235 236 if ((r = store_shader(ctx, shader->gs_copy_shader))) 237 goto error; 238 } 239 240 /* Store the shader in a buffer. */ 241 if ((r = store_shader(ctx, shader))) 242 goto error; 243 244 /* Build state. */ 245 switch (shader->shader.processor_type) { 246 case PIPE_SHADER_TESS_CTRL: 247 evergreen_update_hs_state(ctx, shader); 248 break; 249 case PIPE_SHADER_TESS_EVAL: 250 if (key.tes.as_es) 251 evergreen_update_es_state(ctx, shader); 252 else 253 evergreen_update_vs_state(ctx, shader); 254 break; 255 case PIPE_SHADER_GEOMETRY: 256 if (rctx->b.chip_class >= EVERGREEN) { 257 evergreen_update_gs_state(ctx, shader); 258 evergreen_update_vs_state(ctx, shader->gs_copy_shader); 259 } else { 260 r600_update_gs_state(ctx, shader); 261 r600_update_vs_state(ctx, shader->gs_copy_shader); 262 } 263 break; 264 case PIPE_SHADER_VERTEX: 265 export_shader = key.vs.as_es; 266 if (rctx->b.chip_class >= EVERGREEN) { 267 if (key.vs.as_ls) 268 evergreen_update_ls_state(ctx, shader); 269 else if (key.vs.as_es) 270 evergreen_update_es_state(ctx, shader); 271 else 272 evergreen_update_vs_state(ctx, shader); 273 } else { 274 if (export_shader) 275 r600_update_es_state(ctx, shader); 276 else 277 r600_update_vs_state(ctx, shader); 278 } 279 break; 280 case PIPE_SHADER_FRAGMENT: 281 if (rctx->b.chip_class >= EVERGREEN) { 282 evergreen_update_ps_state(ctx, shader); 283 } else { 284 r600_update_ps_state(ctx, shader); 285 } 286 break; 287 case PIPE_SHADER_COMPUTE: 288 evergreen_update_ls_state(ctx, shader); 289 break; 290 default: 291 r = -EINVAL; 292 goto error; 293 } 294 return 0; 295 296error: 297 r600_pipe_shader_destroy(ctx, shader); 298 return r; 299} 300 301void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader) 302{ 303 r600_resource_reference(&shader->bo, NULL); 304 r600_bytecode_clear(&shader->shader.bc); 305 r600_release_command_buffer(&shader->command_buffer); 306} 307 308/* 309 * tgsi -> r600 shader 310 */ 311struct r600_shader_tgsi_instruction; 312 313struct r600_shader_src { 314 unsigned sel; 315 unsigned swizzle[4]; 316 unsigned neg; 317 unsigned abs; 318 unsigned rel; 319 unsigned kc_bank; 320 boolean kc_rel; /* true if cache bank is indexed */ 321 uint32_t value[4]; 322}; 323 324struct eg_interp { 325 boolean enabled; 326 unsigned ij_index; 327}; 328 329struct r600_shader_ctx { 330 struct tgsi_shader_info info; 331 struct tgsi_array_info *array_infos; 332 /* flag for each tgsi temp array if its been spilled or not */ 333 bool *spilled_arrays; 334 struct tgsi_parse_context parse; 335 const struct tgsi_token *tokens; 336 unsigned type; 337 unsigned file_offset[TGSI_FILE_COUNT]; 338 unsigned temp_reg; 339 const struct r600_shader_tgsi_instruction *inst_info; 340 struct r600_bytecode *bc; 341 struct r600_shader *shader; 342 struct r600_shader_src src[4]; 343 uint32_t *literals; 344 uint32_t nliterals; 345 uint32_t max_driver_temp_used; 346 /* needed for evergreen interpolation */ 347 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid 348 /* evergreen/cayman also store sample mask in face register */ 349 int face_gpr; 350 /* sample id is .w component stored in fixed point position register */ 351 int fixed_pt_position_gpr; 352 int colors_used; 353 boolean clip_vertex_write; 354 unsigned cv_output; 355 unsigned edgeflag_output; 356 int helper_invoc_reg; 357 int cs_block_size_reg; 358 int cs_grid_size_reg; 359 bool cs_block_size_loaded, cs_grid_size_loaded; 360 int fragcoord_input; 361 int next_ring_offset; 362 int gs_out_ring_offset; 363 int gs_next_vertex; 364 struct r600_shader *gs_for_vs; 365 int gs_export_gpr_tregs[4]; 366 int gs_rotated_input[2]; 367 const struct pipe_stream_output_info *gs_stream_output_info; 368 unsigned enabled_stream_buffers_mask; 369 unsigned tess_input_info; /* temp with tess input offsets */ 370 unsigned tess_output_info; /* temp with tess input offsets */ 371 unsigned thread_id_gpr; /* temp with thread id calculated for images */ 372}; 373 374struct r600_shader_tgsi_instruction { 375 unsigned op; 376 int (*process)(struct r600_shader_ctx *ctx); 377}; 378 379static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind); 380static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 381static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 382static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 383static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 384static int tgsi_else(struct r600_shader_ctx *ctx); 385static int tgsi_endif(struct r600_shader_ctx *ctx); 386static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 387static int tgsi_endloop(struct r600_shader_ctx *ctx); 388static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 389static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 390 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 391 unsigned int dst_reg); 392static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 393 const struct r600_shader_src *shader_src, 394 unsigned chan); 395static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 396 unsigned dst_reg, unsigned mask); 397 398static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx) 399{ 400 if (ctx->bc->family == CHIP_HEMLOCK || 401 ctx->bc->family == CHIP_CYPRESS || 402 ctx->bc->family == CHIP_JUNIPER) 403 return false; 404 return true; 405} 406 407static int tgsi_last_instruction(unsigned writemask) 408{ 409 int i, lasti = 0; 410 411 for (i = 0; i < 4; i++) { 412 if (writemask & (1 << i)) { 413 lasti = i; 414 } 415 } 416 return lasti; 417} 418 419static int tgsi_is_supported(struct r600_shader_ctx *ctx) 420{ 421 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 422 unsigned j; 423 424 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { 425 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 426 return -EINVAL; 427 } 428#if 0 429 if (i->Instruction.Label) { 430 R600_ERR("label unsupported\n"); 431 return -EINVAL; 432 } 433#endif 434 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 435 if (i->Src[j].Register.Dimension) { 436 switch (i->Src[j].Register.File) { 437 case TGSI_FILE_CONSTANT: 438 case TGSI_FILE_HW_ATOMIC: 439 break; 440 case TGSI_FILE_INPUT: 441 if (ctx->type == PIPE_SHADER_GEOMETRY || 442 ctx->type == PIPE_SHADER_TESS_CTRL || 443 ctx->type == PIPE_SHADER_TESS_EVAL) 444 break; 445 case TGSI_FILE_OUTPUT: 446 if (ctx->type == PIPE_SHADER_TESS_CTRL) 447 break; 448 default: 449 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j, 450 i->Src[j].Register.File, 451 i->Src[j].Register.Dimension); 452 return -EINVAL; 453 } 454 } 455 } 456 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 457 if (i->Dst[j].Register.Dimension) { 458 if (ctx->type == PIPE_SHADER_TESS_CTRL) 459 continue; 460 R600_ERR("unsupported dst (dimension)\n"); 461 return -EINVAL; 462 } 463 } 464 return 0; 465} 466 467int eg_get_interpolator_index(unsigned interpolate, unsigned location) 468{ 469 if (interpolate == TGSI_INTERPOLATE_COLOR || 470 interpolate == TGSI_INTERPOLATE_LINEAR || 471 interpolate == TGSI_INTERPOLATE_PERSPECTIVE) 472 { 473 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR; 474 int loc; 475 476 switch(location) { 477 case TGSI_INTERPOLATE_LOC_CENTER: 478 loc = 1; 479 break; 480 case TGSI_INTERPOLATE_LOC_CENTROID: 481 loc = 2; 482 break; 483 case TGSI_INTERPOLATE_LOC_SAMPLE: 484 default: 485 loc = 0; break; 486 } 487 488 return is_linear * 3 + loc; 489 } 490 491 return -1; 492} 493 494static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 495 int input) 496{ 497 int i = eg_get_interpolator_index( 498 ctx->shader->input[input].interpolate, 499 ctx->shader->input[input].interpolate_location); 500 assert(i >= 0); 501 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index; 502} 503 504static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 505{ 506 int i, r; 507 struct r600_bytecode_alu alu; 508 int gpr = 0, base_chan = 0; 509 int ij_index = ctx->shader->input[input].ij_index; 510 511 /* work out gpr and base_chan from index */ 512 gpr = ij_index / 2; 513 base_chan = (2 * (ij_index % 2)) + 1; 514 515 for (i = 0; i < 8; i++) { 516 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 517 518 if (i < 4) 519 alu.op = ALU_OP2_INTERP_ZW; 520 else 521 alu.op = ALU_OP2_INTERP_XY; 522 523 if ((i > 1) && (i < 6)) { 524 alu.dst.sel = ctx->shader->input[input].gpr; 525 alu.dst.write = 1; 526 } 527 528 alu.dst.chan = i % 4; 529 530 alu.src[0].sel = gpr; 531 alu.src[0].chan = (base_chan - (i % 2)); 532 533 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 534 535 alu.bank_swizzle_force = SQ_ALU_VEC_210; 536 if ((i % 4) == 3) 537 alu.last = 1; 538 r = r600_bytecode_add_alu(ctx->bc, &alu); 539 if (r) 540 return r; 541 } 542 return 0; 543} 544 545static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 546{ 547 int i, r; 548 struct r600_bytecode_alu alu; 549 550 for (i = 0; i < 4; i++) { 551 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 552 553 alu.op = ALU_OP1_INTERP_LOAD_P0; 554 555 alu.dst.sel = ctx->shader->input[input].gpr; 556 alu.dst.write = 1; 557 558 alu.dst.chan = i; 559 560 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 561 alu.src[0].chan = i; 562 563 if (i == 3) 564 alu.last = 1; 565 r = r600_bytecode_add_alu(ctx->bc, &alu); 566 if (r) 567 return r; 568 } 569 return 0; 570} 571 572/* 573 * Special export handling in shaders 574 * 575 * shader export ARRAY_BASE for EXPORT_POS: 576 * 60 is position 577 * 61 is misc vector 578 * 62, 63 are clip distance vectors 579 * 580 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 581 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 582 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 583 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 584 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 585 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 586 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 587 * exclusive from render target index) 588 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 589 * 590 * 591 * shader export ARRAY_BASE for EXPORT_PIXEL: 592 * 0-7 CB targets 593 * 61 computed Z vector 594 * 595 * The use of the values exported in the computed Z vector are controlled 596 * by DB_SHADER_CONTROL: 597 * Z_EXPORT_ENABLE - Z as a float in RED 598 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 599 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 600 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 601 * DB_SOURCE_FORMAT - export control restrictions 602 * 603 */ 604 605 606/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 607static int r600_spi_sid(struct r600_shader_io * io) 608{ 609 int index, name = io->name; 610 611 /* These params are handled differently, they don't need 612 * semantic indices, so we'll use 0 for them. 613 */ 614 if (name == TGSI_SEMANTIC_POSITION || 615 name == TGSI_SEMANTIC_PSIZE || 616 name == TGSI_SEMANTIC_EDGEFLAG || 617 name == TGSI_SEMANTIC_FACE || 618 name == TGSI_SEMANTIC_SAMPLEMASK) 619 index = 0; 620 else { 621 if (name == TGSI_SEMANTIC_GENERIC) { 622 /* For generic params simply use sid from tgsi */ 623 index = io->sid; 624 } else { 625 /* For non-generic params - pack name and sid into 8 bits */ 626 index = 0x80 | (name<<3) | (io->sid); 627 } 628 629 /* Make sure that all really used indices have nonzero value, so 630 * we can just compare it to 0 later instead of comparing the name 631 * with different values to detect special cases. */ 632 index++; 633 } 634 635 return index; 636}; 637 638/* we need this to get a common lds index for vs/tcs/tes input/outputs */ 639int r600_get_lds_unique_index(unsigned semantic_name, unsigned index) 640{ 641 switch (semantic_name) { 642 case TGSI_SEMANTIC_POSITION: 643 return 0; 644 case TGSI_SEMANTIC_PSIZE: 645 return 1; 646 case TGSI_SEMANTIC_CLIPDIST: 647 assert(index <= 1); 648 return 2 + index; 649 case TGSI_SEMANTIC_GENERIC: 650 if (index <= 63-4) 651 return 4 + index - 9; 652 else 653 /* same explanation as in the default statement, 654 * the only user hitting this is st/nine. 655 */ 656 return 0; 657 658 /* patch indices are completely separate and thus start from 0 */ 659 case TGSI_SEMANTIC_TESSOUTER: 660 return 0; 661 case TGSI_SEMANTIC_TESSINNER: 662 return 1; 663 case TGSI_SEMANTIC_PATCH: 664 return 2 + index; 665 666 default: 667 /* Don't fail here. The result of this function is only used 668 * for LS, TCS, TES, and GS, where legacy GL semantics can't 669 * occur, but this function is called for all vertex shaders 670 * before it's known whether LS will be compiled or not. 671 */ 672 return 0; 673 } 674} 675 676/* turn input into interpolate on EG */ 677static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 678{ 679 int r = 0; 680 681 if (ctx->shader->input[index].spi_sid) { 682 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 683 if (ctx->shader->input[index].interpolate > 0) { 684 evergreen_interp_assign_ij_index(ctx, index); 685 r = evergreen_interp_alu(ctx, index); 686 } else { 687 r = evergreen_interp_flat(ctx, index); 688 } 689 } 690 return r; 691} 692 693static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 694{ 695 struct r600_bytecode_alu alu; 696 int i, r; 697 int gpr_front = ctx->shader->input[front].gpr; 698 int gpr_back = ctx->shader->input[back].gpr; 699 700 for (i = 0; i < 4; i++) { 701 memset(&alu, 0, sizeof(alu)); 702 alu.op = ALU_OP3_CNDGT; 703 alu.is_op3 = 1; 704 alu.dst.write = 1; 705 alu.dst.sel = gpr_front; 706 alu.src[0].sel = ctx->face_gpr; 707 alu.src[1].sel = gpr_front; 708 alu.src[2].sel = gpr_back; 709 710 alu.dst.chan = i; 711 alu.src[1].chan = i; 712 alu.src[2].chan = i; 713 alu.last = (i==3); 714 715 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 716 return r; 717 } 718 719 return 0; 720} 721 722/* execute a single slot ALU calculation */ 723static int single_alu_op2(struct r600_shader_ctx *ctx, int op, 724 int dst_sel, int dst_chan, 725 int src0_sel, unsigned src0_chan_val, 726 int src1_sel, unsigned src1_chan_val) 727{ 728 struct r600_bytecode_alu alu; 729 int r, i; 730 731 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) { 732 for (i = 0; i < 4; i++) { 733 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 734 alu.op = op; 735 alu.src[0].sel = src0_sel; 736 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 737 alu.src[0].value = src0_chan_val; 738 else 739 alu.src[0].chan = src0_chan_val; 740 alu.src[1].sel = src1_sel; 741 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 742 alu.src[1].value = src1_chan_val; 743 else 744 alu.src[1].chan = src1_chan_val; 745 alu.dst.sel = dst_sel; 746 alu.dst.chan = i; 747 alu.dst.write = i == dst_chan; 748 alu.last = (i == 3); 749 r = r600_bytecode_add_alu(ctx->bc, &alu); 750 if (r) 751 return r; 752 } 753 return 0; 754 } 755 756 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 757 alu.op = op; 758 alu.src[0].sel = src0_sel; 759 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 760 alu.src[0].value = src0_chan_val; 761 else 762 alu.src[0].chan = src0_chan_val; 763 alu.src[1].sel = src1_sel; 764 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 765 alu.src[1].value = src1_chan_val; 766 else 767 alu.src[1].chan = src1_chan_val; 768 alu.dst.sel = dst_sel; 769 alu.dst.chan = dst_chan; 770 alu.dst.write = 1; 771 alu.last = 1; 772 r = r600_bytecode_add_alu(ctx->bc, &alu); 773 if (r) 774 return r; 775 return 0; 776} 777 778/* execute a single slot ALU calculation */ 779static int single_alu_op3(struct r600_shader_ctx *ctx, int op, 780 int dst_sel, int dst_chan, 781 int src0_sel, unsigned src0_chan_val, 782 int src1_sel, unsigned src1_chan_val, 783 int src2_sel, unsigned src2_chan_val) 784{ 785 struct r600_bytecode_alu alu; 786 int r; 787 788 /* validate this for other ops */ 789 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT); 790 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 791 alu.op = op; 792 alu.src[0].sel = src0_sel; 793 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 794 alu.src[0].value = src0_chan_val; 795 else 796 alu.src[0].chan = src0_chan_val; 797 alu.src[1].sel = src1_sel; 798 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 799 alu.src[1].value = src1_chan_val; 800 else 801 alu.src[1].chan = src1_chan_val; 802 alu.src[2].sel = src2_sel; 803 if (src2_sel == V_SQ_ALU_SRC_LITERAL) 804 alu.src[2].value = src2_chan_val; 805 else 806 alu.src[2].chan = src2_chan_val; 807 alu.dst.sel = dst_sel; 808 alu.dst.chan = dst_chan; 809 alu.is_op3 = 1; 810 alu.last = 1; 811 r = r600_bytecode_add_alu(ctx->bc, &alu); 812 if (r) 813 return r; 814 return 0; 815} 816 817/* put it in temp_reg.x */ 818static int get_lds_offset0(struct r600_shader_ctx *ctx, 819 int rel_patch_chan, 820 int temp_reg, bool is_patch_var) 821{ 822 int r; 823 824 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */ 825 /* ADD 826 Dimension - patch0_offset (input_vals.z), 827 Non-dim - patch0_data_offset (input_vals.w) 828 */ 829 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 830 temp_reg, 0, 831 ctx->tess_output_info, 0, 832 0, rel_patch_chan, 833 ctx->tess_output_info, is_patch_var ? 3 : 2); 834 if (r) 835 return r; 836 return 0; 837} 838 839static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) 840{ 841 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; 842} 843 844static int r600_get_temp(struct r600_shader_ctx *ctx) 845{ 846 return ctx->temp_reg + ctx->max_driver_temp_used++; 847} 848 849static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) 850{ 851 int i; 852 i = ctx->shader->noutput++; 853 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID; 854 ctx->shader->output[i].sid = 0; 855 ctx->shader->output[i].gpr = 0; 856 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT; 857 ctx->shader->output[i].write_mask = 0x4; 858 ctx->shader->output[i].spi_sid = prim_id_sid; 859 860 return 0; 861} 862 863static int tgsi_barrier(struct r600_shader_ctx *ctx) 864{ 865 struct r600_bytecode_alu alu; 866 int r; 867 868 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 869 alu.op = ctx->inst_info->op; 870 alu.last = 1; 871 872 r = r600_bytecode_add_alu(ctx->bc, &alu); 873 if (r) 874 return r; 875 return 0; 876} 877 878static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed) 879{ 880 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays 881 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY]; 882 unsigned narrays_left = n; 883 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed 884 885 *scratch_space_needed = 0; 886 while (*regno > 124 && narrays_left) { 887 unsigned i; 888 unsigned largest = 0; 889 unsigned largest_index = 0; 890 891 for (i = 0; i < n; i++) { 892 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 893 if (!spilled[i] && size > largest) { 894 largest = size; 895 largest_index = i; 896 } 897 } 898 899 spilled[largest_index] = true; 900 *regno -= largest; 901 *scratch_space_needed += largest; 902 903 narrays_left --; 904 } 905 906 if (narrays_left == 0) { 907 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY); 908 } 909} 910 911/* Take spilled temp arrays into account when translating tgsi register 912 * indexes into r600 gprs if spilled is false, or scratch array offset if 913 * spilled is true */ 914static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled) 915{ 916 unsigned i; 917 unsigned spilled_size = 0; 918 919 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 920 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 921 if (ctx->spilled_arrays[i]) { 922 /* vec4 index into spilled scratch memory */ 923 *spilled = true; 924 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size; 925 } 926 else { 927 /* regular GPR array */ 928 *spilled = false; 929 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 930 } 931 } 932 933 if (tgsi_reg_index < ctx->array_infos[i].range.First) 934 break; 935 if (ctx->spilled_arrays[i]) { 936 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 937 } 938 } 939 940 /* regular GPR index, minus the holes from spilled arrays */ 941 *spilled = false; 942 943 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 944} 945 946/* look up spill area base offset and array size for a spilled temp array */ 947static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, 948 unsigned *array_base, unsigned *array_size) 949{ 950 unsigned i; 951 unsigned offset = 0; 952 953 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 954 if (ctx->spilled_arrays[i]) { 955 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 956 957 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 958 *array_base = offset; 959 *array_size = size - 1; /* hw counts from 1 */ 960 961 return; 962 } 963 964 offset += size; 965 } 966 } 967} 968 969static int tgsi_declaration(struct r600_shader_ctx *ctx) 970{ 971 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 972 int r, i, j, count = d->Range.Last - d->Range.First + 1; 973 974 switch (d->Declaration.File) { 975 case TGSI_FILE_INPUT: 976 for (j = 0; j < count; j++) { 977 i = ctx->shader->ninput + j; 978 assert(i < ARRAY_SIZE(ctx->shader->input)); 979 ctx->shader->input[i].name = d->Semantic.Name; 980 ctx->shader->input[i].sid = d->Semantic.Index + j; 981 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 982 ctx->shader->input[i].interpolate_location = d->Interp.Location; 983 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; 984 if (ctx->type == PIPE_SHADER_FRAGMENT) { 985 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 986 switch (ctx->shader->input[i].name) { 987 case TGSI_SEMANTIC_FACE: 988 if (ctx->face_gpr != -1) 989 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ 990 else 991 ctx->face_gpr = ctx->shader->input[i].gpr; 992 break; 993 case TGSI_SEMANTIC_COLOR: 994 ctx->colors_used++; 995 break; 996 case TGSI_SEMANTIC_POSITION: 997 ctx->fragcoord_input = i; 998 break; 999 case TGSI_SEMANTIC_PRIMID: 1000 /* set this for now */ 1001 ctx->shader->gs_prim_id_input = true; 1002 ctx->shader->ps_prim_id_input = i; 1003 break; 1004 } 1005 if (ctx->bc->chip_class >= EVERGREEN) { 1006 if ((r = evergreen_interp_input(ctx, i))) 1007 return r; 1008 } 1009 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 1010 /* FIXME probably skip inputs if they aren't passed in the ring */ 1011 ctx->shader->input[i].ring_offset = ctx->next_ring_offset; 1012 ctx->next_ring_offset += 16; 1013 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) 1014 ctx->shader->gs_prim_id_input = true; 1015 } 1016 } 1017 ctx->shader->ninput += count; 1018 break; 1019 case TGSI_FILE_OUTPUT: 1020 for (j = 0; j < count; j++) { 1021 i = ctx->shader->noutput + j; 1022 assert(i < ARRAY_SIZE(ctx->shader->output)); 1023 ctx->shader->output[i].name = d->Semantic.Name; 1024 ctx->shader->output[i].sid = d->Semantic.Index + j; 1025 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; 1026 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 1027 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 1028 if (ctx->type == PIPE_SHADER_VERTEX || 1029 ctx->type == PIPE_SHADER_GEOMETRY || 1030 ctx->type == PIPE_SHADER_TESS_EVAL) { 1031 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 1032 switch (d->Semantic.Name) { 1033 case TGSI_SEMANTIC_CLIPDIST: 1034 break; 1035 case TGSI_SEMANTIC_PSIZE: 1036 ctx->shader->vs_out_misc_write = 1; 1037 ctx->shader->vs_out_point_size = 1; 1038 break; 1039 case TGSI_SEMANTIC_EDGEFLAG: 1040 ctx->shader->vs_out_misc_write = 1; 1041 ctx->shader->vs_out_edgeflag = 1; 1042 ctx->edgeflag_output = i; 1043 break; 1044 case TGSI_SEMANTIC_VIEWPORT_INDEX: 1045 ctx->shader->vs_out_misc_write = 1; 1046 ctx->shader->vs_out_viewport = 1; 1047 break; 1048 case TGSI_SEMANTIC_LAYER: 1049 ctx->shader->vs_out_misc_write = 1; 1050 ctx->shader->vs_out_layer = 1; 1051 break; 1052 case TGSI_SEMANTIC_CLIPVERTEX: 1053 ctx->clip_vertex_write = TRUE; 1054 ctx->cv_output = i; 1055 break; 1056 } 1057 if (ctx->type == PIPE_SHADER_GEOMETRY) { 1058 ctx->gs_out_ring_offset += 16; 1059 } 1060 } else if (ctx->type == PIPE_SHADER_FRAGMENT) { 1061 switch (d->Semantic.Name) { 1062 case TGSI_SEMANTIC_COLOR: 1063 ctx->shader->nr_ps_max_color_exports++; 1064 break; 1065 } 1066 } 1067 } 1068 ctx->shader->noutput += count; 1069 break; 1070 case TGSI_FILE_TEMPORARY: 1071 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 1072 if (d->Array.ArrayID) { 1073 bool spilled; 1074 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx, 1075 d->Range.First, 1076 &spilled); 1077 1078 if (!spilled) { 1079 r600_add_gpr_array(ctx->shader, idx, 1080 d->Range.Last - d->Range.First + 1, 0x0F); 1081 } 1082 } 1083 } 1084 break; 1085 1086 case TGSI_FILE_CONSTANT: 1087 case TGSI_FILE_SAMPLER: 1088 case TGSI_FILE_SAMPLER_VIEW: 1089 case TGSI_FILE_ADDRESS: 1090 case TGSI_FILE_BUFFER: 1091 case TGSI_FILE_IMAGE: 1092 case TGSI_FILE_MEMORY: 1093 break; 1094 1095 case TGSI_FILE_HW_ATOMIC: 1096 i = ctx->shader->nhwatomic_ranges; 1097 ctx->shader->atomics[i].start = d->Range.First; 1098 ctx->shader->atomics[i].end = d->Range.Last; 1099 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic; 1100 ctx->shader->atomics[i].array_id = d->Array.ArrayID; 1101 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D; 1102 ctx->shader->nhwatomic_ranges++; 1103 ctx->shader->nhwatomic += count; 1104 break; 1105 1106 case TGSI_FILE_SYSTEM_VALUE: 1107 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK || 1108 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID || 1109 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) { 1110 break; /* Already handled from allocate_system_value_inputs */ 1111 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 1112 break; 1113 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 1114 break; 1115 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID) 1116 break; 1117 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || 1118 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { 1119 int param = r600_get_lds_unique_index(d->Semantic.Name, 0); 1120 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; 1121 unsigned temp_reg = r600_get_temp(ctx); 1122 1123 r = get_lds_offset0(ctx, 2, temp_reg, true); 1124 if (r) 1125 return r; 1126 1127 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1128 temp_reg, 0, 1129 temp_reg, 0, 1130 V_SQ_ALU_SRC_LITERAL, param * 16); 1131 if (r) 1132 return r; 1133 1134 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf); 1135 } 1136 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { 1137 /* MOV r1.x, r0.x; 1138 MOV r1.y, r0.y; 1139 */ 1140 for (i = 0; i < 2; i++) { 1141 struct r600_bytecode_alu alu; 1142 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1143 alu.op = ALU_OP1_MOV; 1144 alu.src[0].sel = 0; 1145 alu.src[0].chan = 0 + i; 1146 alu.dst.sel = 1; 1147 alu.dst.chan = 0 + i; 1148 alu.dst.write = 1; 1149 alu.last = (i == 1) ? 1 : 0; 1150 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1151 return r; 1152 } 1153 /* ADD r1.z, 1.0f, -r0.x */ 1154 struct r600_bytecode_alu alu; 1155 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1156 alu.op = ALU_OP2_ADD; 1157 alu.src[0].sel = V_SQ_ALU_SRC_1; 1158 alu.src[1].sel = 1; 1159 alu.src[1].chan = 0; 1160 alu.src[1].neg = 1; 1161 alu.dst.sel = 1; 1162 alu.dst.chan = 2; 1163 alu.dst.write = 1; 1164 alu.last = 1; 1165 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1166 return r; 1167 1168 /* ADD r1.z, r1.z, -r1.y */ 1169 alu.op = ALU_OP2_ADD; 1170 alu.src[0].sel = 1; 1171 alu.src[0].chan = 2; 1172 alu.src[1].sel = 1; 1173 alu.src[1].chan = 1; 1174 alu.src[1].neg = 1; 1175 alu.dst.sel = 1; 1176 alu.dst.chan = 2; 1177 alu.dst.write = 1; 1178 alu.last = 1; 1179 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1180 return r; 1181 break; 1182 } 1183 break; 1184 default: 1185 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 1186 return -EINVAL; 1187 } 1188 return 0; 1189} 1190 1191static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset) 1192{ 1193 struct tgsi_parse_context parse; 1194 struct { 1195 boolean enabled; 1196 int *reg; 1197 unsigned name, alternate_name; 1198 } inputs[2] = { 1199 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */ 1200 1201 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */ 1202 }; 1203 int num_regs = 0; 1204 unsigned k, i; 1205 1206 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { 1207 return 0; 1208 } 1209 1210 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ 1211 while (!tgsi_parse_end_of_tokens(&parse)) { 1212 tgsi_parse_token(&parse); 1213 1214 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { 1215 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; 1216 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || 1217 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 1218 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) 1219 { 1220 int interpolate, location, k; 1221 1222 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 1223 location = TGSI_INTERPOLATE_LOC_CENTER; 1224 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 1225 location = TGSI_INTERPOLATE_LOC_CENTER; 1226 /* Needs sample positions, currently those are always available */ 1227 } else { 1228 location = TGSI_INTERPOLATE_LOC_CENTROID; 1229 } 1230 1231 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; 1232 k = eg_get_interpolator_index(interpolate, location); 1233 if (k >= 0) 1234 ctx->eg_interpolators[k].enabled = true; 1235 } 1236 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { 1237 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration; 1238 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { 1239 for (k = 0; k < ARRAY_SIZE(inputs); k++) { 1240 if (d->Semantic.Name == inputs[k].name || 1241 d->Semantic.Name == inputs[k].alternate_name) { 1242 inputs[k].enabled = true; 1243 } 1244 } 1245 } 1246 } 1247 } 1248 1249 tgsi_parse_free(&parse); 1250 1251 if (ctx->info.reads_samplemask && 1252 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) { 1253 inputs[1].enabled = true; 1254 } 1255 1256 if (ctx->bc->chip_class >= EVERGREEN) { 1257 int num_baryc = 0; 1258 /* assign gpr to each interpolator according to priority */ 1259 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { 1260 if (ctx->eg_interpolators[i].enabled) { 1261 ctx->eg_interpolators[i].ij_index = num_baryc; 1262 num_baryc++; 1263 } 1264 } 1265 num_baryc = (num_baryc + 1) >> 1; 1266 gpr_offset += num_baryc; 1267 } 1268 1269 for (i = 0; i < ARRAY_SIZE(inputs); i++) { 1270 boolean enabled = inputs[i].enabled; 1271 int *reg = inputs[i].reg; 1272 unsigned name = inputs[i].name; 1273 1274 if (enabled) { 1275 int gpr = gpr_offset + num_regs++; 1276 ctx->shader->nsys_inputs++; 1277 1278 // add to inputs, allocate a gpr 1279 k = ctx->shader->ninput++; 1280 ctx->shader->input[k].name = name; 1281 ctx->shader->input[k].sid = 0; 1282 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT; 1283 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER; 1284 *reg = ctx->shader->input[k].gpr = gpr; 1285 } 1286 } 1287 1288 return gpr_offset + num_regs; 1289} 1290 1291/* 1292 * for evergreen we need to scan the shader to find the number of GPRs we need to 1293 * reserve for interpolation and system values 1294 * 1295 * we need to know if we are going to emit any sample or centroid inputs 1296 * if perspective and linear are required 1297*/ 1298static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 1299{ 1300 unsigned i; 1301 1302 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); 1303 1304 /* 1305 * Could get this information from the shader info. But right now 1306 * we interpolate all declared inputs, whereas the shader info will 1307 * only contain the bits if the inputs are actually used, so it might 1308 * not be safe... 1309 */ 1310 for (i = 0; i < ctx->info.num_inputs; i++) { 1311 int k; 1312 /* skip position/face/mask/sampleid */ 1313 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 1314 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE || 1315 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK || 1316 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID) 1317 continue; 1318 1319 k = eg_get_interpolator_index( 1320 ctx->info.input_interpolate[i], 1321 ctx->info.input_interpolate_loc[i]); 1322 if (k >= 0) 1323 ctx->eg_interpolators[k].enabled = TRUE; 1324 } 1325 1326 /* XXX PULL MODEL and LINE STIPPLE */ 1327 1328 return allocate_system_value_inputs(ctx, 0); 1329} 1330 1331/* sample_id_sel == NULL means fetch for current sample */ 1332static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel) 1333{ 1334 struct r600_bytecode_vtx vtx; 1335 int r, t1; 1336 1337 t1 = r600_get_temp(ctx); 1338 1339 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1340 vtx.op = FETCH_OP_VFETCH; 1341 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1342 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1343 if (sample_id == NULL) { 1344 assert(ctx->fixed_pt_position_gpr != -1); 1345 1346 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; 1347 vtx.src_sel_x = 3; 1348 } 1349 else { 1350 struct r600_bytecode_alu alu; 1351 1352 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1353 alu.op = ALU_OP1_MOV; 1354 r600_bytecode_src(&alu.src[0], sample_id, chan_sel); 1355 alu.dst.sel = t1; 1356 alu.dst.write = 1; 1357 alu.last = 1; 1358 r = r600_bytecode_add_alu(ctx->bc, &alu); 1359 if (r) 1360 return r; 1361 1362 vtx.src_gpr = t1; 1363 vtx.src_sel_x = 0; 1364 } 1365 vtx.mega_fetch_count = 16; 1366 vtx.dst_gpr = t1; 1367 vtx.dst_sel_x = 0; 1368 vtx.dst_sel_y = 1; 1369 vtx.dst_sel_z = 2; 1370 vtx.dst_sel_w = 3; 1371 vtx.data_format = FMT_32_32_32_32_FLOAT; 1372 vtx.num_format_all = 2; 1373 vtx.format_comp_all = 1; 1374 vtx.use_const_fields = 0; 1375 vtx.offset = 0; 1376 vtx.endian = r600_endian_swap(32); 1377 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1378 1379 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1380 if (r) 1381 return r; 1382 1383 return t1; 1384} 1385 1386static int eg_load_helper_invocation(struct r600_shader_ctx *ctx) 1387{ 1388 int r; 1389 struct r600_bytecode_alu alu; 1390 1391 /* do a vtx fetch with wqm set on the vtx fetch */ 1392 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1393 alu.op = ALU_OP1_MOV; 1394 alu.dst.sel = ctx->helper_invoc_reg; 1395 alu.dst.chan = 0; 1396 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1397 alu.src[0].value = 0xffffffff; 1398 alu.dst.write = 1; 1399 alu.last = 1; 1400 r = r600_bytecode_add_alu(ctx->bc, &alu); 1401 if (r) 1402 return r; 1403 1404 /* do a vtx fetch in VPM mode */ 1405 struct r600_bytecode_vtx vtx; 1406 memset(&vtx, 0, sizeof(vtx)); 1407 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 1408 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1409 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1410 vtx.src_gpr = 0; 1411 vtx.mega_fetch_count = 16; /* no idea here really... */ 1412 vtx.dst_gpr = ctx->helper_invoc_reg; 1413 vtx.dst_sel_x = 4; 1414 vtx.dst_sel_y = 7; /* SEL_Y */ 1415 vtx.dst_sel_z = 7; /* SEL_Z */ 1416 vtx.dst_sel_w = 7; /* SEL_W */ 1417 vtx.data_format = FMT_32; 1418 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 1419 return r; 1420 ctx->bc->cf_last->vpm = 1; 1421 return 0; 1422} 1423 1424static int cm_load_helper_invocation(struct r600_shader_ctx *ctx) 1425{ 1426 int r; 1427 struct r600_bytecode_alu alu; 1428 1429 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1430 alu.op = ALU_OP1_MOV; 1431 alu.dst.sel = ctx->helper_invoc_reg; 1432 alu.dst.chan = 0; 1433 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1434 alu.src[0].value = 0xffffffff; 1435 alu.dst.write = 1; 1436 alu.last = 1; 1437 r = r600_bytecode_add_alu(ctx->bc, &alu); 1438 if (r) 1439 return r; 1440 1441 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1442 alu.op = ALU_OP1_MOV; 1443 alu.dst.sel = ctx->helper_invoc_reg; 1444 alu.dst.chan = 0; 1445 alu.src[0].sel = V_SQ_ALU_SRC_0; 1446 alu.dst.write = 1; 1447 alu.last = 1; 1448 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE); 1449 if (r) 1450 return r; 1451 1452 return ctx->helper_invoc_reg; 1453} 1454 1455static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block) 1456{ 1457 struct r600_bytecode_vtx vtx; 1458 int r, t1; 1459 1460 if (ctx->cs_block_size_loaded) 1461 return ctx->cs_block_size_reg; 1462 if (ctx->cs_grid_size_loaded) 1463 return ctx->cs_grid_size_reg; 1464 1465 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg; 1466 struct r600_bytecode_alu alu; 1467 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1468 alu.op = ALU_OP1_MOV; 1469 alu.src[0].sel = V_SQ_ALU_SRC_0; 1470 alu.dst.sel = t1; 1471 alu.dst.write = 1; 1472 alu.last = 1; 1473 r = r600_bytecode_add_alu(ctx->bc, &alu); 1474 if (r) 1475 return r; 1476 1477 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1478 vtx.op = FETCH_OP_VFETCH; 1479 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1480 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1481 vtx.src_gpr = t1; 1482 vtx.src_sel_x = 0; 1483 1484 vtx.mega_fetch_count = 16; 1485 vtx.dst_gpr = t1; 1486 vtx.dst_sel_x = 0; 1487 vtx.dst_sel_y = 1; 1488 vtx.dst_sel_z = 2; 1489 vtx.dst_sel_w = 7; 1490 vtx.data_format = FMT_32_32_32_32; 1491 vtx.num_format_all = 1; 1492 vtx.format_comp_all = 0; 1493 vtx.use_const_fields = 0; 1494 vtx.offset = load_block ? 0 : 16; // first element is size of buffer 1495 vtx.endian = r600_endian_swap(32); 1496 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1497 1498 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1499 if (r) 1500 return r; 1501 1502 if (load_block) 1503 ctx->cs_block_size_loaded = true; 1504 else 1505 ctx->cs_grid_size_loaded = true; 1506 return t1; 1507} 1508 1509static void tgsi_src(struct r600_shader_ctx *ctx, 1510 const struct tgsi_full_src_register *tgsi_src, 1511 struct r600_shader_src *r600_src) 1512{ 1513 memset(r600_src, 0, sizeof(*r600_src)); 1514 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 1515 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 1516 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 1517 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 1518 r600_src->neg = tgsi_src->Register.Negate; 1519 r600_src->abs = tgsi_src->Register.Absolute; 1520 1521 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) { 1522 bool spilled; 1523 unsigned idx; 1524 1525 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled); 1526 1527 if (spilled) { 1528 int reg = r600_get_temp(ctx); 1529 int r; 1530 1531 r600_src->sel = reg; 1532 1533 if (ctx->bc->chip_class < R700) { 1534 struct r600_bytecode_output cf; 1535 1536 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 1537 cf.op = CF_OP_MEM_SCRATCH; 1538 cf.elem_size = 3; 1539 cf.gpr = reg; 1540 cf.comp_mask = 0xF; 1541 cf.swizzle_x = 0; 1542 cf.swizzle_y = 1; 1543 cf.swizzle_z = 2; 1544 cf.swizzle_w = 3; 1545 cf.burst_count = 1; 1546 1547 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1548 &cf.array_base, &cf.array_size); 1549 1550 if (tgsi_src->Register.Indirect) { 1551 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 1552 cf.index_gpr = ctx->bc->ar_reg; 1553 } 1554 else { 1555 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ; 1556 cf.array_base += idx; 1557 cf.array_size = 0; 1558 } 1559 1560 r = r600_bytecode_add_output(ctx->bc, &cf); 1561 } 1562 else { 1563 struct r600_bytecode_vtx vtx; 1564 1565 if (r600_bytecode_get_need_wait_ack(ctx->bc)) { 1566 r600_bytecode_need_wait_ack(ctx->bc, false); 1567 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 1568 } 1569 1570 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1571 vtx.op = FETCH_OP_READ_SCRATCH; 1572 vtx.dst_gpr = reg; 1573 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation 1574 vtx.elem_size = 3; 1575 vtx.data_format = FMT_32_32_32_32; 1576 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT; 1577 vtx.dst_sel_x = tgsi_src->Register.SwizzleX; 1578 vtx.dst_sel_y = tgsi_src->Register.SwizzleY; 1579 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ; 1580 vtx.dst_sel_w = tgsi_src->Register.SwizzleW; 1581 1582 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1583 &vtx.array_base, &vtx.array_size); 1584 1585 if (tgsi_src->Register.Indirect) { 1586 vtx.indexed = 1; 1587 vtx.src_gpr = ctx->bc->ar_reg; 1588 } 1589 else { 1590 vtx.array_base += idx; 1591 vtx.array_size = 0; 1592 } 1593 1594 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1595 } 1596 1597 if (r) 1598 return; 1599 } 1600 else { 1601 if (tgsi_src->Register.Indirect) 1602 r600_src->rel = V_SQ_REL_RELATIVE; 1603 1604 r600_src->sel = idx; 1605 } 1606 1607 return; 1608 } 1609 1610 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 1611 int index; 1612 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 1613 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 1614 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 1615 1616 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 1617 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs); 1618 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 1619 return; 1620 } 1621 index = tgsi_src->Register.Index; 1622 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 1623 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 1624 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 1625 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) { 1626 r600_src->swizzle[0] = 2; // Z value 1627 r600_src->swizzle[1] = 2; 1628 r600_src->swizzle[2] = 2; 1629 r600_src->swizzle[3] = 2; 1630 r600_src->sel = ctx->face_gpr; 1631 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) { 1632 r600_src->swizzle[0] = 3; // W value 1633 r600_src->swizzle[1] = 3; 1634 r600_src->swizzle[2] = 3; 1635 r600_src->swizzle[3] = 3; 1636 r600_src->sel = ctx->fixed_pt_position_gpr; 1637 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) { 1638 r600_src->swizzle[0] = 0; 1639 r600_src->swizzle[1] = 1; 1640 r600_src->swizzle[2] = 4; 1641 r600_src->swizzle[3] = 4; 1642 r600_src->sel = load_sample_position(ctx, NULL, -1); 1643 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 1644 r600_src->swizzle[0] = 3; 1645 r600_src->swizzle[1] = 3; 1646 r600_src->swizzle[2] = 3; 1647 r600_src->swizzle[3] = 3; 1648 r600_src->sel = 0; 1649 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 1650 r600_src->swizzle[0] = 0; 1651 r600_src->swizzle[1] = 0; 1652 r600_src->swizzle[2] = 0; 1653 r600_src->swizzle[3] = 0; 1654 r600_src->sel = 0; 1655 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) { 1656 r600_src->sel = 0; 1657 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) { 1658 r600_src->sel = 1; 1659 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1660 r600_src->swizzle[0] = 3; 1661 r600_src->swizzle[1] = 3; 1662 r600_src->swizzle[2] = 3; 1663 r600_src->swizzle[3] = 3; 1664 r600_src->sel = 1; 1665 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1666 r600_src->swizzle[0] = 2; 1667 r600_src->swizzle[1] = 2; 1668 r600_src->swizzle[2] = 2; 1669 r600_src->swizzle[3] = 2; 1670 r600_src->sel = 0; 1671 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) { 1672 r600_src->sel = 1; 1673 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) { 1674 r600_src->sel = 3; 1675 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) { 1676 r600_src->sel = 2; 1677 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) { 1678 r600_src->sel = ctx->tess_input_info; 1679 r600_src->swizzle[0] = 2; 1680 r600_src->swizzle[1] = 2; 1681 r600_src->swizzle[2] = 2; 1682 r600_src->swizzle[3] = 2; 1683 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1684 r600_src->sel = 0; 1685 r600_src->swizzle[0] = 0; 1686 r600_src->swizzle[1] = 0; 1687 r600_src->swizzle[2] = 0; 1688 r600_src->swizzle[3] = 0; 1689 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1690 r600_src->sel = 0; 1691 r600_src->swizzle[0] = 3; 1692 r600_src->swizzle[1] = 3; 1693 r600_src->swizzle[2] = 3; 1694 r600_src->swizzle[3] = 3; 1695 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) { 1696 r600_src->sel = load_block_grid_size(ctx, false); 1697 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) { 1698 r600_src->sel = load_block_grid_size(ctx, true); 1699 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) { 1700 r600_src->sel = ctx->helper_invoc_reg; 1701 r600_src->swizzle[0] = 0; 1702 r600_src->swizzle[1] = 0; 1703 r600_src->swizzle[2] = 0; 1704 r600_src->swizzle[3] = 0; 1705 } 1706 } else { 1707 if (tgsi_src->Register.Indirect) 1708 r600_src->rel = V_SQ_REL_RELATIVE; 1709 r600_src->sel = tgsi_src->Register.Index; 1710 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 1711 } 1712 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 1713 if (tgsi_src->Register.Dimension) { 1714 r600_src->kc_bank = tgsi_src->Dimension.Index; 1715 if (tgsi_src->Dimension.Indirect) { 1716 r600_src->kc_rel = 1; 1717 } 1718 } 1719 } 1720} 1721 1722static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 1723 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 1724 unsigned int dst_reg) 1725{ 1726 struct r600_bytecode_vtx vtx; 1727 unsigned int ar_reg; 1728 int r; 1729 1730 if (offset) { 1731 struct r600_bytecode_alu alu; 1732 1733 memset(&alu, 0, sizeof(alu)); 1734 1735 alu.op = ALU_OP2_ADD_INT; 1736 alu.src[0].sel = ctx->bc->ar_reg; 1737 alu.src[0].chan = ar_chan; 1738 1739 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1740 alu.src[1].value = offset; 1741 1742 alu.dst.sel = dst_reg; 1743 alu.dst.chan = ar_chan; 1744 alu.dst.write = 1; 1745 alu.last = 1; 1746 1747 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1748 return r; 1749 1750 ar_reg = dst_reg; 1751 } else { 1752 ar_reg = ctx->bc->ar_reg; 1753 } 1754 1755 memset(&vtx, 0, sizeof(vtx)); 1756 vtx.buffer_id = cb_idx; 1757 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1758 vtx.src_gpr = ar_reg; 1759 vtx.src_sel_x = ar_chan; 1760 vtx.mega_fetch_count = 16; 1761 vtx.dst_gpr = dst_reg; 1762 vtx.dst_sel_x = 0; /* SEL_X */ 1763 vtx.dst_sel_y = 1; /* SEL_Y */ 1764 vtx.dst_sel_z = 2; /* SEL_Z */ 1765 vtx.dst_sel_w = 3; /* SEL_W */ 1766 vtx.data_format = FMT_32_32_32_32_FLOAT; 1767 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 1768 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 1769 vtx.endian = r600_endian_swap(32); 1770 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE; 1771 1772 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1773 return r; 1774 1775 return 0; 1776} 1777 1778static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1779{ 1780 struct r600_bytecode_vtx vtx; 1781 int r; 1782 unsigned index = src->Register.Index; 1783 unsigned vtx_id = src->Dimension.Index; 1784 int offset_reg = ctx->gs_rotated_input[vtx_id / 3]; 1785 int offset_chan = vtx_id % 3; 1786 int t2 = 0; 1787 1788 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, 1789 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ 1790 1791 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2) 1792 offset_chan = 3; 1793 1794 if (src->Dimension.Indirect || src->Register.Indirect) 1795 t2 = r600_get_temp(ctx); 1796 1797 if (src->Dimension.Indirect) { 1798 int treg[3]; 1799 struct r600_bytecode_alu alu; 1800 int r, i; 1801 unsigned addr_reg; 1802 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); 1803 if (src->DimIndirect.Index > 0) { 1804 r = single_alu_op2(ctx, ALU_OP1_MOV, 1805 ctx->bc->ar_reg, 0, 1806 addr_reg, 0, 1807 0, 0); 1808 if (r) 1809 return r; 1810 } 1811 /* 1812 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. 1813 at least this is what fglrx seems to do. */ 1814 for (i = 0; i < 3; i++) { 1815 treg[i] = r600_get_temp(ctx); 1816 } 1817 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); 1818 1819 for (i = 0; i < 3; i++) { 1820 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1821 alu.op = ALU_OP1_MOV; 1822 alu.src[0].sel = ctx->gs_rotated_input[0]; 1823 alu.src[0].chan = i == 2 ? 3 : i; 1824 alu.dst.sel = treg[i]; 1825 alu.dst.chan = 0; 1826 alu.dst.write = 1; 1827 alu.last = 1; 1828 r = r600_bytecode_add_alu(ctx->bc, &alu); 1829 if (r) 1830 return r; 1831 } 1832 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1833 alu.op = ALU_OP1_MOV; 1834 alu.src[0].sel = treg[0]; 1835 alu.src[0].rel = 1; 1836 alu.dst.sel = t2; 1837 alu.dst.write = 1; 1838 alu.last = 1; 1839 r = r600_bytecode_add_alu(ctx->bc, &alu); 1840 if (r) 1841 return r; 1842 offset_reg = t2; 1843 offset_chan = 0; 1844 } 1845 1846 if (src->Register.Indirect) { 1847 int addr_reg; 1848 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; 1849 1850 addr_reg = get_address_file_reg(ctx, src->Indirect.Index); 1851 1852 /* pull the value from index_reg */ 1853 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1854 t2, 1, 1855 addr_reg, 0, 1856 V_SQ_ALU_SRC_LITERAL, first); 1857 if (r) 1858 return r; 1859 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1860 t2, 0, 1861 t2, 1, 1862 V_SQ_ALU_SRC_LITERAL, 4, 1863 offset_reg, offset_chan); 1864 if (r) 1865 return r; 1866 offset_reg = t2; 1867 offset_chan = 0; 1868 index = src->Register.Index - first; 1869 } 1870 1871 memset(&vtx, 0, sizeof(vtx)); 1872 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 1873 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1874 vtx.src_gpr = offset_reg; 1875 vtx.src_sel_x = offset_chan; 1876 vtx.offset = index * 16; /*bytes*/ 1877 vtx.mega_fetch_count = 16; 1878 vtx.dst_gpr = dst_reg; 1879 vtx.dst_sel_x = 0; /* SEL_X */ 1880 vtx.dst_sel_y = 1; /* SEL_Y */ 1881 vtx.dst_sel_z = 2; /* SEL_Z */ 1882 vtx.dst_sel_w = 3; /* SEL_W */ 1883 if (ctx->bc->chip_class >= EVERGREEN) { 1884 vtx.use_const_fields = 1; 1885 } else { 1886 vtx.data_format = FMT_32_32_32_32_FLOAT; 1887 } 1888 1889 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1890 return r; 1891 1892 return 0; 1893} 1894 1895static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) 1896{ 1897 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1898 unsigned i; 1899 1900 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1901 struct tgsi_full_src_register *src = &inst->Src[i]; 1902 1903 if (src->Register.File == TGSI_FILE_INPUT) { 1904 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) { 1905 /* primitive id is in R0.z */ 1906 ctx->src[i].sel = 0; 1907 ctx->src[i].swizzle[0] = 2; 1908 } 1909 } 1910 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) { 1911 int treg = r600_get_temp(ctx); 1912 1913 fetch_gs_input(ctx, src, treg); 1914 ctx->src[i].sel = treg; 1915 ctx->src[i].rel = 0; 1916 } 1917 } 1918 return 0; 1919} 1920 1921 1922/* Tessellation shaders pass outputs to the next shader using LDS. 1923 * 1924 * LS outputs = TCS(HS) inputs 1925 * TCS(HS) outputs = TES(DS) inputs 1926 * 1927 * The LDS layout is: 1928 * - TCS inputs for patch 0 1929 * - TCS inputs for patch 1 1930 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 1931 * - ... 1932 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 1933 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 1934 * - TCS outputs for patch 1 1935 * - Per-patch TCS outputs for patch 1 1936 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 1937 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 1938 * - ... 1939 * 1940 * All three shaders VS(LS), TCS, TES share the same LDS space. 1941 */ 1942/* this will return with the dw address in temp_reg.x */ 1943static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, 1944 const struct tgsi_full_dst_register *dst, 1945 const struct tgsi_full_src_register *src, 1946 int stride_bytes_reg, int stride_bytes_chan) 1947{ 1948 struct tgsi_full_dst_register reg; 1949 ubyte *name, *index, *array_first; 1950 int r; 1951 int param; 1952 struct tgsi_shader_info *info = &ctx->info; 1953 /* Set the register description. The address computation is the same 1954 * for sources and destinations. */ 1955 if (src) { 1956 reg.Register.File = src->Register.File; 1957 reg.Register.Index = src->Register.Index; 1958 reg.Register.Indirect = src->Register.Indirect; 1959 reg.Register.Dimension = src->Register.Dimension; 1960 reg.Indirect = src->Indirect; 1961 reg.Dimension = src->Dimension; 1962 reg.DimIndirect = src->DimIndirect; 1963 } else 1964 reg = *dst; 1965 1966 /* If the register is 2-dimensional (e.g. an array of vertices 1967 * in a primitive), calculate the base address of the vertex. */ 1968 if (reg.Register.Dimension) { 1969 int sel, chan; 1970 if (reg.Dimension.Indirect) { 1971 unsigned addr_reg; 1972 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS); 1973 1974 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index); 1975 /* pull the value from index_reg */ 1976 sel = addr_reg; 1977 chan = 0; 1978 } else { 1979 sel = V_SQ_ALU_SRC_LITERAL; 1980 chan = reg.Dimension.Index; 1981 } 1982 1983 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1984 temp_reg, 0, 1985 stride_bytes_reg, stride_bytes_chan, 1986 sel, chan, 1987 temp_reg, 0); 1988 if (r) 1989 return r; 1990 } 1991 1992 if (reg.Register.File == TGSI_FILE_INPUT) { 1993 name = info->input_semantic_name; 1994 index = info->input_semantic_index; 1995 array_first = info->input_array_first; 1996 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 1997 name = info->output_semantic_name; 1998 index = info->output_semantic_index; 1999 array_first = info->output_array_first; 2000 } else { 2001 assert(0); 2002 return -1; 2003 } 2004 if (reg.Register.Indirect) { 2005 int addr_reg; 2006 int first; 2007 /* Add the relative address of the element. */ 2008 if (reg.Indirect.ArrayID) 2009 first = array_first[reg.Indirect.ArrayID]; 2010 else 2011 first = reg.Register.Index; 2012 2013 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); 2014 2015 /* pull the value from index_reg */ 2016 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2017 temp_reg, 0, 2018 V_SQ_ALU_SRC_LITERAL, 16, 2019 addr_reg, 0, 2020 temp_reg, 0); 2021 if (r) 2022 return r; 2023 2024 param = r600_get_lds_unique_index(name[first], 2025 index[first]); 2026 2027 } else { 2028 param = r600_get_lds_unique_index(name[reg.Register.Index], 2029 index[reg.Register.Index]); 2030 } 2031 2032 /* add to base_addr - passed in temp_reg.x */ 2033 if (param) { 2034 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2035 temp_reg, 0, 2036 temp_reg, 0, 2037 V_SQ_ALU_SRC_LITERAL, param * 16); 2038 if (r) 2039 return r; 2040 2041 } 2042 return 0; 2043} 2044 2045static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 2046 unsigned dst_reg, unsigned mask) 2047{ 2048 struct r600_bytecode_alu alu; 2049 int r, i, lasti; 2050 2051 if ((ctx->bc->cf_last->ndw>>1) >= 0x60) 2052 ctx->bc->force_add_cf = 1; 2053 2054 lasti = tgsi_last_instruction(mask); 2055 for (i = 1; i <= lasti; i++) { 2056 if (!(mask & (1 << i))) 2057 continue; 2058 2059 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2060 temp_reg, i, 2061 temp_reg, 0, 2062 V_SQ_ALU_SRC_LITERAL, 4 * i); 2063 if (r) 2064 return r; 2065 } 2066 for (i = 0; i <= lasti; i++) { 2067 if (!(mask & (1 << i))) 2068 continue; 2069 2070 /* emit an LDS_READ_RET */ 2071 memset(&alu, 0, sizeof(alu)); 2072 alu.op = LDS_OP1_LDS_READ_RET; 2073 alu.src[0].sel = temp_reg; 2074 alu.src[0].chan = i; 2075 alu.src[1].sel = V_SQ_ALU_SRC_0; 2076 alu.src[2].sel = V_SQ_ALU_SRC_0; 2077 alu.dst.chan = 0; 2078 alu.is_lds_idx_op = true; 2079 alu.last = 1; 2080 r = r600_bytecode_add_alu(ctx->bc, &alu); 2081 if (r) 2082 return r; 2083 } 2084 for (i = 0; i <= lasti; i++) { 2085 if (!(mask & (1 << i))) 2086 continue; 2087 2088 /* then read from LDS_OQ_A_POP */ 2089 memset(&alu, 0, sizeof(alu)); 2090 2091 alu.op = ALU_OP1_MOV; 2092 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 2093 alu.src[0].chan = 0; 2094 alu.dst.sel = dst_reg; 2095 alu.dst.chan = i; 2096 alu.dst.write = 1; 2097 alu.last = 1; 2098 r = r600_bytecode_add_alu(ctx->bc, &alu); 2099 if (r) 2100 return r; 2101 } 2102 return 0; 2103} 2104 2105static int fetch_mask(struct tgsi_src_register *reg) 2106{ 2107 int mask = 0; 2108 mask |= 1 << reg->SwizzleX; 2109 mask |= 1 << reg->SwizzleY; 2110 mask |= 1 << reg->SwizzleZ; 2111 mask |= 1 << reg->SwizzleW; 2112 return mask; 2113} 2114 2115static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2116{ 2117 int r; 2118 unsigned temp_reg = r600_get_temp(ctx); 2119 2120 r = get_lds_offset0(ctx, 2, temp_reg, 2121 src->Register.Dimension ? false : true); 2122 if (r) 2123 return r; 2124 2125 /* the base address is now in temp.x */ 2126 r = r600_get_byte_address(ctx, temp_reg, 2127 NULL, src, ctx->tess_output_info, 1); 2128 if (r) 2129 return r; 2130 2131 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2132 if (r) 2133 return r; 2134 return 0; 2135} 2136 2137static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2138{ 2139 int r; 2140 unsigned temp_reg = r600_get_temp(ctx); 2141 2142 /* t.x = ips * r0.y */ 2143 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2144 temp_reg, 0, 2145 ctx->tess_input_info, 0, 2146 0, 1); 2147 2148 if (r) 2149 return r; 2150 2151 /* the base address is now in temp.x */ 2152 r = r600_get_byte_address(ctx, temp_reg, 2153 NULL, src, ctx->tess_input_info, 1); 2154 if (r) 2155 return r; 2156 2157 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2158 if (r) 2159 return r; 2160 return 0; 2161} 2162 2163static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2164{ 2165 int r; 2166 unsigned temp_reg = r600_get_temp(ctx); 2167 2168 r = get_lds_offset0(ctx, 1, temp_reg, 2169 src->Register.Dimension ? false : true); 2170 if (r) 2171 return r; 2172 /* the base address is now in temp.x */ 2173 r = r600_get_byte_address(ctx, temp_reg, 2174 NULL, src, 2175 ctx->tess_output_info, 1); 2176 if (r) 2177 return r; 2178 2179 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2180 if (r) 2181 return r; 2182 return 0; 2183} 2184 2185static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) 2186{ 2187 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2188 unsigned i; 2189 2190 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2191 struct tgsi_full_src_register *src = &inst->Src[i]; 2192 2193 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) { 2194 int treg = r600_get_temp(ctx); 2195 fetch_tes_input(ctx, src, treg); 2196 ctx->src[i].sel = treg; 2197 ctx->src[i].rel = 0; 2198 } 2199 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) { 2200 int treg = r600_get_temp(ctx); 2201 fetch_tcs_input(ctx, src, treg); 2202 ctx->src[i].sel = treg; 2203 ctx->src[i].rel = 0; 2204 } 2205 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) { 2206 int treg = r600_get_temp(ctx); 2207 fetch_tcs_output(ctx, src, treg); 2208 ctx->src[i].sel = treg; 2209 ctx->src[i].rel = 0; 2210 } 2211 } 2212 return 0; 2213} 2214 2215static int tgsi_split_constant(struct r600_shader_ctx *ctx) 2216{ 2217 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2218 struct r600_bytecode_alu alu; 2219 int i, j, k, nconst, r; 2220 2221 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 2222 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 2223 nconst++; 2224 } 2225 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 2226 } 2227 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 2228 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 2229 continue; 2230 } 2231 2232 if (ctx->src[i].rel) { 2233 int chan = inst->Src[i].Indirect.Swizzle; 2234 int treg = r600_get_temp(ctx); 2235 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) 2236 return r; 2237 2238 ctx->src[i].kc_bank = 0; 2239 ctx->src[i].kc_rel = 0; 2240 ctx->src[i].sel = treg; 2241 ctx->src[i].rel = 0; 2242 j--; 2243 } else if (j > 0) { 2244 int treg = r600_get_temp(ctx); 2245 for (k = 0; k < 4; k++) { 2246 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2247 alu.op = ALU_OP1_MOV; 2248 alu.src[0].sel = ctx->src[i].sel; 2249 alu.src[0].chan = k; 2250 alu.src[0].rel = ctx->src[i].rel; 2251 alu.src[0].kc_bank = ctx->src[i].kc_bank; 2252 alu.src[0].kc_rel = ctx->src[i].kc_rel; 2253 alu.dst.sel = treg; 2254 alu.dst.chan = k; 2255 alu.dst.write = 1; 2256 if (k == 3) 2257 alu.last = 1; 2258 r = r600_bytecode_add_alu(ctx->bc, &alu); 2259 if (r) 2260 return r; 2261 } 2262 ctx->src[i].sel = treg; 2263 ctx->src[i].rel =0; 2264 j--; 2265 } 2266 } 2267 return 0; 2268} 2269 2270/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 2271static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 2272{ 2273 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2274 struct r600_bytecode_alu alu; 2275 int i, j, k, nliteral, r; 2276 2277 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 2278 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2279 nliteral++; 2280 } 2281 } 2282 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 2283 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2284 int treg = r600_get_temp(ctx); 2285 for (k = 0; k < 4; k++) { 2286 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2287 alu.op = ALU_OP1_MOV; 2288 alu.src[0].sel = ctx->src[i].sel; 2289 alu.src[0].chan = k; 2290 alu.src[0].value = ctx->src[i].value[k]; 2291 alu.dst.sel = treg; 2292 alu.dst.chan = k; 2293 alu.dst.write = 1; 2294 if (k == 3) 2295 alu.last = 1; 2296 r = r600_bytecode_add_alu(ctx->bc, &alu); 2297 if (r) 2298 return r; 2299 } 2300 ctx->src[i].sel = treg; 2301 j--; 2302 } 2303 } 2304 return 0; 2305} 2306 2307static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 2308{ 2309 int i, r, count = ctx->shader->ninput; 2310 2311 for (i = 0; i < count; i++) { 2312 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 2313 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 2314 if (r) 2315 return r; 2316 } 2317 } 2318 return 0; 2319} 2320 2321static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so, 2322 int stream, unsigned *stream_item_size UNUSED) 2323{ 2324 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 2325 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS]; 2326 int j, r; 2327 unsigned i; 2328 2329 /* Sanity checking. */ 2330 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) { 2331 R600_ERR("Too many stream outputs: %d\n", so->num_outputs); 2332 r = -EINVAL; 2333 goto out_err; 2334 } 2335 for (i = 0; i < so->num_outputs; i++) { 2336 if (so->output[i].output_buffer >= 4) { 2337 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 2338 so->output[i].output_buffer); 2339 r = -EINVAL; 2340 goto out_err; 2341 } 2342 } 2343 2344 /* Initialize locations where the outputs are stored. */ 2345 for (i = 0; i < so->num_outputs; i++) { 2346 2347 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; 2348 start_comp[i] = so->output[i].start_component; 2349 /* Lower outputs with dst_offset < start_component. 2350 * 2351 * We can only output 4D vectors with a write mask, e.g. we can 2352 * only output the W component at offset 3, etc. If we want 2353 * to store Y, Z, or W at buffer offset 0, we need to use MOV 2354 * to move it to X and output X. */ 2355 if (so->output[i].dst_offset < so->output[i].start_component) { 2356 unsigned tmp = r600_get_temp(ctx); 2357 2358 for (j = 0; j < so->output[i].num_components; j++) { 2359 struct r600_bytecode_alu alu; 2360 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2361 alu.op = ALU_OP1_MOV; 2362 alu.src[0].sel = so_gpr[i]; 2363 alu.src[0].chan = so->output[i].start_component + j; 2364 2365 alu.dst.sel = tmp; 2366 alu.dst.chan = j; 2367 alu.dst.write = 1; 2368 if (j == so->output[i].num_components - 1) 2369 alu.last = 1; 2370 r = r600_bytecode_add_alu(ctx->bc, &alu); 2371 if (r) 2372 return r; 2373 } 2374 start_comp[i] = 0; 2375 so_gpr[i] = tmp; 2376 } 2377 } 2378 2379 /* Write outputs to buffers. */ 2380 for (i = 0; i < so->num_outputs; i++) { 2381 struct r600_bytecode_output output; 2382 2383 if (stream != -1 && stream != so->output[i].stream) 2384 continue; 2385 2386 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2387 output.gpr = so_gpr[i]; 2388 output.elem_size = so->output[i].num_components - 1; 2389 if (output.elem_size == 2) 2390 output.elem_size = 3; // 3 not supported, write 4 with junk at end 2391 output.array_base = so->output[i].dst_offset - start_comp[i]; 2392 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2393 output.burst_count = 1; 2394 /* array_size is an upper limit for the burst_count 2395 * with MEM_STREAM instructions */ 2396 output.array_size = 0xFFF; 2397 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i]; 2398 2399 if (ctx->bc->chip_class >= EVERGREEN) { 2400 switch (so->output[i].output_buffer) { 2401 case 0: 2402 output.op = CF_OP_MEM_STREAM0_BUF0; 2403 break; 2404 case 1: 2405 output.op = CF_OP_MEM_STREAM0_BUF1; 2406 break; 2407 case 2: 2408 output.op = CF_OP_MEM_STREAM0_BUF2; 2409 break; 2410 case 3: 2411 output.op = CF_OP_MEM_STREAM0_BUF3; 2412 break; 2413 } 2414 output.op += so->output[i].stream * 4; 2415 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3); 2416 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; 2417 } else { 2418 switch (so->output[i].output_buffer) { 2419 case 0: 2420 output.op = CF_OP_MEM_STREAM0; 2421 break; 2422 case 1: 2423 output.op = CF_OP_MEM_STREAM1; 2424 break; 2425 case 2: 2426 output.op = CF_OP_MEM_STREAM2; 2427 break; 2428 case 3: 2429 output.op = CF_OP_MEM_STREAM3; 2430 break; 2431 } 2432 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer; 2433 } 2434 r = r600_bytecode_add_output(ctx->bc, &output); 2435 if (r) 2436 goto out_err; 2437 } 2438 return 0; 2439out_err: 2440 return r; 2441} 2442 2443static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx) 2444{ 2445 struct r600_bytecode_alu alu; 2446 unsigned reg; 2447 2448 if (!ctx->shader->vs_out_edgeflag) 2449 return; 2450 2451 reg = ctx->shader->output[ctx->edgeflag_output].gpr; 2452 2453 /* clamp(x, 0, 1) */ 2454 memset(&alu, 0, sizeof(alu)); 2455 alu.op = ALU_OP1_MOV; 2456 alu.src[0].sel = reg; 2457 alu.dst.sel = reg; 2458 alu.dst.write = 1; 2459 alu.dst.clamp = 1; 2460 alu.last = 1; 2461 r600_bytecode_add_alu(ctx->bc, &alu); 2462 2463 memset(&alu, 0, sizeof(alu)); 2464 alu.op = ALU_OP1_FLT_TO_INT; 2465 alu.src[0].sel = reg; 2466 alu.dst.sel = reg; 2467 alu.dst.write = 1; 2468 alu.last = 1; 2469 r600_bytecode_add_alu(ctx->bc, &alu); 2470} 2471 2472static int generate_gs_copy_shader(struct r600_context *rctx, 2473 struct r600_pipe_shader *gs, 2474 struct pipe_stream_output_info *so) 2475{ 2476 struct r600_shader_ctx ctx = {}; 2477 struct r600_shader *gs_shader = &gs->shader; 2478 struct r600_pipe_shader *cshader; 2479 unsigned ocnt = gs_shader->noutput; 2480 struct r600_bytecode_alu alu; 2481 struct r600_bytecode_vtx vtx; 2482 struct r600_bytecode_output output; 2483 struct r600_bytecode_cf *cf_jump, *cf_pop, 2484 *last_exp_pos = NULL, *last_exp_param = NULL; 2485 int next_clip_pos = 61, next_param = 0; 2486 unsigned i, j; 2487 int ring; 2488 bool only_ring_0 = true; 2489 cshader = calloc(1, sizeof(struct r600_pipe_shader)); 2490 if (!cshader) 2491 return 0; 2492 2493 memcpy(cshader->shader.output, gs_shader->output, ocnt * 2494 sizeof(struct r600_shader_io)); 2495 2496 cshader->shader.noutput = ocnt; 2497 2498 ctx.shader = &cshader->shader; 2499 ctx.bc = &ctx.shader->bc; 2500 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX; 2501 2502 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family, 2503 rctx->screen->has_compressed_msaa_texturing); 2504 2505 ctx.bc->isa = rctx->isa; 2506 2507 cf_jump = NULL; 2508 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes)); 2509 2510 /* R0.x = R0.x & 0x3fffffff */ 2511 memset(&alu, 0, sizeof(alu)); 2512 alu.op = ALU_OP2_AND_INT; 2513 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2514 alu.src[1].value = 0x3fffffff; 2515 alu.dst.write = 1; 2516 r600_bytecode_add_alu(ctx.bc, &alu); 2517 2518 /* R0.y = R0.x >> 30 */ 2519 memset(&alu, 0, sizeof(alu)); 2520 alu.op = ALU_OP2_LSHR_INT; 2521 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2522 alu.src[1].value = 0x1e; 2523 alu.dst.chan = 1; 2524 alu.dst.write = 1; 2525 alu.last = 1; 2526 r600_bytecode_add_alu(ctx.bc, &alu); 2527 2528 /* fetch vertex data from GSVS ring */ 2529 for (i = 0; i < ocnt; ++i) { 2530 struct r600_shader_io *out = &ctx.shader->output[i]; 2531 2532 out->gpr = i + 1; 2533 out->ring_offset = i * 16; 2534 2535 memset(&vtx, 0, sizeof(vtx)); 2536 vtx.op = FETCH_OP_VFETCH; 2537 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 2538 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2539 vtx.mega_fetch_count = 16; 2540 vtx.offset = out->ring_offset; 2541 vtx.dst_gpr = out->gpr; 2542 vtx.src_gpr = 0; 2543 vtx.dst_sel_x = 0; 2544 vtx.dst_sel_y = 1; 2545 vtx.dst_sel_z = 2; 2546 vtx.dst_sel_w = 3; 2547 if (rctx->b.chip_class >= EVERGREEN) { 2548 vtx.use_const_fields = 1; 2549 } else { 2550 vtx.data_format = FMT_32_32_32_32_FLOAT; 2551 } 2552 2553 r600_bytecode_add_vtx(ctx.bc, &vtx); 2554 } 2555 ctx.temp_reg = i + 1; 2556 for (ring = 3; ring >= 0; --ring) { 2557 bool enabled = false; 2558 for (i = 0; i < so->num_outputs; i++) { 2559 if (so->output[i].stream == ring) { 2560 enabled = true; 2561 if (ring > 0) 2562 only_ring_0 = false; 2563 break; 2564 } 2565 } 2566 if (ring != 0 && !enabled) { 2567 cshader->shader.ring_item_sizes[ring] = 0; 2568 continue; 2569 } 2570 2571 if (cf_jump) { 2572 // Patch up jump label 2573 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2574 cf_pop = ctx.bc->cf_last; 2575 2576 cf_jump->cf_addr = cf_pop->id + 2; 2577 cf_jump->pop_count = 1; 2578 cf_pop->cf_addr = cf_pop->id + 2; 2579 cf_pop->pop_count = 1; 2580 } 2581 2582 /* PRED_SETE_INT __, R0.y, ring */ 2583 memset(&alu, 0, sizeof(alu)); 2584 alu.op = ALU_OP2_PRED_SETE_INT; 2585 alu.src[0].chan = 1; 2586 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2587 alu.src[1].value = ring; 2588 alu.execute_mask = 1; 2589 alu.update_pred = 1; 2590 alu.last = 1; 2591 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); 2592 2593 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); 2594 cf_jump = ctx.bc->cf_last; 2595 2596 if (enabled) 2597 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]); 2598 cshader->shader.ring_item_sizes[ring] = ocnt * 16; 2599 } 2600 2601 /* bc adds nops - copy it */ 2602 if (ctx.bc->chip_class == R600) { 2603 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2604 alu.op = ALU_OP0_NOP; 2605 alu.last = 1; 2606 r600_bytecode_add_alu(ctx.bc, &alu); 2607 2608 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2609 } 2610 2611 /* export vertex data */ 2612 /* XXX factor out common code with r600_shader_from_tgsi ? */ 2613 for (i = 0; i < ocnt; ++i) { 2614 struct r600_shader_io *out = &ctx.shader->output[i]; 2615 bool instream0 = true; 2616 if (out->name == TGSI_SEMANTIC_CLIPVERTEX) 2617 continue; 2618 2619 for (j = 0; j < so->num_outputs; j++) { 2620 if (so->output[j].register_index == i) { 2621 if (so->output[j].stream == 0) 2622 break; 2623 if (so->output[j].stream > 0) 2624 instream0 = false; 2625 } 2626 } 2627 if (!instream0) 2628 continue; 2629 memset(&output, 0, sizeof(output)); 2630 output.gpr = out->gpr; 2631 output.elem_size = 3; 2632 output.swizzle_x = 0; 2633 output.swizzle_y = 1; 2634 output.swizzle_z = 2; 2635 output.swizzle_w = 3; 2636 output.burst_count = 1; 2637 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2638 output.op = CF_OP_EXPORT; 2639 switch (out->name) { 2640 case TGSI_SEMANTIC_POSITION: 2641 output.array_base = 60; 2642 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2643 break; 2644 2645 case TGSI_SEMANTIC_PSIZE: 2646 output.array_base = 61; 2647 if (next_clip_pos == 61) 2648 next_clip_pos = 62; 2649 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2650 output.swizzle_y = 7; 2651 output.swizzle_z = 7; 2652 output.swizzle_w = 7; 2653 ctx.shader->vs_out_misc_write = 1; 2654 ctx.shader->vs_out_point_size = 1; 2655 break; 2656 case TGSI_SEMANTIC_LAYER: 2657 if (out->spi_sid) { 2658 /* duplicate it as PARAM to pass to the pixel shader */ 2659 output.array_base = next_param++; 2660 r600_bytecode_add_output(ctx.bc, &output); 2661 last_exp_param = ctx.bc->cf_last; 2662 } 2663 output.array_base = 61; 2664 if (next_clip_pos == 61) 2665 next_clip_pos = 62; 2666 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2667 output.swizzle_x = 7; 2668 output.swizzle_y = 7; 2669 output.swizzle_z = 0; 2670 output.swizzle_w = 7; 2671 ctx.shader->vs_out_misc_write = 1; 2672 ctx.shader->vs_out_layer = 1; 2673 break; 2674 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2675 if (out->spi_sid) { 2676 /* duplicate it as PARAM to pass to the pixel shader */ 2677 output.array_base = next_param++; 2678 r600_bytecode_add_output(ctx.bc, &output); 2679 last_exp_param = ctx.bc->cf_last; 2680 } 2681 output.array_base = 61; 2682 if (next_clip_pos == 61) 2683 next_clip_pos = 62; 2684 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2685 ctx.shader->vs_out_misc_write = 1; 2686 ctx.shader->vs_out_viewport = 1; 2687 output.swizzle_x = 7; 2688 output.swizzle_y = 7; 2689 output.swizzle_z = 7; 2690 output.swizzle_w = 0; 2691 break; 2692 case TGSI_SEMANTIC_CLIPDIST: 2693 /* spi_sid is 0 for clipdistance outputs that were generated 2694 * for clipvertex - we don't need to pass them to PS */ 2695 ctx.shader->clip_dist_write = gs->shader.clip_dist_write; 2696 ctx.shader->cull_dist_write = gs->shader.cull_dist_write; 2697 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask; 2698 if (out->spi_sid) { 2699 /* duplicate it as PARAM to pass to the pixel shader */ 2700 output.array_base = next_param++; 2701 r600_bytecode_add_output(ctx.bc, &output); 2702 last_exp_param = ctx.bc->cf_last; 2703 } 2704 output.array_base = next_clip_pos++; 2705 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2706 break; 2707 case TGSI_SEMANTIC_FOG: 2708 output.swizzle_y = 4; /* 0 */ 2709 output.swizzle_z = 4; /* 0 */ 2710 output.swizzle_w = 5; /* 1 */ 2711 break; 2712 default: 2713 output.array_base = next_param++; 2714 break; 2715 } 2716 r600_bytecode_add_output(ctx.bc, &output); 2717 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) 2718 last_exp_param = ctx.bc->cf_last; 2719 else 2720 last_exp_pos = ctx.bc->cf_last; 2721 } 2722 2723 if (!last_exp_pos) { 2724 memset(&output, 0, sizeof(output)); 2725 output.gpr = 0; 2726 output.elem_size = 3; 2727 output.swizzle_x = 7; 2728 output.swizzle_y = 7; 2729 output.swizzle_z = 7; 2730 output.swizzle_w = 7; 2731 output.burst_count = 1; 2732 output.type = 2; 2733 output.op = CF_OP_EXPORT; 2734 output.array_base = 60; 2735 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2736 r600_bytecode_add_output(ctx.bc, &output); 2737 last_exp_pos = ctx.bc->cf_last; 2738 } 2739 2740 if (!last_exp_param) { 2741 memset(&output, 0, sizeof(output)); 2742 output.gpr = 0; 2743 output.elem_size = 3; 2744 output.swizzle_x = 7; 2745 output.swizzle_y = 7; 2746 output.swizzle_z = 7; 2747 output.swizzle_w = 7; 2748 output.burst_count = 1; 2749 output.type = 2; 2750 output.op = CF_OP_EXPORT; 2751 output.array_base = next_param++; 2752 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2753 r600_bytecode_add_output(ctx.bc, &output); 2754 last_exp_param = ctx.bc->cf_last; 2755 } 2756 2757 last_exp_pos->op = CF_OP_EXPORT_DONE; 2758 last_exp_param->op = CF_OP_EXPORT_DONE; 2759 2760 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2761 cf_pop = ctx.bc->cf_last; 2762 2763 cf_jump->cf_addr = cf_pop->id + 2; 2764 cf_jump->pop_count = 1; 2765 cf_pop->cf_addr = cf_pop->id + 2; 2766 cf_pop->pop_count = 1; 2767 2768 if (ctx.bc->chip_class == CAYMAN) 2769 cm_bytecode_add_cf_end(ctx.bc); 2770 else { 2771 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2772 ctx.bc->cf_last->end_of_program = 1; 2773 } 2774 2775 gs->gs_copy_shader = cshader; 2776 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 2777 2778 ctx.bc->nstack = 1; 2779 2780 return r600_bytecode_build(ctx.bc); 2781} 2782 2783static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind) 2784{ 2785 if (ind) { 2786 struct r600_bytecode_alu alu; 2787 int r; 2788 2789 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2790 alu.op = ALU_OP2_ADD_INT; 2791 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx]; 2792 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2793 alu.src[1].value = ctx->gs_out_ring_offset >> 4; 2794 alu.dst.sel = ctx->gs_export_gpr_tregs[idx]; 2795 alu.dst.write = 1; 2796 alu.last = 1; 2797 r = r600_bytecode_add_alu(ctx->bc, &alu); 2798 if (r) 2799 return r; 2800 } 2801 return 0; 2802} 2803 2804static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind) 2805{ 2806 struct r600_bytecode_output output; 2807 int ring_offset; 2808 unsigned i, k; 2809 int effective_stream = stream == -1 ? 0 : stream; 2810 int idx = 0; 2811 2812 for (i = 0; i < ctx->shader->noutput; i++) { 2813 if (ctx->gs_for_vs) { 2814 /* for ES we need to lookup corresponding ring offset expected by GS 2815 * (map this output to GS input by name and sid) */ 2816 /* FIXME precompute offsets */ 2817 ring_offset = -1; 2818 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) { 2819 struct r600_shader_io *in = &ctx->gs_for_vs->input[k]; 2820 struct r600_shader_io *out = &ctx->shader->output[i]; 2821 if (in->name == out->name && in->sid == out->sid) 2822 ring_offset = in->ring_offset; 2823 } 2824 2825 if (ring_offset == -1) 2826 continue; 2827 } else { 2828 ring_offset = idx * 16; 2829 idx++; 2830 } 2831 2832 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION) 2833 continue; 2834 /* next_ring_offset after parsing input decls contains total size of 2835 * single vertex data, gs_next_vertex - current vertex index */ 2836 if (!ind) 2837 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex; 2838 2839 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2840 output.gpr = ctx->shader->output[i].gpr; 2841 output.elem_size = 3; 2842 output.comp_mask = 0xF; 2843 output.burst_count = 1; 2844 2845 if (ind) 2846 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 2847 else 2848 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2849 2850 switch (stream) { 2851 default: 2852 case 0: 2853 output.op = CF_OP_MEM_RING; break; 2854 case 1: 2855 output.op = CF_OP_MEM_RING1; break; 2856 case 2: 2857 output.op = CF_OP_MEM_RING2; break; 2858 case 3: 2859 output.op = CF_OP_MEM_RING3; break; 2860 } 2861 2862 if (ind) { 2863 output.array_base = ring_offset >> 2; /* in dwords */ 2864 output.array_size = 0xfff; 2865 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream]; 2866 } else 2867 output.array_base = ring_offset >> 2; /* in dwords */ 2868 r600_bytecode_add_output(ctx->bc, &output); 2869 } 2870 2871 ++ctx->gs_next_vertex; 2872 return 0; 2873} 2874 2875 2876static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) 2877{ 2878 int r; 2879 struct r600_bytecode_vtx vtx; 2880 int temp_val = ctx->temp_reg; 2881 /* need to store the TCS output somewhere */ 2882 r = single_alu_op2(ctx, ALU_OP1_MOV, 2883 temp_val, 0, 2884 V_SQ_ALU_SRC_LITERAL, 0, 2885 0, 0); 2886 if (r) 2887 return r; 2888 2889 /* used by VS/TCS */ 2890 if (ctx->tess_input_info) { 2891 /* fetch tcs input values into resv space */ 2892 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2893 vtx.op = FETCH_OP_VFETCH; 2894 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2895 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2896 vtx.mega_fetch_count = 16; 2897 vtx.data_format = FMT_32_32_32_32; 2898 vtx.num_format_all = 2; 2899 vtx.format_comp_all = 1; 2900 vtx.use_const_fields = 0; 2901 vtx.endian = r600_endian_swap(32); 2902 vtx.srf_mode_all = 1; 2903 vtx.offset = 0; 2904 vtx.dst_gpr = ctx->tess_input_info; 2905 vtx.dst_sel_x = 0; 2906 vtx.dst_sel_y = 1; 2907 vtx.dst_sel_z = 2; 2908 vtx.dst_sel_w = 3; 2909 vtx.src_gpr = temp_val; 2910 vtx.src_sel_x = 0; 2911 2912 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 2913 if (r) 2914 return r; 2915 } 2916 2917 /* used by TCS/TES */ 2918 if (ctx->tess_output_info) { 2919 /* fetch tcs output values into resv space */ 2920 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2921 vtx.op = FETCH_OP_VFETCH; 2922 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2923 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2924 vtx.mega_fetch_count = 16; 2925 vtx.data_format = FMT_32_32_32_32; 2926 vtx.num_format_all = 2; 2927 vtx.format_comp_all = 1; 2928 vtx.use_const_fields = 0; 2929 vtx.endian = r600_endian_swap(32); 2930 vtx.srf_mode_all = 1; 2931 vtx.offset = 16; 2932 vtx.dst_gpr = ctx->tess_output_info; 2933 vtx.dst_sel_x = 0; 2934 vtx.dst_sel_y = 1; 2935 vtx.dst_sel_z = 2; 2936 vtx.dst_sel_w = 3; 2937 vtx.src_gpr = temp_val; 2938 vtx.src_sel_x = 0; 2939 2940 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 2941 if (r) 2942 return r; 2943 } 2944 return 0; 2945} 2946 2947static int emit_lds_vs_writes(struct r600_shader_ctx *ctx) 2948{ 2949 int j, r; 2950 int temp_reg; 2951 unsigned i; 2952 2953 /* fetch tcs input values into input_vals */ 2954 ctx->tess_input_info = r600_get_temp(ctx); 2955 ctx->tess_output_info = 0; 2956 r = r600_fetch_tess_io_info(ctx); 2957 if (r) 2958 return r; 2959 2960 temp_reg = r600_get_temp(ctx); 2961 /* dst reg contains LDS address stride * idx */ 2962 /* MUL vertexID, vertex_dw_stride */ 2963 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2964 temp_reg, 0, 2965 ctx->tess_input_info, 1, 2966 0, 1); /* rel id in r0.y? */ 2967 if (r) 2968 return r; 2969 2970 for (i = 0; i < ctx->shader->noutput; i++) { 2971 struct r600_bytecode_alu alu; 2972 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid); 2973 2974 if (param) { 2975 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2976 temp_reg, 1, 2977 temp_reg, 0, 2978 V_SQ_ALU_SRC_LITERAL, param * 16); 2979 if (r) 2980 return r; 2981 } 2982 2983 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2984 temp_reg, 2, 2985 temp_reg, param ? 1 : 0, 2986 V_SQ_ALU_SRC_LITERAL, 8); 2987 if (r) 2988 return r; 2989 2990 2991 for (j = 0; j < 2; j++) { 2992 int chan = (j == 1) ? 2 : (param ? 1 : 0); 2993 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2994 alu.op = LDS_OP3_LDS_WRITE_REL; 2995 alu.src[0].sel = temp_reg; 2996 alu.src[0].chan = chan; 2997 alu.src[1].sel = ctx->shader->output[i].gpr; 2998 alu.src[1].chan = j * 2; 2999 alu.src[2].sel = ctx->shader->output[i].gpr; 3000 alu.src[2].chan = (j * 2) + 1; 3001 alu.last = 1; 3002 alu.dst.chan = 0; 3003 alu.lds_idx = 1; 3004 alu.is_lds_idx_op = true; 3005 r = r600_bytecode_add_alu(ctx->bc, &alu); 3006 if (r) 3007 return r; 3008 } 3009 } 3010 return 0; 3011} 3012 3013static int r600_store_tcs_output(struct r600_shader_ctx *ctx) 3014{ 3015 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3016 const struct tgsi_full_dst_register *dst = &inst->Dst[0]; 3017 int i, r, lasti; 3018 int temp_reg = r600_get_temp(ctx); 3019 struct r600_bytecode_alu alu; 3020 unsigned write_mask = dst->Register.WriteMask; 3021 3022 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) 3023 return 0; 3024 3025 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); 3026 if (r) 3027 return r; 3028 3029 /* the base address is now in temp.x */ 3030 r = r600_get_byte_address(ctx, temp_reg, 3031 &inst->Dst[0], NULL, ctx->tess_output_info, 1); 3032 if (r) 3033 return r; 3034 3035 /* LDS write */ 3036 lasti = tgsi_last_instruction(write_mask); 3037 for (i = 1; i <= lasti; i++) { 3038 3039 if (!(write_mask & (1 << i))) 3040 continue; 3041 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3042 temp_reg, i, 3043 temp_reg, 0, 3044 V_SQ_ALU_SRC_LITERAL, 4 * i); 3045 if (r) 3046 return r; 3047 } 3048 3049 for (i = 0; i <= lasti; i++) { 3050 if (!(write_mask & (1 << i))) 3051 continue; 3052 3053 if ((i == 0 && ((write_mask & 3) == 3)) || 3054 (i == 2 && ((write_mask & 0xc) == 0xc))) { 3055 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3056 alu.op = LDS_OP3_LDS_WRITE_REL; 3057 alu.src[0].sel = temp_reg; 3058 alu.src[0].chan = i; 3059 3060 alu.src[1].sel = dst->Register.Index; 3061 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3062 alu.src[1].chan = i; 3063 3064 alu.src[2].sel = dst->Register.Index; 3065 alu.src[2].sel += ctx->file_offset[dst->Register.File]; 3066 alu.src[2].chan = i + 1; 3067 alu.lds_idx = 1; 3068 alu.dst.chan = 0; 3069 alu.last = 1; 3070 alu.is_lds_idx_op = true; 3071 r = r600_bytecode_add_alu(ctx->bc, &alu); 3072 if (r) 3073 return r; 3074 i += 1; 3075 continue; 3076 } 3077 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3078 alu.op = LDS_OP2_LDS_WRITE; 3079 alu.src[0].sel = temp_reg; 3080 alu.src[0].chan = i; 3081 3082 alu.src[1].sel = dst->Register.Index; 3083 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3084 alu.src[1].chan = i; 3085 3086 alu.src[2].sel = V_SQ_ALU_SRC_0; 3087 alu.dst.chan = 0; 3088 alu.last = 1; 3089 alu.is_lds_idx_op = true; 3090 r = r600_bytecode_add_alu(ctx->bc, &alu); 3091 if (r) 3092 return r; 3093 } 3094 return 0; 3095} 3096 3097static int r600_tess_factor_read(struct r600_shader_ctx *ctx, 3098 int output_idx, int nc) 3099{ 3100 int param; 3101 unsigned temp_reg = r600_get_temp(ctx); 3102 unsigned name = ctx->shader->output[output_idx].name; 3103 int dreg = ctx->shader->output[output_idx].gpr; 3104 int r; 3105 3106 param = r600_get_lds_unique_index(name, 0); 3107 r = get_lds_offset0(ctx, 1, temp_reg, true); 3108 if (r) 3109 return r; 3110 3111 if (param) { 3112 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3113 temp_reg, 0, 3114 temp_reg, 0, 3115 V_SQ_ALU_SRC_LITERAL, param * 16); 3116 if (r) 3117 return r; 3118 } 3119 3120 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1)); 3121 return 0; 3122} 3123 3124static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) 3125{ 3126 int stride, outer_comps, inner_comps; 3127 int tessinner_idx = -1, tessouter_idx = -1; 3128 int i, r; 3129 unsigned j; 3130 int temp_reg = r600_get_temp(ctx); 3131 int treg[3] = {-1, -1, -1}; 3132 struct r600_bytecode_alu alu; 3133 struct r600_bytecode_cf *cf_jump, *cf_pop; 3134 3135 /* only execute factor emission for invocation 0 */ 3136 /* PRED_SETE_INT __, R0.x, 0 */ 3137 memset(&alu, 0, sizeof(alu)); 3138 alu.op = ALU_OP2_PRED_SETE_INT; 3139 alu.src[0].chan = 2; 3140 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3141 alu.execute_mask = 1; 3142 alu.update_pred = 1; 3143 alu.last = 1; 3144 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE); 3145 3146 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 3147 cf_jump = ctx->bc->cf_last; 3148 3149 treg[0] = r600_get_temp(ctx); 3150 switch (ctx->shader->tcs_prim_mode) { 3151 case PIPE_PRIM_LINES: 3152 stride = 8; /* 2 dwords, 1 vec2 store */ 3153 outer_comps = 2; 3154 inner_comps = 0; 3155 break; 3156 case PIPE_PRIM_TRIANGLES: 3157 stride = 16; /* 4 dwords, 1 vec4 store */ 3158 outer_comps = 3; 3159 inner_comps = 1; 3160 treg[1] = r600_get_temp(ctx); 3161 break; 3162 case PIPE_PRIM_QUADS: 3163 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */ 3164 outer_comps = 4; 3165 inner_comps = 2; 3166 treg[1] = r600_get_temp(ctx); 3167 treg[2] = r600_get_temp(ctx); 3168 break; 3169 default: 3170 assert(0); 3171 return -1; 3172 } 3173 3174 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */ 3175 /* TF_WRITE takes index in R.x, value in R.y */ 3176 for (j = 0; j < ctx->shader->noutput; j++) { 3177 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER) 3178 tessinner_idx = j; 3179 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER) 3180 tessouter_idx = j; 3181 } 3182 3183 if (tessouter_idx == -1) 3184 return -1; 3185 3186 if (tessinner_idx == -1 && inner_comps) 3187 return -1; 3188 3189 if (tessouter_idx != -1) { 3190 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps); 3191 if (r) 3192 return r; 3193 } 3194 3195 if (tessinner_idx != -1) { 3196 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps); 3197 if (r) 3198 return r; 3199 } 3200 3201 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */ 3202 /* r.x = relpatchid(r0.y) * tf_stride */ 3203 3204 /* multiply incoming r0.y * stride - t.x = r0.y * stride */ 3205 /* add incoming r0.w to it: t.x = t.x + r0.w */ 3206 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3207 temp_reg, 0, 3208 0, 1, 3209 V_SQ_ALU_SRC_LITERAL, stride, 3210 0, 3); 3211 if (r) 3212 return r; 3213 3214 for (i = 0; i < outer_comps + inner_comps; i++) { 3215 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx; 3216 int out_comp = i >= outer_comps ? i - outer_comps : i; 3217 3218 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) { 3219 if (out_comp == 1) 3220 out_comp = 0; 3221 else if (out_comp == 0) 3222 out_comp = 1; 3223 } 3224 3225 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3226 treg[i / 2], (2 * (i % 2)), 3227 temp_reg, 0, 3228 V_SQ_ALU_SRC_LITERAL, 4 * i); 3229 if (r) 3230 return r; 3231 r = single_alu_op2(ctx, ALU_OP1_MOV, 3232 treg[i / 2], 1 + (2 * (i%2)), 3233 ctx->shader->output[out_idx].gpr, out_comp, 3234 0, 0); 3235 if (r) 3236 return r; 3237 } 3238 for (i = 0; i < outer_comps + inner_comps; i++) { 3239 struct r600_bytecode_gds gds; 3240 3241 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 3242 gds.src_gpr = treg[i / 2]; 3243 gds.src_sel_x = 2 * (i % 2); 3244 gds.src_sel_y = 1 + (2 * (i % 2)); 3245 gds.src_sel_z = 4; 3246 gds.dst_sel_x = 7; 3247 gds.dst_sel_y = 7; 3248 gds.dst_sel_z = 7; 3249 gds.dst_sel_w = 7; 3250 gds.op = FETCH_OP_TF_WRITE; 3251 r = r600_bytecode_add_gds(ctx->bc, &gds); 3252 if (r) 3253 return r; 3254 } 3255 3256 // Patch up jump label 3257 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 3258 cf_pop = ctx->bc->cf_last; 3259 3260 cf_jump->cf_addr = cf_pop->id + 2; 3261 cf_jump->pop_count = 1; 3262 cf_pop->cf_addr = cf_pop->id + 2; 3263 cf_pop->pop_count = 1; 3264 3265 return 0; 3266} 3267 3268/* 3269 * We have to work out the thread ID for load and atomic 3270 * operations, which store the returned value to an index 3271 * in an intermediate buffer. 3272 * The index is calculated by taking the thread id, 3273 * calculated from the MBCNT instructions. 3274 * Then the shader engine ID is multiplied by 256, 3275 * and the wave id is added. 3276 * Then the result is multipled by 64 and thread id is 3277 * added. 3278 */ 3279static int load_thread_id_gpr(struct r600_shader_ctx *ctx) 3280{ 3281 struct r600_bytecode_alu alu; 3282 int r; 3283 3284 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3285 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT; 3286 alu.dst.sel = ctx->temp_reg; 3287 alu.dst.chan = 0; 3288 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3289 alu.src[0].value = 0xffffffff; 3290 alu.dst.write = 1; 3291 r = r600_bytecode_add_alu(ctx->bc, &alu); 3292 if (r) 3293 return r; 3294 3295 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3296 alu.op = ALU_OP1_MBCNT_32HI_INT; 3297 alu.dst.sel = ctx->temp_reg; 3298 alu.dst.chan = 1; 3299 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3300 alu.src[0].value = 0xffffffff; 3301 alu.dst.write = 1; 3302 r = r600_bytecode_add_alu(ctx->bc, &alu); 3303 if (r) 3304 return r; 3305 3306 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3307 alu.op = ALU_OP3_MULADD_UINT24; 3308 alu.dst.sel = ctx->temp_reg; 3309 alu.dst.chan = 2; 3310 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID; 3311 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3312 alu.src[1].value = 256; 3313 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID; 3314 alu.dst.write = 1; 3315 alu.is_op3 = 1; 3316 alu.last = 1; 3317 r = r600_bytecode_add_alu(ctx->bc, &alu); 3318 if (r) 3319 return r; 3320 3321 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3322 ctx->thread_id_gpr, 1, 3323 ctx->temp_reg, 2, 3324 V_SQ_ALU_SRC_LITERAL, 0x40, 3325 ctx->temp_reg, 0); 3326 if (r) 3327 return r; 3328 return 0; 3329} 3330 3331static int r600_shader_from_tgsi(struct r600_context *rctx, 3332 struct r600_pipe_shader *pipeshader, 3333 union r600_shader_key key) 3334{ 3335 struct r600_screen *rscreen = rctx->screen; 3336 struct r600_shader *shader = &pipeshader->shader; 3337 struct tgsi_token *tokens = pipeshader->selector->tokens; 3338 struct pipe_stream_output_info so = pipeshader->selector->so; 3339 struct tgsi_full_immediate *immediate; 3340 struct r600_shader_ctx ctx; 3341 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)]; 3342 unsigned output_done, noutput; 3343 unsigned opcode; 3344 int j, k, r = 0; 3345 unsigned i; 3346 int next_param_base = 0, next_clip_base; 3347 int max_color_exports = MAX2(key.ps.nr_cbufs, 1); 3348 bool indirect_gprs; 3349 bool ring_outputs = false; 3350 bool lds_outputs = false; 3351 bool lds_inputs = false; 3352 bool pos_emitted = false; 3353 3354 ctx.bc = &shader->bc; 3355 ctx.shader = shader; 3356 3357 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, 3358 rscreen->has_compressed_msaa_texturing); 3359 ctx.tokens = tokens; 3360 tgsi_scan_shader(tokens, &ctx.info); 3361 shader->indirect_files = ctx.info.indirect_files; 3362 3363 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY]; 3364 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos)); 3365 ctx.spilled_arrays = calloc(narrays, sizeof(bool)); 3366 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos); 3367 3368 shader->uses_helper_invocation = false; 3369 shader->uses_doubles = ctx.info.uses_doubles; 3370 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; 3371 shader->nsys_inputs = 0; 3372 3373 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 || 3374 ctx.info.file_count[TGSI_FILE_BUFFER] > 0; 3375 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); 3376 tgsi_parse_init(&ctx.parse, tokens); 3377 ctx.type = ctx.info.processor; 3378 shader->processor_type = ctx.type; 3379 ctx.bc->type = shader->processor_type; 3380 3381 switch (ctx.type) { 3382 case PIPE_SHADER_VERTEX: 3383 shader->vs_as_gs_a = key.vs.as_gs_a; 3384 shader->vs_as_es = key.vs.as_es; 3385 shader->vs_as_ls = key.vs.as_ls; 3386 shader->atomic_base = key.vs.first_atomic_counter; 3387 if (shader->vs_as_es) 3388 ring_outputs = true; 3389 if (shader->vs_as_ls) 3390 lds_outputs = true; 3391 break; 3392 case PIPE_SHADER_GEOMETRY: 3393 ring_outputs = true; 3394 shader->atomic_base = key.gs.first_atomic_counter; 3395 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix; 3396 break; 3397 case PIPE_SHADER_TESS_CTRL: 3398 shader->tcs_prim_mode = key.tcs.prim_mode; 3399 shader->atomic_base = key.tcs.first_atomic_counter; 3400 lds_outputs = true; 3401 lds_inputs = true; 3402 break; 3403 case PIPE_SHADER_TESS_EVAL: 3404 shader->tes_as_es = key.tes.as_es; 3405 shader->atomic_base = key.tes.first_atomic_counter; 3406 lds_inputs = true; 3407 if (shader->tes_as_es) 3408 ring_outputs = true; 3409 break; 3410 case PIPE_SHADER_FRAGMENT: 3411 shader->two_side = key.ps.color_two_side; 3412 shader->atomic_base = key.ps.first_atomic_counter; 3413 shader->rat_base = key.ps.nr_cbufs; 3414 shader->image_size_const_offset = key.ps.image_size_const_offset; 3415 break; 3416 case PIPE_SHADER_COMPUTE: 3417 shader->rat_base = 0; 3418 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER]; 3419 break; 3420 default: 3421 break; 3422 } 3423 3424 if (shader->vs_as_es || shader->tes_as_es) { 3425 ctx.gs_for_vs = &rctx->gs_shader->current->shader; 3426 } else { 3427 ctx.gs_for_vs = NULL; 3428 } 3429 3430 ctx.next_ring_offset = 0; 3431 ctx.gs_out_ring_offset = 0; 3432 ctx.gs_next_vertex = 0; 3433 ctx.gs_stream_output_info = &so; 3434 3435 ctx.thread_id_gpr = -1; 3436 ctx.face_gpr = -1; 3437 ctx.fixed_pt_position_gpr = -1; 3438 ctx.fragcoord_input = -1; 3439 ctx.colors_used = 0; 3440 ctx.clip_vertex_write = 0; 3441 3442 ctx.helper_invoc_reg = -1; 3443 ctx.cs_block_size_reg = -1; 3444 ctx.cs_grid_size_reg = -1; 3445 ctx.cs_block_size_loaded = false; 3446 ctx.cs_grid_size_loaded = false; 3447 3448 shader->nr_ps_color_exports = 0; 3449 shader->nr_ps_max_color_exports = 0; 3450 3451 3452 /* register allocations */ 3453 /* Values [0,127] correspond to GPR[0..127]. 3454 * Values [128,159] correspond to constant buffer bank 0 3455 * Values [160,191] correspond to constant buffer bank 1 3456 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 3457 * Values [256,287] correspond to constant buffer bank 2 (EG) 3458 * Values [288,319] correspond to constant buffer bank 3 (EG) 3459 * Other special values are shown in the list below. 3460 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 3461 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 3462 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 3463 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 3464 * 248 SQ_ALU_SRC_0: special constant 0.0. 3465 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 3466 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 3467 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 3468 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 3469 * 253 SQ_ALU_SRC_LITERAL: literal constant. 3470 * 254 SQ_ALU_SRC_PV: previous vector result. 3471 * 255 SQ_ALU_SRC_PS: previous scalar result. 3472 */ 3473 for (i = 0; i < TGSI_FILE_COUNT; i++) { 3474 ctx.file_offset[i] = 0; 3475 } 3476 3477 if (ctx.type == PIPE_SHADER_VERTEX) { 3478 3479 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3480 if (ctx.info.num_inputs) 3481 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 3482 } 3483 if (ctx.type == PIPE_SHADER_FRAGMENT) { 3484 if (ctx.bc->chip_class >= EVERGREEN) 3485 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 3486 else 3487 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); 3488 3489 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3490 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) { 3491 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3492 shader->uses_helper_invocation = true; 3493 } 3494 } 3495 } 3496 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3497 /* FIXME 1 would be enough in some cases (3 or less input vertices) */ 3498 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3499 } 3500 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3501 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3502 if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3503 bool add_tesscoord = false, add_tess_inout = false; 3504 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3505 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3506 /* if we have tesscoord save one reg */ 3507 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD) 3508 add_tesscoord = true; 3509 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER || 3510 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER) 3511 add_tess_inout = true; 3512 } 3513 if (add_tesscoord || add_tess_inout) 3514 ctx.file_offset[TGSI_FILE_INPUT]++; 3515 if (add_tess_inout) 3516 ctx.file_offset[TGSI_FILE_INPUT]+=2; 3517 } 3518 if (ctx.type == PIPE_SHADER_COMPUTE) { 3519 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3520 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3521 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE) 3522 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3523 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE) 3524 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3525 } 3526 } 3527 3528 ctx.file_offset[TGSI_FILE_OUTPUT] = 3529 ctx.file_offset[TGSI_FILE_INPUT] + 3530 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3531 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 3532 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 3533 3534 /* Outside the GPR range. This will be translated to one of the 3535 * kcache banks later. */ 3536 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 3537 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 3538 3539 pipeshader->scratch_space_needed = 0; 3540 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] + 3541 ctx.info.file_max[TGSI_FILE_TEMPORARY]; 3542 if (regno > 124) { 3543 choose_spill_arrays(&ctx, ®no, &pipeshader->scratch_space_needed); 3544 shader->indirect_files = ctx.info.indirect_files; 3545 } 3546 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0; 3547 3548 ctx.bc->ar_reg = ++regno; 3549 ctx.bc->index_reg[0] = ++regno; 3550 ctx.bc->index_reg[1] = ++regno; 3551 3552 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3553 ctx.tess_input_info = ++regno; 3554 ctx.tess_output_info = ++regno; 3555 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3556 ctx.tess_input_info = ++regno; 3557 ctx.tess_output_info = ++regno; 3558 } else if (ctx.type == PIPE_SHADER_GEOMETRY) { 3559 ctx.gs_export_gpr_tregs[0] = ++regno; 3560 ctx.gs_export_gpr_tregs[1] = ++regno; 3561 ctx.gs_export_gpr_tregs[2] = ++regno; 3562 ctx.gs_export_gpr_tregs[3] = ++regno; 3563 if (ctx.shader->gs_tri_strip_adj_fix) { 3564 ctx.gs_rotated_input[0] = ++regno; 3565 ctx.gs_rotated_input[1] = ++regno; 3566 } else { 3567 ctx.gs_rotated_input[0] = 0; 3568 ctx.gs_rotated_input[1] = 1; 3569 } 3570 } 3571 3572 if (shader->uses_images) { 3573 ctx.thread_id_gpr = ++regno; 3574 } 3575 ctx.temp_reg = ++regno; 3576 3577 shader->max_arrays = 0; 3578 shader->num_arrays = 0; 3579 if (indirect_gprs) { 3580 3581 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 3582 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 3583 ctx.file_offset[TGSI_FILE_OUTPUT] - 3584 ctx.file_offset[TGSI_FILE_INPUT], 3585 0x0F); 3586 } 3587 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 3588 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 3589 ctx.file_offset[TGSI_FILE_TEMPORARY] - 3590 ctx.file_offset[TGSI_FILE_OUTPUT], 3591 0x0F); 3592 } 3593 } 3594 3595 ctx.nliterals = 0; 3596 ctx.literals = NULL; 3597 ctx.max_driver_temp_used = 0; 3598 3599 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && 3600 ctx.info.colors_written == 1; 3601 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; 3602 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; 3603 3604 if (ctx.type == PIPE_SHADER_VERTEX || 3605 ctx.type == PIPE_SHADER_GEOMETRY || 3606 ctx.type == PIPE_SHADER_TESS_EVAL) { 3607 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] + 3608 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1; 3609 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1; 3610 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]; 3611 } 3612 3613 if (shader->vs_as_gs_a) 3614 vs_add_primid_output(&ctx, key.vs.prim_id_out); 3615 3616 if (ctx.thread_id_gpr != -1) { 3617 r = load_thread_id_gpr(&ctx); 3618 if (r) 3619 return r; 3620 } 3621 3622 if (ctx.type == PIPE_SHADER_TESS_EVAL) 3623 r600_fetch_tess_io_info(&ctx); 3624 3625 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3626 tgsi_parse_token(&ctx.parse); 3627 switch (ctx.parse.FullToken.Token.Type) { 3628 case TGSI_TOKEN_TYPE_IMMEDIATE: 3629 immediate = &ctx.parse.FullToken.FullImmediate; 3630 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 3631 if(ctx.literals == NULL) { 3632 r = -ENOMEM; 3633 goto out_err; 3634 } 3635 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 3636 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 3637 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 3638 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 3639 ctx.nliterals++; 3640 break; 3641 case TGSI_TOKEN_TYPE_DECLARATION: 3642 r = tgsi_declaration(&ctx); 3643 if (r) 3644 goto out_err; 3645 break; 3646 case TGSI_TOKEN_TYPE_INSTRUCTION: 3647 case TGSI_TOKEN_TYPE_PROPERTY: 3648 break; 3649 default: 3650 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 3651 r = -EINVAL; 3652 goto out_err; 3653 } 3654 } 3655 3656 shader->ring_item_sizes[0] = ctx.next_ring_offset; 3657 shader->ring_item_sizes[1] = 0; 3658 shader->ring_item_sizes[2] = 0; 3659 shader->ring_item_sizes[3] = 0; 3660 3661 /* Process two side if needed */ 3662 if (shader->two_side && ctx.colors_used) { 3663 int i, count = ctx.shader->ninput; 3664 unsigned next_lds_loc = ctx.shader->nlds; 3665 3666 /* additional inputs will be allocated right after the existing inputs, 3667 * we won't need them after the color selection, so we don't need to 3668 * reserve these gprs for the rest of the shader code and to adjust 3669 * output offsets etc. */ 3670 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 3671 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3672 3673 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */ 3674 if (ctx.face_gpr == -1) { 3675 i = ctx.shader->ninput++; 3676 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 3677 ctx.shader->input[i].spi_sid = 0; 3678 ctx.shader->input[i].gpr = gpr++; 3679 ctx.face_gpr = ctx.shader->input[i].gpr; 3680 } 3681 3682 for (i = 0; i < count; i++) { 3683 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 3684 int ni = ctx.shader->ninput++; 3685 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 3686 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 3687 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 3688 ctx.shader->input[ni].gpr = gpr++; 3689 // TGSI to LLVM needs to know the lds position of inputs. 3690 // Non LLVM path computes it later (in process_twoside_color) 3691 ctx.shader->input[ni].lds_pos = next_lds_loc++; 3692 ctx.shader->input[i].back_color_input = ni; 3693 if (ctx.bc->chip_class >= EVERGREEN) { 3694 if ((r = evergreen_interp_input(&ctx, ni))) 3695 return r; 3696 } 3697 } 3698 } 3699 } 3700 3701 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) 3702 shader->nr_ps_max_color_exports = 8; 3703 3704 if (ctx.shader->uses_helper_invocation) { 3705 if (ctx.bc->chip_class == CAYMAN) 3706 r = cm_load_helper_invocation(&ctx); 3707 else 3708 r = eg_load_helper_invocation(&ctx); 3709 if (r) 3710 return r; 3711 } 3712 3713 /* 3714 * XXX this relies on fixed_pt_position_gpr only being present when 3715 * this shader should be executed per sample. Should be the case for now... 3716 */ 3717 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) { 3718 /* 3719 * Fix up sample mask. The hw always gives us coverage mask for 3720 * the pixel. However, for per-sample shading, we need the 3721 * coverage for the shader invocation only. 3722 * Also, with disabled msaa, only the first bit should be set 3723 * (luckily the same fixup works for both problems). 3724 * For now, we can only do it if we know this shader is always 3725 * executed per sample (due to usage of bits in the shader 3726 * forcing per-sample execution). 3727 * If the fb is not multisampled, we'd do unnecessary work but 3728 * it should still be correct. 3729 * It will however do nothing for sample shading according 3730 * to MinSampleShading. 3731 */ 3732 struct r600_bytecode_alu alu; 3733 int tmp = r600_get_temp(&ctx); 3734 assert(ctx.face_gpr != -1); 3735 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3736 3737 alu.op = ALU_OP2_LSHL_INT; 3738 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3739 alu.src[0].value = 0x1; 3740 alu.src[1].sel = ctx.fixed_pt_position_gpr; 3741 alu.src[1].chan = 3; 3742 alu.dst.sel = tmp; 3743 alu.dst.chan = 0; 3744 alu.dst.write = 1; 3745 alu.last = 1; 3746 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3747 return r; 3748 3749 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3750 alu.op = ALU_OP2_AND_INT; 3751 alu.src[0].sel = tmp; 3752 alu.src[1].sel = ctx.face_gpr; 3753 alu.src[1].chan = 2; 3754 alu.dst.sel = ctx.face_gpr; 3755 alu.dst.chan = 2; 3756 alu.dst.write = 1; 3757 alu.last = 1; 3758 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3759 return r; 3760 } 3761 3762 if (ctx.fragcoord_input >= 0) { 3763 if (ctx.bc->chip_class == CAYMAN) { 3764 for (j = 0 ; j < 4; j++) { 3765 struct r600_bytecode_alu alu; 3766 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3767 alu.op = ALU_OP1_RECIP_IEEE; 3768 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3769 alu.src[0].chan = 3; 3770 3771 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3772 alu.dst.chan = j; 3773 alu.dst.write = (j == 3); 3774 alu.last = (j == 3); 3775 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3776 return r; 3777 } 3778 } else { 3779 struct r600_bytecode_alu alu; 3780 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3781 alu.op = ALU_OP1_RECIP_IEEE; 3782 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3783 alu.src[0].chan = 3; 3784 3785 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3786 alu.dst.chan = 3; 3787 alu.dst.write = 1; 3788 alu.last = 1; 3789 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3790 return r; 3791 } 3792 } 3793 3794 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3795 struct r600_bytecode_alu alu; 3796 int r; 3797 3798 /* GS thread with no output workaround - emit a cut at start of GS */ 3799 if (ctx.bc->chip_class == R600) 3800 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); 3801 3802 for (j = 0; j < 4; j++) { 3803 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3804 alu.op = ALU_OP1_MOV; 3805 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3806 alu.src[0].value = 0; 3807 alu.dst.sel = ctx.gs_export_gpr_tregs[j]; 3808 alu.dst.write = 1; 3809 alu.last = 1; 3810 r = r600_bytecode_add_alu(ctx.bc, &alu); 3811 if (r) 3812 return r; 3813 } 3814 3815 if (ctx.shader->gs_tri_strip_adj_fix) { 3816 r = single_alu_op2(&ctx, ALU_OP2_AND_INT, 3817 ctx.gs_rotated_input[0], 2, 3818 0, 2, 3819 V_SQ_ALU_SRC_LITERAL, 1); 3820 if (r) 3821 return r; 3822 3823 for (i = 0; i < 6; i++) { 3824 int rotated = (i + 4) % 6; 3825 int offset_reg = i / 3; 3826 int offset_chan = i % 3; 3827 int rotated_offset_reg = rotated / 3; 3828 int rotated_offset_chan = rotated % 3; 3829 3830 if (offset_reg == 0 && offset_chan == 2) 3831 offset_chan = 3; 3832 if (rotated_offset_reg == 0 && rotated_offset_chan == 2) 3833 rotated_offset_chan = 3; 3834 3835 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT, 3836 ctx.gs_rotated_input[offset_reg], offset_chan, 3837 ctx.gs_rotated_input[0], 2, 3838 offset_reg, offset_chan, 3839 rotated_offset_reg, rotated_offset_chan); 3840 if (r) 3841 return r; 3842 } 3843 } 3844 } 3845 3846 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3847 r600_fetch_tess_io_info(&ctx); 3848 3849 if (shader->two_side && ctx.colors_used) { 3850 if ((r = process_twoside_color_inputs(&ctx))) 3851 return r; 3852 } 3853 3854 tgsi_parse_init(&ctx.parse, tokens); 3855 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3856 tgsi_parse_token(&ctx.parse); 3857 switch (ctx.parse.FullToken.Token.Type) { 3858 case TGSI_TOKEN_TYPE_INSTRUCTION: 3859 r = tgsi_is_supported(&ctx); 3860 if (r) 3861 goto out_err; 3862 ctx.max_driver_temp_used = 0; 3863 /* reserve first tmp for everyone */ 3864 r600_get_temp(&ctx); 3865 3866 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 3867 if ((r = tgsi_split_constant(&ctx))) 3868 goto out_err; 3869 if ((r = tgsi_split_literal_constant(&ctx))) 3870 goto out_err; 3871 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3872 if ((r = tgsi_split_gs_inputs(&ctx))) 3873 goto out_err; 3874 } else if (lds_inputs) { 3875 if ((r = tgsi_split_lds_inputs(&ctx))) 3876 goto out_err; 3877 } 3878 if (ctx.bc->chip_class == CAYMAN) 3879 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 3880 else if (ctx.bc->chip_class >= EVERGREEN) 3881 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 3882 else 3883 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 3884 3885 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise; 3886 3887 r = ctx.inst_info->process(&ctx); 3888 if (r) 3889 goto out_err; 3890 3891 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3892 r = r600_store_tcs_output(&ctx); 3893 if (r) 3894 goto out_err; 3895 } 3896 break; 3897 default: 3898 break; 3899 } 3900 } 3901 3902 /* Reset the temporary register counter. */ 3903 ctx.max_driver_temp_used = 0; 3904 3905 noutput = shader->noutput; 3906 3907 if (!ring_outputs && ctx.clip_vertex_write) { 3908 unsigned clipdist_temp[2]; 3909 3910 clipdist_temp[0] = r600_get_temp(&ctx); 3911 clipdist_temp[1] = r600_get_temp(&ctx); 3912 3913 /* need to convert a clipvertex write into clipdistance writes and not export 3914 the clip vertex anymore */ 3915 3916 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 3917 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 3918 shader->output[noutput].gpr = clipdist_temp[0]; 3919 noutput++; 3920 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 3921 shader->output[noutput].gpr = clipdist_temp[1]; 3922 noutput++; 3923 3924 /* reset spi_sid for clipvertex output to avoid confusing spi */ 3925 shader->output[ctx.cv_output].spi_sid = 0; 3926 3927 shader->clip_dist_write = 0xFF; 3928 shader->cc_dist_mask = 0xFF; 3929 3930 for (i = 0; i < 8; i++) { 3931 int oreg = i >> 2; 3932 int ochan = i & 3; 3933 3934 for (j = 0; j < 4; j++) { 3935 struct r600_bytecode_alu alu; 3936 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3937 alu.op = ALU_OP2_DOT4; 3938 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 3939 alu.src[0].chan = j; 3940 3941 alu.src[1].sel = 512 + i; 3942 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 3943 alu.src[1].chan = j; 3944 3945 alu.dst.sel = clipdist_temp[oreg]; 3946 alu.dst.chan = j; 3947 alu.dst.write = (j == ochan); 3948 if (j == 3) 3949 alu.last = 1; 3950 r = r600_bytecode_add_alu(ctx.bc, &alu); 3951 if (r) 3952 return r; 3953 } 3954 } 3955 } 3956 3957 /* Add stream outputs. */ 3958 if (so.num_outputs) { 3959 bool emit = false; 3960 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX) 3961 emit = true; 3962 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL) 3963 emit = true; 3964 if (emit) 3965 emit_streamout(&ctx, &so, -1, NULL); 3966 } 3967 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 3968 convert_edgeflag_to_int(&ctx); 3969 3970 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3971 r600_emit_tess_factor(&ctx); 3972 3973 if (lds_outputs) { 3974 if (ctx.type == PIPE_SHADER_VERTEX) { 3975 if (ctx.shader->noutput) 3976 emit_lds_vs_writes(&ctx); 3977 } 3978 } else if (ring_outputs) { 3979 if (shader->vs_as_es || shader->tes_as_es) { 3980 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx); 3981 ctx.gs_export_gpr_tregs[1] = -1; 3982 ctx.gs_export_gpr_tregs[2] = -1; 3983 ctx.gs_export_gpr_tregs[3] = -1; 3984 3985 emit_gs_ring_writes(&ctx, &so, -1, FALSE); 3986 } 3987 } else { 3988 /* Export output */ 3989 next_clip_base = shader->vs_out_misc_write ? 62 : 61; 3990 3991 for (i = 0, j = 0; i < noutput; i++, j++) { 3992 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3993 output[j].gpr = shader->output[i].gpr; 3994 output[j].elem_size = 3; 3995 output[j].swizzle_x = 0; 3996 output[j].swizzle_y = 1; 3997 output[j].swizzle_z = 2; 3998 output[j].swizzle_w = 3; 3999 output[j].burst_count = 1; 4000 output[j].type = 0xffffffff; 4001 output[j].op = CF_OP_EXPORT; 4002 switch (ctx.type) { 4003 case PIPE_SHADER_VERTEX: 4004 case PIPE_SHADER_TESS_EVAL: 4005 switch (shader->output[i].name) { 4006 case TGSI_SEMANTIC_POSITION: 4007 output[j].array_base = 60; 4008 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4009 pos_emitted = true; 4010 break; 4011 4012 case TGSI_SEMANTIC_PSIZE: 4013 output[j].array_base = 61; 4014 output[j].swizzle_y = 7; 4015 output[j].swizzle_z = 7; 4016 output[j].swizzle_w = 7; 4017 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4018 pos_emitted = true; 4019 break; 4020 case TGSI_SEMANTIC_EDGEFLAG: 4021 output[j].array_base = 61; 4022 output[j].swizzle_x = 7; 4023 output[j].swizzle_y = 0; 4024 output[j].swizzle_z = 7; 4025 output[j].swizzle_w = 7; 4026 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4027 pos_emitted = true; 4028 break; 4029 case TGSI_SEMANTIC_LAYER: 4030 /* spi_sid is 0 for outputs that are 4031 * not consumed by PS */ 4032 if (shader->output[i].spi_sid) { 4033 output[j].array_base = next_param_base++; 4034 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4035 j++; 4036 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4037 } 4038 output[j].array_base = 61; 4039 output[j].swizzle_x = 7; 4040 output[j].swizzle_y = 7; 4041 output[j].swizzle_z = 0; 4042 output[j].swizzle_w = 7; 4043 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4044 pos_emitted = true; 4045 break; 4046 case TGSI_SEMANTIC_VIEWPORT_INDEX: 4047 /* spi_sid is 0 for outputs that are 4048 * not consumed by PS */ 4049 if (shader->output[i].spi_sid) { 4050 output[j].array_base = next_param_base++; 4051 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4052 j++; 4053 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4054 } 4055 output[j].array_base = 61; 4056 output[j].swizzle_x = 7; 4057 output[j].swizzle_y = 7; 4058 output[j].swizzle_z = 7; 4059 output[j].swizzle_w = 0; 4060 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4061 pos_emitted = true; 4062 break; 4063 case TGSI_SEMANTIC_CLIPVERTEX: 4064 j--; 4065 break; 4066 case TGSI_SEMANTIC_CLIPDIST: 4067 output[j].array_base = next_clip_base++; 4068 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4069 pos_emitted = true; 4070 /* spi_sid is 0 for clipdistance outputs that were generated 4071 * for clipvertex - we don't need to pass them to PS */ 4072 if (shader->output[i].spi_sid) { 4073 j++; 4074 /* duplicate it as PARAM to pass to the pixel shader */ 4075 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4076 output[j].array_base = next_param_base++; 4077 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4078 } 4079 break; 4080 case TGSI_SEMANTIC_FOG: 4081 output[j].swizzle_y = 4; /* 0 */ 4082 output[j].swizzle_z = 4; /* 0 */ 4083 output[j].swizzle_w = 5; /* 1 */ 4084 break; 4085 case TGSI_SEMANTIC_PRIMID: 4086 output[j].swizzle_x = 2; 4087 output[j].swizzle_y = 4; /* 0 */ 4088 output[j].swizzle_z = 4; /* 0 */ 4089 output[j].swizzle_w = 4; /* 0 */ 4090 break; 4091 } 4092 4093 break; 4094 case PIPE_SHADER_FRAGMENT: 4095 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 4096 /* never export more colors than the number of CBs */ 4097 if (shader->output[i].sid >= max_color_exports) { 4098 /* skip export */ 4099 j--; 4100 continue; 4101 } 4102 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4103 output[j].array_base = shader->output[i].sid; 4104 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4105 shader->nr_ps_color_exports++; 4106 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4)); 4107 4108 /* If the i-th target format is set, all previous target formats must 4109 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well. 4110 */ 4111 if (shader->output[i].sid > 0) 4112 for (unsigned x = 0; x < shader->output[i].sid; x++) 4113 shader->ps_color_export_mask |= (1 << (x*4)); 4114 4115 if (shader->output[i].sid > shader->ps_export_highest) 4116 shader->ps_export_highest = shader->output[i].sid; 4117 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) { 4118 for (k = 1; k < max_color_exports; k++) { 4119 j++; 4120 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4121 output[j].gpr = shader->output[i].gpr; 4122 output[j].elem_size = 3; 4123 output[j].swizzle_x = 0; 4124 output[j].swizzle_y = 1; 4125 output[j].swizzle_z = 2; 4126 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4127 output[j].burst_count = 1; 4128 output[j].array_base = k; 4129 output[j].op = CF_OP_EXPORT; 4130 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4131 shader->nr_ps_color_exports++; 4132 if (k > shader->ps_export_highest) 4133 shader->ps_export_highest = k; 4134 shader->ps_color_export_mask |= (0xf << (j * 4)); 4135 } 4136 } 4137 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 4138 output[j].array_base = 61; 4139 output[j].swizzle_x = 2; 4140 output[j].swizzle_y = 7; 4141 output[j].swizzle_z = output[j].swizzle_w = 7; 4142 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4143 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 4144 output[j].array_base = 61; 4145 output[j].swizzle_x = 7; 4146 output[j].swizzle_y = 1; 4147 output[j].swizzle_z = output[j].swizzle_w = 7; 4148 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4149 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) { 4150 output[j].array_base = 61; 4151 output[j].swizzle_x = 7; 4152 output[j].swizzle_y = 7; 4153 output[j].swizzle_z = 0; 4154 output[j].swizzle_w = 7; 4155 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4156 } else { 4157 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 4158 r = -EINVAL; 4159 goto out_err; 4160 } 4161 break; 4162 case PIPE_SHADER_TESS_CTRL: 4163 break; 4164 default: 4165 R600_ERR("unsupported processor type %d\n", ctx.type); 4166 r = -EINVAL; 4167 goto out_err; 4168 } 4169 4170 if (output[j].type == 0xffffffff) { 4171 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4172 output[j].array_base = next_param_base++; 4173 } 4174 } 4175 4176 /* add fake position export */ 4177 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) { 4178 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4179 output[j].gpr = 0; 4180 output[j].elem_size = 3; 4181 output[j].swizzle_x = 7; 4182 output[j].swizzle_y = 7; 4183 output[j].swizzle_z = 7; 4184 output[j].swizzle_w = 7; 4185 output[j].burst_count = 1; 4186 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4187 output[j].array_base = 60; 4188 output[j].op = CF_OP_EXPORT; 4189 j++; 4190 } 4191 4192 /* add fake param output for vertex shader if no param is exported */ 4193 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) { 4194 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4195 output[j].gpr = 0; 4196 output[j].elem_size = 3; 4197 output[j].swizzle_x = 7; 4198 output[j].swizzle_y = 7; 4199 output[j].swizzle_z = 7; 4200 output[j].swizzle_w = 7; 4201 output[j].burst_count = 1; 4202 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4203 output[j].array_base = 0; 4204 output[j].op = CF_OP_EXPORT; 4205 j++; 4206 } 4207 4208 /* add fake pixel export */ 4209 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) { 4210 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4211 output[j].gpr = 0; 4212 output[j].elem_size = 3; 4213 output[j].swizzle_x = 7; 4214 output[j].swizzle_y = 7; 4215 output[j].swizzle_z = 7; 4216 output[j].swizzle_w = 7; 4217 output[j].burst_count = 1; 4218 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4219 output[j].array_base = 0; 4220 output[j].op = CF_OP_EXPORT; 4221 j++; 4222 shader->nr_ps_color_exports++; 4223 shader->ps_color_export_mask = 0xf; 4224 } 4225 4226 noutput = j; 4227 4228 /* set export done on last export of each type */ 4229 for (k = noutput - 1, output_done = 0; k >= 0; k--) { 4230 if (!(output_done & (1 << output[k].type))) { 4231 output_done |= (1 << output[k].type); 4232 output[k].op = CF_OP_EXPORT_DONE; 4233 } 4234 } 4235 /* add output to bytecode */ 4236 for (i = 0; i < noutput; i++) { 4237 r = r600_bytecode_add_output(ctx.bc, &output[i]); 4238 if (r) 4239 goto out_err; 4240 } 4241 } 4242 4243 /* add program end */ 4244 if (ctx.bc->chip_class == CAYMAN) 4245 cm_bytecode_add_cf_end(ctx.bc); 4246 else { 4247 const struct cf_op_info *last = NULL; 4248 4249 if (ctx.bc->cf_last) 4250 last = r600_isa_cf(ctx.bc->cf_last->op); 4251 4252 /* alu clause instructions don't have EOP bit, so add NOP */ 4253 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP) 4254 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 4255 4256 ctx.bc->cf_last->end_of_program = 1; 4257 } 4258 4259 /* check GPR limit - we have 124 = 128 - 4 4260 * (4 are reserved as alu clause temporary registers) */ 4261 if (ctx.bc->ngpr > 124) { 4262 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 4263 r = -ENOMEM; 4264 goto out_err; 4265 } 4266 4267 if (ctx.type == PIPE_SHADER_GEOMETRY) { 4268 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so))) 4269 return r; 4270 } 4271 4272 free(ctx.spilled_arrays); 4273 free(ctx.array_infos); 4274 free(ctx.literals); 4275 tgsi_parse_free(&ctx.parse); 4276 return 0; 4277out_err: 4278 free(ctx.spilled_arrays); 4279 free(ctx.array_infos); 4280 free(ctx.literals); 4281 tgsi_parse_free(&ctx.parse); 4282 return r; 4283} 4284 4285static int tgsi_unsupported(struct r600_shader_ctx *ctx) 4286{ 4287 const unsigned tgsi_opcode = 4288 ctx->parse.FullToken.FullInstruction.Instruction.Opcode; 4289 R600_ERR("%s tgsi opcode unsupported\n", 4290 tgsi_get_opcode_name(tgsi_opcode)); 4291 return -EINVAL; 4292} 4293 4294static int tgsi_end(struct r600_shader_ctx *ctx UNUSED) 4295{ 4296 return 0; 4297} 4298 4299static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 4300 const struct r600_shader_src *shader_src, 4301 unsigned chan) 4302{ 4303 bc_src->sel = shader_src->sel; 4304 bc_src->chan = shader_src->swizzle[chan]; 4305 bc_src->neg = shader_src->neg; 4306 bc_src->abs = shader_src->abs; 4307 bc_src->rel = shader_src->rel; 4308 bc_src->value = shader_src->value[bc_src->chan]; 4309 bc_src->kc_bank = shader_src->kc_bank; 4310 bc_src->kc_rel = shader_src->kc_rel; 4311} 4312 4313static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 4314{ 4315 bc_src->abs = 1; 4316 bc_src->neg = 0; 4317} 4318 4319static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 4320{ 4321 bc_src->neg = !bc_src->neg; 4322} 4323 4324static void tgsi_dst(struct r600_shader_ctx *ctx, 4325 const struct tgsi_full_dst_register *tgsi_dst, 4326 unsigned swizzle, 4327 struct r600_bytecode_alu_dst *r600_dst) 4328{ 4329 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4330 4331 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) { 4332 bool spilled; 4333 unsigned idx; 4334 4335 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled); 4336 4337 if (spilled) { 4338 struct r600_bytecode_output cf; 4339 int reg = 0; 4340 int r; 4341 bool add_pending_output = true; 4342 4343 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 4344 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index, 4345 &cf.array_base, &cf.array_size); 4346 4347 /* If no component has spilled, reserve a register and add the spill code 4348 * ctx->bc->n_pending_outputs is cleared after each instruction group */ 4349 if (ctx->bc->n_pending_outputs == 0) { 4350 reg = r600_get_temp(ctx); 4351 } else { 4352 /* If we are already spilling and the output address is the same like 4353 * before then just reuse the same slot */ 4354 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1]; 4355 if ((cf.array_base + idx == tmpl->array_base) || 4356 (cf.array_base == tmpl->array_base && 4357 tmpl->index_gpr == ctx->bc->ar_reg && 4358 tgsi_dst->Register.Indirect)) { 4359 reg = ctx->bc->pending_outputs[0].gpr; 4360 add_pending_output = false; 4361 } else { 4362 reg = r600_get_temp(ctx); 4363 } 4364 } 4365 4366 r600_dst->sel = reg; 4367 r600_dst->chan = swizzle; 4368 r600_dst->write = 1; 4369 if (inst->Instruction.Saturate) { 4370 r600_dst->clamp = 1; 4371 } 4372 4373 /* Add new outputs as pending */ 4374 if (add_pending_output) { 4375 cf.op = CF_OP_MEM_SCRATCH; 4376 cf.elem_size = 3; 4377 cf.gpr = reg; 4378 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 4379 cf.mark = 1; 4380 cf.comp_mask = inst->Dst[0].Register.WriteMask; 4381 cf.swizzle_x = 0; 4382 cf.swizzle_y = 1; 4383 cf.swizzle_z = 2; 4384 cf.swizzle_w = 3; 4385 cf.burst_count = 1; 4386 4387 if (tgsi_dst->Register.Indirect) { 4388 if (ctx->bc->chip_class < R700) 4389 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 4390 else 4391 cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK; 4392 cf.index_gpr = ctx->bc->ar_reg; 4393 } 4394 else { 4395 cf.array_base += idx; 4396 cf.array_size = 0; 4397 } 4398 4399 r = r600_bytecode_add_pending_output(ctx->bc, &cf); 4400 if (r) 4401 return; 4402 4403 if (ctx->bc->chip_class >= R700) 4404 r600_bytecode_need_wait_ack(ctx->bc, true); 4405 } 4406 return; 4407 } 4408 else { 4409 r600_dst->sel = idx; 4410 } 4411 } 4412 else { 4413 r600_dst->sel = tgsi_dst->Register.Index; 4414 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 4415 } 4416 r600_dst->chan = swizzle; 4417 r600_dst->write = 1; 4418 if (inst->Instruction.Saturate) { 4419 r600_dst->clamp = 1; 4420 } 4421 if (ctx->type == PIPE_SHADER_TESS_CTRL) { 4422 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) { 4423 return; 4424 } 4425 } 4426 if (tgsi_dst->Register.Indirect) 4427 r600_dst->rel = V_SQ_REL_RELATIVE; 4428 4429} 4430 4431static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override) 4432{ 4433 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4434 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4435 struct r600_bytecode_alu alu; 4436 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4437 int use_tmp = 0; 4438 int swizzle_x = inst->Src[0].Register.SwizzleX; 4439 4440 if (singledest) { 4441 switch (write_mask) { 4442 case 0x1: 4443 if (swizzle_x == 2) { 4444 write_mask = 0xc; 4445 use_tmp = 3; 4446 } else 4447 write_mask = 0x3; 4448 break; 4449 case 0x2: 4450 if (swizzle_x == 2) { 4451 write_mask = 0xc; 4452 use_tmp = 3; 4453 } else { 4454 write_mask = 0x3; 4455 use_tmp = 1; 4456 } 4457 break; 4458 case 0x4: 4459 if (swizzle_x == 0) { 4460 write_mask = 0x3; 4461 use_tmp = 1; 4462 } else 4463 write_mask = 0xc; 4464 break; 4465 case 0x8: 4466 if (swizzle_x == 0) { 4467 write_mask = 0x3; 4468 use_tmp = 1; 4469 } else { 4470 write_mask = 0xc; 4471 use_tmp = 3; 4472 } 4473 break; 4474 } 4475 } 4476 4477 lasti = tgsi_last_instruction(write_mask); 4478 for (i = 0; i <= lasti; i++) { 4479 4480 if (!(write_mask & (1 << i))) 4481 continue; 4482 4483 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4484 4485 if (singledest) { 4486 if (use_tmp || dest_temp) { 4487 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp; 4488 alu.dst.chan = i; 4489 alu.dst.write = 1; 4490 } else { 4491 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4492 } 4493 if (i == 1 || i == 3) 4494 alu.dst.write = 0; 4495 } else 4496 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4497 4498 alu.op = op_override ? op_override : ctx->inst_info->op; 4499 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) { 4500 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4501 } else if (!swap) { 4502 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4503 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4504 } 4505 } else { 4506 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i)); 4507 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i)); 4508 } 4509 4510 /* handle some special cases */ 4511 if (i == 1 || i == 3) { 4512 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) { 4513 case TGSI_OPCODE_DABS: 4514 r600_bytecode_src_set_abs(&alu.src[0]); 4515 break; 4516 default: 4517 break; 4518 } 4519 } 4520 if (i == lasti) { 4521 alu.last = 1; 4522 } 4523 r = r600_bytecode_add_alu(ctx->bc, &alu); 4524 if (r) 4525 return r; 4526 } 4527 4528 if (use_tmp) { 4529 write_mask = inst->Dst[0].Register.WriteMask; 4530 4531 lasti = tgsi_last_instruction(write_mask); 4532 /* move result from temp to dst */ 4533 for (i = 0; i <= lasti; i++) { 4534 if (!(write_mask & (1 << i))) 4535 continue; 4536 4537 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4538 alu.op = ALU_OP1_MOV; 4539 4540 if (dest_temp) { 4541 alu.dst.sel = dest_temp; 4542 alu.dst.chan = i; 4543 alu.dst.write = 1; 4544 } else 4545 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4546 alu.src[0].sel = ctx->temp_reg; 4547 alu.src[0].chan = use_tmp - 1; 4548 alu.last = (i == lasti); 4549 4550 r = r600_bytecode_add_alu(ctx->bc, &alu); 4551 if (r) 4552 return r; 4553 } 4554 } 4555 return 0; 4556} 4557 4558static int tgsi_op2_64(struct r600_shader_ctx *ctx) 4559{ 4560 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4561 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4562 /* confirm writemasking */ 4563 if ((write_mask & 0x3) != 0x3 && 4564 (write_mask & 0xc) != 0xc) { 4565 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask); 4566 return -1; 4567 } 4568 return tgsi_op2_64_params(ctx, false, false, 0, 0); 4569} 4570 4571static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx) 4572{ 4573 return tgsi_op2_64_params(ctx, true, false, 0, 0); 4574} 4575 4576static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx) 4577{ 4578 return tgsi_op2_64_params(ctx, true, true, 0, 0); 4579} 4580 4581static int tgsi_op3_64(struct r600_shader_ctx *ctx) 4582{ 4583 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4584 struct r600_bytecode_alu alu; 4585 int i, j, r; 4586 int lasti = 3; 4587 int tmp = r600_get_temp(ctx); 4588 4589 for (i = 0; i < lasti + 1; i++) { 4590 4591 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4592 alu.op = ctx->inst_info->op; 4593 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4594 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1); 4595 } 4596 4597 if (inst->Dst[0].Register.WriteMask & (1 << i)) 4598 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4599 else 4600 alu.dst.sel = tmp; 4601 4602 alu.dst.chan = i; 4603 alu.is_op3 = 1; 4604 if (i == lasti) { 4605 alu.last = 1; 4606 } 4607 r = r600_bytecode_add_alu(ctx->bc, &alu); 4608 if (r) 4609 return r; 4610 } 4611 return 0; 4612} 4613 4614static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 4615{ 4616 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4617 struct r600_bytecode_alu alu; 4618 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4619 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4620 /* use temp register if trans_only and more than one dst component */ 4621 int use_tmp = trans_only && (write_mask ^ (1 << lasti)); 4622 unsigned op = ctx->inst_info->op; 4623 4624 if (op == ALU_OP2_MUL_IEEE && 4625 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 4626 op = ALU_OP2_MUL; 4627 4628 for (i = 0; i <= lasti; i++) { 4629 if (!(write_mask & (1 << i))) 4630 continue; 4631 4632 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4633 if (use_tmp) { 4634 alu.dst.sel = ctx->temp_reg; 4635 alu.dst.chan = i; 4636 alu.dst.write = 1; 4637 } else 4638 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4639 4640 alu.op = op; 4641 if (!swap) { 4642 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4643 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 4644 } 4645 } else { 4646 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4647 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4648 } 4649 if (i == lasti || trans_only) { 4650 alu.last = 1; 4651 } 4652 r = r600_bytecode_add_alu(ctx->bc, &alu); 4653 if (r) 4654 return r; 4655 } 4656 4657 if (use_tmp) { 4658 /* move result from temp to dst */ 4659 for (i = 0; i <= lasti; i++) { 4660 if (!(write_mask & (1 << i))) 4661 continue; 4662 4663 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4664 alu.op = ALU_OP1_MOV; 4665 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4666 alu.src[0].sel = ctx->temp_reg; 4667 alu.src[0].chan = i; 4668 alu.last = (i == lasti); 4669 4670 r = r600_bytecode_add_alu(ctx->bc, &alu); 4671 if (r) 4672 return r; 4673 } 4674 } 4675 return 0; 4676} 4677 4678static int tgsi_op2(struct r600_shader_ctx *ctx) 4679{ 4680 return tgsi_op2_s(ctx, 0, 0); 4681} 4682 4683static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 4684{ 4685 return tgsi_op2_s(ctx, 1, 0); 4686} 4687 4688static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 4689{ 4690 return tgsi_op2_s(ctx, 0, 1); 4691} 4692 4693static int tgsi_ineg(struct r600_shader_ctx *ctx) 4694{ 4695 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4696 struct r600_bytecode_alu alu; 4697 int i, r; 4698 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4699 4700 for (i = 0; i < lasti + 1; i++) { 4701 4702 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4703 continue; 4704 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4705 alu.op = ctx->inst_info->op; 4706 4707 alu.src[0].sel = V_SQ_ALU_SRC_0; 4708 4709 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4710 4711 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4712 4713 if (i == lasti) { 4714 alu.last = 1; 4715 } 4716 r = r600_bytecode_add_alu(ctx->bc, &alu); 4717 if (r) 4718 return r; 4719 } 4720 return 0; 4721 4722} 4723 4724static int tgsi_dneg(struct r600_shader_ctx *ctx) 4725{ 4726 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4727 struct r600_bytecode_alu alu; 4728 int i, r; 4729 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4730 4731 for (i = 0; i < lasti + 1; i++) { 4732 4733 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4734 continue; 4735 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4736 alu.op = ALU_OP1_MOV; 4737 4738 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4739 4740 if (i == 1 || i == 3) 4741 r600_bytecode_src_toggle_neg(&alu.src[0]); 4742 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4743 4744 if (i == lasti) { 4745 alu.last = 1; 4746 } 4747 r = r600_bytecode_add_alu(ctx->bc, &alu); 4748 if (r) 4749 return r; 4750 } 4751 return 0; 4752 4753} 4754 4755static int tgsi_dfracexp(struct r600_shader_ctx *ctx) 4756{ 4757 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4758 struct r600_bytecode_alu alu; 4759 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4760 int i, j, r; 4761 4762 for (i = 0; i <= 3; i++) { 4763 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4764 alu.op = ctx->inst_info->op; 4765 4766 alu.dst.sel = ctx->temp_reg; 4767 alu.dst.chan = i; 4768 alu.dst.write = 1; 4769 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4770 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4771 } 4772 4773 if (i == 3) 4774 alu.last = 1; 4775 4776 r = r600_bytecode_add_alu(ctx->bc, &alu); 4777 if (r) 4778 return r; 4779 } 4780 4781 /* Replicate significand result across channels. */ 4782 for (i = 0; i <= 3; i++) { 4783 if (!(write_mask & (1 << i))) 4784 continue; 4785 4786 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4787 alu.op = ALU_OP1_MOV; 4788 alu.src[0].chan = (i & 1) + 2; 4789 alu.src[0].sel = ctx->temp_reg; 4790 4791 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4792 alu.dst.write = 1; 4793 alu.last = 1; 4794 r = r600_bytecode_add_alu(ctx->bc, &alu); 4795 if (r) 4796 return r; 4797 } 4798 4799 for (i = 0; i <= 3; i++) { 4800 if (inst->Dst[1].Register.WriteMask & (1 << i)) { 4801 /* MOV third channels to writemask dst1 */ 4802 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4803 alu.op = ALU_OP1_MOV; 4804 alu.src[0].chan = 1; 4805 alu.src[0].sel = ctx->temp_reg; 4806 4807 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst); 4808 alu.last = 1; 4809 r = r600_bytecode_add_alu(ctx->bc, &alu); 4810 if (r) 4811 return r; 4812 break; 4813 } 4814 } 4815 return 0; 4816} 4817 4818 4819static int egcm_int_to_double(struct r600_shader_ctx *ctx) 4820{ 4821 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4822 struct r600_bytecode_alu alu; 4823 int i, c, r; 4824 int write_mask = inst->Dst[0].Register.WriteMask; 4825 int temp_reg = r600_get_temp(ctx); 4826 4827 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || 4828 inst->Instruction.Opcode == TGSI_OPCODE_U2D); 4829 4830 for (c = 0; c < 2; c++) { 4831 int dchan = c * 2; 4832 if (write_mask & (0x3 << dchan)) { 4833 /* split into 24-bit int and 8-bit int */ 4834 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4835 alu.op = ALU_OP2_AND_INT; 4836 alu.dst.sel = temp_reg; 4837 alu.dst.chan = dchan; 4838 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4839 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4840 alu.src[1].value = 0xffffff00; 4841 alu.dst.write = 1; 4842 r = r600_bytecode_add_alu(ctx->bc, &alu); 4843 if (r) 4844 return r; 4845 4846 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4847 alu.op = ALU_OP2_AND_INT; 4848 alu.dst.sel = temp_reg; 4849 alu.dst.chan = dchan + 1; 4850 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4851 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4852 alu.src[1].value = 0xff; 4853 alu.dst.write = 1; 4854 alu.last = 1; 4855 r = r600_bytecode_add_alu(ctx->bc, &alu); 4856 if (r) 4857 return r; 4858 } 4859 } 4860 4861 for (c = 0; c < 2; c++) { 4862 int dchan = c * 2; 4863 if (write_mask & (0x3 << dchan)) { 4864 for (i = dchan; i <= dchan + 1; i++) { 4865 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4866 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT; 4867 4868 alu.src[0].sel = temp_reg; 4869 alu.src[0].chan = i; 4870 alu.dst.sel = temp_reg; 4871 alu.dst.chan = i; 4872 alu.dst.write = 1; 4873 if (ctx->bc->chip_class == CAYMAN) 4874 alu.last = i == dchan + 1; 4875 else 4876 alu.last = 1; /* trans only ops on evergreen */ 4877 4878 r = r600_bytecode_add_alu(ctx->bc, &alu); 4879 if (r) 4880 return r; 4881 } 4882 } 4883 } 4884 4885 for (c = 0; c < 2; c++) { 4886 int dchan = c * 2; 4887 if (write_mask & (0x3 << dchan)) { 4888 for (i = 0; i < 4; i++) { 4889 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4890 alu.op = ALU_OP1_FLT32_TO_FLT64; 4891 4892 alu.src[0].chan = dchan + (i / 2); 4893 if (i == 0 || i == 2) 4894 alu.src[0].sel = temp_reg; 4895 else { 4896 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 4897 alu.src[0].value = 0x0; 4898 } 4899 alu.dst.sel = ctx->temp_reg; 4900 alu.dst.chan = i; 4901 alu.last = i == 3; 4902 alu.dst.write = 1; 4903 4904 r = r600_bytecode_add_alu(ctx->bc, &alu); 4905 if (r) 4906 return r; 4907 } 4908 4909 for (i = 0; i <= 1; i++) { 4910 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4911 alu.op = ALU_OP2_ADD_64; 4912 4913 alu.src[0].chan = fp64_switch(i); 4914 alu.src[0].sel = ctx->temp_reg; 4915 4916 alu.src[1].chan = fp64_switch(i + 2); 4917 alu.src[1].sel = ctx->temp_reg; 4918 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst); 4919 alu.last = i == 1; 4920 4921 r = r600_bytecode_add_alu(ctx->bc, &alu); 4922 if (r) 4923 return r; 4924 } 4925 } 4926 } 4927 4928 return 0; 4929} 4930 4931static int egcm_double_to_int(struct r600_shader_ctx *ctx) 4932{ 4933 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4934 struct r600_bytecode_alu alu; 4935 int i, r; 4936 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4937 int treg = r600_get_temp(ctx); 4938 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I || 4939 inst->Instruction.Opcode == TGSI_OPCODE_D2U); 4940 4941 /* do a 64->32 into a temp register */ 4942 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32); 4943 if (r) 4944 return r; 4945 4946 for (i = 0; i <= lasti; i++) { 4947 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4948 continue; 4949 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4950 alu.op = ctx->inst_info->op; 4951 4952 alu.src[0].chan = i; 4953 alu.src[0].sel = treg; 4954 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4955 alu.last = (i == lasti); 4956 4957 r = r600_bytecode_add_alu(ctx->bc, &alu); 4958 if (r) 4959 return r; 4960 } 4961 4962 return 0; 4963} 4964 4965static int cayman_emit_unary_double_raw(struct r600_bytecode *bc, 4966 unsigned op, 4967 int dst_reg, 4968 struct r600_shader_src *src, 4969 bool abs) 4970{ 4971 struct r600_bytecode_alu alu; 4972 const int last_slot = 3; 4973 int r; 4974 4975 /* these have to write the result to X/Y by the looks of it */ 4976 for (int i = 0 ; i < last_slot; i++) { 4977 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4978 alu.op = op; 4979 4980 r600_bytecode_src(&alu.src[0], src, 1); 4981 r600_bytecode_src(&alu.src[1], src, 0); 4982 4983 if (abs) 4984 r600_bytecode_src_set_abs(&alu.src[1]); 4985 4986 alu.dst.sel = dst_reg; 4987 alu.dst.chan = i; 4988 alu.dst.write = (i == 0 || i == 1); 4989 4990 if (bc->chip_class != CAYMAN || i == last_slot - 1) 4991 alu.last = 1; 4992 r = r600_bytecode_add_alu(bc, &alu); 4993 if (r) 4994 return r; 4995 } 4996 4997 return 0; 4998} 4999 5000static int cayman_emit_double_instr(struct r600_shader_ctx *ctx) 5001{ 5002 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5003 int i, r; 5004 struct r600_bytecode_alu alu; 5005 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5006 int t1 = ctx->temp_reg; 5007 5008 /* should only be one src regs */ 5009 assert(inst->Instruction.NumSrcRegs == 1); 5010 5011 /* only support one double at a time */ 5012 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5013 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5014 5015 r = cayman_emit_unary_double_raw( 5016 ctx->bc, ctx->inst_info->op, t1, 5017 &ctx->src[0], 5018 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ || 5019 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT); 5020 if (r) 5021 return r; 5022 5023 for (i = 0 ; i <= lasti; i++) { 5024 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5025 continue; 5026 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5027 alu.op = ALU_OP1_MOV; 5028 alu.src[0].sel = t1; 5029 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1; 5030 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5031 alu.dst.write = 1; 5032 if (i == lasti) 5033 alu.last = 1; 5034 r = r600_bytecode_add_alu(ctx->bc, &alu); 5035 if (r) 5036 return r; 5037 } 5038 return 0; 5039} 5040 5041static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 5042{ 5043 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5044 int i, j, r; 5045 struct r600_bytecode_alu alu; 5046 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5047 5048 for (i = 0 ; i < last_slot; i++) { 5049 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5050 alu.op = ctx->inst_info->op; 5051 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5052 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 5053 5054 /* RSQ should take the absolute value of src */ 5055 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) { 5056 r600_bytecode_src_set_abs(&alu.src[j]); 5057 } 5058 } 5059 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5060 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5061 5062 if (i == last_slot - 1) 5063 alu.last = 1; 5064 r = r600_bytecode_add_alu(ctx->bc, &alu); 5065 if (r) 5066 return r; 5067 } 5068 return 0; 5069} 5070 5071static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 5072{ 5073 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5074 int i, j, k, r; 5075 struct r600_bytecode_alu alu; 5076 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5077 int t1 = ctx->temp_reg; 5078 5079 for (k = 0; k <= lasti; k++) { 5080 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 5081 continue; 5082 5083 for (i = 0 ; i < 4; i++) { 5084 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5085 alu.op = ctx->inst_info->op; 5086 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5087 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 5088 } 5089 alu.dst.sel = t1; 5090 alu.dst.chan = i; 5091 alu.dst.write = (i == k); 5092 if (i == 3) 5093 alu.last = 1; 5094 r = r600_bytecode_add_alu(ctx->bc, &alu); 5095 if (r) 5096 return r; 5097 } 5098 } 5099 5100 for (i = 0 ; i <= lasti; i++) { 5101 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5102 continue; 5103 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5104 alu.op = ALU_OP1_MOV; 5105 alu.src[0].sel = t1; 5106 alu.src[0].chan = i; 5107 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5108 alu.dst.write = 1; 5109 if (i == lasti) 5110 alu.last = 1; 5111 r = r600_bytecode_add_alu(ctx->bc, &alu); 5112 if (r) 5113 return r; 5114 } 5115 5116 return 0; 5117} 5118 5119 5120static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) 5121{ 5122 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5123 int i, j, k, r; 5124 struct r600_bytecode_alu alu; 5125 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5126 int t1 = ctx->temp_reg; 5127 5128 /* t1 would get overwritten below if we actually tried to 5129 * multiply two pairs of doubles at a time. */ 5130 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5131 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5132 5133 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5134 5135 for (i = 0; i < 4; i++) { 5136 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5137 alu.op = ctx->inst_info->op; 5138 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5139 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); 5140 } 5141 alu.dst.sel = t1; 5142 alu.dst.chan = i; 5143 alu.dst.write = 1; 5144 if (i == 3) 5145 alu.last = 1; 5146 r = r600_bytecode_add_alu(ctx->bc, &alu); 5147 if (r) 5148 return r; 5149 } 5150 5151 for (i = 0; i <= lasti; i++) { 5152 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5153 continue; 5154 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5155 alu.op = ALU_OP1_MOV; 5156 alu.src[0].sel = t1; 5157 alu.src[0].chan = i; 5158 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5159 alu.dst.write = 1; 5160 if (i == lasti) 5161 alu.last = 1; 5162 r = r600_bytecode_add_alu(ctx->bc, &alu); 5163 if (r) 5164 return r; 5165 } 5166 5167 return 0; 5168} 5169 5170/* 5171 * Emit RECIP_64 + MUL_64 to implement division. 5172 */ 5173static int cayman_ddiv_instr(struct r600_shader_ctx *ctx) 5174{ 5175 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5176 int r; 5177 struct r600_bytecode_alu alu; 5178 int t1 = ctx->temp_reg; 5179 int k; 5180 5181 /* Only support one double at a time. This is the same constraint as 5182 * in DMUL lowering. */ 5183 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5184 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5185 5186 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5187 5188 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false); 5189 if (r) 5190 return r; 5191 5192 for (int i = 0; i < 4; i++) { 5193 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5194 alu.op = ALU_OP2_MUL_64; 5195 5196 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1)); 5197 5198 alu.src[1].sel = t1; 5199 alu.src[1].chan = (i == 3) ? 0 : 1; 5200 5201 alu.dst.sel = t1; 5202 alu.dst.chan = i; 5203 alu.dst.write = 1; 5204 if (i == 3) 5205 alu.last = 1; 5206 r = r600_bytecode_add_alu(ctx->bc, &alu); 5207 if (r) 5208 return r; 5209 } 5210 5211 for (int i = 0; i < 2; i++) { 5212 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5213 alu.op = ALU_OP1_MOV; 5214 alu.src[0].sel = t1; 5215 alu.src[0].chan = i; 5216 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst); 5217 alu.dst.write = 1; 5218 if (i == 1) 5219 alu.last = 1; 5220 r = r600_bytecode_add_alu(ctx->bc, &alu); 5221 if (r) 5222 return r; 5223 } 5224 return 0; 5225} 5226 5227/* 5228 * r600 - trunc to -PI..PI range 5229 * r700 - normalize by dividing by 2PI 5230 * see fdo bug 27901 5231 */ 5232static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 5233{ 5234 int r; 5235 struct r600_bytecode_alu alu; 5236 5237 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5238 alu.op = ALU_OP3_MULADD; 5239 alu.is_op3 = 1; 5240 5241 alu.dst.chan = 0; 5242 alu.dst.sel = ctx->temp_reg; 5243 alu.dst.write = 1; 5244 5245 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5246 5247 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5248 alu.src[1].chan = 0; 5249 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI); 5250 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5251 alu.src[2].chan = 0; 5252 alu.last = 1; 5253 r = r600_bytecode_add_alu(ctx->bc, &alu); 5254 if (r) 5255 return r; 5256 5257 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5258 alu.op = ALU_OP1_FRACT; 5259 5260 alu.dst.chan = 0; 5261 alu.dst.sel = ctx->temp_reg; 5262 alu.dst.write = 1; 5263 5264 alu.src[0].sel = ctx->temp_reg; 5265 alu.src[0].chan = 0; 5266 alu.last = 1; 5267 r = r600_bytecode_add_alu(ctx->bc, &alu); 5268 if (r) 5269 return r; 5270 5271 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5272 alu.op = ALU_OP3_MULADD; 5273 alu.is_op3 = 1; 5274 5275 alu.dst.chan = 0; 5276 alu.dst.sel = ctx->temp_reg; 5277 alu.dst.write = 1; 5278 5279 alu.src[0].sel = ctx->temp_reg; 5280 alu.src[0].chan = 0; 5281 5282 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5283 alu.src[1].chan = 0; 5284 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 5285 alu.src[2].chan = 0; 5286 5287 if (ctx->bc->chip_class == R600) { 5288 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI); 5289 alu.src[2].value = u_bitcast_f2u(-M_PI); 5290 } else { 5291 alu.src[1].sel = V_SQ_ALU_SRC_1; 5292 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5293 alu.src[2].neg = 1; 5294 } 5295 5296 alu.last = 1; 5297 r = r600_bytecode_add_alu(ctx->bc, &alu); 5298 if (r) 5299 return r; 5300 return 0; 5301} 5302 5303static int cayman_trig(struct r600_shader_ctx *ctx) 5304{ 5305 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5306 struct r600_bytecode_alu alu; 5307 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5308 int i, r; 5309 5310 r = tgsi_setup_trig(ctx); 5311 if (r) 5312 return r; 5313 5314 5315 for (i = 0; i < last_slot; i++) { 5316 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5317 alu.op = ctx->inst_info->op; 5318 alu.dst.chan = i; 5319 5320 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5321 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5322 5323 alu.src[0].sel = ctx->temp_reg; 5324 alu.src[0].chan = 0; 5325 if (i == last_slot - 1) 5326 alu.last = 1; 5327 r = r600_bytecode_add_alu(ctx->bc, &alu); 5328 if (r) 5329 return r; 5330 } 5331 return 0; 5332} 5333 5334static int tgsi_trig(struct r600_shader_ctx *ctx) 5335{ 5336 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5337 struct r600_bytecode_alu alu; 5338 int i, r; 5339 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5340 5341 r = tgsi_setup_trig(ctx); 5342 if (r) 5343 return r; 5344 5345 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5346 alu.op = ctx->inst_info->op; 5347 alu.dst.chan = 0; 5348 alu.dst.sel = ctx->temp_reg; 5349 alu.dst.write = 1; 5350 5351 alu.src[0].sel = ctx->temp_reg; 5352 alu.src[0].chan = 0; 5353 alu.last = 1; 5354 r = r600_bytecode_add_alu(ctx->bc, &alu); 5355 if (r) 5356 return r; 5357 5358 /* replicate result */ 5359 for (i = 0; i < lasti + 1; i++) { 5360 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5361 continue; 5362 5363 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5364 alu.op = ALU_OP1_MOV; 5365 5366 alu.src[0].sel = ctx->temp_reg; 5367 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5368 if (i == lasti) 5369 alu.last = 1; 5370 r = r600_bytecode_add_alu(ctx->bc, &alu); 5371 if (r) 5372 return r; 5373 } 5374 return 0; 5375} 5376 5377static int tgsi_kill(struct r600_shader_ctx *ctx) 5378{ 5379 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5380 struct r600_bytecode_alu alu; 5381 int i, r; 5382 5383 for (i = 0; i < 4; i++) { 5384 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5385 alu.op = ctx->inst_info->op; 5386 5387 alu.dst.chan = i; 5388 5389 alu.src[0].sel = V_SQ_ALU_SRC_0; 5390 5391 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) { 5392 alu.src[1].sel = V_SQ_ALU_SRC_1; 5393 alu.src[1].neg = 1; 5394 } else { 5395 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5396 } 5397 if (i == 3) { 5398 alu.last = 1; 5399 } 5400 r = r600_bytecode_add_alu(ctx->bc, &alu); 5401 if (r) 5402 return r; 5403 } 5404 5405 /* kill must be last in ALU */ 5406 ctx->bc->force_add_cf = 1; 5407 ctx->shader->uses_kill = TRUE; 5408 return 0; 5409} 5410 5411static int tgsi_lit(struct r600_shader_ctx *ctx) 5412{ 5413 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5414 struct r600_bytecode_alu alu; 5415 int r; 5416 5417 /* tmp.x = max(src.y, 0.0) */ 5418 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5419 alu.op = ALU_OP2_MAX; 5420 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 5421 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5422 alu.src[1].chan = 1; 5423 5424 alu.dst.sel = ctx->temp_reg; 5425 alu.dst.chan = 0; 5426 alu.dst.write = 1; 5427 5428 alu.last = 1; 5429 r = r600_bytecode_add_alu(ctx->bc, &alu); 5430 if (r) 5431 return r; 5432 5433 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 5434 { 5435 int chan; 5436 int sel; 5437 unsigned i; 5438 5439 if (ctx->bc->chip_class == CAYMAN) { 5440 for (i = 0; i < 3; i++) { 5441 /* tmp.z = log(tmp.x) */ 5442 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5443 alu.op = ALU_OP1_LOG_CLAMPED; 5444 alu.src[0].sel = ctx->temp_reg; 5445 alu.src[0].chan = 0; 5446 alu.dst.sel = ctx->temp_reg; 5447 alu.dst.chan = i; 5448 if (i == 2) { 5449 alu.dst.write = 1; 5450 alu.last = 1; 5451 } else 5452 alu.dst.write = 0; 5453 5454 r = r600_bytecode_add_alu(ctx->bc, &alu); 5455 if (r) 5456 return r; 5457 } 5458 } else { 5459 /* tmp.z = log(tmp.x) */ 5460 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5461 alu.op = ALU_OP1_LOG_CLAMPED; 5462 alu.src[0].sel = ctx->temp_reg; 5463 alu.src[0].chan = 0; 5464 alu.dst.sel = ctx->temp_reg; 5465 alu.dst.chan = 2; 5466 alu.dst.write = 1; 5467 alu.last = 1; 5468 r = r600_bytecode_add_alu(ctx->bc, &alu); 5469 if (r) 5470 return r; 5471 } 5472 5473 chan = alu.dst.chan; 5474 sel = alu.dst.sel; 5475 5476 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 5477 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5478 alu.op = ALU_OP3_MUL_LIT; 5479 alu.src[0].sel = sel; 5480 alu.src[0].chan = chan; 5481 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 5482 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 5483 alu.dst.sel = ctx->temp_reg; 5484 alu.dst.chan = 0; 5485 alu.dst.write = 1; 5486 alu.is_op3 = 1; 5487 alu.last = 1; 5488 r = r600_bytecode_add_alu(ctx->bc, &alu); 5489 if (r) 5490 return r; 5491 5492 if (ctx->bc->chip_class == CAYMAN) { 5493 for (i = 0; i < 3; i++) { 5494 /* dst.z = exp(tmp.x) */ 5495 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5496 alu.op = ALU_OP1_EXP_IEEE; 5497 alu.src[0].sel = ctx->temp_reg; 5498 alu.src[0].chan = 0; 5499 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5500 if (i == 2) { 5501 alu.dst.write = 1; 5502 alu.last = 1; 5503 } else 5504 alu.dst.write = 0; 5505 r = r600_bytecode_add_alu(ctx->bc, &alu); 5506 if (r) 5507 return r; 5508 } 5509 } else { 5510 /* dst.z = exp(tmp.x) */ 5511 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5512 alu.op = ALU_OP1_EXP_IEEE; 5513 alu.src[0].sel = ctx->temp_reg; 5514 alu.src[0].chan = 0; 5515 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 5516 alu.last = 1; 5517 r = r600_bytecode_add_alu(ctx->bc, &alu); 5518 if (r) 5519 return r; 5520 } 5521 } 5522 5523 /* dst.x, <- 1.0 */ 5524 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5525 alu.op = ALU_OP1_MOV; 5526 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 5527 alu.src[0].chan = 0; 5528 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 5529 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 5530 r = r600_bytecode_add_alu(ctx->bc, &alu); 5531 if (r) 5532 return r; 5533 5534 /* dst.y = max(src.x, 0.0) */ 5535 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5536 alu.op = ALU_OP2_MAX; 5537 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5538 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5539 alu.src[1].chan = 0; 5540 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 5541 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 5542 r = r600_bytecode_add_alu(ctx->bc, &alu); 5543 if (r) 5544 return r; 5545 5546 /* dst.w, <- 1.0 */ 5547 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5548 alu.op = ALU_OP1_MOV; 5549 alu.src[0].sel = V_SQ_ALU_SRC_1; 5550 alu.src[0].chan = 0; 5551 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 5552 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 5553 alu.last = 1; 5554 r = r600_bytecode_add_alu(ctx->bc, &alu); 5555 if (r) 5556 return r; 5557 5558 return 0; 5559} 5560 5561static int tgsi_rsq(struct r600_shader_ctx *ctx) 5562{ 5563 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5564 struct r600_bytecode_alu alu; 5565 int i, r; 5566 5567 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5568 5569 alu.op = ALU_OP1_RECIPSQRT_IEEE; 5570 5571 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5572 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5573 r600_bytecode_src_set_abs(&alu.src[i]); 5574 } 5575 alu.dst.sel = ctx->temp_reg; 5576 alu.dst.write = 1; 5577 alu.last = 1; 5578 r = r600_bytecode_add_alu(ctx->bc, &alu); 5579 if (r) 5580 return r; 5581 /* replicate result */ 5582 return tgsi_helper_tempx_replicate(ctx); 5583} 5584 5585static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 5586{ 5587 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5588 struct r600_bytecode_alu alu; 5589 int i, r; 5590 5591 for (i = 0; i < 4; i++) { 5592 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5593 alu.src[0].sel = ctx->temp_reg; 5594 alu.op = ALU_OP1_MOV; 5595 alu.dst.chan = i; 5596 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5597 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5598 if (i == 3) 5599 alu.last = 1; 5600 r = r600_bytecode_add_alu(ctx->bc, &alu); 5601 if (r) 5602 return r; 5603 } 5604 return 0; 5605} 5606 5607static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 5608{ 5609 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5610 struct r600_bytecode_alu alu; 5611 int i, r; 5612 5613 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5614 alu.op = ctx->inst_info->op; 5615 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5616 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5617 } 5618 alu.dst.sel = ctx->temp_reg; 5619 alu.dst.write = 1; 5620 alu.last = 1; 5621 r = r600_bytecode_add_alu(ctx->bc, &alu); 5622 if (r) 5623 return r; 5624 /* replicate result */ 5625 return tgsi_helper_tempx_replicate(ctx); 5626} 5627 5628static int cayman_pow(struct r600_shader_ctx *ctx) 5629{ 5630 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5631 int i, r; 5632 struct r600_bytecode_alu alu; 5633 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5634 5635 for (i = 0; i < 3; i++) { 5636 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5637 alu.op = ALU_OP1_LOG_IEEE; 5638 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5639 alu.dst.sel = ctx->temp_reg; 5640 alu.dst.chan = i; 5641 alu.dst.write = 1; 5642 if (i == 2) 5643 alu.last = 1; 5644 r = r600_bytecode_add_alu(ctx->bc, &alu); 5645 if (r) 5646 return r; 5647 } 5648 5649 /* b * LOG2(a) */ 5650 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5651 alu.op = ALU_OP2_MUL; 5652 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5653 alu.src[1].sel = ctx->temp_reg; 5654 alu.dst.sel = ctx->temp_reg; 5655 alu.dst.write = 1; 5656 alu.last = 1; 5657 r = r600_bytecode_add_alu(ctx->bc, &alu); 5658 if (r) 5659 return r; 5660 5661 for (i = 0; i < last_slot; i++) { 5662 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5663 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5664 alu.op = ALU_OP1_EXP_IEEE; 5665 alu.src[0].sel = ctx->temp_reg; 5666 5667 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5668 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5669 if (i == last_slot - 1) 5670 alu.last = 1; 5671 r = r600_bytecode_add_alu(ctx->bc, &alu); 5672 if (r) 5673 return r; 5674 } 5675 return 0; 5676} 5677 5678static int tgsi_pow(struct r600_shader_ctx *ctx) 5679{ 5680 struct r600_bytecode_alu alu; 5681 int r; 5682 5683 /* LOG2(a) */ 5684 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5685 alu.op = ALU_OP1_LOG_IEEE; 5686 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5687 alu.dst.sel = ctx->temp_reg; 5688 alu.dst.write = 1; 5689 alu.last = 1; 5690 r = r600_bytecode_add_alu(ctx->bc, &alu); 5691 if (r) 5692 return r; 5693 /* b * LOG2(a) */ 5694 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5695 alu.op = ALU_OP2_MUL; 5696 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5697 alu.src[1].sel = ctx->temp_reg; 5698 alu.dst.sel = ctx->temp_reg; 5699 alu.dst.write = 1; 5700 alu.last = 1; 5701 r = r600_bytecode_add_alu(ctx->bc, &alu); 5702 if (r) 5703 return r; 5704 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5705 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5706 alu.op = ALU_OP1_EXP_IEEE; 5707 alu.src[0].sel = ctx->temp_reg; 5708 alu.dst.sel = ctx->temp_reg; 5709 alu.dst.write = 1; 5710 alu.last = 1; 5711 r = r600_bytecode_add_alu(ctx->bc, &alu); 5712 if (r) 5713 return r; 5714 return tgsi_helper_tempx_replicate(ctx); 5715} 5716 5717static int emit_mul_int_op(struct r600_bytecode *bc, 5718 struct r600_bytecode_alu *alu_src) 5719{ 5720 struct r600_bytecode_alu alu; 5721 int i, r; 5722 alu = *alu_src; 5723 if (bc->chip_class == CAYMAN) { 5724 for (i = 0; i < 4; i++) { 5725 alu.dst.chan = i; 5726 alu.dst.write = (i == alu_src->dst.chan); 5727 alu.last = (i == 3); 5728 5729 r = r600_bytecode_add_alu(bc, &alu); 5730 if (r) 5731 return r; 5732 } 5733 } else { 5734 alu.last = 1; 5735 r = r600_bytecode_add_alu(bc, &alu); 5736 if (r) 5737 return r; 5738 } 5739 return 0; 5740} 5741 5742static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 5743{ 5744 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5745 struct r600_bytecode_alu alu; 5746 int i, r, j; 5747 unsigned write_mask = inst->Dst[0].Register.WriteMask; 5748 int lasti = tgsi_last_instruction(write_mask); 5749 int tmp0 = ctx->temp_reg; 5750 int tmp1 = r600_get_temp(ctx); 5751 int tmp2 = r600_get_temp(ctx); 5752 int tmp3 = r600_get_temp(ctx); 5753 int tmp4 = 0; 5754 5755 /* Use additional temp if dst register and src register are the same */ 5756 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index || 5757 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) { 5758 tmp4 = r600_get_temp(ctx); 5759 } 5760 5761 /* Unsigned path: 5762 * 5763 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 5764 * 5765 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 5766 * 2. tmp0.z = lo (tmp0.x * src2) 5767 * 3. tmp0.w = -tmp0.z 5768 * 4. tmp0.y = hi (tmp0.x * src2) 5769 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 5770 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 5771 * 7. tmp1.x = tmp0.x - tmp0.w 5772 * 8. tmp1.y = tmp0.x + tmp0.w 5773 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 5774 * 10. tmp0.z = hi(tmp0.x * src1) = q 5775 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 5776 * 5777 * 12. tmp0.w = src1 - tmp0.y = r 5778 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 5779 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 5780 * 5781 * if DIV 5782 * 5783 * 15. tmp1.z = tmp0.z + 1 = q + 1 5784 * 16. tmp1.w = tmp0.z - 1 = q - 1 5785 * 5786 * else MOD 5787 * 5788 * 15. tmp1.z = tmp0.w - src2 = r - src2 5789 * 16. tmp1.w = tmp0.w + src2 = r + src2 5790 * 5791 * endif 5792 * 5793 * 17. tmp1.x = tmp1.x & tmp1.y 5794 * 5795 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 5796 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 5797 * 5798 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 5799 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 5800 * 5801 * Signed path: 5802 * 5803 * Same as unsigned, using abs values of the operands, 5804 * and fixing the sign of the result in the end. 5805 */ 5806 5807 for (i = 0; i < 4; i++) { 5808 if (!(write_mask & (1<<i))) 5809 continue; 5810 5811 if (signed_op) { 5812 5813 /* tmp2.x = -src0 */ 5814 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5815 alu.op = ALU_OP2_SUB_INT; 5816 5817 alu.dst.sel = tmp2; 5818 alu.dst.chan = 0; 5819 alu.dst.write = 1; 5820 5821 alu.src[0].sel = V_SQ_ALU_SRC_0; 5822 5823 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5824 5825 alu.last = 1; 5826 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5827 return r; 5828 5829 /* tmp2.y = -src1 */ 5830 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5831 alu.op = ALU_OP2_SUB_INT; 5832 5833 alu.dst.sel = tmp2; 5834 alu.dst.chan = 1; 5835 alu.dst.write = 1; 5836 5837 alu.src[0].sel = V_SQ_ALU_SRC_0; 5838 5839 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5840 5841 alu.last = 1; 5842 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5843 return r; 5844 5845 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 5846 /* it will be a sign of the quotient */ 5847 if (!mod) { 5848 5849 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5850 alu.op = ALU_OP2_XOR_INT; 5851 5852 alu.dst.sel = tmp2; 5853 alu.dst.chan = 2; 5854 alu.dst.write = 1; 5855 5856 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5857 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5858 5859 alu.last = 1; 5860 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5861 return r; 5862 } 5863 5864 /* tmp2.x = |src0| */ 5865 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5866 alu.op = ALU_OP3_CNDGE_INT; 5867 alu.is_op3 = 1; 5868 5869 alu.dst.sel = tmp2; 5870 alu.dst.chan = 0; 5871 alu.dst.write = 1; 5872 5873 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5874 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5875 alu.src[2].sel = tmp2; 5876 alu.src[2].chan = 0; 5877 5878 alu.last = 1; 5879 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5880 return r; 5881 5882 /* tmp2.y = |src1| */ 5883 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5884 alu.op = ALU_OP3_CNDGE_INT; 5885 alu.is_op3 = 1; 5886 5887 alu.dst.sel = tmp2; 5888 alu.dst.chan = 1; 5889 alu.dst.write = 1; 5890 5891 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5892 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5893 alu.src[2].sel = tmp2; 5894 alu.src[2].chan = 1; 5895 5896 alu.last = 1; 5897 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5898 return r; 5899 5900 } 5901 5902 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 5903 if (ctx->bc->chip_class == CAYMAN) { 5904 /* tmp3.x = u2f(src2) */ 5905 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5906 alu.op = ALU_OP1_UINT_TO_FLT; 5907 5908 alu.dst.sel = tmp3; 5909 alu.dst.chan = 0; 5910 alu.dst.write = 1; 5911 5912 if (signed_op) { 5913 alu.src[0].sel = tmp2; 5914 alu.src[0].chan = 1; 5915 } else { 5916 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5917 } 5918 5919 alu.last = 1; 5920 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5921 return r; 5922 5923 /* tmp0.x = recip(tmp3.x) */ 5924 for (j = 0 ; j < 3; j++) { 5925 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5926 alu.op = ALU_OP1_RECIP_IEEE; 5927 5928 alu.dst.sel = tmp0; 5929 alu.dst.chan = j; 5930 alu.dst.write = (j == 0); 5931 5932 alu.src[0].sel = tmp3; 5933 alu.src[0].chan = 0; 5934 5935 if (j == 2) 5936 alu.last = 1; 5937 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5938 return r; 5939 } 5940 5941 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5942 alu.op = ALU_OP2_MUL; 5943 5944 alu.src[0].sel = tmp0; 5945 alu.src[0].chan = 0; 5946 5947 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5948 alu.src[1].value = 0x4f800000; 5949 5950 alu.dst.sel = tmp3; 5951 alu.dst.write = 1; 5952 alu.last = 1; 5953 r = r600_bytecode_add_alu(ctx->bc, &alu); 5954 if (r) 5955 return r; 5956 5957 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5958 alu.op = ALU_OP1_FLT_TO_UINT; 5959 5960 alu.dst.sel = tmp0; 5961 alu.dst.chan = 0; 5962 alu.dst.write = 1; 5963 5964 alu.src[0].sel = tmp3; 5965 alu.src[0].chan = 0; 5966 5967 alu.last = 1; 5968 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5969 return r; 5970 5971 } else { 5972 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5973 alu.op = ALU_OP1_RECIP_UINT; 5974 5975 alu.dst.sel = tmp0; 5976 alu.dst.chan = 0; 5977 alu.dst.write = 1; 5978 5979 if (signed_op) { 5980 alu.src[0].sel = tmp2; 5981 alu.src[0].chan = 1; 5982 } else { 5983 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5984 } 5985 5986 alu.last = 1; 5987 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5988 return r; 5989 } 5990 5991 /* 2. tmp0.z = lo (tmp0.x * src2) */ 5992 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5993 alu.op = ALU_OP2_MULLO_UINT; 5994 5995 alu.dst.sel = tmp0; 5996 alu.dst.chan = 2; 5997 alu.dst.write = 1; 5998 5999 alu.src[0].sel = tmp0; 6000 alu.src[0].chan = 0; 6001 if (signed_op) { 6002 alu.src[1].sel = tmp2; 6003 alu.src[1].chan = 1; 6004 } else { 6005 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6006 } 6007 6008 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6009 return r; 6010 6011 /* 3. tmp0.w = -tmp0.z */ 6012 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6013 alu.op = ALU_OP2_SUB_INT; 6014 6015 alu.dst.sel = tmp0; 6016 alu.dst.chan = 3; 6017 alu.dst.write = 1; 6018 6019 alu.src[0].sel = V_SQ_ALU_SRC_0; 6020 alu.src[1].sel = tmp0; 6021 alu.src[1].chan = 2; 6022 6023 alu.last = 1; 6024 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6025 return r; 6026 6027 /* 4. tmp0.y = hi (tmp0.x * src2) */ 6028 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6029 alu.op = ALU_OP2_MULHI_UINT; 6030 6031 alu.dst.sel = tmp0; 6032 alu.dst.chan = 1; 6033 alu.dst.write = 1; 6034 6035 alu.src[0].sel = tmp0; 6036 alu.src[0].chan = 0; 6037 6038 if (signed_op) { 6039 alu.src[1].sel = tmp2; 6040 alu.src[1].chan = 1; 6041 } else { 6042 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6043 } 6044 6045 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6046 return r; 6047 6048 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 6049 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6050 alu.op = ALU_OP3_CNDE_INT; 6051 alu.is_op3 = 1; 6052 6053 alu.dst.sel = tmp0; 6054 alu.dst.chan = 2; 6055 alu.dst.write = 1; 6056 6057 alu.src[0].sel = tmp0; 6058 alu.src[0].chan = 1; 6059 alu.src[1].sel = tmp0; 6060 alu.src[1].chan = 3; 6061 alu.src[2].sel = tmp0; 6062 alu.src[2].chan = 2; 6063 6064 alu.last = 1; 6065 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6066 return r; 6067 6068 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 6069 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6070 alu.op = ALU_OP2_MULHI_UINT; 6071 6072 alu.dst.sel = tmp0; 6073 alu.dst.chan = 3; 6074 alu.dst.write = 1; 6075 6076 alu.src[0].sel = tmp0; 6077 alu.src[0].chan = 2; 6078 6079 alu.src[1].sel = tmp0; 6080 alu.src[1].chan = 0; 6081 6082 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6083 return r; 6084 6085 /* 7. tmp1.x = tmp0.x - tmp0.w */ 6086 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6087 alu.op = ALU_OP2_SUB_INT; 6088 6089 alu.dst.sel = tmp1; 6090 alu.dst.chan = 0; 6091 alu.dst.write = 1; 6092 6093 alu.src[0].sel = tmp0; 6094 alu.src[0].chan = 0; 6095 alu.src[1].sel = tmp0; 6096 alu.src[1].chan = 3; 6097 6098 alu.last = 1; 6099 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6100 return r; 6101 6102 /* 8. tmp1.y = tmp0.x + tmp0.w */ 6103 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6104 alu.op = ALU_OP2_ADD_INT; 6105 6106 alu.dst.sel = tmp1; 6107 alu.dst.chan = 1; 6108 alu.dst.write = 1; 6109 6110 alu.src[0].sel = tmp0; 6111 alu.src[0].chan = 0; 6112 alu.src[1].sel = tmp0; 6113 alu.src[1].chan = 3; 6114 6115 alu.last = 1; 6116 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6117 return r; 6118 6119 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 6120 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6121 alu.op = ALU_OP3_CNDE_INT; 6122 alu.is_op3 = 1; 6123 6124 alu.dst.sel = tmp0; 6125 alu.dst.chan = 0; 6126 alu.dst.write = 1; 6127 6128 alu.src[0].sel = tmp0; 6129 alu.src[0].chan = 1; 6130 alu.src[1].sel = tmp1; 6131 alu.src[1].chan = 1; 6132 alu.src[2].sel = tmp1; 6133 alu.src[2].chan = 0; 6134 6135 alu.last = 1; 6136 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6137 return r; 6138 6139 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 6140 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6141 alu.op = ALU_OP2_MULHI_UINT; 6142 6143 alu.dst.sel = tmp0; 6144 alu.dst.chan = 2; 6145 alu.dst.write = 1; 6146 6147 alu.src[0].sel = tmp0; 6148 alu.src[0].chan = 0; 6149 6150 if (signed_op) { 6151 alu.src[1].sel = tmp2; 6152 alu.src[1].chan = 0; 6153 } else { 6154 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6155 } 6156 6157 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6158 return r; 6159 6160 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 6161 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6162 alu.op = ALU_OP2_MULLO_UINT; 6163 6164 alu.dst.sel = tmp0; 6165 alu.dst.chan = 1; 6166 alu.dst.write = 1; 6167 6168 if (signed_op) { 6169 alu.src[0].sel = tmp2; 6170 alu.src[0].chan = 1; 6171 } else { 6172 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6173 } 6174 6175 alu.src[1].sel = tmp0; 6176 alu.src[1].chan = 2; 6177 6178 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6179 return r; 6180 6181 /* 12. tmp0.w = src1 - tmp0.y = r */ 6182 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6183 alu.op = ALU_OP2_SUB_INT; 6184 6185 alu.dst.sel = tmp0; 6186 alu.dst.chan = 3; 6187 alu.dst.write = 1; 6188 6189 if (signed_op) { 6190 alu.src[0].sel = tmp2; 6191 alu.src[0].chan = 0; 6192 } else { 6193 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6194 } 6195 6196 alu.src[1].sel = tmp0; 6197 alu.src[1].chan = 1; 6198 6199 alu.last = 1; 6200 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6201 return r; 6202 6203 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 6204 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6205 alu.op = ALU_OP2_SETGE_UINT; 6206 6207 alu.dst.sel = tmp1; 6208 alu.dst.chan = 0; 6209 alu.dst.write = 1; 6210 6211 alu.src[0].sel = tmp0; 6212 alu.src[0].chan = 3; 6213 if (signed_op) { 6214 alu.src[1].sel = tmp2; 6215 alu.src[1].chan = 1; 6216 } else { 6217 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6218 } 6219 6220 alu.last = 1; 6221 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6222 return r; 6223 6224 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 6225 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6226 alu.op = ALU_OP2_SETGE_UINT; 6227 6228 alu.dst.sel = tmp1; 6229 alu.dst.chan = 1; 6230 alu.dst.write = 1; 6231 6232 if (signed_op) { 6233 alu.src[0].sel = tmp2; 6234 alu.src[0].chan = 0; 6235 } else { 6236 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6237 } 6238 6239 alu.src[1].sel = tmp0; 6240 alu.src[1].chan = 1; 6241 6242 alu.last = 1; 6243 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6244 return r; 6245 6246 if (mod) { /* UMOD */ 6247 6248 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 6249 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6250 alu.op = ALU_OP2_SUB_INT; 6251 6252 alu.dst.sel = tmp1; 6253 alu.dst.chan = 2; 6254 alu.dst.write = 1; 6255 6256 alu.src[0].sel = tmp0; 6257 alu.src[0].chan = 3; 6258 6259 if (signed_op) { 6260 alu.src[1].sel = tmp2; 6261 alu.src[1].chan = 1; 6262 } else { 6263 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6264 } 6265 6266 alu.last = 1; 6267 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6268 return r; 6269 6270 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 6271 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6272 alu.op = ALU_OP2_ADD_INT; 6273 6274 alu.dst.sel = tmp1; 6275 alu.dst.chan = 3; 6276 alu.dst.write = 1; 6277 6278 alu.src[0].sel = tmp0; 6279 alu.src[0].chan = 3; 6280 if (signed_op) { 6281 alu.src[1].sel = tmp2; 6282 alu.src[1].chan = 1; 6283 } else { 6284 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6285 } 6286 6287 alu.last = 1; 6288 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6289 return r; 6290 6291 } else { /* UDIV */ 6292 6293 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 6294 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6295 alu.op = ALU_OP2_ADD_INT; 6296 6297 alu.dst.sel = tmp1; 6298 alu.dst.chan = 2; 6299 alu.dst.write = 1; 6300 6301 alu.src[0].sel = tmp0; 6302 alu.src[0].chan = 2; 6303 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6304 6305 alu.last = 1; 6306 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6307 return r; 6308 6309 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 6310 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6311 alu.op = ALU_OP2_ADD_INT; 6312 6313 alu.dst.sel = tmp1; 6314 alu.dst.chan = 3; 6315 alu.dst.write = 1; 6316 6317 alu.src[0].sel = tmp0; 6318 alu.src[0].chan = 2; 6319 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 6320 6321 alu.last = 1; 6322 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6323 return r; 6324 6325 } 6326 6327 /* 17. tmp1.x = tmp1.x & tmp1.y */ 6328 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6329 alu.op = ALU_OP2_AND_INT; 6330 6331 alu.dst.sel = tmp1; 6332 alu.dst.chan = 0; 6333 alu.dst.write = 1; 6334 6335 alu.src[0].sel = tmp1; 6336 alu.src[0].chan = 0; 6337 alu.src[1].sel = tmp1; 6338 alu.src[1].chan = 1; 6339 6340 alu.last = 1; 6341 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6342 return r; 6343 6344 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 6345 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 6346 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6347 alu.op = ALU_OP3_CNDE_INT; 6348 alu.is_op3 = 1; 6349 6350 alu.dst.sel = tmp0; 6351 alu.dst.chan = 2; 6352 alu.dst.write = 1; 6353 6354 alu.src[0].sel = tmp1; 6355 alu.src[0].chan = 0; 6356 alu.src[1].sel = tmp0; 6357 alu.src[1].chan = mod ? 3 : 2; 6358 alu.src[2].sel = tmp1; 6359 alu.src[2].chan = 2; 6360 6361 alu.last = 1; 6362 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6363 return r; 6364 6365 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 6366 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6367 alu.op = ALU_OP3_CNDE_INT; 6368 alu.is_op3 = 1; 6369 6370 if (signed_op) { 6371 alu.dst.sel = tmp0; 6372 alu.dst.chan = 2; 6373 alu.dst.write = 1; 6374 } else { 6375 if (tmp4 > 0) { 6376 alu.dst.sel = tmp4; 6377 alu.dst.chan = i; 6378 alu.dst.write = 1; 6379 } else { 6380 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6381 } 6382 } 6383 6384 alu.src[0].sel = tmp1; 6385 alu.src[0].chan = 1; 6386 alu.src[1].sel = tmp1; 6387 alu.src[1].chan = 3; 6388 alu.src[2].sel = tmp0; 6389 alu.src[2].chan = 2; 6390 6391 alu.last = 1; 6392 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6393 return r; 6394 6395 if (signed_op) { 6396 6397 /* fix the sign of the result */ 6398 6399 if (mod) { 6400 6401 /* tmp0.x = -tmp0.z */ 6402 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6403 alu.op = ALU_OP2_SUB_INT; 6404 6405 alu.dst.sel = tmp0; 6406 alu.dst.chan = 0; 6407 alu.dst.write = 1; 6408 6409 alu.src[0].sel = V_SQ_ALU_SRC_0; 6410 alu.src[1].sel = tmp0; 6411 alu.src[1].chan = 2; 6412 6413 alu.last = 1; 6414 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6415 return r; 6416 6417 /* sign of the remainder is the same as the sign of src0 */ 6418 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 6419 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6420 alu.op = ALU_OP3_CNDGE_INT; 6421 alu.is_op3 = 1; 6422 6423 if (tmp4 > 0) { 6424 alu.dst.sel = tmp4; 6425 alu.dst.chan = i; 6426 alu.dst.write = 1; 6427 } else { 6428 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6429 } 6430 6431 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6432 alu.src[1].sel = tmp0; 6433 alu.src[1].chan = 2; 6434 alu.src[2].sel = tmp0; 6435 alu.src[2].chan = 0; 6436 6437 alu.last = 1; 6438 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6439 return r; 6440 6441 } else { 6442 6443 /* tmp0.x = -tmp0.z */ 6444 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6445 alu.op = ALU_OP2_SUB_INT; 6446 6447 alu.dst.sel = tmp0; 6448 alu.dst.chan = 0; 6449 alu.dst.write = 1; 6450 6451 alu.src[0].sel = V_SQ_ALU_SRC_0; 6452 alu.src[1].sel = tmp0; 6453 alu.src[1].chan = 2; 6454 6455 alu.last = 1; 6456 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6457 return r; 6458 6459 /* fix the quotient sign (same as the sign of src0*src1) */ 6460 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 6461 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6462 alu.op = ALU_OP3_CNDGE_INT; 6463 alu.is_op3 = 1; 6464 6465 if (tmp4 > 0) { 6466 alu.dst.sel = tmp4; 6467 alu.dst.chan = i; 6468 alu.dst.write = 1; 6469 } else { 6470 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6471 } 6472 6473 alu.src[0].sel = tmp2; 6474 alu.src[0].chan = 2; 6475 alu.src[1].sel = tmp0; 6476 alu.src[1].chan = 2; 6477 alu.src[2].sel = tmp0; 6478 alu.src[2].chan = 0; 6479 6480 alu.last = 1; 6481 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6482 return r; 6483 } 6484 } 6485 } 6486 6487 if (tmp4 > 0) { 6488 for (i = 0; i <= lasti; ++i) { 6489 if (!(write_mask & (1<<i))) 6490 continue; 6491 6492 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6493 alu.op = ALU_OP1_MOV; 6494 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6495 alu.src[0].sel = tmp4; 6496 alu.src[0].chan = i; 6497 6498 if (i == lasti) 6499 alu.last = 1; 6500 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6501 return r; 6502 } 6503 } 6504 6505 return 0; 6506} 6507 6508static int tgsi_udiv(struct r600_shader_ctx *ctx) 6509{ 6510 return tgsi_divmod(ctx, 0, 0); 6511} 6512 6513static int tgsi_umod(struct r600_shader_ctx *ctx) 6514{ 6515 return tgsi_divmod(ctx, 1, 0); 6516} 6517 6518static int tgsi_idiv(struct r600_shader_ctx *ctx) 6519{ 6520 return tgsi_divmod(ctx, 0, 1); 6521} 6522 6523static int tgsi_imod(struct r600_shader_ctx *ctx) 6524{ 6525 return tgsi_divmod(ctx, 1, 1); 6526} 6527 6528 6529static int tgsi_f2i(struct r600_shader_ctx *ctx) 6530{ 6531 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6532 struct r600_bytecode_alu alu; 6533 int i, r; 6534 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6535 int last_inst = tgsi_last_instruction(write_mask); 6536 6537 for (i = 0; i < 4; i++) { 6538 if (!(write_mask & (1<<i))) 6539 continue; 6540 6541 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6542 alu.op = ALU_OP1_TRUNC; 6543 6544 alu.dst.sel = ctx->temp_reg; 6545 alu.dst.chan = i; 6546 alu.dst.write = 1; 6547 6548 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6549 if (i == last_inst) 6550 alu.last = 1; 6551 r = r600_bytecode_add_alu(ctx->bc, &alu); 6552 if (r) 6553 return r; 6554 } 6555 6556 for (i = 0; i < 4; i++) { 6557 if (!(write_mask & (1<<i))) 6558 continue; 6559 6560 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6561 alu.op = ctx->inst_info->op; 6562 6563 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6564 6565 alu.src[0].sel = ctx->temp_reg; 6566 alu.src[0].chan = i; 6567 6568 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 6569 alu.last = 1; 6570 r = r600_bytecode_add_alu(ctx->bc, &alu); 6571 if (r) 6572 return r; 6573 } 6574 6575 return 0; 6576} 6577 6578static int tgsi_iabs(struct r600_shader_ctx *ctx) 6579{ 6580 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6581 struct r600_bytecode_alu alu; 6582 int i, r; 6583 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6584 int last_inst = tgsi_last_instruction(write_mask); 6585 6586 /* tmp = -src */ 6587 for (i = 0; i < 4; i++) { 6588 if (!(write_mask & (1<<i))) 6589 continue; 6590 6591 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6592 alu.op = ALU_OP2_SUB_INT; 6593 6594 alu.dst.sel = ctx->temp_reg; 6595 alu.dst.chan = i; 6596 alu.dst.write = 1; 6597 6598 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6599 alu.src[0].sel = V_SQ_ALU_SRC_0; 6600 6601 if (i == last_inst) 6602 alu.last = 1; 6603 r = r600_bytecode_add_alu(ctx->bc, &alu); 6604 if (r) 6605 return r; 6606 } 6607 6608 /* dst = (src >= 0 ? src : tmp) */ 6609 for (i = 0; i < 4; i++) { 6610 if (!(write_mask & (1<<i))) 6611 continue; 6612 6613 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6614 alu.op = ALU_OP3_CNDGE_INT; 6615 alu.is_op3 = 1; 6616 alu.dst.write = 1; 6617 6618 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6619 6620 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6621 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6622 alu.src[2].sel = ctx->temp_reg; 6623 alu.src[2].chan = i; 6624 6625 if (i == last_inst) 6626 alu.last = 1; 6627 r = r600_bytecode_add_alu(ctx->bc, &alu); 6628 if (r) 6629 return r; 6630 } 6631 return 0; 6632} 6633 6634static int tgsi_issg(struct r600_shader_ctx *ctx) 6635{ 6636 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6637 struct r600_bytecode_alu alu; 6638 int i, r; 6639 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6640 int last_inst = tgsi_last_instruction(write_mask); 6641 6642 /* tmp = (src >= 0 ? src : -1) */ 6643 for (i = 0; i < 4; i++) { 6644 if (!(write_mask & (1<<i))) 6645 continue; 6646 6647 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6648 alu.op = ALU_OP3_CNDGE_INT; 6649 alu.is_op3 = 1; 6650 6651 alu.dst.sel = ctx->temp_reg; 6652 alu.dst.chan = i; 6653 alu.dst.write = 1; 6654 6655 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6656 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6657 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 6658 6659 if (i == last_inst) 6660 alu.last = 1; 6661 r = r600_bytecode_add_alu(ctx->bc, &alu); 6662 if (r) 6663 return r; 6664 } 6665 6666 /* dst = (tmp > 0 ? 1 : tmp) */ 6667 for (i = 0; i < 4; i++) { 6668 if (!(write_mask & (1<<i))) 6669 continue; 6670 6671 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6672 alu.op = ALU_OP3_CNDGT_INT; 6673 alu.is_op3 = 1; 6674 alu.dst.write = 1; 6675 6676 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6677 6678 alu.src[0].sel = ctx->temp_reg; 6679 alu.src[0].chan = i; 6680 6681 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6682 6683 alu.src[2].sel = ctx->temp_reg; 6684 alu.src[2].chan = i; 6685 6686 if (i == last_inst) 6687 alu.last = 1; 6688 r = r600_bytecode_add_alu(ctx->bc, &alu); 6689 if (r) 6690 return r; 6691 } 6692 return 0; 6693} 6694 6695 6696 6697static int tgsi_ssg(struct r600_shader_ctx *ctx) 6698{ 6699 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6700 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6701 int last_inst = tgsi_last_instruction(write_mask); 6702 struct r600_bytecode_alu alu; 6703 int i, r; 6704 6705 /* tmp = (src > 0 ? 1 : src) */ 6706 for (i = 0; i <= last_inst; i++) { 6707 if (!(write_mask & (1 << i))) 6708 continue; 6709 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6710 alu.op = ALU_OP3_CNDGT; 6711 alu.is_op3 = 1; 6712 6713 alu.dst.sel = ctx->temp_reg; 6714 alu.dst.chan = i; 6715 6716 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6717 alu.src[1].sel = V_SQ_ALU_SRC_1; 6718 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6719 6720 if (i == last_inst) 6721 alu.last = 1; 6722 r = r600_bytecode_add_alu(ctx->bc, &alu); 6723 if (r) 6724 return r; 6725 } 6726 6727 /* dst = (-tmp > 0 ? -1 : tmp) */ 6728 for (i = 0; i <= last_inst; i++) { 6729 if (!(write_mask & (1 << i))) 6730 continue; 6731 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6732 alu.op = ALU_OP3_CNDGT; 6733 alu.is_op3 = 1; 6734 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6735 6736 alu.src[0].sel = ctx->temp_reg; 6737 alu.src[0].chan = i; 6738 alu.src[0].neg = 1; 6739 6740 alu.src[1].sel = V_SQ_ALU_SRC_1; 6741 alu.src[1].neg = 1; 6742 6743 alu.src[2].sel = ctx->temp_reg; 6744 alu.src[2].chan = i; 6745 6746 if (i == last_inst) 6747 alu.last = 1; 6748 r = r600_bytecode_add_alu(ctx->bc, &alu); 6749 if (r) 6750 return r; 6751 } 6752 return 0; 6753} 6754 6755static int tgsi_bfi(struct r600_shader_ctx *ctx) 6756{ 6757 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6758 struct r600_bytecode_alu alu; 6759 int i, r, t1, t2; 6760 6761 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6762 int last_inst = tgsi_last_instruction(write_mask); 6763 6764 t1 = r600_get_temp(ctx); 6765 6766 for (i = 0; i < 4; i++) { 6767 if (!(write_mask & (1<<i))) 6768 continue; 6769 6770 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6771 alu.op = ALU_OP2_SETGE_INT; 6772 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6773 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6774 alu.src[1].value = 32; 6775 alu.dst.sel = ctx->temp_reg; 6776 alu.dst.chan = i; 6777 alu.dst.write = 1; 6778 alu.last = i == last_inst; 6779 r = r600_bytecode_add_alu(ctx->bc, &alu); 6780 if (r) 6781 return r; 6782 } 6783 6784 for (i = 0; i < 4; i++) { 6785 if (!(write_mask & (1<<i))) 6786 continue; 6787 6788 /* create mask tmp */ 6789 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6790 alu.op = ALU_OP2_BFM_INT; 6791 alu.dst.sel = t1; 6792 alu.dst.chan = i; 6793 alu.dst.write = 1; 6794 alu.last = i == last_inst; 6795 6796 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6797 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6798 6799 r = r600_bytecode_add_alu(ctx->bc, &alu); 6800 if (r) 6801 return r; 6802 } 6803 6804 t2 = r600_get_temp(ctx); 6805 6806 for (i = 0; i < 4; i++) { 6807 if (!(write_mask & (1<<i))) 6808 continue; 6809 6810 /* shift insert left */ 6811 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6812 alu.op = ALU_OP2_LSHL_INT; 6813 alu.dst.sel = t2; 6814 alu.dst.chan = i; 6815 alu.dst.write = 1; 6816 alu.last = i == last_inst; 6817 6818 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6819 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6820 6821 r = r600_bytecode_add_alu(ctx->bc, &alu); 6822 if (r) 6823 return r; 6824 } 6825 6826 for (i = 0; i < 4; i++) { 6827 if (!(write_mask & (1<<i))) 6828 continue; 6829 6830 /* actual bitfield insert */ 6831 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6832 alu.op = ALU_OP3_BFI_INT; 6833 alu.is_op3 = 1; 6834 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6835 alu.dst.chan = i; 6836 alu.dst.write = 1; 6837 alu.last = i == last_inst; 6838 6839 alu.src[0].sel = t1; 6840 alu.src[0].chan = i; 6841 alu.src[1].sel = t2; 6842 alu.src[1].chan = i; 6843 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6844 6845 r = r600_bytecode_add_alu(ctx->bc, &alu); 6846 if (r) 6847 return r; 6848 } 6849 6850 for (i = 0; i < 4; i++) { 6851 if (!(write_mask & (1<<i))) 6852 continue; 6853 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6854 alu.op = ALU_OP3_CNDE_INT; 6855 alu.is_op3 = 1; 6856 alu.src[0].sel = ctx->temp_reg; 6857 alu.src[0].chan = i; 6858 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 6859 6860 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6861 6862 alu.src[1].sel = alu.dst.sel; 6863 alu.src[1].chan = i; 6864 6865 alu.last = i == last_inst; 6866 r = r600_bytecode_add_alu(ctx->bc, &alu); 6867 if (r) 6868 return r; 6869 } 6870 return 0; 6871} 6872 6873static int tgsi_msb(struct r600_shader_ctx *ctx) 6874{ 6875 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6876 struct r600_bytecode_alu alu; 6877 int i, r, t1, t2; 6878 6879 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6880 int last_inst = tgsi_last_instruction(write_mask); 6881 6882 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT || 6883 ctx->inst_info->op == ALU_OP1_FFBH_UINT); 6884 6885 t1 = ctx->temp_reg; 6886 6887 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */ 6888 for (i = 0; i < 4; i++) { 6889 if (!(write_mask & (1<<i))) 6890 continue; 6891 6892 /* t1 = FFBH_INT / FFBH_UINT */ 6893 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6894 alu.op = ctx->inst_info->op; 6895 alu.dst.sel = t1; 6896 alu.dst.chan = i; 6897 alu.dst.write = 1; 6898 alu.last = i == last_inst; 6899 6900 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6901 6902 r = r600_bytecode_add_alu(ctx->bc, &alu); 6903 if (r) 6904 return r; 6905 } 6906 6907 t2 = r600_get_temp(ctx); 6908 6909 for (i = 0; i < 4; i++) { 6910 if (!(write_mask & (1<<i))) 6911 continue; 6912 6913 /* t2 = 31 - t1 */ 6914 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6915 alu.op = ALU_OP2_SUB_INT; 6916 alu.dst.sel = t2; 6917 alu.dst.chan = i; 6918 alu.dst.write = 1; 6919 alu.last = i == last_inst; 6920 6921 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 6922 alu.src[0].value = 31; 6923 alu.src[1].sel = t1; 6924 alu.src[1].chan = i; 6925 6926 r = r600_bytecode_add_alu(ctx->bc, &alu); 6927 if (r) 6928 return r; 6929 } 6930 6931 for (i = 0; i < 4; i++) { 6932 if (!(write_mask & (1<<i))) 6933 continue; 6934 6935 /* result = t1 >= 0 ? t2 : t1 */ 6936 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6937 alu.op = ALU_OP3_CNDGE_INT; 6938 alu.is_op3 = 1; 6939 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6940 alu.dst.chan = i; 6941 alu.dst.write = 1; 6942 alu.last = i == last_inst; 6943 6944 alu.src[0].sel = t1; 6945 alu.src[0].chan = i; 6946 alu.src[1].sel = t2; 6947 alu.src[1].chan = i; 6948 alu.src[2].sel = t1; 6949 alu.src[2].chan = i; 6950 6951 r = r600_bytecode_add_alu(ctx->bc, &alu); 6952 if (r) 6953 return r; 6954 } 6955 6956 return 0; 6957} 6958 6959static int tgsi_interp_egcm(struct r600_shader_ctx *ctx) 6960{ 6961 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6962 struct r600_bytecode_alu alu; 6963 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti; 6964 unsigned location; 6965 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs; 6966 6967 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); 6968 6969 /* Interpolators have been marked for use already by allocate_system_value_inputs */ 6970 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6971 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6972 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */ 6973 } 6974 else { 6975 location = TGSI_INTERPOLATE_LOC_CENTROID; 6976 } 6977 6978 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location); 6979 if (k < 0) 6980 k = 0; 6981 interp_gpr = ctx->eg_interpolators[k].ij_index / 2; 6982 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2); 6983 6984 /* NOTE: currently offset is not perspective correct */ 6985 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6986 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6987 int sample_gpr = -1; 6988 int gradientsH, gradientsV; 6989 struct r600_bytecode_tex tex; 6990 6991 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6992 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]); 6993 } 6994 6995 gradientsH = r600_get_temp(ctx); 6996 gradientsV = r600_get_temp(ctx); 6997 for (i = 0; i < 2; i++) { 6998 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 6999 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V; 7000 tex.src_gpr = interp_gpr; 7001 tex.src_sel_x = interp_base_chan + 0; 7002 tex.src_sel_y = interp_base_chan + 1; 7003 tex.src_sel_z = 0; 7004 tex.src_sel_w = 0; 7005 tex.dst_gpr = i == 0 ? gradientsH : gradientsV; 7006 tex.dst_sel_x = 0; 7007 tex.dst_sel_y = 1; 7008 tex.dst_sel_z = 7; 7009 tex.dst_sel_w = 7; 7010 tex.inst_mod = 1; // Use per pixel gradient calculation 7011 tex.sampler_id = 0; 7012 tex.resource_id = tex.sampler_id; 7013 r = r600_bytecode_add_tex(ctx->bc, &tex); 7014 if (r) 7015 return r; 7016 } 7017 7018 for (i = 0; i < 2; i++) { 7019 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7020 alu.op = ALU_OP3_MULADD; 7021 alu.is_op3 = 1; 7022 alu.src[0].sel = gradientsH; 7023 alu.src[0].chan = i; 7024 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7025 alu.src[1].sel = sample_gpr; 7026 alu.src[1].chan = 2; 7027 } 7028 else { 7029 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 7030 } 7031 alu.src[2].sel = interp_gpr; 7032 alu.src[2].chan = interp_base_chan + i; 7033 alu.dst.sel = ctx->temp_reg; 7034 alu.dst.chan = i; 7035 alu.last = i == 1; 7036 7037 r = r600_bytecode_add_alu(ctx->bc, &alu); 7038 if (r) 7039 return r; 7040 } 7041 7042 for (i = 0; i < 2; i++) { 7043 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7044 alu.op = ALU_OP3_MULADD; 7045 alu.is_op3 = 1; 7046 alu.src[0].sel = gradientsV; 7047 alu.src[0].chan = i; 7048 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7049 alu.src[1].sel = sample_gpr; 7050 alu.src[1].chan = 3; 7051 } 7052 else { 7053 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 7054 } 7055 alu.src[2].sel = ctx->temp_reg; 7056 alu.src[2].chan = i; 7057 alu.dst.sel = ctx->temp_reg; 7058 alu.dst.chan = i; 7059 alu.last = i == 1; 7060 7061 r = r600_bytecode_add_alu(ctx->bc, &alu); 7062 if (r) 7063 return r; 7064 } 7065 } 7066 7067 tmp = r600_get_temp(ctx); 7068 for (i = 0; i < 8; i++) { 7069 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7070 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY; 7071 7072 alu.dst.sel = tmp; 7073 if ((i > 1 && i < 6)) { 7074 alu.dst.write = 1; 7075 } 7076 else { 7077 alu.dst.write = 0; 7078 } 7079 alu.dst.chan = i % 4; 7080 7081 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7082 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7083 alu.src[0].sel = ctx->temp_reg; 7084 alu.src[0].chan = 1 - (i % 2); 7085 } else { 7086 alu.src[0].sel = interp_gpr; 7087 alu.src[0].chan = interp_base_chan + 1 - (i % 2); 7088 } 7089 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 7090 alu.src[1].chan = 0; 7091 7092 alu.last = i % 4 == 3; 7093 alu.bank_swizzle_force = SQ_ALU_VEC_210; 7094 7095 r = r600_bytecode_add_alu(ctx->bc, &alu); 7096 if (r) 7097 return r; 7098 } 7099 7100 // INTERP can't swizzle dst 7101 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7102 for (i = 0; i <= lasti; i++) { 7103 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7104 continue; 7105 7106 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7107 alu.op = ALU_OP1_MOV; 7108 alu.src[0].sel = tmp; 7109 alu.src[0].chan = ctx->src[0].swizzle[i]; 7110 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7111 alu.dst.write = 1; 7112 alu.last = i == lasti; 7113 r = r600_bytecode_add_alu(ctx->bc, &alu); 7114 if (r) 7115 return r; 7116 } 7117 7118 return 0; 7119} 7120 7121 7122static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 7123{ 7124 struct r600_bytecode_alu alu; 7125 int i, r; 7126 7127 for (i = 0; i < 4; i++) { 7128 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7129 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 7130 alu.op = ALU_OP0_NOP; 7131 alu.dst.chan = i; 7132 } else { 7133 alu.op = ALU_OP1_MOV; 7134 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7135 alu.src[0].sel = ctx->temp_reg; 7136 alu.src[0].chan = i; 7137 } 7138 if (i == 3) { 7139 alu.last = 1; 7140 } 7141 r = r600_bytecode_add_alu(ctx->bc, &alu); 7142 if (r) 7143 return r; 7144 } 7145 return 0; 7146} 7147 7148static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx, 7149 unsigned writemask, 7150 struct r600_bytecode_alu_src *bc_src, 7151 const struct r600_shader_src *shader_src) 7152{ 7153 struct r600_bytecode_alu alu; 7154 int i, r; 7155 int lasti = tgsi_last_instruction(writemask); 7156 int temp_reg = 0; 7157 7158 r600_bytecode_src(&bc_src[0], shader_src, 0); 7159 r600_bytecode_src(&bc_src[1], shader_src, 1); 7160 r600_bytecode_src(&bc_src[2], shader_src, 2); 7161 r600_bytecode_src(&bc_src[3], shader_src, 3); 7162 7163 if (bc_src->abs) { 7164 temp_reg = r600_get_temp(ctx); 7165 7166 for (i = 0; i < lasti + 1; i++) { 7167 if (!(writemask & (1 << i))) 7168 continue; 7169 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7170 alu.op = ALU_OP1_MOV; 7171 alu.dst.sel = temp_reg; 7172 alu.dst.chan = i; 7173 alu.dst.write = 1; 7174 alu.src[0] = bc_src[i]; 7175 if (i == lasti) { 7176 alu.last = 1; 7177 } 7178 r = r600_bytecode_add_alu(ctx->bc, &alu); 7179 if (r) 7180 return r; 7181 memset(&bc_src[i], 0, sizeof(*bc_src)); 7182 bc_src[i].sel = temp_reg; 7183 bc_src[i].chan = i; 7184 } 7185 } 7186 return 0; 7187} 7188 7189static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) 7190{ 7191 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7192 struct r600_bytecode_alu alu; 7193 struct r600_bytecode_alu_src srcs[4][4]; 7194 int i, j, r; 7195 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7196 unsigned op = ctx->inst_info->op; 7197 7198 if (op == ALU_OP3_MULADD_IEEE && 7199 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 7200 op = ALU_OP3_MULADD; 7201 7202 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7203 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 7204 srcs[j], &ctx->src[j]); 7205 if (r) 7206 return r; 7207 } 7208 7209 for (i = 0; i < lasti + 1; i++) { 7210 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7211 continue; 7212 7213 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7214 alu.op = op; 7215 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7216 alu.src[j] = srcs[j][i]; 7217 } 7218 7219 if (dst == -1) { 7220 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7221 } else { 7222 alu.dst.sel = dst; 7223 } 7224 alu.dst.chan = i; 7225 alu.dst.write = 1; 7226 alu.is_op3 = 1; 7227 if (i == lasti) { 7228 alu.last = 1; 7229 } 7230 r = r600_bytecode_add_alu(ctx->bc, &alu); 7231 if (r) 7232 return r; 7233 } 7234 return 0; 7235} 7236 7237static int tgsi_op3(struct r600_shader_ctx *ctx) 7238{ 7239 return tgsi_op3_dst(ctx, -1); 7240} 7241 7242static int tgsi_dp(struct r600_shader_ctx *ctx) 7243{ 7244 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7245 struct r600_bytecode_alu alu; 7246 int i, j, r; 7247 unsigned op = ctx->inst_info->op; 7248 if (op == ALU_OP2_DOT4_IEEE && 7249 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 7250 op = ALU_OP2_DOT4; 7251 7252 for (i = 0; i < 4; i++) { 7253 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7254 alu.op = op; 7255 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7256 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 7257 } 7258 7259 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7260 alu.dst.chan = i; 7261 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 7262 /* handle some special cases */ 7263 switch (inst->Instruction.Opcode) { 7264 case TGSI_OPCODE_DP2: 7265 if (i > 1) { 7266 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7267 alu.src[0].chan = alu.src[1].chan = 0; 7268 } 7269 break; 7270 case TGSI_OPCODE_DP3: 7271 if (i > 2) { 7272 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7273 alu.src[0].chan = alu.src[1].chan = 0; 7274 } 7275 break; 7276 default: 7277 break; 7278 } 7279 if (i == 3) { 7280 alu.last = 1; 7281 } 7282 r = r600_bytecode_add_alu(ctx->bc, &alu); 7283 if (r) 7284 return r; 7285 } 7286 return 0; 7287} 7288 7289static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 7290 unsigned index) 7291{ 7292 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7293 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 7294 inst->Src[index].Register.File != TGSI_FILE_INPUT && 7295 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 7296 ctx->src[index].neg || ctx->src[index].abs || 7297 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY); 7298} 7299 7300static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 7301 unsigned index) 7302{ 7303 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7304 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 7305} 7306 7307static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 7308{ 7309 struct r600_bytecode_vtx vtx; 7310 struct r600_bytecode_alu alu; 7311 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7312 int src_gpr, r, i; 7313 int id = tgsi_tex_get_src_gpr(ctx, 1); 7314 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7315 7316 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7317 if (src_requires_loading) { 7318 for (i = 0; i < 4; i++) { 7319 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7320 alu.op = ALU_OP1_MOV; 7321 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7322 alu.dst.sel = ctx->temp_reg; 7323 alu.dst.chan = i; 7324 if (i == 3) 7325 alu.last = 1; 7326 alu.dst.write = 1; 7327 r = r600_bytecode_add_alu(ctx->bc, &alu); 7328 if (r) 7329 return r; 7330 } 7331 src_gpr = ctx->temp_reg; 7332 } 7333 7334 memset(&vtx, 0, sizeof(vtx)); 7335 vtx.op = FETCH_OP_VFETCH; 7336 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 7337 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7338 vtx.src_gpr = src_gpr; 7339 vtx.mega_fetch_count = 16; 7340 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7341 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7342 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 7343 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 7344 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 7345 vtx.use_const_fields = 1; 7346 vtx.buffer_index_mode = sampler_index_mode; 7347 7348 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 7349 return r; 7350 7351 if (ctx->bc->chip_class >= EVERGREEN) 7352 return 0; 7353 7354 for (i = 0; i < 4; i++) { 7355 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7356 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7357 continue; 7358 7359 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7360 alu.op = ALU_OP2_AND_INT; 7361 7362 alu.dst.chan = i; 7363 alu.dst.sel = vtx.dst_gpr; 7364 alu.dst.write = 1; 7365 7366 alu.src[0].sel = vtx.dst_gpr; 7367 alu.src[0].chan = i; 7368 7369 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL; 7370 alu.src[1].sel += (id * 2); 7371 alu.src[1].chan = i % 4; 7372 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7373 7374 if (i == lasti) 7375 alu.last = 1; 7376 r = r600_bytecode_add_alu(ctx->bc, &alu); 7377 if (r) 7378 return r; 7379 } 7380 7381 if (inst->Dst[0].Register.WriteMask & 3) { 7382 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7383 alu.op = ALU_OP2_OR_INT; 7384 7385 alu.dst.chan = 3; 7386 alu.dst.sel = vtx.dst_gpr; 7387 alu.dst.write = 1; 7388 7389 alu.src[0].sel = vtx.dst_gpr; 7390 alu.src[0].chan = 3; 7391 7392 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1; 7393 alu.src[1].chan = 0; 7394 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7395 7396 alu.last = 1; 7397 r = r600_bytecode_add_alu(ctx->bc, &alu); 7398 if (r) 7399 return r; 7400 } 7401 return 0; 7402} 7403 7404static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base) 7405{ 7406 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7407 int r; 7408 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; 7409 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7410 7411 if (ctx->bc->chip_class < EVERGREEN) { 7412 struct r600_bytecode_alu alu; 7413 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7414 alu.op = ALU_OP1_MOV; 7415 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 7416 /* r600 we have them at channel 2 of the second dword */ 7417 alu.src[0].sel += (id * 2) + 1; 7418 alu.src[0].chan = 1; 7419 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7420 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 7421 alu.last = 1; 7422 r = r600_bytecode_add_alu(ctx->bc, &alu); 7423 if (r) 7424 return r; 7425 return 0; 7426 } else { 7427 struct r600_bytecode_vtx vtx; 7428 memset(&vtx, 0, sizeof(vtx)); 7429 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 7430 vtx.buffer_id = id + eg_buffer_base; 7431 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7432 vtx.src_gpr = 0; 7433 vtx.mega_fetch_count = 16; /* no idea here really... */ 7434 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7435 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7436 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */ 7437 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */ 7438 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */ 7439 vtx.data_format = FMT_32_32_32_32; 7440 vtx.buffer_index_mode = sampler_index_mode; 7441 7442 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 7443 return r; 7444 return 0; 7445 } 7446} 7447 7448 7449static int tgsi_tex(struct r600_shader_ctx *ctx) 7450{ 7451 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7452 struct r600_bytecode_tex tex; 7453 struct r600_bytecode_tex grad_offs[3]; 7454 struct r600_bytecode_alu alu; 7455 unsigned src_gpr; 7456 int r, i, j, n_grad_offs = 0; 7457 int opcode; 7458 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing && 7459 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7460 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 7461 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 7462 7463 bool txf_add_offsets = inst->Texture.NumOffsets && 7464 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7465 inst->Texture.Texture != TGSI_TEXTURE_BUFFER; 7466 7467 /* Texture fetch instructions can only use gprs as source. 7468 * Also they cannot negate the source or take the absolute value */ 7469 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS && 7470 tgsi_tex_src_requires_loading(ctx, 0)) || 7471 read_compressed_msaa || txf_add_offsets; 7472 7473 boolean src_loaded = FALSE; 7474 unsigned sampler_src_reg = 1; 7475 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 7476 boolean has_txq_cube_array_z = false; 7477 unsigned sampler_index_mode; 7478 int array_index_offset_channel = -1; 7479 7480 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 7481 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7482 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 7483 if (inst->Dst[0].Register.WriteMask & 4) { 7484 ctx->shader->has_txq_cube_array_z_comp = true; 7485 has_txq_cube_array_z = true; 7486 } 7487 7488 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 7489 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7490 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 || 7491 inst->Instruction.Opcode == TGSI_OPCODE_TG4) 7492 sampler_src_reg = 2; 7493 7494 /* TGSI moves the sampler to src reg 3 for TXD */ 7495 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) 7496 sampler_src_reg = 3; 7497 7498 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7499 7500 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7501 7502 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 7503 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 7504 if (ctx->bc->chip_class < EVERGREEN) 7505 ctx->shader->uses_tex_buffers = true; 7506 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS); 7507 } 7508 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 7509 if (ctx->bc->chip_class < EVERGREEN) 7510 ctx->shader->uses_tex_buffers = true; 7511 return do_vtx_fetch_inst(ctx, src_requires_loading); 7512 } 7513 } 7514 7515 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 7516 int out_chan; 7517 /* Add perspective divide */ 7518 if (ctx->bc->chip_class == CAYMAN) { 7519 out_chan = 2; 7520 for (i = 0; i < 3; i++) { 7521 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7522 alu.op = ALU_OP1_RECIP_IEEE; 7523 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7524 7525 alu.dst.sel = ctx->temp_reg; 7526 alu.dst.chan = i; 7527 if (i == 2) 7528 alu.last = 1; 7529 if (out_chan == i) 7530 alu.dst.write = 1; 7531 r = r600_bytecode_add_alu(ctx->bc, &alu); 7532 if (r) 7533 return r; 7534 } 7535 7536 } else { 7537 out_chan = 3; 7538 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7539 alu.op = ALU_OP1_RECIP_IEEE; 7540 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7541 7542 alu.dst.sel = ctx->temp_reg; 7543 alu.dst.chan = out_chan; 7544 alu.last = 1; 7545 alu.dst.write = 1; 7546 r = r600_bytecode_add_alu(ctx->bc, &alu); 7547 if (r) 7548 return r; 7549 } 7550 7551 for (i = 0; i < 3; i++) { 7552 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7553 alu.op = ALU_OP2_MUL; 7554 alu.src[0].sel = ctx->temp_reg; 7555 alu.src[0].chan = out_chan; 7556 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 7557 alu.dst.sel = ctx->temp_reg; 7558 alu.dst.chan = i; 7559 alu.dst.write = 1; 7560 r = r600_bytecode_add_alu(ctx->bc, &alu); 7561 if (r) 7562 return r; 7563 } 7564 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7565 alu.op = ALU_OP1_MOV; 7566 alu.src[0].sel = V_SQ_ALU_SRC_1; 7567 alu.src[0].chan = 0; 7568 alu.dst.sel = ctx->temp_reg; 7569 alu.dst.chan = 3; 7570 alu.last = 1; 7571 alu.dst.write = 1; 7572 r = r600_bytecode_add_alu(ctx->bc, &alu); 7573 if (r) 7574 return r; 7575 src_loaded = TRUE; 7576 src_gpr = ctx->temp_reg; 7577 } 7578 7579 7580 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 7581 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7582 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7583 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 7584 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) { 7585 7586 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 7587 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 7588 7589 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 7590 for (i = 0; i < 4; i++) { 7591 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7592 alu.op = ALU_OP2_CUBE; 7593 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 7594 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 7595 alu.dst.sel = ctx->temp_reg; 7596 alu.dst.chan = i; 7597 if (i == 3) 7598 alu.last = 1; 7599 alu.dst.write = 1; 7600 r = r600_bytecode_add_alu(ctx->bc, &alu); 7601 if (r) 7602 return r; 7603 } 7604 7605 /* tmp1.z = RCP_e(|tmp1.z|) */ 7606 if (ctx->bc->chip_class == CAYMAN) { 7607 for (i = 0; i < 3; i++) { 7608 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7609 alu.op = ALU_OP1_RECIP_IEEE; 7610 alu.src[0].sel = ctx->temp_reg; 7611 alu.src[0].chan = 2; 7612 alu.src[0].abs = 1; 7613 alu.dst.sel = ctx->temp_reg; 7614 alu.dst.chan = i; 7615 if (i == 2) 7616 alu.dst.write = 1; 7617 if (i == 2) 7618 alu.last = 1; 7619 r = r600_bytecode_add_alu(ctx->bc, &alu); 7620 if (r) 7621 return r; 7622 } 7623 } else { 7624 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7625 alu.op = ALU_OP1_RECIP_IEEE; 7626 alu.src[0].sel = ctx->temp_reg; 7627 alu.src[0].chan = 2; 7628 alu.src[0].abs = 1; 7629 alu.dst.sel = ctx->temp_reg; 7630 alu.dst.chan = 2; 7631 alu.dst.write = 1; 7632 alu.last = 1; 7633 r = r600_bytecode_add_alu(ctx->bc, &alu); 7634 if (r) 7635 return r; 7636 } 7637 7638 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 7639 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 7640 * muladd has no writemask, have to use another temp 7641 */ 7642 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7643 alu.op = ALU_OP3_MULADD; 7644 alu.is_op3 = 1; 7645 7646 alu.src[0].sel = ctx->temp_reg; 7647 alu.src[0].chan = 0; 7648 alu.src[1].sel = ctx->temp_reg; 7649 alu.src[1].chan = 2; 7650 7651 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7652 alu.src[2].chan = 0; 7653 alu.src[2].value = u_bitcast_f2u(1.5f); 7654 7655 alu.dst.sel = ctx->temp_reg; 7656 alu.dst.chan = 0; 7657 alu.dst.write = 1; 7658 7659 r = r600_bytecode_add_alu(ctx->bc, &alu); 7660 if (r) 7661 return r; 7662 7663 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7664 alu.op = ALU_OP3_MULADD; 7665 alu.is_op3 = 1; 7666 7667 alu.src[0].sel = ctx->temp_reg; 7668 alu.src[0].chan = 1; 7669 alu.src[1].sel = ctx->temp_reg; 7670 alu.src[1].chan = 2; 7671 7672 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7673 alu.src[2].chan = 0; 7674 alu.src[2].value = u_bitcast_f2u(1.5f); 7675 7676 alu.dst.sel = ctx->temp_reg; 7677 alu.dst.chan = 1; 7678 alu.dst.write = 1; 7679 7680 alu.last = 1; 7681 r = r600_bytecode_add_alu(ctx->bc, &alu); 7682 if (r) 7683 return r; 7684 /* write initial compare value into Z component 7685 - W src 0 for shadow cube 7686 - X src 1 for shadow cube array */ 7687 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7688 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7689 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7690 alu.op = ALU_OP1_MOV; 7691 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 7692 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7693 else 7694 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7695 alu.dst.sel = ctx->temp_reg; 7696 alu.dst.chan = 2; 7697 alu.dst.write = 1; 7698 alu.last = 1; 7699 r = r600_bytecode_add_alu(ctx->bc, &alu); 7700 if (r) 7701 return r; 7702 } 7703 7704 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7705 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7706 if (ctx->bc->chip_class >= EVERGREEN) { 7707 int mytmp = r600_get_temp(ctx); 7708 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7709 alu.op = ALU_OP1_MOV; 7710 alu.src[0].sel = ctx->temp_reg; 7711 alu.src[0].chan = 3; 7712 alu.dst.sel = mytmp; 7713 alu.dst.chan = 0; 7714 alu.dst.write = 1; 7715 alu.last = 1; 7716 r = r600_bytecode_add_alu(ctx->bc, &alu); 7717 if (r) 7718 return r; 7719 7720 /* Evaluate the array index according to floor(idx + 0.5). This 7721 * needs to be done before merging the face select value, because 7722 * otherwise the fractional part of the array index will interfere 7723 * with the face select value */ 7724 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7725 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7726 alu.op = ALU_OP1_RNDNE; 7727 alu.dst.sel = ctx->temp_reg; 7728 alu.dst.chan = 3; 7729 alu.dst.write = 1; 7730 alu.last = 1; 7731 r = r600_bytecode_add_alu(ctx->bc, &alu); 7732 if (r) 7733 return r; 7734 7735 /* Because the array slice index and the cube face index are merged 7736 * into one value we have to make sure the array slice index is >= 0, 7737 * otherwise the face selection will fail */ 7738 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7739 alu.op = ALU_OP2_MAX; 7740 alu.src[0].sel = ctx->temp_reg; 7741 alu.src[0].chan = 3; 7742 alu.src[1].sel = V_SQ_ALU_SRC_0; 7743 alu.dst.sel = ctx->temp_reg; 7744 alu.dst.chan = 3; 7745 alu.dst.write = 1; 7746 alu.last = 1; 7747 r = r600_bytecode_add_alu(ctx->bc, &alu); 7748 if (r) 7749 return r; 7750 7751 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 7752 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7753 alu.op = ALU_OP3_MULADD; 7754 alu.is_op3 = 1; 7755 alu.src[0].sel = ctx->temp_reg; 7756 alu.src[0].chan = 3; 7757 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7758 alu.src[1].chan = 0; 7759 alu.src[1].value = u_bitcast_f2u(8.0f); 7760 alu.src[2].sel = mytmp; 7761 alu.src[2].chan = 0; 7762 alu.dst.sel = ctx->temp_reg; 7763 alu.dst.chan = 3; 7764 alu.dst.write = 1; 7765 alu.last = 1; 7766 r = r600_bytecode_add_alu(ctx->bc, &alu); 7767 if (r) 7768 return r; 7769 } else if (ctx->bc->chip_class < EVERGREEN) { 7770 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7771 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 7772 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7773 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7774 tex.src_gpr = r600_get_temp(ctx); 7775 tex.src_sel_x = 0; 7776 tex.src_sel_y = 0; 7777 tex.src_sel_z = 0; 7778 tex.src_sel_w = 0; 7779 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 7780 tex.coord_type_x = 1; 7781 tex.coord_type_y = 1; 7782 tex.coord_type_z = 1; 7783 tex.coord_type_w = 1; 7784 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7785 alu.op = ALU_OP1_MOV; 7786 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7787 alu.dst.sel = tex.src_gpr; 7788 alu.dst.chan = 0; 7789 alu.last = 1; 7790 alu.dst.write = 1; 7791 r = r600_bytecode_add_alu(ctx->bc, &alu); 7792 if (r) 7793 return r; 7794 7795 r = r600_bytecode_add_tex(ctx->bc, &tex); 7796 if (r) 7797 return r; 7798 } 7799 7800 } 7801 7802 /* for cube forms of lod and bias we need to route things */ 7803 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 7804 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 7805 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7806 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 7807 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7808 alu.op = ALU_OP1_MOV; 7809 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7810 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 7811 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7812 else 7813 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7814 alu.dst.sel = ctx->temp_reg; 7815 alu.dst.chan = 2; 7816 alu.last = 1; 7817 alu.dst.write = 1; 7818 r = r600_bytecode_add_alu(ctx->bc, &alu); 7819 if (r) 7820 return r; 7821 } 7822 7823 src_loaded = TRUE; 7824 src_gpr = ctx->temp_reg; 7825 } 7826 7827 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 7828 int temp_h = 0, temp_v = 0; 7829 int start_val = 0; 7830 7831 /* if we've already loaded the src (i.e. CUBE don't reload it). */ 7832 if (src_loaded == TRUE) 7833 start_val = 1; 7834 else 7835 src_loaded = TRUE; 7836 for (i = start_val; i < 3; i++) { 7837 int treg = r600_get_temp(ctx); 7838 7839 if (i == 0) 7840 src_gpr = treg; 7841 else if (i == 1) 7842 temp_h = treg; 7843 else 7844 temp_v = treg; 7845 7846 for (j = 0; j < 4; j++) { 7847 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7848 alu.op = ALU_OP1_MOV; 7849 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 7850 alu.dst.sel = treg; 7851 alu.dst.chan = j; 7852 if (j == 3) 7853 alu.last = 1; 7854 alu.dst.write = 1; 7855 r = r600_bytecode_add_alu(ctx->bc, &alu); 7856 if (r) 7857 return r; 7858 } 7859 } 7860 for (i = 1; i < 3; i++) { 7861 /* set gradients h/v */ 7862 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++]; 7863 memset(t, 0, sizeof(struct r600_bytecode_tex)); 7864 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 7865 FETCH_OP_SET_GRADIENTS_V; 7866 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7867 t->sampler_index_mode = sampler_index_mode; 7868 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 7869 t->resource_index_mode = sampler_index_mode; 7870 7871 t->src_gpr = (i == 1) ? temp_h : temp_v; 7872 t->src_sel_x = 0; 7873 t->src_sel_y = 1; 7874 t->src_sel_z = 2; 7875 t->src_sel_w = 3; 7876 7877 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */ 7878 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7; 7879 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 7880 t->coord_type_x = 1; 7881 t->coord_type_y = 1; 7882 t->coord_type_z = 1; 7883 t->coord_type_w = 1; 7884 } 7885 } 7886 } 7887 7888 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 7889 /* Gather4 should follow the same rules as bilinear filtering, but the hardware 7890 * incorrectly forces nearest filtering if the texture format is integer. 7891 * The only effect it has on Gather4, which always returns 4 texels for 7892 * bilinear filtering, is that the final coordinates are off by 0.5 of 7893 * the texel size. 7894 * 7895 * The workaround is to subtract 0.5 from the unnormalized coordinates, 7896 * or (0.5 / size) from the normalized coordinates. 7897 */ 7898 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT || 7899 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) { 7900 int treg = r600_get_temp(ctx); 7901 7902 /* mov array and comparison oordinate to temp_reg if needed */ 7903 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 7904 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 7905 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) { 7906 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2; 7907 for (i = 2; i <= end; i++) { 7908 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7909 alu.op = ALU_OP1_MOV; 7910 alu.dst.sel = ctx->temp_reg; 7911 alu.dst.chan = i; 7912 alu.dst.write = 1; 7913 alu.last = (i == end); 7914 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7915 r = r600_bytecode_add_alu(ctx->bc, &alu); 7916 if (r) 7917 return r; 7918 } 7919 } 7920 7921 if (inst->Texture.Texture == TGSI_TEXTURE_RECT || 7922 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) { 7923 for (i = 0; i < 2; i++) { 7924 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7925 alu.op = ALU_OP2_ADD; 7926 alu.dst.sel = ctx->temp_reg; 7927 alu.dst.chan = i; 7928 alu.dst.write = 1; 7929 alu.last = i == 1; 7930 if (src_loaded) { 7931 alu.src[0].sel = ctx->temp_reg; 7932 alu.src[0].chan = i; 7933 } else 7934 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7935 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 7936 alu.src[1].neg = 1; 7937 r = r600_bytecode_add_alu(ctx->bc, &alu); 7938 if (r) 7939 return r; 7940 } 7941 } else { 7942 /* execute a TXQ */ 7943 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7944 tex.op = FETCH_OP_GET_TEXTURE_RESINFO; 7945 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7946 tex.sampler_index_mode = sampler_index_mode; 7947 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7948 tex.resource_index_mode = sampler_index_mode; 7949 tex.dst_gpr = treg; 7950 tex.src_sel_x = 4; 7951 tex.src_sel_y = 4; 7952 tex.src_sel_z = 4; 7953 tex.src_sel_w = 4; 7954 tex.dst_sel_x = 0; 7955 tex.dst_sel_y = 1; 7956 tex.dst_sel_z = 7; 7957 tex.dst_sel_w = 7; 7958 r = r600_bytecode_add_tex(ctx->bc, &tex); 7959 if (r) 7960 return r; 7961 7962 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */ 7963 if (ctx->bc->chip_class == CAYMAN) { 7964 /* */ 7965 for (i = 0; i < 2; i++) { 7966 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7967 alu.op = ALU_OP1_INT_TO_FLT; 7968 alu.dst.sel = treg; 7969 alu.dst.chan = i; 7970 alu.dst.write = 1; 7971 alu.src[0].sel = treg; 7972 alu.src[0].chan = i; 7973 alu.last = (i == 1) ? 1 : 0; 7974 r = r600_bytecode_add_alu(ctx->bc, &alu); 7975 if (r) 7976 return r; 7977 } 7978 for (j = 0; j < 2; j++) { 7979 for (i = 0; i < 3; i++) { 7980 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7981 alu.op = ALU_OP1_RECIP_IEEE; 7982 alu.src[0].sel = treg; 7983 alu.src[0].chan = j; 7984 alu.dst.sel = treg; 7985 alu.dst.chan = i; 7986 if (i == 2) 7987 alu.last = 1; 7988 if (i == j) 7989 alu.dst.write = 1; 7990 r = r600_bytecode_add_alu(ctx->bc, &alu); 7991 if (r) 7992 return r; 7993 } 7994 } 7995 } else { 7996 for (i = 0; i < 2; i++) { 7997 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7998 alu.op = ALU_OP1_INT_TO_FLT; 7999 alu.dst.sel = treg; 8000 alu.dst.chan = i; 8001 alu.dst.write = 1; 8002 alu.src[0].sel = treg; 8003 alu.src[0].chan = i; 8004 alu.last = 1; 8005 r = r600_bytecode_add_alu(ctx->bc, &alu); 8006 if (r) 8007 return r; 8008 } 8009 for (i = 0; i < 2; i++) { 8010 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8011 alu.op = ALU_OP1_RECIP_IEEE; 8012 alu.src[0].sel = treg; 8013 alu.src[0].chan = i; 8014 alu.dst.sel = treg; 8015 alu.dst.chan = i; 8016 alu.last = 1; 8017 alu.dst.write = 1; 8018 r = r600_bytecode_add_alu(ctx->bc, &alu); 8019 if (r) 8020 return r; 8021 } 8022 } 8023 for (i = 0; i < 2; i++) { 8024 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8025 alu.op = ALU_OP3_MULADD; 8026 alu.is_op3 = 1; 8027 alu.dst.sel = ctx->temp_reg; 8028 alu.dst.chan = i; 8029 alu.dst.write = 1; 8030 alu.last = i == 1; 8031 alu.src[0].sel = treg; 8032 alu.src[0].chan = i; 8033 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 8034 alu.src[1].neg = 1; 8035 if (src_loaded) { 8036 alu.src[2].sel = ctx->temp_reg; 8037 alu.src[2].chan = i; 8038 } else 8039 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 8040 r = r600_bytecode_add_alu(ctx->bc, &alu); 8041 if (r) 8042 return r; 8043 } 8044 } 8045 src_loaded = TRUE; 8046 src_gpr = ctx->temp_reg; 8047 } 8048 } 8049 8050 if (src_requires_loading && !src_loaded) { 8051 for (i = 0; i < 4; i++) { 8052 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8053 alu.op = ALU_OP1_MOV; 8054 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8055 alu.dst.sel = ctx->temp_reg; 8056 alu.dst.chan = i; 8057 if (i == 3) 8058 alu.last = 1; 8059 alu.dst.write = 1; 8060 r = r600_bytecode_add_alu(ctx->bc, &alu); 8061 if (r) 8062 return r; 8063 } 8064 src_loaded = TRUE; 8065 src_gpr = ctx->temp_reg; 8066 } 8067 8068 /* get offset values */ 8069 if (inst->Texture.NumOffsets) { 8070 assert(inst->Texture.NumOffsets == 1); 8071 8072 /* The texture offset feature doesn't work with the TXF instruction 8073 * and must be emulated by adding the offset to the texture coordinates. */ 8074 if (txf_add_offsets) { 8075 const struct tgsi_texture_offset *off = inst->TexOffsets; 8076 8077 switch (inst->Texture.Texture) { 8078 case TGSI_TEXTURE_3D: 8079 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8080 alu.op = ALU_OP2_ADD_INT; 8081 alu.src[0].sel = src_gpr; 8082 alu.src[0].chan = 2; 8083 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8084 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ]; 8085 alu.dst.sel = src_gpr; 8086 alu.dst.chan = 2; 8087 alu.dst.write = 1; 8088 alu.last = 1; 8089 r = r600_bytecode_add_alu(ctx->bc, &alu); 8090 if (r) 8091 return r; 8092 /* fall through */ 8093 8094 case TGSI_TEXTURE_2D: 8095 case TGSI_TEXTURE_SHADOW2D: 8096 case TGSI_TEXTURE_RECT: 8097 case TGSI_TEXTURE_SHADOWRECT: 8098 case TGSI_TEXTURE_2D_ARRAY: 8099 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8100 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8101 alu.op = ALU_OP2_ADD_INT; 8102 alu.src[0].sel = src_gpr; 8103 alu.src[0].chan = 1; 8104 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8105 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY]; 8106 alu.dst.sel = src_gpr; 8107 alu.dst.chan = 1; 8108 alu.dst.write = 1; 8109 alu.last = 1; 8110 r = r600_bytecode_add_alu(ctx->bc, &alu); 8111 if (r) 8112 return r; 8113 /* fall through */ 8114 8115 case TGSI_TEXTURE_1D: 8116 case TGSI_TEXTURE_SHADOW1D: 8117 case TGSI_TEXTURE_1D_ARRAY: 8118 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8119 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8120 alu.op = ALU_OP2_ADD_INT; 8121 alu.src[0].sel = src_gpr; 8122 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8123 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX]; 8124 alu.dst.sel = src_gpr; 8125 alu.dst.write = 1; 8126 alu.last = 1; 8127 r = r600_bytecode_add_alu(ctx->bc, &alu); 8128 if (r) 8129 return r; 8130 break; 8131 /* texture offsets do not apply to other texture targets */ 8132 } 8133 } else { 8134 switch (inst->Texture.Texture) { 8135 case TGSI_TEXTURE_3D: 8136 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 8137 /* fallthrough */ 8138 case TGSI_TEXTURE_2D: 8139 case TGSI_TEXTURE_SHADOW2D: 8140 case TGSI_TEXTURE_RECT: 8141 case TGSI_TEXTURE_SHADOWRECT: 8142 case TGSI_TEXTURE_2D_ARRAY: 8143 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8144 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 8145 /* fallthrough */ 8146 case TGSI_TEXTURE_1D: 8147 case TGSI_TEXTURE_SHADOW1D: 8148 case TGSI_TEXTURE_1D_ARRAY: 8149 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8150 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 8151 } 8152 } 8153 } 8154 8155 /* Obtain the sample index for reading a compressed MSAA color texture. 8156 * To read the FMASK, we use the ldfptr instruction, which tells us 8157 * where the samples are stored. 8158 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 8159 * which is the identity mapping. Each nibble says which physical sample 8160 * should be fetched to get that sample. 8161 * 8162 * Assume src.z contains the sample index. It should be modified like this: 8163 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 8164 * Then fetch the texel with src. 8165 */ 8166 if (read_compressed_msaa) { 8167 unsigned sample_chan = 3; 8168 unsigned temp = r600_get_temp(ctx); 8169 assert(src_loaded); 8170 8171 /* temp.w = ldfptr() */ 8172 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8173 tex.op = FETCH_OP_LD; 8174 tex.inst_mod = 1; /* to indicate this is ldfptr */ 8175 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8176 tex.sampler_index_mode = sampler_index_mode; 8177 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8178 tex.resource_index_mode = sampler_index_mode; 8179 tex.src_gpr = src_gpr; 8180 tex.dst_gpr = temp; 8181 tex.dst_sel_x = 7; /* mask out these components */ 8182 tex.dst_sel_y = 7; 8183 tex.dst_sel_z = 7; 8184 tex.dst_sel_w = 0; /* store X */ 8185 tex.src_sel_x = 0; 8186 tex.src_sel_y = 1; 8187 tex.src_sel_z = 2; 8188 tex.src_sel_w = 3; 8189 tex.offset_x = offset_x; 8190 tex.offset_y = offset_y; 8191 tex.offset_z = offset_z; 8192 r = r600_bytecode_add_tex(ctx->bc, &tex); 8193 if (r) 8194 return r; 8195 8196 /* temp.x = sample_index*4 */ 8197 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8198 alu.op = ALU_OP2_MULLO_INT; 8199 alu.src[0].sel = src_gpr; 8200 alu.src[0].chan = sample_chan; 8201 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8202 alu.src[1].value = 4; 8203 alu.dst.sel = temp; 8204 alu.dst.chan = 0; 8205 alu.dst.write = 1; 8206 r = emit_mul_int_op(ctx->bc, &alu); 8207 if (r) 8208 return r; 8209 8210 /* sample_index = temp.w >> temp.x */ 8211 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8212 alu.op = ALU_OP2_LSHR_INT; 8213 alu.src[0].sel = temp; 8214 alu.src[0].chan = 3; 8215 alu.src[1].sel = temp; 8216 alu.src[1].chan = 0; 8217 alu.dst.sel = src_gpr; 8218 alu.dst.chan = sample_chan; 8219 alu.dst.write = 1; 8220 alu.last = 1; 8221 r = r600_bytecode_add_alu(ctx->bc, &alu); 8222 if (r) 8223 return r; 8224 8225 /* sample_index & 0xF */ 8226 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8227 alu.op = ALU_OP2_AND_INT; 8228 alu.src[0].sel = src_gpr; 8229 alu.src[0].chan = sample_chan; 8230 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8231 alu.src[1].value = 0xF; 8232 alu.dst.sel = src_gpr; 8233 alu.dst.chan = sample_chan; 8234 alu.dst.write = 1; 8235 alu.last = 1; 8236 r = r600_bytecode_add_alu(ctx->bc, &alu); 8237 if (r) 8238 return r; 8239#if 0 8240 /* visualize the FMASK */ 8241 for (i = 0; i < 4; i++) { 8242 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8243 alu.op = ALU_OP1_INT_TO_FLT; 8244 alu.src[0].sel = src_gpr; 8245 alu.src[0].chan = sample_chan; 8246 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8247 alu.dst.chan = i; 8248 alu.dst.write = 1; 8249 alu.last = 1; 8250 r = r600_bytecode_add_alu(ctx->bc, &alu); 8251 if (r) 8252 return r; 8253 } 8254 return 0; 8255#endif 8256 } 8257 8258 /* does this shader want a num layers from TXQ for a cube array? */ 8259 if (has_txq_cube_array_z) { 8260 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8261 8262 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8263 alu.op = ALU_OP1_MOV; 8264 8265 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 8266 if (ctx->bc->chip_class >= EVERGREEN) { 8267 /* with eg each dword is number of cubes */ 8268 alu.src[0].sel += id / 4; 8269 alu.src[0].chan = id % 4; 8270 } else { 8271 /* r600 we have them at channel 2 of the second dword */ 8272 alu.src[0].sel += (id * 2) + 1; 8273 alu.src[0].chan = 2; 8274 } 8275 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 8276 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 8277 alu.last = 1; 8278 r = r600_bytecode_add_alu(ctx->bc, &alu); 8279 if (r) 8280 return r; 8281 /* disable writemask from texture instruction */ 8282 inst->Dst[0].Register.WriteMask &= ~4; 8283 } 8284 8285 opcode = ctx->inst_info->op; 8286 if (opcode == FETCH_OP_GATHER4 && 8287 inst->TexOffsets[0].File != TGSI_FILE_NULL && 8288 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) { 8289 struct r600_bytecode_tex *t; 8290 opcode = FETCH_OP_GATHER4_O; 8291 8292 /* GATHER4_O/GATHER4_C_O use offset values loaded by 8293 SET_TEXTURE_OFFSETS instruction. The immediate offset values 8294 encoded in the instruction are ignored. */ 8295 t = &grad_offs[n_grad_offs++]; 8296 memset(t, 0, sizeof(struct r600_bytecode_tex)); 8297 t->op = FETCH_OP_SET_TEXTURE_OFFSETS; 8298 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8299 t->sampler_index_mode = sampler_index_mode; 8300 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 8301 t->resource_index_mode = sampler_index_mode; 8302 8303 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index; 8304 t->src_sel_x = inst->TexOffsets[0].SwizzleX; 8305 t->src_sel_y = inst->TexOffsets[0].SwizzleY; 8306 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8307 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) 8308 /* make sure array index selector is 0, this is just a safety 8309 * precausion because TGSI seems to emit something strange here */ 8310 t->src_sel_z = 4; 8311 else 8312 t->src_sel_z = inst->TexOffsets[0].SwizzleZ; 8313 8314 t->src_sel_w = 4; 8315 8316 t->dst_sel_x = 7; 8317 t->dst_sel_y = 7; 8318 t->dst_sel_z = 7; 8319 t->dst_sel_w = 7; 8320 } 8321 8322 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8323 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8324 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8325 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8326 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 8327 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 8328 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8329 switch (opcode) { 8330 case FETCH_OP_SAMPLE: 8331 opcode = FETCH_OP_SAMPLE_C; 8332 break; 8333 case FETCH_OP_SAMPLE_L: 8334 opcode = FETCH_OP_SAMPLE_C_L; 8335 break; 8336 case FETCH_OP_SAMPLE_LB: 8337 opcode = FETCH_OP_SAMPLE_C_LB; 8338 break; 8339 case FETCH_OP_SAMPLE_G: 8340 opcode = FETCH_OP_SAMPLE_C_G; 8341 break; 8342 /* Texture gather variants */ 8343 case FETCH_OP_GATHER4: 8344 opcode = FETCH_OP_GATHER4_C; 8345 break; 8346 case FETCH_OP_GATHER4_O: 8347 opcode = FETCH_OP_GATHER4_C_O; 8348 break; 8349 } 8350 } 8351 8352 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8353 tex.op = opcode; 8354 8355 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8356 tex.sampler_index_mode = sampler_index_mode; 8357 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8358 tex.resource_index_mode = sampler_index_mode; 8359 tex.src_gpr = src_gpr; 8360 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8361 8362 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE || 8363 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) { 8364 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */ 8365 } 8366 8367 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 8368 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX]; 8369 tex.inst_mod = texture_component_select; 8370 8371 if (ctx->bc->chip_class == CAYMAN) { 8372 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8373 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8374 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8375 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8376 } else { 8377 /* GATHER4 result order is different from TGSI TG4 */ 8378 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7; 8379 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7; 8380 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7; 8381 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8382 } 8383 } 8384 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) { 8385 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8386 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8387 tex.dst_sel_z = 7; 8388 tex.dst_sel_w = 7; 8389 } 8390 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8391 tex.dst_sel_x = 3; 8392 tex.dst_sel_y = 7; 8393 tex.dst_sel_z = 7; 8394 tex.dst_sel_w = 7; 8395 } 8396 else { 8397 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8398 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8399 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8400 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8401 } 8402 8403 8404 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8405 tex.src_sel_x = 4; 8406 tex.src_sel_y = 4; 8407 tex.src_sel_z = 4; 8408 tex.src_sel_w = 4; 8409 } else if (src_loaded) { 8410 tex.src_sel_x = 0; 8411 tex.src_sel_y = 1; 8412 tex.src_sel_z = 2; 8413 tex.src_sel_w = 3; 8414 } else { 8415 tex.src_sel_x = ctx->src[0].swizzle[0]; 8416 tex.src_sel_y = ctx->src[0].swizzle[1]; 8417 tex.src_sel_z = ctx->src[0].swizzle[2]; 8418 tex.src_sel_w = ctx->src[0].swizzle[3]; 8419 tex.src_rel = ctx->src[0].rel; 8420 } 8421 8422 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 8423 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8424 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8425 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8426 tex.src_sel_x = 1; 8427 tex.src_sel_y = 0; 8428 tex.src_sel_z = 3; 8429 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 8430 } 8431 8432 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 8433 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 8434 tex.coord_type_x = 1; 8435 tex.coord_type_y = 1; 8436 } 8437 tex.coord_type_z = 1; 8438 tex.coord_type_w = 1; 8439 8440 tex.offset_x = offset_x; 8441 tex.offset_y = offset_y; 8442 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 && 8443 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8444 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) { 8445 tex.offset_z = 0; 8446 } 8447 else { 8448 tex.offset_z = offset_z; 8449 } 8450 8451 /* Put the depth for comparison in W. 8452 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 8453 * Some instructions expect the depth in Z. */ 8454 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8455 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8456 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8457 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 8458 opcode != FETCH_OP_SAMPLE_C_L && 8459 opcode != FETCH_OP_SAMPLE_C_LB) { 8460 tex.src_sel_w = tex.src_sel_z; 8461 } 8462 8463 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 8464 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 8465 if (opcode == FETCH_OP_SAMPLE_C_L || 8466 opcode == FETCH_OP_SAMPLE_C_LB) { 8467 /* the array index is read from Y */ 8468 tex.coord_type_y = 0; 8469 array_index_offset_channel = tex.src_sel_y; 8470 } else { 8471 /* the array index is read from Z */ 8472 tex.coord_type_z = 0; 8473 tex.src_sel_z = tex.src_sel_y; 8474 array_index_offset_channel = tex.src_sel_z; 8475 } 8476 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8477 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) { 8478 tex.coord_type_z = 0; 8479 array_index_offset_channel = tex.src_sel_z; 8480 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8481 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 8482 (ctx->bc->chip_class >= EVERGREEN)) 8483 /* the array index is read from Z, coordinate will be corrected elsewhere */ 8484 tex.coord_type_z = 0; 8485 8486 /* We have array access to 1D or 2D ARRAY, the coordinates are not int -> 8487 * evaluate the array index */ 8488 if (array_index_offset_channel >= 0 && 8489 opcode != FETCH_OP_LD && 8490 opcode != FETCH_OP_GET_TEXTURE_RESINFO) { 8491 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8492 alu.src[0].sel = tex.src_gpr; 8493 alu.src[0].chan = array_index_offset_channel; 8494 alu.src[0].rel = tex.src_rel; 8495 alu.op = ALU_OP1_RNDNE; 8496 alu.dst.sel = tex.src_gpr; 8497 alu.dst.chan = array_index_offset_channel; 8498 alu.dst.rel = tex.src_rel; 8499 alu.dst.write = 1; 8500 alu.last = 1; 8501 r = r600_bytecode_add_alu(ctx->bc, &alu); 8502 if (r) 8503 return r; 8504 } 8505 8506 /* mask unused source components */ 8507 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) { 8508 switch (inst->Texture.Texture) { 8509 case TGSI_TEXTURE_2D: 8510 case TGSI_TEXTURE_RECT: 8511 tex.src_sel_z = 7; 8512 tex.src_sel_w = 7; 8513 break; 8514 case TGSI_TEXTURE_1D_ARRAY: 8515 tex.src_sel_y = 7; 8516 tex.src_sel_w = 7; 8517 break; 8518 case TGSI_TEXTURE_1D: 8519 tex.src_sel_y = 7; 8520 tex.src_sel_z = 7; 8521 tex.src_sel_w = 7; 8522 break; 8523 } 8524 } 8525 8526 /* Emit set gradient and offset instructions. */ 8527 for (i = 0; i < n_grad_offs; ++i) { 8528 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]); 8529 if (r) 8530 return r; 8531 } 8532 8533 r = r600_bytecode_add_tex(ctx->bc, &tex); 8534 if (r) 8535 return r; 8536 8537 /* add shadow ambient support - gallium doesn't do it yet */ 8538 return 0; 8539} 8540 8541static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, 8542 struct tgsi_full_src_register *src) 8543{ 8544 unsigned i; 8545 8546 if (src->Register.Indirect) { 8547 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8548 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id) 8549 return ctx->shader->atomics[i].hw_idx; 8550 } 8551 } else { 8552 uint32_t index = src->Register.Index; 8553 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8554 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index) 8555 continue; 8556 if (index > ctx->shader->atomics[i].end) 8557 continue; 8558 if (index < ctx->shader->atomics[i].start) 8559 continue; 8560 uint32_t offset = (index - ctx->shader->atomics[i].start); 8561 return ctx->shader->atomics[i].hw_idx + offset; 8562 } 8563 } 8564 assert(0); 8565 return -1; 8566} 8567 8568static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx, 8569 int *uav_id_p, int *uav_index_mode_p) 8570{ 8571 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8572 int uav_id, uav_index_mode = 0; 8573 int r; 8574 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8575 8576 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); 8577 8578 if (inst->Src[0].Register.Indirect) { 8579 if (is_cm) { 8580 struct r600_bytecode_alu alu; 8581 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8582 alu.op = ALU_OP2_LSHL_INT; 8583 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index); 8584 alu.src[0].chan = 0; 8585 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8586 alu.src[1].value = 2; 8587 alu.dst.sel = ctx->temp_reg; 8588 alu.dst.chan = 0; 8589 alu.dst.write = 1; 8590 alu.last = 1; 8591 r = r600_bytecode_add_alu(ctx->bc, &alu); 8592 if (r) 8593 return r; 8594 8595 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8596 ctx->temp_reg, 0, 8597 ctx->temp_reg, 0, 8598 V_SQ_ALU_SRC_LITERAL, uav_id * 4); 8599 if (r) 8600 return r; 8601 } else 8602 uav_index_mode = 2; 8603 } else if (is_cm) { 8604 r = single_alu_op2(ctx, ALU_OP1_MOV, 8605 ctx->temp_reg, 0, 8606 V_SQ_ALU_SRC_LITERAL, uav_id * 4, 8607 0, 0); 8608 if (r) 8609 return r; 8610 } 8611 *uav_id_p = uav_id; 8612 *uav_index_mode_p = uav_index_mode; 8613 return 0; 8614} 8615 8616static int tgsi_load_gds(struct r600_shader_ctx *ctx) 8617{ 8618 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8619 int r; 8620 struct r600_bytecode_gds gds; 8621 int uav_id = 0; 8622 int uav_index_mode = 0; 8623 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8624 8625 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 8626 if (r) 8627 return r; 8628 8629 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 8630 gds.op = FETCH_OP_GDS_READ_RET; 8631 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8632 gds.uav_id = is_cm ? 0 : uav_id; 8633 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 8634 gds.src_gpr = ctx->temp_reg; 8635 gds.src_sel_x = (is_cm) ? 0 : 4; 8636 gds.src_sel_y = 4; 8637 gds.src_sel_z = 4; 8638 gds.dst_sel_x = 0; 8639 gds.dst_sel_y = 7; 8640 gds.dst_sel_z = 7; 8641 gds.dst_sel_w = 7; 8642 gds.src_gpr2 = 0; 8643 gds.alloc_consume = !is_cm; 8644 r = r600_bytecode_add_gds(ctx->bc, &gds); 8645 if (r) 8646 return r; 8647 8648 ctx->bc->cf_last->vpm = 1; 8649 return 0; 8650} 8651 8652/* this fixes up 1D arrays properly */ 8653static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr) 8654{ 8655 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8656 int r, i; 8657 struct r600_bytecode_alu alu; 8658 int temp_reg = r600_get_temp(ctx); 8659 8660 for (i = 0; i < 4; i++) { 8661 bool def_val = true, write_zero = false; 8662 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8663 alu.op = ALU_OP1_MOV; 8664 alu.dst.sel = temp_reg; 8665 alu.dst.chan = i; 8666 8667 switch (inst->Memory.Texture) { 8668 case TGSI_TEXTURE_BUFFER: 8669 case TGSI_TEXTURE_1D: 8670 if (i == 1 || i == 2 || i == 3) { 8671 write_zero = true; 8672 } 8673 break; 8674 case TGSI_TEXTURE_1D_ARRAY: 8675 if (i == 1 || i == 3) 8676 write_zero = true; 8677 else if (i == 2) { 8678 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1); 8679 def_val = false; 8680 } 8681 break; 8682 case TGSI_TEXTURE_2D: 8683 if (i == 2 || i == 3) 8684 write_zero = true; 8685 break; 8686 default: 8687 if (i == 3) 8688 write_zero = true; 8689 break; 8690 } 8691 8692 if (write_zero) { 8693 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 8694 alu.src[0].value = 0; 8695 } else if (def_val) { 8696 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i); 8697 } 8698 8699 if (i == 3) 8700 alu.last = 1; 8701 alu.dst.write = 1; 8702 r = r600_bytecode_add_alu(ctx->bc, &alu); 8703 if (r) 8704 return r; 8705 } 8706 *idx_gpr = temp_reg; 8707 return 0; 8708} 8709 8710static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx, 8711 int temp_reg) 8712{ 8713 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8714 int r; 8715 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) { 8716 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]); 8717 r = single_alu_op2(ctx, ALU_OP1_MOV, 8718 temp_reg, 0, 8719 V_SQ_ALU_SRC_LITERAL, value >> 2, 8720 0, 0); 8721 if (r) 8722 return r; 8723 } else { 8724 struct r600_bytecode_alu alu; 8725 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8726 alu.op = ALU_OP2_LSHR_INT; 8727 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0); 8728 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8729 alu.src[1].value = 2; 8730 alu.dst.sel = temp_reg; 8731 alu.dst.write = 1; 8732 alu.last = 1; 8733 r = r600_bytecode_add_alu(ctx->bc, &alu); 8734 if (r) 8735 return r; 8736 } 8737 return 0; 8738} 8739 8740static int tgsi_load_buffer(struct r600_shader_ctx *ctx) 8741{ 8742 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8743 /* have to work out the offset into the RAT immediate return buffer */ 8744 struct r600_bytecode_vtx vtx; 8745 struct r600_bytecode_cf *cf; 8746 int r; 8747 int temp_reg = r600_get_temp(ctx); 8748 unsigned rat_index_mode; 8749 unsigned base; 8750 8751 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8752 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE]; 8753 8754 r = load_buffer_coord(ctx, 1, temp_reg); 8755 if (r) 8756 return r; 8757 ctx->bc->cf_last->barrier = 1; 8758 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8759 vtx.op = FETCH_OP_VFETCH; 8760 vtx.buffer_id = inst->Src[0].Register.Index + base; 8761 vtx.buffer_index_mode = rat_index_mode; 8762 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8763 vtx.src_gpr = temp_reg; 8764 vtx.src_sel_x = 0; 8765 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8766 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 8767 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 8768 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 8769 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 8770 vtx.num_format_all = 1; 8771 vtx.format_comp_all = 1; 8772 vtx.srf_mode_all = 0; 8773 8774 if (inst->Dst[0].Register.WriteMask & 8) { 8775 vtx.data_format = FMT_32_32_32_32; 8776 vtx.use_const_fields = 0; 8777 } else if (inst->Dst[0].Register.WriteMask & 4) { 8778 vtx.data_format = FMT_32_32_32; 8779 vtx.use_const_fields = 0; 8780 } else if (inst->Dst[0].Register.WriteMask & 2) { 8781 vtx.data_format = FMT_32_32; 8782 vtx.use_const_fields = 0; 8783 } else { 8784 vtx.data_format = FMT_32; 8785 vtx.use_const_fields = 0; 8786 } 8787 8788 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8789 if (r) 8790 return r; 8791 cf = ctx->bc->cf_last; 8792 cf->barrier = 1; 8793 return 0; 8794} 8795 8796static int tgsi_load_rat(struct r600_shader_ctx *ctx) 8797{ 8798 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8799 /* have to work out the offset into the RAT immediate return buffer */ 8800 struct r600_bytecode_vtx vtx; 8801 struct r600_bytecode_cf *cf; 8802 int r; 8803 int idx_gpr; 8804 unsigned format, num_format, format_comp, endian; 8805 const struct util_format_description *desc; 8806 unsigned rat_index_mode; 8807 unsigned immed_base; 8808 8809 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8810 8811 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 8812 r = load_index_src(ctx, 1, &idx_gpr); 8813 if (r) 8814 return r; 8815 8816 if (rat_index_mode) 8817 egcm_load_index_reg(ctx->bc, 1, false); 8818 8819 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8820 cf = ctx->bc->cf_last; 8821 8822 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index; 8823 cf->rat.inst = V_RAT_INST_NOP_RTN; 8824 cf->rat.index_mode = rat_index_mode; 8825 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 8826 cf->output.gpr = ctx->thread_id_gpr; 8827 cf->output.index_gpr = idx_gpr; 8828 cf->output.comp_mask = 0xf; 8829 cf->output.burst_count = 1; 8830 cf->vpm = 1; 8831 cf->barrier = 1; 8832 cf->mark = 1; 8833 cf->output.elem_size = 0; 8834 8835 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 8836 cf = ctx->bc->cf_last; 8837 cf->barrier = 1; 8838 8839 desc = util_format_description(inst->Memory.Format); 8840 r600_vertex_data_type(inst->Memory.Format, 8841 &format, &num_format, &format_comp, &endian); 8842 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8843 vtx.op = FETCH_OP_VFETCH; 8844 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 8845 vtx.buffer_index_mode = rat_index_mode; 8846 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8847 vtx.src_gpr = ctx->thread_id_gpr; 8848 vtx.src_sel_x = 1; 8849 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8850 vtx.dst_sel_x = desc->swizzle[0]; 8851 vtx.dst_sel_y = desc->swizzle[1]; 8852 vtx.dst_sel_z = desc->swizzle[2]; 8853 vtx.dst_sel_w = desc->swizzle[3]; 8854 vtx.srf_mode_all = 1; 8855 vtx.data_format = format; 8856 vtx.num_format_all = num_format; 8857 vtx.format_comp_all = format_comp; 8858 vtx.endian = endian; 8859 vtx.offset = 0; 8860 vtx.mega_fetch_count = 3; 8861 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8862 if (r) 8863 return r; 8864 cf = ctx->bc->cf_last; 8865 cf->barrier = 1; 8866 return 0; 8867} 8868 8869static int tgsi_load_lds(struct r600_shader_ctx *ctx) 8870{ 8871 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8872 struct r600_bytecode_alu alu; 8873 int r; 8874 int temp_reg = r600_get_temp(ctx); 8875 8876 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8877 alu.op = ALU_OP1_MOV; 8878 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 8879 alu.dst.sel = temp_reg; 8880 alu.dst.write = 1; 8881 alu.last = 1; 8882 r = r600_bytecode_add_alu(ctx->bc, &alu); 8883 if (r) 8884 return r; 8885 8886 r = do_lds_fetch_values(ctx, temp_reg, 8887 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask); 8888 if (r) 8889 return r; 8890 return 0; 8891} 8892 8893static int tgsi_load(struct r600_shader_ctx *ctx) 8894{ 8895 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8896 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 8897 return tgsi_load_rat(ctx); 8898 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 8899 return tgsi_load_gds(ctx); 8900 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 8901 return tgsi_load_buffer(ctx); 8902 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 8903 return tgsi_load_lds(ctx); 8904 return 0; 8905} 8906 8907static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) 8908{ 8909 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8910 struct r600_bytecode_cf *cf; 8911 int r, i; 8912 unsigned rat_index_mode; 8913 int lasti; 8914 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx); 8915 8916 r = load_buffer_coord(ctx, 0, treg2); 8917 if (r) 8918 return r; 8919 8920 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8921 if (rat_index_mode) 8922 egcm_load_index_reg(ctx->bc, 1, false); 8923 8924 for (i = 0; i <= 3; i++) { 8925 struct r600_bytecode_alu alu; 8926 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8927 alu.op = ALU_OP1_MOV; 8928 alu.dst.sel = temp_reg; 8929 alu.dst.chan = i; 8930 alu.src[0].sel = V_SQ_ALU_SRC_0; 8931 alu.last = (i == 3); 8932 alu.dst.write = 1; 8933 r = r600_bytecode_add_alu(ctx->bc, &alu); 8934 if (r) 8935 return r; 8936 } 8937 8938 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 8939 for (i = 0; i <= lasti; i++) { 8940 struct r600_bytecode_alu alu; 8941 if (!((1 << i) & inst->Dst[0].Register.WriteMask)) 8942 continue; 8943 8944 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8945 temp_reg, 0, 8946 treg2, 0, 8947 V_SQ_ALU_SRC_LITERAL, i); 8948 if (r) 8949 return r; 8950 8951 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8952 alu.op = ALU_OP1_MOV; 8953 alu.dst.sel = ctx->temp_reg; 8954 alu.dst.chan = 0; 8955 8956 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 8957 alu.last = 1; 8958 alu.dst.write = 1; 8959 r = r600_bytecode_add_alu(ctx->bc, &alu); 8960 if (r) 8961 return r; 8962 8963 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8964 cf = ctx->bc->cf_last; 8965 8966 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE]; 8967 cf->rat.inst = V_RAT_INST_STORE_TYPED; 8968 cf->rat.index_mode = rat_index_mode; 8969 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 8970 cf->output.gpr = ctx->temp_reg; 8971 cf->output.index_gpr = temp_reg; 8972 cf->output.comp_mask = 1; 8973 cf->output.burst_count = 1; 8974 cf->vpm = 1; 8975 cf->barrier = 1; 8976 cf->output.elem_size = 0; 8977 } 8978 return 0; 8979} 8980 8981static int tgsi_store_rat(struct r600_shader_ctx *ctx) 8982{ 8983 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8984 struct r600_bytecode_cf *cf; 8985 bool src_requires_loading = false; 8986 int val_gpr, idx_gpr; 8987 int r, i; 8988 unsigned rat_index_mode; 8989 8990 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8991 8992 r = load_index_src(ctx, 0, &idx_gpr); 8993 if (r) 8994 return r; 8995 8996 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY) 8997 src_requires_loading = true; 8998 8999 if (src_requires_loading) { 9000 struct r600_bytecode_alu alu; 9001 for (i = 0; i < 4; i++) { 9002 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9003 alu.op = ALU_OP1_MOV; 9004 alu.dst.sel = ctx->temp_reg; 9005 alu.dst.chan = i; 9006 9007 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9008 if (i == 3) 9009 alu.last = 1; 9010 alu.dst.write = 1; 9011 r = r600_bytecode_add_alu(ctx->bc, &alu); 9012 if (r) 9013 return r; 9014 } 9015 val_gpr = ctx->temp_reg; 9016 } else 9017 val_gpr = tgsi_tex_get_src_gpr(ctx, 1); 9018 if (rat_index_mode) 9019 egcm_load_index_reg(ctx->bc, 1, false); 9020 9021 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9022 cf = ctx->bc->cf_last; 9023 9024 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index; 9025 cf->rat.inst = V_RAT_INST_STORE_TYPED; 9026 cf->rat.index_mode = rat_index_mode; 9027 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 9028 cf->output.gpr = val_gpr; 9029 cf->output.index_gpr = idx_gpr; 9030 cf->output.comp_mask = 0xf; 9031 cf->output.burst_count = 1; 9032 cf->vpm = 1; 9033 cf->barrier = 1; 9034 cf->output.elem_size = 0; 9035 return 0; 9036} 9037 9038static int tgsi_store_lds(struct r600_shader_ctx *ctx) 9039{ 9040 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9041 struct r600_bytecode_alu alu; 9042 int r, i, lasti; 9043 int write_mask = inst->Dst[0].Register.WriteMask; 9044 int temp_reg = r600_get_temp(ctx); 9045 9046 /* LDS write */ 9047 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9048 alu.op = ALU_OP1_MOV; 9049 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9050 alu.dst.sel = temp_reg; 9051 alu.dst.write = 1; 9052 alu.last = 1; 9053 r = r600_bytecode_add_alu(ctx->bc, &alu); 9054 if (r) 9055 return r; 9056 9057 lasti = tgsi_last_instruction(write_mask); 9058 for (i = 1; i <= lasti; i++) { 9059 if (!(write_mask & (1 << i))) 9060 continue; 9061 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 9062 temp_reg, i, 9063 temp_reg, 0, 9064 V_SQ_ALU_SRC_LITERAL, 4 * i); 9065 if (r) 9066 return r; 9067 } 9068 for (i = 0; i <= lasti; i++) { 9069 if (!(write_mask & (1 << i))) 9070 continue; 9071 9072 if ((i == 0 && ((write_mask & 3) == 3)) || 9073 (i == 2 && ((write_mask & 0xc) == 0xc))) { 9074 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9075 alu.op = LDS_OP3_LDS_WRITE_REL; 9076 9077 alu.src[0].sel = temp_reg; 9078 alu.src[0].chan = i; 9079 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9080 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1); 9081 alu.last = 1; 9082 alu.is_lds_idx_op = true; 9083 alu.lds_idx = 1; 9084 r = r600_bytecode_add_alu(ctx->bc, &alu); 9085 if (r) 9086 return r; 9087 i += 1; 9088 continue; 9089 } 9090 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9091 alu.op = LDS_OP2_LDS_WRITE; 9092 9093 alu.src[0].sel = temp_reg; 9094 alu.src[0].chan = i; 9095 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9096 9097 alu.last = 1; 9098 alu.is_lds_idx_op = true; 9099 9100 r = r600_bytecode_add_alu(ctx->bc, &alu); 9101 if (r) 9102 return r; 9103 } 9104 return 0; 9105} 9106 9107static int tgsi_store(struct r600_shader_ctx *ctx) 9108{ 9109 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9110 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) 9111 return tgsi_store_buffer_rat(ctx); 9112 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) 9113 return tgsi_store_lds(ctx); 9114 else 9115 return tgsi_store_rat(ctx); 9116} 9117 9118static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx) 9119{ 9120 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9121 /* have to work out the offset into the RAT immediate return buffer */ 9122 struct r600_bytecode_alu alu; 9123 struct r600_bytecode_vtx vtx; 9124 struct r600_bytecode_cf *cf; 9125 int r; 9126 int idx_gpr; 9127 unsigned format, num_format, format_comp, endian; 9128 const struct util_format_description *desc; 9129 unsigned rat_index_mode; 9130 unsigned immed_base; 9131 unsigned rat_base; 9132 9133 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 9134 rat_base = ctx->shader->rat_base; 9135 9136 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 9137 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9138 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9139 9140 r = load_buffer_coord(ctx, 1, ctx->temp_reg); 9141 if (r) 9142 return r; 9143 idx_gpr = ctx->temp_reg; 9144 } else { 9145 r = load_index_src(ctx, 1, &idx_gpr); 9146 if (r) 9147 return r; 9148 } 9149 9150 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9151 9152 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) { 9153 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9154 alu.op = ALU_OP1_MOV; 9155 alu.dst.sel = ctx->thread_id_gpr; 9156 alu.dst.chan = 0; 9157 alu.dst.write = 1; 9158 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9159 alu.last = 1; 9160 r = r600_bytecode_add_alu(ctx->bc, &alu); 9161 if (r) 9162 return r; 9163 9164 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9165 alu.op = ALU_OP1_MOV; 9166 alu.dst.sel = ctx->thread_id_gpr; 9167 if (ctx->bc->chip_class == CAYMAN) 9168 alu.dst.chan = 2; 9169 else 9170 alu.dst.chan = 3; 9171 alu.dst.write = 1; 9172 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9173 alu.last = 1; 9174 r = r600_bytecode_add_alu(ctx->bc, &alu); 9175 if (r) 9176 return r; 9177 } else { 9178 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9179 alu.op = ALU_OP1_MOV; 9180 alu.dst.sel = ctx->thread_id_gpr; 9181 alu.dst.chan = 0; 9182 alu.dst.write = 1; 9183 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9184 alu.last = 1; 9185 r = r600_bytecode_add_alu(ctx->bc, &alu); 9186 if (r) 9187 return r; 9188 } 9189 9190 if (rat_index_mode) 9191 egcm_load_index_reg(ctx->bc, 1, false); 9192 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9193 cf = ctx->bc->cf_last; 9194 9195 cf->rat.id = rat_base + inst->Src[0].Register.Index; 9196 cf->rat.inst = ctx->inst_info->op; 9197 cf->rat.index_mode = rat_index_mode; 9198 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 9199 cf->output.gpr = ctx->thread_id_gpr; 9200 cf->output.index_gpr = idx_gpr; 9201 cf->output.comp_mask = 0xf; 9202 cf->output.burst_count = 1; 9203 cf->vpm = 1; 9204 cf->barrier = 1; 9205 cf->mark = 1; 9206 cf->output.elem_size = 0; 9207 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 9208 cf = ctx->bc->cf_last; 9209 cf->barrier = 1; 9210 cf->cf_addr = 1; 9211 9212 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 9213 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 9214 desc = util_format_description(inst->Memory.Format); 9215 r600_vertex_data_type(inst->Memory.Format, 9216 &format, &num_format, &format_comp, &endian); 9217 vtx.dst_sel_x = desc->swizzle[0]; 9218 } else { 9219 format = FMT_32; 9220 num_format = 1; 9221 format_comp = 0; 9222 endian = 0; 9223 vtx.dst_sel_x = 0; 9224 } 9225 vtx.op = FETCH_OP_VFETCH; 9226 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 9227 vtx.buffer_index_mode = rat_index_mode; 9228 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 9229 vtx.src_gpr = ctx->thread_id_gpr; 9230 vtx.src_sel_x = 1; 9231 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9232 vtx.dst_sel_y = 7; 9233 vtx.dst_sel_z = 7; 9234 vtx.dst_sel_w = 7; 9235 vtx.use_const_fields = 0; 9236 vtx.srf_mode_all = 1; 9237 vtx.data_format = format; 9238 vtx.num_format_all = num_format; 9239 vtx.format_comp_all = format_comp; 9240 vtx.endian = endian; 9241 vtx.offset = 0; 9242 vtx.mega_fetch_count = 0xf; 9243 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 9244 if (r) 9245 return r; 9246 cf = ctx->bc->cf_last; 9247 cf->vpm = 1; 9248 cf->barrier = 1; 9249 return 0; 9250} 9251 9252static int get_gds_op(int opcode) 9253{ 9254 switch (opcode) { 9255 case TGSI_OPCODE_ATOMUADD: 9256 return FETCH_OP_GDS_ADD_RET; 9257 case TGSI_OPCODE_ATOMAND: 9258 return FETCH_OP_GDS_AND_RET; 9259 case TGSI_OPCODE_ATOMOR: 9260 return FETCH_OP_GDS_OR_RET; 9261 case TGSI_OPCODE_ATOMXOR: 9262 return FETCH_OP_GDS_XOR_RET; 9263 case TGSI_OPCODE_ATOMUMIN: 9264 return FETCH_OP_GDS_MIN_UINT_RET; 9265 case TGSI_OPCODE_ATOMUMAX: 9266 return FETCH_OP_GDS_MAX_UINT_RET; 9267 case TGSI_OPCODE_ATOMXCHG: 9268 return FETCH_OP_GDS_XCHG_RET; 9269 case TGSI_OPCODE_ATOMCAS: 9270 return FETCH_OP_GDS_CMP_XCHG_RET; 9271 default: 9272 return -1; 9273 } 9274} 9275 9276static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) 9277{ 9278 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9279 struct r600_bytecode_gds gds; 9280 struct r600_bytecode_alu alu; 9281 int gds_op = get_gds_op(inst->Instruction.Opcode); 9282 int r; 9283 int uav_id = 0; 9284 int uav_index_mode = 0; 9285 bool is_cm = (ctx->bc->chip_class == CAYMAN); 9286 9287 if (gds_op == -1) { 9288 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); 9289 return -1; 9290 } 9291 9292 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 9293 if (r) 9294 return r; 9295 9296 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) { 9297 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) { 9298 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]); 9299 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9300 alu.op = ALU_OP1_MOV; 9301 alu.dst.sel = ctx->temp_reg; 9302 alu.dst.chan = is_cm ? 2 : 1; 9303 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9304 alu.src[0].value = value; 9305 alu.last = 1; 9306 alu.dst.write = 1; 9307 r = r600_bytecode_add_alu(ctx->bc, &alu); 9308 if (r) 9309 return r; 9310 } else { 9311 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9312 alu.op = ALU_OP1_MOV; 9313 alu.dst.sel = ctx->temp_reg; 9314 alu.dst.chan = is_cm ? 2 : 1; 9315 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9316 alu.last = 1; 9317 alu.dst.write = 1; 9318 r = r600_bytecode_add_alu(ctx->bc, &alu); 9319 if (r) 9320 return r; 9321 } 9322 } 9323 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { 9324 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); 9325 int abs_value = abs(value); 9326 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET) 9327 gds_op = FETCH_OP_GDS_SUB_RET; 9328 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9329 alu.op = ALU_OP1_MOV; 9330 alu.dst.sel = ctx->temp_reg; 9331 alu.dst.chan = is_cm ? 1 : 0; 9332 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9333 alu.src[0].value = abs_value; 9334 alu.last = 1; 9335 alu.dst.write = 1; 9336 r = r600_bytecode_add_alu(ctx->bc, &alu); 9337 if (r) 9338 return r; 9339 } else { 9340 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9341 alu.op = ALU_OP1_MOV; 9342 alu.dst.sel = ctx->temp_reg; 9343 alu.dst.chan = is_cm ? 1 : 0; 9344 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9345 alu.last = 1; 9346 alu.dst.write = 1; 9347 r = r600_bytecode_add_alu(ctx->bc, &alu); 9348 if (r) 9349 return r; 9350 } 9351 9352 9353 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 9354 gds.op = gds_op; 9355 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9356 gds.uav_id = is_cm ? 0 : uav_id; 9357 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 9358 gds.src_gpr = ctx->temp_reg; 9359 gds.src_gpr2 = 0; 9360 gds.src_sel_x = is_cm ? 0 : 4; 9361 gds.src_sel_y = is_cm ? 1 : 0; 9362 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) 9363 gds.src_sel_z = is_cm ? 2 : 1; 9364 else 9365 gds.src_sel_z = 7; 9366 gds.dst_sel_x = 0; 9367 gds.dst_sel_y = 7; 9368 gds.dst_sel_z = 7; 9369 gds.dst_sel_w = 7; 9370 gds.alloc_consume = !is_cm; 9371 9372 r = r600_bytecode_add_gds(ctx->bc, &gds); 9373 if (r) 9374 return r; 9375 ctx->bc->cf_last->vpm = 1; 9376 return 0; 9377} 9378 9379static int get_lds_op(int opcode) 9380{ 9381 switch (opcode) { 9382 case TGSI_OPCODE_ATOMUADD: 9383 return LDS_OP2_LDS_ADD_RET; 9384 case TGSI_OPCODE_ATOMAND: 9385 return LDS_OP2_LDS_AND_RET; 9386 case TGSI_OPCODE_ATOMOR: 9387 return LDS_OP2_LDS_OR_RET; 9388 case TGSI_OPCODE_ATOMXOR: 9389 return LDS_OP2_LDS_XOR_RET; 9390 case TGSI_OPCODE_ATOMUMIN: 9391 return LDS_OP2_LDS_MIN_UINT_RET; 9392 case TGSI_OPCODE_ATOMUMAX: 9393 return LDS_OP2_LDS_MAX_UINT_RET; 9394 case TGSI_OPCODE_ATOMIMIN: 9395 return LDS_OP2_LDS_MIN_INT_RET; 9396 case TGSI_OPCODE_ATOMIMAX: 9397 return LDS_OP2_LDS_MAX_INT_RET; 9398 case TGSI_OPCODE_ATOMXCHG: 9399 return LDS_OP2_LDS_XCHG_RET; 9400 case TGSI_OPCODE_ATOMCAS: 9401 return LDS_OP3_LDS_CMP_XCHG_RET; 9402 default: 9403 return -1; 9404 } 9405} 9406 9407static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx) 9408{ 9409 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9410 int lds_op = get_lds_op(inst->Instruction.Opcode); 9411 int r; 9412 9413 struct r600_bytecode_alu alu; 9414 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9415 alu.op = lds_op; 9416 alu.is_lds_idx_op = true; 9417 alu.last = 1; 9418 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 9419 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0); 9420 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET) 9421 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0); 9422 else 9423 alu.src[2].sel = V_SQ_ALU_SRC_0; 9424 r = r600_bytecode_add_alu(ctx->bc, &alu); 9425 if (r) 9426 return r; 9427 9428 /* then read from LDS_OQ_A_POP */ 9429 memset(&alu, 0, sizeof(alu)); 9430 9431 alu.op = ALU_OP1_MOV; 9432 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 9433 alu.src[0].chan = 0; 9434 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 9435 alu.dst.write = 1; 9436 alu.last = 1; 9437 r = r600_bytecode_add_alu(ctx->bc, &alu); 9438 if (r) 9439 return r; 9440 9441 return 0; 9442} 9443 9444static int tgsi_atomic_op(struct r600_shader_ctx *ctx) 9445{ 9446 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9447 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 9448 return tgsi_atomic_op_rat(ctx); 9449 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 9450 return tgsi_atomic_op_gds(ctx); 9451 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9452 return tgsi_atomic_op_rat(ctx); 9453 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 9454 return tgsi_atomic_op_lds(ctx); 9455 return 0; 9456} 9457 9458static int tgsi_resq(struct r600_shader_ctx *ctx) 9459{ 9460 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9461 unsigned sampler_index_mode; 9462 struct r600_bytecode_tex tex; 9463 int r; 9464 boolean has_txq_cube_array_z = false; 9465 9466 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || 9467 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { 9468 if (ctx->bc->chip_class < EVERGREEN) 9469 ctx->shader->uses_tex_buffers = true; 9470 unsigned eg_buffer_base = 0; 9471 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET; 9472 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9473 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9474 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base); 9475 } 9476 9477 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY && 9478 inst->Dst[0].Register.WriteMask & 4) { 9479 ctx->shader->has_txq_cube_array_z_comp = true; 9480 has_txq_cube_array_z = true; 9481 } 9482 9483 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9484 if (sampler_index_mode) 9485 egcm_load_index_reg(ctx->bc, 1, false); 9486 9487 9488 /* does this shader want a num layers from TXQ for a cube array? */ 9489 if (has_txq_cube_array_z) { 9490 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset; 9491 struct r600_bytecode_alu alu; 9492 9493 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9494 alu.op = ALU_OP1_MOV; 9495 9496 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 9497 /* with eg each dword is either number of cubes */ 9498 alu.src[0].sel += id / 4; 9499 alu.src[0].chan = id % 4; 9500 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 9501 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 9502 alu.last = 1; 9503 r = r600_bytecode_add_alu(ctx->bc, &alu); 9504 if (r) 9505 return r; 9506 /* disable writemask from texture instruction */ 9507 inst->Dst[0].Register.WriteMask &= ~4; 9508 } 9509 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 9510 tex.op = ctx->inst_info->op; 9511 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index; 9512 tex.sampler_index_mode = sampler_index_mode; 9513 tex.resource_id = tex.sampler_id; 9514 tex.resource_index_mode = sampler_index_mode; 9515 tex.src_sel_x = 4; 9516 tex.src_sel_y = 4; 9517 tex.src_sel_z = 4; 9518 tex.src_sel_w = 4; 9519 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 9520 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 9521 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 9522 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 9523 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9524 r = r600_bytecode_add_tex(ctx->bc, &tex); 9525 if (r) 9526 return r; 9527 9528 return 0; 9529} 9530 9531static int tgsi_lrp(struct r600_shader_ctx *ctx) 9532{ 9533 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9534 struct r600_bytecode_alu alu; 9535 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9536 struct r600_bytecode_alu_src srcs[2][4]; 9537 unsigned i; 9538 int r; 9539 9540 /* optimize if it's just an equal balance */ 9541 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 9542 for (i = 0; i < lasti + 1; i++) { 9543 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9544 continue; 9545 9546 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9547 alu.op = ALU_OP2_ADD; 9548 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9549 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9550 alu.omod = 3; 9551 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9552 alu.dst.chan = i; 9553 if (i == lasti) { 9554 alu.last = 1; 9555 } 9556 r = r600_bytecode_add_alu(ctx->bc, &alu); 9557 if (r) 9558 return r; 9559 } 9560 return 0; 9561 } 9562 9563 /* 1 - src0 */ 9564 for (i = 0; i < lasti + 1; i++) { 9565 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9566 continue; 9567 9568 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9569 alu.op = ALU_OP2_ADD; 9570 alu.src[0].sel = V_SQ_ALU_SRC_1; 9571 alu.src[0].chan = 0; 9572 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 9573 r600_bytecode_src_toggle_neg(&alu.src[1]); 9574 alu.dst.sel = ctx->temp_reg; 9575 alu.dst.chan = i; 9576 if (i == lasti) { 9577 alu.last = 1; 9578 } 9579 alu.dst.write = 1; 9580 r = r600_bytecode_add_alu(ctx->bc, &alu); 9581 if (r) 9582 return r; 9583 } 9584 9585 /* (1 - src0) * src2 */ 9586 for (i = 0; i < lasti + 1; i++) { 9587 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9588 continue; 9589 9590 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9591 alu.op = ALU_OP2_MUL; 9592 alu.src[0].sel = ctx->temp_reg; 9593 alu.src[0].chan = i; 9594 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9595 alu.dst.sel = ctx->temp_reg; 9596 alu.dst.chan = i; 9597 if (i == lasti) { 9598 alu.last = 1; 9599 } 9600 alu.dst.write = 1; 9601 r = r600_bytecode_add_alu(ctx->bc, &alu); 9602 if (r) 9603 return r; 9604 } 9605 9606 /* src0 * src1 + (1 - src0) * src2 */ 9607 9608 for (i = 0; i < 2; i++) { 9609 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9610 srcs[i], &ctx->src[i]); 9611 if (r) 9612 return r; 9613 } 9614 9615 for (i = 0; i < lasti + 1; i++) { 9616 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9617 continue; 9618 9619 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9620 alu.op = ALU_OP3_MULADD; 9621 alu.is_op3 = 1; 9622 alu.src[0] = srcs[0][i]; 9623 alu.src[1] = srcs[1][i]; 9624 alu.src[2].sel = ctx->temp_reg; 9625 alu.src[2].chan = i; 9626 9627 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9628 alu.dst.chan = i; 9629 if (i == lasti) { 9630 alu.last = 1; 9631 } 9632 r = r600_bytecode_add_alu(ctx->bc, &alu); 9633 if (r) 9634 return r; 9635 } 9636 return 0; 9637} 9638 9639static int tgsi_cmp(struct r600_shader_ctx *ctx) 9640{ 9641 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9642 struct r600_bytecode_alu alu; 9643 int i, r, j; 9644 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9645 struct r600_bytecode_alu_src srcs[3][4]; 9646 9647 unsigned op; 9648 9649 if (ctx->src[0].abs && ctx->src[0].neg) { 9650 op = ALU_OP3_CNDE; 9651 ctx->src[0].abs = 0; 9652 ctx->src[0].neg = 0; 9653 } else { 9654 op = ALU_OP3_CNDGE; 9655 } 9656 9657 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 9658 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9659 srcs[j], &ctx->src[j]); 9660 if (r) 9661 return r; 9662 } 9663 9664 for (i = 0; i < lasti + 1; i++) { 9665 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9666 continue; 9667 9668 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9669 alu.op = op; 9670 alu.src[0] = srcs[0][i]; 9671 alu.src[1] = srcs[2][i]; 9672 alu.src[2] = srcs[1][i]; 9673 9674 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9675 alu.dst.chan = i; 9676 alu.dst.write = 1; 9677 alu.is_op3 = 1; 9678 if (i == lasti) 9679 alu.last = 1; 9680 r = r600_bytecode_add_alu(ctx->bc, &alu); 9681 if (r) 9682 return r; 9683 } 9684 return 0; 9685} 9686 9687static int tgsi_ucmp(struct r600_shader_ctx *ctx) 9688{ 9689 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9690 struct r600_bytecode_alu alu; 9691 int i, r; 9692 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9693 9694 for (i = 0; i < lasti + 1; i++) { 9695 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9696 continue; 9697 9698 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9699 alu.op = ALU_OP3_CNDE_INT; 9700 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9701 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9702 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 9703 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9704 alu.dst.chan = i; 9705 alu.dst.write = 1; 9706 alu.is_op3 = 1; 9707 if (i == lasti) 9708 alu.last = 1; 9709 r = r600_bytecode_add_alu(ctx->bc, &alu); 9710 if (r) 9711 return r; 9712 } 9713 return 0; 9714} 9715 9716static int tgsi_exp(struct r600_shader_ctx *ctx) 9717{ 9718 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9719 struct r600_bytecode_alu alu; 9720 int r; 9721 unsigned i; 9722 9723 /* result.x = 2^floor(src); */ 9724 if (inst->Dst[0].Register.WriteMask & 1) { 9725 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9726 9727 alu.op = ALU_OP1_FLOOR; 9728 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9729 9730 alu.dst.sel = ctx->temp_reg; 9731 alu.dst.chan = 0; 9732 alu.dst.write = 1; 9733 alu.last = 1; 9734 r = r600_bytecode_add_alu(ctx->bc, &alu); 9735 if (r) 9736 return r; 9737 9738 if (ctx->bc->chip_class == CAYMAN) { 9739 for (i = 0; i < 3; i++) { 9740 alu.op = ALU_OP1_EXP_IEEE; 9741 alu.src[0].sel = ctx->temp_reg; 9742 alu.src[0].chan = 0; 9743 9744 alu.dst.sel = ctx->temp_reg; 9745 alu.dst.chan = i; 9746 alu.dst.write = i == 0; 9747 alu.last = i == 2; 9748 r = r600_bytecode_add_alu(ctx->bc, &alu); 9749 if (r) 9750 return r; 9751 } 9752 } else { 9753 alu.op = ALU_OP1_EXP_IEEE; 9754 alu.src[0].sel = ctx->temp_reg; 9755 alu.src[0].chan = 0; 9756 9757 alu.dst.sel = ctx->temp_reg; 9758 alu.dst.chan = 0; 9759 alu.dst.write = 1; 9760 alu.last = 1; 9761 r = r600_bytecode_add_alu(ctx->bc, &alu); 9762 if (r) 9763 return r; 9764 } 9765 } 9766 9767 /* result.y = tmp - floor(tmp); */ 9768 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9769 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9770 9771 alu.op = ALU_OP1_FRACT; 9772 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9773 9774 alu.dst.sel = ctx->temp_reg; 9775#if 0 9776 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9777 if (r) 9778 return r; 9779#endif 9780 alu.dst.write = 1; 9781 alu.dst.chan = 1; 9782 9783 alu.last = 1; 9784 9785 r = r600_bytecode_add_alu(ctx->bc, &alu); 9786 if (r) 9787 return r; 9788 } 9789 9790 /* result.z = RoughApprox2ToX(tmp);*/ 9791 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 9792 if (ctx->bc->chip_class == CAYMAN) { 9793 for (i = 0; i < 3; i++) { 9794 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9795 alu.op = ALU_OP1_EXP_IEEE; 9796 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9797 9798 alu.dst.sel = ctx->temp_reg; 9799 alu.dst.chan = i; 9800 if (i == 2) { 9801 alu.dst.write = 1; 9802 alu.last = 1; 9803 } 9804 9805 r = r600_bytecode_add_alu(ctx->bc, &alu); 9806 if (r) 9807 return r; 9808 } 9809 } else { 9810 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9811 alu.op = ALU_OP1_EXP_IEEE; 9812 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9813 9814 alu.dst.sel = ctx->temp_reg; 9815 alu.dst.write = 1; 9816 alu.dst.chan = 2; 9817 9818 alu.last = 1; 9819 9820 r = r600_bytecode_add_alu(ctx->bc, &alu); 9821 if (r) 9822 return r; 9823 } 9824 } 9825 9826 /* result.w = 1.0;*/ 9827 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 9828 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9829 9830 alu.op = ALU_OP1_MOV; 9831 alu.src[0].sel = V_SQ_ALU_SRC_1; 9832 alu.src[0].chan = 0; 9833 9834 alu.dst.sel = ctx->temp_reg; 9835 alu.dst.chan = 3; 9836 alu.dst.write = 1; 9837 alu.last = 1; 9838 r = r600_bytecode_add_alu(ctx->bc, &alu); 9839 if (r) 9840 return r; 9841 } 9842 return tgsi_helper_copy(ctx, inst); 9843} 9844 9845static int tgsi_log(struct r600_shader_ctx *ctx) 9846{ 9847 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9848 struct r600_bytecode_alu alu; 9849 int r; 9850 unsigned i; 9851 9852 /* result.x = floor(log2(|src|)); */ 9853 if (inst->Dst[0].Register.WriteMask & 1) { 9854 if (ctx->bc->chip_class == CAYMAN) { 9855 for (i = 0; i < 3; i++) { 9856 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9857 9858 alu.op = ALU_OP1_LOG_IEEE; 9859 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9860 r600_bytecode_src_set_abs(&alu.src[0]); 9861 9862 alu.dst.sel = ctx->temp_reg; 9863 alu.dst.chan = i; 9864 if (i == 0) 9865 alu.dst.write = 1; 9866 if (i == 2) 9867 alu.last = 1; 9868 r = r600_bytecode_add_alu(ctx->bc, &alu); 9869 if (r) 9870 return r; 9871 } 9872 9873 } else { 9874 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9875 9876 alu.op = ALU_OP1_LOG_IEEE; 9877 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9878 r600_bytecode_src_set_abs(&alu.src[0]); 9879 9880 alu.dst.sel = ctx->temp_reg; 9881 alu.dst.chan = 0; 9882 alu.dst.write = 1; 9883 alu.last = 1; 9884 r = r600_bytecode_add_alu(ctx->bc, &alu); 9885 if (r) 9886 return r; 9887 } 9888 9889 alu.op = ALU_OP1_FLOOR; 9890 alu.src[0].sel = ctx->temp_reg; 9891 alu.src[0].chan = 0; 9892 9893 alu.dst.sel = ctx->temp_reg; 9894 alu.dst.chan = 0; 9895 alu.dst.write = 1; 9896 alu.last = 1; 9897 9898 r = r600_bytecode_add_alu(ctx->bc, &alu); 9899 if (r) 9900 return r; 9901 } 9902 9903 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 9904 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9905 9906 if (ctx->bc->chip_class == CAYMAN) { 9907 for (i = 0; i < 3; i++) { 9908 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9909 9910 alu.op = ALU_OP1_LOG_IEEE; 9911 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9912 r600_bytecode_src_set_abs(&alu.src[0]); 9913 9914 alu.dst.sel = ctx->temp_reg; 9915 alu.dst.chan = i; 9916 if (i == 1) 9917 alu.dst.write = 1; 9918 if (i == 2) 9919 alu.last = 1; 9920 9921 r = r600_bytecode_add_alu(ctx->bc, &alu); 9922 if (r) 9923 return r; 9924 } 9925 } else { 9926 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9927 9928 alu.op = ALU_OP1_LOG_IEEE; 9929 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9930 r600_bytecode_src_set_abs(&alu.src[0]); 9931 9932 alu.dst.sel = ctx->temp_reg; 9933 alu.dst.chan = 1; 9934 alu.dst.write = 1; 9935 alu.last = 1; 9936 9937 r = r600_bytecode_add_alu(ctx->bc, &alu); 9938 if (r) 9939 return r; 9940 } 9941 9942 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9943 9944 alu.op = ALU_OP1_FLOOR; 9945 alu.src[0].sel = ctx->temp_reg; 9946 alu.src[0].chan = 1; 9947 9948 alu.dst.sel = ctx->temp_reg; 9949 alu.dst.chan = 1; 9950 alu.dst.write = 1; 9951 alu.last = 1; 9952 9953 r = r600_bytecode_add_alu(ctx->bc, &alu); 9954 if (r) 9955 return r; 9956 9957 if (ctx->bc->chip_class == CAYMAN) { 9958 for (i = 0; i < 3; i++) { 9959 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9960 alu.op = ALU_OP1_EXP_IEEE; 9961 alu.src[0].sel = ctx->temp_reg; 9962 alu.src[0].chan = 1; 9963 9964 alu.dst.sel = ctx->temp_reg; 9965 alu.dst.chan = i; 9966 if (i == 1) 9967 alu.dst.write = 1; 9968 if (i == 2) 9969 alu.last = 1; 9970 9971 r = r600_bytecode_add_alu(ctx->bc, &alu); 9972 if (r) 9973 return r; 9974 } 9975 } else { 9976 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9977 alu.op = ALU_OP1_EXP_IEEE; 9978 alu.src[0].sel = ctx->temp_reg; 9979 alu.src[0].chan = 1; 9980 9981 alu.dst.sel = ctx->temp_reg; 9982 alu.dst.chan = 1; 9983 alu.dst.write = 1; 9984 alu.last = 1; 9985 9986 r = r600_bytecode_add_alu(ctx->bc, &alu); 9987 if (r) 9988 return r; 9989 } 9990 9991 if (ctx->bc->chip_class == CAYMAN) { 9992 for (i = 0; i < 3; i++) { 9993 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9994 alu.op = ALU_OP1_RECIP_IEEE; 9995 alu.src[0].sel = ctx->temp_reg; 9996 alu.src[0].chan = 1; 9997 9998 alu.dst.sel = ctx->temp_reg; 9999 alu.dst.chan = i; 10000 if (i == 1) 10001 alu.dst.write = 1; 10002 if (i == 2) 10003 alu.last = 1; 10004 10005 r = r600_bytecode_add_alu(ctx->bc, &alu); 10006 if (r) 10007 return r; 10008 } 10009 } else { 10010 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10011 alu.op = ALU_OP1_RECIP_IEEE; 10012 alu.src[0].sel = ctx->temp_reg; 10013 alu.src[0].chan = 1; 10014 10015 alu.dst.sel = ctx->temp_reg; 10016 alu.dst.chan = 1; 10017 alu.dst.write = 1; 10018 alu.last = 1; 10019 10020 r = r600_bytecode_add_alu(ctx->bc, &alu); 10021 if (r) 10022 return r; 10023 } 10024 10025 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10026 10027 alu.op = ALU_OP2_MUL; 10028 10029 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10030 r600_bytecode_src_set_abs(&alu.src[0]); 10031 10032 alu.src[1].sel = ctx->temp_reg; 10033 alu.src[1].chan = 1; 10034 10035 alu.dst.sel = ctx->temp_reg; 10036 alu.dst.chan = 1; 10037 alu.dst.write = 1; 10038 alu.last = 1; 10039 10040 r = r600_bytecode_add_alu(ctx->bc, &alu); 10041 if (r) 10042 return r; 10043 } 10044 10045 /* result.z = log2(|src|);*/ 10046 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 10047 if (ctx->bc->chip_class == CAYMAN) { 10048 for (i = 0; i < 3; i++) { 10049 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10050 10051 alu.op = ALU_OP1_LOG_IEEE; 10052 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10053 r600_bytecode_src_set_abs(&alu.src[0]); 10054 10055 alu.dst.sel = ctx->temp_reg; 10056 if (i == 2) 10057 alu.dst.write = 1; 10058 alu.dst.chan = i; 10059 if (i == 2) 10060 alu.last = 1; 10061 10062 r = r600_bytecode_add_alu(ctx->bc, &alu); 10063 if (r) 10064 return r; 10065 } 10066 } else { 10067 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10068 10069 alu.op = ALU_OP1_LOG_IEEE; 10070 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10071 r600_bytecode_src_set_abs(&alu.src[0]); 10072 10073 alu.dst.sel = ctx->temp_reg; 10074 alu.dst.write = 1; 10075 alu.dst.chan = 2; 10076 alu.last = 1; 10077 10078 r = r600_bytecode_add_alu(ctx->bc, &alu); 10079 if (r) 10080 return r; 10081 } 10082 } 10083 10084 /* result.w = 1.0; */ 10085 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 10086 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10087 10088 alu.op = ALU_OP1_MOV; 10089 alu.src[0].sel = V_SQ_ALU_SRC_1; 10090 alu.src[0].chan = 0; 10091 10092 alu.dst.sel = ctx->temp_reg; 10093 alu.dst.chan = 3; 10094 alu.dst.write = 1; 10095 alu.last = 1; 10096 10097 r = r600_bytecode_add_alu(ctx->bc, &alu); 10098 if (r) 10099 return r; 10100 } 10101 10102 return tgsi_helper_copy(ctx, inst); 10103} 10104 10105static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 10106{ 10107 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10108 struct r600_bytecode_alu alu; 10109 int r; 10110 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10111 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); 10112 10113 assert(inst->Dst[0].Register.Index < 3); 10114 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10115 10116 switch (inst->Instruction.Opcode) { 10117 case TGSI_OPCODE_ARL: 10118 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 10119 break; 10120 case TGSI_OPCODE_ARR: 10121 alu.op = ALU_OP1_FLT_TO_INT; 10122 break; 10123 case TGSI_OPCODE_UARL: 10124 alu.op = ALU_OP1_MOV; 10125 break; 10126 default: 10127 assert(0); 10128 return -1; 10129 } 10130 10131 for (i = 0; i <= lasti; ++i) { 10132 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10133 continue; 10134 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10135 alu.last = i == lasti; 10136 alu.dst.sel = reg; 10137 alu.dst.chan = i; 10138 alu.dst.write = 1; 10139 r = r600_bytecode_add_alu(ctx->bc, &alu); 10140 if (r) 10141 return r; 10142 } 10143 10144 if (inst->Dst[0].Register.Index > 0) 10145 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0; 10146 else 10147 ctx->bc->ar_loaded = 0; 10148 10149 return 0; 10150} 10151static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 10152{ 10153 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10154 struct r600_bytecode_alu alu; 10155 int r; 10156 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10157 10158 switch (inst->Instruction.Opcode) { 10159 case TGSI_OPCODE_ARL: 10160 memset(&alu, 0, sizeof(alu)); 10161 alu.op = ALU_OP1_FLOOR; 10162 alu.dst.sel = ctx->bc->ar_reg; 10163 alu.dst.write = 1; 10164 for (i = 0; i <= lasti; ++i) { 10165 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10166 alu.dst.chan = i; 10167 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10168 alu.last = i == lasti; 10169 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10170 return r; 10171 } 10172 } 10173 10174 memset(&alu, 0, sizeof(alu)); 10175 alu.op = ALU_OP1_FLT_TO_INT; 10176 alu.src[0].sel = ctx->bc->ar_reg; 10177 alu.dst.sel = ctx->bc->ar_reg; 10178 alu.dst.write = 1; 10179 /* FLT_TO_INT is trans-only on r600/r700 */ 10180 alu.last = TRUE; 10181 for (i = 0; i <= lasti; ++i) { 10182 alu.dst.chan = i; 10183 alu.src[0].chan = i; 10184 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10185 return r; 10186 } 10187 break; 10188 case TGSI_OPCODE_ARR: 10189 memset(&alu, 0, sizeof(alu)); 10190 alu.op = ALU_OP1_FLT_TO_INT; 10191 alu.dst.sel = ctx->bc->ar_reg; 10192 alu.dst.write = 1; 10193 /* FLT_TO_INT is trans-only on r600/r700 */ 10194 alu.last = TRUE; 10195 for (i = 0; i <= lasti; ++i) { 10196 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10197 alu.dst.chan = i; 10198 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10199 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10200 return r; 10201 } 10202 } 10203 break; 10204 case TGSI_OPCODE_UARL: 10205 memset(&alu, 0, sizeof(alu)); 10206 alu.op = ALU_OP1_MOV; 10207 alu.dst.sel = ctx->bc->ar_reg; 10208 alu.dst.write = 1; 10209 for (i = 0; i <= lasti; ++i) { 10210 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10211 alu.dst.chan = i; 10212 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10213 alu.last = i == lasti; 10214 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10215 return r; 10216 } 10217 } 10218 break; 10219 default: 10220 assert(0); 10221 return -1; 10222 } 10223 10224 ctx->bc->ar_loaded = 0; 10225 return 0; 10226} 10227 10228static int tgsi_opdst(struct r600_shader_ctx *ctx) 10229{ 10230 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10231 struct r600_bytecode_alu alu; 10232 int i, r = 0; 10233 10234 for (i = 0; i < 4; i++) { 10235 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10236 10237 alu.op = ALU_OP2_MUL; 10238 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10239 10240 if (i == 0 || i == 3) { 10241 alu.src[0].sel = V_SQ_ALU_SRC_1; 10242 } else { 10243 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10244 } 10245 10246 if (i == 0 || i == 2) { 10247 alu.src[1].sel = V_SQ_ALU_SRC_1; 10248 } else { 10249 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 10250 } 10251 if (i == 3) 10252 alu.last = 1; 10253 r = r600_bytecode_add_alu(ctx->bc, &alu); 10254 if (r) 10255 return r; 10256 } 10257 return 0; 10258} 10259 10260static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type, 10261 struct r600_bytecode_alu_src *src) 10262{ 10263 struct r600_bytecode_alu alu; 10264 int r; 10265 10266 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10267 alu.op = opcode; 10268 alu.execute_mask = 1; 10269 alu.update_pred = 1; 10270 10271 alu.dst.sel = ctx->temp_reg; 10272 alu.dst.write = 1; 10273 alu.dst.chan = 0; 10274 10275 alu.src[0] = *src; 10276 alu.src[1].sel = V_SQ_ALU_SRC_0; 10277 alu.src[1].chan = 0; 10278 10279 alu.last = 1; 10280 10281 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 10282 if (r) 10283 return r; 10284 return 0; 10285} 10286 10287static int pops(struct r600_shader_ctx *ctx, int pops) 10288{ 10289 unsigned force_pop = ctx->bc->force_add_cf; 10290 10291 if (!force_pop) { 10292 int alu_pop = 3; 10293 if (ctx->bc->cf_last) { 10294 if (ctx->bc->cf_last->op == CF_OP_ALU) 10295 alu_pop = 0; 10296 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 10297 alu_pop = 1; 10298 } 10299 alu_pop += pops; 10300 if (alu_pop == 1) { 10301 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 10302 ctx->bc->force_add_cf = 1; 10303 } else if (alu_pop == 2) { 10304 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 10305 ctx->bc->force_add_cf = 1; 10306 } else { 10307 force_pop = 1; 10308 } 10309 } 10310 10311 if (force_pop) { 10312 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 10313 ctx->bc->cf_last->pop_count = pops; 10314 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10315 } 10316 10317 return 0; 10318} 10319 10320static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx, 10321 unsigned reason) 10322{ 10323 struct r600_stack_info *stack = &ctx->bc->stack; 10324 unsigned elements; 10325 int entries; 10326 10327 unsigned entry_size = stack->entry_size; 10328 10329 elements = (stack->loop + stack->push_wqm ) * entry_size; 10330 elements += stack->push; 10331 10332 switch (ctx->bc->chip_class) { 10333 case R600: 10334 case R700: 10335 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 10336 * the stack must be reserved to hold the current active/continue 10337 * masks */ 10338 if (reason == FC_PUSH_VPM || stack->push > 0) { 10339 elements += 2; 10340 } 10341 break; 10342 10343 case CAYMAN: 10344 /* r9xx: any stack operation on empty stack consumes 2 additional 10345 * elements */ 10346 elements += 2; 10347 10348 /* fallthrough */ 10349 /* FIXME: do the two elements added above cover the cases for the 10350 * r8xx+ below? */ 10351 10352 case EVERGREEN: 10353 /* r8xx+: 2 extra elements are not always required, but one extra 10354 * element must be added for each of the following cases: 10355 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 10356 * stack usage. 10357 * (Currently we don't use ALU_ELSE_AFTER.) 10358 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 10359 * PUSH instruction executed. 10360 * 10361 * NOTE: it seems we also need to reserve additional element in some 10362 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 10363 * then STACK_SIZE should be 2 instead of 1 */ 10364 if (reason == FC_PUSH_VPM || stack->push > 0) { 10365 elements += 1; 10366 } 10367 break; 10368 10369 default: 10370 assert(0); 10371 break; 10372 } 10373 10374 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 10375 * for all chips, so we use 4 in the final formula, not the real entry_size 10376 * for the chip */ 10377 entry_size = 4; 10378 10379 entries = (elements + (entry_size - 1)) / entry_size; 10380 10381 if (entries > stack->max_entries) 10382 stack->max_entries = entries; 10383 return elements; 10384} 10385 10386static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 10387{ 10388 switch(reason) { 10389 case FC_PUSH_VPM: 10390 --ctx->bc->stack.push; 10391 assert(ctx->bc->stack.push >= 0); 10392 break; 10393 case FC_PUSH_WQM: 10394 --ctx->bc->stack.push_wqm; 10395 assert(ctx->bc->stack.push_wqm >= 0); 10396 break; 10397 case FC_LOOP: 10398 --ctx->bc->stack.loop; 10399 assert(ctx->bc->stack.loop >= 0); 10400 break; 10401 default: 10402 assert(0); 10403 break; 10404 } 10405} 10406 10407static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 10408{ 10409 switch (reason) { 10410 case FC_PUSH_VPM: 10411 ++ctx->bc->stack.push; 10412 break; 10413 case FC_PUSH_WQM: 10414 ++ctx->bc->stack.push_wqm; 10415 break; 10416 case FC_LOOP: 10417 ++ctx->bc->stack.loop; 10418 break; 10419 default: 10420 assert(0); 10421 } 10422 10423 return callstack_update_max_depth(ctx, reason); 10424} 10425 10426static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 10427{ 10428 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 10429 10430 sp->mid = realloc((void *)sp->mid, 10431 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 10432 sp->mid[sp->num_mid] = ctx->bc->cf_last; 10433 sp->num_mid++; 10434} 10435 10436static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 10437{ 10438 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack)); 10439 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 10440 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 10441 ctx->bc->fc_sp++; 10442} 10443 10444static void fc_poplevel(struct r600_shader_ctx *ctx) 10445{ 10446 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1]; 10447 free(sp->mid); 10448 sp->mid = NULL; 10449 sp->num_mid = 0; 10450 sp->start = NULL; 10451 sp->type = 0; 10452 ctx->bc->fc_sp--; 10453} 10454 10455#if 0 10456static int emit_return(struct r600_shader_ctx *ctx) 10457{ 10458 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 10459 return 0; 10460} 10461 10462static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 10463{ 10464 10465 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 10466 ctx->bc->cf_last->pop_count = pops; 10467 /* XXX work out offset */ 10468 return 0; 10469} 10470 10471static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 10472{ 10473 return 0; 10474} 10475 10476static void emit_testflag(struct r600_shader_ctx *ctx) 10477{ 10478 10479} 10480 10481static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 10482{ 10483 emit_testflag(ctx); 10484 emit_jump_to_offset(ctx, 1, 4); 10485 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 10486 pops(ctx, ifidx + 1); 10487 emit_return(ctx); 10488} 10489 10490static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 10491{ 10492 emit_testflag(ctx); 10493 10494 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10495 ctx->bc->cf_last->pop_count = 1; 10496 10497 fc_set_mid(ctx, fc_sp); 10498 10499 pops(ctx, 1); 10500} 10501#endif 10502 10503static int emit_if(struct r600_shader_ctx *ctx, int opcode, 10504 struct r600_bytecode_alu_src *src) 10505{ 10506 int alu_type = CF_OP_ALU_PUSH_BEFORE; 10507 bool needs_workaround = false; 10508 int elems = callstack_push(ctx, FC_PUSH_VPM); 10509 10510 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) 10511 needs_workaround = true; 10512 10513 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) { 10514 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size; 10515 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size; 10516 10517 if (elems && (!dmod1 || !dmod2)) 10518 needs_workaround = true; 10519 } 10520 10521 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 10522 * LOOP_STARTxxx for nested loops may put the branch stack into a state 10523 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 10524 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 10525 if (needs_workaround) { 10526 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 10527 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10528 alu_type = CF_OP_ALU; 10529 } 10530 10531 emit_logic_pred(ctx, opcode, alu_type, src); 10532 10533 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 10534 10535 fc_pushlevel(ctx, FC_IF); 10536 10537 return 0; 10538} 10539 10540static int tgsi_if(struct r600_shader_ctx *ctx) 10541{ 10542 struct r600_bytecode_alu_src alu_src; 10543 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10544 10545 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src); 10546} 10547 10548static int tgsi_uif(struct r600_shader_ctx *ctx) 10549{ 10550 struct r600_bytecode_alu_src alu_src; 10551 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10552 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 10553} 10554 10555static int tgsi_else(struct r600_shader_ctx *ctx) 10556{ 10557 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 10558 ctx->bc->cf_last->pop_count = 1; 10559 10560 fc_set_mid(ctx, ctx->bc->fc_sp - 1); 10561 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id; 10562 return 0; 10563} 10564 10565static int tgsi_endif(struct r600_shader_ctx *ctx) 10566{ 10567 int offset = 2; 10568 pops(ctx, 1); 10569 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) { 10570 R600_ERR("if/endif unbalanced in shader\n"); 10571 return -1; 10572 } 10573 10574 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */ 10575 if (ctx->bc->cf_last->eg_alu_extended) 10576 offset += 2; 10577 10578 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) { 10579 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset; 10580 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1; 10581 } else { 10582 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset; 10583 } 10584 fc_poplevel(ctx); 10585 10586 callstack_pop(ctx, FC_PUSH_VPM); 10587 return 0; 10588} 10589 10590static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 10591{ 10592 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 10593 * limited to 4096 iterations, like the other LOOP_* instructions. */ 10594 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 10595 10596 fc_pushlevel(ctx, FC_LOOP); 10597 10598 /* check stack depth */ 10599 callstack_push(ctx, FC_LOOP); 10600 return 0; 10601} 10602 10603static int tgsi_endloop(struct r600_shader_ctx *ctx) 10604{ 10605 int i; 10606 10607 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 10608 10609 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) { 10610 R600_ERR("loop/endloop in shader code are not paired.\n"); 10611 return -EINVAL; 10612 } 10613 10614 /* fixup loop pointers - from r600isa 10615 LOOP END points to CF after LOOP START, 10616 LOOP START point to CF after LOOP END 10617 BRK/CONT point to LOOP END CF 10618 */ 10619 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2; 10620 10621 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2; 10622 10623 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) { 10624 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id; 10625 } 10626 /* XXX add LOOPRET support */ 10627 fc_poplevel(ctx); 10628 callstack_pop(ctx, FC_LOOP); 10629 return 0; 10630} 10631 10632static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 10633{ 10634 unsigned int fscp; 10635 10636 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 10637 { 10638 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type) 10639 break; 10640 } 10641 10642 if (fscp == 0) { 10643 R600_ERR("Break not inside loop/endloop pair\n"); 10644 return -EINVAL; 10645 } 10646 10647 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10648 10649 fc_set_mid(ctx, fscp - 1); 10650 10651 return 0; 10652} 10653 10654static int tgsi_gs_emit(struct r600_shader_ctx *ctx) 10655{ 10656 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10657 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX]; 10658 int r; 10659 10660 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10661 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE); 10662 10663 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10664 if (!r) { 10665 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream 10666 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10667 return emit_inc_ring_offset(ctx, stream, TRUE); 10668 } 10669 return r; 10670} 10671 10672static int tgsi_umad(struct r600_shader_ctx *ctx) 10673{ 10674 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10675 struct r600_bytecode_alu alu; 10676 int i, j, r; 10677 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10678 10679 /* src0 * src1 */ 10680 for (i = 0; i < lasti + 1; i++) { 10681 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10682 continue; 10683 10684 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10685 10686 alu.dst.chan = i; 10687 alu.dst.sel = ctx->temp_reg; 10688 alu.dst.write = 1; 10689 10690 alu.op = ALU_OP2_MULLO_UINT; 10691 for (j = 0; j < 2; j++) { 10692 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 10693 } 10694 10695 alu.last = 1; 10696 r = emit_mul_int_op(ctx->bc, &alu); 10697 if (r) 10698 return r; 10699 } 10700 10701 10702 for (i = 0; i < lasti + 1; i++) { 10703 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10704 continue; 10705 10706 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10707 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10708 10709 alu.op = ALU_OP2_ADD_INT; 10710 10711 alu.src[0].sel = ctx->temp_reg; 10712 alu.src[0].chan = i; 10713 10714 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 10715 if (i == lasti) { 10716 alu.last = 1; 10717 } 10718 r = r600_bytecode_add_alu(ctx->bc, &alu); 10719 if (r) 10720 return r; 10721 } 10722 return 0; 10723} 10724 10725static int tgsi_pk2h(struct r600_shader_ctx *ctx) 10726{ 10727 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10728 struct r600_bytecode_alu alu; 10729 int r, i; 10730 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10731 10732 /* temp.xy = f32_to_f16(src) */ 10733 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10734 alu.op = ALU_OP1_FLT32_TO_FLT16; 10735 alu.dst.chan = 0; 10736 alu.dst.sel = ctx->temp_reg; 10737 alu.dst.write = 1; 10738 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10739 r = r600_bytecode_add_alu(ctx->bc, &alu); 10740 if (r) 10741 return r; 10742 alu.dst.chan = 1; 10743 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 10744 alu.last = 1; 10745 r = r600_bytecode_add_alu(ctx->bc, &alu); 10746 if (r) 10747 return r; 10748 10749 /* dst.x = temp.y * 0x10000 + temp.x */ 10750 for (i = 0; i < lasti + 1; i++) { 10751 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10752 continue; 10753 10754 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10755 alu.op = ALU_OP3_MULADD_UINT24; 10756 alu.is_op3 = 1; 10757 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10758 alu.last = i == lasti; 10759 alu.src[0].sel = ctx->temp_reg; 10760 alu.src[0].chan = 1; 10761 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10762 alu.src[1].value = 0x10000; 10763 alu.src[2].sel = ctx->temp_reg; 10764 alu.src[2].chan = 0; 10765 r = r600_bytecode_add_alu(ctx->bc, &alu); 10766 if (r) 10767 return r; 10768 } 10769 10770 return 0; 10771} 10772 10773static int tgsi_up2h(struct r600_shader_ctx *ctx) 10774{ 10775 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10776 struct r600_bytecode_alu alu; 10777 int r, i; 10778 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10779 10780 /* temp.x = src.x */ 10781 /* note: no need to mask out the high bits */ 10782 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10783 alu.op = ALU_OP1_MOV; 10784 alu.dst.chan = 0; 10785 alu.dst.sel = ctx->temp_reg; 10786 alu.dst.write = 1; 10787 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10788 r = r600_bytecode_add_alu(ctx->bc, &alu); 10789 if (r) 10790 return r; 10791 10792 /* temp.y = src.x >> 16 */ 10793 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10794 alu.op = ALU_OP2_LSHR_INT; 10795 alu.dst.chan = 1; 10796 alu.dst.sel = ctx->temp_reg; 10797 alu.dst.write = 1; 10798 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10799 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10800 alu.src[1].value = 16; 10801 alu.last = 1; 10802 r = r600_bytecode_add_alu(ctx->bc, &alu); 10803 if (r) 10804 return r; 10805 10806 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ 10807 for (i = 0; i < lasti + 1; i++) { 10808 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10809 continue; 10810 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10811 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10812 alu.op = ALU_OP1_FLT16_TO_FLT32; 10813 alu.src[0].sel = ctx->temp_reg; 10814 alu.src[0].chan = i % 2; 10815 alu.last = i == lasti; 10816 r = r600_bytecode_add_alu(ctx->bc, &alu); 10817 if (r) 10818 return r; 10819 } 10820 10821 return 0; 10822} 10823 10824static int tgsi_bfe(struct r600_shader_ctx *ctx) 10825{ 10826 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10827 struct r600_bytecode_alu alu; 10828 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10829 int r, i; 10830 int dst = -1; 10831 10832 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File && 10833 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) || 10834 (inst->Src[2].Register.File == inst->Dst[0].Register.File && 10835 inst->Src[2].Register.Index == inst->Dst[0].Register.Index)) 10836 dst = r600_get_temp(ctx); 10837 10838 r = tgsi_op3_dst(ctx, dst); 10839 if (r) 10840 return r; 10841 10842 for (i = 0; i < lasti + 1; i++) { 10843 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10844 alu.op = ALU_OP2_SETGE_INT; 10845 r600_bytecode_src(&alu.src[0], &ctx->src[2], i); 10846 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10847 alu.src[1].value = 32; 10848 alu.dst.sel = ctx->temp_reg; 10849 alu.dst.chan = i; 10850 alu.dst.write = 1; 10851 if (i == lasti) 10852 alu.last = 1; 10853 r = r600_bytecode_add_alu(ctx->bc, &alu); 10854 if (r) 10855 return r; 10856 } 10857 10858 for (i = 0; i < lasti + 1; i++) { 10859 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10860 alu.op = ALU_OP3_CNDE_INT; 10861 alu.is_op3 = 1; 10862 alu.src[0].sel = ctx->temp_reg; 10863 alu.src[0].chan = i; 10864 10865 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10866 if (dst != -1) 10867 alu.src[1].sel = dst; 10868 else 10869 alu.src[1].sel = alu.dst.sel; 10870 alu.src[1].chan = i; 10871 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 10872 alu.dst.write = 1; 10873 if (i == lasti) 10874 alu.last = 1; 10875 r = r600_bytecode_add_alu(ctx->bc, &alu); 10876 if (r) 10877 return r; 10878 } 10879 10880 return 0; 10881} 10882 10883static int tgsi_clock(struct r600_shader_ctx *ctx) 10884{ 10885 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10886 struct r600_bytecode_alu alu; 10887 int r; 10888 10889 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10890 alu.op = ALU_OP1_MOV; 10891 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 10892 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO; 10893 r = r600_bytecode_add_alu(ctx->bc, &alu); 10894 if (r) 10895 return r; 10896 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10897 alu.op = ALU_OP1_MOV; 10898 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 10899 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI; 10900 alu.last = 1; 10901 r = r600_bytecode_add_alu(ctx->bc, &alu); 10902 if (r) 10903 return r; 10904 return 0; 10905} 10906 10907static int emit_u64add(struct r600_shader_ctx *ctx, int op, 10908 int treg, 10909 int src0_sel, int src0_chan, 10910 int src1_sel, int src1_chan) 10911{ 10912 struct r600_bytecode_alu alu; 10913 int r; 10914 int opc; 10915 10916 if (op == ALU_OP2_ADD_INT) 10917 opc = ALU_OP2_ADDC_UINT; 10918 else 10919 opc = ALU_OP2_SUBB_UINT; 10920 10921 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10922 alu.op = op; ; 10923 alu.dst.sel = treg; 10924 alu.dst.chan = 0; 10925 alu.dst.write = 1; 10926 alu.src[0].sel = src0_sel; 10927 alu.src[0].chan = src0_chan + 0; 10928 alu.src[1].sel = src1_sel; 10929 alu.src[1].chan = src1_chan + 0; 10930 alu.src[1].neg = 0; 10931 r = r600_bytecode_add_alu(ctx->bc, &alu); 10932 if (r) 10933 return r; 10934 10935 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10936 alu.op = op; 10937 alu.dst.sel = treg; 10938 alu.dst.chan = 1; 10939 alu.dst.write = 1; 10940 alu.src[0].sel = src0_sel; 10941 alu.src[0].chan = src0_chan + 1; 10942 alu.src[1].sel = src1_sel; 10943 alu.src[1].chan = src1_chan + 1; 10944 alu.src[1].neg = 0; 10945 r = r600_bytecode_add_alu(ctx->bc, &alu); 10946 if (r) 10947 return r; 10948 10949 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10950 alu.op = opc; 10951 alu.dst.sel = treg; 10952 alu.dst.chan = 2; 10953 alu.dst.write = 1; 10954 alu.last = 1; 10955 alu.src[0].sel = src0_sel; 10956 alu.src[0].chan = src0_chan + 0; 10957 alu.src[1].sel = src1_sel; 10958 alu.src[1].chan = src1_chan + 0; 10959 alu.src[1].neg = 0; 10960 r = r600_bytecode_add_alu(ctx->bc, &alu); 10961 if (r) 10962 return r; 10963 10964 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10965 alu.op = op; 10966 alu.dst.sel = treg; 10967 alu.dst.chan = 1; 10968 alu.dst.write = 1; 10969 alu.src[0].sel = treg; 10970 alu.src[0].chan = 1; 10971 alu.src[1].sel = treg; 10972 alu.src[1].chan = 2; 10973 alu.last = 1; 10974 r = r600_bytecode_add_alu(ctx->bc, &alu); 10975 if (r) 10976 return r; 10977 return 0; 10978} 10979 10980static int egcm_u64add(struct r600_shader_ctx *ctx) 10981{ 10982 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10983 struct r600_bytecode_alu alu; 10984 int r; 10985 int treg = ctx->temp_reg; 10986 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT; 10987 10988 if (ctx->src[1].neg) { 10989 op = ALU_OP2_SUB_INT; 10990 opc = ALU_OP2_SUBB_UINT; 10991 } 10992 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10993 alu.op = op; ; 10994 alu.dst.sel = treg; 10995 alu.dst.chan = 0; 10996 alu.dst.write = 1; 10997 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10998 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 10999 alu.src[1].neg = 0; 11000 r = r600_bytecode_add_alu(ctx->bc, &alu); 11001 if (r) 11002 return r; 11003 11004 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11005 alu.op = op; 11006 alu.dst.sel = treg; 11007 alu.dst.chan = 1; 11008 alu.dst.write = 1; 11009 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11010 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11011 alu.src[1].neg = 0; 11012 r = r600_bytecode_add_alu(ctx->bc, &alu); 11013 if (r) 11014 return r; 11015 11016 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11017 alu.op = opc ; 11018 alu.dst.sel = treg; 11019 alu.dst.chan = 2; 11020 alu.dst.write = 1; 11021 alu.last = 1; 11022 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11023 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11024 alu.src[1].neg = 0; 11025 r = r600_bytecode_add_alu(ctx->bc, &alu); 11026 if (r) 11027 return r; 11028 11029 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11030 alu.op = op; 11031 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11032 alu.src[0].sel = treg; 11033 alu.src[0].chan = 1; 11034 alu.src[1].sel = treg; 11035 alu.src[1].chan = 2; 11036 alu.last = 1; 11037 r = r600_bytecode_add_alu(ctx->bc, &alu); 11038 if (r) 11039 return r; 11040 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11041 alu.op = ALU_OP1_MOV; 11042 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11043 alu.src[0].sel = treg; 11044 alu.src[0].chan = 0; 11045 alu.last = 1; 11046 r = r600_bytecode_add_alu(ctx->bc, &alu); 11047 if (r) 11048 return r; 11049 return 0; 11050} 11051 11052/* result.y = mul_high a, b 11053 result.x = mul a,b 11054 result.y += a.x * b.y + a.y * b.x; 11055*/ 11056static int egcm_u64mul(struct r600_shader_ctx *ctx) 11057{ 11058 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11059 struct r600_bytecode_alu alu; 11060 int r; 11061 int treg = ctx->temp_reg; 11062 11063 /* temp.x = mul_lo a.x, b.x */ 11064 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11065 alu.op = ALU_OP2_MULLO_UINT; 11066 alu.dst.sel = treg; 11067 alu.dst.chan = 0; 11068 alu.dst.write = 1; 11069 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11070 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11071 r = emit_mul_int_op(ctx->bc, &alu); 11072 if (r) 11073 return r; 11074 11075 /* temp.y = mul_hi a.x, b.x */ 11076 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11077 alu.op = ALU_OP2_MULHI_UINT; 11078 alu.dst.sel = treg; 11079 alu.dst.chan = 1; 11080 alu.dst.write = 1; 11081 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11082 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11083 r = emit_mul_int_op(ctx->bc, &alu); 11084 if (r) 11085 return r; 11086 11087 /* temp.z = mul a.x, b.y */ 11088 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11089 alu.op = ALU_OP2_MULLO_UINT; 11090 alu.dst.sel = treg; 11091 alu.dst.chan = 2; 11092 alu.dst.write = 1; 11093 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11094 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11095 r = emit_mul_int_op(ctx->bc, &alu); 11096 if (r) 11097 return r; 11098 11099 /* temp.w = mul a.y, b.x */ 11100 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11101 alu.op = ALU_OP2_MULLO_UINT; 11102 alu.dst.sel = treg; 11103 alu.dst.chan = 3; 11104 alu.dst.write = 1; 11105 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11106 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11107 r = emit_mul_int_op(ctx->bc, &alu); 11108 if (r) 11109 return r; 11110 11111 /* temp.z = temp.z + temp.w */ 11112 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11113 alu.op = ALU_OP2_ADD_INT; 11114 alu.dst.sel = treg; 11115 alu.dst.chan = 2; 11116 alu.dst.write = 1; 11117 alu.src[0].sel = treg; 11118 alu.src[0].chan = 2; 11119 alu.src[1].sel = treg; 11120 alu.src[1].chan = 3; 11121 alu.last = 1; 11122 r = r600_bytecode_add_alu(ctx->bc, &alu); 11123 if (r) 11124 return r; 11125 11126 /* temp.y = temp.y + temp.z */ 11127 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11128 alu.op = ALU_OP2_ADD_INT; 11129 alu.dst.sel = treg; 11130 alu.dst.chan = 1; 11131 alu.dst.write = 1; 11132 alu.src[0].sel = treg; 11133 alu.src[0].chan = 1; 11134 alu.src[1].sel = treg; 11135 alu.src[1].chan = 2; 11136 alu.last = 1; 11137 r = r600_bytecode_add_alu(ctx->bc, &alu); 11138 if (r) 11139 return r; 11140 11141 /* dst.x = temp.x */ 11142 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11143 alu.op = ALU_OP1_MOV; 11144 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11145 alu.src[0].sel = treg; 11146 alu.src[0].chan = 0; 11147 r = r600_bytecode_add_alu(ctx->bc, &alu); 11148 if (r) 11149 return r; 11150 11151 /* dst.y = temp.y */ 11152 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11153 alu.op = ALU_OP1_MOV; 11154 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11155 alu.src[0].sel = treg; 11156 alu.src[0].chan = 1; 11157 alu.last = 1; 11158 r = r600_bytecode_add_alu(ctx->bc, &alu); 11159 if (r) 11160 return r; 11161 11162 return 0; 11163} 11164 11165static int emit_u64sge(struct r600_shader_ctx *ctx, 11166 int treg, 11167 int src0_sel, int src0_base_chan, 11168 int src1_sel, int src1_base_chan) 11169{ 11170 int r; 11171 /* for 64-bit sge */ 11172 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */ 11173 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT, 11174 treg, 1, 11175 src0_sel, src0_base_chan + 1, 11176 src1_sel, src1_base_chan + 1); 11177 if (r) 11178 return r; 11179 11180 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11181 treg, 0, 11182 src0_sel, src0_base_chan, 11183 src1_sel, src1_base_chan); 11184 if (r) 11185 return r; 11186 11187 r = single_alu_op2(ctx, ALU_OP2_SETE_INT, 11188 treg, 2, 11189 src0_sel, src0_base_chan + 1, 11190 src1_sel, src1_base_chan + 1); 11191 if (r) 11192 return r; 11193 11194 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11195 treg, 0, 11196 treg, 0, 11197 treg, 2); 11198 if (r) 11199 return r; 11200 11201 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11202 treg, 0, 11203 treg, 0, 11204 treg, 1); 11205 if (r) 11206 return r; 11207 return 0; 11208} 11209 11210/* this isn't a complete div it's just enough for qbo shader to work */ 11211static int egcm_u64div(struct r600_shader_ctx *ctx) 11212{ 11213 struct r600_bytecode_alu alu; 11214 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src; 11215 int r, i; 11216 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11217 11218 /* make sure we are dividing my a const with 0 in the high bits */ 11219 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL) 11220 return -1; 11221 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0) 11222 return -1; 11223 /* make sure we are doing one division */ 11224 if (inst->Dst[0].Register.WriteMask != 0x3) 11225 return -1; 11226 11227 /* emit_if uses ctx->temp_reg so we can't */ 11228 int treg = r600_get_temp(ctx); 11229 int tmp_num = r600_get_temp(ctx); 11230 int sub_tmp = r600_get_temp(ctx); 11231 11232 /* tmp quot are tmp_num.zw */ 11233 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0); 11234 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1); 11235 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0); 11236 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1); 11237 11238 /* MOV tmp_num.xy, numerator */ 11239 r = single_alu_op2(ctx, ALU_OP1_MOV, 11240 tmp_num, 0, 11241 alu_num_lo.sel, alu_num_lo.chan, 11242 0, 0); 11243 if (r) 11244 return r; 11245 r = single_alu_op2(ctx, ALU_OP1_MOV, 11246 tmp_num, 1, 11247 alu_num_hi.sel, alu_num_hi.chan, 11248 0, 0); 11249 if (r) 11250 return r; 11251 11252 r = single_alu_op2(ctx, ALU_OP1_MOV, 11253 tmp_num, 2, 11254 V_SQ_ALU_SRC_LITERAL, 0, 11255 0, 0); 11256 if (r) 11257 return r; 11258 11259 r = single_alu_op2(ctx, ALU_OP1_MOV, 11260 tmp_num, 3, 11261 V_SQ_ALU_SRC_LITERAL, 0, 11262 0, 0); 11263 if (r) 11264 return r; 11265 11266 /* treg 0 is log2_denom */ 11267 /* normally this gets the MSB for the denom high value 11268 - however we know this will always be 0 here. */ 11269 r = single_alu_op2(ctx, 11270 ALU_OP1_MOV, 11271 treg, 0, 11272 V_SQ_ALU_SRC_LITERAL, 32, 11273 0, 0); 11274 if (r) 11275 return r; 11276 11277 /* normally check demon hi for 0, but we know it is already */ 11278 /* t0.z = num_hi >= denom_lo */ 11279 r = single_alu_op2(ctx, 11280 ALU_OP2_SETGE_UINT, 11281 treg, 1, 11282 alu_num_hi.sel, alu_num_hi.chan, 11283 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11284 if (r) 11285 return r; 11286 11287 memset(&alu_src, 0, sizeof(alu_src)); 11288 alu_src.sel = treg; 11289 alu_src.chan = 1; 11290 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11291 if (r) 11292 return r; 11293 11294 /* for loops in here */ 11295 /* get msb t0.x = msb(src[1].x) first */ 11296 int msb_lo = util_last_bit(alu_denom_lo.value); 11297 r = single_alu_op2(ctx, ALU_OP1_MOV, 11298 treg, 0, 11299 V_SQ_ALU_SRC_LITERAL, msb_lo, 11300 0, 0); 11301 if (r) 11302 return r; 11303 11304 /* unroll the asm here */ 11305 for (i = 0; i < 31; i++) { 11306 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11307 treg, 2, 11308 V_SQ_ALU_SRC_LITERAL, i, 11309 treg, 0); 11310 if (r) 11311 return r; 11312 11313 /* we can do this on the CPU */ 11314 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i); 11315 /* t0.z = tmp_num.y >= t0.z */ 11316 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11317 treg, 1, 11318 tmp_num, 1, 11319 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11320 if (r) 11321 return r; 11322 11323 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11324 treg, 1, 11325 treg, 1, 11326 treg, 2); 11327 if (r) 11328 return r; 11329 11330 memset(&alu_src, 0, sizeof(alu_src)); 11331 alu_src.sel = treg; 11332 alu_src.chan = 1; 11333 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11334 if (r) 11335 return r; 11336 11337 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11338 tmp_num, 1, 11339 tmp_num, 1, 11340 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11341 if (r) 11342 return r; 11343 11344 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11345 tmp_num, 3, 11346 tmp_num, 3, 11347 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11348 if (r) 11349 return r; 11350 11351 r = tgsi_endif(ctx); 11352 if (r) 11353 return r; 11354 } 11355 11356 /* log2_denom is always <= 31, so manually peel the last loop 11357 * iteration. 11358 */ 11359 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11360 treg, 1, 11361 tmp_num, 1, 11362 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11363 if (r) 11364 return r; 11365 11366 memset(&alu_src, 0, sizeof(alu_src)); 11367 alu_src.sel = treg; 11368 alu_src.chan = 1; 11369 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11370 if (r) 11371 return r; 11372 11373 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11374 tmp_num, 1, 11375 tmp_num, 1, 11376 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11377 if (r) 11378 return r; 11379 11380 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11381 tmp_num, 3, 11382 tmp_num, 3, 11383 V_SQ_ALU_SRC_LITERAL, 1U); 11384 if (r) 11385 return r; 11386 r = tgsi_endif(ctx); 11387 if (r) 11388 return r; 11389 11390 r = tgsi_endif(ctx); 11391 if (r) 11392 return r; 11393 11394 /* onto the second loop to unroll */ 11395 for (i = 0; i < 31; i++) { 11396 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11397 treg, 1, 11398 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)), 11399 treg, 0); 11400 if (r) 11401 return r; 11402 11403 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i); 11404 r = single_alu_op2(ctx, ALU_OP1_MOV, 11405 treg, 2, 11406 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11407 0, 0); 11408 if (r) 11409 return r; 11410 11411 r = single_alu_op2(ctx, ALU_OP1_MOV, 11412 treg, 3, 11413 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11414 0, 0); 11415 if (r) 11416 return r; 11417 11418 r = emit_u64sge(ctx, sub_tmp, 11419 tmp_num, 0, 11420 treg, 2); 11421 if (r) 11422 return r; 11423 11424 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11425 treg, 1, 11426 treg, 1, 11427 sub_tmp, 0); 11428 if (r) 11429 return r; 11430 11431 memset(&alu_src, 0, sizeof(alu_src)); 11432 alu_src.sel = treg; 11433 alu_src.chan = 1; 11434 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11435 if (r) 11436 return r; 11437 11438 11439 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11440 sub_tmp, 11441 tmp_num, 0, 11442 treg, 2); 11443 if (r) 11444 return r; 11445 11446 r = single_alu_op2(ctx, ALU_OP1_MOV, 11447 tmp_num, 0, 11448 sub_tmp, 0, 11449 0, 0); 11450 if (r) 11451 return r; 11452 11453 r = single_alu_op2(ctx, ALU_OP1_MOV, 11454 tmp_num, 1, 11455 sub_tmp, 1, 11456 0, 0); 11457 if (r) 11458 return r; 11459 11460 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11461 tmp_num, 2, 11462 tmp_num, 2, 11463 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11464 if (r) 11465 return r; 11466 11467 r = tgsi_endif(ctx); 11468 if (r) 11469 return r; 11470 } 11471 11472 /* log2_denom is always <= 63, so manually peel the last loop 11473 * iteration. 11474 */ 11475 uint64_t denom_shl = (uint64_t)alu_denom_lo.value; 11476 r = single_alu_op2(ctx, ALU_OP1_MOV, 11477 treg, 2, 11478 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11479 0, 0); 11480 if (r) 11481 return r; 11482 11483 r = single_alu_op2(ctx, ALU_OP1_MOV, 11484 treg, 3, 11485 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11486 0, 0); 11487 if (r) 11488 return r; 11489 11490 r = emit_u64sge(ctx, sub_tmp, 11491 tmp_num, 0, 11492 treg, 2); 11493 if (r) 11494 return r; 11495 11496 memset(&alu_src, 0, sizeof(alu_src)); 11497 alu_src.sel = sub_tmp; 11498 alu_src.chan = 0; 11499 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11500 if (r) 11501 return r; 11502 11503 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11504 sub_tmp, 11505 tmp_num, 0, 11506 treg, 2); 11507 if (r) 11508 return r; 11509 11510 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11511 tmp_num, 2, 11512 tmp_num, 2, 11513 V_SQ_ALU_SRC_LITERAL, 1U); 11514 if (r) 11515 return r; 11516 r = tgsi_endif(ctx); 11517 if (r) 11518 return r; 11519 11520 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11521 alu.op = ALU_OP1_MOV; 11522 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11523 alu.src[0].sel = tmp_num; 11524 alu.src[0].chan = 2; 11525 r = r600_bytecode_add_alu(ctx->bc, &alu); 11526 if (r) 11527 return r; 11528 11529 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11530 alu.op = ALU_OP1_MOV; 11531 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11532 alu.src[0].sel = tmp_num; 11533 alu.src[0].chan = 3; 11534 alu.last = 1; 11535 r = r600_bytecode_add_alu(ctx->bc, &alu); 11536 if (r) 11537 return r; 11538 return 0; 11539} 11540 11541static int egcm_u64sne(struct r600_shader_ctx *ctx) 11542{ 11543 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11544 struct r600_bytecode_alu alu; 11545 int r; 11546 int treg = ctx->temp_reg; 11547 11548 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11549 alu.op = ALU_OP2_SETNE_INT; 11550 alu.dst.sel = treg; 11551 alu.dst.chan = 0; 11552 alu.dst.write = 1; 11553 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11554 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11555 r = r600_bytecode_add_alu(ctx->bc, &alu); 11556 if (r) 11557 return r; 11558 11559 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11560 alu.op = ALU_OP2_SETNE_INT; 11561 alu.dst.sel = treg; 11562 alu.dst.chan = 1; 11563 alu.dst.write = 1; 11564 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11565 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11566 alu.last = 1; 11567 r = r600_bytecode_add_alu(ctx->bc, &alu); 11568 if (r) 11569 return r; 11570 11571 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11572 alu.op = ALU_OP2_OR_INT; 11573 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11574 alu.src[0].sel = treg; 11575 alu.src[0].chan = 0; 11576 alu.src[1].sel = treg; 11577 alu.src[1].chan = 1; 11578 alu.last = 1; 11579 r = r600_bytecode_add_alu(ctx->bc, &alu); 11580 if (r) 11581 return r; 11582 return 0; 11583} 11584 11585static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 11586 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, 11587 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11588 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11589 11590 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11591 11592 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11593 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11594 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11595 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11596 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11597 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11598 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11599 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11600 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */ 11601 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11602 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11603 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11604 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11605 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11606 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11607 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11608 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11609 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11610 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11611 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11612 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11613 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11614 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11615 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11616 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11617 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11618 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11619 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11620 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11621 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported}, 11622 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11623 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11624 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11625 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11626 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11627 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11628 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11629 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11630 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11631 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11632 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 11633 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 11634 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 11635 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 11636 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 11637 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 11638 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 11639 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 11640 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 11641 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 11642 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 11643 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11644 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11645 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11646 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11647 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 11648 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 11649 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl}, 11650 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 11651 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 11652 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 11653 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 11654 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 11655 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 11656 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11657 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 11658 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 11659 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11660 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11661 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 11662 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 11663 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 11664 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 11665 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 11666 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 11667 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11668 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11669 [81] = { ALU_OP0_NOP, tgsi_unsupported}, 11670 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 11671 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 11672 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 11673 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 11674 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 11675 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans}, 11676 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 11677 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 11678 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 11679 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 11680 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 11681 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 11682 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 11683 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11684 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 11685 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 11686 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 11687 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 11688 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11689 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 11690 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11691 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11692 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 11693 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, 11694 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 11695 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 11696 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 11697 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 11698 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 11699 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 11700 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, 11701 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 11702 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 11703 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 11704 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 11705 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 11706 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11707 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 11708 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 11709 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 11710 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 11711 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 11712 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 11713 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans}, 11714 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 11715 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 11716 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 11717 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 11718 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 11719 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 11720 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 11721 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 11722 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 11723 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 11724 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 11725 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 11726 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans}, 11727 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 11728 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap}, 11729 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11730 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 11731 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 11732 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11733 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 11734 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 11735 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 11736 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 11737 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 11738 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 11739 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 11740 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 11741 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 11742 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 11743 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 11744 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 11745 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl}, 11746 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 11747 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 11748 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 11749 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, 11750 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, 11751 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 11752 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 11753 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 11754 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, 11755 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, 11756 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, 11757 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, 11758 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, 11759 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11760 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11761 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11762 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11763 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11764 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11765 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 11766 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11767 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11768 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 11769 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 11770 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported}, 11771 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported}, 11772 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported}, 11773 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported}, 11774 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported}, 11775 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported}, 11776 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported}, 11777 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported}, 11778 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported}, 11779 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported}, 11780 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported}, 11781 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported}, 11782 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported}, 11783 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 11784}; 11785 11786static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 11787 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 11788 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11789 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11790 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11791 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11792 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11793 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11794 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11795 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11796 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11797 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11798 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11799 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11800 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11801 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11802 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11803 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11804 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11805 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 11806 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11807 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11808 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11809 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11810 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11811 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11812 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11813 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11814 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11815 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11816 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11817 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11818 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11819 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 11820 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11821 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11822 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11823 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11824 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11825 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11826 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 11827 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11828 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11829 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11830 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 11831 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 11832 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 11833 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 11834 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 11835 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 11836 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 11837 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 11838 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 11839 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 11840 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 11841 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 11842 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11843 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11844 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11845 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 11846 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 11847 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 11848 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 11849 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 11850 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 11851 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 11852 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 11853 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 11854 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11855 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 11856 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 11857 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11858 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11859 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 11860 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 11861 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 11862 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 11863 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 11864 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 11865 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11866 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11867 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 11868 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 11869 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 11870 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 11871 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 11872 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 11873 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 11874 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 11875 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 11876 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 11877 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 11878 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 11879 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 11880 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11881 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 11882 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 11883 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 11884 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 11885 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11886 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 11887 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11888 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11889 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 11890 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 11891 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 11892 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 11893 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 11894 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 11895 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 11896 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 11897 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 11898 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 11899 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 11900 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 11901 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 11902 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 11903 /* Refer below for TGSI_OPCODE_DFMA */ 11904 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i}, 11905 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 11906 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 11907 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 11908 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 11909 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 11910 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 11911 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 11912 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 11913 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 11914 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 11915 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 11916 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 11917 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 11918 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 11919 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 11920 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 11921 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 11922 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 11923 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 11924 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 11925 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 11926 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11927 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 11928 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 11929 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11930 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 11931 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 11932 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 11933 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 11934 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 11935 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 11936 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 11937 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 11938 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 11939 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 11940 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 11941 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 11942 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 11943 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 11944 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 11945 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 11946 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 11947 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 11948 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 11949 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 11950 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 11951 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 11952 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 11953 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 11954 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 11955 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 11956 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 11957 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 11958 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 11959 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 11960 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 11961 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 11962 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 11963 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11964 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11965 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 11966 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 11967 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 11968 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 11969 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 11970 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 11971 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 11972 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 11973 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 11974 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 11975 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 11976 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 11977 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 11978 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 11979 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 11980 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 11981 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 11982 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 11983 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 11984 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 11985 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 11986 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 11987 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 11988 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 11989 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 11990 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 11991 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 11992 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 11993 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 11994 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 11995 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 11996 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 11997 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 11998 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 11999 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12000 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12001 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12002 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12003 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12004 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12005 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12006 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12007 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12008 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12009 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12010}; 12011 12012static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 12013 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 12014 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 12015 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 12016 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 12017 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 12018 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 12019 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 12020 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 12021 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 12022 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12023 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12024 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 12025 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 12026 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 12027 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 12028 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 12029 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 12030 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 12031 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 12032 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr}, 12033 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 12034 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 12035 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 12036 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 12037 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 12038 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 12039 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 12040 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 12041 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 12042 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow}, 12043 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 12044 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 12045 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 12046 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 12047 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 12048 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig}, 12049 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12050 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12051 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 12052 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 12053 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12054 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12055 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12056 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 12057 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 12058 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 12059 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 12060 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig}, 12061 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 12062 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 12063 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 12064 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 12065 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 12066 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 12067 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 12068 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12069 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12070 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12071 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 12072 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 12073 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 12074 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 12075 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 12076 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 12077 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 12078 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 12079 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 12080 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12081 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 12082 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 12083 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12084 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12085 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 12086 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 12087 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 12088 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 12089 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 12090 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 12091 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12092 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12093 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 12094 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 12095 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2}, 12096 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 12097 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 12098 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 12099 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 12100 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 12101 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 12102 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 12103 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 12104 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 12105 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 12106 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12107 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 12108 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 12109 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 12110 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 12111 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12112 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 12113 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12114 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12115 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 12116 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 12117 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 12118 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 12119 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 12120 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 12121 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 12122 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 12123 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12124 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 12125 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 12126 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 12127 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 12128 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 12129 /* Refer below for TGSI_OPCODE_DFMA */ 12130 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2}, 12131 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 12132 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 12133 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 12134 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 12135 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 12136 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 12137 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 12138 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2}, 12139 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2}, 12140 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 12141 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 12142 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 12143 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 12144 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 12145 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 12146 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 12147 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 12148 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 12149 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 12150 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 12151 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 12152 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12153 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 12154 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 12155 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12156 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 12157 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 12158 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 12159 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 12160 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 12161 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 12162 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 12163 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 12164 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 12165 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 12166 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 12167 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 12168 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 12169 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 12170 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 12171 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 12172 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 12173 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 12174 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 12175 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 12176 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 12177 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12178 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 12179 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 12180 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 12181 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 12182 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 12183 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 12184 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 12185 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 12186 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 12187 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 12188 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 12189 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12190 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12191 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr}, 12192 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr}, 12193 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 12194 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 12195 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 12196 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 12197 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 12198 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 12199 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 12200 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 12201 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 12202 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 12203 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12204 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12205 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12206 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 12207 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 12208 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 12209 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 12210 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 12211 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 12212 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 12213 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 12214 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 12215 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 12216 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 12217 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 12218 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 12219 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 12220 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 12221 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12222 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12223 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 12224 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 12225 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12226 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12227 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12228 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12229 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12230 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12231 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12232 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12233 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12234 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12235 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12236}; 12237