r600_shader.c revision 01e04c3f
1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_formats.h" 25#include "r600_opcodes.h" 26#include "r600_shader.h" 27#include "r600d.h" 28 29#include "sb/sb_public.h" 30 31#include "pipe/p_shader_tokens.h" 32#include "tgsi/tgsi_info.h" 33#include "tgsi/tgsi_parse.h" 34#include "tgsi/tgsi_scan.h" 35#include "tgsi/tgsi_dump.h" 36#include "util/u_bitcast.h" 37#include "util/u_memory.h" 38#include "util/u_math.h" 39#include <stdio.h> 40#include <errno.h> 41 42/* CAYMAN notes 43Why CAYMAN got loops for lots of instructions is explained here. 44 45-These 8xx t-slot only ops are implemented in all vector slots. 46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 47These 8xx t-slot only opcodes become vector ops, with all four 48slots expecting the arguments on sources a and b. Result is 49broadcast to all channels. 50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64 51These 8xx t-slot only opcodes become vector ops in the z, y, and 52x slots. 53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 55SQRT_IEEE/_64 56SIN/COS 57The w slot may have an independent co-issued operation, or if the 58result is required to be in the w slot, the opcode above may be 59issued in the w slot as well. 60The compiler must issue the source argument to slots z, y, and x 61*/ 62 63/* Contents of r0 on entry to various shaders 64 65 VS - .x = VertexID 66 .y = RelVertexID (??) 67 .w = InstanceID 68 69 GS - r0.xyw, r1.xyz = per-vertex offsets 70 r0.z = PrimitiveID 71 72 TCS - .x = PatchID 73 .y = RelPatchID (??) 74 .z = InvocationID 75 .w = tess factor base. 76 77 TES - .x = TessCoord.x 78 - .y = TessCoord.y 79 - .z = RelPatchID (??) 80 - .w = PrimitiveID 81 82 PS - face_gpr.z = SampleMask 83 face_gpr.w = SampleID 84*/ 85#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16) 86static int r600_shader_from_tgsi(struct r600_context *rctx, 87 struct r600_pipe_shader *pipeshader, 88 union r600_shader_key key); 89 90static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 91 int size, unsigned comp_mask) { 92 93 if (!size) 94 return; 95 96 if (ps->num_arrays == ps->max_arrays) { 97 ps->max_arrays += 64; 98 ps->arrays = realloc(ps->arrays, ps->max_arrays * 99 sizeof(struct r600_shader_array)); 100 } 101 102 int n = ps->num_arrays; 103 ++ps->num_arrays; 104 105 ps->arrays[n].comp_mask = comp_mask; 106 ps->arrays[n].gpr_start = start_gpr; 107 ps->arrays[n].gpr_count = size; 108} 109 110static void r600_dump_streamout(struct pipe_stream_output_info *so) 111{ 112 unsigned i; 113 114 fprintf(stderr, "STREAMOUT\n"); 115 for (i = 0; i < so->num_outputs; i++) { 116 unsigned mask = ((1 << so->output[i].num_components) - 1) << 117 so->output[i].start_component; 118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 119 i, 120 so->output[i].stream, 121 so->output[i].output_buffer, 122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 123 so->output[i].register_index, 124 mask & 1 ? "x" : "", 125 mask & 2 ? "y" : "", 126 mask & 4 ? "z" : "", 127 mask & 8 ? "w" : "", 128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 129 } 130} 131 132static int store_shader(struct pipe_context *ctx, 133 struct r600_pipe_shader *shader) 134{ 135 struct r600_context *rctx = (struct r600_context *)ctx; 136 uint32_t *ptr, i; 137 138 if (shader->bo == NULL) { 139 shader->bo = (struct r600_resource*) 140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 141 if (shader->bo == NULL) { 142 return -ENOMEM; 143 } 144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE); 145 if (R600_BIG_ENDIAN) { 146 for (i = 0; i < shader->shader.bc.ndw; ++i) { 147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]); 148 } 149 } else { 150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 151 } 152 rctx->b.ws->buffer_unmap(shader->bo->buf); 153 } 154 155 return 0; 156} 157 158int r600_pipe_shader_create(struct pipe_context *ctx, 159 struct r600_pipe_shader *shader, 160 union r600_shader_key key) 161{ 162 struct r600_context *rctx = (struct r600_context *)ctx; 163 struct r600_pipe_shader_selector *sel = shader->selector; 164 int r; 165 bool dump = r600_can_dump_shader(&rctx->screen->b, 166 tgsi_get_processor_type(sel->tokens)); 167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); 168 unsigned sb_disasm; 169 unsigned export_shader; 170 171 shader->shader.bc.isa = rctx->isa; 172 173 if (dump) { 174 fprintf(stderr, "--------------------------------------------------------------\n"); 175 tgsi_dump(sel->tokens, 0); 176 177 if (sel->so.num_outputs) { 178 r600_dump_streamout(&sel->so); 179 } 180 } 181 r = r600_shader_from_tgsi(rctx, shader, key); 182 if (r) { 183 R600_ERR("translation from TGSI failed !\n"); 184 goto error; 185 } 186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { 187 /* only disable for vertex shaders in tess paths */ 188 if (key.vs.as_ls) 189 use_sb = 0; 190 } 191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL); 192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL); 193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE); 194 195 /* disable SB for shaders using doubles */ 196 use_sb &= !shader->shader.uses_doubles; 197 198 use_sb &= !shader->shader.uses_atomics; 199 use_sb &= !shader->shader.uses_images; 200 use_sb &= !shader->shader.uses_helper_invocation; 201 202 /* Check if the bytecode has already been built. */ 203 if (!shader->shader.bc.bytecode) { 204 r = r600_bytecode_build(&shader->shader.bc); 205 if (r) { 206 R600_ERR("building bytecode failed !\n"); 207 goto error; 208 } 209 } 210 211 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 212 if (dump && !sb_disasm) { 213 fprintf(stderr, "--------------------------------------------------------------\n"); 214 r600_bytecode_disasm(&shader->shader.bc); 215 fprintf(stderr, "______________________________________________________________\n"); 216 } else if ((dump && sb_disasm) || use_sb) { 217 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 218 dump, use_sb); 219 if (r) { 220 R600_ERR("r600_sb_bytecode_process failed !\n"); 221 goto error; 222 } 223 } 224 225 if (shader->gs_copy_shader) { 226 if (dump) { 227 // dump copy shader 228 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc, 229 &shader->gs_copy_shader->shader, dump, 0); 230 if (r) 231 goto error; 232 } 233 234 if ((r = store_shader(ctx, shader->gs_copy_shader))) 235 goto error; 236 } 237 238 /* Store the shader in a buffer. */ 239 if ((r = store_shader(ctx, shader))) 240 goto error; 241 242 /* Build state. */ 243 switch (shader->shader.processor_type) { 244 case PIPE_SHADER_TESS_CTRL: 245 evergreen_update_hs_state(ctx, shader); 246 break; 247 case PIPE_SHADER_TESS_EVAL: 248 if (key.tes.as_es) 249 evergreen_update_es_state(ctx, shader); 250 else 251 evergreen_update_vs_state(ctx, shader); 252 break; 253 case PIPE_SHADER_GEOMETRY: 254 if (rctx->b.chip_class >= EVERGREEN) { 255 evergreen_update_gs_state(ctx, shader); 256 evergreen_update_vs_state(ctx, shader->gs_copy_shader); 257 } else { 258 r600_update_gs_state(ctx, shader); 259 r600_update_vs_state(ctx, shader->gs_copy_shader); 260 } 261 break; 262 case PIPE_SHADER_VERTEX: 263 export_shader = key.vs.as_es; 264 if (rctx->b.chip_class >= EVERGREEN) { 265 if (key.vs.as_ls) 266 evergreen_update_ls_state(ctx, shader); 267 else if (key.vs.as_es) 268 evergreen_update_es_state(ctx, shader); 269 else 270 evergreen_update_vs_state(ctx, shader); 271 } else { 272 if (export_shader) 273 r600_update_es_state(ctx, shader); 274 else 275 r600_update_vs_state(ctx, shader); 276 } 277 break; 278 case PIPE_SHADER_FRAGMENT: 279 if (rctx->b.chip_class >= EVERGREEN) { 280 evergreen_update_ps_state(ctx, shader); 281 } else { 282 r600_update_ps_state(ctx, shader); 283 } 284 break; 285 case PIPE_SHADER_COMPUTE: 286 evergreen_update_ls_state(ctx, shader); 287 break; 288 default: 289 r = -EINVAL; 290 goto error; 291 } 292 return 0; 293 294error: 295 r600_pipe_shader_destroy(ctx, shader); 296 return r; 297} 298 299void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader) 300{ 301 r600_resource_reference(&shader->bo, NULL); 302 r600_bytecode_clear(&shader->shader.bc); 303 r600_release_command_buffer(&shader->command_buffer); 304} 305 306/* 307 * tgsi -> r600 shader 308 */ 309struct r600_shader_tgsi_instruction; 310 311struct r600_shader_src { 312 unsigned sel; 313 unsigned swizzle[4]; 314 unsigned neg; 315 unsigned abs; 316 unsigned rel; 317 unsigned kc_bank; 318 boolean kc_rel; /* true if cache bank is indexed */ 319 uint32_t value[4]; 320}; 321 322struct eg_interp { 323 boolean enabled; 324 unsigned ij_index; 325}; 326 327struct r600_shader_ctx { 328 struct tgsi_shader_info info; 329 struct tgsi_array_info *array_infos; 330 /* flag for each tgsi temp array if its been spilled or not */ 331 bool *spilled_arrays; 332 struct tgsi_parse_context parse; 333 const struct tgsi_token *tokens; 334 unsigned type; 335 unsigned file_offset[TGSI_FILE_COUNT]; 336 unsigned temp_reg; 337 const struct r600_shader_tgsi_instruction *inst_info; 338 struct r600_bytecode *bc; 339 struct r600_shader *shader; 340 struct r600_shader_src src[4]; 341 uint32_t *literals; 342 uint32_t nliterals; 343 uint32_t max_driver_temp_used; 344 /* needed for evergreen interpolation */ 345 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid 346 /* evergreen/cayman also store sample mask in face register */ 347 int face_gpr; 348 /* sample id is .w component stored in fixed point position register */ 349 int fixed_pt_position_gpr; 350 int colors_used; 351 boolean clip_vertex_write; 352 unsigned cv_output; 353 unsigned edgeflag_output; 354 int helper_invoc_reg; 355 int cs_block_size_reg; 356 int cs_grid_size_reg; 357 bool cs_block_size_loaded, cs_grid_size_loaded; 358 int fragcoord_input; 359 int next_ring_offset; 360 int gs_out_ring_offset; 361 int gs_next_vertex; 362 struct r600_shader *gs_for_vs; 363 int gs_export_gpr_tregs[4]; 364 int gs_rotated_input[2]; 365 const struct pipe_stream_output_info *gs_stream_output_info; 366 unsigned enabled_stream_buffers_mask; 367 unsigned tess_input_info; /* temp with tess input offsets */ 368 unsigned tess_output_info; /* temp with tess input offsets */ 369 unsigned thread_id_gpr; /* temp with thread id calculated for images */ 370}; 371 372struct r600_shader_tgsi_instruction { 373 unsigned op; 374 int (*process)(struct r600_shader_ctx *ctx); 375}; 376 377static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind); 378static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 379static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 380static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 381static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 382static int tgsi_else(struct r600_shader_ctx *ctx); 383static int tgsi_endif(struct r600_shader_ctx *ctx); 384static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 385static int tgsi_endloop(struct r600_shader_ctx *ctx); 386static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 387static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 388 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 389 unsigned int dst_reg); 390static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 391 const struct r600_shader_src *shader_src, 392 unsigned chan); 393static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 394 unsigned dst_reg, unsigned mask); 395 396static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx) 397{ 398 if (ctx->bc->family == CHIP_HEMLOCK || 399 ctx->bc->family == CHIP_CYPRESS || 400 ctx->bc->family == CHIP_JUNIPER) 401 return false; 402 return true; 403} 404 405static int tgsi_last_instruction(unsigned writemask) 406{ 407 int i, lasti = 0; 408 409 for (i = 0; i < 4; i++) { 410 if (writemask & (1 << i)) { 411 lasti = i; 412 } 413 } 414 return lasti; 415} 416 417static int tgsi_is_supported(struct r600_shader_ctx *ctx) 418{ 419 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 420 unsigned j; 421 422 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { 423 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 424 return -EINVAL; 425 } 426#if 0 427 if (i->Instruction.Label) { 428 R600_ERR("label unsupported\n"); 429 return -EINVAL; 430 } 431#endif 432 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 433 if (i->Src[j].Register.Dimension) { 434 switch (i->Src[j].Register.File) { 435 case TGSI_FILE_CONSTANT: 436 case TGSI_FILE_HW_ATOMIC: 437 break; 438 case TGSI_FILE_INPUT: 439 if (ctx->type == PIPE_SHADER_GEOMETRY || 440 ctx->type == PIPE_SHADER_TESS_CTRL || 441 ctx->type == PIPE_SHADER_TESS_EVAL) 442 break; 443 case TGSI_FILE_OUTPUT: 444 if (ctx->type == PIPE_SHADER_TESS_CTRL) 445 break; 446 default: 447 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j, 448 i->Src[j].Register.File, 449 i->Src[j].Register.Dimension); 450 return -EINVAL; 451 } 452 } 453 } 454 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 455 if (i->Dst[j].Register.Dimension) { 456 if (ctx->type == PIPE_SHADER_TESS_CTRL) 457 continue; 458 R600_ERR("unsupported dst (dimension)\n"); 459 return -EINVAL; 460 } 461 } 462 return 0; 463} 464 465int eg_get_interpolator_index(unsigned interpolate, unsigned location) 466{ 467 if (interpolate == TGSI_INTERPOLATE_COLOR || 468 interpolate == TGSI_INTERPOLATE_LINEAR || 469 interpolate == TGSI_INTERPOLATE_PERSPECTIVE) 470 { 471 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR; 472 int loc; 473 474 switch(location) { 475 case TGSI_INTERPOLATE_LOC_CENTER: 476 loc = 1; 477 break; 478 case TGSI_INTERPOLATE_LOC_CENTROID: 479 loc = 2; 480 break; 481 case TGSI_INTERPOLATE_LOC_SAMPLE: 482 default: 483 loc = 0; break; 484 } 485 486 return is_linear * 3 + loc; 487 } 488 489 return -1; 490} 491 492static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 493 int input) 494{ 495 int i = eg_get_interpolator_index( 496 ctx->shader->input[input].interpolate, 497 ctx->shader->input[input].interpolate_location); 498 assert(i >= 0); 499 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index; 500} 501 502static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 503{ 504 int i, r; 505 struct r600_bytecode_alu alu; 506 int gpr = 0, base_chan = 0; 507 int ij_index = ctx->shader->input[input].ij_index; 508 509 /* work out gpr and base_chan from index */ 510 gpr = ij_index / 2; 511 base_chan = (2 * (ij_index % 2)) + 1; 512 513 for (i = 0; i < 8; i++) { 514 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 515 516 if (i < 4) 517 alu.op = ALU_OP2_INTERP_ZW; 518 else 519 alu.op = ALU_OP2_INTERP_XY; 520 521 if ((i > 1) && (i < 6)) { 522 alu.dst.sel = ctx->shader->input[input].gpr; 523 alu.dst.write = 1; 524 } 525 526 alu.dst.chan = i % 4; 527 528 alu.src[0].sel = gpr; 529 alu.src[0].chan = (base_chan - (i % 2)); 530 531 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 532 533 alu.bank_swizzle_force = SQ_ALU_VEC_210; 534 if ((i % 4) == 3) 535 alu.last = 1; 536 r = r600_bytecode_add_alu(ctx->bc, &alu); 537 if (r) 538 return r; 539 } 540 return 0; 541} 542 543static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 544{ 545 int i, r; 546 struct r600_bytecode_alu alu; 547 548 for (i = 0; i < 4; i++) { 549 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 550 551 alu.op = ALU_OP1_INTERP_LOAD_P0; 552 553 alu.dst.sel = ctx->shader->input[input].gpr; 554 alu.dst.write = 1; 555 556 alu.dst.chan = i; 557 558 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 559 alu.src[0].chan = i; 560 561 if (i == 3) 562 alu.last = 1; 563 r = r600_bytecode_add_alu(ctx->bc, &alu); 564 if (r) 565 return r; 566 } 567 return 0; 568} 569 570/* 571 * Special export handling in shaders 572 * 573 * shader export ARRAY_BASE for EXPORT_POS: 574 * 60 is position 575 * 61 is misc vector 576 * 62, 63 are clip distance vectors 577 * 578 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 579 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 580 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 581 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 582 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 583 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 584 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 585 * exclusive from render target index) 586 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 587 * 588 * 589 * shader export ARRAY_BASE for EXPORT_PIXEL: 590 * 0-7 CB targets 591 * 61 computed Z vector 592 * 593 * The use of the values exported in the computed Z vector are controlled 594 * by DB_SHADER_CONTROL: 595 * Z_EXPORT_ENABLE - Z as a float in RED 596 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 597 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 598 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 599 * DB_SOURCE_FORMAT - export control restrictions 600 * 601 */ 602 603 604/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 605static int r600_spi_sid(struct r600_shader_io * io) 606{ 607 int index, name = io->name; 608 609 /* These params are handled differently, they don't need 610 * semantic indices, so we'll use 0 for them. 611 */ 612 if (name == TGSI_SEMANTIC_POSITION || 613 name == TGSI_SEMANTIC_PSIZE || 614 name == TGSI_SEMANTIC_EDGEFLAG || 615 name == TGSI_SEMANTIC_FACE || 616 name == TGSI_SEMANTIC_SAMPLEMASK) 617 index = 0; 618 else { 619 if (name == TGSI_SEMANTIC_GENERIC) { 620 /* For generic params simply use sid from tgsi */ 621 index = io->sid; 622 } else { 623 /* For non-generic params - pack name and sid into 8 bits */ 624 index = 0x80 | (name<<3) | (io->sid); 625 } 626 627 /* Make sure that all really used indices have nonzero value, so 628 * we can just compare it to 0 later instead of comparing the name 629 * with different values to detect special cases. */ 630 index++; 631 } 632 633 return index; 634}; 635 636/* we need this to get a common lds index for vs/tcs/tes input/outputs */ 637int r600_get_lds_unique_index(unsigned semantic_name, unsigned index) 638{ 639 switch (semantic_name) { 640 case TGSI_SEMANTIC_POSITION: 641 return 0; 642 case TGSI_SEMANTIC_PSIZE: 643 return 1; 644 case TGSI_SEMANTIC_CLIPDIST: 645 assert(index <= 1); 646 return 2 + index; 647 case TGSI_SEMANTIC_GENERIC: 648 if (index <= 63-4) 649 return 4 + index - 9; 650 else 651 /* same explanation as in the default statement, 652 * the only user hitting this is st/nine. 653 */ 654 return 0; 655 656 /* patch indices are completely separate and thus start from 0 */ 657 case TGSI_SEMANTIC_TESSOUTER: 658 return 0; 659 case TGSI_SEMANTIC_TESSINNER: 660 return 1; 661 case TGSI_SEMANTIC_PATCH: 662 return 2 + index; 663 664 default: 665 /* Don't fail here. The result of this function is only used 666 * for LS, TCS, TES, and GS, where legacy GL semantics can't 667 * occur, but this function is called for all vertex shaders 668 * before it's known whether LS will be compiled or not. 669 */ 670 return 0; 671 } 672} 673 674/* turn input into interpolate on EG */ 675static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 676{ 677 int r = 0; 678 679 if (ctx->shader->input[index].spi_sid) { 680 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 681 if (ctx->shader->input[index].interpolate > 0) { 682 evergreen_interp_assign_ij_index(ctx, index); 683 r = evergreen_interp_alu(ctx, index); 684 } else { 685 r = evergreen_interp_flat(ctx, index); 686 } 687 } 688 return r; 689} 690 691static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 692{ 693 struct r600_bytecode_alu alu; 694 int i, r; 695 int gpr_front = ctx->shader->input[front].gpr; 696 int gpr_back = ctx->shader->input[back].gpr; 697 698 for (i = 0; i < 4; i++) { 699 memset(&alu, 0, sizeof(alu)); 700 alu.op = ALU_OP3_CNDGT; 701 alu.is_op3 = 1; 702 alu.dst.write = 1; 703 alu.dst.sel = gpr_front; 704 alu.src[0].sel = ctx->face_gpr; 705 alu.src[1].sel = gpr_front; 706 alu.src[2].sel = gpr_back; 707 708 alu.dst.chan = i; 709 alu.src[1].chan = i; 710 alu.src[2].chan = i; 711 alu.last = (i==3); 712 713 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 714 return r; 715 } 716 717 return 0; 718} 719 720/* execute a single slot ALU calculation */ 721static int single_alu_op2(struct r600_shader_ctx *ctx, int op, 722 int dst_sel, int dst_chan, 723 int src0_sel, unsigned src0_chan_val, 724 int src1_sel, unsigned src1_chan_val) 725{ 726 struct r600_bytecode_alu alu; 727 int r, i; 728 729 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) { 730 for (i = 0; i < 4; i++) { 731 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 732 alu.op = op; 733 alu.src[0].sel = src0_sel; 734 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 735 alu.src[0].value = src0_chan_val; 736 else 737 alu.src[0].chan = src0_chan_val; 738 alu.src[1].sel = src1_sel; 739 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 740 alu.src[1].value = src1_chan_val; 741 else 742 alu.src[1].chan = src1_chan_val; 743 alu.dst.sel = dst_sel; 744 alu.dst.chan = i; 745 alu.dst.write = i == dst_chan; 746 alu.last = (i == 3); 747 r = r600_bytecode_add_alu(ctx->bc, &alu); 748 if (r) 749 return r; 750 } 751 return 0; 752 } 753 754 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 755 alu.op = op; 756 alu.src[0].sel = src0_sel; 757 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 758 alu.src[0].value = src0_chan_val; 759 else 760 alu.src[0].chan = src0_chan_val; 761 alu.src[1].sel = src1_sel; 762 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 763 alu.src[1].value = src1_chan_val; 764 else 765 alu.src[1].chan = src1_chan_val; 766 alu.dst.sel = dst_sel; 767 alu.dst.chan = dst_chan; 768 alu.dst.write = 1; 769 alu.last = 1; 770 r = r600_bytecode_add_alu(ctx->bc, &alu); 771 if (r) 772 return r; 773 return 0; 774} 775 776/* execute a single slot ALU calculation */ 777static int single_alu_op3(struct r600_shader_ctx *ctx, int op, 778 int dst_sel, int dst_chan, 779 int src0_sel, unsigned src0_chan_val, 780 int src1_sel, unsigned src1_chan_val, 781 int src2_sel, unsigned src2_chan_val) 782{ 783 struct r600_bytecode_alu alu; 784 int r; 785 786 /* validate this for other ops */ 787 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT); 788 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 789 alu.op = op; 790 alu.src[0].sel = src0_sel; 791 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 792 alu.src[0].value = src0_chan_val; 793 else 794 alu.src[0].chan = src0_chan_val; 795 alu.src[1].sel = src1_sel; 796 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 797 alu.src[1].value = src1_chan_val; 798 else 799 alu.src[1].chan = src1_chan_val; 800 alu.src[2].sel = src2_sel; 801 if (src2_sel == V_SQ_ALU_SRC_LITERAL) 802 alu.src[2].value = src2_chan_val; 803 else 804 alu.src[2].chan = src2_chan_val; 805 alu.dst.sel = dst_sel; 806 alu.dst.chan = dst_chan; 807 alu.is_op3 = 1; 808 alu.last = 1; 809 r = r600_bytecode_add_alu(ctx->bc, &alu); 810 if (r) 811 return r; 812 return 0; 813} 814 815/* put it in temp_reg.x */ 816static int get_lds_offset0(struct r600_shader_ctx *ctx, 817 int rel_patch_chan, 818 int temp_reg, bool is_patch_var) 819{ 820 int r; 821 822 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */ 823 /* ADD 824 Dimension - patch0_offset (input_vals.z), 825 Non-dim - patch0_data_offset (input_vals.w) 826 */ 827 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 828 temp_reg, 0, 829 ctx->tess_output_info, 0, 830 0, rel_patch_chan, 831 ctx->tess_output_info, is_patch_var ? 3 : 2); 832 if (r) 833 return r; 834 return 0; 835} 836 837static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) 838{ 839 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; 840} 841 842static int r600_get_temp(struct r600_shader_ctx *ctx) 843{ 844 return ctx->temp_reg + ctx->max_driver_temp_used++; 845} 846 847static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) 848{ 849 int i; 850 i = ctx->shader->noutput++; 851 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID; 852 ctx->shader->output[i].sid = 0; 853 ctx->shader->output[i].gpr = 0; 854 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT; 855 ctx->shader->output[i].write_mask = 0x4; 856 ctx->shader->output[i].spi_sid = prim_id_sid; 857 858 return 0; 859} 860 861static int tgsi_barrier(struct r600_shader_ctx *ctx) 862{ 863 struct r600_bytecode_alu alu; 864 int r; 865 866 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 867 alu.op = ctx->inst_info->op; 868 alu.last = 1; 869 870 r = r600_bytecode_add_alu(ctx->bc, &alu); 871 if (r) 872 return r; 873 return 0; 874} 875 876static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed) 877{ 878 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays 879 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY]; 880 unsigned narrays_left = n; 881 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed 882 883 *scratch_space_needed = 0; 884 while (*regno > 124 && narrays_left) { 885 unsigned i; 886 unsigned largest = 0; 887 unsigned largest_index = 0; 888 889 for (i = 0; i < n; i++) { 890 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 891 if (!spilled[i] && size > largest) { 892 largest = size; 893 largest_index = i; 894 } 895 } 896 897 spilled[largest_index] = true; 898 *regno -= largest; 899 *scratch_space_needed += largest; 900 901 narrays_left --; 902 } 903 904 if (narrays_left == 0) { 905 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY); 906 } 907} 908 909/* Take spilled temp arrays into account when translating tgsi register 910 * indexes into r600 gprs if spilled is false, or scratch array offset if 911 * spilled is true */ 912static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled) 913{ 914 unsigned i; 915 unsigned spilled_size = 0; 916 917 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 918 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 919 if (ctx->spilled_arrays[i]) { 920 /* vec4 index into spilled scratch memory */ 921 *spilled = true; 922 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size; 923 } 924 else { 925 /* regular GPR array */ 926 *spilled = false; 927 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 928 } 929 } 930 931 if (tgsi_reg_index < ctx->array_infos[i].range.First) 932 break; 933 if (ctx->spilled_arrays[i]) { 934 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 935 } 936 } 937 938 /* regular GPR index, minus the holes from spilled arrays */ 939 *spilled = false; 940 941 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 942} 943 944/* look up spill area base offset and array size for a spilled temp array */ 945static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, 946 unsigned *array_base, unsigned *array_size) 947{ 948 unsigned i; 949 unsigned offset = 0; 950 951 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 952 if (ctx->spilled_arrays[i]) { 953 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 954 955 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 956 *array_base = offset; 957 *array_size = size - 1; /* hw counts from 1 */ 958 959 return; 960 } 961 962 offset += size; 963 } 964 } 965} 966 967static int tgsi_declaration(struct r600_shader_ctx *ctx) 968{ 969 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 970 int r, i, j, count = d->Range.Last - d->Range.First + 1; 971 972 switch (d->Declaration.File) { 973 case TGSI_FILE_INPUT: 974 for (j = 0; j < count; j++) { 975 i = ctx->shader->ninput + j; 976 assert(i < ARRAY_SIZE(ctx->shader->input)); 977 ctx->shader->input[i].name = d->Semantic.Name; 978 ctx->shader->input[i].sid = d->Semantic.Index + j; 979 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 980 ctx->shader->input[i].interpolate_location = d->Interp.Location; 981 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; 982 if (ctx->type == PIPE_SHADER_FRAGMENT) { 983 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 984 switch (ctx->shader->input[i].name) { 985 case TGSI_SEMANTIC_FACE: 986 if (ctx->face_gpr != -1) 987 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ 988 else 989 ctx->face_gpr = ctx->shader->input[i].gpr; 990 break; 991 case TGSI_SEMANTIC_COLOR: 992 ctx->colors_used++; 993 break; 994 case TGSI_SEMANTIC_POSITION: 995 ctx->fragcoord_input = i; 996 break; 997 case TGSI_SEMANTIC_PRIMID: 998 /* set this for now */ 999 ctx->shader->gs_prim_id_input = true; 1000 ctx->shader->ps_prim_id_input = i; 1001 break; 1002 } 1003 if (ctx->bc->chip_class >= EVERGREEN) { 1004 if ((r = evergreen_interp_input(ctx, i))) 1005 return r; 1006 } 1007 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 1008 /* FIXME probably skip inputs if they aren't passed in the ring */ 1009 ctx->shader->input[i].ring_offset = ctx->next_ring_offset; 1010 ctx->next_ring_offset += 16; 1011 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) 1012 ctx->shader->gs_prim_id_input = true; 1013 } 1014 } 1015 ctx->shader->ninput += count; 1016 break; 1017 case TGSI_FILE_OUTPUT: 1018 for (j = 0; j < count; j++) { 1019 i = ctx->shader->noutput + j; 1020 assert(i < ARRAY_SIZE(ctx->shader->output)); 1021 ctx->shader->output[i].name = d->Semantic.Name; 1022 ctx->shader->output[i].sid = d->Semantic.Index + j; 1023 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; 1024 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 1025 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 1026 if (ctx->type == PIPE_SHADER_VERTEX || 1027 ctx->type == PIPE_SHADER_GEOMETRY || 1028 ctx->type == PIPE_SHADER_TESS_EVAL) { 1029 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 1030 switch (d->Semantic.Name) { 1031 case TGSI_SEMANTIC_CLIPDIST: 1032 break; 1033 case TGSI_SEMANTIC_PSIZE: 1034 ctx->shader->vs_out_misc_write = 1; 1035 ctx->shader->vs_out_point_size = 1; 1036 break; 1037 case TGSI_SEMANTIC_EDGEFLAG: 1038 ctx->shader->vs_out_misc_write = 1; 1039 ctx->shader->vs_out_edgeflag = 1; 1040 ctx->edgeflag_output = i; 1041 break; 1042 case TGSI_SEMANTIC_VIEWPORT_INDEX: 1043 ctx->shader->vs_out_misc_write = 1; 1044 ctx->shader->vs_out_viewport = 1; 1045 break; 1046 case TGSI_SEMANTIC_LAYER: 1047 ctx->shader->vs_out_misc_write = 1; 1048 ctx->shader->vs_out_layer = 1; 1049 break; 1050 case TGSI_SEMANTIC_CLIPVERTEX: 1051 ctx->clip_vertex_write = TRUE; 1052 ctx->cv_output = i; 1053 break; 1054 } 1055 if (ctx->type == PIPE_SHADER_GEOMETRY) { 1056 ctx->gs_out_ring_offset += 16; 1057 } 1058 } else if (ctx->type == PIPE_SHADER_FRAGMENT) { 1059 switch (d->Semantic.Name) { 1060 case TGSI_SEMANTIC_COLOR: 1061 ctx->shader->nr_ps_max_color_exports++; 1062 break; 1063 } 1064 } 1065 } 1066 ctx->shader->noutput += count; 1067 break; 1068 case TGSI_FILE_TEMPORARY: 1069 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 1070 if (d->Array.ArrayID) { 1071 bool spilled; 1072 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx, 1073 d->Range.First, 1074 &spilled); 1075 1076 if (!spilled) { 1077 r600_add_gpr_array(ctx->shader, idx, 1078 d->Range.Last - d->Range.First + 1, 0x0F); 1079 } 1080 } 1081 } 1082 break; 1083 1084 case TGSI_FILE_CONSTANT: 1085 case TGSI_FILE_SAMPLER: 1086 case TGSI_FILE_SAMPLER_VIEW: 1087 case TGSI_FILE_ADDRESS: 1088 case TGSI_FILE_BUFFER: 1089 case TGSI_FILE_IMAGE: 1090 case TGSI_FILE_MEMORY: 1091 break; 1092 1093 case TGSI_FILE_HW_ATOMIC: 1094 i = ctx->shader->nhwatomic_ranges; 1095 ctx->shader->atomics[i].start = d->Range.First; 1096 ctx->shader->atomics[i].end = d->Range.Last; 1097 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic; 1098 ctx->shader->atomics[i].array_id = d->Array.ArrayID; 1099 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D; 1100 ctx->shader->nhwatomic_ranges++; 1101 ctx->shader->nhwatomic += count; 1102 break; 1103 1104 case TGSI_FILE_SYSTEM_VALUE: 1105 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK || 1106 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID || 1107 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) { 1108 break; /* Already handled from allocate_system_value_inputs */ 1109 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 1110 break; 1111 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 1112 break; 1113 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID) 1114 break; 1115 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || 1116 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { 1117 int param = r600_get_lds_unique_index(d->Semantic.Name, 0); 1118 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; 1119 unsigned temp_reg = r600_get_temp(ctx); 1120 1121 r = get_lds_offset0(ctx, 2, temp_reg, true); 1122 if (r) 1123 return r; 1124 1125 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1126 temp_reg, 0, 1127 temp_reg, 0, 1128 V_SQ_ALU_SRC_LITERAL, param * 16); 1129 if (r) 1130 return r; 1131 1132 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf); 1133 } 1134 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { 1135 /* MOV r1.x, r0.x; 1136 MOV r1.y, r0.y; 1137 */ 1138 for (i = 0; i < 2; i++) { 1139 struct r600_bytecode_alu alu; 1140 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1141 alu.op = ALU_OP1_MOV; 1142 alu.src[0].sel = 0; 1143 alu.src[0].chan = 0 + i; 1144 alu.dst.sel = 1; 1145 alu.dst.chan = 0 + i; 1146 alu.dst.write = 1; 1147 alu.last = (i == 1) ? 1 : 0; 1148 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1149 return r; 1150 } 1151 /* ADD r1.z, 1.0f, -r0.x */ 1152 struct r600_bytecode_alu alu; 1153 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1154 alu.op = ALU_OP2_ADD; 1155 alu.src[0].sel = V_SQ_ALU_SRC_1; 1156 alu.src[1].sel = 1; 1157 alu.src[1].chan = 0; 1158 alu.src[1].neg = 1; 1159 alu.dst.sel = 1; 1160 alu.dst.chan = 2; 1161 alu.dst.write = 1; 1162 alu.last = 1; 1163 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1164 return r; 1165 1166 /* ADD r1.z, r1.z, -r1.y */ 1167 alu.op = ALU_OP2_ADD; 1168 alu.src[0].sel = 1; 1169 alu.src[0].chan = 2; 1170 alu.src[1].sel = 1; 1171 alu.src[1].chan = 1; 1172 alu.src[1].neg = 1; 1173 alu.dst.sel = 1; 1174 alu.dst.chan = 2; 1175 alu.dst.write = 1; 1176 alu.last = 1; 1177 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1178 return r; 1179 break; 1180 } 1181 break; 1182 default: 1183 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 1184 return -EINVAL; 1185 } 1186 return 0; 1187} 1188 1189static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset) 1190{ 1191 struct tgsi_parse_context parse; 1192 struct { 1193 boolean enabled; 1194 int *reg; 1195 unsigned name, alternate_name; 1196 } inputs[2] = { 1197 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */ 1198 1199 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */ 1200 }; 1201 int num_regs = 0; 1202 unsigned k, i; 1203 1204 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { 1205 return 0; 1206 } 1207 1208 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ 1209 while (!tgsi_parse_end_of_tokens(&parse)) { 1210 tgsi_parse_token(&parse); 1211 1212 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { 1213 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; 1214 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || 1215 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 1216 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) 1217 { 1218 int interpolate, location, k; 1219 1220 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 1221 location = TGSI_INTERPOLATE_LOC_CENTER; 1222 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 1223 location = TGSI_INTERPOLATE_LOC_CENTER; 1224 /* Needs sample positions, currently those are always available */ 1225 } else { 1226 location = TGSI_INTERPOLATE_LOC_CENTROID; 1227 } 1228 1229 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; 1230 k = eg_get_interpolator_index(interpolate, location); 1231 if (k >= 0) 1232 ctx->eg_interpolators[k].enabled = true; 1233 } 1234 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { 1235 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration; 1236 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { 1237 for (k = 0; k < ARRAY_SIZE(inputs); k++) { 1238 if (d->Semantic.Name == inputs[k].name || 1239 d->Semantic.Name == inputs[k].alternate_name) { 1240 inputs[k].enabled = true; 1241 } 1242 } 1243 } 1244 } 1245 } 1246 1247 tgsi_parse_free(&parse); 1248 1249 if (ctx->info.reads_samplemask && 1250 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) { 1251 inputs[1].enabled = true; 1252 } 1253 1254 if (ctx->bc->chip_class >= EVERGREEN) { 1255 int num_baryc = 0; 1256 /* assign gpr to each interpolator according to priority */ 1257 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { 1258 if (ctx->eg_interpolators[i].enabled) { 1259 ctx->eg_interpolators[i].ij_index = num_baryc; 1260 num_baryc++; 1261 } 1262 } 1263 num_baryc = (num_baryc + 1) >> 1; 1264 gpr_offset += num_baryc; 1265 } 1266 1267 for (i = 0; i < ARRAY_SIZE(inputs); i++) { 1268 boolean enabled = inputs[i].enabled; 1269 int *reg = inputs[i].reg; 1270 unsigned name = inputs[i].name; 1271 1272 if (enabled) { 1273 int gpr = gpr_offset + num_regs++; 1274 ctx->shader->nsys_inputs++; 1275 1276 // add to inputs, allocate a gpr 1277 k = ctx->shader->ninput++; 1278 ctx->shader->input[k].name = name; 1279 ctx->shader->input[k].sid = 0; 1280 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT; 1281 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER; 1282 *reg = ctx->shader->input[k].gpr = gpr; 1283 } 1284 } 1285 1286 return gpr_offset + num_regs; 1287} 1288 1289/* 1290 * for evergreen we need to scan the shader to find the number of GPRs we need to 1291 * reserve for interpolation and system values 1292 * 1293 * we need to know if we are going to emit any sample or centroid inputs 1294 * if perspective and linear are required 1295*/ 1296static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 1297{ 1298 unsigned i; 1299 1300 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); 1301 1302 /* 1303 * Could get this information from the shader info. But right now 1304 * we interpolate all declared inputs, whereas the shader info will 1305 * only contain the bits if the inputs are actually used, so it might 1306 * not be safe... 1307 */ 1308 for (i = 0; i < ctx->info.num_inputs; i++) { 1309 int k; 1310 /* skip position/face/mask/sampleid */ 1311 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 1312 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE || 1313 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK || 1314 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID) 1315 continue; 1316 1317 k = eg_get_interpolator_index( 1318 ctx->info.input_interpolate[i], 1319 ctx->info.input_interpolate_loc[i]); 1320 if (k >= 0) 1321 ctx->eg_interpolators[k].enabled = TRUE; 1322 } 1323 1324 /* XXX PULL MODEL and LINE STIPPLE */ 1325 1326 return allocate_system_value_inputs(ctx, 0); 1327} 1328 1329/* sample_id_sel == NULL means fetch for current sample */ 1330static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel) 1331{ 1332 struct r600_bytecode_vtx vtx; 1333 int r, t1; 1334 1335 t1 = r600_get_temp(ctx); 1336 1337 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1338 vtx.op = FETCH_OP_VFETCH; 1339 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1340 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1341 if (sample_id == NULL) { 1342 assert(ctx->fixed_pt_position_gpr != -1); 1343 1344 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; 1345 vtx.src_sel_x = 3; 1346 } 1347 else { 1348 struct r600_bytecode_alu alu; 1349 1350 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1351 alu.op = ALU_OP1_MOV; 1352 r600_bytecode_src(&alu.src[0], sample_id, chan_sel); 1353 alu.dst.sel = t1; 1354 alu.dst.write = 1; 1355 alu.last = 1; 1356 r = r600_bytecode_add_alu(ctx->bc, &alu); 1357 if (r) 1358 return r; 1359 1360 vtx.src_gpr = t1; 1361 vtx.src_sel_x = 0; 1362 } 1363 vtx.mega_fetch_count = 16; 1364 vtx.dst_gpr = t1; 1365 vtx.dst_sel_x = 0; 1366 vtx.dst_sel_y = 1; 1367 vtx.dst_sel_z = 2; 1368 vtx.dst_sel_w = 3; 1369 vtx.data_format = FMT_32_32_32_32_FLOAT; 1370 vtx.num_format_all = 2; 1371 vtx.format_comp_all = 1; 1372 vtx.use_const_fields = 0; 1373 vtx.offset = 0; 1374 vtx.endian = r600_endian_swap(32); 1375 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1376 1377 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1378 if (r) 1379 return r; 1380 1381 return t1; 1382} 1383 1384static int eg_load_helper_invocation(struct r600_shader_ctx *ctx) 1385{ 1386 int r; 1387 struct r600_bytecode_alu alu; 1388 1389 /* do a vtx fetch with wqm set on the vtx fetch */ 1390 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1391 alu.op = ALU_OP1_MOV; 1392 alu.dst.sel = ctx->helper_invoc_reg; 1393 alu.dst.chan = 0; 1394 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1395 alu.src[0].value = 0xffffffff; 1396 alu.dst.write = 1; 1397 alu.last = 1; 1398 r = r600_bytecode_add_alu(ctx->bc, &alu); 1399 if (r) 1400 return r; 1401 1402 /* do a vtx fetch in VPM mode */ 1403 struct r600_bytecode_vtx vtx; 1404 memset(&vtx, 0, sizeof(vtx)); 1405 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 1406 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1407 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1408 vtx.src_gpr = 0; 1409 vtx.mega_fetch_count = 16; /* no idea here really... */ 1410 vtx.dst_gpr = ctx->helper_invoc_reg; 1411 vtx.dst_sel_x = 4; 1412 vtx.dst_sel_y = 7; /* SEL_Y */ 1413 vtx.dst_sel_z = 7; /* SEL_Z */ 1414 vtx.dst_sel_w = 7; /* SEL_W */ 1415 vtx.data_format = FMT_32; 1416 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 1417 return r; 1418 ctx->bc->cf_last->vpm = 1; 1419 return 0; 1420} 1421 1422static int cm_load_helper_invocation(struct r600_shader_ctx *ctx) 1423{ 1424 int r; 1425 struct r600_bytecode_alu alu; 1426 1427 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1428 alu.op = ALU_OP1_MOV; 1429 alu.dst.sel = ctx->helper_invoc_reg; 1430 alu.dst.chan = 0; 1431 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1432 alu.src[0].value = 0xffffffff; 1433 alu.dst.write = 1; 1434 alu.last = 1; 1435 r = r600_bytecode_add_alu(ctx->bc, &alu); 1436 if (r) 1437 return r; 1438 1439 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1440 alu.op = ALU_OP1_MOV; 1441 alu.dst.sel = ctx->helper_invoc_reg; 1442 alu.dst.chan = 0; 1443 alu.src[0].sel = V_SQ_ALU_SRC_0; 1444 alu.dst.write = 1; 1445 alu.last = 1; 1446 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE); 1447 if (r) 1448 return r; 1449 1450 return ctx->helper_invoc_reg; 1451} 1452 1453static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block) 1454{ 1455 struct r600_bytecode_vtx vtx; 1456 int r, t1; 1457 1458 if (ctx->cs_block_size_loaded) 1459 return ctx->cs_block_size_reg; 1460 if (ctx->cs_grid_size_loaded) 1461 return ctx->cs_grid_size_reg; 1462 1463 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg; 1464 struct r600_bytecode_alu alu; 1465 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1466 alu.op = ALU_OP1_MOV; 1467 alu.src[0].sel = V_SQ_ALU_SRC_0; 1468 alu.dst.sel = t1; 1469 alu.dst.write = 1; 1470 alu.last = 1; 1471 r = r600_bytecode_add_alu(ctx->bc, &alu); 1472 if (r) 1473 return r; 1474 1475 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1476 vtx.op = FETCH_OP_VFETCH; 1477 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1478 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1479 vtx.src_gpr = t1; 1480 vtx.src_sel_x = 0; 1481 1482 vtx.mega_fetch_count = 16; 1483 vtx.dst_gpr = t1; 1484 vtx.dst_sel_x = 0; 1485 vtx.dst_sel_y = 1; 1486 vtx.dst_sel_z = 2; 1487 vtx.dst_sel_w = 7; 1488 vtx.data_format = FMT_32_32_32_32; 1489 vtx.num_format_all = 1; 1490 vtx.format_comp_all = 0; 1491 vtx.use_const_fields = 0; 1492 vtx.offset = load_block ? 0 : 16; // first element is size of buffer 1493 vtx.endian = r600_endian_swap(32); 1494 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1495 1496 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1497 if (r) 1498 return r; 1499 1500 if (load_block) 1501 ctx->cs_block_size_loaded = true; 1502 else 1503 ctx->cs_grid_size_loaded = true; 1504 return t1; 1505} 1506 1507static void tgsi_src(struct r600_shader_ctx *ctx, 1508 const struct tgsi_full_src_register *tgsi_src, 1509 struct r600_shader_src *r600_src) 1510{ 1511 memset(r600_src, 0, sizeof(*r600_src)); 1512 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 1513 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 1514 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 1515 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 1516 r600_src->neg = tgsi_src->Register.Negate; 1517 r600_src->abs = tgsi_src->Register.Absolute; 1518 1519 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) { 1520 bool spilled; 1521 unsigned idx; 1522 1523 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled); 1524 1525 if (spilled) { 1526 int reg = r600_get_temp(ctx); 1527 int r; 1528 1529 r600_src->sel = reg; 1530 1531 if (ctx->bc->chip_class < R700) { 1532 struct r600_bytecode_output cf; 1533 1534 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 1535 cf.op = CF_OP_MEM_SCRATCH; 1536 cf.elem_size = 3; 1537 cf.gpr = reg; 1538 cf.comp_mask = 0xF; 1539 cf.swizzle_x = 0; 1540 cf.swizzle_y = 1; 1541 cf.swizzle_z = 2; 1542 cf.swizzle_w = 3; 1543 cf.burst_count = 1; 1544 1545 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1546 &cf.array_base, &cf.array_size); 1547 1548 if (tgsi_src->Register.Indirect) { 1549 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 1550 cf.index_gpr = ctx->bc->ar_reg; 1551 } 1552 else { 1553 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ; 1554 cf.array_base += idx; 1555 cf.array_size = 0; 1556 } 1557 1558 r = r600_bytecode_add_output(ctx->bc, &cf); 1559 } 1560 else { 1561 struct r600_bytecode_vtx vtx; 1562 1563 if (r600_bytecode_get_need_wait_ack(ctx->bc)) { 1564 r600_bytecode_need_wait_ack(ctx->bc, false); 1565 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 1566 } 1567 1568 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1569 vtx.op = FETCH_OP_READ_SCRATCH; 1570 vtx.dst_gpr = reg; 1571 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation 1572 vtx.elem_size = 3; 1573 vtx.data_format = FMT_32_32_32_32; 1574 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT; 1575 vtx.dst_sel_x = tgsi_src->Register.SwizzleX; 1576 vtx.dst_sel_y = tgsi_src->Register.SwizzleY; 1577 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ; 1578 vtx.dst_sel_w = tgsi_src->Register.SwizzleW; 1579 1580 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1581 &vtx.array_base, &vtx.array_size); 1582 1583 if (tgsi_src->Register.Indirect) { 1584 vtx.indexed = 1; 1585 vtx.src_gpr = ctx->bc->ar_reg; 1586 } 1587 else { 1588 vtx.array_base += idx; 1589 vtx.array_size = 0; 1590 } 1591 1592 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1593 } 1594 1595 if (r) 1596 return; 1597 } 1598 else { 1599 if (tgsi_src->Register.Indirect) 1600 r600_src->rel = V_SQ_REL_RELATIVE; 1601 1602 r600_src->sel = idx; 1603 } 1604 1605 return; 1606 } 1607 1608 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 1609 int index; 1610 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 1611 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 1612 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 1613 1614 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 1615 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs); 1616 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 1617 return; 1618 } 1619 index = tgsi_src->Register.Index; 1620 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 1621 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 1622 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 1623 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) { 1624 r600_src->swizzle[0] = 2; // Z value 1625 r600_src->swizzle[1] = 2; 1626 r600_src->swizzle[2] = 2; 1627 r600_src->swizzle[3] = 2; 1628 r600_src->sel = ctx->face_gpr; 1629 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) { 1630 r600_src->swizzle[0] = 3; // W value 1631 r600_src->swizzle[1] = 3; 1632 r600_src->swizzle[2] = 3; 1633 r600_src->swizzle[3] = 3; 1634 r600_src->sel = ctx->fixed_pt_position_gpr; 1635 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) { 1636 r600_src->swizzle[0] = 0; 1637 r600_src->swizzle[1] = 1; 1638 r600_src->swizzle[2] = 4; 1639 r600_src->swizzle[3] = 4; 1640 r600_src->sel = load_sample_position(ctx, NULL, -1); 1641 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 1642 r600_src->swizzle[0] = 3; 1643 r600_src->swizzle[1] = 3; 1644 r600_src->swizzle[2] = 3; 1645 r600_src->swizzle[3] = 3; 1646 r600_src->sel = 0; 1647 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 1648 r600_src->swizzle[0] = 0; 1649 r600_src->swizzle[1] = 0; 1650 r600_src->swizzle[2] = 0; 1651 r600_src->swizzle[3] = 0; 1652 r600_src->sel = 0; 1653 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) { 1654 r600_src->sel = 0; 1655 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) { 1656 r600_src->sel = 1; 1657 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1658 r600_src->swizzle[0] = 3; 1659 r600_src->swizzle[1] = 3; 1660 r600_src->swizzle[2] = 3; 1661 r600_src->swizzle[3] = 3; 1662 r600_src->sel = 1; 1663 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1664 r600_src->swizzle[0] = 2; 1665 r600_src->swizzle[1] = 2; 1666 r600_src->swizzle[2] = 2; 1667 r600_src->swizzle[3] = 2; 1668 r600_src->sel = 0; 1669 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) { 1670 r600_src->sel = 1; 1671 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) { 1672 r600_src->sel = 3; 1673 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) { 1674 r600_src->sel = 2; 1675 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) { 1676 r600_src->sel = ctx->tess_input_info; 1677 r600_src->swizzle[0] = 2; 1678 r600_src->swizzle[1] = 2; 1679 r600_src->swizzle[2] = 2; 1680 r600_src->swizzle[3] = 2; 1681 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1682 r600_src->sel = 0; 1683 r600_src->swizzle[0] = 0; 1684 r600_src->swizzle[1] = 0; 1685 r600_src->swizzle[2] = 0; 1686 r600_src->swizzle[3] = 0; 1687 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1688 r600_src->sel = 0; 1689 r600_src->swizzle[0] = 3; 1690 r600_src->swizzle[1] = 3; 1691 r600_src->swizzle[2] = 3; 1692 r600_src->swizzle[3] = 3; 1693 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) { 1694 r600_src->sel = load_block_grid_size(ctx, false); 1695 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) { 1696 r600_src->sel = load_block_grid_size(ctx, true); 1697 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) { 1698 r600_src->sel = ctx->helper_invoc_reg; 1699 r600_src->swizzle[0] = 0; 1700 r600_src->swizzle[1] = 0; 1701 r600_src->swizzle[2] = 0; 1702 r600_src->swizzle[3] = 0; 1703 } 1704 } else { 1705 if (tgsi_src->Register.Indirect) 1706 r600_src->rel = V_SQ_REL_RELATIVE; 1707 r600_src->sel = tgsi_src->Register.Index; 1708 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 1709 } 1710 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 1711 if (tgsi_src->Register.Dimension) { 1712 r600_src->kc_bank = tgsi_src->Dimension.Index; 1713 if (tgsi_src->Dimension.Indirect) { 1714 r600_src->kc_rel = 1; 1715 } 1716 } 1717 } 1718} 1719 1720static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 1721 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 1722 unsigned int dst_reg) 1723{ 1724 struct r600_bytecode_vtx vtx; 1725 unsigned int ar_reg; 1726 int r; 1727 1728 if (offset) { 1729 struct r600_bytecode_alu alu; 1730 1731 memset(&alu, 0, sizeof(alu)); 1732 1733 alu.op = ALU_OP2_ADD_INT; 1734 alu.src[0].sel = ctx->bc->ar_reg; 1735 alu.src[0].chan = ar_chan; 1736 1737 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1738 alu.src[1].value = offset; 1739 1740 alu.dst.sel = dst_reg; 1741 alu.dst.chan = ar_chan; 1742 alu.dst.write = 1; 1743 alu.last = 1; 1744 1745 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1746 return r; 1747 1748 ar_reg = dst_reg; 1749 } else { 1750 ar_reg = ctx->bc->ar_reg; 1751 } 1752 1753 memset(&vtx, 0, sizeof(vtx)); 1754 vtx.buffer_id = cb_idx; 1755 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1756 vtx.src_gpr = ar_reg; 1757 vtx.src_sel_x = ar_chan; 1758 vtx.mega_fetch_count = 16; 1759 vtx.dst_gpr = dst_reg; 1760 vtx.dst_sel_x = 0; /* SEL_X */ 1761 vtx.dst_sel_y = 1; /* SEL_Y */ 1762 vtx.dst_sel_z = 2; /* SEL_Z */ 1763 vtx.dst_sel_w = 3; /* SEL_W */ 1764 vtx.data_format = FMT_32_32_32_32_FLOAT; 1765 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 1766 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 1767 vtx.endian = r600_endian_swap(32); 1768 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE; 1769 1770 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1771 return r; 1772 1773 return 0; 1774} 1775 1776static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1777{ 1778 struct r600_bytecode_vtx vtx; 1779 int r; 1780 unsigned index = src->Register.Index; 1781 unsigned vtx_id = src->Dimension.Index; 1782 int offset_reg = ctx->gs_rotated_input[vtx_id / 3]; 1783 int offset_chan = vtx_id % 3; 1784 int t2 = 0; 1785 1786 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, 1787 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ 1788 1789 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2) 1790 offset_chan = 3; 1791 1792 if (src->Dimension.Indirect || src->Register.Indirect) 1793 t2 = r600_get_temp(ctx); 1794 1795 if (src->Dimension.Indirect) { 1796 int treg[3]; 1797 struct r600_bytecode_alu alu; 1798 int r, i; 1799 unsigned addr_reg; 1800 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); 1801 if (src->DimIndirect.Index > 0) { 1802 r = single_alu_op2(ctx, ALU_OP1_MOV, 1803 ctx->bc->ar_reg, 0, 1804 addr_reg, 0, 1805 0, 0); 1806 if (r) 1807 return r; 1808 } 1809 /* 1810 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. 1811 at least this is what fglrx seems to do. */ 1812 for (i = 0; i < 3; i++) { 1813 treg[i] = r600_get_temp(ctx); 1814 } 1815 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); 1816 1817 for (i = 0; i < 3; i++) { 1818 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1819 alu.op = ALU_OP1_MOV; 1820 alu.src[0].sel = ctx->gs_rotated_input[0]; 1821 alu.src[0].chan = i == 2 ? 3 : i; 1822 alu.dst.sel = treg[i]; 1823 alu.dst.chan = 0; 1824 alu.dst.write = 1; 1825 alu.last = 1; 1826 r = r600_bytecode_add_alu(ctx->bc, &alu); 1827 if (r) 1828 return r; 1829 } 1830 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1831 alu.op = ALU_OP1_MOV; 1832 alu.src[0].sel = treg[0]; 1833 alu.src[0].rel = 1; 1834 alu.dst.sel = t2; 1835 alu.dst.write = 1; 1836 alu.last = 1; 1837 r = r600_bytecode_add_alu(ctx->bc, &alu); 1838 if (r) 1839 return r; 1840 offset_reg = t2; 1841 offset_chan = 0; 1842 } 1843 1844 if (src->Register.Indirect) { 1845 int addr_reg; 1846 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; 1847 1848 addr_reg = get_address_file_reg(ctx, src->Indirect.Index); 1849 1850 /* pull the value from index_reg */ 1851 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1852 t2, 1, 1853 addr_reg, 0, 1854 V_SQ_ALU_SRC_LITERAL, first); 1855 if (r) 1856 return r; 1857 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1858 t2, 0, 1859 t2, 1, 1860 V_SQ_ALU_SRC_LITERAL, 4, 1861 offset_reg, offset_chan); 1862 if (r) 1863 return r; 1864 offset_reg = t2; 1865 offset_chan = 0; 1866 index = src->Register.Index - first; 1867 } 1868 1869 memset(&vtx, 0, sizeof(vtx)); 1870 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 1871 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1872 vtx.src_gpr = offset_reg; 1873 vtx.src_sel_x = offset_chan; 1874 vtx.offset = index * 16; /*bytes*/ 1875 vtx.mega_fetch_count = 16; 1876 vtx.dst_gpr = dst_reg; 1877 vtx.dst_sel_x = 0; /* SEL_X */ 1878 vtx.dst_sel_y = 1; /* SEL_Y */ 1879 vtx.dst_sel_z = 2; /* SEL_Z */ 1880 vtx.dst_sel_w = 3; /* SEL_W */ 1881 if (ctx->bc->chip_class >= EVERGREEN) { 1882 vtx.use_const_fields = 1; 1883 } else { 1884 vtx.data_format = FMT_32_32_32_32_FLOAT; 1885 } 1886 1887 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1888 return r; 1889 1890 return 0; 1891} 1892 1893static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) 1894{ 1895 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1896 unsigned i; 1897 1898 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1899 struct tgsi_full_src_register *src = &inst->Src[i]; 1900 1901 if (src->Register.File == TGSI_FILE_INPUT) { 1902 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) { 1903 /* primitive id is in R0.z */ 1904 ctx->src[i].sel = 0; 1905 ctx->src[i].swizzle[0] = 2; 1906 } 1907 } 1908 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) { 1909 int treg = r600_get_temp(ctx); 1910 1911 fetch_gs_input(ctx, src, treg); 1912 ctx->src[i].sel = treg; 1913 ctx->src[i].rel = 0; 1914 } 1915 } 1916 return 0; 1917} 1918 1919 1920/* Tessellation shaders pass outputs to the next shader using LDS. 1921 * 1922 * LS outputs = TCS(HS) inputs 1923 * TCS(HS) outputs = TES(DS) inputs 1924 * 1925 * The LDS layout is: 1926 * - TCS inputs for patch 0 1927 * - TCS inputs for patch 1 1928 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 1929 * - ... 1930 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 1931 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 1932 * - TCS outputs for patch 1 1933 * - Per-patch TCS outputs for patch 1 1934 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 1935 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 1936 * - ... 1937 * 1938 * All three shaders VS(LS), TCS, TES share the same LDS space. 1939 */ 1940/* this will return with the dw address in temp_reg.x */ 1941static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, 1942 const struct tgsi_full_dst_register *dst, 1943 const struct tgsi_full_src_register *src, 1944 int stride_bytes_reg, int stride_bytes_chan) 1945{ 1946 struct tgsi_full_dst_register reg; 1947 ubyte *name, *index, *array_first; 1948 int r; 1949 int param; 1950 struct tgsi_shader_info *info = &ctx->info; 1951 /* Set the register description. The address computation is the same 1952 * for sources and destinations. */ 1953 if (src) { 1954 reg.Register.File = src->Register.File; 1955 reg.Register.Index = src->Register.Index; 1956 reg.Register.Indirect = src->Register.Indirect; 1957 reg.Register.Dimension = src->Register.Dimension; 1958 reg.Indirect = src->Indirect; 1959 reg.Dimension = src->Dimension; 1960 reg.DimIndirect = src->DimIndirect; 1961 } else 1962 reg = *dst; 1963 1964 /* If the register is 2-dimensional (e.g. an array of vertices 1965 * in a primitive), calculate the base address of the vertex. */ 1966 if (reg.Register.Dimension) { 1967 int sel, chan; 1968 if (reg.Dimension.Indirect) { 1969 unsigned addr_reg; 1970 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS); 1971 1972 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index); 1973 /* pull the value from index_reg */ 1974 sel = addr_reg; 1975 chan = 0; 1976 } else { 1977 sel = V_SQ_ALU_SRC_LITERAL; 1978 chan = reg.Dimension.Index; 1979 } 1980 1981 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1982 temp_reg, 0, 1983 stride_bytes_reg, stride_bytes_chan, 1984 sel, chan, 1985 temp_reg, 0); 1986 if (r) 1987 return r; 1988 } 1989 1990 if (reg.Register.File == TGSI_FILE_INPUT) { 1991 name = info->input_semantic_name; 1992 index = info->input_semantic_index; 1993 array_first = info->input_array_first; 1994 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 1995 name = info->output_semantic_name; 1996 index = info->output_semantic_index; 1997 array_first = info->output_array_first; 1998 } else { 1999 assert(0); 2000 return -1; 2001 } 2002 if (reg.Register.Indirect) { 2003 int addr_reg; 2004 int first; 2005 /* Add the relative address of the element. */ 2006 if (reg.Indirect.ArrayID) 2007 first = array_first[reg.Indirect.ArrayID]; 2008 else 2009 first = reg.Register.Index; 2010 2011 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); 2012 2013 /* pull the value from index_reg */ 2014 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2015 temp_reg, 0, 2016 V_SQ_ALU_SRC_LITERAL, 16, 2017 addr_reg, 0, 2018 temp_reg, 0); 2019 if (r) 2020 return r; 2021 2022 param = r600_get_lds_unique_index(name[first], 2023 index[first]); 2024 2025 } else { 2026 param = r600_get_lds_unique_index(name[reg.Register.Index], 2027 index[reg.Register.Index]); 2028 } 2029 2030 /* add to base_addr - passed in temp_reg.x */ 2031 if (param) { 2032 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2033 temp_reg, 0, 2034 temp_reg, 0, 2035 V_SQ_ALU_SRC_LITERAL, param * 16); 2036 if (r) 2037 return r; 2038 2039 } 2040 return 0; 2041} 2042 2043static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 2044 unsigned dst_reg, unsigned mask) 2045{ 2046 struct r600_bytecode_alu alu; 2047 int r, i, lasti; 2048 2049 if ((ctx->bc->cf_last->ndw>>1) >= 0x60) 2050 ctx->bc->force_add_cf = 1; 2051 2052 lasti = tgsi_last_instruction(mask); 2053 for (i = 1; i <= lasti; i++) { 2054 if (!(mask & (1 << i))) 2055 continue; 2056 2057 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2058 temp_reg, i, 2059 temp_reg, 0, 2060 V_SQ_ALU_SRC_LITERAL, 4 * i); 2061 if (r) 2062 return r; 2063 } 2064 for (i = 0; i <= lasti; i++) { 2065 if (!(mask & (1 << i))) 2066 continue; 2067 2068 /* emit an LDS_READ_RET */ 2069 memset(&alu, 0, sizeof(alu)); 2070 alu.op = LDS_OP1_LDS_READ_RET; 2071 alu.src[0].sel = temp_reg; 2072 alu.src[0].chan = i; 2073 alu.src[1].sel = V_SQ_ALU_SRC_0; 2074 alu.src[2].sel = V_SQ_ALU_SRC_0; 2075 alu.dst.chan = 0; 2076 alu.is_lds_idx_op = true; 2077 alu.last = 1; 2078 r = r600_bytecode_add_alu(ctx->bc, &alu); 2079 if (r) 2080 return r; 2081 } 2082 for (i = 0; i <= lasti; i++) { 2083 if (!(mask & (1 << i))) 2084 continue; 2085 2086 /* then read from LDS_OQ_A_POP */ 2087 memset(&alu, 0, sizeof(alu)); 2088 2089 alu.op = ALU_OP1_MOV; 2090 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 2091 alu.src[0].chan = 0; 2092 alu.dst.sel = dst_reg; 2093 alu.dst.chan = i; 2094 alu.dst.write = 1; 2095 alu.last = 1; 2096 r = r600_bytecode_add_alu(ctx->bc, &alu); 2097 if (r) 2098 return r; 2099 } 2100 return 0; 2101} 2102 2103static int fetch_mask(struct tgsi_src_register *reg) 2104{ 2105 int mask = 0; 2106 mask |= 1 << reg->SwizzleX; 2107 mask |= 1 << reg->SwizzleY; 2108 mask |= 1 << reg->SwizzleZ; 2109 mask |= 1 << reg->SwizzleW; 2110 return mask; 2111} 2112 2113static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2114{ 2115 int r; 2116 unsigned temp_reg = r600_get_temp(ctx); 2117 2118 r = get_lds_offset0(ctx, 2, temp_reg, 2119 src->Register.Dimension ? false : true); 2120 if (r) 2121 return r; 2122 2123 /* the base address is now in temp.x */ 2124 r = r600_get_byte_address(ctx, temp_reg, 2125 NULL, src, ctx->tess_output_info, 1); 2126 if (r) 2127 return r; 2128 2129 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2130 if (r) 2131 return r; 2132 return 0; 2133} 2134 2135static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2136{ 2137 int r; 2138 unsigned temp_reg = r600_get_temp(ctx); 2139 2140 /* t.x = ips * r0.y */ 2141 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2142 temp_reg, 0, 2143 ctx->tess_input_info, 0, 2144 0, 1); 2145 2146 if (r) 2147 return r; 2148 2149 /* the base address is now in temp.x */ 2150 r = r600_get_byte_address(ctx, temp_reg, 2151 NULL, src, ctx->tess_input_info, 1); 2152 if (r) 2153 return r; 2154 2155 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2156 if (r) 2157 return r; 2158 return 0; 2159} 2160 2161static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2162{ 2163 int r; 2164 unsigned temp_reg = r600_get_temp(ctx); 2165 2166 r = get_lds_offset0(ctx, 1, temp_reg, 2167 src->Register.Dimension ? false : true); 2168 if (r) 2169 return r; 2170 /* the base address is now in temp.x */ 2171 r = r600_get_byte_address(ctx, temp_reg, 2172 NULL, src, 2173 ctx->tess_output_info, 1); 2174 if (r) 2175 return r; 2176 2177 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2178 if (r) 2179 return r; 2180 return 0; 2181} 2182 2183static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) 2184{ 2185 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2186 unsigned i; 2187 2188 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2189 struct tgsi_full_src_register *src = &inst->Src[i]; 2190 2191 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) { 2192 int treg = r600_get_temp(ctx); 2193 fetch_tes_input(ctx, src, treg); 2194 ctx->src[i].sel = treg; 2195 ctx->src[i].rel = 0; 2196 } 2197 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) { 2198 int treg = r600_get_temp(ctx); 2199 fetch_tcs_input(ctx, src, treg); 2200 ctx->src[i].sel = treg; 2201 ctx->src[i].rel = 0; 2202 } 2203 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) { 2204 int treg = r600_get_temp(ctx); 2205 fetch_tcs_output(ctx, src, treg); 2206 ctx->src[i].sel = treg; 2207 ctx->src[i].rel = 0; 2208 } 2209 } 2210 return 0; 2211} 2212 2213static int tgsi_split_constant(struct r600_shader_ctx *ctx) 2214{ 2215 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2216 struct r600_bytecode_alu alu; 2217 int i, j, k, nconst, r; 2218 2219 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 2220 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 2221 nconst++; 2222 } 2223 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 2224 } 2225 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 2226 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 2227 continue; 2228 } 2229 2230 if (ctx->src[i].rel) { 2231 int chan = inst->Src[i].Indirect.Swizzle; 2232 int treg = r600_get_temp(ctx); 2233 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) 2234 return r; 2235 2236 ctx->src[i].kc_bank = 0; 2237 ctx->src[i].kc_rel = 0; 2238 ctx->src[i].sel = treg; 2239 ctx->src[i].rel = 0; 2240 j--; 2241 } else if (j > 0) { 2242 int treg = r600_get_temp(ctx); 2243 for (k = 0; k < 4; k++) { 2244 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2245 alu.op = ALU_OP1_MOV; 2246 alu.src[0].sel = ctx->src[i].sel; 2247 alu.src[0].chan = k; 2248 alu.src[0].rel = ctx->src[i].rel; 2249 alu.src[0].kc_bank = ctx->src[i].kc_bank; 2250 alu.src[0].kc_rel = ctx->src[i].kc_rel; 2251 alu.dst.sel = treg; 2252 alu.dst.chan = k; 2253 alu.dst.write = 1; 2254 if (k == 3) 2255 alu.last = 1; 2256 r = r600_bytecode_add_alu(ctx->bc, &alu); 2257 if (r) 2258 return r; 2259 } 2260 ctx->src[i].sel = treg; 2261 ctx->src[i].rel =0; 2262 j--; 2263 } 2264 } 2265 return 0; 2266} 2267 2268/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 2269static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 2270{ 2271 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2272 struct r600_bytecode_alu alu; 2273 int i, j, k, nliteral, r; 2274 2275 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 2276 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2277 nliteral++; 2278 } 2279 } 2280 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 2281 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2282 int treg = r600_get_temp(ctx); 2283 for (k = 0; k < 4; k++) { 2284 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2285 alu.op = ALU_OP1_MOV; 2286 alu.src[0].sel = ctx->src[i].sel; 2287 alu.src[0].chan = k; 2288 alu.src[0].value = ctx->src[i].value[k]; 2289 alu.dst.sel = treg; 2290 alu.dst.chan = k; 2291 alu.dst.write = 1; 2292 if (k == 3) 2293 alu.last = 1; 2294 r = r600_bytecode_add_alu(ctx->bc, &alu); 2295 if (r) 2296 return r; 2297 } 2298 ctx->src[i].sel = treg; 2299 j--; 2300 } 2301 } 2302 return 0; 2303} 2304 2305static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 2306{ 2307 int i, r, count = ctx->shader->ninput; 2308 2309 for (i = 0; i < count; i++) { 2310 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 2311 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 2312 if (r) 2313 return r; 2314 } 2315 } 2316 return 0; 2317} 2318 2319static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so, 2320 int stream, unsigned *stream_item_size UNUSED) 2321{ 2322 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 2323 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS]; 2324 int j, r; 2325 unsigned i; 2326 2327 /* Sanity checking. */ 2328 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) { 2329 R600_ERR("Too many stream outputs: %d\n", so->num_outputs); 2330 r = -EINVAL; 2331 goto out_err; 2332 } 2333 for (i = 0; i < so->num_outputs; i++) { 2334 if (so->output[i].output_buffer >= 4) { 2335 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 2336 so->output[i].output_buffer); 2337 r = -EINVAL; 2338 goto out_err; 2339 } 2340 } 2341 2342 /* Initialize locations where the outputs are stored. */ 2343 for (i = 0; i < so->num_outputs; i++) { 2344 2345 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; 2346 start_comp[i] = so->output[i].start_component; 2347 /* Lower outputs with dst_offset < start_component. 2348 * 2349 * We can only output 4D vectors with a write mask, e.g. we can 2350 * only output the W component at offset 3, etc. If we want 2351 * to store Y, Z, or W at buffer offset 0, we need to use MOV 2352 * to move it to X and output X. */ 2353 if (so->output[i].dst_offset < so->output[i].start_component) { 2354 unsigned tmp = r600_get_temp(ctx); 2355 2356 for (j = 0; j < so->output[i].num_components; j++) { 2357 struct r600_bytecode_alu alu; 2358 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2359 alu.op = ALU_OP1_MOV; 2360 alu.src[0].sel = so_gpr[i]; 2361 alu.src[0].chan = so->output[i].start_component + j; 2362 2363 alu.dst.sel = tmp; 2364 alu.dst.chan = j; 2365 alu.dst.write = 1; 2366 if (j == so->output[i].num_components - 1) 2367 alu.last = 1; 2368 r = r600_bytecode_add_alu(ctx->bc, &alu); 2369 if (r) 2370 return r; 2371 } 2372 start_comp[i] = 0; 2373 so_gpr[i] = tmp; 2374 } 2375 } 2376 2377 /* Write outputs to buffers. */ 2378 for (i = 0; i < so->num_outputs; i++) { 2379 struct r600_bytecode_output output; 2380 2381 if (stream != -1 && stream != so->output[i].stream) 2382 continue; 2383 2384 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2385 output.gpr = so_gpr[i]; 2386 output.elem_size = so->output[i].num_components - 1; 2387 if (output.elem_size == 2) 2388 output.elem_size = 3; // 3 not supported, write 4 with junk at end 2389 output.array_base = so->output[i].dst_offset - start_comp[i]; 2390 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2391 output.burst_count = 1; 2392 /* array_size is an upper limit for the burst_count 2393 * with MEM_STREAM instructions */ 2394 output.array_size = 0xFFF; 2395 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i]; 2396 2397 if (ctx->bc->chip_class >= EVERGREEN) { 2398 switch (so->output[i].output_buffer) { 2399 case 0: 2400 output.op = CF_OP_MEM_STREAM0_BUF0; 2401 break; 2402 case 1: 2403 output.op = CF_OP_MEM_STREAM0_BUF1; 2404 break; 2405 case 2: 2406 output.op = CF_OP_MEM_STREAM0_BUF2; 2407 break; 2408 case 3: 2409 output.op = CF_OP_MEM_STREAM0_BUF3; 2410 break; 2411 } 2412 output.op += so->output[i].stream * 4; 2413 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3); 2414 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; 2415 } else { 2416 switch (so->output[i].output_buffer) { 2417 case 0: 2418 output.op = CF_OP_MEM_STREAM0; 2419 break; 2420 case 1: 2421 output.op = CF_OP_MEM_STREAM1; 2422 break; 2423 case 2: 2424 output.op = CF_OP_MEM_STREAM2; 2425 break; 2426 case 3: 2427 output.op = CF_OP_MEM_STREAM3; 2428 break; 2429 } 2430 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer; 2431 } 2432 r = r600_bytecode_add_output(ctx->bc, &output); 2433 if (r) 2434 goto out_err; 2435 } 2436 return 0; 2437out_err: 2438 return r; 2439} 2440 2441static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx) 2442{ 2443 struct r600_bytecode_alu alu; 2444 unsigned reg; 2445 2446 if (!ctx->shader->vs_out_edgeflag) 2447 return; 2448 2449 reg = ctx->shader->output[ctx->edgeflag_output].gpr; 2450 2451 /* clamp(x, 0, 1) */ 2452 memset(&alu, 0, sizeof(alu)); 2453 alu.op = ALU_OP1_MOV; 2454 alu.src[0].sel = reg; 2455 alu.dst.sel = reg; 2456 alu.dst.write = 1; 2457 alu.dst.clamp = 1; 2458 alu.last = 1; 2459 r600_bytecode_add_alu(ctx->bc, &alu); 2460 2461 memset(&alu, 0, sizeof(alu)); 2462 alu.op = ALU_OP1_FLT_TO_INT; 2463 alu.src[0].sel = reg; 2464 alu.dst.sel = reg; 2465 alu.dst.write = 1; 2466 alu.last = 1; 2467 r600_bytecode_add_alu(ctx->bc, &alu); 2468} 2469 2470static int generate_gs_copy_shader(struct r600_context *rctx, 2471 struct r600_pipe_shader *gs, 2472 struct pipe_stream_output_info *so) 2473{ 2474 struct r600_shader_ctx ctx = {}; 2475 struct r600_shader *gs_shader = &gs->shader; 2476 struct r600_pipe_shader *cshader; 2477 unsigned ocnt = gs_shader->noutput; 2478 struct r600_bytecode_alu alu; 2479 struct r600_bytecode_vtx vtx; 2480 struct r600_bytecode_output output; 2481 struct r600_bytecode_cf *cf_jump, *cf_pop, 2482 *last_exp_pos = NULL, *last_exp_param = NULL; 2483 int next_clip_pos = 61, next_param = 0; 2484 unsigned i, j; 2485 int ring; 2486 bool only_ring_0 = true; 2487 cshader = calloc(1, sizeof(struct r600_pipe_shader)); 2488 if (!cshader) 2489 return 0; 2490 2491 memcpy(cshader->shader.output, gs_shader->output, ocnt * 2492 sizeof(struct r600_shader_io)); 2493 2494 cshader->shader.noutput = ocnt; 2495 2496 ctx.shader = &cshader->shader; 2497 ctx.bc = &ctx.shader->bc; 2498 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX; 2499 2500 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family, 2501 rctx->screen->has_compressed_msaa_texturing); 2502 2503 ctx.bc->isa = rctx->isa; 2504 2505 cf_jump = NULL; 2506 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes)); 2507 2508 /* R0.x = R0.x & 0x3fffffff */ 2509 memset(&alu, 0, sizeof(alu)); 2510 alu.op = ALU_OP2_AND_INT; 2511 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2512 alu.src[1].value = 0x3fffffff; 2513 alu.dst.write = 1; 2514 r600_bytecode_add_alu(ctx.bc, &alu); 2515 2516 /* R0.y = R0.x >> 30 */ 2517 memset(&alu, 0, sizeof(alu)); 2518 alu.op = ALU_OP2_LSHR_INT; 2519 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2520 alu.src[1].value = 0x1e; 2521 alu.dst.chan = 1; 2522 alu.dst.write = 1; 2523 alu.last = 1; 2524 r600_bytecode_add_alu(ctx.bc, &alu); 2525 2526 /* fetch vertex data from GSVS ring */ 2527 for (i = 0; i < ocnt; ++i) { 2528 struct r600_shader_io *out = &ctx.shader->output[i]; 2529 2530 out->gpr = i + 1; 2531 out->ring_offset = i * 16; 2532 2533 memset(&vtx, 0, sizeof(vtx)); 2534 vtx.op = FETCH_OP_VFETCH; 2535 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 2536 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2537 vtx.mega_fetch_count = 16; 2538 vtx.offset = out->ring_offset; 2539 vtx.dst_gpr = out->gpr; 2540 vtx.src_gpr = 0; 2541 vtx.dst_sel_x = 0; 2542 vtx.dst_sel_y = 1; 2543 vtx.dst_sel_z = 2; 2544 vtx.dst_sel_w = 3; 2545 if (rctx->b.chip_class >= EVERGREEN) { 2546 vtx.use_const_fields = 1; 2547 } else { 2548 vtx.data_format = FMT_32_32_32_32_FLOAT; 2549 } 2550 2551 r600_bytecode_add_vtx(ctx.bc, &vtx); 2552 } 2553 ctx.temp_reg = i + 1; 2554 for (ring = 3; ring >= 0; --ring) { 2555 bool enabled = false; 2556 for (i = 0; i < so->num_outputs; i++) { 2557 if (so->output[i].stream == ring) { 2558 enabled = true; 2559 if (ring > 0) 2560 only_ring_0 = false; 2561 break; 2562 } 2563 } 2564 if (ring != 0 && !enabled) { 2565 cshader->shader.ring_item_sizes[ring] = 0; 2566 continue; 2567 } 2568 2569 if (cf_jump) { 2570 // Patch up jump label 2571 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2572 cf_pop = ctx.bc->cf_last; 2573 2574 cf_jump->cf_addr = cf_pop->id + 2; 2575 cf_jump->pop_count = 1; 2576 cf_pop->cf_addr = cf_pop->id + 2; 2577 cf_pop->pop_count = 1; 2578 } 2579 2580 /* PRED_SETE_INT __, R0.y, ring */ 2581 memset(&alu, 0, sizeof(alu)); 2582 alu.op = ALU_OP2_PRED_SETE_INT; 2583 alu.src[0].chan = 1; 2584 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2585 alu.src[1].value = ring; 2586 alu.execute_mask = 1; 2587 alu.update_pred = 1; 2588 alu.last = 1; 2589 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); 2590 2591 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); 2592 cf_jump = ctx.bc->cf_last; 2593 2594 if (enabled) 2595 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]); 2596 cshader->shader.ring_item_sizes[ring] = ocnt * 16; 2597 } 2598 2599 /* bc adds nops - copy it */ 2600 if (ctx.bc->chip_class == R600) { 2601 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2602 alu.op = ALU_OP0_NOP; 2603 alu.last = 1; 2604 r600_bytecode_add_alu(ctx.bc, &alu); 2605 2606 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2607 } 2608 2609 /* export vertex data */ 2610 /* XXX factor out common code with r600_shader_from_tgsi ? */ 2611 for (i = 0; i < ocnt; ++i) { 2612 struct r600_shader_io *out = &ctx.shader->output[i]; 2613 bool instream0 = true; 2614 if (out->name == TGSI_SEMANTIC_CLIPVERTEX) 2615 continue; 2616 2617 for (j = 0; j < so->num_outputs; j++) { 2618 if (so->output[j].register_index == i) { 2619 if (so->output[j].stream == 0) 2620 break; 2621 if (so->output[j].stream > 0) 2622 instream0 = false; 2623 } 2624 } 2625 if (!instream0) 2626 continue; 2627 memset(&output, 0, sizeof(output)); 2628 output.gpr = out->gpr; 2629 output.elem_size = 3; 2630 output.swizzle_x = 0; 2631 output.swizzle_y = 1; 2632 output.swizzle_z = 2; 2633 output.swizzle_w = 3; 2634 output.burst_count = 1; 2635 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2636 output.op = CF_OP_EXPORT; 2637 switch (out->name) { 2638 case TGSI_SEMANTIC_POSITION: 2639 output.array_base = 60; 2640 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2641 break; 2642 2643 case TGSI_SEMANTIC_PSIZE: 2644 output.array_base = 61; 2645 if (next_clip_pos == 61) 2646 next_clip_pos = 62; 2647 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2648 output.swizzle_y = 7; 2649 output.swizzle_z = 7; 2650 output.swizzle_w = 7; 2651 ctx.shader->vs_out_misc_write = 1; 2652 ctx.shader->vs_out_point_size = 1; 2653 break; 2654 case TGSI_SEMANTIC_LAYER: 2655 if (out->spi_sid) { 2656 /* duplicate it as PARAM to pass to the pixel shader */ 2657 output.array_base = next_param++; 2658 r600_bytecode_add_output(ctx.bc, &output); 2659 last_exp_param = ctx.bc->cf_last; 2660 } 2661 output.array_base = 61; 2662 if (next_clip_pos == 61) 2663 next_clip_pos = 62; 2664 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2665 output.swizzle_x = 7; 2666 output.swizzle_y = 7; 2667 output.swizzle_z = 0; 2668 output.swizzle_w = 7; 2669 ctx.shader->vs_out_misc_write = 1; 2670 ctx.shader->vs_out_layer = 1; 2671 break; 2672 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2673 if (out->spi_sid) { 2674 /* duplicate it as PARAM to pass to the pixel shader */ 2675 output.array_base = next_param++; 2676 r600_bytecode_add_output(ctx.bc, &output); 2677 last_exp_param = ctx.bc->cf_last; 2678 } 2679 output.array_base = 61; 2680 if (next_clip_pos == 61) 2681 next_clip_pos = 62; 2682 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2683 ctx.shader->vs_out_misc_write = 1; 2684 ctx.shader->vs_out_viewport = 1; 2685 output.swizzle_x = 7; 2686 output.swizzle_y = 7; 2687 output.swizzle_z = 7; 2688 output.swizzle_w = 0; 2689 break; 2690 case TGSI_SEMANTIC_CLIPDIST: 2691 /* spi_sid is 0 for clipdistance outputs that were generated 2692 * for clipvertex - we don't need to pass them to PS */ 2693 ctx.shader->clip_dist_write = gs->shader.clip_dist_write; 2694 ctx.shader->cull_dist_write = gs->shader.cull_dist_write; 2695 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask; 2696 if (out->spi_sid) { 2697 /* duplicate it as PARAM to pass to the pixel shader */ 2698 output.array_base = next_param++; 2699 r600_bytecode_add_output(ctx.bc, &output); 2700 last_exp_param = ctx.bc->cf_last; 2701 } 2702 output.array_base = next_clip_pos++; 2703 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2704 break; 2705 case TGSI_SEMANTIC_FOG: 2706 output.swizzle_y = 4; /* 0 */ 2707 output.swizzle_z = 4; /* 0 */ 2708 output.swizzle_w = 5; /* 1 */ 2709 break; 2710 default: 2711 output.array_base = next_param++; 2712 break; 2713 } 2714 r600_bytecode_add_output(ctx.bc, &output); 2715 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) 2716 last_exp_param = ctx.bc->cf_last; 2717 else 2718 last_exp_pos = ctx.bc->cf_last; 2719 } 2720 2721 if (!last_exp_pos) { 2722 memset(&output, 0, sizeof(output)); 2723 output.gpr = 0; 2724 output.elem_size = 3; 2725 output.swizzle_x = 7; 2726 output.swizzle_y = 7; 2727 output.swizzle_z = 7; 2728 output.swizzle_w = 7; 2729 output.burst_count = 1; 2730 output.type = 2; 2731 output.op = CF_OP_EXPORT; 2732 output.array_base = 60; 2733 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2734 r600_bytecode_add_output(ctx.bc, &output); 2735 last_exp_pos = ctx.bc->cf_last; 2736 } 2737 2738 if (!last_exp_param) { 2739 memset(&output, 0, sizeof(output)); 2740 output.gpr = 0; 2741 output.elem_size = 3; 2742 output.swizzle_x = 7; 2743 output.swizzle_y = 7; 2744 output.swizzle_z = 7; 2745 output.swizzle_w = 7; 2746 output.burst_count = 1; 2747 output.type = 2; 2748 output.op = CF_OP_EXPORT; 2749 output.array_base = next_param++; 2750 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2751 r600_bytecode_add_output(ctx.bc, &output); 2752 last_exp_param = ctx.bc->cf_last; 2753 } 2754 2755 last_exp_pos->op = CF_OP_EXPORT_DONE; 2756 last_exp_param->op = CF_OP_EXPORT_DONE; 2757 2758 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2759 cf_pop = ctx.bc->cf_last; 2760 2761 cf_jump->cf_addr = cf_pop->id + 2; 2762 cf_jump->pop_count = 1; 2763 cf_pop->cf_addr = cf_pop->id + 2; 2764 cf_pop->pop_count = 1; 2765 2766 if (ctx.bc->chip_class == CAYMAN) 2767 cm_bytecode_add_cf_end(ctx.bc); 2768 else { 2769 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2770 ctx.bc->cf_last->end_of_program = 1; 2771 } 2772 2773 gs->gs_copy_shader = cshader; 2774 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 2775 2776 ctx.bc->nstack = 1; 2777 2778 return r600_bytecode_build(ctx.bc); 2779} 2780 2781static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind) 2782{ 2783 if (ind) { 2784 struct r600_bytecode_alu alu; 2785 int r; 2786 2787 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2788 alu.op = ALU_OP2_ADD_INT; 2789 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx]; 2790 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2791 alu.src[1].value = ctx->gs_out_ring_offset >> 4; 2792 alu.dst.sel = ctx->gs_export_gpr_tregs[idx]; 2793 alu.dst.write = 1; 2794 alu.last = 1; 2795 r = r600_bytecode_add_alu(ctx->bc, &alu); 2796 if (r) 2797 return r; 2798 } 2799 return 0; 2800} 2801 2802static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind) 2803{ 2804 struct r600_bytecode_output output; 2805 int ring_offset; 2806 unsigned i, k; 2807 int effective_stream = stream == -1 ? 0 : stream; 2808 int idx = 0; 2809 2810 for (i = 0; i < ctx->shader->noutput; i++) { 2811 if (ctx->gs_for_vs) { 2812 /* for ES we need to lookup corresponding ring offset expected by GS 2813 * (map this output to GS input by name and sid) */ 2814 /* FIXME precompute offsets */ 2815 ring_offset = -1; 2816 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) { 2817 struct r600_shader_io *in = &ctx->gs_for_vs->input[k]; 2818 struct r600_shader_io *out = &ctx->shader->output[i]; 2819 if (in->name == out->name && in->sid == out->sid) 2820 ring_offset = in->ring_offset; 2821 } 2822 2823 if (ring_offset == -1) 2824 continue; 2825 } else { 2826 ring_offset = idx * 16; 2827 idx++; 2828 } 2829 2830 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION) 2831 continue; 2832 /* next_ring_offset after parsing input decls contains total size of 2833 * single vertex data, gs_next_vertex - current vertex index */ 2834 if (!ind) 2835 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex; 2836 2837 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2838 output.gpr = ctx->shader->output[i].gpr; 2839 output.elem_size = 3; 2840 output.comp_mask = 0xF; 2841 output.burst_count = 1; 2842 2843 if (ind) 2844 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 2845 else 2846 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2847 2848 switch (stream) { 2849 default: 2850 case 0: 2851 output.op = CF_OP_MEM_RING; break; 2852 case 1: 2853 output.op = CF_OP_MEM_RING1; break; 2854 case 2: 2855 output.op = CF_OP_MEM_RING2; break; 2856 case 3: 2857 output.op = CF_OP_MEM_RING3; break; 2858 } 2859 2860 if (ind) { 2861 output.array_base = ring_offset >> 2; /* in dwords */ 2862 output.array_size = 0xfff; 2863 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream]; 2864 } else 2865 output.array_base = ring_offset >> 2; /* in dwords */ 2866 r600_bytecode_add_output(ctx->bc, &output); 2867 } 2868 2869 ++ctx->gs_next_vertex; 2870 return 0; 2871} 2872 2873 2874static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) 2875{ 2876 int r; 2877 struct r600_bytecode_vtx vtx; 2878 int temp_val = ctx->temp_reg; 2879 /* need to store the TCS output somewhere */ 2880 r = single_alu_op2(ctx, ALU_OP1_MOV, 2881 temp_val, 0, 2882 V_SQ_ALU_SRC_LITERAL, 0, 2883 0, 0); 2884 if (r) 2885 return r; 2886 2887 /* used by VS/TCS */ 2888 if (ctx->tess_input_info) { 2889 /* fetch tcs input values into resv space */ 2890 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2891 vtx.op = FETCH_OP_VFETCH; 2892 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2893 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2894 vtx.mega_fetch_count = 16; 2895 vtx.data_format = FMT_32_32_32_32; 2896 vtx.num_format_all = 2; 2897 vtx.format_comp_all = 1; 2898 vtx.use_const_fields = 0; 2899 vtx.endian = r600_endian_swap(32); 2900 vtx.srf_mode_all = 1; 2901 vtx.offset = 0; 2902 vtx.dst_gpr = ctx->tess_input_info; 2903 vtx.dst_sel_x = 0; 2904 vtx.dst_sel_y = 1; 2905 vtx.dst_sel_z = 2; 2906 vtx.dst_sel_w = 3; 2907 vtx.src_gpr = temp_val; 2908 vtx.src_sel_x = 0; 2909 2910 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 2911 if (r) 2912 return r; 2913 } 2914 2915 /* used by TCS/TES */ 2916 if (ctx->tess_output_info) { 2917 /* fetch tcs output values into resv space */ 2918 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2919 vtx.op = FETCH_OP_VFETCH; 2920 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2921 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2922 vtx.mega_fetch_count = 16; 2923 vtx.data_format = FMT_32_32_32_32; 2924 vtx.num_format_all = 2; 2925 vtx.format_comp_all = 1; 2926 vtx.use_const_fields = 0; 2927 vtx.endian = r600_endian_swap(32); 2928 vtx.srf_mode_all = 1; 2929 vtx.offset = 16; 2930 vtx.dst_gpr = ctx->tess_output_info; 2931 vtx.dst_sel_x = 0; 2932 vtx.dst_sel_y = 1; 2933 vtx.dst_sel_z = 2; 2934 vtx.dst_sel_w = 3; 2935 vtx.src_gpr = temp_val; 2936 vtx.src_sel_x = 0; 2937 2938 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 2939 if (r) 2940 return r; 2941 } 2942 return 0; 2943} 2944 2945static int emit_lds_vs_writes(struct r600_shader_ctx *ctx) 2946{ 2947 int j, r; 2948 int temp_reg; 2949 unsigned i; 2950 2951 /* fetch tcs input values into input_vals */ 2952 ctx->tess_input_info = r600_get_temp(ctx); 2953 ctx->tess_output_info = 0; 2954 r = r600_fetch_tess_io_info(ctx); 2955 if (r) 2956 return r; 2957 2958 temp_reg = r600_get_temp(ctx); 2959 /* dst reg contains LDS address stride * idx */ 2960 /* MUL vertexID, vertex_dw_stride */ 2961 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2962 temp_reg, 0, 2963 ctx->tess_input_info, 1, 2964 0, 1); /* rel id in r0.y? */ 2965 if (r) 2966 return r; 2967 2968 for (i = 0; i < ctx->shader->noutput; i++) { 2969 struct r600_bytecode_alu alu; 2970 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid); 2971 2972 if (param) { 2973 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2974 temp_reg, 1, 2975 temp_reg, 0, 2976 V_SQ_ALU_SRC_LITERAL, param * 16); 2977 if (r) 2978 return r; 2979 } 2980 2981 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2982 temp_reg, 2, 2983 temp_reg, param ? 1 : 0, 2984 V_SQ_ALU_SRC_LITERAL, 8); 2985 if (r) 2986 return r; 2987 2988 2989 for (j = 0; j < 2; j++) { 2990 int chan = (j == 1) ? 2 : (param ? 1 : 0); 2991 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2992 alu.op = LDS_OP3_LDS_WRITE_REL; 2993 alu.src[0].sel = temp_reg; 2994 alu.src[0].chan = chan; 2995 alu.src[1].sel = ctx->shader->output[i].gpr; 2996 alu.src[1].chan = j * 2; 2997 alu.src[2].sel = ctx->shader->output[i].gpr; 2998 alu.src[2].chan = (j * 2) + 1; 2999 alu.last = 1; 3000 alu.dst.chan = 0; 3001 alu.lds_idx = 1; 3002 alu.is_lds_idx_op = true; 3003 r = r600_bytecode_add_alu(ctx->bc, &alu); 3004 if (r) 3005 return r; 3006 } 3007 } 3008 return 0; 3009} 3010 3011static int r600_store_tcs_output(struct r600_shader_ctx *ctx) 3012{ 3013 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3014 const struct tgsi_full_dst_register *dst = &inst->Dst[0]; 3015 int i, r, lasti; 3016 int temp_reg = r600_get_temp(ctx); 3017 struct r600_bytecode_alu alu; 3018 unsigned write_mask = dst->Register.WriteMask; 3019 3020 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) 3021 return 0; 3022 3023 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); 3024 if (r) 3025 return r; 3026 3027 /* the base address is now in temp.x */ 3028 r = r600_get_byte_address(ctx, temp_reg, 3029 &inst->Dst[0], NULL, ctx->tess_output_info, 1); 3030 if (r) 3031 return r; 3032 3033 /* LDS write */ 3034 lasti = tgsi_last_instruction(write_mask); 3035 for (i = 1; i <= lasti; i++) { 3036 3037 if (!(write_mask & (1 << i))) 3038 continue; 3039 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3040 temp_reg, i, 3041 temp_reg, 0, 3042 V_SQ_ALU_SRC_LITERAL, 4 * i); 3043 if (r) 3044 return r; 3045 } 3046 3047 for (i = 0; i <= lasti; i++) { 3048 if (!(write_mask & (1 << i))) 3049 continue; 3050 3051 if ((i == 0 && ((write_mask & 3) == 3)) || 3052 (i == 2 && ((write_mask & 0xc) == 0xc))) { 3053 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3054 alu.op = LDS_OP3_LDS_WRITE_REL; 3055 alu.src[0].sel = temp_reg; 3056 alu.src[0].chan = i; 3057 3058 alu.src[1].sel = dst->Register.Index; 3059 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3060 alu.src[1].chan = i; 3061 3062 alu.src[2].sel = dst->Register.Index; 3063 alu.src[2].sel += ctx->file_offset[dst->Register.File]; 3064 alu.src[2].chan = i + 1; 3065 alu.lds_idx = 1; 3066 alu.dst.chan = 0; 3067 alu.last = 1; 3068 alu.is_lds_idx_op = true; 3069 r = r600_bytecode_add_alu(ctx->bc, &alu); 3070 if (r) 3071 return r; 3072 i += 1; 3073 continue; 3074 } 3075 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3076 alu.op = LDS_OP2_LDS_WRITE; 3077 alu.src[0].sel = temp_reg; 3078 alu.src[0].chan = i; 3079 3080 alu.src[1].sel = dst->Register.Index; 3081 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3082 alu.src[1].chan = i; 3083 3084 alu.src[2].sel = V_SQ_ALU_SRC_0; 3085 alu.dst.chan = 0; 3086 alu.last = 1; 3087 alu.is_lds_idx_op = true; 3088 r = r600_bytecode_add_alu(ctx->bc, &alu); 3089 if (r) 3090 return r; 3091 } 3092 return 0; 3093} 3094 3095static int r600_tess_factor_read(struct r600_shader_ctx *ctx, 3096 int output_idx, int nc) 3097{ 3098 int param; 3099 unsigned temp_reg = r600_get_temp(ctx); 3100 unsigned name = ctx->shader->output[output_idx].name; 3101 int dreg = ctx->shader->output[output_idx].gpr; 3102 int r; 3103 3104 param = r600_get_lds_unique_index(name, 0); 3105 r = get_lds_offset0(ctx, 1, temp_reg, true); 3106 if (r) 3107 return r; 3108 3109 if (param) { 3110 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3111 temp_reg, 0, 3112 temp_reg, 0, 3113 V_SQ_ALU_SRC_LITERAL, param * 16); 3114 if (r) 3115 return r; 3116 } 3117 3118 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1)); 3119 return 0; 3120} 3121 3122static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) 3123{ 3124 int stride, outer_comps, inner_comps; 3125 int tessinner_idx = -1, tessouter_idx = -1; 3126 int i, r; 3127 unsigned j; 3128 int temp_reg = r600_get_temp(ctx); 3129 int treg[3] = {-1, -1, -1}; 3130 struct r600_bytecode_alu alu; 3131 struct r600_bytecode_cf *cf_jump, *cf_pop; 3132 3133 /* only execute factor emission for invocation 0 */ 3134 /* PRED_SETE_INT __, R0.x, 0 */ 3135 memset(&alu, 0, sizeof(alu)); 3136 alu.op = ALU_OP2_PRED_SETE_INT; 3137 alu.src[0].chan = 2; 3138 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3139 alu.execute_mask = 1; 3140 alu.update_pred = 1; 3141 alu.last = 1; 3142 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE); 3143 3144 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 3145 cf_jump = ctx->bc->cf_last; 3146 3147 treg[0] = r600_get_temp(ctx); 3148 switch (ctx->shader->tcs_prim_mode) { 3149 case PIPE_PRIM_LINES: 3150 stride = 8; /* 2 dwords, 1 vec2 store */ 3151 outer_comps = 2; 3152 inner_comps = 0; 3153 break; 3154 case PIPE_PRIM_TRIANGLES: 3155 stride = 16; /* 4 dwords, 1 vec4 store */ 3156 outer_comps = 3; 3157 inner_comps = 1; 3158 treg[1] = r600_get_temp(ctx); 3159 break; 3160 case PIPE_PRIM_QUADS: 3161 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */ 3162 outer_comps = 4; 3163 inner_comps = 2; 3164 treg[1] = r600_get_temp(ctx); 3165 treg[2] = r600_get_temp(ctx); 3166 break; 3167 default: 3168 assert(0); 3169 return -1; 3170 } 3171 3172 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */ 3173 /* TF_WRITE takes index in R.x, value in R.y */ 3174 for (j = 0; j < ctx->shader->noutput; j++) { 3175 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER) 3176 tessinner_idx = j; 3177 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER) 3178 tessouter_idx = j; 3179 } 3180 3181 if (tessouter_idx == -1) 3182 return -1; 3183 3184 if (tessinner_idx == -1 && inner_comps) 3185 return -1; 3186 3187 if (tessouter_idx != -1) { 3188 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps); 3189 if (r) 3190 return r; 3191 } 3192 3193 if (tessinner_idx != -1) { 3194 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps); 3195 if (r) 3196 return r; 3197 } 3198 3199 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */ 3200 /* r.x = relpatchid(r0.y) * tf_stride */ 3201 3202 /* multiply incoming r0.y * stride - t.x = r0.y * stride */ 3203 /* add incoming r0.w to it: t.x = t.x + r0.w */ 3204 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3205 temp_reg, 0, 3206 0, 1, 3207 V_SQ_ALU_SRC_LITERAL, stride, 3208 0, 3); 3209 if (r) 3210 return r; 3211 3212 for (i = 0; i < outer_comps + inner_comps; i++) { 3213 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx; 3214 int out_comp = i >= outer_comps ? i - outer_comps : i; 3215 3216 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) { 3217 if (out_comp == 1) 3218 out_comp = 0; 3219 else if (out_comp == 0) 3220 out_comp = 1; 3221 } 3222 3223 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3224 treg[i / 2], (2 * (i % 2)), 3225 temp_reg, 0, 3226 V_SQ_ALU_SRC_LITERAL, 4 * i); 3227 if (r) 3228 return r; 3229 r = single_alu_op2(ctx, ALU_OP1_MOV, 3230 treg[i / 2], 1 + (2 * (i%2)), 3231 ctx->shader->output[out_idx].gpr, out_comp, 3232 0, 0); 3233 if (r) 3234 return r; 3235 } 3236 for (i = 0; i < outer_comps + inner_comps; i++) { 3237 struct r600_bytecode_gds gds; 3238 3239 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 3240 gds.src_gpr = treg[i / 2]; 3241 gds.src_sel_x = 2 * (i % 2); 3242 gds.src_sel_y = 1 + (2 * (i % 2)); 3243 gds.src_sel_z = 4; 3244 gds.dst_sel_x = 7; 3245 gds.dst_sel_y = 7; 3246 gds.dst_sel_z = 7; 3247 gds.dst_sel_w = 7; 3248 gds.op = FETCH_OP_TF_WRITE; 3249 r = r600_bytecode_add_gds(ctx->bc, &gds); 3250 if (r) 3251 return r; 3252 } 3253 3254 // Patch up jump label 3255 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 3256 cf_pop = ctx->bc->cf_last; 3257 3258 cf_jump->cf_addr = cf_pop->id + 2; 3259 cf_jump->pop_count = 1; 3260 cf_pop->cf_addr = cf_pop->id + 2; 3261 cf_pop->pop_count = 1; 3262 3263 return 0; 3264} 3265 3266/* 3267 * We have to work out the thread ID for load and atomic 3268 * operations, which store the returned value to an index 3269 * in an intermediate buffer. 3270 * The index is calculated by taking the thread id, 3271 * calculated from the MBCNT instructions. 3272 * Then the shader engine ID is multiplied by 256, 3273 * and the wave id is added. 3274 * Then the result is multipled by 64 and thread id is 3275 * added. 3276 */ 3277static int load_thread_id_gpr(struct r600_shader_ctx *ctx) 3278{ 3279 struct r600_bytecode_alu alu; 3280 int r; 3281 3282 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3283 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT; 3284 alu.dst.sel = ctx->temp_reg; 3285 alu.dst.chan = 0; 3286 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3287 alu.src[0].value = 0xffffffff; 3288 alu.dst.write = 1; 3289 r = r600_bytecode_add_alu(ctx->bc, &alu); 3290 if (r) 3291 return r; 3292 3293 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3294 alu.op = ALU_OP1_MBCNT_32HI_INT; 3295 alu.dst.sel = ctx->temp_reg; 3296 alu.dst.chan = 1; 3297 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3298 alu.src[0].value = 0xffffffff; 3299 alu.dst.write = 1; 3300 r = r600_bytecode_add_alu(ctx->bc, &alu); 3301 if (r) 3302 return r; 3303 3304 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3305 alu.op = ALU_OP3_MULADD_UINT24; 3306 alu.dst.sel = ctx->temp_reg; 3307 alu.dst.chan = 2; 3308 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID; 3309 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3310 alu.src[1].value = 256; 3311 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID; 3312 alu.dst.write = 1; 3313 alu.is_op3 = 1; 3314 alu.last = 1; 3315 r = r600_bytecode_add_alu(ctx->bc, &alu); 3316 if (r) 3317 return r; 3318 3319 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3320 ctx->thread_id_gpr, 1, 3321 ctx->temp_reg, 2, 3322 V_SQ_ALU_SRC_LITERAL, 0x40, 3323 ctx->temp_reg, 0); 3324 if (r) 3325 return r; 3326 return 0; 3327} 3328 3329static int r600_shader_from_tgsi(struct r600_context *rctx, 3330 struct r600_pipe_shader *pipeshader, 3331 union r600_shader_key key) 3332{ 3333 struct r600_screen *rscreen = rctx->screen; 3334 struct r600_shader *shader = &pipeshader->shader; 3335 struct tgsi_token *tokens = pipeshader->selector->tokens; 3336 struct pipe_stream_output_info so = pipeshader->selector->so; 3337 struct tgsi_full_immediate *immediate; 3338 struct r600_shader_ctx ctx; 3339 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)]; 3340 unsigned output_done, noutput; 3341 unsigned opcode; 3342 int j, k, r = 0; 3343 unsigned i; 3344 int next_param_base = 0, next_clip_base; 3345 int max_color_exports = MAX2(key.ps.nr_cbufs, 1); 3346 bool indirect_gprs; 3347 bool ring_outputs = false; 3348 bool lds_outputs = false; 3349 bool lds_inputs = false; 3350 bool pos_emitted = false; 3351 3352 ctx.bc = &shader->bc; 3353 ctx.shader = shader; 3354 3355 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, 3356 rscreen->has_compressed_msaa_texturing); 3357 ctx.tokens = tokens; 3358 tgsi_scan_shader(tokens, &ctx.info); 3359 shader->indirect_files = ctx.info.indirect_files; 3360 3361 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY]; 3362 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos)); 3363 ctx.spilled_arrays = calloc(narrays, sizeof(bool)); 3364 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos); 3365 3366 shader->uses_helper_invocation = false; 3367 shader->uses_doubles = ctx.info.uses_doubles; 3368 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; 3369 shader->nsys_inputs = 0; 3370 3371 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 || 3372 ctx.info.file_count[TGSI_FILE_BUFFER] > 0; 3373 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); 3374 tgsi_parse_init(&ctx.parse, tokens); 3375 ctx.type = ctx.info.processor; 3376 shader->processor_type = ctx.type; 3377 ctx.bc->type = shader->processor_type; 3378 3379 switch (ctx.type) { 3380 case PIPE_SHADER_VERTEX: 3381 shader->vs_as_gs_a = key.vs.as_gs_a; 3382 shader->vs_as_es = key.vs.as_es; 3383 shader->vs_as_ls = key.vs.as_ls; 3384 shader->atomic_base = key.vs.first_atomic_counter; 3385 if (shader->vs_as_es) 3386 ring_outputs = true; 3387 if (shader->vs_as_ls) 3388 lds_outputs = true; 3389 break; 3390 case PIPE_SHADER_GEOMETRY: 3391 ring_outputs = true; 3392 shader->atomic_base = key.gs.first_atomic_counter; 3393 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix; 3394 break; 3395 case PIPE_SHADER_TESS_CTRL: 3396 shader->tcs_prim_mode = key.tcs.prim_mode; 3397 shader->atomic_base = key.tcs.first_atomic_counter; 3398 lds_outputs = true; 3399 lds_inputs = true; 3400 break; 3401 case PIPE_SHADER_TESS_EVAL: 3402 shader->tes_as_es = key.tes.as_es; 3403 shader->atomic_base = key.tes.first_atomic_counter; 3404 lds_inputs = true; 3405 if (shader->tes_as_es) 3406 ring_outputs = true; 3407 break; 3408 case PIPE_SHADER_FRAGMENT: 3409 shader->two_side = key.ps.color_two_side; 3410 shader->atomic_base = key.ps.first_atomic_counter; 3411 shader->rat_base = key.ps.nr_cbufs; 3412 shader->image_size_const_offset = key.ps.image_size_const_offset; 3413 break; 3414 case PIPE_SHADER_COMPUTE: 3415 shader->rat_base = 0; 3416 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER]; 3417 break; 3418 default: 3419 break; 3420 } 3421 3422 if (shader->vs_as_es || shader->tes_as_es) { 3423 ctx.gs_for_vs = &rctx->gs_shader->current->shader; 3424 } else { 3425 ctx.gs_for_vs = NULL; 3426 } 3427 3428 ctx.next_ring_offset = 0; 3429 ctx.gs_out_ring_offset = 0; 3430 ctx.gs_next_vertex = 0; 3431 ctx.gs_stream_output_info = &so; 3432 3433 ctx.thread_id_gpr = -1; 3434 ctx.face_gpr = -1; 3435 ctx.fixed_pt_position_gpr = -1; 3436 ctx.fragcoord_input = -1; 3437 ctx.colors_used = 0; 3438 ctx.clip_vertex_write = 0; 3439 3440 ctx.helper_invoc_reg = -1; 3441 ctx.cs_block_size_reg = -1; 3442 ctx.cs_grid_size_reg = -1; 3443 ctx.cs_block_size_loaded = false; 3444 ctx.cs_grid_size_loaded = false; 3445 3446 shader->nr_ps_color_exports = 0; 3447 shader->nr_ps_max_color_exports = 0; 3448 3449 3450 /* register allocations */ 3451 /* Values [0,127] correspond to GPR[0..127]. 3452 * Values [128,159] correspond to constant buffer bank 0 3453 * Values [160,191] correspond to constant buffer bank 1 3454 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 3455 * Values [256,287] correspond to constant buffer bank 2 (EG) 3456 * Values [288,319] correspond to constant buffer bank 3 (EG) 3457 * Other special values are shown in the list below. 3458 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 3459 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 3460 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 3461 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 3462 * 248 SQ_ALU_SRC_0: special constant 0.0. 3463 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 3464 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 3465 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 3466 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 3467 * 253 SQ_ALU_SRC_LITERAL: literal constant. 3468 * 254 SQ_ALU_SRC_PV: previous vector result. 3469 * 255 SQ_ALU_SRC_PS: previous scalar result. 3470 */ 3471 for (i = 0; i < TGSI_FILE_COUNT; i++) { 3472 ctx.file_offset[i] = 0; 3473 } 3474 3475 if (ctx.type == PIPE_SHADER_VERTEX) { 3476 3477 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3478 if (ctx.info.num_inputs) 3479 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 3480 } 3481 if (ctx.type == PIPE_SHADER_FRAGMENT) { 3482 if (ctx.bc->chip_class >= EVERGREEN) 3483 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 3484 else 3485 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); 3486 3487 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3488 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) { 3489 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3490 shader->uses_helper_invocation = true; 3491 } 3492 } 3493 } 3494 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3495 /* FIXME 1 would be enough in some cases (3 or less input vertices) */ 3496 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3497 } 3498 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3499 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3500 if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3501 bool add_tesscoord = false, add_tess_inout = false; 3502 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3503 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3504 /* if we have tesscoord save one reg */ 3505 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD) 3506 add_tesscoord = true; 3507 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER || 3508 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER) 3509 add_tess_inout = true; 3510 } 3511 if (add_tesscoord || add_tess_inout) 3512 ctx.file_offset[TGSI_FILE_INPUT]++; 3513 if (add_tess_inout) 3514 ctx.file_offset[TGSI_FILE_INPUT]+=2; 3515 } 3516 if (ctx.type == PIPE_SHADER_COMPUTE) { 3517 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3518 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3519 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE) 3520 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3521 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE) 3522 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3523 } 3524 } 3525 3526 ctx.file_offset[TGSI_FILE_OUTPUT] = 3527 ctx.file_offset[TGSI_FILE_INPUT] + 3528 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3529 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 3530 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 3531 3532 /* Outside the GPR range. This will be translated to one of the 3533 * kcache banks later. */ 3534 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 3535 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 3536 3537 pipeshader->scratch_space_needed = 0; 3538 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] + 3539 ctx.info.file_max[TGSI_FILE_TEMPORARY]; 3540 if (regno > 124) { 3541 choose_spill_arrays(&ctx, ®no, &pipeshader->scratch_space_needed); 3542 shader->indirect_files = ctx.info.indirect_files; 3543 } 3544 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0; 3545 3546 ctx.bc->ar_reg = ++regno; 3547 ctx.bc->index_reg[0] = ++regno; 3548 ctx.bc->index_reg[1] = ++regno; 3549 3550 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3551 ctx.tess_input_info = ++regno; 3552 ctx.tess_output_info = ++regno; 3553 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3554 ctx.tess_input_info = ++regno; 3555 ctx.tess_output_info = ++regno; 3556 } else if (ctx.type == PIPE_SHADER_GEOMETRY) { 3557 ctx.gs_export_gpr_tregs[0] = ++regno; 3558 ctx.gs_export_gpr_tregs[1] = ++regno; 3559 ctx.gs_export_gpr_tregs[2] = ++regno; 3560 ctx.gs_export_gpr_tregs[3] = ++regno; 3561 if (ctx.shader->gs_tri_strip_adj_fix) { 3562 ctx.gs_rotated_input[0] = ++regno; 3563 ctx.gs_rotated_input[1] = ++regno; 3564 } else { 3565 ctx.gs_rotated_input[0] = 0; 3566 ctx.gs_rotated_input[1] = 1; 3567 } 3568 } 3569 3570 if (shader->uses_images) { 3571 ctx.thread_id_gpr = ++regno; 3572 } 3573 ctx.temp_reg = ++regno; 3574 3575 shader->max_arrays = 0; 3576 shader->num_arrays = 0; 3577 if (indirect_gprs) { 3578 3579 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 3580 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 3581 ctx.file_offset[TGSI_FILE_OUTPUT] - 3582 ctx.file_offset[TGSI_FILE_INPUT], 3583 0x0F); 3584 } 3585 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 3586 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 3587 ctx.file_offset[TGSI_FILE_TEMPORARY] - 3588 ctx.file_offset[TGSI_FILE_OUTPUT], 3589 0x0F); 3590 } 3591 } 3592 3593 ctx.nliterals = 0; 3594 ctx.literals = NULL; 3595 ctx.max_driver_temp_used = 0; 3596 3597 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && 3598 ctx.info.colors_written == 1; 3599 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; 3600 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; 3601 3602 if (ctx.type == PIPE_SHADER_VERTEX || 3603 ctx.type == PIPE_SHADER_GEOMETRY || 3604 ctx.type == PIPE_SHADER_TESS_EVAL) { 3605 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] + 3606 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1; 3607 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1; 3608 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]; 3609 } 3610 3611 if (shader->vs_as_gs_a) 3612 vs_add_primid_output(&ctx, key.vs.prim_id_out); 3613 3614 if (ctx.thread_id_gpr != -1) { 3615 r = load_thread_id_gpr(&ctx); 3616 if (r) 3617 return r; 3618 } 3619 3620 if (ctx.type == PIPE_SHADER_TESS_EVAL) 3621 r600_fetch_tess_io_info(&ctx); 3622 3623 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3624 tgsi_parse_token(&ctx.parse); 3625 switch (ctx.parse.FullToken.Token.Type) { 3626 case TGSI_TOKEN_TYPE_IMMEDIATE: 3627 immediate = &ctx.parse.FullToken.FullImmediate; 3628 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 3629 if(ctx.literals == NULL) { 3630 r = -ENOMEM; 3631 goto out_err; 3632 } 3633 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 3634 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 3635 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 3636 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 3637 ctx.nliterals++; 3638 break; 3639 case TGSI_TOKEN_TYPE_DECLARATION: 3640 r = tgsi_declaration(&ctx); 3641 if (r) 3642 goto out_err; 3643 break; 3644 case TGSI_TOKEN_TYPE_INSTRUCTION: 3645 case TGSI_TOKEN_TYPE_PROPERTY: 3646 break; 3647 default: 3648 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 3649 r = -EINVAL; 3650 goto out_err; 3651 } 3652 } 3653 3654 shader->ring_item_sizes[0] = ctx.next_ring_offset; 3655 shader->ring_item_sizes[1] = 0; 3656 shader->ring_item_sizes[2] = 0; 3657 shader->ring_item_sizes[3] = 0; 3658 3659 /* Process two side if needed */ 3660 if (shader->two_side && ctx.colors_used) { 3661 int i, count = ctx.shader->ninput; 3662 unsigned next_lds_loc = ctx.shader->nlds; 3663 3664 /* additional inputs will be allocated right after the existing inputs, 3665 * we won't need them after the color selection, so we don't need to 3666 * reserve these gprs for the rest of the shader code and to adjust 3667 * output offsets etc. */ 3668 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 3669 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3670 3671 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */ 3672 if (ctx.face_gpr == -1) { 3673 i = ctx.shader->ninput++; 3674 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 3675 ctx.shader->input[i].spi_sid = 0; 3676 ctx.shader->input[i].gpr = gpr++; 3677 ctx.face_gpr = ctx.shader->input[i].gpr; 3678 } 3679 3680 for (i = 0; i < count; i++) { 3681 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 3682 int ni = ctx.shader->ninput++; 3683 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 3684 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 3685 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 3686 ctx.shader->input[ni].gpr = gpr++; 3687 // TGSI to LLVM needs to know the lds position of inputs. 3688 // Non LLVM path computes it later (in process_twoside_color) 3689 ctx.shader->input[ni].lds_pos = next_lds_loc++; 3690 ctx.shader->input[i].back_color_input = ni; 3691 if (ctx.bc->chip_class >= EVERGREEN) { 3692 if ((r = evergreen_interp_input(&ctx, ni))) 3693 return r; 3694 } 3695 } 3696 } 3697 } 3698 3699 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) 3700 shader->nr_ps_max_color_exports = 8; 3701 3702 if (ctx.shader->uses_helper_invocation) { 3703 if (ctx.bc->chip_class == CAYMAN) 3704 r = cm_load_helper_invocation(&ctx); 3705 else 3706 r = eg_load_helper_invocation(&ctx); 3707 if (r) 3708 return r; 3709 } 3710 3711 /* 3712 * XXX this relies on fixed_pt_position_gpr only being present when 3713 * this shader should be executed per sample. Should be the case for now... 3714 */ 3715 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) { 3716 /* 3717 * Fix up sample mask. The hw always gives us coverage mask for 3718 * the pixel. However, for per-sample shading, we need the 3719 * coverage for the shader invocation only. 3720 * Also, with disabled msaa, only the first bit should be set 3721 * (luckily the same fixup works for both problems). 3722 * For now, we can only do it if we know this shader is always 3723 * executed per sample (due to usage of bits in the shader 3724 * forcing per-sample execution). 3725 * If the fb is not multisampled, we'd do unnecessary work but 3726 * it should still be correct. 3727 * It will however do nothing for sample shading according 3728 * to MinSampleShading. 3729 */ 3730 struct r600_bytecode_alu alu; 3731 int tmp = r600_get_temp(&ctx); 3732 assert(ctx.face_gpr != -1); 3733 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3734 3735 alu.op = ALU_OP2_LSHL_INT; 3736 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3737 alu.src[0].value = 0x1; 3738 alu.src[1].sel = ctx.fixed_pt_position_gpr; 3739 alu.src[1].chan = 3; 3740 alu.dst.sel = tmp; 3741 alu.dst.chan = 0; 3742 alu.dst.write = 1; 3743 alu.last = 1; 3744 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3745 return r; 3746 3747 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3748 alu.op = ALU_OP2_AND_INT; 3749 alu.src[0].sel = tmp; 3750 alu.src[1].sel = ctx.face_gpr; 3751 alu.src[1].chan = 2; 3752 alu.dst.sel = ctx.face_gpr; 3753 alu.dst.chan = 2; 3754 alu.dst.write = 1; 3755 alu.last = 1; 3756 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3757 return r; 3758 } 3759 3760 if (ctx.fragcoord_input >= 0) { 3761 if (ctx.bc->chip_class == CAYMAN) { 3762 for (j = 0 ; j < 4; j++) { 3763 struct r600_bytecode_alu alu; 3764 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3765 alu.op = ALU_OP1_RECIP_IEEE; 3766 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3767 alu.src[0].chan = 3; 3768 3769 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3770 alu.dst.chan = j; 3771 alu.dst.write = (j == 3); 3772 alu.last = (j == 3); 3773 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3774 return r; 3775 } 3776 } else { 3777 struct r600_bytecode_alu alu; 3778 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3779 alu.op = ALU_OP1_RECIP_IEEE; 3780 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3781 alu.src[0].chan = 3; 3782 3783 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3784 alu.dst.chan = 3; 3785 alu.dst.write = 1; 3786 alu.last = 1; 3787 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3788 return r; 3789 } 3790 } 3791 3792 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3793 struct r600_bytecode_alu alu; 3794 int r; 3795 3796 /* GS thread with no output workaround - emit a cut at start of GS */ 3797 if (ctx.bc->chip_class == R600) 3798 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); 3799 3800 for (j = 0; j < 4; j++) { 3801 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3802 alu.op = ALU_OP1_MOV; 3803 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3804 alu.src[0].value = 0; 3805 alu.dst.sel = ctx.gs_export_gpr_tregs[j]; 3806 alu.dst.write = 1; 3807 alu.last = 1; 3808 r = r600_bytecode_add_alu(ctx.bc, &alu); 3809 if (r) 3810 return r; 3811 } 3812 3813 if (ctx.shader->gs_tri_strip_adj_fix) { 3814 r = single_alu_op2(&ctx, ALU_OP2_AND_INT, 3815 ctx.gs_rotated_input[0], 2, 3816 0, 2, 3817 V_SQ_ALU_SRC_LITERAL, 1); 3818 if (r) 3819 return r; 3820 3821 for (i = 0; i < 6; i++) { 3822 int rotated = (i + 4) % 6; 3823 int offset_reg = i / 3; 3824 int offset_chan = i % 3; 3825 int rotated_offset_reg = rotated / 3; 3826 int rotated_offset_chan = rotated % 3; 3827 3828 if (offset_reg == 0 && offset_chan == 2) 3829 offset_chan = 3; 3830 if (rotated_offset_reg == 0 && rotated_offset_chan == 2) 3831 rotated_offset_chan = 3; 3832 3833 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT, 3834 ctx.gs_rotated_input[offset_reg], offset_chan, 3835 ctx.gs_rotated_input[0], 2, 3836 offset_reg, offset_chan, 3837 rotated_offset_reg, rotated_offset_chan); 3838 if (r) 3839 return r; 3840 } 3841 } 3842 } 3843 3844 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3845 r600_fetch_tess_io_info(&ctx); 3846 3847 if (shader->two_side && ctx.colors_used) { 3848 if ((r = process_twoside_color_inputs(&ctx))) 3849 return r; 3850 } 3851 3852 tgsi_parse_init(&ctx.parse, tokens); 3853 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3854 tgsi_parse_token(&ctx.parse); 3855 switch (ctx.parse.FullToken.Token.Type) { 3856 case TGSI_TOKEN_TYPE_INSTRUCTION: 3857 r = tgsi_is_supported(&ctx); 3858 if (r) 3859 goto out_err; 3860 ctx.max_driver_temp_used = 0; 3861 /* reserve first tmp for everyone */ 3862 r600_get_temp(&ctx); 3863 3864 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 3865 if ((r = tgsi_split_constant(&ctx))) 3866 goto out_err; 3867 if ((r = tgsi_split_literal_constant(&ctx))) 3868 goto out_err; 3869 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3870 if ((r = tgsi_split_gs_inputs(&ctx))) 3871 goto out_err; 3872 } else if (lds_inputs) { 3873 if ((r = tgsi_split_lds_inputs(&ctx))) 3874 goto out_err; 3875 } 3876 if (ctx.bc->chip_class == CAYMAN) 3877 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 3878 else if (ctx.bc->chip_class >= EVERGREEN) 3879 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 3880 else 3881 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 3882 3883 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise; 3884 3885 r = ctx.inst_info->process(&ctx); 3886 if (r) 3887 goto out_err; 3888 3889 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3890 r = r600_store_tcs_output(&ctx); 3891 if (r) 3892 goto out_err; 3893 } 3894 break; 3895 default: 3896 break; 3897 } 3898 } 3899 3900 /* Reset the temporary register counter. */ 3901 ctx.max_driver_temp_used = 0; 3902 3903 noutput = shader->noutput; 3904 3905 if (!ring_outputs && ctx.clip_vertex_write) { 3906 unsigned clipdist_temp[2]; 3907 3908 clipdist_temp[0] = r600_get_temp(&ctx); 3909 clipdist_temp[1] = r600_get_temp(&ctx); 3910 3911 /* need to convert a clipvertex write into clipdistance writes and not export 3912 the clip vertex anymore */ 3913 3914 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 3915 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 3916 shader->output[noutput].gpr = clipdist_temp[0]; 3917 noutput++; 3918 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 3919 shader->output[noutput].gpr = clipdist_temp[1]; 3920 noutput++; 3921 3922 /* reset spi_sid for clipvertex output to avoid confusing spi */ 3923 shader->output[ctx.cv_output].spi_sid = 0; 3924 3925 shader->clip_dist_write = 0xFF; 3926 shader->cc_dist_mask = 0xFF; 3927 3928 for (i = 0; i < 8; i++) { 3929 int oreg = i >> 2; 3930 int ochan = i & 3; 3931 3932 for (j = 0; j < 4; j++) { 3933 struct r600_bytecode_alu alu; 3934 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3935 alu.op = ALU_OP2_DOT4; 3936 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 3937 alu.src[0].chan = j; 3938 3939 alu.src[1].sel = 512 + i; 3940 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 3941 alu.src[1].chan = j; 3942 3943 alu.dst.sel = clipdist_temp[oreg]; 3944 alu.dst.chan = j; 3945 alu.dst.write = (j == ochan); 3946 if (j == 3) 3947 alu.last = 1; 3948 r = r600_bytecode_add_alu(ctx.bc, &alu); 3949 if (r) 3950 return r; 3951 } 3952 } 3953 } 3954 3955 /* Add stream outputs. */ 3956 if (so.num_outputs) { 3957 bool emit = false; 3958 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX) 3959 emit = true; 3960 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL) 3961 emit = true; 3962 if (emit) 3963 emit_streamout(&ctx, &so, -1, NULL); 3964 } 3965 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 3966 convert_edgeflag_to_int(&ctx); 3967 3968 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3969 r600_emit_tess_factor(&ctx); 3970 3971 if (lds_outputs) { 3972 if (ctx.type == PIPE_SHADER_VERTEX) { 3973 if (ctx.shader->noutput) 3974 emit_lds_vs_writes(&ctx); 3975 } 3976 } else if (ring_outputs) { 3977 if (shader->vs_as_es || shader->tes_as_es) { 3978 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx); 3979 ctx.gs_export_gpr_tregs[1] = -1; 3980 ctx.gs_export_gpr_tregs[2] = -1; 3981 ctx.gs_export_gpr_tregs[3] = -1; 3982 3983 emit_gs_ring_writes(&ctx, &so, -1, FALSE); 3984 } 3985 } else { 3986 /* Export output */ 3987 next_clip_base = shader->vs_out_misc_write ? 62 : 61; 3988 3989 for (i = 0, j = 0; i < noutput; i++, j++) { 3990 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3991 output[j].gpr = shader->output[i].gpr; 3992 output[j].elem_size = 3; 3993 output[j].swizzle_x = 0; 3994 output[j].swizzle_y = 1; 3995 output[j].swizzle_z = 2; 3996 output[j].swizzle_w = 3; 3997 output[j].burst_count = 1; 3998 output[j].type = 0xffffffff; 3999 output[j].op = CF_OP_EXPORT; 4000 switch (ctx.type) { 4001 case PIPE_SHADER_VERTEX: 4002 case PIPE_SHADER_TESS_EVAL: 4003 switch (shader->output[i].name) { 4004 case TGSI_SEMANTIC_POSITION: 4005 output[j].array_base = 60; 4006 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4007 pos_emitted = true; 4008 break; 4009 4010 case TGSI_SEMANTIC_PSIZE: 4011 output[j].array_base = 61; 4012 output[j].swizzle_y = 7; 4013 output[j].swizzle_z = 7; 4014 output[j].swizzle_w = 7; 4015 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4016 pos_emitted = true; 4017 break; 4018 case TGSI_SEMANTIC_EDGEFLAG: 4019 output[j].array_base = 61; 4020 output[j].swizzle_x = 7; 4021 output[j].swizzle_y = 0; 4022 output[j].swizzle_z = 7; 4023 output[j].swizzle_w = 7; 4024 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4025 pos_emitted = true; 4026 break; 4027 case TGSI_SEMANTIC_LAYER: 4028 /* spi_sid is 0 for outputs that are 4029 * not consumed by PS */ 4030 if (shader->output[i].spi_sid) { 4031 output[j].array_base = next_param_base++; 4032 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4033 j++; 4034 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4035 } 4036 output[j].array_base = 61; 4037 output[j].swizzle_x = 7; 4038 output[j].swizzle_y = 7; 4039 output[j].swizzle_z = 0; 4040 output[j].swizzle_w = 7; 4041 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4042 pos_emitted = true; 4043 break; 4044 case TGSI_SEMANTIC_VIEWPORT_INDEX: 4045 /* spi_sid is 0 for outputs that are 4046 * not consumed by PS */ 4047 if (shader->output[i].spi_sid) { 4048 output[j].array_base = next_param_base++; 4049 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4050 j++; 4051 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4052 } 4053 output[j].array_base = 61; 4054 output[j].swizzle_x = 7; 4055 output[j].swizzle_y = 7; 4056 output[j].swizzle_z = 7; 4057 output[j].swizzle_w = 0; 4058 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4059 pos_emitted = true; 4060 break; 4061 case TGSI_SEMANTIC_CLIPVERTEX: 4062 j--; 4063 break; 4064 case TGSI_SEMANTIC_CLIPDIST: 4065 output[j].array_base = next_clip_base++; 4066 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4067 pos_emitted = true; 4068 /* spi_sid is 0 for clipdistance outputs that were generated 4069 * for clipvertex - we don't need to pass them to PS */ 4070 if (shader->output[i].spi_sid) { 4071 j++; 4072 /* duplicate it as PARAM to pass to the pixel shader */ 4073 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4074 output[j].array_base = next_param_base++; 4075 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4076 } 4077 break; 4078 case TGSI_SEMANTIC_FOG: 4079 output[j].swizzle_y = 4; /* 0 */ 4080 output[j].swizzle_z = 4; /* 0 */ 4081 output[j].swizzle_w = 5; /* 1 */ 4082 break; 4083 case TGSI_SEMANTIC_PRIMID: 4084 output[j].swizzle_x = 2; 4085 output[j].swizzle_y = 4; /* 0 */ 4086 output[j].swizzle_z = 4; /* 0 */ 4087 output[j].swizzle_w = 4; /* 0 */ 4088 break; 4089 } 4090 4091 break; 4092 case PIPE_SHADER_FRAGMENT: 4093 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 4094 /* never export more colors than the number of CBs */ 4095 if (shader->output[i].sid >= max_color_exports) { 4096 /* skip export */ 4097 j--; 4098 continue; 4099 } 4100 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4101 output[j].array_base = shader->output[i].sid; 4102 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4103 shader->nr_ps_color_exports++; 4104 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4)); 4105 4106 /* If the i-th target format is set, all previous target formats must 4107 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well. 4108 */ 4109 if (shader->output[i].sid > 0) 4110 for (unsigned x = 0; x < shader->output[i].sid; x++) 4111 shader->ps_color_export_mask |= (1 << (x*4)); 4112 4113 if (shader->output[i].sid > shader->ps_export_highest) 4114 shader->ps_export_highest = shader->output[i].sid; 4115 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) { 4116 for (k = 1; k < max_color_exports; k++) { 4117 j++; 4118 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4119 output[j].gpr = shader->output[i].gpr; 4120 output[j].elem_size = 3; 4121 output[j].swizzle_x = 0; 4122 output[j].swizzle_y = 1; 4123 output[j].swizzle_z = 2; 4124 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4125 output[j].burst_count = 1; 4126 output[j].array_base = k; 4127 output[j].op = CF_OP_EXPORT; 4128 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4129 shader->nr_ps_color_exports++; 4130 if (k > shader->ps_export_highest) 4131 shader->ps_export_highest = k; 4132 shader->ps_color_export_mask |= (0xf << (j * 4)); 4133 } 4134 } 4135 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 4136 output[j].array_base = 61; 4137 output[j].swizzle_x = 2; 4138 output[j].swizzle_y = 7; 4139 output[j].swizzle_z = output[j].swizzle_w = 7; 4140 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4141 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 4142 output[j].array_base = 61; 4143 output[j].swizzle_x = 7; 4144 output[j].swizzle_y = 1; 4145 output[j].swizzle_z = output[j].swizzle_w = 7; 4146 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4147 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) { 4148 output[j].array_base = 61; 4149 output[j].swizzle_x = 7; 4150 output[j].swizzle_y = 7; 4151 output[j].swizzle_z = 0; 4152 output[j].swizzle_w = 7; 4153 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4154 } else { 4155 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 4156 r = -EINVAL; 4157 goto out_err; 4158 } 4159 break; 4160 case PIPE_SHADER_TESS_CTRL: 4161 break; 4162 default: 4163 R600_ERR("unsupported processor type %d\n", ctx.type); 4164 r = -EINVAL; 4165 goto out_err; 4166 } 4167 4168 if (output[j].type == 0xffffffff) { 4169 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4170 output[j].array_base = next_param_base++; 4171 } 4172 } 4173 4174 /* add fake position export */ 4175 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) { 4176 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4177 output[j].gpr = 0; 4178 output[j].elem_size = 3; 4179 output[j].swizzle_x = 7; 4180 output[j].swizzle_y = 7; 4181 output[j].swizzle_z = 7; 4182 output[j].swizzle_w = 7; 4183 output[j].burst_count = 1; 4184 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4185 output[j].array_base = 60; 4186 output[j].op = CF_OP_EXPORT; 4187 j++; 4188 } 4189 4190 /* add fake param output for vertex shader if no param is exported */ 4191 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) { 4192 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4193 output[j].gpr = 0; 4194 output[j].elem_size = 3; 4195 output[j].swizzle_x = 7; 4196 output[j].swizzle_y = 7; 4197 output[j].swizzle_z = 7; 4198 output[j].swizzle_w = 7; 4199 output[j].burst_count = 1; 4200 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4201 output[j].array_base = 0; 4202 output[j].op = CF_OP_EXPORT; 4203 j++; 4204 } 4205 4206 /* add fake pixel export */ 4207 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) { 4208 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4209 output[j].gpr = 0; 4210 output[j].elem_size = 3; 4211 output[j].swizzle_x = 7; 4212 output[j].swizzle_y = 7; 4213 output[j].swizzle_z = 7; 4214 output[j].swizzle_w = 7; 4215 output[j].burst_count = 1; 4216 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4217 output[j].array_base = 0; 4218 output[j].op = CF_OP_EXPORT; 4219 j++; 4220 shader->nr_ps_color_exports++; 4221 shader->ps_color_export_mask = 0xf; 4222 } 4223 4224 noutput = j; 4225 4226 /* set export done on last export of each type */ 4227 for (k = noutput - 1, output_done = 0; k >= 0; k--) { 4228 if (!(output_done & (1 << output[k].type))) { 4229 output_done |= (1 << output[k].type); 4230 output[k].op = CF_OP_EXPORT_DONE; 4231 } 4232 } 4233 /* add output to bytecode */ 4234 for (i = 0; i < noutput; i++) { 4235 r = r600_bytecode_add_output(ctx.bc, &output[i]); 4236 if (r) 4237 goto out_err; 4238 } 4239 } 4240 4241 /* add program end */ 4242 if (ctx.bc->chip_class == CAYMAN) 4243 cm_bytecode_add_cf_end(ctx.bc); 4244 else { 4245 const struct cf_op_info *last = NULL; 4246 4247 if (ctx.bc->cf_last) 4248 last = r600_isa_cf(ctx.bc->cf_last->op); 4249 4250 /* alu clause instructions don't have EOP bit, so add NOP */ 4251 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP) 4252 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 4253 4254 ctx.bc->cf_last->end_of_program = 1; 4255 } 4256 4257 /* check GPR limit - we have 124 = 128 - 4 4258 * (4 are reserved as alu clause temporary registers) */ 4259 if (ctx.bc->ngpr > 124) { 4260 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 4261 r = -ENOMEM; 4262 goto out_err; 4263 } 4264 4265 if (ctx.type == PIPE_SHADER_GEOMETRY) { 4266 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so))) 4267 return r; 4268 } 4269 4270 free(ctx.spilled_arrays); 4271 free(ctx.array_infos); 4272 free(ctx.literals); 4273 tgsi_parse_free(&ctx.parse); 4274 return 0; 4275out_err: 4276 free(ctx.spilled_arrays); 4277 free(ctx.array_infos); 4278 free(ctx.literals); 4279 tgsi_parse_free(&ctx.parse); 4280 return r; 4281} 4282 4283static int tgsi_unsupported(struct r600_shader_ctx *ctx) 4284{ 4285 const unsigned tgsi_opcode = 4286 ctx->parse.FullToken.FullInstruction.Instruction.Opcode; 4287 R600_ERR("%s tgsi opcode unsupported\n", 4288 tgsi_get_opcode_name(tgsi_opcode)); 4289 return -EINVAL; 4290} 4291 4292static int tgsi_end(struct r600_shader_ctx *ctx UNUSED) 4293{ 4294 return 0; 4295} 4296 4297static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 4298 const struct r600_shader_src *shader_src, 4299 unsigned chan) 4300{ 4301 bc_src->sel = shader_src->sel; 4302 bc_src->chan = shader_src->swizzle[chan]; 4303 bc_src->neg = shader_src->neg; 4304 bc_src->abs = shader_src->abs; 4305 bc_src->rel = shader_src->rel; 4306 bc_src->value = shader_src->value[bc_src->chan]; 4307 bc_src->kc_bank = shader_src->kc_bank; 4308 bc_src->kc_rel = shader_src->kc_rel; 4309} 4310 4311static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 4312{ 4313 bc_src->abs = 1; 4314 bc_src->neg = 0; 4315} 4316 4317static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 4318{ 4319 bc_src->neg = !bc_src->neg; 4320} 4321 4322static void tgsi_dst(struct r600_shader_ctx *ctx, 4323 const struct tgsi_full_dst_register *tgsi_dst, 4324 unsigned swizzle, 4325 struct r600_bytecode_alu_dst *r600_dst) 4326{ 4327 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4328 4329 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) { 4330 bool spilled; 4331 unsigned idx; 4332 4333 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled); 4334 4335 if (spilled) { 4336 struct r600_bytecode_output cf; 4337 int reg = 0; 4338 int r; 4339 bool add_pending_output = true; 4340 4341 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 4342 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index, 4343 &cf.array_base, &cf.array_size); 4344 4345 /* If no component has spilled, reserve a register and add the spill code 4346 * ctx->bc->n_pending_outputs is cleared after each instruction group */ 4347 if (ctx->bc->n_pending_outputs == 0) { 4348 reg = r600_get_temp(ctx); 4349 } else { 4350 /* If we are already spilling and the output address is the same like 4351 * before then just reuse the same slot */ 4352 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1]; 4353 if ((cf.array_base + idx == tmpl->array_base) || 4354 (cf.array_base == tmpl->array_base && 4355 tmpl->index_gpr == ctx->bc->ar_reg && 4356 tgsi_dst->Register.Indirect)) { 4357 reg = ctx->bc->pending_outputs[0].gpr; 4358 add_pending_output = false; 4359 } else { 4360 reg = r600_get_temp(ctx); 4361 } 4362 } 4363 4364 r600_dst->sel = reg; 4365 r600_dst->chan = swizzle; 4366 r600_dst->write = 1; 4367 if (inst->Instruction.Saturate) { 4368 r600_dst->clamp = 1; 4369 } 4370 4371 /* Add new outputs as pending */ 4372 if (add_pending_output) { 4373 cf.op = CF_OP_MEM_SCRATCH; 4374 cf.elem_size = 3; 4375 cf.gpr = reg; 4376 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 4377 cf.mark = 1; 4378 cf.comp_mask = inst->Dst[0].Register.WriteMask; 4379 cf.swizzle_x = 0; 4380 cf.swizzle_y = 1; 4381 cf.swizzle_z = 2; 4382 cf.swizzle_w = 3; 4383 cf.burst_count = 1; 4384 4385 if (tgsi_dst->Register.Indirect) { 4386 if (ctx->bc->chip_class < R700) 4387 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 4388 else 4389 cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK; 4390 cf.index_gpr = ctx->bc->ar_reg; 4391 } 4392 else { 4393 cf.array_base += idx; 4394 cf.array_size = 0; 4395 } 4396 4397 r = r600_bytecode_add_pending_output(ctx->bc, &cf); 4398 if (r) 4399 return; 4400 4401 if (ctx->bc->chip_class >= R700) 4402 r600_bytecode_need_wait_ack(ctx->bc, true); 4403 } 4404 return; 4405 } 4406 else { 4407 r600_dst->sel = idx; 4408 } 4409 } 4410 else { 4411 r600_dst->sel = tgsi_dst->Register.Index; 4412 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 4413 } 4414 r600_dst->chan = swizzle; 4415 r600_dst->write = 1; 4416 if (inst->Instruction.Saturate) { 4417 r600_dst->clamp = 1; 4418 } 4419 if (ctx->type == PIPE_SHADER_TESS_CTRL) { 4420 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) { 4421 return; 4422 } 4423 } 4424 if (tgsi_dst->Register.Indirect) 4425 r600_dst->rel = V_SQ_REL_RELATIVE; 4426 4427} 4428 4429static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override) 4430{ 4431 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4432 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4433 struct r600_bytecode_alu alu; 4434 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4435 int use_tmp = 0; 4436 int swizzle_x = inst->Src[0].Register.SwizzleX; 4437 4438 if (singledest) { 4439 switch (write_mask) { 4440 case 0x1: 4441 if (swizzle_x == 2) { 4442 write_mask = 0xc; 4443 use_tmp = 3; 4444 } else 4445 write_mask = 0x3; 4446 break; 4447 case 0x2: 4448 if (swizzle_x == 2) { 4449 write_mask = 0xc; 4450 use_tmp = 3; 4451 } else { 4452 write_mask = 0x3; 4453 use_tmp = 1; 4454 } 4455 break; 4456 case 0x4: 4457 if (swizzle_x == 0) { 4458 write_mask = 0x3; 4459 use_tmp = 1; 4460 } else 4461 write_mask = 0xc; 4462 break; 4463 case 0x8: 4464 if (swizzle_x == 0) { 4465 write_mask = 0x3; 4466 use_tmp = 1; 4467 } else { 4468 write_mask = 0xc; 4469 use_tmp = 3; 4470 } 4471 break; 4472 } 4473 } 4474 4475 lasti = tgsi_last_instruction(write_mask); 4476 for (i = 0; i <= lasti; i++) { 4477 4478 if (!(write_mask & (1 << i))) 4479 continue; 4480 4481 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4482 4483 if (singledest) { 4484 if (use_tmp || dest_temp) { 4485 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp; 4486 alu.dst.chan = i; 4487 alu.dst.write = 1; 4488 } else { 4489 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4490 } 4491 if (i == 1 || i == 3) 4492 alu.dst.write = 0; 4493 } else 4494 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4495 4496 alu.op = op_override ? op_override : ctx->inst_info->op; 4497 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) { 4498 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4499 } else if (!swap) { 4500 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4501 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4502 } 4503 } else { 4504 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i)); 4505 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i)); 4506 } 4507 4508 /* handle some special cases */ 4509 if (i == 1 || i == 3) { 4510 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) { 4511 case TGSI_OPCODE_DABS: 4512 r600_bytecode_src_set_abs(&alu.src[0]); 4513 break; 4514 default: 4515 break; 4516 } 4517 } 4518 if (i == lasti) { 4519 alu.last = 1; 4520 } 4521 r = r600_bytecode_add_alu(ctx->bc, &alu); 4522 if (r) 4523 return r; 4524 } 4525 4526 if (use_tmp) { 4527 write_mask = inst->Dst[0].Register.WriteMask; 4528 4529 lasti = tgsi_last_instruction(write_mask); 4530 /* move result from temp to dst */ 4531 for (i = 0; i <= lasti; i++) { 4532 if (!(write_mask & (1 << i))) 4533 continue; 4534 4535 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4536 alu.op = ALU_OP1_MOV; 4537 4538 if (dest_temp) { 4539 alu.dst.sel = dest_temp; 4540 alu.dst.chan = i; 4541 alu.dst.write = 1; 4542 } else 4543 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4544 alu.src[0].sel = ctx->temp_reg; 4545 alu.src[0].chan = use_tmp - 1; 4546 alu.last = (i == lasti); 4547 4548 r = r600_bytecode_add_alu(ctx->bc, &alu); 4549 if (r) 4550 return r; 4551 } 4552 } 4553 return 0; 4554} 4555 4556static int tgsi_op2_64(struct r600_shader_ctx *ctx) 4557{ 4558 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4559 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4560 /* confirm writemasking */ 4561 if ((write_mask & 0x3) != 0x3 && 4562 (write_mask & 0xc) != 0xc) { 4563 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask); 4564 return -1; 4565 } 4566 return tgsi_op2_64_params(ctx, false, false, 0, 0); 4567} 4568 4569static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx) 4570{ 4571 return tgsi_op2_64_params(ctx, true, false, 0, 0); 4572} 4573 4574static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx) 4575{ 4576 return tgsi_op2_64_params(ctx, true, true, 0, 0); 4577} 4578 4579static int tgsi_op3_64(struct r600_shader_ctx *ctx) 4580{ 4581 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4582 struct r600_bytecode_alu alu; 4583 int i, j, r; 4584 int lasti = 3; 4585 int tmp = r600_get_temp(ctx); 4586 4587 for (i = 0; i < lasti + 1; i++) { 4588 4589 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4590 alu.op = ctx->inst_info->op; 4591 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4592 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1); 4593 } 4594 4595 if (inst->Dst[0].Register.WriteMask & (1 << i)) 4596 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4597 else 4598 alu.dst.sel = tmp; 4599 4600 alu.dst.chan = i; 4601 alu.is_op3 = 1; 4602 if (i == lasti) { 4603 alu.last = 1; 4604 } 4605 r = r600_bytecode_add_alu(ctx->bc, &alu); 4606 if (r) 4607 return r; 4608 } 4609 return 0; 4610} 4611 4612static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 4613{ 4614 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4615 struct r600_bytecode_alu alu; 4616 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4617 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4618 /* use temp register if trans_only and more than one dst component */ 4619 int use_tmp = trans_only && (write_mask ^ (1 << lasti)); 4620 unsigned op = ctx->inst_info->op; 4621 4622 if (op == ALU_OP2_MUL_IEEE && 4623 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 4624 op = ALU_OP2_MUL; 4625 4626 for (i = 0; i <= lasti; i++) { 4627 if (!(write_mask & (1 << i))) 4628 continue; 4629 4630 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4631 if (use_tmp) { 4632 alu.dst.sel = ctx->temp_reg; 4633 alu.dst.chan = i; 4634 alu.dst.write = 1; 4635 } else 4636 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4637 4638 alu.op = op; 4639 if (!swap) { 4640 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4641 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 4642 } 4643 } else { 4644 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4645 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4646 } 4647 if (i == lasti || trans_only) { 4648 alu.last = 1; 4649 } 4650 r = r600_bytecode_add_alu(ctx->bc, &alu); 4651 if (r) 4652 return r; 4653 } 4654 4655 if (use_tmp) { 4656 /* move result from temp to dst */ 4657 for (i = 0; i <= lasti; i++) { 4658 if (!(write_mask & (1 << i))) 4659 continue; 4660 4661 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4662 alu.op = ALU_OP1_MOV; 4663 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4664 alu.src[0].sel = ctx->temp_reg; 4665 alu.src[0].chan = i; 4666 alu.last = (i == lasti); 4667 4668 r = r600_bytecode_add_alu(ctx->bc, &alu); 4669 if (r) 4670 return r; 4671 } 4672 } 4673 return 0; 4674} 4675 4676static int tgsi_op2(struct r600_shader_ctx *ctx) 4677{ 4678 return tgsi_op2_s(ctx, 0, 0); 4679} 4680 4681static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 4682{ 4683 return tgsi_op2_s(ctx, 1, 0); 4684} 4685 4686static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 4687{ 4688 return tgsi_op2_s(ctx, 0, 1); 4689} 4690 4691static int tgsi_ineg(struct r600_shader_ctx *ctx) 4692{ 4693 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4694 struct r600_bytecode_alu alu; 4695 int i, r; 4696 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4697 4698 for (i = 0; i < lasti + 1; i++) { 4699 4700 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4701 continue; 4702 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4703 alu.op = ctx->inst_info->op; 4704 4705 alu.src[0].sel = V_SQ_ALU_SRC_0; 4706 4707 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4708 4709 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4710 4711 if (i == lasti) { 4712 alu.last = 1; 4713 } 4714 r = r600_bytecode_add_alu(ctx->bc, &alu); 4715 if (r) 4716 return r; 4717 } 4718 return 0; 4719 4720} 4721 4722static int tgsi_dneg(struct r600_shader_ctx *ctx) 4723{ 4724 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4725 struct r600_bytecode_alu alu; 4726 int i, r; 4727 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4728 4729 for (i = 0; i < lasti + 1; i++) { 4730 4731 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4732 continue; 4733 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4734 alu.op = ALU_OP1_MOV; 4735 4736 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4737 4738 if (i == 1 || i == 3) 4739 r600_bytecode_src_toggle_neg(&alu.src[0]); 4740 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4741 4742 if (i == lasti) { 4743 alu.last = 1; 4744 } 4745 r = r600_bytecode_add_alu(ctx->bc, &alu); 4746 if (r) 4747 return r; 4748 } 4749 return 0; 4750 4751} 4752 4753static int tgsi_dfracexp(struct r600_shader_ctx *ctx) 4754{ 4755 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4756 struct r600_bytecode_alu alu; 4757 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4758 int i, j, r; 4759 4760 for (i = 0; i <= 3; i++) { 4761 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4762 alu.op = ctx->inst_info->op; 4763 4764 alu.dst.sel = ctx->temp_reg; 4765 alu.dst.chan = i; 4766 alu.dst.write = 1; 4767 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4768 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4769 } 4770 4771 if (i == 3) 4772 alu.last = 1; 4773 4774 r = r600_bytecode_add_alu(ctx->bc, &alu); 4775 if (r) 4776 return r; 4777 } 4778 4779 /* Replicate significand result across channels. */ 4780 for (i = 0; i <= 3; i++) { 4781 if (!(write_mask & (1 << i))) 4782 continue; 4783 4784 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4785 alu.op = ALU_OP1_MOV; 4786 alu.src[0].chan = (i & 1) + 2; 4787 alu.src[0].sel = ctx->temp_reg; 4788 4789 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4790 alu.dst.write = 1; 4791 alu.last = 1; 4792 r = r600_bytecode_add_alu(ctx->bc, &alu); 4793 if (r) 4794 return r; 4795 } 4796 4797 for (i = 0; i <= 3; i++) { 4798 if (inst->Dst[1].Register.WriteMask & (1 << i)) { 4799 /* MOV third channels to writemask dst1 */ 4800 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4801 alu.op = ALU_OP1_MOV; 4802 alu.src[0].chan = 1; 4803 alu.src[0].sel = ctx->temp_reg; 4804 4805 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst); 4806 alu.last = 1; 4807 r = r600_bytecode_add_alu(ctx->bc, &alu); 4808 if (r) 4809 return r; 4810 break; 4811 } 4812 } 4813 return 0; 4814} 4815 4816 4817static int egcm_int_to_double(struct r600_shader_ctx *ctx) 4818{ 4819 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4820 struct r600_bytecode_alu alu; 4821 int i, c, r; 4822 int write_mask = inst->Dst[0].Register.WriteMask; 4823 int temp_reg = r600_get_temp(ctx); 4824 4825 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || 4826 inst->Instruction.Opcode == TGSI_OPCODE_U2D); 4827 4828 for (c = 0; c < 2; c++) { 4829 int dchan = c * 2; 4830 if (write_mask & (0x3 << dchan)) { 4831 /* split into 24-bit int and 8-bit int */ 4832 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4833 alu.op = ALU_OP2_AND_INT; 4834 alu.dst.sel = temp_reg; 4835 alu.dst.chan = dchan; 4836 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4837 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4838 alu.src[1].value = 0xffffff00; 4839 alu.dst.write = 1; 4840 r = r600_bytecode_add_alu(ctx->bc, &alu); 4841 if (r) 4842 return r; 4843 4844 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4845 alu.op = ALU_OP2_AND_INT; 4846 alu.dst.sel = temp_reg; 4847 alu.dst.chan = dchan + 1; 4848 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4849 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4850 alu.src[1].value = 0xff; 4851 alu.dst.write = 1; 4852 alu.last = 1; 4853 r = r600_bytecode_add_alu(ctx->bc, &alu); 4854 if (r) 4855 return r; 4856 } 4857 } 4858 4859 for (c = 0; c < 2; c++) { 4860 int dchan = c * 2; 4861 if (write_mask & (0x3 << dchan)) { 4862 for (i = dchan; i <= dchan + 1; i++) { 4863 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4864 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT; 4865 4866 alu.src[0].sel = temp_reg; 4867 alu.src[0].chan = i; 4868 alu.dst.sel = temp_reg; 4869 alu.dst.chan = i; 4870 alu.dst.write = 1; 4871 if (ctx->bc->chip_class == CAYMAN) 4872 alu.last = i == dchan + 1; 4873 else 4874 alu.last = 1; /* trans only ops on evergreen */ 4875 4876 r = r600_bytecode_add_alu(ctx->bc, &alu); 4877 if (r) 4878 return r; 4879 } 4880 } 4881 } 4882 4883 for (c = 0; c < 2; c++) { 4884 int dchan = c * 2; 4885 if (write_mask & (0x3 << dchan)) { 4886 for (i = 0; i < 4; i++) { 4887 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4888 alu.op = ALU_OP1_FLT32_TO_FLT64; 4889 4890 alu.src[0].chan = dchan + (i / 2); 4891 if (i == 0 || i == 2) 4892 alu.src[0].sel = temp_reg; 4893 else { 4894 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 4895 alu.src[0].value = 0x0; 4896 } 4897 alu.dst.sel = ctx->temp_reg; 4898 alu.dst.chan = i; 4899 alu.last = i == 3; 4900 alu.dst.write = 1; 4901 4902 r = r600_bytecode_add_alu(ctx->bc, &alu); 4903 if (r) 4904 return r; 4905 } 4906 4907 for (i = 0; i <= 1; i++) { 4908 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4909 alu.op = ALU_OP2_ADD_64; 4910 4911 alu.src[0].chan = fp64_switch(i); 4912 alu.src[0].sel = ctx->temp_reg; 4913 4914 alu.src[1].chan = fp64_switch(i + 2); 4915 alu.src[1].sel = ctx->temp_reg; 4916 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst); 4917 alu.last = i == 1; 4918 4919 r = r600_bytecode_add_alu(ctx->bc, &alu); 4920 if (r) 4921 return r; 4922 } 4923 } 4924 } 4925 4926 return 0; 4927} 4928 4929static int egcm_double_to_int(struct r600_shader_ctx *ctx) 4930{ 4931 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4932 struct r600_bytecode_alu alu; 4933 int i, r; 4934 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4935 int treg = r600_get_temp(ctx); 4936 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I || 4937 inst->Instruction.Opcode == TGSI_OPCODE_D2U); 4938 4939 /* do a 64->32 into a temp register */ 4940 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32); 4941 if (r) 4942 return r; 4943 4944 for (i = 0; i <= lasti; i++) { 4945 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4946 continue; 4947 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4948 alu.op = ctx->inst_info->op; 4949 4950 alu.src[0].chan = i; 4951 alu.src[0].sel = treg; 4952 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4953 alu.last = (i == lasti); 4954 4955 r = r600_bytecode_add_alu(ctx->bc, &alu); 4956 if (r) 4957 return r; 4958 } 4959 4960 return 0; 4961} 4962 4963static int cayman_emit_unary_double_raw(struct r600_bytecode *bc, 4964 unsigned op, 4965 int dst_reg, 4966 struct r600_shader_src *src, 4967 bool abs) 4968{ 4969 struct r600_bytecode_alu alu; 4970 const int last_slot = 3; 4971 int r; 4972 4973 /* these have to write the result to X/Y by the looks of it */ 4974 for (int i = 0 ; i < last_slot; i++) { 4975 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4976 alu.op = op; 4977 4978 r600_bytecode_src(&alu.src[0], src, 1); 4979 r600_bytecode_src(&alu.src[1], src, 0); 4980 4981 if (abs) 4982 r600_bytecode_src_set_abs(&alu.src[1]); 4983 4984 alu.dst.sel = dst_reg; 4985 alu.dst.chan = i; 4986 alu.dst.write = (i == 0 || i == 1); 4987 4988 if (bc->chip_class != CAYMAN || i == last_slot - 1) 4989 alu.last = 1; 4990 r = r600_bytecode_add_alu(bc, &alu); 4991 if (r) 4992 return r; 4993 } 4994 4995 return 0; 4996} 4997 4998static int cayman_emit_double_instr(struct r600_shader_ctx *ctx) 4999{ 5000 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5001 int i, r; 5002 struct r600_bytecode_alu alu; 5003 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5004 int t1 = ctx->temp_reg; 5005 5006 /* should only be one src regs */ 5007 assert(inst->Instruction.NumSrcRegs == 1); 5008 5009 /* only support one double at a time */ 5010 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5011 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5012 5013 r = cayman_emit_unary_double_raw( 5014 ctx->bc, ctx->inst_info->op, t1, 5015 &ctx->src[0], 5016 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ || 5017 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT); 5018 if (r) 5019 return r; 5020 5021 for (i = 0 ; i <= lasti; i++) { 5022 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5023 continue; 5024 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5025 alu.op = ALU_OP1_MOV; 5026 alu.src[0].sel = t1; 5027 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1; 5028 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5029 alu.dst.write = 1; 5030 if (i == lasti) 5031 alu.last = 1; 5032 r = r600_bytecode_add_alu(ctx->bc, &alu); 5033 if (r) 5034 return r; 5035 } 5036 return 0; 5037} 5038 5039static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 5040{ 5041 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5042 int i, j, r; 5043 struct r600_bytecode_alu alu; 5044 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5045 5046 for (i = 0 ; i < last_slot; i++) { 5047 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5048 alu.op = ctx->inst_info->op; 5049 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5050 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 5051 5052 /* RSQ should take the absolute value of src */ 5053 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) { 5054 r600_bytecode_src_set_abs(&alu.src[j]); 5055 } 5056 } 5057 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5058 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5059 5060 if (i == last_slot - 1) 5061 alu.last = 1; 5062 r = r600_bytecode_add_alu(ctx->bc, &alu); 5063 if (r) 5064 return r; 5065 } 5066 return 0; 5067} 5068 5069static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 5070{ 5071 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5072 int i, j, k, r; 5073 struct r600_bytecode_alu alu; 5074 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5075 int t1 = ctx->temp_reg; 5076 5077 for (k = 0; k <= lasti; k++) { 5078 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 5079 continue; 5080 5081 for (i = 0 ; i < 4; i++) { 5082 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5083 alu.op = ctx->inst_info->op; 5084 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5085 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 5086 } 5087 alu.dst.sel = t1; 5088 alu.dst.chan = i; 5089 alu.dst.write = (i == k); 5090 if (i == 3) 5091 alu.last = 1; 5092 r = r600_bytecode_add_alu(ctx->bc, &alu); 5093 if (r) 5094 return r; 5095 } 5096 } 5097 5098 for (i = 0 ; i <= lasti; i++) { 5099 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5100 continue; 5101 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5102 alu.op = ALU_OP1_MOV; 5103 alu.src[0].sel = t1; 5104 alu.src[0].chan = i; 5105 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5106 alu.dst.write = 1; 5107 if (i == lasti) 5108 alu.last = 1; 5109 r = r600_bytecode_add_alu(ctx->bc, &alu); 5110 if (r) 5111 return r; 5112 } 5113 5114 return 0; 5115} 5116 5117 5118static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) 5119{ 5120 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5121 int i, j, k, r; 5122 struct r600_bytecode_alu alu; 5123 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5124 int t1 = ctx->temp_reg; 5125 5126 /* t1 would get overwritten below if we actually tried to 5127 * multiply two pairs of doubles at a time. */ 5128 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5129 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5130 5131 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5132 5133 for (i = 0; i < 4; i++) { 5134 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5135 alu.op = ctx->inst_info->op; 5136 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5137 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); 5138 } 5139 alu.dst.sel = t1; 5140 alu.dst.chan = i; 5141 alu.dst.write = 1; 5142 if (i == 3) 5143 alu.last = 1; 5144 r = r600_bytecode_add_alu(ctx->bc, &alu); 5145 if (r) 5146 return r; 5147 } 5148 5149 for (i = 0; i <= lasti; i++) { 5150 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5151 continue; 5152 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5153 alu.op = ALU_OP1_MOV; 5154 alu.src[0].sel = t1; 5155 alu.src[0].chan = i; 5156 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5157 alu.dst.write = 1; 5158 if (i == lasti) 5159 alu.last = 1; 5160 r = r600_bytecode_add_alu(ctx->bc, &alu); 5161 if (r) 5162 return r; 5163 } 5164 5165 return 0; 5166} 5167 5168/* 5169 * Emit RECIP_64 + MUL_64 to implement division. 5170 */ 5171static int cayman_ddiv_instr(struct r600_shader_ctx *ctx) 5172{ 5173 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5174 int r; 5175 struct r600_bytecode_alu alu; 5176 int t1 = ctx->temp_reg; 5177 int k; 5178 5179 /* Only support one double at a time. This is the same constraint as 5180 * in DMUL lowering. */ 5181 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5182 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5183 5184 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5185 5186 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false); 5187 if (r) 5188 return r; 5189 5190 for (int i = 0; i < 4; i++) { 5191 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5192 alu.op = ALU_OP2_MUL_64; 5193 5194 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1)); 5195 5196 alu.src[1].sel = t1; 5197 alu.src[1].chan = (i == 3) ? 0 : 1; 5198 5199 alu.dst.sel = t1; 5200 alu.dst.chan = i; 5201 alu.dst.write = 1; 5202 if (i == 3) 5203 alu.last = 1; 5204 r = r600_bytecode_add_alu(ctx->bc, &alu); 5205 if (r) 5206 return r; 5207 } 5208 5209 for (int i = 0; i < 2; i++) { 5210 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5211 alu.op = ALU_OP1_MOV; 5212 alu.src[0].sel = t1; 5213 alu.src[0].chan = i; 5214 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst); 5215 alu.dst.write = 1; 5216 if (i == 1) 5217 alu.last = 1; 5218 r = r600_bytecode_add_alu(ctx->bc, &alu); 5219 if (r) 5220 return r; 5221 } 5222 return 0; 5223} 5224 5225/* 5226 * r600 - trunc to -PI..PI range 5227 * r700 - normalize by dividing by 2PI 5228 * see fdo bug 27901 5229 */ 5230static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 5231{ 5232 int r; 5233 struct r600_bytecode_alu alu; 5234 5235 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5236 alu.op = ALU_OP3_MULADD; 5237 alu.is_op3 = 1; 5238 5239 alu.dst.chan = 0; 5240 alu.dst.sel = ctx->temp_reg; 5241 alu.dst.write = 1; 5242 5243 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5244 5245 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5246 alu.src[1].chan = 0; 5247 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI); 5248 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5249 alu.src[2].chan = 0; 5250 alu.last = 1; 5251 r = r600_bytecode_add_alu(ctx->bc, &alu); 5252 if (r) 5253 return r; 5254 5255 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5256 alu.op = ALU_OP1_FRACT; 5257 5258 alu.dst.chan = 0; 5259 alu.dst.sel = ctx->temp_reg; 5260 alu.dst.write = 1; 5261 5262 alu.src[0].sel = ctx->temp_reg; 5263 alu.src[0].chan = 0; 5264 alu.last = 1; 5265 r = r600_bytecode_add_alu(ctx->bc, &alu); 5266 if (r) 5267 return r; 5268 5269 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5270 alu.op = ALU_OP3_MULADD; 5271 alu.is_op3 = 1; 5272 5273 alu.dst.chan = 0; 5274 alu.dst.sel = ctx->temp_reg; 5275 alu.dst.write = 1; 5276 5277 alu.src[0].sel = ctx->temp_reg; 5278 alu.src[0].chan = 0; 5279 5280 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5281 alu.src[1].chan = 0; 5282 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 5283 alu.src[2].chan = 0; 5284 5285 if (ctx->bc->chip_class == R600) { 5286 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI); 5287 alu.src[2].value = u_bitcast_f2u(-M_PI); 5288 } else { 5289 alu.src[1].sel = V_SQ_ALU_SRC_1; 5290 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5291 alu.src[2].neg = 1; 5292 } 5293 5294 alu.last = 1; 5295 r = r600_bytecode_add_alu(ctx->bc, &alu); 5296 if (r) 5297 return r; 5298 return 0; 5299} 5300 5301static int cayman_trig(struct r600_shader_ctx *ctx) 5302{ 5303 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5304 struct r600_bytecode_alu alu; 5305 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5306 int i, r; 5307 5308 r = tgsi_setup_trig(ctx); 5309 if (r) 5310 return r; 5311 5312 5313 for (i = 0; i < last_slot; i++) { 5314 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5315 alu.op = ctx->inst_info->op; 5316 alu.dst.chan = i; 5317 5318 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5319 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5320 5321 alu.src[0].sel = ctx->temp_reg; 5322 alu.src[0].chan = 0; 5323 if (i == last_slot - 1) 5324 alu.last = 1; 5325 r = r600_bytecode_add_alu(ctx->bc, &alu); 5326 if (r) 5327 return r; 5328 } 5329 return 0; 5330} 5331 5332static int tgsi_trig(struct r600_shader_ctx *ctx) 5333{ 5334 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5335 struct r600_bytecode_alu alu; 5336 int i, r; 5337 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5338 5339 r = tgsi_setup_trig(ctx); 5340 if (r) 5341 return r; 5342 5343 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5344 alu.op = ctx->inst_info->op; 5345 alu.dst.chan = 0; 5346 alu.dst.sel = ctx->temp_reg; 5347 alu.dst.write = 1; 5348 5349 alu.src[0].sel = ctx->temp_reg; 5350 alu.src[0].chan = 0; 5351 alu.last = 1; 5352 r = r600_bytecode_add_alu(ctx->bc, &alu); 5353 if (r) 5354 return r; 5355 5356 /* replicate result */ 5357 for (i = 0; i < lasti + 1; i++) { 5358 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5359 continue; 5360 5361 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5362 alu.op = ALU_OP1_MOV; 5363 5364 alu.src[0].sel = ctx->temp_reg; 5365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5366 if (i == lasti) 5367 alu.last = 1; 5368 r = r600_bytecode_add_alu(ctx->bc, &alu); 5369 if (r) 5370 return r; 5371 } 5372 return 0; 5373} 5374 5375static int tgsi_kill(struct r600_shader_ctx *ctx) 5376{ 5377 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5378 struct r600_bytecode_alu alu; 5379 int i, r; 5380 5381 for (i = 0; i < 4; i++) { 5382 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5383 alu.op = ctx->inst_info->op; 5384 5385 alu.dst.chan = i; 5386 5387 alu.src[0].sel = V_SQ_ALU_SRC_0; 5388 5389 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) { 5390 alu.src[1].sel = V_SQ_ALU_SRC_1; 5391 alu.src[1].neg = 1; 5392 } else { 5393 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5394 } 5395 if (i == 3) { 5396 alu.last = 1; 5397 } 5398 r = r600_bytecode_add_alu(ctx->bc, &alu); 5399 if (r) 5400 return r; 5401 } 5402 5403 /* kill must be last in ALU */ 5404 ctx->bc->force_add_cf = 1; 5405 ctx->shader->uses_kill = TRUE; 5406 return 0; 5407} 5408 5409static int tgsi_lit(struct r600_shader_ctx *ctx) 5410{ 5411 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5412 struct r600_bytecode_alu alu; 5413 int r; 5414 5415 /* tmp.x = max(src.y, 0.0) */ 5416 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5417 alu.op = ALU_OP2_MAX; 5418 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 5419 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5420 alu.src[1].chan = 1; 5421 5422 alu.dst.sel = ctx->temp_reg; 5423 alu.dst.chan = 0; 5424 alu.dst.write = 1; 5425 5426 alu.last = 1; 5427 r = r600_bytecode_add_alu(ctx->bc, &alu); 5428 if (r) 5429 return r; 5430 5431 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 5432 { 5433 int chan; 5434 int sel; 5435 unsigned i; 5436 5437 if (ctx->bc->chip_class == CAYMAN) { 5438 for (i = 0; i < 3; i++) { 5439 /* tmp.z = log(tmp.x) */ 5440 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5441 alu.op = ALU_OP1_LOG_CLAMPED; 5442 alu.src[0].sel = ctx->temp_reg; 5443 alu.src[0].chan = 0; 5444 alu.dst.sel = ctx->temp_reg; 5445 alu.dst.chan = i; 5446 if (i == 2) { 5447 alu.dst.write = 1; 5448 alu.last = 1; 5449 } else 5450 alu.dst.write = 0; 5451 5452 r = r600_bytecode_add_alu(ctx->bc, &alu); 5453 if (r) 5454 return r; 5455 } 5456 } else { 5457 /* tmp.z = log(tmp.x) */ 5458 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5459 alu.op = ALU_OP1_LOG_CLAMPED; 5460 alu.src[0].sel = ctx->temp_reg; 5461 alu.src[0].chan = 0; 5462 alu.dst.sel = ctx->temp_reg; 5463 alu.dst.chan = 2; 5464 alu.dst.write = 1; 5465 alu.last = 1; 5466 r = r600_bytecode_add_alu(ctx->bc, &alu); 5467 if (r) 5468 return r; 5469 } 5470 5471 chan = alu.dst.chan; 5472 sel = alu.dst.sel; 5473 5474 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 5475 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5476 alu.op = ALU_OP3_MUL_LIT; 5477 alu.src[0].sel = sel; 5478 alu.src[0].chan = chan; 5479 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 5480 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 5481 alu.dst.sel = ctx->temp_reg; 5482 alu.dst.chan = 0; 5483 alu.dst.write = 1; 5484 alu.is_op3 = 1; 5485 alu.last = 1; 5486 r = r600_bytecode_add_alu(ctx->bc, &alu); 5487 if (r) 5488 return r; 5489 5490 if (ctx->bc->chip_class == CAYMAN) { 5491 for (i = 0; i < 3; i++) { 5492 /* dst.z = exp(tmp.x) */ 5493 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5494 alu.op = ALU_OP1_EXP_IEEE; 5495 alu.src[0].sel = ctx->temp_reg; 5496 alu.src[0].chan = 0; 5497 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5498 if (i == 2) { 5499 alu.dst.write = 1; 5500 alu.last = 1; 5501 } else 5502 alu.dst.write = 0; 5503 r = r600_bytecode_add_alu(ctx->bc, &alu); 5504 if (r) 5505 return r; 5506 } 5507 } else { 5508 /* dst.z = exp(tmp.x) */ 5509 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5510 alu.op = ALU_OP1_EXP_IEEE; 5511 alu.src[0].sel = ctx->temp_reg; 5512 alu.src[0].chan = 0; 5513 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 5514 alu.last = 1; 5515 r = r600_bytecode_add_alu(ctx->bc, &alu); 5516 if (r) 5517 return r; 5518 } 5519 } 5520 5521 /* dst.x, <- 1.0 */ 5522 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5523 alu.op = ALU_OP1_MOV; 5524 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 5525 alu.src[0].chan = 0; 5526 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 5527 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 5528 r = r600_bytecode_add_alu(ctx->bc, &alu); 5529 if (r) 5530 return r; 5531 5532 /* dst.y = max(src.x, 0.0) */ 5533 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5534 alu.op = ALU_OP2_MAX; 5535 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5536 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5537 alu.src[1].chan = 0; 5538 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 5539 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 5540 r = r600_bytecode_add_alu(ctx->bc, &alu); 5541 if (r) 5542 return r; 5543 5544 /* dst.w, <- 1.0 */ 5545 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5546 alu.op = ALU_OP1_MOV; 5547 alu.src[0].sel = V_SQ_ALU_SRC_1; 5548 alu.src[0].chan = 0; 5549 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 5550 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 5551 alu.last = 1; 5552 r = r600_bytecode_add_alu(ctx->bc, &alu); 5553 if (r) 5554 return r; 5555 5556 return 0; 5557} 5558 5559static int tgsi_rsq(struct r600_shader_ctx *ctx) 5560{ 5561 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5562 struct r600_bytecode_alu alu; 5563 int i, r; 5564 5565 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5566 5567 alu.op = ALU_OP1_RECIPSQRT_IEEE; 5568 5569 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5570 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5571 r600_bytecode_src_set_abs(&alu.src[i]); 5572 } 5573 alu.dst.sel = ctx->temp_reg; 5574 alu.dst.write = 1; 5575 alu.last = 1; 5576 r = r600_bytecode_add_alu(ctx->bc, &alu); 5577 if (r) 5578 return r; 5579 /* replicate result */ 5580 return tgsi_helper_tempx_replicate(ctx); 5581} 5582 5583static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 5584{ 5585 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5586 struct r600_bytecode_alu alu; 5587 int i, r; 5588 5589 for (i = 0; i < 4; i++) { 5590 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5591 alu.src[0].sel = ctx->temp_reg; 5592 alu.op = ALU_OP1_MOV; 5593 alu.dst.chan = i; 5594 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5595 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5596 if (i == 3) 5597 alu.last = 1; 5598 r = r600_bytecode_add_alu(ctx->bc, &alu); 5599 if (r) 5600 return r; 5601 } 5602 return 0; 5603} 5604 5605static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 5606{ 5607 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5608 struct r600_bytecode_alu alu; 5609 int i, r; 5610 5611 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5612 alu.op = ctx->inst_info->op; 5613 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5614 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5615 } 5616 alu.dst.sel = ctx->temp_reg; 5617 alu.dst.write = 1; 5618 alu.last = 1; 5619 r = r600_bytecode_add_alu(ctx->bc, &alu); 5620 if (r) 5621 return r; 5622 /* replicate result */ 5623 return tgsi_helper_tempx_replicate(ctx); 5624} 5625 5626static int cayman_pow(struct r600_shader_ctx *ctx) 5627{ 5628 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5629 int i, r; 5630 struct r600_bytecode_alu alu; 5631 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5632 5633 for (i = 0; i < 3; i++) { 5634 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5635 alu.op = ALU_OP1_LOG_IEEE; 5636 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5637 alu.dst.sel = ctx->temp_reg; 5638 alu.dst.chan = i; 5639 alu.dst.write = 1; 5640 if (i == 2) 5641 alu.last = 1; 5642 r = r600_bytecode_add_alu(ctx->bc, &alu); 5643 if (r) 5644 return r; 5645 } 5646 5647 /* b * LOG2(a) */ 5648 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5649 alu.op = ALU_OP2_MUL; 5650 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5651 alu.src[1].sel = ctx->temp_reg; 5652 alu.dst.sel = ctx->temp_reg; 5653 alu.dst.write = 1; 5654 alu.last = 1; 5655 r = r600_bytecode_add_alu(ctx->bc, &alu); 5656 if (r) 5657 return r; 5658 5659 for (i = 0; i < last_slot; i++) { 5660 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5661 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5662 alu.op = ALU_OP1_EXP_IEEE; 5663 alu.src[0].sel = ctx->temp_reg; 5664 5665 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5666 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5667 if (i == last_slot - 1) 5668 alu.last = 1; 5669 r = r600_bytecode_add_alu(ctx->bc, &alu); 5670 if (r) 5671 return r; 5672 } 5673 return 0; 5674} 5675 5676static int tgsi_pow(struct r600_shader_ctx *ctx) 5677{ 5678 struct r600_bytecode_alu alu; 5679 int r; 5680 5681 /* LOG2(a) */ 5682 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5683 alu.op = ALU_OP1_LOG_IEEE; 5684 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5685 alu.dst.sel = ctx->temp_reg; 5686 alu.dst.write = 1; 5687 alu.last = 1; 5688 r = r600_bytecode_add_alu(ctx->bc, &alu); 5689 if (r) 5690 return r; 5691 /* b * LOG2(a) */ 5692 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5693 alu.op = ALU_OP2_MUL; 5694 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5695 alu.src[1].sel = ctx->temp_reg; 5696 alu.dst.sel = ctx->temp_reg; 5697 alu.dst.write = 1; 5698 alu.last = 1; 5699 r = r600_bytecode_add_alu(ctx->bc, &alu); 5700 if (r) 5701 return r; 5702 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5703 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5704 alu.op = ALU_OP1_EXP_IEEE; 5705 alu.src[0].sel = ctx->temp_reg; 5706 alu.dst.sel = ctx->temp_reg; 5707 alu.dst.write = 1; 5708 alu.last = 1; 5709 r = r600_bytecode_add_alu(ctx->bc, &alu); 5710 if (r) 5711 return r; 5712 return tgsi_helper_tempx_replicate(ctx); 5713} 5714 5715static int emit_mul_int_op(struct r600_bytecode *bc, 5716 struct r600_bytecode_alu *alu_src) 5717{ 5718 struct r600_bytecode_alu alu; 5719 int i, r; 5720 alu = *alu_src; 5721 if (bc->chip_class == CAYMAN) { 5722 for (i = 0; i < 4; i++) { 5723 alu.dst.chan = i; 5724 alu.dst.write = (i == alu_src->dst.chan); 5725 alu.last = (i == 3); 5726 5727 r = r600_bytecode_add_alu(bc, &alu); 5728 if (r) 5729 return r; 5730 } 5731 } else { 5732 alu.last = 1; 5733 r = r600_bytecode_add_alu(bc, &alu); 5734 if (r) 5735 return r; 5736 } 5737 return 0; 5738} 5739 5740static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 5741{ 5742 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5743 struct r600_bytecode_alu alu; 5744 int i, r, j; 5745 unsigned write_mask = inst->Dst[0].Register.WriteMask; 5746 int lasti = tgsi_last_instruction(write_mask); 5747 int tmp0 = ctx->temp_reg; 5748 int tmp1 = r600_get_temp(ctx); 5749 int tmp2 = r600_get_temp(ctx); 5750 int tmp3 = r600_get_temp(ctx); 5751 int tmp4 = 0; 5752 5753 /* Use additional temp if dst register and src register are the same */ 5754 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index || 5755 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) { 5756 tmp4 = r600_get_temp(ctx); 5757 } 5758 5759 /* Unsigned path: 5760 * 5761 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 5762 * 5763 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 5764 * 2. tmp0.z = lo (tmp0.x * src2) 5765 * 3. tmp0.w = -tmp0.z 5766 * 4. tmp0.y = hi (tmp0.x * src2) 5767 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 5768 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 5769 * 7. tmp1.x = tmp0.x - tmp0.w 5770 * 8. tmp1.y = tmp0.x + tmp0.w 5771 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 5772 * 10. tmp0.z = hi(tmp0.x * src1) = q 5773 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 5774 * 5775 * 12. tmp0.w = src1 - tmp0.y = r 5776 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 5777 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 5778 * 5779 * if DIV 5780 * 5781 * 15. tmp1.z = tmp0.z + 1 = q + 1 5782 * 16. tmp1.w = tmp0.z - 1 = q - 1 5783 * 5784 * else MOD 5785 * 5786 * 15. tmp1.z = tmp0.w - src2 = r - src2 5787 * 16. tmp1.w = tmp0.w + src2 = r + src2 5788 * 5789 * endif 5790 * 5791 * 17. tmp1.x = tmp1.x & tmp1.y 5792 * 5793 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 5794 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 5795 * 5796 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 5797 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 5798 * 5799 * Signed path: 5800 * 5801 * Same as unsigned, using abs values of the operands, 5802 * and fixing the sign of the result in the end. 5803 */ 5804 5805 for (i = 0; i < 4; i++) { 5806 if (!(write_mask & (1<<i))) 5807 continue; 5808 5809 if (signed_op) { 5810 5811 /* tmp2.x = -src0 */ 5812 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5813 alu.op = ALU_OP2_SUB_INT; 5814 5815 alu.dst.sel = tmp2; 5816 alu.dst.chan = 0; 5817 alu.dst.write = 1; 5818 5819 alu.src[0].sel = V_SQ_ALU_SRC_0; 5820 5821 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5822 5823 alu.last = 1; 5824 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5825 return r; 5826 5827 /* tmp2.y = -src1 */ 5828 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5829 alu.op = ALU_OP2_SUB_INT; 5830 5831 alu.dst.sel = tmp2; 5832 alu.dst.chan = 1; 5833 alu.dst.write = 1; 5834 5835 alu.src[0].sel = V_SQ_ALU_SRC_0; 5836 5837 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5838 5839 alu.last = 1; 5840 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5841 return r; 5842 5843 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 5844 /* it will be a sign of the quotient */ 5845 if (!mod) { 5846 5847 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5848 alu.op = ALU_OP2_XOR_INT; 5849 5850 alu.dst.sel = tmp2; 5851 alu.dst.chan = 2; 5852 alu.dst.write = 1; 5853 5854 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5855 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5856 5857 alu.last = 1; 5858 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5859 return r; 5860 } 5861 5862 /* tmp2.x = |src0| */ 5863 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5864 alu.op = ALU_OP3_CNDGE_INT; 5865 alu.is_op3 = 1; 5866 5867 alu.dst.sel = tmp2; 5868 alu.dst.chan = 0; 5869 alu.dst.write = 1; 5870 5871 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5872 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5873 alu.src[2].sel = tmp2; 5874 alu.src[2].chan = 0; 5875 5876 alu.last = 1; 5877 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5878 return r; 5879 5880 /* tmp2.y = |src1| */ 5881 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5882 alu.op = ALU_OP3_CNDGE_INT; 5883 alu.is_op3 = 1; 5884 5885 alu.dst.sel = tmp2; 5886 alu.dst.chan = 1; 5887 alu.dst.write = 1; 5888 5889 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5890 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5891 alu.src[2].sel = tmp2; 5892 alu.src[2].chan = 1; 5893 5894 alu.last = 1; 5895 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5896 return r; 5897 5898 } 5899 5900 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 5901 if (ctx->bc->chip_class == CAYMAN) { 5902 /* tmp3.x = u2f(src2) */ 5903 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5904 alu.op = ALU_OP1_UINT_TO_FLT; 5905 5906 alu.dst.sel = tmp3; 5907 alu.dst.chan = 0; 5908 alu.dst.write = 1; 5909 5910 if (signed_op) { 5911 alu.src[0].sel = tmp2; 5912 alu.src[0].chan = 1; 5913 } else { 5914 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5915 } 5916 5917 alu.last = 1; 5918 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5919 return r; 5920 5921 /* tmp0.x = recip(tmp3.x) */ 5922 for (j = 0 ; j < 3; j++) { 5923 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5924 alu.op = ALU_OP1_RECIP_IEEE; 5925 5926 alu.dst.sel = tmp0; 5927 alu.dst.chan = j; 5928 alu.dst.write = (j == 0); 5929 5930 alu.src[0].sel = tmp3; 5931 alu.src[0].chan = 0; 5932 5933 if (j == 2) 5934 alu.last = 1; 5935 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5936 return r; 5937 } 5938 5939 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5940 alu.op = ALU_OP2_MUL; 5941 5942 alu.src[0].sel = tmp0; 5943 alu.src[0].chan = 0; 5944 5945 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5946 alu.src[1].value = 0x4f800000; 5947 5948 alu.dst.sel = tmp3; 5949 alu.dst.write = 1; 5950 alu.last = 1; 5951 r = r600_bytecode_add_alu(ctx->bc, &alu); 5952 if (r) 5953 return r; 5954 5955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5956 alu.op = ALU_OP1_FLT_TO_UINT; 5957 5958 alu.dst.sel = tmp0; 5959 alu.dst.chan = 0; 5960 alu.dst.write = 1; 5961 5962 alu.src[0].sel = tmp3; 5963 alu.src[0].chan = 0; 5964 5965 alu.last = 1; 5966 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5967 return r; 5968 5969 } else { 5970 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5971 alu.op = ALU_OP1_RECIP_UINT; 5972 5973 alu.dst.sel = tmp0; 5974 alu.dst.chan = 0; 5975 alu.dst.write = 1; 5976 5977 if (signed_op) { 5978 alu.src[0].sel = tmp2; 5979 alu.src[0].chan = 1; 5980 } else { 5981 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5982 } 5983 5984 alu.last = 1; 5985 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5986 return r; 5987 } 5988 5989 /* 2. tmp0.z = lo (tmp0.x * src2) */ 5990 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5991 alu.op = ALU_OP2_MULLO_UINT; 5992 5993 alu.dst.sel = tmp0; 5994 alu.dst.chan = 2; 5995 alu.dst.write = 1; 5996 5997 alu.src[0].sel = tmp0; 5998 alu.src[0].chan = 0; 5999 if (signed_op) { 6000 alu.src[1].sel = tmp2; 6001 alu.src[1].chan = 1; 6002 } else { 6003 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6004 } 6005 6006 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6007 return r; 6008 6009 /* 3. tmp0.w = -tmp0.z */ 6010 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6011 alu.op = ALU_OP2_SUB_INT; 6012 6013 alu.dst.sel = tmp0; 6014 alu.dst.chan = 3; 6015 alu.dst.write = 1; 6016 6017 alu.src[0].sel = V_SQ_ALU_SRC_0; 6018 alu.src[1].sel = tmp0; 6019 alu.src[1].chan = 2; 6020 6021 alu.last = 1; 6022 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6023 return r; 6024 6025 /* 4. tmp0.y = hi (tmp0.x * src2) */ 6026 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6027 alu.op = ALU_OP2_MULHI_UINT; 6028 6029 alu.dst.sel = tmp0; 6030 alu.dst.chan = 1; 6031 alu.dst.write = 1; 6032 6033 alu.src[0].sel = tmp0; 6034 alu.src[0].chan = 0; 6035 6036 if (signed_op) { 6037 alu.src[1].sel = tmp2; 6038 alu.src[1].chan = 1; 6039 } else { 6040 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6041 } 6042 6043 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6044 return r; 6045 6046 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 6047 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6048 alu.op = ALU_OP3_CNDE_INT; 6049 alu.is_op3 = 1; 6050 6051 alu.dst.sel = tmp0; 6052 alu.dst.chan = 2; 6053 alu.dst.write = 1; 6054 6055 alu.src[0].sel = tmp0; 6056 alu.src[0].chan = 1; 6057 alu.src[1].sel = tmp0; 6058 alu.src[1].chan = 3; 6059 alu.src[2].sel = tmp0; 6060 alu.src[2].chan = 2; 6061 6062 alu.last = 1; 6063 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6064 return r; 6065 6066 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 6067 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6068 alu.op = ALU_OP2_MULHI_UINT; 6069 6070 alu.dst.sel = tmp0; 6071 alu.dst.chan = 3; 6072 alu.dst.write = 1; 6073 6074 alu.src[0].sel = tmp0; 6075 alu.src[0].chan = 2; 6076 6077 alu.src[1].sel = tmp0; 6078 alu.src[1].chan = 0; 6079 6080 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6081 return r; 6082 6083 /* 7. tmp1.x = tmp0.x - tmp0.w */ 6084 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6085 alu.op = ALU_OP2_SUB_INT; 6086 6087 alu.dst.sel = tmp1; 6088 alu.dst.chan = 0; 6089 alu.dst.write = 1; 6090 6091 alu.src[0].sel = tmp0; 6092 alu.src[0].chan = 0; 6093 alu.src[1].sel = tmp0; 6094 alu.src[1].chan = 3; 6095 6096 alu.last = 1; 6097 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6098 return r; 6099 6100 /* 8. tmp1.y = tmp0.x + tmp0.w */ 6101 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6102 alu.op = ALU_OP2_ADD_INT; 6103 6104 alu.dst.sel = tmp1; 6105 alu.dst.chan = 1; 6106 alu.dst.write = 1; 6107 6108 alu.src[0].sel = tmp0; 6109 alu.src[0].chan = 0; 6110 alu.src[1].sel = tmp0; 6111 alu.src[1].chan = 3; 6112 6113 alu.last = 1; 6114 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6115 return r; 6116 6117 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 6118 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6119 alu.op = ALU_OP3_CNDE_INT; 6120 alu.is_op3 = 1; 6121 6122 alu.dst.sel = tmp0; 6123 alu.dst.chan = 0; 6124 alu.dst.write = 1; 6125 6126 alu.src[0].sel = tmp0; 6127 alu.src[0].chan = 1; 6128 alu.src[1].sel = tmp1; 6129 alu.src[1].chan = 1; 6130 alu.src[2].sel = tmp1; 6131 alu.src[2].chan = 0; 6132 6133 alu.last = 1; 6134 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6135 return r; 6136 6137 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 6138 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6139 alu.op = ALU_OP2_MULHI_UINT; 6140 6141 alu.dst.sel = tmp0; 6142 alu.dst.chan = 2; 6143 alu.dst.write = 1; 6144 6145 alu.src[0].sel = tmp0; 6146 alu.src[0].chan = 0; 6147 6148 if (signed_op) { 6149 alu.src[1].sel = tmp2; 6150 alu.src[1].chan = 0; 6151 } else { 6152 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6153 } 6154 6155 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6156 return r; 6157 6158 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 6159 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6160 alu.op = ALU_OP2_MULLO_UINT; 6161 6162 alu.dst.sel = tmp0; 6163 alu.dst.chan = 1; 6164 alu.dst.write = 1; 6165 6166 if (signed_op) { 6167 alu.src[0].sel = tmp2; 6168 alu.src[0].chan = 1; 6169 } else { 6170 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6171 } 6172 6173 alu.src[1].sel = tmp0; 6174 alu.src[1].chan = 2; 6175 6176 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6177 return r; 6178 6179 /* 12. tmp0.w = src1 - tmp0.y = r */ 6180 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6181 alu.op = ALU_OP2_SUB_INT; 6182 6183 alu.dst.sel = tmp0; 6184 alu.dst.chan = 3; 6185 alu.dst.write = 1; 6186 6187 if (signed_op) { 6188 alu.src[0].sel = tmp2; 6189 alu.src[0].chan = 0; 6190 } else { 6191 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6192 } 6193 6194 alu.src[1].sel = tmp0; 6195 alu.src[1].chan = 1; 6196 6197 alu.last = 1; 6198 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6199 return r; 6200 6201 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 6202 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6203 alu.op = ALU_OP2_SETGE_UINT; 6204 6205 alu.dst.sel = tmp1; 6206 alu.dst.chan = 0; 6207 alu.dst.write = 1; 6208 6209 alu.src[0].sel = tmp0; 6210 alu.src[0].chan = 3; 6211 if (signed_op) { 6212 alu.src[1].sel = tmp2; 6213 alu.src[1].chan = 1; 6214 } else { 6215 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6216 } 6217 6218 alu.last = 1; 6219 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6220 return r; 6221 6222 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 6223 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6224 alu.op = ALU_OP2_SETGE_UINT; 6225 6226 alu.dst.sel = tmp1; 6227 alu.dst.chan = 1; 6228 alu.dst.write = 1; 6229 6230 if (signed_op) { 6231 alu.src[0].sel = tmp2; 6232 alu.src[0].chan = 0; 6233 } else { 6234 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6235 } 6236 6237 alu.src[1].sel = tmp0; 6238 alu.src[1].chan = 1; 6239 6240 alu.last = 1; 6241 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6242 return r; 6243 6244 if (mod) { /* UMOD */ 6245 6246 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 6247 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6248 alu.op = ALU_OP2_SUB_INT; 6249 6250 alu.dst.sel = tmp1; 6251 alu.dst.chan = 2; 6252 alu.dst.write = 1; 6253 6254 alu.src[0].sel = tmp0; 6255 alu.src[0].chan = 3; 6256 6257 if (signed_op) { 6258 alu.src[1].sel = tmp2; 6259 alu.src[1].chan = 1; 6260 } else { 6261 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6262 } 6263 6264 alu.last = 1; 6265 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6266 return r; 6267 6268 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 6269 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6270 alu.op = ALU_OP2_ADD_INT; 6271 6272 alu.dst.sel = tmp1; 6273 alu.dst.chan = 3; 6274 alu.dst.write = 1; 6275 6276 alu.src[0].sel = tmp0; 6277 alu.src[0].chan = 3; 6278 if (signed_op) { 6279 alu.src[1].sel = tmp2; 6280 alu.src[1].chan = 1; 6281 } else { 6282 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6283 } 6284 6285 alu.last = 1; 6286 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6287 return r; 6288 6289 } else { /* UDIV */ 6290 6291 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 6292 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6293 alu.op = ALU_OP2_ADD_INT; 6294 6295 alu.dst.sel = tmp1; 6296 alu.dst.chan = 2; 6297 alu.dst.write = 1; 6298 6299 alu.src[0].sel = tmp0; 6300 alu.src[0].chan = 2; 6301 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6302 6303 alu.last = 1; 6304 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6305 return r; 6306 6307 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 6308 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6309 alu.op = ALU_OP2_ADD_INT; 6310 6311 alu.dst.sel = tmp1; 6312 alu.dst.chan = 3; 6313 alu.dst.write = 1; 6314 6315 alu.src[0].sel = tmp0; 6316 alu.src[0].chan = 2; 6317 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 6318 6319 alu.last = 1; 6320 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6321 return r; 6322 6323 } 6324 6325 /* 17. tmp1.x = tmp1.x & tmp1.y */ 6326 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6327 alu.op = ALU_OP2_AND_INT; 6328 6329 alu.dst.sel = tmp1; 6330 alu.dst.chan = 0; 6331 alu.dst.write = 1; 6332 6333 alu.src[0].sel = tmp1; 6334 alu.src[0].chan = 0; 6335 alu.src[1].sel = tmp1; 6336 alu.src[1].chan = 1; 6337 6338 alu.last = 1; 6339 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6340 return r; 6341 6342 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 6343 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 6344 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6345 alu.op = ALU_OP3_CNDE_INT; 6346 alu.is_op3 = 1; 6347 6348 alu.dst.sel = tmp0; 6349 alu.dst.chan = 2; 6350 alu.dst.write = 1; 6351 6352 alu.src[0].sel = tmp1; 6353 alu.src[0].chan = 0; 6354 alu.src[1].sel = tmp0; 6355 alu.src[1].chan = mod ? 3 : 2; 6356 alu.src[2].sel = tmp1; 6357 alu.src[2].chan = 2; 6358 6359 alu.last = 1; 6360 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6361 return r; 6362 6363 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 6364 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6365 alu.op = ALU_OP3_CNDE_INT; 6366 alu.is_op3 = 1; 6367 6368 if (signed_op) { 6369 alu.dst.sel = tmp0; 6370 alu.dst.chan = 2; 6371 alu.dst.write = 1; 6372 } else { 6373 if (tmp4 > 0) { 6374 alu.dst.sel = tmp4; 6375 alu.dst.chan = i; 6376 alu.dst.write = 1; 6377 } else { 6378 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6379 } 6380 } 6381 6382 alu.src[0].sel = tmp1; 6383 alu.src[0].chan = 1; 6384 alu.src[1].sel = tmp1; 6385 alu.src[1].chan = 3; 6386 alu.src[2].sel = tmp0; 6387 alu.src[2].chan = 2; 6388 6389 alu.last = 1; 6390 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6391 return r; 6392 6393 if (signed_op) { 6394 6395 /* fix the sign of the result */ 6396 6397 if (mod) { 6398 6399 /* tmp0.x = -tmp0.z */ 6400 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6401 alu.op = ALU_OP2_SUB_INT; 6402 6403 alu.dst.sel = tmp0; 6404 alu.dst.chan = 0; 6405 alu.dst.write = 1; 6406 6407 alu.src[0].sel = V_SQ_ALU_SRC_0; 6408 alu.src[1].sel = tmp0; 6409 alu.src[1].chan = 2; 6410 6411 alu.last = 1; 6412 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6413 return r; 6414 6415 /* sign of the remainder is the same as the sign of src0 */ 6416 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 6417 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6418 alu.op = ALU_OP3_CNDGE_INT; 6419 alu.is_op3 = 1; 6420 6421 if (tmp4 > 0) { 6422 alu.dst.sel = tmp4; 6423 alu.dst.chan = i; 6424 alu.dst.write = 1; 6425 } else { 6426 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6427 } 6428 6429 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6430 alu.src[1].sel = tmp0; 6431 alu.src[1].chan = 2; 6432 alu.src[2].sel = tmp0; 6433 alu.src[2].chan = 0; 6434 6435 alu.last = 1; 6436 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6437 return r; 6438 6439 } else { 6440 6441 /* tmp0.x = -tmp0.z */ 6442 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6443 alu.op = ALU_OP2_SUB_INT; 6444 6445 alu.dst.sel = tmp0; 6446 alu.dst.chan = 0; 6447 alu.dst.write = 1; 6448 6449 alu.src[0].sel = V_SQ_ALU_SRC_0; 6450 alu.src[1].sel = tmp0; 6451 alu.src[1].chan = 2; 6452 6453 alu.last = 1; 6454 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6455 return r; 6456 6457 /* fix the quotient sign (same as the sign of src0*src1) */ 6458 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 6459 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6460 alu.op = ALU_OP3_CNDGE_INT; 6461 alu.is_op3 = 1; 6462 6463 if (tmp4 > 0) { 6464 alu.dst.sel = tmp4; 6465 alu.dst.chan = i; 6466 alu.dst.write = 1; 6467 } else { 6468 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6469 } 6470 6471 alu.src[0].sel = tmp2; 6472 alu.src[0].chan = 2; 6473 alu.src[1].sel = tmp0; 6474 alu.src[1].chan = 2; 6475 alu.src[2].sel = tmp0; 6476 alu.src[2].chan = 0; 6477 6478 alu.last = 1; 6479 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6480 return r; 6481 } 6482 } 6483 } 6484 6485 if (tmp4 > 0) { 6486 for (i = 0; i <= lasti; ++i) { 6487 if (!(write_mask & (1<<i))) 6488 continue; 6489 6490 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6491 alu.op = ALU_OP1_MOV; 6492 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6493 alu.src[0].sel = tmp4; 6494 alu.src[0].chan = i; 6495 6496 if (i == lasti) 6497 alu.last = 1; 6498 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6499 return r; 6500 } 6501 } 6502 6503 return 0; 6504} 6505 6506static int tgsi_udiv(struct r600_shader_ctx *ctx) 6507{ 6508 return tgsi_divmod(ctx, 0, 0); 6509} 6510 6511static int tgsi_umod(struct r600_shader_ctx *ctx) 6512{ 6513 return tgsi_divmod(ctx, 1, 0); 6514} 6515 6516static int tgsi_idiv(struct r600_shader_ctx *ctx) 6517{ 6518 return tgsi_divmod(ctx, 0, 1); 6519} 6520 6521static int tgsi_imod(struct r600_shader_ctx *ctx) 6522{ 6523 return tgsi_divmod(ctx, 1, 1); 6524} 6525 6526 6527static int tgsi_f2i(struct r600_shader_ctx *ctx) 6528{ 6529 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6530 struct r600_bytecode_alu alu; 6531 int i, r; 6532 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6533 int last_inst = tgsi_last_instruction(write_mask); 6534 6535 for (i = 0; i < 4; i++) { 6536 if (!(write_mask & (1<<i))) 6537 continue; 6538 6539 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6540 alu.op = ALU_OP1_TRUNC; 6541 6542 alu.dst.sel = ctx->temp_reg; 6543 alu.dst.chan = i; 6544 alu.dst.write = 1; 6545 6546 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6547 if (i == last_inst) 6548 alu.last = 1; 6549 r = r600_bytecode_add_alu(ctx->bc, &alu); 6550 if (r) 6551 return r; 6552 } 6553 6554 for (i = 0; i < 4; i++) { 6555 if (!(write_mask & (1<<i))) 6556 continue; 6557 6558 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6559 alu.op = ctx->inst_info->op; 6560 6561 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6562 6563 alu.src[0].sel = ctx->temp_reg; 6564 alu.src[0].chan = i; 6565 6566 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 6567 alu.last = 1; 6568 r = r600_bytecode_add_alu(ctx->bc, &alu); 6569 if (r) 6570 return r; 6571 } 6572 6573 return 0; 6574} 6575 6576static int tgsi_iabs(struct r600_shader_ctx *ctx) 6577{ 6578 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6579 struct r600_bytecode_alu alu; 6580 int i, r; 6581 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6582 int last_inst = tgsi_last_instruction(write_mask); 6583 6584 /* tmp = -src */ 6585 for (i = 0; i < 4; i++) { 6586 if (!(write_mask & (1<<i))) 6587 continue; 6588 6589 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6590 alu.op = ALU_OP2_SUB_INT; 6591 6592 alu.dst.sel = ctx->temp_reg; 6593 alu.dst.chan = i; 6594 alu.dst.write = 1; 6595 6596 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6597 alu.src[0].sel = V_SQ_ALU_SRC_0; 6598 6599 if (i == last_inst) 6600 alu.last = 1; 6601 r = r600_bytecode_add_alu(ctx->bc, &alu); 6602 if (r) 6603 return r; 6604 } 6605 6606 /* dst = (src >= 0 ? src : tmp) */ 6607 for (i = 0; i < 4; i++) { 6608 if (!(write_mask & (1<<i))) 6609 continue; 6610 6611 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6612 alu.op = ALU_OP3_CNDGE_INT; 6613 alu.is_op3 = 1; 6614 alu.dst.write = 1; 6615 6616 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6617 6618 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6619 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6620 alu.src[2].sel = ctx->temp_reg; 6621 alu.src[2].chan = i; 6622 6623 if (i == last_inst) 6624 alu.last = 1; 6625 r = r600_bytecode_add_alu(ctx->bc, &alu); 6626 if (r) 6627 return r; 6628 } 6629 return 0; 6630} 6631 6632static int tgsi_issg(struct r600_shader_ctx *ctx) 6633{ 6634 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6635 struct r600_bytecode_alu alu; 6636 int i, r; 6637 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6638 int last_inst = tgsi_last_instruction(write_mask); 6639 6640 /* tmp = (src >= 0 ? src : -1) */ 6641 for (i = 0; i < 4; i++) { 6642 if (!(write_mask & (1<<i))) 6643 continue; 6644 6645 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6646 alu.op = ALU_OP3_CNDGE_INT; 6647 alu.is_op3 = 1; 6648 6649 alu.dst.sel = ctx->temp_reg; 6650 alu.dst.chan = i; 6651 alu.dst.write = 1; 6652 6653 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6654 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6655 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 6656 6657 if (i == last_inst) 6658 alu.last = 1; 6659 r = r600_bytecode_add_alu(ctx->bc, &alu); 6660 if (r) 6661 return r; 6662 } 6663 6664 /* dst = (tmp > 0 ? 1 : tmp) */ 6665 for (i = 0; i < 4; i++) { 6666 if (!(write_mask & (1<<i))) 6667 continue; 6668 6669 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6670 alu.op = ALU_OP3_CNDGT_INT; 6671 alu.is_op3 = 1; 6672 alu.dst.write = 1; 6673 6674 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6675 6676 alu.src[0].sel = ctx->temp_reg; 6677 alu.src[0].chan = i; 6678 6679 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6680 6681 alu.src[2].sel = ctx->temp_reg; 6682 alu.src[2].chan = i; 6683 6684 if (i == last_inst) 6685 alu.last = 1; 6686 r = r600_bytecode_add_alu(ctx->bc, &alu); 6687 if (r) 6688 return r; 6689 } 6690 return 0; 6691} 6692 6693 6694 6695static int tgsi_ssg(struct r600_shader_ctx *ctx) 6696{ 6697 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6698 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6699 int last_inst = tgsi_last_instruction(write_mask); 6700 struct r600_bytecode_alu alu; 6701 int i, r; 6702 6703 /* tmp = (src > 0 ? 1 : src) */ 6704 for (i = 0; i <= last_inst; i++) { 6705 if (!(write_mask & (1 << i))) 6706 continue; 6707 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6708 alu.op = ALU_OP3_CNDGT; 6709 alu.is_op3 = 1; 6710 6711 alu.dst.sel = ctx->temp_reg; 6712 alu.dst.chan = i; 6713 6714 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6715 alu.src[1].sel = V_SQ_ALU_SRC_1; 6716 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6717 6718 if (i == last_inst) 6719 alu.last = 1; 6720 r = r600_bytecode_add_alu(ctx->bc, &alu); 6721 if (r) 6722 return r; 6723 } 6724 6725 /* dst = (-tmp > 0 ? -1 : tmp) */ 6726 for (i = 0; i <= last_inst; i++) { 6727 if (!(write_mask & (1 << i))) 6728 continue; 6729 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6730 alu.op = ALU_OP3_CNDGT; 6731 alu.is_op3 = 1; 6732 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6733 6734 alu.src[0].sel = ctx->temp_reg; 6735 alu.src[0].chan = i; 6736 alu.src[0].neg = 1; 6737 6738 alu.src[1].sel = V_SQ_ALU_SRC_1; 6739 alu.src[1].neg = 1; 6740 6741 alu.src[2].sel = ctx->temp_reg; 6742 alu.src[2].chan = i; 6743 6744 if (i == last_inst) 6745 alu.last = 1; 6746 r = r600_bytecode_add_alu(ctx->bc, &alu); 6747 if (r) 6748 return r; 6749 } 6750 return 0; 6751} 6752 6753static int tgsi_bfi(struct r600_shader_ctx *ctx) 6754{ 6755 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6756 struct r600_bytecode_alu alu; 6757 int i, r, t1, t2; 6758 6759 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6760 int last_inst = tgsi_last_instruction(write_mask); 6761 6762 t1 = r600_get_temp(ctx); 6763 6764 for (i = 0; i < 4; i++) { 6765 if (!(write_mask & (1<<i))) 6766 continue; 6767 6768 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6769 alu.op = ALU_OP2_SETGE_INT; 6770 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6771 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6772 alu.src[1].value = 32; 6773 alu.dst.sel = ctx->temp_reg; 6774 alu.dst.chan = i; 6775 alu.dst.write = 1; 6776 alu.last = i == last_inst; 6777 r = r600_bytecode_add_alu(ctx->bc, &alu); 6778 if (r) 6779 return r; 6780 } 6781 6782 for (i = 0; i < 4; i++) { 6783 if (!(write_mask & (1<<i))) 6784 continue; 6785 6786 /* create mask tmp */ 6787 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6788 alu.op = ALU_OP2_BFM_INT; 6789 alu.dst.sel = t1; 6790 alu.dst.chan = i; 6791 alu.dst.write = 1; 6792 alu.last = i == last_inst; 6793 6794 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6795 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6796 6797 r = r600_bytecode_add_alu(ctx->bc, &alu); 6798 if (r) 6799 return r; 6800 } 6801 6802 t2 = r600_get_temp(ctx); 6803 6804 for (i = 0; i < 4; i++) { 6805 if (!(write_mask & (1<<i))) 6806 continue; 6807 6808 /* shift insert left */ 6809 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6810 alu.op = ALU_OP2_LSHL_INT; 6811 alu.dst.sel = t2; 6812 alu.dst.chan = i; 6813 alu.dst.write = 1; 6814 alu.last = i == last_inst; 6815 6816 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6817 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6818 6819 r = r600_bytecode_add_alu(ctx->bc, &alu); 6820 if (r) 6821 return r; 6822 } 6823 6824 for (i = 0; i < 4; i++) { 6825 if (!(write_mask & (1<<i))) 6826 continue; 6827 6828 /* actual bitfield insert */ 6829 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6830 alu.op = ALU_OP3_BFI_INT; 6831 alu.is_op3 = 1; 6832 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6833 alu.dst.chan = i; 6834 alu.dst.write = 1; 6835 alu.last = i == last_inst; 6836 6837 alu.src[0].sel = t1; 6838 alu.src[0].chan = i; 6839 alu.src[1].sel = t2; 6840 alu.src[1].chan = i; 6841 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6842 6843 r = r600_bytecode_add_alu(ctx->bc, &alu); 6844 if (r) 6845 return r; 6846 } 6847 6848 for (i = 0; i < 4; i++) { 6849 if (!(write_mask & (1<<i))) 6850 continue; 6851 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6852 alu.op = ALU_OP3_CNDE_INT; 6853 alu.is_op3 = 1; 6854 alu.src[0].sel = ctx->temp_reg; 6855 alu.src[0].chan = i; 6856 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 6857 6858 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6859 6860 alu.src[1].sel = alu.dst.sel; 6861 alu.src[1].chan = i; 6862 6863 alu.last = i == last_inst; 6864 r = r600_bytecode_add_alu(ctx->bc, &alu); 6865 if (r) 6866 return r; 6867 } 6868 return 0; 6869} 6870 6871static int tgsi_msb(struct r600_shader_ctx *ctx) 6872{ 6873 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6874 struct r600_bytecode_alu alu; 6875 int i, r, t1, t2; 6876 6877 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6878 int last_inst = tgsi_last_instruction(write_mask); 6879 6880 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT || 6881 ctx->inst_info->op == ALU_OP1_FFBH_UINT); 6882 6883 t1 = ctx->temp_reg; 6884 6885 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */ 6886 for (i = 0; i < 4; i++) { 6887 if (!(write_mask & (1<<i))) 6888 continue; 6889 6890 /* t1 = FFBH_INT / FFBH_UINT */ 6891 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6892 alu.op = ctx->inst_info->op; 6893 alu.dst.sel = t1; 6894 alu.dst.chan = i; 6895 alu.dst.write = 1; 6896 alu.last = i == last_inst; 6897 6898 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6899 6900 r = r600_bytecode_add_alu(ctx->bc, &alu); 6901 if (r) 6902 return r; 6903 } 6904 6905 t2 = r600_get_temp(ctx); 6906 6907 for (i = 0; i < 4; i++) { 6908 if (!(write_mask & (1<<i))) 6909 continue; 6910 6911 /* t2 = 31 - t1 */ 6912 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6913 alu.op = ALU_OP2_SUB_INT; 6914 alu.dst.sel = t2; 6915 alu.dst.chan = i; 6916 alu.dst.write = 1; 6917 alu.last = i == last_inst; 6918 6919 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 6920 alu.src[0].value = 31; 6921 alu.src[1].sel = t1; 6922 alu.src[1].chan = i; 6923 6924 r = r600_bytecode_add_alu(ctx->bc, &alu); 6925 if (r) 6926 return r; 6927 } 6928 6929 for (i = 0; i < 4; i++) { 6930 if (!(write_mask & (1<<i))) 6931 continue; 6932 6933 /* result = t1 >= 0 ? t2 : t1 */ 6934 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6935 alu.op = ALU_OP3_CNDGE_INT; 6936 alu.is_op3 = 1; 6937 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6938 alu.dst.chan = i; 6939 alu.dst.write = 1; 6940 alu.last = i == last_inst; 6941 6942 alu.src[0].sel = t1; 6943 alu.src[0].chan = i; 6944 alu.src[1].sel = t2; 6945 alu.src[1].chan = i; 6946 alu.src[2].sel = t1; 6947 alu.src[2].chan = i; 6948 6949 r = r600_bytecode_add_alu(ctx->bc, &alu); 6950 if (r) 6951 return r; 6952 } 6953 6954 return 0; 6955} 6956 6957static int tgsi_interp_egcm(struct r600_shader_ctx *ctx) 6958{ 6959 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6960 struct r600_bytecode_alu alu; 6961 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti; 6962 unsigned location; 6963 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs; 6964 6965 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); 6966 6967 /* Interpolators have been marked for use already by allocate_system_value_inputs */ 6968 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6969 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6970 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */ 6971 } 6972 else { 6973 location = TGSI_INTERPOLATE_LOC_CENTROID; 6974 } 6975 6976 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location); 6977 if (k < 0) 6978 k = 0; 6979 interp_gpr = ctx->eg_interpolators[k].ij_index / 2; 6980 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2); 6981 6982 /* NOTE: currently offset is not perspective correct */ 6983 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6984 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6985 int sample_gpr = -1; 6986 int gradientsH, gradientsV; 6987 struct r600_bytecode_tex tex; 6988 6989 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6990 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]); 6991 } 6992 6993 gradientsH = r600_get_temp(ctx); 6994 gradientsV = r600_get_temp(ctx); 6995 for (i = 0; i < 2; i++) { 6996 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 6997 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V; 6998 tex.src_gpr = interp_gpr; 6999 tex.src_sel_x = interp_base_chan + 0; 7000 tex.src_sel_y = interp_base_chan + 1; 7001 tex.src_sel_z = 0; 7002 tex.src_sel_w = 0; 7003 tex.dst_gpr = i == 0 ? gradientsH : gradientsV; 7004 tex.dst_sel_x = 0; 7005 tex.dst_sel_y = 1; 7006 tex.dst_sel_z = 7; 7007 tex.dst_sel_w = 7; 7008 tex.inst_mod = 1; // Use per pixel gradient calculation 7009 tex.sampler_id = 0; 7010 tex.resource_id = tex.sampler_id; 7011 r = r600_bytecode_add_tex(ctx->bc, &tex); 7012 if (r) 7013 return r; 7014 } 7015 7016 for (i = 0; i < 2; i++) { 7017 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7018 alu.op = ALU_OP3_MULADD; 7019 alu.is_op3 = 1; 7020 alu.src[0].sel = gradientsH; 7021 alu.src[0].chan = i; 7022 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7023 alu.src[1].sel = sample_gpr; 7024 alu.src[1].chan = 2; 7025 } 7026 else { 7027 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 7028 } 7029 alu.src[2].sel = interp_gpr; 7030 alu.src[2].chan = interp_base_chan + i; 7031 alu.dst.sel = ctx->temp_reg; 7032 alu.dst.chan = i; 7033 alu.last = i == 1; 7034 7035 r = r600_bytecode_add_alu(ctx->bc, &alu); 7036 if (r) 7037 return r; 7038 } 7039 7040 for (i = 0; i < 2; i++) { 7041 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7042 alu.op = ALU_OP3_MULADD; 7043 alu.is_op3 = 1; 7044 alu.src[0].sel = gradientsV; 7045 alu.src[0].chan = i; 7046 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7047 alu.src[1].sel = sample_gpr; 7048 alu.src[1].chan = 3; 7049 } 7050 else { 7051 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 7052 } 7053 alu.src[2].sel = ctx->temp_reg; 7054 alu.src[2].chan = i; 7055 alu.dst.sel = ctx->temp_reg; 7056 alu.dst.chan = i; 7057 alu.last = i == 1; 7058 7059 r = r600_bytecode_add_alu(ctx->bc, &alu); 7060 if (r) 7061 return r; 7062 } 7063 } 7064 7065 tmp = r600_get_temp(ctx); 7066 for (i = 0; i < 8; i++) { 7067 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7068 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY; 7069 7070 alu.dst.sel = tmp; 7071 if ((i > 1 && i < 6)) { 7072 alu.dst.write = 1; 7073 } 7074 else { 7075 alu.dst.write = 0; 7076 } 7077 alu.dst.chan = i % 4; 7078 7079 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7080 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7081 alu.src[0].sel = ctx->temp_reg; 7082 alu.src[0].chan = 1 - (i % 2); 7083 } else { 7084 alu.src[0].sel = interp_gpr; 7085 alu.src[0].chan = interp_base_chan + 1 - (i % 2); 7086 } 7087 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 7088 alu.src[1].chan = 0; 7089 7090 alu.last = i % 4 == 3; 7091 alu.bank_swizzle_force = SQ_ALU_VEC_210; 7092 7093 r = r600_bytecode_add_alu(ctx->bc, &alu); 7094 if (r) 7095 return r; 7096 } 7097 7098 // INTERP can't swizzle dst 7099 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7100 for (i = 0; i <= lasti; i++) { 7101 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7102 continue; 7103 7104 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7105 alu.op = ALU_OP1_MOV; 7106 alu.src[0].sel = tmp; 7107 alu.src[0].chan = ctx->src[0].swizzle[i]; 7108 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7109 alu.dst.write = 1; 7110 alu.last = i == lasti; 7111 r = r600_bytecode_add_alu(ctx->bc, &alu); 7112 if (r) 7113 return r; 7114 } 7115 7116 return 0; 7117} 7118 7119 7120static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 7121{ 7122 struct r600_bytecode_alu alu; 7123 int i, r; 7124 7125 for (i = 0; i < 4; i++) { 7126 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7127 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 7128 alu.op = ALU_OP0_NOP; 7129 alu.dst.chan = i; 7130 } else { 7131 alu.op = ALU_OP1_MOV; 7132 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7133 alu.src[0].sel = ctx->temp_reg; 7134 alu.src[0].chan = i; 7135 } 7136 if (i == 3) { 7137 alu.last = 1; 7138 } 7139 r = r600_bytecode_add_alu(ctx->bc, &alu); 7140 if (r) 7141 return r; 7142 } 7143 return 0; 7144} 7145 7146static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx, 7147 unsigned writemask, 7148 struct r600_bytecode_alu_src *bc_src, 7149 const struct r600_shader_src *shader_src) 7150{ 7151 struct r600_bytecode_alu alu; 7152 int i, r; 7153 int lasti = tgsi_last_instruction(writemask); 7154 int temp_reg = 0; 7155 7156 r600_bytecode_src(&bc_src[0], shader_src, 0); 7157 r600_bytecode_src(&bc_src[1], shader_src, 1); 7158 r600_bytecode_src(&bc_src[2], shader_src, 2); 7159 r600_bytecode_src(&bc_src[3], shader_src, 3); 7160 7161 if (bc_src->abs) { 7162 temp_reg = r600_get_temp(ctx); 7163 7164 for (i = 0; i < lasti + 1; i++) { 7165 if (!(writemask & (1 << i))) 7166 continue; 7167 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7168 alu.op = ALU_OP1_MOV; 7169 alu.dst.sel = temp_reg; 7170 alu.dst.chan = i; 7171 alu.dst.write = 1; 7172 alu.src[0] = bc_src[i]; 7173 if (i == lasti) { 7174 alu.last = 1; 7175 } 7176 r = r600_bytecode_add_alu(ctx->bc, &alu); 7177 if (r) 7178 return r; 7179 memset(&bc_src[i], 0, sizeof(*bc_src)); 7180 bc_src[i].sel = temp_reg; 7181 bc_src[i].chan = i; 7182 } 7183 } 7184 return 0; 7185} 7186 7187static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) 7188{ 7189 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7190 struct r600_bytecode_alu alu; 7191 struct r600_bytecode_alu_src srcs[4][4]; 7192 int i, j, r; 7193 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7194 unsigned op = ctx->inst_info->op; 7195 7196 if (op == ALU_OP3_MULADD_IEEE && 7197 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 7198 op = ALU_OP3_MULADD; 7199 7200 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7201 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 7202 srcs[j], &ctx->src[j]); 7203 if (r) 7204 return r; 7205 } 7206 7207 for (i = 0; i < lasti + 1; i++) { 7208 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7209 continue; 7210 7211 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7212 alu.op = op; 7213 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7214 alu.src[j] = srcs[j][i]; 7215 } 7216 7217 if (dst == -1) { 7218 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7219 } else { 7220 alu.dst.sel = dst; 7221 } 7222 alu.dst.chan = i; 7223 alu.dst.write = 1; 7224 alu.is_op3 = 1; 7225 if (i == lasti) { 7226 alu.last = 1; 7227 } 7228 r = r600_bytecode_add_alu(ctx->bc, &alu); 7229 if (r) 7230 return r; 7231 } 7232 return 0; 7233} 7234 7235static int tgsi_op3(struct r600_shader_ctx *ctx) 7236{ 7237 return tgsi_op3_dst(ctx, -1); 7238} 7239 7240static int tgsi_dp(struct r600_shader_ctx *ctx) 7241{ 7242 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7243 struct r600_bytecode_alu alu; 7244 int i, j, r; 7245 unsigned op = ctx->inst_info->op; 7246 if (op == ALU_OP2_DOT4_IEEE && 7247 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 7248 op = ALU_OP2_DOT4; 7249 7250 for (i = 0; i < 4; i++) { 7251 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7252 alu.op = op; 7253 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7254 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 7255 } 7256 7257 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7258 alu.dst.chan = i; 7259 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 7260 /* handle some special cases */ 7261 switch (inst->Instruction.Opcode) { 7262 case TGSI_OPCODE_DP2: 7263 if (i > 1) { 7264 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7265 alu.src[0].chan = alu.src[1].chan = 0; 7266 } 7267 break; 7268 case TGSI_OPCODE_DP3: 7269 if (i > 2) { 7270 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7271 alu.src[0].chan = alu.src[1].chan = 0; 7272 } 7273 break; 7274 default: 7275 break; 7276 } 7277 if (i == 3) { 7278 alu.last = 1; 7279 } 7280 r = r600_bytecode_add_alu(ctx->bc, &alu); 7281 if (r) 7282 return r; 7283 } 7284 return 0; 7285} 7286 7287static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 7288 unsigned index) 7289{ 7290 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7291 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 7292 inst->Src[index].Register.File != TGSI_FILE_INPUT && 7293 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 7294 ctx->src[index].neg || ctx->src[index].abs || 7295 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY); 7296} 7297 7298static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 7299 unsigned index) 7300{ 7301 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7302 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 7303} 7304 7305static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 7306{ 7307 struct r600_bytecode_vtx vtx; 7308 struct r600_bytecode_alu alu; 7309 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7310 int src_gpr, r, i; 7311 int id = tgsi_tex_get_src_gpr(ctx, 1); 7312 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7313 7314 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7315 if (src_requires_loading) { 7316 for (i = 0; i < 4; i++) { 7317 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7318 alu.op = ALU_OP1_MOV; 7319 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7320 alu.dst.sel = ctx->temp_reg; 7321 alu.dst.chan = i; 7322 if (i == 3) 7323 alu.last = 1; 7324 alu.dst.write = 1; 7325 r = r600_bytecode_add_alu(ctx->bc, &alu); 7326 if (r) 7327 return r; 7328 } 7329 src_gpr = ctx->temp_reg; 7330 } 7331 7332 memset(&vtx, 0, sizeof(vtx)); 7333 vtx.op = FETCH_OP_VFETCH; 7334 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 7335 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7336 vtx.src_gpr = src_gpr; 7337 vtx.mega_fetch_count = 16; 7338 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7339 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7340 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 7341 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 7342 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 7343 vtx.use_const_fields = 1; 7344 vtx.buffer_index_mode = sampler_index_mode; 7345 7346 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 7347 return r; 7348 7349 if (ctx->bc->chip_class >= EVERGREEN) 7350 return 0; 7351 7352 for (i = 0; i < 4; i++) { 7353 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7354 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7355 continue; 7356 7357 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7358 alu.op = ALU_OP2_AND_INT; 7359 7360 alu.dst.chan = i; 7361 alu.dst.sel = vtx.dst_gpr; 7362 alu.dst.write = 1; 7363 7364 alu.src[0].sel = vtx.dst_gpr; 7365 alu.src[0].chan = i; 7366 7367 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL; 7368 alu.src[1].sel += (id * 2); 7369 alu.src[1].chan = i % 4; 7370 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7371 7372 if (i == lasti) 7373 alu.last = 1; 7374 r = r600_bytecode_add_alu(ctx->bc, &alu); 7375 if (r) 7376 return r; 7377 } 7378 7379 if (inst->Dst[0].Register.WriteMask & 3) { 7380 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7381 alu.op = ALU_OP2_OR_INT; 7382 7383 alu.dst.chan = 3; 7384 alu.dst.sel = vtx.dst_gpr; 7385 alu.dst.write = 1; 7386 7387 alu.src[0].sel = vtx.dst_gpr; 7388 alu.src[0].chan = 3; 7389 7390 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1; 7391 alu.src[1].chan = 0; 7392 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7393 7394 alu.last = 1; 7395 r = r600_bytecode_add_alu(ctx->bc, &alu); 7396 if (r) 7397 return r; 7398 } 7399 return 0; 7400} 7401 7402static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base) 7403{ 7404 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7405 int r; 7406 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; 7407 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7408 7409 if (ctx->bc->chip_class < EVERGREEN) { 7410 struct r600_bytecode_alu alu; 7411 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7412 alu.op = ALU_OP1_MOV; 7413 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 7414 /* r600 we have them at channel 2 of the second dword */ 7415 alu.src[0].sel += (id * 2) + 1; 7416 alu.src[0].chan = 1; 7417 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7418 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 7419 alu.last = 1; 7420 r = r600_bytecode_add_alu(ctx->bc, &alu); 7421 if (r) 7422 return r; 7423 return 0; 7424 } else { 7425 struct r600_bytecode_vtx vtx; 7426 memset(&vtx, 0, sizeof(vtx)); 7427 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 7428 vtx.buffer_id = id + eg_buffer_base; 7429 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7430 vtx.src_gpr = 0; 7431 vtx.mega_fetch_count = 16; /* no idea here really... */ 7432 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7433 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7434 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */ 7435 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */ 7436 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */ 7437 vtx.data_format = FMT_32_32_32_32; 7438 vtx.buffer_index_mode = sampler_index_mode; 7439 7440 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 7441 return r; 7442 return 0; 7443 } 7444} 7445 7446 7447static int tgsi_tex(struct r600_shader_ctx *ctx) 7448{ 7449 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7450 struct r600_bytecode_tex tex; 7451 struct r600_bytecode_tex grad_offs[3]; 7452 struct r600_bytecode_alu alu; 7453 unsigned src_gpr; 7454 int r, i, j, n_grad_offs = 0; 7455 int opcode; 7456 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing && 7457 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7458 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 7459 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 7460 7461 bool txf_add_offsets = inst->Texture.NumOffsets && 7462 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7463 inst->Texture.Texture != TGSI_TEXTURE_BUFFER; 7464 7465 /* Texture fetch instructions can only use gprs as source. 7466 * Also they cannot negate the source or take the absolute value */ 7467 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS && 7468 tgsi_tex_src_requires_loading(ctx, 0)) || 7469 read_compressed_msaa || txf_add_offsets; 7470 7471 boolean src_loaded = FALSE; 7472 unsigned sampler_src_reg = 1; 7473 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 7474 boolean has_txq_cube_array_z = false; 7475 unsigned sampler_index_mode; 7476 int array_index_offset_channel = -1; 7477 7478 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 7479 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7480 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 7481 if (inst->Dst[0].Register.WriteMask & 4) { 7482 ctx->shader->has_txq_cube_array_z_comp = true; 7483 has_txq_cube_array_z = true; 7484 } 7485 7486 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 7487 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7488 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 || 7489 inst->Instruction.Opcode == TGSI_OPCODE_TG4) 7490 sampler_src_reg = 2; 7491 7492 /* TGSI moves the sampler to src reg 3 for TXD */ 7493 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) 7494 sampler_src_reg = 3; 7495 7496 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7497 7498 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7499 7500 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 7501 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 7502 if (ctx->bc->chip_class < EVERGREEN) 7503 ctx->shader->uses_tex_buffers = true; 7504 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS); 7505 } 7506 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 7507 if (ctx->bc->chip_class < EVERGREEN) 7508 ctx->shader->uses_tex_buffers = true; 7509 return do_vtx_fetch_inst(ctx, src_requires_loading); 7510 } 7511 } 7512 7513 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 7514 int out_chan; 7515 /* Add perspective divide */ 7516 if (ctx->bc->chip_class == CAYMAN) { 7517 out_chan = 2; 7518 for (i = 0; i < 3; i++) { 7519 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7520 alu.op = ALU_OP1_RECIP_IEEE; 7521 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7522 7523 alu.dst.sel = ctx->temp_reg; 7524 alu.dst.chan = i; 7525 if (i == 2) 7526 alu.last = 1; 7527 if (out_chan == i) 7528 alu.dst.write = 1; 7529 r = r600_bytecode_add_alu(ctx->bc, &alu); 7530 if (r) 7531 return r; 7532 } 7533 7534 } else { 7535 out_chan = 3; 7536 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7537 alu.op = ALU_OP1_RECIP_IEEE; 7538 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7539 7540 alu.dst.sel = ctx->temp_reg; 7541 alu.dst.chan = out_chan; 7542 alu.last = 1; 7543 alu.dst.write = 1; 7544 r = r600_bytecode_add_alu(ctx->bc, &alu); 7545 if (r) 7546 return r; 7547 } 7548 7549 for (i = 0; i < 3; i++) { 7550 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7551 alu.op = ALU_OP2_MUL; 7552 alu.src[0].sel = ctx->temp_reg; 7553 alu.src[0].chan = out_chan; 7554 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 7555 alu.dst.sel = ctx->temp_reg; 7556 alu.dst.chan = i; 7557 alu.dst.write = 1; 7558 r = r600_bytecode_add_alu(ctx->bc, &alu); 7559 if (r) 7560 return r; 7561 } 7562 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7563 alu.op = ALU_OP1_MOV; 7564 alu.src[0].sel = V_SQ_ALU_SRC_1; 7565 alu.src[0].chan = 0; 7566 alu.dst.sel = ctx->temp_reg; 7567 alu.dst.chan = 3; 7568 alu.last = 1; 7569 alu.dst.write = 1; 7570 r = r600_bytecode_add_alu(ctx->bc, &alu); 7571 if (r) 7572 return r; 7573 src_loaded = TRUE; 7574 src_gpr = ctx->temp_reg; 7575 } 7576 7577 7578 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 7579 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7580 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7581 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 7582 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) { 7583 7584 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 7585 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 7586 7587 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 7588 for (i = 0; i < 4; i++) { 7589 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7590 alu.op = ALU_OP2_CUBE; 7591 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 7592 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 7593 alu.dst.sel = ctx->temp_reg; 7594 alu.dst.chan = i; 7595 if (i == 3) 7596 alu.last = 1; 7597 alu.dst.write = 1; 7598 r = r600_bytecode_add_alu(ctx->bc, &alu); 7599 if (r) 7600 return r; 7601 } 7602 7603 /* tmp1.z = RCP_e(|tmp1.z|) */ 7604 if (ctx->bc->chip_class == CAYMAN) { 7605 for (i = 0; i < 3; i++) { 7606 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7607 alu.op = ALU_OP1_RECIP_IEEE; 7608 alu.src[0].sel = ctx->temp_reg; 7609 alu.src[0].chan = 2; 7610 alu.src[0].abs = 1; 7611 alu.dst.sel = ctx->temp_reg; 7612 alu.dst.chan = i; 7613 if (i == 2) 7614 alu.dst.write = 1; 7615 if (i == 2) 7616 alu.last = 1; 7617 r = r600_bytecode_add_alu(ctx->bc, &alu); 7618 if (r) 7619 return r; 7620 } 7621 } else { 7622 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7623 alu.op = ALU_OP1_RECIP_IEEE; 7624 alu.src[0].sel = ctx->temp_reg; 7625 alu.src[0].chan = 2; 7626 alu.src[0].abs = 1; 7627 alu.dst.sel = ctx->temp_reg; 7628 alu.dst.chan = 2; 7629 alu.dst.write = 1; 7630 alu.last = 1; 7631 r = r600_bytecode_add_alu(ctx->bc, &alu); 7632 if (r) 7633 return r; 7634 } 7635 7636 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 7637 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 7638 * muladd has no writemask, have to use another temp 7639 */ 7640 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7641 alu.op = ALU_OP3_MULADD; 7642 alu.is_op3 = 1; 7643 7644 alu.src[0].sel = ctx->temp_reg; 7645 alu.src[0].chan = 0; 7646 alu.src[1].sel = ctx->temp_reg; 7647 alu.src[1].chan = 2; 7648 7649 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7650 alu.src[2].chan = 0; 7651 alu.src[2].value = u_bitcast_f2u(1.5f); 7652 7653 alu.dst.sel = ctx->temp_reg; 7654 alu.dst.chan = 0; 7655 alu.dst.write = 1; 7656 7657 r = r600_bytecode_add_alu(ctx->bc, &alu); 7658 if (r) 7659 return r; 7660 7661 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7662 alu.op = ALU_OP3_MULADD; 7663 alu.is_op3 = 1; 7664 7665 alu.src[0].sel = ctx->temp_reg; 7666 alu.src[0].chan = 1; 7667 alu.src[1].sel = ctx->temp_reg; 7668 alu.src[1].chan = 2; 7669 7670 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7671 alu.src[2].chan = 0; 7672 alu.src[2].value = u_bitcast_f2u(1.5f); 7673 7674 alu.dst.sel = ctx->temp_reg; 7675 alu.dst.chan = 1; 7676 alu.dst.write = 1; 7677 7678 alu.last = 1; 7679 r = r600_bytecode_add_alu(ctx->bc, &alu); 7680 if (r) 7681 return r; 7682 /* write initial compare value into Z component 7683 - W src 0 for shadow cube 7684 - X src 1 for shadow cube array */ 7685 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7686 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7687 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7688 alu.op = ALU_OP1_MOV; 7689 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 7690 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7691 else 7692 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7693 alu.dst.sel = ctx->temp_reg; 7694 alu.dst.chan = 2; 7695 alu.dst.write = 1; 7696 alu.last = 1; 7697 r = r600_bytecode_add_alu(ctx->bc, &alu); 7698 if (r) 7699 return r; 7700 } 7701 7702 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7703 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7704 if (ctx->bc->chip_class >= EVERGREEN) { 7705 int mytmp = r600_get_temp(ctx); 7706 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7707 alu.op = ALU_OP1_MOV; 7708 alu.src[0].sel = ctx->temp_reg; 7709 alu.src[0].chan = 3; 7710 alu.dst.sel = mytmp; 7711 alu.dst.chan = 0; 7712 alu.dst.write = 1; 7713 alu.last = 1; 7714 r = r600_bytecode_add_alu(ctx->bc, &alu); 7715 if (r) 7716 return r; 7717 7718 /* Evaluate the array index according to floor(idx + 0.5). This 7719 * needs to be done before merging the face select value, because 7720 * otherwise the fractional part of the array index will interfere 7721 * with the face select value */ 7722 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7723 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7724 alu.op = ALU_OP1_RNDNE; 7725 alu.dst.sel = ctx->temp_reg; 7726 alu.dst.chan = 3; 7727 alu.dst.write = 1; 7728 alu.last = 1; 7729 r = r600_bytecode_add_alu(ctx->bc, &alu); 7730 if (r) 7731 return r; 7732 7733 /* Because the array slice index and the cube face index are merged 7734 * into one value we have to make sure the array slice index is >= 0, 7735 * otherwise the face selection will fail */ 7736 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7737 alu.op = ALU_OP2_MAX; 7738 alu.src[0].sel = ctx->temp_reg; 7739 alu.src[0].chan = 3; 7740 alu.src[1].sel = V_SQ_ALU_SRC_0; 7741 alu.dst.sel = ctx->temp_reg; 7742 alu.dst.chan = 3; 7743 alu.dst.write = 1; 7744 alu.last = 1; 7745 r = r600_bytecode_add_alu(ctx->bc, &alu); 7746 if (r) 7747 return r; 7748 7749 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 7750 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7751 alu.op = ALU_OP3_MULADD; 7752 alu.is_op3 = 1; 7753 alu.src[0].sel = ctx->temp_reg; 7754 alu.src[0].chan = 3; 7755 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7756 alu.src[1].chan = 0; 7757 alu.src[1].value = u_bitcast_f2u(8.0f); 7758 alu.src[2].sel = mytmp; 7759 alu.src[2].chan = 0; 7760 alu.dst.sel = ctx->temp_reg; 7761 alu.dst.chan = 3; 7762 alu.dst.write = 1; 7763 alu.last = 1; 7764 r = r600_bytecode_add_alu(ctx->bc, &alu); 7765 if (r) 7766 return r; 7767 } else if (ctx->bc->chip_class < EVERGREEN) { 7768 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7769 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 7770 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7771 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7772 tex.src_gpr = r600_get_temp(ctx); 7773 tex.src_sel_x = 0; 7774 tex.src_sel_y = 0; 7775 tex.src_sel_z = 0; 7776 tex.src_sel_w = 0; 7777 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 7778 tex.coord_type_x = 1; 7779 tex.coord_type_y = 1; 7780 tex.coord_type_z = 1; 7781 tex.coord_type_w = 1; 7782 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7783 alu.op = ALU_OP1_MOV; 7784 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7785 alu.dst.sel = tex.src_gpr; 7786 alu.dst.chan = 0; 7787 alu.last = 1; 7788 alu.dst.write = 1; 7789 r = r600_bytecode_add_alu(ctx->bc, &alu); 7790 if (r) 7791 return r; 7792 7793 r = r600_bytecode_add_tex(ctx->bc, &tex); 7794 if (r) 7795 return r; 7796 } 7797 7798 } 7799 7800 /* for cube forms of lod and bias we need to route things */ 7801 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 7802 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 7803 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7804 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 7805 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7806 alu.op = ALU_OP1_MOV; 7807 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7808 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 7809 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7810 else 7811 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7812 alu.dst.sel = ctx->temp_reg; 7813 alu.dst.chan = 2; 7814 alu.last = 1; 7815 alu.dst.write = 1; 7816 r = r600_bytecode_add_alu(ctx->bc, &alu); 7817 if (r) 7818 return r; 7819 } 7820 7821 src_loaded = TRUE; 7822 src_gpr = ctx->temp_reg; 7823 } 7824 7825 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 7826 int temp_h = 0, temp_v = 0; 7827 int start_val = 0; 7828 7829 /* if we've already loaded the src (i.e. CUBE don't reload it). */ 7830 if (src_loaded == TRUE) 7831 start_val = 1; 7832 else 7833 src_loaded = TRUE; 7834 for (i = start_val; i < 3; i++) { 7835 int treg = r600_get_temp(ctx); 7836 7837 if (i == 0) 7838 src_gpr = treg; 7839 else if (i == 1) 7840 temp_h = treg; 7841 else 7842 temp_v = treg; 7843 7844 for (j = 0; j < 4; j++) { 7845 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7846 alu.op = ALU_OP1_MOV; 7847 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 7848 alu.dst.sel = treg; 7849 alu.dst.chan = j; 7850 if (j == 3) 7851 alu.last = 1; 7852 alu.dst.write = 1; 7853 r = r600_bytecode_add_alu(ctx->bc, &alu); 7854 if (r) 7855 return r; 7856 } 7857 } 7858 for (i = 1; i < 3; i++) { 7859 /* set gradients h/v */ 7860 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++]; 7861 memset(t, 0, sizeof(struct r600_bytecode_tex)); 7862 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 7863 FETCH_OP_SET_GRADIENTS_V; 7864 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7865 t->sampler_index_mode = sampler_index_mode; 7866 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 7867 t->resource_index_mode = sampler_index_mode; 7868 7869 t->src_gpr = (i == 1) ? temp_h : temp_v; 7870 t->src_sel_x = 0; 7871 t->src_sel_y = 1; 7872 t->src_sel_z = 2; 7873 t->src_sel_w = 3; 7874 7875 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */ 7876 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7; 7877 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 7878 t->coord_type_x = 1; 7879 t->coord_type_y = 1; 7880 t->coord_type_z = 1; 7881 t->coord_type_w = 1; 7882 } 7883 } 7884 } 7885 7886 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 7887 /* Gather4 should follow the same rules as bilinear filtering, but the hardware 7888 * incorrectly forces nearest filtering if the texture format is integer. 7889 * The only effect it has on Gather4, which always returns 4 texels for 7890 * bilinear filtering, is that the final coordinates are off by 0.5 of 7891 * the texel size. 7892 * 7893 * The workaround is to subtract 0.5 from the unnormalized coordinates, 7894 * or (0.5 / size) from the normalized coordinates. 7895 */ 7896 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT || 7897 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) { 7898 int treg = r600_get_temp(ctx); 7899 7900 /* mov array and comparison oordinate to temp_reg if needed */ 7901 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 7902 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 7903 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) { 7904 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2; 7905 for (i = 2; i <= end; i++) { 7906 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7907 alu.op = ALU_OP1_MOV; 7908 alu.dst.sel = ctx->temp_reg; 7909 alu.dst.chan = i; 7910 alu.dst.write = 1; 7911 alu.last = (i == end); 7912 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7913 r = r600_bytecode_add_alu(ctx->bc, &alu); 7914 if (r) 7915 return r; 7916 } 7917 } 7918 7919 if (inst->Texture.Texture == TGSI_TEXTURE_RECT || 7920 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) { 7921 for (i = 0; i < 2; i++) { 7922 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7923 alu.op = ALU_OP2_ADD; 7924 alu.dst.sel = ctx->temp_reg; 7925 alu.dst.chan = i; 7926 alu.dst.write = 1; 7927 alu.last = i == 1; 7928 if (src_loaded) { 7929 alu.src[0].sel = ctx->temp_reg; 7930 alu.src[0].chan = i; 7931 } else 7932 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7933 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 7934 alu.src[1].neg = 1; 7935 r = r600_bytecode_add_alu(ctx->bc, &alu); 7936 if (r) 7937 return r; 7938 } 7939 } else { 7940 /* execute a TXQ */ 7941 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7942 tex.op = FETCH_OP_GET_TEXTURE_RESINFO; 7943 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7944 tex.sampler_index_mode = sampler_index_mode; 7945 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7946 tex.resource_index_mode = sampler_index_mode; 7947 tex.dst_gpr = treg; 7948 tex.src_sel_x = 4; 7949 tex.src_sel_y = 4; 7950 tex.src_sel_z = 4; 7951 tex.src_sel_w = 4; 7952 tex.dst_sel_x = 0; 7953 tex.dst_sel_y = 1; 7954 tex.dst_sel_z = 7; 7955 tex.dst_sel_w = 7; 7956 r = r600_bytecode_add_tex(ctx->bc, &tex); 7957 if (r) 7958 return r; 7959 7960 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */ 7961 if (ctx->bc->chip_class == CAYMAN) { 7962 /* */ 7963 for (i = 0; i < 2; i++) { 7964 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7965 alu.op = ALU_OP1_INT_TO_FLT; 7966 alu.dst.sel = treg; 7967 alu.dst.chan = i; 7968 alu.dst.write = 1; 7969 alu.src[0].sel = treg; 7970 alu.src[0].chan = i; 7971 alu.last = (i == 1) ? 1 : 0; 7972 r = r600_bytecode_add_alu(ctx->bc, &alu); 7973 if (r) 7974 return r; 7975 } 7976 for (j = 0; j < 2; j++) { 7977 for (i = 0; i < 3; i++) { 7978 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7979 alu.op = ALU_OP1_RECIP_IEEE; 7980 alu.src[0].sel = treg; 7981 alu.src[0].chan = j; 7982 alu.dst.sel = treg; 7983 alu.dst.chan = i; 7984 if (i == 2) 7985 alu.last = 1; 7986 if (i == j) 7987 alu.dst.write = 1; 7988 r = r600_bytecode_add_alu(ctx->bc, &alu); 7989 if (r) 7990 return r; 7991 } 7992 } 7993 } else { 7994 for (i = 0; i < 2; i++) { 7995 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7996 alu.op = ALU_OP1_INT_TO_FLT; 7997 alu.dst.sel = treg; 7998 alu.dst.chan = i; 7999 alu.dst.write = 1; 8000 alu.src[0].sel = treg; 8001 alu.src[0].chan = i; 8002 alu.last = 1; 8003 r = r600_bytecode_add_alu(ctx->bc, &alu); 8004 if (r) 8005 return r; 8006 } 8007 for (i = 0; i < 2; i++) { 8008 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8009 alu.op = ALU_OP1_RECIP_IEEE; 8010 alu.src[0].sel = treg; 8011 alu.src[0].chan = i; 8012 alu.dst.sel = treg; 8013 alu.dst.chan = i; 8014 alu.last = 1; 8015 alu.dst.write = 1; 8016 r = r600_bytecode_add_alu(ctx->bc, &alu); 8017 if (r) 8018 return r; 8019 } 8020 } 8021 for (i = 0; i < 2; i++) { 8022 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8023 alu.op = ALU_OP3_MULADD; 8024 alu.is_op3 = 1; 8025 alu.dst.sel = ctx->temp_reg; 8026 alu.dst.chan = i; 8027 alu.dst.write = 1; 8028 alu.last = i == 1; 8029 alu.src[0].sel = treg; 8030 alu.src[0].chan = i; 8031 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 8032 alu.src[1].neg = 1; 8033 if (src_loaded) { 8034 alu.src[2].sel = ctx->temp_reg; 8035 alu.src[2].chan = i; 8036 } else 8037 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 8038 r = r600_bytecode_add_alu(ctx->bc, &alu); 8039 if (r) 8040 return r; 8041 } 8042 } 8043 src_loaded = TRUE; 8044 src_gpr = ctx->temp_reg; 8045 } 8046 } 8047 8048 if (src_requires_loading && !src_loaded) { 8049 for (i = 0; i < 4; i++) { 8050 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8051 alu.op = ALU_OP1_MOV; 8052 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8053 alu.dst.sel = ctx->temp_reg; 8054 alu.dst.chan = i; 8055 if (i == 3) 8056 alu.last = 1; 8057 alu.dst.write = 1; 8058 r = r600_bytecode_add_alu(ctx->bc, &alu); 8059 if (r) 8060 return r; 8061 } 8062 src_loaded = TRUE; 8063 src_gpr = ctx->temp_reg; 8064 } 8065 8066 /* get offset values */ 8067 if (inst->Texture.NumOffsets) { 8068 assert(inst->Texture.NumOffsets == 1); 8069 8070 /* The texture offset feature doesn't work with the TXF instruction 8071 * and must be emulated by adding the offset to the texture coordinates. */ 8072 if (txf_add_offsets) { 8073 const struct tgsi_texture_offset *off = inst->TexOffsets; 8074 8075 switch (inst->Texture.Texture) { 8076 case TGSI_TEXTURE_3D: 8077 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8078 alu.op = ALU_OP2_ADD_INT; 8079 alu.src[0].sel = src_gpr; 8080 alu.src[0].chan = 2; 8081 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8082 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ]; 8083 alu.dst.sel = src_gpr; 8084 alu.dst.chan = 2; 8085 alu.dst.write = 1; 8086 alu.last = 1; 8087 r = r600_bytecode_add_alu(ctx->bc, &alu); 8088 if (r) 8089 return r; 8090 /* fall through */ 8091 8092 case TGSI_TEXTURE_2D: 8093 case TGSI_TEXTURE_SHADOW2D: 8094 case TGSI_TEXTURE_RECT: 8095 case TGSI_TEXTURE_SHADOWRECT: 8096 case TGSI_TEXTURE_2D_ARRAY: 8097 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8098 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8099 alu.op = ALU_OP2_ADD_INT; 8100 alu.src[0].sel = src_gpr; 8101 alu.src[0].chan = 1; 8102 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8103 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY]; 8104 alu.dst.sel = src_gpr; 8105 alu.dst.chan = 1; 8106 alu.dst.write = 1; 8107 alu.last = 1; 8108 r = r600_bytecode_add_alu(ctx->bc, &alu); 8109 if (r) 8110 return r; 8111 /* fall through */ 8112 8113 case TGSI_TEXTURE_1D: 8114 case TGSI_TEXTURE_SHADOW1D: 8115 case TGSI_TEXTURE_1D_ARRAY: 8116 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8117 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8118 alu.op = ALU_OP2_ADD_INT; 8119 alu.src[0].sel = src_gpr; 8120 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8121 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX]; 8122 alu.dst.sel = src_gpr; 8123 alu.dst.write = 1; 8124 alu.last = 1; 8125 r = r600_bytecode_add_alu(ctx->bc, &alu); 8126 if (r) 8127 return r; 8128 break; 8129 /* texture offsets do not apply to other texture targets */ 8130 } 8131 } else { 8132 switch (inst->Texture.Texture) { 8133 case TGSI_TEXTURE_3D: 8134 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 8135 /* fallthrough */ 8136 case TGSI_TEXTURE_2D: 8137 case TGSI_TEXTURE_SHADOW2D: 8138 case TGSI_TEXTURE_RECT: 8139 case TGSI_TEXTURE_SHADOWRECT: 8140 case TGSI_TEXTURE_2D_ARRAY: 8141 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8142 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 8143 /* fallthrough */ 8144 case TGSI_TEXTURE_1D: 8145 case TGSI_TEXTURE_SHADOW1D: 8146 case TGSI_TEXTURE_1D_ARRAY: 8147 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8148 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 8149 } 8150 } 8151 } 8152 8153 /* Obtain the sample index for reading a compressed MSAA color texture. 8154 * To read the FMASK, we use the ldfptr instruction, which tells us 8155 * where the samples are stored. 8156 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 8157 * which is the identity mapping. Each nibble says which physical sample 8158 * should be fetched to get that sample. 8159 * 8160 * Assume src.z contains the sample index. It should be modified like this: 8161 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 8162 * Then fetch the texel with src. 8163 */ 8164 if (read_compressed_msaa) { 8165 unsigned sample_chan = 3; 8166 unsigned temp = r600_get_temp(ctx); 8167 assert(src_loaded); 8168 8169 /* temp.w = ldfptr() */ 8170 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8171 tex.op = FETCH_OP_LD; 8172 tex.inst_mod = 1; /* to indicate this is ldfptr */ 8173 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8174 tex.sampler_index_mode = sampler_index_mode; 8175 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8176 tex.resource_index_mode = sampler_index_mode; 8177 tex.src_gpr = src_gpr; 8178 tex.dst_gpr = temp; 8179 tex.dst_sel_x = 7; /* mask out these components */ 8180 tex.dst_sel_y = 7; 8181 tex.dst_sel_z = 7; 8182 tex.dst_sel_w = 0; /* store X */ 8183 tex.src_sel_x = 0; 8184 tex.src_sel_y = 1; 8185 tex.src_sel_z = 2; 8186 tex.src_sel_w = 3; 8187 tex.offset_x = offset_x; 8188 tex.offset_y = offset_y; 8189 tex.offset_z = offset_z; 8190 r = r600_bytecode_add_tex(ctx->bc, &tex); 8191 if (r) 8192 return r; 8193 8194 /* temp.x = sample_index*4 */ 8195 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8196 alu.op = ALU_OP2_MULLO_INT; 8197 alu.src[0].sel = src_gpr; 8198 alu.src[0].chan = sample_chan; 8199 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8200 alu.src[1].value = 4; 8201 alu.dst.sel = temp; 8202 alu.dst.chan = 0; 8203 alu.dst.write = 1; 8204 r = emit_mul_int_op(ctx->bc, &alu); 8205 if (r) 8206 return r; 8207 8208 /* sample_index = temp.w >> temp.x */ 8209 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8210 alu.op = ALU_OP2_LSHR_INT; 8211 alu.src[0].sel = temp; 8212 alu.src[0].chan = 3; 8213 alu.src[1].sel = temp; 8214 alu.src[1].chan = 0; 8215 alu.dst.sel = src_gpr; 8216 alu.dst.chan = sample_chan; 8217 alu.dst.write = 1; 8218 alu.last = 1; 8219 r = r600_bytecode_add_alu(ctx->bc, &alu); 8220 if (r) 8221 return r; 8222 8223 /* sample_index & 0xF */ 8224 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8225 alu.op = ALU_OP2_AND_INT; 8226 alu.src[0].sel = src_gpr; 8227 alu.src[0].chan = sample_chan; 8228 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8229 alu.src[1].value = 0xF; 8230 alu.dst.sel = src_gpr; 8231 alu.dst.chan = sample_chan; 8232 alu.dst.write = 1; 8233 alu.last = 1; 8234 r = r600_bytecode_add_alu(ctx->bc, &alu); 8235 if (r) 8236 return r; 8237#if 0 8238 /* visualize the FMASK */ 8239 for (i = 0; i < 4; i++) { 8240 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8241 alu.op = ALU_OP1_INT_TO_FLT; 8242 alu.src[0].sel = src_gpr; 8243 alu.src[0].chan = sample_chan; 8244 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8245 alu.dst.chan = i; 8246 alu.dst.write = 1; 8247 alu.last = 1; 8248 r = r600_bytecode_add_alu(ctx->bc, &alu); 8249 if (r) 8250 return r; 8251 } 8252 return 0; 8253#endif 8254 } 8255 8256 /* does this shader want a num layers from TXQ for a cube array? */ 8257 if (has_txq_cube_array_z) { 8258 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8259 8260 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8261 alu.op = ALU_OP1_MOV; 8262 8263 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 8264 if (ctx->bc->chip_class >= EVERGREEN) { 8265 /* with eg each dword is number of cubes */ 8266 alu.src[0].sel += id / 4; 8267 alu.src[0].chan = id % 4; 8268 } else { 8269 /* r600 we have them at channel 2 of the second dword */ 8270 alu.src[0].sel += (id * 2) + 1; 8271 alu.src[0].chan = 2; 8272 } 8273 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 8274 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 8275 alu.last = 1; 8276 r = r600_bytecode_add_alu(ctx->bc, &alu); 8277 if (r) 8278 return r; 8279 /* disable writemask from texture instruction */ 8280 inst->Dst[0].Register.WriteMask &= ~4; 8281 } 8282 8283 opcode = ctx->inst_info->op; 8284 if (opcode == FETCH_OP_GATHER4 && 8285 inst->TexOffsets[0].File != TGSI_FILE_NULL && 8286 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) { 8287 struct r600_bytecode_tex *t; 8288 opcode = FETCH_OP_GATHER4_O; 8289 8290 /* GATHER4_O/GATHER4_C_O use offset values loaded by 8291 SET_TEXTURE_OFFSETS instruction. The immediate offset values 8292 encoded in the instruction are ignored. */ 8293 t = &grad_offs[n_grad_offs++]; 8294 memset(t, 0, sizeof(struct r600_bytecode_tex)); 8295 t->op = FETCH_OP_SET_TEXTURE_OFFSETS; 8296 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8297 t->sampler_index_mode = sampler_index_mode; 8298 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 8299 t->resource_index_mode = sampler_index_mode; 8300 8301 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index; 8302 t->src_sel_x = inst->TexOffsets[0].SwizzleX; 8303 t->src_sel_y = inst->TexOffsets[0].SwizzleY; 8304 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8305 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) 8306 /* make sure array index selector is 0, this is just a safety 8307 * precausion because TGSI seems to emit something strange here */ 8308 t->src_sel_z = 4; 8309 else 8310 t->src_sel_z = inst->TexOffsets[0].SwizzleZ; 8311 8312 t->src_sel_w = 4; 8313 8314 t->dst_sel_x = 7; 8315 t->dst_sel_y = 7; 8316 t->dst_sel_z = 7; 8317 t->dst_sel_w = 7; 8318 } 8319 8320 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8321 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8322 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8323 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8324 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 8325 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 8326 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8327 switch (opcode) { 8328 case FETCH_OP_SAMPLE: 8329 opcode = FETCH_OP_SAMPLE_C; 8330 break; 8331 case FETCH_OP_SAMPLE_L: 8332 opcode = FETCH_OP_SAMPLE_C_L; 8333 break; 8334 case FETCH_OP_SAMPLE_LB: 8335 opcode = FETCH_OP_SAMPLE_C_LB; 8336 break; 8337 case FETCH_OP_SAMPLE_G: 8338 opcode = FETCH_OP_SAMPLE_C_G; 8339 break; 8340 /* Texture gather variants */ 8341 case FETCH_OP_GATHER4: 8342 opcode = FETCH_OP_GATHER4_C; 8343 break; 8344 case FETCH_OP_GATHER4_O: 8345 opcode = FETCH_OP_GATHER4_C_O; 8346 break; 8347 } 8348 } 8349 8350 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8351 tex.op = opcode; 8352 8353 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8354 tex.sampler_index_mode = sampler_index_mode; 8355 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8356 tex.resource_index_mode = sampler_index_mode; 8357 tex.src_gpr = src_gpr; 8358 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8359 8360 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE || 8361 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) { 8362 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */ 8363 } 8364 8365 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 8366 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX]; 8367 tex.inst_mod = texture_component_select; 8368 8369 if (ctx->bc->chip_class == CAYMAN) { 8370 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8371 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8372 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8373 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8374 } else { 8375 /* GATHER4 result order is different from TGSI TG4 */ 8376 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7; 8377 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7; 8378 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7; 8379 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8380 } 8381 } 8382 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) { 8383 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8384 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8385 tex.dst_sel_z = 7; 8386 tex.dst_sel_w = 7; 8387 } 8388 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8389 tex.dst_sel_x = 3; 8390 tex.dst_sel_y = 7; 8391 tex.dst_sel_z = 7; 8392 tex.dst_sel_w = 7; 8393 } 8394 else { 8395 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8396 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8397 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8398 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8399 } 8400 8401 8402 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8403 tex.src_sel_x = 4; 8404 tex.src_sel_y = 4; 8405 tex.src_sel_z = 4; 8406 tex.src_sel_w = 4; 8407 } else if (src_loaded) { 8408 tex.src_sel_x = 0; 8409 tex.src_sel_y = 1; 8410 tex.src_sel_z = 2; 8411 tex.src_sel_w = 3; 8412 } else { 8413 tex.src_sel_x = ctx->src[0].swizzle[0]; 8414 tex.src_sel_y = ctx->src[0].swizzle[1]; 8415 tex.src_sel_z = ctx->src[0].swizzle[2]; 8416 tex.src_sel_w = ctx->src[0].swizzle[3]; 8417 tex.src_rel = ctx->src[0].rel; 8418 } 8419 8420 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 8421 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8422 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8423 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8424 tex.src_sel_x = 1; 8425 tex.src_sel_y = 0; 8426 tex.src_sel_z = 3; 8427 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 8428 } 8429 8430 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 8431 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 8432 tex.coord_type_x = 1; 8433 tex.coord_type_y = 1; 8434 } 8435 tex.coord_type_z = 1; 8436 tex.coord_type_w = 1; 8437 8438 tex.offset_x = offset_x; 8439 tex.offset_y = offset_y; 8440 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 && 8441 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8442 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) { 8443 tex.offset_z = 0; 8444 } 8445 else { 8446 tex.offset_z = offset_z; 8447 } 8448 8449 /* Put the depth for comparison in W. 8450 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 8451 * Some instructions expect the depth in Z. */ 8452 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8453 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8454 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8455 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 8456 opcode != FETCH_OP_SAMPLE_C_L && 8457 opcode != FETCH_OP_SAMPLE_C_LB) { 8458 tex.src_sel_w = tex.src_sel_z; 8459 } 8460 8461 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 8462 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 8463 if (opcode == FETCH_OP_SAMPLE_C_L || 8464 opcode == FETCH_OP_SAMPLE_C_LB) { 8465 /* the array index is read from Y */ 8466 tex.coord_type_y = 0; 8467 array_index_offset_channel = tex.src_sel_y; 8468 } else { 8469 /* the array index is read from Z */ 8470 tex.coord_type_z = 0; 8471 tex.src_sel_z = tex.src_sel_y; 8472 array_index_offset_channel = tex.src_sel_z; 8473 } 8474 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8475 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) { 8476 tex.coord_type_z = 0; 8477 array_index_offset_channel = tex.src_sel_z; 8478 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8479 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 8480 (ctx->bc->chip_class >= EVERGREEN)) 8481 /* the array index is read from Z, coordinate will be corrected elsewhere */ 8482 tex.coord_type_z = 0; 8483 8484 /* We have array access to 1D or 2D ARRAY, the coordinates are not int -> 8485 * evaluate the array index */ 8486 if (array_index_offset_channel >= 0 && 8487 opcode != FETCH_OP_LD && 8488 opcode != FETCH_OP_GET_TEXTURE_RESINFO) { 8489 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8490 alu.src[0].sel = tex.src_gpr; 8491 alu.src[0].chan = array_index_offset_channel; 8492 alu.src[0].rel = tex.src_rel; 8493 alu.op = ALU_OP1_RNDNE; 8494 alu.dst.sel = tex.src_gpr; 8495 alu.dst.chan = array_index_offset_channel; 8496 alu.dst.rel = tex.src_rel; 8497 alu.dst.write = 1; 8498 alu.last = 1; 8499 r = r600_bytecode_add_alu(ctx->bc, &alu); 8500 if (r) 8501 return r; 8502 } 8503 8504 /* mask unused source components */ 8505 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) { 8506 switch (inst->Texture.Texture) { 8507 case TGSI_TEXTURE_2D: 8508 case TGSI_TEXTURE_RECT: 8509 tex.src_sel_z = 7; 8510 tex.src_sel_w = 7; 8511 break; 8512 case TGSI_TEXTURE_1D_ARRAY: 8513 tex.src_sel_y = 7; 8514 tex.src_sel_w = 7; 8515 break; 8516 case TGSI_TEXTURE_1D: 8517 tex.src_sel_y = 7; 8518 tex.src_sel_z = 7; 8519 tex.src_sel_w = 7; 8520 break; 8521 } 8522 } 8523 8524 /* Emit set gradient and offset instructions. */ 8525 for (i = 0; i < n_grad_offs; ++i) { 8526 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]); 8527 if (r) 8528 return r; 8529 } 8530 8531 r = r600_bytecode_add_tex(ctx->bc, &tex); 8532 if (r) 8533 return r; 8534 8535 /* add shadow ambient support - gallium doesn't do it yet */ 8536 return 0; 8537} 8538 8539static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, 8540 struct tgsi_full_src_register *src) 8541{ 8542 unsigned i; 8543 8544 if (src->Register.Indirect) { 8545 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8546 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id) 8547 return ctx->shader->atomics[i].hw_idx; 8548 } 8549 } else { 8550 uint32_t index = src->Register.Index; 8551 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8552 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index) 8553 continue; 8554 if (index > ctx->shader->atomics[i].end) 8555 continue; 8556 if (index < ctx->shader->atomics[i].start) 8557 continue; 8558 uint32_t offset = (index - ctx->shader->atomics[i].start); 8559 return ctx->shader->atomics[i].hw_idx + offset; 8560 } 8561 } 8562 assert(0); 8563 return -1; 8564} 8565 8566static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx, 8567 int *uav_id_p, int *uav_index_mode_p) 8568{ 8569 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8570 int uav_id, uav_index_mode = 0; 8571 int r; 8572 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8573 8574 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); 8575 8576 if (inst->Src[0].Register.Indirect) { 8577 if (is_cm) { 8578 struct r600_bytecode_alu alu; 8579 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8580 alu.op = ALU_OP2_LSHL_INT; 8581 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index); 8582 alu.src[0].chan = 0; 8583 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8584 alu.src[1].value = 2; 8585 alu.dst.sel = ctx->temp_reg; 8586 alu.dst.chan = 0; 8587 alu.dst.write = 1; 8588 alu.last = 1; 8589 r = r600_bytecode_add_alu(ctx->bc, &alu); 8590 if (r) 8591 return r; 8592 8593 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8594 ctx->temp_reg, 0, 8595 ctx->temp_reg, 0, 8596 V_SQ_ALU_SRC_LITERAL, uav_id * 4); 8597 if (r) 8598 return r; 8599 } else 8600 uav_index_mode = 2; 8601 } else if (is_cm) { 8602 r = single_alu_op2(ctx, ALU_OP1_MOV, 8603 ctx->temp_reg, 0, 8604 V_SQ_ALU_SRC_LITERAL, uav_id * 4, 8605 0, 0); 8606 if (r) 8607 return r; 8608 } 8609 *uav_id_p = uav_id; 8610 *uav_index_mode_p = uav_index_mode; 8611 return 0; 8612} 8613 8614static int tgsi_load_gds(struct r600_shader_ctx *ctx) 8615{ 8616 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8617 int r; 8618 struct r600_bytecode_gds gds; 8619 int uav_id = 0; 8620 int uav_index_mode = 0; 8621 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8622 8623 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 8624 if (r) 8625 return r; 8626 8627 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 8628 gds.op = FETCH_OP_GDS_READ_RET; 8629 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8630 gds.uav_id = is_cm ? 0 : uav_id; 8631 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 8632 gds.src_gpr = ctx->temp_reg; 8633 gds.src_sel_x = (is_cm) ? 0 : 4; 8634 gds.src_sel_y = 4; 8635 gds.src_sel_z = 4; 8636 gds.dst_sel_x = 0; 8637 gds.dst_sel_y = 7; 8638 gds.dst_sel_z = 7; 8639 gds.dst_sel_w = 7; 8640 gds.src_gpr2 = 0; 8641 gds.alloc_consume = !is_cm; 8642 r = r600_bytecode_add_gds(ctx->bc, &gds); 8643 if (r) 8644 return r; 8645 8646 ctx->bc->cf_last->vpm = 1; 8647 return 0; 8648} 8649 8650/* this fixes up 1D arrays properly */ 8651static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr) 8652{ 8653 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8654 int r, i; 8655 struct r600_bytecode_alu alu; 8656 int temp_reg = r600_get_temp(ctx); 8657 8658 for (i = 0; i < 4; i++) { 8659 bool def_val = true, write_zero = false; 8660 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8661 alu.op = ALU_OP1_MOV; 8662 alu.dst.sel = temp_reg; 8663 alu.dst.chan = i; 8664 8665 switch (inst->Memory.Texture) { 8666 case TGSI_TEXTURE_BUFFER: 8667 case TGSI_TEXTURE_1D: 8668 if (i == 1 || i == 2 || i == 3) { 8669 write_zero = true; 8670 } 8671 break; 8672 case TGSI_TEXTURE_1D_ARRAY: 8673 if (i == 1 || i == 3) 8674 write_zero = true; 8675 else if (i == 2) { 8676 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1); 8677 def_val = false; 8678 } 8679 break; 8680 case TGSI_TEXTURE_2D: 8681 if (i == 2 || i == 3) 8682 write_zero = true; 8683 break; 8684 default: 8685 if (i == 3) 8686 write_zero = true; 8687 break; 8688 } 8689 8690 if (write_zero) { 8691 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 8692 alu.src[0].value = 0; 8693 } else if (def_val) { 8694 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i); 8695 } 8696 8697 if (i == 3) 8698 alu.last = 1; 8699 alu.dst.write = 1; 8700 r = r600_bytecode_add_alu(ctx->bc, &alu); 8701 if (r) 8702 return r; 8703 } 8704 *idx_gpr = temp_reg; 8705 return 0; 8706} 8707 8708static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx, 8709 int temp_reg) 8710{ 8711 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8712 int r; 8713 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) { 8714 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]); 8715 r = single_alu_op2(ctx, ALU_OP1_MOV, 8716 temp_reg, 0, 8717 V_SQ_ALU_SRC_LITERAL, value >> 2, 8718 0, 0); 8719 if (r) 8720 return r; 8721 } else { 8722 struct r600_bytecode_alu alu; 8723 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8724 alu.op = ALU_OP2_LSHR_INT; 8725 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0); 8726 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8727 alu.src[1].value = 2; 8728 alu.dst.sel = temp_reg; 8729 alu.dst.write = 1; 8730 alu.last = 1; 8731 r = r600_bytecode_add_alu(ctx->bc, &alu); 8732 if (r) 8733 return r; 8734 } 8735 return 0; 8736} 8737 8738static int tgsi_load_buffer(struct r600_shader_ctx *ctx) 8739{ 8740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8741 /* have to work out the offset into the RAT immediate return buffer */ 8742 struct r600_bytecode_vtx vtx; 8743 struct r600_bytecode_cf *cf; 8744 int r; 8745 int temp_reg = r600_get_temp(ctx); 8746 unsigned rat_index_mode; 8747 unsigned base; 8748 8749 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8750 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE]; 8751 8752 r = load_buffer_coord(ctx, 1, temp_reg); 8753 if (r) 8754 return r; 8755 ctx->bc->cf_last->barrier = 1; 8756 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8757 vtx.op = FETCH_OP_VFETCH; 8758 vtx.buffer_id = inst->Src[0].Register.Index + base; 8759 vtx.buffer_index_mode = rat_index_mode; 8760 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8761 vtx.src_gpr = temp_reg; 8762 vtx.src_sel_x = 0; 8763 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8764 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 8765 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 8766 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 8767 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 8768 vtx.num_format_all = 1; 8769 vtx.format_comp_all = 1; 8770 vtx.srf_mode_all = 0; 8771 8772 if (inst->Dst[0].Register.WriteMask & 8) { 8773 vtx.data_format = FMT_32_32_32_32; 8774 vtx.use_const_fields = 0; 8775 } else if (inst->Dst[0].Register.WriteMask & 4) { 8776 vtx.data_format = FMT_32_32_32; 8777 vtx.use_const_fields = 0; 8778 } else if (inst->Dst[0].Register.WriteMask & 2) { 8779 vtx.data_format = FMT_32_32; 8780 vtx.use_const_fields = 0; 8781 } else { 8782 vtx.data_format = FMT_32; 8783 vtx.use_const_fields = 0; 8784 } 8785 8786 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8787 if (r) 8788 return r; 8789 cf = ctx->bc->cf_last; 8790 cf->barrier = 1; 8791 return 0; 8792} 8793 8794static int tgsi_load_rat(struct r600_shader_ctx *ctx) 8795{ 8796 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8797 /* have to work out the offset into the RAT immediate return buffer */ 8798 struct r600_bytecode_vtx vtx; 8799 struct r600_bytecode_cf *cf; 8800 int r; 8801 int idx_gpr; 8802 unsigned format, num_format, format_comp, endian; 8803 const struct util_format_description *desc; 8804 unsigned rat_index_mode; 8805 unsigned immed_base; 8806 8807 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8808 8809 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 8810 r = load_index_src(ctx, 1, &idx_gpr); 8811 if (r) 8812 return r; 8813 8814 if (rat_index_mode) 8815 egcm_load_index_reg(ctx->bc, 1, false); 8816 8817 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8818 cf = ctx->bc->cf_last; 8819 8820 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index; 8821 cf->rat.inst = V_RAT_INST_NOP_RTN; 8822 cf->rat.index_mode = rat_index_mode; 8823 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 8824 cf->output.gpr = ctx->thread_id_gpr; 8825 cf->output.index_gpr = idx_gpr; 8826 cf->output.comp_mask = 0xf; 8827 cf->output.burst_count = 1; 8828 cf->vpm = 1; 8829 cf->barrier = 1; 8830 cf->mark = 1; 8831 cf->output.elem_size = 0; 8832 8833 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 8834 cf = ctx->bc->cf_last; 8835 cf->barrier = 1; 8836 8837 desc = util_format_description(inst->Memory.Format); 8838 r600_vertex_data_type(inst->Memory.Format, 8839 &format, &num_format, &format_comp, &endian); 8840 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8841 vtx.op = FETCH_OP_VFETCH; 8842 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 8843 vtx.buffer_index_mode = rat_index_mode; 8844 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8845 vtx.src_gpr = ctx->thread_id_gpr; 8846 vtx.src_sel_x = 1; 8847 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8848 vtx.dst_sel_x = desc->swizzle[0]; 8849 vtx.dst_sel_y = desc->swizzle[1]; 8850 vtx.dst_sel_z = desc->swizzle[2]; 8851 vtx.dst_sel_w = desc->swizzle[3]; 8852 vtx.srf_mode_all = 1; 8853 vtx.data_format = format; 8854 vtx.num_format_all = num_format; 8855 vtx.format_comp_all = format_comp; 8856 vtx.endian = endian; 8857 vtx.offset = 0; 8858 vtx.mega_fetch_count = 3; 8859 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8860 if (r) 8861 return r; 8862 cf = ctx->bc->cf_last; 8863 cf->barrier = 1; 8864 return 0; 8865} 8866 8867static int tgsi_load_lds(struct r600_shader_ctx *ctx) 8868{ 8869 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8870 struct r600_bytecode_alu alu; 8871 int r; 8872 int temp_reg = r600_get_temp(ctx); 8873 8874 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8875 alu.op = ALU_OP1_MOV; 8876 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 8877 alu.dst.sel = temp_reg; 8878 alu.dst.write = 1; 8879 alu.last = 1; 8880 r = r600_bytecode_add_alu(ctx->bc, &alu); 8881 if (r) 8882 return r; 8883 8884 r = do_lds_fetch_values(ctx, temp_reg, 8885 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask); 8886 if (r) 8887 return r; 8888 return 0; 8889} 8890 8891static int tgsi_load(struct r600_shader_ctx *ctx) 8892{ 8893 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8894 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 8895 return tgsi_load_rat(ctx); 8896 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 8897 return tgsi_load_gds(ctx); 8898 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 8899 return tgsi_load_buffer(ctx); 8900 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 8901 return tgsi_load_lds(ctx); 8902 return 0; 8903} 8904 8905static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) 8906{ 8907 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8908 struct r600_bytecode_cf *cf; 8909 int r, i; 8910 unsigned rat_index_mode; 8911 int lasti; 8912 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx); 8913 8914 r = load_buffer_coord(ctx, 0, treg2); 8915 if (r) 8916 return r; 8917 8918 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8919 if (rat_index_mode) 8920 egcm_load_index_reg(ctx->bc, 1, false); 8921 8922 for (i = 0; i <= 3; i++) { 8923 struct r600_bytecode_alu alu; 8924 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8925 alu.op = ALU_OP1_MOV; 8926 alu.dst.sel = temp_reg; 8927 alu.dst.chan = i; 8928 alu.src[0].sel = V_SQ_ALU_SRC_0; 8929 alu.last = (i == 3); 8930 alu.dst.write = 1; 8931 r = r600_bytecode_add_alu(ctx->bc, &alu); 8932 if (r) 8933 return r; 8934 } 8935 8936 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 8937 for (i = 0; i <= lasti; i++) { 8938 struct r600_bytecode_alu alu; 8939 if (!((1 << i) & inst->Dst[0].Register.WriteMask)) 8940 continue; 8941 8942 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8943 temp_reg, 0, 8944 treg2, 0, 8945 V_SQ_ALU_SRC_LITERAL, i); 8946 if (r) 8947 return r; 8948 8949 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8950 alu.op = ALU_OP1_MOV; 8951 alu.dst.sel = ctx->temp_reg; 8952 alu.dst.chan = 0; 8953 8954 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 8955 alu.last = 1; 8956 alu.dst.write = 1; 8957 r = r600_bytecode_add_alu(ctx->bc, &alu); 8958 if (r) 8959 return r; 8960 8961 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8962 cf = ctx->bc->cf_last; 8963 8964 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE]; 8965 cf->rat.inst = V_RAT_INST_STORE_TYPED; 8966 cf->rat.index_mode = rat_index_mode; 8967 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 8968 cf->output.gpr = ctx->temp_reg; 8969 cf->output.index_gpr = temp_reg; 8970 cf->output.comp_mask = 1; 8971 cf->output.burst_count = 1; 8972 cf->vpm = 1; 8973 cf->barrier = 1; 8974 cf->output.elem_size = 0; 8975 } 8976 return 0; 8977} 8978 8979static int tgsi_store_rat(struct r600_shader_ctx *ctx) 8980{ 8981 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8982 struct r600_bytecode_cf *cf; 8983 bool src_requires_loading = false; 8984 int val_gpr, idx_gpr; 8985 int r, i; 8986 unsigned rat_index_mode; 8987 8988 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8989 8990 r = load_index_src(ctx, 0, &idx_gpr); 8991 if (r) 8992 return r; 8993 8994 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY) 8995 src_requires_loading = true; 8996 8997 if (src_requires_loading) { 8998 struct r600_bytecode_alu alu; 8999 for (i = 0; i < 4; i++) { 9000 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9001 alu.op = ALU_OP1_MOV; 9002 alu.dst.sel = ctx->temp_reg; 9003 alu.dst.chan = i; 9004 9005 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9006 if (i == 3) 9007 alu.last = 1; 9008 alu.dst.write = 1; 9009 r = r600_bytecode_add_alu(ctx->bc, &alu); 9010 if (r) 9011 return r; 9012 } 9013 val_gpr = ctx->temp_reg; 9014 } else 9015 val_gpr = tgsi_tex_get_src_gpr(ctx, 1); 9016 if (rat_index_mode) 9017 egcm_load_index_reg(ctx->bc, 1, false); 9018 9019 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9020 cf = ctx->bc->cf_last; 9021 9022 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index; 9023 cf->rat.inst = V_RAT_INST_STORE_TYPED; 9024 cf->rat.index_mode = rat_index_mode; 9025 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 9026 cf->output.gpr = val_gpr; 9027 cf->output.index_gpr = idx_gpr; 9028 cf->output.comp_mask = 0xf; 9029 cf->output.burst_count = 1; 9030 cf->vpm = 1; 9031 cf->barrier = 1; 9032 cf->output.elem_size = 0; 9033 return 0; 9034} 9035 9036static int tgsi_store_lds(struct r600_shader_ctx *ctx) 9037{ 9038 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9039 struct r600_bytecode_alu alu; 9040 int r, i, lasti; 9041 int write_mask = inst->Dst[0].Register.WriteMask; 9042 int temp_reg = r600_get_temp(ctx); 9043 9044 /* LDS write */ 9045 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9046 alu.op = ALU_OP1_MOV; 9047 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9048 alu.dst.sel = temp_reg; 9049 alu.dst.write = 1; 9050 alu.last = 1; 9051 r = r600_bytecode_add_alu(ctx->bc, &alu); 9052 if (r) 9053 return r; 9054 9055 lasti = tgsi_last_instruction(write_mask); 9056 for (i = 1; i <= lasti; i++) { 9057 if (!(write_mask & (1 << i))) 9058 continue; 9059 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 9060 temp_reg, i, 9061 temp_reg, 0, 9062 V_SQ_ALU_SRC_LITERAL, 4 * i); 9063 if (r) 9064 return r; 9065 } 9066 for (i = 0; i <= lasti; i++) { 9067 if (!(write_mask & (1 << i))) 9068 continue; 9069 9070 if ((i == 0 && ((write_mask & 3) == 3)) || 9071 (i == 2 && ((write_mask & 0xc) == 0xc))) { 9072 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9073 alu.op = LDS_OP3_LDS_WRITE_REL; 9074 9075 alu.src[0].sel = temp_reg; 9076 alu.src[0].chan = i; 9077 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9078 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1); 9079 alu.last = 1; 9080 alu.is_lds_idx_op = true; 9081 alu.lds_idx = 1; 9082 r = r600_bytecode_add_alu(ctx->bc, &alu); 9083 if (r) 9084 return r; 9085 i += 1; 9086 continue; 9087 } 9088 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9089 alu.op = LDS_OP2_LDS_WRITE; 9090 9091 alu.src[0].sel = temp_reg; 9092 alu.src[0].chan = i; 9093 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9094 9095 alu.last = 1; 9096 alu.is_lds_idx_op = true; 9097 9098 r = r600_bytecode_add_alu(ctx->bc, &alu); 9099 if (r) 9100 return r; 9101 } 9102 return 0; 9103} 9104 9105static int tgsi_store(struct r600_shader_ctx *ctx) 9106{ 9107 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9108 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) 9109 return tgsi_store_buffer_rat(ctx); 9110 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) 9111 return tgsi_store_lds(ctx); 9112 else 9113 return tgsi_store_rat(ctx); 9114} 9115 9116static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx) 9117{ 9118 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9119 /* have to work out the offset into the RAT immediate return buffer */ 9120 struct r600_bytecode_alu alu; 9121 struct r600_bytecode_vtx vtx; 9122 struct r600_bytecode_cf *cf; 9123 int r; 9124 int idx_gpr; 9125 unsigned format, num_format, format_comp, endian; 9126 const struct util_format_description *desc; 9127 unsigned rat_index_mode; 9128 unsigned immed_base; 9129 unsigned rat_base; 9130 9131 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 9132 rat_base = ctx->shader->rat_base; 9133 9134 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 9135 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9136 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9137 9138 r = load_buffer_coord(ctx, 1, ctx->temp_reg); 9139 if (r) 9140 return r; 9141 idx_gpr = ctx->temp_reg; 9142 } else { 9143 r = load_index_src(ctx, 1, &idx_gpr); 9144 if (r) 9145 return r; 9146 } 9147 9148 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9149 9150 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) { 9151 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9152 alu.op = ALU_OP1_MOV; 9153 alu.dst.sel = ctx->thread_id_gpr; 9154 alu.dst.chan = 0; 9155 alu.dst.write = 1; 9156 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9157 alu.last = 1; 9158 r = r600_bytecode_add_alu(ctx->bc, &alu); 9159 if (r) 9160 return r; 9161 9162 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9163 alu.op = ALU_OP1_MOV; 9164 alu.dst.sel = ctx->thread_id_gpr; 9165 if (ctx->bc->chip_class == CAYMAN) 9166 alu.dst.chan = 2; 9167 else 9168 alu.dst.chan = 3; 9169 alu.dst.write = 1; 9170 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9171 alu.last = 1; 9172 r = r600_bytecode_add_alu(ctx->bc, &alu); 9173 if (r) 9174 return r; 9175 } else { 9176 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9177 alu.op = ALU_OP1_MOV; 9178 alu.dst.sel = ctx->thread_id_gpr; 9179 alu.dst.chan = 0; 9180 alu.dst.write = 1; 9181 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9182 alu.last = 1; 9183 r = r600_bytecode_add_alu(ctx->bc, &alu); 9184 if (r) 9185 return r; 9186 } 9187 9188 if (rat_index_mode) 9189 egcm_load_index_reg(ctx->bc, 1, false); 9190 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9191 cf = ctx->bc->cf_last; 9192 9193 cf->rat.id = rat_base + inst->Src[0].Register.Index; 9194 cf->rat.inst = ctx->inst_info->op; 9195 cf->rat.index_mode = rat_index_mode; 9196 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 9197 cf->output.gpr = ctx->thread_id_gpr; 9198 cf->output.index_gpr = idx_gpr; 9199 cf->output.comp_mask = 0xf; 9200 cf->output.burst_count = 1; 9201 cf->vpm = 1; 9202 cf->barrier = 1; 9203 cf->mark = 1; 9204 cf->output.elem_size = 0; 9205 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 9206 cf = ctx->bc->cf_last; 9207 cf->barrier = 1; 9208 cf->cf_addr = 1; 9209 9210 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 9211 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 9212 desc = util_format_description(inst->Memory.Format); 9213 r600_vertex_data_type(inst->Memory.Format, 9214 &format, &num_format, &format_comp, &endian); 9215 vtx.dst_sel_x = desc->swizzle[0]; 9216 } else { 9217 format = FMT_32; 9218 num_format = 1; 9219 format_comp = 0; 9220 endian = 0; 9221 vtx.dst_sel_x = 0; 9222 } 9223 vtx.op = FETCH_OP_VFETCH; 9224 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 9225 vtx.buffer_index_mode = rat_index_mode; 9226 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 9227 vtx.src_gpr = ctx->thread_id_gpr; 9228 vtx.src_sel_x = 1; 9229 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9230 vtx.dst_sel_y = 7; 9231 vtx.dst_sel_z = 7; 9232 vtx.dst_sel_w = 7; 9233 vtx.use_const_fields = 0; 9234 vtx.srf_mode_all = 1; 9235 vtx.data_format = format; 9236 vtx.num_format_all = num_format; 9237 vtx.format_comp_all = format_comp; 9238 vtx.endian = endian; 9239 vtx.offset = 0; 9240 vtx.mega_fetch_count = 0xf; 9241 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 9242 if (r) 9243 return r; 9244 cf = ctx->bc->cf_last; 9245 cf->vpm = 1; 9246 cf->barrier = 1; 9247 return 0; 9248} 9249 9250static int get_gds_op(int opcode) 9251{ 9252 switch (opcode) { 9253 case TGSI_OPCODE_ATOMUADD: 9254 return FETCH_OP_GDS_ADD_RET; 9255 case TGSI_OPCODE_ATOMAND: 9256 return FETCH_OP_GDS_AND_RET; 9257 case TGSI_OPCODE_ATOMOR: 9258 return FETCH_OP_GDS_OR_RET; 9259 case TGSI_OPCODE_ATOMXOR: 9260 return FETCH_OP_GDS_XOR_RET; 9261 case TGSI_OPCODE_ATOMUMIN: 9262 return FETCH_OP_GDS_MIN_UINT_RET; 9263 case TGSI_OPCODE_ATOMUMAX: 9264 return FETCH_OP_GDS_MAX_UINT_RET; 9265 case TGSI_OPCODE_ATOMXCHG: 9266 return FETCH_OP_GDS_XCHG_RET; 9267 case TGSI_OPCODE_ATOMCAS: 9268 return FETCH_OP_GDS_CMP_XCHG_RET; 9269 default: 9270 return -1; 9271 } 9272} 9273 9274static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) 9275{ 9276 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9277 struct r600_bytecode_gds gds; 9278 struct r600_bytecode_alu alu; 9279 int gds_op = get_gds_op(inst->Instruction.Opcode); 9280 int r; 9281 int uav_id = 0; 9282 int uav_index_mode = 0; 9283 bool is_cm = (ctx->bc->chip_class == CAYMAN); 9284 9285 if (gds_op == -1) { 9286 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); 9287 return -1; 9288 } 9289 9290 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 9291 if (r) 9292 return r; 9293 9294 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) { 9295 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) { 9296 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]); 9297 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9298 alu.op = ALU_OP1_MOV; 9299 alu.dst.sel = ctx->temp_reg; 9300 alu.dst.chan = is_cm ? 2 : 1; 9301 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9302 alu.src[0].value = value; 9303 alu.last = 1; 9304 alu.dst.write = 1; 9305 r = r600_bytecode_add_alu(ctx->bc, &alu); 9306 if (r) 9307 return r; 9308 } else { 9309 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9310 alu.op = ALU_OP1_MOV; 9311 alu.dst.sel = ctx->temp_reg; 9312 alu.dst.chan = is_cm ? 2 : 1; 9313 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9314 alu.last = 1; 9315 alu.dst.write = 1; 9316 r = r600_bytecode_add_alu(ctx->bc, &alu); 9317 if (r) 9318 return r; 9319 } 9320 } 9321 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { 9322 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); 9323 int abs_value = abs(value); 9324 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET) 9325 gds_op = FETCH_OP_GDS_SUB_RET; 9326 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9327 alu.op = ALU_OP1_MOV; 9328 alu.dst.sel = ctx->temp_reg; 9329 alu.dst.chan = is_cm ? 1 : 0; 9330 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9331 alu.src[0].value = abs_value; 9332 alu.last = 1; 9333 alu.dst.write = 1; 9334 r = r600_bytecode_add_alu(ctx->bc, &alu); 9335 if (r) 9336 return r; 9337 } else { 9338 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9339 alu.op = ALU_OP1_MOV; 9340 alu.dst.sel = ctx->temp_reg; 9341 alu.dst.chan = is_cm ? 1 : 0; 9342 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9343 alu.last = 1; 9344 alu.dst.write = 1; 9345 r = r600_bytecode_add_alu(ctx->bc, &alu); 9346 if (r) 9347 return r; 9348 } 9349 9350 9351 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 9352 gds.op = gds_op; 9353 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9354 gds.uav_id = is_cm ? 0 : uav_id; 9355 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 9356 gds.src_gpr = ctx->temp_reg; 9357 gds.src_gpr2 = 0; 9358 gds.src_sel_x = is_cm ? 0 : 4; 9359 gds.src_sel_y = is_cm ? 1 : 0; 9360 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) 9361 gds.src_sel_z = is_cm ? 2 : 1; 9362 else 9363 gds.src_sel_z = 7; 9364 gds.dst_sel_x = 0; 9365 gds.dst_sel_y = 7; 9366 gds.dst_sel_z = 7; 9367 gds.dst_sel_w = 7; 9368 gds.alloc_consume = !is_cm; 9369 9370 r = r600_bytecode_add_gds(ctx->bc, &gds); 9371 if (r) 9372 return r; 9373 ctx->bc->cf_last->vpm = 1; 9374 return 0; 9375} 9376 9377static int get_lds_op(int opcode) 9378{ 9379 switch (opcode) { 9380 case TGSI_OPCODE_ATOMUADD: 9381 return LDS_OP2_LDS_ADD_RET; 9382 case TGSI_OPCODE_ATOMAND: 9383 return LDS_OP2_LDS_AND_RET; 9384 case TGSI_OPCODE_ATOMOR: 9385 return LDS_OP2_LDS_OR_RET; 9386 case TGSI_OPCODE_ATOMXOR: 9387 return LDS_OP2_LDS_XOR_RET; 9388 case TGSI_OPCODE_ATOMUMIN: 9389 return LDS_OP2_LDS_MIN_UINT_RET; 9390 case TGSI_OPCODE_ATOMUMAX: 9391 return LDS_OP2_LDS_MAX_UINT_RET; 9392 case TGSI_OPCODE_ATOMIMIN: 9393 return LDS_OP2_LDS_MIN_INT_RET; 9394 case TGSI_OPCODE_ATOMIMAX: 9395 return LDS_OP2_LDS_MAX_INT_RET; 9396 case TGSI_OPCODE_ATOMXCHG: 9397 return LDS_OP2_LDS_XCHG_RET; 9398 case TGSI_OPCODE_ATOMCAS: 9399 return LDS_OP3_LDS_CMP_XCHG_RET; 9400 default: 9401 return -1; 9402 } 9403} 9404 9405static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx) 9406{ 9407 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9408 int lds_op = get_lds_op(inst->Instruction.Opcode); 9409 int r; 9410 9411 struct r600_bytecode_alu alu; 9412 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9413 alu.op = lds_op; 9414 alu.is_lds_idx_op = true; 9415 alu.last = 1; 9416 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 9417 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0); 9418 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET) 9419 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0); 9420 else 9421 alu.src[2].sel = V_SQ_ALU_SRC_0; 9422 r = r600_bytecode_add_alu(ctx->bc, &alu); 9423 if (r) 9424 return r; 9425 9426 /* then read from LDS_OQ_A_POP */ 9427 memset(&alu, 0, sizeof(alu)); 9428 9429 alu.op = ALU_OP1_MOV; 9430 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 9431 alu.src[0].chan = 0; 9432 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 9433 alu.dst.write = 1; 9434 alu.last = 1; 9435 r = r600_bytecode_add_alu(ctx->bc, &alu); 9436 if (r) 9437 return r; 9438 9439 return 0; 9440} 9441 9442static int tgsi_atomic_op(struct r600_shader_ctx *ctx) 9443{ 9444 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9445 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 9446 return tgsi_atomic_op_rat(ctx); 9447 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 9448 return tgsi_atomic_op_gds(ctx); 9449 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9450 return tgsi_atomic_op_rat(ctx); 9451 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 9452 return tgsi_atomic_op_lds(ctx); 9453 return 0; 9454} 9455 9456static int tgsi_resq(struct r600_shader_ctx *ctx) 9457{ 9458 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9459 unsigned sampler_index_mode; 9460 struct r600_bytecode_tex tex; 9461 int r; 9462 boolean has_txq_cube_array_z = false; 9463 9464 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || 9465 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { 9466 if (ctx->bc->chip_class < EVERGREEN) 9467 ctx->shader->uses_tex_buffers = true; 9468 unsigned eg_buffer_base = 0; 9469 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET; 9470 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9471 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9472 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base); 9473 } 9474 9475 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY && 9476 inst->Dst[0].Register.WriteMask & 4) { 9477 ctx->shader->has_txq_cube_array_z_comp = true; 9478 has_txq_cube_array_z = true; 9479 } 9480 9481 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9482 if (sampler_index_mode) 9483 egcm_load_index_reg(ctx->bc, 1, false); 9484 9485 9486 /* does this shader want a num layers from TXQ for a cube array? */ 9487 if (has_txq_cube_array_z) { 9488 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset; 9489 struct r600_bytecode_alu alu; 9490 9491 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9492 alu.op = ALU_OP1_MOV; 9493 9494 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 9495 /* with eg each dword is either number of cubes */ 9496 alu.src[0].sel += id / 4; 9497 alu.src[0].chan = id % 4; 9498 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 9499 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 9500 alu.last = 1; 9501 r = r600_bytecode_add_alu(ctx->bc, &alu); 9502 if (r) 9503 return r; 9504 /* disable writemask from texture instruction */ 9505 inst->Dst[0].Register.WriteMask &= ~4; 9506 } 9507 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 9508 tex.op = ctx->inst_info->op; 9509 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index; 9510 tex.sampler_index_mode = sampler_index_mode; 9511 tex.resource_id = tex.sampler_id; 9512 tex.resource_index_mode = sampler_index_mode; 9513 tex.src_sel_x = 4; 9514 tex.src_sel_y = 4; 9515 tex.src_sel_z = 4; 9516 tex.src_sel_w = 4; 9517 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 9518 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 9519 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 9520 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 9521 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9522 r = r600_bytecode_add_tex(ctx->bc, &tex); 9523 if (r) 9524 return r; 9525 9526 return 0; 9527} 9528 9529static int tgsi_lrp(struct r600_shader_ctx *ctx) 9530{ 9531 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9532 struct r600_bytecode_alu alu; 9533 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9534 struct r600_bytecode_alu_src srcs[2][4]; 9535 unsigned i; 9536 int r; 9537 9538 /* optimize if it's just an equal balance */ 9539 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 9540 for (i = 0; i < lasti + 1; i++) { 9541 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9542 continue; 9543 9544 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9545 alu.op = ALU_OP2_ADD; 9546 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9547 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9548 alu.omod = 3; 9549 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9550 alu.dst.chan = i; 9551 if (i == lasti) { 9552 alu.last = 1; 9553 } 9554 r = r600_bytecode_add_alu(ctx->bc, &alu); 9555 if (r) 9556 return r; 9557 } 9558 return 0; 9559 } 9560 9561 /* 1 - src0 */ 9562 for (i = 0; i < lasti + 1; i++) { 9563 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9564 continue; 9565 9566 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9567 alu.op = ALU_OP2_ADD; 9568 alu.src[0].sel = V_SQ_ALU_SRC_1; 9569 alu.src[0].chan = 0; 9570 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 9571 r600_bytecode_src_toggle_neg(&alu.src[1]); 9572 alu.dst.sel = ctx->temp_reg; 9573 alu.dst.chan = i; 9574 if (i == lasti) { 9575 alu.last = 1; 9576 } 9577 alu.dst.write = 1; 9578 r = r600_bytecode_add_alu(ctx->bc, &alu); 9579 if (r) 9580 return r; 9581 } 9582 9583 /* (1 - src0) * src2 */ 9584 for (i = 0; i < lasti + 1; i++) { 9585 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9586 continue; 9587 9588 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9589 alu.op = ALU_OP2_MUL; 9590 alu.src[0].sel = ctx->temp_reg; 9591 alu.src[0].chan = i; 9592 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9593 alu.dst.sel = ctx->temp_reg; 9594 alu.dst.chan = i; 9595 if (i == lasti) { 9596 alu.last = 1; 9597 } 9598 alu.dst.write = 1; 9599 r = r600_bytecode_add_alu(ctx->bc, &alu); 9600 if (r) 9601 return r; 9602 } 9603 9604 /* src0 * src1 + (1 - src0) * src2 */ 9605 9606 for (i = 0; i < 2; i++) { 9607 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9608 srcs[i], &ctx->src[i]); 9609 if (r) 9610 return r; 9611 } 9612 9613 for (i = 0; i < lasti + 1; i++) { 9614 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9615 continue; 9616 9617 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9618 alu.op = ALU_OP3_MULADD; 9619 alu.is_op3 = 1; 9620 alu.src[0] = srcs[0][i]; 9621 alu.src[1] = srcs[1][i]; 9622 alu.src[2].sel = ctx->temp_reg; 9623 alu.src[2].chan = i; 9624 9625 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9626 alu.dst.chan = i; 9627 if (i == lasti) { 9628 alu.last = 1; 9629 } 9630 r = r600_bytecode_add_alu(ctx->bc, &alu); 9631 if (r) 9632 return r; 9633 } 9634 return 0; 9635} 9636 9637static int tgsi_cmp(struct r600_shader_ctx *ctx) 9638{ 9639 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9640 struct r600_bytecode_alu alu; 9641 int i, r, j; 9642 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9643 struct r600_bytecode_alu_src srcs[3][4]; 9644 9645 unsigned op; 9646 9647 if (ctx->src[0].abs && ctx->src[0].neg) { 9648 op = ALU_OP3_CNDE; 9649 ctx->src[0].abs = 0; 9650 ctx->src[0].neg = 0; 9651 } else { 9652 op = ALU_OP3_CNDGE; 9653 } 9654 9655 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 9656 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9657 srcs[j], &ctx->src[j]); 9658 if (r) 9659 return r; 9660 } 9661 9662 for (i = 0; i < lasti + 1; i++) { 9663 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9664 continue; 9665 9666 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9667 alu.op = op; 9668 alu.src[0] = srcs[0][i]; 9669 alu.src[1] = srcs[2][i]; 9670 alu.src[2] = srcs[1][i]; 9671 9672 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9673 alu.dst.chan = i; 9674 alu.dst.write = 1; 9675 alu.is_op3 = 1; 9676 if (i == lasti) 9677 alu.last = 1; 9678 r = r600_bytecode_add_alu(ctx->bc, &alu); 9679 if (r) 9680 return r; 9681 } 9682 return 0; 9683} 9684 9685static int tgsi_ucmp(struct r600_shader_ctx *ctx) 9686{ 9687 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9688 struct r600_bytecode_alu alu; 9689 int i, r; 9690 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9691 9692 for (i = 0; i < lasti + 1; i++) { 9693 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9694 continue; 9695 9696 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9697 alu.op = ALU_OP3_CNDE_INT; 9698 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9699 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9700 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 9701 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9702 alu.dst.chan = i; 9703 alu.dst.write = 1; 9704 alu.is_op3 = 1; 9705 if (i == lasti) 9706 alu.last = 1; 9707 r = r600_bytecode_add_alu(ctx->bc, &alu); 9708 if (r) 9709 return r; 9710 } 9711 return 0; 9712} 9713 9714static int tgsi_exp(struct r600_shader_ctx *ctx) 9715{ 9716 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9717 struct r600_bytecode_alu alu; 9718 int r; 9719 unsigned i; 9720 9721 /* result.x = 2^floor(src); */ 9722 if (inst->Dst[0].Register.WriteMask & 1) { 9723 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9724 9725 alu.op = ALU_OP1_FLOOR; 9726 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9727 9728 alu.dst.sel = ctx->temp_reg; 9729 alu.dst.chan = 0; 9730 alu.dst.write = 1; 9731 alu.last = 1; 9732 r = r600_bytecode_add_alu(ctx->bc, &alu); 9733 if (r) 9734 return r; 9735 9736 if (ctx->bc->chip_class == CAYMAN) { 9737 for (i = 0; i < 3; i++) { 9738 alu.op = ALU_OP1_EXP_IEEE; 9739 alu.src[0].sel = ctx->temp_reg; 9740 alu.src[0].chan = 0; 9741 9742 alu.dst.sel = ctx->temp_reg; 9743 alu.dst.chan = i; 9744 alu.dst.write = i == 0; 9745 alu.last = i == 2; 9746 r = r600_bytecode_add_alu(ctx->bc, &alu); 9747 if (r) 9748 return r; 9749 } 9750 } else { 9751 alu.op = ALU_OP1_EXP_IEEE; 9752 alu.src[0].sel = ctx->temp_reg; 9753 alu.src[0].chan = 0; 9754 9755 alu.dst.sel = ctx->temp_reg; 9756 alu.dst.chan = 0; 9757 alu.dst.write = 1; 9758 alu.last = 1; 9759 r = r600_bytecode_add_alu(ctx->bc, &alu); 9760 if (r) 9761 return r; 9762 } 9763 } 9764 9765 /* result.y = tmp - floor(tmp); */ 9766 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9767 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9768 9769 alu.op = ALU_OP1_FRACT; 9770 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9771 9772 alu.dst.sel = ctx->temp_reg; 9773#if 0 9774 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9775 if (r) 9776 return r; 9777#endif 9778 alu.dst.write = 1; 9779 alu.dst.chan = 1; 9780 9781 alu.last = 1; 9782 9783 r = r600_bytecode_add_alu(ctx->bc, &alu); 9784 if (r) 9785 return r; 9786 } 9787 9788 /* result.z = RoughApprox2ToX(tmp);*/ 9789 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 9790 if (ctx->bc->chip_class == CAYMAN) { 9791 for (i = 0; i < 3; i++) { 9792 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9793 alu.op = ALU_OP1_EXP_IEEE; 9794 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9795 9796 alu.dst.sel = ctx->temp_reg; 9797 alu.dst.chan = i; 9798 if (i == 2) { 9799 alu.dst.write = 1; 9800 alu.last = 1; 9801 } 9802 9803 r = r600_bytecode_add_alu(ctx->bc, &alu); 9804 if (r) 9805 return r; 9806 } 9807 } else { 9808 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9809 alu.op = ALU_OP1_EXP_IEEE; 9810 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9811 9812 alu.dst.sel = ctx->temp_reg; 9813 alu.dst.write = 1; 9814 alu.dst.chan = 2; 9815 9816 alu.last = 1; 9817 9818 r = r600_bytecode_add_alu(ctx->bc, &alu); 9819 if (r) 9820 return r; 9821 } 9822 } 9823 9824 /* result.w = 1.0;*/ 9825 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 9826 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9827 9828 alu.op = ALU_OP1_MOV; 9829 alu.src[0].sel = V_SQ_ALU_SRC_1; 9830 alu.src[0].chan = 0; 9831 9832 alu.dst.sel = ctx->temp_reg; 9833 alu.dst.chan = 3; 9834 alu.dst.write = 1; 9835 alu.last = 1; 9836 r = r600_bytecode_add_alu(ctx->bc, &alu); 9837 if (r) 9838 return r; 9839 } 9840 return tgsi_helper_copy(ctx, inst); 9841} 9842 9843static int tgsi_log(struct r600_shader_ctx *ctx) 9844{ 9845 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9846 struct r600_bytecode_alu alu; 9847 int r; 9848 unsigned i; 9849 9850 /* result.x = floor(log2(|src|)); */ 9851 if (inst->Dst[0].Register.WriteMask & 1) { 9852 if (ctx->bc->chip_class == CAYMAN) { 9853 for (i = 0; i < 3; i++) { 9854 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9855 9856 alu.op = ALU_OP1_LOG_IEEE; 9857 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9858 r600_bytecode_src_set_abs(&alu.src[0]); 9859 9860 alu.dst.sel = ctx->temp_reg; 9861 alu.dst.chan = i; 9862 if (i == 0) 9863 alu.dst.write = 1; 9864 if (i == 2) 9865 alu.last = 1; 9866 r = r600_bytecode_add_alu(ctx->bc, &alu); 9867 if (r) 9868 return r; 9869 } 9870 9871 } else { 9872 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9873 9874 alu.op = ALU_OP1_LOG_IEEE; 9875 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9876 r600_bytecode_src_set_abs(&alu.src[0]); 9877 9878 alu.dst.sel = ctx->temp_reg; 9879 alu.dst.chan = 0; 9880 alu.dst.write = 1; 9881 alu.last = 1; 9882 r = r600_bytecode_add_alu(ctx->bc, &alu); 9883 if (r) 9884 return r; 9885 } 9886 9887 alu.op = ALU_OP1_FLOOR; 9888 alu.src[0].sel = ctx->temp_reg; 9889 alu.src[0].chan = 0; 9890 9891 alu.dst.sel = ctx->temp_reg; 9892 alu.dst.chan = 0; 9893 alu.dst.write = 1; 9894 alu.last = 1; 9895 9896 r = r600_bytecode_add_alu(ctx->bc, &alu); 9897 if (r) 9898 return r; 9899 } 9900 9901 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 9902 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9903 9904 if (ctx->bc->chip_class == CAYMAN) { 9905 for (i = 0; i < 3; i++) { 9906 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9907 9908 alu.op = ALU_OP1_LOG_IEEE; 9909 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9910 r600_bytecode_src_set_abs(&alu.src[0]); 9911 9912 alu.dst.sel = ctx->temp_reg; 9913 alu.dst.chan = i; 9914 if (i == 1) 9915 alu.dst.write = 1; 9916 if (i == 2) 9917 alu.last = 1; 9918 9919 r = r600_bytecode_add_alu(ctx->bc, &alu); 9920 if (r) 9921 return r; 9922 } 9923 } else { 9924 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9925 9926 alu.op = ALU_OP1_LOG_IEEE; 9927 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9928 r600_bytecode_src_set_abs(&alu.src[0]); 9929 9930 alu.dst.sel = ctx->temp_reg; 9931 alu.dst.chan = 1; 9932 alu.dst.write = 1; 9933 alu.last = 1; 9934 9935 r = r600_bytecode_add_alu(ctx->bc, &alu); 9936 if (r) 9937 return r; 9938 } 9939 9940 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9941 9942 alu.op = ALU_OP1_FLOOR; 9943 alu.src[0].sel = ctx->temp_reg; 9944 alu.src[0].chan = 1; 9945 9946 alu.dst.sel = ctx->temp_reg; 9947 alu.dst.chan = 1; 9948 alu.dst.write = 1; 9949 alu.last = 1; 9950 9951 r = r600_bytecode_add_alu(ctx->bc, &alu); 9952 if (r) 9953 return r; 9954 9955 if (ctx->bc->chip_class == CAYMAN) { 9956 for (i = 0; i < 3; i++) { 9957 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9958 alu.op = ALU_OP1_EXP_IEEE; 9959 alu.src[0].sel = ctx->temp_reg; 9960 alu.src[0].chan = 1; 9961 9962 alu.dst.sel = ctx->temp_reg; 9963 alu.dst.chan = i; 9964 if (i == 1) 9965 alu.dst.write = 1; 9966 if (i == 2) 9967 alu.last = 1; 9968 9969 r = r600_bytecode_add_alu(ctx->bc, &alu); 9970 if (r) 9971 return r; 9972 } 9973 } else { 9974 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9975 alu.op = ALU_OP1_EXP_IEEE; 9976 alu.src[0].sel = ctx->temp_reg; 9977 alu.src[0].chan = 1; 9978 9979 alu.dst.sel = ctx->temp_reg; 9980 alu.dst.chan = 1; 9981 alu.dst.write = 1; 9982 alu.last = 1; 9983 9984 r = r600_bytecode_add_alu(ctx->bc, &alu); 9985 if (r) 9986 return r; 9987 } 9988 9989 if (ctx->bc->chip_class == CAYMAN) { 9990 for (i = 0; i < 3; i++) { 9991 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9992 alu.op = ALU_OP1_RECIP_IEEE; 9993 alu.src[0].sel = ctx->temp_reg; 9994 alu.src[0].chan = 1; 9995 9996 alu.dst.sel = ctx->temp_reg; 9997 alu.dst.chan = i; 9998 if (i == 1) 9999 alu.dst.write = 1; 10000 if (i == 2) 10001 alu.last = 1; 10002 10003 r = r600_bytecode_add_alu(ctx->bc, &alu); 10004 if (r) 10005 return r; 10006 } 10007 } else { 10008 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10009 alu.op = ALU_OP1_RECIP_IEEE; 10010 alu.src[0].sel = ctx->temp_reg; 10011 alu.src[0].chan = 1; 10012 10013 alu.dst.sel = ctx->temp_reg; 10014 alu.dst.chan = 1; 10015 alu.dst.write = 1; 10016 alu.last = 1; 10017 10018 r = r600_bytecode_add_alu(ctx->bc, &alu); 10019 if (r) 10020 return r; 10021 } 10022 10023 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10024 10025 alu.op = ALU_OP2_MUL; 10026 10027 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10028 r600_bytecode_src_set_abs(&alu.src[0]); 10029 10030 alu.src[1].sel = ctx->temp_reg; 10031 alu.src[1].chan = 1; 10032 10033 alu.dst.sel = ctx->temp_reg; 10034 alu.dst.chan = 1; 10035 alu.dst.write = 1; 10036 alu.last = 1; 10037 10038 r = r600_bytecode_add_alu(ctx->bc, &alu); 10039 if (r) 10040 return r; 10041 } 10042 10043 /* result.z = log2(|src|);*/ 10044 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 10045 if (ctx->bc->chip_class == CAYMAN) { 10046 for (i = 0; i < 3; i++) { 10047 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10048 10049 alu.op = ALU_OP1_LOG_IEEE; 10050 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10051 r600_bytecode_src_set_abs(&alu.src[0]); 10052 10053 alu.dst.sel = ctx->temp_reg; 10054 if (i == 2) 10055 alu.dst.write = 1; 10056 alu.dst.chan = i; 10057 if (i == 2) 10058 alu.last = 1; 10059 10060 r = r600_bytecode_add_alu(ctx->bc, &alu); 10061 if (r) 10062 return r; 10063 } 10064 } else { 10065 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10066 10067 alu.op = ALU_OP1_LOG_IEEE; 10068 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10069 r600_bytecode_src_set_abs(&alu.src[0]); 10070 10071 alu.dst.sel = ctx->temp_reg; 10072 alu.dst.write = 1; 10073 alu.dst.chan = 2; 10074 alu.last = 1; 10075 10076 r = r600_bytecode_add_alu(ctx->bc, &alu); 10077 if (r) 10078 return r; 10079 } 10080 } 10081 10082 /* result.w = 1.0; */ 10083 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 10084 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10085 10086 alu.op = ALU_OP1_MOV; 10087 alu.src[0].sel = V_SQ_ALU_SRC_1; 10088 alu.src[0].chan = 0; 10089 10090 alu.dst.sel = ctx->temp_reg; 10091 alu.dst.chan = 3; 10092 alu.dst.write = 1; 10093 alu.last = 1; 10094 10095 r = r600_bytecode_add_alu(ctx->bc, &alu); 10096 if (r) 10097 return r; 10098 } 10099 10100 return tgsi_helper_copy(ctx, inst); 10101} 10102 10103static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 10104{ 10105 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10106 struct r600_bytecode_alu alu; 10107 int r; 10108 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10109 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); 10110 10111 assert(inst->Dst[0].Register.Index < 3); 10112 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10113 10114 switch (inst->Instruction.Opcode) { 10115 case TGSI_OPCODE_ARL: 10116 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 10117 break; 10118 case TGSI_OPCODE_ARR: 10119 alu.op = ALU_OP1_FLT_TO_INT; 10120 break; 10121 case TGSI_OPCODE_UARL: 10122 alu.op = ALU_OP1_MOV; 10123 break; 10124 default: 10125 assert(0); 10126 return -1; 10127 } 10128 10129 for (i = 0; i <= lasti; ++i) { 10130 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10131 continue; 10132 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10133 alu.last = i == lasti; 10134 alu.dst.sel = reg; 10135 alu.dst.chan = i; 10136 alu.dst.write = 1; 10137 r = r600_bytecode_add_alu(ctx->bc, &alu); 10138 if (r) 10139 return r; 10140 } 10141 10142 if (inst->Dst[0].Register.Index > 0) 10143 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0; 10144 else 10145 ctx->bc->ar_loaded = 0; 10146 10147 return 0; 10148} 10149static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 10150{ 10151 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10152 struct r600_bytecode_alu alu; 10153 int r; 10154 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10155 10156 switch (inst->Instruction.Opcode) { 10157 case TGSI_OPCODE_ARL: 10158 memset(&alu, 0, sizeof(alu)); 10159 alu.op = ALU_OP1_FLOOR; 10160 alu.dst.sel = ctx->bc->ar_reg; 10161 alu.dst.write = 1; 10162 for (i = 0; i <= lasti; ++i) { 10163 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10164 alu.dst.chan = i; 10165 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10166 alu.last = i == lasti; 10167 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10168 return r; 10169 } 10170 } 10171 10172 memset(&alu, 0, sizeof(alu)); 10173 alu.op = ALU_OP1_FLT_TO_INT; 10174 alu.src[0].sel = ctx->bc->ar_reg; 10175 alu.dst.sel = ctx->bc->ar_reg; 10176 alu.dst.write = 1; 10177 /* FLT_TO_INT is trans-only on r600/r700 */ 10178 alu.last = TRUE; 10179 for (i = 0; i <= lasti; ++i) { 10180 alu.dst.chan = i; 10181 alu.src[0].chan = i; 10182 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10183 return r; 10184 } 10185 break; 10186 case TGSI_OPCODE_ARR: 10187 memset(&alu, 0, sizeof(alu)); 10188 alu.op = ALU_OP1_FLT_TO_INT; 10189 alu.dst.sel = ctx->bc->ar_reg; 10190 alu.dst.write = 1; 10191 /* FLT_TO_INT is trans-only on r600/r700 */ 10192 alu.last = TRUE; 10193 for (i = 0; i <= lasti; ++i) { 10194 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10195 alu.dst.chan = i; 10196 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10197 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10198 return r; 10199 } 10200 } 10201 break; 10202 case TGSI_OPCODE_UARL: 10203 memset(&alu, 0, sizeof(alu)); 10204 alu.op = ALU_OP1_MOV; 10205 alu.dst.sel = ctx->bc->ar_reg; 10206 alu.dst.write = 1; 10207 for (i = 0; i <= lasti; ++i) { 10208 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10209 alu.dst.chan = i; 10210 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10211 alu.last = i == lasti; 10212 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10213 return r; 10214 } 10215 } 10216 break; 10217 default: 10218 assert(0); 10219 return -1; 10220 } 10221 10222 ctx->bc->ar_loaded = 0; 10223 return 0; 10224} 10225 10226static int tgsi_opdst(struct r600_shader_ctx *ctx) 10227{ 10228 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10229 struct r600_bytecode_alu alu; 10230 int i, r = 0; 10231 10232 for (i = 0; i < 4; i++) { 10233 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10234 10235 alu.op = ALU_OP2_MUL; 10236 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10237 10238 if (i == 0 || i == 3) { 10239 alu.src[0].sel = V_SQ_ALU_SRC_1; 10240 } else { 10241 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10242 } 10243 10244 if (i == 0 || i == 2) { 10245 alu.src[1].sel = V_SQ_ALU_SRC_1; 10246 } else { 10247 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 10248 } 10249 if (i == 3) 10250 alu.last = 1; 10251 r = r600_bytecode_add_alu(ctx->bc, &alu); 10252 if (r) 10253 return r; 10254 } 10255 return 0; 10256} 10257 10258static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type, 10259 struct r600_bytecode_alu_src *src) 10260{ 10261 struct r600_bytecode_alu alu; 10262 int r; 10263 10264 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10265 alu.op = opcode; 10266 alu.execute_mask = 1; 10267 alu.update_pred = 1; 10268 10269 alu.dst.sel = ctx->temp_reg; 10270 alu.dst.write = 1; 10271 alu.dst.chan = 0; 10272 10273 alu.src[0] = *src; 10274 alu.src[1].sel = V_SQ_ALU_SRC_0; 10275 alu.src[1].chan = 0; 10276 10277 alu.last = 1; 10278 10279 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 10280 if (r) 10281 return r; 10282 return 0; 10283} 10284 10285static int pops(struct r600_shader_ctx *ctx, int pops) 10286{ 10287 unsigned force_pop = ctx->bc->force_add_cf; 10288 10289 if (!force_pop) { 10290 int alu_pop = 3; 10291 if (ctx->bc->cf_last) { 10292 if (ctx->bc->cf_last->op == CF_OP_ALU) 10293 alu_pop = 0; 10294 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 10295 alu_pop = 1; 10296 } 10297 alu_pop += pops; 10298 if (alu_pop == 1) { 10299 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 10300 ctx->bc->force_add_cf = 1; 10301 } else if (alu_pop == 2) { 10302 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 10303 ctx->bc->force_add_cf = 1; 10304 } else { 10305 force_pop = 1; 10306 } 10307 } 10308 10309 if (force_pop) { 10310 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 10311 ctx->bc->cf_last->pop_count = pops; 10312 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10313 } 10314 10315 return 0; 10316} 10317 10318static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx, 10319 unsigned reason) 10320{ 10321 struct r600_stack_info *stack = &ctx->bc->stack; 10322 unsigned elements; 10323 int entries; 10324 10325 unsigned entry_size = stack->entry_size; 10326 10327 elements = (stack->loop + stack->push_wqm ) * entry_size; 10328 elements += stack->push; 10329 10330 switch (ctx->bc->chip_class) { 10331 case R600: 10332 case R700: 10333 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 10334 * the stack must be reserved to hold the current active/continue 10335 * masks */ 10336 if (reason == FC_PUSH_VPM || stack->push > 0) { 10337 elements += 2; 10338 } 10339 break; 10340 10341 case CAYMAN: 10342 /* r9xx: any stack operation on empty stack consumes 2 additional 10343 * elements */ 10344 elements += 2; 10345 10346 /* fallthrough */ 10347 /* FIXME: do the two elements added above cover the cases for the 10348 * r8xx+ below? */ 10349 10350 case EVERGREEN: 10351 /* r8xx+: 2 extra elements are not always required, but one extra 10352 * element must be added for each of the following cases: 10353 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 10354 * stack usage. 10355 * (Currently we don't use ALU_ELSE_AFTER.) 10356 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 10357 * PUSH instruction executed. 10358 * 10359 * NOTE: it seems we also need to reserve additional element in some 10360 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 10361 * then STACK_SIZE should be 2 instead of 1 */ 10362 if (reason == FC_PUSH_VPM || stack->push > 0) { 10363 elements += 1; 10364 } 10365 break; 10366 10367 default: 10368 assert(0); 10369 break; 10370 } 10371 10372 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 10373 * for all chips, so we use 4 in the final formula, not the real entry_size 10374 * for the chip */ 10375 entry_size = 4; 10376 10377 entries = (elements + (entry_size - 1)) / entry_size; 10378 10379 if (entries > stack->max_entries) 10380 stack->max_entries = entries; 10381 return elements; 10382} 10383 10384static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 10385{ 10386 switch(reason) { 10387 case FC_PUSH_VPM: 10388 --ctx->bc->stack.push; 10389 assert(ctx->bc->stack.push >= 0); 10390 break; 10391 case FC_PUSH_WQM: 10392 --ctx->bc->stack.push_wqm; 10393 assert(ctx->bc->stack.push_wqm >= 0); 10394 break; 10395 case FC_LOOP: 10396 --ctx->bc->stack.loop; 10397 assert(ctx->bc->stack.loop >= 0); 10398 break; 10399 default: 10400 assert(0); 10401 break; 10402 } 10403} 10404 10405static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 10406{ 10407 switch (reason) { 10408 case FC_PUSH_VPM: 10409 ++ctx->bc->stack.push; 10410 break; 10411 case FC_PUSH_WQM: 10412 ++ctx->bc->stack.push_wqm; 10413 break; 10414 case FC_LOOP: 10415 ++ctx->bc->stack.loop; 10416 break; 10417 default: 10418 assert(0); 10419 } 10420 10421 return callstack_update_max_depth(ctx, reason); 10422} 10423 10424static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 10425{ 10426 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 10427 10428 sp->mid = realloc((void *)sp->mid, 10429 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 10430 sp->mid[sp->num_mid] = ctx->bc->cf_last; 10431 sp->num_mid++; 10432} 10433 10434static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 10435{ 10436 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack)); 10437 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 10438 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 10439 ctx->bc->fc_sp++; 10440} 10441 10442static void fc_poplevel(struct r600_shader_ctx *ctx) 10443{ 10444 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1]; 10445 free(sp->mid); 10446 sp->mid = NULL; 10447 sp->num_mid = 0; 10448 sp->start = NULL; 10449 sp->type = 0; 10450 ctx->bc->fc_sp--; 10451} 10452 10453#if 0 10454static int emit_return(struct r600_shader_ctx *ctx) 10455{ 10456 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 10457 return 0; 10458} 10459 10460static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 10461{ 10462 10463 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 10464 ctx->bc->cf_last->pop_count = pops; 10465 /* XXX work out offset */ 10466 return 0; 10467} 10468 10469static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 10470{ 10471 return 0; 10472} 10473 10474static void emit_testflag(struct r600_shader_ctx *ctx) 10475{ 10476 10477} 10478 10479static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 10480{ 10481 emit_testflag(ctx); 10482 emit_jump_to_offset(ctx, 1, 4); 10483 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 10484 pops(ctx, ifidx + 1); 10485 emit_return(ctx); 10486} 10487 10488static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 10489{ 10490 emit_testflag(ctx); 10491 10492 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10493 ctx->bc->cf_last->pop_count = 1; 10494 10495 fc_set_mid(ctx, fc_sp); 10496 10497 pops(ctx, 1); 10498} 10499#endif 10500 10501static int emit_if(struct r600_shader_ctx *ctx, int opcode, 10502 struct r600_bytecode_alu_src *src) 10503{ 10504 int alu_type = CF_OP_ALU_PUSH_BEFORE; 10505 bool needs_workaround = false; 10506 int elems = callstack_push(ctx, FC_PUSH_VPM); 10507 10508 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) 10509 needs_workaround = true; 10510 10511 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) { 10512 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size; 10513 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size; 10514 10515 if (elems && (!dmod1 || !dmod2)) 10516 needs_workaround = true; 10517 } 10518 10519 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 10520 * LOOP_STARTxxx for nested loops may put the branch stack into a state 10521 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 10522 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 10523 if (needs_workaround) { 10524 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 10525 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10526 alu_type = CF_OP_ALU; 10527 } 10528 10529 emit_logic_pred(ctx, opcode, alu_type, src); 10530 10531 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 10532 10533 fc_pushlevel(ctx, FC_IF); 10534 10535 return 0; 10536} 10537 10538static int tgsi_if(struct r600_shader_ctx *ctx) 10539{ 10540 struct r600_bytecode_alu_src alu_src; 10541 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10542 10543 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src); 10544} 10545 10546static int tgsi_uif(struct r600_shader_ctx *ctx) 10547{ 10548 struct r600_bytecode_alu_src alu_src; 10549 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10550 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 10551} 10552 10553static int tgsi_else(struct r600_shader_ctx *ctx) 10554{ 10555 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 10556 ctx->bc->cf_last->pop_count = 1; 10557 10558 fc_set_mid(ctx, ctx->bc->fc_sp - 1); 10559 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id; 10560 return 0; 10561} 10562 10563static int tgsi_endif(struct r600_shader_ctx *ctx) 10564{ 10565 int offset = 2; 10566 pops(ctx, 1); 10567 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) { 10568 R600_ERR("if/endif unbalanced in shader\n"); 10569 return -1; 10570 } 10571 10572 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */ 10573 if (ctx->bc->cf_last->eg_alu_extended) 10574 offset += 2; 10575 10576 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) { 10577 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset; 10578 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1; 10579 } else { 10580 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset; 10581 } 10582 fc_poplevel(ctx); 10583 10584 callstack_pop(ctx, FC_PUSH_VPM); 10585 return 0; 10586} 10587 10588static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 10589{ 10590 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 10591 * limited to 4096 iterations, like the other LOOP_* instructions. */ 10592 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 10593 10594 fc_pushlevel(ctx, FC_LOOP); 10595 10596 /* check stack depth */ 10597 callstack_push(ctx, FC_LOOP); 10598 return 0; 10599} 10600 10601static int tgsi_endloop(struct r600_shader_ctx *ctx) 10602{ 10603 int i; 10604 10605 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 10606 10607 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) { 10608 R600_ERR("loop/endloop in shader code are not paired.\n"); 10609 return -EINVAL; 10610 } 10611 10612 /* fixup loop pointers - from r600isa 10613 LOOP END points to CF after LOOP START, 10614 LOOP START point to CF after LOOP END 10615 BRK/CONT point to LOOP END CF 10616 */ 10617 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2; 10618 10619 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2; 10620 10621 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) { 10622 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id; 10623 } 10624 /* XXX add LOOPRET support */ 10625 fc_poplevel(ctx); 10626 callstack_pop(ctx, FC_LOOP); 10627 return 0; 10628} 10629 10630static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 10631{ 10632 unsigned int fscp; 10633 10634 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 10635 { 10636 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type) 10637 break; 10638 } 10639 10640 if (fscp == 0) { 10641 R600_ERR("Break not inside loop/endloop pair\n"); 10642 return -EINVAL; 10643 } 10644 10645 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10646 10647 fc_set_mid(ctx, fscp - 1); 10648 10649 return 0; 10650} 10651 10652static int tgsi_gs_emit(struct r600_shader_ctx *ctx) 10653{ 10654 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10655 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX]; 10656 int r; 10657 10658 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10659 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE); 10660 10661 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10662 if (!r) { 10663 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream 10664 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10665 return emit_inc_ring_offset(ctx, stream, TRUE); 10666 } 10667 return r; 10668} 10669 10670static int tgsi_umad(struct r600_shader_ctx *ctx) 10671{ 10672 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10673 struct r600_bytecode_alu alu; 10674 int i, j, r; 10675 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10676 10677 /* src0 * src1 */ 10678 for (i = 0; i < lasti + 1; i++) { 10679 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10680 continue; 10681 10682 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10683 10684 alu.dst.chan = i; 10685 alu.dst.sel = ctx->temp_reg; 10686 alu.dst.write = 1; 10687 10688 alu.op = ALU_OP2_MULLO_UINT; 10689 for (j = 0; j < 2; j++) { 10690 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 10691 } 10692 10693 alu.last = 1; 10694 r = emit_mul_int_op(ctx->bc, &alu); 10695 if (r) 10696 return r; 10697 } 10698 10699 10700 for (i = 0; i < lasti + 1; i++) { 10701 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10702 continue; 10703 10704 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10705 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10706 10707 alu.op = ALU_OP2_ADD_INT; 10708 10709 alu.src[0].sel = ctx->temp_reg; 10710 alu.src[0].chan = i; 10711 10712 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 10713 if (i == lasti) { 10714 alu.last = 1; 10715 } 10716 r = r600_bytecode_add_alu(ctx->bc, &alu); 10717 if (r) 10718 return r; 10719 } 10720 return 0; 10721} 10722 10723static int tgsi_pk2h(struct r600_shader_ctx *ctx) 10724{ 10725 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10726 struct r600_bytecode_alu alu; 10727 int r, i; 10728 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10729 10730 /* temp.xy = f32_to_f16(src) */ 10731 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10732 alu.op = ALU_OP1_FLT32_TO_FLT16; 10733 alu.dst.chan = 0; 10734 alu.dst.sel = ctx->temp_reg; 10735 alu.dst.write = 1; 10736 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10737 r = r600_bytecode_add_alu(ctx->bc, &alu); 10738 if (r) 10739 return r; 10740 alu.dst.chan = 1; 10741 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 10742 alu.last = 1; 10743 r = r600_bytecode_add_alu(ctx->bc, &alu); 10744 if (r) 10745 return r; 10746 10747 /* dst.x = temp.y * 0x10000 + temp.x */ 10748 for (i = 0; i < lasti + 1; i++) { 10749 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10750 continue; 10751 10752 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10753 alu.op = ALU_OP3_MULADD_UINT24; 10754 alu.is_op3 = 1; 10755 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10756 alu.last = i == lasti; 10757 alu.src[0].sel = ctx->temp_reg; 10758 alu.src[0].chan = 1; 10759 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10760 alu.src[1].value = 0x10000; 10761 alu.src[2].sel = ctx->temp_reg; 10762 alu.src[2].chan = 0; 10763 r = r600_bytecode_add_alu(ctx->bc, &alu); 10764 if (r) 10765 return r; 10766 } 10767 10768 return 0; 10769} 10770 10771static int tgsi_up2h(struct r600_shader_ctx *ctx) 10772{ 10773 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10774 struct r600_bytecode_alu alu; 10775 int r, i; 10776 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10777 10778 /* temp.x = src.x */ 10779 /* note: no need to mask out the high bits */ 10780 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10781 alu.op = ALU_OP1_MOV; 10782 alu.dst.chan = 0; 10783 alu.dst.sel = ctx->temp_reg; 10784 alu.dst.write = 1; 10785 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10786 r = r600_bytecode_add_alu(ctx->bc, &alu); 10787 if (r) 10788 return r; 10789 10790 /* temp.y = src.x >> 16 */ 10791 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10792 alu.op = ALU_OP2_LSHR_INT; 10793 alu.dst.chan = 1; 10794 alu.dst.sel = ctx->temp_reg; 10795 alu.dst.write = 1; 10796 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10797 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10798 alu.src[1].value = 16; 10799 alu.last = 1; 10800 r = r600_bytecode_add_alu(ctx->bc, &alu); 10801 if (r) 10802 return r; 10803 10804 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ 10805 for (i = 0; i < lasti + 1; i++) { 10806 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10807 continue; 10808 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10809 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10810 alu.op = ALU_OP1_FLT16_TO_FLT32; 10811 alu.src[0].sel = ctx->temp_reg; 10812 alu.src[0].chan = i % 2; 10813 alu.last = i == lasti; 10814 r = r600_bytecode_add_alu(ctx->bc, &alu); 10815 if (r) 10816 return r; 10817 } 10818 10819 return 0; 10820} 10821 10822static int tgsi_bfe(struct r600_shader_ctx *ctx) 10823{ 10824 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10825 struct r600_bytecode_alu alu; 10826 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10827 int r, i; 10828 int dst = -1; 10829 10830 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File && 10831 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) || 10832 (inst->Src[2].Register.File == inst->Dst[0].Register.File && 10833 inst->Src[2].Register.Index == inst->Dst[0].Register.Index)) 10834 dst = r600_get_temp(ctx); 10835 10836 r = tgsi_op3_dst(ctx, dst); 10837 if (r) 10838 return r; 10839 10840 for (i = 0; i < lasti + 1; i++) { 10841 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10842 alu.op = ALU_OP2_SETGE_INT; 10843 r600_bytecode_src(&alu.src[0], &ctx->src[2], i); 10844 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10845 alu.src[1].value = 32; 10846 alu.dst.sel = ctx->temp_reg; 10847 alu.dst.chan = i; 10848 alu.dst.write = 1; 10849 if (i == lasti) 10850 alu.last = 1; 10851 r = r600_bytecode_add_alu(ctx->bc, &alu); 10852 if (r) 10853 return r; 10854 } 10855 10856 for (i = 0; i < lasti + 1; i++) { 10857 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10858 alu.op = ALU_OP3_CNDE_INT; 10859 alu.is_op3 = 1; 10860 alu.src[0].sel = ctx->temp_reg; 10861 alu.src[0].chan = i; 10862 10863 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10864 if (dst != -1) 10865 alu.src[1].sel = dst; 10866 else 10867 alu.src[1].sel = alu.dst.sel; 10868 alu.src[1].chan = i; 10869 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 10870 alu.dst.write = 1; 10871 if (i == lasti) 10872 alu.last = 1; 10873 r = r600_bytecode_add_alu(ctx->bc, &alu); 10874 if (r) 10875 return r; 10876 } 10877 10878 return 0; 10879} 10880 10881static int tgsi_clock(struct r600_shader_ctx *ctx) 10882{ 10883 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10884 struct r600_bytecode_alu alu; 10885 int r; 10886 10887 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10888 alu.op = ALU_OP1_MOV; 10889 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 10890 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO; 10891 r = r600_bytecode_add_alu(ctx->bc, &alu); 10892 if (r) 10893 return r; 10894 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10895 alu.op = ALU_OP1_MOV; 10896 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 10897 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI; 10898 alu.last = 1; 10899 r = r600_bytecode_add_alu(ctx->bc, &alu); 10900 if (r) 10901 return r; 10902 return 0; 10903} 10904 10905static int emit_u64add(struct r600_shader_ctx *ctx, int op, 10906 int treg, 10907 int src0_sel, int src0_chan, 10908 int src1_sel, int src1_chan) 10909{ 10910 struct r600_bytecode_alu alu; 10911 int r; 10912 int opc; 10913 10914 if (op == ALU_OP2_ADD_INT) 10915 opc = ALU_OP2_ADDC_UINT; 10916 else 10917 opc = ALU_OP2_SUBB_UINT; 10918 10919 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10920 alu.op = op; ; 10921 alu.dst.sel = treg; 10922 alu.dst.chan = 0; 10923 alu.dst.write = 1; 10924 alu.src[0].sel = src0_sel; 10925 alu.src[0].chan = src0_chan + 0; 10926 alu.src[1].sel = src1_sel; 10927 alu.src[1].chan = src1_chan + 0; 10928 alu.src[1].neg = 0; 10929 r = r600_bytecode_add_alu(ctx->bc, &alu); 10930 if (r) 10931 return r; 10932 10933 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10934 alu.op = op; 10935 alu.dst.sel = treg; 10936 alu.dst.chan = 1; 10937 alu.dst.write = 1; 10938 alu.src[0].sel = src0_sel; 10939 alu.src[0].chan = src0_chan + 1; 10940 alu.src[1].sel = src1_sel; 10941 alu.src[1].chan = src1_chan + 1; 10942 alu.src[1].neg = 0; 10943 r = r600_bytecode_add_alu(ctx->bc, &alu); 10944 if (r) 10945 return r; 10946 10947 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10948 alu.op = opc; 10949 alu.dst.sel = treg; 10950 alu.dst.chan = 2; 10951 alu.dst.write = 1; 10952 alu.last = 1; 10953 alu.src[0].sel = src0_sel; 10954 alu.src[0].chan = src0_chan + 0; 10955 alu.src[1].sel = src1_sel; 10956 alu.src[1].chan = src1_chan + 0; 10957 alu.src[1].neg = 0; 10958 r = r600_bytecode_add_alu(ctx->bc, &alu); 10959 if (r) 10960 return r; 10961 10962 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10963 alu.op = op; 10964 alu.dst.sel = treg; 10965 alu.dst.chan = 1; 10966 alu.dst.write = 1; 10967 alu.src[0].sel = treg; 10968 alu.src[0].chan = 1; 10969 alu.src[1].sel = treg; 10970 alu.src[1].chan = 2; 10971 alu.last = 1; 10972 r = r600_bytecode_add_alu(ctx->bc, &alu); 10973 if (r) 10974 return r; 10975 return 0; 10976} 10977 10978static int egcm_u64add(struct r600_shader_ctx *ctx) 10979{ 10980 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10981 struct r600_bytecode_alu alu; 10982 int r; 10983 int treg = ctx->temp_reg; 10984 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT; 10985 10986 if (ctx->src[1].neg) { 10987 op = ALU_OP2_SUB_INT; 10988 opc = ALU_OP2_SUBB_UINT; 10989 } 10990 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10991 alu.op = op; ; 10992 alu.dst.sel = treg; 10993 alu.dst.chan = 0; 10994 alu.dst.write = 1; 10995 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10996 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 10997 alu.src[1].neg = 0; 10998 r = r600_bytecode_add_alu(ctx->bc, &alu); 10999 if (r) 11000 return r; 11001 11002 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11003 alu.op = op; 11004 alu.dst.sel = treg; 11005 alu.dst.chan = 1; 11006 alu.dst.write = 1; 11007 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11008 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11009 alu.src[1].neg = 0; 11010 r = r600_bytecode_add_alu(ctx->bc, &alu); 11011 if (r) 11012 return r; 11013 11014 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11015 alu.op = opc ; 11016 alu.dst.sel = treg; 11017 alu.dst.chan = 2; 11018 alu.dst.write = 1; 11019 alu.last = 1; 11020 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11021 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11022 alu.src[1].neg = 0; 11023 r = r600_bytecode_add_alu(ctx->bc, &alu); 11024 if (r) 11025 return r; 11026 11027 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11028 alu.op = op; 11029 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11030 alu.src[0].sel = treg; 11031 alu.src[0].chan = 1; 11032 alu.src[1].sel = treg; 11033 alu.src[1].chan = 2; 11034 alu.last = 1; 11035 r = r600_bytecode_add_alu(ctx->bc, &alu); 11036 if (r) 11037 return r; 11038 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11039 alu.op = ALU_OP1_MOV; 11040 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11041 alu.src[0].sel = treg; 11042 alu.src[0].chan = 0; 11043 alu.last = 1; 11044 r = r600_bytecode_add_alu(ctx->bc, &alu); 11045 if (r) 11046 return r; 11047 return 0; 11048} 11049 11050/* result.y = mul_high a, b 11051 result.x = mul a,b 11052 result.y += a.x * b.y + a.y * b.x; 11053*/ 11054static int egcm_u64mul(struct r600_shader_ctx *ctx) 11055{ 11056 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11057 struct r600_bytecode_alu alu; 11058 int r; 11059 int treg = ctx->temp_reg; 11060 11061 /* temp.x = mul_lo a.x, b.x */ 11062 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11063 alu.op = ALU_OP2_MULLO_UINT; 11064 alu.dst.sel = treg; 11065 alu.dst.chan = 0; 11066 alu.dst.write = 1; 11067 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11068 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11069 r = emit_mul_int_op(ctx->bc, &alu); 11070 if (r) 11071 return r; 11072 11073 /* temp.y = mul_hi a.x, b.x */ 11074 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11075 alu.op = ALU_OP2_MULHI_UINT; 11076 alu.dst.sel = treg; 11077 alu.dst.chan = 1; 11078 alu.dst.write = 1; 11079 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11080 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11081 r = emit_mul_int_op(ctx->bc, &alu); 11082 if (r) 11083 return r; 11084 11085 /* temp.z = mul a.x, b.y */ 11086 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11087 alu.op = ALU_OP2_MULLO_UINT; 11088 alu.dst.sel = treg; 11089 alu.dst.chan = 2; 11090 alu.dst.write = 1; 11091 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11092 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11093 r = emit_mul_int_op(ctx->bc, &alu); 11094 if (r) 11095 return r; 11096 11097 /* temp.w = mul a.y, b.x */ 11098 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11099 alu.op = ALU_OP2_MULLO_UINT; 11100 alu.dst.sel = treg; 11101 alu.dst.chan = 3; 11102 alu.dst.write = 1; 11103 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11104 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11105 r = emit_mul_int_op(ctx->bc, &alu); 11106 if (r) 11107 return r; 11108 11109 /* temp.z = temp.z + temp.w */ 11110 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11111 alu.op = ALU_OP2_ADD_INT; 11112 alu.dst.sel = treg; 11113 alu.dst.chan = 2; 11114 alu.dst.write = 1; 11115 alu.src[0].sel = treg; 11116 alu.src[0].chan = 2; 11117 alu.src[1].sel = treg; 11118 alu.src[1].chan = 3; 11119 alu.last = 1; 11120 r = r600_bytecode_add_alu(ctx->bc, &alu); 11121 if (r) 11122 return r; 11123 11124 /* temp.y = temp.y + temp.z */ 11125 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11126 alu.op = ALU_OP2_ADD_INT; 11127 alu.dst.sel = treg; 11128 alu.dst.chan = 1; 11129 alu.dst.write = 1; 11130 alu.src[0].sel = treg; 11131 alu.src[0].chan = 1; 11132 alu.src[1].sel = treg; 11133 alu.src[1].chan = 2; 11134 alu.last = 1; 11135 r = r600_bytecode_add_alu(ctx->bc, &alu); 11136 if (r) 11137 return r; 11138 11139 /* dst.x = temp.x */ 11140 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11141 alu.op = ALU_OP1_MOV; 11142 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11143 alu.src[0].sel = treg; 11144 alu.src[0].chan = 0; 11145 r = r600_bytecode_add_alu(ctx->bc, &alu); 11146 if (r) 11147 return r; 11148 11149 /* dst.y = temp.y */ 11150 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11151 alu.op = ALU_OP1_MOV; 11152 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11153 alu.src[0].sel = treg; 11154 alu.src[0].chan = 1; 11155 alu.last = 1; 11156 r = r600_bytecode_add_alu(ctx->bc, &alu); 11157 if (r) 11158 return r; 11159 11160 return 0; 11161} 11162 11163static int emit_u64sge(struct r600_shader_ctx *ctx, 11164 int treg, 11165 int src0_sel, int src0_base_chan, 11166 int src1_sel, int src1_base_chan) 11167{ 11168 int r; 11169 /* for 64-bit sge */ 11170 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */ 11171 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT, 11172 treg, 1, 11173 src0_sel, src0_base_chan + 1, 11174 src1_sel, src1_base_chan + 1); 11175 if (r) 11176 return r; 11177 11178 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11179 treg, 0, 11180 src0_sel, src0_base_chan, 11181 src1_sel, src1_base_chan); 11182 if (r) 11183 return r; 11184 11185 r = single_alu_op2(ctx, ALU_OP2_SETE_INT, 11186 treg, 2, 11187 src0_sel, src0_base_chan + 1, 11188 src1_sel, src1_base_chan + 1); 11189 if (r) 11190 return r; 11191 11192 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11193 treg, 0, 11194 treg, 0, 11195 treg, 2); 11196 if (r) 11197 return r; 11198 11199 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11200 treg, 0, 11201 treg, 0, 11202 treg, 1); 11203 if (r) 11204 return r; 11205 return 0; 11206} 11207 11208/* this isn't a complete div it's just enough for qbo shader to work */ 11209static int egcm_u64div(struct r600_shader_ctx *ctx) 11210{ 11211 struct r600_bytecode_alu alu; 11212 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src; 11213 int r, i; 11214 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11215 11216 /* make sure we are dividing my a const with 0 in the high bits */ 11217 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL) 11218 return -1; 11219 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0) 11220 return -1; 11221 /* make sure we are doing one division */ 11222 if (inst->Dst[0].Register.WriteMask != 0x3) 11223 return -1; 11224 11225 /* emit_if uses ctx->temp_reg so we can't */ 11226 int treg = r600_get_temp(ctx); 11227 int tmp_num = r600_get_temp(ctx); 11228 int sub_tmp = r600_get_temp(ctx); 11229 11230 /* tmp quot are tmp_num.zw */ 11231 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0); 11232 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1); 11233 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0); 11234 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1); 11235 11236 /* MOV tmp_num.xy, numerator */ 11237 r = single_alu_op2(ctx, ALU_OP1_MOV, 11238 tmp_num, 0, 11239 alu_num_lo.sel, alu_num_lo.chan, 11240 0, 0); 11241 if (r) 11242 return r; 11243 r = single_alu_op2(ctx, ALU_OP1_MOV, 11244 tmp_num, 1, 11245 alu_num_hi.sel, alu_num_hi.chan, 11246 0, 0); 11247 if (r) 11248 return r; 11249 11250 r = single_alu_op2(ctx, ALU_OP1_MOV, 11251 tmp_num, 2, 11252 V_SQ_ALU_SRC_LITERAL, 0, 11253 0, 0); 11254 if (r) 11255 return r; 11256 11257 r = single_alu_op2(ctx, ALU_OP1_MOV, 11258 tmp_num, 3, 11259 V_SQ_ALU_SRC_LITERAL, 0, 11260 0, 0); 11261 if (r) 11262 return r; 11263 11264 /* treg 0 is log2_denom */ 11265 /* normally this gets the MSB for the denom high value 11266 - however we know this will always be 0 here. */ 11267 r = single_alu_op2(ctx, 11268 ALU_OP1_MOV, 11269 treg, 0, 11270 V_SQ_ALU_SRC_LITERAL, 32, 11271 0, 0); 11272 if (r) 11273 return r; 11274 11275 /* normally check demon hi for 0, but we know it is already */ 11276 /* t0.z = num_hi >= denom_lo */ 11277 r = single_alu_op2(ctx, 11278 ALU_OP2_SETGE_UINT, 11279 treg, 1, 11280 alu_num_hi.sel, alu_num_hi.chan, 11281 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11282 if (r) 11283 return r; 11284 11285 memset(&alu_src, 0, sizeof(alu_src)); 11286 alu_src.sel = treg; 11287 alu_src.chan = 1; 11288 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11289 if (r) 11290 return r; 11291 11292 /* for loops in here */ 11293 /* get msb t0.x = msb(src[1].x) first */ 11294 int msb_lo = util_last_bit(alu_denom_lo.value); 11295 r = single_alu_op2(ctx, ALU_OP1_MOV, 11296 treg, 0, 11297 V_SQ_ALU_SRC_LITERAL, msb_lo, 11298 0, 0); 11299 if (r) 11300 return r; 11301 11302 /* unroll the asm here */ 11303 for (i = 0; i < 31; i++) { 11304 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11305 treg, 2, 11306 V_SQ_ALU_SRC_LITERAL, i, 11307 treg, 0); 11308 if (r) 11309 return r; 11310 11311 /* we can do this on the CPU */ 11312 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i); 11313 /* t0.z = tmp_num.y >= t0.z */ 11314 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11315 treg, 1, 11316 tmp_num, 1, 11317 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11318 if (r) 11319 return r; 11320 11321 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11322 treg, 1, 11323 treg, 1, 11324 treg, 2); 11325 if (r) 11326 return r; 11327 11328 memset(&alu_src, 0, sizeof(alu_src)); 11329 alu_src.sel = treg; 11330 alu_src.chan = 1; 11331 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11332 if (r) 11333 return r; 11334 11335 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11336 tmp_num, 1, 11337 tmp_num, 1, 11338 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11339 if (r) 11340 return r; 11341 11342 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11343 tmp_num, 3, 11344 tmp_num, 3, 11345 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11346 if (r) 11347 return r; 11348 11349 r = tgsi_endif(ctx); 11350 if (r) 11351 return r; 11352 } 11353 11354 /* log2_denom is always <= 31, so manually peel the last loop 11355 * iteration. 11356 */ 11357 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11358 treg, 1, 11359 tmp_num, 1, 11360 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11361 if (r) 11362 return r; 11363 11364 memset(&alu_src, 0, sizeof(alu_src)); 11365 alu_src.sel = treg; 11366 alu_src.chan = 1; 11367 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11368 if (r) 11369 return r; 11370 11371 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11372 tmp_num, 1, 11373 tmp_num, 1, 11374 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11375 if (r) 11376 return r; 11377 11378 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11379 tmp_num, 3, 11380 tmp_num, 3, 11381 V_SQ_ALU_SRC_LITERAL, 1U); 11382 if (r) 11383 return r; 11384 r = tgsi_endif(ctx); 11385 if (r) 11386 return r; 11387 11388 r = tgsi_endif(ctx); 11389 if (r) 11390 return r; 11391 11392 /* onto the second loop to unroll */ 11393 for (i = 0; i < 31; i++) { 11394 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11395 treg, 1, 11396 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)), 11397 treg, 0); 11398 if (r) 11399 return r; 11400 11401 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i); 11402 r = single_alu_op2(ctx, ALU_OP1_MOV, 11403 treg, 2, 11404 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11405 0, 0); 11406 if (r) 11407 return r; 11408 11409 r = single_alu_op2(ctx, ALU_OP1_MOV, 11410 treg, 3, 11411 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11412 0, 0); 11413 if (r) 11414 return r; 11415 11416 r = emit_u64sge(ctx, sub_tmp, 11417 tmp_num, 0, 11418 treg, 2); 11419 if (r) 11420 return r; 11421 11422 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11423 treg, 1, 11424 treg, 1, 11425 sub_tmp, 0); 11426 if (r) 11427 return r; 11428 11429 memset(&alu_src, 0, sizeof(alu_src)); 11430 alu_src.sel = treg; 11431 alu_src.chan = 1; 11432 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11433 if (r) 11434 return r; 11435 11436 11437 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11438 sub_tmp, 11439 tmp_num, 0, 11440 treg, 2); 11441 if (r) 11442 return r; 11443 11444 r = single_alu_op2(ctx, ALU_OP1_MOV, 11445 tmp_num, 0, 11446 sub_tmp, 0, 11447 0, 0); 11448 if (r) 11449 return r; 11450 11451 r = single_alu_op2(ctx, ALU_OP1_MOV, 11452 tmp_num, 1, 11453 sub_tmp, 1, 11454 0, 0); 11455 if (r) 11456 return r; 11457 11458 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11459 tmp_num, 2, 11460 tmp_num, 2, 11461 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11462 if (r) 11463 return r; 11464 11465 r = tgsi_endif(ctx); 11466 if (r) 11467 return r; 11468 } 11469 11470 /* log2_denom is always <= 63, so manually peel the last loop 11471 * iteration. 11472 */ 11473 uint64_t denom_shl = (uint64_t)alu_denom_lo.value; 11474 r = single_alu_op2(ctx, ALU_OP1_MOV, 11475 treg, 2, 11476 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11477 0, 0); 11478 if (r) 11479 return r; 11480 11481 r = single_alu_op2(ctx, ALU_OP1_MOV, 11482 treg, 3, 11483 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11484 0, 0); 11485 if (r) 11486 return r; 11487 11488 r = emit_u64sge(ctx, sub_tmp, 11489 tmp_num, 0, 11490 treg, 2); 11491 if (r) 11492 return r; 11493 11494 memset(&alu_src, 0, sizeof(alu_src)); 11495 alu_src.sel = sub_tmp; 11496 alu_src.chan = 0; 11497 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11498 if (r) 11499 return r; 11500 11501 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11502 sub_tmp, 11503 tmp_num, 0, 11504 treg, 2); 11505 if (r) 11506 return r; 11507 11508 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11509 tmp_num, 2, 11510 tmp_num, 2, 11511 V_SQ_ALU_SRC_LITERAL, 1U); 11512 if (r) 11513 return r; 11514 r = tgsi_endif(ctx); 11515 if (r) 11516 return r; 11517 11518 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11519 alu.op = ALU_OP1_MOV; 11520 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11521 alu.src[0].sel = tmp_num; 11522 alu.src[0].chan = 2; 11523 r = r600_bytecode_add_alu(ctx->bc, &alu); 11524 if (r) 11525 return r; 11526 11527 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11528 alu.op = ALU_OP1_MOV; 11529 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11530 alu.src[0].sel = tmp_num; 11531 alu.src[0].chan = 3; 11532 alu.last = 1; 11533 r = r600_bytecode_add_alu(ctx->bc, &alu); 11534 if (r) 11535 return r; 11536 return 0; 11537} 11538 11539static int egcm_u64sne(struct r600_shader_ctx *ctx) 11540{ 11541 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11542 struct r600_bytecode_alu alu; 11543 int r; 11544 int treg = ctx->temp_reg; 11545 11546 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11547 alu.op = ALU_OP2_SETNE_INT; 11548 alu.dst.sel = treg; 11549 alu.dst.chan = 0; 11550 alu.dst.write = 1; 11551 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11552 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11553 r = r600_bytecode_add_alu(ctx->bc, &alu); 11554 if (r) 11555 return r; 11556 11557 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11558 alu.op = ALU_OP2_SETNE_INT; 11559 alu.dst.sel = treg; 11560 alu.dst.chan = 1; 11561 alu.dst.write = 1; 11562 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11563 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11564 alu.last = 1; 11565 r = r600_bytecode_add_alu(ctx->bc, &alu); 11566 if (r) 11567 return r; 11568 11569 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11570 alu.op = ALU_OP2_OR_INT; 11571 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11572 alu.src[0].sel = treg; 11573 alu.src[0].chan = 0; 11574 alu.src[1].sel = treg; 11575 alu.src[1].chan = 1; 11576 alu.last = 1; 11577 r = r600_bytecode_add_alu(ctx->bc, &alu); 11578 if (r) 11579 return r; 11580 return 0; 11581} 11582 11583static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 11584 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, 11585 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11586 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11587 11588 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11589 11590 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11591 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11592 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11593 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11594 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11595 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11596 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11597 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11598 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */ 11599 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11600 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11601 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11602 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11603 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11604 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11605 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11606 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11607 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11608 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11609 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11610 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11611 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11612 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11613 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11614 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11615 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11616 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11617 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11618 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11619 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported}, 11620 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11621 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11622 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11623 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11624 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11625 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11626 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11627 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11628 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11629 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11630 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 11631 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 11632 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 11633 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 11634 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 11635 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 11636 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 11637 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 11638 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 11639 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 11640 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 11641 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11642 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11643 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11644 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11645 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 11646 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 11647 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl}, 11648 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 11649 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 11650 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 11651 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 11652 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 11653 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 11654 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11655 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 11656 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 11657 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11658 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11659 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 11660 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 11661 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 11662 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 11663 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 11664 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 11665 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11666 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11667 [81] = { ALU_OP0_NOP, tgsi_unsupported}, 11668 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 11669 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 11670 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 11671 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 11672 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 11673 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans}, 11674 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 11675 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 11676 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 11677 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 11678 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 11679 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 11680 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 11681 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11682 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 11683 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 11684 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 11685 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 11686 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11687 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 11688 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11689 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11690 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 11691 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, 11692 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 11693 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 11694 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 11695 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 11696 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 11697 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 11698 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, 11699 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 11700 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 11701 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 11702 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 11703 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 11704 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11705 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 11706 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 11707 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 11708 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 11709 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 11710 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 11711 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans}, 11712 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 11713 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 11714 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 11715 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 11716 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 11717 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 11718 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 11719 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 11720 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 11721 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 11722 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 11723 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 11724 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans}, 11725 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 11726 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap}, 11727 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11728 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 11729 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 11730 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11731 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 11732 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 11733 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 11734 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 11735 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 11736 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 11737 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 11738 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 11739 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 11740 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 11741 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 11742 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 11743 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl}, 11744 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 11745 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 11746 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 11747 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, 11748 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, 11749 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 11750 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 11751 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 11752 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, 11753 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, 11754 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, 11755 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, 11756 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, 11757 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11758 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11759 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11760 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11761 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11762 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11763 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 11764 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11765 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11766 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 11767 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 11768 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported}, 11769 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported}, 11770 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported}, 11771 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported}, 11772 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported}, 11773 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported}, 11774 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported}, 11775 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported}, 11776 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported}, 11777 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported}, 11778 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported}, 11779 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported}, 11780 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported}, 11781 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 11782}; 11783 11784static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 11785 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 11786 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11787 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11788 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11789 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11790 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11791 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11792 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11793 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11794 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11795 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11796 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11797 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11798 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11799 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11800 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11801 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11802 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11803 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 11804 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11805 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11806 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11807 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11808 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11809 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11810 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11811 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11812 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11813 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11814 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11815 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11816 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11817 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 11818 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11819 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11820 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11821 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11822 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11823 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11824 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 11825 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11826 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11827 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11828 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 11829 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 11830 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 11831 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 11832 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 11833 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 11834 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 11835 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 11836 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 11837 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 11838 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 11839 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 11840 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11841 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11842 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11843 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 11844 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 11845 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 11846 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 11847 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 11848 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 11849 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 11850 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 11851 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 11852 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11853 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 11854 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 11855 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11856 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11857 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 11858 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 11859 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 11860 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 11861 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 11862 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 11863 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11864 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11865 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 11866 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 11867 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 11868 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 11869 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 11870 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 11871 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 11872 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 11873 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 11874 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 11875 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 11876 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 11877 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 11878 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11879 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 11880 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 11881 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 11882 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 11883 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11884 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 11885 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11886 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11887 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 11888 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 11889 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 11890 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 11891 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 11892 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 11893 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 11894 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 11895 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 11896 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 11897 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 11898 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 11899 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 11900 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 11901 /* Refer below for TGSI_OPCODE_DFMA */ 11902 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i}, 11903 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 11904 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 11905 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 11906 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 11907 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 11908 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 11909 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 11910 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 11911 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 11912 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 11913 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 11914 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 11915 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 11916 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 11917 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 11918 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 11919 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 11920 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 11921 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 11922 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 11923 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 11924 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11925 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 11926 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 11927 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11928 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 11929 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 11930 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 11931 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 11932 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 11933 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 11934 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 11935 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 11936 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 11937 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 11938 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 11939 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 11940 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 11941 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 11942 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 11943 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 11944 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 11945 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 11946 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 11947 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 11948 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 11949 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 11950 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 11951 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 11952 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 11953 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 11954 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 11955 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 11956 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 11957 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 11958 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 11959 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 11960 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 11961 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11962 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11963 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 11964 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 11965 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 11966 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 11967 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 11968 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 11969 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 11970 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 11971 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 11972 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 11973 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 11974 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 11975 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 11976 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 11977 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 11978 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 11979 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 11980 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 11981 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 11982 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 11983 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 11984 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 11985 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 11986 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 11987 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 11988 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 11989 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 11990 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 11991 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 11992 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 11993 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 11994 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 11995 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 11996 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 11997 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 11998 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 11999 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12000 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12001 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12002 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12003 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12004 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12005 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12006 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12007 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12008}; 12009 12010static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 12011 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 12012 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 12013 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 12014 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 12015 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 12016 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 12017 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 12018 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 12019 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 12020 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12021 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12022 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 12023 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 12024 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 12025 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 12026 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 12027 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 12028 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 12029 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 12030 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr}, 12031 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 12032 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 12033 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 12034 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 12035 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 12036 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 12037 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 12038 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 12039 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 12040 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow}, 12041 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 12042 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 12043 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 12044 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 12045 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 12046 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig}, 12047 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12048 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12049 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 12050 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 12051 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12052 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12053 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12054 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 12055 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 12056 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 12057 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 12058 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig}, 12059 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 12060 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 12061 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 12062 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 12063 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 12064 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 12065 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 12066 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12067 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12068 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12069 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 12070 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 12071 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 12072 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 12073 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 12074 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 12075 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 12076 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 12077 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 12078 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12079 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 12080 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 12081 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12082 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12083 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 12084 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 12085 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 12086 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 12087 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 12088 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 12089 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12090 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12091 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 12092 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 12093 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2}, 12094 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 12095 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 12096 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 12097 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 12098 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 12099 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 12100 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 12101 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 12102 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 12103 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 12104 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12105 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 12106 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 12107 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 12108 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 12109 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12110 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 12111 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12112 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12113 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 12114 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 12115 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 12116 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 12117 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 12118 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 12119 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 12120 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 12121 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12122 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 12123 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 12124 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 12125 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 12126 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 12127 /* Refer below for TGSI_OPCODE_DFMA */ 12128 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2}, 12129 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 12130 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 12131 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 12132 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 12133 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 12134 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 12135 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 12136 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2}, 12137 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2}, 12138 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 12139 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 12140 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 12141 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 12142 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 12143 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 12144 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 12145 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 12146 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 12147 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 12148 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 12149 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 12150 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12151 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 12152 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 12153 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12154 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 12155 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 12156 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 12157 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 12158 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 12159 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 12160 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 12161 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 12162 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 12163 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 12164 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 12165 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 12166 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 12167 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 12168 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 12169 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 12170 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 12171 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 12172 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 12173 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 12174 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 12175 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12176 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 12177 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 12178 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 12179 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 12180 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 12181 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 12182 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 12183 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 12184 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 12185 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 12186 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 12187 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12188 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12189 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr}, 12190 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr}, 12191 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 12192 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 12193 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 12194 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 12195 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 12196 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 12197 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 12198 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 12199 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 12200 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 12201 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12202 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12203 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12204 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 12205 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 12206 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 12207 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 12208 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 12209 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 12210 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 12211 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 12212 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 12213 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 12214 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 12215 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 12216 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 12217 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 12218 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 12219 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12220 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12221 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 12222 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 12223 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12224 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12225 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12226 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12227 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12228 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12229 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12230 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12231 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12232 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12233 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12234}; 12235