1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_generator.cpp 25 * 26 * This file supports generating code from the FS LIR to the actual 27 * native instructions. 28 */ 29 30#include "brw_eu.h" 31#include "brw_fs.h" 32#include "brw_cfg.h" 33#include "util/mesa-sha1.h" 34 35static enum brw_reg_file 36brw_file_from_reg(fs_reg *reg) 37{ 38 switch (reg->file) { 39 case ARF: 40 return BRW_ARCHITECTURE_REGISTER_FILE; 41 case FIXED_GRF: 42 case VGRF: 43 return BRW_GENERAL_REGISTER_FILE; 44 case MRF: 45 return BRW_MESSAGE_REGISTER_FILE; 46 case IMM: 47 return BRW_IMMEDIATE_VALUE; 48 case BAD_FILE: 49 case ATTR: 50 case UNIFORM: 51 unreachable("not reached"); 52 } 53 return BRW_ARCHITECTURE_REGISTER_FILE; 54} 55 56static struct brw_reg 57brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst, 58 fs_reg *reg, bool compressed) 59{ 60 struct brw_reg brw_reg; 61 62 switch (reg->file) { 63 case MRF: 64 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); 65 FALLTHROUGH; 66 case VGRF: 67 if (reg->stride == 0) { 68 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); 69 } else { 70 /* From the Haswell PRM: 71 * 72 * "VertStride must be used to cross GRF register boundaries. This 73 * rule implies that elements within a 'Width' cannot cross GRF 74 * boundaries." 75 * 76 * The maximum width value that could satisfy this restriction is: 77 */ 78 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); 79 80 /* Because the hardware can only split source regions at a whole 81 * multiple of width during decompression (i.e. vertically), clamp 82 * the value obtained above to the physical execution size of a 83 * single decompressed chunk of the instruction: 84 */ 85 const unsigned phys_width = compressed ? inst->exec_size / 2 : 86 inst->exec_size; 87 88 const unsigned max_hw_width = 16; 89 90 /* XXX - The equation above is strictly speaking not correct on 91 * hardware that supports unbalanced GRF writes -- On Gfx9+ 92 * each decompressed chunk of the instruction may have a 93 * different execution size when the number of components 94 * written to each destination GRF is not the same. 95 */ 96 if (reg->stride > 4) { 97 assert(reg != &inst->dst); 98 assert(reg->stride * type_sz(reg->type) <= REG_SIZE); 99 brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); 100 brw_reg = stride(brw_reg, reg->stride, 1, 0); 101 } else { 102 const unsigned width = MIN3(reg_width, phys_width, max_hw_width); 103 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); 104 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); 105 } 106 107 if (devinfo->verx10 == 70) { 108 /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13): 109 * "Each DF (Double Float) operand uses an element size of 4 rather 110 * than 8 and all regioning parameters are twice what the values 111 * would be based on the true element size: ExecSize, Width, 112 * HorzStride, and VertStride. Each DF operand uses a pair of 113 * channels and all masking and swizzing should be adjusted 114 * appropriately." 115 * 116 * From the IvyBridge PRM (Special Requirements for Handling Double 117 * Precision Data Types, page 71): 118 * "In Align1 mode, all regioning parameters like stride, execution 119 * size, and width must use the syntax of a pair of packed 120 * floats. The offsets for these data types must be 64-bit 121 * aligned. The execution size and regioning parameters are in terms 122 * of floats." 123 * 124 * Summarized: when handling DF-typed arguments, ExecSize, 125 * VertStride, and Width must be doubled. 126 * 127 * It applies to BayTrail too. 128 */ 129 if (type_sz(reg->type) == 8) { 130 brw_reg.width++; 131 if (brw_reg.vstride > 0) 132 brw_reg.vstride++; 133 assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1); 134 } 135 136 /* When converting from DF->F, we set the destination stride to 2 137 * because each d2f conversion implicitly writes 2 floats, being 138 * the first one the converted value. IVB/BYT actually writes two 139 * F components per SIMD channel, and every other component is 140 * filled with garbage. 141 */ 142 if (reg == &inst->dst && get_exec_type_size(inst) == 8 && 143 type_sz(inst->dst.type) < 8) { 144 assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1); 145 brw_reg.hstride--; 146 } 147 } 148 } 149 150 brw_reg = retype(brw_reg, reg->type); 151 brw_reg = byte_offset(brw_reg, reg->offset); 152 brw_reg.abs = reg->abs; 153 brw_reg.negate = reg->negate; 154 break; 155 case ARF: 156 case FIXED_GRF: 157 case IMM: 158 assert(reg->offset == 0); 159 brw_reg = reg->as_brw_reg(); 160 break; 161 case BAD_FILE: 162 /* Probably unused. */ 163 brw_reg = brw_null_reg(); 164 break; 165 case ATTR: 166 case UNIFORM: 167 unreachable("not reached"); 168 } 169 170 /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0> 171 * region, but on IVB and BYT DF regions must be programmed in terms of 172 * floats. A <0,2,1> region accomplishes this. 173 */ 174 if (devinfo->verx10 == 70 && 175 type_sz(reg->type) == 8 && 176 brw_reg.vstride == BRW_VERTICAL_STRIDE_0 && 177 brw_reg.width == BRW_WIDTH_1 && 178 brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) { 179 brw_reg.width = BRW_WIDTH_2; 180 brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1; 181 } 182 183 return brw_reg; 184} 185 186fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, 187 void *mem_ctx, 188 struct brw_stage_prog_data *prog_data, 189 bool runtime_check_aads_emit, 190 gl_shader_stage stage) 191 192 : compiler(compiler), log_data(log_data), 193 devinfo(compiler->devinfo), 194 prog_data(prog_data), dispatch_width(0), 195 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), 196 shader_name(NULL), stage(stage), mem_ctx(mem_ctx) 197{ 198 p = rzalloc(mem_ctx, struct brw_codegen); 199 brw_init_codegen(devinfo, p, mem_ctx); 200 201 /* In the FS code generator, we are very careful to ensure that we always 202 * set the right execution size so we don't need the EU code to "help" us 203 * by trying to infer it. Sometimes, it infers the wrong thing. 204 */ 205 p->automatic_exec_sizes = false; 206} 207 208fs_generator::~fs_generator() 209{ 210} 211 212class ip_record : public exec_node { 213public: 214 DECLARE_RALLOC_CXX_OPERATORS(ip_record) 215 216 ip_record(int ip) 217 { 218 this->ip = ip; 219 } 220 221 int ip; 222}; 223 224bool 225fs_generator::patch_halt_jumps() 226{ 227 if (this->discard_halt_patches.is_empty()) 228 return false; 229 230 int scale = brw_jump_scale(p->devinfo); 231 232 if (devinfo->ver >= 6) { 233 /* There is a somewhat strange undocumented requirement of using 234 * HALT, according to the simulator. If some channel has HALTed to 235 * a particular UIP, then by the end of the program, every channel 236 * must have HALTed to that UIP. Furthermore, the tracking is a 237 * stack, so you can't do the final halt of a UIP after starting 238 * halting to a new UIP. 239 * 240 * Symptoms of not emitting this instruction on actual hardware 241 * included GPU hangs and sparkly rendering on the piglit discard 242 * tests. 243 */ 244 brw_inst *last_halt = brw_HALT(p); 245 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); 246 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); 247 } 248 249 int ip = p->nr_insn; 250 251 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { 252 brw_inst *patch = &p->store[patch_ip->ip]; 253 254 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); 255 if (devinfo->ver >= 6) { 256 /* HALT takes a half-instruction distance from the pre-incremented IP. */ 257 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); 258 } else { 259 brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); 260 } 261 } 262 263 this->discard_halt_patches.make_empty(); 264 265 if (devinfo->ver < 6) { 266 /* From the g965 PRM: 267 * 268 * "As DMask is not automatically reloaded into AMask upon completion 269 * of this instruction, software has to manually restore AMask upon 270 * completion." 271 * 272 * DMask lives in the bottom 16 bits of sr0.1. 273 */ 274 brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), 275 retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); 276 brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); 277 brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); 278 brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); 279 brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); 280 } 281 282 if (devinfo->ver == 4 && !devinfo->is_g4x) { 283 /* From the g965 PRM: 284 * 285 * "[DevBW, DevCL] Erratum: The subfields in mask stack register are 286 * reset to zero during graphics reset, however, they are not 287 * initialized at thread dispatch. These subfields will retain the 288 * values from the previous thread. Software should make sure the 289 * mask stack is empty (reset to zero) before terminating the thread. 290 * In case that this is not practical, software may have to reset the 291 * mask stack at the beginning of each kernel, which will impact the 292 * performance." 293 * 294 * Luckily we can rely on: 295 * 296 * "[DevBW, DevCL] This register access restriction is not 297 * applicable, hardware does ensure execution pipeline coherency, 298 * when a mask stack register is used as an explicit source and/or 299 * destination." 300 */ 301 brw_push_insn_state(p); 302 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 303 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 304 305 brw_set_default_exec_size(p, BRW_EXECUTE_2); 306 brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); 307 308 brw_set_default_exec_size(p, BRW_EXECUTE_16); 309 /* Reset the if stack. */ 310 brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), 311 brw_imm_uw(0)); 312 313 brw_pop_insn_state(p); 314 } 315 316 return true; 317} 318 319void 320fs_generator::generate_send(fs_inst *inst, 321 struct brw_reg dst, 322 struct brw_reg desc, 323 struct brw_reg ex_desc, 324 struct brw_reg payload, 325 struct brw_reg payload2) 326{ 327 const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE && 328 dst.nr == BRW_ARF_NULL; 329 const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE; 330 331 uint32_t desc_imm = inst->desc | 332 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); 333 334 uint32_t ex_desc_imm = inst->ex_desc | 335 brw_message_ex_desc(devinfo, inst->ex_mlen); 336 337 if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) { 338 /* If we have any sort of extended descriptor, then we need SENDS. This 339 * also covers the dual-payload case because ex_mlen goes in ex_desc. 340 */ 341 brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, 342 desc, desc_imm, ex_desc, ex_desc_imm, 343 inst->eot); 344 if (inst->check_tdr) 345 brw_inst_set_opcode(p->devinfo, brw_last_inst, 346 devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); 347 } else { 348 brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, 349 inst->eot); 350 if (inst->check_tdr) 351 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); 352 } 353} 354 355void 356fs_generator::fire_fb_write(fs_inst *inst, 357 struct brw_reg payload, 358 struct brw_reg implied_header, 359 GLuint nr) 360{ 361 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 362 363 if (devinfo->ver < 6) { 364 brw_push_insn_state(p); 365 brw_set_default_exec_size(p, BRW_EXECUTE_8); 366 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 367 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 368 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 369 brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1), 370 offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1)); 371 brw_pop_insn_state(p); 372 } 373 374 uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data); 375 376 /* We assume render targets start at 0, because headerless FB write 377 * messages set "Render Target Index" to 0. Using a different binding 378 * table index would make it impossible to use headerless messages. 379 */ 380 const uint32_t surf_index = inst->target; 381 382 brw_inst *insn = brw_fb_WRITE(p, 383 payload, 384 retype(implied_header, BRW_REGISTER_TYPE_UW), 385 msg_control, 386 surf_index, 387 nr, 388 0, 389 inst->eot, 390 inst->last_rt, 391 inst->header_size != 0); 392 393 if (devinfo->ver >= 6) 394 brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16); 395} 396 397void 398fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) 399{ 400 if (devinfo->verx10 <= 70) { 401 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 402 brw_set_default_flag_reg(p, 0, 0); 403 } 404 405 const struct brw_reg implied_header = 406 devinfo->ver < 6 ? payload : brw_null_reg(); 407 408 if (inst->base_mrf >= 0) 409 payload = brw_message_reg(inst->base_mrf); 410 411 if (!runtime_check_aads_emit) { 412 fire_fb_write(inst, payload, implied_header, inst->mlen); 413 } else { 414 /* This can only happen in gen < 6 */ 415 assert(devinfo->ver < 6); 416 417 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 418 419 /* Check runtime bit to detect if we have to send AA data or not */ 420 brw_push_insn_state(p); 421 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 422 brw_set_default_exec_size(p, BRW_EXECUTE_1); 423 brw_AND(p, 424 v1_null_ud, 425 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), 426 brw_imm_ud(1<<26)); 427 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); 428 429 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; 430 brw_pop_insn_state(p); 431 { 432 /* Don't send AA data */ 433 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); 434 } 435 brw_land_fwd_jump(p, jmp); 436 fire_fb_write(inst, payload, implied_header, inst->mlen); 437 } 438} 439 440void 441fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, 442 struct brw_reg payload) 443{ 444 assert(inst->size_written % REG_SIZE == 0); 445 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 446 /* We assume that render targets start at binding table index 0. */ 447 const unsigned surf_index = inst->target; 448 449 gfx9_fb_READ(p, dst, payload, surf_index, 450 inst->header_size, inst->size_written / REG_SIZE, 451 prog_data->persample_dispatch); 452} 453 454void 455fs_generator::generate_mov_indirect(fs_inst *inst, 456 struct brw_reg dst, 457 struct brw_reg reg, 458 struct brw_reg indirect_byte_offset) 459{ 460 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD); 461 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE); 462 assert(!reg.abs && !reg.negate); 463 assert(reg.type == dst.type); 464 465 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; 466 467 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { 468 imm_byte_offset += indirect_byte_offset.ud; 469 470 reg.nr = imm_byte_offset / REG_SIZE; 471 reg.subnr = imm_byte_offset % REG_SIZE; 472 if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) { 473 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 474 subscript(reg, BRW_REGISTER_TYPE_D, 0)); 475 brw_set_default_swsb(p, tgl_swsb_null()); 476 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 477 subscript(reg, BRW_REGISTER_TYPE_D, 1)); 478 } else { 479 brw_MOV(p, dst, reg); 480 } 481 } else { 482 /* Prior to Broadwell, there are only 8 address registers. */ 483 assert(inst->exec_size <= 8 || devinfo->ver >= 8); 484 485 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 486 struct brw_reg addr = vec8(brw_address_reg(0)); 487 488 /* Whether we can use destination dependency control without running the 489 * risk of a hang if an instruction gets shot down. 490 */ 491 const bool use_dep_ctrl = !inst->predicate && 492 inst->exec_size == dispatch_width; 493 brw_inst *insn; 494 495 /* The destination stride of an instruction (in bytes) must be greater 496 * than or equal to the size of the rest of the instruction. Since the 497 * address register is of type UW, we can't use a D-type instruction. 498 * In order to get around this, re retype to UW and use a stride. 499 */ 500 indirect_byte_offset = 501 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); 502 503 /* There are a number of reasons why we don't use the base offset here. 504 * One reason is that the field is only 9 bits which means we can only 505 * use it to access the first 16 GRFs. Also, from the Haswell PRM 506 * section "Register Region Restrictions": 507 * 508 * "The lower bits of the AddressImmediate must not overflow to 509 * change the register address. The lower 5 bits of Address 510 * Immediate when added to lower 5 bits of address register gives 511 * the sub-register offset. The upper bits of Address Immediate 512 * when added to upper bits of address register gives the register 513 * address. Any overflow from sub-register offset is dropped." 514 * 515 * Since the indirect may cause us to cross a register boundary, this 516 * makes the base offset almost useless. We could try and do something 517 * clever where we use a actual base offset if base_offset % 32 == 0 but 518 * that would mean we were generating different code depending on the 519 * base offset. Instead, for the sake of consistency, we'll just do the 520 * add ourselves. This restriction is only listed in the Haswell PRM 521 * but empirical testing indicates that it applies on all older 522 * generations and is lifted on Broadwell. 523 * 524 * In the end, while base_offset is nice to look at in the generated 525 * code, using it saves us 0 instructions and would require quite a bit 526 * of case-by-case work. It's just not worth it. 527 * 528 * Due to a hardware bug some platforms (particularly Gfx11+) seem to 529 * require the address components of all channels to be valid whether or 530 * not they're active, which causes issues if we use VxH addressing 531 * under non-uniform control-flow. We can easily work around that by 532 * initializing the whole address register with a pipelined NoMask MOV 533 * instruction. 534 */ 535 if (devinfo->ver >= 7) { 536 insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); 537 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 538 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); 539 if (devinfo->ver >= 12) 540 brw_set_default_swsb(p, tgl_swsb_null()); 541 else 542 brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); 543 } 544 545 insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); 546 if (devinfo->ver >= 12) 547 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 548 else if (devinfo->ver >= 7) 549 brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); 550 551 if (type_sz(reg.type) > 4 && 552 ((devinfo->verx10 == 70) || 553 devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) || 554 !devinfo->has_64bit_float || devinfo->verx10 >= 125)) { 555 /* IVB has an issue (which we found empirically) where it reads two 556 * address register components per channel for indirectly addressed 557 * 64-bit sources. 558 * 559 * From the Cherryview PRM Vol 7. "Register Region Restrictions": 560 * 561 * "When source or destination datatype is 64b or operation is 562 * integer DWord multiply, indirect addressing must not be used." 563 * 564 * To work around both of these, we do two integer MOVs insead of one 565 * 64-bit MOV. Because no double value should ever cross a register 566 * boundary, it's safe to use the immediate offset in the indirect 567 * here to handle adding 4 bytes to the offset and avoid the extra 568 * ADD to the register file. 569 */ 570 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 571 retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); 572 brw_set_default_swsb(p, tgl_swsb_null()); 573 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 574 retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); 575 } else { 576 struct brw_reg ind_src = brw_VxH_indirect(0, 0); 577 578 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type)); 579 580 if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE && 581 !inst->get_next()->is_tail_sentinel() && 582 ((fs_inst *)inst->get_next())->mlen > 0) { 583 /* From the Sandybridge PRM: 584 * 585 * "[Errata: DevSNB(SNB)] If MRF register is updated by any 586 * instruction that “indexed/indirect” source AND is followed 587 * by a send, the instruction requires a “Switch”. This is to 588 * avoid race condition where send may dispatch before MRF is 589 * updated." 590 */ 591 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH); 592 } 593 } 594 } 595} 596 597void 598fs_generator::generate_shuffle(fs_inst *inst, 599 struct brw_reg dst, 600 struct brw_reg src, 601 struct brw_reg idx) 602{ 603 assert(src.file == BRW_GENERAL_REGISTER_FILE); 604 assert(!src.abs && !src.negate); 605 606 /* Ivy bridge has some strange behavior that makes this a real pain to 607 * implement for 64-bit values so we just don't bother. 608 */ 609 assert(devinfo->verx10 >= 75 || type_sz(src.type) <= 4); 610 611 /* Because we're using the address register, we're limited to 8-wide 612 * execution on gfx7. On gfx8, we're limited to 16-wide by the address 613 * register file and 8-wide for 64-bit types. We could try and make this 614 * instruction splittable higher up in the compiler but that gets weird 615 * because it reads all of the channels regardless of execution size. It's 616 * easier just to split it here. 617 */ 618 const unsigned lower_width = 619 devinfo->ver <= 7 || element_sz(src) > 4 || element_sz(dst) > 4 ? 8 : 620 MIN2(16, inst->exec_size); 621 622 brw_set_default_exec_size(p, cvt(lower_width) - 1); 623 for (unsigned group = 0; group < inst->exec_size; group += lower_width) { 624 brw_set_default_group(p, group); 625 626 if ((src.vstride == 0 && src.hstride == 0) || 627 idx.file == BRW_IMMEDIATE_VALUE) { 628 /* Trivial, the source is already uniform or the index is a constant. 629 * We will typically not get here if the optimizer is doing its job, 630 * but asserting would be mean. 631 */ 632 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; 633 struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0); 634 struct brw_reg group_dst = suboffset(dst, group); 635 if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) { 636 brw_MOV(p, subscript(group_dst, BRW_REGISTER_TYPE_UD, 0), 637 subscript(group_src, BRW_REGISTER_TYPE_UD, 0)); 638 brw_set_default_swsb(p, tgl_swsb_null()); 639 brw_MOV(p, subscript(group_dst, BRW_REGISTER_TYPE_UD, 1), 640 subscript(group_src, BRW_REGISTER_TYPE_UD, 1)); 641 } else { 642 brw_MOV(p, group_dst, group_src); 643 } 644 } else { 645 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 646 struct brw_reg addr = vec8(brw_address_reg(0)); 647 648 struct brw_reg group_idx = suboffset(idx, group); 649 650 if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) { 651 /* Things get grumpy if the register is too wide. */ 652 group_idx.width--; 653 group_idx.vstride--; 654 } 655 656 assert(type_sz(group_idx.type) <= 4); 657 if (type_sz(group_idx.type) == 4) { 658 /* The destination stride of an instruction (in bytes) must be 659 * greater than or equal to the size of the rest of the 660 * instruction. Since the address register is of type UW, we 661 * can't use a D-type instruction. In order to get around this, 662 * re retype to UW and use a stride. 663 */ 664 group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W); 665 } 666 667 uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr; 668 669 /* From the Haswell PRM: 670 * 671 * "When a sequence of NoDDChk and NoDDClr are used, the last 672 * instruction that completes the scoreboard clear must have a 673 * non-zero execution mask. This means, if any kind of predication 674 * can change the execution mask or channel enable of the last 675 * instruction, the optimization must be avoided. This is to 676 * avoid instructions being shot down the pipeline when no writes 677 * are required." 678 * 679 * Whenever predication is enabled or the instructions being emitted 680 * aren't the full width, it's possible that it will be run with zero 681 * channels enabled so we can't use dependency control without 682 * running the risk of a hang if an instruction gets shot down. 683 */ 684 const bool use_dep_ctrl = !inst->predicate && 685 lower_width == dispatch_width; 686 brw_inst *insn; 687 688 /* Due to a hardware bug some platforms (particularly Gfx11+) seem 689 * to require the address components of all channels to be valid 690 * whether or not they're active, which causes issues if we use VxH 691 * addressing under non-uniform control-flow. We can easily work 692 * around that by initializing the whole address register with a 693 * pipelined NoMask MOV instruction. 694 */ 695 insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset)); 696 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 697 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); 698 if (devinfo->ver >= 12) 699 brw_set_default_swsb(p, tgl_swsb_null()); 700 else 701 brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); 702 703 /* Take into account the component size and horizontal stride. */ 704 assert(src.vstride == src.hstride + src.width); 705 insn = brw_SHL(p, addr, group_idx, 706 brw_imm_uw(util_logbase2(type_sz(src.type)) + 707 src.hstride - 1)); 708 if (devinfo->ver >= 12) 709 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 710 else 711 brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); 712 713 /* Add on the register start offset */ 714 brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset)); 715 716 if (type_sz(src.type) > 4 && 717 ((devinfo->verx10 == 70) || 718 devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) || 719 !devinfo->has_64bit_float)) { 720 /* IVB has an issue (which we found empirically) where it reads 721 * two address register components per channel for indirectly 722 * addressed 64-bit sources. 723 * 724 * From the Cherryview PRM Vol 7. "Register Region Restrictions": 725 * 726 * "When source or destination datatype is 64b or operation is 727 * integer DWord multiply, indirect addressing must not be 728 * used." 729 * 730 * To work around both of these, we do two integer MOVs insead of 731 * one 64-bit MOV. Because no double value should ever cross a 732 * register boundary, it's safe to use the immediate offset in the 733 * indirect here to handle adding 4 bytes to the offset and avoid 734 * the extra ADD to the register file. 735 */ 736 struct brw_reg gdst = suboffset(dst, group); 737 struct brw_reg dst_d = retype(spread(gdst, 2), 738 BRW_REGISTER_TYPE_D); 739 assert(dst.hstride == 1); 740 brw_MOV(p, dst_d, 741 retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); 742 brw_set_default_swsb(p, tgl_swsb_null()); 743 brw_MOV(p, byte_offset(dst_d, 4), 744 retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); 745 } else { 746 brw_MOV(p, suboffset(dst, group * dst.hstride), 747 retype(brw_VxH_indirect(0, 0), src.type)); 748 } 749 } 750 751 brw_set_default_swsb(p, tgl_swsb_null()); 752 } 753} 754 755void 756fs_generator::generate_quad_swizzle(const fs_inst *inst, 757 struct brw_reg dst, struct brw_reg src, 758 unsigned swiz) 759{ 760 /* Requires a quad. */ 761 assert(inst->exec_size >= 4); 762 763 if (src.file == BRW_IMMEDIATE_VALUE || 764 has_scalar_region(src)) { 765 /* The value is uniform across all channels */ 766 brw_MOV(p, dst, src); 767 768 } else if (devinfo->ver < 11 && type_sz(src.type) == 4) { 769 /* This only works on 8-wide 32-bit values */ 770 assert(inst->exec_size == 8); 771 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 772 assert(src.vstride == src.width + 1); 773 brw_set_default_access_mode(p, BRW_ALIGN_16); 774 struct brw_reg swiz_src = stride(src, 4, 4, 1); 775 swiz_src.swizzle = swiz; 776 brw_MOV(p, dst, swiz_src); 777 778 } else { 779 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 780 assert(src.vstride == src.width + 1); 781 const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0)); 782 783 switch (swiz) { 784 case BRW_SWIZZLE_XXXX: 785 case BRW_SWIZZLE_YYYY: 786 case BRW_SWIZZLE_ZZZZ: 787 case BRW_SWIZZLE_WWWW: 788 brw_MOV(p, dst, stride(src_0, 4, 4, 0)); 789 break; 790 791 case BRW_SWIZZLE_XXZZ: 792 case BRW_SWIZZLE_YYWW: 793 brw_MOV(p, dst, stride(src_0, 2, 2, 0)); 794 break; 795 796 case BRW_SWIZZLE_XYXY: 797 case BRW_SWIZZLE_ZWZW: 798 assert(inst->exec_size == 4); 799 brw_MOV(p, dst, stride(src_0, 0, 2, 1)); 800 break; 801 802 default: 803 assert(inst->force_writemask_all); 804 brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); 805 806 for (unsigned c = 0; c < 4; c++) { 807 brw_inst *insn = brw_MOV( 808 p, stride(suboffset(dst, c), 809 4 * inst->dst.stride, 1, 4 * inst->dst.stride), 810 stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); 811 812 if (devinfo->ver < 12) { 813 brw_inst_set_no_dd_clear(devinfo, insn, c < 3); 814 brw_inst_set_no_dd_check(devinfo, insn, c > 0); 815 } 816 817 brw_set_default_swsb(p, tgl_swsb_null()); 818 } 819 820 break; 821 } 822 } 823} 824 825void 826fs_generator::generate_urb_read(fs_inst *inst, 827 struct brw_reg dst, 828 struct brw_reg header) 829{ 830 assert(inst->size_written % REG_SIZE == 0); 831 assert(header.file == BRW_GENERAL_REGISTER_FILE); 832 assert(header.type == BRW_REGISTER_TYPE_UD); 833 834 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 835 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 836 brw_set_src0(p, send, header); 837 if (devinfo->ver < 12) 838 brw_set_src1(p, send, brw_imm_ud(0u)); 839 840 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); 841 brw_inst_set_urb_opcode(p->devinfo, send, GFX8_URB_OPCODE_SIMD8_READ); 842 843 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT) 844 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true); 845 846 brw_inst_set_mlen(p->devinfo, send, inst->mlen); 847 brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE); 848 brw_inst_set_header_present(p->devinfo, send, true); 849 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset); 850} 851 852void 853fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) 854{ 855 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); 856 857 brw_set_dest(p, insn, brw_null_reg()); 858 brw_set_src0(p, insn, payload); 859 if (devinfo->ver < 12) 860 brw_set_src1(p, insn, brw_imm_ud(0u)); 861 862 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); 863 brw_inst_set_urb_opcode(p->devinfo, insn, GFX8_URB_OPCODE_SIMD8_WRITE); 864 865 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || 866 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) 867 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true); 868 869 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || 870 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) 871 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true); 872 873 brw_inst_set_mlen(p->devinfo, insn, inst->mlen); 874 brw_inst_set_rlen(p->devinfo, insn, 0); 875 brw_inst_set_eot(p->devinfo, insn, inst->eot); 876 brw_inst_set_header_present(p->devinfo, insn, true); 877 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset); 878} 879 880void 881fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) 882{ 883 struct brw_inst *insn; 884 885 insn = brw_next_insn(p, BRW_OPCODE_SEND); 886 887 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); 888 brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); 889 if (devinfo->ver < 12) 890 brw_set_src1(p, insn, brw_imm_ud(0u)); 891 892 /* For XeHP and newer send a message to the message gateway to terminate a 893 * compute shader. For older devices, a message is sent to the thread 894 * spawner. 895 */ 896 if (devinfo->verx10 >= 125) 897 brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY); 898 else 899 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); 900 brw_inst_set_mlen(devinfo, insn, 1); 901 brw_inst_set_rlen(devinfo, insn, 0); 902 brw_inst_set_eot(devinfo, insn, inst->eot); 903 brw_inst_set_header_present(devinfo, insn, false); 904 905 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ 906 907 if (devinfo->ver < 11) { 908 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ 909 910 /* Note that even though the thread has a URB resource associated with it, 911 * we set the "do not dereference URB" bit, because the URB resource is 912 * managed by the fixed-function unit, so it will free it automatically. 913 */ 914 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ 915 } 916 917 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 918} 919 920void 921fs_generator::generate_barrier(fs_inst *, struct brw_reg src) 922{ 923 brw_barrier(p, src); 924 if (devinfo->ver >= 12) { 925 brw_set_default_swsb(p, tgl_swsb_null()); 926 brw_SYNC(p, TGL_SYNC_BAR); 927 } else { 928 brw_WAIT(p); 929 } 930} 931 932bool 933fs_generator::generate_linterp(fs_inst *inst, 934 struct brw_reg dst, struct brw_reg *src) 935{ 936 /* PLN reads: 937 * / in SIMD16 \ 938 * ----------------------------------- 939 * | src1+0 | src1+1 | src1+2 | src1+3 | 940 * |-----------------------------------| 941 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| 942 * ----------------------------------- 943 * 944 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: 945 * 946 * ----------------------------------- 947 * | src1+0 | src1+1 | src1+2 | src1+3 | 948 * |-----------------------------------| 949 * |(x0, x1)|(y0, y1)| | | in SIMD8 950 * |-----------------------------------| 951 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 952 * ----------------------------------- 953 * 954 * See also: emit_interpolation_setup_gfx4(). 955 */ 956 struct brw_reg delta_x = src[0]; 957 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); 958 struct brw_reg interp = src[1]; 959 brw_inst *i[2]; 960 961 /* nir_lower_interpolation() will do the lowering to MAD instructions for 962 * us on gfx11+ 963 */ 964 assert(devinfo->ver < 11); 965 966 if (devinfo->has_pln) { 967 if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) { 968 /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane": 969 * 970 * "[DevSNB]:<src1> must be even register aligned. 971 * 972 * This restriction is lifted on Ivy Bridge. 973 * 974 * This means that we need to split PLN into LINE+MAC on-the-fly. 975 * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so 976 * we have to split into SIMD8 pieces. For gfx4 (!has_pln), the 977 * coordinate registers are laid out differently so we leave it as a 978 * SIMD16 instruction. 979 */ 980 assert(inst->exec_size == 8 || inst->exec_size == 16); 981 assert(inst->group % 16 == 0); 982 983 brw_push_insn_state(p); 984 brw_set_default_exec_size(p, BRW_EXECUTE_8); 985 986 /* Thanks to two accumulators, we can emit all the LINEs and then all 987 * the MACs. This improves parallelism a bit. 988 */ 989 for (unsigned g = 0; g < inst->exec_size / 8; g++) { 990 brw_inst *line = brw_LINE(p, brw_null_reg(), interp, 991 offset(delta_x, g * 2)); 992 brw_inst_set_group(devinfo, line, inst->group + g * 8); 993 994 /* LINE writes the accumulator automatically on gfx4-5. On Sandy 995 * Bridge and later, we have to explicitly enable it. 996 */ 997 if (devinfo->ver >= 6) 998 brw_inst_set_acc_wr_control(p->devinfo, line, true); 999 1000 /* brw_set_default_saturate() is called before emitting 1001 * instructions, so the saturate bit is set in each instruction, 1002 * so we need to unset it on the LINE instructions. 1003 */ 1004 brw_inst_set_saturate(p->devinfo, line, false); 1005 } 1006 1007 for (unsigned g = 0; g < inst->exec_size / 8; g++) { 1008 brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1), 1009 offset(delta_x, g * 2 + 1)); 1010 brw_inst_set_group(devinfo, mac, inst->group + g * 8); 1011 brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod); 1012 } 1013 1014 brw_pop_insn_state(p); 1015 1016 return true; 1017 } else { 1018 brw_PLN(p, dst, interp, delta_x); 1019 1020 return false; 1021 } 1022 } else { 1023 i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x); 1024 i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y); 1025 1026 brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); 1027 1028 /* brw_set_default_saturate() is called before emitting instructions, so 1029 * the saturate bit is set in each instruction, so we need to unset it on 1030 * the first instruction. 1031 */ 1032 brw_inst_set_saturate(p->devinfo, i[0], false); 1033 1034 return true; 1035 } 1036} 1037 1038void 1039fs_generator::generate_get_buffer_size(fs_inst *inst, 1040 struct brw_reg dst, 1041 struct brw_reg src, 1042 struct brw_reg surf_index) 1043{ 1044 assert(devinfo->ver >= 7); 1045 assert(surf_index.file == BRW_IMMEDIATE_VALUE); 1046 1047 uint32_t simd_mode; 1048 int rlen = 4; 1049 1050 switch (inst->exec_size) { 1051 case 8: 1052 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1053 break; 1054 case 16: 1055 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1056 break; 1057 default: 1058 unreachable("Invalid width for texture instruction"); 1059 } 1060 1061 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 1062 rlen = 8; 1063 dst = vec16(dst); 1064 } 1065 1066 brw_SAMPLE(p, 1067 retype(dst, BRW_REGISTER_TYPE_UW), 1068 inst->base_mrf, 1069 src, 1070 surf_index.ud, 1071 0, 1072 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 1073 rlen, /* response length */ 1074 inst->mlen, 1075 inst->header_size > 0, 1076 simd_mode, 1077 BRW_SAMPLER_RETURN_FORMAT_SINT32); 1078} 1079 1080void 1081fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, 1082 struct brw_reg surface_index, 1083 struct brw_reg sampler_index) 1084{ 1085 assert(devinfo->ver < 7); 1086 assert(inst->size_written % REG_SIZE == 0); 1087 int msg_type = -1; 1088 uint32_t simd_mode; 1089 uint32_t return_format; 1090 1091 /* Sampler EOT message of less than the dispatch width would kill the 1092 * thread prematurely. 1093 */ 1094 assert(!inst->eot || inst->exec_size == dispatch_width); 1095 1096 switch (dst.type) { 1097 case BRW_REGISTER_TYPE_D: 1098 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 1099 break; 1100 case BRW_REGISTER_TYPE_UD: 1101 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 1102 break; 1103 default: 1104 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 1105 break; 1106 } 1107 1108 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type 1109 * is set as part of the message descriptor. On gfx4, the PRM seems to 1110 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on 1111 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is 1112 * gone from the message descriptor entirely and you just get UINT32 all 1113 * the time regasrdless. Since we can really only do non-UINT32 on gfx4, 1114 * just stomp it to UINT32 all the time. 1115 */ 1116 if (inst->opcode == SHADER_OPCODE_TXS) 1117 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 1118 1119 switch (inst->exec_size) { 1120 case 8: 1121 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1122 break; 1123 case 16: 1124 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1125 break; 1126 default: 1127 unreachable("Invalid width for texture instruction"); 1128 } 1129 1130 if (devinfo->ver >= 5) { 1131 switch (inst->opcode) { 1132 case SHADER_OPCODE_TEX: 1133 if (inst->shadow_compare) { 1134 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 1135 } else { 1136 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE; 1137 } 1138 break; 1139 case FS_OPCODE_TXB: 1140 if (inst->shadow_compare) { 1141 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 1142 } else { 1143 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; 1144 } 1145 break; 1146 case SHADER_OPCODE_TXL: 1147 if (inst->shadow_compare) { 1148 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 1149 } else { 1150 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; 1151 } 1152 break; 1153 case SHADER_OPCODE_TXS: 1154 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 1155 break; 1156 case SHADER_OPCODE_TXD: 1157 assert(!inst->shadow_compare); 1158 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 1159 break; 1160 case SHADER_OPCODE_TXF: 1161 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 1162 break; 1163 case SHADER_OPCODE_TXF_CMS: 1164 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 1165 break; 1166 case SHADER_OPCODE_LOD: 1167 msg_type = GFX5_SAMPLER_MESSAGE_LOD; 1168 break; 1169 case SHADER_OPCODE_TG4: 1170 assert(devinfo->ver == 6); 1171 assert(!inst->shadow_compare); 1172 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 1173 break; 1174 case SHADER_OPCODE_SAMPLEINFO: 1175 msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 1176 break; 1177 default: 1178 unreachable("not reached"); 1179 } 1180 } else { 1181 switch (inst->opcode) { 1182 case SHADER_OPCODE_TEX: 1183 /* Note that G45 and older determines shadow compare and dispatch width 1184 * from message length for most messages. 1185 */ 1186 if (inst->exec_size == 8) { 1187 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 1188 if (inst->shadow_compare) { 1189 assert(inst->mlen == 6); 1190 } else { 1191 assert(inst->mlen <= 4); 1192 } 1193 } else { 1194 if (inst->shadow_compare) { 1195 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; 1196 assert(inst->mlen == 9); 1197 } else { 1198 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; 1199 assert(inst->mlen <= 7 && inst->mlen % 2 == 1); 1200 } 1201 } 1202 break; 1203 case FS_OPCODE_TXB: 1204 if (inst->shadow_compare) { 1205 assert(inst->exec_size == 8); 1206 assert(inst->mlen == 6); 1207 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 1208 } else { 1209 assert(inst->mlen == 9); 1210 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 1211 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1212 } 1213 break; 1214 case SHADER_OPCODE_TXL: 1215 if (inst->shadow_compare) { 1216 assert(inst->exec_size == 8); 1217 assert(inst->mlen == 6); 1218 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 1219 } else { 1220 assert(inst->mlen == 9); 1221 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 1222 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1223 } 1224 break; 1225 case SHADER_OPCODE_TXD: 1226 /* There is no sample_d_c message; comparisons are done manually */ 1227 assert(inst->exec_size == 8); 1228 assert(inst->mlen == 7 || inst->mlen == 10); 1229 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; 1230 break; 1231 case SHADER_OPCODE_TXF: 1232 assert(inst->mlen <= 9 && inst->mlen % 2 == 1); 1233 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1234 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1235 break; 1236 case SHADER_OPCODE_TXS: 1237 assert(inst->mlen == 3); 1238 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; 1239 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1240 break; 1241 default: 1242 unreachable("not reached"); 1243 } 1244 } 1245 assert(msg_type != -1); 1246 1247 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 1248 dst = vec16(dst); 1249 } 1250 1251 assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 1252 1253 /* Load the message header if present. If there's a texture offset, 1254 * we need to set it up explicitly and load the offset bitfield. 1255 * Otherwise, we can use an implied move from g0 to the first message reg. 1256 */ 1257 struct brw_reg src = brw_null_reg(); 1258 if (inst->header_size != 0) { 1259 if (devinfo->ver < 6 && !inst->offset) { 1260 /* Set up an implied move from g0 to the MRF. */ 1261 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 1262 } else { 1263 const tgl_swsb swsb = brw_get_default_swsb(p); 1264 assert(inst->base_mrf != -1); 1265 struct brw_reg header_reg = brw_message_reg(inst->base_mrf); 1266 1267 brw_push_insn_state(p); 1268 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1269 brw_set_default_exec_size(p, BRW_EXECUTE_8); 1270 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1271 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 1272 /* Explicitly set up the message header by copying g0 to the MRF. */ 1273 brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); 1274 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 1275 1276 brw_set_default_exec_size(p, BRW_EXECUTE_1); 1277 if (inst->offset) { 1278 /* Set the offset bits in DWord 2. */ 1279 brw_MOV(p, get_element_ud(header_reg, 2), 1280 brw_imm_ud(inst->offset)); 1281 } 1282 1283 brw_pop_insn_state(p); 1284 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1285 } 1286 } 1287 1288 uint32_t base_binding_table_index; 1289 switch (inst->opcode) { 1290 case SHADER_OPCODE_TG4: 1291 base_binding_table_index = prog_data->binding_table.gather_texture_start; 1292 break; 1293 default: 1294 base_binding_table_index = prog_data->binding_table.texture_start; 1295 break; 1296 } 1297 1298 assert(surface_index.file == BRW_IMMEDIATE_VALUE); 1299 assert(sampler_index.file == BRW_IMMEDIATE_VALUE); 1300 1301 brw_SAMPLE(p, 1302 retype(dst, BRW_REGISTER_TYPE_UW), 1303 inst->base_mrf, 1304 src, 1305 surface_index.ud + base_binding_table_index, 1306 sampler_index.ud % 16, 1307 msg_type, 1308 inst->size_written / REG_SIZE, 1309 inst->mlen, 1310 inst->header_size != 0, 1311 simd_mode, 1312 return_format); 1313} 1314 1315 1316/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 1317 * looking like: 1318 * 1319 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 1320 * 1321 * Ideally, we want to produce: 1322 * 1323 * DDX DDY 1324 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 1325 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 1326 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 1327 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 1328 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 1329 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 1330 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 1331 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 1332 * 1333 * and add another set of two more subspans if in 16-pixel dispatch mode. 1334 * 1335 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 1336 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 1337 * pair. But the ideal approximation may impose a huge performance cost on 1338 * sample_d. On at least Haswell, sample_d instruction does some 1339 * optimizations if the same LOD is used for all pixels in the subspan. 1340 * 1341 * For DDY, we need to use ALIGN16 mode since it's capable of doing the 1342 * appropriate swizzling. 1343 */ 1344void 1345fs_generator::generate_ddx(const fs_inst *inst, 1346 struct brw_reg dst, struct brw_reg src) 1347{ 1348 unsigned vstride, width; 1349 1350 if (devinfo->ver >= 8) { 1351 if (inst->opcode == FS_OPCODE_DDX_FINE) { 1352 /* produce accurate derivatives */ 1353 vstride = BRW_VERTICAL_STRIDE_2; 1354 width = BRW_WIDTH_2; 1355 } else { 1356 /* replicate the derivative at the top-left pixel to other pixels */ 1357 vstride = BRW_VERTICAL_STRIDE_4; 1358 width = BRW_WIDTH_4; 1359 } 1360 1361 struct brw_reg src0 = byte_offset(src, type_sz(src.type));; 1362 struct brw_reg src1 = src; 1363 1364 src0.vstride = vstride; 1365 src0.width = width; 1366 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1367 src1.vstride = vstride; 1368 src1.width = width; 1369 src1.hstride = BRW_HORIZONTAL_STRIDE_0; 1370 1371 brw_ADD(p, dst, src0, negate(src1)); 1372 } else { 1373 /* On Haswell and earlier, the region used above appears to not work 1374 * correctly for compressed instructions. At least on Haswell and 1375 * Iron Lake, compressed ALIGN16 instructions do work. Since we 1376 * would have to split to SIMD8 no matter which method we choose, we 1377 * may as well use ALIGN16 on all platforms gfx7 and earlier. 1378 */ 1379 struct brw_reg src0 = stride(src, 4, 4, 1); 1380 struct brw_reg src1 = stride(src, 4, 4, 1); 1381 if (inst->opcode == FS_OPCODE_DDX_FINE) { 1382 src0.swizzle = BRW_SWIZZLE_XXZZ; 1383 src1.swizzle = BRW_SWIZZLE_YYWW; 1384 } else { 1385 src0.swizzle = BRW_SWIZZLE_XXXX; 1386 src1.swizzle = BRW_SWIZZLE_YYYY; 1387 } 1388 1389 brw_push_insn_state(p); 1390 brw_set_default_access_mode(p, BRW_ALIGN_16); 1391 brw_ADD(p, dst, negate(src0), src1); 1392 brw_pop_insn_state(p); 1393 } 1394} 1395 1396/* The negate_value boolean is used to negate the derivative computation for 1397 * FBOs, since they place the origin at the upper left instead of the lower 1398 * left. 1399 */ 1400void 1401fs_generator::generate_ddy(const fs_inst *inst, 1402 struct brw_reg dst, struct brw_reg src) 1403{ 1404 const uint32_t type_size = type_sz(src.type); 1405 1406 if (inst->opcode == FS_OPCODE_DDY_FINE) { 1407 /* produce accurate derivatives. 1408 * 1409 * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU) 1410 * "Register Region Restrictions", Section "1. Special Restrictions": 1411 * 1412 * "In Align16 mode, the channel selects and channel enables apply to 1413 * a pair of half-floats, because these parameters are defined for 1414 * DWord elements ONLY. This is applicable when both source and 1415 * destination are half-floats." 1416 * 1417 * So for half-float operations we use the Gfx11+ Align1 path. CHV 1418 * inherits its FP16 hardware from SKL, so it is not affected. 1419 */ 1420 if (devinfo->ver >= 11 || 1421 (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) { 1422 src = stride(src, 0, 2, 1); 1423 1424 brw_push_insn_state(p); 1425 brw_set_default_exec_size(p, BRW_EXECUTE_4); 1426 for (uint32_t g = 0; g < inst->exec_size; g += 4) { 1427 brw_set_default_group(p, inst->group + g); 1428 brw_ADD(p, byte_offset(dst, g * type_size), 1429 negate(byte_offset(src, g * type_size)), 1430 byte_offset(src, (g + 2) * type_size)); 1431 brw_set_default_swsb(p, tgl_swsb_null()); 1432 } 1433 brw_pop_insn_state(p); 1434 } else { 1435 struct brw_reg src0 = stride(src, 4, 4, 1); 1436 struct brw_reg src1 = stride(src, 4, 4, 1); 1437 src0.swizzle = BRW_SWIZZLE_XYXY; 1438 src1.swizzle = BRW_SWIZZLE_ZWZW; 1439 1440 brw_push_insn_state(p); 1441 brw_set_default_access_mode(p, BRW_ALIGN_16); 1442 brw_ADD(p, dst, negate(src0), src1); 1443 brw_pop_insn_state(p); 1444 } 1445 } else { 1446 /* replicate the derivative at the top-left pixel to other pixels */ 1447 if (devinfo->ver >= 8) { 1448 struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); 1449 struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); 1450 1451 brw_ADD(p, dst, negate(src0), src1); 1452 } else { 1453 /* On Haswell and earlier, the region used above appears to not work 1454 * correctly for compressed instructions. At least on Haswell and 1455 * Iron Lake, compressed ALIGN16 instructions do work. Since we 1456 * would have to split to SIMD8 no matter which method we choose, we 1457 * may as well use ALIGN16 on all platforms gfx7 and earlier. 1458 */ 1459 struct brw_reg src0 = stride(src, 4, 4, 1); 1460 struct brw_reg src1 = stride(src, 4, 4, 1); 1461 src0.swizzle = BRW_SWIZZLE_XXXX; 1462 src1.swizzle = BRW_SWIZZLE_ZZZZ; 1463 1464 brw_push_insn_state(p); 1465 brw_set_default_access_mode(p, BRW_ALIGN_16); 1466 brw_ADD(p, dst, negate(src0), src1); 1467 brw_pop_insn_state(p); 1468 } 1469 } 1470} 1471 1472void 1473fs_generator::generate_halt(fs_inst *) 1474{ 1475 /* This HALT will be patched up at FB write time to point UIP at the end of 1476 * the program, and at brw_uip_jip() JIP will be set to the end of the 1477 * current block (or the program). 1478 */ 1479 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); 1480 brw_HALT(p); 1481} 1482 1483void 1484fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) 1485{ 1486 /* The 32-wide messages only respect the first 16-wide half of the channel 1487 * enable signals which are replicated identically for the second group of 1488 * 16 channels, so we cannot use them unless the write is marked 1489 * force_writemask_all. 1490 */ 1491 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : 1492 MIN2(16, inst->exec_size); 1493 const unsigned block_size = 4 * lower_size / REG_SIZE; 1494 const tgl_swsb swsb = brw_get_default_swsb(p); 1495 assert(inst->mlen != 0); 1496 1497 brw_push_insn_state(p); 1498 brw_set_default_exec_size(p, cvt(lower_size) - 1); 1499 brw_set_default_compression(p, lower_size > 8); 1500 1501 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1502 brw_set_default_group(p, inst->group + lower_size * i); 1503 1504 if (i > 0) { 1505 assert(swsb.mode & TGL_SBID_SET); 1506 brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid)); 1507 } else { 1508 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1509 } 1510 1511 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), 1512 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); 1513 1514 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1515 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1516 block_size, 1517 inst->offset + block_size * REG_SIZE * i); 1518 } 1519 1520 brw_pop_insn_state(p); 1521} 1522 1523void 1524fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) 1525{ 1526 assert(inst->exec_size <= 16 || inst->force_writemask_all); 1527 assert(inst->mlen != 0); 1528 1529 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1530 inst->exec_size / 8, inst->offset); 1531} 1532 1533void 1534fs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst) 1535{ 1536 assert(inst->exec_size <= 16 || inst->force_writemask_all); 1537 1538 gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); 1539} 1540 1541/* The A32 messages take a buffer base address in header.5:[31:0] (See 1542 * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered 1543 * and OWord block messages in the SKL PRM Vol. 2d for more details.) 1544 * Unfortunately, there are a number of subtle differences: 1545 * 1546 * For the block read/write messages: 1547 * 1548 * - We always stomp header.2 to fill in the actual scratch address (in 1549 * units of OWORDs) so we don't care what's in there. 1550 * 1551 * - They rely on per-thread scratch space value in header.3[3:0] to do 1552 * bounds checking so that needs to be valid. The upper bits of 1553 * header.3 are ignored, though, so we can copy all of g0.3. 1554 * 1555 * - They ignore header.5[9:0] and assumes the address is 1KB aligned. 1556 * 1557 * 1558 * For the byte/dword scattered read/write messages: 1559 * 1560 * - We want header.2 to be zero because that gets added to the per-channel 1561 * offset in the non-header portion of the message. 1562 * 1563 * - Contrary to what the docs claim, they don't do any bounds checking so 1564 * the value of header.3[3:0] doesn't matter. 1565 * 1566 * - They consider all of header.5 for the base address and header.5[9:0] 1567 * are not ignored. This means that we can't copy g0.5 verbatim because 1568 * g0.5[9:0] contains the FFTID on most platforms. Instead, we have to 1569 * use an AND to mask off the bottom 10 bits. 1570 * 1571 * 1572 * For block messages, just copying g0 gives a valid header because all the 1573 * garbage gets ignored except for header.2 which we stomp as part of message 1574 * setup. For byte/dword scattered messages, we can just zero out the header 1575 * and copy over the bits we need from g0.5. This opcode, however, tries to 1576 * satisfy the requirements of both by starting with 0 and filling out the 1577 * information required by either set of opcodes. 1578 */ 1579void 1580fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst) 1581{ 1582 assert(inst->exec_size == 8 && inst->force_writemask_all); 1583 assert(dst.file == BRW_GENERAL_REGISTER_FILE); 1584 1585 dst.type = BRW_REGISTER_TYPE_UD; 1586 1587 brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0)); 1588 if (devinfo->ver >= 12) 1589 brw_set_default_swsb(p, tgl_swsb_null()); 1590 else 1591 brw_inst_set_no_dd_clear(p->devinfo, insn, true); 1592 1593 /* Copy the per-thread scratch space size from g0.3[3:0] */ 1594 brw_set_default_exec_size(p, BRW_EXECUTE_1); 1595 insn = brw_AND(p, suboffset(dst, 3), 1596 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), 1597 brw_imm_ud(INTEL_MASK(3, 0))); 1598 if (devinfo->ver < 12) { 1599 brw_inst_set_no_dd_clear(p->devinfo, insn, true); 1600 brw_inst_set_no_dd_check(p->devinfo, insn, true); 1601 } 1602 1603 /* Copy the scratch base address from g0.5[31:10] */ 1604 insn = brw_AND(p, suboffset(dst, 5), 1605 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 1606 brw_imm_ud(INTEL_MASK(31, 10))); 1607 if (devinfo->ver < 12) 1608 brw_inst_set_no_dd_check(p->devinfo, insn, true); 1609} 1610 1611void 1612fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, 1613 struct brw_reg dst, 1614 struct brw_reg index, 1615 struct brw_reg offset) 1616{ 1617 assert(type_sz(dst.type) == 4); 1618 assert(inst->mlen != 0); 1619 1620 assert(index.file == BRW_IMMEDIATE_VALUE && 1621 index.type == BRW_REGISTER_TYPE_UD); 1622 uint32_t surf_index = index.ud; 1623 1624 assert(offset.file == BRW_IMMEDIATE_VALUE && 1625 offset.type == BRW_REGISTER_TYPE_UD); 1626 uint32_t read_offset = offset.ud; 1627 1628 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1629 read_offset, surf_index); 1630} 1631 1632void 1633fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst, 1634 struct brw_reg dst, 1635 struct brw_reg index, 1636 struct brw_reg payload) 1637{ 1638 assert(index.type == BRW_REGISTER_TYPE_UD); 1639 assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1640 assert(type_sz(dst.type) == 4); 1641 1642 if (index.file == BRW_IMMEDIATE_VALUE) { 1643 const uint32_t surf_index = index.ud; 1644 1645 brw_push_insn_state(p); 1646 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1647 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1648 brw_pop_insn_state(p); 1649 1650 brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE); 1651 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 1652 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 1653 brw_set_desc(p, send, 1654 brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written, 1655 REG_SIZE), true) | 1656 brw_dp_desc(devinfo, surf_index, 1657 GFX7_DATAPORT_DC_OWORD_BLOCK_READ, 1658 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size))); 1659 1660 } else { 1661 const tgl_swsb swsb = brw_get_default_swsb(p); 1662 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1663 1664 brw_push_insn_state(p); 1665 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1666 1667 /* a0.0 = surf_index & 0xff */ 1668 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1669 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1670 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1671 brw_set_dest(p, insn_and, addr); 1672 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); 1673 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1674 1675 /* dst = send(payload, a0.0 | <descriptor>) */ 1676 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1677 brw_send_indirect_message( 1678 p, GFX6_SFID_DATAPORT_CONSTANT_CACHE, 1679 retype(dst, BRW_REGISTER_TYPE_UD), 1680 retype(payload, BRW_REGISTER_TYPE_UD), addr, 1681 brw_message_desc(devinfo, 1, 1682 DIV_ROUND_UP(inst->size_written, REG_SIZE), true) | 1683 brw_dp_desc(devinfo, 0 /* surface */, 1684 GFX7_DATAPORT_DC_OWORD_BLOCK_READ, 1685 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)), 1686 false /* EOT */); 1687 1688 brw_pop_insn_state(p); 1689 } 1690} 1691 1692void 1693fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst, 1694 struct brw_reg dst, 1695 struct brw_reg index) 1696{ 1697 assert(devinfo->ver < 7); /* Should use the gfx7 variant. */ 1698 assert(inst->header_size != 0); 1699 assert(inst->mlen); 1700 1701 assert(index.file == BRW_IMMEDIATE_VALUE && 1702 index.type == BRW_REGISTER_TYPE_UD); 1703 uint32_t surf_index = index.ud; 1704 1705 uint32_t simd_mode, rlen, msg_type; 1706 if (inst->exec_size == 16) { 1707 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1708 rlen = 8; 1709 } else { 1710 assert(inst->exec_size == 8); 1711 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1712 rlen = 4; 1713 } 1714 1715 if (devinfo->ver >= 5) 1716 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; 1717 else { 1718 /* We always use the SIMD16 message so that we only have to load U, and 1719 * not V or R. 1720 */ 1721 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1722 assert(inst->mlen == 3); 1723 assert(inst->size_written == 8 * REG_SIZE); 1724 rlen = 8; 1725 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1726 } 1727 1728 struct brw_reg header = brw_vec8_grf(0, 0); 1729 gfx6_resolve_implied_move(p, &header, inst->base_mrf); 1730 1731 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1732 brw_inst_set_compression(devinfo, send, false); 1733 brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER); 1734 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); 1735 brw_set_src0(p, send, header); 1736 if (devinfo->ver < 6) 1737 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); 1738 1739 /* Our surface is set up as floats, regardless of what actual data is 1740 * stored in it. 1741 */ 1742 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 1743 brw_set_desc(p, send, 1744 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) | 1745 brw_sampler_desc(devinfo, surf_index, 1746 0, /* sampler (unused) */ 1747 msg_type, simd_mode, return_format)); 1748} 1749 1750void 1751fs_generator::generate_pixel_interpolator_query(fs_inst *inst, 1752 struct brw_reg dst, 1753 struct brw_reg src, 1754 struct brw_reg msg_data, 1755 unsigned msg_type) 1756{ 1757 const bool has_payload = inst->src[0].file != BAD_FILE; 1758 assert(msg_data.type == BRW_REGISTER_TYPE_UD); 1759 assert(inst->size_written % REG_SIZE == 0); 1760 1761 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 1762 1763 brw_pixel_interpolator_query(p, 1764 retype(dst, BRW_REGISTER_TYPE_UW), 1765 /* If we don't have a payload, what we send doesn't matter */ 1766 has_payload ? src : brw_vec8_grf(0, 0), 1767 inst->pi_noperspective, 1768 prog_data->per_coarse_pixel_dispatch, 1769 msg_type, 1770 msg_data, 1771 has_payload ? 2 * inst->exec_size / 8 : 1, 1772 inst->size_written / REG_SIZE); 1773} 1774 1775/* Sets vstride=1, width=4, hstride=0 of register src1 during 1776 * the ADD instruction. 1777 */ 1778void 1779fs_generator::generate_set_sample_id(fs_inst *inst, 1780 struct brw_reg dst, 1781 struct brw_reg src0, 1782 struct brw_reg src1) 1783{ 1784 assert(dst.type == BRW_REGISTER_TYPE_D || 1785 dst.type == BRW_REGISTER_TYPE_UD); 1786 assert(src0.type == BRW_REGISTER_TYPE_D || 1787 src0.type == BRW_REGISTER_TYPE_UD); 1788 1789 const struct brw_reg reg = stride(src1, 1, 4, 0); 1790 const unsigned lower_size = MIN2(inst->exec_size, 1791 devinfo->ver >= 8 ? 16 : 8); 1792 1793 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1794 brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8), 1795 offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) * 1796 (i * lower_size / (1 << src0.width))) * 1797 type_sz(src0.type) / REG_SIZE), 1798 suboffset(reg, i * lower_size / 4)); 1799 brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); 1800 brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); 1801 brw_inst_set_compression(devinfo, insn, lower_size > 8); 1802 brw_set_default_swsb(p, tgl_swsb_null()); 1803 } 1804} 1805 1806void 1807fs_generator::generate_pack_half_2x16_split(fs_inst *, 1808 struct brw_reg dst, 1809 struct brw_reg x, 1810 struct brw_reg y) 1811{ 1812 assert(devinfo->ver >= 7); 1813 assert(dst.type == BRW_REGISTER_TYPE_UD); 1814 assert(x.type == BRW_REGISTER_TYPE_F); 1815 assert(y.type == BRW_REGISTER_TYPE_F); 1816 1817 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 1818 * 1819 * Because this instruction does not have a 16-bit floating-point type, 1820 * the destination data type must be Word (W). 1821 * 1822 * The destination must be DWord-aligned and specify a horizontal stride 1823 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 1824 * each destination channel and the upper word is not modified. 1825 */ 1826 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); 1827 1828 /* Give each 32-bit channel of dst the form below, where "." means 1829 * unchanged. 1830 * 0x....hhhh 1831 */ 1832 brw_F32TO16(p, dst_w, y); 1833 1834 /* Now the form: 1835 * 0xhhhh0000 1836 */ 1837 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 1838 brw_SHL(p, dst, dst, brw_imm_ud(16u)); 1839 1840 /* And, finally the form of packHalf2x16's output: 1841 * 0xhhhhllll 1842 */ 1843 brw_F32TO16(p, dst_w, x); 1844} 1845 1846void 1847fs_generator::generate_shader_time_add(fs_inst *, 1848 struct brw_reg payload, 1849 struct brw_reg offset, 1850 struct brw_reg value) 1851{ 1852 const tgl_swsb swsb = brw_get_default_swsb(p); 1853 1854 assert(devinfo->ver >= 7); 1855 brw_push_insn_state(p); 1856 brw_set_default_mask_control(p, true); 1857 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1858 1859 assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1860 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), 1861 offset.type); 1862 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0), 1863 value.type); 1864 1865 assert(offset.file == BRW_IMMEDIATE_VALUE); 1866 if (value.file == BRW_GENERAL_REGISTER_FILE) { 1867 value.width = BRW_WIDTH_1; 1868 value.hstride = BRW_HORIZONTAL_STRIDE_0; 1869 value.vstride = BRW_VERTICAL_STRIDE_0; 1870 } else { 1871 assert(value.file == BRW_IMMEDIATE_VALUE); 1872 } 1873 1874 /* Trying to deal with setup of the params from the IR is crazy in the FS8 1875 * case, and we don't really care about squeezing every bit of performance 1876 * out of this path, so we just emit the MOVs from here. 1877 */ 1878 brw_MOV(p, payload_offset, offset); 1879 brw_set_default_swsb(p, tgl_swsb_null()); 1880 brw_MOV(p, payload_value, value); 1881 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 1882 brw_shader_time_add(p, payload, 1883 prog_data->binding_table.shader_time_start); 1884 brw_pop_insn_state(p); 1885} 1886 1887void 1888fs_generator::enable_debug(const char *shader_name) 1889{ 1890 debug_flag = true; 1891 this->shader_name = shader_name; 1892} 1893 1894int 1895fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, 1896 struct shader_stats shader_stats, 1897 const brw::performance &perf, 1898 struct brw_compile_stats *stats) 1899{ 1900 /* align to 64 byte boundary. */ 1901 brw_realign(p, 64); 1902 1903 this->dispatch_width = dispatch_width; 1904 1905 int start_offset = p->next_insn_offset; 1906 1907 /* `send_count` explicitly does not include spills or fills, as we'd 1908 * like to use it as a metric for intentional memory access or other 1909 * shared function use. Otherwise, subtle changes to scheduling or 1910 * register allocation could cause it to fluctuate wildly - and that 1911 * effect is already counted in spill/fill counts. 1912 */ 1913 int spill_count = 0, fill_count = 0; 1914 int loop_count = 0, send_count = 0, nop_count = 0; 1915 bool is_accum_used = false; 1916 1917 struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); 1918 1919 foreach_block_and_inst (block, fs_inst, inst, cfg) { 1920 if (inst->opcode == SHADER_OPCODE_UNDEF) 1921 continue; 1922 1923 struct brw_reg src[4], dst; 1924 unsigned int last_insn_offset = p->next_insn_offset; 1925 bool multiple_instructions_emitted = false; 1926 tgl_swsb swsb = inst->sched; 1927 1928 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the 1929 * "Register Region Restrictions" section: for BDW, SKL: 1930 * 1931 * "A POW/FDIV operation must not be followed by an instruction 1932 * that requires two destination registers." 1933 * 1934 * The documentation is often lacking annotations for Atom parts, 1935 * and empirically this affects CHV as well. 1936 */ 1937 if (devinfo->ver >= 8 && 1938 devinfo->ver <= 9 && 1939 p->nr_insn > 1 && 1940 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH && 1941 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW && 1942 inst->dst.component_size(inst->exec_size) > REG_SIZE) { 1943 brw_NOP(p); 1944 last_insn_offset = p->next_insn_offset; 1945 1946 /* In order to avoid spurious instruction count differences when the 1947 * instruction schedule changes, keep track of the number of inserted 1948 * NOPs. 1949 */ 1950 nop_count++; 1951 } 1952 1953 /* Wa_14010017096: 1954 * 1955 * Clear accumulator register before end of thread. 1956 */ 1957 if (inst->eot && is_accum_used && devinfo->ver >= 12) { 1958 brw_set_default_exec_size(p, BRW_EXECUTE_16); 1959 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1960 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1961 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1962 brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f)); 1963 last_insn_offset = p->next_insn_offset; 1964 swsb = tgl_swsb_dst_dep(swsb, 1); 1965 } 1966 1967 if (!is_accum_used && !inst->eot) { 1968 is_accum_used = inst->writes_accumulator_implicitly(devinfo) || 1969 inst->dst.is_accumulator(); 1970 } 1971 1972 /* Wa_14013745556: 1973 * 1974 * Always use @1 SWSB for EOT. 1975 */ 1976 if (inst->eot && devinfo->ver >= 12) { 1977 if (tgl_swsb_src_dep(swsb).mode) { 1978 brw_set_default_exec_size(p, BRW_EXECUTE_1); 1979 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1980 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1981 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 1982 brw_SYNC(p, TGL_SYNC_NOP); 1983 last_insn_offset = p->next_insn_offset; 1984 } 1985 1986 swsb = tgl_swsb_dst_dep(swsb, 1); 1987 } 1988 1989 if (unlikely(debug_flag)) 1990 disasm_annotate(disasm_info, inst, p->next_insn_offset); 1991 1992 /* If the instruction writes to more than one register, it needs to be 1993 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the 1994 * hardware figures out by itself what the right compression mode is, 1995 * but we still need to know whether the instruction is compressed to 1996 * set up the source register regions appropriately. 1997 * 1998 * XXX - This is wrong for instructions that write a single register but 1999 * read more than one which should strictly speaking be treated as 2000 * compressed. For instructions that don't write any registers it 2001 * relies on the destination being a null register of the correct 2002 * type and regioning so the instruction is considered compressed 2003 * or not accordingly. 2004 */ 2005 const bool compressed = 2006 inst->dst.component_size(inst->exec_size) > REG_SIZE; 2007 brw_set_default_compression(p, compressed); 2008 brw_set_default_group(p, inst->group); 2009 2010 for (unsigned int i = 0; i < inst->sources; i++) { 2011 src[i] = brw_reg_from_fs_reg(devinfo, inst, 2012 &inst->src[i], compressed); 2013 /* The accumulator result appears to get used for the 2014 * conditional modifier generation. When negating a UD 2015 * value, there is a 33rd bit generated for the sign in the 2016 * accumulator value, so now you can't check, for example, 2017 * equality with a 32-bit value. See piglit fs-op-neg-uvec4. 2018 */ 2019 assert(!inst->conditional_mod || 2020 inst->src[i].type != BRW_REGISTER_TYPE_UD || 2021 !inst->src[i].negate); 2022 } 2023 dst = brw_reg_from_fs_reg(devinfo, inst, 2024 &inst->dst, compressed); 2025 2026 brw_set_default_access_mode(p, BRW_ALIGN_1); 2027 brw_set_default_predicate_control(p, inst->predicate); 2028 brw_set_default_predicate_inverse(p, inst->predicate_inverse); 2029 /* On gfx7 and above, hardware automatically adds the group onto the 2030 * flag subregister number. On Sandy Bridge and older, we have to do it 2031 * ourselves. 2032 */ 2033 const unsigned flag_subreg = inst->flag_subreg + 2034 (devinfo->ver >= 7 ? 0 : inst->group / 16); 2035 brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2); 2036 brw_set_default_saturate(p, inst->saturate); 2037 brw_set_default_mask_control(p, inst->force_writemask_all); 2038 brw_set_default_acc_write_control(p, inst->writes_accumulator); 2039 brw_set_default_swsb(p, swsb); 2040 2041 unsigned exec_size = inst->exec_size; 2042 if (devinfo->verx10 == 70 && 2043 (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) { 2044 exec_size *= 2; 2045 } 2046 2047 brw_set_default_exec_size(p, cvt(exec_size) - 1); 2048 2049 assert(inst->force_writemask_all || inst->exec_size >= 4); 2050 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0); 2051 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver)); 2052 assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 2053 2054 switch (inst->opcode) { 2055 case BRW_OPCODE_SYNC: 2056 assert(src[0].file == BRW_IMMEDIATE_VALUE); 2057 brw_SYNC(p, tgl_sync_function(src[0].ud)); 2058 break; 2059 case BRW_OPCODE_MOV: 2060 brw_MOV(p, dst, src[0]); 2061 break; 2062 case BRW_OPCODE_ADD: 2063 brw_ADD(p, dst, src[0], src[1]); 2064 break; 2065 case BRW_OPCODE_MUL: 2066 brw_MUL(p, dst, src[0], src[1]); 2067 break; 2068 case BRW_OPCODE_AVG: 2069 brw_AVG(p, dst, src[0], src[1]); 2070 break; 2071 case BRW_OPCODE_MACH: 2072 brw_MACH(p, dst, src[0], src[1]); 2073 break; 2074 2075 case BRW_OPCODE_DP4A: 2076 assert(devinfo->ver >= 12); 2077 brw_DP4A(p, dst, src[0], src[1], src[2]); 2078 break; 2079 2080 case BRW_OPCODE_LINE: 2081 brw_LINE(p, dst, src[0], src[1]); 2082 break; 2083 2084 case BRW_OPCODE_MAD: 2085 assert(devinfo->ver >= 6); 2086 if (devinfo->ver < 10) 2087 brw_set_default_access_mode(p, BRW_ALIGN_16); 2088 brw_MAD(p, dst, src[0], src[1], src[2]); 2089 break; 2090 2091 case BRW_OPCODE_LRP: 2092 assert(devinfo->ver >= 6 && devinfo->ver <= 10); 2093 if (devinfo->ver < 10) 2094 brw_set_default_access_mode(p, BRW_ALIGN_16); 2095 brw_LRP(p, dst, src[0], src[1], src[2]); 2096 break; 2097 2098 case BRW_OPCODE_ADD3: 2099 assert(devinfo->verx10 >= 125); 2100 brw_ADD3(p, dst, src[0], src[1], src[2]); 2101 break; 2102 2103 case BRW_OPCODE_FRC: 2104 brw_FRC(p, dst, src[0]); 2105 break; 2106 case BRW_OPCODE_RNDD: 2107 brw_RNDD(p, dst, src[0]); 2108 break; 2109 case BRW_OPCODE_RNDE: 2110 brw_RNDE(p, dst, src[0]); 2111 break; 2112 case BRW_OPCODE_RNDZ: 2113 brw_RNDZ(p, dst, src[0]); 2114 break; 2115 2116 case BRW_OPCODE_AND: 2117 brw_AND(p, dst, src[0], src[1]); 2118 break; 2119 case BRW_OPCODE_OR: 2120 brw_OR(p, dst, src[0], src[1]); 2121 break; 2122 case BRW_OPCODE_XOR: 2123 brw_XOR(p, dst, src[0], src[1]); 2124 break; 2125 case BRW_OPCODE_NOT: 2126 brw_NOT(p, dst, src[0]); 2127 break; 2128 case BRW_OPCODE_ASR: 2129 brw_ASR(p, dst, src[0], src[1]); 2130 break; 2131 case BRW_OPCODE_SHR: 2132 brw_SHR(p, dst, src[0], src[1]); 2133 break; 2134 case BRW_OPCODE_SHL: 2135 brw_SHL(p, dst, src[0], src[1]); 2136 break; 2137 case BRW_OPCODE_ROL: 2138 assert(devinfo->ver >= 11); 2139 assert(src[0].type == dst.type); 2140 brw_ROL(p, dst, src[0], src[1]); 2141 break; 2142 case BRW_OPCODE_ROR: 2143 assert(devinfo->ver >= 11); 2144 assert(src[0].type == dst.type); 2145 brw_ROR(p, dst, src[0], src[1]); 2146 break; 2147 case BRW_OPCODE_F32TO16: 2148 assert(devinfo->ver >= 7); 2149 brw_F32TO16(p, dst, src[0]); 2150 break; 2151 case BRW_OPCODE_F16TO32: 2152 assert(devinfo->ver >= 7); 2153 brw_F16TO32(p, dst, src[0]); 2154 break; 2155 case BRW_OPCODE_CMP: 2156 if (inst->exec_size >= 16 && devinfo->verx10 == 70 && 2157 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { 2158 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround 2159 * implemented in the compiler is not sufficient. Overriding the 2160 * type when the destination is the null register is necessary but 2161 * not sufficient by itself. 2162 */ 2163 dst.type = BRW_REGISTER_TYPE_D; 2164 } 2165 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 2166 break; 2167 case BRW_OPCODE_CMPN: 2168 if (inst->exec_size >= 16 && devinfo->verx10 == 70 && 2169 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { 2170 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround 2171 * implemented in the compiler is not sufficient. Overriding the 2172 * type when the destination is the null register is necessary but 2173 * not sufficient by itself. 2174 */ 2175 dst.type = BRW_REGISTER_TYPE_D; 2176 } 2177 brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]); 2178 break; 2179 case BRW_OPCODE_SEL: 2180 brw_SEL(p, dst, src[0], src[1]); 2181 break; 2182 case BRW_OPCODE_CSEL: 2183 assert(devinfo->ver >= 8); 2184 if (devinfo->ver < 10) 2185 brw_set_default_access_mode(p, BRW_ALIGN_16); 2186 brw_CSEL(p, dst, src[0], src[1], src[2]); 2187 break; 2188 case BRW_OPCODE_BFREV: 2189 assert(devinfo->ver >= 7); 2190 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 2191 retype(src[0], BRW_REGISTER_TYPE_UD)); 2192 break; 2193 case BRW_OPCODE_FBH: 2194 assert(devinfo->ver >= 7); 2195 brw_FBH(p, retype(dst, src[0].type), src[0]); 2196 break; 2197 case BRW_OPCODE_FBL: 2198 assert(devinfo->ver >= 7); 2199 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), 2200 retype(src[0], BRW_REGISTER_TYPE_UD)); 2201 break; 2202 case BRW_OPCODE_LZD: 2203 brw_LZD(p, dst, src[0]); 2204 break; 2205 case BRW_OPCODE_CBIT: 2206 assert(devinfo->ver >= 7); 2207 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), 2208 retype(src[0], BRW_REGISTER_TYPE_UD)); 2209 break; 2210 case BRW_OPCODE_ADDC: 2211 assert(devinfo->ver >= 7); 2212 brw_ADDC(p, dst, src[0], src[1]); 2213 break; 2214 case BRW_OPCODE_SUBB: 2215 assert(devinfo->ver >= 7); 2216 brw_SUBB(p, dst, src[0], src[1]); 2217 break; 2218 case BRW_OPCODE_MAC: 2219 brw_MAC(p, dst, src[0], src[1]); 2220 break; 2221 2222 case BRW_OPCODE_BFE: 2223 assert(devinfo->ver >= 7); 2224 if (devinfo->ver < 10) 2225 brw_set_default_access_mode(p, BRW_ALIGN_16); 2226 brw_BFE(p, dst, src[0], src[1], src[2]); 2227 break; 2228 2229 case BRW_OPCODE_BFI1: 2230 assert(devinfo->ver >= 7); 2231 brw_BFI1(p, dst, src[0], src[1]); 2232 break; 2233 case BRW_OPCODE_BFI2: 2234 assert(devinfo->ver >= 7); 2235 if (devinfo->ver < 10) 2236 brw_set_default_access_mode(p, BRW_ALIGN_16); 2237 brw_BFI2(p, dst, src[0], src[1], src[2]); 2238 break; 2239 2240 case BRW_OPCODE_IF: 2241 if (inst->src[0].file != BAD_FILE) { 2242 /* The instruction has an embedded compare (only allowed on gfx6) */ 2243 assert(devinfo->ver == 6); 2244 gfx6_IF(p, inst->conditional_mod, src[0], src[1]); 2245 } else { 2246 brw_IF(p, brw_get_default_exec_size(p)); 2247 } 2248 break; 2249 2250 case BRW_OPCODE_ELSE: 2251 brw_ELSE(p); 2252 break; 2253 case BRW_OPCODE_ENDIF: 2254 brw_ENDIF(p); 2255 break; 2256 2257 case BRW_OPCODE_DO: 2258 brw_DO(p, brw_get_default_exec_size(p)); 2259 break; 2260 2261 case BRW_OPCODE_BREAK: 2262 brw_BREAK(p); 2263 break; 2264 case BRW_OPCODE_CONTINUE: 2265 brw_CONT(p); 2266 break; 2267 2268 case BRW_OPCODE_WHILE: 2269 brw_WHILE(p); 2270 loop_count++; 2271 break; 2272 2273 case SHADER_OPCODE_RCP: 2274 case SHADER_OPCODE_RSQ: 2275 case SHADER_OPCODE_SQRT: 2276 case SHADER_OPCODE_EXP2: 2277 case SHADER_OPCODE_LOG2: 2278 case SHADER_OPCODE_SIN: 2279 case SHADER_OPCODE_COS: 2280 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 2281 if (devinfo->ver >= 6) { 2282 assert(inst->mlen == 0); 2283 assert(devinfo->ver >= 7 || inst->exec_size == 8); 2284 gfx6_math(p, dst, brw_math_function(inst->opcode), 2285 src[0], brw_null_reg()); 2286 } else { 2287 assert(inst->mlen >= 1); 2288 assert(devinfo->ver == 5 || devinfo->is_g4x || inst->exec_size == 8); 2289 gfx4_math(p, dst, 2290 brw_math_function(inst->opcode), 2291 inst->base_mrf, src[0], 2292 BRW_MATH_PRECISION_FULL); 2293 send_count++; 2294 } 2295 break; 2296 case SHADER_OPCODE_INT_QUOTIENT: 2297 case SHADER_OPCODE_INT_REMAINDER: 2298 case SHADER_OPCODE_POW: 2299 assert(devinfo->verx10 < 125); 2300 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 2301 if (devinfo->ver >= 6) { 2302 assert(inst->mlen == 0); 2303 assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) || 2304 inst->exec_size == 8); 2305 gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 2306 } else { 2307 assert(inst->mlen >= 1); 2308 assert(inst->exec_size == 8); 2309 gfx4_math(p, dst, brw_math_function(inst->opcode), 2310 inst->base_mrf, src[0], 2311 BRW_MATH_PRECISION_FULL); 2312 send_count++; 2313 } 2314 break; 2315 case FS_OPCODE_LINTERP: 2316 multiple_instructions_emitted = generate_linterp(inst, dst, src); 2317 break; 2318 case FS_OPCODE_PIXEL_X: 2319 assert(src[0].type == BRW_REGISTER_TYPE_UW); 2320 assert(src[1].type == BRW_REGISTER_TYPE_UW); 2321 src[0].subnr = 0 * type_sz(src[0].type); 2322 if (src[1].file == BRW_IMMEDIATE_VALUE) { 2323 assert(src[1].ud == 0); 2324 brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 2325 } else { 2326 /* Coarse pixel case */ 2327 brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); 2328 } 2329 break; 2330 case FS_OPCODE_PIXEL_Y: 2331 assert(src[0].type == BRW_REGISTER_TYPE_UW); 2332 assert(src[1].type == BRW_REGISTER_TYPE_UW); 2333 src[0].subnr = 4 * type_sz(src[0].type); 2334 if (src[1].file == BRW_IMMEDIATE_VALUE) { 2335 assert(src[1].ud == 0); 2336 brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 2337 } else { 2338 /* Coarse pixel case */ 2339 brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); 2340 } 2341 break; 2342 2343 case SHADER_OPCODE_SEND: 2344 generate_send(inst, dst, src[0], src[1], src[2], 2345 inst->ex_mlen > 0 ? src[3] : brw_null_reg()); 2346 if ((inst->desc & 0xff) == BRW_BTI_STATELESS || 2347 (inst->desc & 0xff) == GFX8_BTI_STATELESS_NON_COHERENT) { 2348 if (inst->size_written) 2349 fill_count++; 2350 else 2351 spill_count++; 2352 } else { 2353 send_count++; 2354 } 2355 break; 2356 2357 case SHADER_OPCODE_GET_BUFFER_SIZE: 2358 generate_get_buffer_size(inst, dst, src[0], src[1]); 2359 send_count++; 2360 break; 2361 case SHADER_OPCODE_TEX: 2362 case FS_OPCODE_TXB: 2363 case SHADER_OPCODE_TXD: 2364 case SHADER_OPCODE_TXF: 2365 case SHADER_OPCODE_TXF_CMS: 2366 case SHADER_OPCODE_TXL: 2367 case SHADER_OPCODE_TXS: 2368 case SHADER_OPCODE_LOD: 2369 case SHADER_OPCODE_TG4: 2370 case SHADER_OPCODE_SAMPLEINFO: 2371 assert(inst->src[0].file == BAD_FILE); 2372 generate_tex(inst, dst, src[1], src[2]); 2373 send_count++; 2374 break; 2375 2376 case FS_OPCODE_DDX_COARSE: 2377 case FS_OPCODE_DDX_FINE: 2378 generate_ddx(inst, dst, src[0]); 2379 break; 2380 case FS_OPCODE_DDY_COARSE: 2381 case FS_OPCODE_DDY_FINE: 2382 generate_ddy(inst, dst, src[0]); 2383 break; 2384 2385 case SHADER_OPCODE_GFX4_SCRATCH_WRITE: 2386 generate_scratch_write(inst, src[0]); 2387 spill_count++; 2388 break; 2389 2390 case SHADER_OPCODE_GFX4_SCRATCH_READ: 2391 generate_scratch_read(inst, dst); 2392 fill_count++; 2393 break; 2394 2395 case SHADER_OPCODE_GFX7_SCRATCH_READ: 2396 generate_scratch_read_gfx7(inst, dst); 2397 fill_count++; 2398 break; 2399 2400 case SHADER_OPCODE_SCRATCH_HEADER: 2401 generate_scratch_header(inst, dst); 2402 break; 2403 2404 case SHADER_OPCODE_MOV_INDIRECT: 2405 generate_mov_indirect(inst, dst, src[0], src[1]); 2406 break; 2407 2408 case SHADER_OPCODE_MOV_RELOC_IMM: 2409 assert(src[0].file == BRW_IMMEDIATE_VALUE); 2410 brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud); 2411 break; 2412 2413 case SHADER_OPCODE_URB_READ_SIMD8: 2414 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 2415 generate_urb_read(inst, dst, src[0]); 2416 send_count++; 2417 break; 2418 2419 case SHADER_OPCODE_URB_WRITE_SIMD8: 2420 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 2421 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 2422 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 2423 generate_urb_write(inst, src[0]); 2424 send_count++; 2425 break; 2426 2427 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 2428 assert(inst->force_writemask_all); 2429 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); 2430 send_count++; 2431 break; 2432 2433 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 2434 assert(inst->force_writemask_all); 2435 generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]); 2436 send_count++; 2437 break; 2438 2439 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 2440 generate_varying_pull_constant_load_gfx4(inst, dst, src[0]); 2441 send_count++; 2442 break; 2443 2444 case FS_OPCODE_REP_FB_WRITE: 2445 case FS_OPCODE_FB_WRITE: 2446 generate_fb_write(inst, src[0]); 2447 send_count++; 2448 break; 2449 2450 case FS_OPCODE_FB_READ: 2451 generate_fb_read(inst, dst, src[0]); 2452 send_count++; 2453 break; 2454 2455 case BRW_OPCODE_HALT: 2456 generate_halt(inst); 2457 break; 2458 2459 case SHADER_OPCODE_SHADER_TIME_ADD: 2460 generate_shader_time_add(inst, src[0], src[1], src[2]); 2461 break; 2462 2463 case SHADER_OPCODE_INTERLOCK: 2464 case SHADER_OPCODE_MEMORY_FENCE: { 2465 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2466 assert(src[2].file == BRW_IMMEDIATE_VALUE); 2467 2468 const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? 2469 BRW_OPCODE_SENDC : BRW_OPCODE_SEND; 2470 2471 brw_memory_fence(p, dst, src[0], send_op, 2472 brw_message_target(inst->sfid), 2473 /* commit_enable */ src[1].ud, 2474 /* bti */ src[2].ud); 2475 send_count++; 2476 break; 2477 } 2478 2479 case FS_OPCODE_SCHEDULING_FENCE: 2480 if (inst->sources == 0 && swsb.regdist == 0 && 2481 swsb.mode == TGL_SBID_NULL) { 2482 if (unlikely(debug_flag)) 2483 disasm_info->use_tail = true; 2484 break; 2485 } 2486 2487 if (devinfo->ver >= 12) { 2488 /* Use the available SWSB information to stall. A single SYNC is 2489 * sufficient since if there were multiple dependencies, the 2490 * scoreboard algorithm already injected other SYNCs before this 2491 * instruction. 2492 */ 2493 brw_SYNC(p, TGL_SYNC_NOP); 2494 } else { 2495 for (unsigned i = 0; i < inst->sources; i++) { 2496 /* Emit a MOV to force a stall until the instruction producing the 2497 * registers finishes. 2498 */ 2499 brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), 2500 retype(src[i], BRW_REGISTER_TYPE_UW)); 2501 } 2502 2503 if (inst->sources > 1) 2504 multiple_instructions_emitted = true; 2505 } 2506 2507 break; 2508 2509 case SHADER_OPCODE_FIND_LIVE_CHANNEL: { 2510 const struct brw_reg mask = 2511 brw_stage_has_packed_dispatch(devinfo, stage, 2512 prog_data) ? brw_imm_ud(~0u) : 2513 stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : 2514 brw_dmask_reg(); 2515 brw_find_live_channel(p, dst, mask); 2516 break; 2517 } 2518 case FS_OPCODE_LOAD_LIVE_CHANNELS: { 2519 assert(devinfo->ver >= 8); 2520 assert(inst->force_writemask_all && inst->group == 0); 2521 assert(inst->dst.file == BAD_FILE); 2522 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2523 brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), 2524 BRW_REGISTER_TYPE_UD), 2525 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); 2526 break; 2527 } 2528 case SHADER_OPCODE_BROADCAST: 2529 assert(inst->force_writemask_all); 2530 brw_broadcast(p, dst, src[0], src[1]); 2531 break; 2532 2533 case SHADER_OPCODE_SHUFFLE: 2534 generate_shuffle(inst, dst, src[0], src[1]); 2535 break; 2536 2537 case SHADER_OPCODE_SEL_EXEC: 2538 assert(inst->force_writemask_all); 2539 if (type_sz(dst.type) > 4 && !devinfo->has_64bit_float) { 2540 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2541 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 0), 2542 subscript(src[1], BRW_REGISTER_TYPE_UD, 0)); 2543 brw_set_default_swsb(p, tgl_swsb_null()); 2544 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 1), 2545 subscript(src[1], BRW_REGISTER_TYPE_UD, 1)); 2546 brw_set_default_mask_control(p, BRW_MASK_ENABLE); 2547 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 0), 2548 subscript(src[0], BRW_REGISTER_TYPE_UD, 0)); 2549 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 1), 2550 subscript(src[0], BRW_REGISTER_TYPE_UD, 1)); 2551 } else { 2552 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2553 brw_MOV(p, dst, src[1]); 2554 brw_set_default_mask_control(p, BRW_MASK_ENABLE); 2555 brw_set_default_swsb(p, tgl_swsb_null()); 2556 brw_MOV(p, dst, src[0]); 2557 } 2558 break; 2559 2560 case SHADER_OPCODE_QUAD_SWIZZLE: 2561 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2562 assert(src[1].type == BRW_REGISTER_TYPE_UD); 2563 generate_quad_swizzle(inst, dst, src[0], src[1].ud); 2564 break; 2565 2566 case SHADER_OPCODE_CLUSTER_BROADCAST: { 2567 assert(!src[0].negate && !src[0].abs); 2568 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2569 assert(src[1].type == BRW_REGISTER_TYPE_UD); 2570 assert(src[2].file == BRW_IMMEDIATE_VALUE); 2571 assert(src[2].type == BRW_REGISTER_TYPE_UD); 2572 const unsigned component = src[1].ud; 2573 const unsigned cluster_size = src[2].ud; 2574 unsigned vstride = cluster_size; 2575 unsigned width = cluster_size; 2576 2577 /* The maximum exec_size is 32, but the maximum width is only 16. */ 2578 if (inst->exec_size == width) { 2579 vstride = 0; 2580 width = 1; 2581 } 2582 2583 struct brw_reg strided = stride(suboffset(src[0], component), 2584 vstride, width, 0); 2585 if (type_sz(src[0].type) > 4 && 2586 (devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) || 2587 !devinfo->has_64bit_float)) { 2588 /* IVB has an issue (which we found empirically) where it reads 2589 * two address register components per channel for indirectly 2590 * addressed 64-bit sources. 2591 * 2592 * From the Cherryview PRM Vol 7. "Register Region Restrictions": 2593 * 2594 * "When source or destination datatype is 64b or operation is 2595 * integer DWord multiply, indirect addressing must not be 2596 * used." 2597 * 2598 * To work around both of these, we do two integer MOVs insead of 2599 * one 64-bit MOV. Because no double value should ever cross a 2600 * register boundary, it's safe to use the immediate offset in the 2601 * indirect here to handle adding 4 bytes to the offset and avoid 2602 * the extra ADD to the register file. 2603 */ 2604 assert(src[0].type == dst.type); 2605 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 2606 subscript(strided, BRW_REGISTER_TYPE_D, 0)); 2607 brw_set_default_swsb(p, tgl_swsb_null()); 2608 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 2609 subscript(strided, BRW_REGISTER_TYPE_D, 1)); 2610 } else { 2611 brw_MOV(p, dst, strided); 2612 } 2613 break; 2614 } 2615 2616 case FS_OPCODE_SET_SAMPLE_ID: 2617 generate_set_sample_id(inst, dst, src[0], src[1]); 2618 break; 2619 2620 case FS_OPCODE_PACK_HALF_2x16_SPLIT: 2621 generate_pack_half_2x16_split(inst, dst, src[0], src[1]); 2622 break; 2623 2624 case SHADER_OPCODE_HALT_TARGET: 2625 /* This is the place where the final HALT needs to be inserted if 2626 * we've emitted any discards. If not, this will emit no code. 2627 */ 2628 if (!patch_halt_jumps()) { 2629 if (unlikely(debug_flag)) { 2630 disasm_info->use_tail = true; 2631 } 2632 } 2633 break; 2634 2635 case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 2636 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2637 GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE); 2638 send_count++; 2639 break; 2640 2641 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 2642 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2643 GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); 2644 send_count++; 2645 break; 2646 2647 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 2648 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2649 GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); 2650 send_count++; 2651 break; 2652 2653 case CS_OPCODE_CS_TERMINATE: 2654 generate_cs_terminate(inst, src[0]); 2655 send_count++; 2656 break; 2657 2658 case SHADER_OPCODE_BARRIER: 2659 generate_barrier(inst, src[0]); 2660 send_count++; 2661 break; 2662 2663 case BRW_OPCODE_DIM: 2664 assert(devinfo->is_haswell); 2665 assert(src[0].type == BRW_REGISTER_TYPE_DF); 2666 assert(dst.type == BRW_REGISTER_TYPE_DF); 2667 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2668 break; 2669 2670 case SHADER_OPCODE_RND_MODE: { 2671 assert(src[0].file == BRW_IMMEDIATE_VALUE); 2672 /* 2673 * Changes the floating point rounding mode updating the control 2674 * register field defined at cr0.0[5-6] bits. 2675 */ 2676 enum brw_rnd_mode mode = 2677 (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT); 2678 brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK); 2679 } 2680 break; 2681 2682 case SHADER_OPCODE_FLOAT_CONTROL_MODE: 2683 assert(src[0].file == BRW_IMMEDIATE_VALUE); 2684 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2685 brw_float_controls_mode(p, src[0].d, src[1].d); 2686 break; 2687 2688 case SHADER_OPCODE_GET_DSS_ID: 2689 /* The Slice, Dual-SubSlice, SubSlice, EU, and Thread IDs are all 2690 * stored in sr0.0. Normally, for reading from HW regs, we'd just do 2691 * this in the IR and let the back-end generate some code but these 2692 * live in the state register which tends to have special rules. 2693 * 2694 * For convenience, we combine Slice ID and Dual-SubSlice ID into a 2695 * single ID. 2696 */ 2697 if (devinfo->ver == 12) { 2698 /* There is a SWSB restriction that requires that any time sr0 is 2699 * accessed both the instruction doing the access and the next one 2700 * have SWSB set to RegDist(1). 2701 */ 2702 if (brw_get_default_swsb(p).mode != TGL_SBID_NULL) 2703 brw_SYNC(p, TGL_SYNC_NOP); 2704 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 2705 brw_SHR(p, dst, brw_sr0_reg(0), brw_imm_ud(9)); 2706 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 2707 brw_AND(p, dst, dst, brw_imm_ud(0x1f)); 2708 } else { 2709 /* These move around basically every hardware generation, so don't 2710 * do any >= checks and fail if the platform hasn't explicitly 2711 * been enabled here. 2712 */ 2713 unreachable("Unsupported platform"); 2714 } 2715 break; 2716 2717 default: 2718 unreachable("Unsupported opcode"); 2719 2720 case SHADER_OPCODE_LOAD_PAYLOAD: 2721 unreachable("Should be lowered by lower_load_payload()"); 2722 } 2723 2724 if (multiple_instructions_emitted) 2725 continue; 2726 2727 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2728 assert(p->next_insn_offset == last_insn_offset + 16 || 2729 !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2730 "emitting more than 1 instruction"); 2731 2732 brw_inst *last = &p->store[last_insn_offset / 16]; 2733 2734 if (inst->conditional_mod) 2735 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2736 if (devinfo->ver < 12) { 2737 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2738 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2739 } 2740 } 2741 } 2742 2743 brw_set_uip_jip(p, start_offset); 2744 2745 /* end of program sentinel */ 2746 disasm_new_inst_group(disasm_info, p->next_insn_offset); 2747 2748#ifndef NDEBUG 2749 bool validated = 2750#else 2751 if (unlikely(debug_flag)) 2752#endif 2753 brw_validate_instructions(devinfo, p->store, 2754 start_offset, 2755 p->next_insn_offset, 2756 disasm_info); 2757 2758 int before_size = p->next_insn_offset - start_offset; 2759 brw_compact_instructions(p, start_offset, disasm_info); 2760 int after_size = p->next_insn_offset - start_offset; 2761 2762 if (unlikely(debug_flag)) { 2763 unsigned char sha1[21]; 2764 char sha1buf[41]; 2765 2766 _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst), 2767 after_size, sha1); 2768 _mesa_sha1_format(sha1buf, sha1); 2769 2770 fprintf(stderr, "Native code for %s (sha1 %s)\n" 2771 "SIMD%d shader: %d instructions. %d loops. %u cycles. " 2772 "%d:%d spills:fills, %u sends, " 2773 "scheduled with mode %s. " 2774 "Promoted %u constants. " 2775 "Compacted %d to %d bytes (%.0f%%)\n", 2776 shader_name, sha1buf, 2777 dispatch_width, before_size / 16, 2778 loop_count, perf.latency, 2779 spill_count, fill_count, send_count, 2780 shader_stats.scheduler_mode, 2781 shader_stats.promoted_constants, 2782 before_size, after_size, 2783 100.0f * (before_size - after_size) / before_size); 2784 2785 /* overriding the shader makes disasm_info invalid */ 2786 if (!brw_try_override_assembly(p, start_offset, sha1buf)) { 2787 dump_assembly(p->store, start_offset, p->next_insn_offset, 2788 disasm_info, perf.block_latency); 2789 } else { 2790 fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); 2791 } 2792 } 2793 ralloc_free(disasm_info); 2794#ifndef NDEBUG 2795 if (!validated && !debug_flag) { 2796 fprintf(stderr, 2797 "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n"); 2798 } 2799#endif 2800 assert(validated); 2801 2802 brw_shader_debug_log(compiler, log_data, 2803 "%s SIMD%d shader: %d inst, %d loops, %u cycles, " 2804 "%d:%d spills:fills, %u sends, " 2805 "scheduled with mode %s, " 2806 "Promoted %u constants, " 2807 "compacted %d to %d bytes.\n", 2808 _mesa_shader_stage_to_abbrev(stage), 2809 dispatch_width, before_size / 16 - nop_count, 2810 loop_count, perf.latency, 2811 spill_count, fill_count, send_count, 2812 shader_stats.scheduler_mode, 2813 shader_stats.promoted_constants, 2814 before_size, after_size); 2815 if (stats) { 2816 stats->dispatch_width = dispatch_width; 2817 stats->instructions = before_size / 16 - nop_count; 2818 stats->sends = send_count; 2819 stats->loops = loop_count; 2820 stats->cycles = perf.latency; 2821 stats->spills = spill_count; 2822 stats->fills = fill_count; 2823 } 2824 2825 return start_offset; 2826} 2827 2828void 2829fs_generator::add_const_data(void *data, unsigned size) 2830{ 2831 assert(prog_data->const_data_size == 0); 2832 if (size > 0) { 2833 prog_data->const_data_size = size; 2834 prog_data->const_data_offset = brw_append_data(p, data, size, 32); 2835 } 2836} 2837 2838void 2839fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt) 2840{ 2841 assert(brw_shader_stage_is_bindless(stage)); 2842 struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data); 2843 if (num_resume_shaders > 0) { 2844 bs_prog_data->resume_sbt_offset = 2845 brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32); 2846 for (unsigned i = 0; i < num_resume_shaders; i++) { 2847 size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt); 2848 assert(offset <= UINT32_MAX); 2849 brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET, 2850 BRW_SHADER_RELOC_TYPE_U32, 2851 (uint32_t)offset, (uint32_t)sbt[i]); 2852 } 2853 } 2854} 2855 2856const unsigned * 2857fs_generator::get_assembly() 2858{ 2859 prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs); 2860 2861 return brw_get_program(p, &prog_data->program_size); 2862} 2863