1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_generator.cpp 25 * 26 * This file supports generating code from the FS LIR to the actual 27 * native instructions. 28 */ 29 30#include "brw_eu.h" 31#include "brw_fs.h" 32#include "brw_cfg.h" 33 34static enum brw_reg_file 35brw_file_from_reg(fs_reg *reg) 36{ 37 switch (reg->file) { 38 case ARF: 39 return BRW_ARCHITECTURE_REGISTER_FILE; 40 case FIXED_GRF: 41 case VGRF: 42 return BRW_GENERAL_REGISTER_FILE; 43 case MRF: 44 return BRW_MESSAGE_REGISTER_FILE; 45 case IMM: 46 return BRW_IMMEDIATE_VALUE; 47 case BAD_FILE: 48 case ATTR: 49 case UNIFORM: 50 unreachable("not reached"); 51 } 52 return BRW_ARCHITECTURE_REGISTER_FILE; 53} 54 55static struct brw_reg 56brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst, 57 fs_reg *reg, bool compressed) 58{ 59 struct brw_reg brw_reg; 60 61 switch (reg->file) { 62 case MRF: 63 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); 64 /* Fallthrough */ 65 case VGRF: 66 if (reg->stride == 0) { 67 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); 68 } else { 69 /* From the Haswell PRM: 70 * 71 * "VertStride must be used to cross GRF register boundaries. This 72 * rule implies that elements within a 'Width' cannot cross GRF 73 * boundaries." 74 * 75 * The maximum width value that could satisfy this restriction is: 76 */ 77 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); 78 79 /* Because the hardware can only split source regions at a whole 80 * multiple of width during decompression (i.e. vertically), clamp 81 * the value obtained above to the physical execution size of a 82 * single decompressed chunk of the instruction: 83 */ 84 const unsigned phys_width = compressed ? inst->exec_size / 2 : 85 inst->exec_size; 86 87 /* XXX - The equation above is strictly speaking not correct on 88 * hardware that supports unbalanced GRF writes -- On Gen9+ 89 * each decompressed chunk of the instruction may have a 90 * different execution size when the number of components 91 * written to each destination GRF is not the same. 92 */ 93 if (reg->stride > 4) { 94 assert(reg != &inst->dst); 95 assert(reg->stride * type_sz(reg->type) <= REG_SIZE); 96 brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); 97 brw_reg = stride(brw_reg, reg->stride, 1, 0); 98 } else { 99 const unsigned width = MIN2(reg_width, phys_width); 100 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); 101 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); 102 } 103 104 if (devinfo->gen == 7 && !devinfo->is_haswell) { 105 /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13): 106 * "Each DF (Double Float) operand uses an element size of 4 rather 107 * than 8 and all regioning parameters are twice what the values 108 * would be based on the true element size: ExecSize, Width, 109 * HorzStride, and VertStride. Each DF operand uses a pair of 110 * channels and all masking and swizzing should be adjusted 111 * appropriately." 112 * 113 * From the IvyBridge PRM (Special Requirements for Handling Double 114 * Precision Data Types, page 71): 115 * "In Align1 mode, all regioning parameters like stride, execution 116 * size, and width must use the syntax of a pair of packed 117 * floats. The offsets for these data types must be 64-bit 118 * aligned. The execution size and regioning parameters are in terms 119 * of floats." 120 * 121 * Summarized: when handling DF-typed arguments, ExecSize, 122 * VertStride, and Width must be doubled. 123 * 124 * It applies to BayTrail too. 125 */ 126 if (type_sz(reg->type) == 8) { 127 brw_reg.width++; 128 if (brw_reg.vstride > 0) 129 brw_reg.vstride++; 130 assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1); 131 } 132 133 /* When converting from DF->F, we set the destination stride to 2 134 * because each d2f conversion implicitly writes 2 floats, being 135 * the first one the converted value. IVB/BYT actually writes two 136 * F components per SIMD channel, and every other component is 137 * filled with garbage. 138 */ 139 if (reg == &inst->dst && get_exec_type_size(inst) == 8 && 140 type_sz(inst->dst.type) < 8) { 141 assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1); 142 brw_reg.hstride--; 143 } 144 } 145 } 146 147 brw_reg = retype(brw_reg, reg->type); 148 brw_reg = byte_offset(brw_reg, reg->offset); 149 brw_reg.abs = reg->abs; 150 brw_reg.negate = reg->negate; 151 break; 152 case ARF: 153 case FIXED_GRF: 154 case IMM: 155 assert(reg->offset == 0); 156 brw_reg = reg->as_brw_reg(); 157 break; 158 case BAD_FILE: 159 /* Probably unused. */ 160 brw_reg = brw_null_reg(); 161 break; 162 case ATTR: 163 case UNIFORM: 164 unreachable("not reached"); 165 } 166 167 /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0> 168 * region, but on IVB and BYT DF regions must be programmed in terms of 169 * floats. A <0,2,1> region accomplishes this. 170 */ 171 if (devinfo->gen == 7 && !devinfo->is_haswell && 172 type_sz(reg->type) == 8 && 173 brw_reg.vstride == BRW_VERTICAL_STRIDE_0 && 174 brw_reg.width == BRW_WIDTH_1 && 175 brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) { 176 brw_reg.width = BRW_WIDTH_2; 177 brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1; 178 } 179 180 return brw_reg; 181} 182 183fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, 184 void *mem_ctx, 185 struct brw_stage_prog_data *prog_data, 186 unsigned promoted_constants, 187 bool runtime_check_aads_emit, 188 gl_shader_stage stage) 189 190 : compiler(compiler), log_data(log_data), 191 devinfo(compiler->devinfo), 192 prog_data(prog_data), 193 promoted_constants(promoted_constants), 194 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), 195 stage(stage), mem_ctx(mem_ctx) 196{ 197 p = rzalloc(mem_ctx, struct brw_codegen); 198 brw_init_codegen(devinfo, p, mem_ctx); 199 200 /* In the FS code generator, we are very careful to ensure that we always 201 * set the right execution size so we don't need the EU code to "help" us 202 * by trying to infer it. Sometimes, it infers the wrong thing. 203 */ 204 p->automatic_exec_sizes = false; 205} 206 207fs_generator::~fs_generator() 208{ 209} 210 211class ip_record : public exec_node { 212public: 213 DECLARE_RALLOC_CXX_OPERATORS(ip_record) 214 215 ip_record(int ip) 216 { 217 this->ip = ip; 218 } 219 220 int ip; 221}; 222 223bool 224fs_generator::patch_discard_jumps_to_fb_writes() 225{ 226 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) 227 return false; 228 229 int scale = brw_jump_scale(p->devinfo); 230 231 /* There is a somewhat strange undocumented requirement of using 232 * HALT, according to the simulator. If some channel has HALTed to 233 * a particular UIP, then by the end of the program, every channel 234 * must have HALTed to that UIP. Furthermore, the tracking is a 235 * stack, so you can't do the final halt of a UIP after starting 236 * halting to a new UIP. 237 * 238 * Symptoms of not emitting this instruction on actual hardware 239 * included GPU hangs and sparkly rendering on the piglit discard 240 * tests. 241 */ 242 brw_inst *last_halt = gen6_HALT(p); 243 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); 244 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); 245 246 int ip = p->nr_insn; 247 248 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { 249 brw_inst *patch = &p->store[patch_ip->ip]; 250 251 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); 252 /* HALT takes a half-instruction distance from the pre-incremented IP. */ 253 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); 254 } 255 256 this->discard_halt_patches.make_empty(); 257 return true; 258} 259 260void 261fs_generator::generate_send(fs_inst *inst, 262 struct brw_reg dst, 263 struct brw_reg desc, 264 struct brw_reg ex_desc, 265 struct brw_reg payload, 266 struct brw_reg payload2) 267{ 268 const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE && 269 dst.nr == BRW_ARF_NULL; 270 const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE; 271 272 uint32_t desc_imm = inst->desc | 273 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); 274 275 uint32_t ex_desc_imm = brw_message_ex_desc(devinfo, inst->ex_mlen); 276 277 if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) { 278 /* If we have any sort of extended descriptor, then we need SENDS. This 279 * also covers the dual-payload case because ex_mlen goes in ex_desc. 280 */ 281 brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, 282 desc, desc_imm, ex_desc, ex_desc_imm, 283 inst->eot); 284 if (inst->check_tdr) 285 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDSC); 286 } else { 287 brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, 288 inst->eot); 289 if (inst->check_tdr) 290 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); 291 } 292} 293 294void 295fs_generator::fire_fb_write(fs_inst *inst, 296 struct brw_reg payload, 297 struct brw_reg implied_header, 298 GLuint nr) 299{ 300 uint32_t msg_control; 301 302 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 303 304 if (devinfo->gen < 6) { 305 brw_push_insn_state(p); 306 brw_set_default_exec_size(p, BRW_EXECUTE_8); 307 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 308 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 309 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 310 brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1), 311 offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1)); 312 brw_pop_insn_state(p); 313 } 314 315 if (inst->opcode == FS_OPCODE_REP_FB_WRITE) { 316 assert(inst->group == 0 && inst->exec_size == 16); 317 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; 318 319 } else if (prog_data->dual_src_blend) { 320 assert(inst->exec_size == 8); 321 322 if (inst->group % 16 == 0) 323 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; 324 else if (inst->group % 16 == 8) 325 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; 326 else 327 unreachable("Invalid dual-source FB write instruction group"); 328 329 } else { 330 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16)); 331 332 if (inst->exec_size == 16) 333 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; 334 else if (inst->exec_size == 8) 335 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; 336 else 337 unreachable("Invalid FB write execution size"); 338 } 339 340 /* We assume render targets start at 0, because headerless FB write 341 * messages set "Render Target Index" to 0. Using a different binding 342 * table index would make it impossible to use headerless messages. 343 */ 344 const uint32_t surf_index = inst->target; 345 346 brw_inst *insn = brw_fb_WRITE(p, 347 payload, 348 retype(implied_header, BRW_REGISTER_TYPE_UW), 349 msg_control, 350 surf_index, 351 nr, 352 0, 353 inst->eot, 354 inst->last_rt, 355 inst->header_size != 0); 356 357 if (devinfo->gen >= 6) 358 brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16); 359} 360 361void 362fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) 363{ 364 if (devinfo->gen < 8 && !devinfo->is_haswell) { 365 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 366 } 367 368 const struct brw_reg implied_header = 369 devinfo->gen < 6 ? payload : brw_null_reg(); 370 371 if (inst->base_mrf >= 0) 372 payload = brw_message_reg(inst->base_mrf); 373 374 if (!runtime_check_aads_emit) { 375 fire_fb_write(inst, payload, implied_header, inst->mlen); 376 } else { 377 /* This can only happen in gen < 6 */ 378 assert(devinfo->gen < 6); 379 380 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 381 382 /* Check runtime bit to detect if we have to send AA data or not */ 383 brw_push_insn_state(p); 384 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 385 brw_set_default_exec_size(p, BRW_EXECUTE_1); 386 brw_AND(p, 387 v1_null_ud, 388 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), 389 brw_imm_ud(1<<26)); 390 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); 391 392 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; 393 brw_pop_insn_state(p); 394 { 395 /* Don't send AA data */ 396 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); 397 } 398 brw_land_fwd_jump(p, jmp); 399 fire_fb_write(inst, payload, implied_header, inst->mlen); 400 } 401} 402 403void 404fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, 405 struct brw_reg payload) 406{ 407 assert(inst->size_written % REG_SIZE == 0); 408 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 409 /* We assume that render targets start at binding table index 0. */ 410 const unsigned surf_index = inst->target; 411 412 gen9_fb_READ(p, dst, payload, surf_index, 413 inst->header_size, inst->size_written / REG_SIZE, 414 prog_data->persample_dispatch); 415} 416 417void 418fs_generator::generate_mov_indirect(fs_inst *inst, 419 struct brw_reg dst, 420 struct brw_reg reg, 421 struct brw_reg indirect_byte_offset) 422{ 423 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD); 424 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE); 425 assert(!reg.abs && !reg.negate); 426 assert(reg.type == dst.type); 427 428 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; 429 430 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { 431 imm_byte_offset += indirect_byte_offset.ud; 432 433 reg.nr = imm_byte_offset / REG_SIZE; 434 reg.subnr = imm_byte_offset % REG_SIZE; 435 brw_MOV(p, dst, reg); 436 } else { 437 /* Prior to Broadwell, there are only 8 address registers. */ 438 assert(inst->exec_size <= 8 || devinfo->gen >= 8); 439 440 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 441 struct brw_reg addr = vec8(brw_address_reg(0)); 442 443 /* The destination stride of an instruction (in bytes) must be greater 444 * than or equal to the size of the rest of the instruction. Since the 445 * address register is of type UW, we can't use a D-type instruction. 446 * In order to get around this, re retype to UW and use a stride. 447 */ 448 indirect_byte_offset = 449 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); 450 451 /* There are a number of reasons why we don't use the base offset here. 452 * One reason is that the field is only 9 bits which means we can only 453 * use it to access the first 16 GRFs. Also, from the Haswell PRM 454 * section "Register Region Restrictions": 455 * 456 * "The lower bits of the AddressImmediate must not overflow to 457 * change the register address. The lower 5 bits of Address 458 * Immediate when added to lower 5 bits of address register gives 459 * the sub-register offset. The upper bits of Address Immediate 460 * when added to upper bits of address register gives the register 461 * address. Any overflow from sub-register offset is dropped." 462 * 463 * Since the indirect may cause us to cross a register boundary, this 464 * makes the base offset almost useless. We could try and do something 465 * clever where we use a actual base offset if base_offset % 32 == 0 but 466 * that would mean we were generating different code depending on the 467 * base offset. Instead, for the sake of consistency, we'll just do the 468 * add ourselves. This restriction is only listed in the Haswell PRM 469 * but empirical testing indicates that it applies on all older 470 * generations and is lifted on Broadwell. 471 * 472 * In the end, while base_offset is nice to look at in the generated 473 * code, using it saves us 0 instructions and would require quite a bit 474 * of case-by-case work. It's just not worth it. 475 */ 476 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); 477 478 if (type_sz(reg.type) > 4 && 479 ((devinfo->gen == 7 && !devinfo->is_haswell) || 480 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) || 481 !devinfo->has_64bit_types)) { 482 /* IVB has an issue (which we found empirically) where it reads two 483 * address register components per channel for indirectly addressed 484 * 64-bit sources. 485 * 486 * From the Cherryview PRM Vol 7. "Register Region Restrictions": 487 * 488 * "When source or destination datatype is 64b or operation is 489 * integer DWord multiply, indirect addressing must not be used." 490 * 491 * To work around both of these, we do two integer MOVs insead of one 492 * 64-bit MOV. Because no double value should ever cross a register 493 * boundary, it's safe to use the immediate offset in the indirect 494 * here to handle adding 4 bytes to the offset and avoid the extra 495 * ADD to the register file. 496 */ 497 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 498 retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); 499 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 500 retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); 501 } else { 502 struct brw_reg ind_src = brw_VxH_indirect(0, 0); 503 504 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type)); 505 506 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE && 507 !inst->get_next()->is_tail_sentinel() && 508 ((fs_inst *)inst->get_next())->mlen > 0) { 509 /* From the Sandybridge PRM: 510 * 511 * "[Errata: DevSNB(SNB)] If MRF register is updated by any 512 * instruction that “indexed/indirect” source AND is followed 513 * by a send, the instruction requires a “Switch”. This is to 514 * avoid race condition where send may dispatch before MRF is 515 * updated." 516 */ 517 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH); 518 } 519 } 520 } 521} 522 523void 524fs_generator::generate_shuffle(fs_inst *inst, 525 struct brw_reg dst, 526 struct brw_reg src, 527 struct brw_reg idx) 528{ 529 /* Ivy bridge has some strange behavior that makes this a real pain to 530 * implement for 64-bit values so we just don't bother. 531 */ 532 assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4); 533 534 /* Because we're using the address register, we're limited to 8-wide 535 * execution on gen7. On gen8, we're limited to 16-wide by the address 536 * register file and 8-wide for 64-bit types. We could try and make this 537 * instruction splittable higher up in the compiler but that gets weird 538 * because it reads all of the channels regardless of execution size. It's 539 * easier just to split it here. 540 */ 541 const unsigned lower_width = 542 (devinfo->gen <= 7 || type_sz(src.type) > 4) ? 543 8 : MIN2(16, inst->exec_size); 544 545 brw_set_default_exec_size(p, cvt(lower_width) - 1); 546 for (unsigned group = 0; group < inst->exec_size; group += lower_width) { 547 brw_set_default_group(p, group); 548 549 if ((src.vstride == 0 && src.hstride == 0) || 550 idx.file == BRW_IMMEDIATE_VALUE) { 551 /* Trivial, the source is already uniform or the index is a constant. 552 * We will typically not get here if the optimizer is doing its job, 553 * but asserting would be mean. 554 */ 555 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; 556 brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0)); 557 } else { 558 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 559 struct brw_reg addr = vec8(brw_address_reg(0)); 560 561 struct brw_reg group_idx = suboffset(idx, group); 562 563 if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) { 564 /* Things get grumpy if the register is too wide. */ 565 group_idx.width--; 566 group_idx.vstride--; 567 } 568 569 assert(type_sz(group_idx.type) <= 4); 570 if (type_sz(group_idx.type) == 4) { 571 /* The destination stride of an instruction (in bytes) must be 572 * greater than or equal to the size of the rest of the 573 * instruction. Since the address register is of type UW, we 574 * can't use a D-type instruction. In order to get around this, 575 * re retype to UW and use a stride. 576 */ 577 group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W); 578 } 579 580 /* Take into account the component size and horizontal stride. */ 581 assert(src.vstride == src.hstride + src.width); 582 brw_SHL(p, addr, group_idx, 583 brw_imm_uw(_mesa_logbase2(type_sz(src.type)) + 584 src.hstride - 1)); 585 586 /* Add on the register start offset */ 587 brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr)); 588 589 if (type_sz(src.type) > 4 && 590 ((devinfo->gen == 7 && !devinfo->is_haswell) || 591 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { 592 /* IVB has an issue (which we found empirically) where it reads 593 * two address register components per channel for indirectly 594 * addressed 64-bit sources. 595 * 596 * From the Cherryview PRM Vol 7. "Register Region Restrictions": 597 * 598 * "When source or destination datatype is 64b or operation is 599 * integer DWord multiply, indirect addressing must not be 600 * used." 601 * 602 * To work around both of these, we do two integer MOVs insead of 603 * one 64-bit MOV. Because no double value should ever cross a 604 * register boundary, it's safe to use the immediate offset in the 605 * indirect here to handle adding 4 bytes to the offset and avoid 606 * the extra ADD to the register file. 607 */ 608 struct brw_reg gdst = suboffset(dst, group); 609 struct brw_reg dst_d = retype(spread(gdst, 2), 610 BRW_REGISTER_TYPE_D); 611 brw_MOV(p, dst_d, 612 retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); 613 brw_MOV(p, byte_offset(dst_d, 4), 614 retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); 615 } else { 616 brw_MOV(p, suboffset(dst, group), 617 retype(brw_VxH_indirect(0, 0), src.type)); 618 } 619 } 620 } 621} 622 623void 624fs_generator::generate_quad_swizzle(const fs_inst *inst, 625 struct brw_reg dst, struct brw_reg src, 626 unsigned swiz) 627{ 628 /* Requires a quad. */ 629 assert(inst->exec_size >= 4); 630 631 if (src.file == BRW_IMMEDIATE_VALUE || 632 has_scalar_region(src)) { 633 /* The value is uniform across all channels */ 634 brw_MOV(p, dst, src); 635 636 } else if (devinfo->gen < 11 && type_sz(src.type) == 4) { 637 /* This only works on 8-wide 32-bit values */ 638 assert(inst->exec_size == 8); 639 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 640 assert(src.vstride == src.width + 1); 641 brw_set_default_access_mode(p, BRW_ALIGN_16); 642 struct brw_reg swiz_src = stride(src, 4, 4, 1); 643 swiz_src.swizzle = swiz; 644 brw_MOV(p, dst, swiz_src); 645 646 } else { 647 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 648 assert(src.vstride == src.width + 1); 649 const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0)); 650 651 switch (swiz) { 652 case BRW_SWIZZLE_XXXX: 653 case BRW_SWIZZLE_YYYY: 654 case BRW_SWIZZLE_ZZZZ: 655 case BRW_SWIZZLE_WWWW: 656 brw_MOV(p, dst, stride(src_0, 4, 4, 0)); 657 break; 658 659 case BRW_SWIZZLE_XXZZ: 660 case BRW_SWIZZLE_YYWW: 661 brw_MOV(p, dst, stride(src_0, 2, 2, 0)); 662 break; 663 664 case BRW_SWIZZLE_XYXY: 665 case BRW_SWIZZLE_ZWZW: 666 assert(inst->exec_size == 4); 667 brw_MOV(p, dst, stride(src_0, 0, 2, 1)); 668 break; 669 670 default: 671 assert(inst->force_writemask_all); 672 brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); 673 674 for (unsigned c = 0; c < 4; c++) { 675 brw_inst *insn = brw_MOV( 676 p, stride(suboffset(dst, c), 677 4 * inst->dst.stride, 1, 4 * inst->dst.stride), 678 stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); 679 680 brw_inst_set_no_dd_clear(devinfo, insn, c < 3); 681 brw_inst_set_no_dd_check(devinfo, insn, c > 0); 682 } 683 684 break; 685 } 686 } 687} 688 689void 690fs_generator::generate_urb_read(fs_inst *inst, 691 struct brw_reg dst, 692 struct brw_reg header) 693{ 694 assert(inst->size_written % REG_SIZE == 0); 695 assert(header.file == BRW_GENERAL_REGISTER_FILE); 696 assert(header.type == BRW_REGISTER_TYPE_UD); 697 698 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 699 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 700 brw_set_src0(p, send, header); 701 brw_set_src1(p, send, brw_imm_ud(0u)); 702 703 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); 704 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ); 705 706 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT) 707 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true); 708 709 brw_inst_set_mlen(p->devinfo, send, inst->mlen); 710 brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE); 711 brw_inst_set_header_present(p->devinfo, send, true); 712 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset); 713} 714 715void 716fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) 717{ 718 brw_inst *insn; 719 720 /* WaClearTDRRegBeforeEOTForNonPS. 721 * 722 * WA: Clear tdr register before send EOT in all non-PS shader kernels 723 * 724 * mov(8) tdr0:ud 0x0:ud {NoMask}" 725 */ 726 if (inst->eot && p->devinfo->gen == 10) { 727 brw_push_insn_state(p); 728 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 729 brw_MOV(p, brw_tdr_reg(), brw_imm_uw(0)); 730 brw_pop_insn_state(p); 731 } 732 733 insn = brw_next_insn(p, BRW_OPCODE_SEND); 734 735 brw_set_dest(p, insn, brw_null_reg()); 736 brw_set_src0(p, insn, payload); 737 brw_set_src1(p, insn, brw_imm_ud(0u)); 738 739 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); 740 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE); 741 742 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || 743 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) 744 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true); 745 746 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || 747 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) 748 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true); 749 750 brw_inst_set_mlen(p->devinfo, insn, inst->mlen); 751 brw_inst_set_rlen(p->devinfo, insn, 0); 752 brw_inst_set_eot(p->devinfo, insn, inst->eot); 753 brw_inst_set_header_present(p->devinfo, insn, true); 754 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset); 755} 756 757void 758fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) 759{ 760 struct brw_inst *insn; 761 762 insn = brw_next_insn(p, BRW_OPCODE_SEND); 763 764 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); 765 brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); 766 brw_set_src1(p, insn, brw_imm_ud(0u)); 767 768 /* Terminate a compute shader by sending a message to the thread spawner. 769 */ 770 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); 771 brw_inst_set_mlen(devinfo, insn, 1); 772 brw_inst_set_rlen(devinfo, insn, 0); 773 brw_inst_set_eot(devinfo, insn, inst->eot); 774 brw_inst_set_header_present(devinfo, insn, false); 775 776 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ 777 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ 778 779 /* Note that even though the thread has a URB resource associated with it, 780 * we set the "do not dereference URB" bit, because the URB resource is 781 * managed by the fixed-function unit, so it will free it automatically. 782 */ 783 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ 784 785 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 786} 787 788void 789fs_generator::generate_barrier(fs_inst *, struct brw_reg src) 790{ 791 brw_barrier(p, src); 792 brw_WAIT(p); 793} 794 795bool 796fs_generator::generate_linterp(fs_inst *inst, 797 struct brw_reg dst, struct brw_reg *src) 798{ 799 /* PLN reads: 800 * / in SIMD16 \ 801 * ----------------------------------- 802 * | src1+0 | src1+1 | src1+2 | src1+3 | 803 * |-----------------------------------| 804 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| 805 * ----------------------------------- 806 * 807 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: 808 * 809 * ----------------------------------- 810 * | src1+0 | src1+1 | src1+2 | src1+3 | 811 * |-----------------------------------| 812 * |(x0, x1)|(y0, y1)| | | in SIMD8 813 * |-----------------------------------| 814 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 815 * ----------------------------------- 816 * 817 * See also: emit_interpolation_setup_gen4(). 818 */ 819 struct brw_reg delta_x = src[0]; 820 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); 821 struct brw_reg interp = src[1]; 822 brw_inst *i[2]; 823 824 /* fs_visitor::lower_linterp() will do the lowering to MAD instructions for 825 * us on gen11+ 826 */ 827 assert(devinfo->gen < 11); 828 829 if (devinfo->has_pln) { 830 if (devinfo->gen <= 6 && (delta_x.nr & 1) != 0) { 831 /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane": 832 * 833 * "[DevSNB]:<src1> must be even register aligned. 834 * 835 * This restriction is lifted on Ivy Bridge. 836 * 837 * This means that we need to split PLN into LINE+MAC on-the-fly. 838 * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so 839 * we have to split into SIMD8 pieces. For gen4 (!has_pln), the 840 * coordinate registers are laid out differently so we leave it as a 841 * SIMD16 instruction. 842 */ 843 assert(inst->exec_size == 8 || inst->exec_size == 16); 844 assert(inst->group % 16 == 0); 845 846 brw_push_insn_state(p); 847 brw_set_default_exec_size(p, BRW_EXECUTE_8); 848 849 /* Thanks to two accumulators, we can emit all the LINEs and then all 850 * the MACs. This improves parallelism a bit. 851 */ 852 for (unsigned g = 0; g < inst->exec_size / 8; g++) { 853 brw_inst *line = brw_LINE(p, brw_null_reg(), interp, 854 offset(delta_x, g * 2)); 855 brw_inst_set_group(devinfo, line, inst->group + g * 8); 856 857 /* LINE writes the accumulator automatically on gen4-5. On Sandy 858 * Bridge and later, we have to explicitly enable it. 859 */ 860 if (devinfo->gen >= 6) 861 brw_inst_set_acc_wr_control(p->devinfo, line, true); 862 863 /* brw_set_default_saturate() is called before emitting 864 * instructions, so the saturate bit is set in each instruction, 865 * so we need to unset it on the LINE instructions. 866 */ 867 brw_inst_set_saturate(p->devinfo, line, false); 868 } 869 870 for (unsigned g = 0; g < inst->exec_size / 8; g++) { 871 brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1), 872 offset(delta_x, g * 2 + 1)); 873 brw_inst_set_group(devinfo, mac, inst->group + g * 8); 874 brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod); 875 } 876 877 brw_pop_insn_state(p); 878 879 return true; 880 } else { 881 brw_PLN(p, dst, interp, delta_x); 882 883 return false; 884 } 885 } else { 886 i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x); 887 i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y); 888 889 brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); 890 891 /* brw_set_default_saturate() is called before emitting instructions, so 892 * the saturate bit is set in each instruction, so we need to unset it on 893 * the first instruction. 894 */ 895 brw_inst_set_saturate(p->devinfo, i[0], false); 896 897 return true; 898 } 899} 900 901void 902fs_generator::generate_get_buffer_size(fs_inst *inst, 903 struct brw_reg dst, 904 struct brw_reg src, 905 struct brw_reg surf_index) 906{ 907 assert(devinfo->gen >= 7); 908 assert(surf_index.file == BRW_IMMEDIATE_VALUE); 909 910 uint32_t simd_mode; 911 int rlen = 4; 912 913 switch (inst->exec_size) { 914 case 8: 915 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 916 break; 917 case 16: 918 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 919 break; 920 default: 921 unreachable("Invalid width for texture instruction"); 922 } 923 924 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 925 rlen = 8; 926 dst = vec16(dst); 927 } 928 929 brw_SAMPLE(p, 930 retype(dst, BRW_REGISTER_TYPE_UW), 931 inst->base_mrf, 932 src, 933 surf_index.ud, 934 0, 935 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 936 rlen, /* response length */ 937 inst->mlen, 938 inst->header_size > 0, 939 simd_mode, 940 BRW_SAMPLER_RETURN_FORMAT_SINT32); 941} 942 943void 944fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, 945 struct brw_reg surface_index, 946 struct brw_reg sampler_index) 947{ 948 assert(devinfo->gen < 7); 949 assert(inst->size_written % REG_SIZE == 0); 950 int msg_type = -1; 951 uint32_t simd_mode; 952 uint32_t return_format; 953 954 /* Sampler EOT message of less than the dispatch width would kill the 955 * thread prematurely. 956 */ 957 assert(!inst->eot || inst->exec_size == dispatch_width); 958 959 switch (dst.type) { 960 case BRW_REGISTER_TYPE_D: 961 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 962 break; 963 case BRW_REGISTER_TYPE_UD: 964 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 965 break; 966 default: 967 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 968 break; 969 } 970 971 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type 972 * is set as part of the message descriptor. On gen4, the PRM seems to 973 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on 974 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is 975 * gone from the message descriptor entirely and you just get UINT32 all 976 * the time regasrdless. Since we can really only do non-UINT32 on gen4, 977 * just stomp it to UINT32 all the time. 978 */ 979 if (inst->opcode == SHADER_OPCODE_TXS) 980 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 981 982 switch (inst->exec_size) { 983 case 8: 984 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 985 break; 986 case 16: 987 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 988 break; 989 default: 990 unreachable("Invalid width for texture instruction"); 991 } 992 993 if (devinfo->gen >= 5) { 994 switch (inst->opcode) { 995 case SHADER_OPCODE_TEX: 996 if (inst->shadow_compare) { 997 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 998 } else { 999 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 1000 } 1001 break; 1002 case FS_OPCODE_TXB: 1003 if (inst->shadow_compare) { 1004 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 1005 } else { 1006 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 1007 } 1008 break; 1009 case SHADER_OPCODE_TXL: 1010 if (inst->shadow_compare) { 1011 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 1012 } else { 1013 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 1014 } 1015 break; 1016 case SHADER_OPCODE_TXS: 1017 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 1018 break; 1019 case SHADER_OPCODE_TXD: 1020 assert(!inst->shadow_compare); 1021 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 1022 break; 1023 case SHADER_OPCODE_TXF: 1024 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 1025 break; 1026 case SHADER_OPCODE_TXF_CMS: 1027 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 1028 break; 1029 case SHADER_OPCODE_LOD: 1030 msg_type = GEN5_SAMPLER_MESSAGE_LOD; 1031 break; 1032 case SHADER_OPCODE_TG4: 1033 assert(devinfo->gen == 6); 1034 assert(!inst->shadow_compare); 1035 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 1036 break; 1037 case SHADER_OPCODE_SAMPLEINFO: 1038 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 1039 break; 1040 default: 1041 unreachable("not reached"); 1042 } 1043 } else { 1044 switch (inst->opcode) { 1045 case SHADER_OPCODE_TEX: 1046 /* Note that G45 and older determines shadow compare and dispatch width 1047 * from message length for most messages. 1048 */ 1049 if (inst->exec_size == 8) { 1050 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 1051 if (inst->shadow_compare) { 1052 assert(inst->mlen == 6); 1053 } else { 1054 assert(inst->mlen <= 4); 1055 } 1056 } else { 1057 if (inst->shadow_compare) { 1058 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; 1059 assert(inst->mlen == 9); 1060 } else { 1061 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; 1062 assert(inst->mlen <= 7 && inst->mlen % 2 == 1); 1063 } 1064 } 1065 break; 1066 case FS_OPCODE_TXB: 1067 if (inst->shadow_compare) { 1068 assert(inst->exec_size == 8); 1069 assert(inst->mlen == 6); 1070 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 1071 } else { 1072 assert(inst->mlen == 9); 1073 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 1074 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1075 } 1076 break; 1077 case SHADER_OPCODE_TXL: 1078 if (inst->shadow_compare) { 1079 assert(inst->exec_size == 8); 1080 assert(inst->mlen == 6); 1081 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 1082 } else { 1083 assert(inst->mlen == 9); 1084 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 1085 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1086 } 1087 break; 1088 case SHADER_OPCODE_TXD: 1089 /* There is no sample_d_c message; comparisons are done manually */ 1090 assert(inst->exec_size == 8); 1091 assert(inst->mlen == 7 || inst->mlen == 10); 1092 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; 1093 break; 1094 case SHADER_OPCODE_TXF: 1095 assert(inst->mlen <= 9 && inst->mlen % 2 == 1); 1096 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1097 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1098 break; 1099 case SHADER_OPCODE_TXS: 1100 assert(inst->mlen == 3); 1101 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; 1102 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1103 break; 1104 default: 1105 unreachable("not reached"); 1106 } 1107 } 1108 assert(msg_type != -1); 1109 1110 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 1111 dst = vec16(dst); 1112 } 1113 1114 assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 1115 1116 /* Load the message header if present. If there's a texture offset, 1117 * we need to set it up explicitly and load the offset bitfield. 1118 * Otherwise, we can use an implied move from g0 to the first message reg. 1119 */ 1120 struct brw_reg src = brw_null_reg(); 1121 if (inst->header_size != 0) { 1122 if (devinfo->gen < 6 && !inst->offset) { 1123 /* Set up an implied move from g0 to the MRF. */ 1124 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 1125 } else { 1126 assert(inst->base_mrf != -1); 1127 struct brw_reg header_reg = brw_message_reg(inst->base_mrf); 1128 1129 brw_push_insn_state(p); 1130 brw_set_default_exec_size(p, BRW_EXECUTE_8); 1131 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1132 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 1133 /* Explicitly set up the message header by copying g0 to the MRF. */ 1134 brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); 1135 1136 brw_set_default_exec_size(p, BRW_EXECUTE_1); 1137 if (inst->offset) { 1138 /* Set the offset bits in DWord 2. */ 1139 brw_MOV(p, get_element_ud(header_reg, 2), 1140 brw_imm_ud(inst->offset)); 1141 } 1142 1143 brw_pop_insn_state(p); 1144 } 1145 } 1146 1147 uint32_t base_binding_table_index; 1148 switch (inst->opcode) { 1149 case SHADER_OPCODE_TG4: 1150 base_binding_table_index = prog_data->binding_table.gather_texture_start; 1151 break; 1152 default: 1153 base_binding_table_index = prog_data->binding_table.texture_start; 1154 break; 1155 } 1156 1157 assert(surface_index.file == BRW_IMMEDIATE_VALUE); 1158 assert(sampler_index.file == BRW_IMMEDIATE_VALUE); 1159 1160 brw_SAMPLE(p, 1161 retype(dst, BRW_REGISTER_TYPE_UW), 1162 inst->base_mrf, 1163 src, 1164 surface_index.ud + base_binding_table_index, 1165 sampler_index.ud % 16, 1166 msg_type, 1167 inst->size_written / REG_SIZE, 1168 inst->mlen, 1169 inst->header_size != 0, 1170 simd_mode, 1171 return_format); 1172} 1173 1174 1175/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 1176 * looking like: 1177 * 1178 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 1179 * 1180 * Ideally, we want to produce: 1181 * 1182 * DDX DDY 1183 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 1184 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 1185 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 1186 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 1187 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 1188 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 1189 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 1190 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 1191 * 1192 * and add another set of two more subspans if in 16-pixel dispatch mode. 1193 * 1194 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 1195 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 1196 * pair. But the ideal approximation may impose a huge performance cost on 1197 * sample_d. On at least Haswell, sample_d instruction does some 1198 * optimizations if the same LOD is used for all pixels in the subspan. 1199 * 1200 * For DDY, we need to use ALIGN16 mode since it's capable of doing the 1201 * appropriate swizzling. 1202 */ 1203void 1204fs_generator::generate_ddx(const fs_inst *inst, 1205 struct brw_reg dst, struct brw_reg src) 1206{ 1207 unsigned vstride, width; 1208 1209 if (devinfo->gen >= 8) { 1210 if (inst->opcode == FS_OPCODE_DDX_FINE) { 1211 /* produce accurate derivatives */ 1212 vstride = BRW_VERTICAL_STRIDE_2; 1213 width = BRW_WIDTH_2; 1214 } else { 1215 /* replicate the derivative at the top-left pixel to other pixels */ 1216 vstride = BRW_VERTICAL_STRIDE_4; 1217 width = BRW_WIDTH_4; 1218 } 1219 1220 struct brw_reg src0 = byte_offset(src, type_sz(src.type));; 1221 struct brw_reg src1 = src; 1222 1223 src0.vstride = vstride; 1224 src0.width = width; 1225 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1226 src1.vstride = vstride; 1227 src1.width = width; 1228 src1.hstride = BRW_HORIZONTAL_STRIDE_0; 1229 1230 brw_ADD(p, dst, src0, negate(src1)); 1231 } else { 1232 /* On Haswell and earlier, the region used above appears to not work 1233 * correctly for compressed instructions. At least on Haswell and 1234 * Iron Lake, compressed ALIGN16 instructions do work. Since we 1235 * would have to split to SIMD8 no matter which method we choose, we 1236 * may as well use ALIGN16 on all platforms gen7 and earlier. 1237 */ 1238 struct brw_reg src0 = stride(src, 4, 4, 1); 1239 struct brw_reg src1 = stride(src, 4, 4, 1); 1240 if (inst->opcode == FS_OPCODE_DDX_FINE) { 1241 src0.swizzle = BRW_SWIZZLE_XXZZ; 1242 src1.swizzle = BRW_SWIZZLE_YYWW; 1243 } else { 1244 src0.swizzle = BRW_SWIZZLE_XXXX; 1245 src1.swizzle = BRW_SWIZZLE_YYYY; 1246 } 1247 1248 brw_push_insn_state(p); 1249 brw_set_default_access_mode(p, BRW_ALIGN_16); 1250 brw_ADD(p, dst, negate(src0), src1); 1251 brw_pop_insn_state(p); 1252 } 1253} 1254 1255/* The negate_value boolean is used to negate the derivative computation for 1256 * FBOs, since they place the origin at the upper left instead of the lower 1257 * left. 1258 */ 1259void 1260fs_generator::generate_ddy(const fs_inst *inst, 1261 struct brw_reg dst, struct brw_reg src) 1262{ 1263 const uint32_t type_size = type_sz(src.type); 1264 1265 if (inst->opcode == FS_OPCODE_DDY_FINE) { 1266 /* produce accurate derivatives. 1267 * 1268 * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU) 1269 * "Register Region Restrictions", Section "1. Special Restrictions": 1270 * 1271 * "In Align16 mode, the channel selects and channel enables apply to 1272 * a pair of half-floats, because these parameters are defined for 1273 * DWord elements ONLY. This is applicable when both source and 1274 * destination are half-floats." 1275 * 1276 * So for half-float operations we use the Gen11+ Align1 path. CHV 1277 * inherits its FP16 hardware from SKL, so it is not affected. 1278 */ 1279 if (devinfo->gen >= 11 || 1280 (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) { 1281 src = stride(src, 0, 2, 1); 1282 1283 brw_push_insn_state(p); 1284 brw_set_default_exec_size(p, BRW_EXECUTE_4); 1285 for (uint32_t g = 0; g < inst->exec_size; g += 4) { 1286 brw_set_default_group(p, inst->group + g); 1287 brw_ADD(p, byte_offset(dst, g * type_size), 1288 negate(byte_offset(src, g * type_size)), 1289 byte_offset(src, (g + 2) * type_size)); 1290 } 1291 brw_pop_insn_state(p); 1292 } else { 1293 struct brw_reg src0 = stride(src, 4, 4, 1); 1294 struct brw_reg src1 = stride(src, 4, 4, 1); 1295 src0.swizzle = BRW_SWIZZLE_XYXY; 1296 src1.swizzle = BRW_SWIZZLE_ZWZW; 1297 1298 brw_push_insn_state(p); 1299 brw_set_default_access_mode(p, BRW_ALIGN_16); 1300 brw_ADD(p, dst, negate(src0), src1); 1301 brw_pop_insn_state(p); 1302 } 1303 } else { 1304 /* replicate the derivative at the top-left pixel to other pixels */ 1305 if (devinfo->gen >= 8) { 1306 struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); 1307 struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); 1308 1309 brw_ADD(p, dst, negate(src0), src1); 1310 } else { 1311 /* On Haswell and earlier, the region used above appears to not work 1312 * correctly for compressed instructions. At least on Haswell and 1313 * Iron Lake, compressed ALIGN16 instructions do work. Since we 1314 * would have to split to SIMD8 no matter which method we choose, we 1315 * may as well use ALIGN16 on all platforms gen7 and earlier. 1316 */ 1317 struct brw_reg src0 = stride(src, 4, 4, 1); 1318 struct brw_reg src1 = stride(src, 4, 4, 1); 1319 src0.swizzle = BRW_SWIZZLE_XXXX; 1320 src1.swizzle = BRW_SWIZZLE_ZZZZ; 1321 1322 brw_push_insn_state(p); 1323 brw_set_default_access_mode(p, BRW_ALIGN_16); 1324 brw_ADD(p, dst, negate(src0), src1); 1325 brw_pop_insn_state(p); 1326 } 1327 } 1328} 1329 1330void 1331fs_generator::generate_discard_jump(fs_inst *) 1332{ 1333 assert(devinfo->gen >= 6); 1334 1335 /* This HALT will be patched up at FB write time to point UIP at the end of 1336 * the program, and at brw_uip_jip() JIP will be set to the end of the 1337 * current block (or the program). 1338 */ 1339 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); 1340 gen6_HALT(p); 1341} 1342 1343void 1344fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) 1345{ 1346 /* The 32-wide messages only respect the first 16-wide half of the channel 1347 * enable signals which are replicated identically for the second group of 1348 * 16 channels, so we cannot use them unless the write is marked 1349 * force_writemask_all. 1350 */ 1351 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : 1352 MIN2(16, inst->exec_size); 1353 const unsigned block_size = 4 * lower_size / REG_SIZE; 1354 assert(inst->mlen != 0); 1355 1356 brw_push_insn_state(p); 1357 brw_set_default_exec_size(p, cvt(lower_size) - 1); 1358 brw_set_default_compression(p, lower_size > 8); 1359 1360 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1361 brw_set_default_group(p, inst->group + lower_size * i); 1362 1363 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), 1364 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); 1365 1366 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1367 block_size, 1368 inst->offset + block_size * REG_SIZE * i); 1369 } 1370 1371 brw_pop_insn_state(p); 1372} 1373 1374void 1375fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) 1376{ 1377 assert(inst->exec_size <= 16 || inst->force_writemask_all); 1378 assert(inst->mlen != 0); 1379 1380 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1381 inst->exec_size / 8, inst->offset); 1382} 1383 1384void 1385fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst) 1386{ 1387 assert(inst->exec_size <= 16 || inst->force_writemask_all); 1388 1389 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); 1390} 1391 1392void 1393fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, 1394 struct brw_reg dst, 1395 struct brw_reg index, 1396 struct brw_reg offset) 1397{ 1398 assert(type_sz(dst.type) == 4); 1399 assert(inst->mlen != 0); 1400 1401 assert(index.file == BRW_IMMEDIATE_VALUE && 1402 index.type == BRW_REGISTER_TYPE_UD); 1403 uint32_t surf_index = index.ud; 1404 1405 assert(offset.file == BRW_IMMEDIATE_VALUE && 1406 offset.type == BRW_REGISTER_TYPE_UD); 1407 uint32_t read_offset = offset.ud; 1408 1409 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1410 read_offset, surf_index); 1411} 1412 1413void 1414fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, 1415 struct brw_reg dst, 1416 struct brw_reg index, 1417 struct brw_reg payload) 1418{ 1419 assert(index.type == BRW_REGISTER_TYPE_UD); 1420 assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1421 assert(type_sz(dst.type) == 4); 1422 1423 if (index.file == BRW_IMMEDIATE_VALUE) { 1424 const uint32_t surf_index = index.ud; 1425 1426 brw_push_insn_state(p); 1427 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1428 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1429 brw_pop_insn_state(p); 1430 1431 brw_inst_set_sfid(devinfo, send, GEN6_SFID_DATAPORT_CONSTANT_CACHE); 1432 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 1433 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 1434 brw_set_desc(p, send, 1435 brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written, 1436 REG_SIZE), true) | 1437 brw_dp_read_desc(devinfo, surf_index, 1438 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), 1439 GEN7_DATAPORT_DC_OWORD_BLOCK_READ, 1440 BRW_DATAPORT_READ_TARGET_DATA_CACHE)); 1441 1442 } else { 1443 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1444 1445 brw_push_insn_state(p); 1446 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1447 1448 /* a0.0 = surf_index & 0xff */ 1449 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1450 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1451 brw_set_dest(p, insn_and, addr); 1452 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); 1453 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1454 1455 /* dst = send(payload, a0.0 | <descriptor>) */ 1456 brw_send_indirect_message( 1457 p, GEN6_SFID_DATAPORT_CONSTANT_CACHE, 1458 retype(dst, BRW_REGISTER_TYPE_UD), 1459 retype(payload, BRW_REGISTER_TYPE_UD), addr, 1460 brw_message_desc(devinfo, 1, 1461 DIV_ROUND_UP(inst->size_written, REG_SIZE), true) | 1462 brw_dp_read_desc(devinfo, 0 /* surface */, 1463 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), 1464 GEN7_DATAPORT_DC_OWORD_BLOCK_READ, 1465 BRW_DATAPORT_READ_TARGET_DATA_CACHE), 1466 false /* EOT */); 1467 1468 brw_pop_insn_state(p); 1469 } 1470} 1471 1472void 1473fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst, 1474 struct brw_reg dst, 1475 struct brw_reg index) 1476{ 1477 assert(devinfo->gen < 7); /* Should use the gen7 variant. */ 1478 assert(inst->header_size != 0); 1479 assert(inst->mlen); 1480 1481 assert(index.file == BRW_IMMEDIATE_VALUE && 1482 index.type == BRW_REGISTER_TYPE_UD); 1483 uint32_t surf_index = index.ud; 1484 1485 uint32_t simd_mode, rlen, msg_type; 1486 if (inst->exec_size == 16) { 1487 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1488 rlen = 8; 1489 } else { 1490 assert(inst->exec_size == 8); 1491 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1492 rlen = 4; 1493 } 1494 1495 if (devinfo->gen >= 5) 1496 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 1497 else { 1498 /* We always use the SIMD16 message so that we only have to load U, and 1499 * not V or R. 1500 */ 1501 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1502 assert(inst->mlen == 3); 1503 assert(inst->size_written == 8 * REG_SIZE); 1504 rlen = 8; 1505 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1506 } 1507 1508 struct brw_reg header = brw_vec8_grf(0, 0); 1509 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1510 1511 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1512 brw_inst_set_compression(devinfo, send, false); 1513 brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER); 1514 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); 1515 brw_set_src0(p, send, header); 1516 if (devinfo->gen < 6) 1517 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); 1518 1519 /* Our surface is set up as floats, regardless of what actual data is 1520 * stored in it. 1521 */ 1522 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 1523 brw_set_desc(p, send, 1524 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) | 1525 brw_sampler_desc(devinfo, surf_index, 1526 0, /* sampler (unused) */ 1527 msg_type, simd_mode, return_format)); 1528} 1529 1530void 1531fs_generator::generate_pixel_interpolator_query(fs_inst *inst, 1532 struct brw_reg dst, 1533 struct brw_reg src, 1534 struct brw_reg msg_data, 1535 unsigned msg_type) 1536{ 1537 const bool has_payload = inst->src[0].file != BAD_FILE; 1538 assert(msg_data.type == BRW_REGISTER_TYPE_UD); 1539 assert(inst->size_written % REG_SIZE == 0); 1540 1541 brw_pixel_interpolator_query(p, 1542 retype(dst, BRW_REGISTER_TYPE_UW), 1543 /* If we don't have a payload, what we send doesn't matter */ 1544 has_payload ? src : brw_vec8_grf(0, 0), 1545 inst->pi_noperspective, 1546 msg_type, 1547 msg_data, 1548 has_payload ? 2 * inst->exec_size / 8 : 1, 1549 inst->size_written / REG_SIZE); 1550} 1551 1552/* Sets vstride=1, width=4, hstride=0 of register src1 during 1553 * the ADD instruction. 1554 */ 1555void 1556fs_generator::generate_set_sample_id(fs_inst *inst, 1557 struct brw_reg dst, 1558 struct brw_reg src0, 1559 struct brw_reg src1) 1560{ 1561 assert(dst.type == BRW_REGISTER_TYPE_D || 1562 dst.type == BRW_REGISTER_TYPE_UD); 1563 assert(src0.type == BRW_REGISTER_TYPE_D || 1564 src0.type == BRW_REGISTER_TYPE_UD); 1565 1566 const struct brw_reg reg = stride(src1, 1, 4, 0); 1567 const unsigned lower_size = MIN2(inst->exec_size, 1568 devinfo->gen >= 8 ? 16 : 8); 1569 1570 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1571 brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8), 1572 offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) * 1573 (i * lower_size / (1 << src0.width))) * 1574 type_sz(src0.type) / REG_SIZE), 1575 suboffset(reg, i * lower_size / 4)); 1576 brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); 1577 brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); 1578 brw_inst_set_compression(devinfo, insn, lower_size > 8); 1579 } 1580} 1581 1582void 1583fs_generator::generate_pack_half_2x16_split(fs_inst *, 1584 struct brw_reg dst, 1585 struct brw_reg x, 1586 struct brw_reg y) 1587{ 1588 assert(devinfo->gen >= 7); 1589 assert(dst.type == BRW_REGISTER_TYPE_UD); 1590 assert(x.type == BRW_REGISTER_TYPE_F); 1591 assert(y.type == BRW_REGISTER_TYPE_F); 1592 1593 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 1594 * 1595 * Because this instruction does not have a 16-bit floating-point type, 1596 * the destination data type must be Word (W). 1597 * 1598 * The destination must be DWord-aligned and specify a horizontal stride 1599 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 1600 * each destination channel and the upper word is not modified. 1601 */ 1602 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); 1603 1604 /* Give each 32-bit channel of dst the form below, where "." means 1605 * unchanged. 1606 * 0x....hhhh 1607 */ 1608 brw_F32TO16(p, dst_w, y); 1609 1610 /* Now the form: 1611 * 0xhhhh0000 1612 */ 1613 brw_SHL(p, dst, dst, brw_imm_ud(16u)); 1614 1615 /* And, finally the form of packHalf2x16's output: 1616 * 0xhhhhllll 1617 */ 1618 brw_F32TO16(p, dst_w, x); 1619} 1620 1621void 1622fs_generator::generate_shader_time_add(fs_inst *, 1623 struct brw_reg payload, 1624 struct brw_reg offset, 1625 struct brw_reg value) 1626{ 1627 assert(devinfo->gen >= 7); 1628 brw_push_insn_state(p); 1629 brw_set_default_mask_control(p, true); 1630 1631 assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1632 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), 1633 offset.type); 1634 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0), 1635 value.type); 1636 1637 assert(offset.file == BRW_IMMEDIATE_VALUE); 1638 if (value.file == BRW_GENERAL_REGISTER_FILE) { 1639 value.width = BRW_WIDTH_1; 1640 value.hstride = BRW_HORIZONTAL_STRIDE_0; 1641 value.vstride = BRW_VERTICAL_STRIDE_0; 1642 } else { 1643 assert(value.file == BRW_IMMEDIATE_VALUE); 1644 } 1645 1646 /* Trying to deal with setup of the params from the IR is crazy in the FS8 1647 * case, and we don't really care about squeezing every bit of performance 1648 * out of this path, so we just emit the MOVs from here. 1649 */ 1650 brw_MOV(p, payload_offset, offset); 1651 brw_MOV(p, payload_value, value); 1652 brw_shader_time_add(p, payload, 1653 prog_data->binding_table.shader_time_start); 1654 brw_pop_insn_state(p); 1655} 1656 1657void 1658fs_generator::enable_debug(const char *shader_name) 1659{ 1660 debug_flag = true; 1661 this->shader_name = shader_name; 1662} 1663 1664int 1665fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) 1666{ 1667 /* align to 64 byte boundary. */ 1668 while (p->next_insn_offset % 64) 1669 brw_NOP(p); 1670 1671 this->dispatch_width = dispatch_width; 1672 1673 int start_offset = p->next_insn_offset; 1674 int spill_count = 0, fill_count = 0; 1675 int loop_count = 0; 1676 1677 struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); 1678 1679 foreach_block_and_inst (block, fs_inst, inst, cfg) { 1680 struct brw_reg src[4], dst; 1681 unsigned int last_insn_offset = p->next_insn_offset; 1682 bool multiple_instructions_emitted = false; 1683 1684 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the 1685 * "Register Region Restrictions" section: for BDW, SKL: 1686 * 1687 * "A POW/FDIV operation must not be followed by an instruction 1688 * that requires two destination registers." 1689 * 1690 * The documentation is often lacking annotations for Atom parts, 1691 * and empirically this affects CHV as well. 1692 */ 1693 if (devinfo->gen >= 8 && 1694 devinfo->gen <= 9 && 1695 p->nr_insn > 1 && 1696 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH && 1697 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW && 1698 inst->dst.component_size(inst->exec_size) > REG_SIZE) { 1699 brw_NOP(p); 1700 last_insn_offset = p->next_insn_offset; 1701 } 1702 1703 if (unlikely(debug_flag)) 1704 disasm_annotate(disasm_info, inst, p->next_insn_offset); 1705 1706 /* If the instruction writes to more than one register, it needs to be 1707 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the 1708 * hardware figures out by itself what the right compression mode is, 1709 * but we still need to know whether the instruction is compressed to 1710 * set up the source register regions appropriately. 1711 * 1712 * XXX - This is wrong for instructions that write a single register but 1713 * read more than one which should strictly speaking be treated as 1714 * compressed. For instructions that don't write any registers it 1715 * relies on the destination being a null register of the correct 1716 * type and regioning so the instruction is considered compressed 1717 * or not accordingly. 1718 */ 1719 const bool compressed = 1720 inst->dst.component_size(inst->exec_size) > REG_SIZE; 1721 brw_set_default_compression(p, compressed); 1722 brw_set_default_group(p, inst->group); 1723 1724 for (unsigned int i = 0; i < inst->sources; i++) { 1725 src[i] = brw_reg_from_fs_reg(devinfo, inst, 1726 &inst->src[i], compressed); 1727 /* The accumulator result appears to get used for the 1728 * conditional modifier generation. When negating a UD 1729 * value, there is a 33rd bit generated for the sign in the 1730 * accumulator value, so now you can't check, for example, 1731 * equality with a 32-bit value. See piglit fs-op-neg-uvec4. 1732 */ 1733 assert(!inst->conditional_mod || 1734 inst->src[i].type != BRW_REGISTER_TYPE_UD || 1735 !inst->src[i].negate); 1736 } 1737 dst = brw_reg_from_fs_reg(devinfo, inst, 1738 &inst->dst, compressed); 1739 1740 brw_set_default_access_mode(p, BRW_ALIGN_1); 1741 brw_set_default_predicate_control(p, inst->predicate); 1742 brw_set_default_predicate_inverse(p, inst->predicate_inverse); 1743 /* On gen7 and above, hardware automatically adds the group onto the 1744 * flag subregister number. On Sandy Bridge and older, we have to do it 1745 * ourselves. 1746 */ 1747 const unsigned flag_subreg = inst->flag_subreg + 1748 (devinfo->gen >= 7 ? 0 : inst->group / 16); 1749 brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2); 1750 brw_set_default_saturate(p, inst->saturate); 1751 brw_set_default_mask_control(p, inst->force_writemask_all); 1752 brw_set_default_acc_write_control(p, inst->writes_accumulator); 1753 1754 unsigned exec_size = inst->exec_size; 1755 if (devinfo->gen == 7 && !devinfo->is_haswell && 1756 (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) { 1757 exec_size *= 2; 1758 } 1759 1760 brw_set_default_exec_size(p, cvt(exec_size) - 1); 1761 1762 assert(inst->force_writemask_all || inst->exec_size >= 4); 1763 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0); 1764 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); 1765 assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 1766 1767 switch (inst->opcode) { 1768 case BRW_OPCODE_MOV: 1769 brw_MOV(p, dst, src[0]); 1770 break; 1771 case BRW_OPCODE_ADD: 1772 brw_ADD(p, dst, src[0], src[1]); 1773 break; 1774 case BRW_OPCODE_MUL: 1775 brw_MUL(p, dst, src[0], src[1]); 1776 break; 1777 case BRW_OPCODE_AVG: 1778 brw_AVG(p, dst, src[0], src[1]); 1779 break; 1780 case BRW_OPCODE_MACH: 1781 brw_MACH(p, dst, src[0], src[1]); 1782 break; 1783 1784 case BRW_OPCODE_LINE: 1785 brw_LINE(p, dst, src[0], src[1]); 1786 break; 1787 1788 case BRW_OPCODE_MAD: 1789 assert(devinfo->gen >= 6); 1790 if (devinfo->gen < 10) 1791 brw_set_default_access_mode(p, BRW_ALIGN_16); 1792 brw_MAD(p, dst, src[0], src[1], src[2]); 1793 break; 1794 1795 case BRW_OPCODE_LRP: 1796 assert(devinfo->gen >= 6 && devinfo->gen <= 10); 1797 if (devinfo->gen < 10) 1798 brw_set_default_access_mode(p, BRW_ALIGN_16); 1799 brw_LRP(p, dst, src[0], src[1], src[2]); 1800 break; 1801 1802 case BRW_OPCODE_FRC: 1803 brw_FRC(p, dst, src[0]); 1804 break; 1805 case BRW_OPCODE_RNDD: 1806 brw_RNDD(p, dst, src[0]); 1807 break; 1808 case BRW_OPCODE_RNDE: 1809 brw_RNDE(p, dst, src[0]); 1810 break; 1811 case BRW_OPCODE_RNDZ: 1812 brw_RNDZ(p, dst, src[0]); 1813 break; 1814 1815 case BRW_OPCODE_AND: 1816 brw_AND(p, dst, src[0], src[1]); 1817 break; 1818 case BRW_OPCODE_OR: 1819 brw_OR(p, dst, src[0], src[1]); 1820 break; 1821 case BRW_OPCODE_XOR: 1822 brw_XOR(p, dst, src[0], src[1]); 1823 break; 1824 case BRW_OPCODE_NOT: 1825 brw_NOT(p, dst, src[0]); 1826 break; 1827 case BRW_OPCODE_ASR: 1828 brw_ASR(p, dst, src[0], src[1]); 1829 break; 1830 case BRW_OPCODE_SHR: 1831 brw_SHR(p, dst, src[0], src[1]); 1832 break; 1833 case BRW_OPCODE_SHL: 1834 brw_SHL(p, dst, src[0], src[1]); 1835 break; 1836 case BRW_OPCODE_F32TO16: 1837 assert(devinfo->gen >= 7); 1838 brw_F32TO16(p, dst, src[0]); 1839 break; 1840 case BRW_OPCODE_F16TO32: 1841 assert(devinfo->gen >= 7); 1842 brw_F16TO32(p, dst, src[0]); 1843 break; 1844 case BRW_OPCODE_CMP: 1845 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell && 1846 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1847 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround 1848 * implemented in the compiler is not sufficient. Overriding the 1849 * type when the destination is the null register is necessary but 1850 * not sufficient by itself. 1851 */ 1852 assert(dst.nr == BRW_ARF_NULL); 1853 dst.type = BRW_REGISTER_TYPE_D; 1854 } 1855 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 1856 break; 1857 case BRW_OPCODE_SEL: 1858 brw_SEL(p, dst, src[0], src[1]); 1859 break; 1860 case BRW_OPCODE_CSEL: 1861 assert(devinfo->gen >= 8); 1862 if (devinfo->gen < 10) 1863 brw_set_default_access_mode(p, BRW_ALIGN_16); 1864 brw_CSEL(p, dst, src[0], src[1], src[2]); 1865 break; 1866 case BRW_OPCODE_BFREV: 1867 assert(devinfo->gen >= 7); 1868 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 1869 retype(src[0], BRW_REGISTER_TYPE_UD)); 1870 break; 1871 case BRW_OPCODE_FBH: 1872 assert(devinfo->gen >= 7); 1873 brw_FBH(p, retype(dst, src[0].type), src[0]); 1874 break; 1875 case BRW_OPCODE_FBL: 1876 assert(devinfo->gen >= 7); 1877 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), 1878 retype(src[0], BRW_REGISTER_TYPE_UD)); 1879 break; 1880 case BRW_OPCODE_LZD: 1881 brw_LZD(p, dst, src[0]); 1882 break; 1883 case BRW_OPCODE_CBIT: 1884 assert(devinfo->gen >= 7); 1885 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), 1886 retype(src[0], BRW_REGISTER_TYPE_UD)); 1887 break; 1888 case BRW_OPCODE_ADDC: 1889 assert(devinfo->gen >= 7); 1890 brw_ADDC(p, dst, src[0], src[1]); 1891 break; 1892 case BRW_OPCODE_SUBB: 1893 assert(devinfo->gen >= 7); 1894 brw_SUBB(p, dst, src[0], src[1]); 1895 break; 1896 case BRW_OPCODE_MAC: 1897 brw_MAC(p, dst, src[0], src[1]); 1898 break; 1899 1900 case BRW_OPCODE_BFE: 1901 assert(devinfo->gen >= 7); 1902 if (devinfo->gen < 10) 1903 brw_set_default_access_mode(p, BRW_ALIGN_16); 1904 brw_BFE(p, dst, src[0], src[1], src[2]); 1905 break; 1906 1907 case BRW_OPCODE_BFI1: 1908 assert(devinfo->gen >= 7); 1909 brw_BFI1(p, dst, src[0], src[1]); 1910 break; 1911 case BRW_OPCODE_BFI2: 1912 assert(devinfo->gen >= 7); 1913 if (devinfo->gen < 10) 1914 brw_set_default_access_mode(p, BRW_ALIGN_16); 1915 brw_BFI2(p, dst, src[0], src[1], src[2]); 1916 break; 1917 1918 case BRW_OPCODE_IF: 1919 if (inst->src[0].file != BAD_FILE) { 1920 /* The instruction has an embedded compare (only allowed on gen6) */ 1921 assert(devinfo->gen == 6); 1922 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 1923 } else { 1924 brw_IF(p, brw_get_default_exec_size(p)); 1925 } 1926 break; 1927 1928 case BRW_OPCODE_ELSE: 1929 brw_ELSE(p); 1930 break; 1931 case BRW_OPCODE_ENDIF: 1932 brw_ENDIF(p); 1933 break; 1934 1935 case BRW_OPCODE_DO: 1936 brw_DO(p, brw_get_default_exec_size(p)); 1937 break; 1938 1939 case BRW_OPCODE_BREAK: 1940 brw_BREAK(p); 1941 break; 1942 case BRW_OPCODE_CONTINUE: 1943 brw_CONT(p); 1944 break; 1945 1946 case BRW_OPCODE_WHILE: 1947 brw_WHILE(p); 1948 loop_count++; 1949 break; 1950 1951 case SHADER_OPCODE_RCP: 1952 case SHADER_OPCODE_RSQ: 1953 case SHADER_OPCODE_SQRT: 1954 case SHADER_OPCODE_EXP2: 1955 case SHADER_OPCODE_LOG2: 1956 case SHADER_OPCODE_SIN: 1957 case SHADER_OPCODE_COS: 1958 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1959 if (devinfo->gen >= 6) { 1960 assert(inst->mlen == 0); 1961 assert(devinfo->gen >= 7 || inst->exec_size == 8); 1962 gen6_math(p, dst, brw_math_function(inst->opcode), 1963 src[0], brw_null_reg()); 1964 } else { 1965 assert(inst->mlen >= 1); 1966 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8); 1967 gen4_math(p, dst, 1968 brw_math_function(inst->opcode), 1969 inst->base_mrf, src[0], 1970 BRW_MATH_PRECISION_FULL); 1971 } 1972 break; 1973 case SHADER_OPCODE_INT_QUOTIENT: 1974 case SHADER_OPCODE_INT_REMAINDER: 1975 case SHADER_OPCODE_POW: 1976 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1977 if (devinfo->gen >= 6) { 1978 assert(inst->mlen == 0); 1979 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) || 1980 inst->exec_size == 8); 1981 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 1982 } else { 1983 assert(inst->mlen >= 1); 1984 assert(inst->exec_size == 8); 1985 gen4_math(p, dst, brw_math_function(inst->opcode), 1986 inst->base_mrf, src[0], 1987 BRW_MATH_PRECISION_FULL); 1988 } 1989 break; 1990 case FS_OPCODE_LINTERP: 1991 multiple_instructions_emitted = generate_linterp(inst, dst, src); 1992 break; 1993 case FS_OPCODE_PIXEL_X: 1994 assert(src[0].type == BRW_REGISTER_TYPE_UW); 1995 src[0].subnr = 0 * type_sz(src[0].type); 1996 brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 1997 break; 1998 case FS_OPCODE_PIXEL_Y: 1999 assert(src[0].type == BRW_REGISTER_TYPE_UW); 2000 src[0].subnr = 4 * type_sz(src[0].type); 2001 brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 2002 break; 2003 2004 case SHADER_OPCODE_SEND: 2005 generate_send(inst, dst, src[0], src[1], src[2], 2006 inst->ex_mlen > 0 ? src[3] : brw_null_reg()); 2007 break; 2008 2009 case SHADER_OPCODE_GET_BUFFER_SIZE: 2010 generate_get_buffer_size(inst, dst, src[0], src[1]); 2011 break; 2012 case SHADER_OPCODE_TEX: 2013 case FS_OPCODE_TXB: 2014 case SHADER_OPCODE_TXD: 2015 case SHADER_OPCODE_TXF: 2016 case SHADER_OPCODE_TXF_CMS: 2017 case SHADER_OPCODE_TXL: 2018 case SHADER_OPCODE_TXS: 2019 case SHADER_OPCODE_LOD: 2020 case SHADER_OPCODE_TG4: 2021 case SHADER_OPCODE_SAMPLEINFO: 2022 assert(inst->src[0].file == BAD_FILE); 2023 generate_tex(inst, dst, src[1], src[2]); 2024 break; 2025 2026 case FS_OPCODE_DDX_COARSE: 2027 case FS_OPCODE_DDX_FINE: 2028 generate_ddx(inst, dst, src[0]); 2029 break; 2030 case FS_OPCODE_DDY_COARSE: 2031 case FS_OPCODE_DDY_FINE: 2032 generate_ddy(inst, dst, src[0]); 2033 break; 2034 2035 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 2036 generate_scratch_write(inst, src[0]); 2037 spill_count++; 2038 break; 2039 2040 case SHADER_OPCODE_GEN4_SCRATCH_READ: 2041 generate_scratch_read(inst, dst); 2042 fill_count++; 2043 break; 2044 2045 case SHADER_OPCODE_GEN7_SCRATCH_READ: 2046 generate_scratch_read_gen7(inst, dst); 2047 fill_count++; 2048 break; 2049 2050 case SHADER_OPCODE_MOV_INDIRECT: 2051 generate_mov_indirect(inst, dst, src[0], src[1]); 2052 break; 2053 2054 case SHADER_OPCODE_URB_READ_SIMD8: 2055 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 2056 generate_urb_read(inst, dst, src[0]); 2057 break; 2058 2059 case SHADER_OPCODE_URB_WRITE_SIMD8: 2060 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 2061 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 2062 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 2063 generate_urb_write(inst, src[0]); 2064 break; 2065 2066 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 2067 assert(inst->force_writemask_all); 2068 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); 2069 break; 2070 2071 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: 2072 assert(inst->force_writemask_all); 2073 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); 2074 break; 2075 2076 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: 2077 generate_varying_pull_constant_load_gen4(inst, dst, src[0]); 2078 break; 2079 2080 case FS_OPCODE_REP_FB_WRITE: 2081 case FS_OPCODE_FB_WRITE: 2082 generate_fb_write(inst, src[0]); 2083 break; 2084 2085 case FS_OPCODE_FB_READ: 2086 generate_fb_read(inst, dst, src[0]); 2087 break; 2088 2089 case FS_OPCODE_DISCARD_JUMP: 2090 generate_discard_jump(inst); 2091 break; 2092 2093 case SHADER_OPCODE_SHADER_TIME_ADD: 2094 generate_shader_time_add(inst, src[0], src[1], src[2]); 2095 break; 2096 2097 case SHADER_OPCODE_MEMORY_FENCE: 2098 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2099 brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud); 2100 break; 2101 2102 case SHADER_OPCODE_INTERLOCK: 2103 assert(devinfo->gen >= 9); 2104 /* The interlock is basically a memory fence issued via sendc */ 2105 brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false); 2106 break; 2107 2108 case SHADER_OPCODE_FIND_LIVE_CHANNEL: { 2109 const struct brw_reg mask = 2110 brw_stage_has_packed_dispatch(devinfo, stage, 2111 prog_data) ? brw_imm_ud(~0u) : 2112 stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : 2113 brw_dmask_reg(); 2114 brw_find_live_channel(p, dst, mask); 2115 break; 2116 } 2117 2118 case SHADER_OPCODE_BROADCAST: 2119 assert(inst->force_writemask_all); 2120 brw_broadcast(p, dst, src[0], src[1]); 2121 break; 2122 2123 case SHADER_OPCODE_SHUFFLE: 2124 generate_shuffle(inst, dst, src[0], src[1]); 2125 break; 2126 2127 case SHADER_OPCODE_SEL_EXEC: 2128 assert(inst->force_writemask_all); 2129 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2130 brw_MOV(p, dst, src[1]); 2131 brw_set_default_mask_control(p, BRW_MASK_ENABLE); 2132 brw_MOV(p, dst, src[0]); 2133 break; 2134 2135 case SHADER_OPCODE_QUAD_SWIZZLE: 2136 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2137 assert(src[1].type == BRW_REGISTER_TYPE_UD); 2138 generate_quad_swizzle(inst, dst, src[0], src[1].ud); 2139 break; 2140 2141 case SHADER_OPCODE_CLUSTER_BROADCAST: { 2142 assert(src[0].type == dst.type); 2143 assert(!src[0].negate && !src[0].abs); 2144 assert(src[1].file == BRW_IMMEDIATE_VALUE); 2145 assert(src[1].type == BRW_REGISTER_TYPE_UD); 2146 assert(src[2].file == BRW_IMMEDIATE_VALUE); 2147 assert(src[2].type == BRW_REGISTER_TYPE_UD); 2148 const unsigned component = src[1].ud; 2149 const unsigned cluster_size = src[2].ud; 2150 struct brw_reg strided = stride(suboffset(src[0], component), 2151 cluster_size, cluster_size, 0); 2152 if (type_sz(src[0].type) > 4 && 2153 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { 2154 /* IVB has an issue (which we found empirically) where it reads 2155 * two address register components per channel for indirectly 2156 * addressed 64-bit sources. 2157 * 2158 * From the Cherryview PRM Vol 7. "Register Region Restrictions": 2159 * 2160 * "When source or destination datatype is 64b or operation is 2161 * integer DWord multiply, indirect addressing must not be 2162 * used." 2163 * 2164 * To work around both of these, we do two integer MOVs insead of 2165 * one 64-bit MOV. Because no double value should ever cross a 2166 * register boundary, it's safe to use the immediate offset in the 2167 * indirect here to handle adding 4 bytes to the offset and avoid 2168 * the extra ADD to the register file. 2169 */ 2170 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 2171 subscript(strided, BRW_REGISTER_TYPE_D, 0)); 2172 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 2173 subscript(strided, BRW_REGISTER_TYPE_D, 1)); 2174 } else { 2175 brw_MOV(p, dst, strided); 2176 } 2177 break; 2178 } 2179 2180 case FS_OPCODE_SET_SAMPLE_ID: 2181 generate_set_sample_id(inst, dst, src[0], src[1]); 2182 break; 2183 2184 case FS_OPCODE_PACK_HALF_2x16_SPLIT: 2185 generate_pack_half_2x16_split(inst, dst, src[0], src[1]); 2186 break; 2187 2188 case FS_OPCODE_PLACEHOLDER_HALT: 2189 /* This is the place where the final HALT needs to be inserted if 2190 * we've emitted any discards. If not, this will emit no code. 2191 */ 2192 if (!patch_discard_jumps_to_fb_writes()) { 2193 if (unlikely(debug_flag)) { 2194 disasm_info->use_tail = true; 2195 } 2196 } 2197 break; 2198 2199 case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 2200 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2201 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); 2202 break; 2203 2204 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 2205 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2206 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); 2207 break; 2208 2209 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 2210 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2211 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); 2212 break; 2213 2214 case CS_OPCODE_CS_TERMINATE: 2215 generate_cs_terminate(inst, src[0]); 2216 break; 2217 2218 case SHADER_OPCODE_BARRIER: 2219 generate_barrier(inst, src[0]); 2220 break; 2221 2222 case BRW_OPCODE_DIM: 2223 assert(devinfo->is_haswell); 2224 assert(src[0].type == BRW_REGISTER_TYPE_DF); 2225 assert(dst.type == BRW_REGISTER_TYPE_DF); 2226 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2227 break; 2228 2229 case SHADER_OPCODE_RND_MODE: 2230 assert(src[0].file == BRW_IMMEDIATE_VALUE); 2231 brw_rounding_mode(p, (brw_rnd_mode) src[0].d); 2232 break; 2233 2234 default: 2235 unreachable("Unsupported opcode"); 2236 2237 case SHADER_OPCODE_LOAD_PAYLOAD: 2238 unreachable("Should be lowered by lower_load_payload()"); 2239 } 2240 2241 if (multiple_instructions_emitted) 2242 continue; 2243 2244 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2245 assert(p->next_insn_offset == last_insn_offset + 16 || 2246 !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2247 "emitting more than 1 instruction"); 2248 2249 brw_inst *last = &p->store[last_insn_offset / 16]; 2250 2251 if (inst->conditional_mod) 2252 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2253 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2254 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2255 } 2256 } 2257 2258 brw_set_uip_jip(p, start_offset); 2259 2260 /* end of program sentinel */ 2261 disasm_new_inst_group(disasm_info, p->next_insn_offset); 2262 2263#ifndef NDEBUG 2264 bool validated = 2265#else 2266 if (unlikely(debug_flag)) 2267#endif 2268 brw_validate_instructions(devinfo, p->store, 2269 start_offset, 2270 p->next_insn_offset, 2271 disasm_info); 2272 2273 int before_size = p->next_insn_offset - start_offset; 2274 brw_compact_instructions(p, start_offset, disasm_info); 2275 int after_size = p->next_insn_offset - start_offset; 2276 2277 if (unlikely(debug_flag)) { 2278 fprintf(stderr, "Native code for %s\n" 2279 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" 2280 " bytes (%.0f%%)\n", 2281 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, 2282 spill_count, fill_count, promoted_constants, before_size, after_size, 2283 100.0f * (before_size - after_size) / before_size); 2284 2285 dump_assembly(p->store, disasm_info); 2286 } 2287 ralloc_free(disasm_info); 2288 assert(validated); 2289 2290 compiler->shader_debug_log(log_data, 2291 "%s SIMD%d shader: %d inst, %d loops, %u cycles, " 2292 "%d:%d spills:fills, Promoted %u constants, " 2293 "compacted %d to %d bytes.", 2294 _mesa_shader_stage_to_abbrev(stage), 2295 dispatch_width, before_size / 16, 2296 loop_count, cfg->cycle_count, spill_count, 2297 fill_count, promoted_constants, before_size, 2298 after_size); 2299 2300 return start_offset; 2301} 2302 2303const unsigned * 2304fs_generator::get_assembly() 2305{ 2306 return brw_get_program(p, &prog_data->program_size); 2307} 2308