1/* 2 * Copyright © 2019 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include <algorithm> 29#include <bitset> 30#include <stack> 31#include <vector> 32 33namespace aco { 34namespace { 35 36struct State { 37 Program* program; 38 Block* block; 39 std::vector<aco_ptr<Instruction>> old_instructions; 40}; 41 42struct NOP_ctx_gfx6 { 43 void join(const NOP_ctx_gfx6& other) 44 { 45 set_vskip_mode_then_vector = 46 MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector); 47 valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz); 48 valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz); 49 valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas); 50 salu_wr_m0_then_gds_msg_ttrace = 51 MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace); 52 valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp); 53 salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds); 54 salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel); 55 setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg); 56 vmem_store_then_wr_data |= other.vmem_store_then_wr_data; 57 smem_clause |= other.smem_clause; 58 smem_write |= other.smem_write; 59 for (unsigned i = 0; i < BITSET_WORDS(128); i++) { 60 smem_clause_read_write[i] |= other.smem_clause_read_write[i]; 61 smem_clause_write[i] |= other.smem_clause_write[i]; 62 } 63 } 64 65 bool operator==(const NOP_ctx_gfx6& other) 66 { 67 return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector && 68 valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz && 69 valu_wr_exec_then_execz == other.valu_wr_exec_then_execz && 70 valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas && 71 vmem_store_then_wr_data == other.vmem_store_then_wr_data && 72 salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace && 73 valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp && 74 salu_wr_m0_then_lds == other.salu_wr_m0_then_lds && 75 salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel && 76 setreg_then_getsetreg == other.setreg_then_getsetreg && 77 smem_clause == other.smem_clause && smem_write == other.smem_write && 78 BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) && 79 BITSET_EQUAL(smem_clause_write, other.smem_clause_write); 80 } 81 82 void add_wait_states(unsigned amount) 83 { 84 if ((set_vskip_mode_then_vector -= amount) < 0) 85 set_vskip_mode_then_vector = 0; 86 87 if ((valu_wr_vcc_then_vccz -= amount) < 0) 88 valu_wr_vcc_then_vccz = 0; 89 90 if ((valu_wr_exec_then_execz -= amount) < 0) 91 valu_wr_exec_then_execz = 0; 92 93 if ((valu_wr_vcc_then_div_fmas -= amount) < 0) 94 valu_wr_vcc_then_div_fmas = 0; 95 96 if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0) 97 salu_wr_m0_then_gds_msg_ttrace = 0; 98 99 if ((valu_wr_exec_then_dpp -= amount) < 0) 100 valu_wr_exec_then_dpp = 0; 101 102 if ((salu_wr_m0_then_lds -= amount) < 0) 103 salu_wr_m0_then_lds = 0; 104 105 if ((salu_wr_m0_then_moverel -= amount) < 0) 106 salu_wr_m0_then_moverel = 0; 107 108 if ((setreg_then_getsetreg -= amount) < 0) 109 setreg_then_getsetreg = 0; 110 111 vmem_store_then_wr_data.reset(); 112 } 113 114 /* setting MODE.vskip and then any vector op requires 2 wait states */ 115 int8_t set_vskip_mode_then_vector = 0; 116 117 /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */ 118 int8_t valu_wr_vcc_then_vccz = 0; 119 int8_t valu_wr_exec_then_execz = 0; 120 121 /* VALU writing VCC followed by v_div_fmas require 4 wait states */ 122 int8_t valu_wr_vcc_then_div_fmas = 0; 123 124 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */ 125 int8_t salu_wr_m0_then_gds_msg_ttrace = 0; 126 127 /* VALU writing EXEC followed by DPP requires 5 wait states */ 128 int8_t valu_wr_exec_then_dpp = 0; 129 130 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */ 131 int8_t salu_wr_m0_then_lds = 0; 132 133 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */ 134 int8_t salu_wr_m0_then_moverel = 0; 135 136 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states 137 * currently we don't look at the actual register */ 138 int8_t setreg_then_getsetreg = 0; 139 140 /* some memory instructions writing >64bit followed by a instructions 141 * writing the VGPRs holding the writedata requires 1 wait state */ 142 std::bitset<256> vmem_store_then_wr_data; 143 144 /* we break up SMEM clauses that contain stores or overwrite an 145 * operand/definition of another instruction in the clause */ 146 bool smem_clause = false; 147 bool smem_write = false; 148 BITSET_DECLARE(smem_clause_read_write, 128) = {0}; 149 BITSET_DECLARE(smem_clause_write, 128) = {0}; 150}; 151 152struct NOP_ctx_gfx10 { 153 bool has_VOPC = false; 154 bool has_nonVALU_exec_read = false; 155 bool has_VMEM = false; 156 bool has_branch_after_VMEM = false; 157 bool has_DS = false; 158 bool has_branch_after_DS = false; 159 bool has_NSA_MIMG = false; 160 bool has_writelane = false; 161 std::bitset<128> sgprs_read_by_VMEM; 162 std::bitset<128> sgprs_read_by_SMEM; 163 164 void join(const NOP_ctx_gfx10& other) 165 { 166 has_VOPC |= other.has_VOPC; 167 has_nonVALU_exec_read |= other.has_nonVALU_exec_read; 168 has_VMEM |= other.has_VMEM; 169 has_branch_after_VMEM |= other.has_branch_after_VMEM; 170 has_DS |= other.has_DS; 171 has_branch_after_DS |= other.has_branch_after_DS; 172 has_NSA_MIMG |= other.has_NSA_MIMG; 173 has_writelane |= other.has_writelane; 174 sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; 175 sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; 176 } 177 178 bool operator==(const NOP_ctx_gfx10& other) 179 { 180 return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read && 181 has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM && 182 has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS && 183 has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane && 184 sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && 185 sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; 186 } 187}; 188 189int 190get_wait_states(aco_ptr<Instruction>& instr) 191{ 192 if (instr->opcode == aco_opcode::s_nop) 193 return instr->sopp().imm + 1; 194 else if (instr->opcode == aco_opcode::p_constaddr) 195 return 3; /* lowered to 3 instructions in the assembler */ 196 else 197 return 1; 198} 199 200bool 201regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) 202{ 203 return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size); 204} 205 206template <bool Valu, bool Vintrp, bool Salu> 207bool 208handle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_needed, uint32_t* mask) 209{ 210 unsigned mask_size = util_last_bit(*mask); 211 212 uint32_t writemask = 0; 213 for (Definition& def : pred->definitions) { 214 if (regs_intersect(reg, mask_size, def.physReg(), def.size())) { 215 unsigned start = def.physReg() > reg ? def.physReg() - reg : 0; 216 unsigned end = MIN2(mask_size, start + def.size()); 217 writemask |= u_bit_consecutive(start, end - start); 218 } 219 } 220 221 bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) || 222 (pred->isSALU() && Salu)); 223 if (is_hazard) 224 return true; 225 226 *mask &= ~writemask; 227 *nops_needed = MAX2(*nops_needed - get_wait_states(pred), 0); 228 229 if (*mask == 0) 230 *nops_needed = 0; 231 232 return *nops_needed == 0; 233} 234 235template <bool Valu, bool Vintrp, bool Salu> 236int 237handle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask, 238 bool start_at_end) 239{ 240 if (block == state.block && start_at_end) { 241 /* If it's the current block, block->instructions is incomplete. */ 242 for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) { 243 aco_ptr<Instruction>& instr = state.old_instructions[pred_idx]; 244 if (!instr) 245 break; /* Instruction has been moved to block->instructions. */ 246 if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(instr, reg, &nops_needed, &mask)) 247 return nops_needed; 248 } 249 } 250 for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) { 251 if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg, 252 &nops_needed, &mask)) 253 return nops_needed; 254 } 255 256 int res = 0; 257 258 /* Loops require branch instructions, which count towards the wait 259 * states. So even with loops this should finish unless nops_needed is some 260 * huge value. */ 261 for (unsigned lin_pred : block->linear_preds) { 262 res = 263 std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>( 264 state, &state.program->blocks[lin_pred], nops_needed, reg, mask, true)); 265 } 266 return res; 267} 268 269template <bool Valu, bool Vintrp, bool Salu> 270void 271handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op) 272{ 273 if (*NOPs >= min_states) 274 return; 275 int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>( 276 state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()), false); 277 *NOPs = MAX2(*NOPs, res); 278} 279 280static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>; 281static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>; 282static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>; 283 284void 285set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) 286{ 287 unsigned end = start + size - 1; 288 unsigned start_mod = start % BITSET_WORDBITS; 289 if (start_mod + size <= BITSET_WORDBITS) { 290 BITSET_SET_RANGE_INSIDE_WORD(words, start, end); 291 } else { 292 unsigned first_size = BITSET_WORDBITS - start_mod; 293 set_bitset_range(words, start, BITSET_WORDBITS - start_mod); 294 set_bitset_range(words, start + first_size, size - first_size); 295 } 296} 297 298bool 299test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) 300{ 301 unsigned end = start + size - 1; 302 unsigned start_mod = start % BITSET_WORDBITS; 303 if (start_mod + size <= BITSET_WORDBITS) { 304 return BITSET_TEST_RANGE(words, start, end); 305 } else { 306 unsigned first_size = BITSET_WORDBITS - start_mod; 307 return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) || 308 test_bitset_range(words, start + first_size, size - first_size); 309 } 310} 311 312/* A SMEM clause is any group of consecutive SMEM instructions. The 313 * instructions in this group may return out of order and/or may be replayed. 314 * 315 * To fix this potential hazard correctly, we have to make sure that when a 316 * clause has more than one instruction, no instruction in the clause writes 317 * to a register that is read by another instruction in the clause (including 318 * itself). In this case, we have to break the SMEM clause by inserting non 319 * SMEM instructions. 320 * 321 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set. 322 */ 323void 324handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr, 325 int* NOPs) 326{ 327 /* break off from previous SMEM clause if needed */ 328 if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) { 329 /* Don't allow clauses with store instructions since the clause's 330 * instructions may use the same address. */ 331 if (ctx.smem_write || instr->definitions.empty() || 332 instr_info.is_atomic[(unsigned)instr->opcode]) { 333 *NOPs = 1; 334 } else if (program->dev.xnack_enabled) { 335 for (Operand op : instr->operands) { 336 if (!op.isConstant() && 337 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { 338 *NOPs = 1; 339 break; 340 } 341 } 342 343 Definition def = instr->definitions[0]; 344 if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size())) 345 *NOPs = 1; 346 } 347 } 348} 349 350/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */ 351void 352handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr, 353 std::vector<aco_ptr<Instruction>>& new_instructions) 354{ 355 /* check hazards */ 356 int NOPs = 0; 357 358 if (instr->isSMEM()) { 359 if (state.program->chip_class == GFX6) { 360 /* A read of an SGPR by SMRD instruction requires 4 wait states 361 * when the SGPR was written by a VALU instruction. According to LLVM, 362 * there is also an undocumented hardware behavior when the buffer 363 * descriptor is written by a SALU instruction */ 364 for (unsigned i = 0; i < instr->operands.size(); i++) { 365 Operand op = instr->operands[i]; 366 if (op.isConstant()) 367 continue; 368 369 bool is_buffer_desc = i == 0 && op.size() > 2; 370 if (is_buffer_desc) 371 handle_valu_salu_then_read_hazard(state, &NOPs, 4, op); 372 else 373 handle_valu_then_read_hazard(state, &NOPs, 4, op); 374 } 375 } 376 377 handle_smem_clause_hazards(state.program, ctx, instr, &NOPs); 378 } else if (instr->isSALU()) { 379 if (instr->opcode == aco_opcode::s_setreg_b32 || 380 instr->opcode == aco_opcode::s_setreg_imm32_b32 || 381 instr->opcode == aco_opcode::s_getreg_b32) { 382 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg); 383 } 384 385 if (state.program->chip_class == GFX9) { 386 if (instr->opcode == aco_opcode::s_movrels_b32 || 387 instr->opcode == aco_opcode::s_movrels_b64 || 388 instr->opcode == aco_opcode::s_movreld_b32 || 389 instr->opcode == aco_opcode::s_movreld_b64) { 390 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel); 391 } 392 } 393 394 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata) 395 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace); 396 } else if (instr->isDS() && instr->ds().gds) { 397 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace); 398 } else if (instr->isVALU() || instr->isVINTRP()) { 399 for (Operand op : instr->operands) { 400 if (op.physReg() == vccz) 401 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_vccz); 402 if (op.physReg() == execz) 403 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_execz); 404 } 405 406 if (instr->isDPP()) { 407 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp); 408 handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]); 409 } 410 411 for (Definition def : instr->definitions) { 412 if (def.regClass().type() != RegType::sgpr) { 413 for (unsigned i = 0; i < def.size(); i++) 414 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]); 415 } 416 } 417 418 if ((instr->opcode == aco_opcode::v_readlane_b32 || 419 instr->opcode == aco_opcode::v_readlane_b32_e64 || 420 instr->opcode == aco_opcode::v_writelane_b32 || 421 instr->opcode == aco_opcode::v_writelane_b32_e64) && 422 !instr->operands[1].isConstant()) { 423 handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]); 424 } 425 426 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_* 427 * is followed by a read with v_readfirstlane or v_readlane to fix GPU 428 * hangs on GFX6. Note that v_writelane_* is apparently not affected. 429 * This hazard isn't documented anywhere but AMD confirmed that hazard. 430 */ 431 if (state.program->chip_class == GFX6 && 432 (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */ 433 instr->opcode == aco_opcode::v_readfirstlane_b32)) { 434 handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]); 435 } 436 437 if (instr->opcode == aco_opcode::v_div_fmas_f32 || 438 instr->opcode == aco_opcode::v_div_fmas_f64) 439 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas); 440 } else if (instr->isVMEM() || instr->isFlatLike()) { 441 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ 442 for (Operand op : instr->operands) { 443 if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr) 444 handle_valu_then_read_hazard(state, &NOPs, 5, op); 445 } 446 } 447 448 if (!instr->isSALU() && instr->format != Format::SMEM) 449 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector); 450 451 if (state.program->chip_class == GFX9) { 452 bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds; 453 if (instr->isVINTRP() || lds_scratch_global || 454 instr->opcode == aco_opcode::ds_read_addtid_b32 || 455 instr->opcode == aco_opcode::ds_write_addtid_b32 || 456 instr->opcode == aco_opcode::buffer_store_lds_dword) { 457 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds); 458 } 459 } 460 461 ctx.add_wait_states(NOPs + get_wait_states(instr)); 462 463 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles 464 if (NOPs) { 465 /* create NOP */ 466 aco_ptr<SOPP_instruction> nop{ 467 create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)}; 468 nop->imm = NOPs - 1; 469 nop->block = -1; 470 new_instructions.emplace_back(std::move(nop)); 471 } 472 473 /* update information to check for later hazards */ 474 if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) { 475 ctx.smem_clause = false; 476 ctx.smem_write = false; 477 478 if (state.program->dev.xnack_enabled) { 479 BITSET_ZERO(ctx.smem_clause_read_write); 480 BITSET_ZERO(ctx.smem_clause_write); 481 } 482 } 483 484 if (instr->isSMEM()) { 485 if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) { 486 ctx.smem_write = true; 487 } else { 488 ctx.smem_clause = true; 489 490 if (state.program->dev.xnack_enabled) { 491 for (Operand op : instr->operands) { 492 if (!op.isConstant()) { 493 set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size()); 494 } 495 } 496 497 Definition def = instr->definitions[0]; 498 set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()); 499 set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size()); 500 } 501 } 502 } else if (instr->isVALU()) { 503 for (Definition def : instr->definitions) { 504 if (def.regClass().type() == RegType::sgpr) { 505 if (def.physReg() == vcc || def.physReg() == vcc_hi) { 506 ctx.valu_wr_vcc_then_vccz = 5; 507 ctx.valu_wr_vcc_then_div_fmas = 4; 508 } 509 if (def.physReg() == exec || def.physReg() == exec_hi) { 510 ctx.valu_wr_exec_then_execz = 5; 511 ctx.valu_wr_exec_then_dpp = 5; 512 } 513 } 514 } 515 } else if (instr->isSALU() && !instr->definitions.empty()) { 516 if (!instr->definitions.empty()) { 517 /* all other definitions should be SCC */ 518 Definition def = instr->definitions[0]; 519 if (def.physReg() == m0) { 520 ctx.salu_wr_m0_then_gds_msg_ttrace = 1; 521 ctx.salu_wr_m0_then_lds = 1; 522 ctx.salu_wr_m0_then_moverel = 1; 523 } 524 } else if (instr->opcode == aco_opcode::s_setreg_b32 || 525 instr->opcode == aco_opcode::s_setreg_imm32_b32) { 526 SOPK_instruction& sopk = instr->sopk(); 527 unsigned offset = (sopk.imm >> 6) & 0x1f; 528 unsigned size = ((sopk.imm >> 11) & 0x1f) + 1; 529 unsigned reg = sopk.imm & 0x3f; 530 ctx.setreg_then_getsetreg = 2; 531 532 if (reg == 1 && offset >= 28 && size > (28 - offset)) 533 ctx.set_vskip_mode_then_vector = 2; 534 } 535 } else if (instr->isVMEM() || instr->isFlatLike()) { 536 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */ 537 bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 && 538 instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128; 539 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit 540 * store) */ 541 bool consider_mimg = instr->isMIMG() && 542 instr->operands[1].regClass().type() == RegType::vgpr && 543 instr->operands[1].size() > 2 && instr->operands[0].size() == 4; 544 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */ 545 bool consider_flat = 546 instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2; 547 if (consider_buf || consider_mimg || consider_flat) { 548 PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg(); 549 unsigned size = instr->operands[consider_flat ? 2 : 3].size(); 550 for (unsigned i = 0; i < size; i++) 551 ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1; 552 } 553 } 554} 555 556template <std::size_t N> 557bool 558check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs) 559{ 560 return std::any_of(instr->definitions.begin(), instr->definitions.end(), 561 [&check_regs](const Definition& def) -> bool 562 { 563 bool writes_any = false; 564 for (unsigned i = 0; i < def.size(); i++) { 565 unsigned def_reg = def.physReg() + i; 566 writes_any |= def_reg < check_regs.size() && check_regs[def_reg]; 567 } 568 return writes_any; 569 }); 570} 571 572template <std::size_t N> 573void 574mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads) 575{ 576 for (const Operand& op : instr->operands) { 577 for (unsigned i = 0; i < op.size(); i++) { 578 unsigned reg = op.physReg() + i; 579 if (reg < reg_reads.size()) 580 reg_reads.set(reg); 581 } 582 } 583} 584 585bool 586VALU_writes_sgpr(aco_ptr<Instruction>& instr) 587{ 588 if (instr->isVOPC()) 589 return true; 590 if (instr->isVOP3() && instr->definitions.size() == 2) 591 return true; 592 if (instr->opcode == aco_opcode::v_readfirstlane_b32 || 593 instr->opcode == aco_opcode::v_readlane_b32 || 594 instr->opcode == aco_opcode::v_readlane_b32_e64) 595 return true; 596 return false; 597} 598 599bool 600instr_writes_exec(const aco_ptr<Instruction>& instr) 601{ 602 return std::any_of(instr->definitions.begin(), instr->definitions.end(), 603 [](const Definition& def) -> bool 604 { return def.physReg() == exec_lo || def.physReg() == exec_hi; }); 605} 606 607bool 608instr_writes_sgpr(const aco_ptr<Instruction>& instr) 609{ 610 return std::any_of(instr->definitions.begin(), instr->definitions.end(), 611 [](const Definition& def) -> bool 612 { return def.getTemp().type() == RegType::sgpr; }); 613} 614 615inline bool 616instr_is_branch(const aco_ptr<Instruction>& instr) 617{ 618 return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 || 619 instr->opcode == aco_opcode::s_cbranch_scc1 || 620 instr->opcode == aco_opcode::s_cbranch_vccz || 621 instr->opcode == aco_opcode::s_cbranch_vccnz || 622 instr->opcode == aco_opcode::s_cbranch_execz || 623 instr->opcode == aco_opcode::s_cbranch_execnz || 624 instr->opcode == aco_opcode::s_cbranch_cdbgsys || 625 instr->opcode == aco_opcode::s_cbranch_cdbguser || 626 instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user || 627 instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user || 628 instr->opcode == aco_opcode::s_subvector_loop_begin || 629 instr->opcode == aco_opcode::s_subvector_loop_end || 630 instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 || 631 instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64; 632} 633 634void 635handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr, 636 std::vector<aco_ptr<Instruction>>& new_instructions) 637{ 638 // TODO: s_dcache_inv needs to be in it's own group on GFX10 639 640 /* VMEMtoScalarWriteHazard 641 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" 642 * in-between. 643 */ 644 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) { 645 /* Remember all SGPRs that are read by the VMEM instruction */ 646 mark_read_regs(instr, ctx.sgprs_read_by_VMEM); 647 ctx.sgprs_read_by_VMEM.set(exec); 648 if (state.program->wave_size == 64) 649 ctx.sgprs_read_by_VMEM.set(exec_hi); 650 } else if (instr->isSALU() || instr->isSMEM()) { 651 if (instr->opcode == aco_opcode::s_waitcnt) { 652 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */ 653 uint16_t imm = instr->sopp().imm; 654 unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); 655 if (vmcnt == 0) 656 ctx.sgprs_read_by_VMEM.reset(); 657 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { 658 /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */ 659 if (instr->sopp().imm == 0xffe3) 660 ctx.sgprs_read_by_VMEM.reset(); 661 } 662 663 /* Check if SALU writes an SGPR that was previously read by the VALU */ 664 if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) { 665 ctx.sgprs_read_by_VMEM.reset(); 666 667 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ 668 aco_ptr<SOPP_instruction> depctr{ 669 create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; 670 depctr->imm = 0xffe3; 671 depctr->block = -1; 672 new_instructions.emplace_back(std::move(depctr)); 673 } 674 } else if (instr->isVALU()) { 675 /* Hazard is mitigated by any VALU instruction */ 676 ctx.sgprs_read_by_VMEM.reset(); 677 } 678 679 /* VcmpxPermlaneHazard 680 * Handle any permlane following a VOPC instruction, insert v_mov between them. 681 */ 682 if (instr->isVOPC()) { 683 ctx.has_VOPC = true; 684 } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 || 685 instr->opcode == aco_opcode::v_permlanex16_b32)) { 686 ctx.has_VOPC = false; 687 688 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */ 689 aco_ptr<VOP1_instruction> v_mov{ 690 create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)}; 691 v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1); 692 v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1); 693 new_instructions.emplace_back(std::move(v_mov)); 694 } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) { 695 ctx.has_VOPC = false; 696 } 697 698 /* VcmpxExecWARHazard 699 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction. 700 */ 701 if (!instr->isVALU() && instr->reads_exec()) { 702 ctx.has_nonVALU_exec_read = true; 703 } else if (instr->isVALU()) { 704 if (instr_writes_exec(instr)) { 705 ctx.has_nonVALU_exec_read = false; 706 707 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ 708 aco_ptr<SOPP_instruction> depctr{ 709 create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; 710 depctr->imm = 0xfffe; 711 depctr->block = -1; 712 new_instructions.emplace_back(std::move(depctr)); 713 } else if (instr_writes_sgpr(instr)) { 714 /* Any VALU instruction that writes an SGPR mitigates the problem */ 715 ctx.has_nonVALU_exec_read = false; 716 } 717 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { 718 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */ 719 if ((instr->sopp().imm & 0xfffe) == 0xfffe) 720 ctx.has_nonVALU_exec_read = false; 721 } 722 723 /* SMEMtoVectorWriteHazard 724 * Handle any VALU instruction writing an SGPR after an SMEM reads it. 725 */ 726 if (instr->isSMEM()) { 727 /* Remember all SGPRs that are read by the SMEM instruction */ 728 mark_read_regs(instr, ctx.sgprs_read_by_SMEM); 729 } else if (VALU_writes_sgpr(instr)) { 730 /* Check if VALU writes an SGPR that was previously read by SMEM */ 731 if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) { 732 ctx.sgprs_read_by_SMEM.reset(); 733 734 /* Insert s_mov to mitigate the problem */ 735 aco_ptr<SOP1_instruction> s_mov{ 736 create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)}; 737 s_mov->definitions[0] = Definition(sgpr_null, s1); 738 s_mov->operands[0] = Operand::zero(); 739 new_instructions.emplace_back(std::move(s_mov)); 740 } 741 } else if (instr->isSALU()) { 742 if (instr->format != Format::SOPP) { 743 /* SALU can mitigate the hazard */ 744 ctx.sgprs_read_by_SMEM.reset(); 745 } else { 746 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */ 747 const SOPP_instruction& sopp = instr->sopp(); 748 if (sopp.opcode == aco_opcode::s_waitcnt_lgkmcnt) { 749 if (sopp.imm == 0 && sopp.definitions[0].physReg() == sgpr_null) 750 ctx.sgprs_read_by_SMEM.reset(); 751 } else if (sopp.opcode == aco_opcode::s_waitcnt) { 752 unsigned lgkm = (sopp.imm >> 8) & 0x3f; 753 if (lgkm == 0) 754 ctx.sgprs_read_by_SMEM.reset(); 755 } 756 } 757 } 758 759 /* LdsBranchVmemWARHazard 760 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns. 761 */ 762 if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) { 763 ctx.has_VMEM = true; 764 ctx.has_branch_after_VMEM = false; 765 /* Mitigation for DS is needed only if there was already a branch after */ 766 ctx.has_DS = ctx.has_branch_after_DS; 767 } else if (instr->isDS()) { 768 ctx.has_DS = true; 769 ctx.has_branch_after_DS = false; 770 /* Mitigation for VMEM is needed only if there was already a branch after */ 771 ctx.has_VMEM = ctx.has_branch_after_VMEM; 772 } else if (instr_is_branch(instr)) { 773 ctx.has_branch_after_VMEM = ctx.has_VMEM; 774 ctx.has_branch_after_DS = ctx.has_DS; 775 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { 776 /* Only s_waitcnt_vscnt can mitigate the hazard */ 777 const SOPK_instruction& sopk = instr->sopk(); 778 if (sopk.definitions[0].physReg() == sgpr_null && sopk.imm == 0) 779 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; 780 } 781 if ((ctx.has_VMEM && ctx.has_branch_after_DS) || (ctx.has_DS && ctx.has_branch_after_VMEM)) { 782 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; 783 784 /* Insert s_waitcnt_vscnt to mitigate the problem */ 785 aco_ptr<SOPK_instruction> wait{ 786 create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)}; 787 wait->definitions[0] = Definition(sgpr_null, s1); 788 wait->imm = 0; 789 new_instructions.emplace_back(std::move(wait)); 790 } 791 792 /* NSAToVMEMBug 793 * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 794 * 0). 795 */ 796 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) { 797 ctx.has_NSA_MIMG = true; 798 } else if (ctx.has_NSA_MIMG) { 799 ctx.has_NSA_MIMG = false; 800 801 if (instr->isMUBUF() || instr->isMTBUF()) { 802 uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset; 803 if (offset & 6) 804 Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); 805 } 806 } 807 808 /* waNsaCannotFollowWritelane 809 * Handles NSA MIMG immediately following a v_writelane_b32. 810 */ 811 if (instr->opcode == aco_opcode::v_writelane_b32_e64) { 812 ctx.has_writelane = true; 813 } else if (ctx.has_writelane) { 814 ctx.has_writelane = false; 815 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0) 816 Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); 817 } 818} 819 820template <typename Ctx> 821using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&, 822 std::vector<aco_ptr<Instruction>>&); 823 824template <typename Ctx, HandleInstr<Ctx> Handle> 825void 826handle_block(Program* program, Ctx& ctx, Block& block) 827{ 828 if (block.instructions.empty()) 829 return; 830 831 State state; 832 state.program = program; 833 state.block = █ 834 state.old_instructions = std::move(block.instructions); 835 836 block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning 837 block.instructions.reserve(state.old_instructions.size()); 838 839 for (aco_ptr<Instruction>& instr : state.old_instructions) { 840 Handle(state, ctx, instr, block.instructions); 841 block.instructions.emplace_back(std::move(instr)); 842 } 843} 844 845template <typename Ctx, HandleInstr<Ctx> Handle> 846void 847mitigate_hazards(Program* program) 848{ 849 std::vector<Ctx> all_ctx(program->blocks.size()); 850 std::stack<unsigned, std::vector<unsigned>> loop_header_indices; 851 852 for (unsigned i = 0; i < program->blocks.size(); i++) { 853 Block& block = program->blocks[i]; 854 Ctx& ctx = all_ctx[i]; 855 856 if (block.kind & block_kind_loop_header) { 857 loop_header_indices.push(i); 858 } else if (block.kind & block_kind_loop_exit) { 859 /* Go through the whole loop again */ 860 for (unsigned idx = loop_header_indices.top(); idx < i; idx++) { 861 Ctx loop_block_ctx; 862 for (unsigned b : program->blocks[idx].linear_preds) 863 loop_block_ctx.join(all_ctx[b]); 864 865 handle_block<Ctx, Handle>(program, loop_block_ctx, program->blocks[idx]); 866 867 /* We only need to continue if the loop header context changed */ 868 if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx]) 869 break; 870 871 all_ctx[idx] = loop_block_ctx; 872 } 873 874 loop_header_indices.pop(); 875 } 876 877 for (unsigned b : block.linear_preds) 878 ctx.join(all_ctx[b]); 879 880 handle_block<Ctx, Handle>(program, ctx, block); 881 } 882} 883 884} /* end namespace */ 885 886void 887insert_NOPs(Program* program) 888{ 889 if (program->chip_class >= GFX10_3) 890 ; /* no hazards/bugs to mitigate */ 891 else if (program->chip_class >= GFX10) 892 mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10>(program); 893 else 894 mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program); 895} 896 897} // namespace aco 898