17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2019 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg 257ec681f3Smrg#include "aco_builder.h" 267ec681f3Smrg#include "aco_ir.h" 277ec681f3Smrg 287ec681f3Smrg#include <algorithm> 297ec681f3Smrg#include <bitset> 307ec681f3Smrg#include <stack> 317ec681f3Smrg#include <vector> 327ec681f3Smrg 337ec681f3Smrgnamespace aco { 347ec681f3Smrgnamespace { 357ec681f3Smrg 367ec681f3Smrgstruct State { 377ec681f3Smrg Program* program; 387ec681f3Smrg Block* block; 397ec681f3Smrg std::vector<aco_ptr<Instruction>> old_instructions; 407ec681f3Smrg}; 417ec681f3Smrg 427ec681f3Smrgstruct NOP_ctx_gfx6 { 437ec681f3Smrg void join(const NOP_ctx_gfx6& other) 447ec681f3Smrg { 457ec681f3Smrg set_vskip_mode_then_vector = 467ec681f3Smrg MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector); 477ec681f3Smrg valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz); 487ec681f3Smrg valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz); 497ec681f3Smrg valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas); 507ec681f3Smrg salu_wr_m0_then_gds_msg_ttrace = 517ec681f3Smrg MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace); 527ec681f3Smrg valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp); 537ec681f3Smrg salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds); 547ec681f3Smrg salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel); 557ec681f3Smrg setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg); 567ec681f3Smrg vmem_store_then_wr_data |= other.vmem_store_then_wr_data; 577ec681f3Smrg smem_clause |= other.smem_clause; 587ec681f3Smrg smem_write |= other.smem_write; 597ec681f3Smrg for (unsigned i = 0; i < BITSET_WORDS(128); i++) { 607ec681f3Smrg smem_clause_read_write[i] |= other.smem_clause_read_write[i]; 617ec681f3Smrg smem_clause_write[i] |= other.smem_clause_write[i]; 627ec681f3Smrg } 637ec681f3Smrg } 647ec681f3Smrg 657ec681f3Smrg bool operator==(const NOP_ctx_gfx6& other) 667ec681f3Smrg { 677ec681f3Smrg return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector && 687ec681f3Smrg valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz && 697ec681f3Smrg valu_wr_exec_then_execz == other.valu_wr_exec_then_execz && 707ec681f3Smrg valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas && 717ec681f3Smrg vmem_store_then_wr_data == other.vmem_store_then_wr_data && 727ec681f3Smrg salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace && 737ec681f3Smrg valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp && 747ec681f3Smrg salu_wr_m0_then_lds == other.salu_wr_m0_then_lds && 757ec681f3Smrg salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel && 767ec681f3Smrg setreg_then_getsetreg == other.setreg_then_getsetreg && 777ec681f3Smrg smem_clause == other.smem_clause && smem_write == other.smem_write && 787ec681f3Smrg BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) && 797ec681f3Smrg BITSET_EQUAL(smem_clause_write, other.smem_clause_write); 807ec681f3Smrg } 817ec681f3Smrg 827ec681f3Smrg void add_wait_states(unsigned amount) 837ec681f3Smrg { 847ec681f3Smrg if ((set_vskip_mode_then_vector -= amount) < 0) 857ec681f3Smrg set_vskip_mode_then_vector = 0; 867ec681f3Smrg 877ec681f3Smrg if ((valu_wr_vcc_then_vccz -= amount) < 0) 887ec681f3Smrg valu_wr_vcc_then_vccz = 0; 897ec681f3Smrg 907ec681f3Smrg if ((valu_wr_exec_then_execz -= amount) < 0) 917ec681f3Smrg valu_wr_exec_then_execz = 0; 927ec681f3Smrg 937ec681f3Smrg if ((valu_wr_vcc_then_div_fmas -= amount) < 0) 947ec681f3Smrg valu_wr_vcc_then_div_fmas = 0; 957ec681f3Smrg 967ec681f3Smrg if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0) 977ec681f3Smrg salu_wr_m0_then_gds_msg_ttrace = 0; 987ec681f3Smrg 997ec681f3Smrg if ((valu_wr_exec_then_dpp -= amount) < 0) 1007ec681f3Smrg valu_wr_exec_then_dpp = 0; 1017ec681f3Smrg 1027ec681f3Smrg if ((salu_wr_m0_then_lds -= amount) < 0) 1037ec681f3Smrg salu_wr_m0_then_lds = 0; 1047ec681f3Smrg 1057ec681f3Smrg if ((salu_wr_m0_then_moverel -= amount) < 0) 1067ec681f3Smrg salu_wr_m0_then_moverel = 0; 1077ec681f3Smrg 1087ec681f3Smrg if ((setreg_then_getsetreg -= amount) < 0) 1097ec681f3Smrg setreg_then_getsetreg = 0; 1107ec681f3Smrg 1117ec681f3Smrg vmem_store_then_wr_data.reset(); 1127ec681f3Smrg } 1137ec681f3Smrg 1147ec681f3Smrg /* setting MODE.vskip and then any vector op requires 2 wait states */ 1157ec681f3Smrg int8_t set_vskip_mode_then_vector = 0; 1167ec681f3Smrg 1177ec681f3Smrg /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */ 1187ec681f3Smrg int8_t valu_wr_vcc_then_vccz = 0; 1197ec681f3Smrg int8_t valu_wr_exec_then_execz = 0; 1207ec681f3Smrg 1217ec681f3Smrg /* VALU writing VCC followed by v_div_fmas require 4 wait states */ 1227ec681f3Smrg int8_t valu_wr_vcc_then_div_fmas = 0; 1237ec681f3Smrg 1247ec681f3Smrg /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */ 1257ec681f3Smrg int8_t salu_wr_m0_then_gds_msg_ttrace = 0; 1267ec681f3Smrg 1277ec681f3Smrg /* VALU writing EXEC followed by DPP requires 5 wait states */ 1287ec681f3Smrg int8_t valu_wr_exec_then_dpp = 0; 1297ec681f3Smrg 1307ec681f3Smrg /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */ 1317ec681f3Smrg int8_t salu_wr_m0_then_lds = 0; 1327ec681f3Smrg 1337ec681f3Smrg /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */ 1347ec681f3Smrg int8_t salu_wr_m0_then_moverel = 0; 1357ec681f3Smrg 1367ec681f3Smrg /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states 1377ec681f3Smrg * currently we don't look at the actual register */ 1387ec681f3Smrg int8_t setreg_then_getsetreg = 0; 1397ec681f3Smrg 1407ec681f3Smrg /* some memory instructions writing >64bit followed by a instructions 1417ec681f3Smrg * writing the VGPRs holding the writedata requires 1 wait state */ 1427ec681f3Smrg std::bitset<256> vmem_store_then_wr_data; 1437ec681f3Smrg 1447ec681f3Smrg /* we break up SMEM clauses that contain stores or overwrite an 1457ec681f3Smrg * operand/definition of another instruction in the clause */ 1467ec681f3Smrg bool smem_clause = false; 1477ec681f3Smrg bool smem_write = false; 1487ec681f3Smrg BITSET_DECLARE(smem_clause_read_write, 128) = {0}; 1497ec681f3Smrg BITSET_DECLARE(smem_clause_write, 128) = {0}; 1507ec681f3Smrg}; 1517ec681f3Smrg 1527ec681f3Smrgstruct NOP_ctx_gfx10 { 1537ec681f3Smrg bool has_VOPC = false; 1547ec681f3Smrg bool has_nonVALU_exec_read = false; 1557ec681f3Smrg bool has_VMEM = false; 1567ec681f3Smrg bool has_branch_after_VMEM = false; 1577ec681f3Smrg bool has_DS = false; 1587ec681f3Smrg bool has_branch_after_DS = false; 1597ec681f3Smrg bool has_NSA_MIMG = false; 1607ec681f3Smrg bool has_writelane = false; 1617ec681f3Smrg std::bitset<128> sgprs_read_by_VMEM; 1627ec681f3Smrg std::bitset<128> sgprs_read_by_SMEM; 1637ec681f3Smrg 1647ec681f3Smrg void join(const NOP_ctx_gfx10& other) 1657ec681f3Smrg { 1667ec681f3Smrg has_VOPC |= other.has_VOPC; 1677ec681f3Smrg has_nonVALU_exec_read |= other.has_nonVALU_exec_read; 1687ec681f3Smrg has_VMEM |= other.has_VMEM; 1697ec681f3Smrg has_branch_after_VMEM |= other.has_branch_after_VMEM; 1707ec681f3Smrg has_DS |= other.has_DS; 1717ec681f3Smrg has_branch_after_DS |= other.has_branch_after_DS; 1727ec681f3Smrg has_NSA_MIMG |= other.has_NSA_MIMG; 1737ec681f3Smrg has_writelane |= other.has_writelane; 1747ec681f3Smrg sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; 1757ec681f3Smrg sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; 1767ec681f3Smrg } 1777ec681f3Smrg 1787ec681f3Smrg bool operator==(const NOP_ctx_gfx10& other) 1797ec681f3Smrg { 1807ec681f3Smrg return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read && 1817ec681f3Smrg has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM && 1827ec681f3Smrg has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS && 1837ec681f3Smrg has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane && 1847ec681f3Smrg sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && 1857ec681f3Smrg sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; 1867ec681f3Smrg } 1877ec681f3Smrg}; 1887ec681f3Smrg 1897ec681f3Smrgint 1907ec681f3Smrgget_wait_states(aco_ptr<Instruction>& instr) 1917ec681f3Smrg{ 1927ec681f3Smrg if (instr->opcode == aco_opcode::s_nop) 1937ec681f3Smrg return instr->sopp().imm + 1; 1947ec681f3Smrg else if (instr->opcode == aco_opcode::p_constaddr) 1957ec681f3Smrg return 3; /* lowered to 3 instructions in the assembler */ 1967ec681f3Smrg else 1977ec681f3Smrg return 1; 1987ec681f3Smrg} 1997ec681f3Smrg 2007ec681f3Smrgbool 2017ec681f3Smrgregs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) 2027ec681f3Smrg{ 2037ec681f3Smrg return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size); 2047ec681f3Smrg} 2057ec681f3Smrg 2067ec681f3Smrgtemplate <bool Valu, bool Vintrp, bool Salu> 2077ec681f3Smrgbool 2087ec681f3Smrghandle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_needed, uint32_t* mask) 2097ec681f3Smrg{ 2107ec681f3Smrg unsigned mask_size = util_last_bit(*mask); 2117ec681f3Smrg 2127ec681f3Smrg uint32_t writemask = 0; 2137ec681f3Smrg for (Definition& def : pred->definitions) { 2147ec681f3Smrg if (regs_intersect(reg, mask_size, def.physReg(), def.size())) { 2157ec681f3Smrg unsigned start = def.physReg() > reg ? def.physReg() - reg : 0; 2167ec681f3Smrg unsigned end = MIN2(mask_size, start + def.size()); 2177ec681f3Smrg writemask |= u_bit_consecutive(start, end - start); 2187ec681f3Smrg } 2197ec681f3Smrg } 2207ec681f3Smrg 2217ec681f3Smrg bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) || 2227ec681f3Smrg (pred->isSALU() && Salu)); 2237ec681f3Smrg if (is_hazard) 2247ec681f3Smrg return true; 2257ec681f3Smrg 2267ec681f3Smrg *mask &= ~writemask; 2277ec681f3Smrg *nops_needed = MAX2(*nops_needed - get_wait_states(pred), 0); 2287ec681f3Smrg 2297ec681f3Smrg if (*mask == 0) 2307ec681f3Smrg *nops_needed = 0; 2317ec681f3Smrg 2327ec681f3Smrg return *nops_needed == 0; 2337ec681f3Smrg} 2347ec681f3Smrg 2357ec681f3Smrgtemplate <bool Valu, bool Vintrp, bool Salu> 2367ec681f3Smrgint 2377ec681f3Smrghandle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask, 2387ec681f3Smrg bool start_at_end) 2397ec681f3Smrg{ 2407ec681f3Smrg if (block == state.block && start_at_end) { 2417ec681f3Smrg /* If it's the current block, block->instructions is incomplete. */ 2427ec681f3Smrg for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) { 2437ec681f3Smrg aco_ptr<Instruction>& instr = state.old_instructions[pred_idx]; 2447ec681f3Smrg if (!instr) 2457ec681f3Smrg break; /* Instruction has been moved to block->instructions. */ 2467ec681f3Smrg if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(instr, reg, &nops_needed, &mask)) 2477ec681f3Smrg return nops_needed; 2487ec681f3Smrg } 2497ec681f3Smrg } 2507ec681f3Smrg for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) { 2517ec681f3Smrg if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg, 2527ec681f3Smrg &nops_needed, &mask)) 2537ec681f3Smrg return nops_needed; 2547ec681f3Smrg } 2557ec681f3Smrg 2567ec681f3Smrg int res = 0; 2577ec681f3Smrg 2587ec681f3Smrg /* Loops require branch instructions, which count towards the wait 2597ec681f3Smrg * states. So even with loops this should finish unless nops_needed is some 2607ec681f3Smrg * huge value. */ 2617ec681f3Smrg for (unsigned lin_pred : block->linear_preds) { 2627ec681f3Smrg res = 2637ec681f3Smrg std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>( 2647ec681f3Smrg state, &state.program->blocks[lin_pred], nops_needed, reg, mask, true)); 2657ec681f3Smrg } 2667ec681f3Smrg return res; 2677ec681f3Smrg} 2687ec681f3Smrg 2697ec681f3Smrgtemplate <bool Valu, bool Vintrp, bool Salu> 2707ec681f3Smrgvoid 2717ec681f3Smrghandle_raw_hazard(State& state, int* NOPs, int min_states, Operand op) 2727ec681f3Smrg{ 2737ec681f3Smrg if (*NOPs >= min_states) 2747ec681f3Smrg return; 2757ec681f3Smrg int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>( 2767ec681f3Smrg state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()), false); 2777ec681f3Smrg *NOPs = MAX2(*NOPs, res); 2787ec681f3Smrg} 2797ec681f3Smrg 2807ec681f3Smrgstatic auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>; 2817ec681f3Smrgstatic auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>; 2827ec681f3Smrgstatic auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>; 2837ec681f3Smrg 2847ec681f3Smrgvoid 2857ec681f3Smrgset_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) 2867ec681f3Smrg{ 2877ec681f3Smrg unsigned end = start + size - 1; 2887ec681f3Smrg unsigned start_mod = start % BITSET_WORDBITS; 2897ec681f3Smrg if (start_mod + size <= BITSET_WORDBITS) { 2907ec681f3Smrg BITSET_SET_RANGE_INSIDE_WORD(words, start, end); 2917ec681f3Smrg } else { 2927ec681f3Smrg unsigned first_size = BITSET_WORDBITS - start_mod; 2937ec681f3Smrg set_bitset_range(words, start, BITSET_WORDBITS - start_mod); 2947ec681f3Smrg set_bitset_range(words, start + first_size, size - first_size); 2957ec681f3Smrg } 2967ec681f3Smrg} 2977ec681f3Smrg 2987ec681f3Smrgbool 2997ec681f3Smrgtest_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) 3007ec681f3Smrg{ 3017ec681f3Smrg unsigned end = start + size - 1; 3027ec681f3Smrg unsigned start_mod = start % BITSET_WORDBITS; 3037ec681f3Smrg if (start_mod + size <= BITSET_WORDBITS) { 3047ec681f3Smrg return BITSET_TEST_RANGE(words, start, end); 3057ec681f3Smrg } else { 3067ec681f3Smrg unsigned first_size = BITSET_WORDBITS - start_mod; 3077ec681f3Smrg return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) || 3087ec681f3Smrg test_bitset_range(words, start + first_size, size - first_size); 3097ec681f3Smrg } 3107ec681f3Smrg} 3117ec681f3Smrg 3127ec681f3Smrg/* A SMEM clause is any group of consecutive SMEM instructions. The 3137ec681f3Smrg * instructions in this group may return out of order and/or may be replayed. 3147ec681f3Smrg * 3157ec681f3Smrg * To fix this potential hazard correctly, we have to make sure that when a 3167ec681f3Smrg * clause has more than one instruction, no instruction in the clause writes 3177ec681f3Smrg * to a register that is read by another instruction in the clause (including 3187ec681f3Smrg * itself). In this case, we have to break the SMEM clause by inserting non 3197ec681f3Smrg * SMEM instructions. 3207ec681f3Smrg * 3217ec681f3Smrg * SMEM clauses are only present on GFX8+, and only matter when XNACK is set. 3227ec681f3Smrg */ 3237ec681f3Smrgvoid 3247ec681f3Smrghandle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr, 3257ec681f3Smrg int* NOPs) 3267ec681f3Smrg{ 3277ec681f3Smrg /* break off from previous SMEM clause if needed */ 3287ec681f3Smrg if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) { 3297ec681f3Smrg /* Don't allow clauses with store instructions since the clause's 3307ec681f3Smrg * instructions may use the same address. */ 3317ec681f3Smrg if (ctx.smem_write || instr->definitions.empty() || 3327ec681f3Smrg instr_info.is_atomic[(unsigned)instr->opcode]) { 3337ec681f3Smrg *NOPs = 1; 3347ec681f3Smrg } else if (program->dev.xnack_enabled) { 3357ec681f3Smrg for (Operand op : instr->operands) { 3367ec681f3Smrg if (!op.isConstant() && 3377ec681f3Smrg test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { 3387ec681f3Smrg *NOPs = 1; 3397ec681f3Smrg break; 3407ec681f3Smrg } 3417ec681f3Smrg } 3427ec681f3Smrg 3437ec681f3Smrg Definition def = instr->definitions[0]; 3447ec681f3Smrg if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size())) 3457ec681f3Smrg *NOPs = 1; 3467ec681f3Smrg } 3477ec681f3Smrg } 3487ec681f3Smrg} 3497ec681f3Smrg 3507ec681f3Smrg/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */ 3517ec681f3Smrgvoid 3527ec681f3Smrghandle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr, 3537ec681f3Smrg std::vector<aco_ptr<Instruction>>& new_instructions) 3547ec681f3Smrg{ 3557ec681f3Smrg /* check hazards */ 3567ec681f3Smrg int NOPs = 0; 3577ec681f3Smrg 3587ec681f3Smrg if (instr->isSMEM()) { 3597ec681f3Smrg if (state.program->chip_class == GFX6) { 3607ec681f3Smrg /* A read of an SGPR by SMRD instruction requires 4 wait states 3617ec681f3Smrg * when the SGPR was written by a VALU instruction. According to LLVM, 3627ec681f3Smrg * there is also an undocumented hardware behavior when the buffer 3637ec681f3Smrg * descriptor is written by a SALU instruction */ 3647ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) { 3657ec681f3Smrg Operand op = instr->operands[i]; 3667ec681f3Smrg if (op.isConstant()) 3677ec681f3Smrg continue; 3687ec681f3Smrg 3697ec681f3Smrg bool is_buffer_desc = i == 0 && op.size() > 2; 3707ec681f3Smrg if (is_buffer_desc) 3717ec681f3Smrg handle_valu_salu_then_read_hazard(state, &NOPs, 4, op); 3727ec681f3Smrg else 3737ec681f3Smrg handle_valu_then_read_hazard(state, &NOPs, 4, op); 3747ec681f3Smrg } 3757ec681f3Smrg } 3767ec681f3Smrg 3777ec681f3Smrg handle_smem_clause_hazards(state.program, ctx, instr, &NOPs); 3787ec681f3Smrg } else if (instr->isSALU()) { 3797ec681f3Smrg if (instr->opcode == aco_opcode::s_setreg_b32 || 3807ec681f3Smrg instr->opcode == aco_opcode::s_setreg_imm32_b32 || 3817ec681f3Smrg instr->opcode == aco_opcode::s_getreg_b32) { 3827ec681f3Smrg NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg); 3837ec681f3Smrg } 3847ec681f3Smrg 3857ec681f3Smrg if (state.program->chip_class == GFX9) { 3867ec681f3Smrg if (instr->opcode == aco_opcode::s_movrels_b32 || 3877ec681f3Smrg instr->opcode == aco_opcode::s_movrels_b64 || 3887ec681f3Smrg instr->opcode == aco_opcode::s_movreld_b32 || 3897ec681f3Smrg instr->opcode == aco_opcode::s_movreld_b64) { 3907ec681f3Smrg NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel); 3917ec681f3Smrg } 3927ec681f3Smrg } 3937ec681f3Smrg 3947ec681f3Smrg if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata) 3957ec681f3Smrg NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace); 3967ec681f3Smrg } else if (instr->isDS() && instr->ds().gds) { 3977ec681f3Smrg NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace); 3987ec681f3Smrg } else if (instr->isVALU() || instr->isVINTRP()) { 3997ec681f3Smrg for (Operand op : instr->operands) { 4007ec681f3Smrg if (op.physReg() == vccz) 4017ec681f3Smrg NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_vccz); 4027ec681f3Smrg if (op.physReg() == execz) 4037ec681f3Smrg NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_execz); 4047ec681f3Smrg } 4057ec681f3Smrg 4067ec681f3Smrg if (instr->isDPP()) { 4077ec681f3Smrg NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp); 4087ec681f3Smrg handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]); 4097ec681f3Smrg } 4107ec681f3Smrg 4117ec681f3Smrg for (Definition def : instr->definitions) { 4127ec681f3Smrg if (def.regClass().type() != RegType::sgpr) { 4137ec681f3Smrg for (unsigned i = 0; i < def.size(); i++) 4147ec681f3Smrg NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]); 4157ec681f3Smrg } 4167ec681f3Smrg } 4177ec681f3Smrg 4187ec681f3Smrg if ((instr->opcode == aco_opcode::v_readlane_b32 || 4197ec681f3Smrg instr->opcode == aco_opcode::v_readlane_b32_e64 || 4207ec681f3Smrg instr->opcode == aco_opcode::v_writelane_b32 || 4217ec681f3Smrg instr->opcode == aco_opcode::v_writelane_b32_e64) && 4227ec681f3Smrg !instr->operands[1].isConstant()) { 4237ec681f3Smrg handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]); 4247ec681f3Smrg } 4257ec681f3Smrg 4267ec681f3Smrg /* It's required to insert 1 wait state if the dst VGPR of any v_interp_* 4277ec681f3Smrg * is followed by a read with v_readfirstlane or v_readlane to fix GPU 4287ec681f3Smrg * hangs on GFX6. Note that v_writelane_* is apparently not affected. 4297ec681f3Smrg * This hazard isn't documented anywhere but AMD confirmed that hazard. 4307ec681f3Smrg */ 4317ec681f3Smrg if (state.program->chip_class == GFX6 && 4327ec681f3Smrg (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */ 4337ec681f3Smrg instr->opcode == aco_opcode::v_readfirstlane_b32)) { 4347ec681f3Smrg handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]); 4357ec681f3Smrg } 4367ec681f3Smrg 4377ec681f3Smrg if (instr->opcode == aco_opcode::v_div_fmas_f32 || 4387ec681f3Smrg instr->opcode == aco_opcode::v_div_fmas_f64) 4397ec681f3Smrg NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas); 4407ec681f3Smrg } else if (instr->isVMEM() || instr->isFlatLike()) { 4417ec681f3Smrg /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ 4427ec681f3Smrg for (Operand op : instr->operands) { 4437ec681f3Smrg if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr) 4447ec681f3Smrg handle_valu_then_read_hazard(state, &NOPs, 5, op); 4457ec681f3Smrg } 4467ec681f3Smrg } 4477ec681f3Smrg 4487ec681f3Smrg if (!instr->isSALU() && instr->format != Format::SMEM) 4497ec681f3Smrg NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector); 4507ec681f3Smrg 4517ec681f3Smrg if (state.program->chip_class == GFX9) { 4527ec681f3Smrg bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds; 4537ec681f3Smrg if (instr->isVINTRP() || lds_scratch_global || 4547ec681f3Smrg instr->opcode == aco_opcode::ds_read_addtid_b32 || 4557ec681f3Smrg instr->opcode == aco_opcode::ds_write_addtid_b32 || 4567ec681f3Smrg instr->opcode == aco_opcode::buffer_store_lds_dword) { 4577ec681f3Smrg NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds); 4587ec681f3Smrg } 4597ec681f3Smrg } 4607ec681f3Smrg 4617ec681f3Smrg ctx.add_wait_states(NOPs + get_wait_states(instr)); 4627ec681f3Smrg 4637ec681f3Smrg // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles 4647ec681f3Smrg if (NOPs) { 4657ec681f3Smrg /* create NOP */ 4667ec681f3Smrg aco_ptr<SOPP_instruction> nop{ 4677ec681f3Smrg create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)}; 4687ec681f3Smrg nop->imm = NOPs - 1; 4697ec681f3Smrg nop->block = -1; 4707ec681f3Smrg new_instructions.emplace_back(std::move(nop)); 4717ec681f3Smrg } 4727ec681f3Smrg 4737ec681f3Smrg /* update information to check for later hazards */ 4747ec681f3Smrg if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) { 4757ec681f3Smrg ctx.smem_clause = false; 4767ec681f3Smrg ctx.smem_write = false; 4777ec681f3Smrg 4787ec681f3Smrg if (state.program->dev.xnack_enabled) { 4797ec681f3Smrg BITSET_ZERO(ctx.smem_clause_read_write); 4807ec681f3Smrg BITSET_ZERO(ctx.smem_clause_write); 4817ec681f3Smrg } 4827ec681f3Smrg } 4837ec681f3Smrg 4847ec681f3Smrg if (instr->isSMEM()) { 4857ec681f3Smrg if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) { 4867ec681f3Smrg ctx.smem_write = true; 4877ec681f3Smrg } else { 4887ec681f3Smrg ctx.smem_clause = true; 4897ec681f3Smrg 4907ec681f3Smrg if (state.program->dev.xnack_enabled) { 4917ec681f3Smrg for (Operand op : instr->operands) { 4927ec681f3Smrg if (!op.isConstant()) { 4937ec681f3Smrg set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size()); 4947ec681f3Smrg } 4957ec681f3Smrg } 4967ec681f3Smrg 4977ec681f3Smrg Definition def = instr->definitions[0]; 4987ec681f3Smrg set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()); 4997ec681f3Smrg set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size()); 5007ec681f3Smrg } 5017ec681f3Smrg } 5027ec681f3Smrg } else if (instr->isVALU()) { 5037ec681f3Smrg for (Definition def : instr->definitions) { 5047ec681f3Smrg if (def.regClass().type() == RegType::sgpr) { 5057ec681f3Smrg if (def.physReg() == vcc || def.physReg() == vcc_hi) { 5067ec681f3Smrg ctx.valu_wr_vcc_then_vccz = 5; 5077ec681f3Smrg ctx.valu_wr_vcc_then_div_fmas = 4; 5087ec681f3Smrg } 5097ec681f3Smrg if (def.physReg() == exec || def.physReg() == exec_hi) { 5107ec681f3Smrg ctx.valu_wr_exec_then_execz = 5; 5117ec681f3Smrg ctx.valu_wr_exec_then_dpp = 5; 5127ec681f3Smrg } 5137ec681f3Smrg } 5147ec681f3Smrg } 5157ec681f3Smrg } else if (instr->isSALU() && !instr->definitions.empty()) { 5167ec681f3Smrg if (!instr->definitions.empty()) { 5177ec681f3Smrg /* all other definitions should be SCC */ 5187ec681f3Smrg Definition def = instr->definitions[0]; 5197ec681f3Smrg if (def.physReg() == m0) { 5207ec681f3Smrg ctx.salu_wr_m0_then_gds_msg_ttrace = 1; 5217ec681f3Smrg ctx.salu_wr_m0_then_lds = 1; 5227ec681f3Smrg ctx.salu_wr_m0_then_moverel = 1; 5237ec681f3Smrg } 5247ec681f3Smrg } else if (instr->opcode == aco_opcode::s_setreg_b32 || 5257ec681f3Smrg instr->opcode == aco_opcode::s_setreg_imm32_b32) { 5267ec681f3Smrg SOPK_instruction& sopk = instr->sopk(); 5277ec681f3Smrg unsigned offset = (sopk.imm >> 6) & 0x1f; 5287ec681f3Smrg unsigned size = ((sopk.imm >> 11) & 0x1f) + 1; 5297ec681f3Smrg unsigned reg = sopk.imm & 0x3f; 5307ec681f3Smrg ctx.setreg_then_getsetreg = 2; 5317ec681f3Smrg 5327ec681f3Smrg if (reg == 1 && offset >= 28 && size > (28 - offset)) 5337ec681f3Smrg ctx.set_vskip_mode_then_vector = 2; 5347ec681f3Smrg } 5357ec681f3Smrg } else if (instr->isVMEM() || instr->isFlatLike()) { 5367ec681f3Smrg /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */ 5377ec681f3Smrg bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 && 5387ec681f3Smrg instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128; 5397ec681f3Smrg /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit 5407ec681f3Smrg * store) */ 5417ec681f3Smrg bool consider_mimg = instr->isMIMG() && 5427ec681f3Smrg instr->operands[1].regClass().type() == RegType::vgpr && 5437ec681f3Smrg instr->operands[1].size() > 2 && instr->operands[0].size() == 4; 5447ec681f3Smrg /* FLAT/GLOBAL/SCRATCH store with >64-bit data */ 5457ec681f3Smrg bool consider_flat = 5467ec681f3Smrg instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2; 5477ec681f3Smrg if (consider_buf || consider_mimg || consider_flat) { 5487ec681f3Smrg PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg(); 5497ec681f3Smrg unsigned size = instr->operands[consider_flat ? 2 : 3].size(); 5507ec681f3Smrg for (unsigned i = 0; i < size; i++) 5517ec681f3Smrg ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1; 5527ec681f3Smrg } 5537ec681f3Smrg } 5547ec681f3Smrg} 5557ec681f3Smrg 5567ec681f3Smrgtemplate <std::size_t N> 5577ec681f3Smrgbool 5587ec681f3Smrgcheck_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs) 5597ec681f3Smrg{ 5607ec681f3Smrg return std::any_of(instr->definitions.begin(), instr->definitions.end(), 5617ec681f3Smrg [&check_regs](const Definition& def) -> bool 5627ec681f3Smrg { 5637ec681f3Smrg bool writes_any = false; 5647ec681f3Smrg for (unsigned i = 0; i < def.size(); i++) { 5657ec681f3Smrg unsigned def_reg = def.physReg() + i; 5667ec681f3Smrg writes_any |= def_reg < check_regs.size() && check_regs[def_reg]; 5677ec681f3Smrg } 5687ec681f3Smrg return writes_any; 5697ec681f3Smrg }); 5707ec681f3Smrg} 5717ec681f3Smrg 5727ec681f3Smrgtemplate <std::size_t N> 5737ec681f3Smrgvoid 5747ec681f3Smrgmark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads) 5757ec681f3Smrg{ 5767ec681f3Smrg for (const Operand& op : instr->operands) { 5777ec681f3Smrg for (unsigned i = 0; i < op.size(); i++) { 5787ec681f3Smrg unsigned reg = op.physReg() + i; 5797ec681f3Smrg if (reg < reg_reads.size()) 5807ec681f3Smrg reg_reads.set(reg); 5817ec681f3Smrg } 5827ec681f3Smrg } 5837ec681f3Smrg} 5847ec681f3Smrg 5857ec681f3Smrgbool 5867ec681f3SmrgVALU_writes_sgpr(aco_ptr<Instruction>& instr) 5877ec681f3Smrg{ 5887ec681f3Smrg if (instr->isVOPC()) 5897ec681f3Smrg return true; 5907ec681f3Smrg if (instr->isVOP3() && instr->definitions.size() == 2) 5917ec681f3Smrg return true; 5927ec681f3Smrg if (instr->opcode == aco_opcode::v_readfirstlane_b32 || 5937ec681f3Smrg instr->opcode == aco_opcode::v_readlane_b32 || 5947ec681f3Smrg instr->opcode == aco_opcode::v_readlane_b32_e64) 5957ec681f3Smrg return true; 5967ec681f3Smrg return false; 5977ec681f3Smrg} 5987ec681f3Smrg 5997ec681f3Smrgbool 6007ec681f3Smrginstr_writes_exec(const aco_ptr<Instruction>& instr) 6017ec681f3Smrg{ 6027ec681f3Smrg return std::any_of(instr->definitions.begin(), instr->definitions.end(), 6037ec681f3Smrg [](const Definition& def) -> bool 6047ec681f3Smrg { return def.physReg() == exec_lo || def.physReg() == exec_hi; }); 6057ec681f3Smrg} 6067ec681f3Smrg 6077ec681f3Smrgbool 6087ec681f3Smrginstr_writes_sgpr(const aco_ptr<Instruction>& instr) 6097ec681f3Smrg{ 6107ec681f3Smrg return std::any_of(instr->definitions.begin(), instr->definitions.end(), 6117ec681f3Smrg [](const Definition& def) -> bool 6127ec681f3Smrg { return def.getTemp().type() == RegType::sgpr; }); 6137ec681f3Smrg} 6147ec681f3Smrg 6157ec681f3Smrginline bool 6167ec681f3Smrginstr_is_branch(const aco_ptr<Instruction>& instr) 6177ec681f3Smrg{ 6187ec681f3Smrg return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 || 6197ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_scc1 || 6207ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_vccz || 6217ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_vccnz || 6227ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_execz || 6237ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_execnz || 6247ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_cdbgsys || 6257ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_cdbguser || 6267ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user || 6277ec681f3Smrg instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user || 6287ec681f3Smrg instr->opcode == aco_opcode::s_subvector_loop_begin || 6297ec681f3Smrg instr->opcode == aco_opcode::s_subvector_loop_end || 6307ec681f3Smrg instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 || 6317ec681f3Smrg instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64; 6327ec681f3Smrg} 6337ec681f3Smrg 6347ec681f3Smrgvoid 6357ec681f3Smrghandle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr, 6367ec681f3Smrg std::vector<aco_ptr<Instruction>>& new_instructions) 6377ec681f3Smrg{ 6387ec681f3Smrg // TODO: s_dcache_inv needs to be in it's own group on GFX10 6397ec681f3Smrg 6407ec681f3Smrg /* VMEMtoScalarWriteHazard 6417ec681f3Smrg * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" 6427ec681f3Smrg * in-between. 6437ec681f3Smrg */ 6447ec681f3Smrg if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) { 6457ec681f3Smrg /* Remember all SGPRs that are read by the VMEM instruction */ 6467ec681f3Smrg mark_read_regs(instr, ctx.sgprs_read_by_VMEM); 6477ec681f3Smrg ctx.sgprs_read_by_VMEM.set(exec); 6487ec681f3Smrg if (state.program->wave_size == 64) 6497ec681f3Smrg ctx.sgprs_read_by_VMEM.set(exec_hi); 6507ec681f3Smrg } else if (instr->isSALU() || instr->isSMEM()) { 6517ec681f3Smrg if (instr->opcode == aco_opcode::s_waitcnt) { 6527ec681f3Smrg /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */ 6537ec681f3Smrg uint16_t imm = instr->sopp().imm; 6547ec681f3Smrg unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); 6557ec681f3Smrg if (vmcnt == 0) 6567ec681f3Smrg ctx.sgprs_read_by_VMEM.reset(); 6577ec681f3Smrg } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { 6587ec681f3Smrg /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */ 6597ec681f3Smrg if (instr->sopp().imm == 0xffe3) 6607ec681f3Smrg ctx.sgprs_read_by_VMEM.reset(); 6617ec681f3Smrg } 6627ec681f3Smrg 6637ec681f3Smrg /* Check if SALU writes an SGPR that was previously read by the VALU */ 6647ec681f3Smrg if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) { 6657ec681f3Smrg ctx.sgprs_read_by_VMEM.reset(); 6667ec681f3Smrg 6677ec681f3Smrg /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ 6687ec681f3Smrg aco_ptr<SOPP_instruction> depctr{ 6697ec681f3Smrg create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; 6707ec681f3Smrg depctr->imm = 0xffe3; 6717ec681f3Smrg depctr->block = -1; 6727ec681f3Smrg new_instructions.emplace_back(std::move(depctr)); 6737ec681f3Smrg } 6747ec681f3Smrg } else if (instr->isVALU()) { 6757ec681f3Smrg /* Hazard is mitigated by any VALU instruction */ 6767ec681f3Smrg ctx.sgprs_read_by_VMEM.reset(); 6777ec681f3Smrg } 6787ec681f3Smrg 6797ec681f3Smrg /* VcmpxPermlaneHazard 6807ec681f3Smrg * Handle any permlane following a VOPC instruction, insert v_mov between them. 6817ec681f3Smrg */ 6827ec681f3Smrg if (instr->isVOPC()) { 6837ec681f3Smrg ctx.has_VOPC = true; 6847ec681f3Smrg } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 || 6857ec681f3Smrg instr->opcode == aco_opcode::v_permlanex16_b32)) { 6867ec681f3Smrg ctx.has_VOPC = false; 6877ec681f3Smrg 6887ec681f3Smrg /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */ 6897ec681f3Smrg aco_ptr<VOP1_instruction> v_mov{ 6907ec681f3Smrg create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)}; 6917ec681f3Smrg v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1); 6927ec681f3Smrg v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1); 6937ec681f3Smrg new_instructions.emplace_back(std::move(v_mov)); 6947ec681f3Smrg } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) { 6957ec681f3Smrg ctx.has_VOPC = false; 6967ec681f3Smrg } 6977ec681f3Smrg 6987ec681f3Smrg /* VcmpxExecWARHazard 6997ec681f3Smrg * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction. 7007ec681f3Smrg */ 7017ec681f3Smrg if (!instr->isVALU() && instr->reads_exec()) { 7027ec681f3Smrg ctx.has_nonVALU_exec_read = true; 7037ec681f3Smrg } else if (instr->isVALU()) { 7047ec681f3Smrg if (instr_writes_exec(instr)) { 7057ec681f3Smrg ctx.has_nonVALU_exec_read = false; 7067ec681f3Smrg 7077ec681f3Smrg /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ 7087ec681f3Smrg aco_ptr<SOPP_instruction> depctr{ 7097ec681f3Smrg create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; 7107ec681f3Smrg depctr->imm = 0xfffe; 7117ec681f3Smrg depctr->block = -1; 7127ec681f3Smrg new_instructions.emplace_back(std::move(depctr)); 7137ec681f3Smrg } else if (instr_writes_sgpr(instr)) { 7147ec681f3Smrg /* Any VALU instruction that writes an SGPR mitigates the problem */ 7157ec681f3Smrg ctx.has_nonVALU_exec_read = false; 7167ec681f3Smrg } 7177ec681f3Smrg } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { 7187ec681f3Smrg /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */ 7197ec681f3Smrg if ((instr->sopp().imm & 0xfffe) == 0xfffe) 7207ec681f3Smrg ctx.has_nonVALU_exec_read = false; 7217ec681f3Smrg } 7227ec681f3Smrg 7237ec681f3Smrg /* SMEMtoVectorWriteHazard 7247ec681f3Smrg * Handle any VALU instruction writing an SGPR after an SMEM reads it. 7257ec681f3Smrg */ 7267ec681f3Smrg if (instr->isSMEM()) { 7277ec681f3Smrg /* Remember all SGPRs that are read by the SMEM instruction */ 7287ec681f3Smrg mark_read_regs(instr, ctx.sgprs_read_by_SMEM); 7297ec681f3Smrg } else if (VALU_writes_sgpr(instr)) { 7307ec681f3Smrg /* Check if VALU writes an SGPR that was previously read by SMEM */ 7317ec681f3Smrg if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) { 7327ec681f3Smrg ctx.sgprs_read_by_SMEM.reset(); 7337ec681f3Smrg 7347ec681f3Smrg /* Insert s_mov to mitigate the problem */ 7357ec681f3Smrg aco_ptr<SOP1_instruction> s_mov{ 7367ec681f3Smrg create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)}; 7377ec681f3Smrg s_mov->definitions[0] = Definition(sgpr_null, s1); 7387ec681f3Smrg s_mov->operands[0] = Operand::zero(); 7397ec681f3Smrg new_instructions.emplace_back(std::move(s_mov)); 7407ec681f3Smrg } 7417ec681f3Smrg } else if (instr->isSALU()) { 7427ec681f3Smrg if (instr->format != Format::SOPP) { 7437ec681f3Smrg /* SALU can mitigate the hazard */ 7447ec681f3Smrg ctx.sgprs_read_by_SMEM.reset(); 7457ec681f3Smrg } else { 7467ec681f3Smrg /* Reducing lgkmcnt count to 0 always mitigates the hazard. */ 7477ec681f3Smrg const SOPP_instruction& sopp = instr->sopp(); 7487ec681f3Smrg if (sopp.opcode == aco_opcode::s_waitcnt_lgkmcnt) { 7497ec681f3Smrg if (sopp.imm == 0 && sopp.definitions[0].physReg() == sgpr_null) 7507ec681f3Smrg ctx.sgprs_read_by_SMEM.reset(); 7517ec681f3Smrg } else if (sopp.opcode == aco_opcode::s_waitcnt) { 7527ec681f3Smrg unsigned lgkm = (sopp.imm >> 8) & 0x3f; 7537ec681f3Smrg if (lgkm == 0) 7547ec681f3Smrg ctx.sgprs_read_by_SMEM.reset(); 7557ec681f3Smrg } 7567ec681f3Smrg } 7577ec681f3Smrg } 7587ec681f3Smrg 7597ec681f3Smrg /* LdsBranchVmemWARHazard 7607ec681f3Smrg * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns. 7617ec681f3Smrg */ 7627ec681f3Smrg if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) { 7637ec681f3Smrg ctx.has_VMEM = true; 7647ec681f3Smrg ctx.has_branch_after_VMEM = false; 7657ec681f3Smrg /* Mitigation for DS is needed only if there was already a branch after */ 7667ec681f3Smrg ctx.has_DS = ctx.has_branch_after_DS; 7677ec681f3Smrg } else if (instr->isDS()) { 7687ec681f3Smrg ctx.has_DS = true; 7697ec681f3Smrg ctx.has_branch_after_DS = false; 7707ec681f3Smrg /* Mitigation for VMEM is needed only if there was already a branch after */ 7717ec681f3Smrg ctx.has_VMEM = ctx.has_branch_after_VMEM; 7727ec681f3Smrg } else if (instr_is_branch(instr)) { 7737ec681f3Smrg ctx.has_branch_after_VMEM = ctx.has_VMEM; 7747ec681f3Smrg ctx.has_branch_after_DS = ctx.has_DS; 7757ec681f3Smrg } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { 7767ec681f3Smrg /* Only s_waitcnt_vscnt can mitigate the hazard */ 7777ec681f3Smrg const SOPK_instruction& sopk = instr->sopk(); 7787ec681f3Smrg if (sopk.definitions[0].physReg() == sgpr_null && sopk.imm == 0) 7797ec681f3Smrg ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; 7807ec681f3Smrg } 7817ec681f3Smrg if ((ctx.has_VMEM && ctx.has_branch_after_DS) || (ctx.has_DS && ctx.has_branch_after_VMEM)) { 7827ec681f3Smrg ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; 7837ec681f3Smrg 7847ec681f3Smrg /* Insert s_waitcnt_vscnt to mitigate the problem */ 7857ec681f3Smrg aco_ptr<SOPK_instruction> wait{ 7867ec681f3Smrg create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)}; 7877ec681f3Smrg wait->definitions[0] = Definition(sgpr_null, s1); 7887ec681f3Smrg wait->imm = 0; 7897ec681f3Smrg new_instructions.emplace_back(std::move(wait)); 7907ec681f3Smrg } 7917ec681f3Smrg 7927ec681f3Smrg /* NSAToVMEMBug 7937ec681f3Smrg * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 7947ec681f3Smrg * 0). 7957ec681f3Smrg */ 7967ec681f3Smrg if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) { 7977ec681f3Smrg ctx.has_NSA_MIMG = true; 7987ec681f3Smrg } else if (ctx.has_NSA_MIMG) { 7997ec681f3Smrg ctx.has_NSA_MIMG = false; 8007ec681f3Smrg 8017ec681f3Smrg if (instr->isMUBUF() || instr->isMTBUF()) { 8027ec681f3Smrg uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset; 8037ec681f3Smrg if (offset & 6) 8047ec681f3Smrg Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); 8057ec681f3Smrg } 8067ec681f3Smrg } 8077ec681f3Smrg 8087ec681f3Smrg /* waNsaCannotFollowWritelane 8097ec681f3Smrg * Handles NSA MIMG immediately following a v_writelane_b32. 8107ec681f3Smrg */ 8117ec681f3Smrg if (instr->opcode == aco_opcode::v_writelane_b32_e64) { 8127ec681f3Smrg ctx.has_writelane = true; 8137ec681f3Smrg } else if (ctx.has_writelane) { 8147ec681f3Smrg ctx.has_writelane = false; 8157ec681f3Smrg if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0) 8167ec681f3Smrg Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); 8177ec681f3Smrg } 8187ec681f3Smrg} 8197ec681f3Smrg 8207ec681f3Smrgtemplate <typename Ctx> 8217ec681f3Smrgusing HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&, 8227ec681f3Smrg std::vector<aco_ptr<Instruction>>&); 8237ec681f3Smrg 8247ec681f3Smrgtemplate <typename Ctx, HandleInstr<Ctx> Handle> 8257ec681f3Smrgvoid 8267ec681f3Smrghandle_block(Program* program, Ctx& ctx, Block& block) 8277ec681f3Smrg{ 8287ec681f3Smrg if (block.instructions.empty()) 8297ec681f3Smrg return; 8307ec681f3Smrg 8317ec681f3Smrg State state; 8327ec681f3Smrg state.program = program; 8337ec681f3Smrg state.block = █ 8347ec681f3Smrg state.old_instructions = std::move(block.instructions); 8357ec681f3Smrg 8367ec681f3Smrg block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning 8377ec681f3Smrg block.instructions.reserve(state.old_instructions.size()); 8387ec681f3Smrg 8397ec681f3Smrg for (aco_ptr<Instruction>& instr : state.old_instructions) { 8407ec681f3Smrg Handle(state, ctx, instr, block.instructions); 8417ec681f3Smrg block.instructions.emplace_back(std::move(instr)); 8427ec681f3Smrg } 8437ec681f3Smrg} 8447ec681f3Smrg 8457ec681f3Smrgtemplate <typename Ctx, HandleInstr<Ctx> Handle> 8467ec681f3Smrgvoid 8477ec681f3Smrgmitigate_hazards(Program* program) 8487ec681f3Smrg{ 8497ec681f3Smrg std::vector<Ctx> all_ctx(program->blocks.size()); 8507ec681f3Smrg std::stack<unsigned, std::vector<unsigned>> loop_header_indices; 8517ec681f3Smrg 8527ec681f3Smrg for (unsigned i = 0; i < program->blocks.size(); i++) { 8537ec681f3Smrg Block& block = program->blocks[i]; 8547ec681f3Smrg Ctx& ctx = all_ctx[i]; 8557ec681f3Smrg 8567ec681f3Smrg if (block.kind & block_kind_loop_header) { 8577ec681f3Smrg loop_header_indices.push(i); 8587ec681f3Smrg } else if (block.kind & block_kind_loop_exit) { 8597ec681f3Smrg /* Go through the whole loop again */ 8607ec681f3Smrg for (unsigned idx = loop_header_indices.top(); idx < i; idx++) { 8617ec681f3Smrg Ctx loop_block_ctx; 8627ec681f3Smrg for (unsigned b : program->blocks[idx].linear_preds) 8637ec681f3Smrg loop_block_ctx.join(all_ctx[b]); 8647ec681f3Smrg 8657ec681f3Smrg handle_block<Ctx, Handle>(program, loop_block_ctx, program->blocks[idx]); 8667ec681f3Smrg 8677ec681f3Smrg /* We only need to continue if the loop header context changed */ 8687ec681f3Smrg if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx]) 8697ec681f3Smrg break; 8707ec681f3Smrg 8717ec681f3Smrg all_ctx[idx] = loop_block_ctx; 8727ec681f3Smrg } 8737ec681f3Smrg 8747ec681f3Smrg loop_header_indices.pop(); 8757ec681f3Smrg } 8767ec681f3Smrg 8777ec681f3Smrg for (unsigned b : block.linear_preds) 8787ec681f3Smrg ctx.join(all_ctx[b]); 8797ec681f3Smrg 8807ec681f3Smrg handle_block<Ctx, Handle>(program, ctx, block); 8817ec681f3Smrg } 8827ec681f3Smrg} 8837ec681f3Smrg 8847ec681f3Smrg} /* end namespace */ 8857ec681f3Smrg 8867ec681f3Smrgvoid 8877ec681f3Smrginsert_NOPs(Program* program) 8887ec681f3Smrg{ 8897ec681f3Smrg if (program->chip_class >= GFX10_3) 8907ec681f3Smrg ; /* no hazards/bugs to mitigate */ 8917ec681f3Smrg else if (program->chip_class >= GFX10) 8927ec681f3Smrg mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10>(program); 8937ec681f3Smrg else 8947ec681f3Smrg mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program); 8957ec681f3Smrg} 8967ec681f3Smrg 8977ec681f3Smrg} // namespace aco 898