17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2019 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg */
247ec681f3Smrg
257ec681f3Smrg#include "aco_builder.h"
267ec681f3Smrg#include "aco_ir.h"
277ec681f3Smrg
287ec681f3Smrg#include <algorithm>
297ec681f3Smrg#include <bitset>
307ec681f3Smrg#include <stack>
317ec681f3Smrg#include <vector>
327ec681f3Smrg
337ec681f3Smrgnamespace aco {
347ec681f3Smrgnamespace {
357ec681f3Smrg
367ec681f3Smrgstruct State {
377ec681f3Smrg   Program* program;
387ec681f3Smrg   Block* block;
397ec681f3Smrg   std::vector<aco_ptr<Instruction>> old_instructions;
407ec681f3Smrg};
417ec681f3Smrg
427ec681f3Smrgstruct NOP_ctx_gfx6 {
437ec681f3Smrg   void join(const NOP_ctx_gfx6& other)
447ec681f3Smrg   {
457ec681f3Smrg      set_vskip_mode_then_vector =
467ec681f3Smrg         MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
477ec681f3Smrg      valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz);
487ec681f3Smrg      valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz);
497ec681f3Smrg      valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
507ec681f3Smrg      salu_wr_m0_then_gds_msg_ttrace =
517ec681f3Smrg         MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
527ec681f3Smrg      valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
537ec681f3Smrg      salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
547ec681f3Smrg      salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
557ec681f3Smrg      setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
567ec681f3Smrg      vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
577ec681f3Smrg      smem_clause |= other.smem_clause;
587ec681f3Smrg      smem_write |= other.smem_write;
597ec681f3Smrg      for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
607ec681f3Smrg         smem_clause_read_write[i] |= other.smem_clause_read_write[i];
617ec681f3Smrg         smem_clause_write[i] |= other.smem_clause_write[i];
627ec681f3Smrg      }
637ec681f3Smrg   }
647ec681f3Smrg
657ec681f3Smrg   bool operator==(const NOP_ctx_gfx6& other)
667ec681f3Smrg   {
677ec681f3Smrg      return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
687ec681f3Smrg             valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
697ec681f3Smrg             valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
707ec681f3Smrg             valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
717ec681f3Smrg             vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
727ec681f3Smrg             salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
737ec681f3Smrg             valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
747ec681f3Smrg             salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
757ec681f3Smrg             salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
767ec681f3Smrg             setreg_then_getsetreg == other.setreg_then_getsetreg &&
777ec681f3Smrg             smem_clause == other.smem_clause && smem_write == other.smem_write &&
787ec681f3Smrg             BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
797ec681f3Smrg             BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
807ec681f3Smrg   }
817ec681f3Smrg
827ec681f3Smrg   void add_wait_states(unsigned amount)
837ec681f3Smrg   {
847ec681f3Smrg      if ((set_vskip_mode_then_vector -= amount) < 0)
857ec681f3Smrg         set_vskip_mode_then_vector = 0;
867ec681f3Smrg
877ec681f3Smrg      if ((valu_wr_vcc_then_vccz -= amount) < 0)
887ec681f3Smrg         valu_wr_vcc_then_vccz = 0;
897ec681f3Smrg
907ec681f3Smrg      if ((valu_wr_exec_then_execz -= amount) < 0)
917ec681f3Smrg         valu_wr_exec_then_execz = 0;
927ec681f3Smrg
937ec681f3Smrg      if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
947ec681f3Smrg         valu_wr_vcc_then_div_fmas = 0;
957ec681f3Smrg
967ec681f3Smrg      if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
977ec681f3Smrg         salu_wr_m0_then_gds_msg_ttrace = 0;
987ec681f3Smrg
997ec681f3Smrg      if ((valu_wr_exec_then_dpp -= amount) < 0)
1007ec681f3Smrg         valu_wr_exec_then_dpp = 0;
1017ec681f3Smrg
1027ec681f3Smrg      if ((salu_wr_m0_then_lds -= amount) < 0)
1037ec681f3Smrg         salu_wr_m0_then_lds = 0;
1047ec681f3Smrg
1057ec681f3Smrg      if ((salu_wr_m0_then_moverel -= amount) < 0)
1067ec681f3Smrg         salu_wr_m0_then_moverel = 0;
1077ec681f3Smrg
1087ec681f3Smrg      if ((setreg_then_getsetreg -= amount) < 0)
1097ec681f3Smrg         setreg_then_getsetreg = 0;
1107ec681f3Smrg
1117ec681f3Smrg      vmem_store_then_wr_data.reset();
1127ec681f3Smrg   }
1137ec681f3Smrg
1147ec681f3Smrg   /* setting MODE.vskip and then any vector op requires 2 wait states */
1157ec681f3Smrg   int8_t set_vskip_mode_then_vector = 0;
1167ec681f3Smrg
1177ec681f3Smrg   /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */
1187ec681f3Smrg   int8_t valu_wr_vcc_then_vccz = 0;
1197ec681f3Smrg   int8_t valu_wr_exec_then_execz = 0;
1207ec681f3Smrg
1217ec681f3Smrg   /* VALU writing VCC followed by v_div_fmas require 4 wait states */
1227ec681f3Smrg   int8_t valu_wr_vcc_then_div_fmas = 0;
1237ec681f3Smrg
1247ec681f3Smrg   /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
1257ec681f3Smrg   int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
1267ec681f3Smrg
1277ec681f3Smrg   /* VALU writing EXEC followed by DPP requires 5 wait states */
1287ec681f3Smrg   int8_t valu_wr_exec_then_dpp = 0;
1297ec681f3Smrg
1307ec681f3Smrg   /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
1317ec681f3Smrg   int8_t salu_wr_m0_then_lds = 0;
1327ec681f3Smrg
1337ec681f3Smrg   /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
1347ec681f3Smrg   int8_t salu_wr_m0_then_moverel = 0;
1357ec681f3Smrg
1367ec681f3Smrg   /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
1377ec681f3Smrg    * currently we don't look at the actual register */
1387ec681f3Smrg   int8_t setreg_then_getsetreg = 0;
1397ec681f3Smrg
1407ec681f3Smrg   /* some memory instructions writing >64bit followed by a instructions
1417ec681f3Smrg    * writing the VGPRs holding the writedata requires 1 wait state */
1427ec681f3Smrg   std::bitset<256> vmem_store_then_wr_data;
1437ec681f3Smrg
1447ec681f3Smrg   /* we break up SMEM clauses that contain stores or overwrite an
1457ec681f3Smrg    * operand/definition of another instruction in the clause */
1467ec681f3Smrg   bool smem_clause = false;
1477ec681f3Smrg   bool smem_write = false;
1487ec681f3Smrg   BITSET_DECLARE(smem_clause_read_write, 128) = {0};
1497ec681f3Smrg   BITSET_DECLARE(smem_clause_write, 128) = {0};
1507ec681f3Smrg};
1517ec681f3Smrg
1527ec681f3Smrgstruct NOP_ctx_gfx10 {
1537ec681f3Smrg   bool has_VOPC = false;
1547ec681f3Smrg   bool has_nonVALU_exec_read = false;
1557ec681f3Smrg   bool has_VMEM = false;
1567ec681f3Smrg   bool has_branch_after_VMEM = false;
1577ec681f3Smrg   bool has_DS = false;
1587ec681f3Smrg   bool has_branch_after_DS = false;
1597ec681f3Smrg   bool has_NSA_MIMG = false;
1607ec681f3Smrg   bool has_writelane = false;
1617ec681f3Smrg   std::bitset<128> sgprs_read_by_VMEM;
1627ec681f3Smrg   std::bitset<128> sgprs_read_by_SMEM;
1637ec681f3Smrg
1647ec681f3Smrg   void join(const NOP_ctx_gfx10& other)
1657ec681f3Smrg   {
1667ec681f3Smrg      has_VOPC |= other.has_VOPC;
1677ec681f3Smrg      has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
1687ec681f3Smrg      has_VMEM |= other.has_VMEM;
1697ec681f3Smrg      has_branch_after_VMEM |= other.has_branch_after_VMEM;
1707ec681f3Smrg      has_DS |= other.has_DS;
1717ec681f3Smrg      has_branch_after_DS |= other.has_branch_after_DS;
1727ec681f3Smrg      has_NSA_MIMG |= other.has_NSA_MIMG;
1737ec681f3Smrg      has_writelane |= other.has_writelane;
1747ec681f3Smrg      sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
1757ec681f3Smrg      sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
1767ec681f3Smrg   }
1777ec681f3Smrg
1787ec681f3Smrg   bool operator==(const NOP_ctx_gfx10& other)
1797ec681f3Smrg   {
1807ec681f3Smrg      return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
1817ec681f3Smrg             has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM &&
1827ec681f3Smrg             has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS &&
1837ec681f3Smrg             has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
1847ec681f3Smrg             sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
1857ec681f3Smrg             sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
1867ec681f3Smrg   }
1877ec681f3Smrg};
1887ec681f3Smrg
1897ec681f3Smrgint
1907ec681f3Smrgget_wait_states(aco_ptr<Instruction>& instr)
1917ec681f3Smrg{
1927ec681f3Smrg   if (instr->opcode == aco_opcode::s_nop)
1937ec681f3Smrg      return instr->sopp().imm + 1;
1947ec681f3Smrg   else if (instr->opcode == aco_opcode::p_constaddr)
1957ec681f3Smrg      return 3; /* lowered to 3 instructions in the assembler */
1967ec681f3Smrg   else
1977ec681f3Smrg      return 1;
1987ec681f3Smrg}
1997ec681f3Smrg
2007ec681f3Smrgbool
2017ec681f3Smrgregs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
2027ec681f3Smrg{
2037ec681f3Smrg   return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
2047ec681f3Smrg}
2057ec681f3Smrg
2067ec681f3Smrgtemplate <bool Valu, bool Vintrp, bool Salu>
2077ec681f3Smrgbool
2087ec681f3Smrghandle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_needed, uint32_t* mask)
2097ec681f3Smrg{
2107ec681f3Smrg   unsigned mask_size = util_last_bit(*mask);
2117ec681f3Smrg
2127ec681f3Smrg   uint32_t writemask = 0;
2137ec681f3Smrg   for (Definition& def : pred->definitions) {
2147ec681f3Smrg      if (regs_intersect(reg, mask_size, def.physReg(), def.size())) {
2157ec681f3Smrg         unsigned start = def.physReg() > reg ? def.physReg() - reg : 0;
2167ec681f3Smrg         unsigned end = MIN2(mask_size, start + def.size());
2177ec681f3Smrg         writemask |= u_bit_consecutive(start, end - start);
2187ec681f3Smrg      }
2197ec681f3Smrg   }
2207ec681f3Smrg
2217ec681f3Smrg   bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
2227ec681f3Smrg                                       (pred->isSALU() && Salu));
2237ec681f3Smrg   if (is_hazard)
2247ec681f3Smrg      return true;
2257ec681f3Smrg
2267ec681f3Smrg   *mask &= ~writemask;
2277ec681f3Smrg   *nops_needed = MAX2(*nops_needed - get_wait_states(pred), 0);
2287ec681f3Smrg
2297ec681f3Smrg   if (*mask == 0)
2307ec681f3Smrg      *nops_needed = 0;
2317ec681f3Smrg
2327ec681f3Smrg   return *nops_needed == 0;
2337ec681f3Smrg}
2347ec681f3Smrg
2357ec681f3Smrgtemplate <bool Valu, bool Vintrp, bool Salu>
2367ec681f3Smrgint
2377ec681f3Smrghandle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask,
2387ec681f3Smrg                           bool start_at_end)
2397ec681f3Smrg{
2407ec681f3Smrg   if (block == state.block && start_at_end) {
2417ec681f3Smrg      /* If it's the current block, block->instructions is incomplete. */
2427ec681f3Smrg      for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
2437ec681f3Smrg         aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
2447ec681f3Smrg         if (!instr)
2457ec681f3Smrg            break; /* Instruction has been moved to block->instructions. */
2467ec681f3Smrg         if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(instr, reg, &nops_needed, &mask))
2477ec681f3Smrg            return nops_needed;
2487ec681f3Smrg      }
2497ec681f3Smrg   }
2507ec681f3Smrg   for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
2517ec681f3Smrg      if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg,
2527ec681f3Smrg                                                      &nops_needed, &mask))
2537ec681f3Smrg         return nops_needed;
2547ec681f3Smrg   }
2557ec681f3Smrg
2567ec681f3Smrg   int res = 0;
2577ec681f3Smrg
2587ec681f3Smrg   /* Loops require branch instructions, which count towards the wait
2597ec681f3Smrg    * states. So even with loops this should finish unless nops_needed is some
2607ec681f3Smrg    * huge value. */
2617ec681f3Smrg   for (unsigned lin_pred : block->linear_preds) {
2627ec681f3Smrg      res =
2637ec681f3Smrg         std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
2647ec681f3Smrg                          state, &state.program->blocks[lin_pred], nops_needed, reg, mask, true));
2657ec681f3Smrg   }
2667ec681f3Smrg   return res;
2677ec681f3Smrg}
2687ec681f3Smrg
2697ec681f3Smrgtemplate <bool Valu, bool Vintrp, bool Salu>
2707ec681f3Smrgvoid
2717ec681f3Smrghandle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
2727ec681f3Smrg{
2737ec681f3Smrg   if (*NOPs >= min_states)
2747ec681f3Smrg      return;
2757ec681f3Smrg   int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
2767ec681f3Smrg      state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()), false);
2777ec681f3Smrg   *NOPs = MAX2(*NOPs, res);
2787ec681f3Smrg}
2797ec681f3Smrg
2807ec681f3Smrgstatic auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
2817ec681f3Smrgstatic auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
2827ec681f3Smrgstatic auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
2837ec681f3Smrg
2847ec681f3Smrgvoid
2857ec681f3Smrgset_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
2867ec681f3Smrg{
2877ec681f3Smrg   unsigned end = start + size - 1;
2887ec681f3Smrg   unsigned start_mod = start % BITSET_WORDBITS;
2897ec681f3Smrg   if (start_mod + size <= BITSET_WORDBITS) {
2907ec681f3Smrg      BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
2917ec681f3Smrg   } else {
2927ec681f3Smrg      unsigned first_size = BITSET_WORDBITS - start_mod;
2937ec681f3Smrg      set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
2947ec681f3Smrg      set_bitset_range(words, start + first_size, size - first_size);
2957ec681f3Smrg   }
2967ec681f3Smrg}
2977ec681f3Smrg
2987ec681f3Smrgbool
2997ec681f3Smrgtest_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
3007ec681f3Smrg{
3017ec681f3Smrg   unsigned end = start + size - 1;
3027ec681f3Smrg   unsigned start_mod = start % BITSET_WORDBITS;
3037ec681f3Smrg   if (start_mod + size <= BITSET_WORDBITS) {
3047ec681f3Smrg      return BITSET_TEST_RANGE(words, start, end);
3057ec681f3Smrg   } else {
3067ec681f3Smrg      unsigned first_size = BITSET_WORDBITS - start_mod;
3077ec681f3Smrg      return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
3087ec681f3Smrg             test_bitset_range(words, start + first_size, size - first_size);
3097ec681f3Smrg   }
3107ec681f3Smrg}
3117ec681f3Smrg
3127ec681f3Smrg/* A SMEM clause is any group of consecutive SMEM instructions. The
3137ec681f3Smrg * instructions in this group may return out of order and/or may be replayed.
3147ec681f3Smrg *
3157ec681f3Smrg * To fix this potential hazard correctly, we have to make sure that when a
3167ec681f3Smrg * clause has more than one instruction, no instruction in the clause writes
3177ec681f3Smrg * to a register that is read by another instruction in the clause (including
3187ec681f3Smrg * itself). In this case, we have to break the SMEM clause by inserting non
3197ec681f3Smrg * SMEM instructions.
3207ec681f3Smrg *
3217ec681f3Smrg * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
3227ec681f3Smrg */
3237ec681f3Smrgvoid
3247ec681f3Smrghandle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
3257ec681f3Smrg                           int* NOPs)
3267ec681f3Smrg{
3277ec681f3Smrg   /* break off from previous SMEM clause if needed */
3287ec681f3Smrg   if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
3297ec681f3Smrg      /* Don't allow clauses with store instructions since the clause's
3307ec681f3Smrg       * instructions may use the same address. */
3317ec681f3Smrg      if (ctx.smem_write || instr->definitions.empty() ||
3327ec681f3Smrg          instr_info.is_atomic[(unsigned)instr->opcode]) {
3337ec681f3Smrg         *NOPs = 1;
3347ec681f3Smrg      } else if (program->dev.xnack_enabled) {
3357ec681f3Smrg         for (Operand op : instr->operands) {
3367ec681f3Smrg            if (!op.isConstant() &&
3377ec681f3Smrg                test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
3387ec681f3Smrg               *NOPs = 1;
3397ec681f3Smrg               break;
3407ec681f3Smrg            }
3417ec681f3Smrg         }
3427ec681f3Smrg
3437ec681f3Smrg         Definition def = instr->definitions[0];
3447ec681f3Smrg         if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
3457ec681f3Smrg            *NOPs = 1;
3467ec681f3Smrg      }
3477ec681f3Smrg   }
3487ec681f3Smrg}
3497ec681f3Smrg
3507ec681f3Smrg/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
3517ec681f3Smrgvoid
3527ec681f3Smrghandle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
3537ec681f3Smrg                        std::vector<aco_ptr<Instruction>>& new_instructions)
3547ec681f3Smrg{
3557ec681f3Smrg   /* check hazards */
3567ec681f3Smrg   int NOPs = 0;
3577ec681f3Smrg
3587ec681f3Smrg   if (instr->isSMEM()) {
3597ec681f3Smrg      if (state.program->chip_class == GFX6) {
3607ec681f3Smrg         /* A read of an SGPR by SMRD instruction requires 4 wait states
3617ec681f3Smrg          * when the SGPR was written by a VALU instruction. According to LLVM,
3627ec681f3Smrg          * there is also an undocumented hardware behavior when the buffer
3637ec681f3Smrg          * descriptor is written by a SALU instruction */
3647ec681f3Smrg         for (unsigned i = 0; i < instr->operands.size(); i++) {
3657ec681f3Smrg            Operand op = instr->operands[i];
3667ec681f3Smrg            if (op.isConstant())
3677ec681f3Smrg               continue;
3687ec681f3Smrg
3697ec681f3Smrg            bool is_buffer_desc = i == 0 && op.size() > 2;
3707ec681f3Smrg            if (is_buffer_desc)
3717ec681f3Smrg               handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
3727ec681f3Smrg            else
3737ec681f3Smrg               handle_valu_then_read_hazard(state, &NOPs, 4, op);
3747ec681f3Smrg         }
3757ec681f3Smrg      }
3767ec681f3Smrg
3777ec681f3Smrg      handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
3787ec681f3Smrg   } else if (instr->isSALU()) {
3797ec681f3Smrg      if (instr->opcode == aco_opcode::s_setreg_b32 ||
3807ec681f3Smrg          instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
3817ec681f3Smrg          instr->opcode == aco_opcode::s_getreg_b32) {
3827ec681f3Smrg         NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
3837ec681f3Smrg      }
3847ec681f3Smrg
3857ec681f3Smrg      if (state.program->chip_class == GFX9) {
3867ec681f3Smrg         if (instr->opcode == aco_opcode::s_movrels_b32 ||
3877ec681f3Smrg             instr->opcode == aco_opcode::s_movrels_b64 ||
3887ec681f3Smrg             instr->opcode == aco_opcode::s_movreld_b32 ||
3897ec681f3Smrg             instr->opcode == aco_opcode::s_movreld_b64) {
3907ec681f3Smrg            NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
3917ec681f3Smrg         }
3927ec681f3Smrg      }
3937ec681f3Smrg
3947ec681f3Smrg      if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
3957ec681f3Smrg         NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
3967ec681f3Smrg   } else if (instr->isDS() && instr->ds().gds) {
3977ec681f3Smrg      NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
3987ec681f3Smrg   } else if (instr->isVALU() || instr->isVINTRP()) {
3997ec681f3Smrg      for (Operand op : instr->operands) {
4007ec681f3Smrg         if (op.physReg() == vccz)
4017ec681f3Smrg            NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_vccz);
4027ec681f3Smrg         if (op.physReg() == execz)
4037ec681f3Smrg            NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_execz);
4047ec681f3Smrg      }
4057ec681f3Smrg
4067ec681f3Smrg      if (instr->isDPP()) {
4077ec681f3Smrg         NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
4087ec681f3Smrg         handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
4097ec681f3Smrg      }
4107ec681f3Smrg
4117ec681f3Smrg      for (Definition def : instr->definitions) {
4127ec681f3Smrg         if (def.regClass().type() != RegType::sgpr) {
4137ec681f3Smrg            for (unsigned i = 0; i < def.size(); i++)
4147ec681f3Smrg               NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
4157ec681f3Smrg         }
4167ec681f3Smrg      }
4177ec681f3Smrg
4187ec681f3Smrg      if ((instr->opcode == aco_opcode::v_readlane_b32 ||
4197ec681f3Smrg           instr->opcode == aco_opcode::v_readlane_b32_e64 ||
4207ec681f3Smrg           instr->opcode == aco_opcode::v_writelane_b32 ||
4217ec681f3Smrg           instr->opcode == aco_opcode::v_writelane_b32_e64) &&
4227ec681f3Smrg          !instr->operands[1].isConstant()) {
4237ec681f3Smrg         handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
4247ec681f3Smrg      }
4257ec681f3Smrg
4267ec681f3Smrg      /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
4277ec681f3Smrg       * is followed by a read with v_readfirstlane or v_readlane to fix GPU
4287ec681f3Smrg       * hangs on GFX6. Note that v_writelane_* is apparently not affected.
4297ec681f3Smrg       * This hazard isn't documented anywhere but AMD confirmed that hazard.
4307ec681f3Smrg       */
4317ec681f3Smrg      if (state.program->chip_class == GFX6 &&
4327ec681f3Smrg          (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
4337ec681f3Smrg           instr->opcode == aco_opcode::v_readfirstlane_b32)) {
4347ec681f3Smrg         handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
4357ec681f3Smrg      }
4367ec681f3Smrg
4377ec681f3Smrg      if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
4387ec681f3Smrg          instr->opcode == aco_opcode::v_div_fmas_f64)
4397ec681f3Smrg         NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
4407ec681f3Smrg   } else if (instr->isVMEM() || instr->isFlatLike()) {
4417ec681f3Smrg      /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
4427ec681f3Smrg      for (Operand op : instr->operands) {
4437ec681f3Smrg         if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
4447ec681f3Smrg            handle_valu_then_read_hazard(state, &NOPs, 5, op);
4457ec681f3Smrg      }
4467ec681f3Smrg   }
4477ec681f3Smrg
4487ec681f3Smrg   if (!instr->isSALU() && instr->format != Format::SMEM)
4497ec681f3Smrg      NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
4507ec681f3Smrg
4517ec681f3Smrg   if (state.program->chip_class == GFX9) {
4527ec681f3Smrg      bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
4537ec681f3Smrg      if (instr->isVINTRP() || lds_scratch_global ||
4547ec681f3Smrg          instr->opcode == aco_opcode::ds_read_addtid_b32 ||
4557ec681f3Smrg          instr->opcode == aco_opcode::ds_write_addtid_b32 ||
4567ec681f3Smrg          instr->opcode == aco_opcode::buffer_store_lds_dword) {
4577ec681f3Smrg         NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
4587ec681f3Smrg      }
4597ec681f3Smrg   }
4607ec681f3Smrg
4617ec681f3Smrg   ctx.add_wait_states(NOPs + get_wait_states(instr));
4627ec681f3Smrg
4637ec681f3Smrg   // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
4647ec681f3Smrg   if (NOPs) {
4657ec681f3Smrg      /* create NOP */
4667ec681f3Smrg      aco_ptr<SOPP_instruction> nop{
4677ec681f3Smrg         create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
4687ec681f3Smrg      nop->imm = NOPs - 1;
4697ec681f3Smrg      nop->block = -1;
4707ec681f3Smrg      new_instructions.emplace_back(std::move(nop));
4717ec681f3Smrg   }
4727ec681f3Smrg
4737ec681f3Smrg   /* update information to check for later hazards */
4747ec681f3Smrg   if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
4757ec681f3Smrg      ctx.smem_clause = false;
4767ec681f3Smrg      ctx.smem_write = false;
4777ec681f3Smrg
4787ec681f3Smrg      if (state.program->dev.xnack_enabled) {
4797ec681f3Smrg         BITSET_ZERO(ctx.smem_clause_read_write);
4807ec681f3Smrg         BITSET_ZERO(ctx.smem_clause_write);
4817ec681f3Smrg      }
4827ec681f3Smrg   }
4837ec681f3Smrg
4847ec681f3Smrg   if (instr->isSMEM()) {
4857ec681f3Smrg      if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
4867ec681f3Smrg         ctx.smem_write = true;
4877ec681f3Smrg      } else {
4887ec681f3Smrg         ctx.smem_clause = true;
4897ec681f3Smrg
4907ec681f3Smrg         if (state.program->dev.xnack_enabled) {
4917ec681f3Smrg            for (Operand op : instr->operands) {
4927ec681f3Smrg               if (!op.isConstant()) {
4937ec681f3Smrg                  set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
4947ec681f3Smrg               }
4957ec681f3Smrg            }
4967ec681f3Smrg
4977ec681f3Smrg            Definition def = instr->definitions[0];
4987ec681f3Smrg            set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
4997ec681f3Smrg            set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
5007ec681f3Smrg         }
5017ec681f3Smrg      }
5027ec681f3Smrg   } else if (instr->isVALU()) {
5037ec681f3Smrg      for (Definition def : instr->definitions) {
5047ec681f3Smrg         if (def.regClass().type() == RegType::sgpr) {
5057ec681f3Smrg            if (def.physReg() == vcc || def.physReg() == vcc_hi) {
5067ec681f3Smrg               ctx.valu_wr_vcc_then_vccz = 5;
5077ec681f3Smrg               ctx.valu_wr_vcc_then_div_fmas = 4;
5087ec681f3Smrg            }
5097ec681f3Smrg            if (def.physReg() == exec || def.physReg() == exec_hi) {
5107ec681f3Smrg               ctx.valu_wr_exec_then_execz = 5;
5117ec681f3Smrg               ctx.valu_wr_exec_then_dpp = 5;
5127ec681f3Smrg            }
5137ec681f3Smrg         }
5147ec681f3Smrg      }
5157ec681f3Smrg   } else if (instr->isSALU() && !instr->definitions.empty()) {
5167ec681f3Smrg      if (!instr->definitions.empty()) {
5177ec681f3Smrg         /* all other definitions should be SCC */
5187ec681f3Smrg         Definition def = instr->definitions[0];
5197ec681f3Smrg         if (def.physReg() == m0) {
5207ec681f3Smrg            ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
5217ec681f3Smrg            ctx.salu_wr_m0_then_lds = 1;
5227ec681f3Smrg            ctx.salu_wr_m0_then_moverel = 1;
5237ec681f3Smrg         }
5247ec681f3Smrg      } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
5257ec681f3Smrg                 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
5267ec681f3Smrg         SOPK_instruction& sopk = instr->sopk();
5277ec681f3Smrg         unsigned offset = (sopk.imm >> 6) & 0x1f;
5287ec681f3Smrg         unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
5297ec681f3Smrg         unsigned reg = sopk.imm & 0x3f;
5307ec681f3Smrg         ctx.setreg_then_getsetreg = 2;
5317ec681f3Smrg
5327ec681f3Smrg         if (reg == 1 && offset >= 28 && size > (28 - offset))
5337ec681f3Smrg            ctx.set_vskip_mode_then_vector = 2;
5347ec681f3Smrg      }
5357ec681f3Smrg   } else if (instr->isVMEM() || instr->isFlatLike()) {
5367ec681f3Smrg      /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
5377ec681f3Smrg      bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
5387ec681f3Smrg                          instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
5397ec681f3Smrg      /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
5407ec681f3Smrg       * store) */
5417ec681f3Smrg      bool consider_mimg = instr->isMIMG() &&
5427ec681f3Smrg                           instr->operands[1].regClass().type() == RegType::vgpr &&
5437ec681f3Smrg                           instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
5447ec681f3Smrg      /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
5457ec681f3Smrg      bool consider_flat =
5467ec681f3Smrg         instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
5477ec681f3Smrg      if (consider_buf || consider_mimg || consider_flat) {
5487ec681f3Smrg         PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
5497ec681f3Smrg         unsigned size = instr->operands[consider_flat ? 2 : 3].size();
5507ec681f3Smrg         for (unsigned i = 0; i < size; i++)
5517ec681f3Smrg            ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
5527ec681f3Smrg      }
5537ec681f3Smrg   }
5547ec681f3Smrg}
5557ec681f3Smrg
5567ec681f3Smrgtemplate <std::size_t N>
5577ec681f3Smrgbool
5587ec681f3Smrgcheck_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
5597ec681f3Smrg{
5607ec681f3Smrg   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
5617ec681f3Smrg                      [&check_regs](const Definition& def) -> bool
5627ec681f3Smrg                      {
5637ec681f3Smrg                         bool writes_any = false;
5647ec681f3Smrg                         for (unsigned i = 0; i < def.size(); i++) {
5657ec681f3Smrg                            unsigned def_reg = def.physReg() + i;
5667ec681f3Smrg                            writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
5677ec681f3Smrg                         }
5687ec681f3Smrg                         return writes_any;
5697ec681f3Smrg                      });
5707ec681f3Smrg}
5717ec681f3Smrg
5727ec681f3Smrgtemplate <std::size_t N>
5737ec681f3Smrgvoid
5747ec681f3Smrgmark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
5757ec681f3Smrg{
5767ec681f3Smrg   for (const Operand& op : instr->operands) {
5777ec681f3Smrg      for (unsigned i = 0; i < op.size(); i++) {
5787ec681f3Smrg         unsigned reg = op.physReg() + i;
5797ec681f3Smrg         if (reg < reg_reads.size())
5807ec681f3Smrg            reg_reads.set(reg);
5817ec681f3Smrg      }
5827ec681f3Smrg   }
5837ec681f3Smrg}
5847ec681f3Smrg
5857ec681f3Smrgbool
5867ec681f3SmrgVALU_writes_sgpr(aco_ptr<Instruction>& instr)
5877ec681f3Smrg{
5887ec681f3Smrg   if (instr->isVOPC())
5897ec681f3Smrg      return true;
5907ec681f3Smrg   if (instr->isVOP3() && instr->definitions.size() == 2)
5917ec681f3Smrg      return true;
5927ec681f3Smrg   if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
5937ec681f3Smrg       instr->opcode == aco_opcode::v_readlane_b32 ||
5947ec681f3Smrg       instr->opcode == aco_opcode::v_readlane_b32_e64)
5957ec681f3Smrg      return true;
5967ec681f3Smrg   return false;
5977ec681f3Smrg}
5987ec681f3Smrg
5997ec681f3Smrgbool
6007ec681f3Smrginstr_writes_exec(const aco_ptr<Instruction>& instr)
6017ec681f3Smrg{
6027ec681f3Smrg   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
6037ec681f3Smrg                      [](const Definition& def) -> bool
6047ec681f3Smrg                      { return def.physReg() == exec_lo || def.physReg() == exec_hi; });
6057ec681f3Smrg}
6067ec681f3Smrg
6077ec681f3Smrgbool
6087ec681f3Smrginstr_writes_sgpr(const aco_ptr<Instruction>& instr)
6097ec681f3Smrg{
6107ec681f3Smrg   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
6117ec681f3Smrg                      [](const Definition& def) -> bool
6127ec681f3Smrg                      { return def.getTemp().type() == RegType::sgpr; });
6137ec681f3Smrg}
6147ec681f3Smrg
6157ec681f3Smrginline bool
6167ec681f3Smrginstr_is_branch(const aco_ptr<Instruction>& instr)
6177ec681f3Smrg{
6187ec681f3Smrg   return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
6197ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_scc1 ||
6207ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_vccz ||
6217ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_vccnz ||
6227ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_execz ||
6237ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_execnz ||
6247ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
6257ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_cdbguser ||
6267ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
6277ec681f3Smrg          instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
6287ec681f3Smrg          instr->opcode == aco_opcode::s_subvector_loop_begin ||
6297ec681f3Smrg          instr->opcode == aco_opcode::s_subvector_loop_end ||
6307ec681f3Smrg          instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
6317ec681f3Smrg          instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
6327ec681f3Smrg}
6337ec681f3Smrg
6347ec681f3Smrgvoid
6357ec681f3Smrghandle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
6367ec681f3Smrg                         std::vector<aco_ptr<Instruction>>& new_instructions)
6377ec681f3Smrg{
6387ec681f3Smrg   // TODO: s_dcache_inv needs to be in it's own group on GFX10
6397ec681f3Smrg
6407ec681f3Smrg   /* VMEMtoScalarWriteHazard
6417ec681f3Smrg    * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)"
6427ec681f3Smrg    * in-between.
6437ec681f3Smrg    */
6447ec681f3Smrg   if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
6457ec681f3Smrg      /* Remember all SGPRs that are read by the VMEM instruction */
6467ec681f3Smrg      mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
6477ec681f3Smrg      ctx.sgprs_read_by_VMEM.set(exec);
6487ec681f3Smrg      if (state.program->wave_size == 64)
6497ec681f3Smrg         ctx.sgprs_read_by_VMEM.set(exec_hi);
6507ec681f3Smrg   } else if (instr->isSALU() || instr->isSMEM()) {
6517ec681f3Smrg      if (instr->opcode == aco_opcode::s_waitcnt) {
6527ec681f3Smrg         /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
6537ec681f3Smrg         uint16_t imm = instr->sopp().imm;
6547ec681f3Smrg         unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
6557ec681f3Smrg         if (vmcnt == 0)
6567ec681f3Smrg            ctx.sgprs_read_by_VMEM.reset();
6577ec681f3Smrg      } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
6587ec681f3Smrg         /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */
6597ec681f3Smrg         if (instr->sopp().imm == 0xffe3)
6607ec681f3Smrg            ctx.sgprs_read_by_VMEM.reset();
6617ec681f3Smrg      }
6627ec681f3Smrg
6637ec681f3Smrg      /* Check if SALU writes an SGPR that was previously read by the VALU */
6647ec681f3Smrg      if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) {
6657ec681f3Smrg         ctx.sgprs_read_by_VMEM.reset();
6667ec681f3Smrg
6677ec681f3Smrg         /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
6687ec681f3Smrg         aco_ptr<SOPP_instruction> depctr{
6697ec681f3Smrg            create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
6707ec681f3Smrg         depctr->imm = 0xffe3;
6717ec681f3Smrg         depctr->block = -1;
6727ec681f3Smrg         new_instructions.emplace_back(std::move(depctr));
6737ec681f3Smrg      }
6747ec681f3Smrg   } else if (instr->isVALU()) {
6757ec681f3Smrg      /* Hazard is mitigated by any VALU instruction */
6767ec681f3Smrg      ctx.sgprs_read_by_VMEM.reset();
6777ec681f3Smrg   }
6787ec681f3Smrg
6797ec681f3Smrg   /* VcmpxPermlaneHazard
6807ec681f3Smrg    * Handle any permlane following a VOPC instruction, insert v_mov between them.
6817ec681f3Smrg    */
6827ec681f3Smrg   if (instr->isVOPC()) {
6837ec681f3Smrg      ctx.has_VOPC = true;
6847ec681f3Smrg   } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 ||
6857ec681f3Smrg                               instr->opcode == aco_opcode::v_permlanex16_b32)) {
6867ec681f3Smrg      ctx.has_VOPC = false;
6877ec681f3Smrg
6887ec681f3Smrg      /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
6897ec681f3Smrg      aco_ptr<VOP1_instruction> v_mov{
6907ec681f3Smrg         create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
6917ec681f3Smrg      v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
6927ec681f3Smrg      v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
6937ec681f3Smrg      new_instructions.emplace_back(std::move(v_mov));
6947ec681f3Smrg   } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
6957ec681f3Smrg      ctx.has_VOPC = false;
6967ec681f3Smrg   }
6977ec681f3Smrg
6987ec681f3Smrg   /* VcmpxExecWARHazard
6997ec681f3Smrg    * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
7007ec681f3Smrg    */
7017ec681f3Smrg   if (!instr->isVALU() && instr->reads_exec()) {
7027ec681f3Smrg      ctx.has_nonVALU_exec_read = true;
7037ec681f3Smrg   } else if (instr->isVALU()) {
7047ec681f3Smrg      if (instr_writes_exec(instr)) {
7057ec681f3Smrg         ctx.has_nonVALU_exec_read = false;
7067ec681f3Smrg
7077ec681f3Smrg         /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
7087ec681f3Smrg         aco_ptr<SOPP_instruction> depctr{
7097ec681f3Smrg            create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
7107ec681f3Smrg         depctr->imm = 0xfffe;
7117ec681f3Smrg         depctr->block = -1;
7127ec681f3Smrg         new_instructions.emplace_back(std::move(depctr));
7137ec681f3Smrg      } else if (instr_writes_sgpr(instr)) {
7147ec681f3Smrg         /* Any VALU instruction that writes an SGPR mitigates the problem */
7157ec681f3Smrg         ctx.has_nonVALU_exec_read = false;
7167ec681f3Smrg      }
7177ec681f3Smrg   } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
7187ec681f3Smrg      /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
7197ec681f3Smrg      if ((instr->sopp().imm & 0xfffe) == 0xfffe)
7207ec681f3Smrg         ctx.has_nonVALU_exec_read = false;
7217ec681f3Smrg   }
7227ec681f3Smrg
7237ec681f3Smrg   /* SMEMtoVectorWriteHazard
7247ec681f3Smrg    * Handle any VALU instruction writing an SGPR after an SMEM reads it.
7257ec681f3Smrg    */
7267ec681f3Smrg   if (instr->isSMEM()) {
7277ec681f3Smrg      /* Remember all SGPRs that are read by the SMEM instruction */
7287ec681f3Smrg      mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
7297ec681f3Smrg   } else if (VALU_writes_sgpr(instr)) {
7307ec681f3Smrg      /* Check if VALU writes an SGPR that was previously read by SMEM */
7317ec681f3Smrg      if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
7327ec681f3Smrg         ctx.sgprs_read_by_SMEM.reset();
7337ec681f3Smrg
7347ec681f3Smrg         /* Insert s_mov to mitigate the problem */
7357ec681f3Smrg         aco_ptr<SOP1_instruction> s_mov{
7367ec681f3Smrg            create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
7377ec681f3Smrg         s_mov->definitions[0] = Definition(sgpr_null, s1);
7387ec681f3Smrg         s_mov->operands[0] = Operand::zero();
7397ec681f3Smrg         new_instructions.emplace_back(std::move(s_mov));
7407ec681f3Smrg      }
7417ec681f3Smrg   } else if (instr->isSALU()) {
7427ec681f3Smrg      if (instr->format != Format::SOPP) {
7437ec681f3Smrg         /* SALU can mitigate the hazard */
7447ec681f3Smrg         ctx.sgprs_read_by_SMEM.reset();
7457ec681f3Smrg      } else {
7467ec681f3Smrg         /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
7477ec681f3Smrg         const SOPP_instruction& sopp = instr->sopp();
7487ec681f3Smrg         if (sopp.opcode == aco_opcode::s_waitcnt_lgkmcnt) {
7497ec681f3Smrg            if (sopp.imm == 0 && sopp.definitions[0].physReg() == sgpr_null)
7507ec681f3Smrg               ctx.sgprs_read_by_SMEM.reset();
7517ec681f3Smrg         } else if (sopp.opcode == aco_opcode::s_waitcnt) {
7527ec681f3Smrg            unsigned lgkm = (sopp.imm >> 8) & 0x3f;
7537ec681f3Smrg            if (lgkm == 0)
7547ec681f3Smrg               ctx.sgprs_read_by_SMEM.reset();
7557ec681f3Smrg         }
7567ec681f3Smrg      }
7577ec681f3Smrg   }
7587ec681f3Smrg
7597ec681f3Smrg   /* LdsBranchVmemWARHazard
7607ec681f3Smrg    * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
7617ec681f3Smrg    */
7627ec681f3Smrg   if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
7637ec681f3Smrg      ctx.has_VMEM = true;
7647ec681f3Smrg      ctx.has_branch_after_VMEM = false;
7657ec681f3Smrg      /* Mitigation for DS is needed only if there was already a branch after */
7667ec681f3Smrg      ctx.has_DS = ctx.has_branch_after_DS;
7677ec681f3Smrg   } else if (instr->isDS()) {
7687ec681f3Smrg      ctx.has_DS = true;
7697ec681f3Smrg      ctx.has_branch_after_DS = false;
7707ec681f3Smrg      /* Mitigation for VMEM is needed only if there was already a branch after */
7717ec681f3Smrg      ctx.has_VMEM = ctx.has_branch_after_VMEM;
7727ec681f3Smrg   } else if (instr_is_branch(instr)) {
7737ec681f3Smrg      ctx.has_branch_after_VMEM = ctx.has_VMEM;
7747ec681f3Smrg      ctx.has_branch_after_DS = ctx.has_DS;
7757ec681f3Smrg   } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
7767ec681f3Smrg      /* Only s_waitcnt_vscnt can mitigate the hazard */
7777ec681f3Smrg      const SOPK_instruction& sopk = instr->sopk();
7787ec681f3Smrg      if (sopk.definitions[0].physReg() == sgpr_null && sopk.imm == 0)
7797ec681f3Smrg         ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
7807ec681f3Smrg   }
7817ec681f3Smrg   if ((ctx.has_VMEM && ctx.has_branch_after_DS) || (ctx.has_DS && ctx.has_branch_after_VMEM)) {
7827ec681f3Smrg      ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
7837ec681f3Smrg
7847ec681f3Smrg      /* Insert s_waitcnt_vscnt to mitigate the problem */
7857ec681f3Smrg      aco_ptr<SOPK_instruction> wait{
7867ec681f3Smrg         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
7877ec681f3Smrg      wait->definitions[0] = Definition(sgpr_null, s1);
7887ec681f3Smrg      wait->imm = 0;
7897ec681f3Smrg      new_instructions.emplace_back(std::move(wait));
7907ec681f3Smrg   }
7917ec681f3Smrg
7927ec681f3Smrg   /* NSAToVMEMBug
7937ec681f3Smrg    * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
7947ec681f3Smrg    * 0).
7957ec681f3Smrg    */
7967ec681f3Smrg   if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
7977ec681f3Smrg      ctx.has_NSA_MIMG = true;
7987ec681f3Smrg   } else if (ctx.has_NSA_MIMG) {
7997ec681f3Smrg      ctx.has_NSA_MIMG = false;
8007ec681f3Smrg
8017ec681f3Smrg      if (instr->isMUBUF() || instr->isMTBUF()) {
8027ec681f3Smrg         uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
8037ec681f3Smrg         if (offset & 6)
8047ec681f3Smrg            Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
8057ec681f3Smrg      }
8067ec681f3Smrg   }
8077ec681f3Smrg
8087ec681f3Smrg   /* waNsaCannotFollowWritelane
8097ec681f3Smrg    * Handles NSA MIMG immediately following a v_writelane_b32.
8107ec681f3Smrg    */
8117ec681f3Smrg   if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
8127ec681f3Smrg      ctx.has_writelane = true;
8137ec681f3Smrg   } else if (ctx.has_writelane) {
8147ec681f3Smrg      ctx.has_writelane = false;
8157ec681f3Smrg      if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
8167ec681f3Smrg         Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
8177ec681f3Smrg   }
8187ec681f3Smrg}
8197ec681f3Smrg
8207ec681f3Smrgtemplate <typename Ctx>
8217ec681f3Smrgusing HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
8227ec681f3Smrg                             std::vector<aco_ptr<Instruction>>&);
8237ec681f3Smrg
8247ec681f3Smrgtemplate <typename Ctx, HandleInstr<Ctx> Handle>
8257ec681f3Smrgvoid
8267ec681f3Smrghandle_block(Program* program, Ctx& ctx, Block& block)
8277ec681f3Smrg{
8287ec681f3Smrg   if (block.instructions.empty())
8297ec681f3Smrg      return;
8307ec681f3Smrg
8317ec681f3Smrg   State state;
8327ec681f3Smrg   state.program = program;
8337ec681f3Smrg   state.block = &block;
8347ec681f3Smrg   state.old_instructions = std::move(block.instructions);
8357ec681f3Smrg
8367ec681f3Smrg   block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
8377ec681f3Smrg   block.instructions.reserve(state.old_instructions.size());
8387ec681f3Smrg
8397ec681f3Smrg   for (aco_ptr<Instruction>& instr : state.old_instructions) {
8407ec681f3Smrg      Handle(state, ctx, instr, block.instructions);
8417ec681f3Smrg      block.instructions.emplace_back(std::move(instr));
8427ec681f3Smrg   }
8437ec681f3Smrg}
8447ec681f3Smrg
8457ec681f3Smrgtemplate <typename Ctx, HandleInstr<Ctx> Handle>
8467ec681f3Smrgvoid
8477ec681f3Smrgmitigate_hazards(Program* program)
8487ec681f3Smrg{
8497ec681f3Smrg   std::vector<Ctx> all_ctx(program->blocks.size());
8507ec681f3Smrg   std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
8517ec681f3Smrg
8527ec681f3Smrg   for (unsigned i = 0; i < program->blocks.size(); i++) {
8537ec681f3Smrg      Block& block = program->blocks[i];
8547ec681f3Smrg      Ctx& ctx = all_ctx[i];
8557ec681f3Smrg
8567ec681f3Smrg      if (block.kind & block_kind_loop_header) {
8577ec681f3Smrg         loop_header_indices.push(i);
8587ec681f3Smrg      } else if (block.kind & block_kind_loop_exit) {
8597ec681f3Smrg         /* Go through the whole loop again */
8607ec681f3Smrg         for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
8617ec681f3Smrg            Ctx loop_block_ctx;
8627ec681f3Smrg            for (unsigned b : program->blocks[idx].linear_preds)
8637ec681f3Smrg               loop_block_ctx.join(all_ctx[b]);
8647ec681f3Smrg
8657ec681f3Smrg            handle_block<Ctx, Handle>(program, loop_block_ctx, program->blocks[idx]);
8667ec681f3Smrg
8677ec681f3Smrg            /* We only need to continue if the loop header context changed */
8687ec681f3Smrg            if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
8697ec681f3Smrg               break;
8707ec681f3Smrg
8717ec681f3Smrg            all_ctx[idx] = loop_block_ctx;
8727ec681f3Smrg         }
8737ec681f3Smrg
8747ec681f3Smrg         loop_header_indices.pop();
8757ec681f3Smrg      }
8767ec681f3Smrg
8777ec681f3Smrg      for (unsigned b : block.linear_preds)
8787ec681f3Smrg         ctx.join(all_ctx[b]);
8797ec681f3Smrg
8807ec681f3Smrg      handle_block<Ctx, Handle>(program, ctx, block);
8817ec681f3Smrg   }
8827ec681f3Smrg}
8837ec681f3Smrg
8847ec681f3Smrg} /* end namespace */
8857ec681f3Smrg
8867ec681f3Smrgvoid
8877ec681f3Smrginsert_NOPs(Program* program)
8887ec681f3Smrg{
8897ec681f3Smrg   if (program->chip_class >= GFX10_3)
8907ec681f3Smrg      ; /* no hazards/bugs to mitigate */
8917ec681f3Smrg   else if (program->chip_class >= GFX10)
8927ec681f3Smrg      mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10>(program);
8937ec681f3Smrg   else
8947ec681f3Smrg      mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
8957ec681f3Smrg}
8967ec681f3Smrg
8977ec681f3Smrg} // namespace aco
898