Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements hazard recognizers for scheduling on GCN processors.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "GCNHazardRecognizer.h"
     14 #include "GCNSubtarget.h"
     15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     16 #include "llvm/CodeGen/MachineFunction.h"
     17 #include "llvm/CodeGen/ScheduleDAG.h"
     18 #include "llvm/Support/TargetParser.h"
     19 
     20 using namespace llvm;
     21 
     22 //===----------------------------------------------------------------------===//
     23 // Hazard Recoginizer Implementation
     24 //===----------------------------------------------------------------------===//
     25 
     26 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
     27   IsHazardRecognizerMode(false),
     28   CurrCycleInstr(nullptr),
     29   MF(MF),
     30   ST(MF.getSubtarget<GCNSubtarget>()),
     31   TII(*ST.getInstrInfo()),
     32   TRI(TII.getRegisterInfo()),
     33   ClauseUses(TRI.getNumRegUnits()),
     34   ClauseDefs(TRI.getNumRegUnits()) {
     35   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
     36   TSchedModel.init(&ST);
     37 }
     38 
     39 void GCNHazardRecognizer::Reset() {
     40   EmittedInstrs.clear();
     41 }
     42 
     43 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
     44   EmitInstruction(SU->getInstr());
     45 }
     46 
     47 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
     48   CurrCycleInstr = MI;
     49 }
     50 
     51 static bool isDivFMas(unsigned Opcode) {
     52   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
     53 }
     54 
     55 static bool isSGetReg(unsigned Opcode) {
     56   return Opcode == AMDGPU::S_GETREG_B32;
     57 }
     58 
     59 static bool isSSetReg(unsigned Opcode) {
     60   switch (Opcode) {
     61   case AMDGPU::S_SETREG_B32:
     62   case AMDGPU::S_SETREG_B32_mode:
     63   case AMDGPU::S_SETREG_IMM32_B32:
     64   case AMDGPU::S_SETREG_IMM32_B32_mode:
     65     return true;
     66   }
     67   return false;
     68 }
     69 
     70 static bool isRWLane(unsigned Opcode) {
     71   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
     72 }
     73 
     74 static bool isRFE(unsigned Opcode) {
     75   return Opcode == AMDGPU::S_RFE_B64;
     76 }
     77 
     78 static bool isSMovRel(unsigned Opcode) {
     79   switch (Opcode) {
     80   case AMDGPU::S_MOVRELS_B32:
     81   case AMDGPU::S_MOVRELS_B64:
     82   case AMDGPU::S_MOVRELD_B32:
     83   case AMDGPU::S_MOVRELD_B64:
     84     return true;
     85   default:
     86     return false;
     87   }
     88 }
     89 
     90 static bool isDGEMM(unsigned Opcode) {
     91   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
     92          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
     93          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
     94          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
     95 }
     96 
     97 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
     98   unsigned Opcode = MI.getOpcode();
     99 
    100   if (!SIInstrInfo::isMAI(MI) ||
    101       isDGEMM(Opcode) ||
    102       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
    103       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
    104     return false;
    105 
    106   return true;
    107 }
    108 
    109 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
    110                                     const MachineInstr &MI) {
    111   if (TII.isAlwaysGDS(MI.getOpcode()))
    112     return true;
    113 
    114   switch (MI.getOpcode()) {
    115   case AMDGPU::S_SENDMSG:
    116   case AMDGPU::S_SENDMSGHALT:
    117   case AMDGPU::S_TTRACEDATA:
    118     return true;
    119   // These DS opcodes don't support GDS.
    120   case AMDGPU::DS_NOP:
    121   case AMDGPU::DS_PERMUTE_B32:
    122   case AMDGPU::DS_BPERMUTE_B32:
    123     return false;
    124   default:
    125     if (TII.isDS(MI.getOpcode())) {
    126       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
    127                                            AMDGPU::OpName::gds);
    128       if (MI.getOperand(GDS).getImm())
    129         return true;
    130     }
    131     return false;
    132   }
    133 }
    134 
    135 static bool isPermlane(const MachineInstr &MI) {
    136   unsigned Opcode = MI.getOpcode();
    137   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
    138          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
    139 }
    140 
    141 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
    142   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
    143                                                      AMDGPU::OpName::simm16);
    144   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
    145 }
    146 
    147 ScheduleHazardRecognizer::HazardType
    148 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
    149   MachineInstr *MI = SU->getInstr();
    150   // If we are not in "HazardRecognizerMode" and therefore not being run from
    151   // the scheduler, track possible stalls from hazards but don't insert noops.
    152   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
    153 
    154   if (MI->isBundle())
    155    return NoHazard;
    156 
    157   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
    158     return HazardType;
    159 
    160   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
    161     return HazardType;
    162 
    163   if (checkFPAtomicToDenormModeHazard(MI) > 0)
    164     return HazardType;
    165 
    166   if (ST.hasNoDataDepHazard())
    167     return NoHazard;
    168 
    169   // FIXME: Should flat be considered vmem?
    170   if ((SIInstrInfo::isVMEM(*MI) ||
    171        SIInstrInfo::isFLAT(*MI))
    172       && checkVMEMHazards(MI) > 0)
    173     return HazardType;
    174 
    175   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
    176     return HazardType;
    177 
    178   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
    179     return HazardType;
    180 
    181   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
    182     return HazardType;
    183 
    184   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
    185     return HazardType;
    186 
    187   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
    188        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
    189        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
    190     return HazardType;
    191 
    192   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
    193     return HazardType;
    194 
    195   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
    196     return HazardType;
    197 
    198   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
    199     return HazardType;
    200 
    201   if (ST.hasReadM0MovRelInterpHazard() &&
    202       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
    203       checkReadM0Hazards(MI) > 0)
    204     return HazardType;
    205 
    206   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
    207       checkReadM0Hazards(MI) > 0)
    208     return HazardType;
    209 
    210   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
    211     return HazardType;
    212 
    213   if ((SIInstrInfo::isVMEM(*MI) ||
    214        SIInstrInfo::isFLAT(*MI) ||
    215        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
    216     return HazardType;
    217 
    218   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
    219     return HazardType;
    220 
    221   return NoHazard;
    222 }
    223 
    224 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
    225                                 unsigned Quantity) {
    226   while (Quantity > 0) {
    227     unsigned Arg = std::min(Quantity, 8u);
    228     Quantity -= Arg;
    229     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
    230         .addImm(Arg - 1);
    231   }
    232 }
    233 
    234 void GCNHazardRecognizer::processBundle() {
    235   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
    236   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
    237   // Check bundled MachineInstr's for hazards.
    238   for (; MI != E && MI->isInsideBundle(); ++MI) {
    239     CurrCycleInstr = &*MI;
    240     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
    241 
    242     if (IsHazardRecognizerMode) {
    243       fixHazards(CurrCycleInstr);
    244 
    245       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
    246     }
    247 
    248     // Its unnecessary to track more than MaxLookAhead instructions. Since we
    249     // include the bundled MI directly after, only add a maximum of
    250     // (MaxLookAhead - 1) noops to EmittedInstrs.
    251     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
    252       EmittedInstrs.push_front(nullptr);
    253 
    254     EmittedInstrs.push_front(CurrCycleInstr);
    255     EmittedInstrs.resize(MaxLookAhead);
    256   }
    257   CurrCycleInstr = nullptr;
    258 }
    259 
    260 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
    261   IsHazardRecognizerMode = true;
    262   CurrCycleInstr = MI;
    263   unsigned W = PreEmitNoopsCommon(MI);
    264   fixHazards(MI);
    265   CurrCycleInstr = nullptr;
    266   return W;
    267 }
    268 
    269 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
    270   if (MI->isBundle())
    271     return 0;
    272 
    273   int WaitStates = 0;
    274 
    275   if (SIInstrInfo::isSMRD(*MI))
    276     return std::max(WaitStates, checkSMRDHazards(MI));
    277 
    278   if (ST.hasNSAtoVMEMBug())
    279     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
    280 
    281   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
    282 
    283   if (ST.hasNoDataDepHazard())
    284     return WaitStates;
    285 
    286   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
    287     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
    288 
    289   if (SIInstrInfo::isVALU(*MI))
    290     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
    291 
    292   if (SIInstrInfo::isDPP(*MI))
    293     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
    294 
    295   if (isDivFMas(MI->getOpcode()))
    296     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
    297 
    298   if (isRWLane(MI->getOpcode()))
    299     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
    300 
    301   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
    302        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
    303        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
    304     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
    305 
    306   if (MI->isInlineAsm())
    307     return std::max(WaitStates, checkInlineAsmHazards(MI));
    308 
    309   if (isSGetReg(MI->getOpcode()))
    310     return std::max(WaitStates, checkGetRegHazards(MI));
    311 
    312   if (isSSetReg(MI->getOpcode()))
    313     return std::max(WaitStates, checkSetRegHazards(MI));
    314 
    315   if (isRFE(MI->getOpcode()))
    316     return std::max(WaitStates, checkRFEHazards(MI));
    317 
    318   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
    319                                            isSMovRel(MI->getOpcode())))
    320     return std::max(WaitStates, checkReadM0Hazards(MI));
    321 
    322   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
    323     return std::max(WaitStates, checkReadM0Hazards(MI));
    324 
    325   if (SIInstrInfo::isMAI(*MI))
    326     return std::max(WaitStates, checkMAIHazards(MI));
    327 
    328   if (SIInstrInfo::isVMEM(*MI) ||
    329       SIInstrInfo::isFLAT(*MI) ||
    330       SIInstrInfo::isDS(*MI))
    331     return std::max(WaitStates, checkMAILdStHazards(MI));
    332 
    333   return WaitStates;
    334 }
    335 
    336 void GCNHazardRecognizer::EmitNoop() {
    337   EmittedInstrs.push_front(nullptr);
    338 }
    339 
    340 void GCNHazardRecognizer::AdvanceCycle() {
    341   // When the scheduler detects a stall, it will call AdvanceCycle() without
    342   // emitting any instructions.
    343   if (!CurrCycleInstr) {
    344     EmittedInstrs.push_front(nullptr);
    345     return;
    346   }
    347 
    348   // Do not track non-instructions which do not affect the wait states.
    349   // If included, these instructions can lead to buffer overflow such that
    350   // detectable hazards are missed.
    351   if (CurrCycleInstr->isMetaInstruction()) {
    352     CurrCycleInstr = nullptr;
    353     return;
    354   }
    355 
    356   if (CurrCycleInstr->isBundle()) {
    357     processBundle();
    358     return;
    359   }
    360 
    361   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
    362 
    363   // Keep track of emitted instructions
    364   EmittedInstrs.push_front(CurrCycleInstr);
    365 
    366   // Add a nullptr for each additional wait state after the first.  Make sure
    367   // not to add more than getMaxLookAhead() items to the list, since we
    368   // truncate the list to that size right after this loop.
    369   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
    370        i < e; ++i) {
    371     EmittedInstrs.push_front(nullptr);
    372   }
    373 
    374   // getMaxLookahead() is the largest number of wait states we will ever need
    375   // to insert, so there is no point in keeping track of more than that many
    376   // wait states.
    377   EmittedInstrs.resize(getMaxLookAhead());
    378 
    379   CurrCycleInstr = nullptr;
    380 }
    381 
    382 void GCNHazardRecognizer::RecedeCycle() {
    383   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
    384 }
    385 
    386 //===----------------------------------------------------------------------===//
    387 // Helper Functions
    388 //===----------------------------------------------------------------------===//
    389 
    390 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
    391 
    392 // Returns a minimum wait states since \p I walking all predecessors.
    393 // Only scans until \p IsExpired does not return true.
    394 // Can only be run in a hazard recognizer mode.
    395 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
    396                               const MachineBasicBlock *MBB,
    397                               MachineBasicBlock::const_reverse_instr_iterator I,
    398                               int WaitStates, IsExpiredFn IsExpired,
    399                               DenseSet<const MachineBasicBlock *> &Visited) {
    400   for (auto E = MBB->instr_rend(); I != E; ++I) {
    401     // Don't add WaitStates for parent BUNDLE instructions.
    402     if (I->isBundle())
    403       continue;
    404 
    405     if (IsHazard(*I))
    406       return WaitStates;
    407 
    408     if (I->isInlineAsm() || I->isMetaInstruction())
    409       continue;
    410 
    411     WaitStates += SIInstrInfo::getNumWaitStates(*I);
    412 
    413     if (IsExpired(*I, WaitStates))
    414       return std::numeric_limits<int>::max();
    415   }
    416 
    417   int MinWaitStates = std::numeric_limits<int>::max();
    418   for (MachineBasicBlock *Pred : MBB->predecessors()) {
    419     if (!Visited.insert(Pred).second)
    420       continue;
    421 
    422     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
    423                                WaitStates, IsExpired, Visited);
    424 
    425     MinWaitStates = std::min(MinWaitStates, W);
    426   }
    427 
    428   return MinWaitStates;
    429 }
    430 
    431 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
    432                               const MachineInstr *MI, IsExpiredFn IsExpired) {
    433   DenseSet<const MachineBasicBlock *> Visited;
    434   return getWaitStatesSince(IsHazard, MI->getParent(),
    435                             std::next(MI->getReverseIterator()),
    436                             0, IsExpired, Visited);
    437 }
    438 
    439 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
    440   if (IsHazardRecognizerMode) {
    441     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
    442       return WaitStates >= Limit;
    443     };
    444     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
    445   }
    446 
    447   int WaitStates = 0;
    448   for (MachineInstr *MI : EmittedInstrs) {
    449     if (MI) {
    450       if (IsHazard(*MI))
    451         return WaitStates;
    452 
    453       if (MI->isInlineAsm())
    454         continue;
    455     }
    456     ++WaitStates;
    457 
    458     if (WaitStates >= Limit)
    459       break;
    460   }
    461   return std::numeric_limits<int>::max();
    462 }
    463 
    464 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
    465                                                IsHazardFn IsHazardDef,
    466                                                int Limit) {
    467   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    468 
    469   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
    470     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
    471   };
    472 
    473   return getWaitStatesSince(IsHazardFn, Limit);
    474 }
    475 
    476 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
    477                                                   int Limit) {
    478   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
    479     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
    480   };
    481 
    482   return getWaitStatesSince(IsHazardFn, Limit);
    483 }
    484 
    485 //===----------------------------------------------------------------------===//
    486 // No-op Hazard Detection
    487 //===----------------------------------------------------------------------===//
    488 
    489 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
    490                         MCRegister Reg) {
    491   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
    492     BV.set(*RUI);
    493 }
    494 
    495 static void addRegsToSet(const SIRegisterInfo &TRI,
    496                          iterator_range<MachineInstr::const_mop_iterator> Ops,
    497                          BitVector &Set) {
    498   for (const MachineOperand &Op : Ops) {
    499     if (Op.isReg())
    500       addRegUnits(TRI, Set, Op.getReg().asMCReg());
    501   }
    502 }
    503 
    504 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
    505   // XXX: Do we need to worry about implicit operands
    506   addRegsToSet(TRI, MI.defs(), ClauseDefs);
    507   addRegsToSet(TRI, MI.uses(), ClauseUses);
    508 }
    509 
    510 static bool breaksSMEMSoftClause(MachineInstr *MI) {
    511   return !SIInstrInfo::isSMRD(*MI);
    512 }
    513 
    514 static bool breaksVMEMSoftClause(MachineInstr *MI) {
    515   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
    516 }
    517 
    518 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
    519   // SMEM soft clause are only present on VI+, and only matter if xnack is
    520   // enabled.
    521   if (!ST.isXNACKEnabled())
    522     return 0;
    523 
    524   bool IsSMRD = TII.isSMRD(*MEM);
    525 
    526   resetClause();
    527 
    528   // A soft-clause is any group of consecutive SMEM instructions.  The
    529   // instructions in this group may return out of order and/or may be
    530   // replayed (i.e. the same instruction issued more than once).
    531   //
    532   // In order to handle these situations correctly we need to make sure that
    533   // when a clause has more than one instruction, no instruction in the clause
    534   // writes to a register that is read by another instruction in the clause
    535   // (including itself). If we encounter this situaion, we need to break the
    536   // clause by inserting a non SMEM instruction.
    537 
    538   for (MachineInstr *MI : EmittedInstrs) {
    539     // When we hit a non-SMEM instruction then we have passed the start of the
    540     // clause and we can stop.
    541     if (!MI)
    542       break;
    543 
    544     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
    545       break;
    546 
    547     addClauseInst(*MI);
    548   }
    549 
    550   if (ClauseDefs.none())
    551     return 0;
    552 
    553   // We need to make sure not to put loads and stores in the same clause if they
    554   // use the same address. For now, just start a new clause whenever we see a
    555   // store.
    556   if (MEM->mayStore())
    557     return 1;
    558 
    559   addClauseInst(*MEM);
    560 
    561   // If the set of defs and uses intersect then we cannot add this instruction
    562   // to the clause, so we have a hazard.
    563   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
    564 }
    565 
    566 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
    567   int WaitStatesNeeded = 0;
    568 
    569   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
    570 
    571   // This SMRD hazard only affects SI.
    572   if (!ST.hasSMRDReadVALUDefHazard())
    573     return WaitStatesNeeded;
    574 
    575   // A read of an SGPR by SMRD instruction requires 4 wait states when the
    576   // SGPR was written by a VALU instruction.
    577   int SmrdSgprWaitStates = 4;
    578   auto IsHazardDefFn = [this](const MachineInstr &MI) {
    579     return TII.isVALU(MI);
    580   };
    581   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
    582     return TII.isSALU(MI);
    583   };
    584 
    585   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
    586 
    587   for (const MachineOperand &Use : SMRD->uses()) {
    588     if (!Use.isReg())
    589       continue;
    590     int WaitStatesNeededForUse =
    591         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
    592                                                    SmrdSgprWaitStates);
    593     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
    594 
    595     // This fixes what appears to be undocumented hardware behavior in SI where
    596     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
    597     // needs some number of nops in between. We don't know how many we need, but
    598     // let's use 4. This wasn't discovered before probably because the only
    599     // case when this happens is when we expand a 64-bit pointer into a full
    600     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
    601     // probably never encountered in the closed-source land.
    602     if (IsBufferSMRD) {
    603       int WaitStatesNeededForUse =
    604         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
    605                                                    IsBufferHazardDefFn,
    606                                                    SmrdSgprWaitStates);
    607       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
    608     }
    609   }
    610 
    611   return WaitStatesNeeded;
    612 }
    613 
    614 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
    615   if (!ST.hasVMEMReadSGPRVALUDefHazard())
    616     return 0;
    617 
    618   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
    619 
    620   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
    621   // SGPR was written by a VALU Instruction.
    622   const int VmemSgprWaitStates = 5;
    623   auto IsHazardDefFn = [this](const MachineInstr &MI) {
    624     return TII.isVALU(MI);
    625   };
    626   for (const MachineOperand &Use : VMEM->uses()) {
    627     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
    628       continue;
    629 
    630     int WaitStatesNeededForUse =
    631         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
    632                                                    VmemSgprWaitStates);
    633     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
    634   }
    635   return WaitStatesNeeded;
    636 }
    637 
    638 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
    639   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    640   const SIInstrInfo *TII = ST.getInstrInfo();
    641 
    642   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
    643   int DppVgprWaitStates = 2;
    644   int DppExecWaitStates = 5;
    645   int WaitStatesNeeded = 0;
    646   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
    647     return TII->isVALU(MI);
    648   };
    649 
    650   for (const MachineOperand &Use : DPP->uses()) {
    651     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
    652       continue;
    653     int WaitStatesNeededForUse =
    654         DppVgprWaitStates - getWaitStatesSinceDef(
    655                                 Use.getReg(),
    656                                 [](const MachineInstr &) { return true; },
    657                                 DppVgprWaitStates);
    658     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
    659   }
    660 
    661   WaitStatesNeeded = std::max(
    662       WaitStatesNeeded,
    663       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
    664                                                 DppExecWaitStates));
    665 
    666   return WaitStatesNeeded;
    667 }
    668 
    669 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
    670   const SIInstrInfo *TII = ST.getInstrInfo();
    671 
    672   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
    673   // instruction.
    674   const int DivFMasWaitStates = 4;
    675   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
    676     return TII->isVALU(MI);
    677   };
    678   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
    679                                                DivFMasWaitStates);
    680 
    681   return DivFMasWaitStates - WaitStatesNeeded;
    682 }
    683 
    684 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
    685   const SIInstrInfo *TII = ST.getInstrInfo();
    686   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
    687 
    688   const int GetRegWaitStates = 2;
    689   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
    690     return GetRegHWReg == getHWReg(TII, MI);
    691   };
    692   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
    693 
    694   return GetRegWaitStates - WaitStatesNeeded;
    695 }
    696 
    697 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
    698   const SIInstrInfo *TII = ST.getInstrInfo();
    699   unsigned HWReg = getHWReg(TII, *SetRegInstr);
    700 
    701   const int SetRegWaitStates = ST.getSetRegWaitStates();
    702   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
    703     return HWReg == getHWReg(TII, MI);
    704   };
    705   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
    706   return SetRegWaitStates - WaitStatesNeeded;
    707 }
    708 
    709 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
    710   if (!MI.mayStore())
    711     return -1;
    712 
    713   const SIInstrInfo *TII = ST.getInstrInfo();
    714   unsigned Opcode = MI.getOpcode();
    715   const MCInstrDesc &Desc = MI.getDesc();
    716 
    717   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
    718   int VDataRCID = -1;
    719   if (VDataIdx != -1)
    720     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
    721 
    722   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
    723     // There is no hazard if the instruction does not use vector regs
    724     // (like wbinvl1)
    725     if (VDataIdx == -1)
    726       return -1;
    727     // For MUBUF/MTBUF instructions this hazard only exists if the
    728     // instruction is not using a register in the soffset field.
    729     const MachineOperand *SOffset =
    730         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
    731     // If we have no soffset operand, then assume this field has been
    732     // hardcoded to zero.
    733     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
    734         (!SOffset || !SOffset->isReg()))
    735       return VDataIdx;
    736   }
    737 
    738   // MIMG instructions create a hazard if they don't use a 256-bit T# and
    739   // the store size is greater than 8 bytes and they have more than two bits
    740   // of their dmask set.
    741   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
    742   if (TII->isMIMG(MI)) {
    743     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
    744     assert(SRsrcIdx != -1 &&
    745            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
    746     (void)SRsrcIdx;
    747   }
    748 
    749   if (TII->isFLAT(MI)) {
    750     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
    751     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
    752       return DataIdx;
    753   }
    754 
    755   return -1;
    756 }
    757 
    758 int
    759 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
    760                                             const MachineRegisterInfo &MRI) {
    761   // Helper to check for the hazard where VMEM instructions that store more than
    762   // 8 bytes can have there store data over written by the next instruction.
    763   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    764 
    765   const int VALUWaitStates = 1;
    766   int WaitStatesNeeded = 0;
    767 
    768   if (!TRI->isVectorRegister(MRI, Def.getReg()))
    769     return WaitStatesNeeded;
    770   Register Reg = Def.getReg();
    771   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
    772     int DataIdx = createsVALUHazard(MI);
    773     return DataIdx >= 0 &&
    774            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
    775   };
    776   int WaitStatesNeededForDef =
    777     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
    778   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
    779 
    780   return WaitStatesNeeded;
    781 }
    782 
    783 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
    784   // This checks for the hazard where VMEM instructions that store more than
    785   // 8 bytes can have there store data over written by the next instruction.
    786   if (!ST.has12DWordStoreHazard())
    787     return 0;
    788 
    789   const MachineRegisterInfo &MRI = MF.getRegInfo();
    790   int WaitStatesNeeded = 0;
    791 
    792   for (const MachineOperand &Def : VALU->defs()) {
    793     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
    794   }
    795 
    796   return WaitStatesNeeded;
    797 }
    798 
    799 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
    800   // This checks for hazards associated with inline asm statements.
    801   // Since inline asms can contain just about anything, we use this
    802   // to call/leverage other check*Hazard routines. Note that
    803   // this function doesn't attempt to address all possible inline asm
    804   // hazards (good luck), but is a collection of what has been
    805   // problematic thus far.
    806 
    807   // see checkVALUHazards()
    808   if (!ST.has12DWordStoreHazard())
    809     return 0;
    810 
    811   const MachineRegisterInfo &MRI = MF.getRegInfo();
    812   int WaitStatesNeeded = 0;
    813 
    814   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
    815        I != E; ++I) {
    816     const MachineOperand &Op = IA->getOperand(I);
    817     if (Op.isReg() && Op.isDef()) {
    818       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
    819     }
    820   }
    821 
    822   return WaitStatesNeeded;
    823 }
    824 
    825 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
    826   const SIInstrInfo *TII = ST.getInstrInfo();
    827   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    828   const MachineRegisterInfo &MRI = MF.getRegInfo();
    829 
    830   const MachineOperand *LaneSelectOp =
    831       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
    832 
    833   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
    834     return 0;
    835 
    836   Register LaneSelectReg = LaneSelectOp->getReg();
    837   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
    838 
    839   const int RWLaneWaitStates = 4;
    840   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
    841                                               RWLaneWaitStates);
    842   return RWLaneWaitStates - WaitStatesSince;
    843 }
    844 
    845 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
    846   if (!ST.hasRFEHazards())
    847     return 0;
    848 
    849   const SIInstrInfo *TII = ST.getInstrInfo();
    850 
    851   const int RFEWaitStates = 1;
    852 
    853   auto IsHazardFn = [TII](const MachineInstr &MI) {
    854     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
    855   };
    856   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
    857   return RFEWaitStates - WaitStatesNeeded;
    858 }
    859 
    860 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
    861   const SIInstrInfo *TII = ST.getInstrInfo();
    862   const int SMovRelWaitStates = 1;
    863   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
    864   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
    865                                                    SMovRelWaitStates);
    866 }
    867 
    868 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
    869   fixVMEMtoScalarWriteHazards(MI);
    870   fixVcmpxPermlaneHazards(MI);
    871   fixSMEMtoVectorWriteHazards(MI);
    872   fixVcmpxExecWARHazard(MI);
    873   fixLdsBranchVmemWARHazard(MI);
    874 }
    875 
    876 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
    877   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
    878     return false;
    879 
    880   const SIInstrInfo *TII = ST.getInstrInfo();
    881   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
    882 
    883   auto IsExpiredFn = [](const MachineInstr &MI, int) {
    884     unsigned Opc = MI.getOpcode();
    885     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
    886            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
    887   };
    888 
    889   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
    890       std::numeric_limits<int>::max())
    891     return false;
    892 
    893   // V_NOP will be discarded by SQ.
    894   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
    895   // which is always a VGPR and available.
    896   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
    897   Register Reg = Src0->getReg();
    898   bool IsUndef = Src0->isUndef();
    899   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
    900           TII->get(AMDGPU::V_MOV_B32_e32))
    901     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
    902     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
    903 
    904   return true;
    905 }
    906 
    907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
    908   if (!ST.hasVMEMtoScalarWriteHazard())
    909     return false;
    910 
    911   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
    912     return false;
    913 
    914   if (MI->getNumDefs() == 0)
    915     return false;
    916 
    917   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    918 
    919   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
    920     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
    921         !SIInstrInfo::isFLAT(I))
    922       return false;
    923 
    924     for (const MachineOperand &Def : MI->defs()) {
    925       const MachineOperand *Op =
    926           I.findRegisterUseOperand(Def.getReg(), false, TRI);
    927       if (!Op)
    928         continue;
    929       return true;
    930     }
    931     return false;
    932   };
    933 
    934   auto IsExpiredFn = [](const MachineInstr &MI, int) {
    935     return SIInstrInfo::isVALU(MI) ||
    936            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
    937             !MI.getOperand(0).getImm()) ||
    938            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
    939             MI.getOperand(0).getImm() == 0xffe3);
    940   };
    941 
    942   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
    943       std::numeric_limits<int>::max())
    944     return false;
    945 
    946   const SIInstrInfo *TII = ST.getInstrInfo();
    947   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
    948           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
    949       .addImm(0xffe3);
    950   return true;
    951 }
    952 
    953 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
    954   if (!ST.hasSMEMtoVectorWriteHazard())
    955     return false;
    956 
    957   if (!SIInstrInfo::isVALU(*MI))
    958     return false;
    959 
    960   unsigned SDSTName;
    961   switch (MI->getOpcode()) {
    962   case AMDGPU::V_READLANE_B32:
    963   case AMDGPU::V_READFIRSTLANE_B32:
    964     SDSTName = AMDGPU::OpName::vdst;
    965     break;
    966   default:
    967     SDSTName = AMDGPU::OpName::sdst;
    968     break;
    969   }
    970 
    971   const SIInstrInfo *TII = ST.getInstrInfo();
    972   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    973   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
    974   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
    975   if (!SDST) {
    976     for (const auto &MO : MI->implicit_operands()) {
    977       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
    978         SDST = &MO;
    979         break;
    980       }
    981     }
    982   }
    983 
    984   if (!SDST)
    985     return false;
    986 
    987   const Register SDSTReg = SDST->getReg();
    988   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
    989     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
    990   };
    991 
    992   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
    993     if (TII->isSALU(MI)) {
    994       switch (MI.getOpcode()) {
    995       case AMDGPU::S_SETVSKIP:
    996       case AMDGPU::S_VERSION:
    997       case AMDGPU::S_WAITCNT_VSCNT:
    998       case AMDGPU::S_WAITCNT_VMCNT:
    999       case AMDGPU::S_WAITCNT_EXPCNT:
   1000         // These instructions cannot not mitigate the hazard.
   1001         return false;
   1002       case AMDGPU::S_WAITCNT_LGKMCNT:
   1003         // Reducing lgkmcnt count to 0 always mitigates the hazard.
   1004         return (MI.getOperand(1).getImm() == 0) &&
   1005                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
   1006       case AMDGPU::S_WAITCNT: {
   1007         const int64_t Imm = MI.getOperand(0).getImm();
   1008         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
   1009         return (Decoded.LgkmCnt == 0);
   1010       }
   1011       default:
   1012         // SOPP instructions cannot mitigate the hazard.
   1013         if (TII->isSOPP(MI))
   1014           return false;
   1015         // At this point the SALU can be assumed to mitigate the hazard
   1016         // because either:
   1017         // (a) it is independent of the at risk SMEM (breaking chain),
   1018         // or
   1019         // (b) it is dependent on the SMEM, in which case an appropriate
   1020         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
   1021         //     SMEM instruction.
   1022         return true;
   1023       }
   1024     }
   1025     return false;
   1026   };
   1027 
   1028   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
   1029       std::numeric_limits<int>::max())
   1030     return false;
   1031 
   1032   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
   1033           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
   1034       .addImm(0);
   1035   return true;
   1036 }
   1037 
   1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
   1039   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
   1040     return false;
   1041 
   1042   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   1043   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
   1044     return false;
   1045 
   1046   auto IsHazardFn = [TRI](const MachineInstr &I) {
   1047     if (SIInstrInfo::isVALU(I))
   1048       return false;
   1049     return I.readsRegister(AMDGPU::EXEC, TRI);
   1050   };
   1051 
   1052   const SIInstrInfo *TII = ST.getInstrInfo();
   1053   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
   1054     if (SIInstrInfo::isVALU(MI)) {
   1055       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
   1056         return true;
   1057       for (auto MO : MI.implicit_operands())
   1058         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
   1059           return true;
   1060     }
   1061     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
   1062         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
   1063       return true;
   1064     return false;
   1065   };
   1066 
   1067   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
   1068       std::numeric_limits<int>::max())
   1069     return false;
   1070 
   1071   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
   1072           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
   1073     .addImm(0xfffe);
   1074   return true;
   1075 }
   1076 
   1077 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
   1078   if (!ST.hasLdsBranchVmemWARHazard())
   1079     return false;
   1080 
   1081   auto IsHazardInst = [](const MachineInstr &MI) {
   1082     if (SIInstrInfo::isDS(MI))
   1083       return 1;
   1084     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
   1085       return 2;
   1086     return 0;
   1087   };
   1088 
   1089   auto InstType = IsHazardInst(*MI);
   1090   if (!InstType)
   1091     return false;
   1092 
   1093   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
   1094     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
   1095                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
   1096                                !I.getOperand(1).getImm());
   1097   };
   1098 
   1099   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
   1100     if (!I.isBranch())
   1101       return false;
   1102 
   1103     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
   1104       auto InstType2 = IsHazardInst(I);
   1105       return InstType2 && InstType != InstType2;
   1106     };
   1107 
   1108     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
   1109       auto InstType2 = IsHazardInst(I);
   1110       if (InstType == InstType2)
   1111         return true;
   1112 
   1113       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
   1114              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
   1115              !I.getOperand(1).getImm();
   1116     };
   1117 
   1118     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
   1119            std::numeric_limits<int>::max();
   1120   };
   1121 
   1122   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
   1123       std::numeric_limits<int>::max())
   1124     return false;
   1125 
   1126   const SIInstrInfo *TII = ST.getInstrInfo();
   1127   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
   1128           TII->get(AMDGPU::S_WAITCNT_VSCNT))
   1129     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
   1130     .addImm(0);
   1131 
   1132   return true;
   1133 }
   1134 
   1135 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
   1136   int NSAtoVMEMWaitStates = 1;
   1137 
   1138   if (!ST.hasNSAtoVMEMBug())
   1139     return 0;
   1140 
   1141   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
   1142     return 0;
   1143 
   1144   const SIInstrInfo *TII = ST.getInstrInfo();
   1145   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
   1146   if (!Offset || (Offset->getImm() & 6) == 0)
   1147     return 0;
   1148 
   1149   auto IsHazardFn = [TII](const MachineInstr &I) {
   1150     if (!SIInstrInfo::isMIMG(I))
   1151       return false;
   1152     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
   1153     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
   1154            TII->getInstSizeInBytes(I) >= 16;
   1155   };
   1156 
   1157   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
   1158 }
   1159 
   1160 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
   1161   int FPAtomicToDenormModeWaitStates = 3;
   1162 
   1163   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
   1164     return 0;
   1165 
   1166   auto IsHazardFn = [](const MachineInstr &I) {
   1167     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
   1168       return false;
   1169     return SIInstrInfo::isFPAtomic(I);
   1170   };
   1171 
   1172   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
   1173     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
   1174       return true;
   1175 
   1176     switch (MI.getOpcode()) {
   1177     case AMDGPU::S_WAITCNT:
   1178     case AMDGPU::S_WAITCNT_VSCNT:
   1179     case AMDGPU::S_WAITCNT_VMCNT:
   1180     case AMDGPU::S_WAITCNT_EXPCNT:
   1181     case AMDGPU::S_WAITCNT_LGKMCNT:
   1182     case AMDGPU::S_WAIT_IDLE:
   1183       return true;
   1184     default:
   1185       break;
   1186     }
   1187 
   1188     return false;
   1189   };
   1190 
   1191   return FPAtomicToDenormModeWaitStates -
   1192          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
   1193 }
   1194 
   1195 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
   1196   assert(SIInstrInfo::isMAI(*MI));
   1197 
   1198   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
   1199 }
   1200 
   1201 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
   1202   int WaitStatesNeeded = 0;
   1203   unsigned Opc = MI->getOpcode();
   1204 
   1205   auto IsVALUFn = [](const MachineInstr &MI) {
   1206     return SIInstrInfo::isVALU(MI);
   1207   };
   1208 
   1209   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
   1210     const int LegacyVALUWritesVGPRWaitStates = 2;
   1211     const int VALUWritesExecWaitStates = 4;
   1212     const int MaxWaitStates = 4;
   1213 
   1214     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
   1215       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
   1216     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1217 
   1218     if (WaitStatesNeeded < MaxWaitStates) {
   1219       for (const MachineOperand &Use : MI->explicit_uses()) {
   1220         const int MaxWaitStates = 2;
   1221 
   1222         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
   1223           continue;
   1224 
   1225         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
   1226           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
   1227         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1228 
   1229         if (WaitStatesNeeded == MaxWaitStates)
   1230           break;
   1231       }
   1232     }
   1233   }
   1234 
   1235   auto IsMFMAFn = [](const MachineInstr &MI) {
   1236     return SIInstrInfo::isMAI(MI) &&
   1237            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
   1238            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   1239   };
   1240 
   1241   for (const MachineOperand &Op : MI->explicit_operands()) {
   1242     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
   1243       continue;
   1244 
   1245     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
   1246       continue;
   1247 
   1248     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
   1249     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
   1250     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
   1251     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
   1252     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
   1253     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
   1254     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
   1255     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
   1256     const int MaxWaitStates = 18;
   1257     Register Reg = Op.getReg();
   1258     unsigned HazardDefLatency = 0;
   1259 
   1260     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
   1261                                this](const MachineInstr &MI) {
   1262       if (!IsMFMAFn(MI))
   1263         return false;
   1264       Register DstReg = MI.getOperand(0).getReg();
   1265       if (DstReg == Reg)
   1266         return false;
   1267       HazardDefLatency =
   1268           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
   1269       return TRI.regsOverlap(DstReg, Reg);
   1270     };
   1271 
   1272     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
   1273                                                    MaxWaitStates);
   1274     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
   1275     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
   1276     int OpNo = MI->getOperandNo(&Op);
   1277     if (OpNo == SrcCIdx) {
   1278       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
   1279     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
   1280       switch (HazardDefLatency) {
   1281       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
   1282                break;
   1283       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
   1284                break;
   1285       case 16: LLVM_FALLTHROUGH;
   1286       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
   1287                break;
   1288       }
   1289     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
   1290       switch (HazardDefLatency) {
   1291       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
   1292                break;
   1293       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
   1294                break;
   1295       case 16: LLVM_FALLTHROUGH;
   1296       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
   1297                break;
   1298       }
   1299     }
   1300 
   1301     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
   1302     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1303 
   1304     if (WaitStatesNeeded == MaxWaitStates)
   1305       return WaitStatesNeeded; // Early exit.
   1306 
   1307     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
   1308       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
   1309         return false;
   1310       Register DstReg = MI.getOperand(0).getReg();
   1311       return TRI.regsOverlap(Reg, DstReg);
   1312     };
   1313 
   1314     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
   1315     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
   1316     const int AccVGPRWriteAccVgprReadWaitStates = 3;
   1317     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
   1318     if (OpNo == SrcCIdx)
   1319       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
   1320     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
   1321       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
   1322 
   1323     WaitStatesNeededForUse = NeedWaitStates -
   1324       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
   1325     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1326 
   1327     if (WaitStatesNeeded == MaxWaitStates)
   1328       return WaitStatesNeeded; // Early exit.
   1329   }
   1330 
   1331   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
   1332     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
   1333     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
   1334     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
   1335     const int MaxWaitStates = 13;
   1336     Register DstReg = MI->getOperand(0).getReg();
   1337     unsigned HazardDefLatency = 0;
   1338 
   1339     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
   1340                          this](const MachineInstr &MI) {
   1341       if (!IsMFMAFn(MI))
   1342         return false;
   1343       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
   1344       HazardDefLatency =
   1345           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
   1346       return TRI.regsOverlap(Reg, DstReg);
   1347     };
   1348 
   1349     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
   1350     int NeedWaitStates;
   1351     switch (HazardDefLatency) {
   1352     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
   1353              break;
   1354     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
   1355              break;
   1356     case 16: LLVM_FALLTHROUGH;
   1357     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
   1358              break;
   1359     }
   1360 
   1361     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
   1362     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1363   }
   1364 
   1365   return WaitStatesNeeded;
   1366 }
   1367 
   1368 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
   1369   int WaitStatesNeeded = 0;
   1370   unsigned Opc = MI->getOpcode();
   1371 
   1372   auto IsMFMAFn = [](const MachineInstr &MI) {
   1373     return SIInstrInfo::isMAI(MI) &&
   1374            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
   1375            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   1376   };
   1377 
   1378   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
   1379     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
   1380   };
   1381 
   1382   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
   1383     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
   1384   };
   1385 
   1386   if (!IsMFMAFn(*MI))
   1387     return WaitStatesNeeded;
   1388 
   1389   const int VALUWritesExecWaitStates = 4;
   1390   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
   1391     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
   1392                           VALUWritesExecWaitStates);
   1393   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1394 
   1395   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
   1396 
   1397   // Loop for both DGEMM and S/HGEMM 2nd instruction.
   1398   for (const MachineOperand &Use : MI->explicit_uses()) {
   1399     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
   1400     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
   1401     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
   1402     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
   1403     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
   1404     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
   1405     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
   1406     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
   1407     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
   1408     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
   1409     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
   1410     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
   1411     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
   1412     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
   1413     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
   1414     const int MaxWaitStates = 19;
   1415 
   1416     if (!Use.isReg())
   1417       continue;
   1418     unsigned Reg = Use.getReg();
   1419     bool FullReg;
   1420     const MachineInstr *MI1;
   1421 
   1422     auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
   1423                                      this](const MachineInstr &MI) {
   1424       if (!IsMFMAFn(MI))
   1425         return false;
   1426       if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
   1427         return false;
   1428       Register DstReg = MI.getOperand(0).getReg();
   1429       FullReg = (DstReg == Reg);
   1430       MI1 = &MI;
   1431       return TRI.regsOverlap(DstReg, Reg);
   1432     };
   1433 
   1434     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
   1435       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
   1436     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1437 
   1438     int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
   1439                                               MaxWaitStates);
   1440     if (NumWaitStates == std::numeric_limits<int>::max())
   1441       continue;
   1442 
   1443     int OpNo = MI->getOperandNo(&Use);
   1444     unsigned Opc1 = MI1->getOpcode();
   1445     int NeedWaitStates = 0;
   1446     if (OpNo == SrcCIdx) {
   1447       if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
   1448         NeedWaitStates = 0;
   1449       } else if (FullReg) {
   1450         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
   1451              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
   1452             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
   1453              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
   1454           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
   1455       } else {
   1456         switch (Opc1) {
   1457         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
   1458         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
   1459           if (!isXDL(ST, *MI))
   1460             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
   1461           break;
   1462         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
   1463         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
   1464           if (!isXDL(ST, *MI))
   1465             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
   1466           break;
   1467         default:
   1468           switch (TSchedModel.computeInstrLatency(MI1)) {
   1469           case 2:
   1470             NeedWaitStates = isDGEMM(Opc)
   1471               ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
   1472               : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
   1473             break;
   1474           case 8:
   1475             NeedWaitStates = isDGEMM(Opc)
   1476               ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
   1477               : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
   1478             break;
   1479           case 16: LLVM_FALLTHROUGH;
   1480           default:
   1481             NeedWaitStates = isDGEMM(Opc)
   1482               ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
   1483               : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
   1484           }
   1485         }
   1486       }
   1487     } else {
   1488       switch (Opc1) {
   1489       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
   1490       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
   1491         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
   1492         break;
   1493       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
   1494       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
   1495         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
   1496         break;
   1497       default:
   1498         switch (TSchedModel.computeInstrLatency(MI1)) {
   1499         case 2:
   1500           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
   1501           break;
   1502         case 8:
   1503           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
   1504           break;
   1505         case 16: LLVM_FALLTHROUGH;
   1506         default:
   1507           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
   1508         }
   1509       }
   1510     }
   1511     if (WaitStatesNeeded >= NeedWaitStates)
   1512       continue;
   1513 
   1514     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
   1515     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1516 
   1517     if (WaitStatesNeeded == MaxWaitStates)
   1518       break;
   1519   }
   1520 
   1521   return WaitStatesNeeded;
   1522 }
   1523 
   1524 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
   1525   // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
   1526   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
   1527     return 0;
   1528 
   1529   int WaitStatesNeeded = 0;
   1530 
   1531   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
   1532     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
   1533   };
   1534 
   1535   for (const MachineOperand &Op : MI->explicit_uses()) {
   1536     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
   1537       continue;
   1538 
   1539     Register Reg = Op.getReg();
   1540 
   1541     const int AccVgprReadLdStWaitStates = 2;
   1542     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
   1543     const int MaxWaitStates = 2;
   1544 
   1545     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
   1546       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
   1547     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1548 
   1549     if (WaitStatesNeeded == MaxWaitStates)
   1550       return WaitStatesNeeded; // Early exit.
   1551 
   1552     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
   1553       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
   1554           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
   1555         return false;
   1556       auto IsVALUFn = [](const MachineInstr &MI) {
   1557         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
   1558       };
   1559       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
   1560              std::numeric_limits<int>::max();
   1561     };
   1562 
   1563     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
   1564       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
   1565     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1566   }
   1567 
   1568   return WaitStatesNeeded;
   1569 }
   1570 
   1571 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
   1572   if (!ST.hasGFX90AInsts())
   1573     return 0;
   1574 
   1575   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
   1576     return SIInstrInfo::isMAI(MI) &&
   1577            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
   1578            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   1579   };
   1580 
   1581   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
   1582     return isDGEMM(MI.getOpcode());
   1583   };
   1584 
   1585   // This is checked in checkMAIHazards90A()
   1586   if (IsMFMAFn(*MI))
   1587     return 0;
   1588 
   1589   int WaitStatesNeeded = 0;
   1590 
   1591   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
   1592                        SIInstrInfo::isFLAT(*MI) ||
   1593                        SIInstrInfo::isDS(*MI) ||
   1594                        SIInstrInfo::isEXP(*MI);
   1595   bool IsVALU = SIInstrInfo::isVALU(*MI);
   1596 
   1597   const MachineInstr *MFMA = nullptr;
   1598   unsigned Reg;
   1599   auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
   1600                               this](const MachineInstr &MI) {
   1601     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
   1602       return false;
   1603     if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
   1604       return false;
   1605     MFMA = &MI;
   1606     return true;
   1607   };
   1608 
   1609   const MachineInstr *DOT = nullptr;
   1610   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
   1611     if (!SIInstrInfo::isDOT(MI) ||
   1612         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
   1613       return false;
   1614     DOT = &MI;
   1615     return true;
   1616   };
   1617 
   1618   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
   1619                                            AMDGPU::OpName::src2);
   1620 
   1621   if (IsMemOrExport || IsVALU) {
   1622     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
   1623     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
   1624     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
   1625     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
   1626     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
   1627     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
   1628     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
   1629     const int DotWriteSameDotReadSrcAB = 3;
   1630     const int DotWriteDifferentVALURead = 3;
   1631     const int MaxWaitStates = 19;
   1632 
   1633     for (const MachineOperand &Use : MI->explicit_uses()) {
   1634       if (!Use.isReg())
   1635         continue;
   1636       Reg = Use.getReg();
   1637 
   1638       DOT = nullptr;
   1639       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
   1640                                                      MaxWaitStates);
   1641       if (DOT) {
   1642         int NeedWaitStates = 0;
   1643         if (DOT->getOpcode() == MI->getOpcode()) {
   1644           if (&Use - &MI->getOperand(0) != SrcCIdx)
   1645             NeedWaitStates = DotWriteSameDotReadSrcAB;
   1646         } else {
   1647           NeedWaitStates = DotWriteDifferentVALURead;
   1648         }
   1649 
   1650         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
   1651         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1652       }
   1653 
   1654       MFMA = nullptr;
   1655       WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
   1656                                                  MaxWaitStates);
   1657       if (!MFMA)
   1658         continue;
   1659 
   1660       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
   1661       int NeedWaitStates = MaxWaitStates;
   1662       switch (HazardDefLatency) {
   1663       case 2:
   1664         NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
   1665         break;
   1666       case 4:
   1667         assert(isDGEMM(MFMA->getOpcode()));
   1668         NeedWaitStates =
   1669             IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
   1670                           : DMFMA4x4WriteVgprVALUReadWaitStates;
   1671         break;
   1672       case 8:
   1673         NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
   1674         break;
   1675       case 16: LLVM_FALLTHROUGH;
   1676       default:
   1677         NeedWaitStates =
   1678           isDGEMM(MFMA->getOpcode())
   1679             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
   1680                             : DMFMA16x16WriteVgprVALUReadWaitStates
   1681             : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
   1682         break;
   1683       }
   1684 
   1685       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
   1686       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1687 
   1688       if (WaitStatesNeeded == MaxWaitStates)
   1689         break;
   1690     }
   1691   }
   1692 
   1693   unsigned Opc = MI->getOpcode();
   1694   const int DMFMAToFMA64WaitStates = 2;
   1695   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
   1696        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
   1697        Opc == AMDGPU::V_FMAC_F64_dpp) &&
   1698       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
   1699     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
   1700       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
   1701     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1702   }
   1703 
   1704   if (!IsVALU && !IsMemOrExport)
   1705     return WaitStatesNeeded;
   1706 
   1707   for (const MachineOperand &Def : MI->defs()) {
   1708     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
   1709     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
   1710     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
   1711     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
   1712     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
   1713     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
   1714     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
   1715     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
   1716     const int DotWriteDifferentVALUWrite = 3;
   1717     const int MaxWaitStates = 19;
   1718     const int MaxWarWaitStates = 15;
   1719 
   1720     Reg = Def.getReg();
   1721 
   1722     DOT = nullptr;
   1723     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
   1724                                                    MaxWaitStates);
   1725     if (DOT && DOT->getOpcode() != MI->getOpcode())
   1726       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
   1727                                                     WaitStatesSinceDef);
   1728 
   1729     MFMA = nullptr;
   1730     WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
   1731                                                MaxWaitStates);
   1732     if (MFMA) {
   1733       int NeedWaitStates = MaxWaitStates;
   1734       switch (TSchedModel.computeInstrLatency(MFMA)) {
   1735       case 2:
   1736         NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
   1737         break;
   1738       case 4:
   1739         assert(isDGEMM(MFMA->getOpcode()));
   1740         NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
   1741         break;
   1742       case 8:
   1743         NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
   1744         break;
   1745       case 16: LLVM_FALLTHROUGH;
   1746       default:
   1747         NeedWaitStates = isDGEMM(MFMA->getOpcode())
   1748                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
   1749                    : SMFMA32x32WriteVgprVALUWawWaitStates;
   1750         break;
   1751       }
   1752 
   1753       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
   1754       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1755 
   1756       if (WaitStatesNeeded == MaxWaitStates)
   1757         break;
   1758     }
   1759 
   1760     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
   1761                              this](const MachineInstr &MI) {
   1762       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
   1763           !MI.readsRegister(Reg, &TRI))
   1764         return false;
   1765 
   1766       const MachineOperand *SrcC =
   1767           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
   1768       assert(SrcC);
   1769       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
   1770         return false;
   1771 
   1772       MFMA = &MI;
   1773       return true;
   1774     };
   1775 
   1776     MFMA = nullptr;
   1777     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
   1778                                                 MaxWarWaitStates);
   1779     if (!MFMA)
   1780       continue;
   1781 
   1782     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
   1783     int NeedWaitStates = MaxWaitStates;
   1784     switch (HazardDefLatency) {
   1785     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
   1786              break;
   1787     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
   1788              break;
   1789     case 16: LLVM_FALLTHROUGH;
   1790     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
   1791              break;
   1792     }
   1793 
   1794     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
   1795     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   1796   }
   1797 
   1798   return WaitStatesNeeded;
   1799 }
   1800 
   1801 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
   1802   if (!SU->isInstr())
   1803     return false;
   1804 
   1805   const MachineInstr *MAI = nullptr;
   1806   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
   1807     MAI = nullptr;
   1808     if (SIInstrInfo::isMAI(MI) &&
   1809         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
   1810         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
   1811       MAI = &MI;
   1812     return MAI != nullptr;
   1813   };
   1814 
   1815   MachineInstr *MI = SU->getInstr();
   1816   if (IsMFMAFn(*MI)) {
   1817     int W = getWaitStatesSince(IsMFMAFn, 16);
   1818     if (MAI)
   1819       return W < (int)TSchedModel.computeInstrLatency(MAI);
   1820   }
   1821 
   1822   return false;
   1823 }
   1824