Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
     11 /// for pixel shaders, and strict whole wavefront mode for all programs.
     12 ///
     13 /// The "strict" prefix indicates that inactive lanes do not take part in
     14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
     15 /// always be enabled irrespective of control flow decisions. Conversely in
     16 /// non-strict WQM inactive lanes may control flow decisions.
     17 ///
     18 /// Whole quad mode is required for derivative computations, but it interferes
     19 /// with shader side effects (stores and atomics). It ensures that WQM is
     20 /// enabled when necessary, but disabled around stores and atomics.
     21 ///
     22 /// When necessary, this pass creates a function prolog
     23 ///
     24 ///   S_MOV_B64 LiveMask, EXEC
     25 ///   S_WQM_B64 EXEC, EXEC
     26 ///
     27 /// to enter WQM at the top of the function and surrounds blocks of Exact
     28 /// instructions by
     29 ///
     30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
     31 ///   ...
     32 ///   S_MOV_B64 EXEC, Tmp
     33 ///
     34 /// We also compute when a sequence of instructions requires strict whole
     35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
     36 ///
     37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
     38 ///   ...
     39 ///   S_MOV_B64 EXEC, Tmp
     40 ///
     41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
     42 /// we use a similar save and restore mechanism and force whole quad mode for
     43 /// those instructions:
     44 ///
     45 ///  S_MOV_B64 Tmp, EXEC
     46 ///  S_WQM_B64 EXEC, EXEC
     47 ///  ...
     48 ///  S_MOV_B64 EXEC, Tmp
     49 ///
     50 /// In order to avoid excessive switching during sequences of Exact
     51 /// instructions, the pass first analyzes which instructions must be run in WQM
     52 /// (aka which instructions produce values that lead to derivative
     53 /// computations).
     54 ///
     55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
     56 ///
     57 /// There is room for improvement given better control flow analysis:
     58 ///
     59 ///  (1) at the top level (outside of control flow statements, and as long as
     60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
     61 ///      the LiveMask (this is implemented for the entry block).
     62 ///
     63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
     64 ///      consist of exact and don't-care instructions, the switch only has to
     65 ///      be done at the entry and exit points rather than potentially in each
     66 ///      block of the region.
     67 ///
     68 //===----------------------------------------------------------------------===//
     69 
     70 #include "AMDGPU.h"
     71 #include "GCNSubtarget.h"
     72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     73 #include "llvm/ADT/MapVector.h"
     74 #include "llvm/ADT/PostOrderIterator.h"
     75 #include "llvm/CodeGen/LiveIntervals.h"
     76 #include "llvm/CodeGen/MachineBasicBlock.h"
     77 #include "llvm/CodeGen/MachineDominators.h"
     78 #include "llvm/CodeGen/MachineFunctionPass.h"
     79 #include "llvm/CodeGen/MachineInstr.h"
     80 #include "llvm/CodeGen/MachinePostDominators.h"
     81 #include "llvm/IR/CallingConv.h"
     82 #include "llvm/InitializePasses.h"
     83 #include "llvm/Support/raw_ostream.h"
     84 
     85 using namespace llvm;
     86 
     87 #define DEBUG_TYPE "si-wqm"
     88 
     89 namespace {
     90 
     91 enum {
     92   StateWQM = 0x1,
     93   StateStrictWWM = 0x2,
     94   StateStrictWQM = 0x4,
     95   StateExact = 0x8,
     96   StateStrict = StateStrictWWM | StateStrictWQM,
     97 };
     98 
     99 struct PrintState {
    100 public:
    101   int State;
    102 
    103   explicit PrintState(int State) : State(State) {}
    104 };
    105 
    106 #ifndef NDEBUG
    107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
    108 
    109   static const std::pair<char, const char *> Mapping[] = {
    110       std::make_pair(StateWQM, "WQM"),
    111       std::make_pair(StateStrictWWM, "StrictWWM"),
    112       std::make_pair(StateStrictWQM, "StrictWQM"),
    113       std::make_pair(StateExact, "Exact")};
    114   char State = PS.State;
    115   for (auto M : Mapping) {
    116     if (State & M.first) {
    117       OS << M.second;
    118       State &= ~M.first;
    119 
    120       if (State)
    121         OS << '|';
    122     }
    123   }
    124   assert(State == 0);
    125   return OS;
    126 }
    127 #endif
    128 
    129 struct InstrInfo {
    130   char Needs = 0;
    131   char Disabled = 0;
    132   char OutNeeds = 0;
    133 };
    134 
    135 struct BlockInfo {
    136   char Needs = 0;
    137   char InNeeds = 0;
    138   char OutNeeds = 0;
    139   char InitialState = 0;
    140   bool NeedsLowering = false;
    141 };
    142 
    143 struct WorkItem {
    144   MachineBasicBlock *MBB = nullptr;
    145   MachineInstr *MI = nullptr;
    146 
    147   WorkItem() = default;
    148   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
    149   WorkItem(MachineInstr *MI) : MI(MI) {}
    150 };
    151 
    152 class SIWholeQuadMode : public MachineFunctionPass {
    153 private:
    154   const SIInstrInfo *TII;
    155   const SIRegisterInfo *TRI;
    156   const GCNSubtarget *ST;
    157   MachineRegisterInfo *MRI;
    158   LiveIntervals *LIS;
    159   MachineDominatorTree *MDT;
    160   MachinePostDominatorTree *PDT;
    161 
    162   unsigned AndOpc;
    163   unsigned AndN2Opc;
    164   unsigned XorOpc;
    165   unsigned AndSaveExecOpc;
    166   unsigned OrSaveExecOpc;
    167   unsigned WQMOpc;
    168   Register Exec;
    169   Register LiveMaskReg;
    170 
    171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
    172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
    173 
    174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
    175   DenseMap<const MachineInstr *, char> StateTransition;
    176 
    177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
    178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
    179   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
    180   SmallVector<MachineInstr *, 4> KillInstrs;
    181 
    182   void printInfo();
    183 
    184   void markInstruction(MachineInstr &MI, char Flag,
    185                        std::vector<WorkItem> &Worklist);
    186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
    187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
    188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
    189                    std::vector<WorkItem> &Worklist);
    190   void markInstructionUses(const MachineInstr &MI, char Flag,
    191                            std::vector<WorkItem> &Worklist);
    192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
    193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
    194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
    195   char analyzeFunction(MachineFunction &MF);
    196 
    197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
    198                                       MachineBasicBlock::iterator Before);
    199   MachineBasicBlock::iterator
    200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
    201                    MachineBasicBlock::iterator Last, bool PreferLast,
    202                    bool SaveSCC);
    203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
    204                Register SaveWQM);
    205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
    206              Register SavedWQM);
    207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
    208                     Register SaveOrig, char StrictStateNeeded);
    209   void fromStrictMode(MachineBasicBlock &MBB,
    210                       MachineBasicBlock::iterator Before, Register SavedOrig,
    211                       char NonStrictState, char CurrentStrictState);
    212 
    213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
    214 
    215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
    216                             bool IsWQM);
    217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
    218 
    219   void lowerBlock(MachineBasicBlock &MBB);
    220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
    221 
    222   void lowerLiveMaskQueries();
    223   void lowerCopyInstrs();
    224   void lowerKillInstrs(bool IsWQM);
    225 
    226 public:
    227   static char ID;
    228 
    229   SIWholeQuadMode() :
    230     MachineFunctionPass(ID) { }
    231 
    232   bool runOnMachineFunction(MachineFunction &MF) override;
    233 
    234   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
    235 
    236   void getAnalysisUsage(AnalysisUsage &AU) const override {
    237     AU.addRequired<LiveIntervals>();
    238     AU.addPreserved<SlotIndexes>();
    239     AU.addPreserved<LiveIntervals>();
    240     AU.addRequired<MachineDominatorTree>();
    241     AU.addPreserved<MachineDominatorTree>();
    242     AU.addRequired<MachinePostDominatorTree>();
    243     AU.addPreserved<MachinePostDominatorTree>();
    244     MachineFunctionPass::getAnalysisUsage(AU);
    245   }
    246 
    247   MachineFunctionProperties getClearedProperties() const override {
    248     return MachineFunctionProperties().set(
    249         MachineFunctionProperties::Property::IsSSA);
    250   }
    251 };
    252 
    253 } // end anonymous namespace
    254 
    255 char SIWholeQuadMode::ID = 0;
    256 
    257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
    258                       false)
    259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
    260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
    261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
    262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
    263                     false)
    264 
    265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
    266 
    267 FunctionPass *llvm::createSIWholeQuadModePass() {
    268   return new SIWholeQuadMode;
    269 }
    270 
    271 #ifndef NDEBUG
    272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
    273   for (const auto &BII : Blocks) {
    274     dbgs() << "\n"
    275            << printMBBReference(*BII.first) << ":\n"
    276            << "  InNeeds = " << PrintState(BII.second.InNeeds)
    277            << ", Needs = " << PrintState(BII.second.Needs)
    278            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
    279 
    280     for (const MachineInstr &MI : *BII.first) {
    281       auto III = Instructions.find(&MI);
    282       if (III == Instructions.end())
    283         continue;
    284 
    285       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
    286              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
    287     }
    288   }
    289 }
    290 #endif
    291 
    292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
    293                                       std::vector<WorkItem> &Worklist) {
    294   InstrInfo &II = Instructions[&MI];
    295 
    296   assert(!(Flag & StateExact) && Flag != 0);
    297 
    298   // Remove any disabled states from the flag. The user that required it gets
    299   // an undefined value in the helper lanes. For example, this can happen if
    300   // the result of an atomic is used by instruction that requires WQM, where
    301   // ignoring the request for WQM is correct as per the relevant specs.
    302   Flag &= ~II.Disabled;
    303 
    304   // Ignore if the flag is already encompassed by the existing needs, or we
    305   // just disabled everything.
    306   if ((II.Needs & Flag) == Flag)
    307     return;
    308 
    309   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
    310   II.Needs |= Flag;
    311   Worklist.push_back(&MI);
    312 }
    313 
    314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
    315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
    316                                Register Reg, unsigned SubReg, char Flag,
    317                                std::vector<WorkItem> &Worklist) {
    318   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
    319 
    320   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
    321   const VNInfo *Value = UseLRQ.valueIn();
    322   if (!Value)
    323     return;
    324 
    325   // Note: this code assumes that lane masks on AMDGPU completely
    326   // cover registers.
    327   const LaneBitmask UseLanes =
    328       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
    329              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
    330                                 : LaneBitmask::getNone());
    331 
    332   // Perform a depth-first iteration of the LiveRange graph marking defs.
    333   // Stop processing of a given branch when all use lanes have been defined.
    334   // The first definition stops processing for a physical register.
    335   struct PhiEntry {
    336     const VNInfo *Phi;
    337     unsigned PredIdx;
    338     LaneBitmask DefinedLanes;
    339 
    340     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
    341         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
    342   };
    343   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
    344   SmallVector<PhiEntry, 2> PhiStack;
    345   SmallSet<VisitKey, 4> Visited;
    346   LaneBitmask DefinedLanes;
    347   unsigned NextPredIdx = 0; // Only used for processing phi nodes
    348   do {
    349     const VNInfo *NextValue = nullptr;
    350     const VisitKey Key(Value, DefinedLanes);
    351 
    352     if (!Visited.count(Key)) {
    353       Visited.insert(Key);
    354       // On first visit to a phi then start processing first predecessor
    355       NextPredIdx = 0;
    356     }
    357 
    358     if (Value->isPHIDef()) {
    359       // Each predecessor node in the phi must be processed as a subgraph
    360       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
    361       assert(MBB && "Phi-def has no defining MBB");
    362 
    363       // Find next predecessor to process
    364       unsigned Idx = NextPredIdx;
    365       auto PI = MBB->pred_begin() + Idx;
    366       auto PE = MBB->pred_end();
    367       for (; PI != PE && !NextValue; ++PI, ++Idx) {
    368         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
    369           if (!Visited.count(VisitKey(VN, DefinedLanes)))
    370             NextValue = VN;
    371         }
    372       }
    373 
    374       // If there are more predecessors to process; add phi to stack
    375       if (PI != PE)
    376         PhiStack.emplace_back(Value, Idx, DefinedLanes);
    377     } else {
    378       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
    379       assert(MI && "Def has no defining instruction");
    380 
    381       if (Reg.isVirtual()) {
    382         // Iterate over all operands to find relevant definitions
    383         bool HasDef = false;
    384         for (const MachineOperand &Op : MI->operands()) {
    385           if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
    386             continue;
    387 
    388           // Compute lanes defined and overlap with use
    389           LaneBitmask OpLanes =
    390               Op.isUndef() ? LaneBitmask::getAll()
    391                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
    392           LaneBitmask Overlap = (UseLanes & OpLanes);
    393 
    394           // Record if this instruction defined any of use
    395           HasDef |= Overlap.any();
    396 
    397           // Mark any lanes defined
    398           DefinedLanes |= OpLanes;
    399         }
    400 
    401         // Check if all lanes of use have been defined
    402         if ((DefinedLanes & UseLanes) != UseLanes) {
    403           // Definition not complete; need to process input value
    404           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
    405           if (const VNInfo *VN = LRQ.valueIn()) {
    406             if (!Visited.count(VisitKey(VN, DefinedLanes)))
    407               NextValue = VN;
    408           }
    409         }
    410 
    411         // Only mark the instruction if it defines some part of the use
    412         if (HasDef)
    413           markInstruction(*MI, Flag, Worklist);
    414       } else {
    415         // For physical registers simply mark the defining instruction
    416         markInstruction(*MI, Flag, Worklist);
    417       }
    418     }
    419 
    420     if (!NextValue && !PhiStack.empty()) {
    421       // Reach end of chain; revert to processing last phi
    422       PhiEntry &Entry = PhiStack.back();
    423       NextValue = Entry.Phi;
    424       NextPredIdx = Entry.PredIdx;
    425       DefinedLanes = Entry.DefinedLanes;
    426       PhiStack.pop_back();
    427     }
    428 
    429     Value = NextValue;
    430   } while (Value);
    431 }
    432 
    433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
    434                                   const MachineOperand &Op, char Flag,
    435                                   std::vector<WorkItem> &Worklist) {
    436   assert(Op.isReg());
    437   Register Reg = Op.getReg();
    438 
    439   // Ignore some hardware registers
    440   switch (Reg) {
    441   case AMDGPU::EXEC:
    442   case AMDGPU::EXEC_LO:
    443     return;
    444   default:
    445     break;
    446   }
    447 
    448   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
    449                     << " for " << MI);
    450   if (Reg.isVirtual()) {
    451     LiveRange &LR = LIS->getInterval(Reg);
    452     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
    453   } else {
    454     // Handle physical registers that we need to track; this is mostly relevant
    455     // for VCC, which can appear as the (implicit) input of a uniform branch,
    456     // e.g. when a loop counter is stored in a VGPR.
    457     for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
    458          ++RegUnit) {
    459       LiveRange &LR = LIS->getRegUnit(*RegUnit);
    460       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
    461       if (!Value)
    462         continue;
    463 
    464       markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
    465     }
    466   }
    467 }
    468 
    469 /// Mark all instructions defining the uses in \p MI with \p Flag.
    470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
    471                                           std::vector<WorkItem> &Worklist) {
    472   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
    473                     << MI);
    474 
    475   for (const MachineOperand &Use : MI.uses()) {
    476     if (!Use.isReg() || !Use.isUse())
    477       continue;
    478     markOperand(MI, Use, Flag, Worklist);
    479   }
    480 }
    481 
    482 // Scan instructions to determine which ones require an Exact execmask and
    483 // which ones seed WQM requirements.
    484 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
    485                                        std::vector<WorkItem> &Worklist) {
    486   char GlobalFlags = 0;
    487   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
    488   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
    489   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
    490 
    491   // We need to visit the basic blocks in reverse post-order so that we visit
    492   // defs before uses, in particular so that we don't accidentally mark an
    493   // instruction as needing e.g. WQM before visiting it and realizing it needs
    494   // WQM disabled.
    495   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
    496   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
    497     MachineBasicBlock &MBB = **BI;
    498     BlockInfo &BBI = Blocks[&MBB];
    499 
    500     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
    501       MachineInstr &MI = *II;
    502       InstrInfo &III = Instructions[&MI];
    503       unsigned Opcode = MI.getOpcode();
    504       char Flags = 0;
    505 
    506       if (TII->isWQM(Opcode)) {
    507         // If LOD is not supported WQM is not needed.
    508         if (!ST->hasExtendedImageInsts())
    509           continue;
    510         // Sampling instructions don't need to produce results for all pixels
    511         // in a quad, they just require all inputs of a quad to have been
    512         // computed for derivatives.
    513         markInstructionUses(MI, StateWQM, Worklist);
    514         GlobalFlags |= StateWQM;
    515         continue;
    516       } else if (Opcode == AMDGPU::WQM) {
    517         // The WQM intrinsic requires its output to have all the helper lanes
    518         // correct, so we need it to be in WQM.
    519         Flags = StateWQM;
    520         LowerToCopyInstrs.push_back(&MI);
    521       } else if (Opcode == AMDGPU::SOFT_WQM) {
    522         LowerToCopyInstrs.push_back(&MI);
    523         SoftWQMInstrs.push_back(&MI);
    524         continue;
    525       } else if (Opcode == AMDGPU::STRICT_WWM) {
    526         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
    527         // it needs to be executed in WQM or Exact so that its copy doesn't
    528         // clobber inactive lanes.
    529         markInstructionUses(MI, StateStrictWWM, Worklist);
    530         GlobalFlags |= StateStrictWWM;
    531         LowerToMovInstrs.push_back(&MI);
    532         continue;
    533       } else if (Opcode == AMDGPU::STRICT_WQM) {
    534         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
    535         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
    536         // quads that have at least one active thread.
    537         markInstructionUses(MI, StateStrictWQM, Worklist);
    538         GlobalFlags |= StateStrictWQM;
    539         LowerToMovInstrs.push_back(&MI);
    540         continue;
    541       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
    542                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
    543         III.Disabled = StateStrict;
    544         MachineOperand &Inactive = MI.getOperand(2);
    545         if (Inactive.isReg()) {
    546           if (Inactive.isUndef()) {
    547             LowerToCopyInstrs.push_back(&MI);
    548           } else {
    549             markOperand(MI, Inactive, StateStrictWWM, Worklist);
    550           }
    551         }
    552         SetInactiveInstrs.push_back(&MI);
    553         continue;
    554       } else if (TII->isDisableWQM(MI)) {
    555         BBI.Needs |= StateExact;
    556         if (!(BBI.InNeeds & StateExact)) {
    557           BBI.InNeeds |= StateExact;
    558           Worklist.push_back(&MBB);
    559         }
    560         GlobalFlags |= StateExact;
    561         III.Disabled = StateWQM | StateStrict;
    562         continue;
    563       } else {
    564         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
    565           LiveMaskQueries.push_back(&MI);
    566         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
    567                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
    568                    Opcode == AMDGPU::SI_DEMOTE_I1) {
    569           KillInstrs.push_back(&MI);
    570           BBI.NeedsLowering = true;
    571         } else if (WQMOutputs) {
    572           // The function is in machine SSA form, which means that physical
    573           // VGPRs correspond to shader inputs and outputs. Inputs are
    574           // only used, outputs are only defined.
    575           // FIXME: is this still valid?
    576           for (const MachineOperand &MO : MI.defs()) {
    577             if (!MO.isReg())
    578               continue;
    579 
    580             Register Reg = MO.getReg();
    581 
    582             if (!Reg.isVirtual() &&
    583                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
    584               Flags = StateWQM;
    585               break;
    586             }
    587           }
    588         }
    589 
    590         if (!Flags)
    591           continue;
    592       }
    593 
    594       markInstruction(MI, Flags, Worklist);
    595       GlobalFlags |= Flags;
    596     }
    597   }
    598 
    599   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
    600   // ever used anywhere in the function. This implements the corresponding
    601   // semantics of @llvm.amdgcn.set.inactive.
    602   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
    603   if (GlobalFlags & StateWQM) {
    604     for (MachineInstr *MI : SetInactiveInstrs)
    605       markInstruction(*MI, StateWQM, Worklist);
    606     for (MachineInstr *MI : SoftWQMInstrs)
    607       markInstruction(*MI, StateWQM, Worklist);
    608   }
    609 
    610   return GlobalFlags;
    611 }
    612 
    613 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
    614                                            std::vector<WorkItem>& Worklist) {
    615   MachineBasicBlock *MBB = MI.getParent();
    616   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
    617   BlockInfo &BI = Blocks[MBB];
    618 
    619   // Control flow-type instructions and stores to temporary memory that are
    620   // followed by WQM computations must themselves be in WQM.
    621   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
    622       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
    623     Instructions[&MI].Needs = StateWQM;
    624     II.Needs = StateWQM;
    625   }
    626 
    627   // Propagate to block level
    628   if (II.Needs & StateWQM) {
    629     BI.Needs |= StateWQM;
    630     if (!(BI.InNeeds & StateWQM)) {
    631       BI.InNeeds |= StateWQM;
    632       Worklist.push_back(MBB);
    633     }
    634   }
    635 
    636   // Propagate backwards within block
    637   if (MachineInstr *PrevMI = MI.getPrevNode()) {
    638     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
    639     if (!PrevMI->isPHI()) {
    640       InstrInfo &PrevII = Instructions[PrevMI];
    641       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
    642         PrevII.OutNeeds |= InNeeds;
    643         Worklist.push_back(PrevMI);
    644       }
    645     }
    646   }
    647 
    648   // Propagate WQM flag to instruction inputs
    649   assert(!(II.Needs & StateExact));
    650 
    651   if (II.Needs != 0)
    652     markInstructionUses(MI, II.Needs, Worklist);
    653 
    654   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
    655   // not require any WQM transitions.
    656   if (II.Needs & StateStrictWWM)
    657     BI.Needs |= StateStrictWWM;
    658   if (II.Needs & StateStrictWQM)
    659     BI.Needs |= StateStrictWQM;
    660 }
    661 
    662 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
    663                                      std::vector<WorkItem>& Worklist) {
    664   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
    665 
    666   // Propagate through instructions
    667   if (!MBB.empty()) {
    668     MachineInstr *LastMI = &*MBB.rbegin();
    669     InstrInfo &LastII = Instructions[LastMI];
    670     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
    671       LastII.OutNeeds |= BI.OutNeeds;
    672       Worklist.push_back(LastMI);
    673     }
    674   }
    675 
    676   // Predecessor blocks must provide for our WQM/Exact needs.
    677   for (MachineBasicBlock *Pred : MBB.predecessors()) {
    678     BlockInfo &PredBI = Blocks[Pred];
    679     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
    680       continue;
    681 
    682     PredBI.OutNeeds |= BI.InNeeds;
    683     PredBI.InNeeds |= BI.InNeeds;
    684     Worklist.push_back(Pred);
    685   }
    686 
    687   // All successors must be prepared to accept the same set of WQM/Exact data.
    688   for (MachineBasicBlock *Succ : MBB.successors()) {
    689     BlockInfo &SuccBI = Blocks[Succ];
    690     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
    691       continue;
    692 
    693     SuccBI.InNeeds |= BI.OutNeeds;
    694     Worklist.push_back(Succ);
    695   }
    696 }
    697 
    698 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
    699   std::vector<WorkItem> Worklist;
    700   char GlobalFlags = scanInstructions(MF, Worklist);
    701 
    702   while (!Worklist.empty()) {
    703     WorkItem WI = Worklist.back();
    704     Worklist.pop_back();
    705 
    706     if (WI.MI)
    707       propagateInstruction(*WI.MI, Worklist);
    708     else
    709       propagateBlock(*WI.MBB, Worklist);
    710   }
    711 
    712   return GlobalFlags;
    713 }
    714 
    715 MachineBasicBlock::iterator
    716 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
    717                          MachineBasicBlock::iterator Before) {
    718   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    719 
    720   MachineInstr *Save =
    721       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
    722           .addReg(AMDGPU::SCC);
    723   MachineInstr *Restore =
    724       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
    725           .addReg(SaveReg);
    726 
    727   LIS->InsertMachineInstrInMaps(*Save);
    728   LIS->InsertMachineInstrInMaps(*Restore);
    729   LIS->createAndComputeVirtRegInterval(SaveReg);
    730 
    731   return Restore;
    732 }
    733 
    734 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
    735                                                MachineInstr *TermMI) {
    736   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
    737                     << *TermMI << "\n");
    738 
    739   MachineBasicBlock *SplitBB =
    740       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
    741 
    742   // Convert last instruction in block to a terminator.
    743   // Note: this only covers the expected patterns
    744   unsigned NewOpcode = 0;
    745   switch (TermMI->getOpcode()) {
    746   case AMDGPU::S_AND_B32:
    747     NewOpcode = AMDGPU::S_AND_B32_term;
    748     break;
    749   case AMDGPU::S_AND_B64:
    750     NewOpcode = AMDGPU::S_AND_B64_term;
    751     break;
    752   case AMDGPU::S_MOV_B32:
    753     NewOpcode = AMDGPU::S_MOV_B32_term;
    754     break;
    755   case AMDGPU::S_MOV_B64:
    756     NewOpcode = AMDGPU::S_MOV_B64_term;
    757     break;
    758   default:
    759     break;
    760   }
    761   if (NewOpcode)
    762     TermMI->setDesc(TII->get(NewOpcode));
    763 
    764   if (SplitBB != BB) {
    765     // Update dominator trees
    766     using DomTreeT = DomTreeBase<MachineBasicBlock>;
    767     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
    768     for (MachineBasicBlock *Succ : SplitBB->successors()) {
    769       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
    770       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
    771     }
    772     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
    773     if (MDT)
    774       MDT->getBase().applyUpdates(DTUpdates);
    775     if (PDT)
    776       PDT->getBase().applyUpdates(DTUpdates);
    777 
    778     // Link blocks
    779     MachineInstr *MI =
    780         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
    781             .addMBB(SplitBB);
    782     LIS->InsertMachineInstrInMaps(*MI);
    783   }
    784 
    785   return SplitBB;
    786 }
    787 
    788 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
    789                                             MachineInstr &MI) {
    790   const DebugLoc &DL = MI.getDebugLoc();
    791   unsigned Opcode = 0;
    792 
    793   assert(MI.getOperand(0).isReg());
    794 
    795   // Comparison is for live lanes; however here we compute the inverse
    796   // (killed lanes).  This is because VCMP will always generate 0 bits
    797   // for inactive lanes so a mask of live lanes would not be correct
    798   // inside control flow.
    799   // Invert the comparison by swapping the operands and adjusting
    800   // the comparison codes.
    801 
    802   switch (MI.getOperand(2).getImm()) {
    803   case ISD::SETUEQ:
    804     Opcode = AMDGPU::V_CMP_LG_F32_e64;
    805     break;
    806   case ISD::SETUGT:
    807     Opcode = AMDGPU::V_CMP_GE_F32_e64;
    808     break;
    809   case ISD::SETUGE:
    810     Opcode = AMDGPU::V_CMP_GT_F32_e64;
    811     break;
    812   case ISD::SETULT:
    813     Opcode = AMDGPU::V_CMP_LE_F32_e64;
    814     break;
    815   case ISD::SETULE:
    816     Opcode = AMDGPU::V_CMP_LT_F32_e64;
    817     break;
    818   case ISD::SETUNE:
    819     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
    820     break;
    821   case ISD::SETO:
    822     Opcode = AMDGPU::V_CMP_O_F32_e64;
    823     break;
    824   case ISD::SETUO:
    825     Opcode = AMDGPU::V_CMP_U_F32_e64;
    826     break;
    827   case ISD::SETOEQ:
    828   case ISD::SETEQ:
    829     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
    830     break;
    831   case ISD::SETOGT:
    832   case ISD::SETGT:
    833     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
    834     break;
    835   case ISD::SETOGE:
    836   case ISD::SETGE:
    837     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
    838     break;
    839   case ISD::SETOLT:
    840   case ISD::SETLT:
    841     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
    842     break;
    843   case ISD::SETOLE:
    844   case ISD::SETLE:
    845     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
    846     break;
    847   case ISD::SETONE:
    848   case ISD::SETNE:
    849     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
    850     break;
    851   default:
    852     llvm_unreachable("invalid ISD:SET cond code");
    853   }
    854 
    855   // Pick opcode based on comparison type.
    856   MachineInstr *VcmpMI;
    857   const MachineOperand &Op0 = MI.getOperand(0);
    858   const MachineOperand &Op1 = MI.getOperand(1);
    859   if (TRI->isVGPR(*MRI, Op0.getReg())) {
    860     Opcode = AMDGPU::getVOPe32(Opcode);
    861     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
    862   } else {
    863     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
    864                  .addReg(AMDGPU::VCC, RegState::Define)
    865                  .addImm(0) // src0 modifiers
    866                  .add(Op1)
    867                  .addImm(0) // src1 modifiers
    868                  .add(Op0)
    869                  .addImm(0); // omod
    870   }
    871 
    872   // VCC represents lanes killed.
    873   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
    874 
    875   MachineInstr *MaskUpdateMI =
    876       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
    877           .addReg(LiveMaskReg)
    878           .addReg(VCC);
    879 
    880   // State of SCC represents whether any lanes are live in mask,
    881   // if SCC is 0 then no lanes will be alive anymore.
    882   MachineInstr *EarlyTermMI =
    883       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
    884 
    885   MachineInstr *ExecMaskMI =
    886       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
    887 
    888   assert(MBB.succ_size() == 1);
    889   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
    890                               .addMBB(*MBB.succ_begin());
    891 
    892   // Update live intervals
    893   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
    894   MBB.remove(&MI);
    895 
    896   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
    897   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
    898   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
    899   LIS->InsertMachineInstrInMaps(*NewTerm);
    900 
    901   return NewTerm;
    902 }
    903 
    904 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
    905                                            MachineInstr &MI, bool IsWQM) {
    906   const DebugLoc &DL = MI.getDebugLoc();
    907   MachineInstr *MaskUpdateMI = nullptr;
    908 
    909   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
    910   const MachineOperand &Op = MI.getOperand(0);
    911   int64_t KillVal = MI.getOperand(1).getImm();
    912   MachineInstr *ComputeKilledMaskMI = nullptr;
    913   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
    914   Register TmpReg;
    915 
    916   // Is this a static or dynamic kill?
    917   if (Op.isImm()) {
    918     if (Op.getImm() == KillVal) {
    919       // Static: all active lanes are killed
    920       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
    921                          .addReg(LiveMaskReg)
    922                          .addReg(Exec);
    923     } else {
    924       // Static: kill does nothing
    925       MachineInstr *NewTerm = nullptr;
    926       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
    927         LIS->RemoveMachineInstrFromMaps(MI);
    928       } else {
    929         assert(MBB.succ_size() == 1);
    930         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
    931                       .addMBB(*MBB.succ_begin());
    932         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
    933       }
    934       MBB.remove(&MI);
    935       return NewTerm;
    936     }
    937   } else {
    938     if (!KillVal) {
    939       // Op represents live lanes after kill,
    940       // so exec mask needs to be factored in.
    941       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
    942       ComputeKilledMaskMI =
    943           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
    944       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
    945                          .addReg(LiveMaskReg)
    946                          .addReg(TmpReg);
    947     } else {
    948       // Op represents lanes to kill
    949       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
    950                          .addReg(LiveMaskReg)
    951                          .add(Op);
    952     }
    953   }
    954 
    955   // State of SCC represents whether any lanes are live in mask,
    956   // if SCC is 0 then no lanes will be alive anymore.
    957   MachineInstr *EarlyTermMI =
    958       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
    959 
    960   // In the case we got this far some lanes are still live,
    961   // update EXEC to deactivate lanes as appropriate.
    962   MachineInstr *NewTerm;
    963   MachineInstr *WQMMaskMI = nullptr;
    964   Register LiveMaskWQM;
    965   if (IsDemote) {
    966     // Demotes deactive quads with only helper lanes
    967     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
    968     WQMMaskMI =
    969         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
    970     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
    971                   .addReg(Exec)
    972                   .addReg(LiveMaskWQM);
    973   } else {
    974     // Kills deactivate lanes
    975     if (Op.isImm()) {
    976       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
    977       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
    978     } else if (!IsWQM) {
    979       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
    980                     .addReg(Exec)
    981                     .addReg(LiveMaskReg);
    982     } else {
    983       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
    984       NewTerm =
    985           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
    986     }
    987   }
    988 
    989   // Update live intervals
    990   LIS->RemoveMachineInstrFromMaps(MI);
    991   MBB.remove(&MI);
    992   assert(EarlyTermMI);
    993   assert(MaskUpdateMI);
    994   assert(NewTerm);
    995   if (ComputeKilledMaskMI)
    996     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
    997   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
    998   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
    999   if (WQMMaskMI)
   1000     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
   1001   LIS->InsertMachineInstrInMaps(*NewTerm);
   1002 
   1003   if (CndReg) {
   1004     LIS->removeInterval(CndReg);
   1005     LIS->createAndComputeVirtRegInterval(CndReg);
   1006   }
   1007   if (TmpReg)
   1008     LIS->createAndComputeVirtRegInterval(TmpReg);
   1009   if (LiveMaskWQM)
   1010     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
   1011 
   1012   return NewTerm;
   1013 }
   1014 
   1015 // Replace (or supplement) instructions accessing live mask.
   1016 // This can only happen once all the live mask registers have been created
   1017 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
   1018 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   1019   auto BII = Blocks.find(&MBB);
   1020   if (BII == Blocks.end())
   1021     return;
   1022 
   1023   const BlockInfo &BI = BII->second;
   1024   if (!BI.NeedsLowering)
   1025     return;
   1026 
   1027   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
   1028 
   1029   SmallVector<MachineInstr *, 4> SplitPoints;
   1030   char State = BI.InitialState;
   1031 
   1032   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   1033   while (II != IE) {
   1034     auto Next = std::next(II);
   1035     MachineInstr &MI = *II;
   1036 
   1037     if (StateTransition.count(&MI))
   1038       State = StateTransition[&MI];
   1039 
   1040     MachineInstr *SplitPoint = nullptr;
   1041     switch (MI.getOpcode()) {
   1042     case AMDGPU::SI_DEMOTE_I1:
   1043     case AMDGPU::SI_KILL_I1_TERMINATOR:
   1044       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
   1045       break;
   1046     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   1047       SplitPoint = lowerKillF32(MBB, MI);
   1048       break;
   1049     default:
   1050       break;
   1051     }
   1052     if (SplitPoint)
   1053       SplitPoints.push_back(SplitPoint);
   1054 
   1055     II = Next;
   1056   }
   1057 
   1058   // Perform splitting after instruction scan to simplify iteration.
   1059   if (!SplitPoints.empty()) {
   1060     MachineBasicBlock *BB = &MBB;
   1061     for (MachineInstr *MI : SplitPoints) {
   1062       BB = splitBlock(BB, MI);
   1063     }
   1064   }
   1065 }
   1066 
   1067 // Return an iterator in the (inclusive) range [First, Last] at which
   1068 // instructions can be safely inserted, keeping in mind that some of the
   1069 // instructions we want to add necessarily clobber SCC.
   1070 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
   1071     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
   1072     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
   1073   if (!SaveSCC)
   1074     return PreferLast ? Last : First;
   1075 
   1076   LiveRange &LR =
   1077       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
   1078   auto MBBE = MBB.end();
   1079   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
   1080                                      : LIS->getMBBEndIdx(&MBB);
   1081   SlotIndex LastIdx =
   1082       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
   1083   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
   1084   const LiveRange::Segment *S;
   1085 
   1086   for (;;) {
   1087     S = LR.getSegmentContaining(Idx);
   1088     if (!S)
   1089       break;
   1090 
   1091     if (PreferLast) {
   1092       SlotIndex Next = S->start.getBaseIndex();
   1093       if (Next < FirstIdx)
   1094         break;
   1095       Idx = Next;
   1096     } else {
   1097       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
   1098       assert(EndMI && "Segment does not end on valid instruction");
   1099       auto NextI = std::next(EndMI->getIterator());
   1100       if (NextI == MBB.end())
   1101         break;
   1102       SlotIndex Next = LIS->getInstructionIndex(*NextI);
   1103       if (Next > LastIdx)
   1104         break;
   1105       Idx = Next;
   1106     }
   1107   }
   1108 
   1109   MachineBasicBlock::iterator MBBI;
   1110 
   1111   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
   1112     MBBI = MI;
   1113   else {
   1114     assert(Idx == LIS->getMBBEndIdx(&MBB));
   1115     MBBI = MBB.end();
   1116   }
   1117 
   1118   // Move insertion point past any operations modifying EXEC.
   1119   // This assumes that the value of SCC defined by any of these operations
   1120   // does not need to be preserved.
   1121   while (MBBI != Last) {
   1122     bool IsExecDef = false;
   1123     for (const MachineOperand &MO : MBBI->operands()) {
   1124       if (MO.isReg() && MO.isDef()) {
   1125         IsExecDef |=
   1126             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
   1127       }
   1128     }
   1129     if (!IsExecDef)
   1130       break;
   1131     MBBI++;
   1132     S = nullptr;
   1133   }
   1134 
   1135   if (S)
   1136     MBBI = saveSCC(MBB, MBBI);
   1137 
   1138   return MBBI;
   1139 }
   1140 
   1141 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
   1142                               MachineBasicBlock::iterator Before,
   1143                               Register SaveWQM) {
   1144   MachineInstr *MI;
   1145 
   1146   if (SaveWQM) {
   1147     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
   1148              .addReg(LiveMaskReg);
   1149   } else {
   1150     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
   1151              .addReg(Exec)
   1152              .addReg(LiveMaskReg);
   1153   }
   1154 
   1155   LIS->InsertMachineInstrInMaps(*MI);
   1156   StateTransition[MI] = StateExact;
   1157 }
   1158 
   1159 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
   1160                             MachineBasicBlock::iterator Before,
   1161                             Register SavedWQM) {
   1162   MachineInstr *MI;
   1163 
   1164   if (SavedWQM) {
   1165     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
   1166              .addReg(SavedWQM);
   1167   } else {
   1168     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
   1169   }
   1170 
   1171   LIS->InsertMachineInstrInMaps(*MI);
   1172   StateTransition[MI] = StateWQM;
   1173 }
   1174 
   1175 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
   1176                                    MachineBasicBlock::iterator Before,
   1177                                    Register SaveOrig, char StrictStateNeeded) {
   1178   MachineInstr *MI;
   1179   assert(SaveOrig);
   1180   assert(StrictStateNeeded == StateStrictWWM ||
   1181          StrictStateNeeded == StateStrictWQM);
   1182 
   1183   if (StrictStateNeeded == StateStrictWWM) {
   1184     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
   1185                  SaveOrig)
   1186              .addImm(-1);
   1187   } else {
   1188     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
   1189                  SaveOrig)
   1190              .addImm(-1);
   1191   }
   1192   LIS->InsertMachineInstrInMaps(*MI);
   1193   StateTransition[MI] = StateStrictWWM;
   1194 }
   1195 
   1196 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
   1197                                      MachineBasicBlock::iterator Before,
   1198                                      Register SavedOrig, char NonStrictState,
   1199                                      char CurrentStrictState) {
   1200   MachineInstr *MI;
   1201 
   1202   assert(SavedOrig);
   1203   assert(CurrentStrictState == StateStrictWWM ||
   1204          CurrentStrictState == StateStrictWQM);
   1205 
   1206   if (CurrentStrictState == StateStrictWWM) {
   1207     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
   1208                  Exec)
   1209              .addReg(SavedOrig);
   1210   } else {
   1211     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
   1212                  Exec)
   1213              .addReg(SavedOrig);
   1214   }
   1215   LIS->InsertMachineInstrInMaps(*MI);
   1216   StateTransition[MI] = NonStrictState;
   1217 }
   1218 
   1219 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
   1220   auto BII = Blocks.find(&MBB);
   1221   if (BII == Blocks.end())
   1222     return;
   1223 
   1224   BlockInfo &BI = BII->second;
   1225 
   1226   // This is a non-entry block that is WQM throughout, so no need to do
   1227   // anything.
   1228   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
   1229     BI.InitialState = StateWQM;
   1230     return;
   1231   }
   1232 
   1233   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
   1234                     << ":\n");
   1235 
   1236   Register SavedWQMReg;
   1237   Register SavedNonStrictReg;
   1238   bool WQMFromExec = IsEntry;
   1239   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
   1240   char NonStrictState = 0;
   1241   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
   1242 
   1243   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   1244   if (IsEntry) {
   1245     // Skip the instruction that saves LiveMask
   1246     if (II != IE && II->getOpcode() == AMDGPU::COPY)
   1247       ++II;
   1248   }
   1249 
   1250   // This stores the first instruction where it's safe to switch from WQM to
   1251   // Exact or vice versa.
   1252   MachineBasicBlock::iterator FirstWQM = IE;
   1253 
   1254   // This stores the first instruction where it's safe to switch from Strict
   1255   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
   1256   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
   1257   // be safe to switch to/from WQM as well.
   1258   MachineBasicBlock::iterator FirstStrict = IE;
   1259 
   1260   // Record initial state is block information.
   1261   BI.InitialState = State;
   1262 
   1263   for (;;) {
   1264     MachineBasicBlock::iterator Next = II;
   1265     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
   1266     char OutNeeds = 0;
   1267 
   1268     if (FirstWQM == IE)
   1269       FirstWQM = II;
   1270 
   1271     if (FirstStrict == IE)
   1272       FirstStrict = II;
   1273 
   1274     // First, figure out the allowed states (Needs) based on the propagated
   1275     // flags.
   1276     if (II != IE) {
   1277       MachineInstr &MI = *II;
   1278 
   1279       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
   1280         auto III = Instructions.find(&MI);
   1281         if (III != Instructions.end()) {
   1282           if (III->second.Needs & StateStrictWWM)
   1283             Needs = StateStrictWWM;
   1284           else if (III->second.Needs & StateStrictWQM)
   1285             Needs = StateStrictWQM;
   1286           else if (III->second.Needs & StateWQM)
   1287             Needs = StateWQM;
   1288           else
   1289             Needs &= ~III->second.Disabled;
   1290           OutNeeds = III->second.OutNeeds;
   1291         }
   1292       } else {
   1293         // If the instruction doesn't actually need a correct EXEC, then we can
   1294         // safely leave Strict mode enabled.
   1295         Needs = StateExact | StateWQM | StateStrict;
   1296       }
   1297 
   1298       if (MI.isTerminator() && OutNeeds == StateExact)
   1299         Needs = StateExact;
   1300 
   1301       ++Next;
   1302     } else {
   1303       // End of basic block
   1304       if (BI.OutNeeds & StateWQM)
   1305         Needs = StateWQM;
   1306       else if (BI.OutNeeds == StateExact)
   1307         Needs = StateExact;
   1308       else
   1309         Needs = StateWQM | StateExact;
   1310     }
   1311 
   1312     // Now, transition if necessary.
   1313     if (!(Needs & State)) {
   1314       MachineBasicBlock::iterator First;
   1315       if (State == StateStrictWWM || Needs == StateStrictWWM ||
   1316           State == StateStrictWQM || Needs == StateStrictWQM) {
   1317         // We must switch to or from Strict mode.
   1318         First = FirstStrict;
   1319       } else {
   1320         // We only need to switch to/from WQM, so we can use FirstWQM.
   1321         First = FirstWQM;
   1322       }
   1323 
   1324       // Whether we need to save SCC depends on start and end states.
   1325       bool SaveSCC = false;
   1326       switch (State) {
   1327       case StateExact:
   1328       case StateStrictWWM:
   1329       case StateStrictWQM:
   1330         // Exact/Strict -> Strict: save SCC
   1331         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
   1332         // Exact/Strict -> Exact: no save
   1333         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
   1334         break;
   1335       case StateWQM:
   1336         // WQM -> Exact/Strict: save SCC
   1337         SaveSCC = !(Needs & StateWQM);
   1338         break;
   1339       default:
   1340         llvm_unreachable("Unknown state");
   1341         break;
   1342       }
   1343       MachineBasicBlock::iterator Before =
   1344           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
   1345 
   1346       if (State & StateStrict) {
   1347         assert(State == StateStrictWWM || State == StateStrictWQM);
   1348         assert(SavedNonStrictReg);
   1349         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
   1350 
   1351         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
   1352         SavedNonStrictReg = 0;
   1353         State = NonStrictState;
   1354       }
   1355 
   1356       if (Needs & StateStrict) {
   1357         NonStrictState = State;
   1358         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
   1359         assert(!SavedNonStrictReg);
   1360         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
   1361 
   1362         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
   1363         State = Needs;
   1364 
   1365       } else {
   1366         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
   1367           if (!WQMFromExec && (OutNeeds & StateWQM)) {
   1368             assert(!SavedWQMReg);
   1369             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
   1370           }
   1371 
   1372           toExact(MBB, Before, SavedWQMReg);
   1373           State = StateExact;
   1374         } else if (State == StateExact && (Needs & StateWQM) &&
   1375                    !(Needs & StateExact)) {
   1376           assert(WQMFromExec == (SavedWQMReg == 0));
   1377 
   1378           toWQM(MBB, Before, SavedWQMReg);
   1379 
   1380           if (SavedWQMReg) {
   1381             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
   1382             SavedWQMReg = 0;
   1383           }
   1384           State = StateWQM;
   1385         } else {
   1386           // We can get here if we transitioned from StrictWWM to a
   1387           // non-StrictWWM state that already matches our needs, but we
   1388           // shouldn't need to do anything.
   1389           assert(Needs & State);
   1390         }
   1391       }
   1392     }
   1393 
   1394     if (Needs != (StateExact | StateWQM | StateStrict)) {
   1395       if (Needs != (StateExact | StateWQM))
   1396         FirstWQM = IE;
   1397       FirstStrict = IE;
   1398     }
   1399 
   1400     if (II == IE)
   1401       break;
   1402 
   1403     II = Next;
   1404   }
   1405   assert(!SavedWQMReg);
   1406   assert(!SavedNonStrictReg);
   1407 }
   1408 
   1409 void SIWholeQuadMode::lowerLiveMaskQueries() {
   1410   for (MachineInstr *MI : LiveMaskQueries) {
   1411     const DebugLoc &DL = MI->getDebugLoc();
   1412     Register Dest = MI->getOperand(0).getReg();
   1413 
   1414     MachineInstr *Copy =
   1415         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
   1416             .addReg(LiveMaskReg);
   1417 
   1418     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
   1419     MI->eraseFromParent();
   1420   }
   1421 }
   1422 
   1423 void SIWholeQuadMode::lowerCopyInstrs() {
   1424   for (MachineInstr *MI : LowerToMovInstrs) {
   1425     assert(MI->getNumExplicitOperands() == 2);
   1426 
   1427     const Register Reg = MI->getOperand(0).getReg();
   1428     const unsigned SubReg = MI->getOperand(0).getSubReg();
   1429 
   1430     if (TRI->isVGPR(*MRI, Reg)) {
   1431       const TargetRegisterClass *regClass =
   1432           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
   1433       if (SubReg)
   1434         regClass = TRI->getSubRegClass(regClass, SubReg);
   1435 
   1436       const unsigned MovOp = TII->getMovOpcode(regClass);
   1437       MI->setDesc(TII->get(MovOp));
   1438 
   1439       // Check that it already implicitly depends on exec (like all VALU movs
   1440       // should do).
   1441       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
   1442         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
   1443       }));
   1444     } else {
   1445       // Remove early-clobber and exec dependency from simple SGPR copies.
   1446       // This allows some to be eliminated during/post RA.
   1447       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
   1448       if (MI->getOperand(0).isEarlyClobber()) {
   1449         LIS->removeInterval(Reg);
   1450         MI->getOperand(0).setIsEarlyClobber(false);
   1451         LIS->createAndComputeVirtRegInterval(Reg);
   1452       }
   1453       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
   1454       while (Index >= 0) {
   1455         MI->RemoveOperand(Index);
   1456         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
   1457       }
   1458       MI->setDesc(TII->get(AMDGPU::COPY));
   1459       LLVM_DEBUG(dbgs() << "  -> " << *MI);
   1460     }
   1461   }
   1462   for (MachineInstr *MI : LowerToCopyInstrs) {
   1463     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
   1464         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
   1465       assert(MI->getNumExplicitOperands() == 3);
   1466       // the only reason we should be here is V_SET_INACTIVE has
   1467       // an undef input so it is being replaced by a simple copy.
   1468       // There should be a second undef source that we should remove.
   1469       assert(MI->getOperand(2).isUndef());
   1470       MI->RemoveOperand(2);
   1471       MI->untieRegOperand(1);
   1472     } else {
   1473       assert(MI->getNumExplicitOperands() == 2);
   1474     }
   1475 
   1476     MI->setDesc(TII->get(AMDGPU::COPY));
   1477   }
   1478 }
   1479 
   1480 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
   1481   for (MachineInstr *MI : KillInstrs) {
   1482     MachineBasicBlock *MBB = MI->getParent();
   1483     MachineInstr *SplitPoint = nullptr;
   1484     switch (MI->getOpcode()) {
   1485     case AMDGPU::SI_DEMOTE_I1:
   1486     case AMDGPU::SI_KILL_I1_TERMINATOR:
   1487       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
   1488       break;
   1489     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   1490       SplitPoint = lowerKillF32(*MBB, *MI);
   1491       break;
   1492     default:
   1493       continue;
   1494     }
   1495     if (SplitPoint)
   1496       splitBlock(MBB, SplitPoint);
   1497   }
   1498 }
   1499 
   1500 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   1501   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
   1502                     << " ------------- \n");
   1503   LLVM_DEBUG(MF.dump(););
   1504 
   1505   Instructions.clear();
   1506   Blocks.clear();
   1507   LiveMaskQueries.clear();
   1508   LowerToCopyInstrs.clear();
   1509   LowerToMovInstrs.clear();
   1510   KillInstrs.clear();
   1511   StateTransition.clear();
   1512 
   1513   ST = &MF.getSubtarget<GCNSubtarget>();
   1514 
   1515   TII = ST->getInstrInfo();
   1516   TRI = &TII->getRegisterInfo();
   1517   MRI = &MF.getRegInfo();
   1518   LIS = &getAnalysis<LiveIntervals>();
   1519   MDT = &getAnalysis<MachineDominatorTree>();
   1520   PDT = &getAnalysis<MachinePostDominatorTree>();
   1521 
   1522   if (ST->isWave32()) {
   1523     AndOpc = AMDGPU::S_AND_B32;
   1524     AndN2Opc = AMDGPU::S_ANDN2_B32;
   1525     XorOpc = AMDGPU::S_XOR_B32;
   1526     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
   1527     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
   1528     WQMOpc = AMDGPU::S_WQM_B32;
   1529     Exec = AMDGPU::EXEC_LO;
   1530   } else {
   1531     AndOpc = AMDGPU::S_AND_B64;
   1532     AndN2Opc = AMDGPU::S_ANDN2_B64;
   1533     XorOpc = AMDGPU::S_XOR_B64;
   1534     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
   1535     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
   1536     WQMOpc = AMDGPU::S_WQM_B64;
   1537     Exec = AMDGPU::EXEC;
   1538   }
   1539 
   1540   const char GlobalFlags = analyzeFunction(MF);
   1541   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
   1542 
   1543   LiveMaskReg = Exec;
   1544 
   1545   // Shader is simple does not need any state changes or any complex lowering
   1546   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
   1547       LowerToMovInstrs.empty() && KillInstrs.empty()) {
   1548     lowerLiveMaskQueries();
   1549     return !LiveMaskQueries.empty();
   1550   }
   1551 
   1552   MachineBasicBlock &Entry = MF.front();
   1553   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
   1554 
   1555   // Store a copy of the original live mask when required
   1556   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
   1557     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
   1558     MachineInstr *MI =
   1559         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
   1560             .addReg(Exec);
   1561     LIS->InsertMachineInstrInMaps(*MI);
   1562   }
   1563 
   1564   LLVM_DEBUG(printInfo());
   1565 
   1566   lowerLiveMaskQueries();
   1567   lowerCopyInstrs();
   1568 
   1569   // Shader only needs WQM
   1570   if (GlobalFlags == StateWQM) {
   1571     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
   1572                   .addReg(Exec);
   1573     LIS->InsertMachineInstrInMaps(*MI);
   1574     lowerKillInstrs(true);
   1575   } else {
   1576     for (auto BII : Blocks)
   1577       processBlock(*BII.first, BII.first == &Entry);
   1578     // Lowering blocks causes block splitting so perform as a second pass.
   1579     for (auto BII : Blocks)
   1580       lowerBlock(*BII.first);
   1581   }
   1582 
   1583   // Compute live range for live mask
   1584   if (LiveMaskReg != Exec)
   1585     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
   1586 
   1587   // Physical registers like SCC aren't tracked by default anyway, so just
   1588   // removing the ranges we computed is the simplest option for maintaining
   1589   // the analysis results.
   1590   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
   1591 
   1592   // If we performed any kills then recompute EXEC
   1593   if (!KillInstrs.empty())
   1594     LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
   1595 
   1596   return true;
   1597 }
   1598