Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// SI implementation of the TargetRegisterInfo class.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "SIRegisterInfo.h"
     15 #include "AMDGPU.h"
     16 #include "AMDGPURegisterBankInfo.h"
     17 #include "GCNSubtarget.h"
     18 #include "MCTargetDesc/AMDGPUInstPrinter.h"
     19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     20 #include "SIMachineFunctionInfo.h"
     21 #include "llvm/CodeGen/LiveIntervals.h"
     22 #include "llvm/CodeGen/MachineDominators.h"
     23 #include "llvm/CodeGen/RegisterScavenging.h"
     24 
     25 using namespace llvm;
     26 
     27 #define GET_REGINFO_TARGET_DESC
     28 #include "AMDGPUGenRegisterInfo.inc"
     29 
     30 static cl::opt<bool> EnableSpillSGPRToVGPR(
     31   "amdgpu-spill-sgpr-to-vgpr",
     32   cl::desc("Enable spilling VGPRs to SGPRs"),
     33   cl::ReallyHidden,
     34   cl::init(true));
     35 
     36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
     37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
     38 
     39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
     40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
     41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
     42 //      meaning index 7 in SubRegFromChannelTable.
     43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
     44     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
     45 
     46 namespace llvm {
     47 
     48 // A temporary struct to spill SGPRs.
     49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
     50 // just v_writelane and v_readlane.
     51 //
     52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
     53 // is saved to scratch (or the other way around for loads).
     54 // For this, a VGPR is required where the needed lanes can be clobbered. The
     55 // RegScavenger can provide a VGPR where currently active lanes can be
     56 // clobbered, but we still need to save inactive lanes.
     57 // The high-level steps are:
     58 // - Try to scavenge SGPR(s) to save exec
     59 // - Try to scavenge VGPR
     60 // - Save needed, all or inactive lanes of a TmpVGPR
     61 // - Spill/Restore SGPRs using TmpVGPR
     62 // - Restore TmpVGPR
     63 //
     64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
     65 // cannot scavenge temporary SGPRs to save exec, we use the following code:
     66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
     67 // s_not exec, exec
     68 // buffer_store_dword TmpVGPR ; save inactive lanes
     69 // s_not exec, exec
     70 struct SGPRSpillBuilder {
     71   struct PerVGPRData {
     72     unsigned PerVGPR;
     73     unsigned NumVGPRs;
     74     int64_t VGPRLanes;
     75   };
     76 
     77   // The SGPR to save
     78   Register SuperReg;
     79   MachineBasicBlock::iterator MI;
     80   ArrayRef<int16_t> SplitParts;
     81   unsigned NumSubRegs;
     82   bool IsKill;
     83   const DebugLoc &DL;
     84 
     85   /* When spilling to stack */
     86   // The SGPRs are written into this VGPR, which is then written to scratch
     87   // (or vice versa for loads).
     88   Register TmpVGPR = AMDGPU::NoRegister;
     89   // Temporary spill slot to save TmpVGPR to.
     90   int TmpVGPRIndex = 0;
     91   // If TmpVGPR is live before the spill or if it is scavenged.
     92   bool TmpVGPRLive = false;
     93   // Scavenged SGPR to save EXEC.
     94   Register SavedExecReg = AMDGPU::NoRegister;
     95   // Stack index to write the SGPRs to.
     96   int Index;
     97   unsigned EltSize = 4;
     98 
     99   RegScavenger *RS;
    100   MachineBasicBlock &MBB;
    101   MachineFunction &MF;
    102   SIMachineFunctionInfo &MFI;
    103   const SIInstrInfo &TII;
    104   const SIRegisterInfo &TRI;
    105   bool IsWave32;
    106   Register ExecReg;
    107   unsigned MovOpc;
    108   unsigned NotOpc;
    109 
    110   SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
    111                    bool IsWave32, MachineBasicBlock::iterator MI, int Index,
    112                    RegScavenger *RS)
    113       : SuperReg(MI->getOperand(0).getReg()), MI(MI),
    114         IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index),
    115         RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()),
    116         MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
    117         IsWave32(IsWave32) {
    118     const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg);
    119     SplitParts = TRI.getRegSplitParts(RC, EltSize);
    120     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
    121 
    122     if (IsWave32) {
    123       ExecReg = AMDGPU::EXEC_LO;
    124       MovOpc = AMDGPU::S_MOV_B32;
    125       NotOpc = AMDGPU::S_NOT_B32;
    126     } else {
    127       ExecReg = AMDGPU::EXEC;
    128       MovOpc = AMDGPU::S_MOV_B64;
    129       NotOpc = AMDGPU::S_NOT_B64;
    130     }
    131 
    132     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
    133     assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
    134            SuperReg != AMDGPU::EXEC && "exec should never spill");
    135   }
    136 
    137   PerVGPRData getPerVGPRData() {
    138     PerVGPRData Data;
    139     Data.PerVGPR = IsWave32 ? 32 : 64;
    140     Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
    141     Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
    142     return Data;
    143   }
    144 
    145   // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
    146   // free.
    147   // Writes these instructions if an SGPR can be scavenged:
    148   // s_mov_b64 s[6:7], exec   ; Save exec
    149   // s_mov_b64 exec, 3        ; Wanted lanemask
    150   // buffer_store_dword v1    ; Write scavenged VGPR to emergency slot
    151   //
    152   // Writes these instructions if no SGPR can be scavenged:
    153   // buffer_store_dword v0    ; Only if no free VGPR was found
    154   // s_not_b64 exec, exec
    155   // buffer_store_dword v0    ; Save inactive lanes
    156   //                          ; exec stays inverted, it is flipped back in
    157   //                          ; restore.
    158   void prepare() {
    159     // Scavenged temporary VGPR to use. It must be scavenged once for any number
    160     // of spilled subregs.
    161     // FIXME: The liveness analysis is limited and does not tell if a register
    162     // is in use in lanes that are currently inactive. We can never be sure if
    163     // a register as actually in use in another lane, so we need to save all
    164     // used lanes of the chosen VGPR.
    165     assert(RS && "Cannot spill SGPR to memory without RegScavenger");
    166     TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
    167 
    168     // Reserve temporary stack slot
    169     TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
    170     if (TmpVGPR) {
    171       // Found a register that is dead in the currently active lanes, we only
    172       // need to spill inactive lanes.
    173       TmpVGPRLive = false;
    174     } else {
    175       // Pick v0 because it doesn't make a difference.
    176       TmpVGPR = AMDGPU::VGPR0;
    177       TmpVGPRLive = true;
    178     }
    179 
    180     // Try to scavenge SGPRs to save exec
    181     assert(!SavedExecReg && "Exec is already saved, refuse to save again");
    182     const TargetRegisterClass &RC =
    183         IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
    184     RS->setRegUsed(SuperReg);
    185     SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false);
    186 
    187     int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
    188 
    189     if (SavedExecReg) {
    190       RS->setRegUsed(SavedExecReg);
    191       // Set exec to needed lanes
    192       BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
    193       auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
    194       if (!TmpVGPRLive)
    195         I.addReg(TmpVGPR, RegState::ImplicitDefine);
    196       // Spill needed lanes
    197       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
    198     } else {
    199       // Spill active lanes
    200       if (TmpVGPRLive)
    201         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
    202                                     /*IsKill*/ false);
    203       // Spill inactive lanes
    204       auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
    205       if (!TmpVGPRLive)
    206         I.addReg(TmpVGPR, RegState::ImplicitDefine);
    207       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
    208     }
    209   }
    210 
    211   // Writes these instructions if an SGPR can be scavenged:
    212   // buffer_load_dword v1     ; Write scavenged VGPR to emergency slot
    213   // s_waitcnt vmcnt(0)       ; If a free VGPR was found
    214   // s_mov_b64 exec, s[6:7]   ; Save exec
    215   //
    216   // Writes these instructions if no SGPR can be scavenged:
    217   // buffer_load_dword v0     ; Restore inactive lanes
    218   // s_waitcnt vmcnt(0)       ; If a free VGPR was found
    219   // s_not_b64 exec, exec
    220   // buffer_load_dword v0     ; Only if no free VGPR was found
    221   void restore() {
    222     if (SavedExecReg) {
    223       // Restore used lanes
    224       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
    225                                   /*IsKill*/ false);
    226       // Restore exec
    227       auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg)
    228                    .addReg(SavedExecReg, RegState::Kill);
    229       // Add an implicit use of the load so it is not dead.
    230       // FIXME This inserts an unnecessary waitcnt
    231       if (!TmpVGPRLive) {
    232         I.addReg(TmpVGPR, RegState::ImplicitKill);
    233       }
    234     } else {
    235       // Restore inactive lanes
    236       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
    237                                   /*IsKill*/ false);
    238       auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
    239       if (!TmpVGPRLive) {
    240         I.addReg(TmpVGPR, RegState::ImplicitKill);
    241       }
    242       // Restore active lanes
    243       if (TmpVGPRLive)
    244         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
    245     }
    246   }
    247 
    248   // Write TmpVGPR to memory or read TmpVGPR from memory.
    249   // Either using a single buffer_load/store if exec is set to the needed mask
    250   // or using
    251   // buffer_load
    252   // s_not exec, exec
    253   // buffer_load
    254   // s_not exec, exec
    255   void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
    256     if (SavedExecReg) {
    257       // Spill needed lanes
    258       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
    259     } else {
    260       // Spill active lanes
    261       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
    262                                   /*IsKill*/ false);
    263       // Spill inactive lanes
    264       BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
    265       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
    266       BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
    267     }
    268   }
    269 };
    270 
    271 } // namespace llvm
    272 
    273 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
    274     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
    275       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
    276 
    277   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
    278          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
    279          (getSubRegIndexLaneMask(AMDGPU::lo16) |
    280           getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
    281            getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
    282          "getNumCoveredRegs() will not work with generated subreg masks!");
    283 
    284   RegPressureIgnoredUnits.resize(getNumRegUnits());
    285   RegPressureIgnoredUnits.set(
    286       *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
    287   for (auto Reg : AMDGPU::VGPR_HI16RegClass)
    288     RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
    289 
    290   // HACK: Until this is fully tablegen'd.
    291   static llvm::once_flag InitializeRegSplitPartsFlag;
    292 
    293   static auto InitializeRegSplitPartsOnce = [this]() {
    294     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
    295       unsigned Size = getSubRegIdxSize(Idx);
    296       if (Size & 31)
    297         continue;
    298       std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
    299       unsigned Pos = getSubRegIdxOffset(Idx);
    300       if (Pos % Size)
    301         continue;
    302       Pos /= Size;
    303       if (Vec.empty()) {
    304         unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
    305         Vec.resize(MaxNumParts);
    306       }
    307       Vec[Pos] = Idx;
    308     }
    309   };
    310 
    311   static llvm::once_flag InitializeSubRegFromChannelTableFlag;
    312 
    313   static auto InitializeSubRegFromChannelTableOnce = [this]() {
    314     for (auto &Row : SubRegFromChannelTable)
    315       Row.fill(AMDGPU::NoSubRegister);
    316     for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
    317       unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
    318       unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
    319       assert(Width < SubRegFromChannelTableWidthMap.size());
    320       Width = SubRegFromChannelTableWidthMap[Width];
    321       if (Width == 0)
    322         continue;
    323       unsigned TableIdx = Width - 1;
    324       assert(TableIdx < SubRegFromChannelTable.size());
    325       assert(Offset < SubRegFromChannelTable[TableIdx].size());
    326       SubRegFromChannelTable[TableIdx][Offset] = Idx;
    327     }
    328   };
    329 
    330   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
    331   llvm::call_once(InitializeSubRegFromChannelTableFlag,
    332                   InitializeSubRegFromChannelTableOnce);
    333 }
    334 
    335 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
    336                                            MCRegister Reg) const {
    337   MCRegAliasIterator R(Reg, this, true);
    338 
    339   for (; R.isValid(); ++R)
    340     Reserved.set(*R);
    341 }
    342 
    343 // Forced to be here by one .inc
    344 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
    345   const MachineFunction *MF) const {
    346   CallingConv::ID CC = MF->getFunction().getCallingConv();
    347   switch (CC) {
    348   case CallingConv::C:
    349   case CallingConv::Fast:
    350   case CallingConv::Cold:
    351   case CallingConv::AMDGPU_Gfx:
    352     return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
    353         ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
    354         : CSR_AMDGPU_HighRegs_SaveList;
    355   default: {
    356     // Dummy to not crash RegisterClassInfo.
    357     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
    358     return &NoCalleeSavedReg;
    359   }
    360   }
    361 }
    362 
    363 const MCPhysReg *
    364 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
    365   return nullptr;
    366 }
    367 
    368 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
    369                                                      CallingConv::ID CC) const {
    370   switch (CC) {
    371   case CallingConv::C:
    372   case CallingConv::Fast:
    373   case CallingConv::Cold:
    374   case CallingConv::AMDGPU_Gfx:
    375     return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
    376         ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
    377         : CSR_AMDGPU_HighRegs_RegMask;
    378   default:
    379     return nullptr;
    380   }
    381 }
    382 
    383 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
    384   return CSR_AMDGPU_NoRegs_RegMask;
    385 }
    386 
    387 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
    388   const SIFrameLowering *TFI =
    389       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
    390   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    391   // During ISel lowering we always reserve the stack pointer in entry
    392   // functions, but never actually want to reference it when accessing our own
    393   // frame. If we need a frame pointer we use it, but otherwise we can just use
    394   // an immediate "0" which we represent by returning NoRegister.
    395   if (FuncInfo->isEntryFunction()) {
    396     return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
    397   }
    398   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
    399                         : FuncInfo->getStackPtrOffsetReg();
    400 }
    401 
    402 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    403   // When we need stack realignment, we can't reference off of the
    404   // stack pointer, so we reserve a base pointer.
    405   const MachineFrameInfo &MFI = MF.getFrameInfo();
    406   return MFI.getNumFixedObjects() && shouldRealignStack(MF);
    407 }
    408 
    409 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
    410 
    411 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
    412   return CSR_AMDGPU_AllVGPRs_RegMask;
    413 }
    414 
    415 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
    416   return CSR_AMDGPU_AllAGPRs_RegMask;
    417 }
    418 
    419 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
    420   return CSR_AMDGPU_AllVectorRegs_RegMask;
    421 }
    422 
    423 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
    424   return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
    425 }
    426 
    427 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
    428                                               unsigned NumRegs) {
    429   assert(NumRegs < SubRegFromChannelTableWidthMap.size());
    430   unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
    431   assert(NumRegIndex && "Not implemented");
    432   assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
    433   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
    434 }
    435 
    436 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
    437   const MachineFunction &MF) const {
    438   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
    439   MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
    440   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
    441 }
    442 
    443 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    444   BitVector Reserved(getNumRegs());
    445   Reserved.set(AMDGPU::MODE);
    446 
    447   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
    448   // this seems likely to result in bugs, so I'm marking them as reserved.
    449   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
    450   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
    451 
    452   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
    453   reserveRegisterTuples(Reserved, AMDGPU::M0);
    454 
    455   // Reserve src_vccz, src_execz, src_scc.
    456   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
    457   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
    458   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
    459 
    460   // Reserve the memory aperture registers.
    461   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
    462   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
    463   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
    464   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
    465 
    466   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
    467   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
    468 
    469   // Reserve xnack_mask registers - support is not implemented in Codegen.
    470   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
    471 
    472   // Reserve lds_direct register - support is not implemented in Codegen.
    473   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
    474 
    475   // Reserve Trap Handler registers - support is not implemented in Codegen.
    476   reserveRegisterTuples(Reserved, AMDGPU::TBA);
    477   reserveRegisterTuples(Reserved, AMDGPU::TMA);
    478   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
    479   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
    480   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
    481   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
    482   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
    483   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
    484   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
    485   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
    486 
    487   // Reserve null register - it shall never be allocated
    488   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
    489 
    490   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
    491   // will result in bugs.
    492   if (isWave32) {
    493     Reserved.set(AMDGPU::VCC);
    494     Reserved.set(AMDGPU::VCC_HI);
    495   }
    496 
    497   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
    498   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
    499   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
    500     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
    501     reserveRegisterTuples(Reserved, Reg);
    502   }
    503 
    504   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
    505   // TODO: In an entry function without calls and AGPRs used it is possible
    506   //       to use the whole register budget for VGPRs. Even more it shall
    507   //       be possible to estimate maximum AGPR/VGPR pressure and split
    508   //       register file accordingly.
    509   if (ST.hasGFX90AInsts())
    510     MaxNumVGPRs /= 2;
    511   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
    512   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
    513     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
    514     reserveRegisterTuples(Reserved, Reg);
    515     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
    516     reserveRegisterTuples(Reserved, Reg);
    517   }
    518 
    519   for (auto Reg : AMDGPU::SReg_32RegClass) {
    520     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
    521     Register Low = getSubReg(Reg, AMDGPU::lo16);
    522     // This is to prevent BB vcc liveness errors.
    523     if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
    524       Reserved.set(Low);
    525   }
    526 
    527   for (auto Reg : AMDGPU::AGPR_32RegClass) {
    528     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
    529   }
    530 
    531   // Reserve all the rest AGPRs if there are no instructions to use it.
    532   if (!ST.hasMAIInsts()) {
    533     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
    534       unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
    535       reserveRegisterTuples(Reserved, Reg);
    536     }
    537   }
    538 
    539   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    540 
    541   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
    542   if (ScratchRSrcReg != AMDGPU::NoRegister) {
    543     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
    544     // to spill.
    545     // TODO: May need to reserve a VGPR if doing LDS spilling.
    546     reserveRegisterTuples(Reserved, ScratchRSrcReg);
    547   }
    548 
    549   // We have to assume the SP is needed in case there are calls in the function,
    550   // which is detected after the function is lowered. If we aren't really going
    551   // to need SP, don't bother reserving it.
    552   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
    553 
    554   if (StackPtrReg) {
    555     reserveRegisterTuples(Reserved, StackPtrReg);
    556     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
    557   }
    558 
    559   MCRegister FrameReg = MFI->getFrameOffsetReg();
    560   if (FrameReg) {
    561     reserveRegisterTuples(Reserved, FrameReg);
    562     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
    563   }
    564 
    565   if (hasBasePointer(MF)) {
    566     MCRegister BasePtrReg = getBaseRegister();
    567     reserveRegisterTuples(Reserved, BasePtrReg);
    568     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
    569   }
    570 
    571   for (auto Reg : MFI->WWMReservedRegs) {
    572     reserveRegisterTuples(Reserved, Reg.first);
    573   }
    574 
    575   // FIXME: Stop using reserved registers for this.
    576   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
    577     reserveRegisterTuples(Reserved, Reg);
    578 
    579   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
    580     reserveRegisterTuples(Reserved, Reg);
    581 
    582   for (auto SSpill : MFI->getSGPRSpillVGPRs())
    583     reserveRegisterTuples(Reserved, SSpill.VGPR);
    584 
    585   return Reserved;
    586 }
    587 
    588 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
    589   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    590   // On entry, the base address is 0, so it can't possibly need any more
    591   // alignment.
    592 
    593   // FIXME: Should be able to specify the entry frame alignment per calling
    594   // convention instead.
    595   if (Info->isEntryFunction())
    596     return false;
    597 
    598   return TargetRegisterInfo::shouldRealignStack(MF);
    599 }
    600 
    601 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
    602   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
    603   if (Info->isEntryFunction()) {
    604     const MachineFrameInfo &MFI = Fn.getFrameInfo();
    605     return MFI.hasStackObjects() || MFI.hasCalls();
    606   }
    607 
    608   // May need scavenger for dealing with callee saved registers.
    609   return true;
    610 }
    611 
    612 bool SIRegisterInfo::requiresFrameIndexScavenging(
    613   const MachineFunction &MF) const {
    614   // Do not use frame virtual registers. They used to be used for SGPRs, but
    615   // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
    616   // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
    617   // spill.
    618   return false;
    619 }
    620 
    621 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
    622   const MachineFunction &MF) const {
    623   const MachineFrameInfo &MFI = MF.getFrameInfo();
    624   return MFI.hasStackObjects();
    625 }
    626 
    627 bool SIRegisterInfo::requiresVirtualBaseRegisters(
    628   const MachineFunction &) const {
    629   // There are no special dedicated stack or frame pointers.
    630   return true;
    631 }
    632 
    633 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
    634   assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
    635 
    636   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
    637                                           AMDGPU::OpName::offset);
    638   return MI->getOperand(OffIdx).getImm();
    639 }
    640 
    641 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
    642                                                  int Idx) const {
    643   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
    644     return 0;
    645 
    646   assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
    647                                             AMDGPU::OpName::vaddr) ||
    648          (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
    649                                             AMDGPU::OpName::saddr))) &&
    650          "Should never see frame index on non-address operand");
    651 
    652   return getScratchInstrOffset(MI);
    653 }
    654 
    655 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
    656   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
    657     return false;
    658 
    659   int64_t FullOffset = Offset + getScratchInstrOffset(MI);
    660 
    661   if (SIInstrInfo::isMUBUF(*MI))
    662     return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
    663 
    664   const SIInstrInfo *TII = ST.getInstrInfo();
    665   return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
    666                                  SIInstrFlags::FlatScratch);
    667 }
    668 
    669 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
    670                                                       int FrameIdx,
    671                                                       int64_t Offset) const {
    672   MachineBasicBlock::iterator Ins = MBB->begin();
    673   DebugLoc DL; // Defaults to "unknown"
    674 
    675   if (Ins != MBB->end())
    676     DL = Ins->getDebugLoc();
    677 
    678   MachineFunction *MF = MBB->getParent();
    679   const SIInstrInfo *TII = ST.getInstrInfo();
    680   MachineRegisterInfo &MRI = MF->getRegInfo();
    681   unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
    682                                            : AMDGPU::V_MOV_B32_e32;
    683 
    684   Register BaseReg = MRI.createVirtualRegister(
    685       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
    686                              : &AMDGPU::VGPR_32RegClass);
    687 
    688   if (Offset == 0) {
    689     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
    690       .addFrameIndex(FrameIdx);
    691     return BaseReg;
    692   }
    693 
    694   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    695 
    696   Register FIReg = MRI.createVirtualRegister(
    697       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
    698                              : &AMDGPU::VGPR_32RegClass);
    699 
    700   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
    701     .addImm(Offset);
    702   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
    703     .addFrameIndex(FrameIdx);
    704 
    705   if (ST.enableFlatScratch() ) {
    706     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
    707         .addReg(OffsetReg, RegState::Kill)
    708         .addReg(FIReg);
    709     return BaseReg;
    710   }
    711 
    712   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
    713     .addReg(OffsetReg, RegState::Kill)
    714     .addReg(FIReg)
    715     .addImm(0); // clamp bit
    716 
    717   return BaseReg;
    718 }
    719 
    720 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
    721                                        int64_t Offset) const {
    722   const SIInstrInfo *TII = ST.getInstrInfo();
    723   bool IsFlat = TII->isFLATScratch(MI);
    724 
    725 #ifndef NDEBUG
    726   // FIXME: Is it possible to be storing a frame index to itself?
    727   bool SeenFI = false;
    728   for (const MachineOperand &MO: MI.operands()) {
    729     if (MO.isFI()) {
    730       if (SeenFI)
    731         llvm_unreachable("should not see multiple frame indices");
    732 
    733       SeenFI = true;
    734     }
    735   }
    736 #endif
    737 
    738   MachineOperand *FIOp =
    739       TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
    740                                       : AMDGPU::OpName::vaddr);
    741 
    742   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
    743   int64_t NewOffset = OffsetOp->getImm() + Offset;
    744 
    745   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
    746   assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
    747 
    748   if (IsFlat) {
    749     assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
    750                                   SIInstrFlags::FlatScratch) &&
    751            "offset should be legal");
    752     FIOp->ChangeToRegister(BaseReg, false);
    753     OffsetOp->setImm(NewOffset);
    754     return;
    755   }
    756 
    757 #ifndef NDEBUG
    758   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
    759   assert(SOffset->isImm() && SOffset->getImm() == 0);
    760 #endif
    761 
    762   assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
    763          "offset should be legal");
    764 
    765   FIOp->ChangeToRegister(BaseReg, false);
    766   OffsetOp->setImm(NewOffset);
    767 }
    768 
    769 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
    770                                         Register BaseReg,
    771                                         int64_t Offset) const {
    772   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
    773     return false;
    774 
    775   int64_t NewOffset = Offset + getScratchInstrOffset(MI);
    776 
    777   if (SIInstrInfo::isMUBUF(*MI))
    778     return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
    779 
    780   const SIInstrInfo *TII = ST.getInstrInfo();
    781   return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
    782                                 SIInstrFlags::FlatScratch);
    783 }
    784 
    785 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
    786   const MachineFunction &MF, unsigned Kind) const {
    787   // This is inaccurate. It depends on the instruction and address space. The
    788   // only place where we should hit this is for dealing with frame indexes /
    789   // private accesses, so this is correct in that case.
    790   return &AMDGPU::VGPR_32RegClass;
    791 }
    792 
    793 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
    794 
    795   switch (Op) {
    796   case AMDGPU::SI_SPILL_S1024_SAVE:
    797   case AMDGPU::SI_SPILL_S1024_RESTORE:
    798   case AMDGPU::SI_SPILL_V1024_SAVE:
    799   case AMDGPU::SI_SPILL_V1024_RESTORE:
    800   case AMDGPU::SI_SPILL_A1024_SAVE:
    801   case AMDGPU::SI_SPILL_A1024_RESTORE:
    802     return 32;
    803   case AMDGPU::SI_SPILL_S512_SAVE:
    804   case AMDGPU::SI_SPILL_S512_RESTORE:
    805   case AMDGPU::SI_SPILL_V512_SAVE:
    806   case AMDGPU::SI_SPILL_V512_RESTORE:
    807   case AMDGPU::SI_SPILL_A512_SAVE:
    808   case AMDGPU::SI_SPILL_A512_RESTORE:
    809     return 16;
    810   case AMDGPU::SI_SPILL_S256_SAVE:
    811   case AMDGPU::SI_SPILL_S256_RESTORE:
    812   case AMDGPU::SI_SPILL_V256_SAVE:
    813   case AMDGPU::SI_SPILL_V256_RESTORE:
    814   case AMDGPU::SI_SPILL_A256_SAVE:
    815   case AMDGPU::SI_SPILL_A256_RESTORE:
    816     return 8;
    817   case AMDGPU::SI_SPILL_S192_SAVE:
    818   case AMDGPU::SI_SPILL_S192_RESTORE:
    819   case AMDGPU::SI_SPILL_V192_SAVE:
    820   case AMDGPU::SI_SPILL_V192_RESTORE:
    821   case AMDGPU::SI_SPILL_A192_SAVE:
    822   case AMDGPU::SI_SPILL_A192_RESTORE:
    823     return 6;
    824   case AMDGPU::SI_SPILL_S160_SAVE:
    825   case AMDGPU::SI_SPILL_S160_RESTORE:
    826   case AMDGPU::SI_SPILL_V160_SAVE:
    827   case AMDGPU::SI_SPILL_V160_RESTORE:
    828   case AMDGPU::SI_SPILL_A160_SAVE:
    829   case AMDGPU::SI_SPILL_A160_RESTORE:
    830     return 5;
    831   case AMDGPU::SI_SPILL_S128_SAVE:
    832   case AMDGPU::SI_SPILL_S128_RESTORE:
    833   case AMDGPU::SI_SPILL_V128_SAVE:
    834   case AMDGPU::SI_SPILL_V128_RESTORE:
    835   case AMDGPU::SI_SPILL_A128_SAVE:
    836   case AMDGPU::SI_SPILL_A128_RESTORE:
    837     return 4;
    838   case AMDGPU::SI_SPILL_S96_SAVE:
    839   case AMDGPU::SI_SPILL_S96_RESTORE:
    840   case AMDGPU::SI_SPILL_V96_SAVE:
    841   case AMDGPU::SI_SPILL_V96_RESTORE:
    842   case AMDGPU::SI_SPILL_A96_SAVE:
    843   case AMDGPU::SI_SPILL_A96_RESTORE:
    844     return 3;
    845   case AMDGPU::SI_SPILL_S64_SAVE:
    846   case AMDGPU::SI_SPILL_S64_RESTORE:
    847   case AMDGPU::SI_SPILL_V64_SAVE:
    848   case AMDGPU::SI_SPILL_V64_RESTORE:
    849   case AMDGPU::SI_SPILL_A64_SAVE:
    850   case AMDGPU::SI_SPILL_A64_RESTORE:
    851     return 2;
    852   case AMDGPU::SI_SPILL_S32_SAVE:
    853   case AMDGPU::SI_SPILL_S32_RESTORE:
    854   case AMDGPU::SI_SPILL_V32_SAVE:
    855   case AMDGPU::SI_SPILL_V32_RESTORE:
    856   case AMDGPU::SI_SPILL_A32_SAVE:
    857   case AMDGPU::SI_SPILL_A32_RESTORE:
    858     return 1;
    859   default: llvm_unreachable("Invalid spill opcode");
    860   }
    861 }
    862 
    863 static int getOffsetMUBUFStore(unsigned Opc) {
    864   switch (Opc) {
    865   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
    866     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
    867   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
    868     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
    869   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
    870     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
    871   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
    872     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
    873   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
    874     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
    875   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
    876     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
    877   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
    878     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
    879   default:
    880     return -1;
    881   }
    882 }
    883 
    884 static int getOffsetMUBUFLoad(unsigned Opc) {
    885   switch (Opc) {
    886   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
    887     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
    888   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
    889     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
    890   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
    891     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
    892   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
    893     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
    894   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
    895     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
    896   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
    897     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
    898   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
    899     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
    900   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
    901     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
    902   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
    903     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
    904   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
    905     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
    906   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
    907     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
    908   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
    909     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
    910   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
    911     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
    912   default:
    913     return -1;
    914   }
    915 }
    916 
    917 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
    918                                            MachineBasicBlock &MBB,
    919                                            MachineBasicBlock::iterator MI,
    920                                            int Index, unsigned Lane,
    921                                            unsigned ValueReg, bool IsKill) {
    922   MachineFunction *MF = MBB.getParent();
    923   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    924   const SIInstrInfo *TII = ST.getInstrInfo();
    925 
    926   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
    927 
    928   if (Reg == AMDGPU::NoRegister)
    929     return MachineInstrBuilder();
    930 
    931   bool IsStore = MI->mayStore();
    932   MachineRegisterInfo &MRI = MF->getRegInfo();
    933   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
    934 
    935   unsigned Dst = IsStore ? Reg : ValueReg;
    936   unsigned Src = IsStore ? ValueReg : Reg;
    937   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
    938                                                    : AMDGPU::V_ACCVGPR_READ_B32_e64;
    939 
    940   auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
    941                  .addReg(Src, getKillRegState(IsKill));
    942   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
    943   return MIB;
    944 }
    945 
    946 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
    947 // need to handle the case where an SGPR may need to be spilled while spilling.
    948 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
    949                                       MachineFrameInfo &MFI,
    950                                       MachineBasicBlock::iterator MI,
    951                                       int Index,
    952                                       int64_t Offset) {
    953   const SIInstrInfo *TII = ST.getInstrInfo();
    954   MachineBasicBlock *MBB = MI->getParent();
    955   const DebugLoc &DL = MI->getDebugLoc();
    956   bool IsStore = MI->mayStore();
    957 
    958   unsigned Opc = MI->getOpcode();
    959   int LoadStoreOp = IsStore ?
    960     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
    961   if (LoadStoreOp == -1)
    962     return false;
    963 
    964   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
    965   if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
    966     return true;
    967 
    968   MachineInstrBuilder NewMI =
    969       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
    970           .add(*Reg)
    971           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
    972           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
    973           .addImm(Offset)
    974           .addImm(0) // cpol
    975           .addImm(0) // tfe
    976           .addImm(0) // swz
    977           .cloneMemRefs(*MI);
    978 
    979   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
    980                                                        AMDGPU::OpName::vdata_in);
    981   if (VDataIn)
    982     NewMI.add(*VDataIn);
    983   return true;
    984 }
    985 
    986 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
    987                                           unsigned LoadStoreOp,
    988                                           unsigned EltSize) {
    989   bool IsStore = TII->get(LoadStoreOp).mayStore();
    990   bool UseST =
    991     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
    992     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
    993 
    994   switch (EltSize) {
    995   case 4:
    996     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
    997                           : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
    998     break;
    999   case 8:
   1000     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
   1001                           : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
   1002     break;
   1003   case 12:
   1004     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
   1005                           : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
   1006     break;
   1007   case 16:
   1008     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
   1009                           : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
   1010     break;
   1011   default:
   1012     llvm_unreachable("Unexpected spill load/store size!");
   1013   }
   1014 
   1015   if (UseST)
   1016     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
   1017 
   1018   return LoadStoreOp;
   1019 }
   1020 
   1021 void SIRegisterInfo::buildSpillLoadStore(
   1022     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
   1023     unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
   1024     MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
   1025     RegScavenger *RS, LivePhysRegs *LiveRegs) const {
   1026   assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both");
   1027 
   1028   MachineFunction *MF = MBB.getParent();
   1029   const SIInstrInfo *TII = ST.getInstrInfo();
   1030   const MachineFrameInfo &MFI = MF->getFrameInfo();
   1031   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
   1032 
   1033   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
   1034   const DebugLoc &DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
   1035   bool IsStore = Desc->mayStore();
   1036   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
   1037 
   1038   bool Scavenged = false;
   1039   MCRegister SOffset = ScratchOffsetReg;
   1040 
   1041   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
   1042   // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
   1043   const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
   1044   const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
   1045 
   1046   // Always use 4 byte operations for AGPRs because we need to scavenge
   1047   // a temporary VGPR.
   1048   unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
   1049   unsigned NumSubRegs = RegWidth / EltSize;
   1050   unsigned Size = NumSubRegs * EltSize;
   1051   unsigned RemSize = RegWidth - Size;
   1052   unsigned NumRemSubRegs = RemSize ? 1 : 0;
   1053   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
   1054   int64_t MaxOffset = Offset + Size + RemSize - EltSize;
   1055   int64_t ScratchOffsetRegDelta = 0;
   1056 
   1057   if (IsFlat && EltSize > 4) {
   1058     LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
   1059     Desc = &TII->get(LoadStoreOp);
   1060   }
   1061 
   1062   Align Alignment = MFI.getObjectAlign(Index);
   1063   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
   1064 
   1065   assert((IsFlat || ((Offset % EltSize) == 0)) &&
   1066          "unexpected VGPR spill offset");
   1067 
   1068   bool IsOffsetLegal =
   1069       IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
   1070                                       SIInstrFlags::FlatScratch)
   1071              : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
   1072   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
   1073     SOffset = MCRegister();
   1074 
   1075     // We currently only support spilling VGPRs to EltSize boundaries, meaning
   1076     // we can simplify the adjustment of Offset here to just scale with
   1077     // WavefrontSize.
   1078     if (!IsFlat)
   1079       Offset *= ST.getWavefrontSize();
   1080 
   1081     // We don't have access to the register scavenger if this function is called
   1082     // during  PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
   1083     if (RS) {
   1084       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
   1085     } else if (LiveRegs) {
   1086       for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
   1087         if (LiveRegs->available(MF->getRegInfo(), Reg)) {
   1088           SOffset = Reg;
   1089           break;
   1090         }
   1091       }
   1092     }
   1093 
   1094     if (!SOffset) {
   1095       // There are no free SGPRs, and since we are in the process of spilling
   1096       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
   1097       // on SI/CI and on VI it is true until we implement spilling using scalar
   1098       // stores), we have no way to free up an SGPR.  Our solution here is to
   1099       // add the offset directly to the ScratchOffset or StackPtrOffset
   1100       // register, and then subtract the offset after the spill to return the
   1101       // register to it's original value.
   1102       if (!ScratchOffsetReg)
   1103         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
   1104       SOffset = ScratchOffsetReg;
   1105       ScratchOffsetRegDelta = Offset;
   1106     } else {
   1107       Scavenged = true;
   1108     }
   1109 
   1110     if (!SOffset)
   1111       report_fatal_error("could not scavenge SGPR to spill in entry function");
   1112 
   1113     if (ScratchOffsetReg == AMDGPU::NoRegister) {
   1114       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
   1115     } else {
   1116       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
   1117           .addReg(ScratchOffsetReg)
   1118           .addImm(Offset);
   1119     }
   1120 
   1121     Offset = 0;
   1122   }
   1123 
   1124   if (IsFlat && SOffset == AMDGPU::NoRegister) {
   1125     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
   1126            && "Unexpected vaddr for flat scratch with a FI operand");
   1127 
   1128     assert(ST.hasFlatScratchSTMode());
   1129     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
   1130     Desc = &TII->get(LoadStoreOp);
   1131   }
   1132 
   1133   Register TmpReg;
   1134 
   1135   for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
   1136        ++i, RegOffset += EltSize) {
   1137     if (i == NumSubRegs) {
   1138       EltSize = RemSize;
   1139       LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
   1140     }
   1141     Desc = &TII->get(LoadStoreOp);
   1142 
   1143     unsigned NumRegs = EltSize / 4;
   1144     Register SubReg = e == 1
   1145             ? ValueReg
   1146             : Register(getSubReg(ValueReg,
   1147                                  getSubRegFromChannel(RegOffset / 4, NumRegs)));
   1148 
   1149     unsigned SOffsetRegState = 0;
   1150     unsigned SrcDstRegState = getDefRegState(!IsStore);
   1151     if (i + 1 == e) {
   1152       SOffsetRegState |= getKillRegState(Scavenged);
   1153       // The last implicit use carries the "Kill" flag.
   1154       SrcDstRegState |= getKillRegState(IsKill);
   1155     }
   1156 
   1157     // Make sure the whole register is defined if there are undef components by
   1158     // adding an implicit def of the super-reg on the first instruction.
   1159     bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
   1160     bool NeedSuperRegImpOperand = e > 1;
   1161 
   1162     unsigned Lane = RegOffset / 4;
   1163     unsigned LaneE = (RegOffset + EltSize) / 4;
   1164     for ( ; Lane != LaneE; ++Lane) {
   1165       bool IsSubReg = e > 1 || EltSize > 4;
   1166       Register Sub = IsSubReg
   1167              ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
   1168              : ValueReg;
   1169       auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
   1170       if (!MIB.getInstr())
   1171         break;
   1172       if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
   1173         MIB.addReg(ValueReg, RegState::ImplicitDefine);
   1174         NeedSuperRegDef = false;
   1175       }
   1176       if (IsSubReg || NeedSuperRegImpOperand) {
   1177         NeedSuperRegImpOperand = true;
   1178         unsigned State = SrcDstRegState;
   1179         if (Lane + 1 != LaneE)
   1180           State &= ~RegState::Kill;
   1181         MIB.addReg(ValueReg, RegState::Implicit | State);
   1182       }
   1183     }
   1184 
   1185     if (Lane == LaneE) // Fully spilled into AGPRs.
   1186       continue;
   1187 
   1188     // Offset in bytes from the beginning of the ValueReg to its portion we
   1189     // still need to spill. It may differ from RegOffset if a portion of
   1190     // current SubReg has been already spilled into AGPRs by the loop above.
   1191     unsigned RemRegOffset = Lane * 4;
   1192     unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
   1193     if (RemEltSize != EltSize) { // Partially spilled to AGPRs
   1194       assert(IsFlat && EltSize > 4);
   1195 
   1196       unsigned NumRegs = RemEltSize / 4;
   1197       SubReg = Register(getSubReg(ValueReg,
   1198                         getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
   1199       unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
   1200       Desc = &TII->get(Opc);
   1201     }
   1202 
   1203     unsigned FinalReg = SubReg;
   1204 
   1205     if (IsAGPR) {
   1206       assert(EltSize == 4);
   1207 
   1208       if (!TmpReg) {
   1209         assert(RS && "Needs to have RegScavenger to spill an AGPR!");
   1210         // FIXME: change to scavengeRegisterBackwards()
   1211         TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
   1212         RS->setRegUsed(TmpReg);
   1213       }
   1214       if (IsStore) {
   1215         auto AccRead = BuildMI(MBB, MI, DL,
   1216                                TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
   1217                            .addReg(SubReg, getKillRegState(IsKill));
   1218         if (NeedSuperRegDef)
   1219           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
   1220         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
   1221       }
   1222       SubReg = TmpReg;
   1223     }
   1224 
   1225     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
   1226     MachineMemOperand *NewMMO =
   1227         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
   1228                                  commonAlignment(Alignment, RemRegOffset));
   1229 
   1230     auto MIB =
   1231         BuildMI(MBB, MI, DL, *Desc)
   1232             .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
   1233     if (!IsFlat)
   1234       MIB.addReg(FuncInfo->getScratchRSrcReg());
   1235 
   1236     if (SOffset == AMDGPU::NoRegister) {
   1237       if (!IsFlat)
   1238         MIB.addImm(0);
   1239     } else {
   1240       MIB.addReg(SOffset, SOffsetRegState);
   1241     }
   1242     MIB.addImm(Offset + RemRegOffset)
   1243        .addImm(0); // cpol
   1244     if (!IsFlat)
   1245       MIB.addImm(0)  // tfe
   1246          .addImm(0); // swz
   1247     MIB.addMemOperand(NewMMO);
   1248 
   1249     if (!IsAGPR && NeedSuperRegDef)
   1250       MIB.addReg(ValueReg, RegState::ImplicitDefine);
   1251 
   1252     if (!IsStore && TmpReg != AMDGPU::NoRegister) {
   1253       MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
   1254                     FinalReg)
   1255                 .addReg(TmpReg, RegState::Kill);
   1256       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
   1257     }
   1258 
   1259     if (NeedSuperRegImpOperand)
   1260       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
   1261   }
   1262 
   1263   if (ScratchOffsetRegDelta != 0) {
   1264     // Subtract the offset we added to the ScratchOffset register.
   1265     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
   1266         .addReg(SOffset)
   1267         .addImm(ScratchOffsetRegDelta);
   1268   }
   1269 }
   1270 
   1271 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
   1272                                              int Offset, bool IsLoad,
   1273                                              bool IsKill) const {
   1274   // Load/store VGPR
   1275   MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
   1276   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
   1277 
   1278   Register FrameReg =
   1279       FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
   1280           ? getBaseRegister()
   1281           : getFrameRegister(SB.MF);
   1282 
   1283   Align Alignment = FrameInfo.getObjectAlign(Index);
   1284   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
   1285   MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
   1286       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
   1287       SB.EltSize, Alignment);
   1288 
   1289   if (IsLoad) {
   1290     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
   1291                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
   1292     buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
   1293                         Offset * SB.EltSize, MMO, SB.RS);
   1294   } else {
   1295     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
   1296                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
   1297     buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg,
   1298                         Offset * SB.EltSize, MMO, SB.RS);
   1299     // This only ever adds one VGPR spill
   1300     SB.MFI.addToSpilledVGPRs(1);
   1301   }
   1302 }
   1303 
   1304 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   1305                                int Index,
   1306                                RegScavenger *RS,
   1307                                bool OnlyToVGPR) const {
   1308   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
   1309 
   1310   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
   1311       SB.MFI.getSGPRToVGPRSpills(Index);
   1312   bool SpillToVGPR = !VGPRSpills.empty();
   1313   if (OnlyToVGPR && !SpillToVGPR)
   1314     return false;
   1315 
   1316   assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
   1317                          SB.SuperReg != SB.MFI.getFrameOffsetReg()));
   1318 
   1319   if (SpillToVGPR) {
   1320     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
   1321       Register SubReg =
   1322           SB.NumSubRegs == 1
   1323               ? SB.SuperReg
   1324               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
   1325       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
   1326 
   1327       bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1;
   1328 
   1329       // Mark the "old value of vgpr" input undef only if this is the first sgpr
   1330       // spill to this specific vgpr in the first basic block.
   1331       auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
   1332                          Spill.VGPR)
   1333                      .addReg(SubReg, getKillRegState(UseKill))
   1334                      .addImm(Spill.Lane)
   1335                      .addReg(Spill.VGPR);
   1336 
   1337       if (i == 0 && SB.NumSubRegs > 1) {
   1338         // We may be spilling a super-register which is only partially defined,
   1339         // and need to ensure later spills think the value is defined.
   1340         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
   1341       }
   1342 
   1343       if (SB.NumSubRegs > 1)
   1344         MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
   1345 
   1346       // FIXME: Since this spills to another register instead of an actual
   1347       // frame index, we should delete the frame index when all references to
   1348       // it are fixed.
   1349     }
   1350   } else {
   1351     SB.prepare();
   1352 
   1353     // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
   1354     unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
   1355 
   1356     // Per VGPR helper data
   1357     auto PVD = SB.getPerVGPRData();
   1358 
   1359     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
   1360       unsigned TmpVGPRFlags = RegState::Undef;
   1361 
   1362       // Write sub registers into the VGPR
   1363       for (unsigned i = Offset * PVD.PerVGPR,
   1364                     e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
   1365            i < e; ++i) {
   1366         Register SubReg =
   1367             SB.NumSubRegs == 1
   1368                 ? SB.SuperReg
   1369                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
   1370 
   1371         MachineInstrBuilder WriteLane =
   1372             BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
   1373                     SB.TmpVGPR)
   1374                 .addReg(SubReg, SubKillState)
   1375                 .addImm(i % PVD.PerVGPR)
   1376                 .addReg(SB.TmpVGPR, TmpVGPRFlags);
   1377         TmpVGPRFlags = 0;
   1378 
   1379         // There could be undef components of a spilled super register.
   1380         // TODO: Can we detect this and skip the spill?
   1381         if (SB.NumSubRegs > 1) {
   1382           // The last implicit use of the SB.SuperReg carries the "Kill" flag.
   1383           unsigned SuperKillState = 0;
   1384           if (i + 1 == SB.NumSubRegs)
   1385             SuperKillState |= getKillRegState(SB.IsKill);
   1386           WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
   1387         }
   1388       }
   1389 
   1390       // Write out VGPR
   1391       SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
   1392     }
   1393 
   1394     SB.restore();
   1395   }
   1396 
   1397   MI->eraseFromParent();
   1398   SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
   1399   return true;
   1400 }
   1401 
   1402 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   1403                                  int Index,
   1404                                  RegScavenger *RS,
   1405                                  bool OnlyToVGPR) const {
   1406   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
   1407 
   1408   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
   1409       SB.MFI.getSGPRToVGPRSpills(Index);
   1410   bool SpillToVGPR = !VGPRSpills.empty();
   1411   if (OnlyToVGPR && !SpillToVGPR)
   1412     return false;
   1413 
   1414   if (SpillToVGPR) {
   1415     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
   1416       Register SubReg =
   1417           SB.NumSubRegs == 1
   1418               ? SB.SuperReg
   1419               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
   1420 
   1421       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
   1422       auto MIB =
   1423           BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
   1424               .addReg(Spill.VGPR)
   1425               .addImm(Spill.Lane);
   1426       if (SB.NumSubRegs > 1 && i == 0)
   1427         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
   1428     }
   1429   } else {
   1430     SB.prepare();
   1431 
   1432     // Per VGPR helper data
   1433     auto PVD = SB.getPerVGPRData();
   1434 
   1435     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
   1436       // Load in VGPR data
   1437       SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
   1438 
   1439       // Unpack lanes
   1440       for (unsigned i = Offset * PVD.PerVGPR,
   1441                     e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
   1442            i < e; ++i) {
   1443         Register SubReg =
   1444             SB.NumSubRegs == 1
   1445                 ? SB.SuperReg
   1446                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
   1447 
   1448         bool LastSubReg = (i + 1 == e);
   1449         auto MIB = BuildMI(SB.MBB, MI, SB.DL,
   1450                            SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
   1451                        .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
   1452                        .addImm(i);
   1453         if (SB.NumSubRegs > 1 && i == 0)
   1454           MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
   1455       }
   1456     }
   1457 
   1458     SB.restore();
   1459   }
   1460 
   1461   MI->eraseFromParent();
   1462   return true;
   1463 }
   1464 
   1465 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
   1466 /// a VGPR and the stack slot can be safely eliminated when all other users are
   1467 /// handled.
   1468 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   1469   MachineBasicBlock::iterator MI,
   1470   int FI,
   1471   RegScavenger *RS) const {
   1472   switch (MI->getOpcode()) {
   1473   case AMDGPU::SI_SPILL_S1024_SAVE:
   1474   case AMDGPU::SI_SPILL_S512_SAVE:
   1475   case AMDGPU::SI_SPILL_S256_SAVE:
   1476   case AMDGPU::SI_SPILL_S192_SAVE:
   1477   case AMDGPU::SI_SPILL_S160_SAVE:
   1478   case AMDGPU::SI_SPILL_S128_SAVE:
   1479   case AMDGPU::SI_SPILL_S96_SAVE:
   1480   case AMDGPU::SI_SPILL_S64_SAVE:
   1481   case AMDGPU::SI_SPILL_S32_SAVE:
   1482     return spillSGPR(MI, FI, RS, true);
   1483   case AMDGPU::SI_SPILL_S1024_RESTORE:
   1484   case AMDGPU::SI_SPILL_S512_RESTORE:
   1485   case AMDGPU::SI_SPILL_S256_RESTORE:
   1486   case AMDGPU::SI_SPILL_S192_RESTORE:
   1487   case AMDGPU::SI_SPILL_S160_RESTORE:
   1488   case AMDGPU::SI_SPILL_S128_RESTORE:
   1489   case AMDGPU::SI_SPILL_S96_RESTORE:
   1490   case AMDGPU::SI_SPILL_S64_RESTORE:
   1491   case AMDGPU::SI_SPILL_S32_RESTORE:
   1492     return restoreSGPR(MI, FI, RS, true);
   1493   default:
   1494     llvm_unreachable("not an SGPR spill instruction");
   1495   }
   1496 }
   1497 
   1498 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   1499                                         int SPAdj, unsigned FIOperandNum,
   1500                                         RegScavenger *RS) const {
   1501   MachineFunction *MF = MI->getParent()->getParent();
   1502   MachineBasicBlock *MBB = MI->getParent();
   1503   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   1504   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   1505   const SIInstrInfo *TII = ST.getInstrInfo();
   1506   DebugLoc DL = MI->getDebugLoc();
   1507 
   1508   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
   1509 
   1510   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
   1511   int Index = MI->getOperand(FIOperandNum).getIndex();
   1512 
   1513   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
   1514                           ? getBaseRegister()
   1515                           : getFrameRegister(*MF);
   1516 
   1517   switch (MI->getOpcode()) {
   1518     // SGPR register spill
   1519     case AMDGPU::SI_SPILL_S1024_SAVE:
   1520     case AMDGPU::SI_SPILL_S512_SAVE:
   1521     case AMDGPU::SI_SPILL_S256_SAVE:
   1522     case AMDGPU::SI_SPILL_S192_SAVE:
   1523     case AMDGPU::SI_SPILL_S160_SAVE:
   1524     case AMDGPU::SI_SPILL_S128_SAVE:
   1525     case AMDGPU::SI_SPILL_S96_SAVE:
   1526     case AMDGPU::SI_SPILL_S64_SAVE:
   1527     case AMDGPU::SI_SPILL_S32_SAVE: {
   1528       spillSGPR(MI, Index, RS);
   1529       break;
   1530     }
   1531 
   1532     // SGPR register restore
   1533     case AMDGPU::SI_SPILL_S1024_RESTORE:
   1534     case AMDGPU::SI_SPILL_S512_RESTORE:
   1535     case AMDGPU::SI_SPILL_S256_RESTORE:
   1536     case AMDGPU::SI_SPILL_S192_RESTORE:
   1537     case AMDGPU::SI_SPILL_S160_RESTORE:
   1538     case AMDGPU::SI_SPILL_S128_RESTORE:
   1539     case AMDGPU::SI_SPILL_S96_RESTORE:
   1540     case AMDGPU::SI_SPILL_S64_RESTORE:
   1541     case AMDGPU::SI_SPILL_S32_RESTORE: {
   1542       restoreSGPR(MI, Index, RS);
   1543       break;
   1544     }
   1545 
   1546     // VGPR register spill
   1547     case AMDGPU::SI_SPILL_V1024_SAVE:
   1548     case AMDGPU::SI_SPILL_V512_SAVE:
   1549     case AMDGPU::SI_SPILL_V256_SAVE:
   1550     case AMDGPU::SI_SPILL_V192_SAVE:
   1551     case AMDGPU::SI_SPILL_V160_SAVE:
   1552     case AMDGPU::SI_SPILL_V128_SAVE:
   1553     case AMDGPU::SI_SPILL_V96_SAVE:
   1554     case AMDGPU::SI_SPILL_V64_SAVE:
   1555     case AMDGPU::SI_SPILL_V32_SAVE:
   1556     case AMDGPU::SI_SPILL_A1024_SAVE:
   1557     case AMDGPU::SI_SPILL_A512_SAVE:
   1558     case AMDGPU::SI_SPILL_A256_SAVE:
   1559     case AMDGPU::SI_SPILL_A192_SAVE:
   1560     case AMDGPU::SI_SPILL_A160_SAVE:
   1561     case AMDGPU::SI_SPILL_A128_SAVE:
   1562     case AMDGPU::SI_SPILL_A96_SAVE:
   1563     case AMDGPU::SI_SPILL_A64_SAVE:
   1564     case AMDGPU::SI_SPILL_A32_SAVE: {
   1565       const MachineOperand *VData = TII->getNamedOperand(*MI,
   1566                                                          AMDGPU::OpName::vdata);
   1567       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
   1568              MFI->getStackPtrOffsetReg());
   1569 
   1570       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
   1571                                             : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
   1572       auto *MBB = MI->getParent();
   1573       buildSpillLoadStore(
   1574           *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
   1575           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
   1576           *MI->memoperands_begin(), RS);
   1577       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
   1578       MI->eraseFromParent();
   1579       break;
   1580     }
   1581     case AMDGPU::SI_SPILL_V32_RESTORE:
   1582     case AMDGPU::SI_SPILL_V64_RESTORE:
   1583     case AMDGPU::SI_SPILL_V96_RESTORE:
   1584     case AMDGPU::SI_SPILL_V128_RESTORE:
   1585     case AMDGPU::SI_SPILL_V160_RESTORE:
   1586     case AMDGPU::SI_SPILL_V192_RESTORE:
   1587     case AMDGPU::SI_SPILL_V256_RESTORE:
   1588     case AMDGPU::SI_SPILL_V512_RESTORE:
   1589     case AMDGPU::SI_SPILL_V1024_RESTORE:
   1590     case AMDGPU::SI_SPILL_A32_RESTORE:
   1591     case AMDGPU::SI_SPILL_A64_RESTORE:
   1592     case AMDGPU::SI_SPILL_A96_RESTORE:
   1593     case AMDGPU::SI_SPILL_A128_RESTORE:
   1594     case AMDGPU::SI_SPILL_A160_RESTORE:
   1595     case AMDGPU::SI_SPILL_A192_RESTORE:
   1596     case AMDGPU::SI_SPILL_A256_RESTORE:
   1597     case AMDGPU::SI_SPILL_A512_RESTORE:
   1598     case AMDGPU::SI_SPILL_A1024_RESTORE: {
   1599       const MachineOperand *VData = TII->getNamedOperand(*MI,
   1600                                                          AMDGPU::OpName::vdata);
   1601       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
   1602              MFI->getStackPtrOffsetReg());
   1603 
   1604       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
   1605                                             : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
   1606       auto *MBB = MI->getParent();
   1607       buildSpillLoadStore(
   1608           *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
   1609           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
   1610           *MI->memoperands_begin(), RS);
   1611       MI->eraseFromParent();
   1612       break;
   1613     }
   1614 
   1615     default: {
   1616       // Other access to frame index
   1617       const DebugLoc &DL = MI->getDebugLoc();
   1618 
   1619       int64_t Offset = FrameInfo.getObjectOffset(Index);
   1620       if (ST.enableFlatScratch()) {
   1621         if (TII->isFLATScratch(*MI)) {
   1622           assert((int16_t)FIOperandNum ==
   1623                  AMDGPU::getNamedOperandIdx(MI->getOpcode(),
   1624                                             AMDGPU::OpName::saddr));
   1625 
   1626           // The offset is always swizzled, just replace it
   1627           if (FrameReg)
   1628             FIOp.ChangeToRegister(FrameReg, false);
   1629 
   1630           if (!Offset)
   1631             return;
   1632 
   1633           MachineOperand *OffsetOp =
   1634             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
   1635           int64_t NewOffset = Offset + OffsetOp->getImm();
   1636           if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
   1637                                      SIInstrFlags::FlatScratch)) {
   1638             OffsetOp->setImm(NewOffset);
   1639             if (FrameReg)
   1640               return;
   1641             Offset = 0;
   1642           }
   1643 
   1644           assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
   1645                  "Unexpected vaddr for flat scratch with a FI operand");
   1646 
   1647           // On GFX10 we have ST mode to use no registers for an address.
   1648           // Otherwise we need to materialize 0 into an SGPR.
   1649           if (!Offset && ST.hasFlatScratchSTMode()) {
   1650             unsigned Opc = MI->getOpcode();
   1651             unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
   1652             MI->RemoveOperand(
   1653                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
   1654             MI->setDesc(TII->get(NewOpc));
   1655             return;
   1656           }
   1657         }
   1658 
   1659         if (!FrameReg) {
   1660           FIOp.ChangeToImmediate(Offset);
   1661           if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
   1662             return;
   1663         }
   1664 
   1665         // We need to use register here. Check if we can use an SGPR or need
   1666         // a VGPR.
   1667         FIOp.ChangeToRegister(AMDGPU::M0, false);
   1668         bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
   1669 
   1670         if (!Offset && FrameReg && UseSGPR) {
   1671           FIOp.setReg(FrameReg);
   1672           return;
   1673         }
   1674 
   1675         const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
   1676                                                 : &AMDGPU::VGPR_32RegClass;
   1677 
   1678         Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
   1679         FIOp.setReg(TmpReg);
   1680         FIOp.setIsKill(true);
   1681 
   1682         if ((!FrameReg || !Offset) && TmpReg) {
   1683           unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
   1684           auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
   1685           if (FrameReg)
   1686             MIB.addReg(FrameReg);
   1687           else
   1688             MIB.addImm(Offset);
   1689 
   1690           return;
   1691         }
   1692 
   1693         Register TmpSReg =
   1694             UseSGPR ? TmpReg
   1695                     : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
   1696                                            !UseSGPR);
   1697 
   1698         // TODO: for flat scratch another attempt can be made with a VGPR index
   1699         //       if no SGPRs can be scavenged.
   1700         if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
   1701           report_fatal_error("Cannot scavenge register in FI elimination!");
   1702 
   1703         if (!TmpSReg) {
   1704           // Use frame register and restore it after.
   1705           TmpSReg = FrameReg;
   1706           FIOp.setReg(FrameReg);
   1707           FIOp.setIsKill(false);
   1708         }
   1709 
   1710         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
   1711           .addReg(FrameReg)
   1712           .addImm(Offset);
   1713 
   1714         if (!UseSGPR)
   1715           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
   1716             .addReg(TmpSReg, RegState::Kill);
   1717 
   1718         if (TmpSReg == FrameReg) {
   1719           // Undo frame register modification.
   1720           BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
   1721                   FrameReg)
   1722             .addReg(FrameReg)
   1723             .addImm(Offset);
   1724         }
   1725 
   1726         return;
   1727       }
   1728 
   1729       bool IsMUBUF = TII->isMUBUF(*MI);
   1730 
   1731       if (!IsMUBUF && !MFI->isEntryFunction()) {
   1732         // Convert to a swizzled stack address by scaling by the wave size.
   1733         //
   1734         // In an entry function/kernel the offset is already swizzled.
   1735 
   1736         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
   1737         Register ResultReg =
   1738             IsCopy ? MI->getOperand(0).getReg()
   1739                    : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
   1740 
   1741         int64_t Offset = FrameInfo.getObjectOffset(Index);
   1742         if (Offset == 0) {
   1743           // XXX - This never happens because of emergency scavenging slot at 0?
   1744           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
   1745             .addImm(ST.getWavefrontSizeLog2())
   1746             .addReg(FrameReg);
   1747         } else {
   1748           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
   1749             // Reuse ResultReg in intermediate step.
   1750             Register ScaledReg = ResultReg;
   1751 
   1752             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
   1753                     ScaledReg)
   1754               .addImm(ST.getWavefrontSizeLog2())
   1755               .addReg(FrameReg);
   1756 
   1757             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
   1758 
   1759             // TODO: Fold if use instruction is another add of a constant.
   1760             if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
   1761               // FIXME: This can fail
   1762               MIB.addImm(Offset);
   1763               MIB.addReg(ScaledReg, RegState::Kill);
   1764               if (!IsVOP2)
   1765                 MIB.addImm(0); // clamp bit
   1766             } else {
   1767               assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
   1768                      "Need to reuse carry out register");
   1769 
   1770               // Use scavenged unused carry out as offset register.
   1771               Register ConstOffsetReg;
   1772               if (!isWave32)
   1773                 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
   1774               else
   1775                 ConstOffsetReg = MIB.getReg(1);
   1776 
   1777               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
   1778                 .addImm(Offset);
   1779               MIB.addReg(ConstOffsetReg, RegState::Kill);
   1780               MIB.addReg(ScaledReg, RegState::Kill);
   1781               MIB.addImm(0); // clamp bit
   1782             }
   1783           } else {
   1784             // We have to produce a carry out, and there isn't a free SGPR pair
   1785             // for it. We can keep the whole computation on the SALU to avoid
   1786             // clobbering an additional register at the cost of an extra mov.
   1787 
   1788             // We may have 1 free scratch SGPR even though a carry out is
   1789             // unavailable. Only one additional mov is needed.
   1790             Register TmpScaledReg =
   1791                 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
   1792             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
   1793 
   1794             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
   1795               .addReg(FrameReg)
   1796               .addImm(ST.getWavefrontSizeLog2());
   1797             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
   1798               .addReg(ScaledReg, RegState::Kill)
   1799               .addImm(Offset);
   1800             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
   1801               .addReg(ScaledReg, RegState::Kill);
   1802 
   1803             // If there were truly no free SGPRs, we need to undo everything.
   1804             if (!TmpScaledReg.isValid()) {
   1805               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
   1806                 .addReg(ScaledReg, RegState::Kill)
   1807                 .addImm(Offset);
   1808               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
   1809                 .addReg(FrameReg)
   1810                 .addImm(ST.getWavefrontSizeLog2());
   1811             }
   1812           }
   1813         }
   1814 
   1815         // Don't introduce an extra copy if we're just materializing in a mov.
   1816         if (IsCopy)
   1817           MI->eraseFromParent();
   1818         else
   1819           FIOp.ChangeToRegister(ResultReg, false, false, true);
   1820         return;
   1821       }
   1822 
   1823       if (IsMUBUF) {
   1824         // Disable offen so we don't need a 0 vgpr base.
   1825         assert(static_cast<int>(FIOperandNum) ==
   1826                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
   1827                                           AMDGPU::OpName::vaddr));
   1828 
   1829         auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
   1830         assert((SOffset.isImm() && SOffset.getImm() == 0));
   1831 
   1832         if (FrameReg != AMDGPU::NoRegister)
   1833           SOffset.ChangeToRegister(FrameReg, false);
   1834 
   1835         int64_t Offset = FrameInfo.getObjectOffset(Index);
   1836         int64_t OldImm
   1837           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
   1838         int64_t NewOffset = OldImm + Offset;
   1839 
   1840         if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
   1841             buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
   1842           MI->eraseFromParent();
   1843           return;
   1844         }
   1845       }
   1846 
   1847       // If the offset is simply too big, don't convert to a scratch wave offset
   1848       // relative index.
   1849 
   1850       FIOp.ChangeToImmediate(Offset);
   1851       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
   1852         Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
   1853         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
   1854           .addImm(Offset);
   1855         FIOp.ChangeToRegister(TmpReg, false, false, true);
   1856       }
   1857     }
   1858   }
   1859 }
   1860 
   1861 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
   1862   return AMDGPUInstPrinter::getRegisterName(Reg);
   1863 }
   1864 
   1865 static const TargetRegisterClass *
   1866 getAnyVGPRClassForBitWidth(unsigned BitWidth) {
   1867   if (BitWidth <= 64)
   1868     return &AMDGPU::VReg_64RegClass;
   1869   if (BitWidth <= 96)
   1870     return &AMDGPU::VReg_96RegClass;
   1871   if (BitWidth <= 128)
   1872     return &AMDGPU::VReg_128RegClass;
   1873   if (BitWidth <= 160)
   1874     return &AMDGPU::VReg_160RegClass;
   1875   if (BitWidth <= 192)
   1876     return &AMDGPU::VReg_192RegClass;
   1877   if (BitWidth <= 256)
   1878     return &AMDGPU::VReg_256RegClass;
   1879   if (BitWidth <= 512)
   1880     return &AMDGPU::VReg_512RegClass;
   1881   if (BitWidth <= 1024)
   1882     return &AMDGPU::VReg_1024RegClass;
   1883 
   1884   return nullptr;
   1885 }
   1886 
   1887 static const TargetRegisterClass *
   1888 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
   1889   if (BitWidth <= 64)
   1890     return &AMDGPU::VReg_64_Align2RegClass;
   1891   if (BitWidth <= 96)
   1892     return &AMDGPU::VReg_96_Align2RegClass;
   1893   if (BitWidth <= 128)
   1894     return &AMDGPU::VReg_128_Align2RegClass;
   1895   if (BitWidth <= 160)
   1896     return &AMDGPU::VReg_160_Align2RegClass;
   1897   if (BitWidth <= 192)
   1898     return &AMDGPU::VReg_192_Align2RegClass;
   1899   if (BitWidth <= 256)
   1900     return &AMDGPU::VReg_256_Align2RegClass;
   1901   if (BitWidth <= 512)
   1902     return &AMDGPU::VReg_512_Align2RegClass;
   1903   if (BitWidth <= 1024)
   1904     return &AMDGPU::VReg_1024_Align2RegClass;
   1905 
   1906   return nullptr;
   1907 }
   1908 
   1909 const TargetRegisterClass *
   1910 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
   1911   if (BitWidth == 1)
   1912     return &AMDGPU::VReg_1RegClass;
   1913   if (BitWidth <= 16)
   1914     return &AMDGPU::VGPR_LO16RegClass;
   1915   if (BitWidth <= 32)
   1916     return &AMDGPU::VGPR_32RegClass;
   1917   return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
   1918                                 : getAnyVGPRClassForBitWidth(BitWidth);
   1919 }
   1920 
   1921 static const TargetRegisterClass *
   1922 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
   1923   if (BitWidth <= 64)
   1924     return &AMDGPU::AReg_64RegClass;
   1925   if (BitWidth <= 96)
   1926     return &AMDGPU::AReg_96RegClass;
   1927   if (BitWidth <= 128)
   1928     return &AMDGPU::AReg_128RegClass;
   1929   if (BitWidth <= 160)
   1930     return &AMDGPU::AReg_160RegClass;
   1931   if (BitWidth <= 192)
   1932     return &AMDGPU::AReg_192RegClass;
   1933   if (BitWidth <= 256)
   1934     return &AMDGPU::AReg_256RegClass;
   1935   if (BitWidth <= 512)
   1936     return &AMDGPU::AReg_512RegClass;
   1937   if (BitWidth <= 1024)
   1938     return &AMDGPU::AReg_1024RegClass;
   1939 
   1940   return nullptr;
   1941 }
   1942 
   1943 static const TargetRegisterClass *
   1944 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
   1945   if (BitWidth <= 64)
   1946     return &AMDGPU::AReg_64_Align2RegClass;
   1947   if (BitWidth <= 96)
   1948     return &AMDGPU::AReg_96_Align2RegClass;
   1949   if (BitWidth <= 128)
   1950     return &AMDGPU::AReg_128_Align2RegClass;
   1951   if (BitWidth <= 160)
   1952     return &AMDGPU::AReg_160_Align2RegClass;
   1953   if (BitWidth <= 192)
   1954     return &AMDGPU::AReg_192_Align2RegClass;
   1955   if (BitWidth <= 256)
   1956     return &AMDGPU::AReg_256_Align2RegClass;
   1957   if (BitWidth <= 512)
   1958     return &AMDGPU::AReg_512_Align2RegClass;
   1959   if (BitWidth <= 1024)
   1960     return &AMDGPU::AReg_1024_Align2RegClass;
   1961 
   1962   return nullptr;
   1963 }
   1964 
   1965 const TargetRegisterClass *
   1966 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
   1967   if (BitWidth <= 16)
   1968     return &AMDGPU::AGPR_LO16RegClass;
   1969   if (BitWidth <= 32)
   1970     return &AMDGPU::AGPR_32RegClass;
   1971   return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
   1972                                 : getAnyAGPRClassForBitWidth(BitWidth);
   1973 }
   1974 
   1975 const TargetRegisterClass *
   1976 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
   1977   if (BitWidth <= 16)
   1978     return &AMDGPU::SGPR_LO16RegClass;
   1979   if (BitWidth <= 32)
   1980     return &AMDGPU::SReg_32RegClass;
   1981   if (BitWidth <= 64)
   1982     return &AMDGPU::SReg_64RegClass;
   1983   if (BitWidth <= 96)
   1984     return &AMDGPU::SGPR_96RegClass;
   1985   if (BitWidth <= 128)
   1986     return &AMDGPU::SGPR_128RegClass;
   1987   if (BitWidth <= 160)
   1988     return &AMDGPU::SGPR_160RegClass;
   1989   if (BitWidth <= 192)
   1990     return &AMDGPU::SGPR_192RegClass;
   1991   if (BitWidth <= 256)
   1992     return &AMDGPU::SGPR_256RegClass;
   1993   if (BitWidth <= 512)
   1994     return &AMDGPU::SGPR_512RegClass;
   1995   if (BitWidth <= 1024)
   1996     return &AMDGPU::SGPR_1024RegClass;
   1997 
   1998   return nullptr;
   1999 }
   2000 
   2001 // FIXME: This is very slow. It might be worth creating a map from physreg to
   2002 // register class.
   2003 const TargetRegisterClass *
   2004 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
   2005   static const TargetRegisterClass *const BaseClasses[] = {
   2006     &AMDGPU::VGPR_LO16RegClass,
   2007     &AMDGPU::VGPR_HI16RegClass,
   2008     &AMDGPU::SReg_LO16RegClass,
   2009     &AMDGPU::AGPR_LO16RegClass,
   2010     &AMDGPU::VGPR_32RegClass,
   2011     &AMDGPU::SReg_32RegClass,
   2012     &AMDGPU::AGPR_32RegClass,
   2013     &AMDGPU::AGPR_32RegClass,
   2014     &AMDGPU::VReg_64_Align2RegClass,
   2015     &AMDGPU::VReg_64RegClass,
   2016     &AMDGPU::SReg_64RegClass,
   2017     &AMDGPU::AReg_64_Align2RegClass,
   2018     &AMDGPU::AReg_64RegClass,
   2019     &AMDGPU::VReg_96_Align2RegClass,
   2020     &AMDGPU::VReg_96RegClass,
   2021     &AMDGPU::SReg_96RegClass,
   2022     &AMDGPU::AReg_96_Align2RegClass,
   2023     &AMDGPU::AReg_96RegClass,
   2024     &AMDGPU::VReg_128_Align2RegClass,
   2025     &AMDGPU::VReg_128RegClass,
   2026     &AMDGPU::SReg_128RegClass,
   2027     &AMDGPU::AReg_128_Align2RegClass,
   2028     &AMDGPU::AReg_128RegClass,
   2029     &AMDGPU::VReg_160_Align2RegClass,
   2030     &AMDGPU::VReg_160RegClass,
   2031     &AMDGPU::SReg_160RegClass,
   2032     &AMDGPU::AReg_160_Align2RegClass,
   2033     &AMDGPU::AReg_160RegClass,
   2034     &AMDGPU::VReg_192_Align2RegClass,
   2035     &AMDGPU::VReg_192RegClass,
   2036     &AMDGPU::SReg_192RegClass,
   2037     &AMDGPU::AReg_192_Align2RegClass,
   2038     &AMDGPU::AReg_192RegClass,
   2039     &AMDGPU::VReg_256_Align2RegClass,
   2040     &AMDGPU::VReg_256RegClass,
   2041     &AMDGPU::SReg_256RegClass,
   2042     &AMDGPU::AReg_256_Align2RegClass,
   2043     &AMDGPU::AReg_256RegClass,
   2044     &AMDGPU::VReg_512_Align2RegClass,
   2045     &AMDGPU::VReg_512RegClass,
   2046     &AMDGPU::SReg_512RegClass,
   2047     &AMDGPU::AReg_512_Align2RegClass,
   2048     &AMDGPU::AReg_512RegClass,
   2049     &AMDGPU::SReg_1024RegClass,
   2050     &AMDGPU::VReg_1024_Align2RegClass,
   2051     &AMDGPU::VReg_1024RegClass,
   2052     &AMDGPU::AReg_1024_Align2RegClass,
   2053     &AMDGPU::AReg_1024RegClass,
   2054     &AMDGPU::SCC_CLASSRegClass,
   2055     &AMDGPU::Pseudo_SReg_32RegClass,
   2056     &AMDGPU::Pseudo_SReg_128RegClass,
   2057   };
   2058 
   2059   for (const TargetRegisterClass *BaseClass : BaseClasses) {
   2060     if (BaseClass->contains(Reg)) {
   2061       return BaseClass;
   2062     }
   2063   }
   2064   return nullptr;
   2065 }
   2066 
   2067 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
   2068                                Register Reg) const {
   2069   const TargetRegisterClass *RC;
   2070   if (Reg.isVirtual())
   2071     RC = MRI.getRegClass(Reg);
   2072   else
   2073     RC = getPhysRegClass(Reg);
   2074   return isSGPRClass(RC);
   2075 }
   2076 
   2077 // TODO: It might be helpful to have some target specific flags in
   2078 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
   2079 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
   2080   unsigned Size = getRegSizeInBits(*RC);
   2081   if (Size == 16) {
   2082     return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
   2083            getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
   2084   }
   2085   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
   2086   if (!VRC) {
   2087     assert(Size < 32 && "Invalid register class size");
   2088     return false;
   2089   }
   2090   return getCommonSubClass(VRC, RC) != nullptr;
   2091 }
   2092 
   2093 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
   2094   unsigned Size = getRegSizeInBits(*RC);
   2095   if (Size < 16)
   2096     return false;
   2097   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
   2098   if (!ARC) {
   2099     assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
   2100     return false;
   2101   }
   2102   return getCommonSubClass(ARC, RC) != nullptr;
   2103 }
   2104 
   2105 const TargetRegisterClass *
   2106 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
   2107   unsigned Size = getRegSizeInBits(*SRC);
   2108   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
   2109   assert(VRC && "Invalid register class size");
   2110   return VRC;
   2111 }
   2112 
   2113 const TargetRegisterClass *
   2114 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
   2115   unsigned Size = getRegSizeInBits(*SRC);
   2116   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
   2117   assert(ARC && "Invalid register class size");
   2118   return ARC;
   2119 }
   2120 
   2121 const TargetRegisterClass *
   2122 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
   2123   unsigned Size = getRegSizeInBits(*VRC);
   2124   if (Size == 32)
   2125     return &AMDGPU::SGPR_32RegClass;
   2126   const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
   2127   assert(SRC && "Invalid register class size");
   2128   return SRC;
   2129 }
   2130 
   2131 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
   2132                          const TargetRegisterClass *RC, unsigned SubIdx) const {
   2133   if (SubIdx == AMDGPU::NoSubRegister)
   2134     return RC;
   2135 
   2136   // We can assume that each lane corresponds to one 32-bit register.
   2137   unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
   2138   if (isSGPRClass(RC)) {
   2139     if (Size == 32)
   2140       RC = &AMDGPU::SGPR_32RegClass;
   2141     else
   2142       RC = getSGPRClassForBitWidth(Size);
   2143   } else if (hasAGPRs(RC)) {
   2144     RC = getAGPRClassForBitWidth(Size);
   2145   } else {
   2146     RC = getVGPRClassForBitWidth(Size);
   2147   }
   2148   assert(RC && "Invalid sub-register class size");
   2149   return RC;
   2150 }
   2151 
   2152 const TargetRegisterClass *
   2153 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
   2154                                          const TargetRegisterClass *SubRC,
   2155                                          unsigned SubIdx) const {
   2156   // Ensure this subregister index is aligned in the super register.
   2157   const TargetRegisterClass *MatchRC =
   2158       getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
   2159   return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
   2160 }
   2161 
   2162 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
   2163   if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
   2164       OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
   2165     return !ST.hasMFMAInlineLiteralBug();
   2166 
   2167   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
   2168          OpType <= AMDGPU::OPERAND_SRC_LAST;
   2169 }
   2170 
   2171 bool SIRegisterInfo::shouldRewriteCopySrc(
   2172   const TargetRegisterClass *DefRC,
   2173   unsigned DefSubReg,
   2174   const TargetRegisterClass *SrcRC,
   2175   unsigned SrcSubReg) const {
   2176   // We want to prefer the smallest register class possible, so we don't want to
   2177   // stop and rewrite on anything that looks like a subregister
   2178   // extract. Operations mostly don't care about the super register class, so we
   2179   // only want to stop on the most basic of copies between the same register
   2180   // class.
   2181   //
   2182   // e.g. if we have something like
   2183   // %0 = ...
   2184   // %1 = ...
   2185   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
   2186   // %3 = COPY %2, sub0
   2187   //
   2188   // We want to look through the COPY to find:
   2189   //  => %3 = COPY %0
   2190 
   2191   // Plain copy.
   2192   return getCommonSubClass(DefRC, SrcRC) != nullptr;
   2193 }
   2194 
   2195 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
   2196   // TODO: 64-bit operands have extending behavior from 32-bit literal.
   2197   return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
   2198          OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
   2199 }
   2200 
   2201 /// Returns a lowest register that is not used at any point in the function.
   2202 ///        If all registers are used, then this function will return
   2203 ///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
   2204 ///         highest unused register.
   2205 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
   2206                                               const TargetRegisterClass *RC,
   2207                                               const MachineFunction &MF,
   2208                                               bool ReserveHighestVGPR) const {
   2209   if (ReserveHighestVGPR) {
   2210     for (MCRegister Reg : reverse(*RC))
   2211       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
   2212         return Reg;
   2213   } else {
   2214     for (MCRegister Reg : *RC)
   2215       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
   2216         return Reg;
   2217   }
   2218   return MCRegister();
   2219 }
   2220 
   2221 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
   2222                                                    unsigned EltSize) const {
   2223   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
   2224   assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
   2225 
   2226   const unsigned RegDWORDs = RegBitWidth / 32;
   2227   const unsigned EltDWORDs = EltSize / 4;
   2228   assert(RegSplitParts.size() + 1 >= EltDWORDs);
   2229 
   2230   const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
   2231   const unsigned NumParts = RegDWORDs / EltDWORDs;
   2232 
   2233   return makeArrayRef(Parts.data(), NumParts);
   2234 }
   2235 
   2236 const TargetRegisterClass*
   2237 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
   2238                                   Register Reg) const {
   2239   return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
   2240 }
   2241 
   2242 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
   2243                             Register Reg) const {
   2244   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
   2245   // Registers without classes are unaddressable, SGPR-like registers.
   2246   return RC && hasVGPRs(RC);
   2247 }
   2248 
   2249 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
   2250                             Register Reg) const {
   2251   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
   2252 
   2253   // Registers without classes are unaddressable, SGPR-like registers.
   2254   return RC && hasAGPRs(RC);
   2255 }
   2256 
   2257 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
   2258                                     const TargetRegisterClass *SrcRC,
   2259                                     unsigned SubReg,
   2260                                     const TargetRegisterClass *DstRC,
   2261                                     unsigned DstSubReg,
   2262                                     const TargetRegisterClass *NewRC,
   2263                                     LiveIntervals &LIS) const {
   2264   unsigned SrcSize = getRegSizeInBits(*SrcRC);
   2265   unsigned DstSize = getRegSizeInBits(*DstRC);
   2266   unsigned NewSize = getRegSizeInBits(*NewRC);
   2267 
   2268   // Do not increase size of registers beyond dword, we would need to allocate
   2269   // adjacent registers and constraint regalloc more than needed.
   2270 
   2271   // Always allow dword coalescing.
   2272   if (SrcSize <= 32 || DstSize <= 32)
   2273     return true;
   2274 
   2275   return NewSize <= DstSize || NewSize <= SrcSize;
   2276 }
   2277 
   2278 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   2279                                              MachineFunction &MF) const {
   2280   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   2281 
   2282   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
   2283                                                        MF.getFunction());
   2284   switch (RC->getID()) {
   2285   default:
   2286     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
   2287   case AMDGPU::VGPR_32RegClassID:
   2288   case AMDGPU::VGPR_LO16RegClassID:
   2289   case AMDGPU::VGPR_HI16RegClassID:
   2290     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
   2291   case AMDGPU::SGPR_32RegClassID:
   2292   case AMDGPU::SGPR_LO16RegClassID:
   2293     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
   2294   }
   2295 }
   2296 
   2297 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
   2298                                                 unsigned Idx) const {
   2299   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
   2300       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
   2301     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
   2302                                const_cast<MachineFunction &>(MF));
   2303 
   2304   if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
   2305     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
   2306                                const_cast<MachineFunction &>(MF));
   2307 
   2308   llvm_unreachable("Unexpected register pressure set!");
   2309 }
   2310 
   2311 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
   2312   static const int Empty[] = { -1 };
   2313 
   2314   if (RegPressureIgnoredUnits[RegUnit])
   2315     return Empty;
   2316 
   2317   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
   2318 }
   2319 
   2320 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
   2321   // Not a callee saved register.
   2322   return AMDGPU::SGPR30_SGPR31;
   2323 }
   2324 
   2325 const TargetRegisterClass *
   2326 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
   2327                                          const RegisterBank &RB,
   2328                                          const MachineRegisterInfo &MRI) const {
   2329   switch (RB.getID()) {
   2330   case AMDGPU::VGPRRegBankID:
   2331     return getVGPRClassForBitWidth(std::max(32u, Size));
   2332   case AMDGPU::VCCRegBankID:
   2333     assert(Size == 1);
   2334     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
   2335                     : &AMDGPU::SReg_64_XEXECRegClass;
   2336   case AMDGPU::SGPRRegBankID:
   2337     return getSGPRClassForBitWidth(std::max(32u, Size));
   2338   case AMDGPU::AGPRRegBankID:
   2339     return getAGPRClassForBitWidth(std::max(32u, Size));
   2340   default:
   2341     llvm_unreachable("unknown register bank");
   2342   }
   2343 }
   2344 
   2345 const TargetRegisterClass *
   2346 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
   2347                                          const MachineRegisterInfo &MRI) const {
   2348   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
   2349   if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
   2350     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
   2351 
   2352   const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
   2353   return getAllocatableClass(RC);
   2354 }
   2355 
   2356 MCRegister SIRegisterInfo::getVCC() const {
   2357   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
   2358 }
   2359 
   2360 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
   2361   // VGPR tuples have an alignment requirement on gfx90a variants.
   2362   return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
   2363                                 : &AMDGPU::VReg_64RegClass;
   2364 }
   2365 
   2366 const TargetRegisterClass *
   2367 SIRegisterInfo::getRegClass(unsigned RCID) const {
   2368   switch ((int)RCID) {
   2369   case AMDGPU::SReg_1RegClassID:
   2370     return getBoolRC();
   2371   case AMDGPU::SReg_1_XEXECRegClassID:
   2372     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
   2373       : &AMDGPU::SReg_64_XEXECRegClass;
   2374   case -1:
   2375     return nullptr;
   2376   default:
   2377     return AMDGPUGenRegisterInfo::getRegClass(RCID);
   2378   }
   2379 }
   2380 
   2381 // Find reaching register definition
   2382 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
   2383                                               MachineInstr &Use,
   2384                                               MachineRegisterInfo &MRI,
   2385                                               LiveIntervals *LIS) const {
   2386   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
   2387   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
   2388   SlotIndex DefIdx;
   2389 
   2390   if (Reg.isVirtual()) {
   2391     if (!LIS->hasInterval(Reg))
   2392       return nullptr;
   2393     LiveInterval &LI = LIS->getInterval(Reg);
   2394     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
   2395                                   : MRI.getMaxLaneMaskForVReg(Reg);
   2396     VNInfo *V = nullptr;
   2397     if (LI.hasSubRanges()) {
   2398       for (auto &S : LI.subranges()) {
   2399         if ((S.LaneMask & SubLanes) == SubLanes) {
   2400           V = S.getVNInfoAt(UseIdx);
   2401           break;
   2402         }
   2403       }
   2404     } else {
   2405       V = LI.getVNInfoAt(UseIdx);
   2406     }
   2407     if (!V)
   2408       return nullptr;
   2409     DefIdx = V->def;
   2410   } else {
   2411     // Find last def.
   2412     for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
   2413          ++Units) {
   2414       LiveRange &LR = LIS->getRegUnit(*Units);
   2415       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
   2416         if (!DefIdx.isValid() ||
   2417             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
   2418                           LIS->getInstructionFromIndex(V->def)))
   2419           DefIdx = V->def;
   2420       } else {
   2421         return nullptr;
   2422       }
   2423     }
   2424   }
   2425 
   2426   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
   2427 
   2428   if (!Def || !MDT.dominates(Def, &Use))
   2429     return nullptr;
   2430 
   2431   assert(Def->modifiesRegister(Reg, this));
   2432 
   2433   return Def;
   2434 }
   2435 
   2436 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
   2437   assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
   2438 
   2439   for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
   2440                                          AMDGPU::SReg_32RegClass,
   2441                                          AMDGPU::AGPR_32RegClass } ) {
   2442     if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
   2443       return Super;
   2444   }
   2445   if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
   2446                                             &AMDGPU::VGPR_32RegClass)) {
   2447       return Super;
   2448   }
   2449 
   2450   return AMDGPU::NoRegister;
   2451 }
   2452 
   2453 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
   2454   if (!ST.needsAlignedVGPRs())
   2455     return true;
   2456 
   2457   if (hasVGPRs(&RC))
   2458     return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
   2459   if (hasAGPRs(&RC))
   2460     return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
   2461 
   2462   return true;
   2463 }
   2464 
   2465 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   2466   switch (PhysReg) {
   2467   case AMDGPU::SGPR_NULL:
   2468   case AMDGPU::SRC_SHARED_BASE:
   2469   case AMDGPU::SRC_PRIVATE_BASE:
   2470   case AMDGPU::SRC_SHARED_LIMIT:
   2471   case AMDGPU::SRC_PRIVATE_LIMIT:
   2472     return true;
   2473   default:
   2474     return false;
   2475   }
   2476 }
   2477 
   2478 ArrayRef<MCPhysReg>
   2479 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
   2480   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
   2481                       ST.getMaxNumSGPRs(MF) / 4);
   2482 }
   2483 
   2484 ArrayRef<MCPhysReg>
   2485 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
   2486   return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
   2487                       ST.getMaxNumSGPRs(MF) / 2);
   2488 }
   2489 
   2490 ArrayRef<MCPhysReg>
   2491 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
   2492   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
   2493 }
   2494