Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 /// \file
      9 /// This file implements the targeting of the RegisterBankInfo class for
     10 /// AMDGPU.
     11 ///
     12 /// \par
     13 ///
     14 /// AMDGPU has unique register bank constraints that require special high level
     15 /// strategies to deal with. There are two main true physical register banks
     16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
     17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
     18 /// boolean context. There is also the AGPR bank, which is a special purpose
     19 /// physical register bank present on some subtargets.
     20 ///
     21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
     22 /// be uniform. It is generally not valid to legalize operands by inserting
     23 /// copies as on other targets. Operations which require uniform, SGPR operands
     24 /// generally require scalarization by repeatedly executing the instruction,
     25 /// activating each set of lanes using a unique set of input values. This is
     26 /// referred to as a waterfall loop.
     27 ///
     28 /// \par Booleans
     29 ///
     30 /// Booleans (s1 values) requires special consideration. A vector compare result
     31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
     32 /// register. These are represented with the VCC bank. During selection, we need
     33 /// to be able to unambiguously go back from a register class to a register
     34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
     35 /// bank, we need to know the use context type. An SGPR s1 value always means a
     36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
     37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
     38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
     39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
     40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
     41 ///
     42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
     43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
     44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
     45 /// memory) will require a copy to the VCC bank which will require clearing the
     46 /// high bits and inserting a compare.
     47 ///
     48 /// \par Constant bus restriction
     49 ///
     50 /// VALU instructions have a limitation known as the constant bus
     51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
     52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
     53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
     54 /// multiple operands. From a register bank perspective, any combination of
     55 /// operands should be legal as an SGPR, but this is contextually dependent on
     56 /// the SGPR operands all being the same register. There is therefore optimal to
     57 /// choose the SGPR with the most uses to minimize the number of copies.
     58 ///
     59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
     60 /// operation should have its source operands all mapped to VGPRs (except for
     61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
     62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
     63 /// complicated to solve here. Every optimization pattern or instruction
     64 /// selected to multiple outputs would have to enforce this rule, and there
     65 /// would be additional complexity in tracking this rule for every G_*
     66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
     67 /// picking the optimal operand combination from a post-isel optimization pass.
     68 ///
     69 //===----------------------------------------------------------------------===//
     70 
     71 #include "AMDGPURegisterBankInfo.h"
     72 
     73 #include "AMDGPU.h"
     74 #include "AMDGPUGlobalISelUtils.h"
     75 #include "AMDGPUInstrInfo.h"
     76 #include "GCNSubtarget.h"
     77 #include "SIMachineFunctionInfo.h"
     78 #include "SIRegisterInfo.h"
     79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
     80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
     81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
     82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
     83 #include "llvm/IR/IntrinsicsAMDGPU.h"
     84 
     85 #define GET_TARGET_REGBANK_IMPL
     86 #include "AMDGPUGenRegisterBank.inc"
     87 
     88 // This file will be TableGen'ed at some point.
     89 #include "AMDGPUGenRegisterBankInfo.def"
     90 
     91 using namespace llvm;
     92 using namespace MIPatternMatch;
     93 
     94 namespace {
     95 
     96 // Observer to apply a register bank to new registers created by LegalizerHelper.
     97 class ApplyRegBankMapping final : public GISelChangeObserver {
     98 private:
     99   const AMDGPURegisterBankInfo &RBI;
    100   MachineRegisterInfo &MRI;
    101   const RegisterBank *NewBank;
    102   SmallVector<MachineInstr *, 4> NewInsts;
    103 
    104 public:
    105   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
    106                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
    107     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
    108 
    109   ~ApplyRegBankMapping() {
    110     for (MachineInstr *MI : NewInsts)
    111       applyBank(*MI);
    112   }
    113 
    114   /// Set any registers that don't have a set register class or bank to SALU.
    115   void applyBank(MachineInstr &MI) {
    116     const unsigned Opc = MI.getOpcode();
    117     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
    118         Opc == AMDGPU::G_SEXT) {
    119       // LegalizerHelper wants to use the basic legalization artifacts when
    120       // widening etc. We don't handle selection with vcc in artifact sources,
    121       // so we need to use a sslect instead to handle these properly.
    122       Register DstReg = MI.getOperand(0).getReg();
    123       Register SrcReg = MI.getOperand(1).getReg();
    124       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
    125       if (SrcBank == &AMDGPU::VCCRegBank) {
    126         const LLT S32 = LLT::scalar(32);
    127         assert(MRI.getType(SrcReg) == LLT::scalar(1));
    128         assert(MRI.getType(DstReg) == S32);
    129         assert(NewBank == &AMDGPU::VGPRRegBank);
    130 
    131         // Replace the extension with a select, which really uses the boolean
    132         // source.
    133         MachineIRBuilder B(MI);
    134         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
    135         auto False = B.buildConstant(S32, 0);
    136         B.buildSelect(DstReg, SrcReg, True, False);
    137         MRI.setRegBank(True.getReg(0), *NewBank);
    138         MRI.setRegBank(False.getReg(0), *NewBank);
    139         MI.eraseFromParent();
    140       }
    141 
    142       assert(!MRI.getRegClassOrRegBank(DstReg));
    143       MRI.setRegBank(DstReg, *NewBank);
    144       return;
    145     }
    146 
    147 #ifndef NDEBUG
    148     if (Opc == AMDGPU::G_TRUNC) {
    149       Register DstReg = MI.getOperand(0).getReg();
    150       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
    151       assert(DstBank != &AMDGPU::VCCRegBank);
    152     }
    153 #endif
    154 
    155     for (MachineOperand &Op : MI.operands()) {
    156       if (!Op.isReg())
    157         continue;
    158 
    159       // We may see physical registers if building a real MI
    160       Register Reg = Op.getReg();
    161       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
    162         continue;
    163 
    164       const RegisterBank *RB = NewBank;
    165       if (MRI.getType(Reg) == LLT::scalar(1)) {
    166         assert(NewBank == &AMDGPU::VGPRRegBank &&
    167                "s1 operands should only be used for vector bools");
    168         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
    169                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
    170                "not expecting legalization artifacts here");
    171         RB = &AMDGPU::VCCRegBank;
    172       }
    173 
    174       MRI.setRegBank(Reg, *RB);
    175     }
    176   }
    177 
    178   void erasingInstr(MachineInstr &MI) override {}
    179 
    180   void createdInstr(MachineInstr &MI) override {
    181     // At this point, the instruction was just inserted and has no operands.
    182     NewInsts.push_back(&MI);
    183   }
    184 
    185   void changingInstr(MachineInstr &MI) override {}
    186   void changedInstr(MachineInstr &MI) override {
    187     // FIXME: In principle we should probably add the instruction to NewInsts,
    188     // but the way the LegalizerHelper uses the observer, we will always see the
    189     // registers we need to set the regbank on also referenced in a new
    190     // instruction.
    191   }
    192 };
    193 
    194 }
    195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
    196     : AMDGPUGenRegisterBankInfo(),
    197       Subtarget(ST),
    198       TRI(Subtarget.getRegisterInfo()),
    199       TII(Subtarget.getInstrInfo()) {
    200 
    201   // HACK: Until this is fully tablegen'd.
    202   static llvm::once_flag InitializeRegisterBankFlag;
    203 
    204   static auto InitializeRegisterBankOnce = [this]() {
    205     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
    206            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
    207            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
    208     (void)this;
    209   };
    210 
    211   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
    212 }
    213 
    214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
    215   unsigned BankID = Bank.getID();
    216   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
    217 }
    218 
    219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
    220                                           const RegisterBank &Src,
    221                                           unsigned Size) const {
    222   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
    223   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
    224       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
    225     return std::numeric_limits<unsigned>::max();
    226   }
    227 
    228   // Bool values are tricky, because the meaning is based on context. The SCC
    229   // and VCC banks are for the natural scalar and vector conditions produced by
    230   // a compare.
    231   //
    232   // Legalization doesn't know about the necessary context, so an s1 use may
    233   // have been a truncate from an arbitrary value, in which case a copy (lowered
    234   // as a compare with 0) needs to be inserted.
    235   if (Size == 1 &&
    236       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
    237       (isVectorRegisterBank(Src) ||
    238        Src.getID() == AMDGPU::SGPRRegBankID ||
    239        Src.getID() == AMDGPU::VCCRegBankID))
    240     return std::numeric_limits<unsigned>::max();
    241 
    242   // There is no direct copy between AGPRs.
    243   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
    244       Src.getID() == AMDGPU::AGPRRegBankID)
    245     return 4;
    246 
    247   return RegisterBankInfo::copyCost(Dst, Src, Size);
    248 }
    249 
    250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
    251   const ValueMapping &ValMapping,
    252   const RegisterBank *CurBank) const {
    253   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
    254   // VGPR.
    255   // FIXME: Is there a better way to do this?
    256   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
    257     return 10; // This is expensive.
    258 
    259   assert(ValMapping.NumBreakDowns == 2 &&
    260          ValMapping.BreakDown[0].Length == 32 &&
    261          ValMapping.BreakDown[0].StartIdx == 0 &&
    262          ValMapping.BreakDown[1].Length == 32 &&
    263          ValMapping.BreakDown[1].StartIdx == 32 &&
    264          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
    265 
    266   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
    267   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
    268   // want.
    269 
    270   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
    271   // alignment restrictions, but this probably isn't important.
    272   return 1;
    273 }
    274 
    275 const RegisterBank &
    276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
    277                                                LLT Ty) const {
    278   if (&RC == &AMDGPU::SReg_1RegClass)
    279     return AMDGPU::VCCRegBank;
    280 
    281   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
    282   // VCC-like use.
    283   if (TRI->isSGPRClass(&RC)) {
    284     // FIXME: This probably came from a copy from a physical register, which
    285     // should be inferrrable from the copied to-type. We don't have many boolean
    286     // physical register constraints so just assume a normal SGPR for now.
    287     if (!Ty.isValid())
    288       return AMDGPU::SGPRRegBank;
    289 
    290     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
    291   }
    292 
    293   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
    294 }
    295 
    296 template <unsigned NumOps>
    297 RegisterBankInfo::InstructionMappings
    298 AMDGPURegisterBankInfo::addMappingFromTable(
    299     const MachineInstr &MI, const MachineRegisterInfo &MRI,
    300     const std::array<unsigned, NumOps> RegSrcOpIdx,
    301     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
    302 
    303   InstructionMappings AltMappings;
    304 
    305   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
    306 
    307   unsigned Sizes[NumOps];
    308   for (unsigned I = 0; I < NumOps; ++I) {
    309     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
    310     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
    311   }
    312 
    313   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
    314     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
    315     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
    316   }
    317 
    318   // getInstrMapping's default mapping uses ID 1, so start at 2.
    319   unsigned MappingID = 2;
    320   for (const auto &Entry : Table) {
    321     for (unsigned I = 0; I < NumOps; ++I) {
    322       int OpIdx = RegSrcOpIdx[I];
    323       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
    324     }
    325 
    326     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
    327                                                  getOperandsMapping(Operands),
    328                                                  Operands.size()));
    329   }
    330 
    331   return AltMappings;
    332 }
    333 
    334 RegisterBankInfo::InstructionMappings
    335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
    336     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
    337   switch (MI.getIntrinsicID()) {
    338   case Intrinsic::amdgcn_readlane: {
    339     static const OpRegBankEntry<3> Table[2] = {
    340       // Perfectly legal.
    341       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
    342 
    343       // Need a readfirstlane for the index.
    344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
    345     };
    346 
    347     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
    348     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
    349   }
    350   case Intrinsic::amdgcn_writelane: {
    351     static const OpRegBankEntry<4> Table[4] = {
    352       // Perfectly legal.
    353       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
    354 
    355       // Need readfirstlane of first op
    356       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
    357 
    358       // Need readfirstlane of second op
    359       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
    360 
    361       // Need readfirstlane of both ops
    362       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
    363     };
    364 
    365     // rsrc, voffset, offset
    366     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
    367     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
    368   }
    369   default:
    370     return RegisterBankInfo::getInstrAlternativeMappings(MI);
    371   }
    372 }
    373 
    374 RegisterBankInfo::InstructionMappings
    375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
    376     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
    377 
    378   switch (MI.getIntrinsicID()) {
    379   case Intrinsic::amdgcn_s_buffer_load: {
    380     static const OpRegBankEntry<2> Table[4] = {
    381       // Perfectly legal.
    382       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
    383 
    384       // Only need 1 register in loop
    385       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
    386 
    387       // Have to waterfall the resource.
    388       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
    389 
    390       // Have to waterfall the resource, and the offset.
    391       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
    392     };
    393 
    394     // rsrc, offset
    395     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
    396     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
    397   }
    398   case Intrinsic::amdgcn_ds_ordered_add:
    399   case Intrinsic::amdgcn_ds_ordered_swap: {
    400     // VGPR = M0, VGPR
    401     static const OpRegBankEntry<3> Table[2] = {
    402       // Perfectly legal.
    403       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
    404 
    405       // Need a readfirstlane for m0
    406       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
    407     };
    408 
    409     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
    410     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
    411   }
    412   case Intrinsic::amdgcn_s_sendmsg:
    413   case Intrinsic::amdgcn_s_sendmsghalt: {
    414     // FIXME: Should have no register for immediate
    415     static const OpRegBankEntry<1> Table[2] = {
    416       // Perfectly legal.
    417       { { AMDGPU::SGPRRegBankID }, 1 },
    418 
    419       // Need readlane
    420       { { AMDGPU::VGPRRegBankID }, 3 }
    421     };
    422 
    423     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
    424     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
    425   }
    426   default:
    427     return RegisterBankInfo::getInstrAlternativeMappings(MI);
    428   }
    429 }
    430 
    431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
    432   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
    433   return I && I->getMetadata("amdgpu.noclobber");
    434 }
    435 
    436 // FIXME: Returns uniform if there's no source value information. This is
    437 // probably wrong.
    438 static bool isScalarLoadLegal(const MachineInstr &MI) {
    439   if (!MI.hasOneMemOperand())
    440     return false;
    441 
    442   const MachineMemOperand *MMO = *MI.memoperands_begin();
    443   const unsigned AS = MMO->getAddrSpace();
    444   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
    445                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
    446   // Require 4-byte alignment.
    447   return MMO->getAlign() >= Align(4) &&
    448          // Can't do a scalar atomic load.
    449          !MMO->isAtomic() &&
    450          // Don't use scalar loads for volatile accesses to non-constant address
    451          // spaces.
    452          (IsConst || !MMO->isVolatile()) &&
    453          // Memory must be known constant, or not written before this load.
    454          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
    455          AMDGPUInstrInfo::isUniformMMO(MMO);
    456 }
    457 
    458 RegisterBankInfo::InstructionMappings
    459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
    460     const MachineInstr &MI) const {
    461 
    462   const MachineFunction &MF = *MI.getParent()->getParent();
    463   const MachineRegisterInfo &MRI = MF.getRegInfo();
    464 
    465 
    466   InstructionMappings AltMappings;
    467   switch (MI.getOpcode()) {
    468   case TargetOpcode::G_CONSTANT: {
    469     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
    470     if (Size == 1) {
    471       static const OpRegBankEntry<1> Table[3] = {
    472         { { AMDGPU::VGPRRegBankID }, 1 },
    473         { { AMDGPU::SGPRRegBankID }, 1 },
    474         { { AMDGPU::VCCRegBankID }, 1 }
    475       };
    476 
    477       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
    478     }
    479 
    480     LLVM_FALLTHROUGH;
    481   }
    482   case TargetOpcode::G_FCONSTANT:
    483   case TargetOpcode::G_FRAME_INDEX:
    484   case TargetOpcode::G_GLOBAL_VALUE: {
    485     static const OpRegBankEntry<1> Table[2] = {
    486       { { AMDGPU::VGPRRegBankID }, 1 },
    487       { { AMDGPU::SGPRRegBankID }, 1 }
    488     };
    489 
    490     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
    491   }
    492   case TargetOpcode::G_AND:
    493   case TargetOpcode::G_OR:
    494   case TargetOpcode::G_XOR: {
    495     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
    496 
    497     if (Size == 1) {
    498       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
    499       const InstructionMapping &SCCMapping = getInstructionMapping(
    500         1, 1, getOperandsMapping(
    501           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
    502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
    503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
    504         3); // Num Operands
    505       AltMappings.push_back(&SCCMapping);
    506 
    507       const InstructionMapping &VCCMapping0 = getInstructionMapping(
    508         2, 1, getOperandsMapping(
    509           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
    510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
    511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
    512         3); // Num Operands
    513       AltMappings.push_back(&VCCMapping0);
    514       return AltMappings;
    515     }
    516 
    517     if (Size != 64)
    518       break;
    519 
    520     const InstructionMapping &SSMapping = getInstructionMapping(
    521       1, 1, getOperandsMapping(
    522         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
    525       3); // Num Operands
    526     AltMappings.push_back(&SSMapping);
    527 
    528     const InstructionMapping &VVMapping = getInstructionMapping(
    529       2, 2, getOperandsMapping(
    530         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
    531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
    532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
    533       3); // Num Operands
    534     AltMappings.push_back(&VVMapping);
    535     break;
    536   }
    537   case TargetOpcode::G_LOAD:
    538   case TargetOpcode::G_ZEXTLOAD:
    539   case TargetOpcode::G_SEXTLOAD: {
    540     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
    541     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
    542     unsigned PtrSize = PtrTy.getSizeInBits();
    543     unsigned AS = PtrTy.getAddressSpace();
    544 
    545     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
    546          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
    547         isScalarLoadLegal(MI)) {
    548       const InstructionMapping &SSMapping = getInstructionMapping(
    549           1, 1, getOperandsMapping(
    550                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    551                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
    552           2); // Num Operands
    553       AltMappings.push_back(&SSMapping);
    554     }
    555 
    556     const InstructionMapping &VVMapping = getInstructionMapping(
    557         2, 1,
    558         getOperandsMapping(
    559             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
    560              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
    561         2); // Num Operands
    562     AltMappings.push_back(&VVMapping);
    563 
    564     // It may be possible to have a vgpr = load sgpr mapping here, because
    565     // the mubuf instructions support this kind of load, but probably for only
    566     // gfx7 and older.  However, the addressing mode matching in the instruction
    567     // selector should be able to do a better job of detecting and selecting
    568     // these kinds of loads from the vgpr = load vgpr mapping.
    569 
    570     return AltMappings;
    571 
    572   }
    573   case TargetOpcode::G_SELECT: {
    574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
    575     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
    576       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
    578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
    580       4); // Num Operands
    581     AltMappings.push_back(&SSMapping);
    582 
    583     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
    584       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
    585                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
    586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
    587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
    588       4); // Num Operands
    589     AltMappings.push_back(&VVMapping);
    590 
    591     return AltMappings;
    592   }
    593   case TargetOpcode::G_UADDE:
    594   case TargetOpcode::G_USUBE:
    595   case TargetOpcode::G_SADDE:
    596   case TargetOpcode::G_SSUBE: {
    597     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
    598     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
    599       getOperandsMapping(
    600         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    601          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
    602          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    603          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
    604          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
    605       5); // Num Operands
    606     AltMappings.push_back(&SSMapping);
    607 
    608     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
    609       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
    610                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
    611                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
    612                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
    613                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
    614       5); // Num Operands
    615     AltMappings.push_back(&VVMapping);
    616     return AltMappings;
    617   }
    618   case AMDGPU::G_BRCOND: {
    619     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
    620 
    621     // TODO: Change type to 32 for scalar
    622     const InstructionMapping &SMapping = getInstructionMapping(
    623       1, 1, getOperandsMapping(
    624         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
    625       2); // Num Operands
    626     AltMappings.push_back(&SMapping);
    627 
    628     const InstructionMapping &VMapping = getInstructionMapping(
    629       1, 1, getOperandsMapping(
    630         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
    631       2); // Num Operands
    632     AltMappings.push_back(&VMapping);
    633     return AltMappings;
    634   }
    635   case AMDGPU::G_INTRINSIC:
    636     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
    637   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
    638     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
    639   default:
    640     break;
    641   }
    642   return RegisterBankInfo::getInstrAlternativeMappings(MI);
    643 }
    644 
    645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
    646   MachineIRBuilder &B,
    647   SmallVector<Register, 2> &Regs,
    648   LLT HalfTy,
    649   Register Reg) const {
    650   assert(HalfTy.getSizeInBits() == 32);
    651   MachineRegisterInfo *MRI = B.getMRI();
    652   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
    653   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
    654   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
    655   MRI->setRegBank(LoLHS, *Bank);
    656   MRI->setRegBank(HiLHS, *Bank);
    657 
    658   Regs.push_back(LoLHS);
    659   Regs.push_back(HiLHS);
    660 
    661   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
    662     .addDef(LoLHS)
    663     .addDef(HiLHS)
    664     .addUse(Reg);
    665 }
    666 
    667 /// Replace the current type each register in \p Regs has with \p NewTy
    668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
    669                           LLT NewTy) {
    670   for (Register Reg : Regs) {
    671     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
    672     MRI.setType(Reg, NewTy);
    673   }
    674 }
    675 
    676 static LLT getHalfSizedType(LLT Ty) {
    677   if (Ty.isVector()) {
    678     assert(Ty.getNumElements() % 2 == 0);
    679     return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
    680   }
    681 
    682   assert(Ty.getSizeInBits() % 2 == 0);
    683   return LLT::scalar(Ty.getSizeInBits() / 2);
    684 }
    685 
    686 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
    687 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
    688 /// execute the instruction for each unique combination of values in all lanes
    689 /// in the wave. The block will be split such that rest of the instructions are
    690 /// moved to a new block.
    691 ///
    692 /// Essentially performs this loop:
    693 //
    694 /// Save Execution Mask
    695 /// For (Lane : Wavefront) {
    696 ///   Enable Lane, Disable all other lanes
    697 ///   SGPR = read SGPR value for current lane from VGPR
    698 ///   VGPRResult[Lane] = use_op SGPR
    699 /// }
    700 /// Restore Execution Mask
    701 ///
    702 /// There is additional complexity to try for compare values to identify the
    703 /// unique values used.
    704 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
    705   MachineIRBuilder &B,
    706   iterator_range<MachineBasicBlock::iterator> Range,
    707   SmallSet<Register, 4> &SGPROperandRegs,
    708   MachineRegisterInfo &MRI) const {
    709   SmallVector<Register, 4> ResultRegs;
    710   SmallVector<Register, 4> InitResultRegs;
    711   SmallVector<Register, 4> PhiRegs;
    712 
    713   // Track use registers which have already been expanded with a readfirstlane
    714   // sequence. This may have multiple uses if moving a sequence.
    715   DenseMap<Register, Register> WaterfalledRegMap;
    716 
    717   MachineBasicBlock &MBB = B.getMBB();
    718   MachineFunction *MF = &B.getMF();
    719 
    720   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
    721   const unsigned WaveAndOpc = Subtarget.isWave32() ?
    722     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
    723   const unsigned MovTermOpc = Subtarget.isWave32() ?
    724     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
    725   const unsigned XorTermOpc = Subtarget.isWave32() ?
    726     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
    727   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
    728     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
    729   const unsigned ExecReg =  Subtarget.isWave32() ?
    730     AMDGPU::EXEC_LO : AMDGPU::EXEC;
    731 
    732 #ifndef NDEBUG
    733   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
    734 #endif
    735 
    736   for (MachineInstr &MI : Range) {
    737     for (MachineOperand &Def : MI.defs()) {
    738       if (MRI.use_nodbg_empty(Def.getReg()))
    739         continue;
    740 
    741       LLT ResTy = MRI.getType(Def.getReg());
    742       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
    743       ResultRegs.push_back(Def.getReg());
    744       Register InitReg = B.buildUndef(ResTy).getReg(0);
    745       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
    746       InitResultRegs.push_back(InitReg);
    747       PhiRegs.push_back(PhiReg);
    748       MRI.setRegBank(PhiReg, *DefBank);
    749       MRI.setRegBank(InitReg, *DefBank);
    750     }
    751   }
    752 
    753   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
    754   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
    755 
    756   // Don't bother using generic instructions/registers for the exec mask.
    757   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
    758     .addDef(InitSaveExecReg);
    759 
    760   Register PhiExec = MRI.createVirtualRegister(WaveRC);
    761   Register NewExec = MRI.createVirtualRegister(WaveRC);
    762 
    763   // To insert the loop we need to split the block. Move everything before this
    764   // point to a new block, and insert a new empty block before this instruction.
    765   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    766   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    767   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
    768   MachineFunction::iterator MBBI(MBB);
    769   ++MBBI;
    770   MF->insert(MBBI, LoopBB);
    771   MF->insert(MBBI, RestoreExecBB);
    772   MF->insert(MBBI, RemainderBB);
    773 
    774   LoopBB->addSuccessor(RestoreExecBB);
    775   LoopBB->addSuccessor(LoopBB);
    776 
    777   // Move the rest of the block into a new block.
    778   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    779   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
    780 
    781   MBB.addSuccessor(LoopBB);
    782   RestoreExecBB->addSuccessor(RemainderBB);
    783 
    784   B.setInsertPt(*LoopBB, LoopBB->end());
    785 
    786   B.buildInstr(TargetOpcode::PHI)
    787     .addDef(PhiExec)
    788     .addReg(InitSaveExecReg)
    789     .addMBB(&MBB)
    790     .addReg(NewExec)
    791     .addMBB(LoopBB);
    792 
    793   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
    794     B.buildInstr(TargetOpcode::G_PHI)
    795       .addDef(std::get<2>(Result))
    796       .addReg(std::get<0>(Result)) // Initial value / implicit_def
    797       .addMBB(&MBB)
    798       .addReg(std::get<1>(Result)) // Mid-loop value.
    799       .addMBB(LoopBB);
    800   }
    801 
    802   const DebugLoc &DL = B.getDL();
    803 
    804   MachineInstr &FirstInst = *Range.begin();
    805 
    806   // Move the instruction into the loop. Note we moved everything after
    807   // Range.end() already into a new block, so Range.end() is no longer valid.
    808   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
    809 
    810   // Figure out the iterator range after splicing the instructions.
    811   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
    812   auto NewEnd = LoopBB->end();
    813 
    814   MachineBasicBlock::iterator I = Range.begin();
    815   B.setInsertPt(*LoopBB, I);
    816 
    817   Register CondReg;
    818 
    819   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
    820 
    821   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
    822     for (MachineOperand &Op : MI.uses()) {
    823       if (!Op.isReg() || Op.isDef())
    824         continue;
    825 
    826       Register OldReg = Op.getReg();
    827       if (!SGPROperandRegs.count(OldReg))
    828         continue;
    829 
    830       // See if we already processed this register in another instruction in the
    831       // sequence.
    832       auto OldVal = WaterfalledRegMap.find(OldReg);
    833       if (OldVal != WaterfalledRegMap.end()) {
    834         Op.setReg(OldVal->second);
    835         continue;
    836       }
    837 
    838       Register OpReg = Op.getReg();
    839       LLT OpTy = MRI.getType(OpReg);
    840 
    841       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
    842       if (OpBank != &AMDGPU::VGPRRegBank) {
    843         // Insert copy from AGPR to VGPR before the loop.
    844         B.setMBB(MBB);
    845         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
    846         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
    847         B.setInstr(*I);
    848       }
    849 
    850       unsigned OpSize = OpTy.getSizeInBits();
    851 
    852       // Can only do a readlane of 32-bit pieces.
    853       if (OpSize == 32) {
    854         // Avoid extra copies in the simple case of one 32-bit register.
    855         Register CurrentLaneOpReg
    856           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    857         MRI.setType(CurrentLaneOpReg, OpTy);
    858 
    859         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
    860         // Read the next variant <- also loop target.
    861         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
    862                 CurrentLaneOpReg)
    863           .addReg(OpReg);
    864 
    865         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
    866         bool First = CondReg == AMDGPU::NoRegister;
    867         if (First)
    868           CondReg = NewCondReg;
    869 
    870         // Compare the just read M0 value to all possible Idx values.
    871         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
    872           .addDef(NewCondReg)
    873           .addReg(CurrentLaneOpReg)
    874           .addReg(OpReg);
    875         Op.setReg(CurrentLaneOpReg);
    876 
    877         if (!First) {
    878           Register AndReg = MRI.createVirtualRegister(WaveRC);
    879 
    880           // If there are multiple operands to consider, and the conditions.
    881           B.buildInstr(WaveAndOpc)
    882             .addDef(AndReg)
    883             .addReg(NewCondReg)
    884             .addReg(CondReg);
    885           CondReg = AndReg;
    886         }
    887       } else {
    888         LLT S32 = LLT::scalar(32);
    889         SmallVector<Register, 8> ReadlanePieces;
    890 
    891         // The compares can be done as 64-bit, but the extract needs to be done
    892         // in 32-bit pieces.
    893 
    894         bool Is64 = OpSize % 64 == 0;
    895 
    896         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
    897         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
    898           : AMDGPU::V_CMP_EQ_U32_e64;
    899 
    900         // The compares can be done as 64-bit, but the extract needs to be done
    901         // in 32-bit pieces.
    902 
    903         // Insert the unmerge before the loop.
    904 
    905         B.setMBB(MBB);
    906         auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
    907         B.setInstr(*I);
    908 
    909         unsigned NumPieces = Unmerge->getNumOperands() - 1;
    910         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
    911           Register UnmergePiece = Unmerge.getReg(PieceIdx);
    912 
    913           Register CurrentLaneOpReg;
    914           if (Is64) {
    915             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
    916             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
    917 
    918             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
    919             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
    920             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
    921 
    922             // Read the next variant <- also loop target.
    923             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
    924                     CurrentLaneOpRegLo)
    925               .addReg(UnmergePiece, 0, AMDGPU::sub0);
    926 
    927             // Read the next variant <- also loop target.
    928             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
    929                     CurrentLaneOpRegHi)
    930               .addReg(UnmergePiece, 0, AMDGPU::sub1);
    931 
    932             CurrentLaneOpReg =
    933               B.buildMerge(LLT::scalar(64),
    934                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
    935               .getReg(0);
    936 
    937             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
    938 
    939             if (OpTy.getScalarSizeInBits() == 64) {
    940               // If we need to produce a 64-bit element vector, so use the
    941               // merged pieces
    942               ReadlanePieces.push_back(CurrentLaneOpReg);
    943             } else {
    944               // 32-bit element type.
    945               ReadlanePieces.push_back(CurrentLaneOpRegLo);
    946               ReadlanePieces.push_back(CurrentLaneOpRegHi);
    947             }
    948           } else {
    949             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
    950             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
    951             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
    952 
    953             // Read the next variant <- also loop target.
    954             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
    955                     CurrentLaneOpReg)
    956               .addReg(UnmergePiece);
    957             ReadlanePieces.push_back(CurrentLaneOpReg);
    958           }
    959 
    960           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
    961           bool First = CondReg == AMDGPU::NoRegister;
    962           if (First)
    963             CondReg = NewCondReg;
    964 
    965           B.buildInstr(CmpOp)
    966             .addDef(NewCondReg)
    967             .addReg(CurrentLaneOpReg)
    968             .addReg(UnmergePiece);
    969 
    970           if (!First) {
    971             Register AndReg = MRI.createVirtualRegister(WaveRC);
    972 
    973             // If there are multiple operands to consider, and the conditions.
    974             B.buildInstr(WaveAndOpc)
    975               .addDef(AndReg)
    976               .addReg(NewCondReg)
    977               .addReg(CondReg);
    978             CondReg = AndReg;
    979           }
    980         }
    981 
    982         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
    983         // BUILD_VECTOR
    984         if (OpTy.isVector()) {
    985           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
    986           Op.setReg(Merge.getReg(0));
    987         } else {
    988           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
    989           Op.setReg(Merge.getReg(0));
    990         }
    991 
    992         MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
    993       }
    994 
    995       // Make sure we don't re-process this register again.
    996       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
    997     }
    998   }
    999 
   1000   B.setInsertPt(*LoopBB, LoopBB->end());
   1001 
   1002   // Update EXEC, save the original EXEC value to VCC.
   1003   B.buildInstr(AndSaveExecOpc)
   1004     .addDef(NewExec)
   1005     .addReg(CondReg, RegState::Kill);
   1006 
   1007   MRI.setSimpleHint(NewExec, CondReg);
   1008 
   1009   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   1010   B.buildInstr(XorTermOpc)
   1011     .addDef(ExecReg)
   1012     .addReg(ExecReg)
   1013     .addReg(NewExec);
   1014 
   1015   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
   1016   // s_cbranch_scc0?
   1017 
   1018   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
   1019   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
   1020     .addMBB(LoopBB);
   1021 
   1022   // Save the EXEC mask before the loop.
   1023   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
   1024     .addReg(ExecReg);
   1025 
   1026   // Restore the EXEC mask after the loop.
   1027   B.setMBB(*RestoreExecBB);
   1028   B.buildInstr(MovTermOpc)
   1029     .addDef(ExecReg)
   1030     .addReg(SaveExecReg);
   1031 
   1032   // Set the insert point after the original instruction, so any new
   1033   // instructions will be in the remainder.
   1034   B.setInsertPt(*RemainderBB, RemainderBB->begin());
   1035 
   1036   return true;
   1037 }
   1038 
   1039 // Return any unique registers used by \p MI at \p OpIndices that need to be
   1040 // handled in a waterfall loop. Returns these registers in \p
   1041 // SGPROperandRegs. Returns true if there are any operands to handle and a
   1042 // waterfall loop is necessary.
   1043 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
   1044   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
   1045   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
   1046   for (unsigned Op : OpIndices) {
   1047     assert(MI.getOperand(Op).isUse());
   1048     Register Reg = MI.getOperand(Op).getReg();
   1049     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
   1050     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
   1051       SGPROperandRegs.insert(Reg);
   1052   }
   1053 
   1054   // No operands need to be replaced, so no need to loop.
   1055   return !SGPROperandRegs.empty();
   1056 }
   1057 
   1058 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   1059   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
   1060   ArrayRef<unsigned> OpIndices) const {
   1061   // Use a set to avoid extra readfirstlanes in the case where multiple operands
   1062   // are the same register.
   1063   SmallSet<Register, 4> SGPROperandRegs;
   1064 
   1065   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
   1066     return false;
   1067 
   1068   MachineBasicBlock::iterator I = MI.getIterator();
   1069   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
   1070                                 SGPROperandRegs, MRI);
   1071 }
   1072 
   1073 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   1074   MachineInstr &MI, MachineRegisterInfo &MRI,
   1075   ArrayRef<unsigned> OpIndices) const {
   1076   MachineIRBuilder B(MI);
   1077   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
   1078 }
   1079 
   1080 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
   1081 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
   1082     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
   1083   Register Reg = MI.getOperand(OpIdx).getReg();
   1084   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
   1085   if (Bank == &AMDGPU::SGPRRegBank)
   1086     return;
   1087 
   1088   LLT Ty = MRI.getType(Reg);
   1089   MachineIRBuilder B(MI);
   1090 
   1091   if (Bank != &AMDGPU::VGPRRegBank) {
   1092     // We need to copy from AGPR to VGPR
   1093     Reg = B.buildCopy(Ty, Reg).getReg(0);
   1094     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
   1095   }
   1096 
   1097   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   1098   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
   1099     .addDef(SGPR)
   1100     .addReg(Reg);
   1101 
   1102   MRI.setType(SGPR, Ty);
   1103 
   1104   const TargetRegisterClass *Constrained =
   1105       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
   1106   (void)Constrained;
   1107   assert(Constrained && "Failed to constrain readfirstlane src reg");
   1108 
   1109   MI.getOperand(OpIdx).setReg(SGPR);
   1110 }
   1111 
   1112 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
   1113 /// rest will be in the remainder.
   1114 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
   1115   unsigned TotalSize = Ty.getSizeInBits();
   1116   if (!Ty.isVector())
   1117     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
   1118 
   1119   LLT EltTy = Ty.getElementType();
   1120   unsigned EltSize = EltTy.getSizeInBits();
   1121   assert(FirstSize % EltSize == 0);
   1122 
   1123   unsigned FirstPartNumElts = FirstSize / EltSize;
   1124   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
   1125 
   1126   return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
   1127           LLT::scalarOrVector(RemainderElts, EltTy)};
   1128 }
   1129 
   1130 static LLT widen96To128(LLT Ty) {
   1131   if (!Ty.isVector())
   1132     return LLT::scalar(128);
   1133 
   1134   LLT EltTy = Ty.getElementType();
   1135   assert(128 % EltTy.getSizeInBits() == 0);
   1136   return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
   1137 }
   1138 
   1139 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
   1140                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
   1141                                               MachineRegisterInfo &MRI) const {
   1142   Register DstReg = MI.getOperand(0).getReg();
   1143   const LLT LoadTy = MRI.getType(DstReg);
   1144   unsigned LoadSize = LoadTy.getSizeInBits();
   1145   const unsigned MaxNonSmrdLoadSize = 128;
   1146 
   1147   const RegisterBank *DstBank =
   1148       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   1149   if (DstBank == &AMDGPU::SGPRRegBank) {
   1150     // There are some special cases that we need to look at for 32 bit and 96
   1151     // bit SGPR loads otherwise we have nothing to do.
   1152     if (LoadSize != 32 && LoadSize != 96)
   1153       return false;
   1154 
   1155     MachineMemOperand *MMO = *MI.memoperands_begin();
   1156     const unsigned MemSize = 8 * MMO->getSize();
   1157     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
   1158     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
   1159     // scalar loads should have a load size of 32 but memory access size of less
   1160     // than 32.
   1161     if (LoadSize == 32 &&
   1162         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
   1163       return false;
   1164 
   1165     Register PtrReg = MI.getOperand(1).getReg();
   1166 
   1167     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
   1168     MachineIRBuilder B(MI, O);
   1169 
   1170     if (LoadSize == 32) {
   1171       // This is an extending load from a sub-dword size. Widen the memory
   1172       // access size to 4 bytes and clear the extra high bits appropriately
   1173       const LLT S32 = LLT::scalar(32);
   1174       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
   1175         // Must extend the sign bit into higher bits for a G_SEXTLOAD
   1176         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
   1177         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
   1178       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
   1179         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
   1180         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
   1181         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
   1182       } else
   1183         // We do not need to touch the higher bits for regular loads.
   1184         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
   1185     } else {
   1186       // 96-bit loads are only available for vector loads. We need to split this
   1187       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
   1188       if (MMO->getAlign() < Align(16)) {
   1189         LLT Part64, Part32;
   1190         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
   1191         auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
   1192         auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
   1193 
   1194         auto Undef = B.buildUndef(LoadTy);
   1195         auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
   1196         B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
   1197       } else {
   1198         LLT WiderTy = widen96To128(LoadTy);
   1199         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
   1200         B.buildExtract(MI.getOperand(0), WideLoad, 0);
   1201       }
   1202     }
   1203 
   1204     MI.eraseFromParent();
   1205     return true;
   1206   }
   1207 
   1208   // 128-bit loads are supported for all instruction types.
   1209   if (LoadSize <= MaxNonSmrdLoadSize)
   1210     return false;
   1211 
   1212   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
   1213   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
   1214 
   1215   if (SrcRegs.empty())
   1216     SrcRegs.push_back(MI.getOperand(1).getReg());
   1217 
   1218   assert(LoadSize % MaxNonSmrdLoadSize == 0);
   1219 
   1220   // RegBankSelect only emits scalar types, so we need to reset the pointer
   1221   // operand to a pointer type.
   1222   Register BasePtrReg = SrcRegs[0];
   1223   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
   1224   MRI.setType(BasePtrReg, PtrTy);
   1225 
   1226   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
   1227   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
   1228   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
   1229   MachineIRBuilder B(MI, Observer);
   1230   LegalizerHelper Helper(B.getMF(), Observer, B);
   1231 
   1232   if (LoadTy.isVector()) {
   1233     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
   1234       return false;
   1235   } else {
   1236     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
   1237       return false;
   1238   }
   1239 
   1240   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
   1241   return true;
   1242 }
   1243 
   1244 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
   1245   MachineInstr &MI,
   1246   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
   1247   MachineRegisterInfo &MRI) const {
   1248   const MachineFunction &MF = *MI.getMF();
   1249   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   1250   const auto &TFI = *ST.getFrameLowering();
   1251 
   1252   // Guard in case the stack growth direction ever changes with scratch
   1253   // instructions.
   1254   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
   1255     return false;
   1256 
   1257   Register Dst = MI.getOperand(0).getReg();
   1258   Register AllocSize = MI.getOperand(1).getReg();
   1259   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
   1260 
   1261   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
   1262 
   1263   // TODO: Need to emit a wave reduction to get the maximum size.
   1264   if (SizeBank != &AMDGPU::SGPRRegBank)
   1265     return false;
   1266 
   1267   LLT PtrTy = MRI.getType(Dst);
   1268   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
   1269 
   1270   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   1271   Register SPReg = Info->getStackPtrOffsetReg();
   1272   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
   1273   MachineIRBuilder B(MI, ApplyBank);
   1274 
   1275   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
   1276   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
   1277 
   1278   auto SPCopy = B.buildCopy(PtrTy, SPReg);
   1279   if (Alignment > TFI.getStackAlign()) {
   1280     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
   1281     B.buildMaskLowPtrBits(Dst, PtrAdd,
   1282                           Log2(Alignment) + ST.getWavefrontSizeLog2());
   1283   } else {
   1284     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
   1285   }
   1286 
   1287   MI.eraseFromParent();
   1288   return true;
   1289 }
   1290 
   1291 bool AMDGPURegisterBankInfo::applyMappingImage(
   1292     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
   1293     MachineRegisterInfo &MRI, int RsrcIdx) const {
   1294   const int NumDefs = MI.getNumExplicitDefs();
   1295 
   1296   // The reported argument index is relative to the IR intrinsic call arguments,
   1297   // so we need to shift by the number of defs and the intrinsic ID.
   1298   RsrcIdx += NumDefs + 1;
   1299 
   1300   // Insert copies to VGPR arguments.
   1301   applyDefaultMapping(OpdMapper);
   1302 
   1303   // Fixup any SGPR arguments.
   1304   SmallVector<unsigned, 4> SGPRIndexes;
   1305   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
   1306     if (!MI.getOperand(I).isReg())
   1307       continue;
   1308 
   1309     // If this intrinsic has a sampler, it immediately follows rsrc.
   1310     if (I == RsrcIdx || I == RsrcIdx + 1)
   1311       SGPRIndexes.push_back(I);
   1312   }
   1313 
   1314   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
   1315   return true;
   1316 }
   1317 
   1318 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
   1319                                         Register Reg) {
   1320   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
   1321   if (!Def)
   1322     return Reg;
   1323 
   1324   // TODO: Guard against this being an implicit def
   1325   return Def->getOperand(0).getReg();
   1326 }
   1327 
   1328 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
   1329 // the three offsets (voffset, soffset and instoffset)
   1330 static unsigned setBufferOffsets(MachineIRBuilder &B,
   1331                                  const AMDGPURegisterBankInfo &RBI,
   1332                                  Register CombinedOffset, Register &VOffsetReg,
   1333                                  Register &SOffsetReg, int64_t &InstOffsetVal,
   1334                                  Align Alignment) {
   1335   const LLT S32 = LLT::scalar(32);
   1336   MachineRegisterInfo *MRI = B.getMRI();
   1337 
   1338   if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
   1339     uint32_t SOffset, ImmOffset;
   1340     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
   1341                                  Alignment)) {
   1342       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
   1343       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
   1344       InstOffsetVal = ImmOffset;
   1345 
   1346       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
   1347       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
   1348       return SOffset + ImmOffset;
   1349     }
   1350   }
   1351 
   1352   Register Base;
   1353   unsigned Offset;
   1354 
   1355   std::tie(Base, Offset) =
   1356       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
   1357 
   1358   uint32_t SOffset, ImmOffset;
   1359   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
   1360                                                   &RBI.Subtarget, Alignment)) {
   1361     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
   1362       VOffsetReg = Base;
   1363       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
   1364       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
   1365       InstOffsetVal = ImmOffset;
   1366       return 0; // XXX - Why is this 0?
   1367     }
   1368 
   1369     // If we have SGPR base, we can use it for soffset.
   1370     if (SOffset == 0) {
   1371       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
   1372       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
   1373       SOffsetReg = Base;
   1374       InstOffsetVal = ImmOffset;
   1375       return 0; // XXX - Why is this 0?
   1376     }
   1377   }
   1378 
   1379   // Handle the variable sgpr + vgpr case.
   1380   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
   1381   if (Add && (int)Offset >= 0) {
   1382     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
   1383     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
   1384 
   1385     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
   1386     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
   1387 
   1388     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
   1389       VOffsetReg = Src0;
   1390       SOffsetReg = Src1;
   1391       return 0;
   1392     }
   1393 
   1394     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
   1395       VOffsetReg = Src1;
   1396       SOffsetReg = Src0;
   1397       return 0;
   1398     }
   1399   }
   1400 
   1401   // Ensure we have a VGPR for the combined offset. This could be an issue if we
   1402   // have an SGPR offset and a VGPR resource.
   1403   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
   1404     VOffsetReg = CombinedOffset;
   1405   } else {
   1406     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
   1407     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
   1408   }
   1409 
   1410   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
   1411   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
   1412   return 0;
   1413 }
   1414 
   1415 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
   1416   const OperandsMapper &OpdMapper) const {
   1417   MachineInstr &MI = OpdMapper.getMI();
   1418   MachineRegisterInfo &MRI = OpdMapper.getMRI();
   1419 
   1420   const LLT S32 = LLT::scalar(32);
   1421   Register Dst = MI.getOperand(0).getReg();
   1422   LLT Ty = MRI.getType(Dst);
   1423 
   1424   const RegisterBank *RSrcBank =
   1425     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   1426   const RegisterBank *OffsetBank =
   1427     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
   1428   if (RSrcBank == &AMDGPU::SGPRRegBank &&
   1429       OffsetBank == &AMDGPU::SGPRRegBank)
   1430     return true; // Legal mapping
   1431 
   1432   // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
   1433   // here but don't have an MMO.
   1434 
   1435   unsigned LoadSize = Ty.getSizeInBits();
   1436   int NumLoads = 1;
   1437   if (LoadSize == 256 || LoadSize == 512) {
   1438     NumLoads = LoadSize / 128;
   1439     Ty = Ty.divide(NumLoads);
   1440   }
   1441 
   1442   // Use the alignment to ensure that the required offsets will fit into the
   1443   // immediate offsets.
   1444   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
   1445 
   1446   MachineIRBuilder B(MI);
   1447   MachineFunction &MF = B.getMF();
   1448 
   1449   Register SOffset;
   1450   Register VOffset;
   1451   int64_t ImmOffset = 0;
   1452 
   1453   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
   1454                                         VOffset, SOffset, ImmOffset, Alignment);
   1455 
   1456   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
   1457   // can, but we neeed to track an MMO for that.
   1458   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
   1459   const Align MemAlign(4); // FIXME: ABI type alignment?
   1460   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
   1461     MachinePointerInfo(),
   1462     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
   1463     MachineMemOperand::MOInvariant,
   1464     MemSize, MemAlign);
   1465   if (MMOOffset != 0)
   1466     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
   1467 
   1468   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
   1469   // assume that the buffer is unswizzled.
   1470 
   1471   Register RSrc = MI.getOperand(1).getReg();
   1472   Register VIndex = B.buildConstant(S32, 0).getReg(0);
   1473   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
   1474 
   1475   SmallVector<Register, 4> LoadParts(NumLoads);
   1476 
   1477   MachineBasicBlock::iterator MII = MI.getIterator();
   1478   MachineInstrSpan Span(MII, &B.getMBB());
   1479 
   1480   for (int i = 0; i < NumLoads; ++i) {
   1481     if (NumLoads == 1) {
   1482       LoadParts[i] = Dst;
   1483     } else {
   1484       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
   1485       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
   1486     }
   1487 
   1488     MachineMemOperand *MMO = BaseMMO;
   1489     if (i != 0)
   1490       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
   1491 
   1492     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
   1493       .addDef(LoadParts[i])       // vdata
   1494       .addUse(RSrc)               // rsrc
   1495       .addUse(VIndex)             // vindex
   1496       .addUse(VOffset)            // voffset
   1497       .addUse(SOffset)            // soffset
   1498       .addImm(ImmOffset + 16 * i) // offset(imm)
   1499       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
   1500       .addImm(0)                  // idxen(imm)
   1501       .addMemOperand(MMO);
   1502   }
   1503 
   1504   // TODO: If only the resource is a VGPR, it may be better to execute the
   1505   // scalar load in the waterfall loop if the resource is expected to frequently
   1506   // be dynamically uniform.
   1507   if (RSrcBank != &AMDGPU::SGPRRegBank) {
   1508     // Remove the original instruction to avoid potentially confusing the
   1509     // waterfall loop logic.
   1510     B.setInstr(*Span.begin());
   1511     MI.eraseFromParent();
   1512 
   1513     SmallSet<Register, 4> OpsToWaterfall;
   1514 
   1515     OpsToWaterfall.insert(RSrc);
   1516     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
   1517                            OpsToWaterfall, MRI);
   1518   }
   1519 
   1520   if (NumLoads != 1) {
   1521     if (Ty.isVector())
   1522       B.buildConcatVectors(Dst, LoadParts);
   1523     else
   1524       B.buildMerge(Dst, LoadParts);
   1525   }
   1526 
   1527   // We removed the instruction earlier with a waterfall loop.
   1528   if (RSrcBank == &AMDGPU::SGPRRegBank)
   1529     MI.eraseFromParent();
   1530 
   1531   return true;
   1532 }
   1533 
   1534 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
   1535   const OperandsMapper &OpdMapper, bool Signed) const {
   1536   MachineInstr &MI = OpdMapper.getMI();
   1537   MachineRegisterInfo &MRI = OpdMapper.getMRI();
   1538 
   1539   // Insert basic copies
   1540   applyDefaultMapping(OpdMapper);
   1541 
   1542   Register DstReg = MI.getOperand(0).getReg();
   1543   LLT Ty = MRI.getType(DstReg);
   1544 
   1545   const LLT S32 = LLT::scalar(32);
   1546 
   1547   const RegisterBank *DstBank =
   1548     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   1549   if (DstBank == &AMDGPU::VGPRRegBank) {
   1550     if (Ty == S32)
   1551       return true;
   1552 
   1553     // TODO: 64-bit version is scalar only, so we need to expand this.
   1554     return false;
   1555   }
   1556 
   1557   Register SrcReg = MI.getOperand(2).getReg();
   1558   Register OffsetReg = MI.getOperand(3).getReg();
   1559   Register WidthReg = MI.getOperand(4).getReg();
   1560 
   1561   // The scalar form packs the offset and width in a single operand.
   1562 
   1563   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
   1564   MachineIRBuilder B(MI, ApplyBank);
   1565 
   1566   // Ensure the high bits are clear to insert the offset.
   1567   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
   1568   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
   1569 
   1570   // Zeros out the low bits, so don't bother clamping the input value.
   1571   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
   1572 
   1573   // Transformation function, pack the offset and width of a BFE into
   1574   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
   1575   // source, bits [5:0] contain the offset and bits [22:16] the width.
   1576   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
   1577 
   1578   // TODO: It might be worth using a pseudo here to avoid scc clobber and
   1579   // register class constraints.
   1580   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
   1581                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
   1582 
   1583   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
   1584   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
   1585     llvm_unreachable("failed to constrain BFE");
   1586 
   1587   MI.eraseFromParent();
   1588   return true;
   1589 }
   1590 
   1591 // Return a suitable opcode for extending the operands of Opc when widening.
   1592 static unsigned getExtendOp(unsigned Opc) {
   1593   switch (Opc) {
   1594   case TargetOpcode::G_ASHR:
   1595   case TargetOpcode::G_SMIN:
   1596   case TargetOpcode::G_SMAX:
   1597     return TargetOpcode::G_SEXT;
   1598   case TargetOpcode::G_LSHR:
   1599   case TargetOpcode::G_UMIN:
   1600   case TargetOpcode::G_UMAX:
   1601     return TargetOpcode::G_ZEXT;
   1602   default:
   1603     return TargetOpcode::G_ANYEXT;
   1604   }
   1605 }
   1606 
   1607 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
   1608 // any illegal vector extend or unmerge operations.
   1609 static std::pair<Register, Register>
   1610 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
   1611   const LLT S32 = LLT::scalar(32);
   1612   auto Bitcast = B.buildBitcast(S32, Src);
   1613 
   1614   if (ExtOpcode == TargetOpcode::G_SEXT) {
   1615     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
   1616     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
   1617     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
   1618   }
   1619 
   1620   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
   1621   if (ExtOpcode == TargetOpcode::G_ZEXT) {
   1622     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
   1623     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
   1624   }
   1625 
   1626   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
   1627   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
   1628 }
   1629 
   1630 // For cases where only a single copy is inserted for matching register banks.
   1631 // Replace the register in the instruction operand
   1632 static bool substituteSimpleCopyRegs(
   1633   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
   1634   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
   1635   if (!SrcReg.empty()) {
   1636     assert(SrcReg.size() == 1);
   1637     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
   1638     return true;
   1639   }
   1640 
   1641   return false;
   1642 }
   1643 
   1644 /// Handle register layout difference for f16 images for some subtargets.
   1645 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
   1646                                                 MachineRegisterInfo &MRI,
   1647                                                 Register Reg) const {
   1648   if (!Subtarget.hasUnpackedD16VMem())
   1649     return Reg;
   1650 
   1651   const LLT S16 = LLT::scalar(16);
   1652   LLT StoreVT = MRI.getType(Reg);
   1653   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
   1654     return Reg;
   1655 
   1656   auto Unmerge = B.buildUnmerge(S16, Reg);
   1657 
   1658 
   1659   SmallVector<Register, 4> WideRegs;
   1660   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
   1661     WideRegs.push_back(Unmerge.getReg(I));
   1662 
   1663   const LLT S32 = LLT::scalar(32);
   1664   int NumElts = StoreVT.getNumElements();
   1665 
   1666   return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
   1667 }
   1668 
   1669 static std::pair<Register, unsigned>
   1670 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
   1671   int64_t Const;
   1672   if (mi_match(Reg, MRI, m_ICst(Const)))
   1673     return std::make_pair(Register(), Const);
   1674 
   1675   Register Base;
   1676   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
   1677     return std::make_pair(Base, Const);
   1678 
   1679   // TODO: Handle G_OR used for add case
   1680   return std::make_pair(Reg, 0);
   1681 }
   1682 
   1683 std::pair<Register, unsigned>
   1684 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
   1685                                            Register OrigOffset) const {
   1686   const unsigned MaxImm = 4095;
   1687   Register BaseReg;
   1688   unsigned ImmOffset;
   1689   const LLT S32 = LLT::scalar(32);
   1690 
   1691   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
   1692                                                            OrigOffset);
   1693 
   1694   unsigned C1 = 0;
   1695   if (ImmOffset != 0) {
   1696     // If the immediate value is too big for the immoffset field, put the value
   1697     // and -4096 into the immoffset field so that the value that is copied/added
   1698     // for the voffset field is a multiple of 4096, and it stands more chance
   1699     // of being CSEd with the copy/add for another similar load/store.
   1700     // However, do not do that rounding down to a multiple of 4096 if that is a
   1701     // negative number, as it appears to be illegal to have a negative offset
   1702     // in the vgpr, even if adding the immediate offset makes it positive.
   1703     unsigned Overflow = ImmOffset & ~MaxImm;
   1704     ImmOffset -= Overflow;
   1705     if ((int32_t)Overflow < 0) {
   1706       Overflow += ImmOffset;
   1707       ImmOffset = 0;
   1708     }
   1709 
   1710     C1 = ImmOffset;
   1711     if (Overflow != 0) {
   1712       if (!BaseReg)
   1713         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
   1714       else {
   1715         auto OverflowVal = B.buildConstant(S32, Overflow);
   1716         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
   1717       }
   1718     }
   1719   }
   1720 
   1721   if (!BaseReg)
   1722     BaseReg = B.buildConstant(S32, 0).getReg(0);
   1723 
   1724   return {BaseReg, C1};
   1725 }
   1726 
   1727 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
   1728   int64_t C;
   1729   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
   1730 }
   1731 
   1732 static unsigned extractCPol(unsigned CachePolicy) {
   1733   return CachePolicy & AMDGPU::CPol::ALL;
   1734 }
   1735 
   1736 static unsigned extractSWZ(unsigned CachePolicy) {
   1737   return (CachePolicy >> 3) & 1;
   1738 }
   1739 
   1740 
   1741 MachineInstr *
   1742 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
   1743                                              MachineInstr &MI) const {
   1744    MachineRegisterInfo &MRI = *B.getMRI();
   1745   executeInWaterfallLoop(B, MI, MRI, {2, 4});
   1746 
   1747   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
   1748 
   1749   Register VData = MI.getOperand(1).getReg();
   1750   LLT Ty = MRI.getType(VData);
   1751 
   1752   int EltSize = Ty.getScalarSizeInBits();
   1753   int Size = Ty.getSizeInBits();
   1754 
   1755   // FIXME: Broken integer truncstore.
   1756   if (EltSize != 32)
   1757     report_fatal_error("unhandled intrinsic store");
   1758 
   1759   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
   1760   const int MemSize = (*MI.memoperands_begin())->getSize();
   1761 
   1762 
   1763   Register RSrc = MI.getOperand(2).getReg();
   1764   Register VOffset = MI.getOperand(3).getReg();
   1765   Register SOffset = MI.getOperand(4).getReg();
   1766   unsigned CachePolicy = MI.getOperand(5).getImm();
   1767 
   1768   unsigned ImmOffset;
   1769   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
   1770 
   1771   const bool Offen = !isZero(VOffset, MRI);
   1772 
   1773   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
   1774   switch (8 * MemSize) {
   1775   case 8:
   1776     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
   1777                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
   1778     break;
   1779   case 16:
   1780     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
   1781                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
   1782     break;
   1783   default:
   1784     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
   1785                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
   1786     if (Size > 32)
   1787       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
   1788     break;
   1789   }
   1790 
   1791 
   1792   // Set the insertion point back to the instruction in case it was moved into a
   1793   // loop.
   1794   B.setInstr(MI);
   1795 
   1796   MachineInstrBuilder MIB = B.buildInstr(Opc)
   1797     .addUse(VData);
   1798 
   1799   if (Offen)
   1800     MIB.addUse(VOffset);
   1801 
   1802   MIB.addUse(RSrc)
   1803      .addUse(SOffset)
   1804      .addImm(ImmOffset)
   1805      .addImm(extractCPol(CachePolicy))
   1806      .addImm(0) // tfe: FIXME: Remove from inst
   1807      .addImm(extractSWZ(CachePolicy))
   1808      .cloneMemRefs(MI);
   1809 
   1810   // FIXME: We need a way to report failure from applyMappingImpl.
   1811   // Insert constrain copies before inserting the loop.
   1812   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
   1813     report_fatal_error("failed to constrain selected store intrinsic");
   1814 
   1815   return MIB;
   1816 }
   1817 
   1818 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
   1819                                         Register SrcReg) const {
   1820   MachineRegisterInfo &MRI = *B.getMRI();
   1821   LLT SrcTy = MRI.getType(SrcReg);
   1822   if (SrcTy.getSizeInBits() == 32) {
   1823     // Use a v_mov_b32 here to make the exec dependency explicit.
   1824     B.buildInstr(AMDGPU::V_MOV_B32_e32)
   1825       .addDef(DstReg)
   1826       .addUse(SrcReg);
   1827     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
   1828            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
   1829   }
   1830 
   1831   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1832   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1833 
   1834   B.buildInstr(AMDGPU::V_MOV_B32_e32)
   1835     .addDef(TmpReg0)
   1836     .addUse(SrcReg, 0, AMDGPU::sub0);
   1837   B.buildInstr(AMDGPU::V_MOV_B32_e32)
   1838     .addDef(TmpReg1)
   1839     .addUse(SrcReg, 0, AMDGPU::sub1);
   1840   B.buildInstr(AMDGPU::REG_SEQUENCE)
   1841     .addDef(DstReg)
   1842     .addUse(TmpReg0)
   1843     .addImm(AMDGPU::sub0)
   1844     .addUse(TmpReg1)
   1845     .addImm(AMDGPU::sub1);
   1846 
   1847   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
   1848          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
   1849 }
   1850 
   1851 /// Utility function for pushing dynamic vector indexes with a constant offset
   1852 /// into waterwall loops.
   1853 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
   1854                                    MachineInstr &IdxUseInstr,
   1855                                    unsigned OpIdx,
   1856                                    unsigned ConstOffset) {
   1857   MachineRegisterInfo &MRI = *B.getMRI();
   1858   const LLT S32 = LLT::scalar(32);
   1859   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
   1860   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
   1861 
   1862   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
   1863 
   1864   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
   1865   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
   1866   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
   1867   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
   1868 }
   1869 
   1870 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
   1871 /// original 32-bit source value (to be inserted in the low part of the combined
   1872 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
   1873 /// value.
   1874 static void extendLow32IntoHigh32(MachineIRBuilder &B,
   1875                                   Register Hi32Reg, Register Lo32Reg,
   1876                                   unsigned ExtOpc,
   1877                                   const RegisterBank &RegBank,
   1878                                   bool IsBooleanSrc = false) {
   1879   if (ExtOpc == AMDGPU::G_ZEXT) {
   1880     B.buildConstant(Hi32Reg, 0);
   1881   } else if (ExtOpc == AMDGPU::G_SEXT) {
   1882     if (IsBooleanSrc) {
   1883       // If we know the original source was an s1, the high half is the same as
   1884       // the low.
   1885       B.buildCopy(Hi32Reg, Lo32Reg);
   1886     } else {
   1887       // Replicate sign bit from 32-bit extended part.
   1888       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
   1889       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
   1890       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
   1891     }
   1892   } else {
   1893     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
   1894     B.buildUndef(Hi32Reg);
   1895   }
   1896 }
   1897 
   1898 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
   1899   MachineInstr &MI, MachineRegisterInfo &MRI,
   1900   const OperandsMapper &OpdMapper) const {
   1901 
   1902   Register VecReg = MI.getOperand(1).getReg();
   1903   Register Idx = MI.getOperand(2).getReg();
   1904 
   1905   const RegisterBank &IdxBank =
   1906     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
   1907 
   1908   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
   1909 
   1910   LLT VecTy = MRI.getType(VecReg);
   1911   unsigned EltSize = VecTy.getScalarSizeInBits();
   1912   unsigned NumElem = VecTy.getNumElements();
   1913 
   1914   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
   1915                                                   IsDivergentIdx))
   1916     return false;
   1917 
   1918   MachineIRBuilder B(MI);
   1919   LLT S32 = LLT::scalar(32);
   1920 
   1921   const RegisterBank &DstBank =
   1922     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   1923   const RegisterBank &SrcBank =
   1924     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   1925 
   1926   const RegisterBank &CCBank =
   1927     (DstBank == AMDGPU::SGPRRegBank &&
   1928      SrcBank == AMDGPU::SGPRRegBank &&
   1929      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
   1930                                      : AMDGPU::VCCRegBank;
   1931   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
   1932 
   1933   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
   1934     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
   1935     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
   1936   }
   1937 
   1938   LLT EltTy = VecTy.getScalarType();
   1939   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
   1940   unsigned NumLanes = DstRegs.size();
   1941   if (!NumLanes)
   1942     NumLanes = 1;
   1943   else
   1944     EltTy = MRI.getType(DstRegs[0]);
   1945 
   1946   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
   1947   SmallVector<Register, 2> Res(NumLanes);
   1948   for (unsigned L = 0; L < NumLanes; ++L)
   1949     Res[L] = UnmergeToEltTy.getReg(L);
   1950 
   1951   for (unsigned I = 1; I < NumElem; ++I) {
   1952     auto IC = B.buildConstant(S32, I);
   1953     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
   1954     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
   1955     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
   1956 
   1957     for (unsigned L = 0; L < NumLanes; ++L) {
   1958       auto S = B.buildSelect(EltTy, Cmp,
   1959                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
   1960 
   1961       for (unsigned N : { 0, 2, 3 })
   1962         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
   1963 
   1964       Res[L] = S->getOperand(0).getReg();
   1965     }
   1966   }
   1967 
   1968   for (unsigned L = 0; L < NumLanes; ++L) {
   1969     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
   1970     B.buildCopy(DstReg, Res[L]);
   1971     MRI.setRegBank(DstReg, DstBank);
   1972   }
   1973 
   1974   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
   1975   MI.eraseFromParent();
   1976 
   1977   return true;
   1978 }
   1979 
   1980 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
   1981   MachineInstr &MI, MachineRegisterInfo &MRI,
   1982   const OperandsMapper &OpdMapper) const {
   1983 
   1984   Register VecReg = MI.getOperand(1).getReg();
   1985   Register Idx = MI.getOperand(3).getReg();
   1986 
   1987   const RegisterBank &IdxBank =
   1988     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
   1989 
   1990   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
   1991 
   1992   LLT VecTy = MRI.getType(VecReg);
   1993   unsigned EltSize = VecTy.getScalarSizeInBits();
   1994   unsigned NumElem = VecTy.getNumElements();
   1995 
   1996   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
   1997                                                   IsDivergentIdx))
   1998     return false;
   1999 
   2000   MachineIRBuilder B(MI);
   2001   LLT S32 = LLT::scalar(32);
   2002 
   2003   const RegisterBank &DstBank =
   2004     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2005   const RegisterBank &SrcBank =
   2006     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   2007   const RegisterBank &InsBank =
   2008     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
   2009 
   2010   const RegisterBank &CCBank =
   2011     (DstBank == AMDGPU::SGPRRegBank &&
   2012      SrcBank == AMDGPU::SGPRRegBank &&
   2013      InsBank == AMDGPU::SGPRRegBank &&
   2014      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
   2015                                      : AMDGPU::VCCRegBank;
   2016   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
   2017 
   2018   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
   2019     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
   2020     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
   2021   }
   2022 
   2023   LLT EltTy = VecTy.getScalarType();
   2024   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
   2025   unsigned NumLanes = InsRegs.size();
   2026   if (!NumLanes) {
   2027     NumLanes = 1;
   2028     InsRegs.push_back(MI.getOperand(2).getReg());
   2029   } else {
   2030     EltTy = MRI.getType(InsRegs[0]);
   2031   }
   2032 
   2033   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
   2034   SmallVector<Register, 16> Ops(NumElem * NumLanes);
   2035 
   2036   for (unsigned I = 0; I < NumElem; ++I) {
   2037     auto IC = B.buildConstant(S32, I);
   2038     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
   2039     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
   2040     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
   2041 
   2042     for (unsigned L = 0; L < NumLanes; ++L) {
   2043       auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
   2044                              UnmergeToEltTy.getReg(I * NumLanes + L));
   2045 
   2046       for (unsigned N : { 0, 2, 3 })
   2047         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
   2048 
   2049       Ops[I * NumLanes + L] = S->getOperand(0).getReg();
   2050     }
   2051   }
   2052 
   2053   LLT MergeTy = LLT::vector(Ops.size(), EltTy);
   2054   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
   2055     B.buildBuildVector(MI.getOperand(0), Ops);
   2056   } else {
   2057     auto Vec = B.buildBuildVector(MergeTy, Ops);
   2058     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
   2059     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
   2060   }
   2061 
   2062   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
   2063   MI.eraseFromParent();
   2064 
   2065   return true;
   2066 }
   2067 
   2068 void AMDGPURegisterBankInfo::applyMappingImpl(
   2069     const OperandsMapper &OpdMapper) const {
   2070   MachineInstr &MI = OpdMapper.getMI();
   2071   unsigned Opc = MI.getOpcode();
   2072   MachineRegisterInfo &MRI = OpdMapper.getMRI();
   2073   switch (Opc) {
   2074   case AMDGPU::G_PHI: {
   2075     Register DstReg = MI.getOperand(0).getReg();
   2076     LLT DstTy = MRI.getType(DstReg);
   2077     if (DstTy != LLT::scalar(1))
   2078       break;
   2079 
   2080     const LLT S32 = LLT::scalar(32);
   2081     const RegisterBank *DstBank =
   2082       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2083     if (DstBank == &AMDGPU::VCCRegBank) {
   2084       applyDefaultMapping(OpdMapper);
   2085       // The standard handling only considers the result register bank for
   2086       // phis. For VCC, blindly inserting a copy when the phi is lowered will
   2087       // produce an invalid copy. We can only copy with some kind of compare to
   2088       // get a vector boolean result. Insert a regitser bank copy that will be
   2089       // correctly lowered to a compare.
   2090       MachineIRBuilder B(*MI.getParent()->getParent());
   2091 
   2092       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
   2093         Register SrcReg = MI.getOperand(I).getReg();
   2094         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
   2095 
   2096         if (SrcBank != &AMDGPU::VCCRegBank) {
   2097           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
   2098           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
   2099 
   2100           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
   2101           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
   2102           MI.getOperand(I).setReg(Copy.getReg(0));
   2103         }
   2104       }
   2105 
   2106       return;
   2107     }
   2108 
   2109     // Phi handling is strange and only considers the bank of the destination.
   2110     substituteSimpleCopyRegs(OpdMapper, 0);
   2111 
   2112     // Promote SGPR/VGPR booleans to s32
   2113     MachineFunction *MF = MI.getParent()->getParent();
   2114     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
   2115     MachineIRBuilder B(MI, ApplyBank);
   2116     LegalizerHelper Helper(*MF, ApplyBank, B);
   2117 
   2118     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
   2119       llvm_unreachable("widen scalar should have succeeded");
   2120 
   2121     return;
   2122   }
   2123   case AMDGPU::G_ICMP:
   2124   case AMDGPU::G_UADDO:
   2125   case AMDGPU::G_USUBO:
   2126   case AMDGPU::G_UADDE:
   2127   case AMDGPU::G_SADDE:
   2128   case AMDGPU::G_USUBE:
   2129   case AMDGPU::G_SSUBE: {
   2130     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
   2131     Register DstReg = MI.getOperand(BoolDstOp).getReg();
   2132 
   2133     const RegisterBank *DstBank =
   2134       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2135     if (DstBank != &AMDGPU::SGPRRegBank)
   2136       break;
   2137 
   2138     const bool HasCarryIn = MI.getNumOperands() == 5;
   2139 
   2140     // If this is a scalar compare, promote the result to s32, as the selection
   2141     // will end up using a copy to a 32-bit vreg.
   2142     const LLT S32 = LLT::scalar(32);
   2143     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
   2144     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
   2145     MI.getOperand(BoolDstOp).setReg(NewDstReg);
   2146     MachineIRBuilder B(MI);
   2147 
   2148     if (HasCarryIn) {
   2149       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
   2150       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
   2151       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
   2152       MI.getOperand(4).setReg(NewSrcReg);
   2153     }
   2154 
   2155     MachineBasicBlock *MBB = MI.getParent();
   2156     B.setInsertPt(*MBB, std::next(MI.getIterator()));
   2157 
   2158     // If we had a constrained VCC result register, a copy was inserted to VCC
   2159     // from SGPR.
   2160     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
   2161     if (DefRegs.empty())
   2162       DefRegs.push_back(DstReg);
   2163     B.buildTrunc(DefRegs[0], NewDstReg);
   2164     return;
   2165   }
   2166   case AMDGPU::G_SELECT: {
   2167     Register DstReg = MI.getOperand(0).getReg();
   2168     LLT DstTy = MRI.getType(DstReg);
   2169 
   2170     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
   2171     if (CondRegs.empty())
   2172       CondRegs.push_back(MI.getOperand(1).getReg());
   2173     else {
   2174       assert(CondRegs.size() == 1);
   2175     }
   2176 
   2177     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
   2178     if (CondBank == &AMDGPU::SGPRRegBank) {
   2179       MachineIRBuilder B(MI);
   2180       const LLT S32 = LLT::scalar(32);
   2181       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
   2182       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
   2183 
   2184       MI.getOperand(1).setReg(NewCondReg);
   2185       B.buildZExt(NewCondReg, CondRegs[0]);
   2186     }
   2187 
   2188     if (DstTy.getSizeInBits() != 64)
   2189       break;
   2190 
   2191     MachineIRBuilder B(MI);
   2192     LLT HalfTy = getHalfSizedType(DstTy);
   2193 
   2194     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
   2195     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
   2196     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
   2197 
   2198     // All inputs are SGPRs, nothing special to do.
   2199     if (DefRegs.empty()) {
   2200       assert(Src1Regs.empty() && Src2Regs.empty());
   2201       break;
   2202     }
   2203 
   2204     if (Src1Regs.empty())
   2205       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
   2206     else {
   2207       setRegsToType(MRI, Src1Regs, HalfTy);
   2208     }
   2209 
   2210     if (Src2Regs.empty())
   2211       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
   2212     else
   2213       setRegsToType(MRI, Src2Regs, HalfTy);
   2214 
   2215     setRegsToType(MRI, DefRegs, HalfTy);
   2216 
   2217     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
   2218     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
   2219 
   2220     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
   2221     MI.eraseFromParent();
   2222     return;
   2223   }
   2224   case AMDGPU::G_BRCOND: {
   2225     Register CondReg = MI.getOperand(0).getReg();
   2226     // FIXME: Should use legalizer helper, but should change bool ext type.
   2227     const RegisterBank *CondBank =
   2228       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2229 
   2230     if (CondBank == &AMDGPU::SGPRRegBank) {
   2231       MachineIRBuilder B(MI);
   2232       const LLT S32 = LLT::scalar(32);
   2233       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
   2234       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
   2235 
   2236       MI.getOperand(0).setReg(NewCondReg);
   2237       B.buildZExt(NewCondReg, CondReg);
   2238       return;
   2239     }
   2240 
   2241     break;
   2242   }
   2243   case AMDGPU::G_AND:
   2244   case AMDGPU::G_OR:
   2245   case AMDGPU::G_XOR: {
   2246     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
   2247     // there is a VGPR input.
   2248     Register DstReg = MI.getOperand(0).getReg();
   2249     LLT DstTy = MRI.getType(DstReg);
   2250 
   2251     if (DstTy.getSizeInBits() == 1) {
   2252       const RegisterBank *DstBank =
   2253         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2254       if (DstBank == &AMDGPU::VCCRegBank)
   2255         break;
   2256 
   2257       MachineFunction *MF = MI.getParent()->getParent();
   2258       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
   2259       MachineIRBuilder B(MI, ApplyBank);
   2260       LegalizerHelper Helper(*MF, ApplyBank, B);
   2261 
   2262       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
   2263           LegalizerHelper::Legalized)
   2264         llvm_unreachable("widen scalar should have succeeded");
   2265       return;
   2266     }
   2267 
   2268     if (DstTy.getSizeInBits() != 64)
   2269       break;
   2270 
   2271     LLT HalfTy = getHalfSizedType(DstTy);
   2272     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
   2273     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
   2274     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
   2275 
   2276     // All inputs are SGPRs, nothing special to do.
   2277     if (DefRegs.empty()) {
   2278       assert(Src0Regs.empty() && Src1Regs.empty());
   2279       break;
   2280     }
   2281 
   2282     assert(DefRegs.size() == 2);
   2283     assert(Src0Regs.size() == Src1Regs.size() &&
   2284            (Src0Regs.empty() || Src0Regs.size() == 2));
   2285 
   2286     // Depending on where the source registers came from, the generic code may
   2287     // have decided to split the inputs already or not. If not, we still need to
   2288     // extract the values.
   2289     MachineIRBuilder B(MI);
   2290 
   2291     if (Src0Regs.empty())
   2292       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
   2293     else
   2294       setRegsToType(MRI, Src0Regs, HalfTy);
   2295 
   2296     if (Src1Regs.empty())
   2297       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
   2298     else
   2299       setRegsToType(MRI, Src1Regs, HalfTy);
   2300 
   2301     setRegsToType(MRI, DefRegs, HalfTy);
   2302 
   2303     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
   2304     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
   2305 
   2306     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
   2307     MI.eraseFromParent();
   2308     return;
   2309   }
   2310   case AMDGPU::G_ADD:
   2311   case AMDGPU::G_SUB:
   2312   case AMDGPU::G_MUL:
   2313   case AMDGPU::G_SHL:
   2314   case AMDGPU::G_LSHR:
   2315   case AMDGPU::G_ASHR:
   2316   case AMDGPU::G_SMIN:
   2317   case AMDGPU::G_SMAX:
   2318   case AMDGPU::G_UMIN:
   2319   case AMDGPU::G_UMAX: {
   2320     Register DstReg = MI.getOperand(0).getReg();
   2321     LLT DstTy = MRI.getType(DstReg);
   2322 
   2323     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
   2324     // Packed 16-bit operations need to be scalarized and promoted.
   2325     if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
   2326       break;
   2327 
   2328     const RegisterBank *DstBank =
   2329       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2330     if (DstBank == &AMDGPU::VGPRRegBank)
   2331       break;
   2332 
   2333     const LLT S32 = LLT::scalar(32);
   2334     MachineBasicBlock *MBB = MI.getParent();
   2335     MachineFunction *MF = MBB->getParent();
   2336     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
   2337     MachineIRBuilder B(MI, ApplySALU);
   2338 
   2339     if (DstTy.isVector()) {
   2340       Register WideSrc0Lo, WideSrc0Hi;
   2341       Register WideSrc1Lo, WideSrc1Hi;
   2342 
   2343       unsigned ExtendOp = getExtendOp(MI.getOpcode());
   2344       std::tie(WideSrc0Lo, WideSrc0Hi)
   2345         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
   2346       std::tie(WideSrc1Lo, WideSrc1Hi)
   2347         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
   2348       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
   2349       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
   2350       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
   2351       MI.eraseFromParent();
   2352     } else {
   2353       LegalizerHelper Helper(*MF, ApplySALU, B);
   2354 
   2355       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
   2356         llvm_unreachable("widen scalar should have succeeded");
   2357 
   2358       // FIXME: s16 shift amounts should be legal.
   2359       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
   2360           Opc == AMDGPU::G_ASHR) {
   2361         B.setInsertPt(*MBB, MI.getIterator());
   2362         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
   2363           llvm_unreachable("widen scalar should have succeeded");
   2364       }
   2365     }
   2366 
   2367     return;
   2368   }
   2369   case AMDGPU::G_SEXT_INREG: {
   2370     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
   2371     if (SrcRegs.empty())
   2372       break; // Nothing to repair
   2373 
   2374     const LLT S32 = LLT::scalar(32);
   2375     MachineIRBuilder B(MI);
   2376     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
   2377     GISelObserverWrapper Observer(&O);
   2378     B.setChangeObserver(Observer);
   2379 
   2380     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
   2381     // we would need to further expand, and doesn't let us directly set the
   2382     // result registers.
   2383     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
   2384 
   2385     int Amt = MI.getOperand(2).getImm();
   2386     if (Amt <= 32) {
   2387       if (Amt == 32) {
   2388         // The low bits are unchanged.
   2389         B.buildCopy(DstRegs[0], SrcRegs[0]);
   2390       } else {
   2391         // Extend in the low bits and propagate the sign bit to the high half.
   2392         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
   2393       }
   2394 
   2395       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
   2396     } else {
   2397       // The low bits are unchanged, and extend in the high bits.
   2398       B.buildCopy(DstRegs[0], SrcRegs[0]);
   2399       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
   2400     }
   2401 
   2402     Register DstReg = MI.getOperand(0).getReg();
   2403     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
   2404     MI.eraseFromParent();
   2405     return;
   2406   }
   2407   case AMDGPU::G_CTPOP:
   2408   case AMDGPU::G_BITREVERSE:
   2409   case AMDGPU::G_CTLZ_ZERO_UNDEF:
   2410   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
   2411     const RegisterBank *DstBank =
   2412       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2413     if (DstBank == &AMDGPU::SGPRRegBank)
   2414       break;
   2415 
   2416     Register SrcReg = MI.getOperand(1).getReg();
   2417     const LLT S32 = LLT::scalar(32);
   2418     LLT Ty = MRI.getType(SrcReg);
   2419     if (Ty == S32)
   2420       break;
   2421 
   2422     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
   2423     MachineIRBuilder B(MI, ApplyVALU);
   2424 
   2425     MachineFunction &MF = B.getMF();
   2426     LegalizerHelper Helper(MF, ApplyVALU, B);
   2427 
   2428     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
   2429       llvm_unreachable("narrowScalar should have succeeded");
   2430     return;
   2431   }
   2432   case AMDGPU::G_SEXT:
   2433   case AMDGPU::G_ZEXT:
   2434   case AMDGPU::G_ANYEXT: {
   2435     Register SrcReg = MI.getOperand(1).getReg();
   2436     LLT SrcTy = MRI.getType(SrcReg);
   2437     const bool Signed = Opc == AMDGPU::G_SEXT;
   2438 
   2439     assert(empty(OpdMapper.getVRegs(1)));
   2440 
   2441     MachineIRBuilder B(MI);
   2442     const RegisterBank *SrcBank =
   2443       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   2444 
   2445     Register DstReg = MI.getOperand(0).getReg();
   2446     LLT DstTy = MRI.getType(DstReg);
   2447     if (DstTy.isScalar() &&
   2448         SrcBank != &AMDGPU::SGPRRegBank &&
   2449         SrcBank != &AMDGPU::VCCRegBank &&
   2450         // FIXME: Should handle any type that round to s64 when irregular
   2451         // breakdowns supported.
   2452         DstTy.getSizeInBits() == 64 &&
   2453         SrcTy.getSizeInBits() <= 32) {
   2454       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
   2455 
   2456       // Extend to 32-bit, and then extend the low half.
   2457       if (Signed) {
   2458         // TODO: Should really be buildSExtOrCopy
   2459         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
   2460       } else if (Opc == AMDGPU::G_ZEXT) {
   2461         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
   2462       } else {
   2463         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
   2464       }
   2465 
   2466       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
   2467       MRI.setRegBank(DstReg, *SrcBank);
   2468       MI.eraseFromParent();
   2469       return;
   2470     }
   2471 
   2472     if (SrcTy != LLT::scalar(1))
   2473       return;
   2474 
   2475     // It is not legal to have a legalization artifact with a VCC source. Rather
   2476     // than introducing a copy, insert the select we would have to select the
   2477     // copy to.
   2478     if (SrcBank == &AMDGPU::VCCRegBank) {
   2479       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
   2480 
   2481       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
   2482 
   2483       unsigned DstSize = DstTy.getSizeInBits();
   2484       // 64-bit select is SGPR only
   2485       const bool UseSel64 = DstSize > 32 &&
   2486         SrcBank->getID() == AMDGPU::SGPRRegBankID;
   2487 
   2488       // TODO: Should s16 select be legal?
   2489       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
   2490       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
   2491       auto False = B.buildConstant(SelType, 0);
   2492 
   2493       MRI.setRegBank(True.getReg(0), *DstBank);
   2494       MRI.setRegBank(False.getReg(0), *DstBank);
   2495       MRI.setRegBank(DstReg, *DstBank);
   2496 
   2497       if (DstSize > 32) {
   2498         B.buildSelect(DefRegs[0], SrcReg, True, False);
   2499         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
   2500       } else if (DstSize < 32) {
   2501         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
   2502         MRI.setRegBank(Sel.getReg(0), *DstBank);
   2503         B.buildTrunc(DstReg, Sel);
   2504       } else {
   2505         B.buildSelect(DstReg, SrcReg, True, False);
   2506       }
   2507 
   2508       MI.eraseFromParent();
   2509       return;
   2510     }
   2511 
   2512     break;
   2513   }
   2514   case AMDGPU::G_BUILD_VECTOR:
   2515   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
   2516     Register DstReg = MI.getOperand(0).getReg();
   2517     LLT DstTy = MRI.getType(DstReg);
   2518     if (DstTy != LLT::vector(2, 16))
   2519       break;
   2520 
   2521     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
   2522     substituteSimpleCopyRegs(OpdMapper, 1);
   2523     substituteSimpleCopyRegs(OpdMapper, 2);
   2524 
   2525     const RegisterBank *DstBank =
   2526       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2527     if (DstBank == &AMDGPU::SGPRRegBank)
   2528       break; // Can use S_PACK_* instructions.
   2529 
   2530     MachineIRBuilder B(MI);
   2531 
   2532     Register Lo = MI.getOperand(1).getReg();
   2533     Register Hi = MI.getOperand(2).getReg();
   2534     const LLT S32 = LLT::scalar(32);
   2535 
   2536     const RegisterBank *BankLo =
   2537       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   2538     const RegisterBank *BankHi =
   2539       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
   2540 
   2541     Register ZextLo;
   2542     Register ShiftHi;
   2543 
   2544     if (Opc == AMDGPU::G_BUILD_VECTOR) {
   2545       ZextLo = B.buildZExt(S32, Lo).getReg(0);
   2546       MRI.setRegBank(ZextLo, *BankLo);
   2547 
   2548       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
   2549       MRI.setRegBank(ZextHi, *BankHi);
   2550 
   2551       auto ShiftAmt = B.buildConstant(S32, 16);
   2552       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
   2553 
   2554       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
   2555       MRI.setRegBank(ShiftHi, *BankHi);
   2556     } else {
   2557       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
   2558       MRI.setRegBank(MaskLo, *BankLo);
   2559 
   2560       auto ShiftAmt = B.buildConstant(S32, 16);
   2561       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
   2562 
   2563       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
   2564       MRI.setRegBank(ShiftHi, *BankHi);
   2565 
   2566       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
   2567       MRI.setRegBank(ZextLo, *BankLo);
   2568     }
   2569 
   2570     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
   2571     MRI.setRegBank(Or.getReg(0), *DstBank);
   2572 
   2573     B.buildBitcast(DstReg, Or);
   2574     MI.eraseFromParent();
   2575     return;
   2576   }
   2577   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
   2578     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
   2579 
   2580     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
   2581 
   2582     Register DstReg = MI.getOperand(0).getReg();
   2583     Register SrcReg = MI.getOperand(1).getReg();
   2584 
   2585     const LLT S32 = LLT::scalar(32);
   2586     LLT DstTy = MRI.getType(DstReg);
   2587     LLT SrcTy = MRI.getType(SrcReg);
   2588 
   2589     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
   2590       return;
   2591 
   2592     MachineIRBuilder B(MI);
   2593 
   2594     const ValueMapping &DstMapping
   2595       = OpdMapper.getInstrMapping().getOperandMapping(0);
   2596     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
   2597     const RegisterBank *SrcBank =
   2598       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   2599     const RegisterBank *IdxBank =
   2600         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
   2601 
   2602     Register BaseIdxReg;
   2603     unsigned ConstOffset;
   2604     std::tie(BaseIdxReg, ConstOffset) =
   2605         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
   2606 
   2607     // See if the index is an add of a constant which will be foldable by moving
   2608     // the base register of the index later if this is going to be executed in a
   2609     // waterfall loop. This is essentially to reassociate the add of a constant
   2610     // with the readfirstlane.
   2611     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
   2612                                    ConstOffset > 0 &&
   2613                                    ConstOffset < SrcTy.getNumElements();
   2614 
   2615     // Move the base register. We'll re-insert the add later.
   2616     if (ShouldMoveIndexIntoLoop)
   2617       MI.getOperand(2).setReg(BaseIdxReg);
   2618 
   2619     // If this is a VGPR result only because the index was a VGPR result, the
   2620     // actual indexing will be done on the SGPR source vector, which will
   2621     // produce a scalar result. We need to copy to the VGPR result inside the
   2622     // waterfall loop.
   2623     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
   2624                                 SrcBank == &AMDGPU::SGPRRegBank;
   2625     if (DstRegs.empty()) {
   2626       applyDefaultMapping(OpdMapper);
   2627 
   2628       executeInWaterfallLoop(MI, MRI, { 2 });
   2629 
   2630       if (NeedCopyToVGPR) {
   2631         // We don't want a phi for this temporary reg.
   2632         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
   2633         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
   2634         MI.getOperand(0).setReg(TmpReg);
   2635         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
   2636 
   2637         // Use a v_mov_b32 here to make the exec dependency explicit.
   2638         buildVCopy(B, DstReg, TmpReg);
   2639       }
   2640 
   2641       // Re-insert the constant offset add inside the waterfall loop.
   2642       if (ShouldMoveIndexIntoLoop)
   2643         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
   2644 
   2645       return;
   2646     }
   2647 
   2648     assert(DstTy.getSizeInBits() == 64);
   2649 
   2650     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
   2651 
   2652     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
   2653     auto One = B.buildConstant(S32, 1);
   2654 
   2655     MachineBasicBlock::iterator MII = MI.getIterator();
   2656 
   2657     // Split the vector index into 32-bit pieces. Prepare to move all of the
   2658     // new instructions into a waterfall loop if necessary.
   2659     //
   2660     // Don't put the bitcast or constant in the loop.
   2661     MachineInstrSpan Span(MII, &B.getMBB());
   2662 
   2663     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
   2664     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
   2665     auto IdxHi = B.buildAdd(S32, IdxLo, One);
   2666 
   2667     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
   2668     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
   2669 
   2670     MRI.setRegBank(DstReg, *DstBank);
   2671     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
   2672     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
   2673     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
   2674     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
   2675 
   2676     SmallSet<Register, 4> OpsToWaterfall;
   2677     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
   2678       MI.eraseFromParent();
   2679       return;
   2680     }
   2681 
   2682     // Remove the original instruction to avoid potentially confusing the
   2683     // waterfall loop logic.
   2684     B.setInstr(*Span.begin());
   2685     MI.eraseFromParent();
   2686     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
   2687                            OpsToWaterfall, MRI);
   2688 
   2689     if (NeedCopyToVGPR) {
   2690       MachineBasicBlock *LoopBB = Extract1->getParent();
   2691       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
   2692       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
   2693       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
   2694       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
   2695 
   2696       Extract0->getOperand(0).setReg(TmpReg0);
   2697       Extract1->getOperand(0).setReg(TmpReg1);
   2698 
   2699       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
   2700 
   2701       buildVCopy(B, DstRegs[0], TmpReg0);
   2702       buildVCopy(B, DstRegs[1], TmpReg1);
   2703     }
   2704 
   2705     if (ShouldMoveIndexIntoLoop)
   2706       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
   2707 
   2708     return;
   2709   }
   2710   case AMDGPU::G_INSERT_VECTOR_ELT: {
   2711     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
   2712 
   2713     Register DstReg = MI.getOperand(0).getReg();
   2714     LLT VecTy = MRI.getType(DstReg);
   2715 
   2716     assert(OpdMapper.getVRegs(0).empty());
   2717     assert(OpdMapper.getVRegs(3).empty());
   2718 
   2719     if (substituteSimpleCopyRegs(OpdMapper, 1))
   2720       MRI.setType(MI.getOperand(1).getReg(), VecTy);
   2721 
   2722     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
   2723       return;
   2724 
   2725     const RegisterBank *IdxBank =
   2726       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
   2727 
   2728     Register SrcReg = MI.getOperand(1).getReg();
   2729     Register InsReg = MI.getOperand(2).getReg();
   2730     LLT InsTy = MRI.getType(InsReg);
   2731     (void)InsTy;
   2732 
   2733     Register BaseIdxReg;
   2734     unsigned ConstOffset;
   2735     std::tie(BaseIdxReg, ConstOffset) =
   2736         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
   2737 
   2738     // See if the index is an add of a constant which will be foldable by moving
   2739     // the base register of the index later if this is going to be executed in a
   2740     // waterfall loop. This is essentially to reassociate the add of a constant
   2741     // with the readfirstlane.
   2742     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
   2743       ConstOffset > 0 &&
   2744       ConstOffset < VecTy.getNumElements();
   2745 
   2746     // Move the base register. We'll re-insert the add later.
   2747     if (ShouldMoveIndexIntoLoop)
   2748       MI.getOperand(3).setReg(BaseIdxReg);
   2749 
   2750 
   2751     if (InsRegs.empty()) {
   2752       executeInWaterfallLoop(MI, MRI, { 3 });
   2753 
   2754       // Re-insert the constant offset add inside the waterfall loop.
   2755       if (ShouldMoveIndexIntoLoop) {
   2756         MachineIRBuilder B(MI);
   2757         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
   2758       }
   2759 
   2760       return;
   2761     }
   2762 
   2763 
   2764     assert(InsTy.getSizeInBits() == 64);
   2765 
   2766     const LLT S32 = LLT::scalar(32);
   2767     LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
   2768 
   2769     MachineIRBuilder B(MI);
   2770     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
   2771     auto One = B.buildConstant(S32, 1);
   2772 
   2773     // Split the vector index into 32-bit pieces. Prepare to move all of the
   2774     // new instructions into a waterfall loop if necessary.
   2775     //
   2776     // Don't put the bitcast or constant in the loop.
   2777     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
   2778 
   2779     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
   2780     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
   2781     auto IdxHi = B.buildAdd(S32, IdxLo, One);
   2782 
   2783     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
   2784     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
   2785 
   2786     const RegisterBank *DstBank =
   2787       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
   2788     const RegisterBank *SrcBank =
   2789       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
   2790     const RegisterBank *InsSrcBank =
   2791       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
   2792 
   2793     MRI.setRegBank(InsReg, *InsSrcBank);
   2794     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
   2795     MRI.setRegBank(InsLo.getReg(0), *DstBank);
   2796     MRI.setRegBank(InsHi.getReg(0), *DstBank);
   2797     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
   2798     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
   2799     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
   2800 
   2801 
   2802     SmallSet<Register, 4> OpsToWaterfall;
   2803     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
   2804       B.setInsertPt(B.getMBB(), MI);
   2805       B.buildBitcast(DstReg, InsHi);
   2806       MI.eraseFromParent();
   2807       return;
   2808     }
   2809 
   2810     B.setInstr(*Span.begin());
   2811     MI.eraseFromParent();
   2812 
   2813     // Figure out the point after the waterfall loop before mangling the control
   2814     // flow.
   2815     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
   2816                            OpsToWaterfall, MRI);
   2817 
   2818     // The insertion point is now right after the original instruction.
   2819     //
   2820     // Keep the bitcast to the original vector type out of the loop. Doing this
   2821     // saved an extra phi we don't need inside the loop.
   2822     B.buildBitcast(DstReg, InsHi);
   2823 
   2824     // Re-insert the constant offset add inside the waterfall loop.
   2825     if (ShouldMoveIndexIntoLoop)
   2826       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
   2827 
   2828     return;
   2829   }
   2830   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
   2831   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
   2832   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
   2833   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
   2834   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
   2835   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
   2836   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
   2837   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
   2838   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
   2839   case AMDGPU::G_AMDGPU_BUFFER_STORE:
   2840   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
   2841   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
   2842   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
   2843   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
   2844   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
   2845   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
   2846     applyDefaultMapping(OpdMapper);
   2847     executeInWaterfallLoop(MI, MRI, {1, 4});
   2848     return;
   2849   }
   2850   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
   2851   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
   2852   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
   2853   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
   2854   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
   2855   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
   2856   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
   2857   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
   2858   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
   2859   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
   2860   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
   2861   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
   2862     applyDefaultMapping(OpdMapper);
   2863     executeInWaterfallLoop(MI, MRI, {2, 5});
   2864     return;
   2865   }
   2866   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
   2867   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
   2868   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
   2869     applyDefaultMapping(OpdMapper);
   2870     executeInWaterfallLoop(MI, MRI, {2, 5});
   2871     return;
   2872   }
   2873   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
   2874     applyDefaultMapping(OpdMapper);
   2875     executeInWaterfallLoop(MI, MRI, {3, 6});
   2876     return;
   2877   }
   2878   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
   2879     applyMappingSBufferLoad(OpdMapper);
   2880     return;
   2881   }
   2882   case AMDGPU::G_INTRINSIC: {
   2883     switch (MI.getIntrinsicID()) {
   2884     case Intrinsic::amdgcn_readlane: {
   2885       substituteSimpleCopyRegs(OpdMapper, 2);
   2886 
   2887       assert(OpdMapper.getVRegs(0).empty());
   2888       assert(OpdMapper.getVRegs(3).empty());
   2889 
   2890       // Make sure the index is an SGPR. It doesn't make sense to run this in a
   2891       // waterfall loop, so assume it's a uniform value.
   2892       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
   2893       return;
   2894     }
   2895     case Intrinsic::amdgcn_writelane: {
   2896       assert(OpdMapper.getVRegs(0).empty());
   2897       assert(OpdMapper.getVRegs(2).empty());
   2898       assert(OpdMapper.getVRegs(3).empty());
   2899 
   2900       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
   2901       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
   2902       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
   2903       return;
   2904     }
   2905     case Intrinsic::amdgcn_interp_p1:
   2906     case Intrinsic::amdgcn_interp_p2:
   2907     case Intrinsic::amdgcn_interp_mov:
   2908     case Intrinsic::amdgcn_interp_p1_f16:
   2909     case Intrinsic::amdgcn_interp_p2_f16: {
   2910       applyDefaultMapping(OpdMapper);
   2911 
   2912       // Readlane for m0 value, which is always the last operand.
   2913       // FIXME: Should this be a waterfall loop instead?
   2914       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
   2915       return;
   2916     }
   2917     case Intrinsic::amdgcn_permlane16:
   2918     case Intrinsic::amdgcn_permlanex16: {
   2919       // Doing a waterfall loop over these wouldn't make any sense.
   2920       substituteSimpleCopyRegs(OpdMapper, 2);
   2921       substituteSimpleCopyRegs(OpdMapper, 3);
   2922       constrainOpWithReadfirstlane(MI, MRI, 4);
   2923       constrainOpWithReadfirstlane(MI, MRI, 5);
   2924       return;
   2925     }
   2926     case Intrinsic::amdgcn_sbfe:
   2927       applyMappingBFEIntrinsic(OpdMapper, true);
   2928       return;
   2929     case Intrinsic::amdgcn_ubfe:
   2930       applyMappingBFEIntrinsic(OpdMapper, false);
   2931       return;
   2932     case Intrinsic::amdgcn_ballot:
   2933       // Use default handling and insert copy to vcc source.
   2934       break;
   2935     }
   2936     break;
   2937   }
   2938   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
   2939   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
   2940     const AMDGPU::RsrcIntrinsic *RSrcIntrin
   2941       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
   2942     assert(RSrcIntrin && RSrcIntrin->IsImage);
   2943     // Non-images can have complications from operands that allow both SGPR
   2944     // and VGPR. For now it's too complicated to figure out the final opcode
   2945     // to derive the register bank from the MCInstrDesc.
   2946     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
   2947     return;
   2948   }
   2949   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
   2950     unsigned N = MI.getNumExplicitOperands() - 2;
   2951     executeInWaterfallLoop(MI, MRI, { N });
   2952     return;
   2953   }
   2954   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
   2955     auto IntrID = MI.getIntrinsicID();
   2956     switch (IntrID) {
   2957     case Intrinsic::amdgcn_ds_ordered_add:
   2958     case Intrinsic::amdgcn_ds_ordered_swap: {
   2959       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
   2960       assert(OpdMapper.getVRegs(0).empty());
   2961       substituteSimpleCopyRegs(OpdMapper, 3);
   2962       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
   2963       return;
   2964     }
   2965     case Intrinsic::amdgcn_ds_gws_init:
   2966     case Intrinsic::amdgcn_ds_gws_barrier:
   2967     case Intrinsic::amdgcn_ds_gws_sema_br: {
   2968       // Only the first lane is executes, so readfirstlane is safe.
   2969       substituteSimpleCopyRegs(OpdMapper, 1);
   2970       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
   2971       return;
   2972     }
   2973     case Intrinsic::amdgcn_ds_gws_sema_v:
   2974     case Intrinsic::amdgcn_ds_gws_sema_p:
   2975     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
   2976       // Only the first lane is executes, so readfirstlane is safe.
   2977       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
   2978       return;
   2979     }
   2980     case Intrinsic::amdgcn_ds_append:
   2981     case Intrinsic::amdgcn_ds_consume: {
   2982       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
   2983       return;
   2984     }
   2985     case Intrinsic::amdgcn_s_sendmsg:
   2986     case Intrinsic::amdgcn_s_sendmsghalt: {
   2987       // FIXME: Should this use a waterfall loop?
   2988       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
   2989       return;
   2990     }
   2991     case Intrinsic::amdgcn_s_setreg: {
   2992       constrainOpWithReadfirstlane(MI, MRI, 2);
   2993       return;
   2994     }
   2995     default: {
   2996       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
   2997               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
   2998         // Non-images can have complications from operands that allow both SGPR
   2999         // and VGPR. For now it's too complicated to figure out the final opcode
   3000         // to derive the register bank from the MCInstrDesc.
   3001         if (RSrcIntrin->IsImage) {
   3002           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
   3003           return;
   3004         }
   3005       }
   3006 
   3007       break;
   3008     }
   3009     }
   3010     break;
   3011   }
   3012   case AMDGPU::G_LOAD:
   3013   case AMDGPU::G_ZEXTLOAD:
   3014   case AMDGPU::G_SEXTLOAD: {
   3015     if (applyMappingLoad(MI, OpdMapper, MRI))
   3016       return;
   3017     break;
   3018   }
   3019   case AMDGPU::G_DYN_STACKALLOC:
   3020     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
   3021     return;
   3022   default:
   3023     break;
   3024   }
   3025 
   3026   return applyDefaultMapping(OpdMapper);
   3027 }
   3028 
   3029 // vgpr, sgpr -> vgpr
   3030 // vgpr, agpr -> vgpr
   3031 // agpr, agpr -> agpr
   3032 // agpr, sgpr -> vgpr
   3033 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
   3034   if (RB0 == AMDGPU::InvalidRegBankID)
   3035     return RB1;
   3036   if (RB1 == AMDGPU::InvalidRegBankID)
   3037     return RB0;
   3038 
   3039   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
   3040     return AMDGPU::SGPRRegBankID;
   3041 
   3042   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
   3043     return AMDGPU::AGPRRegBankID;
   3044 
   3045   return AMDGPU::VGPRRegBankID;
   3046 }
   3047 
   3048 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
   3049   if (RB0 == AMDGPU::InvalidRegBankID)
   3050     return RB1;
   3051   if (RB1 == AMDGPU::InvalidRegBankID)
   3052     return RB0;
   3053 
   3054   // vcc, vcc -> vcc
   3055   // vcc, sgpr -> vcc
   3056   // vcc, vgpr -> vcc
   3057   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
   3058     return AMDGPU::VCCRegBankID;
   3059 
   3060   // vcc, vgpr -> vgpr
   3061   return regBankUnion(RB0, RB1);
   3062 }
   3063 
   3064 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
   3065                                                 const MachineInstr &MI) const {
   3066   unsigned RegBank = AMDGPU::InvalidRegBankID;
   3067 
   3068   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
   3069     if (!MI.getOperand(i).isReg())
   3070       continue;
   3071     Register Reg = MI.getOperand(i).getReg();
   3072     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
   3073       RegBank = regBankUnion(RegBank, Bank->getID());
   3074       if (RegBank == AMDGPU::VGPRRegBankID)
   3075         break;
   3076     }
   3077   }
   3078 
   3079   return RegBank;
   3080 }
   3081 
   3082 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
   3083   const MachineFunction &MF = *MI.getParent()->getParent();
   3084   const MachineRegisterInfo &MRI = MF.getRegInfo();
   3085   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
   3086     if (!MI.getOperand(i).isReg())
   3087       continue;
   3088     Register Reg = MI.getOperand(i).getReg();
   3089     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
   3090       if (Bank->getID() != AMDGPU::SGPRRegBankID)
   3091         return false;
   3092     }
   3093   }
   3094   return true;
   3095 }
   3096 
   3097 const RegisterBankInfo::InstructionMapping &
   3098 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
   3099   const MachineFunction &MF = *MI.getParent()->getParent();
   3100   const MachineRegisterInfo &MRI = MF.getRegInfo();
   3101   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
   3102 
   3103   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
   3104     const MachineOperand &SrcOp = MI.getOperand(i);
   3105     if (!SrcOp.isReg())
   3106       continue;
   3107 
   3108     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
   3109     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   3110   }
   3111   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
   3112                                MI.getNumOperands());
   3113 }
   3114 
   3115 const RegisterBankInfo::InstructionMapping &
   3116 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
   3117   const MachineFunction &MF = *MI.getParent()->getParent();
   3118   const MachineRegisterInfo &MRI = MF.getRegInfo();
   3119   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
   3120 
   3121   // Even though we technically could use SGPRs, this would require knowledge of
   3122   // the constant bus restriction. Force all sources to VGPR (except for VCC).
   3123   //
   3124   // TODO: Unary ops are trivially OK, so accept SGPRs?
   3125   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
   3126     const MachineOperand &Src = MI.getOperand(i);
   3127     if (!Src.isReg())
   3128       continue;
   3129 
   3130     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
   3131     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
   3132     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
   3133   }
   3134 
   3135   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
   3136                                MI.getNumOperands());
   3137 }
   3138 
   3139 const RegisterBankInfo::InstructionMapping &
   3140 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
   3141   const MachineFunction &MF = *MI.getParent()->getParent();
   3142   const MachineRegisterInfo &MRI = MF.getRegInfo();
   3143   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
   3144 
   3145   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
   3146     const MachineOperand &Op = MI.getOperand(I);
   3147     if (!Op.isReg())
   3148       continue;
   3149 
   3150     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
   3151     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3152   }
   3153 
   3154   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
   3155                                MI.getNumOperands());
   3156 }
   3157 
   3158 const RegisterBankInfo::InstructionMapping &
   3159 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
   3160                                         const MachineInstr &MI,
   3161                                         int RsrcIdx) const {
   3162   // The reported argument index is relative to the IR intrinsic call arguments,
   3163   // so we need to shift by the number of defs and the intrinsic ID.
   3164   RsrcIdx += MI.getNumExplicitDefs() + 1;
   3165 
   3166   const int NumOps = MI.getNumOperands();
   3167   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
   3168 
   3169   // TODO: Should packed/unpacked D16 difference be reported here as part of
   3170   // the value mapping?
   3171   for (int I = 0; I != NumOps; ++I) {
   3172     if (!MI.getOperand(I).isReg())
   3173       continue;
   3174 
   3175     Register OpReg = MI.getOperand(I).getReg();
   3176     // We replace some dead address operands with $noreg
   3177     if (!OpReg)
   3178       continue;
   3179 
   3180     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
   3181 
   3182     // FIXME: Probably need a new intrinsic register bank searchable table to
   3183     // handle arbitrary intrinsics easily.
   3184     //
   3185     // If this has a sampler, it immediately follows rsrc.
   3186     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
   3187 
   3188     if (MustBeSGPR) {
   3189       // If this must be an SGPR, so we must report whatever it is as legal.
   3190       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
   3191       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
   3192     } else {
   3193       // Some operands must be VGPR, and these are easy to copy to.
   3194       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3195     }
   3196   }
   3197 
   3198   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
   3199 }
   3200 
   3201 /// Return the mapping for a pointer arugment.
   3202 const RegisterBankInfo::ValueMapping *
   3203 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
   3204                                               Register PtrReg) const {
   3205   LLT PtrTy = MRI.getType(PtrReg);
   3206   unsigned Size = PtrTy.getSizeInBits();
   3207   if (Subtarget.useFlatForGlobal() ||
   3208       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
   3209     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3210 
   3211   // If we're using MUBUF instructions for global memory, an SGPR base register
   3212   // is possible. Otherwise this needs to be a VGPR.
   3213   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
   3214   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
   3215 }
   3216 
   3217 const RegisterBankInfo::InstructionMapping &
   3218 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
   3219 
   3220   const MachineFunction &MF = *MI.getParent()->getParent();
   3221   const MachineRegisterInfo &MRI = MF.getRegInfo();
   3222   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
   3223   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   3224   Register PtrReg = MI.getOperand(1).getReg();
   3225   LLT PtrTy = MRI.getType(PtrReg);
   3226   unsigned AS = PtrTy.getAddressSpace();
   3227   unsigned PtrSize = PtrTy.getSizeInBits();
   3228 
   3229   const ValueMapping *ValMapping;
   3230   const ValueMapping *PtrMapping;
   3231 
   3232   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
   3233 
   3234   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
   3235     if (isScalarLoadLegal(MI)) {
   3236       // We have a uniform instruction so we want to use an SMRD load
   3237       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   3238       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
   3239     } else {
   3240       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3241 
   3242       // If we're using MUBUF instructions for global memory, an SGPR base
   3243       // register is possible. Otherwise this needs to be a VGPR.
   3244       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
   3245         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
   3246 
   3247       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
   3248     }
   3249   } else {
   3250     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3251     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
   3252   }
   3253 
   3254   OpdsMapping[0] = ValMapping;
   3255   OpdsMapping[1] = PtrMapping;
   3256   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
   3257       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
   3258   return Mapping;
   3259 
   3260   // FIXME: Do we want to add a mapping for FLAT load, or should we just
   3261   // handle that during instruction selection?
   3262 }
   3263 
   3264 unsigned
   3265 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
   3266                                      const MachineRegisterInfo &MRI,
   3267                                      unsigned Default) const {
   3268   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
   3269   return Bank ? Bank->getID() : Default;
   3270 }
   3271 
   3272 const RegisterBankInfo::ValueMapping *
   3273 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
   3274                                          const MachineRegisterInfo &MRI,
   3275                                          const TargetRegisterInfo &TRI) const {
   3276   // Lie and claim anything is legal, even though this needs to be an SGPR
   3277   // applyMapping will have to deal with it as a waterfall loop.
   3278   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
   3279   unsigned Size = getSizeInBits(Reg, MRI, TRI);
   3280   return AMDGPU::getValueMapping(Bank, Size);
   3281 }
   3282 
   3283 const RegisterBankInfo::ValueMapping *
   3284 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
   3285                                          const MachineRegisterInfo &MRI,
   3286                                          const TargetRegisterInfo &TRI) const {
   3287   unsigned Size = getSizeInBits(Reg, MRI, TRI);
   3288   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3289 }
   3290 
   3291 const RegisterBankInfo::ValueMapping *
   3292 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
   3293                                          const MachineRegisterInfo &MRI,
   3294                                          const TargetRegisterInfo &TRI) const {
   3295   unsigned Size = getSizeInBits(Reg, MRI, TRI);
   3296   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
   3297 }
   3298 
   3299 ///
   3300 /// This function must return a legal mapping, because
   3301 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
   3302 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
   3303 /// VGPR to SGPR generated is illegal.
   3304 ///
   3305 // Operands that must be SGPRs must accept potentially divergent VGPRs as
   3306 // legal. These will be dealt with in applyMappingImpl.
   3307 //
   3308 const RegisterBankInfo::InstructionMapping &
   3309 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   3310   const MachineFunction &MF = *MI.getParent()->getParent();
   3311   const MachineRegisterInfo &MRI = MF.getRegInfo();
   3312 
   3313   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
   3314     // The default logic bothers to analyze impossible alternative mappings. We
   3315     // want the most straightforward mapping, so just directly handle this.
   3316     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
   3317                                              *TRI);
   3318     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
   3319                                              *TRI);
   3320     assert(SrcBank && "src bank should have been assigned already");
   3321     if (!DstBank)
   3322       DstBank = SrcBank;
   3323 
   3324     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   3325     if (cannotCopy(*DstBank, *SrcBank, Size))
   3326       return getInvalidInstructionMapping();
   3327 
   3328     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
   3329     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
   3330     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
   3331     OpdsMapping[0] = &ValMap;
   3332     if (MI.getOpcode() == AMDGPU::G_FREEZE)
   3333       OpdsMapping[1] = &ValMap;
   3334 
   3335     return getInstructionMapping(
   3336         1, /*Cost*/ 1,
   3337         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
   3338   }
   3339 
   3340   if (MI.isRegSequence()) {
   3341     // If any input is a VGPR, the result must be a VGPR. The default handling
   3342     // assumes any copy between banks is legal.
   3343     unsigned BankID = AMDGPU::SGPRRegBankID;
   3344 
   3345     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
   3346       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
   3347       // It doesn't make sense to use vcc or scc banks here, so just ignore
   3348       // them.
   3349       if (OpBank != AMDGPU::SGPRRegBankID) {
   3350         BankID = AMDGPU::VGPRRegBankID;
   3351         break;
   3352       }
   3353     }
   3354     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   3355 
   3356     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
   3357     return getInstructionMapping(
   3358         1, /*Cost*/ 1,
   3359         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
   3360   }
   3361 
   3362   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
   3363   // properly.
   3364   //
   3365   // TODO: There are additional exec masking dependencies to analyze.
   3366   if (MI.getOpcode() == TargetOpcode::G_PHI) {
   3367     unsigned ResultBank = AMDGPU::InvalidRegBankID;
   3368     Register DstReg = MI.getOperand(0).getReg();
   3369 
   3370     // Sometimes the result may have already been assigned a bank.
   3371     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
   3372       ResultBank = DstBank->getID();
   3373 
   3374     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
   3375       Register Reg = MI.getOperand(I).getReg();
   3376       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
   3377 
   3378       // FIXME: Assuming VGPR for any undetermined inputs.
   3379       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
   3380         ResultBank = AMDGPU::VGPRRegBankID;
   3381         break;
   3382       }
   3383 
   3384       // FIXME: Need to promote SGPR case to s32
   3385       unsigned OpBank = Bank->getID();
   3386       ResultBank = regBankBoolUnion(ResultBank, OpBank);
   3387     }
   3388 
   3389     assert(ResultBank != AMDGPU::InvalidRegBankID);
   3390 
   3391     unsigned Size = MRI.getType(DstReg).getSizeInBits();
   3392 
   3393     const ValueMapping &ValMap =
   3394         getValueMapping(0, Size, getRegBank(ResultBank));
   3395     return getInstructionMapping(
   3396         1, /*Cost*/ 1,
   3397         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
   3398   }
   3399 
   3400   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
   3401   if (Mapping.isValid())
   3402     return Mapping;
   3403 
   3404   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
   3405 
   3406   switch (MI.getOpcode()) {
   3407   default:
   3408     return getInvalidInstructionMapping();
   3409 
   3410   case AMDGPU::G_AND:
   3411   case AMDGPU::G_OR:
   3412   case AMDGPU::G_XOR: {
   3413     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3414     if (Size == 1) {
   3415       const RegisterBank *DstBank
   3416         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
   3417 
   3418       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
   3419       unsigned BankLHS = AMDGPU::InvalidRegBankID;
   3420       unsigned BankRHS = AMDGPU::InvalidRegBankID;
   3421       if (DstBank) {
   3422         TargetBankID = DstBank->getID();
   3423         if (DstBank == &AMDGPU::VCCRegBank) {
   3424           TargetBankID = AMDGPU::VCCRegBankID;
   3425           BankLHS = AMDGPU::VCCRegBankID;
   3426           BankRHS = AMDGPU::VCCRegBankID;
   3427         } else {
   3428           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
   3429                                  AMDGPU::SGPRRegBankID);
   3430           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
   3431                                  AMDGPU::SGPRRegBankID);
   3432         }
   3433       } else {
   3434         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
   3435                                AMDGPU::VCCRegBankID);
   3436         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
   3437                                AMDGPU::VCCRegBankID);
   3438 
   3439         // Both inputs should be true booleans to produce a boolean result.
   3440         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
   3441           TargetBankID = AMDGPU::VGPRRegBankID;
   3442         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
   3443           TargetBankID = AMDGPU::VCCRegBankID;
   3444           BankLHS = AMDGPU::VCCRegBankID;
   3445           BankRHS = AMDGPU::VCCRegBankID;
   3446         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
   3447           TargetBankID = AMDGPU::SGPRRegBankID;
   3448         }
   3449       }
   3450 
   3451       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
   3452       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
   3453       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
   3454       break;
   3455     }
   3456 
   3457     if (Size == 64) {
   3458 
   3459       if (isSALUMapping(MI)) {
   3460         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
   3461         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
   3462       } else {
   3463         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
   3464         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
   3465         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
   3466 
   3467         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
   3468         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
   3469       }
   3470 
   3471       break;
   3472     }
   3473 
   3474     LLVM_FALLTHROUGH;
   3475   }
   3476   case AMDGPU::G_PTR_ADD:
   3477   case AMDGPU::G_PTRMASK:
   3478   case AMDGPU::G_ADD:
   3479   case AMDGPU::G_SUB:
   3480   case AMDGPU::G_MUL:
   3481   case AMDGPU::G_SHL:
   3482   case AMDGPU::G_LSHR:
   3483   case AMDGPU::G_ASHR:
   3484   case AMDGPU::G_UADDO:
   3485   case AMDGPU::G_USUBO:
   3486   case AMDGPU::G_UADDE:
   3487   case AMDGPU::G_SADDE:
   3488   case AMDGPU::G_USUBE:
   3489   case AMDGPU::G_SSUBE:
   3490   case AMDGPU::G_SMIN:
   3491   case AMDGPU::G_SMAX:
   3492   case AMDGPU::G_UMIN:
   3493   case AMDGPU::G_UMAX:
   3494   case AMDGPU::G_SHUFFLE_VECTOR:
   3495     if (isSALUMapping(MI))
   3496       return getDefaultMappingSOP(MI);
   3497     LLVM_FALLTHROUGH;
   3498 
   3499   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
   3500   case AMDGPU::G_SSUBSAT:
   3501   case AMDGPU::G_UADDSAT:
   3502   case AMDGPU::G_USUBSAT:
   3503   case AMDGPU::G_FADD:
   3504   case AMDGPU::G_FSUB:
   3505   case AMDGPU::G_FPTOSI:
   3506   case AMDGPU::G_FPTOUI:
   3507   case AMDGPU::G_FMUL:
   3508   case AMDGPU::G_FMA:
   3509   case AMDGPU::G_FMAD:
   3510   case AMDGPU::G_FSQRT:
   3511   case AMDGPU::G_FFLOOR:
   3512   case AMDGPU::G_FCEIL:
   3513   case AMDGPU::G_FRINT:
   3514   case AMDGPU::G_SITOFP:
   3515   case AMDGPU::G_UITOFP:
   3516   case AMDGPU::G_FPTRUNC:
   3517   case AMDGPU::G_FPEXT:
   3518   case AMDGPU::G_FEXP2:
   3519   case AMDGPU::G_FLOG2:
   3520   case AMDGPU::G_FMINNUM:
   3521   case AMDGPU::G_FMAXNUM:
   3522   case AMDGPU::G_FMINNUM_IEEE:
   3523   case AMDGPU::G_FMAXNUM_IEEE:
   3524   case AMDGPU::G_FCANONICALIZE:
   3525   case AMDGPU::G_INTRINSIC_TRUNC:
   3526   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
   3527   case AMDGPU::G_FSHR: // TODO: Expand for scalar
   3528   case AMDGPU::G_AMDGPU_FFBH_U32:
   3529   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
   3530   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
   3531   case AMDGPU::G_AMDGPU_RCP_IFLAG:
   3532   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
   3533   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
   3534   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
   3535   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
   3536   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
   3537   case AMDGPU::G_AMDGPU_SMED3:
   3538     return getDefaultMappingVOP(MI);
   3539   case AMDGPU::G_UMULH:
   3540   case AMDGPU::G_SMULH: {
   3541     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
   3542       return getDefaultMappingSOP(MI);
   3543     return getDefaultMappingVOP(MI);
   3544   }
   3545   case AMDGPU::G_IMPLICIT_DEF: {
   3546     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3547     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   3548     break;
   3549   }
   3550   case AMDGPU::G_FCONSTANT:
   3551   case AMDGPU::G_CONSTANT:
   3552   case AMDGPU::G_GLOBAL_VALUE:
   3553   case AMDGPU::G_BLOCK_ADDR:
   3554   case AMDGPU::G_READCYCLECOUNTER: {
   3555     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3556     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   3557     break;
   3558   }
   3559   case AMDGPU::G_FRAME_INDEX: {
   3560     // TODO: This should be the same as other constants, but eliminateFrameIndex
   3561     // currently assumes VALU uses.
   3562     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3563     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3564     break;
   3565   }
   3566   case AMDGPU::G_DYN_STACKALLOC: {
   3567     // Result is always uniform, and a wave reduction is needed for the source.
   3568     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
   3569     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
   3570     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
   3571     break;
   3572   }
   3573   case AMDGPU::G_INSERT: {
   3574     unsigned BankID = getMappingType(MRI, MI);
   3575     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   3576     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
   3577     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
   3578     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
   3579     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
   3580     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
   3581     OpdsMapping[3] = nullptr;
   3582     break;
   3583   }
   3584   case AMDGPU::G_EXTRACT: {
   3585     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
   3586     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   3587     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
   3588     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
   3589     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
   3590     OpdsMapping[2] = nullptr;
   3591     break;
   3592   }
   3593   case AMDGPU::G_BUILD_VECTOR:
   3594   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
   3595     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
   3596     if (DstTy == LLT::vector(2, 16)) {
   3597       unsigned DstSize = DstTy.getSizeInBits();
   3598       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
   3599       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
   3600       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
   3601       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
   3602 
   3603       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
   3604       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
   3605       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
   3606       break;
   3607     }
   3608 
   3609     LLVM_FALLTHROUGH;
   3610   }
   3611   case AMDGPU::G_MERGE_VALUES:
   3612   case AMDGPU::G_CONCAT_VECTORS: {
   3613     unsigned Bank = getMappingType(MRI, MI);
   3614     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3615     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
   3616 
   3617     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
   3618     // Op1 and Dst should use the same register bank.
   3619     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
   3620       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
   3621     break;
   3622   }
   3623   case AMDGPU::G_BITREVERSE:
   3624   case AMDGPU::G_BITCAST:
   3625   case AMDGPU::G_INTTOPTR:
   3626   case AMDGPU::G_PTRTOINT:
   3627   case AMDGPU::G_FABS:
   3628   case AMDGPU::G_FNEG: {
   3629     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3630     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
   3631     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
   3632     break;
   3633   }
   3634   case AMDGPU::G_CTLZ_ZERO_UNDEF:
   3635   case AMDGPU::G_CTTZ_ZERO_UNDEF:
   3636   case AMDGPU::G_CTPOP: {
   3637     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
   3638     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
   3639     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
   3640 
   3641     // This should really be getValueMappingSGPR64Only, but allowing the generic
   3642     // code to handle the register split just makes using LegalizerHelper more
   3643     // difficult.
   3644     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
   3645     break;
   3646   }
   3647   case AMDGPU::G_TRUNC: {
   3648     Register Dst = MI.getOperand(0).getReg();
   3649     Register Src = MI.getOperand(1).getReg();
   3650     unsigned Bank = getRegBankID(Src, MRI);
   3651     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
   3652     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
   3653     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
   3654     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
   3655     break;
   3656   }
   3657   case AMDGPU::G_ZEXT:
   3658   case AMDGPU::G_SEXT:
   3659   case AMDGPU::G_ANYEXT:
   3660   case AMDGPU::G_SEXT_INREG: {
   3661     Register Dst = MI.getOperand(0).getReg();
   3662     Register Src = MI.getOperand(1).getReg();
   3663     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
   3664     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
   3665 
   3666     unsigned DstBank;
   3667     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
   3668     assert(SrcBank);
   3669     switch (SrcBank->getID()) {
   3670     case AMDGPU::SGPRRegBankID:
   3671       DstBank = AMDGPU::SGPRRegBankID;
   3672       break;
   3673     default:
   3674       DstBank = AMDGPU::VGPRRegBankID;
   3675       break;
   3676     }
   3677 
   3678     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
   3679     // 32-bits, and then to 64.
   3680     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
   3681     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
   3682                                                        SrcSize);
   3683     break;
   3684   }
   3685   case AMDGPU::G_FCMP: {
   3686     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   3687     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
   3688     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
   3689     OpdsMapping[1] = nullptr; // Predicate Operand.
   3690     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
   3691     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3692     break;
   3693   }
   3694   case AMDGPU::G_STORE: {
   3695     assert(MI.getOperand(0).isReg());
   3696     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3697 
   3698     // FIXME: We need to specify a different reg bank once scalar stores are
   3699     // supported.
   3700     const ValueMapping *ValMapping =
   3701         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   3702     OpdsMapping[0] = ValMapping;
   3703     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
   3704     break;
   3705   }
   3706   case AMDGPU::G_ICMP: {
   3707     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
   3708     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   3709 
   3710     // See if the result register has already been constrained to vcc, which may
   3711     // happen due to control flow intrinsic lowering.
   3712     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
   3713                                     AMDGPU::SGPRRegBankID);
   3714     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
   3715     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
   3716 
   3717     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
   3718                      Op2Bank == AMDGPU::SGPRRegBankID &&
   3719                      Op3Bank == AMDGPU::SGPRRegBankID &&
   3720       (Size == 32 || (Size == 64 &&
   3721                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
   3722                       Subtarget.hasScalarCompareEq64()));
   3723 
   3724     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
   3725     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
   3726 
   3727     // TODO: Use 32-bit for scalar output size.
   3728     // SCC results will need to be copied to a 32-bit SGPR virtual register.
   3729     const unsigned ResultSize = 1;
   3730 
   3731     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
   3732     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
   3733     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
   3734     break;
   3735   }
   3736   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
   3737     // VGPR index can be used for waterfall when indexing a SGPR vector.
   3738     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
   3739     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3740     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
   3741     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   3742     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
   3743     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
   3744 
   3745     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
   3746     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
   3747 
   3748     // The index can be either if the source vector is VGPR.
   3749     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
   3750     break;
   3751   }
   3752   case AMDGPU::G_INSERT_VECTOR_ELT: {
   3753     unsigned OutputBankID = isSALUMapping(MI) ?
   3754       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
   3755 
   3756     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3757     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   3758     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
   3759     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
   3760     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
   3761 
   3762     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
   3763     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
   3764 
   3765     // This is a weird case, because we need to break down the mapping based on
   3766     // the register bank of a different operand.
   3767     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
   3768       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
   3769                                                       InsertSize);
   3770     } else {
   3771       assert(InsertSize == 32 || InsertSize == 64);
   3772       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
   3773     }
   3774 
   3775     // The index can be either if the source vector is VGPR.
   3776     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
   3777     break;
   3778   }
   3779   case AMDGPU::G_UNMERGE_VALUES: {
   3780     unsigned Bank = getMappingType(MRI, MI);
   3781 
   3782     // Op1 and Dst should use the same register bank.
   3783     // FIXME: Shouldn't this be the default? Why do we need to handle this?
   3784     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
   3785       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
   3786       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
   3787     }
   3788     break;
   3789   }
   3790   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
   3791   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
   3792   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
   3793   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
   3794   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
   3795   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
   3796   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
   3797   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
   3798   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
   3799   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
   3800   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
   3801   case AMDGPU::G_AMDGPU_BUFFER_STORE:
   3802   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
   3803   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
   3804   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
   3805   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
   3806     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   3807 
   3808     // rsrc
   3809     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
   3810 
   3811     // vindex
   3812     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   3813 
   3814     // voffset
   3815     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   3816 
   3817     // soffset
   3818     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   3819 
   3820     // Any remaining operands are immediates and were correctly null
   3821     // initialized.
   3822     break;
   3823   }
   3824   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
   3825   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
   3826   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
   3827   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
   3828   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
   3829   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
   3830   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
   3831   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
   3832   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
   3833   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
   3834   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
   3835   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
   3836   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
   3837   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
   3838   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
   3839     // vdata_out
   3840     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   3841 
   3842     // vdata_in
   3843     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
   3844 
   3845     // rsrc
   3846     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   3847 
   3848     // vindex
   3849     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   3850 
   3851     // voffset
   3852     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   3853 
   3854     // soffset
   3855     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
   3856 
   3857     // Any remaining operands are immediates and were correctly null
   3858     // initialized.
   3859     break;
   3860   }
   3861   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
   3862     // vdata_out
   3863     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   3864 
   3865     // vdata_in
   3866     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
   3867 
   3868     // cmp
   3869     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   3870 
   3871     // rsrc
   3872     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   3873 
   3874     // vindex
   3875     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   3876 
   3877     // voffset
   3878     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
   3879 
   3880     // soffset
   3881     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
   3882 
   3883     // Any remaining operands are immediates and were correctly null
   3884     // initialized.
   3885     break;
   3886   }
   3887   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
   3888     // Lie and claim everything is legal, even though some need to be
   3889     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
   3890     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
   3891     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   3892 
   3893     // We need to convert this to a MUBUF if either the resource of offset is
   3894     // VGPR.
   3895     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
   3896     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
   3897     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
   3898 
   3899     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3900     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
   3901     break;
   3902   }
   3903   case AMDGPU::G_INTRINSIC: {
   3904     switch (MI.getIntrinsicID()) {
   3905     default:
   3906       return getInvalidInstructionMapping();
   3907     case Intrinsic::amdgcn_div_fmas:
   3908     case Intrinsic::amdgcn_div_fixup:
   3909     case Intrinsic::amdgcn_trig_preop:
   3910     case Intrinsic::amdgcn_sin:
   3911     case Intrinsic::amdgcn_cos:
   3912     case Intrinsic::amdgcn_log_clamp:
   3913     case Intrinsic::amdgcn_rcp:
   3914     case Intrinsic::amdgcn_rcp_legacy:
   3915     case Intrinsic::amdgcn_sqrt:
   3916     case Intrinsic::amdgcn_rsq:
   3917     case Intrinsic::amdgcn_rsq_legacy:
   3918     case Intrinsic::amdgcn_rsq_clamp:
   3919     case Intrinsic::amdgcn_fmul_legacy:
   3920     case Intrinsic::amdgcn_fma_legacy:
   3921     case Intrinsic::amdgcn_ldexp:
   3922     case Intrinsic::amdgcn_frexp_mant:
   3923     case Intrinsic::amdgcn_frexp_exp:
   3924     case Intrinsic::amdgcn_fract:
   3925     case Intrinsic::amdgcn_cvt_pkrtz:
   3926     case Intrinsic::amdgcn_cvt_pknorm_i16:
   3927     case Intrinsic::amdgcn_cvt_pknorm_u16:
   3928     case Intrinsic::amdgcn_cvt_pk_i16:
   3929     case Intrinsic::amdgcn_cvt_pk_u16:
   3930     case Intrinsic::amdgcn_fmed3:
   3931     case Intrinsic::amdgcn_cubeid:
   3932     case Intrinsic::amdgcn_cubema:
   3933     case Intrinsic::amdgcn_cubesc:
   3934     case Intrinsic::amdgcn_cubetc:
   3935     case Intrinsic::amdgcn_sffbh:
   3936     case Intrinsic::amdgcn_fmad_ftz:
   3937     case Intrinsic::amdgcn_mbcnt_lo:
   3938     case Intrinsic::amdgcn_mbcnt_hi:
   3939     case Intrinsic::amdgcn_mul_u24:
   3940     case Intrinsic::amdgcn_mul_i24:
   3941     case Intrinsic::amdgcn_lerp:
   3942     case Intrinsic::amdgcn_sad_u8:
   3943     case Intrinsic::amdgcn_msad_u8:
   3944     case Intrinsic::amdgcn_sad_hi_u8:
   3945     case Intrinsic::amdgcn_sad_u16:
   3946     case Intrinsic::amdgcn_qsad_pk_u16_u8:
   3947     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
   3948     case Intrinsic::amdgcn_mqsad_u32_u8:
   3949     case Intrinsic::amdgcn_cvt_pk_u8_f32:
   3950     case Intrinsic::amdgcn_alignbit:
   3951     case Intrinsic::amdgcn_alignbyte:
   3952     case Intrinsic::amdgcn_perm:
   3953     case Intrinsic::amdgcn_fdot2:
   3954     case Intrinsic::amdgcn_sdot2:
   3955     case Intrinsic::amdgcn_udot2:
   3956     case Intrinsic::amdgcn_sdot4:
   3957     case Intrinsic::amdgcn_udot4:
   3958     case Intrinsic::amdgcn_sdot8:
   3959     case Intrinsic::amdgcn_udot8:
   3960       return getDefaultMappingVOP(MI);
   3961     case Intrinsic::amdgcn_sbfe:
   3962     case Intrinsic::amdgcn_ubfe:
   3963       if (isSALUMapping(MI))
   3964         return getDefaultMappingSOP(MI);
   3965       return getDefaultMappingVOP(MI);
   3966     case Intrinsic::amdgcn_ds_swizzle:
   3967     case Intrinsic::amdgcn_ds_permute:
   3968     case Intrinsic::amdgcn_ds_bpermute:
   3969     case Intrinsic::amdgcn_update_dpp:
   3970     case Intrinsic::amdgcn_mov_dpp8:
   3971     case Intrinsic::amdgcn_mov_dpp:
   3972     case Intrinsic::amdgcn_strict_wwm:
   3973     case Intrinsic::amdgcn_wwm:
   3974     case Intrinsic::amdgcn_strict_wqm:
   3975     case Intrinsic::amdgcn_wqm:
   3976     case Intrinsic::amdgcn_softwqm:
   3977     case Intrinsic::amdgcn_set_inactive:
   3978       return getDefaultMappingAllVGPR(MI);
   3979     case Intrinsic::amdgcn_kernarg_segment_ptr:
   3980     case Intrinsic::amdgcn_s_getpc:
   3981     case Intrinsic::amdgcn_groupstaticsize:
   3982     case Intrinsic::amdgcn_reloc_constant:
   3983     case Intrinsic::returnaddress: {
   3984       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3985       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   3986       break;
   3987     }
   3988     case Intrinsic::amdgcn_wqm_vote: {
   3989       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   3990       OpdsMapping[0] = OpdsMapping[2]
   3991         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
   3992       break;
   3993     }
   3994     case Intrinsic::amdgcn_ps_live: {
   3995       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
   3996       break;
   3997     }
   3998     case Intrinsic::amdgcn_div_scale: {
   3999       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4000       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
   4001       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
   4002       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
   4003 
   4004       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
   4005       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
   4006       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
   4007       break;
   4008     }
   4009     case Intrinsic::amdgcn_class: {
   4010       Register Src0Reg = MI.getOperand(2).getReg();
   4011       Register Src1Reg = MI.getOperand(3).getReg();
   4012       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
   4013       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
   4014       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4015       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
   4016       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
   4017       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
   4018       break;
   4019     }
   4020     case Intrinsic::amdgcn_icmp:
   4021     case Intrinsic::amdgcn_fcmp: {
   4022       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4023       // This is not VCCRegBank because this is not used in boolean contexts.
   4024       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
   4025       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   4026       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
   4027       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
   4028       break;
   4029     }
   4030     case Intrinsic::amdgcn_readlane: {
   4031       // This must be an SGPR, but accept a VGPR.
   4032       Register IdxReg = MI.getOperand(3).getReg();
   4033       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
   4034       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
   4035       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
   4036       LLVM_FALLTHROUGH;
   4037     }
   4038     case Intrinsic::amdgcn_readfirstlane: {
   4039       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4040       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   4041       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
   4042       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
   4043       break;
   4044     }
   4045     case Intrinsic::amdgcn_writelane: {
   4046       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4047       Register SrcReg = MI.getOperand(2).getReg();
   4048       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
   4049       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
   4050       Register IdxReg = MI.getOperand(3).getReg();
   4051       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
   4052       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
   4053       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
   4054 
   4055       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
   4056       // to legalize.
   4057       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
   4058       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
   4059       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
   4060       break;
   4061     }
   4062     case Intrinsic::amdgcn_if_break: {
   4063       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   4064       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   4065       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
   4066       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   4067       break;
   4068     }
   4069     case Intrinsic::amdgcn_permlane16:
   4070     case Intrinsic::amdgcn_permlanex16: {
   4071       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   4072       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   4073       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   4074       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
   4075       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4076       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   4077       break;
   4078     }
   4079     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
   4080     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
   4081     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
   4082     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
   4083     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
   4084     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
   4085     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
   4086     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
   4087     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
   4088     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
   4089     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
   4090     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
   4091     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
   4092     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
   4093     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
   4094     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
   4095     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
   4096     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
   4097     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
   4098     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
   4099     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
   4100     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
   4101     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
   4102     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
   4103     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
   4104     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
   4105     case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
   4106       // Default for MAI intrinsics.
   4107       // srcC can also be an immediate which can be folded later.
   4108       // FIXME: Should we eventually add an alternative mapping with AGPR src
   4109       // for srcA/srcB?
   4110       //
   4111       // vdst, srcA, srcB, srcC
   4112       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   4113       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4114       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4115       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   4116       break;
   4117     }
   4118     case Intrinsic::amdgcn_interp_p1:
   4119     case Intrinsic::amdgcn_interp_p2:
   4120     case Intrinsic::amdgcn_interp_mov:
   4121     case Intrinsic::amdgcn_interp_p1_f16:
   4122     case Intrinsic::amdgcn_interp_p2_f16: {
   4123       const int M0Idx = MI.getNumOperands() - 1;
   4124       Register M0Reg = MI.getOperand(M0Idx).getReg();
   4125       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
   4126       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4127 
   4128       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
   4129       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
   4130         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4131 
   4132       // Must be SGPR, but we must take whatever the original bank is and fix it
   4133       // later.
   4134       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
   4135       break;
   4136     }
   4137     case Intrinsic::amdgcn_ballot: {
   4138       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4139       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
   4140       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
   4141       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
   4142       break;
   4143     }
   4144     }
   4145     break;
   4146   }
   4147   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
   4148   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
   4149     auto IntrID = MI.getIntrinsicID();
   4150     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
   4151     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
   4152     // Non-images can have complications from operands that allow both SGPR
   4153     // and VGPR. For now it's too complicated to figure out the final opcode
   4154     // to derive the register bank from the MCInstrDesc.
   4155     assert(RSrcIntrin->IsImage);
   4156     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
   4157   }
   4158   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
   4159     unsigned N = MI.getNumExplicitOperands() - 2;
   4160     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
   4161     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
   4162     for (unsigned I = 2; I < N; ++I)
   4163       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4164     break;
   4165   }
   4166   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
   4167     auto IntrID = MI.getIntrinsicID();
   4168     switch (IntrID) {
   4169     case Intrinsic::amdgcn_s_getreg:
   4170     case Intrinsic::amdgcn_s_memtime:
   4171     case Intrinsic::amdgcn_s_memrealtime:
   4172     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
   4173       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4174       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   4175       break;
   4176     }
   4177     case Intrinsic::amdgcn_global_atomic_fadd:
   4178     case Intrinsic::amdgcn_global_atomic_csub:
   4179     case Intrinsic::amdgcn_global_atomic_fmin:
   4180     case Intrinsic::amdgcn_global_atomic_fmax:
   4181     case Intrinsic::amdgcn_flat_atomic_fadd:
   4182     case Intrinsic::amdgcn_flat_atomic_fmin:
   4183     case Intrinsic::amdgcn_flat_atomic_fmax:
   4184       return getDefaultMappingAllVGPR(MI);
   4185     case Intrinsic::amdgcn_ds_ordered_add:
   4186     case Intrinsic::amdgcn_ds_ordered_swap: {
   4187       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4188       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
   4189       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
   4190                                  AMDGPU::SGPRRegBankID);
   4191       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
   4192       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4193       break;
   4194     }
   4195     case Intrinsic::amdgcn_ds_append:
   4196     case Intrinsic::amdgcn_ds_consume: {
   4197       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4198       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
   4199       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4200       break;
   4201     }
   4202     case Intrinsic::amdgcn_exp_compr:
   4203       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4204       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4205       break;
   4206     case Intrinsic::amdgcn_exp:
   4207       // FIXME: Could we support packed types here?
   4208       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4209       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4210       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4211       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4212       break;
   4213     case Intrinsic::amdgcn_s_sendmsg:
   4214     case Intrinsic::amdgcn_s_sendmsghalt: {
   4215       // This must be an SGPR, but accept a VGPR.
   4216       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
   4217                                    AMDGPU::SGPRRegBankID);
   4218       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
   4219       break;
   4220     }
   4221     case Intrinsic::amdgcn_s_setreg: {
   4222       // This must be an SGPR, but accept a VGPR.
   4223       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
   4224                                    AMDGPU::SGPRRegBankID);
   4225       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
   4226       break;
   4227     }
   4228     case Intrinsic::amdgcn_end_cf: {
   4229       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
   4230       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   4231       break;
   4232     }
   4233     case Intrinsic::amdgcn_else: {
   4234       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
   4235       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
   4236       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
   4237       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
   4238       break;
   4239     }
   4240     case Intrinsic::amdgcn_live_mask: {
   4241       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
   4242       break;
   4243     }
   4244     case Intrinsic::amdgcn_wqm_demote:
   4245     case Intrinsic::amdgcn_kill: {
   4246       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
   4247       break;
   4248     }
   4249     case Intrinsic::amdgcn_raw_buffer_load:
   4250     case Intrinsic::amdgcn_raw_tbuffer_load: {
   4251       // FIXME: Should make intrinsic ID the last operand of the instruction,
   4252       // then this would be the same as store
   4253       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   4254       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4255       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4256       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   4257       break;
   4258     }
   4259     case Intrinsic::amdgcn_raw_buffer_store:
   4260     case Intrinsic::amdgcn_raw_buffer_store_format:
   4261     case Intrinsic::amdgcn_raw_tbuffer_store: {
   4262       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
   4263       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4264       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4265       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   4266       break;
   4267     }
   4268     case Intrinsic::amdgcn_struct_buffer_load:
   4269     case Intrinsic::amdgcn_struct_tbuffer_load: {
   4270       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   4271       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4272       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4273       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   4274       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
   4275       break;
   4276     }
   4277     case Intrinsic::amdgcn_struct_buffer_store:
   4278     case Intrinsic::amdgcn_struct_tbuffer_store: {
   4279       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
   4280       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4281       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4282       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
   4283       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
   4284       break;
   4285     }
   4286     case Intrinsic::amdgcn_init_exec_from_input: {
   4287       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
   4288       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   4289       break;
   4290     }
   4291     case Intrinsic::amdgcn_ds_gws_init:
   4292     case Intrinsic::amdgcn_ds_gws_barrier:
   4293     case Intrinsic::amdgcn_ds_gws_sema_br: {
   4294       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
   4295 
   4296       // This must be an SGPR, but accept a VGPR.
   4297       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
   4298                                    AMDGPU::SGPRRegBankID);
   4299       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
   4300       break;
   4301     }
   4302     case Intrinsic::amdgcn_ds_gws_sema_v:
   4303     case Intrinsic::amdgcn_ds_gws_sema_p:
   4304     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
   4305       // This must be an SGPR, but accept a VGPR.
   4306       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
   4307                                    AMDGPU::SGPRRegBankID);
   4308       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
   4309       break;
   4310     }
   4311     default:
   4312       return getInvalidInstructionMapping();
   4313     }
   4314     break;
   4315   }
   4316   case AMDGPU::G_SELECT: {
   4317     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   4318     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
   4319                                     AMDGPU::SGPRRegBankID);
   4320     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
   4321                                     AMDGPU::SGPRRegBankID);
   4322     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
   4323                     Op3Bank == AMDGPU::SGPRRegBankID;
   4324 
   4325     unsigned CondBankDefault = SGPRSrcs ?
   4326       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
   4327     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
   4328                                      CondBankDefault);
   4329     if (CondBank == AMDGPU::SGPRRegBankID)
   4330       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
   4331     else if (CondBank == AMDGPU::VGPRRegBankID)
   4332       CondBank = AMDGPU::VCCRegBankID;
   4333 
   4334     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
   4335       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
   4336 
   4337     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
   4338 
   4339     // TODO: Should report 32-bit for scalar condition type.
   4340     if (Size == 64) {
   4341       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
   4342       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
   4343       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
   4344       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
   4345     } else {
   4346       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
   4347       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
   4348       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
   4349       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
   4350     }
   4351 
   4352     break;
   4353   }
   4354 
   4355   case AMDGPU::G_LOAD:
   4356   case AMDGPU::G_ZEXTLOAD:
   4357   case AMDGPU::G_SEXTLOAD:
   4358     return getInstrMappingForLoad(MI);
   4359 
   4360   case AMDGPU::G_ATOMICRMW_XCHG:
   4361   case AMDGPU::G_ATOMICRMW_ADD:
   4362   case AMDGPU::G_ATOMICRMW_SUB:
   4363   case AMDGPU::G_ATOMICRMW_AND:
   4364   case AMDGPU::G_ATOMICRMW_OR:
   4365   case AMDGPU::G_ATOMICRMW_XOR:
   4366   case AMDGPU::G_ATOMICRMW_MAX:
   4367   case AMDGPU::G_ATOMICRMW_MIN:
   4368   case AMDGPU::G_ATOMICRMW_UMAX:
   4369   case AMDGPU::G_ATOMICRMW_UMIN:
   4370   case AMDGPU::G_ATOMICRMW_FADD:
   4371   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
   4372   case AMDGPU::G_AMDGPU_ATOMIC_INC:
   4373   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
   4374   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
   4375   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
   4376     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   4377     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
   4378     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4379     break;
   4380   }
   4381   case AMDGPU::G_ATOMIC_CMPXCHG: {
   4382     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
   4383     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
   4384     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
   4385     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
   4386     break;
   4387   }
   4388   case AMDGPU::G_BRCOND: {
   4389     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
   4390                                  AMDGPU::SGPRRegBankID);
   4391     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
   4392     if (Bank != AMDGPU::SGPRRegBankID)
   4393       Bank = AMDGPU::VCCRegBankID;
   4394 
   4395     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
   4396     break;
   4397   }
   4398   }
   4399 
   4400   return getInstructionMapping(/*ID*/1, /*Cost*/1,
   4401                                getOperandsMapping(OpdsMapping),
   4402                                MI.getNumOperands());
   4403 }
   4404