Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// SI Implementation of TargetInstrInfo.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "SIInstrInfo.h"
     15 #include "AMDGPU.h"
     16 #include "AMDGPUInstrInfo.h"
     17 #include "GCNHazardRecognizer.h"
     18 #include "GCNSubtarget.h"
     19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     20 #include "SIMachineFunctionInfo.h"
     21 #include "llvm/Analysis/ValueTracking.h"
     22 #include "llvm/CodeGen/LiveVariables.h"
     23 #include "llvm/CodeGen/MachineDominators.h"
     24 #include "llvm/CodeGen/RegisterScavenging.h"
     25 #include "llvm/CodeGen/ScheduleDAG.h"
     26 #include "llvm/IR/DiagnosticInfo.h"
     27 #include "llvm/IR/IntrinsicsAMDGPU.h"
     28 #include "llvm/Support/CommandLine.h"
     29 #include "llvm/Target/TargetMachine.h"
     30 
     31 using namespace llvm;
     32 
     33 #define DEBUG_TYPE "si-instr-info"
     34 
     35 #define GET_INSTRINFO_CTOR_DTOR
     36 #include "AMDGPUGenInstrInfo.inc"
     37 
     38 namespace llvm {
     39 
     40 class AAResults;
     41 
     42 namespace AMDGPU {
     43 #define GET_D16ImageDimIntrinsics_IMPL
     44 #define GET_ImageDimIntrinsicTable_IMPL
     45 #define GET_RsrcIntrinsics_IMPL
     46 #include "AMDGPUGenSearchableTables.inc"
     47 }
     48 }
     49 
     50 
     51 // Must be at least 4 to be able to branch over minimum unconditional branch
     52 // code. This is only for making it possible to write reasonably small tests for
     53 // long branches.
     54 static cl::opt<unsigned>
     55 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
     56                  cl::desc("Restrict range of branch instructions (DEBUG)"));
     57 
     58 static cl::opt<bool> Fix16BitCopies(
     59   "amdgpu-fix-16-bit-physreg-copies",
     60   cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
     61   cl::init(true),
     62   cl::ReallyHidden);
     63 
     64 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
     65   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
     66     RI(ST), ST(ST) {
     67   SchedModel.init(&ST);
     68 }
     69 
     70 //===----------------------------------------------------------------------===//
     71 // TargetInstrInfo callbacks
     72 //===----------------------------------------------------------------------===//
     73 
     74 static unsigned getNumOperandsNoGlue(SDNode *Node) {
     75   unsigned N = Node->getNumOperands();
     76   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     77     --N;
     78   return N;
     79 }
     80 
     81 /// Returns true if both nodes have the same value for the given
     82 ///        operand \p Op, or if both nodes do not have this operand.
     83 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
     84   unsigned Opc0 = N0->getMachineOpcode();
     85   unsigned Opc1 = N1->getMachineOpcode();
     86 
     87   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
     88   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
     89 
     90   if (Op0Idx == -1 && Op1Idx == -1)
     91     return true;
     92 
     93 
     94   if ((Op0Idx == -1 && Op1Idx != -1) ||
     95       (Op1Idx == -1 && Op0Idx != -1))
     96     return false;
     97 
     98   // getNamedOperandIdx returns the index for the MachineInstr's operands,
     99   // which includes the result as the first operand. We are indexing into the
    100   // MachineSDNode's operands, so we need to skip the result operand to get
    101   // the real index.
    102   --Op0Idx;
    103   --Op1Idx;
    104 
    105   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
    106 }
    107 
    108 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
    109                                                     AAResults *AA) const {
    110   // TODO: The generic check fails for VALU instructions that should be
    111   // rematerializable due to implicit reads of exec. We really want all of the
    112   // generic logic for this except for this.
    113   switch (MI.getOpcode()) {
    114   case AMDGPU::V_MOV_B32_e32:
    115   case AMDGPU::V_MOV_B32_e64:
    116   case AMDGPU::V_MOV_B64_PSEUDO:
    117   case AMDGPU::V_ACCVGPR_READ_B32_e64:
    118   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
    119     // No non-standard implicit operands.
    120     assert(MI.getDesc().getNumOperands() == 2);
    121     assert(MI.getDesc().getNumImplicitDefs() == 0);
    122     assert(MI.getDesc().getNumImplicitUses() == 1);
    123     return MI.getNumOperands() == 3;
    124   default:
    125     return false;
    126   }
    127 }
    128 
    129 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
    130                                           int64_t &Offset0,
    131                                           int64_t &Offset1) const {
    132   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
    133     return false;
    134 
    135   unsigned Opc0 = Load0->getMachineOpcode();
    136   unsigned Opc1 = Load1->getMachineOpcode();
    137 
    138   // Make sure both are actually loads.
    139   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
    140     return false;
    141 
    142   if (isDS(Opc0) && isDS(Opc1)) {
    143 
    144     // FIXME: Handle this case:
    145     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
    146       return false;
    147 
    148     // Check base reg.
    149     if (Load0->getOperand(0) != Load1->getOperand(0))
    150       return false;
    151 
    152     // Skip read2 / write2 variants for simplicity.
    153     // TODO: We should report true if the used offsets are adjacent (excluded
    154     // st64 versions).
    155     int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
    156     int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
    157     if (Offset0Idx == -1 || Offset1Idx == -1)
    158       return false;
    159 
    160     // XXX - be careful of datalesss loads
    161     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
    162     // include the output in the operand list, but SDNodes don't, we need to
    163     // subtract the index by one.
    164     Offset0Idx -= get(Opc0).NumDefs;
    165     Offset1Idx -= get(Opc1).NumDefs;
    166     Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
    167     Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
    168     return true;
    169   }
    170 
    171   if (isSMRD(Opc0) && isSMRD(Opc1)) {
    172     // Skip time and cache invalidation instructions.
    173     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
    174         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
    175       return false;
    176 
    177     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
    178 
    179     // Check base reg.
    180     if (Load0->getOperand(0) != Load1->getOperand(0))
    181       return false;
    182 
    183     const ConstantSDNode *Load0Offset =
    184         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
    185     const ConstantSDNode *Load1Offset =
    186         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
    187 
    188     if (!Load0Offset || !Load1Offset)
    189       return false;
    190 
    191     Offset0 = Load0Offset->getZExtValue();
    192     Offset1 = Load1Offset->getZExtValue();
    193     return true;
    194   }
    195 
    196   // MUBUF and MTBUF can access the same addresses.
    197   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
    198 
    199     // MUBUF and MTBUF have vaddr at different indices.
    200     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
    201         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
    202         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
    203       return false;
    204 
    205     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
    206     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
    207 
    208     if (OffIdx0 == -1 || OffIdx1 == -1)
    209       return false;
    210 
    211     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
    212     // include the output in the operand list, but SDNodes don't, we need to
    213     // subtract the index by one.
    214     OffIdx0 -= get(Opc0).NumDefs;
    215     OffIdx1 -= get(Opc1).NumDefs;
    216 
    217     SDValue Off0 = Load0->getOperand(OffIdx0);
    218     SDValue Off1 = Load1->getOperand(OffIdx1);
    219 
    220     // The offset might be a FrameIndexSDNode.
    221     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
    222       return false;
    223 
    224     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
    225     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
    226     return true;
    227   }
    228 
    229   return false;
    230 }
    231 
    232 static bool isStride64(unsigned Opc) {
    233   switch (Opc) {
    234   case AMDGPU::DS_READ2ST64_B32:
    235   case AMDGPU::DS_READ2ST64_B64:
    236   case AMDGPU::DS_WRITE2ST64_B32:
    237   case AMDGPU::DS_WRITE2ST64_B64:
    238     return true;
    239   default:
    240     return false;
    241   }
    242 }
    243 
    244 bool SIInstrInfo::getMemOperandsWithOffsetWidth(
    245     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
    246     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
    247     const TargetRegisterInfo *TRI) const {
    248   if (!LdSt.mayLoadOrStore())
    249     return false;
    250 
    251   unsigned Opc = LdSt.getOpcode();
    252   OffsetIsScalable = false;
    253   const MachineOperand *BaseOp, *OffsetOp;
    254   int DataOpIdx;
    255 
    256   if (isDS(LdSt)) {
    257     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
    258     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
    259     if (OffsetOp) {
    260       // Normal, single offset LDS instruction.
    261       if (!BaseOp) {
    262         // DS_CONSUME/DS_APPEND use M0 for the base address.
    263         // TODO: find the implicit use operand for M0 and use that as BaseOp?
    264         return false;
    265       }
    266       BaseOps.push_back(BaseOp);
    267       Offset = OffsetOp->getImm();
    268       // Get appropriate operand, and compute width accordingly.
    269       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
    270       if (DataOpIdx == -1)
    271         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
    272       Width = getOpSize(LdSt, DataOpIdx);
    273     } else {
    274       // The 2 offset instructions use offset0 and offset1 instead. We can treat
    275       // these as a load with a single offset if the 2 offsets are consecutive.
    276       // We will use this for some partially aligned loads.
    277       const MachineOperand *Offset0Op =
    278           getNamedOperand(LdSt, AMDGPU::OpName::offset0);
    279       const MachineOperand *Offset1Op =
    280           getNamedOperand(LdSt, AMDGPU::OpName::offset1);
    281 
    282       unsigned Offset0 = Offset0Op->getImm();
    283       unsigned Offset1 = Offset1Op->getImm();
    284       if (Offset0 + 1 != Offset1)
    285         return false;
    286 
    287       // Each of these offsets is in element sized units, so we need to convert
    288       // to bytes of the individual reads.
    289 
    290       unsigned EltSize;
    291       if (LdSt.mayLoad())
    292         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
    293       else {
    294         assert(LdSt.mayStore());
    295         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
    296         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
    297       }
    298 
    299       if (isStride64(Opc))
    300         EltSize *= 64;
    301 
    302       BaseOps.push_back(BaseOp);
    303       Offset = EltSize * Offset0;
    304       // Get appropriate operand(s), and compute width accordingly.
    305       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
    306       if (DataOpIdx == -1) {
    307         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
    308         Width = getOpSize(LdSt, DataOpIdx);
    309         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
    310         Width += getOpSize(LdSt, DataOpIdx);
    311       } else {
    312         Width = getOpSize(LdSt, DataOpIdx);
    313       }
    314     }
    315     return true;
    316   }
    317 
    318   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
    319     const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
    320     if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
    321       return false;
    322     BaseOps.push_back(RSrc);
    323     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
    324     if (BaseOp && !BaseOp->isFI())
    325       BaseOps.push_back(BaseOp);
    326     const MachineOperand *OffsetImm =
    327         getNamedOperand(LdSt, AMDGPU::OpName::offset);
    328     Offset = OffsetImm->getImm();
    329     const MachineOperand *SOffset =
    330         getNamedOperand(LdSt, AMDGPU::OpName::soffset);
    331     if (SOffset) {
    332       if (SOffset->isReg())
    333         BaseOps.push_back(SOffset);
    334       else
    335         Offset += SOffset->getImm();
    336     }
    337     // Get appropriate operand, and compute width accordingly.
    338     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
    339     if (DataOpIdx == -1)
    340       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
    341     Width = getOpSize(LdSt, DataOpIdx);
    342     return true;
    343   }
    344 
    345   if (isMIMG(LdSt)) {
    346     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
    347     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
    348     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
    349     if (VAddr0Idx >= 0) {
    350       // GFX10 possible NSA encoding.
    351       for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
    352         BaseOps.push_back(&LdSt.getOperand(I));
    353     } else {
    354       BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
    355     }
    356     Offset = 0;
    357     // Get appropriate operand, and compute width accordingly.
    358     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
    359     Width = getOpSize(LdSt, DataOpIdx);
    360     return true;
    361   }
    362 
    363   if (isSMRD(LdSt)) {
    364     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
    365     if (!BaseOp) // e.g. S_MEMTIME
    366       return false;
    367     BaseOps.push_back(BaseOp);
    368     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
    369     Offset = OffsetOp ? OffsetOp->getImm() : 0;
    370     // Get appropriate operand, and compute width accordingly.
    371     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
    372     Width = getOpSize(LdSt, DataOpIdx);
    373     return true;
    374   }
    375 
    376   if (isFLAT(LdSt)) {
    377     // Instructions have either vaddr or saddr or both or none.
    378     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
    379     if (BaseOp)
    380       BaseOps.push_back(BaseOp);
    381     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
    382     if (BaseOp)
    383       BaseOps.push_back(BaseOp);
    384     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
    385     // Get appropriate operand, and compute width accordingly.
    386     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
    387     if (DataOpIdx == -1)
    388       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
    389     Width = getOpSize(LdSt, DataOpIdx);
    390     return true;
    391   }
    392 
    393   return false;
    394 }
    395 
    396 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
    397                                   ArrayRef<const MachineOperand *> BaseOps1,
    398                                   const MachineInstr &MI2,
    399                                   ArrayRef<const MachineOperand *> BaseOps2) {
    400   // Only examine the first "base" operand of each instruction, on the
    401   // assumption that it represents the real base address of the memory access.
    402   // Other operands are typically offsets or indices from this base address.
    403   if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
    404     return true;
    405 
    406   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
    407     return false;
    408 
    409   auto MO1 = *MI1.memoperands_begin();
    410   auto MO2 = *MI2.memoperands_begin();
    411   if (MO1->getAddrSpace() != MO2->getAddrSpace())
    412     return false;
    413 
    414   auto Base1 = MO1->getValue();
    415   auto Base2 = MO2->getValue();
    416   if (!Base1 || !Base2)
    417     return false;
    418   Base1 = getUnderlyingObject(Base1);
    419   Base2 = getUnderlyingObject(Base2);
    420 
    421   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
    422     return false;
    423 
    424   return Base1 == Base2;
    425 }
    426 
    427 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
    428                                       ArrayRef<const MachineOperand *> BaseOps2,
    429                                       unsigned NumLoads,
    430                                       unsigned NumBytes) const {
    431   // If the mem ops (to be clustered) do not have the same base ptr, then they
    432   // should not be clustered
    433   if (!BaseOps1.empty() && !BaseOps2.empty()) {
    434     const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
    435     const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
    436     if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
    437       return false;
    438   } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
    439     // If only one base op is empty, they do not have the same base ptr
    440     return false;
    441   }
    442 
    443   // In order to avoid regester pressure, on an average, the number of DWORDS
    444   // loaded together by all clustered mem ops should not exceed 8. This is an
    445   // empirical value based on certain observations and performance related
    446   // experiments.
    447   // The good thing about this heuristic is - it avoids clustering of too many
    448   // sub-word loads, and also avoids clustering of wide loads. Below is the
    449   // brief summary of how the heuristic behaves for various `LoadSize`.
    450   // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
    451   // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
    452   // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
    453   // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
    454   // (5) LoadSize >= 17: do not cluster
    455   const unsigned LoadSize = NumBytes / NumLoads;
    456   const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
    457   return NumDWORDs <= 8;
    458 }
    459 
    460 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
    461 // the first 16 loads will be interleaved with the stores, and the next 16 will
    462 // be clustered as expected. It should really split into 2 16 store batches.
    463 //
    464 // Loads are clustered until this returns false, rather than trying to schedule
    465 // groups of stores. This also means we have to deal with saying different
    466 // address space loads should be clustered, and ones which might cause bank
    467 // conflicts.
    468 //
    469 // This might be deprecated so it might not be worth that much effort to fix.
    470 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
    471                                           int64_t Offset0, int64_t Offset1,
    472                                           unsigned NumLoads) const {
    473   assert(Offset1 > Offset0 &&
    474          "Second offset should be larger than first offset!");
    475   // If we have less than 16 loads in a row, and the offsets are within 64
    476   // bytes, then schedule together.
    477 
    478   // A cacheline is 64 bytes (for global memory).
    479   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
    480 }
    481 
    482 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
    483                               MachineBasicBlock::iterator MI,
    484                               const DebugLoc &DL, MCRegister DestReg,
    485                               MCRegister SrcReg, bool KillSrc,
    486                               const char *Msg = "illegal SGPR to VGPR copy") {
    487   MachineFunction *MF = MBB.getParent();
    488   DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
    489   LLVMContext &C = MF->getFunction().getContext();
    490   C.diagnose(IllegalCopy);
    491 
    492   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
    493     .addReg(SrcReg, getKillRegState(KillSrc));
    494 }
    495 
    496 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
    497 /// to directly copy, so an intermediate VGPR needs to be used.
    498 static void indirectCopyToAGPR(const SIInstrInfo &TII,
    499                                MachineBasicBlock &MBB,
    500                                MachineBasicBlock::iterator MI,
    501                                const DebugLoc &DL, MCRegister DestReg,
    502                                MCRegister SrcReg, bool KillSrc,
    503                                RegScavenger &RS,
    504                                Register ImpDefSuperReg = Register(),
    505                                Register ImpUseSuperReg = Register()) {
    506   const SIRegisterInfo &RI = TII.getRegisterInfo();
    507 
    508   assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
    509          AMDGPU::AGPR_32RegClass.contains(SrcReg));
    510 
    511   // First try to find defining accvgpr_write to avoid temporary registers.
    512   for (auto Def = MI, E = MBB.begin(); Def != E; ) {
    513     --Def;
    514     if (!Def->definesRegister(SrcReg, &RI))
    515       continue;
    516     if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
    517       break;
    518 
    519     MachineOperand &DefOp = Def->getOperand(1);
    520     assert(DefOp.isReg() || DefOp.isImm());
    521 
    522     if (DefOp.isReg()) {
    523       // Check that register source operand if not clobbered before MI.
    524       // Immediate operands are always safe to propagate.
    525       bool SafeToPropagate = true;
    526       for (auto I = Def; I != MI && SafeToPropagate; ++I)
    527         if (I->modifiesRegister(DefOp.getReg(), &RI))
    528           SafeToPropagate = false;
    529 
    530       if (!SafeToPropagate)
    531         break;
    532 
    533       DefOp.setIsKill(false);
    534     }
    535 
    536     MachineInstrBuilder Builder =
    537       BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
    538       .add(DefOp);
    539     if (ImpDefSuperReg)
    540       Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
    541 
    542     if (ImpUseSuperReg) {
    543       Builder.addReg(ImpUseSuperReg,
    544                      getKillRegState(KillSrc) | RegState::Implicit);
    545     }
    546 
    547     return;
    548   }
    549 
    550   RS.enterBasicBlock(MBB);
    551   RS.forward(MI);
    552 
    553   // Ideally we want to have three registers for a long reg_sequence copy
    554   // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
    555   unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
    556                                              *MBB.getParent());
    557 
    558   // Registers in the sequence are allocated contiguously so we can just
    559   // use register number to pick one of three round-robin temps.
    560   unsigned RegNo = DestReg % 3;
    561   Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
    562   if (!Tmp)
    563     report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
    564   RS.setRegUsed(Tmp);
    565 
    566   if (!TII.getSubtarget().hasGFX90AInsts()) {
    567     // Only loop through if there are any free registers left, otherwise
    568     // scavenger may report a fatal error without emergency spill slot
    569     // or spill with the slot.
    570     while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
    571       Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
    572       if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
    573         break;
    574       Tmp = Tmp2;
    575       RS.setRegUsed(Tmp);
    576     }
    577   }
    578 
    579   // Insert copy to temporary VGPR.
    580   unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
    581   if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
    582     TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
    583   } else {
    584     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
    585   }
    586 
    587   MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
    588     .addReg(SrcReg, getKillRegState(KillSrc));
    589   if (ImpUseSuperReg) {
    590     UseBuilder.addReg(ImpUseSuperReg,
    591                       getKillRegState(KillSrc) | RegState::Implicit);
    592   }
    593 
    594   MachineInstrBuilder DefBuilder
    595     = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
    596     .addReg(Tmp, RegState::Kill);
    597 
    598   if (ImpDefSuperReg)
    599     DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
    600 }
    601 
    602 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
    603                            MachineBasicBlock::iterator MI, const DebugLoc &DL,
    604                            MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
    605                            const TargetRegisterClass *RC, bool Forward) {
    606   const SIRegisterInfo &RI = TII.getRegisterInfo();
    607   ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
    608   MachineBasicBlock::iterator I = MI;
    609   MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
    610 
    611   for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
    612     int16_t SubIdx = BaseIndices[Idx];
    613     Register Reg = RI.getSubReg(DestReg, SubIdx);
    614     unsigned Opcode = AMDGPU::S_MOV_B32;
    615 
    616     // Is SGPR aligned? If so try to combine with next.
    617     Register Src = RI.getSubReg(SrcReg, SubIdx);
    618     bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
    619     bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
    620     if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
    621       // Can use SGPR64 copy
    622       unsigned Channel = RI.getChannelFromSubReg(SubIdx);
    623       SubIdx = RI.getSubRegFromChannel(Channel, 2);
    624       Opcode = AMDGPU::S_MOV_B64;
    625       Idx++;
    626     }
    627 
    628     LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
    629                  .addReg(RI.getSubReg(SrcReg, SubIdx))
    630                  .addReg(SrcReg, RegState::Implicit);
    631 
    632     if (!FirstMI)
    633       FirstMI = LastMI;
    634 
    635     if (!Forward)
    636       I--;
    637   }
    638 
    639   assert(FirstMI && LastMI);
    640   if (!Forward)
    641     std::swap(FirstMI, LastMI);
    642 
    643   FirstMI->addOperand(
    644       MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
    645 
    646   if (KillSrc)
    647     LastMI->addRegisterKilled(SrcReg, &RI);
    648 }
    649 
    650 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    651                               MachineBasicBlock::iterator MI,
    652                               const DebugLoc &DL, MCRegister DestReg,
    653                               MCRegister SrcReg, bool KillSrc) const {
    654   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
    655 
    656   // FIXME: This is hack to resolve copies between 16 bit and 32 bit
    657   // registers until all patterns are fixed.
    658   if (Fix16BitCopies &&
    659       ((RI.getRegSizeInBits(*RC) == 16) ^
    660        (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
    661     MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
    662     MCRegister Super = RI.get32BitRegister(RegToFix);
    663     assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
    664     RegToFix = Super;
    665 
    666     if (DestReg == SrcReg) {
    667       // Insert empty bundle since ExpandPostRA expects an instruction here.
    668       BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
    669       return;
    670     }
    671 
    672     RC = RI.getPhysRegClass(DestReg);
    673   }
    674 
    675   if (RC == &AMDGPU::VGPR_32RegClass) {
    676     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
    677            AMDGPU::SReg_32RegClass.contains(SrcReg) ||
    678            AMDGPU::AGPR_32RegClass.contains(SrcReg));
    679     unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
    680                      AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
    681     BuildMI(MBB, MI, DL, get(Opc), DestReg)
    682       .addReg(SrcReg, getKillRegState(KillSrc));
    683     return;
    684   }
    685 
    686   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
    687       RC == &AMDGPU::SReg_32RegClass) {
    688     if (SrcReg == AMDGPU::SCC) {
    689       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
    690           .addImm(1)
    691           .addImm(0);
    692       return;
    693     }
    694 
    695     if (DestReg == AMDGPU::VCC_LO) {
    696       if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
    697         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
    698           .addReg(SrcReg, getKillRegState(KillSrc));
    699       } else {
    700         // FIXME: Hack until VReg_1 removed.
    701         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
    702         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
    703           .addImm(0)
    704           .addReg(SrcReg, getKillRegState(KillSrc));
    705       }
    706 
    707       return;
    708     }
    709 
    710     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
    711       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
    712       return;
    713     }
    714 
    715     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
    716             .addReg(SrcReg, getKillRegState(KillSrc));
    717     return;
    718   }
    719 
    720   if (RC == &AMDGPU::SReg_64RegClass) {
    721     if (SrcReg == AMDGPU::SCC) {
    722       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
    723           .addImm(1)
    724           .addImm(0);
    725       return;
    726     }
    727 
    728     if (DestReg == AMDGPU::VCC) {
    729       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
    730         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
    731           .addReg(SrcReg, getKillRegState(KillSrc));
    732       } else {
    733         // FIXME: Hack until VReg_1 removed.
    734         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
    735         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
    736           .addImm(0)
    737           .addReg(SrcReg, getKillRegState(KillSrc));
    738       }
    739 
    740       return;
    741     }
    742 
    743     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
    744       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
    745       return;
    746     }
    747 
    748     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
    749             .addReg(SrcReg, getKillRegState(KillSrc));
    750     return;
    751   }
    752 
    753   if (DestReg == AMDGPU::SCC) {
    754     // Copying 64-bit or 32-bit sources to SCC barely makes sense,
    755     // but SelectionDAG emits such copies for i1 sources.
    756     if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
    757       // This copy can only be produced by patterns
    758       // with explicit SCC, which are known to be enabled
    759       // only for subtargets with S_CMP_LG_U64 present.
    760       assert(ST.hasScalarCompareEq64());
    761       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
    762           .addReg(SrcReg, getKillRegState(KillSrc))
    763           .addImm(0);
    764     } else {
    765       assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
    766       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
    767           .addReg(SrcReg, getKillRegState(KillSrc))
    768           .addImm(0);
    769     }
    770 
    771     return;
    772   }
    773 
    774   if (RC == &AMDGPU::AGPR_32RegClass) {
    775     if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
    776       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
    777         .addReg(SrcReg, getKillRegState(KillSrc));
    778       return;
    779     }
    780 
    781     if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
    782       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
    783         .addReg(SrcReg, getKillRegState(KillSrc));
    784       return;
    785     }
    786 
    787     // FIXME: Pass should maintain scavenger to avoid scan through the block on
    788     // every AGPR spill.
    789     RegScavenger RS;
    790     indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
    791     return;
    792   }
    793 
    794   const unsigned Size = RI.getRegSizeInBits(*RC);
    795   if (Size == 16) {
    796     assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
    797            AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
    798            AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
    799            AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
    800 
    801     bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
    802     bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
    803     bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
    804     bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
    805     bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
    806                   AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
    807                   AMDGPU::AGPR_LO16RegClass.contains(DestReg);
    808     bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
    809                   AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
    810                   AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
    811     MCRegister NewDestReg = RI.get32BitRegister(DestReg);
    812     MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
    813 
    814     if (IsSGPRDst) {
    815       if (!IsSGPRSrc) {
    816         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
    817         return;
    818       }
    819 
    820       BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
    821         .addReg(NewSrcReg, getKillRegState(KillSrc));
    822       return;
    823     }
    824 
    825     if (IsAGPRDst || IsAGPRSrc) {
    826       if (!DstLow || !SrcLow) {
    827         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
    828                           "Cannot use hi16 subreg with an AGPR!");
    829       }
    830 
    831       copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
    832       return;
    833     }
    834 
    835     if (IsSGPRSrc && !ST.hasSDWAScalar()) {
    836       if (!DstLow || !SrcLow) {
    837         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
    838                           "Cannot use hi16 subreg on VI!");
    839       }
    840 
    841       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
    842         .addReg(NewSrcReg, getKillRegState(KillSrc));
    843       return;
    844     }
    845 
    846     auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
    847       .addImm(0) // src0_modifiers
    848       .addReg(NewSrcReg)
    849       .addImm(0) // clamp
    850       .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
    851                      : AMDGPU::SDWA::SdwaSel::WORD_1)
    852       .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
    853       .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
    854                      : AMDGPU::SDWA::SdwaSel::WORD_1)
    855       .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
    856     // First implicit operand is $exec.
    857     MIB->tieOperands(0, MIB->getNumOperands() - 1);
    858     return;
    859   }
    860 
    861   const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
    862   if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
    863     if (ST.hasPackedFP32Ops()) {
    864       BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
    865         .addImm(SISrcMods::OP_SEL_1)
    866         .addReg(SrcReg)
    867         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
    868         .addReg(SrcReg)
    869         .addImm(0) // op_sel_lo
    870         .addImm(0) // op_sel_hi
    871         .addImm(0) // neg_lo
    872         .addImm(0) // neg_hi
    873         .addImm(0) // clamp
    874         .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
    875       return;
    876     }
    877   }
    878 
    879   const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
    880   if (RI.isSGPRClass(RC)) {
    881     if (!RI.isSGPRClass(SrcRC)) {
    882       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
    883       return;
    884     }
    885     expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
    886     return;
    887   }
    888 
    889   unsigned EltSize = 4;
    890   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
    891   if (RI.hasAGPRs(RC)) {
    892     Opcode = (RI.hasVGPRs(SrcRC)) ?
    893       AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
    894   } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
    895     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
    896   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
    897              (RI.isProperlyAlignedRC(*RC) &&
    898               (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
    899     // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
    900     if (ST.hasPackedFP32Ops()) {
    901       Opcode = AMDGPU::V_PK_MOV_B32;
    902       EltSize = 8;
    903     }
    904   }
    905 
    906   // For the cases where we need an intermediate instruction/temporary register
    907   // (destination is an AGPR), we need a scavenger.
    908   //
    909   // FIXME: The pass should maintain this for us so we don't have to re-scan the
    910   // whole block for every handled copy.
    911   std::unique_ptr<RegScavenger> RS;
    912   if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
    913     RS.reset(new RegScavenger());
    914 
    915   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
    916 
    917   // If there is an overlap, we can't kill the super-register on the last
    918   // instruction, since it will also kill the components made live by this def.
    919   const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
    920 
    921   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
    922     unsigned SubIdx;
    923     if (Forward)
    924       SubIdx = SubIndices[Idx];
    925     else
    926       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
    927 
    928     bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
    929 
    930     if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
    931       Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
    932       Register ImpUseSuper = SrcReg;
    933       indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
    934                          RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
    935                          ImpDefSuper, ImpUseSuper);
    936     } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
    937       Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
    938       Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
    939       MachineInstrBuilder MIB =
    940         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
    941         .addImm(SISrcMods::OP_SEL_1)
    942         .addReg(SrcSubReg)
    943         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
    944         .addReg(SrcSubReg)
    945         .addImm(0) // op_sel_lo
    946         .addImm(0) // op_sel_hi
    947         .addImm(0) // neg_lo
    948         .addImm(0) // neg_hi
    949         .addImm(0) // clamp
    950         .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
    951       if (Idx == 0)
    952         MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
    953     } else {
    954       MachineInstrBuilder Builder =
    955         BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
    956         .addReg(RI.getSubReg(SrcReg, SubIdx));
    957       if (Idx == 0)
    958         Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
    959 
    960       Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
    961     }
    962   }
    963 }
    964 
    965 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
    966   int NewOpc;
    967 
    968   // Try to map original to commuted opcode
    969   NewOpc = AMDGPU::getCommuteRev(Opcode);
    970   if (NewOpc != -1)
    971     // Check if the commuted (REV) opcode exists on the target.
    972     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
    973 
    974   // Try to map commuted to original opcode
    975   NewOpc = AMDGPU::getCommuteOrig(Opcode);
    976   if (NewOpc != -1)
    977     // Check if the original (non-REV) opcode exists on the target.
    978     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
    979 
    980   return Opcode;
    981 }
    982 
    983 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
    984                                        MachineBasicBlock::iterator MI,
    985                                        const DebugLoc &DL, unsigned DestReg,
    986                                        int64_t Value) const {
    987   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    988   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
    989   if (RegClass == &AMDGPU::SReg_32RegClass ||
    990       RegClass == &AMDGPU::SGPR_32RegClass ||
    991       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
    992       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
    993     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
    994       .addImm(Value);
    995     return;
    996   }
    997 
    998   if (RegClass == &AMDGPU::SReg_64RegClass ||
    999       RegClass == &AMDGPU::SGPR_64RegClass ||
   1000       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
   1001     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
   1002       .addImm(Value);
   1003     return;
   1004   }
   1005 
   1006   if (RegClass == &AMDGPU::VGPR_32RegClass) {
   1007     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
   1008       .addImm(Value);
   1009     return;
   1010   }
   1011   if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
   1012     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
   1013       .addImm(Value);
   1014     return;
   1015   }
   1016 
   1017   unsigned EltSize = 4;
   1018   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   1019   if (RI.isSGPRClass(RegClass)) {
   1020     if (RI.getRegSizeInBits(*RegClass) > 32) {
   1021       Opcode =  AMDGPU::S_MOV_B64;
   1022       EltSize = 8;
   1023     } else {
   1024       Opcode = AMDGPU::S_MOV_B32;
   1025       EltSize = 4;
   1026     }
   1027   }
   1028 
   1029   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
   1030   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
   1031     int64_t IdxValue = Idx == 0 ? Value : 0;
   1032 
   1033     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
   1034       get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
   1035     Builder.addImm(IdxValue);
   1036   }
   1037 }
   1038 
   1039 const TargetRegisterClass *
   1040 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
   1041   return &AMDGPU::VGPR_32RegClass;
   1042 }
   1043 
   1044 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
   1045                                      MachineBasicBlock::iterator I,
   1046                                      const DebugLoc &DL, Register DstReg,
   1047                                      ArrayRef<MachineOperand> Cond,
   1048                                      Register TrueReg,
   1049                                      Register FalseReg) const {
   1050   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   1051   const TargetRegisterClass *BoolXExecRC =
   1052     RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   1053   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
   1054          "Not a VGPR32 reg");
   1055 
   1056   if (Cond.size() == 1) {
   1057     Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1058     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
   1059       .add(Cond[0]);
   1060     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1061       .addImm(0)
   1062       .addReg(FalseReg)
   1063       .addImm(0)
   1064       .addReg(TrueReg)
   1065       .addReg(SReg);
   1066   } else if (Cond.size() == 2) {
   1067     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
   1068     switch (Cond[0].getImm()) {
   1069     case SIInstrInfo::SCC_TRUE: {
   1070       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1071       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
   1072                                             : AMDGPU::S_CSELECT_B64), SReg)
   1073         .addImm(1)
   1074         .addImm(0);
   1075       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1076         .addImm(0)
   1077         .addReg(FalseReg)
   1078         .addImm(0)
   1079         .addReg(TrueReg)
   1080         .addReg(SReg);
   1081       break;
   1082     }
   1083     case SIInstrInfo::SCC_FALSE: {
   1084       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1085       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
   1086                                             : AMDGPU::S_CSELECT_B64), SReg)
   1087         .addImm(0)
   1088         .addImm(1);
   1089       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1090         .addImm(0)
   1091         .addReg(FalseReg)
   1092         .addImm(0)
   1093         .addReg(TrueReg)
   1094         .addReg(SReg);
   1095       break;
   1096     }
   1097     case SIInstrInfo::VCCNZ: {
   1098       MachineOperand RegOp = Cond[1];
   1099       RegOp.setImplicit(false);
   1100       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1101       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
   1102         .add(RegOp);
   1103       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1104           .addImm(0)
   1105           .addReg(FalseReg)
   1106           .addImm(0)
   1107           .addReg(TrueReg)
   1108           .addReg(SReg);
   1109       break;
   1110     }
   1111     case SIInstrInfo::VCCZ: {
   1112       MachineOperand RegOp = Cond[1];
   1113       RegOp.setImplicit(false);
   1114       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1115       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
   1116         .add(RegOp);
   1117       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1118           .addImm(0)
   1119           .addReg(TrueReg)
   1120           .addImm(0)
   1121           .addReg(FalseReg)
   1122           .addReg(SReg);
   1123       break;
   1124     }
   1125     case SIInstrInfo::EXECNZ: {
   1126       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1127       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
   1128       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
   1129                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
   1130         .addImm(0);
   1131       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
   1132                                             : AMDGPU::S_CSELECT_B64), SReg)
   1133         .addImm(1)
   1134         .addImm(0);
   1135       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1136         .addImm(0)
   1137         .addReg(FalseReg)
   1138         .addImm(0)
   1139         .addReg(TrueReg)
   1140         .addReg(SReg);
   1141       break;
   1142     }
   1143     case SIInstrInfo::EXECZ: {
   1144       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
   1145       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
   1146       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
   1147                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
   1148         .addImm(0);
   1149       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
   1150                                             : AMDGPU::S_CSELECT_B64), SReg)
   1151         .addImm(0)
   1152         .addImm(1);
   1153       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
   1154         .addImm(0)
   1155         .addReg(FalseReg)
   1156         .addImm(0)
   1157         .addReg(TrueReg)
   1158         .addReg(SReg);
   1159       llvm_unreachable("Unhandled branch predicate EXECZ");
   1160       break;
   1161     }
   1162     default:
   1163       llvm_unreachable("invalid branch predicate");
   1164     }
   1165   } else {
   1166     llvm_unreachable("Can only handle Cond size 1 or 2");
   1167   }
   1168 }
   1169 
   1170 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
   1171                                MachineBasicBlock::iterator I,
   1172                                const DebugLoc &DL,
   1173                                Register SrcReg, int Value) const {
   1174   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   1175   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
   1176   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
   1177     .addImm(Value)
   1178     .addReg(SrcReg);
   1179 
   1180   return Reg;
   1181 }
   1182 
   1183 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
   1184                                MachineBasicBlock::iterator I,
   1185                                const DebugLoc &DL,
   1186                                Register SrcReg, int Value) const {
   1187   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   1188   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
   1189   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
   1190     .addImm(Value)
   1191     .addReg(SrcReg);
   1192 
   1193   return Reg;
   1194 }
   1195 
   1196 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
   1197 
   1198   if (RI.hasAGPRs(DstRC))
   1199     return AMDGPU::COPY;
   1200   if (RI.getRegSizeInBits(*DstRC) == 32) {
   1201     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
   1202   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
   1203     return AMDGPU::S_MOV_B64;
   1204   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
   1205     return  AMDGPU::V_MOV_B64_PSEUDO;
   1206   }
   1207   return AMDGPU::COPY;
   1208 }
   1209 
   1210 const MCInstrDesc &
   1211 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
   1212                                      bool IsIndirectSrc) const {
   1213   if (IsIndirectSrc) {
   1214     if (VecSize <= 32) // 4 bytes
   1215       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
   1216     if (VecSize <= 64) // 8 bytes
   1217       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
   1218     if (VecSize <= 96) // 12 bytes
   1219       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
   1220     if (VecSize <= 128) // 16 bytes
   1221       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
   1222     if (VecSize <= 160) // 20 bytes
   1223       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
   1224     if (VecSize <= 256) // 32 bytes
   1225       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
   1226     if (VecSize <= 512) // 64 bytes
   1227       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
   1228     if (VecSize <= 1024) // 128 bytes
   1229       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
   1230 
   1231     llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
   1232   }
   1233 
   1234   if (VecSize <= 32) // 4 bytes
   1235     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
   1236   if (VecSize <= 64) // 8 bytes
   1237     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
   1238   if (VecSize <= 96) // 12 bytes
   1239     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
   1240   if (VecSize <= 128) // 16 bytes
   1241     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
   1242   if (VecSize <= 160) // 20 bytes
   1243     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
   1244   if (VecSize <= 256) // 32 bytes
   1245     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
   1246   if (VecSize <= 512) // 64 bytes
   1247     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
   1248   if (VecSize <= 1024) // 128 bytes
   1249     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
   1250 
   1251   llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
   1252 }
   1253 
   1254 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
   1255   if (VecSize <= 32) // 4 bytes
   1256     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
   1257   if (VecSize <= 64) // 8 bytes
   1258     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
   1259   if (VecSize <= 96) // 12 bytes
   1260     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
   1261   if (VecSize <= 128) // 16 bytes
   1262     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
   1263   if (VecSize <= 160) // 20 bytes
   1264     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
   1265   if (VecSize <= 256) // 32 bytes
   1266     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
   1267   if (VecSize <= 512) // 64 bytes
   1268     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
   1269   if (VecSize <= 1024) // 128 bytes
   1270     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
   1271 
   1272   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
   1273 }
   1274 
   1275 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
   1276   if (VecSize <= 32) // 4 bytes
   1277     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
   1278   if (VecSize <= 64) // 8 bytes
   1279     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
   1280   if (VecSize <= 96) // 12 bytes
   1281     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
   1282   if (VecSize <= 128) // 16 bytes
   1283     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
   1284   if (VecSize <= 160) // 20 bytes
   1285     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
   1286   if (VecSize <= 256) // 32 bytes
   1287     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
   1288   if (VecSize <= 512) // 64 bytes
   1289     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
   1290   if (VecSize <= 1024) // 128 bytes
   1291     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
   1292 
   1293   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
   1294 }
   1295 
   1296 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
   1297   if (VecSize <= 64) // 8 bytes
   1298     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
   1299   if (VecSize <= 128) // 16 bytes
   1300     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
   1301   if (VecSize <= 256) // 32 bytes
   1302     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
   1303   if (VecSize <= 512) // 64 bytes
   1304     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
   1305   if (VecSize <= 1024) // 128 bytes
   1306     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
   1307 
   1308   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
   1309 }
   1310 
   1311 const MCInstrDesc &
   1312 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
   1313                                              bool IsSGPR) const {
   1314   if (IsSGPR) {
   1315     switch (EltSize) {
   1316     case 32:
   1317       return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
   1318     case 64:
   1319       return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
   1320     default:
   1321       llvm_unreachable("invalid reg indexing elt size");
   1322     }
   1323   }
   1324 
   1325   assert(EltSize == 32 && "invalid reg indexing elt size");
   1326   return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
   1327 }
   1328 
   1329 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
   1330   switch (Size) {
   1331   case 4:
   1332     return AMDGPU::SI_SPILL_S32_SAVE;
   1333   case 8:
   1334     return AMDGPU::SI_SPILL_S64_SAVE;
   1335   case 12:
   1336     return AMDGPU::SI_SPILL_S96_SAVE;
   1337   case 16:
   1338     return AMDGPU::SI_SPILL_S128_SAVE;
   1339   case 20:
   1340     return AMDGPU::SI_SPILL_S160_SAVE;
   1341   case 24:
   1342     return AMDGPU::SI_SPILL_S192_SAVE;
   1343   case 32:
   1344     return AMDGPU::SI_SPILL_S256_SAVE;
   1345   case 64:
   1346     return AMDGPU::SI_SPILL_S512_SAVE;
   1347   case 128:
   1348     return AMDGPU::SI_SPILL_S1024_SAVE;
   1349   default:
   1350     llvm_unreachable("unknown register size");
   1351   }
   1352 }
   1353 
   1354 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
   1355   switch (Size) {
   1356   case 4:
   1357     return AMDGPU::SI_SPILL_V32_SAVE;
   1358   case 8:
   1359     return AMDGPU::SI_SPILL_V64_SAVE;
   1360   case 12:
   1361     return AMDGPU::SI_SPILL_V96_SAVE;
   1362   case 16:
   1363     return AMDGPU::SI_SPILL_V128_SAVE;
   1364   case 20:
   1365     return AMDGPU::SI_SPILL_V160_SAVE;
   1366   case 24:
   1367     return AMDGPU::SI_SPILL_V192_SAVE;
   1368   case 32:
   1369     return AMDGPU::SI_SPILL_V256_SAVE;
   1370   case 64:
   1371     return AMDGPU::SI_SPILL_V512_SAVE;
   1372   case 128:
   1373     return AMDGPU::SI_SPILL_V1024_SAVE;
   1374   default:
   1375     llvm_unreachable("unknown register size");
   1376   }
   1377 }
   1378 
   1379 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
   1380   switch (Size) {
   1381   case 4:
   1382     return AMDGPU::SI_SPILL_A32_SAVE;
   1383   case 8:
   1384     return AMDGPU::SI_SPILL_A64_SAVE;
   1385   case 12:
   1386     return AMDGPU::SI_SPILL_A96_SAVE;
   1387   case 16:
   1388     return AMDGPU::SI_SPILL_A128_SAVE;
   1389   case 20:
   1390     return AMDGPU::SI_SPILL_A160_SAVE;
   1391   case 24:
   1392     return AMDGPU::SI_SPILL_A192_SAVE;
   1393   case 32:
   1394     return AMDGPU::SI_SPILL_A256_SAVE;
   1395   case 64:
   1396     return AMDGPU::SI_SPILL_A512_SAVE;
   1397   case 128:
   1398     return AMDGPU::SI_SPILL_A1024_SAVE;
   1399   default:
   1400     llvm_unreachable("unknown register size");
   1401   }
   1402 }
   1403 
   1404 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   1405                                       MachineBasicBlock::iterator MI,
   1406                                       Register SrcReg, bool isKill,
   1407                                       int FrameIndex,
   1408                                       const TargetRegisterClass *RC,
   1409                                       const TargetRegisterInfo *TRI) const {
   1410   MachineFunction *MF = MBB.getParent();
   1411   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   1412   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   1413   const DebugLoc &DL = MBB.findDebugLoc(MI);
   1414 
   1415   MachinePointerInfo PtrInfo
   1416     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
   1417   MachineMemOperand *MMO = MF->getMachineMemOperand(
   1418       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
   1419       FrameInfo.getObjectAlign(FrameIndex));
   1420   unsigned SpillSize = TRI->getSpillSize(*RC);
   1421 
   1422   if (RI.isSGPRClass(RC)) {
   1423     MFI->setHasSpilledSGPRs();
   1424     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
   1425     assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
   1426            SrcReg != AMDGPU::EXEC && "exec should not be spilled");
   1427 
   1428     // We are only allowed to create one new instruction when spilling
   1429     // registers, so we need to use pseudo instruction for spilling SGPRs.
   1430     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
   1431 
   1432     // The SGPR spill/restore instructions only work on number sgprs, so we need
   1433     // to make sure we are using the correct register class.
   1434     if (SrcReg.isVirtual() && SpillSize == 4) {
   1435       MachineRegisterInfo &MRI = MF->getRegInfo();
   1436       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
   1437     }
   1438 
   1439     BuildMI(MBB, MI, DL, OpDesc)
   1440       .addReg(SrcReg, getKillRegState(isKill)) // data
   1441       .addFrameIndex(FrameIndex)               // addr
   1442       .addMemOperand(MMO)
   1443       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
   1444 
   1445     if (RI.spillSGPRToVGPR())
   1446       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
   1447     return;
   1448   }
   1449 
   1450   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
   1451                                     : getVGPRSpillSaveOpcode(SpillSize);
   1452   MFI->setHasSpilledVGPRs();
   1453 
   1454   BuildMI(MBB, MI, DL, get(Opcode))
   1455     .addReg(SrcReg, getKillRegState(isKill)) // data
   1456     .addFrameIndex(FrameIndex)               // addr
   1457     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
   1458     .addImm(0)                               // offset
   1459     .addMemOperand(MMO);
   1460 }
   1461 
   1462 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
   1463   switch (Size) {
   1464   case 4:
   1465     return AMDGPU::SI_SPILL_S32_RESTORE;
   1466   case 8:
   1467     return AMDGPU::SI_SPILL_S64_RESTORE;
   1468   case 12:
   1469     return AMDGPU::SI_SPILL_S96_RESTORE;
   1470   case 16:
   1471     return AMDGPU::SI_SPILL_S128_RESTORE;
   1472   case 20:
   1473     return AMDGPU::SI_SPILL_S160_RESTORE;
   1474   case 24:
   1475     return AMDGPU::SI_SPILL_S192_RESTORE;
   1476   case 32:
   1477     return AMDGPU::SI_SPILL_S256_RESTORE;
   1478   case 64:
   1479     return AMDGPU::SI_SPILL_S512_RESTORE;
   1480   case 128:
   1481     return AMDGPU::SI_SPILL_S1024_RESTORE;
   1482   default:
   1483     llvm_unreachable("unknown register size");
   1484   }
   1485 }
   1486 
   1487 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
   1488   switch (Size) {
   1489   case 4:
   1490     return AMDGPU::SI_SPILL_V32_RESTORE;
   1491   case 8:
   1492     return AMDGPU::SI_SPILL_V64_RESTORE;
   1493   case 12:
   1494     return AMDGPU::SI_SPILL_V96_RESTORE;
   1495   case 16:
   1496     return AMDGPU::SI_SPILL_V128_RESTORE;
   1497   case 20:
   1498     return AMDGPU::SI_SPILL_V160_RESTORE;
   1499   case 24:
   1500     return AMDGPU::SI_SPILL_V192_RESTORE;
   1501   case 32:
   1502     return AMDGPU::SI_SPILL_V256_RESTORE;
   1503   case 64:
   1504     return AMDGPU::SI_SPILL_V512_RESTORE;
   1505   case 128:
   1506     return AMDGPU::SI_SPILL_V1024_RESTORE;
   1507   default:
   1508     llvm_unreachable("unknown register size");
   1509   }
   1510 }
   1511 
   1512 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
   1513   switch (Size) {
   1514   case 4:
   1515     return AMDGPU::SI_SPILL_A32_RESTORE;
   1516   case 8:
   1517     return AMDGPU::SI_SPILL_A64_RESTORE;
   1518   case 12:
   1519     return AMDGPU::SI_SPILL_A96_RESTORE;
   1520   case 16:
   1521     return AMDGPU::SI_SPILL_A128_RESTORE;
   1522   case 20:
   1523     return AMDGPU::SI_SPILL_A160_RESTORE;
   1524   case 24:
   1525     return AMDGPU::SI_SPILL_A192_RESTORE;
   1526   case 32:
   1527     return AMDGPU::SI_SPILL_A256_RESTORE;
   1528   case 64:
   1529     return AMDGPU::SI_SPILL_A512_RESTORE;
   1530   case 128:
   1531     return AMDGPU::SI_SPILL_A1024_RESTORE;
   1532   default:
   1533     llvm_unreachable("unknown register size");
   1534   }
   1535 }
   1536 
   1537 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   1538                                        MachineBasicBlock::iterator MI,
   1539                                        Register DestReg, int FrameIndex,
   1540                                        const TargetRegisterClass *RC,
   1541                                        const TargetRegisterInfo *TRI) const {
   1542   MachineFunction *MF = MBB.getParent();
   1543   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   1544   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   1545   const DebugLoc &DL = MBB.findDebugLoc(MI);
   1546   unsigned SpillSize = TRI->getSpillSize(*RC);
   1547 
   1548   MachinePointerInfo PtrInfo
   1549     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
   1550 
   1551   MachineMemOperand *MMO = MF->getMachineMemOperand(
   1552       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
   1553       FrameInfo.getObjectAlign(FrameIndex));
   1554 
   1555   if (RI.isSGPRClass(RC)) {
   1556     MFI->setHasSpilledSGPRs();
   1557     assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
   1558     assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
   1559            DestReg != AMDGPU::EXEC && "exec should not be spilled");
   1560 
   1561     // FIXME: Maybe this should not include a memoperand because it will be
   1562     // lowered to non-memory instructions.
   1563     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
   1564     if (DestReg.isVirtual() && SpillSize == 4) {
   1565       MachineRegisterInfo &MRI = MF->getRegInfo();
   1566       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
   1567     }
   1568 
   1569     if (RI.spillSGPRToVGPR())
   1570       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
   1571     BuildMI(MBB, MI, DL, OpDesc, DestReg)
   1572       .addFrameIndex(FrameIndex) // addr
   1573       .addMemOperand(MMO)
   1574       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
   1575 
   1576     return;
   1577   }
   1578 
   1579   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
   1580                                     : getVGPRSpillRestoreOpcode(SpillSize);
   1581   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
   1582     .addFrameIndex(FrameIndex)        // vaddr
   1583     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
   1584     .addImm(0)                           // offset
   1585     .addMemOperand(MMO);
   1586 }
   1587 
   1588 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
   1589                              MachineBasicBlock::iterator MI) const {
   1590   insertNoops(MBB, MI, 1);
   1591 }
   1592 
   1593 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
   1594                               MachineBasicBlock::iterator MI,
   1595                               unsigned Quantity) const {
   1596   DebugLoc DL = MBB.findDebugLoc(MI);
   1597   while (Quantity > 0) {
   1598     unsigned Arg = std::min(Quantity, 8u);
   1599     Quantity -= Arg;
   1600     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
   1601   }
   1602 }
   1603 
   1604 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
   1605   auto MF = MBB.getParent();
   1606   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
   1607 
   1608   assert(Info->isEntryFunction());
   1609 
   1610   if (MBB.succ_empty()) {
   1611     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
   1612     if (HasNoTerminator) {
   1613       if (Info->returnsVoid()) {
   1614         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
   1615       } else {
   1616         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
   1617       }
   1618     }
   1619   }
   1620 }
   1621 
   1622 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   1623   switch (MI.getOpcode()) {
   1624   default: return 1; // FIXME: Do wait states equal cycles?
   1625 
   1626   case AMDGPU::S_NOP:
   1627     return MI.getOperand(0).getImm() + 1;
   1628   }
   1629 }
   1630 
   1631 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   1632   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   1633   MachineBasicBlock &MBB = *MI.getParent();
   1634   DebugLoc DL = MBB.findDebugLoc(MI);
   1635   switch (MI.getOpcode()) {
   1636   default: return TargetInstrInfo::expandPostRAPseudo(MI);
   1637   case AMDGPU::S_MOV_B64_term:
   1638     // This is only a terminator to get the correct spill code placement during
   1639     // register allocation.
   1640     MI.setDesc(get(AMDGPU::S_MOV_B64));
   1641     break;
   1642 
   1643   case AMDGPU::S_MOV_B32_term:
   1644     // This is only a terminator to get the correct spill code placement during
   1645     // register allocation.
   1646     MI.setDesc(get(AMDGPU::S_MOV_B32));
   1647     break;
   1648 
   1649   case AMDGPU::S_XOR_B64_term:
   1650     // This is only a terminator to get the correct spill code placement during
   1651     // register allocation.
   1652     MI.setDesc(get(AMDGPU::S_XOR_B64));
   1653     break;
   1654 
   1655   case AMDGPU::S_XOR_B32_term:
   1656     // This is only a terminator to get the correct spill code placement during
   1657     // register allocation.
   1658     MI.setDesc(get(AMDGPU::S_XOR_B32));
   1659     break;
   1660   case AMDGPU::S_OR_B64_term:
   1661     // This is only a terminator to get the correct spill code placement during
   1662     // register allocation.
   1663     MI.setDesc(get(AMDGPU::S_OR_B64));
   1664     break;
   1665   case AMDGPU::S_OR_B32_term:
   1666     // This is only a terminator to get the correct spill code placement during
   1667     // register allocation.
   1668     MI.setDesc(get(AMDGPU::S_OR_B32));
   1669     break;
   1670 
   1671   case AMDGPU::S_ANDN2_B64_term:
   1672     // This is only a terminator to get the correct spill code placement during
   1673     // register allocation.
   1674     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
   1675     break;
   1676 
   1677   case AMDGPU::S_ANDN2_B32_term:
   1678     // This is only a terminator to get the correct spill code placement during
   1679     // register allocation.
   1680     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
   1681     break;
   1682 
   1683   case AMDGPU::S_AND_B64_term:
   1684     // This is only a terminator to get the correct spill code placement during
   1685     // register allocation.
   1686     MI.setDesc(get(AMDGPU::S_AND_B64));
   1687     break;
   1688 
   1689   case AMDGPU::S_AND_B32_term:
   1690     // This is only a terminator to get the correct spill code placement during
   1691     // register allocation.
   1692     MI.setDesc(get(AMDGPU::S_AND_B32));
   1693     break;
   1694 
   1695   case AMDGPU::V_MOV_B64_PSEUDO: {
   1696     Register Dst = MI.getOperand(0).getReg();
   1697     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
   1698     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
   1699 
   1700     const MachineOperand &SrcOp = MI.getOperand(1);
   1701     // FIXME: Will this work for 64-bit floating point immediates?
   1702     assert(!SrcOp.isFPImm());
   1703     if (SrcOp.isImm()) {
   1704       APInt Imm(64, SrcOp.getImm());
   1705       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
   1706       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
   1707       if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
   1708         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
   1709           .addImm(SISrcMods::OP_SEL_1)
   1710           .addImm(Lo.getSExtValue())
   1711           .addImm(SISrcMods::OP_SEL_1)
   1712           .addImm(Lo.getSExtValue())
   1713           .addImm(0)  // op_sel_lo
   1714           .addImm(0)  // op_sel_hi
   1715           .addImm(0)  // neg_lo
   1716           .addImm(0)  // neg_hi
   1717           .addImm(0); // clamp
   1718       } else {
   1719         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
   1720           .addImm(Lo.getZExtValue())
   1721           .addReg(Dst, RegState::Implicit | RegState::Define);
   1722         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
   1723           .addImm(Hi.getZExtValue())
   1724           .addReg(Dst, RegState::Implicit | RegState::Define);
   1725       }
   1726     } else {
   1727       assert(SrcOp.isReg());
   1728       if (ST.hasPackedFP32Ops() &&
   1729           !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
   1730         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
   1731           .addImm(SISrcMods::OP_SEL_1) // src0_mod
   1732           .addReg(SrcOp.getReg())
   1733           .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
   1734           .addReg(SrcOp.getReg())
   1735           .addImm(0)  // op_sel_lo
   1736           .addImm(0)  // op_sel_hi
   1737           .addImm(0)  // neg_lo
   1738           .addImm(0)  // neg_hi
   1739           .addImm(0); // clamp
   1740       } else {
   1741         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
   1742           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
   1743           .addReg(Dst, RegState::Implicit | RegState::Define);
   1744         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
   1745           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
   1746           .addReg(Dst, RegState::Implicit | RegState::Define);
   1747       }
   1748     }
   1749     MI.eraseFromParent();
   1750     break;
   1751   }
   1752   case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
   1753     expandMovDPP64(MI);
   1754     break;
   1755   }
   1756   case AMDGPU::V_SET_INACTIVE_B32: {
   1757     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
   1758     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   1759     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
   1760     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
   1761     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
   1762       .add(MI.getOperand(2));
   1763     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
   1764       .addReg(Exec);
   1765     MI.eraseFromParent();
   1766     break;
   1767   }
   1768   case AMDGPU::V_SET_INACTIVE_B64: {
   1769     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
   1770     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   1771     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
   1772     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
   1773     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
   1774                                  MI.getOperand(0).getReg())
   1775       .add(MI.getOperand(2));
   1776     expandPostRAPseudo(*Copy);
   1777     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
   1778       .addReg(Exec);
   1779     MI.eraseFromParent();
   1780     break;
   1781   }
   1782   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
   1783   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
   1784   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
   1785   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
   1786   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
   1787   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
   1788   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
   1789   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
   1790   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
   1791   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
   1792   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
   1793   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
   1794   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
   1795   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
   1796   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
   1797   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
   1798   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
   1799   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
   1800   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
   1801   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
   1802   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
   1803     const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
   1804 
   1805     unsigned Opc;
   1806     if (RI.hasVGPRs(EltRC)) {
   1807       Opc = AMDGPU::V_MOVRELD_B32_e32;
   1808     } else {
   1809       Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
   1810                                               : AMDGPU::S_MOVRELD_B32;
   1811     }
   1812 
   1813     const MCInstrDesc &OpDesc = get(Opc);
   1814     Register VecReg = MI.getOperand(0).getReg();
   1815     bool IsUndef = MI.getOperand(1).isUndef();
   1816     unsigned SubReg = MI.getOperand(3).getImm();
   1817     assert(VecReg == MI.getOperand(1).getReg());
   1818 
   1819     MachineInstrBuilder MIB =
   1820       BuildMI(MBB, MI, DL, OpDesc)
   1821         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
   1822         .add(MI.getOperand(2))
   1823         .addReg(VecReg, RegState::ImplicitDefine)
   1824         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
   1825 
   1826     const int ImpDefIdx =
   1827       OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
   1828     const int ImpUseIdx = ImpDefIdx + 1;
   1829     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
   1830     MI.eraseFromParent();
   1831     break;
   1832   }
   1833   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
   1834   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
   1835   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
   1836   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
   1837   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
   1838   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
   1839   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
   1840   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
   1841     assert(ST.useVGPRIndexMode());
   1842     Register VecReg = MI.getOperand(0).getReg();
   1843     bool IsUndef = MI.getOperand(1).isUndef();
   1844     Register Idx = MI.getOperand(3).getReg();
   1845     Register SubReg = MI.getOperand(4).getImm();
   1846 
   1847     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
   1848                               .addReg(Idx)
   1849                               .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
   1850     SetOn->getOperand(3).setIsUndef();
   1851 
   1852     const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
   1853     MachineInstrBuilder MIB =
   1854         BuildMI(MBB, MI, DL, OpDesc)
   1855             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
   1856             .add(MI.getOperand(2))
   1857             .addReg(VecReg, RegState::ImplicitDefine)
   1858             .addReg(VecReg,
   1859                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
   1860 
   1861     const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
   1862     const int ImpUseIdx = ImpDefIdx + 1;
   1863     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
   1864 
   1865     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
   1866 
   1867     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
   1868 
   1869     MI.eraseFromParent();
   1870     break;
   1871   }
   1872   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
   1873   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
   1874   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
   1875   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
   1876   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
   1877   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
   1878   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
   1879   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
   1880     assert(ST.useVGPRIndexMode());
   1881     Register Dst = MI.getOperand(0).getReg();
   1882     Register VecReg = MI.getOperand(1).getReg();
   1883     bool IsUndef = MI.getOperand(1).isUndef();
   1884     Register Idx = MI.getOperand(2).getReg();
   1885     Register SubReg = MI.getOperand(3).getImm();
   1886 
   1887     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
   1888                               .addReg(Idx)
   1889                               .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
   1890     SetOn->getOperand(3).setIsUndef();
   1891 
   1892     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
   1893         .addDef(Dst)
   1894         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
   1895         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
   1896         .addReg(AMDGPU::M0, RegState::Implicit);
   1897 
   1898     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
   1899 
   1900     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
   1901 
   1902     MI.eraseFromParent();
   1903     break;
   1904   }
   1905   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
   1906     MachineFunction &MF = *MBB.getParent();
   1907     Register Reg = MI.getOperand(0).getReg();
   1908     Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
   1909     Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
   1910 
   1911     // Create a bundle so these instructions won't be re-ordered by the
   1912     // post-RA scheduler.
   1913     MIBundleBuilder Bundler(MBB, MI);
   1914     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
   1915 
   1916     // Add 32-bit offset from this instruction to the start of the
   1917     // constant data.
   1918     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
   1919                        .addReg(RegLo)
   1920                        .add(MI.getOperand(1)));
   1921 
   1922     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
   1923                                   .addReg(RegHi);
   1924     MIB.add(MI.getOperand(2));
   1925 
   1926     Bundler.append(MIB);
   1927     finalizeBundle(MBB, Bundler.begin());
   1928 
   1929     MI.eraseFromParent();
   1930     break;
   1931   }
   1932   case AMDGPU::ENTER_STRICT_WWM: {
   1933     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
   1934     // Whole Wave Mode is entered.
   1935     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
   1936                                  : AMDGPU::S_OR_SAVEEXEC_B64));
   1937     break;
   1938   }
   1939   case AMDGPU::ENTER_STRICT_WQM: {
   1940     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
   1941     // STRICT_WQM is entered.
   1942     const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   1943     const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
   1944     const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   1945     BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
   1946     BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
   1947 
   1948     MI.eraseFromParent();
   1949     break;
   1950   }
   1951   case AMDGPU::EXIT_STRICT_WWM:
   1952   case AMDGPU::EXIT_STRICT_WQM: {
   1953     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
   1954     // WWM/STICT_WQM is exited.
   1955     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
   1956     break;
   1957   }
   1958   }
   1959   return true;
   1960 }
   1961 
   1962 std::pair<MachineInstr*, MachineInstr*>
   1963 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   1964   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
   1965 
   1966   MachineBasicBlock &MBB = *MI.getParent();
   1967   DebugLoc DL = MBB.findDebugLoc(MI);
   1968   MachineFunction *MF = MBB.getParent();
   1969   MachineRegisterInfo &MRI = MF->getRegInfo();
   1970   Register Dst = MI.getOperand(0).getReg();
   1971   unsigned Part = 0;
   1972   MachineInstr *Split[2];
   1973 
   1974   for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
   1975     auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
   1976     if (Dst.isPhysical()) {
   1977       MovDPP.addDef(RI.getSubReg(Dst, Sub));
   1978     } else {
   1979       assert(MRI.isSSA());
   1980       auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1981       MovDPP.addDef(Tmp);
   1982     }
   1983 
   1984     for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
   1985       const MachineOperand &SrcOp = MI.getOperand(I);
   1986       assert(!SrcOp.isFPImm());
   1987       if (SrcOp.isImm()) {
   1988         APInt Imm(64, SrcOp.getImm());
   1989         Imm.ashrInPlace(Part * 32);
   1990         MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
   1991       } else {
   1992         assert(SrcOp.isReg());
   1993         Register Src = SrcOp.getReg();
   1994         if (Src.isPhysical())
   1995           MovDPP.addReg(RI.getSubReg(Src, Sub));
   1996         else
   1997           MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
   1998       }
   1999     }
   2000 
   2001     for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
   2002       MovDPP.addImm(MI.getOperand(I).getImm());
   2003 
   2004     Split[Part] = MovDPP;
   2005     ++Part;
   2006   }
   2007 
   2008   if (Dst.isVirtual())
   2009     BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
   2010       .addReg(Split[0]->getOperand(0).getReg())
   2011       .addImm(AMDGPU::sub0)
   2012       .addReg(Split[1]->getOperand(0).getReg())
   2013       .addImm(AMDGPU::sub1);
   2014 
   2015   MI.eraseFromParent();
   2016   return std::make_pair(Split[0], Split[1]);
   2017 }
   2018 
   2019 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
   2020                                       MachineOperand &Src0,
   2021                                       unsigned Src0OpName,
   2022                                       MachineOperand &Src1,
   2023                                       unsigned Src1OpName) const {
   2024   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
   2025   if (!Src0Mods)
   2026     return false;
   2027 
   2028   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
   2029   assert(Src1Mods &&
   2030          "All commutable instructions have both src0 and src1 modifiers");
   2031 
   2032   int Src0ModsVal = Src0Mods->getImm();
   2033   int Src1ModsVal = Src1Mods->getImm();
   2034 
   2035   Src1Mods->setImm(Src0ModsVal);
   2036   Src0Mods->setImm(Src1ModsVal);
   2037   return true;
   2038 }
   2039 
   2040 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
   2041                                              MachineOperand &RegOp,
   2042                                              MachineOperand &NonRegOp) {
   2043   Register Reg = RegOp.getReg();
   2044   unsigned SubReg = RegOp.getSubReg();
   2045   bool IsKill = RegOp.isKill();
   2046   bool IsDead = RegOp.isDead();
   2047   bool IsUndef = RegOp.isUndef();
   2048   bool IsDebug = RegOp.isDebug();
   2049 
   2050   if (NonRegOp.isImm())
   2051     RegOp.ChangeToImmediate(NonRegOp.getImm());
   2052   else if (NonRegOp.isFI())
   2053     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
   2054   else if (NonRegOp.isGlobal()) {
   2055     RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
   2056                      NonRegOp.getTargetFlags());
   2057   } else
   2058     return nullptr;
   2059 
   2060   // Make sure we don't reinterpret a subreg index in the target flags.
   2061   RegOp.setTargetFlags(NonRegOp.getTargetFlags());
   2062 
   2063   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
   2064   NonRegOp.setSubReg(SubReg);
   2065 
   2066   return &MI;
   2067 }
   2068 
   2069 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   2070                                                   unsigned Src0Idx,
   2071                                                   unsigned Src1Idx) const {
   2072   assert(!NewMI && "this should never be used");
   2073 
   2074   unsigned Opc = MI.getOpcode();
   2075   int CommutedOpcode = commuteOpcode(Opc);
   2076   if (CommutedOpcode == -1)
   2077     return nullptr;
   2078 
   2079   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
   2080            static_cast<int>(Src0Idx) &&
   2081          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
   2082            static_cast<int>(Src1Idx) &&
   2083          "inconsistency with findCommutedOpIndices");
   2084 
   2085   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   2086   MachineOperand &Src1 = MI.getOperand(Src1Idx);
   2087 
   2088   MachineInstr *CommutedMI = nullptr;
   2089   if (Src0.isReg() && Src1.isReg()) {
   2090     if (isOperandLegal(MI, Src1Idx, &Src0)) {
   2091       // Be sure to copy the source modifiers to the right place.
   2092       CommutedMI
   2093         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
   2094     }
   2095 
   2096   } else if (Src0.isReg() && !Src1.isReg()) {
   2097     // src0 should always be able to support any operand type, so no need to
   2098     // check operand legality.
   2099     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
   2100   } else if (!Src0.isReg() && Src1.isReg()) {
   2101     if (isOperandLegal(MI, Src1Idx, &Src0))
   2102       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
   2103   } else {
   2104     // FIXME: Found two non registers to commute. This does happen.
   2105     return nullptr;
   2106   }
   2107 
   2108   if (CommutedMI) {
   2109     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
   2110                         Src1, AMDGPU::OpName::src1_modifiers);
   2111 
   2112     CommutedMI->setDesc(get(CommutedOpcode));
   2113   }
   2114 
   2115   return CommutedMI;
   2116 }
   2117 
   2118 // This needs to be implemented because the source modifiers may be inserted
   2119 // between the true commutable operands, and the base
   2120 // TargetInstrInfo::commuteInstruction uses it.
   2121 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   2122                                         unsigned &SrcOpIdx0,
   2123                                         unsigned &SrcOpIdx1) const {
   2124   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
   2125 }
   2126 
   2127 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
   2128                                         unsigned &SrcOpIdx1) const {
   2129   if (!Desc.isCommutable())
   2130     return false;
   2131 
   2132   unsigned Opc = Desc.getOpcode();
   2133   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   2134   if (Src0Idx == -1)
   2135     return false;
   2136 
   2137   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
   2138   if (Src1Idx == -1)
   2139     return false;
   2140 
   2141   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
   2142 }
   2143 
   2144 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   2145                                         int64_t BrOffset) const {
   2146   // BranchRelaxation should never have to check s_setpc_b64 because its dest
   2147   // block is unanalyzable.
   2148   assert(BranchOp != AMDGPU::S_SETPC_B64);
   2149 
   2150   // Convert to dwords.
   2151   BrOffset /= 4;
   2152 
   2153   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
   2154   // from the next instruction.
   2155   BrOffset -= 1;
   2156 
   2157   return isIntN(BranchOffsetBits, BrOffset);
   2158 }
   2159 
   2160 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
   2161   const MachineInstr &MI) const {
   2162   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
   2163     // This would be a difficult analysis to perform, but can always be legal so
   2164     // there's no need to analyze it.
   2165     return nullptr;
   2166   }
   2167 
   2168   return MI.getOperand(0).getMBB();
   2169 }
   2170 
   2171 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   2172                                            MachineBasicBlock &DestBB,
   2173                                            const DebugLoc &DL,
   2174                                            int64_t BrOffset,
   2175                                            RegScavenger *RS) const {
   2176   assert(RS && "RegScavenger required for long branching");
   2177   assert(MBB.empty() &&
   2178          "new block should be inserted for expanding unconditional branch");
   2179   assert(MBB.pred_size() == 1);
   2180 
   2181   MachineFunction *MF = MBB.getParent();
   2182   MachineRegisterInfo &MRI = MF->getRegInfo();
   2183 
   2184   // FIXME: Virtual register workaround for RegScavenger not working with empty
   2185   // blocks.
   2186   Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
   2187 
   2188   auto I = MBB.end();
   2189 
   2190   // We need to compute the offset relative to the instruction immediately after
   2191   // s_getpc_b64. Insert pc arithmetic code before last terminator.
   2192   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
   2193 
   2194   // TODO: Handle > 32-bit block address.
   2195   if (BrOffset >= 0) {
   2196     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
   2197       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
   2198       .addReg(PCReg, 0, AMDGPU::sub0)
   2199       .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
   2200     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
   2201       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
   2202       .addReg(PCReg, 0, AMDGPU::sub1)
   2203       .addImm(0);
   2204   } else {
   2205     // Backwards branch.
   2206     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
   2207       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
   2208       .addReg(PCReg, 0, AMDGPU::sub0)
   2209       .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
   2210     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
   2211       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
   2212       .addReg(PCReg, 0, AMDGPU::sub1)
   2213       .addImm(0);
   2214   }
   2215 
   2216   // Insert the indirect branch after the other terminator.
   2217   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
   2218     .addReg(PCReg);
   2219 
   2220   // FIXME: If spilling is necessary, this will fail because this scavenger has
   2221   // no emergency stack slots. It is non-trivial to spill in this situation,
   2222   // because the restore code needs to be specially placed after the
   2223   // jump. BranchRelaxation then needs to be made aware of the newly inserted
   2224   // block.
   2225   //
   2226   // If a spill is needed for the pc register pair, we need to insert a spill
   2227   // restore block right before the destination block, and insert a short branch
   2228   // into the old destination block's fallthrough predecessor.
   2229   // e.g.:
   2230   //
   2231   // s_cbranch_scc0 skip_long_branch:
   2232   //
   2233   // long_branch_bb:
   2234   //   spill s[8:9]
   2235   //   s_getpc_b64 s[8:9]
   2236   //   s_add_u32 s8, s8, restore_bb
   2237   //   s_addc_u32 s9, s9, 0
   2238   //   s_setpc_b64 s[8:9]
   2239   //
   2240   // skip_long_branch:
   2241   //   foo;
   2242   //
   2243   // .....
   2244   //
   2245   // dest_bb_fallthrough_predecessor:
   2246   // bar;
   2247   // s_branch dest_bb
   2248   //
   2249   // restore_bb:
   2250   //  restore s[8:9]
   2251   //  fallthrough dest_bb
   2252   ///
   2253   // dest_bb:
   2254   //   buzz;
   2255 
   2256   RS->enterBasicBlockEnd(MBB);
   2257   Register Scav = RS->scavengeRegisterBackwards(
   2258     AMDGPU::SReg_64RegClass,
   2259     MachineBasicBlock::iterator(GetPC), false, 0);
   2260   MRI.replaceRegWith(PCReg, Scav);
   2261   MRI.clearVirtRegs();
   2262   RS->setRegUsed(Scav);
   2263 
   2264   return 4 + 8 + 4 + 4;
   2265 }
   2266 
   2267 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
   2268   switch (Cond) {
   2269   case SIInstrInfo::SCC_TRUE:
   2270     return AMDGPU::S_CBRANCH_SCC1;
   2271   case SIInstrInfo::SCC_FALSE:
   2272     return AMDGPU::S_CBRANCH_SCC0;
   2273   case SIInstrInfo::VCCNZ:
   2274     return AMDGPU::S_CBRANCH_VCCNZ;
   2275   case SIInstrInfo::VCCZ:
   2276     return AMDGPU::S_CBRANCH_VCCZ;
   2277   case SIInstrInfo::EXECNZ:
   2278     return AMDGPU::S_CBRANCH_EXECNZ;
   2279   case SIInstrInfo::EXECZ:
   2280     return AMDGPU::S_CBRANCH_EXECZ;
   2281   default:
   2282     llvm_unreachable("invalid branch predicate");
   2283   }
   2284 }
   2285 
   2286 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
   2287   switch (Opcode) {
   2288   case AMDGPU::S_CBRANCH_SCC0:
   2289     return SCC_FALSE;
   2290   case AMDGPU::S_CBRANCH_SCC1:
   2291     return SCC_TRUE;
   2292   case AMDGPU::S_CBRANCH_VCCNZ:
   2293     return VCCNZ;
   2294   case AMDGPU::S_CBRANCH_VCCZ:
   2295     return VCCZ;
   2296   case AMDGPU::S_CBRANCH_EXECNZ:
   2297     return EXECNZ;
   2298   case AMDGPU::S_CBRANCH_EXECZ:
   2299     return EXECZ;
   2300   default:
   2301     return INVALID_BR;
   2302   }
   2303 }
   2304 
   2305 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
   2306                                     MachineBasicBlock::iterator I,
   2307                                     MachineBasicBlock *&TBB,
   2308                                     MachineBasicBlock *&FBB,
   2309                                     SmallVectorImpl<MachineOperand> &Cond,
   2310                                     bool AllowModify) const {
   2311   if (I->getOpcode() == AMDGPU::S_BRANCH) {
   2312     // Unconditional Branch
   2313     TBB = I->getOperand(0).getMBB();
   2314     return false;
   2315   }
   2316 
   2317   MachineBasicBlock *CondBB = nullptr;
   2318 
   2319   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
   2320     CondBB = I->getOperand(1).getMBB();
   2321     Cond.push_back(I->getOperand(0));
   2322   } else {
   2323     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
   2324     if (Pred == INVALID_BR)
   2325       return true;
   2326 
   2327     CondBB = I->getOperand(0).getMBB();
   2328     Cond.push_back(MachineOperand::CreateImm(Pred));
   2329     Cond.push_back(I->getOperand(1)); // Save the branch register.
   2330   }
   2331   ++I;
   2332 
   2333   if (I == MBB.end()) {
   2334     // Conditional branch followed by fall-through.
   2335     TBB = CondBB;
   2336     return false;
   2337   }
   2338 
   2339   if (I->getOpcode() == AMDGPU::S_BRANCH) {
   2340     TBB = CondBB;
   2341     FBB = I->getOperand(0).getMBB();
   2342     return false;
   2343   }
   2344 
   2345   return true;
   2346 }
   2347 
   2348 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   2349                                 MachineBasicBlock *&FBB,
   2350                                 SmallVectorImpl<MachineOperand> &Cond,
   2351                                 bool AllowModify) const {
   2352   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
   2353   auto E = MBB.end();
   2354   if (I == E)
   2355     return false;
   2356 
   2357   // Skip over the instructions that are artificially terminators for special
   2358   // exec management.
   2359   while (I != E && !I->isBranch() && !I->isReturn()) {
   2360     switch (I->getOpcode()) {
   2361     case AMDGPU::S_MOV_B64_term:
   2362     case AMDGPU::S_XOR_B64_term:
   2363     case AMDGPU::S_OR_B64_term:
   2364     case AMDGPU::S_ANDN2_B64_term:
   2365     case AMDGPU::S_AND_B64_term:
   2366     case AMDGPU::S_MOV_B32_term:
   2367     case AMDGPU::S_XOR_B32_term:
   2368     case AMDGPU::S_OR_B32_term:
   2369     case AMDGPU::S_ANDN2_B32_term:
   2370     case AMDGPU::S_AND_B32_term:
   2371       break;
   2372     case AMDGPU::SI_IF:
   2373     case AMDGPU::SI_ELSE:
   2374     case AMDGPU::SI_KILL_I1_TERMINATOR:
   2375     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   2376       // FIXME: It's messy that these need to be considered here at all.
   2377       return true;
   2378     default:
   2379       llvm_unreachable("unexpected non-branch terminator inst");
   2380     }
   2381 
   2382     ++I;
   2383   }
   2384 
   2385   if (I == E)
   2386     return false;
   2387 
   2388   return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
   2389 }
   2390 
   2391 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
   2392                                    int *BytesRemoved) const {
   2393   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
   2394 
   2395   unsigned Count = 0;
   2396   unsigned RemovedSize = 0;
   2397   while (I != MBB.end()) {
   2398     MachineBasicBlock::iterator Next = std::next(I);
   2399     RemovedSize += getInstSizeInBytes(*I);
   2400     I->eraseFromParent();
   2401     ++Count;
   2402     I = Next;
   2403   }
   2404 
   2405   if (BytesRemoved)
   2406     *BytesRemoved = RemovedSize;
   2407 
   2408   return Count;
   2409 }
   2410 
   2411 // Copy the flags onto the implicit condition register operand.
   2412 static void preserveCondRegFlags(MachineOperand &CondReg,
   2413                                  const MachineOperand &OrigCond) {
   2414   CondReg.setIsUndef(OrigCond.isUndef());
   2415   CondReg.setIsKill(OrigCond.isKill());
   2416 }
   2417 
   2418 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
   2419                                    MachineBasicBlock *TBB,
   2420                                    MachineBasicBlock *FBB,
   2421                                    ArrayRef<MachineOperand> Cond,
   2422                                    const DebugLoc &DL,
   2423                                    int *BytesAdded) const {
   2424   if (!FBB && Cond.empty()) {
   2425     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
   2426       .addMBB(TBB);
   2427     if (BytesAdded)
   2428       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
   2429     return 1;
   2430   }
   2431 
   2432   if(Cond.size() == 1 && Cond[0].isReg()) {
   2433      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
   2434        .add(Cond[0])
   2435        .addMBB(TBB);
   2436      return 1;
   2437   }
   2438 
   2439   assert(TBB && Cond[0].isImm());
   2440 
   2441   unsigned Opcode
   2442     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
   2443 
   2444   if (!FBB) {
   2445     Cond[1].isUndef();
   2446     MachineInstr *CondBr =
   2447       BuildMI(&MBB, DL, get(Opcode))
   2448       .addMBB(TBB);
   2449 
   2450     // Copy the flags onto the implicit condition register operand.
   2451     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
   2452     fixImplicitOperands(*CondBr);
   2453 
   2454     if (BytesAdded)
   2455       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
   2456     return 1;
   2457   }
   2458 
   2459   assert(TBB && FBB);
   2460 
   2461   MachineInstr *CondBr =
   2462     BuildMI(&MBB, DL, get(Opcode))
   2463     .addMBB(TBB);
   2464   fixImplicitOperands(*CondBr);
   2465   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
   2466     .addMBB(FBB);
   2467 
   2468   MachineOperand &CondReg = CondBr->getOperand(1);
   2469   CondReg.setIsUndef(Cond[1].isUndef());
   2470   CondReg.setIsKill(Cond[1].isKill());
   2471 
   2472   if (BytesAdded)
   2473     *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
   2474 
   2475   return 2;
   2476 }
   2477 
   2478 bool SIInstrInfo::reverseBranchCondition(
   2479   SmallVectorImpl<MachineOperand> &Cond) const {
   2480   if (Cond.size() != 2) {
   2481     return true;
   2482   }
   2483 
   2484   if (Cond[0].isImm()) {
   2485     Cond[0].setImm(-Cond[0].getImm());
   2486     return false;
   2487   }
   2488 
   2489   return true;
   2490 }
   2491 
   2492 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
   2493                                   ArrayRef<MachineOperand> Cond,
   2494                                   Register DstReg, Register TrueReg,
   2495                                   Register FalseReg, int &CondCycles,
   2496                                   int &TrueCycles, int &FalseCycles) const {
   2497   switch (Cond[0].getImm()) {
   2498   case VCCNZ:
   2499   case VCCZ: {
   2500     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2501     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
   2502     if (MRI.getRegClass(FalseReg) != RC)
   2503       return false;
   2504 
   2505     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
   2506     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
   2507 
   2508     // Limit to equal cost for branch vs. N v_cndmask_b32s.
   2509     return RI.hasVGPRs(RC) && NumInsts <= 6;
   2510   }
   2511   case SCC_TRUE:
   2512   case SCC_FALSE: {
   2513     // FIXME: We could insert for VGPRs if we could replace the original compare
   2514     // with a vector one.
   2515     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2516     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
   2517     if (MRI.getRegClass(FalseReg) != RC)
   2518       return false;
   2519 
   2520     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
   2521 
   2522     // Multiples of 8 can do s_cselect_b64
   2523     if (NumInsts % 2 == 0)
   2524       NumInsts /= 2;
   2525 
   2526     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
   2527     return RI.isSGPRClass(RC);
   2528   }
   2529   default:
   2530     return false;
   2531   }
   2532 }
   2533 
   2534 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   2535                                MachineBasicBlock::iterator I, const DebugLoc &DL,
   2536                                Register DstReg, ArrayRef<MachineOperand> Cond,
   2537                                Register TrueReg, Register FalseReg) const {
   2538   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
   2539   if (Pred == VCCZ || Pred == SCC_FALSE) {
   2540     Pred = static_cast<BranchPredicate>(-Pred);
   2541     std::swap(TrueReg, FalseReg);
   2542   }
   2543 
   2544   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2545   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
   2546   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
   2547 
   2548   if (DstSize == 32) {
   2549     MachineInstr *Select;
   2550     if (Pred == SCC_TRUE) {
   2551       Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
   2552         .addReg(TrueReg)
   2553         .addReg(FalseReg);
   2554     } else {
   2555       // Instruction's operands are backwards from what is expected.
   2556       Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
   2557         .addReg(FalseReg)
   2558         .addReg(TrueReg);
   2559     }
   2560 
   2561     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
   2562     return;
   2563   }
   2564 
   2565   if (DstSize == 64 && Pred == SCC_TRUE) {
   2566     MachineInstr *Select =
   2567       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
   2568       .addReg(TrueReg)
   2569       .addReg(FalseReg);
   2570 
   2571     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
   2572     return;
   2573   }
   2574 
   2575   static const int16_t Sub0_15[] = {
   2576     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
   2577     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
   2578     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
   2579     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
   2580   };
   2581 
   2582   static const int16_t Sub0_15_64[] = {
   2583     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
   2584     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
   2585     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
   2586     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
   2587   };
   2588 
   2589   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
   2590   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
   2591   const int16_t *SubIndices = Sub0_15;
   2592   int NElts = DstSize / 32;
   2593 
   2594   // 64-bit select is only available for SALU.
   2595   // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
   2596   if (Pred == SCC_TRUE) {
   2597     if (NElts % 2) {
   2598       SelOp = AMDGPU::S_CSELECT_B32;
   2599       EltRC = &AMDGPU::SGPR_32RegClass;
   2600     } else {
   2601       SelOp = AMDGPU::S_CSELECT_B64;
   2602       EltRC = &AMDGPU::SGPR_64RegClass;
   2603       SubIndices = Sub0_15_64;
   2604       NElts /= 2;
   2605     }
   2606   }
   2607 
   2608   MachineInstrBuilder MIB = BuildMI(
   2609     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
   2610 
   2611   I = MIB->getIterator();
   2612 
   2613   SmallVector<Register, 8> Regs;
   2614   for (int Idx = 0; Idx != NElts; ++Idx) {
   2615     Register DstElt = MRI.createVirtualRegister(EltRC);
   2616     Regs.push_back(DstElt);
   2617 
   2618     unsigned SubIdx = SubIndices[Idx];
   2619 
   2620     MachineInstr *Select;
   2621     if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
   2622       Select =
   2623         BuildMI(MBB, I, DL, get(SelOp), DstElt)
   2624         .addReg(FalseReg, 0, SubIdx)
   2625         .addReg(TrueReg, 0, SubIdx);
   2626     } else {
   2627       Select =
   2628         BuildMI(MBB, I, DL, get(SelOp), DstElt)
   2629         .addReg(TrueReg, 0, SubIdx)
   2630         .addReg(FalseReg, 0, SubIdx);
   2631     }
   2632 
   2633     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
   2634     fixImplicitOperands(*Select);
   2635 
   2636     MIB.addReg(DstElt)
   2637        .addImm(SubIdx);
   2638   }
   2639 }
   2640 
   2641 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
   2642   switch (MI.getOpcode()) {
   2643   case AMDGPU::V_MOV_B32_e32:
   2644   case AMDGPU::V_MOV_B32_e64:
   2645   case AMDGPU::V_MOV_B64_PSEUDO: {
   2646     // If there are additional implicit register operands, this may be used for
   2647     // register indexing so the source register operand isn't simply copied.
   2648     unsigned NumOps = MI.getDesc().getNumOperands() +
   2649       MI.getDesc().getNumImplicitUses();
   2650 
   2651     return MI.getNumOperands() == NumOps;
   2652   }
   2653   case AMDGPU::S_MOV_B32:
   2654   case AMDGPU::S_MOV_B64:
   2655   case AMDGPU::COPY:
   2656   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
   2657   case AMDGPU::V_ACCVGPR_READ_B32_e64:
   2658   case AMDGPU::V_ACCVGPR_MOV_B32:
   2659     return true;
   2660   default:
   2661     return false;
   2662   }
   2663 }
   2664 
   2665 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
   2666     unsigned Kind) const {
   2667   switch(Kind) {
   2668   case PseudoSourceValue::Stack:
   2669   case PseudoSourceValue::FixedStack:
   2670     return AMDGPUAS::PRIVATE_ADDRESS;
   2671   case PseudoSourceValue::ConstantPool:
   2672   case PseudoSourceValue::GOT:
   2673   case PseudoSourceValue::JumpTable:
   2674   case PseudoSourceValue::GlobalValueCallEntry:
   2675   case PseudoSourceValue::ExternalSymbolCallEntry:
   2676   case PseudoSourceValue::TargetCustom:
   2677     return AMDGPUAS::CONSTANT_ADDRESS;
   2678   }
   2679   return AMDGPUAS::FLAT_ADDRESS;
   2680 }
   2681 
   2682 static void removeModOperands(MachineInstr &MI) {
   2683   unsigned Opc = MI.getOpcode();
   2684   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
   2685                                               AMDGPU::OpName::src0_modifiers);
   2686   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
   2687                                               AMDGPU::OpName::src1_modifiers);
   2688   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
   2689                                               AMDGPU::OpName::src2_modifiers);
   2690 
   2691   MI.RemoveOperand(Src2ModIdx);
   2692   MI.RemoveOperand(Src1ModIdx);
   2693   MI.RemoveOperand(Src0ModIdx);
   2694 }
   2695 
   2696 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   2697                                 Register Reg, MachineRegisterInfo *MRI) const {
   2698   if (!MRI->hasOneNonDBGUse(Reg))
   2699     return false;
   2700 
   2701   switch (DefMI.getOpcode()) {
   2702   default:
   2703     return false;
   2704   case AMDGPU::S_MOV_B64:
   2705     // TODO: We could fold 64-bit immediates, but this get compilicated
   2706     // when there are sub-registers.
   2707     return false;
   2708 
   2709   case AMDGPU::V_MOV_B32_e32:
   2710   case AMDGPU::S_MOV_B32:
   2711   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
   2712     break;
   2713   }
   2714 
   2715   const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
   2716   assert(ImmOp);
   2717   // FIXME: We could handle FrameIndex values here.
   2718   if (!ImmOp->isImm())
   2719     return false;
   2720 
   2721   unsigned Opc = UseMI.getOpcode();
   2722   if (Opc == AMDGPU::COPY) {
   2723     Register DstReg = UseMI.getOperand(0).getReg();
   2724     bool Is16Bit = getOpSize(UseMI, 0) == 2;
   2725     bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
   2726     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
   2727     APInt Imm(32, ImmOp->getImm());
   2728 
   2729     if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
   2730       Imm = Imm.ashr(16);
   2731 
   2732     if (RI.isAGPR(*MRI, DstReg)) {
   2733       if (!isInlineConstant(Imm))
   2734         return false;
   2735       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
   2736     }
   2737 
   2738     if (Is16Bit) {
   2739        if (isVGPRCopy)
   2740          return false; // Do not clobber vgpr_hi16
   2741 
   2742        if (DstReg.isVirtual() &&
   2743            UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
   2744          return false;
   2745 
   2746       UseMI.getOperand(0).setSubReg(0);
   2747       if (DstReg.isPhysical()) {
   2748         DstReg = RI.get32BitRegister(DstReg);
   2749         UseMI.getOperand(0).setReg(DstReg);
   2750       }
   2751       assert(UseMI.getOperand(1).getReg().isVirtual());
   2752     }
   2753 
   2754     UseMI.setDesc(get(NewOpc));
   2755     UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
   2756     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
   2757     return true;
   2758   }
   2759 
   2760   if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
   2761       Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
   2762       Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
   2763       Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
   2764     // Don't fold if we are using source or output modifiers. The new VOP2
   2765     // instructions don't have them.
   2766     if (hasAnyModifiersSet(UseMI))
   2767       return false;
   2768 
   2769     // If this is a free constant, there's no reason to do this.
   2770     // TODO: We could fold this here instead of letting SIFoldOperands do it
   2771     // later.
   2772     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
   2773 
   2774     // Any src operand can be used for the legality check.
   2775     if (isInlineConstant(UseMI, *Src0, *ImmOp))
   2776       return false;
   2777 
   2778     bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
   2779                  Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
   2780     bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
   2781                  Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
   2782     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
   2783     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
   2784 
   2785     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
   2786     // We should only expect these to be on src0 due to canonicalizations.
   2787     if (Src0->isReg() && Src0->getReg() == Reg) {
   2788       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
   2789         return false;
   2790 
   2791       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
   2792         return false;
   2793 
   2794       unsigned NewOpc =
   2795         IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
   2796               : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
   2797       if (pseudoToMCOpcode(NewOpc) == -1)
   2798         return false;
   2799 
   2800       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
   2801 
   2802       const int64_t Imm = ImmOp->getImm();
   2803 
   2804       // FIXME: This would be a lot easier if we could return a new instruction
   2805       // instead of having to modify in place.
   2806 
   2807       // Remove these first since they are at the end.
   2808       UseMI.RemoveOperand(
   2809           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
   2810       UseMI.RemoveOperand(
   2811           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
   2812 
   2813       Register Src1Reg = Src1->getReg();
   2814       unsigned Src1SubReg = Src1->getSubReg();
   2815       Src0->setReg(Src1Reg);
   2816       Src0->setSubReg(Src1SubReg);
   2817       Src0->setIsKill(Src1->isKill());
   2818 
   2819       if (Opc == AMDGPU::V_MAC_F32_e64 ||
   2820           Opc == AMDGPU::V_MAC_F16_e64 ||
   2821           Opc == AMDGPU::V_FMAC_F32_e64 ||
   2822           Opc == AMDGPU::V_FMAC_F16_e64)
   2823         UseMI.untieRegOperand(
   2824             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
   2825 
   2826       Src1->ChangeToImmediate(Imm);
   2827 
   2828       removeModOperands(UseMI);
   2829       UseMI.setDesc(get(NewOpc));
   2830 
   2831       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
   2832       if (DeleteDef)
   2833         DefMI.eraseFromParent();
   2834 
   2835       return true;
   2836     }
   2837 
   2838     // Added part is the constant: Use v_madak_{f16, f32}.
   2839     if (Src2->isReg() && Src2->getReg() == Reg) {
   2840       // Not allowed to use constant bus for another operand.
   2841       // We can however allow an inline immediate as src0.
   2842       bool Src0Inlined = false;
   2843       if (Src0->isReg()) {
   2844         // Try to inline constant if possible.
   2845         // If the Def moves immediate and the use is single
   2846         // We are saving VGPR here.
   2847         MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
   2848         if (Def && Def->isMoveImmediate() &&
   2849           isInlineConstant(Def->getOperand(1)) &&
   2850           MRI->hasOneUse(Src0->getReg())) {
   2851           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
   2852           Src0Inlined = true;
   2853         } else if ((Src0->getReg().isPhysical() &&
   2854                     (ST.getConstantBusLimit(Opc) <= 1 &&
   2855                      RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
   2856                    (Src0->getReg().isVirtual() &&
   2857                     (ST.getConstantBusLimit(Opc) <= 1 &&
   2858                      RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
   2859           return false;
   2860           // VGPR is okay as Src0 - fallthrough
   2861       }
   2862 
   2863       if (Src1->isReg() && !Src0Inlined ) {
   2864         // We have one slot for inlinable constant so far - try to fill it
   2865         MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
   2866         if (Def && Def->isMoveImmediate() &&
   2867             isInlineConstant(Def->getOperand(1)) &&
   2868             MRI->hasOneUse(Src1->getReg()) &&
   2869             commuteInstruction(UseMI)) {
   2870             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
   2871         } else if ((Src1->getReg().isPhysical() &&
   2872                     RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
   2873                    (Src1->getReg().isVirtual() &&
   2874                     RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
   2875           return false;
   2876           // VGPR is okay as Src1 - fallthrough
   2877       }
   2878 
   2879       unsigned NewOpc =
   2880         IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
   2881               : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
   2882       if (pseudoToMCOpcode(NewOpc) == -1)
   2883         return false;
   2884 
   2885       const int64_t Imm = ImmOp->getImm();
   2886 
   2887       // FIXME: This would be a lot easier if we could return a new instruction
   2888       // instead of having to modify in place.
   2889 
   2890       // Remove these first since they are at the end.
   2891       UseMI.RemoveOperand(
   2892           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
   2893       UseMI.RemoveOperand(
   2894           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
   2895 
   2896       if (Opc == AMDGPU::V_MAC_F32_e64 ||
   2897           Opc == AMDGPU::V_MAC_F16_e64 ||
   2898           Opc == AMDGPU::V_FMAC_F32_e64 ||
   2899           Opc == AMDGPU::V_FMAC_F16_e64)
   2900         UseMI.untieRegOperand(
   2901             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
   2902 
   2903       // ChangingToImmediate adds Src2 back to the instruction.
   2904       Src2->ChangeToImmediate(Imm);
   2905 
   2906       // These come before src2.
   2907       removeModOperands(UseMI);
   2908       UseMI.setDesc(get(NewOpc));
   2909       // It might happen that UseMI was commuted
   2910       // and we now have SGPR as SRC1. If so 2 inlined
   2911       // constant and SGPR are illegal.
   2912       legalizeOperands(UseMI);
   2913 
   2914       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
   2915       if (DeleteDef)
   2916         DefMI.eraseFromParent();
   2917 
   2918       return true;
   2919     }
   2920   }
   2921 
   2922   return false;
   2923 }
   2924 
   2925 static bool
   2926 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
   2927                            ArrayRef<const MachineOperand *> BaseOps2) {
   2928   if (BaseOps1.size() != BaseOps2.size())
   2929     return false;
   2930   for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
   2931     if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
   2932       return false;
   2933   }
   2934   return true;
   2935 }
   2936 
   2937 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
   2938                                 int WidthB, int OffsetB) {
   2939   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
   2940   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
   2941   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
   2942   return LowOffset + LowWidth <= HighOffset;
   2943 }
   2944 
   2945 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
   2946                                                const MachineInstr &MIb) const {
   2947   SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
   2948   int64_t Offset0, Offset1;
   2949   unsigned Dummy0, Dummy1;
   2950   bool Offset0IsScalable, Offset1IsScalable;
   2951   if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
   2952                                      Dummy0, &RI) ||
   2953       !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
   2954                                      Dummy1, &RI))
   2955     return false;
   2956 
   2957   if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
   2958     return false;
   2959 
   2960   if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
   2961     // FIXME: Handle ds_read2 / ds_write2.
   2962     return false;
   2963   }
   2964   unsigned Width0 = MIa.memoperands().front()->getSize();
   2965   unsigned Width1 = MIb.memoperands().front()->getSize();
   2966   return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
   2967 }
   2968 
   2969 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   2970                                                   const MachineInstr &MIb) const {
   2971   assert(MIa.mayLoadOrStore() &&
   2972          "MIa must load from or modify a memory location");
   2973   assert(MIb.mayLoadOrStore() &&
   2974          "MIb must load from or modify a memory location");
   2975 
   2976   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
   2977     return false;
   2978 
   2979   // XXX - Can we relax this between address spaces?
   2980   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
   2981     return false;
   2982 
   2983   // TODO: Should we check the address space from the MachineMemOperand? That
   2984   // would allow us to distinguish objects we know don't alias based on the
   2985   // underlying address space, even if it was lowered to a different one,
   2986   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   2987   // buffer.
   2988   if (isDS(MIa)) {
   2989     if (isDS(MIb))
   2990       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   2991 
   2992     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
   2993   }
   2994 
   2995   if (isMUBUF(MIa) || isMTBUF(MIa)) {
   2996     if (isMUBUF(MIb) || isMTBUF(MIb))
   2997       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   2998 
   2999     return !isFLAT(MIb) && !isSMRD(MIb);
   3000   }
   3001 
   3002   if (isSMRD(MIa)) {
   3003     if (isSMRD(MIb))
   3004       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   3005 
   3006     return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
   3007   }
   3008 
   3009   if (isFLAT(MIa)) {
   3010     if (isFLAT(MIb))
   3011       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   3012 
   3013     return false;
   3014   }
   3015 
   3016   return false;
   3017 }
   3018 
   3019 static int64_t getFoldableImm(const MachineOperand* MO) {
   3020   if (!MO->isReg())
   3021     return false;
   3022   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
   3023   const MachineRegisterInfo &MRI = MF->getRegInfo();
   3024   auto Def = MRI.getUniqueVRegDef(MO->getReg());
   3025   if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
   3026       Def->getOperand(1).isImm())
   3027     return Def->getOperand(1).getImm();
   3028   return AMDGPU::NoRegister;
   3029 }
   3030 
   3031 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
   3032                                 MachineInstr &NewMI) {
   3033   if (LV) {
   3034     unsigned NumOps = MI.getNumOperands();
   3035     for (unsigned I = 1; I < NumOps; ++I) {
   3036       MachineOperand &Op = MI.getOperand(I);
   3037       if (Op.isReg() && Op.isKill())
   3038         LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
   3039     }
   3040   }
   3041 }
   3042 
   3043 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   3044                                                  MachineInstr &MI,
   3045                                                  LiveVariables *LV) const {
   3046   unsigned Opc = MI.getOpcode();
   3047   bool IsF16 = false;
   3048   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
   3049                Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
   3050                Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
   3051   bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
   3052 
   3053   switch (Opc) {
   3054   default:
   3055     return nullptr;
   3056   case AMDGPU::V_MAC_F16_e64:
   3057   case AMDGPU::V_FMAC_F16_e64:
   3058     IsF16 = true;
   3059     LLVM_FALLTHROUGH;
   3060   case AMDGPU::V_MAC_F32_e64:
   3061   case AMDGPU::V_FMAC_F32_e64:
   3062   case AMDGPU::V_FMAC_F64_e64:
   3063     break;
   3064   case AMDGPU::V_MAC_F16_e32:
   3065   case AMDGPU::V_FMAC_F16_e32:
   3066     IsF16 = true;
   3067     LLVM_FALLTHROUGH;
   3068   case AMDGPU::V_MAC_F32_e32:
   3069   case AMDGPU::V_FMAC_F32_e32:
   3070   case AMDGPU::V_FMAC_F64_e32: {
   3071     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
   3072                                              AMDGPU::OpName::src0);
   3073     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
   3074     if (!Src0->isReg() && !Src0->isImm())
   3075       return nullptr;
   3076 
   3077     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
   3078       return nullptr;
   3079 
   3080     break;
   3081   }
   3082   }
   3083 
   3084   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   3085   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
   3086   const MachineOperand *Src0Mods =
   3087     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   3088   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
   3089   const MachineOperand *Src1Mods =
   3090     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
   3091   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   3092   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   3093   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
   3094   MachineInstrBuilder MIB;
   3095 
   3096   if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
   3097       // If we have an SGPR input, we will violate the constant bus restriction.
   3098       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
   3099        !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
   3100     if (auto Imm = getFoldableImm(Src2)) {
   3101       unsigned NewOpc =
   3102           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
   3103                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
   3104       if (pseudoToMCOpcode(NewOpc) != -1) {
   3105         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
   3106                   .add(*Dst)
   3107                   .add(*Src0)
   3108                   .add(*Src1)
   3109                   .addImm(Imm);
   3110         updateLiveVariables(LV, MI, *MIB);
   3111         return MIB;
   3112       }
   3113     }
   3114     unsigned NewOpc = IsFMA
   3115                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
   3116                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
   3117     if (auto Imm = getFoldableImm(Src1)) {
   3118       if (pseudoToMCOpcode(NewOpc) != -1) {
   3119         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
   3120                   .add(*Dst)
   3121                   .add(*Src0)
   3122                   .addImm(Imm)
   3123                   .add(*Src2);
   3124         updateLiveVariables(LV, MI, *MIB);
   3125         return MIB;
   3126       }
   3127     }
   3128     if (auto Imm = getFoldableImm(Src0)) {
   3129       if (pseudoToMCOpcode(NewOpc) != -1 &&
   3130           isOperandLegal(
   3131               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
   3132               Src1)) {
   3133         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
   3134                   .add(*Dst)
   3135                   .add(*Src1)
   3136                   .addImm(Imm)
   3137                   .add(*Src2);
   3138         updateLiveVariables(LV, MI, *MIB);
   3139         return MIB;
   3140       }
   3141     }
   3142   }
   3143 
   3144   unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
   3145                                    : IsF64 ? AMDGPU::V_FMA_F64_e64
   3146                                            : AMDGPU::V_FMA_F32_e64)
   3147                           : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
   3148   if (pseudoToMCOpcode(NewOpc) == -1)
   3149     return nullptr;
   3150 
   3151   MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
   3152             .add(*Dst)
   3153             .addImm(Src0Mods ? Src0Mods->getImm() : 0)
   3154             .add(*Src0)
   3155             .addImm(Src1Mods ? Src1Mods->getImm() : 0)
   3156             .add(*Src1)
   3157             .addImm(0) // Src mods
   3158             .add(*Src2)
   3159             .addImm(Clamp ? Clamp->getImm() : 0)
   3160             .addImm(Omod ? Omod->getImm() : 0);
   3161   updateLiveVariables(LV, MI, *MIB);
   3162   return MIB;
   3163 }
   3164 
   3165 // It's not generally safe to move VALU instructions across these since it will
   3166 // start using the register as a base index rather than directly.
   3167 // XXX - Why isn't hasSideEffects sufficient for these?
   3168 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
   3169   switch (MI.getOpcode()) {
   3170   case AMDGPU::S_SET_GPR_IDX_ON:
   3171   case AMDGPU::S_SET_GPR_IDX_MODE:
   3172   case AMDGPU::S_SET_GPR_IDX_OFF:
   3173     return true;
   3174   default:
   3175     return false;
   3176   }
   3177 }
   3178 
   3179 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   3180                                        const MachineBasicBlock *MBB,
   3181                                        const MachineFunction &MF) const {
   3182   // Skipping the check for SP writes in the base implementation. The reason it
   3183   // was added was apparently due to compile time concerns.
   3184   //
   3185   // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
   3186   // but is probably avoidable.
   3187 
   3188   // Copied from base implementation.
   3189   // Terminators and labels can't be scheduled around.
   3190   if (MI.isTerminator() || MI.isPosition())
   3191     return true;
   3192 
   3193   // INLINEASM_BR can jump to another block
   3194   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
   3195     return true;
   3196 
   3197   // Target-independent instructions do not have an implicit-use of EXEC, even
   3198   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   3199   // boundaries prevents incorrect movements of such instructions.
   3200   return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
   3201          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
   3202          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
   3203          changesVGPRIndexingMode(MI);
   3204 }
   3205 
   3206 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
   3207   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
   3208          Opcode == AMDGPU::DS_GWS_INIT ||
   3209          Opcode == AMDGPU::DS_GWS_SEMA_V ||
   3210          Opcode == AMDGPU::DS_GWS_SEMA_BR ||
   3211          Opcode == AMDGPU::DS_GWS_SEMA_P ||
   3212          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
   3213          Opcode == AMDGPU::DS_GWS_BARRIER;
   3214 }
   3215 
   3216 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
   3217   // Skip the full operand and register alias search modifiesRegister
   3218   // does. There's only a handful of instructions that touch this, it's only an
   3219   // implicit def, and doesn't alias any other registers.
   3220   if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
   3221     for (; ImpDef && *ImpDef; ++ImpDef) {
   3222       if (*ImpDef == AMDGPU::MODE)
   3223         return true;
   3224     }
   3225   }
   3226 
   3227   return false;
   3228 }
   3229 
   3230 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
   3231   unsigned Opcode = MI.getOpcode();
   3232 
   3233   if (MI.mayStore() && isSMRD(MI))
   3234     return true; // scalar store or atomic
   3235 
   3236   // This will terminate the function when other lanes may need to continue.
   3237   if (MI.isReturn())
   3238     return true;
   3239 
   3240   // These instructions cause shader I/O that may cause hardware lockups
   3241   // when executed with an empty EXEC mask.
   3242   //
   3243   // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
   3244   //       EXEC = 0, but checking for that case here seems not worth it
   3245   //       given the typical code patterns.
   3246   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
   3247       isEXP(Opcode) ||
   3248       Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
   3249       Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
   3250     return true;
   3251 
   3252   if (MI.isCall() || MI.isInlineAsm())
   3253     return true; // conservative assumption
   3254 
   3255   // A mode change is a scalar operation that influences vector instructions.
   3256   if (modifiesModeRegister(MI))
   3257     return true;
   3258 
   3259   // These are like SALU instructions in terms of effects, so it's questionable
   3260   // whether we should return true for those.
   3261   //
   3262   // However, executing them with EXEC = 0 causes them to operate on undefined
   3263   // data, which we avoid by returning true here.
   3264   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
   3265       Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
   3266     return true;
   3267 
   3268   return false;
   3269 }
   3270 
   3271 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
   3272                               const MachineInstr &MI) const {
   3273   if (MI.isMetaInstruction())
   3274     return false;
   3275 
   3276   // This won't read exec if this is an SGPR->SGPR copy.
   3277   if (MI.isCopyLike()) {
   3278     if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
   3279       return true;
   3280 
   3281     // Make sure this isn't copying exec as a normal operand
   3282     return MI.readsRegister(AMDGPU::EXEC, &RI);
   3283   }
   3284 
   3285   // Make a conservative assumption about the callee.
   3286   if (MI.isCall())
   3287     return true;
   3288 
   3289   // Be conservative with any unhandled generic opcodes.
   3290   if (!isTargetSpecificOpcode(MI.getOpcode()))
   3291     return true;
   3292 
   3293   return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
   3294 }
   3295 
   3296 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   3297   switch (Imm.getBitWidth()) {
   3298   case 1: // This likely will be a condition code mask.
   3299     return true;
   3300 
   3301   case 32:
   3302     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
   3303                                         ST.hasInv2PiInlineImm());
   3304   case 64:
   3305     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
   3306                                         ST.hasInv2PiInlineImm());
   3307   case 16:
   3308     return ST.has16BitInsts() &&
   3309            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
   3310                                         ST.hasInv2PiInlineImm());
   3311   default:
   3312     llvm_unreachable("invalid bitwidth");
   3313   }
   3314 }
   3315 
   3316 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   3317                                    uint8_t OperandType) const {
   3318   if (!MO.isImm() ||
   3319       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
   3320       OperandType > AMDGPU::OPERAND_SRC_LAST)
   3321     return false;
   3322 
   3323   // MachineOperand provides no way to tell the true operand size, since it only
   3324   // records a 64-bit value. We need to know the size to determine if a 32-bit
   3325   // floating point immediate bit pattern is legal for an integer immediate. It
   3326   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
   3327 
   3328   int64_t Imm = MO.getImm();
   3329   switch (OperandType) {
   3330   case AMDGPU::OPERAND_REG_IMM_INT32:
   3331   case AMDGPU::OPERAND_REG_IMM_FP32:
   3332   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   3333   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   3334   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   3335   case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
   3336   case AMDGPU::OPERAND_REG_IMM_V2INT32:
   3337   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
   3338   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
   3339   case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
   3340     int32_t Trunc = static_cast<int32_t>(Imm);
   3341     return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
   3342   }
   3343   case AMDGPU::OPERAND_REG_IMM_INT64:
   3344   case AMDGPU::OPERAND_REG_IMM_FP64:
   3345   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   3346   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   3347   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
   3348     return AMDGPU::isInlinableLiteral64(MO.getImm(),
   3349                                         ST.hasInv2PiInlineImm());
   3350   case AMDGPU::OPERAND_REG_IMM_INT16:
   3351   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   3352   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
   3353     // We would expect inline immediates to not be concerned with an integer/fp
   3354     // distinction. However, in the case of 16-bit integer operations, the
   3355     // "floating point" values appear to not work. It seems read the low 16-bits
   3356     // of 32-bit immediates, which happens to always work for the integer
   3357     // values.
   3358     //
   3359     // See llvm bugzilla 46302.
   3360     //
   3361     // TODO: Theoretically we could use op-sel to use the high bits of the
   3362     // 32-bit FP values.
   3363     return AMDGPU::isInlinableIntLiteral(Imm);
   3364   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   3365   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   3366   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   3367     // This suffers the same problem as the scalar 16-bit cases.
   3368     return AMDGPU::isInlinableIntLiteralV216(Imm);
   3369   case AMDGPU::OPERAND_REG_IMM_FP16:
   3370   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   3371   case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
   3372     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
   3373       // A few special case instructions have 16-bit operands on subtargets
   3374       // where 16-bit instructions are not legal.
   3375       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
   3376       // constants in these cases
   3377       int16_t Trunc = static_cast<int16_t>(Imm);
   3378       return ST.has16BitInsts() &&
   3379              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
   3380     }
   3381 
   3382     return false;
   3383   }
   3384   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   3385   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   3386   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
   3387     uint32_t Trunc = static_cast<uint32_t>(Imm);
   3388     return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
   3389   }
   3390   default:
   3391     llvm_unreachable("invalid bitwidth");
   3392   }
   3393 }
   3394 
   3395 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
   3396                                         const MCOperandInfo &OpInfo) const {
   3397   switch (MO.getType()) {
   3398   case MachineOperand::MO_Register:
   3399     return false;
   3400   case MachineOperand::MO_Immediate:
   3401     return !isInlineConstant(MO, OpInfo);
   3402   case MachineOperand::MO_FrameIndex:
   3403   case MachineOperand::MO_MachineBasicBlock:
   3404   case MachineOperand::MO_ExternalSymbol:
   3405   case MachineOperand::MO_GlobalAddress:
   3406   case MachineOperand::MO_MCSymbol:
   3407     return true;
   3408   default:
   3409     llvm_unreachable("unexpected operand type");
   3410   }
   3411 }
   3412 
   3413 static bool compareMachineOp(const MachineOperand &Op0,
   3414                              const MachineOperand &Op1) {
   3415   if (Op0.getType() != Op1.getType())
   3416     return false;
   3417 
   3418   switch (Op0.getType()) {
   3419   case MachineOperand::MO_Register:
   3420     return Op0.getReg() == Op1.getReg();
   3421   case MachineOperand::MO_Immediate:
   3422     return Op0.getImm() == Op1.getImm();
   3423   default:
   3424     llvm_unreachable("Didn't expect to be comparing these operand types");
   3425   }
   3426 }
   3427 
   3428 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
   3429                                     const MachineOperand &MO) const {
   3430   const MCInstrDesc &InstDesc = MI.getDesc();
   3431   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
   3432 
   3433   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
   3434 
   3435   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
   3436     return true;
   3437 
   3438   if (OpInfo.RegClass < 0)
   3439     return false;
   3440 
   3441   if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
   3442     if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
   3443         OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
   3444                                                     AMDGPU::OpName::src2))
   3445       return false;
   3446     return RI.opCanUseInlineConstant(OpInfo.OperandType);
   3447   }
   3448 
   3449   if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
   3450     return false;
   3451 
   3452   if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
   3453     return true;
   3454 
   3455   return ST.hasVOP3Literal();
   3456 }
   3457 
   3458 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
   3459   // GFX90A does not have V_MUL_LEGACY_F32_e32.
   3460   if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
   3461     return false;
   3462 
   3463   int Op32 = AMDGPU::getVOPe32(Opcode);
   3464   if (Op32 == -1)
   3465     return false;
   3466 
   3467   return pseudoToMCOpcode(Op32) != -1;
   3468 }
   3469 
   3470 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
   3471   // The src0_modifier operand is present on all instructions
   3472   // that have modifiers.
   3473 
   3474   return AMDGPU::getNamedOperandIdx(Opcode,
   3475                                     AMDGPU::OpName::src0_modifiers) != -1;
   3476 }
   3477 
   3478 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
   3479                                   unsigned OpName) const {
   3480   const MachineOperand *Mods = getNamedOperand(MI, OpName);
   3481   return Mods && Mods->getImm();
   3482 }
   3483 
   3484 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
   3485   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
   3486          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
   3487          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
   3488          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
   3489          hasModifiersSet(MI, AMDGPU::OpName::omod);
   3490 }
   3491 
   3492 bool SIInstrInfo::canShrink(const MachineInstr &MI,
   3493                             const MachineRegisterInfo &MRI) const {
   3494   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   3495   // Can't shrink instruction with three operands.
   3496   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
   3497   // a special case for it.  It can only be shrunk if the third operand
   3498   // is vcc, and src0_modifiers and src1_modifiers are not set.
   3499   // We should handle this the same way we handle vopc, by addding
   3500   // a register allocation hint pre-regalloc and then do the shrinking
   3501   // post-regalloc.
   3502   if (Src2) {
   3503     switch (MI.getOpcode()) {
   3504       default: return false;
   3505 
   3506       case AMDGPU::V_ADDC_U32_e64:
   3507       case AMDGPU::V_SUBB_U32_e64:
   3508       case AMDGPU::V_SUBBREV_U32_e64: {
   3509         const MachineOperand *Src1
   3510           = getNamedOperand(MI, AMDGPU::OpName::src1);
   3511         if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
   3512           return false;
   3513         // Additional verification is needed for sdst/src2.
   3514         return true;
   3515       }
   3516       case AMDGPU::V_MAC_F32_e64:
   3517       case AMDGPU::V_MAC_F16_e64:
   3518       case AMDGPU::V_FMAC_F32_e64:
   3519       case AMDGPU::V_FMAC_F16_e64:
   3520       case AMDGPU::V_FMAC_F64_e64:
   3521         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
   3522             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
   3523           return false;
   3524         break;
   3525 
   3526       case AMDGPU::V_CNDMASK_B32_e64:
   3527         break;
   3528     }
   3529   }
   3530 
   3531   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
   3532   if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
   3533                hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
   3534     return false;
   3535 
   3536   // We don't need to check src0, all input types are legal, so just make sure
   3537   // src0 isn't using any modifiers.
   3538   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
   3539     return false;
   3540 
   3541   // Can it be shrunk to a valid 32 bit opcode?
   3542   if (!hasVALU32BitEncoding(MI.getOpcode()))
   3543     return false;
   3544 
   3545   // Check output modifiers
   3546   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
   3547          !hasModifiersSet(MI, AMDGPU::OpName::clamp);
   3548 }
   3549 
   3550 // Set VCC operand with all flags from \p Orig, except for setting it as
   3551 // implicit.
   3552 static void copyFlagsToImplicitVCC(MachineInstr &MI,
   3553                                    const MachineOperand &Orig) {
   3554 
   3555   for (MachineOperand &Use : MI.implicit_operands()) {
   3556     if (Use.isUse() &&
   3557         (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
   3558       Use.setIsUndef(Orig.isUndef());
   3559       Use.setIsKill(Orig.isKill());
   3560       return;
   3561     }
   3562   }
   3563 }
   3564 
   3565 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
   3566                                            unsigned Op32) const {
   3567   MachineBasicBlock *MBB = MI.getParent();;
   3568   MachineInstrBuilder Inst32 =
   3569     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
   3570     .setMIFlags(MI.getFlags());
   3571 
   3572   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
   3573   // For VOPC instructions, this is replaced by an implicit def of vcc.
   3574   int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
   3575   if (Op32DstIdx != -1) {
   3576     // dst
   3577     Inst32.add(MI.getOperand(0));
   3578   } else {
   3579     assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
   3580             (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
   3581            "Unexpected case");
   3582   }
   3583 
   3584   Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
   3585 
   3586   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
   3587   if (Src1)
   3588     Inst32.add(*Src1);
   3589 
   3590   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   3591 
   3592   if (Src2) {
   3593     int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
   3594     if (Op32Src2Idx != -1) {
   3595       Inst32.add(*Src2);
   3596     } else {
   3597       // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
   3598       // replaced with an implicit read of vcc or vcc_lo. The implicit read
   3599       // of vcc was already added during the initial BuildMI, but we
   3600       // 1) may need to change vcc to vcc_lo to preserve the original register
   3601       // 2) have to preserve the original flags.
   3602       fixImplicitOperands(*Inst32);
   3603       copyFlagsToImplicitVCC(*Inst32, *Src2);
   3604     }
   3605   }
   3606 
   3607   return Inst32;
   3608 }
   3609 
   3610 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   3611                                   const MachineOperand &MO,
   3612                                   const MCOperandInfo &OpInfo) const {
   3613   // Literal constants use the constant bus.
   3614   //if (isLiteralConstantLike(MO, OpInfo))
   3615   // return true;
   3616   if (MO.isImm())
   3617     return !isInlineConstant(MO, OpInfo);
   3618 
   3619   if (!MO.isReg())
   3620     return true; // Misc other operands like FrameIndex
   3621 
   3622   if (!MO.isUse())
   3623     return false;
   3624 
   3625   if (MO.getReg().isVirtual())
   3626     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
   3627 
   3628   // Null is free
   3629   if (MO.getReg() == AMDGPU::SGPR_NULL)
   3630     return false;
   3631 
   3632   // SGPRs use the constant bus
   3633   if (MO.isImplicit()) {
   3634     return MO.getReg() == AMDGPU::M0 ||
   3635            MO.getReg() == AMDGPU::VCC ||
   3636            MO.getReg() == AMDGPU::VCC_LO;
   3637   } else {
   3638     return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
   3639            AMDGPU::SReg_64RegClass.contains(MO.getReg());
   3640   }
   3641 }
   3642 
   3643 static Register findImplicitSGPRRead(const MachineInstr &MI) {
   3644   for (const MachineOperand &MO : MI.implicit_operands()) {
   3645     // We only care about reads.
   3646     if (MO.isDef())
   3647       continue;
   3648 
   3649     switch (MO.getReg()) {
   3650     case AMDGPU::VCC:
   3651     case AMDGPU::VCC_LO:
   3652     case AMDGPU::VCC_HI:
   3653     case AMDGPU::M0:
   3654     case AMDGPU::FLAT_SCR:
   3655       return MO.getReg();
   3656 
   3657     default:
   3658       break;
   3659     }
   3660   }
   3661 
   3662   return AMDGPU::NoRegister;
   3663 }
   3664 
   3665 static bool shouldReadExec(const MachineInstr &MI) {
   3666   if (SIInstrInfo::isVALU(MI)) {
   3667     switch (MI.getOpcode()) {
   3668     case AMDGPU::V_READLANE_B32:
   3669     case AMDGPU::V_WRITELANE_B32:
   3670       return false;
   3671     }
   3672 
   3673     return true;
   3674   }
   3675 
   3676   if (MI.isPreISelOpcode() ||
   3677       SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
   3678       SIInstrInfo::isSALU(MI) ||
   3679       SIInstrInfo::isSMRD(MI))
   3680     return false;
   3681 
   3682   return true;
   3683 }
   3684 
   3685 static bool isSubRegOf(const SIRegisterInfo &TRI,
   3686                        const MachineOperand &SuperVec,
   3687                        const MachineOperand &SubReg) {
   3688   if (SubReg.getReg().isPhysical())
   3689     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
   3690 
   3691   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
   3692          SubReg.getReg() == SuperVec.getReg();
   3693 }
   3694 
   3695 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
   3696                                     StringRef &ErrInfo) const {
   3697   uint16_t Opcode = MI.getOpcode();
   3698   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
   3699     return true;
   3700 
   3701   const MachineFunction *MF = MI.getParent()->getParent();
   3702   const MachineRegisterInfo &MRI = MF->getRegInfo();
   3703 
   3704   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   3705   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   3706   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
   3707 
   3708   // Make sure the number of operands is correct.
   3709   const MCInstrDesc &Desc = get(Opcode);
   3710   if (!Desc.isVariadic() &&
   3711       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
   3712     ErrInfo = "Instruction has wrong number of operands.";
   3713     return false;
   3714   }
   3715 
   3716   if (MI.isInlineAsm()) {
   3717     // Verify register classes for inlineasm constraints.
   3718     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
   3719          I != E; ++I) {
   3720       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
   3721       if (!RC)
   3722         continue;
   3723 
   3724       const MachineOperand &Op = MI.getOperand(I);
   3725       if (!Op.isReg())
   3726         continue;
   3727 
   3728       Register Reg = Op.getReg();
   3729       if (!Reg.isVirtual() && !RC->contains(Reg)) {
   3730         ErrInfo = "inlineasm operand has incorrect register class.";
   3731         return false;
   3732       }
   3733     }
   3734 
   3735     return true;
   3736   }
   3737 
   3738   if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
   3739     ErrInfo = "missing memory operand from MIMG instruction.";
   3740     return false;
   3741   }
   3742 
   3743   // Make sure the register classes are correct.
   3744   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
   3745     const MachineOperand &MO = MI.getOperand(i);
   3746     if (MO.isFPImm()) {
   3747       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
   3748                 "all fp values to integers.";
   3749       return false;
   3750     }
   3751 
   3752     int RegClass = Desc.OpInfo[i].RegClass;
   3753 
   3754     switch (Desc.OpInfo[i].OperandType) {
   3755     case MCOI::OPERAND_REGISTER:
   3756       if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
   3757         ErrInfo = "Illegal immediate value for operand.";
   3758         return false;
   3759       }
   3760       break;
   3761     case AMDGPU::OPERAND_REG_IMM_INT32:
   3762     case AMDGPU::OPERAND_REG_IMM_FP32:
   3763       break;
   3764     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   3765     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   3766     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   3767     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   3768     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   3769     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   3770     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
   3771     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
   3772     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
   3773     case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
   3774     case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
   3775       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
   3776         ErrInfo = "Illegal immediate value for operand.";
   3777         return false;
   3778       }
   3779       break;
   3780     }
   3781     case MCOI::OPERAND_IMMEDIATE:
   3782     case AMDGPU::OPERAND_KIMM32:
   3783       // Check if this operand is an immediate.
   3784       // FrameIndex operands will be replaced by immediates, so they are
   3785       // allowed.
   3786       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
   3787         ErrInfo = "Expected immediate, but got non-immediate";
   3788         return false;
   3789       }
   3790       LLVM_FALLTHROUGH;
   3791     default:
   3792       continue;
   3793     }
   3794 
   3795     if (!MO.isReg())
   3796       continue;
   3797     Register Reg = MO.getReg();
   3798     if (!Reg)
   3799       continue;
   3800 
   3801     // FIXME: Ideally we would have separate instruction definitions with the
   3802     // aligned register constraint.
   3803     // FIXME: We do not verify inline asm operands, but custom inline asm
   3804     // verification is broken anyway
   3805     if (ST.needsAlignedVGPRs()) {
   3806       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
   3807       const bool IsVGPR = RI.hasVGPRs(RC);
   3808       const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
   3809       if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
   3810         const TargetRegisterClass *SubRC =
   3811             RI.getSubRegClass(RC, MO.getSubReg());
   3812         RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
   3813         if (RC)
   3814           RC = SubRC;
   3815       }
   3816 
   3817       // Check that this is the aligned version of the class.
   3818       if (!RC || !RI.isProperlyAlignedRC(*RC)) {
   3819         ErrInfo = "Subtarget requires even aligned vector registers";
   3820         return false;
   3821       }
   3822     }
   3823 
   3824     if (RegClass != -1) {
   3825       if (Reg.isVirtual())
   3826         continue;
   3827 
   3828       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
   3829       if (!RC->contains(Reg)) {
   3830         ErrInfo = "Operand has incorrect register class.";
   3831         return false;
   3832       }
   3833     }
   3834   }
   3835 
   3836   // Verify SDWA
   3837   if (isSDWA(MI)) {
   3838     if (!ST.hasSDWA()) {
   3839       ErrInfo = "SDWA is not supported on this target";
   3840       return false;
   3841     }
   3842 
   3843     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
   3844 
   3845     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
   3846 
   3847     for (int OpIdx: OpIndicies) {
   3848       if (OpIdx == -1)
   3849         continue;
   3850       const MachineOperand &MO = MI.getOperand(OpIdx);
   3851 
   3852       if (!ST.hasSDWAScalar()) {
   3853         // Only VGPRS on VI
   3854         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
   3855           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
   3856           return false;
   3857         }
   3858       } else {
   3859         // No immediates on GFX9
   3860         if (!MO.isReg()) {
   3861           ErrInfo =
   3862             "Only reg allowed as operands in SDWA instructions on GFX9+";
   3863           return false;
   3864         }
   3865       }
   3866     }
   3867 
   3868     if (!ST.hasSDWAOmod()) {
   3869       // No omod allowed on VI
   3870       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
   3871       if (OMod != nullptr &&
   3872         (!OMod->isImm() || OMod->getImm() != 0)) {
   3873         ErrInfo = "OMod not allowed in SDWA instructions on VI";
   3874         return false;
   3875       }
   3876     }
   3877 
   3878     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
   3879     if (isVOPC(BasicOpcode)) {
   3880       if (!ST.hasSDWASdst() && DstIdx != -1) {
   3881         // Only vcc allowed as dst on VI for VOPC
   3882         const MachineOperand &Dst = MI.getOperand(DstIdx);
   3883         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
   3884           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
   3885           return false;
   3886         }
   3887       } else if (!ST.hasSDWAOutModsVOPC()) {
   3888         // No clamp allowed on GFX9 for VOPC
   3889         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   3890         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
   3891           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
   3892           return false;
   3893         }
   3894 
   3895         // No omod allowed on GFX9 for VOPC
   3896         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
   3897         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
   3898           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
   3899           return false;
   3900         }
   3901       }
   3902     }
   3903 
   3904     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
   3905     if (DstUnused && DstUnused->isImm() &&
   3906         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
   3907       const MachineOperand &Dst = MI.getOperand(DstIdx);
   3908       if (!Dst.isReg() || !Dst.isTied()) {
   3909         ErrInfo = "Dst register should have tied register";
   3910         return false;
   3911       }
   3912 
   3913       const MachineOperand &TiedMO =
   3914           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
   3915       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
   3916         ErrInfo =
   3917             "Dst register should be tied to implicit use of preserved register";
   3918         return false;
   3919       } else if (TiedMO.getReg().isPhysical() &&
   3920                  Dst.getReg() != TiedMO.getReg()) {
   3921         ErrInfo = "Dst register should use same physical register as preserved";
   3922         return false;
   3923       }
   3924     }
   3925   }
   3926 
   3927   // Verify MIMG
   3928   if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
   3929     // Ensure that the return type used is large enough for all the options
   3930     // being used TFE/LWE require an extra result register.
   3931     const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
   3932     if (DMask) {
   3933       uint64_t DMaskImm = DMask->getImm();
   3934       uint32_t RegCount =
   3935           isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
   3936       const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
   3937       const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
   3938       const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
   3939 
   3940       // Adjust for packed 16 bit values
   3941       if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
   3942         RegCount >>= 1;
   3943 
   3944       // Adjust if using LWE or TFE
   3945       if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
   3946         RegCount += 1;
   3947 
   3948       const uint32_t DstIdx =
   3949           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
   3950       const MachineOperand &Dst = MI.getOperand(DstIdx);
   3951       if (Dst.isReg()) {
   3952         const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
   3953         uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
   3954         if (RegCount > DstSize) {
   3955           ErrInfo = "MIMG instruction returns too many registers for dst "
   3956                     "register class";
   3957           return false;
   3958         }
   3959       }
   3960     }
   3961   }
   3962 
   3963   // Verify VOP*. Ignore multiple sgpr operands on writelane.
   3964   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
   3965       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
   3966     // Only look at the true operands. Only a real operand can use the constant
   3967     // bus, and we don't want to check pseudo-operands like the source modifier
   3968     // flags.
   3969     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
   3970 
   3971     unsigned ConstantBusCount = 0;
   3972     bool UsesLiteral = false;
   3973     const MachineOperand *LiteralVal = nullptr;
   3974 
   3975     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
   3976       ++ConstantBusCount;
   3977 
   3978     SmallVector<Register, 2> SGPRsUsed;
   3979     Register SGPRUsed;
   3980 
   3981     for (int OpIdx : OpIndices) {
   3982       if (OpIdx == -1)
   3983         break;
   3984       const MachineOperand &MO = MI.getOperand(OpIdx);
   3985       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
   3986         if (MO.isReg()) {
   3987           SGPRUsed = MO.getReg();
   3988           if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
   3989                 return SGPRUsed != SGPR;
   3990               })) {
   3991             ++ConstantBusCount;
   3992             SGPRsUsed.push_back(SGPRUsed);
   3993           }
   3994         } else {
   3995           if (!UsesLiteral) {
   3996             ++ConstantBusCount;
   3997             UsesLiteral = true;
   3998             LiteralVal = &MO;
   3999           } else if (!MO.isIdenticalTo(*LiteralVal)) {
   4000             assert(isVOP3(MI));
   4001             ErrInfo = "VOP3 instruction uses more than one literal";
   4002             return false;
   4003           }
   4004         }
   4005       }
   4006     }
   4007 
   4008     SGPRUsed = findImplicitSGPRRead(MI);
   4009     if (SGPRUsed != AMDGPU::NoRegister) {
   4010       // Implicit uses may safely overlap true overands
   4011       if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
   4012             return !RI.regsOverlap(SGPRUsed, SGPR);
   4013           })) {
   4014         ++ConstantBusCount;
   4015         SGPRsUsed.push_back(SGPRUsed);
   4016       }
   4017     }
   4018 
   4019     // v_writelane_b32 is an exception from constant bus restriction:
   4020     // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
   4021     if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
   4022         Opcode != AMDGPU::V_WRITELANE_B32) {
   4023       ErrInfo = "VOP* instruction violates constant bus restriction";
   4024       return false;
   4025     }
   4026 
   4027     if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
   4028       ErrInfo = "VOP3 instruction uses literal";
   4029       return false;
   4030     }
   4031   }
   4032 
   4033   // Special case for writelane - this can break the multiple constant bus rule,
   4034   // but still can't use more than one SGPR register
   4035   if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
   4036     unsigned SGPRCount = 0;
   4037     Register SGPRUsed = AMDGPU::NoRegister;
   4038 
   4039     for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
   4040       if (OpIdx == -1)
   4041         break;
   4042 
   4043       const MachineOperand &MO = MI.getOperand(OpIdx);
   4044 
   4045       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
   4046         if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
   4047           if (MO.getReg() != SGPRUsed)
   4048             ++SGPRCount;
   4049           SGPRUsed = MO.getReg();
   4050         }
   4051       }
   4052       if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
   4053         ErrInfo = "WRITELANE instruction violates constant bus restriction";
   4054         return false;
   4055       }
   4056     }
   4057   }
   4058 
   4059   // Verify misc. restrictions on specific instructions.
   4060   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
   4061       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
   4062     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
   4063     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
   4064     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
   4065     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
   4066       if (!compareMachineOp(Src0, Src1) &&
   4067           !compareMachineOp(Src0, Src2)) {
   4068         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
   4069         return false;
   4070       }
   4071     }
   4072     if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
   4073          SISrcMods::ABS) ||
   4074         (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
   4075          SISrcMods::ABS) ||
   4076         (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
   4077          SISrcMods::ABS)) {
   4078       ErrInfo = "ABS not allowed in VOP3B instructions";
   4079       return false;
   4080     }
   4081   }
   4082 
   4083   if (isSOP2(MI) || isSOPC(MI)) {
   4084     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
   4085     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
   4086     unsigned Immediates = 0;
   4087 
   4088     if (!Src0.isReg() &&
   4089         !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
   4090       Immediates++;
   4091     if (!Src1.isReg() &&
   4092         !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
   4093       Immediates++;
   4094 
   4095     if (Immediates > 1) {
   4096       ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
   4097       return false;
   4098     }
   4099   }
   4100 
   4101   if (isSOPK(MI)) {
   4102     auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
   4103     if (Desc.isBranch()) {
   4104       if (!Op->isMBB()) {
   4105         ErrInfo = "invalid branch target for SOPK instruction";
   4106         return false;
   4107       }
   4108     } else {
   4109       uint64_t Imm = Op->getImm();
   4110       if (sopkIsZext(MI)) {
   4111         if (!isUInt<16>(Imm)) {
   4112           ErrInfo = "invalid immediate for SOPK instruction";
   4113           return false;
   4114         }
   4115       } else {
   4116         if (!isInt<16>(Imm)) {
   4117           ErrInfo = "invalid immediate for SOPK instruction";
   4118           return false;
   4119         }
   4120       }
   4121     }
   4122   }
   4123 
   4124   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
   4125       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
   4126       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
   4127       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
   4128     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
   4129                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
   4130 
   4131     const unsigned StaticNumOps = Desc.getNumOperands() +
   4132       Desc.getNumImplicitUses();
   4133     const unsigned NumImplicitOps = IsDst ? 2 : 1;
   4134 
   4135     // Allow additional implicit operands. This allows a fixup done by the post
   4136     // RA scheduler where the main implicit operand is killed and implicit-defs
   4137     // are added for sub-registers that remain live after this instruction.
   4138     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
   4139       ErrInfo = "missing implicit register operands";
   4140       return false;
   4141     }
   4142 
   4143     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   4144     if (IsDst) {
   4145       if (!Dst->isUse()) {
   4146         ErrInfo = "v_movreld_b32 vdst should be a use operand";
   4147         return false;
   4148       }
   4149 
   4150       unsigned UseOpIdx;
   4151       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
   4152           UseOpIdx != StaticNumOps + 1) {
   4153         ErrInfo = "movrel implicit operands should be tied";
   4154         return false;
   4155       }
   4156     }
   4157 
   4158     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
   4159     const MachineOperand &ImpUse
   4160       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
   4161     if (!ImpUse.isReg() || !ImpUse.isUse() ||
   4162         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
   4163       ErrInfo = "src0 should be subreg of implicit vector use";
   4164       return false;
   4165     }
   4166   }
   4167 
   4168   // Make sure we aren't losing exec uses in the td files. This mostly requires
   4169   // being careful when using let Uses to try to add other use registers.
   4170   if (shouldReadExec(MI)) {
   4171     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
   4172       ErrInfo = "VALU instruction does not implicitly read exec mask";
   4173       return false;
   4174     }
   4175   }
   4176 
   4177   if (isSMRD(MI)) {
   4178     if (MI.mayStore()) {
   4179       // The register offset form of scalar stores may only use m0 as the
   4180       // soffset register.
   4181       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
   4182       if (Soff && Soff->getReg() != AMDGPU::M0) {
   4183         ErrInfo = "scalar stores must use m0 as offset register";
   4184         return false;
   4185       }
   4186     }
   4187   }
   4188 
   4189   if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
   4190     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
   4191     if (Offset->getImm() != 0) {
   4192       ErrInfo = "subtarget does not support offsets in flat instructions";
   4193       return false;
   4194     }
   4195   }
   4196 
   4197   if (isMIMG(MI)) {
   4198     const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
   4199     if (DimOp) {
   4200       int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
   4201                                                  AMDGPU::OpName::vaddr0);
   4202       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
   4203       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
   4204       const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
   4205           AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
   4206       const AMDGPU::MIMGDimInfo *Dim =
   4207           AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
   4208 
   4209       if (!Dim) {
   4210         ErrInfo = "dim is out of range";
   4211         return false;
   4212       }
   4213 
   4214       bool IsA16 = false;
   4215       if (ST.hasR128A16()) {
   4216         const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
   4217         IsA16 = R128A16->getImm() != 0;
   4218       } else if (ST.hasGFX10A16()) {
   4219         const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
   4220         IsA16 = A16->getImm() != 0;
   4221       }
   4222 
   4223       bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
   4224 
   4225       unsigned AddrWords =
   4226           AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
   4227 
   4228       unsigned VAddrWords;
   4229       if (IsNSA) {
   4230         VAddrWords = SRsrcIdx - VAddr0Idx;
   4231       } else {
   4232         const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
   4233         VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
   4234         if (AddrWords > 8)
   4235           AddrWords = 16;
   4236         else if (AddrWords > 4)
   4237           AddrWords = 8;
   4238         else if (AddrWords == 4)
   4239           AddrWords = 4;
   4240         else if (AddrWords == 3)
   4241           AddrWords = 3;
   4242       }
   4243 
   4244       if (VAddrWords != AddrWords) {
   4245         LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
   4246                           << " but got " << VAddrWords << "\n");
   4247         ErrInfo = "bad vaddr size";
   4248         return false;
   4249       }
   4250     }
   4251   }
   4252 
   4253   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
   4254   if (DppCt) {
   4255     using namespace AMDGPU::DPP;
   4256 
   4257     unsigned DC = DppCt->getImm();
   4258     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
   4259         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
   4260         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
   4261         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
   4262         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
   4263         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
   4264         (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
   4265       ErrInfo = "Invalid dpp_ctrl value";
   4266       return false;
   4267     }
   4268     if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
   4269         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
   4270       ErrInfo = "Invalid dpp_ctrl value: "
   4271                 "wavefront shifts are not supported on GFX10+";
   4272       return false;
   4273     }
   4274     if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
   4275         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
   4276       ErrInfo = "Invalid dpp_ctrl value: "
   4277                 "broadcasts are not supported on GFX10+";
   4278       return false;
   4279     }
   4280     if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
   4281         ST.getGeneration() < AMDGPUSubtarget::GFX10) {
   4282       if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
   4283           DC <= DppCtrl::ROW_NEWBCAST_LAST &&
   4284           !ST.hasGFX90AInsts()) {
   4285         ErrInfo = "Invalid dpp_ctrl value: "
   4286                   "row_newbroadcast/row_share is not supported before "
   4287                   "GFX90A/GFX10";
   4288         return false;
   4289       } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
   4290         ErrInfo = "Invalid dpp_ctrl value: "
   4291                   "row_share and row_xmask are not supported before GFX10";
   4292         return false;
   4293       }
   4294     }
   4295 
   4296     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
   4297     int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   4298 
   4299     if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
   4300         ((DstIdx >= 0 &&
   4301           (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
   4302            Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
   4303          ((Src0Idx >= 0 &&
   4304            (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
   4305             Desc.OpInfo[Src0Idx].RegClass ==
   4306                 AMDGPU::VReg_64_Align2RegClassID)))) &&
   4307         !AMDGPU::isLegal64BitDPPControl(DC)) {
   4308       ErrInfo = "Invalid dpp_ctrl value: "
   4309                 "64 bit dpp only support row_newbcast";
   4310       return false;
   4311     }
   4312   }
   4313 
   4314   if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
   4315     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   4316     uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
   4317                                         : AMDGPU::OpName::vdata;
   4318     const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
   4319     const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
   4320     if (Data && !Data->isReg())
   4321       Data = nullptr;
   4322 
   4323     if (ST.hasGFX90AInsts()) {
   4324       if (Dst && Data &&
   4325           (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
   4326         ErrInfo = "Invalid register class: "
   4327                   "vdata and vdst should be both VGPR or AGPR";
   4328         return false;
   4329       }
   4330       if (Data && Data2 &&
   4331           (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
   4332         ErrInfo = "Invalid register class: "
   4333                   "both data operands should be VGPR or AGPR";
   4334         return false;
   4335       }
   4336     } else {
   4337       if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
   4338           (Data && RI.isAGPR(MRI, Data->getReg())) ||
   4339           (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
   4340         ErrInfo = "Invalid register class: "
   4341                   "agpr loads and stores not supported on this GPU";
   4342         return false;
   4343       }
   4344     }
   4345   }
   4346 
   4347   return true;
   4348 }
   4349 
   4350 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   4351   switch (MI.getOpcode()) {
   4352   default: return AMDGPU::INSTRUCTION_LIST_END;
   4353   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   4354   case AMDGPU::COPY: return AMDGPU::COPY;
   4355   case AMDGPU::PHI: return AMDGPU::PHI;
   4356   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   4357   case AMDGPU::WQM: return AMDGPU::WQM;
   4358   case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
   4359   case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
   4360   case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
   4361   case AMDGPU::S_MOV_B32: {
   4362     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   4363     return MI.getOperand(1).isReg() ||
   4364            RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
   4365            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
   4366   }
   4367   case AMDGPU::S_ADD_I32:
   4368     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
   4369   case AMDGPU::S_ADDC_U32:
   4370     return AMDGPU::V_ADDC_U32_e32;
   4371   case AMDGPU::S_SUB_I32:
   4372     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
   4373     // FIXME: These are not consistently handled, and selected when the carry is
   4374     // used.
   4375   case AMDGPU::S_ADD_U32:
   4376     return AMDGPU::V_ADD_CO_U32_e32;
   4377   case AMDGPU::S_SUB_U32:
   4378     return AMDGPU::V_SUB_CO_U32_e32;
   4379   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
   4380   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
   4381   case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
   4382   case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
   4383   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
   4384   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
   4385   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
   4386   case AMDGPU::S_XNOR_B32:
   4387     return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
   4388   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
   4389   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
   4390   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
   4391   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
   4392   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
   4393   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
   4394   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
   4395   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
   4396   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
   4397   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
   4398   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
   4399   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
   4400   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
   4401   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
   4402   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
   4403   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   4404   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
   4405   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
   4406   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
   4407   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
   4408   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
   4409   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
   4410   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
   4411   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
   4412   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
   4413   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
   4414   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
   4415   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
   4416   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
   4417   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
   4418   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
   4419   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
   4420   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   4421   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   4422   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   4423   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
   4424   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
   4425   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
   4426   }
   4427   llvm_unreachable(
   4428       "Unexpected scalar opcode without corresponding vector one!");
   4429 }
   4430 
   4431 static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
   4432                                           const MachineRegisterInfo &MRI,
   4433                                           const MCInstrDesc &TID,
   4434                                           unsigned RCID,
   4435                                           bool IsAllocatable) {
   4436   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
   4437       (TID.mayLoad() || TID.mayStore() ||
   4438       (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
   4439     switch (RCID) {
   4440     case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
   4441     case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
   4442     case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
   4443     case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
   4444     case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
   4445     default:
   4446       break;
   4447     }
   4448   }
   4449   return RCID;
   4450 }
   4451 
   4452 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
   4453     unsigned OpNum, const TargetRegisterInfo *TRI,
   4454     const MachineFunction &MF)
   4455   const {
   4456   if (OpNum >= TID.getNumOperands())
   4457     return nullptr;
   4458   auto RegClass = TID.OpInfo[OpNum].RegClass;
   4459   bool IsAllocatable = false;
   4460   if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
   4461     // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
   4462     // with two data operands. Request register class constainted to VGPR only
   4463     // of both operands present as Machine Copy Propagation can not check this
   4464     // constraint and possibly other passes too.
   4465     //
   4466     // The check is limited to FLAT and DS because atomics in non-flat encoding
   4467     // have their vdst and vdata tied to be the same register.
   4468     const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
   4469                                                    AMDGPU::OpName::vdst);
   4470     const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
   4471         (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
   4472                                          : AMDGPU::OpName::vdata);
   4473     if (DataIdx != -1) {
   4474       IsAllocatable = VDstIdx != -1 ||
   4475                       AMDGPU::getNamedOperandIdx(TID.Opcode,
   4476                                                  AMDGPU::OpName::data1) != -1;
   4477     }
   4478   }
   4479   RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
   4480                                        IsAllocatable);
   4481   return RI.getRegClass(RegClass);
   4482 }
   4483 
   4484 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   4485                                                       unsigned OpNo) const {
   4486   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   4487   const MCInstrDesc &Desc = get(MI.getOpcode());
   4488   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
   4489       Desc.OpInfo[OpNo].RegClass == -1) {
   4490     Register Reg = MI.getOperand(OpNo).getReg();
   4491 
   4492     if (Reg.isVirtual())
   4493       return MRI.getRegClass(Reg);
   4494     return RI.getPhysRegClass(Reg);
   4495   }
   4496 
   4497   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
   4498   RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
   4499   return RI.getRegClass(RCID);
   4500 }
   4501 
   4502 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   4503   MachineBasicBlock::iterator I = MI;
   4504   MachineBasicBlock *MBB = MI.getParent();
   4505   MachineOperand &MO = MI.getOperand(OpIdx);
   4506   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   4507   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
   4508   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   4509   unsigned Size = RI.getRegSizeInBits(*RC);
   4510   unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
   4511   if (MO.isReg())
   4512     Opcode = AMDGPU::COPY;
   4513   else if (RI.isSGPRClass(RC))
   4514     Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
   4515 
   4516   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
   4517   const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
   4518   if (RI.getCommonSubClass(VRC64, VRC))
   4519     VRC = VRC64;
   4520   else
   4521     VRC = &AMDGPU::VGPR_32RegClass;
   4522 
   4523   Register Reg = MRI.createVirtualRegister(VRC);
   4524   DebugLoc DL = MBB->findDebugLoc(I);
   4525   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
   4526   MO.ChangeToRegister(Reg, false);
   4527 }
   4528 
   4529 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
   4530                                          MachineRegisterInfo &MRI,
   4531                                          MachineOperand &SuperReg,
   4532                                          const TargetRegisterClass *SuperRC,
   4533                                          unsigned SubIdx,
   4534                                          const TargetRegisterClass *SubRC)
   4535                                          const {
   4536   MachineBasicBlock *MBB = MI->getParent();
   4537   DebugLoc DL = MI->getDebugLoc();
   4538   Register SubReg = MRI.createVirtualRegister(SubRC);
   4539 
   4540   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
   4541     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
   4542       .addReg(SuperReg.getReg(), 0, SubIdx);
   4543     return SubReg;
   4544   }
   4545 
   4546   // Just in case the super register is itself a sub-register, copy it to a new
   4547   // value so we don't need to worry about merging its subreg index with the
   4548   // SubIdx passed to this function. The register coalescer should be able to
   4549   // eliminate this extra copy.
   4550   Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
   4551 
   4552   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
   4553     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
   4554 
   4555   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
   4556     .addReg(NewSuperReg, 0, SubIdx);
   4557 
   4558   return SubReg;
   4559 }
   4560 
   4561 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
   4562   MachineBasicBlock::iterator MII,
   4563   MachineRegisterInfo &MRI,
   4564   MachineOperand &Op,
   4565   const TargetRegisterClass *SuperRC,
   4566   unsigned SubIdx,
   4567   const TargetRegisterClass *SubRC) const {
   4568   if (Op.isImm()) {
   4569     if (SubIdx == AMDGPU::sub0)
   4570       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
   4571     if (SubIdx == AMDGPU::sub1)
   4572       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
   4573 
   4574     llvm_unreachable("Unhandled register index for immediate");
   4575   }
   4576 
   4577   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
   4578                                        SubIdx, SubRC);
   4579   return MachineOperand::CreateReg(SubReg, false);
   4580 }
   4581 
   4582 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
   4583 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
   4584   assert(Inst.getNumExplicitOperands() == 3);
   4585   MachineOperand Op1 = Inst.getOperand(1);
   4586   Inst.RemoveOperand(1);
   4587   Inst.addOperand(Op1);
   4588 }
   4589 
   4590 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
   4591                                     const MCOperandInfo &OpInfo,
   4592                                     const MachineOperand &MO) const {
   4593   if (!MO.isReg())
   4594     return false;
   4595 
   4596   Register Reg = MO.getReg();
   4597 
   4598   const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
   4599   if (Reg.isPhysical())
   4600     return DRC->contains(Reg);
   4601 
   4602   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
   4603 
   4604   if (MO.getSubReg()) {
   4605     const MachineFunction *MF = MO.getParent()->getParent()->getParent();
   4606     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
   4607     if (!SuperRC)
   4608       return false;
   4609 
   4610     DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
   4611     if (!DRC)
   4612       return false;
   4613   }
   4614   return RC->hasSuperClassEq(DRC);
   4615 }
   4616 
   4617 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
   4618                                      const MCOperandInfo &OpInfo,
   4619                                      const MachineOperand &MO) const {
   4620   if (MO.isReg())
   4621     return isLegalRegOperand(MRI, OpInfo, MO);
   4622 
   4623   // Handle non-register types that are treated like immediates.
   4624   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
   4625   return true;
   4626 }
   4627 
   4628 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   4629                                  const MachineOperand *MO) const {
   4630   const MachineFunction &MF = *MI.getParent()->getParent();
   4631   const MachineRegisterInfo &MRI = MF.getRegInfo();
   4632   const MCInstrDesc &InstDesc = MI.getDesc();
   4633   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
   4634   const TargetRegisterClass *DefinedRC =
   4635       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
   4636   if (!MO)
   4637     MO = &MI.getOperand(OpIdx);
   4638 
   4639   int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
   4640   int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
   4641   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
   4642     if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
   4643       return false;
   4644 
   4645     SmallDenseSet<RegSubRegPair> SGPRsUsed;
   4646     if (MO->isReg())
   4647       SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
   4648 
   4649     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
   4650       if (i == OpIdx)
   4651         continue;
   4652       const MachineOperand &Op = MI.getOperand(i);
   4653       if (Op.isReg()) {
   4654         RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
   4655         if (!SGPRsUsed.count(SGPR) &&
   4656             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
   4657           if (--ConstantBusLimit <= 0)
   4658             return false;
   4659           SGPRsUsed.insert(SGPR);
   4660         }
   4661       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
   4662         if (--ConstantBusLimit <= 0)
   4663           return false;
   4664       } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
   4665                  isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
   4666         if (!VOP3LiteralLimit--)
   4667           return false;
   4668         if (--ConstantBusLimit <= 0)
   4669           return false;
   4670       }
   4671     }
   4672   }
   4673 
   4674   if (MO->isReg()) {
   4675     assert(DefinedRC);
   4676     if (!isLegalRegOperand(MRI, OpInfo, *MO))
   4677       return false;
   4678     bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
   4679     if (IsAGPR && !ST.hasMAIInsts())
   4680       return false;
   4681     unsigned Opc = MI.getOpcode();
   4682     if (IsAGPR &&
   4683         (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
   4684         (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
   4685       return false;
   4686     // Atomics should have both vdst and vdata either vgpr or agpr.
   4687     const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
   4688     const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
   4689         isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
   4690     if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
   4691         MI.getOperand(DataIdx).isReg() &&
   4692         RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
   4693       return false;
   4694     if ((int)OpIdx == DataIdx) {
   4695       if (VDstIdx != -1 &&
   4696           RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
   4697         return false;
   4698       // DS instructions with 2 src operands also must have tied RC.
   4699       const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
   4700                                                       AMDGPU::OpName::data1);
   4701       if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
   4702           RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
   4703         return false;
   4704     }
   4705     if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
   4706         (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
   4707         RI.isSGPRReg(MRI, MO->getReg()))
   4708       return false;
   4709     return true;
   4710   }
   4711 
   4712   // Handle non-register types that are treated like immediates.
   4713   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
   4714 
   4715   if (!DefinedRC) {
   4716     // This operand expects an immediate.
   4717     return true;
   4718   }
   4719 
   4720   return isImmOperandLegal(MI, OpIdx, *MO);
   4721 }
   4722 
   4723 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   4724                                        MachineInstr &MI) const {
   4725   unsigned Opc = MI.getOpcode();
   4726   const MCInstrDesc &InstrDesc = get(Opc);
   4727 
   4728   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   4729   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   4730 
   4731   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
   4732   MachineOperand &Src1 = MI.getOperand(Src1Idx);
   4733 
   4734   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
   4735   // we need to only have one constant bus use before GFX10.
   4736   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
   4737   if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
   4738       Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
   4739        isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
   4740     legalizeOpWithMove(MI, Src0Idx);
   4741 
   4742   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
   4743   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
   4744   // src0/src1 with V_READFIRSTLANE.
   4745   if (Opc == AMDGPU::V_WRITELANE_B32) {
   4746     const DebugLoc &DL = MI.getDebugLoc();
   4747     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
   4748       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   4749       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
   4750           .add(Src0);
   4751       Src0.ChangeToRegister(Reg, false);
   4752     }
   4753     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
   4754       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   4755       const DebugLoc &DL = MI.getDebugLoc();
   4756       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
   4757           .add(Src1);
   4758       Src1.ChangeToRegister(Reg, false);
   4759     }
   4760     return;
   4761   }
   4762 
   4763   // No VOP2 instructions support AGPRs.
   4764   if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
   4765     legalizeOpWithMove(MI, Src0Idx);
   4766 
   4767   if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
   4768     legalizeOpWithMove(MI, Src1Idx);
   4769 
   4770   // VOP2 src0 instructions support all operand types, so we don't need to check
   4771   // their legality. If src1 is already legal, we don't need to do anything.
   4772   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
   4773     return;
   4774 
   4775   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
   4776   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
   4777   // select is uniform.
   4778   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
   4779       RI.isVGPR(MRI, Src1.getReg())) {
   4780     Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   4781     const DebugLoc &DL = MI.getDebugLoc();
   4782     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
   4783         .add(Src1);
   4784     Src1.ChangeToRegister(Reg, false);
   4785     return;
   4786   }
   4787 
   4788   // We do not use commuteInstruction here because it is too aggressive and will
   4789   // commute if it is possible. We only want to commute here if it improves
   4790   // legality. This can be called a fairly large number of times so don't waste
   4791   // compile time pointlessly swapping and checking legality again.
   4792   if (HasImplicitSGPR || !MI.isCommutable()) {
   4793     legalizeOpWithMove(MI, Src1Idx);
   4794     return;
   4795   }
   4796 
   4797   // If src0 can be used as src1, commuting will make the operands legal.
   4798   // Otherwise we have to give up and insert a move.
   4799   //
   4800   // TODO: Other immediate-like operand kinds could be commuted if there was a
   4801   // MachineOperand::ChangeTo* for them.
   4802   if ((!Src1.isImm() && !Src1.isReg()) ||
   4803       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
   4804     legalizeOpWithMove(MI, Src1Idx);
   4805     return;
   4806   }
   4807 
   4808   int CommutedOpc = commuteOpcode(MI);
   4809   if (CommutedOpc == -1) {
   4810     legalizeOpWithMove(MI, Src1Idx);
   4811     return;
   4812   }
   4813 
   4814   MI.setDesc(get(CommutedOpc));
   4815 
   4816   Register Src0Reg = Src0.getReg();
   4817   unsigned Src0SubReg = Src0.getSubReg();
   4818   bool Src0Kill = Src0.isKill();
   4819 
   4820   if (Src1.isImm())
   4821     Src0.ChangeToImmediate(Src1.getImm());
   4822   else if (Src1.isReg()) {
   4823     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
   4824     Src0.setSubReg(Src1.getSubReg());
   4825   } else
   4826     llvm_unreachable("Should only have register or immediate operands");
   4827 
   4828   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
   4829   Src1.setSubReg(Src0SubReg);
   4830   fixImplicitOperands(MI);
   4831 }
   4832 
   4833 // Legalize VOP3 operands. All operand types are supported for any operand
   4834 // but only one literal constant and only starting from GFX10.
   4835 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
   4836                                        MachineInstr &MI) const {
   4837   unsigned Opc = MI.getOpcode();
   4838 
   4839   int VOP3Idx[3] = {
   4840     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
   4841     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
   4842     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
   4843   };
   4844 
   4845   if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
   4846       Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
   4847     // src1 and src2 must be scalar
   4848     MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
   4849     MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
   4850     const DebugLoc &DL = MI.getDebugLoc();
   4851     if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
   4852       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   4853       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
   4854         .add(Src1);
   4855       Src1.ChangeToRegister(Reg, false);
   4856     }
   4857     if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
   4858       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   4859       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
   4860         .add(Src2);
   4861       Src2.ChangeToRegister(Reg, false);
   4862     }
   4863   }
   4864 
   4865   // Find the one SGPR operand we are allowed to use.
   4866   int ConstantBusLimit = ST.getConstantBusLimit(Opc);
   4867   int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
   4868   SmallDenseSet<unsigned> SGPRsUsed;
   4869   Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
   4870   if (SGPRReg != AMDGPU::NoRegister) {
   4871     SGPRsUsed.insert(SGPRReg);
   4872     --ConstantBusLimit;
   4873   }
   4874 
   4875   for (unsigned i = 0; i < 3; ++i) {
   4876     int Idx = VOP3Idx[i];
   4877     if (Idx == -1)
   4878       break;
   4879     MachineOperand &MO = MI.getOperand(Idx);
   4880 
   4881     if (!MO.isReg()) {
   4882       if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
   4883         continue;
   4884 
   4885       if (LiteralLimit > 0 && ConstantBusLimit > 0) {
   4886         --LiteralLimit;
   4887         --ConstantBusLimit;
   4888         continue;
   4889       }
   4890 
   4891       --LiteralLimit;
   4892       --ConstantBusLimit;
   4893       legalizeOpWithMove(MI, Idx);
   4894       continue;
   4895     }
   4896 
   4897     if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
   4898         !isOperandLegal(MI, Idx, &MO)) {
   4899       legalizeOpWithMove(MI, Idx);
   4900       continue;
   4901     }
   4902 
   4903     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
   4904       continue; // VGPRs are legal
   4905 
   4906     // We can use one SGPR in each VOP3 instruction prior to GFX10
   4907     // and two starting from GFX10.
   4908     if (SGPRsUsed.count(MO.getReg()))
   4909       continue;
   4910     if (ConstantBusLimit > 0) {
   4911       SGPRsUsed.insert(MO.getReg());
   4912       --ConstantBusLimit;
   4913       continue;
   4914     }
   4915 
   4916     // If we make it this far, then the operand is not legal and we must
   4917     // legalize it.
   4918     legalizeOpWithMove(MI, Idx);
   4919   }
   4920 }
   4921 
   4922 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
   4923                                          MachineRegisterInfo &MRI) const {
   4924   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
   4925   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
   4926   Register DstReg = MRI.createVirtualRegister(SRC);
   4927   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
   4928 
   4929   if (RI.hasAGPRs(VRC)) {
   4930     VRC = RI.getEquivalentVGPRClass(VRC);
   4931     Register NewSrcReg = MRI.createVirtualRegister(VRC);
   4932     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
   4933             get(TargetOpcode::COPY), NewSrcReg)
   4934         .addReg(SrcReg);
   4935     SrcReg = NewSrcReg;
   4936   }
   4937 
   4938   if (SubRegs == 1) {
   4939     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
   4940             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
   4941         .addReg(SrcReg);
   4942     return DstReg;
   4943   }
   4944 
   4945   SmallVector<unsigned, 8> SRegs;
   4946   for (unsigned i = 0; i < SubRegs; ++i) {
   4947     Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   4948     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
   4949             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
   4950         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
   4951     SRegs.push_back(SGPR);
   4952   }
   4953 
   4954   MachineInstrBuilder MIB =
   4955       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
   4956               get(AMDGPU::REG_SEQUENCE), DstReg);
   4957   for (unsigned i = 0; i < SubRegs; ++i) {
   4958     MIB.addReg(SRegs[i]);
   4959     MIB.addImm(RI.getSubRegFromChannel(i));
   4960   }
   4961   return DstReg;
   4962 }
   4963 
   4964 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
   4965                                        MachineInstr &MI) const {
   4966 
   4967   // If the pointer is store in VGPRs, then we need to move them to
   4968   // SGPRs using v_readfirstlane.  This is safe because we only select
   4969   // loads with uniform pointers to SMRD instruction so we know the
   4970   // pointer value is uniform.
   4971   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
   4972   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
   4973     Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
   4974     SBase->setReg(SGPR);
   4975   }
   4976   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
   4977   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
   4978     Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
   4979     SOff->setReg(SGPR);
   4980   }
   4981 }
   4982 
   4983 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   4984   unsigned Opc = Inst.getOpcode();
   4985   int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
   4986   if (OldSAddrIdx < 0)
   4987     return false;
   4988 
   4989   assert(isSegmentSpecificFLAT(Inst));
   4990 
   4991   int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
   4992   if (NewOpc < 0)
   4993     NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
   4994   if (NewOpc < 0)
   4995     return false;
   4996 
   4997   MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
   4998   MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
   4999   if (RI.isSGPRReg(MRI, SAddr.getReg()))
   5000     return false;
   5001 
   5002   int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
   5003   if (NewVAddrIdx < 0)
   5004     return false;
   5005 
   5006   int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
   5007 
   5008   // Check vaddr, it shall be zero or absent.
   5009   MachineInstr *VAddrDef = nullptr;
   5010   if (OldVAddrIdx >= 0) {
   5011     MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
   5012     VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
   5013     if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
   5014         !VAddrDef->getOperand(1).isImm() ||
   5015         VAddrDef->getOperand(1).getImm() != 0)
   5016       return false;
   5017   }
   5018 
   5019   const MCInstrDesc &NewDesc = get(NewOpc);
   5020   Inst.setDesc(NewDesc);
   5021 
   5022   // Callers expect interator to be valid after this call, so modify the
   5023   // instruction in place.
   5024   if (OldVAddrIdx == NewVAddrIdx) {
   5025     MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
   5026     // Clear use list from the old vaddr holding a zero register.
   5027     MRI.removeRegOperandFromUseList(&NewVAddr);
   5028     MRI.moveOperands(&NewVAddr, &SAddr, 1);
   5029     Inst.RemoveOperand(OldSAddrIdx);
   5030     // Update the use list with the pointer we have just moved from vaddr to
   5031     // saddr poisition. Otherwise new vaddr will be missing from the use list.
   5032     MRI.removeRegOperandFromUseList(&NewVAddr);
   5033     MRI.addRegOperandToUseList(&NewVAddr);
   5034   } else {
   5035     assert(OldSAddrIdx == NewVAddrIdx);
   5036 
   5037     if (OldVAddrIdx >= 0) {
   5038       int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
   5039                                                  AMDGPU::OpName::vdst_in);
   5040 
   5041       // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
   5042       // it asserts. Untie the operands for now and retie them afterwards.
   5043       if (NewVDstIn != -1) {
   5044         int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
   5045         Inst.untieRegOperand(OldVDstIn);
   5046       }
   5047 
   5048       Inst.RemoveOperand(OldVAddrIdx);
   5049 
   5050       if (NewVDstIn != -1) {
   5051         int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
   5052         Inst.tieOperands(NewVDst, NewVDstIn);
   5053       }
   5054     }
   5055   }
   5056 
   5057   if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
   5058     VAddrDef->eraseFromParent();
   5059 
   5060   return true;
   5061 }
   5062 
   5063 // FIXME: Remove this when SelectionDAG is obsoleted.
   5064 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
   5065                                        MachineInstr &MI) const {
   5066   if (!isSegmentSpecificFLAT(MI))
   5067     return;
   5068 
   5069   // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
   5070   // thinks they are uniform, so a readfirstlane should be valid.
   5071   MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
   5072   if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
   5073     return;
   5074 
   5075   if (moveFlatAddrToVGPR(MI))
   5076     return;
   5077 
   5078   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
   5079   SAddr->setReg(ToSGPR);
   5080 }
   5081 
   5082 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
   5083                                          MachineBasicBlock::iterator I,
   5084                                          const TargetRegisterClass *DstRC,
   5085                                          MachineOperand &Op,
   5086                                          MachineRegisterInfo &MRI,
   5087                                          const DebugLoc &DL) const {
   5088   Register OpReg = Op.getReg();
   5089   unsigned OpSubReg = Op.getSubReg();
   5090 
   5091   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
   5092       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
   5093 
   5094   // Check if operand is already the correct register class.
   5095   if (DstRC == OpRC)
   5096     return;
   5097 
   5098   Register DstReg = MRI.createVirtualRegister(DstRC);
   5099   MachineInstr *Copy =
   5100       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
   5101 
   5102   Op.setReg(DstReg);
   5103   Op.setSubReg(0);
   5104 
   5105   MachineInstr *Def = MRI.getVRegDef(OpReg);
   5106   if (!Def)
   5107     return;
   5108 
   5109   // Try to eliminate the copy if it is copying an immediate value.
   5110   if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
   5111     FoldImmediate(*Copy, *Def, OpReg, &MRI);
   5112 
   5113   bool ImpDef = Def->isImplicitDef();
   5114   while (!ImpDef && Def && Def->isCopy()) {
   5115     if (Def->getOperand(1).getReg().isPhysical())
   5116       break;
   5117     Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
   5118     ImpDef = Def && Def->isImplicitDef();
   5119   }
   5120   if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
   5121       !ImpDef)
   5122     Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
   5123 }
   5124 
   5125 // Emit the actual waterfall loop, executing the wrapped instruction for each
   5126 // unique value of \p Rsrc across all lanes. In the best case we execute 1
   5127 // iteration, in the worst case we execute 64 (once per lane).
   5128 static void
   5129 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
   5130                           MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
   5131                           const DebugLoc &DL, MachineOperand &Rsrc) {
   5132   MachineFunction &MF = *OrigBB.getParent();
   5133   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   5134   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   5135   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   5136   unsigned SaveExecOpc =
   5137       ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
   5138   unsigned XorTermOpc =
   5139       ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
   5140   unsigned AndOpc =
   5141       ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
   5142   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   5143 
   5144   MachineBasicBlock::iterator I = LoopBB.begin();
   5145 
   5146   SmallVector<Register, 8> ReadlanePieces;
   5147   Register CondReg = AMDGPU::NoRegister;
   5148 
   5149   Register VRsrc = Rsrc.getReg();
   5150   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
   5151 
   5152   unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
   5153   unsigned NumSubRegs =  RegSize / 32;
   5154   assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
   5155 
   5156   for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
   5157 
   5158     Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   5159     Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   5160 
   5161     // Read the next variant <- also loop target.
   5162     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
   5163             .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
   5164 
   5165     // Read the next variant <- also loop target.
   5166     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
   5167             .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
   5168 
   5169     ReadlanePieces.push_back(CurRegLo);
   5170     ReadlanePieces.push_back(CurRegHi);
   5171 
   5172     // Comparison is to be done as 64-bit.
   5173     Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
   5174     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
   5175             .addReg(CurRegLo)
   5176             .addImm(AMDGPU::sub0)
   5177             .addReg(CurRegHi)
   5178             .addImm(AMDGPU::sub1);
   5179 
   5180     Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
   5181     auto Cmp =
   5182         BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
   5183             .addReg(CurReg);
   5184     if (NumSubRegs <= 2)
   5185       Cmp.addReg(VRsrc);
   5186     else
   5187       Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
   5188 
   5189     // Combine the comparision results with AND.
   5190     if (CondReg == AMDGPU::NoRegister) // First.
   5191       CondReg = NewCondReg;
   5192     else { // If not the first, we create an AND.
   5193       Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
   5194       BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
   5195               .addReg(CondReg)
   5196               .addReg(NewCondReg);
   5197       CondReg = AndReg;
   5198     }
   5199   } // End for loop.
   5200 
   5201   auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
   5202   Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
   5203 
   5204   // Build scalar Rsrc.
   5205   auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
   5206   unsigned Channel = 0;
   5207   for (Register Piece : ReadlanePieces) {
   5208     Merge.addReg(Piece)
   5209          .addImm(TRI->getSubRegFromChannel(Channel++));
   5210   }
   5211 
   5212   // Update Rsrc operand to use the SGPR Rsrc.
   5213   Rsrc.setReg(SRsrc);
   5214   Rsrc.setIsKill(true);
   5215 
   5216   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
   5217   MRI.setSimpleHint(SaveExec, CondReg);
   5218 
   5219   // Update EXEC to matching lanes, saving original to SaveExec.
   5220   BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
   5221       .addReg(CondReg, RegState::Kill);
   5222 
   5223   // The original instruction is here; we insert the terminators after it.
   5224   I = LoopBB.end();
   5225 
   5226   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   5227   BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
   5228       .addReg(Exec)
   5229       .addReg(SaveExec);
   5230 
   5231   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
   5232 }
   5233 
   5234 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
   5235 // with SGPRs by iterating over all unique values across all lanes.
   5236 // Returns the loop basic block that now contains \p MI.
   5237 static MachineBasicBlock *
   5238 loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   5239                   MachineOperand &Rsrc, MachineDominatorTree *MDT,
   5240                   MachineBasicBlock::iterator Begin = nullptr,
   5241                   MachineBasicBlock::iterator End = nullptr) {
   5242   MachineBasicBlock &MBB = *MI.getParent();
   5243   MachineFunction &MF = *MBB.getParent();
   5244   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   5245   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   5246   MachineRegisterInfo &MRI = MF.getRegInfo();
   5247   if (!Begin.isValid())
   5248     Begin = &MI;
   5249   if (!End.isValid()) {
   5250     End = &MI;
   5251     ++End;
   5252   }
   5253   const DebugLoc &DL = MI.getDebugLoc();
   5254   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   5255   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   5256   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   5257 
   5258   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
   5259 
   5260   // Save the EXEC mask
   5261   BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
   5262 
   5263   // Killed uses in the instruction we are waterfalling around will be
   5264   // incorrect due to the added control-flow.
   5265   MachineBasicBlock::iterator AfterMI = MI;
   5266   ++AfterMI;
   5267   for (auto I = Begin; I != AfterMI; I++) {
   5268     for (auto &MO : I->uses()) {
   5269       if (MO.isReg() && MO.isUse()) {
   5270         MRI.clearKillFlags(MO.getReg());
   5271       }
   5272     }
   5273   }
   5274 
   5275   // To insert the loop we need to split the block. Move everything after this
   5276   // point to a new block, and insert a new empty block between the two.
   5277   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
   5278   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
   5279   MachineFunction::iterator MBBI(MBB);
   5280   ++MBBI;
   5281 
   5282   MF.insert(MBBI, LoopBB);
   5283   MF.insert(MBBI, RemainderBB);
   5284 
   5285   LoopBB->addSuccessor(LoopBB);
   5286   LoopBB->addSuccessor(RemainderBB);
   5287 
   5288   // Move Begin to MI to the LoopBB, and the remainder of the block to
   5289   // RemainderBB.
   5290   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
   5291   RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
   5292   LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
   5293 
   5294   MBB.addSuccessor(LoopBB);
   5295 
   5296   // Update dominators. We know that MBB immediately dominates LoopBB, that
   5297   // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
   5298   // dominates all of the successors transferred to it from MBB that MBB used
   5299   // to properly dominate.
   5300   if (MDT) {
   5301     MDT->addNewBlock(LoopBB, &MBB);
   5302     MDT->addNewBlock(RemainderBB, LoopBB);
   5303     for (auto &Succ : RemainderBB->successors()) {
   5304       if (MDT->properlyDominates(&MBB, Succ)) {
   5305         MDT->changeImmediateDominator(Succ, RemainderBB);
   5306       }
   5307     }
   5308   }
   5309 
   5310   emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
   5311 
   5312   // Restore the EXEC mask
   5313   MachineBasicBlock::iterator First = RemainderBB->begin();
   5314   BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
   5315   return LoopBB;
   5316 }
   5317 
   5318 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
   5319 static std::tuple<unsigned, unsigned>
   5320 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
   5321   MachineBasicBlock &MBB = *MI.getParent();
   5322   MachineFunction &MF = *MBB.getParent();
   5323   MachineRegisterInfo &MRI = MF.getRegInfo();
   5324 
   5325   // Extract the ptr from the resource descriptor.
   5326   unsigned RsrcPtr =
   5327       TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
   5328                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
   5329 
   5330   // Create an empty resource descriptor
   5331   Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
   5332   Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   5333   Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   5334   Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
   5335   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
   5336 
   5337   // Zero64 = 0
   5338   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
   5339       .addImm(0);
   5340 
   5341   // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
   5342   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
   5343       .addImm(RsrcDataFormat & 0xFFFFFFFF);
   5344 
   5345   // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
   5346   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
   5347       .addImm(RsrcDataFormat >> 32);
   5348 
   5349   // NewSRsrc = {Zero64, SRsrcFormat}
   5350   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
   5351       .addReg(Zero64)
   5352       .addImm(AMDGPU::sub0_sub1)
   5353       .addReg(SRsrcFormatLo)
   5354       .addImm(AMDGPU::sub2)
   5355       .addReg(SRsrcFormatHi)
   5356       .addImm(AMDGPU::sub3);
   5357 
   5358   return std::make_tuple(RsrcPtr, NewSRsrc);
   5359 }
   5360 
   5361 MachineBasicBlock *
   5362 SIInstrInfo::legalizeOperands(MachineInstr &MI,
   5363                               MachineDominatorTree *MDT) const {
   5364   MachineFunction &MF = *MI.getParent()->getParent();
   5365   MachineRegisterInfo &MRI = MF.getRegInfo();
   5366   MachineBasicBlock *CreatedBB = nullptr;
   5367 
   5368   // Legalize VOP2
   5369   if (isVOP2(MI) || isVOPC(MI)) {
   5370     legalizeOperandsVOP2(MRI, MI);
   5371     return CreatedBB;
   5372   }
   5373 
   5374   // Legalize VOP3
   5375   if (isVOP3(MI)) {
   5376     legalizeOperandsVOP3(MRI, MI);
   5377     return CreatedBB;
   5378   }
   5379 
   5380   // Legalize SMRD
   5381   if (isSMRD(MI)) {
   5382     legalizeOperandsSMRD(MRI, MI);
   5383     return CreatedBB;
   5384   }
   5385 
   5386   // Legalize FLAT
   5387   if (isFLAT(MI)) {
   5388     legalizeOperandsFLAT(MRI, MI);
   5389     return CreatedBB;
   5390   }
   5391 
   5392   // Legalize REG_SEQUENCE and PHI
   5393   // The register class of the operands much be the same type as the register
   5394   // class of the output.
   5395   if (MI.getOpcode() == AMDGPU::PHI) {
   5396     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
   5397     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
   5398       if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
   5399         continue;
   5400       const TargetRegisterClass *OpRC =
   5401           MRI.getRegClass(MI.getOperand(i).getReg());
   5402       if (RI.hasVectorRegisters(OpRC)) {
   5403         VRC = OpRC;
   5404       } else {
   5405         SRC = OpRC;
   5406       }
   5407     }
   5408 
   5409     // If any of the operands are VGPR registers, then they all most be
   5410     // otherwise we will create illegal VGPR->SGPR copies when legalizing
   5411     // them.
   5412     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
   5413       if (!VRC) {
   5414         assert(SRC);
   5415         if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
   5416           VRC = &AMDGPU::VReg_1RegClass;
   5417         } else
   5418           VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
   5419                     ? RI.getEquivalentAGPRClass(SRC)
   5420                     : RI.getEquivalentVGPRClass(SRC);
   5421       } else {
   5422           VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
   5423                     ? RI.getEquivalentAGPRClass(VRC)
   5424                     : RI.getEquivalentVGPRClass(VRC);
   5425       }
   5426       RC = VRC;
   5427     } else {
   5428       RC = SRC;
   5429     }
   5430 
   5431     // Update all the operands so they have the same type.
   5432     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
   5433       MachineOperand &Op = MI.getOperand(I);
   5434       if (!Op.isReg() || !Op.getReg().isVirtual())
   5435         continue;
   5436 
   5437       // MI is a PHI instruction.
   5438       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
   5439       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
   5440 
   5441       // Avoid creating no-op copies with the same src and dst reg class.  These
   5442       // confuse some of the machine passes.
   5443       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
   5444     }
   5445   }
   5446 
   5447   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
   5448   // VGPR dest type and SGPR sources, insert copies so all operands are
   5449   // VGPRs. This seems to help operand folding / the register coalescer.
   5450   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
   5451     MachineBasicBlock *MBB = MI.getParent();
   5452     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
   5453     if (RI.hasVGPRs(DstRC)) {
   5454       // Update all the operands so they are VGPR register classes. These may
   5455       // not be the same register class because REG_SEQUENCE supports mixing
   5456       // subregister index types e.g. sub0_sub1 + sub2 + sub3
   5457       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
   5458         MachineOperand &Op = MI.getOperand(I);
   5459         if (!Op.isReg() || !Op.getReg().isVirtual())
   5460           continue;
   5461 
   5462         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
   5463         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
   5464         if (VRC == OpRC)
   5465           continue;
   5466 
   5467         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
   5468         Op.setIsKill();
   5469       }
   5470     }
   5471 
   5472     return CreatedBB;
   5473   }
   5474 
   5475   // Legalize INSERT_SUBREG
   5476   // src0 must have the same register class as dst
   5477   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
   5478     Register Dst = MI.getOperand(0).getReg();
   5479     Register Src0 = MI.getOperand(1).getReg();
   5480     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
   5481     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
   5482     if (DstRC != Src0RC) {
   5483       MachineBasicBlock *MBB = MI.getParent();
   5484       MachineOperand &Op = MI.getOperand(1);
   5485       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
   5486     }
   5487     return CreatedBB;
   5488   }
   5489 
   5490   // Legalize SI_INIT_M0
   5491   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
   5492     MachineOperand &Src = MI.getOperand(0);
   5493     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
   5494       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
   5495     return CreatedBB;
   5496   }
   5497 
   5498   // Legalize MIMG and MUBUF/MTBUF for shaders.
   5499   //
   5500   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
   5501   // scratch memory access. In both cases, the legalization never involves
   5502   // conversion to the addr64 form.
   5503   if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
   5504                      (isMUBUF(MI) || isMTBUF(MI)))) {
   5505     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
   5506     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
   5507       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
   5508 
   5509     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
   5510     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
   5511       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
   5512 
   5513     return CreatedBB;
   5514   }
   5515 
   5516   // Legalize SI_CALL
   5517   if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
   5518     MachineOperand *Dest = &MI.getOperand(0);
   5519     if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
   5520       // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
   5521       // following copies, we also need to move copies from and to physical
   5522       // registers into the loop block.
   5523       unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
   5524       unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
   5525 
   5526       // Also move the copies to physical registers into the loop block
   5527       MachineBasicBlock &MBB = *MI.getParent();
   5528       MachineBasicBlock::iterator Start(&MI);
   5529       while (Start->getOpcode() != FrameSetupOpcode)
   5530         --Start;
   5531       MachineBasicBlock::iterator End(&MI);
   5532       while (End->getOpcode() != FrameDestroyOpcode)
   5533         ++End;
   5534       // Also include following copies of the return value
   5535       ++End;
   5536       while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
   5537              MI.definesRegister(End->getOperand(1).getReg()))
   5538         ++End;
   5539       CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
   5540     }
   5541   }
   5542 
   5543   // Legalize MUBUF* instructions.
   5544   int RsrcIdx =
   5545       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
   5546   if (RsrcIdx != -1) {
   5547     // We have an MUBUF instruction
   5548     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
   5549     unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
   5550     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
   5551                              RI.getRegClass(RsrcRC))) {
   5552       // The operands are legal.
   5553       // FIXME: We may need to legalize operands besided srsrc.
   5554       return CreatedBB;
   5555     }
   5556 
   5557     // Legalize a VGPR Rsrc.
   5558     //
   5559     // If the instruction is _ADDR64, we can avoid a waterfall by extracting
   5560     // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
   5561     // a zero-value SRsrc.
   5562     //
   5563     // If the instruction is _OFFSET (both idxen and offen disabled), and we
   5564     // support ADDR64 instructions, we can convert to ADDR64 and do the same as
   5565     // above.
   5566     //
   5567     // Otherwise we are on non-ADDR64 hardware, and/or we have
   5568     // idxen/offen/bothen and we fall back to a waterfall loop.
   5569 
   5570     MachineBasicBlock &MBB = *MI.getParent();
   5571 
   5572     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
   5573     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
   5574       // This is already an ADDR64 instruction so we need to add the pointer
   5575       // extracted from the resource descriptor to the current value of VAddr.
   5576       Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   5577       Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   5578       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   5579 
   5580       const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   5581       Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
   5582       Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
   5583 
   5584       unsigned RsrcPtr, NewSRsrc;
   5585       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
   5586 
   5587       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
   5588       const DebugLoc &DL = MI.getDebugLoc();
   5589       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
   5590         .addDef(CondReg0)
   5591         .addReg(RsrcPtr, 0, AMDGPU::sub0)
   5592         .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
   5593         .addImm(0);
   5594 
   5595       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
   5596       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
   5597         .addDef(CondReg1, RegState::Dead)
   5598         .addReg(RsrcPtr, 0, AMDGPU::sub1)
   5599         .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
   5600         .addReg(CondReg0, RegState::Kill)
   5601         .addImm(0);
   5602 
   5603       // NewVaddr = {NewVaddrHi, NewVaddrLo}
   5604       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
   5605           .addReg(NewVAddrLo)
   5606           .addImm(AMDGPU::sub0)
   5607           .addReg(NewVAddrHi)
   5608           .addImm(AMDGPU::sub1);
   5609 
   5610       VAddr->setReg(NewVAddr);
   5611       Rsrc->setReg(NewSRsrc);
   5612     } else if (!VAddr && ST.hasAddr64()) {
   5613       // This instructions is the _OFFSET variant, so we need to convert it to
   5614       // ADDR64.
   5615       assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
   5616              "FIXME: Need to emit flat atomics here");
   5617 
   5618       unsigned RsrcPtr, NewSRsrc;
   5619       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
   5620 
   5621       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   5622       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
   5623       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
   5624       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
   5625       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
   5626 
   5627       // Atomics rith return have have an additional tied operand and are
   5628       // missing some of the special bits.
   5629       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
   5630       MachineInstr *Addr64;
   5631 
   5632       if (!VDataIn) {
   5633         // Regular buffer load / store.
   5634         MachineInstrBuilder MIB =
   5635             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
   5636                 .add(*VData)
   5637                 .addReg(NewVAddr)
   5638                 .addReg(NewSRsrc)
   5639                 .add(*SOffset)
   5640                 .add(*Offset);
   5641 
   5642         if (const MachineOperand *CPol =
   5643                 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
   5644           MIB.addImm(CPol->getImm());
   5645         }
   5646 
   5647         if (const MachineOperand *TFE =
   5648                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
   5649           MIB.addImm(TFE->getImm());
   5650         }
   5651 
   5652         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
   5653 
   5654         MIB.cloneMemRefs(MI);
   5655         Addr64 = MIB;
   5656       } else {
   5657         // Atomics with return.
   5658         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
   5659                      .add(*VData)
   5660                      .add(*VDataIn)
   5661                      .addReg(NewVAddr)
   5662                      .addReg(NewSRsrc)
   5663                      .add(*SOffset)
   5664                      .add(*Offset)
   5665                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
   5666                      .cloneMemRefs(MI);
   5667       }
   5668 
   5669       MI.removeFromParent();
   5670 
   5671       // NewVaddr = {NewVaddrHi, NewVaddrLo}
   5672       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
   5673               NewVAddr)
   5674           .addReg(RsrcPtr, 0, AMDGPU::sub0)
   5675           .addImm(AMDGPU::sub0)
   5676           .addReg(RsrcPtr, 0, AMDGPU::sub1)
   5677           .addImm(AMDGPU::sub1);
   5678     } else {
   5679       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
   5680       // to SGPRs.
   5681       CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
   5682       return CreatedBB;
   5683     }
   5684   }
   5685   return CreatedBB;
   5686 }
   5687 
   5688 MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
   5689                                            MachineDominatorTree *MDT) const {
   5690   SetVectorType Worklist;
   5691   Worklist.insert(&TopInst);
   5692   MachineBasicBlock *CreatedBB = nullptr;
   5693   MachineBasicBlock *CreatedBBTmp = nullptr;
   5694 
   5695   while (!Worklist.empty()) {
   5696     MachineInstr &Inst = *Worklist.pop_back_val();
   5697     MachineBasicBlock *MBB = Inst.getParent();
   5698     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   5699 
   5700     unsigned Opcode = Inst.getOpcode();
   5701     unsigned NewOpcode = getVALUOp(Inst);
   5702 
   5703     // Handle some special cases
   5704     switch (Opcode) {
   5705     default:
   5706       break;
   5707     case AMDGPU::S_ADD_U64_PSEUDO:
   5708     case AMDGPU::S_SUB_U64_PSEUDO:
   5709       splitScalar64BitAddSub(Worklist, Inst, MDT);
   5710       Inst.eraseFromParent();
   5711       continue;
   5712     case AMDGPU::S_ADD_I32:
   5713     case AMDGPU::S_SUB_I32: {
   5714       // FIXME: The u32 versions currently selected use the carry.
   5715       bool Changed;
   5716       std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
   5717       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
   5718         CreatedBB = CreatedBBTmp;
   5719       if (Changed)
   5720         continue;
   5721 
   5722       // Default handling
   5723       break;
   5724     }
   5725     case AMDGPU::S_AND_B64:
   5726       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
   5727       Inst.eraseFromParent();
   5728       continue;
   5729 
   5730     case AMDGPU::S_OR_B64:
   5731       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
   5732       Inst.eraseFromParent();
   5733       continue;
   5734 
   5735     case AMDGPU::S_XOR_B64:
   5736       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
   5737       Inst.eraseFromParent();
   5738       continue;
   5739 
   5740     case AMDGPU::S_NAND_B64:
   5741       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
   5742       Inst.eraseFromParent();
   5743       continue;
   5744 
   5745     case AMDGPU::S_NOR_B64:
   5746       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
   5747       Inst.eraseFromParent();
   5748       continue;
   5749 
   5750     case AMDGPU::S_XNOR_B64:
   5751       if (ST.hasDLInsts())
   5752         splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
   5753       else
   5754         splitScalar64BitXnor(Worklist, Inst, MDT);
   5755       Inst.eraseFromParent();
   5756       continue;
   5757 
   5758     case AMDGPU::S_ANDN2_B64:
   5759       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
   5760       Inst.eraseFromParent();
   5761       continue;
   5762 
   5763     case AMDGPU::S_ORN2_B64:
   5764       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
   5765       Inst.eraseFromParent();
   5766       continue;
   5767 
   5768     case AMDGPU::S_BREV_B64:
   5769       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
   5770       Inst.eraseFromParent();
   5771       continue;
   5772 
   5773     case AMDGPU::S_NOT_B64:
   5774       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
   5775       Inst.eraseFromParent();
   5776       continue;
   5777 
   5778     case AMDGPU::S_BCNT1_I32_B64:
   5779       splitScalar64BitBCNT(Worklist, Inst);
   5780       Inst.eraseFromParent();
   5781       continue;
   5782 
   5783     case AMDGPU::S_BFE_I64:
   5784       splitScalar64BitBFE(Worklist, Inst);
   5785       Inst.eraseFromParent();
   5786       continue;
   5787 
   5788     case AMDGPU::S_LSHL_B32:
   5789       if (ST.hasOnlyRevVALUShifts()) {
   5790         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
   5791         swapOperands(Inst);
   5792       }
   5793       break;
   5794     case AMDGPU::S_ASHR_I32:
   5795       if (ST.hasOnlyRevVALUShifts()) {
   5796         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
   5797         swapOperands(Inst);
   5798       }
   5799       break;
   5800     case AMDGPU::S_LSHR_B32:
   5801       if (ST.hasOnlyRevVALUShifts()) {
   5802         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
   5803         swapOperands(Inst);
   5804       }
   5805       break;
   5806     case AMDGPU::S_LSHL_B64:
   5807       if (ST.hasOnlyRevVALUShifts()) {
   5808         NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
   5809         swapOperands(Inst);
   5810       }
   5811       break;
   5812     case AMDGPU::S_ASHR_I64:
   5813       if (ST.hasOnlyRevVALUShifts()) {
   5814         NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
   5815         swapOperands(Inst);
   5816       }
   5817       break;
   5818     case AMDGPU::S_LSHR_B64:
   5819       if (ST.hasOnlyRevVALUShifts()) {
   5820         NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
   5821         swapOperands(Inst);
   5822       }
   5823       break;
   5824 
   5825     case AMDGPU::S_ABS_I32:
   5826       lowerScalarAbs(Worklist, Inst);
   5827       Inst.eraseFromParent();
   5828       continue;
   5829 
   5830     case AMDGPU::S_CBRANCH_SCC0:
   5831     case AMDGPU::S_CBRANCH_SCC1:
   5832       // Clear unused bits of vcc
   5833       if (ST.isWave32())
   5834         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
   5835                 AMDGPU::VCC_LO)
   5836             .addReg(AMDGPU::EXEC_LO)
   5837             .addReg(AMDGPU::VCC_LO);
   5838       else
   5839         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
   5840                 AMDGPU::VCC)
   5841             .addReg(AMDGPU::EXEC)
   5842             .addReg(AMDGPU::VCC);
   5843       break;
   5844 
   5845     case AMDGPU::S_BFE_U64:
   5846     case AMDGPU::S_BFM_B64:
   5847       llvm_unreachable("Moving this op to VALU not implemented");
   5848 
   5849     case AMDGPU::S_PACK_LL_B32_B16:
   5850     case AMDGPU::S_PACK_LH_B32_B16:
   5851     case AMDGPU::S_PACK_HH_B32_B16:
   5852       movePackToVALU(Worklist, MRI, Inst);
   5853       Inst.eraseFromParent();
   5854       continue;
   5855 
   5856     case AMDGPU::S_XNOR_B32:
   5857       lowerScalarXnor(Worklist, Inst);
   5858       Inst.eraseFromParent();
   5859       continue;
   5860 
   5861     case AMDGPU::S_NAND_B32:
   5862       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
   5863       Inst.eraseFromParent();
   5864       continue;
   5865 
   5866     case AMDGPU::S_NOR_B32:
   5867       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
   5868       Inst.eraseFromParent();
   5869       continue;
   5870 
   5871     case AMDGPU::S_ANDN2_B32:
   5872       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
   5873       Inst.eraseFromParent();
   5874       continue;
   5875 
   5876     case AMDGPU::S_ORN2_B32:
   5877       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
   5878       Inst.eraseFromParent();
   5879       continue;
   5880 
   5881     // TODO: remove as soon as everything is ready
   5882     // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
   5883     // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
   5884     // can only be selected from the uniform SDNode.
   5885     case AMDGPU::S_ADD_CO_PSEUDO:
   5886     case AMDGPU::S_SUB_CO_PSEUDO: {
   5887       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
   5888                          ? AMDGPU::V_ADDC_U32_e64
   5889                          : AMDGPU::V_SUBB_U32_e64;
   5890       const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   5891 
   5892       Register CarryInReg = Inst.getOperand(4).getReg();
   5893       if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
   5894         Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
   5895         BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
   5896             .addReg(CarryInReg);
   5897       }
   5898 
   5899       Register CarryOutReg = Inst.getOperand(1).getReg();
   5900 
   5901       Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
   5902           MRI.getRegClass(Inst.getOperand(0).getReg())));
   5903       MachineInstr *CarryOp =
   5904           BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
   5905               .addReg(CarryOutReg, RegState::Define)
   5906               .add(Inst.getOperand(2))
   5907               .add(Inst.getOperand(3))
   5908               .addReg(CarryInReg)
   5909               .addImm(0);
   5910       CreatedBBTmp = legalizeOperands(*CarryOp);
   5911       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
   5912         CreatedBB = CreatedBBTmp;
   5913       MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
   5914       addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
   5915       Inst.eraseFromParent();
   5916     }
   5917       continue;
   5918     case AMDGPU::S_UADDO_PSEUDO:
   5919     case AMDGPU::S_USUBO_PSEUDO: {
   5920       const DebugLoc &DL = Inst.getDebugLoc();
   5921       MachineOperand &Dest0 = Inst.getOperand(0);
   5922       MachineOperand &Dest1 = Inst.getOperand(1);
   5923       MachineOperand &Src0 = Inst.getOperand(2);
   5924       MachineOperand &Src1 = Inst.getOperand(3);
   5925 
   5926       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
   5927                          ? AMDGPU::V_ADD_CO_U32_e64
   5928                          : AMDGPU::V_SUB_CO_U32_e64;
   5929       const TargetRegisterClass *NewRC =
   5930           RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
   5931       Register DestReg = MRI.createVirtualRegister(NewRC);
   5932       MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
   5933                                    .addReg(Dest1.getReg(), RegState::Define)
   5934                                    .add(Src0)
   5935                                    .add(Src1)
   5936                                    .addImm(0); // clamp bit
   5937 
   5938       CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
   5939       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
   5940         CreatedBB = CreatedBBTmp;
   5941 
   5942       MRI.replaceRegWith(Dest0.getReg(), DestReg);
   5943       addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
   5944                                    Worklist);
   5945       Inst.eraseFromParent();
   5946     }
   5947       continue;
   5948 
   5949     case AMDGPU::S_CSELECT_B32:
   5950     case AMDGPU::S_CSELECT_B64:
   5951       lowerSelect(Worklist, Inst, MDT);
   5952       Inst.eraseFromParent();
   5953       continue;
   5954     }
   5955 
   5956     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
   5957       // We cannot move this instruction to the VALU, so we should try to
   5958       // legalize its operands instead.
   5959       CreatedBBTmp = legalizeOperands(Inst, MDT);
   5960       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
   5961         CreatedBB = CreatedBBTmp;
   5962       continue;
   5963     }
   5964 
   5965     // Use the new VALU Opcode.
   5966     const MCInstrDesc &NewDesc = get(NewOpcode);
   5967     Inst.setDesc(NewDesc);
   5968 
   5969     // Remove any references to SCC. Vector instructions can't read from it, and
   5970     // We're just about to add the implicit use / defs of VCC, and we don't want
   5971     // both.
   5972     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
   5973       MachineOperand &Op = Inst.getOperand(i);
   5974       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
   5975         // Only propagate through live-def of SCC.
   5976         if (Op.isDef() && !Op.isDead())
   5977           addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
   5978         if (Op.isUse())
   5979           addSCCDefsToVALUWorklist(Op, Worklist);
   5980         Inst.RemoveOperand(i);
   5981       }
   5982     }
   5983 
   5984     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
   5985       // We are converting these to a BFE, so we need to add the missing
   5986       // operands for the size and offset.
   5987       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
   5988       Inst.addOperand(MachineOperand::CreateImm(0));
   5989       Inst.addOperand(MachineOperand::CreateImm(Size));
   5990 
   5991     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
   5992       // The VALU version adds the second operand to the result, so insert an
   5993       // extra 0 operand.
   5994       Inst.addOperand(MachineOperand::CreateImm(0));
   5995     }
   5996 
   5997     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
   5998     fixImplicitOperands(Inst);
   5999 
   6000     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
   6001       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
   6002       // If we need to move this to VGPRs, we need to unpack the second operand
   6003       // back into the 2 separate ones for bit offset and width.
   6004       assert(OffsetWidthOp.isImm() &&
   6005              "Scalar BFE is only implemented for constant width and offset");
   6006       uint32_t Imm = OffsetWidthOp.getImm();
   6007 
   6008       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
   6009       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
   6010       Inst.RemoveOperand(2);                     // Remove old immediate.
   6011       Inst.addOperand(MachineOperand::CreateImm(Offset));
   6012       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
   6013     }
   6014 
   6015     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
   6016     unsigned NewDstReg = AMDGPU::NoRegister;
   6017     if (HasDst) {
   6018       Register DstReg = Inst.getOperand(0).getReg();
   6019       if (DstReg.isPhysical())
   6020         continue;
   6021 
   6022       // Update the destination register class.
   6023       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
   6024       if (!NewDstRC)
   6025         continue;
   6026 
   6027       if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
   6028           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
   6029         // Instead of creating a copy where src and dst are the same register
   6030         // class, we just replace all uses of dst with src.  These kinds of
   6031         // copies interfere with the heuristics MachineSink uses to decide
   6032         // whether or not to split a critical edge.  Since the pass assumes
   6033         // that copies will end up as machine instructions and not be
   6034         // eliminated.
   6035         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
   6036         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
   6037         MRI.clearKillFlags(Inst.getOperand(1).getReg());
   6038         Inst.getOperand(0).setReg(DstReg);
   6039 
   6040         // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
   6041         // these are deleted later, but at -O0 it would leave a suspicious
   6042         // looking illegal copy of an undef register.
   6043         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
   6044           Inst.RemoveOperand(I);
   6045         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
   6046         continue;
   6047       }
   6048 
   6049       NewDstReg = MRI.createVirtualRegister(NewDstRC);
   6050       MRI.replaceRegWith(DstReg, NewDstReg);
   6051     }
   6052 
   6053     // Legalize the operands
   6054     CreatedBBTmp = legalizeOperands(Inst, MDT);
   6055     if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
   6056       CreatedBB = CreatedBBTmp;
   6057 
   6058     if (HasDst)
   6059      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
   6060   }
   6061   return CreatedBB;
   6062 }
   6063 
   6064 // Add/sub require special handling to deal with carry outs.
   6065 std::pair<bool, MachineBasicBlock *>
   6066 SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
   6067                               MachineDominatorTree *MDT) const {
   6068   if (ST.hasAddNoCarry()) {
   6069     // Assume there is no user of scc since we don't select this in that case.
   6070     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
   6071     // is used.
   6072 
   6073     MachineBasicBlock &MBB = *Inst.getParent();
   6074     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6075 
   6076     Register OldDstReg = Inst.getOperand(0).getReg();
   6077     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6078 
   6079     unsigned Opc = Inst.getOpcode();
   6080     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
   6081 
   6082     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
   6083       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
   6084 
   6085     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
   6086     Inst.RemoveOperand(3);
   6087 
   6088     Inst.setDesc(get(NewOpc));
   6089     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
   6090     Inst.addImplicitDefUseOperands(*MBB.getParent());
   6091     MRI.replaceRegWith(OldDstReg, ResultReg);
   6092     MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
   6093 
   6094     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6095     return std::make_pair(true, NewBB);
   6096   }
   6097 
   6098   return std::make_pair(false, nullptr);
   6099 }
   6100 
   6101 void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
   6102                               MachineDominatorTree *MDT) const {
   6103 
   6104   MachineBasicBlock &MBB = *Inst.getParent();
   6105   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6106   MachineBasicBlock::iterator MII = Inst;
   6107   DebugLoc DL = Inst.getDebugLoc();
   6108 
   6109   MachineOperand &Dest = Inst.getOperand(0);
   6110   MachineOperand &Src0 = Inst.getOperand(1);
   6111   MachineOperand &Src1 = Inst.getOperand(2);
   6112   MachineOperand &Cond = Inst.getOperand(3);
   6113 
   6114   Register SCCSource = Cond.getReg();
   6115   // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
   6116   if (!Cond.isUndef()) {
   6117     for (MachineInstr &CandI :
   6118          make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
   6119                     Inst.getParent()->rend())) {
   6120       if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
   6121           -1) {
   6122         if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
   6123           SCCSource = CandI.getOperand(1).getReg();
   6124         }
   6125         break;
   6126       }
   6127     }
   6128   }
   6129 
   6130   // If this is a trivial select where the condition is effectively not SCC
   6131   // (SCCSource is a source of copy to SCC), then the select is semantically
   6132   // equivalent to copying SCCSource. Hence, there is no need to create
   6133   // V_CNDMASK, we can just use that and bail out.
   6134   if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
   6135       Src1.isImm() && (Src1.getImm() == 0)) {
   6136     MRI.replaceRegWith(Dest.getReg(), SCCSource);
   6137     return;
   6138   }
   6139 
   6140   const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
   6141                                       ? &AMDGPU::SReg_64_XEXECRegClass
   6142                                       : &AMDGPU::SReg_32_XM0_XEXECRegClass;
   6143   Register CopySCC = MRI.createVirtualRegister(TC);
   6144 
   6145   if (SCCSource == AMDGPU::SCC) {
   6146     // Insert a trivial select instead of creating a copy, because a copy from
   6147     // SCC would semantically mean just copying a single bit, but we may need
   6148     // the result to be a vector condition mask that needs preserving.
   6149     unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
   6150                                                     : AMDGPU::S_CSELECT_B32;
   6151     auto NewSelect =
   6152         BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
   6153     NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
   6154   } else {
   6155     BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
   6156   }
   6157 
   6158   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6159 
   6160   auto UpdatedInst =
   6161       BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
   6162           .addImm(0)
   6163           .add(Src1) // False
   6164           .addImm(0)
   6165           .add(Src0) // True
   6166           .addReg(CopySCC);
   6167 
   6168   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   6169   legalizeOperands(*UpdatedInst, MDT);
   6170   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6171 }
   6172 
   6173 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
   6174                                  MachineInstr &Inst) const {
   6175   MachineBasicBlock &MBB = *Inst.getParent();
   6176   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6177   MachineBasicBlock::iterator MII = Inst;
   6178   DebugLoc DL = Inst.getDebugLoc();
   6179 
   6180   MachineOperand &Dest = Inst.getOperand(0);
   6181   MachineOperand &Src = Inst.getOperand(1);
   6182   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6183   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6184 
   6185   unsigned SubOp = ST.hasAddNoCarry() ?
   6186     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
   6187 
   6188   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
   6189     .addImm(0)
   6190     .addReg(Src.getReg());
   6191 
   6192   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
   6193     .addReg(Src.getReg())
   6194     .addReg(TmpReg);
   6195 
   6196   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   6197   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6198 }
   6199 
   6200 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
   6201                                   MachineInstr &Inst) const {
   6202   MachineBasicBlock &MBB = *Inst.getParent();
   6203   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6204   MachineBasicBlock::iterator MII = Inst;
   6205   const DebugLoc &DL = Inst.getDebugLoc();
   6206 
   6207   MachineOperand &Dest = Inst.getOperand(0);
   6208   MachineOperand &Src0 = Inst.getOperand(1);
   6209   MachineOperand &Src1 = Inst.getOperand(2);
   6210 
   6211   if (ST.hasDLInsts()) {
   6212     Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6213     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
   6214     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
   6215 
   6216     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
   6217       .add(Src0)
   6218       .add(Src1);
   6219 
   6220     MRI.replaceRegWith(Dest.getReg(), NewDest);
   6221     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   6222   } else {
   6223     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
   6224     // invert either source and then perform the XOR. If either source is a
   6225     // scalar register, then we can leave the inversion on the scalar unit to
   6226     // acheive a better distrubution of scalar and vector instructions.
   6227     bool Src0IsSGPR = Src0.isReg() &&
   6228                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
   6229     bool Src1IsSGPR = Src1.isReg() &&
   6230                       RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
   6231     MachineInstr *Xor;
   6232     Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   6233     Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   6234 
   6235     // Build a pair of scalar instructions and add them to the work list.
   6236     // The next iteration over the work list will lower these to the vector
   6237     // unit as necessary.
   6238     if (Src0IsSGPR) {
   6239       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
   6240       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
   6241       .addReg(Temp)
   6242       .add(Src1);
   6243     } else if (Src1IsSGPR) {
   6244       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
   6245       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
   6246       .add(Src0)
   6247       .addReg(Temp);
   6248     } else {
   6249       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
   6250         .add(Src0)
   6251         .add(Src1);
   6252       MachineInstr *Not =
   6253           BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
   6254       Worklist.insert(Not);
   6255     }
   6256 
   6257     MRI.replaceRegWith(Dest.getReg(), NewDest);
   6258 
   6259     Worklist.insert(Xor);
   6260 
   6261     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   6262   }
   6263 }
   6264 
   6265 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
   6266                                       MachineInstr &Inst,
   6267                                       unsigned Opcode) const {
   6268   MachineBasicBlock &MBB = *Inst.getParent();
   6269   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6270   MachineBasicBlock::iterator MII = Inst;
   6271   const DebugLoc &DL = Inst.getDebugLoc();
   6272 
   6273   MachineOperand &Dest = Inst.getOperand(0);
   6274   MachineOperand &Src0 = Inst.getOperand(1);
   6275   MachineOperand &Src1 = Inst.getOperand(2);
   6276 
   6277   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   6278   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   6279 
   6280   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
   6281     .add(Src0)
   6282     .add(Src1);
   6283 
   6284   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
   6285     .addReg(Interm);
   6286 
   6287   Worklist.insert(&Op);
   6288   Worklist.insert(&Not);
   6289 
   6290   MRI.replaceRegWith(Dest.getReg(), NewDest);
   6291   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   6292 }
   6293 
   6294 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
   6295                                      MachineInstr &Inst,
   6296                                      unsigned Opcode) const {
   6297   MachineBasicBlock &MBB = *Inst.getParent();
   6298   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6299   MachineBasicBlock::iterator MII = Inst;
   6300   const DebugLoc &DL = Inst.getDebugLoc();
   6301 
   6302   MachineOperand &Dest = Inst.getOperand(0);
   6303   MachineOperand &Src0 = Inst.getOperand(1);
   6304   MachineOperand &Src1 = Inst.getOperand(2);
   6305 
   6306   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   6307   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   6308 
   6309   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
   6310     .add(Src1);
   6311 
   6312   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
   6313     .add(Src0)
   6314     .addReg(Interm);
   6315 
   6316   Worklist.insert(&Not);
   6317   Worklist.insert(&Op);
   6318 
   6319   MRI.replaceRegWith(Dest.getReg(), NewDest);
   6320   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   6321 }
   6322 
   6323 void SIInstrInfo::splitScalar64BitUnaryOp(
   6324     SetVectorType &Worklist, MachineInstr &Inst,
   6325     unsigned Opcode, bool Swap) const {
   6326   MachineBasicBlock &MBB = *Inst.getParent();
   6327   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6328 
   6329   MachineOperand &Dest = Inst.getOperand(0);
   6330   MachineOperand &Src0 = Inst.getOperand(1);
   6331   DebugLoc DL = Inst.getDebugLoc();
   6332 
   6333   MachineBasicBlock::iterator MII = Inst;
   6334 
   6335   const MCInstrDesc &InstDesc = get(Opcode);
   6336   const TargetRegisterClass *Src0RC = Src0.isReg() ?
   6337     MRI.getRegClass(Src0.getReg()) :
   6338     &AMDGPU::SGPR_32RegClass;
   6339 
   6340   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
   6341 
   6342   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   6343                                                        AMDGPU::sub0, Src0SubRC);
   6344 
   6345   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   6346   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
   6347   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
   6348 
   6349   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   6350   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
   6351 
   6352   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   6353                                                        AMDGPU::sub1, Src0SubRC);
   6354 
   6355   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   6356   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
   6357 
   6358   if (Swap)
   6359     std::swap(DestSub0, DestSub1);
   6360 
   6361   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
   6362   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
   6363     .addReg(DestSub0)
   6364     .addImm(AMDGPU::sub0)
   6365     .addReg(DestSub1)
   6366     .addImm(AMDGPU::sub1);
   6367 
   6368   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
   6369 
   6370   Worklist.insert(&LoHalf);
   6371   Worklist.insert(&HiHalf);
   6372 
   6373   // We don't need to legalizeOperands here because for a single operand, src0
   6374   // will support any kind of input.
   6375 
   6376   // Move all users of this moved value.
   6377   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
   6378 }
   6379 
   6380 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
   6381                                          MachineInstr &Inst,
   6382                                          MachineDominatorTree *MDT) const {
   6383   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
   6384 
   6385   MachineBasicBlock &MBB = *Inst.getParent();
   6386   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6387   const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   6388 
   6389   Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   6390   Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6391   Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6392 
   6393   Register CarryReg = MRI.createVirtualRegister(CarryRC);
   6394   Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
   6395 
   6396   MachineOperand &Dest = Inst.getOperand(0);
   6397   MachineOperand &Src0 = Inst.getOperand(1);
   6398   MachineOperand &Src1 = Inst.getOperand(2);
   6399   const DebugLoc &DL = Inst.getDebugLoc();
   6400   MachineBasicBlock::iterator MII = Inst;
   6401 
   6402   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
   6403   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
   6404   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
   6405   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
   6406 
   6407   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   6408                                                        AMDGPU::sub0, Src0SubRC);
   6409   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
   6410                                                        AMDGPU::sub0, Src1SubRC);
   6411 
   6412 
   6413   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   6414                                                        AMDGPU::sub1, Src0SubRC);
   6415   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
   6416                                                        AMDGPU::sub1, Src1SubRC);
   6417 
   6418   unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
   6419   MachineInstr *LoHalf =
   6420     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
   6421     .addReg(CarryReg, RegState::Define)
   6422     .add(SrcReg0Sub0)
   6423     .add(SrcReg1Sub0)
   6424     .addImm(0); // clamp bit
   6425 
   6426   unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
   6427   MachineInstr *HiHalf =
   6428     BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
   6429     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
   6430     .add(SrcReg0Sub1)
   6431     .add(SrcReg1Sub1)
   6432     .addReg(CarryReg, RegState::Kill)
   6433     .addImm(0); // clamp bit
   6434 
   6435   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
   6436     .addReg(DestSub0)
   6437     .addImm(AMDGPU::sub0)
   6438     .addReg(DestSub1)
   6439     .addImm(AMDGPU::sub1);
   6440 
   6441   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
   6442 
   6443   // Try to legalize the operands in case we need to swap the order to keep it
   6444   // valid.
   6445   legalizeOperands(*LoHalf, MDT);
   6446   legalizeOperands(*HiHalf, MDT);
   6447 
   6448   // Move all users of this moved vlaue.
   6449   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
   6450 }
   6451 
   6452 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
   6453                                            MachineInstr &Inst, unsigned Opcode,
   6454                                            MachineDominatorTree *MDT) const {
   6455   MachineBasicBlock &MBB = *Inst.getParent();
   6456   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6457 
   6458   MachineOperand &Dest = Inst.getOperand(0);
   6459   MachineOperand &Src0 = Inst.getOperand(1);
   6460   MachineOperand &Src1 = Inst.getOperand(2);
   6461   DebugLoc DL = Inst.getDebugLoc();
   6462 
   6463   MachineBasicBlock::iterator MII = Inst;
   6464 
   6465   const MCInstrDesc &InstDesc = get(Opcode);
   6466   const TargetRegisterClass *Src0RC = Src0.isReg() ?
   6467     MRI.getRegClass(Src0.getReg()) :
   6468     &AMDGPU::SGPR_32RegClass;
   6469 
   6470   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
   6471   const TargetRegisterClass *Src1RC = Src1.isReg() ?
   6472     MRI.getRegClass(Src1.getReg()) :
   6473     &AMDGPU::SGPR_32RegClass;
   6474 
   6475   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
   6476 
   6477   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   6478                                                        AMDGPU::sub0, Src0SubRC);
   6479   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
   6480                                                        AMDGPU::sub0, Src1SubRC);
   6481   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   6482                                                        AMDGPU::sub1, Src0SubRC);
   6483   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
   6484                                                        AMDGPU::sub1, Src1SubRC);
   6485 
   6486   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   6487   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
   6488   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
   6489 
   6490   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   6491   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
   6492                               .add(SrcReg0Sub0)
   6493                               .add(SrcReg1Sub0);
   6494 
   6495   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   6496   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
   6497                               .add(SrcReg0Sub1)
   6498                               .add(SrcReg1Sub1);
   6499 
   6500   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
   6501   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
   6502     .addReg(DestSub0)
   6503     .addImm(AMDGPU::sub0)
   6504     .addReg(DestSub1)
   6505     .addImm(AMDGPU::sub1);
   6506 
   6507   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
   6508 
   6509   Worklist.insert(&LoHalf);
   6510   Worklist.insert(&HiHalf);
   6511 
   6512   // Move all users of this moved vlaue.
   6513   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
   6514 }
   6515 
   6516 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
   6517                                        MachineInstr &Inst,
   6518                                        MachineDominatorTree *MDT) const {
   6519   MachineBasicBlock &MBB = *Inst.getParent();
   6520   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6521 
   6522   MachineOperand &Dest = Inst.getOperand(0);
   6523   MachineOperand &Src0 = Inst.getOperand(1);
   6524   MachineOperand &Src1 = Inst.getOperand(2);
   6525   const DebugLoc &DL = Inst.getDebugLoc();
   6526 
   6527   MachineBasicBlock::iterator MII = Inst;
   6528 
   6529   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   6530 
   6531   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
   6532 
   6533   MachineOperand* Op0;
   6534   MachineOperand* Op1;
   6535 
   6536   if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
   6537     Op0 = &Src0;
   6538     Op1 = &Src1;
   6539   } else {
   6540     Op0 = &Src1;
   6541     Op1 = &Src0;
   6542   }
   6543 
   6544   BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
   6545     .add(*Op0);
   6546 
   6547   Register NewDest = MRI.createVirtualRegister(DestRC);
   6548 
   6549   MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
   6550     .addReg(Interm)
   6551     .add(*Op1);
   6552 
   6553   MRI.replaceRegWith(Dest.getReg(), NewDest);
   6554 
   6555   Worklist.insert(&Xor);
   6556 }
   6557 
   6558 void SIInstrInfo::splitScalar64BitBCNT(
   6559     SetVectorType &Worklist, MachineInstr &Inst) const {
   6560   MachineBasicBlock &MBB = *Inst.getParent();
   6561   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6562 
   6563   MachineBasicBlock::iterator MII = Inst;
   6564   const DebugLoc &DL = Inst.getDebugLoc();
   6565 
   6566   MachineOperand &Dest = Inst.getOperand(0);
   6567   MachineOperand &Src = Inst.getOperand(1);
   6568 
   6569   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
   6570   const TargetRegisterClass *SrcRC = Src.isReg() ?
   6571     MRI.getRegClass(Src.getReg()) :
   6572     &AMDGPU::SGPR_32RegClass;
   6573 
   6574   Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6575   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6576 
   6577   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
   6578 
   6579   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
   6580                                                       AMDGPU::sub0, SrcSubRC);
   6581   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
   6582                                                       AMDGPU::sub1, SrcSubRC);
   6583 
   6584   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
   6585 
   6586   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
   6587 
   6588   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   6589 
   6590   // We don't need to legalize operands here. src0 for etiher instruction can be
   6591   // an SGPR, and the second input is unused or determined here.
   6592   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6593 }
   6594 
   6595 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   6596                                       MachineInstr &Inst) const {
   6597   MachineBasicBlock &MBB = *Inst.getParent();
   6598   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   6599   MachineBasicBlock::iterator MII = Inst;
   6600   const DebugLoc &DL = Inst.getDebugLoc();
   6601 
   6602   MachineOperand &Dest = Inst.getOperand(0);
   6603   uint32_t Imm = Inst.getOperand(2).getImm();
   6604   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
   6605   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
   6606 
   6607   (void) Offset;
   6608 
   6609   // Only sext_inreg cases handled.
   6610   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
   6611          Offset == 0 && "Not implemented");
   6612 
   6613   if (BitWidth < 32) {
   6614     Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6615     Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6616     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   6617 
   6618     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
   6619         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
   6620         .addImm(0)
   6621         .addImm(BitWidth);
   6622 
   6623     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
   6624       .addImm(31)
   6625       .addReg(MidRegLo);
   6626 
   6627     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
   6628       .addReg(MidRegLo)
   6629       .addImm(AMDGPU::sub0)
   6630       .addReg(MidRegHi)
   6631       .addImm(AMDGPU::sub1);
   6632 
   6633     MRI.replaceRegWith(Dest.getReg(), ResultReg);
   6634     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6635     return;
   6636   }
   6637 
   6638   MachineOperand &Src = Inst.getOperand(1);
   6639   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6640   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   6641 
   6642   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
   6643     .addImm(31)
   6644     .addReg(Src.getReg(), 0, AMDGPU::sub0);
   6645 
   6646   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
   6647     .addReg(Src.getReg(), 0, AMDGPU::sub0)
   6648     .addImm(AMDGPU::sub0)
   6649     .addReg(TmpReg)
   6650     .addImm(AMDGPU::sub1);
   6651 
   6652   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   6653   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6654 }
   6655 
   6656 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   6657   Register DstReg,
   6658   MachineRegisterInfo &MRI,
   6659   SetVectorType &Worklist) const {
   6660   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
   6661          E = MRI.use_end(); I != E;) {
   6662     MachineInstr &UseMI = *I->getParent();
   6663 
   6664     unsigned OpNo = 0;
   6665 
   6666     switch (UseMI.getOpcode()) {
   6667     case AMDGPU::COPY:
   6668     case AMDGPU::WQM:
   6669     case AMDGPU::SOFT_WQM:
   6670     case AMDGPU::STRICT_WWM:
   6671     case AMDGPU::STRICT_WQM:
   6672     case AMDGPU::REG_SEQUENCE:
   6673     case AMDGPU::PHI:
   6674     case AMDGPU::INSERT_SUBREG:
   6675       break;
   6676     default:
   6677       OpNo = I.getOperandNo();
   6678       break;
   6679     }
   6680 
   6681     if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
   6682       Worklist.insert(&UseMI);
   6683 
   6684       do {
   6685         ++I;
   6686       } while (I != E && I->getParent() == &UseMI);
   6687     } else {
   6688       ++I;
   6689     }
   6690   }
   6691 }
   6692 
   6693 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
   6694                                  MachineRegisterInfo &MRI,
   6695                                  MachineInstr &Inst) const {
   6696   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6697   MachineBasicBlock *MBB = Inst.getParent();
   6698   MachineOperand &Src0 = Inst.getOperand(1);
   6699   MachineOperand &Src1 = Inst.getOperand(2);
   6700   const DebugLoc &DL = Inst.getDebugLoc();
   6701 
   6702   switch (Inst.getOpcode()) {
   6703   case AMDGPU::S_PACK_LL_B32_B16: {
   6704     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6705     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6706 
   6707     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
   6708     // 0.
   6709     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
   6710       .addImm(0xffff);
   6711 
   6712     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
   6713       .addReg(ImmReg, RegState::Kill)
   6714       .add(Src0);
   6715 
   6716     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
   6717       .add(Src1)
   6718       .addImm(16)
   6719       .addReg(TmpReg, RegState::Kill);
   6720     break;
   6721   }
   6722   case AMDGPU::S_PACK_LH_B32_B16: {
   6723     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6724     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
   6725       .addImm(0xffff);
   6726     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
   6727       .addReg(ImmReg, RegState::Kill)
   6728       .add(Src0)
   6729       .add(Src1);
   6730     break;
   6731   }
   6732   case AMDGPU::S_PACK_HH_B32_B16: {
   6733     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6734     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   6735     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
   6736       .addImm(16)
   6737       .add(Src0);
   6738     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
   6739       .addImm(0xffff0000);
   6740     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
   6741       .add(Src1)
   6742       .addReg(ImmReg, RegState::Kill)
   6743       .addReg(TmpReg, RegState::Kill);
   6744     break;
   6745   }
   6746   default:
   6747     llvm_unreachable("unhandled s_pack_* instruction");
   6748   }
   6749 
   6750   MachineOperand &Dest = Inst.getOperand(0);
   6751   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   6752   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
   6753 }
   6754 
   6755 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
   6756                                                MachineInstr &SCCDefInst,
   6757                                                SetVectorType &Worklist) const {
   6758   bool SCCUsedImplicitly = false;
   6759 
   6760   // Ensure that def inst defines SCC, which is still live.
   6761   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
   6762          !Op.isDead() && Op.getParent() == &SCCDefInst);
   6763   SmallVector<MachineInstr *, 4> CopyToDelete;
   6764   // This assumes that all the users of SCC are in the same block
   6765   // as the SCC def.
   6766   for (MachineInstr &MI : // Skip the def inst itself.
   6767        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
   6768                   SCCDefInst.getParent()->end())) {
   6769     // Check if SCC is used first.
   6770     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
   6771       if (MI.isCopy()) {
   6772         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   6773         Register DestReg = MI.getOperand(0).getReg();
   6774 
   6775         for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
   6776           if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
   6777               (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
   6778             User.getOperand(4).setReg(RI.getVCC());
   6779             Worklist.insert(&User);
   6780           } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
   6781             User.getOperand(5).setReg(RI.getVCC());
   6782             // No need to add to Worklist.
   6783           }
   6784         }
   6785         CopyToDelete.push_back(&MI);
   6786       } else {
   6787         if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
   6788             MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
   6789           // This is an implicit use of SCC and it is really expected by
   6790           // the SCC users to handle.
   6791           // We cannot preserve the edge to the user so add the explicit
   6792           // copy: SCC = COPY VCC.
   6793           // The copy will be cleaned up during the processing of the user
   6794           // in lowerSelect.
   6795           SCCUsedImplicitly = true;
   6796         }
   6797 
   6798         Worklist.insert(&MI);
   6799       }
   6800     }
   6801     // Exit if we find another SCC def.
   6802     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
   6803       break;
   6804   }
   6805   for (auto &Copy : CopyToDelete)
   6806     Copy->eraseFromParent();
   6807 
   6808   if (SCCUsedImplicitly) {
   6809     BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
   6810             SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
   6811         .addReg(RI.getVCC());
   6812   }
   6813 }
   6814 
   6815 // Instructions that use SCC may be converted to VALU instructions. When that
   6816 // happens, the SCC register is changed to VCC_LO. The instruction that defines
   6817 // SCC must be changed to an instruction that defines VCC. This function makes
   6818 // sure that the instruction that defines SCC is added to the moveToVALU
   6819 // worklist.
   6820 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
   6821                                            SetVectorType &Worklist) const {
   6822   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
   6823 
   6824   MachineInstr *SCCUseInst = Op.getParent();
   6825   // Look for a preceeding instruction that either defines VCC or SCC. If VCC
   6826   // then there is nothing to do because the defining instruction has been
   6827   // converted to a VALU already. If SCC then that instruction needs to be
   6828   // converted to a VALU.
   6829   for (MachineInstr &MI :
   6830        make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
   6831                   SCCUseInst->getParent()->rend())) {
   6832     if (MI.modifiesRegister(AMDGPU::VCC, &RI))
   6833       break;
   6834     if (MI.definesRegister(AMDGPU::SCC, &RI)) {
   6835       Worklist.insert(&MI);
   6836       break;
   6837     }
   6838   }
   6839 }
   6840 
   6841 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   6842   const MachineInstr &Inst) const {
   6843   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
   6844 
   6845   switch (Inst.getOpcode()) {
   6846   // For target instructions, getOpRegClass just returns the virtual register
   6847   // class associated with the operand, so we need to find an equivalent VGPR
   6848   // register class in order to move the instruction to the VALU.
   6849   case AMDGPU::COPY:
   6850   case AMDGPU::PHI:
   6851   case AMDGPU::REG_SEQUENCE:
   6852   case AMDGPU::INSERT_SUBREG:
   6853   case AMDGPU::WQM:
   6854   case AMDGPU::SOFT_WQM:
   6855   case AMDGPU::STRICT_WWM:
   6856   case AMDGPU::STRICT_WQM: {
   6857     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
   6858     if (RI.hasAGPRs(SrcRC)) {
   6859       if (RI.hasAGPRs(NewDstRC))
   6860         return nullptr;
   6861 
   6862       switch (Inst.getOpcode()) {
   6863       case AMDGPU::PHI:
   6864       case AMDGPU::REG_SEQUENCE:
   6865       case AMDGPU::INSERT_SUBREG:
   6866         NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
   6867         break;
   6868       default:
   6869         NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
   6870       }
   6871 
   6872       if (!NewDstRC)
   6873         return nullptr;
   6874     } else {
   6875       if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
   6876         return nullptr;
   6877 
   6878       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
   6879       if (!NewDstRC)
   6880         return nullptr;
   6881     }
   6882 
   6883     return NewDstRC;
   6884   }
   6885   default:
   6886     return NewDstRC;
   6887   }
   6888 }
   6889 
   6890 // Find the one SGPR operand we are allowed to use.
   6891 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
   6892                                    int OpIndices[3]) const {
   6893   const MCInstrDesc &Desc = MI.getDesc();
   6894 
   6895   // Find the one SGPR operand we are allowed to use.
   6896   //
   6897   // First we need to consider the instruction's operand requirements before
   6898   // legalizing. Some operands are required to be SGPRs, such as implicit uses
   6899   // of VCC, but we are still bound by the constant bus requirement to only use
   6900   // one.
   6901   //
   6902   // If the operand's class is an SGPR, we can never move it.
   6903 
   6904   Register SGPRReg = findImplicitSGPRRead(MI);
   6905   if (SGPRReg != AMDGPU::NoRegister)
   6906     return SGPRReg;
   6907 
   6908   Register UsedSGPRs[3] = { AMDGPU::NoRegister };
   6909   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   6910 
   6911   for (unsigned i = 0; i < 3; ++i) {
   6912     int Idx = OpIndices[i];
   6913     if (Idx == -1)
   6914       break;
   6915 
   6916     const MachineOperand &MO = MI.getOperand(Idx);
   6917     if (!MO.isReg())
   6918       continue;
   6919 
   6920     // Is this operand statically required to be an SGPR based on the operand
   6921     // constraints?
   6922     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
   6923     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
   6924     if (IsRequiredSGPR)
   6925       return MO.getReg();
   6926 
   6927     // If this could be a VGPR or an SGPR, Check the dynamic register class.
   6928     Register Reg = MO.getReg();
   6929     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
   6930     if (RI.isSGPRClass(RegRC))
   6931       UsedSGPRs[i] = Reg;
   6932   }
   6933 
   6934   // We don't have a required SGPR operand, so we have a bit more freedom in
   6935   // selecting operands to move.
   6936 
   6937   // Try to select the most used SGPR. If an SGPR is equal to one of the
   6938   // others, we choose that.
   6939   //
   6940   // e.g.
   6941   // V_FMA_F32 v0, s0, s0, s0 -> No moves
   6942   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
   6943 
   6944   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
   6945   // prefer those.
   6946 
   6947   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
   6948     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
   6949       SGPRReg = UsedSGPRs[0];
   6950   }
   6951 
   6952   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
   6953     if (UsedSGPRs[1] == UsedSGPRs[2])
   6954       SGPRReg = UsedSGPRs[1];
   6955   }
   6956 
   6957   return SGPRReg;
   6958 }
   6959 
   6960 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
   6961                                              unsigned OperandName) const {
   6962   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
   6963   if (Idx == -1)
   6964     return nullptr;
   6965 
   6966   return &MI.getOperand(Idx);
   6967 }
   6968 
   6969 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   6970   if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
   6971     return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
   6972            (1ULL << 56) | // RESOURCE_LEVEL = 1
   6973            (3ULL << 60); // OOB_SELECT = 3
   6974   }
   6975 
   6976   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   6977   if (ST.isAmdHsaOS()) {
   6978     // Set ATC = 1. GFX9 doesn't have this bit.
   6979     if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
   6980       RsrcDataFormat |= (1ULL << 56);
   6981 
   6982     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
   6983     // BTW, it disables TC L2 and therefore decreases performance.
   6984     if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
   6985       RsrcDataFormat |= (2ULL << 59);
   6986   }
   6987 
   6988   return RsrcDataFormat;
   6989 }
   6990 
   6991 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
   6992   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
   6993                     AMDGPU::RSRC_TID_ENABLE |
   6994                     0xffffffff; // Size;
   6995 
   6996   // GFX9 doesn't have ELEMENT_SIZE.
   6997   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   6998     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
   6999     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
   7000   }
   7001 
   7002   // IndexStride = 64 / 32.
   7003   uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
   7004   Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
   7005 
   7006   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   7007   // Clear them unless we want a huge stride.
   7008   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
   7009       ST.getGeneration() <= AMDGPUSubtarget::GFX9)
   7010     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
   7011 
   7012   return Rsrc23;
   7013 }
   7014 
   7015 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
   7016   unsigned Opc = MI.getOpcode();
   7017 
   7018   return isSMRD(Opc);
   7019 }
   7020 
   7021 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
   7022   return get(Opc).mayLoad() &&
   7023          (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
   7024 }
   7025 
   7026 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
   7027                                     int &FrameIndex) const {
   7028   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
   7029   if (!Addr || !Addr->isFI())
   7030     return AMDGPU::NoRegister;
   7031 
   7032   assert(!MI.memoperands_empty() &&
   7033          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
   7034 
   7035   FrameIndex = Addr->getIndex();
   7036   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
   7037 }
   7038 
   7039 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
   7040                                         int &FrameIndex) const {
   7041   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
   7042   assert(Addr && Addr->isFI());
   7043   FrameIndex = Addr->getIndex();
   7044   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
   7045 }
   7046 
   7047 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   7048                                           int &FrameIndex) const {
   7049   if (!MI.mayLoad())
   7050     return AMDGPU::NoRegister;
   7051 
   7052   if (isMUBUF(MI) || isVGPRSpill(MI))
   7053     return isStackAccess(MI, FrameIndex);
   7054 
   7055   if (isSGPRSpill(MI))
   7056     return isSGPRStackAccess(MI, FrameIndex);
   7057 
   7058   return AMDGPU::NoRegister;
   7059 }
   7060 
   7061 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   7062                                          int &FrameIndex) const {
   7063   if (!MI.mayStore())
   7064     return AMDGPU::NoRegister;
   7065 
   7066   if (isMUBUF(MI) || isVGPRSpill(MI))
   7067     return isStackAccess(MI, FrameIndex);
   7068 
   7069   if (isSGPRSpill(MI))
   7070     return isSGPRStackAccess(MI, FrameIndex);
   7071 
   7072   return AMDGPU::NoRegister;
   7073 }
   7074 
   7075 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
   7076   unsigned Size = 0;
   7077   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
   7078   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
   7079   while (++I != E && I->isInsideBundle()) {
   7080     assert(!I->isBundle() && "No nested bundle!");
   7081     Size += getInstSizeInBytes(*I);
   7082   }
   7083 
   7084   return Size;
   7085 }
   7086 
   7087 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   7088   unsigned Opc = MI.getOpcode();
   7089   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
   7090   unsigned DescSize = Desc.getSize();
   7091 
   7092   // If we have a definitive size, we can use it. Otherwise we need to inspect
   7093   // the operands to know the size.
   7094   if (isFixedSize(MI)) {
   7095     unsigned Size = DescSize;
   7096 
   7097     // If we hit the buggy offset, an extra nop will be inserted in MC so
   7098     // estimate the worst case.
   7099     if (MI.isBranch() && ST.hasOffset3fBug())
   7100       Size += 4;
   7101 
   7102     return Size;
   7103   }
   7104 
   7105   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   7106   // operands that coud ever be literals.
   7107   if (isVALU(MI) || isSALU(MI)) {
   7108     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   7109     if (Src0Idx == -1)
   7110       return DescSize; // No operands.
   7111 
   7112     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
   7113       return isVOP3(MI) ? 12 : (DescSize + 4);
   7114 
   7115     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
   7116     if (Src1Idx == -1)
   7117       return DescSize;
   7118 
   7119     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
   7120       return isVOP3(MI) ? 12 : (DescSize + 4);
   7121 
   7122     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
   7123     if (Src2Idx == -1)
   7124       return DescSize;
   7125 
   7126     if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
   7127       return isVOP3(MI) ? 12 : (DescSize + 4);
   7128 
   7129     return DescSize;
   7130   }
   7131 
   7132   // Check whether we have extra NSA words.
   7133   if (isMIMG(MI)) {
   7134     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
   7135     if (VAddr0Idx < 0)
   7136       return 8;
   7137 
   7138     int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
   7139     return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
   7140   }
   7141 
   7142   switch (Opc) {
   7143   case TargetOpcode::IMPLICIT_DEF:
   7144   case TargetOpcode::KILL:
   7145   case TargetOpcode::DBG_VALUE:
   7146   case TargetOpcode::EH_LABEL:
   7147     return 0;
   7148   case TargetOpcode::BUNDLE:
   7149     return getInstBundleSize(MI);
   7150   case TargetOpcode::INLINEASM:
   7151   case TargetOpcode::INLINEASM_BR: {
   7152     const MachineFunction *MF = MI.getParent()->getParent();
   7153     const char *AsmStr = MI.getOperand(0).getSymbolName();
   7154     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
   7155   }
   7156   default:
   7157     return DescSize;
   7158   }
   7159 }
   7160 
   7161 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
   7162   if (!isFLAT(MI))
   7163     return false;
   7164 
   7165   if (MI.memoperands_empty())
   7166     return true;
   7167 
   7168   for (const MachineMemOperand *MMO : MI.memoperands()) {
   7169     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
   7170       return true;
   7171   }
   7172   return false;
   7173 }
   7174 
   7175 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
   7176   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
   7177 }
   7178 
   7179 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
   7180                                             MachineBasicBlock *IfEnd) const {
   7181   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
   7182   assert(TI != IfEntry->end());
   7183 
   7184   MachineInstr *Branch = &(*TI);
   7185   MachineFunction *MF = IfEntry->getParent();
   7186   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
   7187 
   7188   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
   7189     Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
   7190     MachineInstr *SIIF =
   7191         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
   7192             .add(Branch->getOperand(0))
   7193             .add(Branch->getOperand(1));
   7194     MachineInstr *SIEND =
   7195         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
   7196             .addReg(DstReg);
   7197 
   7198     IfEntry->erase(TI);
   7199     IfEntry->insert(IfEntry->end(), SIIF);
   7200     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
   7201   }
   7202 }
   7203 
   7204 void SIInstrInfo::convertNonUniformLoopRegion(
   7205     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
   7206   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
   7207   // We expect 2 terminators, one conditional and one unconditional.
   7208   assert(TI != LoopEnd->end());
   7209 
   7210   MachineInstr *Branch = &(*TI);
   7211   MachineFunction *MF = LoopEnd->getParent();
   7212   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
   7213 
   7214   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
   7215 
   7216     Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
   7217     Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
   7218     MachineInstrBuilder HeaderPHIBuilder =
   7219         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
   7220     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
   7221                                           E = LoopEntry->pred_end();
   7222          PI != E; ++PI) {
   7223       if (*PI == LoopEnd) {
   7224         HeaderPHIBuilder.addReg(BackEdgeReg);
   7225       } else {
   7226         MachineBasicBlock *PMBB = *PI;
   7227         Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
   7228         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
   7229                              ZeroReg, 0);
   7230         HeaderPHIBuilder.addReg(ZeroReg);
   7231       }
   7232       HeaderPHIBuilder.addMBB(*PI);
   7233     }
   7234     MachineInstr *HeaderPhi = HeaderPHIBuilder;
   7235     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
   7236                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
   7237                                   .addReg(DstReg)
   7238                                   .add(Branch->getOperand(0));
   7239     MachineInstr *SILOOP =
   7240         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
   7241             .addReg(BackEdgeReg)
   7242             .addMBB(LoopEntry);
   7243 
   7244     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
   7245     LoopEnd->erase(TI);
   7246     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
   7247     LoopEnd->insert(LoopEnd->end(), SILOOP);
   7248   }
   7249 }
   7250 
   7251 ArrayRef<std::pair<int, const char *>>
   7252 SIInstrInfo::getSerializableTargetIndices() const {
   7253   static const std::pair<int, const char *> TargetIndices[] = {
   7254       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
   7255       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
   7256       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
   7257       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
   7258       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
   7259   return makeArrayRef(TargetIndices);
   7260 }
   7261 
   7262 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
   7263 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
   7264 ScheduleHazardRecognizer *
   7265 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   7266                                             const ScheduleDAG *DAG) const {
   7267   return new GCNHazardRecognizer(DAG->MF);
   7268 }
   7269 
   7270 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
   7271 /// pass.
   7272 ScheduleHazardRecognizer *
   7273 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
   7274   return new GCNHazardRecognizer(MF);
   7275 }
   7276 
   7277 std::pair<unsigned, unsigned>
   7278 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
   7279   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
   7280 }
   7281 
   7282 ArrayRef<std::pair<unsigned, const char *>>
   7283 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   7284   static const std::pair<unsigned, const char *> TargetFlags[] = {
   7285     { MO_GOTPCREL, "amdgpu-gotprel" },
   7286     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
   7287     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
   7288     { MO_REL32_LO, "amdgpu-rel32-lo" },
   7289     { MO_REL32_HI, "amdgpu-rel32-hi" },
   7290     { MO_ABS32_LO, "amdgpu-abs32-lo" },
   7291     { MO_ABS32_HI, "amdgpu-abs32-hi" },
   7292   };
   7293 
   7294   return makeArrayRef(TargetFlags);
   7295 }
   7296 
   7297 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
   7298   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
   7299          MI.modifiesRegister(AMDGPU::EXEC, &RI);
   7300 }
   7301 
   7302 MachineInstrBuilder
   7303 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
   7304                            MachineBasicBlock::iterator I,
   7305                            const DebugLoc &DL,
   7306                            Register DestReg) const {
   7307   if (ST.hasAddNoCarry())
   7308     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
   7309 
   7310   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   7311   Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
   7312   MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
   7313 
   7314   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
   7315            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
   7316 }
   7317 
   7318 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
   7319                                                MachineBasicBlock::iterator I,
   7320                                                const DebugLoc &DL,
   7321                                                Register DestReg,
   7322                                                RegScavenger &RS) const {
   7323   if (ST.hasAddNoCarry())
   7324     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
   7325 
   7326   // If available, prefer to use vcc.
   7327   Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
   7328                              ? Register(RI.getVCC())
   7329                              : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
   7330 
   7331   // TODO: Users need to deal with this.
   7332   if (!UnusedCarry.isValid())
   7333     return MachineInstrBuilder();
   7334 
   7335   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
   7336            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
   7337 }
   7338 
   7339 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
   7340   switch (Opcode) {
   7341   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   7342   case AMDGPU::SI_KILL_I1_TERMINATOR:
   7343     return true;
   7344   default:
   7345     return false;
   7346   }
   7347 }
   7348 
   7349 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
   7350   switch (Opcode) {
   7351   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
   7352     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
   7353   case AMDGPU::SI_KILL_I1_PSEUDO:
   7354     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
   7355   default:
   7356     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
   7357   }
   7358 }
   7359 
   7360 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
   7361   if (!ST.isWave32())
   7362     return;
   7363 
   7364   for (auto &Op : MI.implicit_operands()) {
   7365     if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
   7366       Op.setReg(AMDGPU::VCC_LO);
   7367   }
   7368 }
   7369 
   7370 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
   7371   if (!isSMRD(MI))
   7372     return false;
   7373 
   7374   // Check that it is using a buffer resource.
   7375   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
   7376   if (Idx == -1) // e.g. s_memtime
   7377     return false;
   7378 
   7379   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
   7380   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
   7381 }
   7382 
   7383 // Depending on the used address space and instructions, some immediate offsets
   7384 // are allowed and some are not.
   7385 // In general, flat instruction offsets can only be non-negative, global and
   7386 // scratch instruction offsets can also be negative.
   7387 //
   7388 // There are several bugs related to these offsets:
   7389 // On gfx10.1, flat instructions that go into the global address space cannot
   7390 // use an offset.
   7391 //
   7392 // For scratch instructions, the address can be either an SGPR or a VGPR.
   7393 // The following offsets can be used, depending on the architecture (x means
   7394 // cannot be used):
   7395 // +----------------------------+------+------+
   7396 // | Address-Mode               | SGPR | VGPR |
   7397 // +----------------------------+------+------+
   7398 // | gfx9                       |      |      |
   7399 // | negative, 4-aligned offset | x    | ok   |
   7400 // | negative, unaligned offset | x    | ok   |
   7401 // +----------------------------+------+------+
   7402 // | gfx10                      |      |      |
   7403 // | negative, 4-aligned offset | ok   | ok   |
   7404 // | negative, unaligned offset | ok   | x    |
   7405 // +----------------------------+------+------+
   7406 // | gfx10.3                    |      |      |
   7407 // | negative, 4-aligned offset | ok   | ok   |
   7408 // | negative, unaligned offset | ok   | ok   |
   7409 // +----------------------------+------+------+
   7410 //
   7411 // This function ignores the addressing mode, so if an offset cannot be used in
   7412 // one addressing mode, it is considered illegal.
   7413 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
   7414                                     uint64_t FlatVariant) const {
   7415   // TODO: Should 0 be special cased?
   7416   if (!ST.hasFlatInstOffsets())
   7417     return false;
   7418 
   7419   if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
   7420       (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
   7421        AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
   7422     return false;
   7423 
   7424   bool Signed = FlatVariant != SIInstrFlags::FLAT;
   7425   if (ST.hasNegativeScratchOffsetBug() &&
   7426       FlatVariant == SIInstrFlags::FlatScratch)
   7427     Signed = false;
   7428   if (ST.hasNegativeUnalignedScratchOffsetBug() &&
   7429       FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
   7430       (Offset % 4) != 0) {
   7431     return false;
   7432   }
   7433 
   7434   unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
   7435   return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
   7436 }
   7437 
   7438 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
   7439 std::pair<int64_t, int64_t>
   7440 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
   7441                              uint64_t FlatVariant) const {
   7442   int64_t RemainderOffset = COffsetVal;
   7443   int64_t ImmField = 0;
   7444   bool Signed = FlatVariant != SIInstrFlags::FLAT;
   7445   if (ST.hasNegativeScratchOffsetBug() &&
   7446       FlatVariant == SIInstrFlags::FlatScratch)
   7447     Signed = false;
   7448 
   7449   const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
   7450   if (Signed) {
   7451     // Use signed division by a power of two to truncate towards 0.
   7452     int64_t D = 1LL << (NumBits - 1);
   7453     RemainderOffset = (COffsetVal / D) * D;
   7454     ImmField = COffsetVal - RemainderOffset;
   7455 
   7456     if (ST.hasNegativeUnalignedScratchOffsetBug() &&
   7457         FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
   7458         (ImmField % 4) != 0) {
   7459       // Make ImmField a multiple of 4
   7460       RemainderOffset += ImmField % 4;
   7461       ImmField -= ImmField % 4;
   7462     }
   7463   } else if (COffsetVal >= 0) {
   7464     ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
   7465     RemainderOffset = COffsetVal - ImmField;
   7466   }
   7467 
   7468   assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
   7469   assert(RemainderOffset + ImmField == COffsetVal);
   7470   return {ImmField, RemainderOffset};
   7471 }
   7472 
   7473 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
   7474 enum SIEncodingFamily {
   7475   SI = 0,
   7476   VI = 1,
   7477   SDWA = 2,
   7478   SDWA9 = 3,
   7479   GFX80 = 4,
   7480   GFX9 = 5,
   7481   GFX10 = 6,
   7482   SDWA10 = 7,
   7483   GFX90A = 8
   7484 };
   7485 
   7486 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
   7487   switch (ST.getGeneration()) {
   7488   default:
   7489     break;
   7490   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
   7491   case AMDGPUSubtarget::SEA_ISLANDS:
   7492     return SIEncodingFamily::SI;
   7493   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
   7494   case AMDGPUSubtarget::GFX9:
   7495     return SIEncodingFamily::VI;
   7496   case AMDGPUSubtarget::GFX10:
   7497     return SIEncodingFamily::GFX10;
   7498   }
   7499   llvm_unreachable("Unknown subtarget generation!");
   7500 }
   7501 
   7502 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
   7503   switch(MCOp) {
   7504   // These opcodes use indirect register addressing so
   7505   // they need special handling by codegen (currently missing).
   7506   // Therefore it is too risky to allow these opcodes
   7507   // to be selected by dpp combiner or sdwa peepholer.
   7508   case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
   7509   case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
   7510   case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
   7511   case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
   7512   case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
   7513   case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
   7514   case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
   7515   case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
   7516     return true;
   7517   default:
   7518     return false;
   7519   }
   7520 }
   7521 
   7522 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
   7523   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
   7524 
   7525   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
   7526     ST.getGeneration() == AMDGPUSubtarget::GFX9)
   7527     Gen = SIEncodingFamily::GFX9;
   7528 
   7529   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
   7530   // subtarget has UnpackedD16VMem feature.
   7531   // TODO: remove this when we discard GFX80 encoding.
   7532   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
   7533     Gen = SIEncodingFamily::GFX80;
   7534 
   7535   if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
   7536     switch (ST.getGeneration()) {
   7537     default:
   7538       Gen = SIEncodingFamily::SDWA;
   7539       break;
   7540     case AMDGPUSubtarget::GFX9:
   7541       Gen = SIEncodingFamily::SDWA9;
   7542       break;
   7543     case AMDGPUSubtarget::GFX10:
   7544       Gen = SIEncodingFamily::SDWA10;
   7545       break;
   7546     }
   7547   }
   7548 
   7549   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
   7550 
   7551   // -1 means that Opcode is already a native instruction.
   7552   if (MCOp == -1)
   7553     return Opcode;
   7554 
   7555   if (ST.hasGFX90AInsts()) {
   7556     uint16_t NMCOp = (uint16_t)-1;
   7557       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
   7558     if (NMCOp == (uint16_t)-1)
   7559       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
   7560     if (NMCOp != (uint16_t)-1)
   7561       MCOp = NMCOp;
   7562   }
   7563 
   7564   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
   7565   // no encoding in the given subtarget generation.
   7566   if (MCOp == (uint16_t)-1)
   7567     return -1;
   7568 
   7569   if (isAsmOnlyOpcode(MCOp))
   7570     return -1;
   7571 
   7572   return MCOp;
   7573 }
   7574 
   7575 static
   7576 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
   7577   assert(RegOpnd.isReg());
   7578   return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
   7579                              getRegSubRegPair(RegOpnd);
   7580 }
   7581 
   7582 TargetInstrInfo::RegSubRegPair
   7583 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
   7584   assert(MI.isRegSequence());
   7585   for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
   7586     if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
   7587       auto &RegOp = MI.getOperand(1 + 2 * I);
   7588       return getRegOrUndef(RegOp);
   7589     }
   7590   return TargetInstrInfo::RegSubRegPair();
   7591 }
   7592 
   7593 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
   7594 // Following a subreg of reg:subreg isn't supported
   7595 static bool followSubRegDef(MachineInstr &MI,
   7596                             TargetInstrInfo::RegSubRegPair &RSR) {
   7597   if (!RSR.SubReg)
   7598     return false;
   7599   switch (MI.getOpcode()) {
   7600   default: break;
   7601   case AMDGPU::REG_SEQUENCE:
   7602     RSR = getRegSequenceSubReg(MI, RSR.SubReg);
   7603     return true;
   7604   // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
   7605   case AMDGPU::INSERT_SUBREG:
   7606     if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
   7607       // inserted the subreg we're looking for
   7608       RSR = getRegOrUndef(MI.getOperand(2));
   7609     else { // the subreg in the rest of the reg
   7610       auto R1 = getRegOrUndef(MI.getOperand(1));
   7611       if (R1.SubReg) // subreg of subreg isn't supported
   7612         return false;
   7613       RSR.Reg = R1.Reg;
   7614     }
   7615     return true;
   7616   }
   7617   return false;
   7618 }
   7619 
   7620 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
   7621                                      MachineRegisterInfo &MRI) {
   7622   assert(MRI.isSSA());
   7623   if (!P.Reg.isVirtual())
   7624     return nullptr;
   7625 
   7626   auto RSR = P;
   7627   auto *DefInst = MRI.getVRegDef(RSR.Reg);
   7628   while (auto *MI = DefInst) {
   7629     DefInst = nullptr;
   7630     switch (MI->getOpcode()) {
   7631     case AMDGPU::COPY:
   7632     case AMDGPU::V_MOV_B32_e32: {
   7633       auto &Op1 = MI->getOperand(1);
   7634       if (Op1.isReg() && Op1.getReg().isVirtual()) {
   7635         if (Op1.isUndef())
   7636           return nullptr;
   7637         RSR = getRegSubRegPair(Op1);
   7638         DefInst = MRI.getVRegDef(RSR.Reg);
   7639       }
   7640       break;
   7641     }
   7642     default:
   7643       if (followSubRegDef(*MI, RSR)) {
   7644         if (!RSR.Reg)
   7645           return nullptr;
   7646         DefInst = MRI.getVRegDef(RSR.Reg);
   7647       }
   7648     }
   7649     if (!DefInst)
   7650       return MI;
   7651   }
   7652   return nullptr;
   7653 }
   7654 
   7655 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
   7656                                       Register VReg,
   7657                                       const MachineInstr &DefMI,
   7658                                       const MachineInstr &UseMI) {
   7659   assert(MRI.isSSA() && "Must be run on SSA");
   7660 
   7661   auto *TRI = MRI.getTargetRegisterInfo();
   7662   auto *DefBB = DefMI.getParent();
   7663 
   7664   // Don't bother searching between blocks, although it is possible this block
   7665   // doesn't modify exec.
   7666   if (UseMI.getParent() != DefBB)
   7667     return true;
   7668 
   7669   const int MaxInstScan = 20;
   7670   int NumInst = 0;
   7671 
   7672   // Stop scan at the use.
   7673   auto E = UseMI.getIterator();
   7674   for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
   7675     if (I->isDebugInstr())
   7676       continue;
   7677 
   7678     if (++NumInst > MaxInstScan)
   7679       return true;
   7680 
   7681     if (I->modifiesRegister(AMDGPU::EXEC, TRI))
   7682       return true;
   7683   }
   7684 
   7685   return false;
   7686 }
   7687 
   7688 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
   7689                                          Register VReg,
   7690                                          const MachineInstr &DefMI) {
   7691   assert(MRI.isSSA() && "Must be run on SSA");
   7692 
   7693   auto *TRI = MRI.getTargetRegisterInfo();
   7694   auto *DefBB = DefMI.getParent();
   7695 
   7696   const int MaxUseScan = 10;
   7697   int NumUse = 0;
   7698 
   7699   for (auto &Use : MRI.use_nodbg_operands(VReg)) {
   7700     auto &UseInst = *Use.getParent();
   7701     // Don't bother searching between blocks, although it is possible this block
   7702     // doesn't modify exec.
   7703     if (UseInst.getParent() != DefBB)
   7704       return true;
   7705 
   7706     if (++NumUse > MaxUseScan)
   7707       return true;
   7708   }
   7709 
   7710   if (NumUse == 0)
   7711     return false;
   7712 
   7713   const int MaxInstScan = 20;
   7714   int NumInst = 0;
   7715 
   7716   // Stop scan when we have seen all the uses.
   7717   for (auto I = std::next(DefMI.getIterator()); ; ++I) {
   7718     assert(I != DefBB->end());
   7719 
   7720     if (I->isDebugInstr())
   7721       continue;
   7722 
   7723     if (++NumInst > MaxInstScan)
   7724       return true;
   7725 
   7726     for (const MachineOperand &Op : I->operands()) {
   7727       // We don't check reg masks here as they're used only on calls:
   7728       // 1. EXEC is only considered const within one BB
   7729       // 2. Call should be a terminator instruction if present in a BB
   7730 
   7731       if (!Op.isReg())
   7732         continue;
   7733 
   7734       Register Reg = Op.getReg();
   7735       if (Op.isUse()) {
   7736         if (Reg == VReg && --NumUse == 0)
   7737           return false;
   7738       } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
   7739         return true;
   7740     }
   7741   }
   7742 }
   7743 
   7744 MachineInstr *SIInstrInfo::createPHIDestinationCopy(
   7745     MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
   7746     const DebugLoc &DL, Register Src, Register Dst) const {
   7747   auto Cur = MBB.begin();
   7748   if (Cur != MBB.end())
   7749     do {
   7750       if (!Cur->isPHI() && Cur->readsRegister(Dst))
   7751         return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
   7752       ++Cur;
   7753     } while (Cur != MBB.end() && Cur != LastPHIIt);
   7754 
   7755   return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
   7756                                                    Dst);
   7757 }
   7758 
   7759 MachineInstr *SIInstrInfo::createPHISourceCopy(
   7760     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
   7761     const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
   7762   if (InsPt != MBB.end() &&
   7763       (InsPt->getOpcode() == AMDGPU::SI_IF ||
   7764        InsPt->getOpcode() == AMDGPU::SI_ELSE ||
   7765        InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
   7766       InsPt->definesRegister(Src)) {
   7767     InsPt++;
   7768     return BuildMI(MBB, InsPt, DL,
   7769                    get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
   7770                                      : AMDGPU::S_MOV_B64_term),
   7771                    Dst)
   7772         .addReg(Src, 0, SrcSubReg)
   7773         .addReg(AMDGPU::EXEC, RegState::Implicit);
   7774   }
   7775   return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
   7776                                               Dst);
   7777 }
   7778 
   7779 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
   7780 
   7781 MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
   7782     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
   7783     MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
   7784     VirtRegMap *VRM) const {
   7785   // This is a bit of a hack (copied from AArch64). Consider this instruction:
   7786   //
   7787   //   %0:sreg_32 = COPY $m0
   7788   //
   7789   // We explicitly chose SReg_32 for the virtual register so such a copy might
   7790   // be eliminated by RegisterCoalescer. However, that may not be possible, and
   7791   // %0 may even spill. We can't spill $m0 normally (it would require copying to
   7792   // a numbered SGPR anyway), and since it is in the SReg_32 register class,
   7793   // TargetInstrInfo::foldMemoryOperand() is going to try.
   7794   // A similar issue also exists with spilling and reloading $exec registers.
   7795   //
   7796   // To prevent that, constrain the %0 register class here.
   7797   if (MI.isFullCopy()) {
   7798     Register DstReg = MI.getOperand(0).getReg();
   7799     Register SrcReg = MI.getOperand(1).getReg();
   7800     if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
   7801         (DstReg.isVirtual() != SrcReg.isVirtual())) {
   7802       MachineRegisterInfo &MRI = MF.getRegInfo();
   7803       Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
   7804       const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
   7805       if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
   7806         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
   7807         return nullptr;
   7808       } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
   7809         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
   7810         return nullptr;
   7811       }
   7812     }
   7813   }
   7814 
   7815   return nullptr;
   7816 }
   7817 
   7818 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   7819                                       const MachineInstr &MI,
   7820                                       unsigned *PredCost) const {
   7821   if (MI.isBundle()) {
   7822     MachineBasicBlock::const_instr_iterator I(MI.getIterator());
   7823     MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
   7824     unsigned Lat = 0, Count = 0;
   7825     for (++I; I != E && I->isBundledWithPred(); ++I) {
   7826       ++Count;
   7827       Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
   7828     }
   7829     return Lat + Count - 1;
   7830   }
   7831 
   7832   return SchedModel.computeInstrLatency(&MI);
   7833 }
   7834 
   7835 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
   7836   switch (MF.getFunction().getCallingConv()) {
   7837   case CallingConv::AMDGPU_PS:
   7838     return 1;
   7839   case CallingConv::AMDGPU_VS:
   7840     return 2;
   7841   case CallingConv::AMDGPU_GS:
   7842     return 3;
   7843   case CallingConv::AMDGPU_HS:
   7844   case CallingConv::AMDGPU_LS:
   7845   case CallingConv::AMDGPU_ES:
   7846     report_fatal_error("ds_ordered_count unsupported for this calling conv");
   7847   case CallingConv::AMDGPU_CS:
   7848   case CallingConv::AMDGPU_KERNEL:
   7849   case CallingConv::C:
   7850   case CallingConv::Fast:
   7851   default:
   7852     // Assume other calling conventions are various compute callable functions
   7853     return 0;
   7854   }
   7855 }
   7856