Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This pass tries to fuse DS instructions with close by immediate offsets.
     10 // This will fuse operations such as
     11 //  ds_read_b32 v0, v2 offset:16
     12 //  ds_read_b32 v1, v2 offset:32
     13 // ==>
     14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
     15 //
     16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
     17 //  s_buffer_load_dword s4, s[0:3], 4
     18 //  s_buffer_load_dword s5, s[0:3], 8
     19 // ==>
     20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
     21 //
     22 // This pass also tries to promote constant offset to the immediate by
     23 // adjusting the base. It tries to use a base from the nearby instructions that
     24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
     25 // to the immediate.
     26 // E.g.
     27 //  s_movk_i32 s0, 0x1800
     28 //  v_add_co_u32_e32 v0, vcc, s0, v2
     29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
     30 //
     31 //  s_movk_i32 s0, 0x1000
     32 //  v_add_co_u32_e32 v5, vcc, s0, v2
     33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
     34 //  global_load_dwordx2 v[5:6], v[5:6], off
     35 //  global_load_dwordx2 v[0:1], v[0:1], off
     36 // =>
     37 //  s_movk_i32 s0, 0x1000
     38 //  v_add_co_u32_e32 v5, vcc, s0, v2
     39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
     40 //  global_load_dwordx2 v[5:6], v[5:6], off
     41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
     42 //
     43 // Future improvements:
     44 //
     45 // - This is currently missing stores of constants because loading
     46 //   the constant into the data register is placed between the stores, although
     47 //   this is arguably a scheduling problem.
     48 //
     49 // - Live interval recomputing seems inefficient. This currently only matches
     50 //   one pair, and recomputes live intervals and moves on to the next pair. It
     51 //   would be better to compute a list of all merges that need to occur.
     52 //
     53 // - With a list of instructions to process, we can also merge more. If a
     54 //   cluster of loads have offsets that are too large to fit in the 8-bit
     55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
     56 //   pointer and use the new reduced offsets.
     57 //
     58 //===----------------------------------------------------------------------===//
     59 
     60 #include "AMDGPU.h"
     61 #include "GCNSubtarget.h"
     62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     63 #include "llvm/Analysis/AliasAnalysis.h"
     64 #include "llvm/CodeGen/MachineFunctionPass.h"
     65 #include "llvm/InitializePasses.h"
     66 
     67 using namespace llvm;
     68 
     69 #define DEBUG_TYPE "si-load-store-opt"
     70 
     71 namespace {
     72 enum InstClassEnum {
     73   UNKNOWN,
     74   DS_READ,
     75   DS_WRITE,
     76   S_BUFFER_LOAD_IMM,
     77   BUFFER_LOAD,
     78   BUFFER_STORE,
     79   MIMG,
     80   TBUFFER_LOAD,
     81   TBUFFER_STORE,
     82 };
     83 
     84 struct AddressRegs {
     85   unsigned char NumVAddrs = 0;
     86   bool SBase = false;
     87   bool SRsrc = false;
     88   bool SOffset = false;
     89   bool VAddr = false;
     90   bool Addr = false;
     91   bool SSamp = false;
     92 };
     93 
     94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
     95 const unsigned MaxAddressRegs = 12 + 1 + 1;
     96 
     97 class SILoadStoreOptimizer : public MachineFunctionPass {
     98   struct CombineInfo {
     99     MachineBasicBlock::iterator I;
    100     unsigned EltSize;
    101     unsigned Offset;
    102     unsigned Width;
    103     unsigned Format;
    104     unsigned BaseOff;
    105     unsigned DMask;
    106     InstClassEnum InstClass;
    107     unsigned CPol = 0;
    108     bool UseST64;
    109     int AddrIdx[MaxAddressRegs];
    110     const MachineOperand *AddrReg[MaxAddressRegs];
    111     unsigned NumAddresses;
    112     unsigned Order;
    113 
    114     bool hasSameBaseAddress(const MachineInstr &MI) {
    115       for (unsigned i = 0; i < NumAddresses; i++) {
    116         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
    117 
    118         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
    119           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
    120               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
    121             return false;
    122           }
    123           continue;
    124         }
    125 
    126         // Check same base pointer. Be careful of subregisters, which can occur
    127         // with vectors of pointers.
    128         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
    129             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
    130          return false;
    131         }
    132       }
    133       return true;
    134     }
    135 
    136     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
    137       for (unsigned i = 0; i < NumAddresses; ++i) {
    138         const MachineOperand *AddrOp = AddrReg[i];
    139         // Immediates are always OK.
    140         if (AddrOp->isImm())
    141           continue;
    142 
    143         // Don't try to merge addresses that aren't either immediates or registers.
    144         // TODO: Should be possible to merge FrameIndexes and maybe some other
    145         // non-register
    146         if (!AddrOp->isReg())
    147           return false;
    148 
    149         // TODO: We should be able to merge physical reg addreses.
    150         if (AddrOp->getReg().isPhysical())
    151           return false;
    152 
    153         // If an address has only one use then there will be on other
    154         // instructions with the same address, so we can't merge this one.
    155         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
    156           return false;
    157       }
    158       return true;
    159     }
    160 
    161     void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
    162                const GCNSubtarget &STM);
    163   };
    164 
    165   struct BaseRegisters {
    166     Register LoReg;
    167     Register HiReg;
    168 
    169     unsigned LoSubReg = 0;
    170     unsigned HiSubReg = 0;
    171   };
    172 
    173   struct MemAddress {
    174     BaseRegisters Base;
    175     int64_t Offset = 0;
    176   };
    177 
    178   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
    179 
    180 private:
    181   const GCNSubtarget *STM = nullptr;
    182   const SIInstrInfo *TII = nullptr;
    183   const SIRegisterInfo *TRI = nullptr;
    184   MachineRegisterInfo *MRI = nullptr;
    185   AliasAnalysis *AA = nullptr;
    186   bool OptimizeAgain;
    187 
    188   static bool dmasksCanBeCombined(const CombineInfo &CI,
    189                                   const SIInstrInfo &TII,
    190                                   const CombineInfo &Paired);
    191   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
    192                                    CombineInfo &Paired, bool Modify = false);
    193   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
    194                         const CombineInfo &Paired);
    195   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
    196   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
    197                                                      const CombineInfo &Paired);
    198   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
    199                                                     const CombineInfo &Paired);
    200   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
    201 
    202   bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
    203                             SmallVectorImpl<MachineInstr *> &InstsToMove);
    204 
    205   unsigned read2Opcode(unsigned EltSize) const;
    206   unsigned read2ST64Opcode(unsigned EltSize) const;
    207   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
    208                                              CombineInfo &Paired,
    209                   const SmallVectorImpl<MachineInstr *> &InstsToMove);
    210 
    211   unsigned write2Opcode(unsigned EltSize) const;
    212   unsigned write2ST64Opcode(unsigned EltSize) const;
    213   MachineBasicBlock::iterator
    214   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
    215                   const SmallVectorImpl<MachineInstr *> &InstsToMove);
    216   MachineBasicBlock::iterator
    217   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
    218                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
    219   MachineBasicBlock::iterator
    220   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
    221                           const SmallVectorImpl<MachineInstr *> &InstsToMove);
    222   MachineBasicBlock::iterator
    223   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
    224                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
    225   MachineBasicBlock::iterator
    226   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
    227                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
    228   MachineBasicBlock::iterator
    229   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
    230                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
    231   MachineBasicBlock::iterator
    232   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
    233                         const SmallVectorImpl<MachineInstr *> &InstsToMove);
    234 
    235   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
    236                            int32_t NewOffset) const;
    237   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
    238   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
    239   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
    240   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
    241   /// Promotes constant offset to the immediate by adjusting the base. It
    242   /// tries to use a base from the nearby instructions that allows it to have
    243   /// a 13bit constant offset which gets promoted to the immediate.
    244   bool promoteConstantOffsetToImm(MachineInstr &CI,
    245                                   MemInfoMap &Visited,
    246                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
    247   void addInstToMergeableList(const CombineInfo &CI,
    248                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
    249 
    250   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
    251       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
    252       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
    253       std::list<std::list<CombineInfo>> &MergeableInsts) const;
    254 
    255 public:
    256   static char ID;
    257 
    258   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
    259     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
    260   }
    261 
    262   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
    263                                      bool &OptimizeListAgain);
    264   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
    265 
    266   bool runOnMachineFunction(MachineFunction &MF) override;
    267 
    268   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
    269 
    270   void getAnalysisUsage(AnalysisUsage &AU) const override {
    271     AU.setPreservesCFG();
    272     AU.addRequired<AAResultsWrapperPass>();
    273 
    274     MachineFunctionPass::getAnalysisUsage(AU);
    275   }
    276 
    277   MachineFunctionProperties getRequiredProperties() const override {
    278     return MachineFunctionProperties()
    279       .set(MachineFunctionProperties::Property::IsSSA);
    280   }
    281 };
    282 
    283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
    284   const unsigned Opc = MI.getOpcode();
    285 
    286   if (TII.isMUBUF(Opc)) {
    287     // FIXME: Handle d16 correctly
    288     return AMDGPU::getMUBUFElements(Opc);
    289   }
    290   if (TII.isMIMG(MI)) {
    291     uint64_t DMaskImm =
    292         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
    293     return countPopulation(DMaskImm);
    294   }
    295   if (TII.isMTBUF(Opc)) {
    296     return AMDGPU::getMTBUFElements(Opc);
    297   }
    298 
    299   switch (Opc) {
    300   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
    301     return 1;
    302   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
    303     return 2;
    304   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
    305     return 4;
    306   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
    307   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
    308   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
    309   case AMDGPU::DS_WRITE_B32_gfx9:
    310     return 1;
    311   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
    312   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
    313   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
    314   case AMDGPU::DS_WRITE_B64_gfx9:
    315     return 2;
    316   default:
    317     return 0;
    318   }
    319 }
    320 
    321 /// Maps instruction opcode to enum InstClassEnum.
    322 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
    323   switch (Opc) {
    324   default:
    325     if (TII.isMUBUF(Opc)) {
    326       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
    327       default:
    328         return UNKNOWN;
    329       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
    330       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
    331       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
    332       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
    333         return BUFFER_LOAD;
    334       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
    335       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
    336       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
    337       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
    338         return BUFFER_STORE;
    339       }
    340     }
    341     if (TII.isMIMG(Opc)) {
    342       // Ignore instructions encoded without vaddr.
    343       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
    344           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
    345         return UNKNOWN;
    346       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
    347       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
    348           TII.isGather4(Opc))
    349         return UNKNOWN;
    350       return MIMG;
    351     }
    352     if (TII.isMTBUF(Opc)) {
    353       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
    354       default:
    355         return UNKNOWN;
    356       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
    357       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
    358       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
    359       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
    360         return TBUFFER_LOAD;
    361       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
    362       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
    363       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
    364       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
    365         return TBUFFER_STORE;
    366       }
    367     }
    368     return UNKNOWN;
    369   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
    370   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
    371   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
    372     return S_BUFFER_LOAD_IMM;
    373   case AMDGPU::DS_READ_B32:
    374   case AMDGPU::DS_READ_B32_gfx9:
    375   case AMDGPU::DS_READ_B64:
    376   case AMDGPU::DS_READ_B64_gfx9:
    377     return DS_READ;
    378   case AMDGPU::DS_WRITE_B32:
    379   case AMDGPU::DS_WRITE_B32_gfx9:
    380   case AMDGPU::DS_WRITE_B64:
    381   case AMDGPU::DS_WRITE_B64_gfx9:
    382     return DS_WRITE;
    383   case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
    384   case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
    385   case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
    386   case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
    387   case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
    388   case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
    389   case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
    390   case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
    391     return UNKNOWN;
    392   }
    393 }
    394 
    395 /// Determines instruction subclass from opcode. Only instructions
    396 /// of the same subclass can be merged together.
    397 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
    398   switch (Opc) {
    399   default:
    400     if (TII.isMUBUF(Opc))
    401       return AMDGPU::getMUBUFBaseOpcode(Opc);
    402     if (TII.isMIMG(Opc)) {
    403       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
    404       assert(Info);
    405       return Info->BaseOpcode;
    406     }
    407     if (TII.isMTBUF(Opc))
    408       return AMDGPU::getMTBUFBaseOpcode(Opc);
    409     return -1;
    410   case AMDGPU::DS_READ_B32:
    411   case AMDGPU::DS_READ_B32_gfx9:
    412   case AMDGPU::DS_READ_B64:
    413   case AMDGPU::DS_READ_B64_gfx9:
    414   case AMDGPU::DS_WRITE_B32:
    415   case AMDGPU::DS_WRITE_B32_gfx9:
    416   case AMDGPU::DS_WRITE_B64:
    417   case AMDGPU::DS_WRITE_B64_gfx9:
    418     return Opc;
    419   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
    420   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
    421   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
    422     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
    423   }
    424 }
    425 
    426 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
    427   AddressRegs Result;
    428 
    429   if (TII.isMUBUF(Opc)) {
    430     if (AMDGPU::getMUBUFHasVAddr(Opc))
    431       Result.VAddr = true;
    432     if (AMDGPU::getMUBUFHasSrsrc(Opc))
    433       Result.SRsrc = true;
    434     if (AMDGPU::getMUBUFHasSoffset(Opc))
    435       Result.SOffset = true;
    436 
    437     return Result;
    438   }
    439 
    440   if (TII.isMIMG(Opc)) {
    441     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
    442     if (VAddr0Idx >= 0) {
    443       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
    444       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
    445     } else {
    446       Result.VAddr = true;
    447     }
    448     Result.SRsrc = true;
    449     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
    450     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
    451       Result.SSamp = true;
    452 
    453     return Result;
    454   }
    455   if (TII.isMTBUF(Opc)) {
    456     if (AMDGPU::getMTBUFHasVAddr(Opc))
    457       Result.VAddr = true;
    458     if (AMDGPU::getMTBUFHasSrsrc(Opc))
    459       Result.SRsrc = true;
    460     if (AMDGPU::getMTBUFHasSoffset(Opc))
    461       Result.SOffset = true;
    462 
    463     return Result;
    464   }
    465 
    466   switch (Opc) {
    467   default:
    468     return Result;
    469   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
    470   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
    471   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
    472     Result.SBase = true;
    473     return Result;
    474   case AMDGPU::DS_READ_B32:
    475   case AMDGPU::DS_READ_B64:
    476   case AMDGPU::DS_READ_B32_gfx9:
    477   case AMDGPU::DS_READ_B64_gfx9:
    478   case AMDGPU::DS_WRITE_B32:
    479   case AMDGPU::DS_WRITE_B64:
    480   case AMDGPU::DS_WRITE_B32_gfx9:
    481   case AMDGPU::DS_WRITE_B64_gfx9:
    482     Result.Addr = true;
    483     return Result;
    484   }
    485 }
    486 
    487 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
    488                                               const SIInstrInfo &TII,
    489                                               const GCNSubtarget &STM) {
    490   I = MI;
    491   unsigned Opc = MI->getOpcode();
    492   InstClass = getInstClass(Opc, TII);
    493 
    494   if (InstClass == UNKNOWN)
    495     return;
    496 
    497   switch (InstClass) {
    498   case DS_READ:
    499    EltSize =
    500           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
    501                                                                           : 4;
    502    break;
    503   case DS_WRITE:
    504     EltSize =
    505           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
    506                                                                             : 4;
    507     break;
    508   case S_BUFFER_LOAD_IMM:
    509     EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
    510     break;
    511   default:
    512     EltSize = 4;
    513     break;
    514   }
    515 
    516   if (InstClass == MIMG) {
    517     DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
    518     // Offset is not considered for MIMG instructions.
    519     Offset = 0;
    520   } else {
    521     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
    522     Offset = I->getOperand(OffsetIdx).getImm();
    523   }
    524 
    525   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
    526     Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
    527 
    528   Width = getOpcodeWidth(*I, TII);
    529 
    530   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
    531     Offset &= 0xffff;
    532   } else if (InstClass != MIMG) {
    533     CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
    534   }
    535 
    536   AddressRegs Regs = getRegs(Opc, TII);
    537 
    538   NumAddresses = 0;
    539   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
    540     AddrIdx[NumAddresses++] =
    541         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
    542   if (Regs.Addr)
    543     AddrIdx[NumAddresses++] =
    544         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
    545   if (Regs.SBase)
    546     AddrIdx[NumAddresses++] =
    547         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
    548   if (Regs.SRsrc)
    549     AddrIdx[NumAddresses++] =
    550         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
    551   if (Regs.SOffset)
    552     AddrIdx[NumAddresses++] =
    553         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
    554   if (Regs.VAddr)
    555     AddrIdx[NumAddresses++] =
    556         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
    557   if (Regs.SSamp)
    558     AddrIdx[NumAddresses++] =
    559         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
    560   assert(NumAddresses <= MaxAddressRegs);
    561 
    562   for (unsigned J = 0; J < NumAddresses; J++)
    563     AddrReg[J] = &I->getOperand(AddrIdx[J]);
    564 }
    565 
    566 } // end anonymous namespace.
    567 
    568 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
    569                       "SI Load Store Optimizer", false, false)
    570 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
    571 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
    572                     false, false)
    573 
    574 char SILoadStoreOptimizer::ID = 0;
    575 
    576 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
    577 
    578 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
    579   return new SILoadStoreOptimizer();
    580 }
    581 
    582 static void moveInstsAfter(MachineBasicBlock::iterator I,
    583                            ArrayRef<MachineInstr *> InstsToMove) {
    584   MachineBasicBlock *MBB = I->getParent();
    585   ++I;
    586   for (MachineInstr *MI : InstsToMove) {
    587     MI->removeFromParent();
    588     MBB->insert(I, MI);
    589   }
    590 }
    591 
    592 static void addDefsUsesToList(const MachineInstr &MI,
    593                               DenseSet<Register> &RegDefs,
    594                               DenseSet<Register> &PhysRegUses) {
    595   for (const MachineOperand &Op : MI.operands()) {
    596     if (Op.isReg()) {
    597       if (Op.isDef())
    598         RegDefs.insert(Op.getReg());
    599       else if (Op.readsReg() && Op.getReg().isPhysical())
    600         PhysRegUses.insert(Op.getReg());
    601     }
    602   }
    603 }
    604 
    605 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
    606                                       MachineBasicBlock::iterator B,
    607                                       AliasAnalysis *AA) {
    608   // RAW or WAR - cannot reorder
    609   // WAW - cannot reorder
    610   // RAR - safe to reorder
    611   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
    612 }
    613 
    614 // Add MI and its defs to the lists if MI reads one of the defs that are
    615 // already in the list. Returns true in that case.
    616 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
    617                                   DenseSet<Register> &PhysRegUses,
    618                                   SmallVectorImpl<MachineInstr *> &Insts) {
    619   for (MachineOperand &Use : MI.operands()) {
    620     // If one of the defs is read, then there is a use of Def between I and the
    621     // instruction that I will potentially be merged with. We will need to move
    622     // this instruction after the merged instructions.
    623     //
    624     // Similarly, if there is a def which is read by an instruction that is to
    625     // be moved for merging, then we need to move the def-instruction as well.
    626     // This can only happen for physical registers such as M0; virtual
    627     // registers are in SSA form.
    628     if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
    629                         (Use.isDef() && RegDefs.count(Use.getReg())) ||
    630                         (Use.isDef() && Use.getReg().isPhysical() &&
    631                          PhysRegUses.count(Use.getReg())))) {
    632       Insts.push_back(&MI);
    633       addDefsUsesToList(MI, RegDefs, PhysRegUses);
    634       return true;
    635     }
    636   }
    637 
    638   return false;
    639 }
    640 
    641 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
    642                                     ArrayRef<MachineInstr *> InstsToMove,
    643                                     AliasAnalysis *AA) {
    644   assert(MemOp.mayLoadOrStore());
    645 
    646   for (MachineInstr *InstToMove : InstsToMove) {
    647     if (!InstToMove->mayLoadOrStore())
    648       continue;
    649     if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
    650       return false;
    651   }
    652   return true;
    653 }
    654 
    655 // This function assumes that \p A and \p B have are identical except for
    656 // size and offset, and they referecne adjacent memory.
    657 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
    658                                                    const MachineMemOperand *A,
    659                                                    const MachineMemOperand *B) {
    660   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
    661   unsigned Size = A->getSize() + B->getSize();
    662   // This function adds the offset parameter to the existing offset for A,
    663   // so we pass 0 here as the offset and then manually set it to the correct
    664   // value after the call.
    665   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
    666   MMO->setOffset(MinOffset);
    667   return MMO;
    668 }
    669 
    670 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
    671                                                const SIInstrInfo &TII,
    672                                                const CombineInfo &Paired) {
    673   assert(CI.InstClass == MIMG);
    674 
    675   // Ignore instructions with tfe/lwe set.
    676   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
    677   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
    678 
    679   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
    680     return false;
    681 
    682   // Check other optional immediate operands for equality.
    683   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
    684                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
    685                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
    686 
    687   for (auto op : OperandsToMatch) {
    688     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
    689     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
    690       return false;
    691     if (Idx != -1 &&
    692         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
    693       return false;
    694   }
    695 
    696   // Check DMask for overlaps.
    697   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
    698   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
    699 
    700   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
    701   if ((1u << AllowedBitsForMin) <= MinMask)
    702     return false;
    703 
    704   return true;
    705 }
    706 
    707 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
    708                                        unsigned ComponentCount,
    709                                        const GCNSubtarget &STI) {
    710   if (ComponentCount > 4)
    711     return 0;
    712 
    713   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
    714       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
    715   if (!OldFormatInfo)
    716     return 0;
    717 
    718   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
    719       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
    720                                            ComponentCount,
    721                                            OldFormatInfo->NumFormat, STI);
    722 
    723   if (!NewFormatInfo)
    724     return 0;
    725 
    726   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
    727          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
    728 
    729   return NewFormatInfo->Format;
    730 }
    731 
    732 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
    733 // highest power of two. Note that the result is well defined for all inputs
    734 // including corner cases like:
    735 // - if Lo == Hi, return that value
    736 // - if Lo == 0, return 0 (even though the "- 1" below underflows
    737 // - if Lo > Hi, return 0 (as if the range wrapped around)
    738 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
    739   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
    740 }
    741 
    742 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
    743                                                 const GCNSubtarget &STI,
    744                                                 CombineInfo &Paired,
    745                                                 bool Modify) {
    746   assert(CI.InstClass != MIMG);
    747 
    748   // XXX - Would the same offset be OK? Is there any reason this would happen or
    749   // be useful?
    750   if (CI.Offset == Paired.Offset)
    751     return false;
    752 
    753   // This won't be valid if the offset isn't aligned.
    754   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
    755     return false;
    756 
    757   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
    758 
    759     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
    760         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
    761     if (!Info0)
    762       return false;
    763     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
    764         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
    765     if (!Info1)
    766       return false;
    767 
    768     if (Info0->BitsPerComp != Info1->BitsPerComp ||
    769         Info0->NumFormat != Info1->NumFormat)
    770       return false;
    771 
    772     // TODO: Should be possible to support more formats, but if format loads
    773     // are not dword-aligned, the merged load might not be valid.
    774     if (Info0->BitsPerComp != 32)
    775       return false;
    776 
    777     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
    778       return false;
    779   }
    780 
    781   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
    782   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
    783   CI.UseST64 = false;
    784   CI.BaseOff = 0;
    785 
    786   // Handle all non-DS instructions.
    787   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
    788     return (EltOffset0 + CI.Width == EltOffset1 ||
    789             EltOffset1 + Paired.Width == EltOffset0) &&
    790            CI.CPol == Paired.CPol &&
    791            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
    792   }
    793 
    794   // If the offset in elements doesn't fit in 8-bits, we might be able to use
    795   // the stride 64 versions.
    796   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
    797       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
    798     if (Modify) {
    799       CI.Offset = EltOffset0 / 64;
    800       Paired.Offset = EltOffset1 / 64;
    801       CI.UseST64 = true;
    802     }
    803     return true;
    804   }
    805 
    806   // Check if the new offsets fit in the reduced 8-bit range.
    807   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
    808     if (Modify) {
    809       CI.Offset = EltOffset0;
    810       Paired.Offset = EltOffset1;
    811     }
    812     return true;
    813   }
    814 
    815   // Try to shift base address to decrease offsets.
    816   uint32_t Min = std::min(EltOffset0, EltOffset1);
    817   uint32_t Max = std::max(EltOffset0, EltOffset1);
    818 
    819   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
    820   if (((Max - Min) & ~Mask) == 0) {
    821     if (Modify) {
    822       // From the range of values we could use for BaseOff, choose the one that
    823       // is aligned to the highest power of two, to maximise the chance that
    824       // the same offset can be reused for other load/store pairs.
    825       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
    826       // Copy the low bits of the offsets, so that when we adjust them by
    827       // subtracting BaseOff they will be multiples of 64.
    828       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
    829       CI.BaseOff = BaseOff * CI.EltSize;
    830       CI.Offset = (EltOffset0 - BaseOff) / 64;
    831       Paired.Offset = (EltOffset1 - BaseOff) / 64;
    832       CI.UseST64 = true;
    833     }
    834     return true;
    835   }
    836 
    837   if (isUInt<8>(Max - Min)) {
    838     if (Modify) {
    839       // From the range of values we could use for BaseOff, choose the one that
    840       // is aligned to the highest power of two, to maximise the chance that
    841       // the same offset can be reused for other load/store pairs.
    842       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
    843       CI.BaseOff = BaseOff * CI.EltSize;
    844       CI.Offset = EltOffset0 - BaseOff;
    845       Paired.Offset = EltOffset1 - BaseOff;
    846     }
    847     return true;
    848   }
    849 
    850   return false;
    851 }
    852 
    853 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
    854                                      const CombineInfo &CI,
    855                                      const CombineInfo &Paired) {
    856   const unsigned Width = (CI.Width + Paired.Width);
    857   switch (CI.InstClass) {
    858   default:
    859     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
    860   case S_BUFFER_LOAD_IMM:
    861     switch (Width) {
    862     default:
    863       return false;
    864     case 2:
    865     case 4:
    866       return true;
    867     }
    868   }
    869 }
    870 
    871 const TargetRegisterClass *
    872 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
    873   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
    874     return TRI->getRegClassForReg(*MRI, Dst->getReg());
    875   }
    876   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
    877     return TRI->getRegClassForReg(*MRI, Src->getReg());
    878   }
    879   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
    880     return TRI->getRegClassForReg(*MRI, Src->getReg());
    881   }
    882   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
    883     return TRI->getRegClassForReg(*MRI, Dst->getReg());
    884   }
    885   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
    886     return TRI->getRegClassForReg(*MRI, Src->getReg());
    887   }
    888   return nullptr;
    889 }
    890 
    891 /// This function assumes that CI comes before Paired in a basic block.
    892 bool SILoadStoreOptimizer::checkAndPrepareMerge(
    893     CombineInfo &CI, CombineInfo &Paired,
    894     SmallVectorImpl<MachineInstr *> &InstsToMove) {
    895 
    896   // Check both offsets (or masks for MIMG) can be combined and fit in the
    897   // reduced range.
    898   if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
    899     return false;
    900 
    901   if (CI.InstClass != MIMG &&
    902       (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
    903     return false;
    904 
    905   const unsigned Opc = CI.I->getOpcode();
    906   const InstClassEnum InstClass = getInstClass(Opc, *TII);
    907 
    908   if (InstClass == UNKNOWN) {
    909     return false;
    910   }
    911   const unsigned InstSubclass = getInstSubclass(Opc, *TII);
    912 
    913   // Do not merge VMEM buffer instructions with "swizzled" bit set.
    914   int Swizzled =
    915       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
    916   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
    917     return false;
    918 
    919   DenseSet<Register> RegDefsToMove;
    920   DenseSet<Register> PhysRegUsesToMove;
    921   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
    922 
    923   const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
    924   bool IsAGPR = TRI->hasAGPRs(DataRC);
    925 
    926   MachineBasicBlock::iterator E = std::next(Paired.I);
    927   MachineBasicBlock::iterator MBBI = std::next(CI.I);
    928   MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
    929   for (; MBBI != E; ++MBBI) {
    930 
    931     if (MBBI == MBBE) {
    932       // CombineInfo::Order is a hint on the instruction ordering within the
    933       // basic block. This hint suggests that CI precedes Paired, which is
    934       // true most of the time. However, moveInstsAfter() processing a
    935       // previous list may have changed this order in a situation when it
    936       // moves an instruction which exists in some other merge list.
    937       // In this case it must be dependent.
    938       return false;
    939     }
    940 
    941     if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
    942         (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
    943       // This is not a matching instruction, but we can keep looking as
    944       // long as one of these conditions are met:
    945       // 1. It is safe to move I down past MBBI.
    946       // 2. It is safe to move MBBI down past the instruction that I will
    947       //    be merged into.
    948 
    949       if (MBBI->hasUnmodeledSideEffects()) {
    950         // We can't re-order this instruction with respect to other memory
    951         // operations, so we fail both conditions mentioned above.
    952         return false;
    953       }
    954 
    955       if (MBBI->mayLoadOrStore() &&
    956           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
    957            !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
    958         // We fail condition #1, but we may still be able to satisfy condition
    959         // #2.  Add this instruction to the move list and then we will check
    960         // if condition #2 holds once we have selected the matching instruction.
    961         InstsToMove.push_back(&*MBBI);
    962         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
    963         continue;
    964       }
    965 
    966       // When we match I with another DS instruction we will be moving I down
    967       // to the location of the matched instruction any uses of I will need to
    968       // be moved down as well.
    969       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
    970                             InstsToMove);
    971       continue;
    972     }
    973 
    974     // Don't merge volatiles.
    975     if (MBBI->hasOrderedMemoryRef())
    976       return false;
    977 
    978     int Swizzled =
    979         AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
    980     if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
    981       return false;
    982 
    983     // Handle a case like
    984     //   DS_WRITE_B32 addr, v, idx0
    985     //   w = DS_READ_B32 addr, idx0
    986     //   DS_WRITE_B32 addr, f(w), idx1
    987     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
    988     // merging of the two writes.
    989     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
    990                               InstsToMove))
    991       continue;
    992 
    993     if (&*MBBI == &*Paired.I) {
    994       if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
    995         return false;
    996       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
    997       //        operands. However we are reporting that ds_write2 shall have
    998       //        only VGPR data so that machine copy propagation does not
    999       //        create an illegal instruction with a VGPR and AGPR sources.
   1000       //        Consequenctially if we create such instruction the verifier
   1001       //        will complain.
   1002       if (IsAGPR && CI.InstClass == DS_WRITE)
   1003         return false;
   1004 
   1005       // We need to go through the list of instructions that we plan to
   1006       // move and make sure they are all safe to move down past the merged
   1007       // instruction.
   1008       if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
   1009 
   1010         // Call offsetsCanBeCombined with modify = true so that the offsets are
   1011         // correct for the new instruction.  This should return true, because
   1012         // this function should only be called on CombineInfo objects that
   1013         // have already been confirmed to be mergeable.
   1014         if (CI.InstClass != MIMG)
   1015           offsetsCanBeCombined(CI, *STM, Paired, true);
   1016         return true;
   1017       }
   1018       return false;
   1019     }
   1020 
   1021     // We've found a load/store that we couldn't merge for some reason.
   1022     // We could potentially keep looking, but we'd need to make sure that
   1023     // it was safe to move I and also all the instruction in InstsToMove
   1024     // down past this instruction.
   1025     // check if we can move I across MBBI and if we can move all I's users
   1026     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
   1027         !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
   1028       break;
   1029   }
   1030   return false;
   1031 }
   1032 
   1033 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
   1034   if (STM->ldsRequiresM0Init())
   1035     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
   1036   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
   1037 }
   1038 
   1039 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
   1040   if (STM->ldsRequiresM0Init())
   1041     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
   1042 
   1043   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
   1044                         : AMDGPU::DS_READ2ST64_B64_gfx9;
   1045 }
   1046 
   1047 MachineBasicBlock::iterator
   1048 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   1049     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1050   MachineBasicBlock *MBB = CI.I->getParent();
   1051 
   1052   // Be careful, since the addresses could be subregisters themselves in weird
   1053   // cases, like vectors of pointers.
   1054   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
   1055 
   1056   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
   1057   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
   1058 
   1059   unsigned NewOffset0 = CI.Offset;
   1060   unsigned NewOffset1 = Paired.Offset;
   1061   unsigned Opc =
   1062       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
   1063 
   1064   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   1065   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
   1066 
   1067   if (NewOffset0 > NewOffset1) {
   1068     // Canonicalize the merged instruction so the smaller offset comes first.
   1069     std::swap(NewOffset0, NewOffset1);
   1070     std::swap(SubRegIdx0, SubRegIdx1);
   1071   }
   1072 
   1073   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
   1074          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
   1075 
   1076   const MCInstrDesc &Read2Desc = TII->get(Opc);
   1077 
   1078   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1079   Register DestReg = MRI->createVirtualRegister(SuperRC);
   1080 
   1081   DebugLoc DL = CI.I->getDebugLoc();
   1082 
   1083   Register BaseReg = AddrReg->getReg();
   1084   unsigned BaseSubReg = AddrReg->getSubReg();
   1085   unsigned BaseRegFlags = 0;
   1086   if (CI.BaseOff) {
   1087     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   1088     BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
   1089         .addImm(CI.BaseOff);
   1090 
   1091     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1092     BaseRegFlags = RegState::Kill;
   1093 
   1094     TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
   1095         .addReg(ImmReg)
   1096         .addReg(AddrReg->getReg(), 0, BaseSubReg)
   1097         .addImm(0); // clamp bit
   1098     BaseSubReg = 0;
   1099   }
   1100 
   1101   MachineInstrBuilder Read2 =
   1102       BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
   1103           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
   1104           .addImm(NewOffset0)                        // offset0
   1105           .addImm(NewOffset1)                        // offset1
   1106           .addImm(0)                                 // gds
   1107           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
   1108 
   1109   (void)Read2;
   1110 
   1111   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
   1112 
   1113   // Copy to the old destination registers.
   1114   BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1115       .add(*Dest0) // Copy to same destination including flags and sub reg.
   1116       .addReg(DestReg, 0, SubRegIdx0);
   1117   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1118                             .add(*Dest1)
   1119                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
   1120 
   1121   moveInstsAfter(Copy1, InstsToMove);
   1122 
   1123   CI.I->eraseFromParent();
   1124   Paired.I->eraseFromParent();
   1125 
   1126   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   1127   return Read2;
   1128 }
   1129 
   1130 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
   1131   if (STM->ldsRequiresM0Init())
   1132     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
   1133   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
   1134                         : AMDGPU::DS_WRITE2_B64_gfx9;
   1135 }
   1136 
   1137 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
   1138   if (STM->ldsRequiresM0Init())
   1139     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
   1140                           : AMDGPU::DS_WRITE2ST64_B64;
   1141 
   1142   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
   1143                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
   1144 }
   1145 
   1146 MachineBasicBlock::iterator
   1147 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   1148                                       const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1149   MachineBasicBlock *MBB = CI.I->getParent();
   1150 
   1151   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   1152   // sure we preserve the subregister index and any register flags set on them.
   1153   const MachineOperand *AddrReg =
   1154       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
   1155   const MachineOperand *Data0 =
   1156       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
   1157   const MachineOperand *Data1 =
   1158       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
   1159 
   1160   unsigned NewOffset0 = CI.Offset;
   1161   unsigned NewOffset1 = Paired.Offset;
   1162   unsigned Opc =
   1163       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
   1164 
   1165   if (NewOffset0 > NewOffset1) {
   1166     // Canonicalize the merged instruction so the smaller offset comes first.
   1167     std::swap(NewOffset0, NewOffset1);
   1168     std::swap(Data0, Data1);
   1169   }
   1170 
   1171   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
   1172          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
   1173 
   1174   const MCInstrDesc &Write2Desc = TII->get(Opc);
   1175   DebugLoc DL = CI.I->getDebugLoc();
   1176 
   1177   Register BaseReg = AddrReg->getReg();
   1178   unsigned BaseSubReg = AddrReg->getSubReg();
   1179   unsigned BaseRegFlags = 0;
   1180   if (CI.BaseOff) {
   1181     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   1182     BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
   1183         .addImm(CI.BaseOff);
   1184 
   1185     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1186     BaseRegFlags = RegState::Kill;
   1187 
   1188     TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
   1189         .addReg(ImmReg)
   1190         .addReg(AddrReg->getReg(), 0, BaseSubReg)
   1191         .addImm(0); // clamp bit
   1192     BaseSubReg = 0;
   1193   }
   1194 
   1195   MachineInstrBuilder Write2 =
   1196       BuildMI(*MBB, Paired.I, DL, Write2Desc)
   1197           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
   1198           .add(*Data0)                               // data0
   1199           .add(*Data1)                               // data1
   1200           .addImm(NewOffset0)                        // offset0
   1201           .addImm(NewOffset1)                        // offset1
   1202           .addImm(0)                                 // gds
   1203           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
   1204 
   1205   moveInstsAfter(Write2, InstsToMove);
   1206 
   1207   CI.I->eraseFromParent();
   1208   Paired.I->eraseFromParent();
   1209 
   1210   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
   1211   return Write2;
   1212 }
   1213 
   1214 MachineBasicBlock::iterator
   1215 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   1216                            const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1217   MachineBasicBlock *MBB = CI.I->getParent();
   1218   DebugLoc DL = CI.I->getDebugLoc();
   1219   const unsigned Opcode = getNewOpcode(CI, Paired);
   1220 
   1221   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1222 
   1223   Register DestReg = MRI->createVirtualRegister(SuperRC);
   1224   unsigned MergedDMask = CI.DMask | Paired.DMask;
   1225   unsigned DMaskIdx =
   1226       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
   1227 
   1228   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
   1229   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
   1230     if (I == DMaskIdx)
   1231       MIB.addImm(MergedDMask);
   1232     else
   1233       MIB.add((*CI.I).getOperand(I));
   1234   }
   1235 
   1236   // It shouldn't be possible to get this far if the two instructions
   1237   // don't have a single memoperand, because MachineInstr::mayAlias()
   1238   // will return true if this is the case.
   1239   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
   1240 
   1241   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   1242   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
   1243 
   1244   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
   1245 
   1246   unsigned SubRegIdx0, SubRegIdx1;
   1247   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
   1248 
   1249   // Copy to the old destination registers.
   1250   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
   1251   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   1252   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
   1253 
   1254   BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1255       .add(*Dest0) // Copy to same destination including flags and sub reg.
   1256       .addReg(DestReg, 0, SubRegIdx0);
   1257   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1258                             .add(*Dest1)
   1259                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
   1260 
   1261   moveInstsAfter(Copy1, InstsToMove);
   1262 
   1263   CI.I->eraseFromParent();
   1264   Paired.I->eraseFromParent();
   1265   return New;
   1266 }
   1267 
   1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   1269     CombineInfo &CI, CombineInfo &Paired,
   1270     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1271   MachineBasicBlock *MBB = CI.I->getParent();
   1272   DebugLoc DL = CI.I->getDebugLoc();
   1273   const unsigned Opcode = getNewOpcode(CI, Paired);
   1274 
   1275   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1276 
   1277   Register DestReg = MRI->createVirtualRegister(SuperRC);
   1278   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
   1279 
   1280   // It shouldn't be possible to get this far if the two instructions
   1281   // don't have a single memoperand, because MachineInstr::mayAlias()
   1282   // will return true if this is the case.
   1283   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
   1284 
   1285   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   1286   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
   1287 
   1288   MachineInstr *New =
   1289     BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
   1290         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
   1291         .addImm(MergedOffset) // offset
   1292         .addImm(CI.CPol)      // cpol
   1293         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
   1294 
   1295   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   1296   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
   1297   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
   1298 
   1299   // Copy to the old destination registers.
   1300   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
   1301   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
   1302   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
   1303 
   1304   BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1305       .add(*Dest0) // Copy to same destination including flags and sub reg.
   1306       .addReg(DestReg, 0, SubRegIdx0);
   1307   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1308                             .add(*Dest1)
   1309                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
   1310 
   1311   moveInstsAfter(Copy1, InstsToMove);
   1312 
   1313   CI.I->eraseFromParent();
   1314   Paired.I->eraseFromParent();
   1315   return New;
   1316 }
   1317 
   1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   1319     CombineInfo &CI, CombineInfo &Paired,
   1320     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1321   MachineBasicBlock *MBB = CI.I->getParent();
   1322   DebugLoc DL = CI.I->getDebugLoc();
   1323 
   1324   const unsigned Opcode = getNewOpcode(CI, Paired);
   1325 
   1326   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1327 
   1328   // Copy to the new source register.
   1329   Register DestReg = MRI->createVirtualRegister(SuperRC);
   1330   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
   1331 
   1332   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
   1333 
   1334   AddressRegs Regs = getRegs(Opcode, *TII);
   1335 
   1336   if (Regs.VAddr)
   1337     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
   1338 
   1339   // It shouldn't be possible to get this far if the two instructions
   1340   // don't have a single memoperand, because MachineInstr::mayAlias()
   1341   // will return true if this is the case.
   1342   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
   1343 
   1344   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   1345   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
   1346 
   1347   MachineInstr *New =
   1348     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
   1349         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
   1350         .addImm(MergedOffset) // offset
   1351         .addImm(CI.CPol)      // cpol
   1352         .addImm(0)            // tfe
   1353         .addImm(0)            // swz
   1354         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
   1355 
   1356   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   1357   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
   1358   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
   1359 
   1360   // Copy to the old destination registers.
   1361   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
   1362   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   1363   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
   1364 
   1365   BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1366       .add(*Dest0) // Copy to same destination including flags and sub reg.
   1367       .addReg(DestReg, 0, SubRegIdx0);
   1368   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1369                             .add(*Dest1)
   1370                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
   1371 
   1372   moveInstsAfter(Copy1, InstsToMove);
   1373 
   1374   CI.I->eraseFromParent();
   1375   Paired.I->eraseFromParent();
   1376   return New;
   1377 }
   1378 
   1379 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   1380     CombineInfo &CI, CombineInfo &Paired,
   1381     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1382   MachineBasicBlock *MBB = CI.I->getParent();
   1383   DebugLoc DL = CI.I->getDebugLoc();
   1384 
   1385   const unsigned Opcode = getNewOpcode(CI, Paired);
   1386 
   1387   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1388 
   1389   // Copy to the new source register.
   1390   Register DestReg = MRI->createVirtualRegister(SuperRC);
   1391   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
   1392 
   1393   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
   1394 
   1395   AddressRegs Regs = getRegs(Opcode, *TII);
   1396 
   1397   if (Regs.VAddr)
   1398     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
   1399 
   1400   unsigned JoinedFormat =
   1401       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
   1402 
   1403   // It shouldn't be possible to get this far if the two instructions
   1404   // don't have a single memoperand, because MachineInstr::mayAlias()
   1405   // will return true if this is the case.
   1406   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
   1407 
   1408   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   1409   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
   1410 
   1411   MachineInstr *New =
   1412       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
   1413           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
   1414           .addImm(MergedOffset) // offset
   1415           .addImm(JoinedFormat) // format
   1416           .addImm(CI.CPol)      // cpol
   1417           .addImm(0)            // tfe
   1418           .addImm(0)            // swz
   1419           .addMemOperand(
   1420               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
   1421 
   1422   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   1423   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
   1424   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
   1425 
   1426   // Copy to the old destination registers.
   1427   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
   1428   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   1429   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
   1430 
   1431   BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1432       .add(*Dest0) // Copy to same destination including flags and sub reg.
   1433       .addReg(DestReg, 0, SubRegIdx0);
   1434   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
   1435                             .add(*Dest1)
   1436                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
   1437 
   1438   moveInstsAfter(Copy1, InstsToMove);
   1439 
   1440   CI.I->eraseFromParent();
   1441   Paired.I->eraseFromParent();
   1442   return New;
   1443 }
   1444 
   1445 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   1446     CombineInfo &CI, CombineInfo &Paired,
   1447     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1448   MachineBasicBlock *MBB = CI.I->getParent();
   1449   DebugLoc DL = CI.I->getDebugLoc();
   1450 
   1451   const unsigned Opcode = getNewOpcode(CI, Paired);
   1452 
   1453   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   1454   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
   1455   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
   1456 
   1457   // Copy to the new source register.
   1458   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1459   Register SrcReg = MRI->createVirtualRegister(SuperRC);
   1460 
   1461   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   1462   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
   1463 
   1464   BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
   1465       .add(*Src0)
   1466       .addImm(SubRegIdx0)
   1467       .add(*Src1)
   1468       .addImm(SubRegIdx1);
   1469 
   1470   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
   1471                  .addReg(SrcReg, RegState::Kill);
   1472 
   1473   AddressRegs Regs = getRegs(Opcode, *TII);
   1474 
   1475   if (Regs.VAddr)
   1476     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
   1477 
   1478   unsigned JoinedFormat =
   1479       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
   1480 
   1481   // It shouldn't be possible to get this far if the two instructions
   1482   // don't have a single memoperand, because MachineInstr::mayAlias()
   1483   // will return true if this is the case.
   1484   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
   1485 
   1486   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   1487   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
   1488 
   1489   MachineInstr *New =
   1490       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
   1491           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
   1492           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
   1493           .addImm(JoinedFormat)                     // format
   1494           .addImm(CI.CPol)                          // cpol
   1495           .addImm(0)                                // tfe
   1496           .addImm(0)                                // swz
   1497           .addMemOperand(
   1498               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
   1499 
   1500   moveInstsAfter(MIB, InstsToMove);
   1501 
   1502   CI.I->eraseFromParent();
   1503   Paired.I->eraseFromParent();
   1504   return New;
   1505 }
   1506 
   1507 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
   1508                                             const CombineInfo &Paired) {
   1509   const unsigned Width = CI.Width + Paired.Width;
   1510 
   1511   switch (CI.InstClass) {
   1512   default:
   1513     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
   1514     // FIXME: Handle d16 correctly
   1515     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
   1516                                   Width);
   1517   case TBUFFER_LOAD:
   1518   case TBUFFER_STORE:
   1519     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
   1520                                   Width);
   1521 
   1522   case UNKNOWN:
   1523     llvm_unreachable("Unknown instruction class");
   1524   case S_BUFFER_LOAD_IMM:
   1525     switch (Width) {
   1526     default:
   1527       return 0;
   1528     case 2:
   1529       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
   1530     case 4:
   1531       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
   1532     }
   1533   case MIMG:
   1534     assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
   1535     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
   1536   }
   1537 }
   1538 
   1539 std::pair<unsigned, unsigned>
   1540 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
   1541 
   1542   if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
   1543     return std::make_pair(0, 0);
   1544 
   1545   bool ReverseOrder;
   1546   if (CI.InstClass == MIMG) {
   1547     assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
   1548            "No overlaps");
   1549     ReverseOrder = CI.DMask > Paired.DMask;
   1550   } else
   1551     ReverseOrder = CI.Offset > Paired.Offset;
   1552 
   1553   static const unsigned Idxs[4][4] = {
   1554       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
   1555       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
   1556       {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
   1557       {AMDGPU::sub3, 0, 0, 0},
   1558   };
   1559   unsigned Idx0;
   1560   unsigned Idx1;
   1561 
   1562   assert(CI.Width >= 1 && CI.Width <= 3);
   1563   assert(Paired.Width >= 1 && Paired.Width <= 3);
   1564 
   1565   if (ReverseOrder) {
   1566     Idx1 = Idxs[0][Paired.Width - 1];
   1567     Idx0 = Idxs[Paired.Width][CI.Width - 1];
   1568   } else {
   1569     Idx0 = Idxs[0][CI.Width - 1];
   1570     Idx1 = Idxs[CI.Width][Paired.Width - 1];
   1571   }
   1572 
   1573   return std::make_pair(Idx0, Idx1);
   1574 }
   1575 
   1576 const TargetRegisterClass *
   1577 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
   1578                                              const CombineInfo &Paired) {
   1579   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
   1580     switch (CI.Width + Paired.Width) {
   1581     default:
   1582       return nullptr;
   1583     case 2:
   1584       return &AMDGPU::SReg_64_XEXECRegClass;
   1585     case 4:
   1586       return &AMDGPU::SGPR_128RegClass;
   1587     case 8:
   1588       return &AMDGPU::SGPR_256RegClass;
   1589     case 16:
   1590       return &AMDGPU::SGPR_512RegClass;
   1591     }
   1592   }
   1593 
   1594   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
   1595   return TRI->hasAGPRs(getDataRegClass(*CI.I))
   1596              ? TRI->getAGPRClassForBitWidth(BitWidth)
   1597              : TRI->getVGPRClassForBitWidth(BitWidth);
   1598 }
   1599 
   1600 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   1601     CombineInfo &CI, CombineInfo &Paired,
   1602     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   1603   MachineBasicBlock *MBB = CI.I->getParent();
   1604   DebugLoc DL = CI.I->getDebugLoc();
   1605 
   1606   const unsigned Opcode = getNewOpcode(CI, Paired);
   1607 
   1608   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   1609   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
   1610   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
   1611 
   1612   // Copy to the new source register.
   1613   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   1614   Register SrcReg = MRI->createVirtualRegister(SuperRC);
   1615 
   1616   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   1617   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
   1618 
   1619   BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
   1620       .add(*Src0)
   1621       .addImm(SubRegIdx0)
   1622       .add(*Src1)
   1623       .addImm(SubRegIdx1);
   1624 
   1625   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
   1626                  .addReg(SrcReg, RegState::Kill);
   1627 
   1628   AddressRegs Regs = getRegs(Opcode, *TII);
   1629 
   1630   if (Regs.VAddr)
   1631     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
   1632 
   1633 
   1634   // It shouldn't be possible to get this far if the two instructions
   1635   // don't have a single memoperand, because MachineInstr::mayAlias()
   1636   // will return true if this is the case.
   1637   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
   1638 
   1639   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   1640   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
   1641 
   1642   MachineInstr *New =
   1643     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
   1644         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
   1645         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
   1646         .addImm(CI.CPol)      // cpol
   1647         .addImm(0)            // tfe
   1648         .addImm(0)            // swz
   1649         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
   1650 
   1651   moveInstsAfter(MIB, InstsToMove);
   1652 
   1653   CI.I->eraseFromParent();
   1654   Paired.I->eraseFromParent();
   1655   return New;
   1656 }
   1657 
   1658 MachineOperand
   1659 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
   1660   APInt V(32, Val, true);
   1661   if (TII->isInlineConstant(V))
   1662     return MachineOperand::CreateImm(Val);
   1663 
   1664   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   1665   MachineInstr *Mov =
   1666   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
   1667           TII->get(AMDGPU::S_MOV_B32), Reg)
   1668     .addImm(Val);
   1669   (void)Mov;
   1670   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
   1671   return MachineOperand::CreateReg(Reg, false);
   1672 }
   1673 
   1674 // Compute base address using Addr and return the final register.
   1675 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   1676                                            const MemAddress &Addr) const {
   1677   MachineBasicBlock *MBB = MI.getParent();
   1678   MachineBasicBlock::iterator MBBI = MI.getIterator();
   1679   DebugLoc DL = MI.getDebugLoc();
   1680 
   1681   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
   1682           Addr.Base.LoSubReg) &&
   1683          "Expected 32-bit Base-Register-Low!!");
   1684 
   1685   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
   1686           Addr.Base.HiSubReg) &&
   1687          "Expected 32-bit Base-Register-Hi!!");
   1688 
   1689   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
   1690   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
   1691   MachineOperand OffsetHi =
   1692     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
   1693 
   1694   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   1695   Register CarryReg = MRI->createVirtualRegister(CarryRC);
   1696   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
   1697 
   1698   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1699   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1700   MachineInstr *LoHalf =
   1701     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
   1702       .addReg(CarryReg, RegState::Define)
   1703       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
   1704       .add(OffsetLo)
   1705       .addImm(0); // clamp bit
   1706   (void)LoHalf;
   1707   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
   1708 
   1709   MachineInstr *HiHalf =
   1710   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
   1711     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
   1712     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
   1713     .add(OffsetHi)
   1714     .addReg(CarryReg, RegState::Kill)
   1715     .addImm(0); // clamp bit
   1716   (void)HiHalf;
   1717   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
   1718 
   1719   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
   1720   MachineInstr *FullBase =
   1721     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
   1722       .addReg(DestSub0)
   1723       .addImm(AMDGPU::sub0)
   1724       .addReg(DestSub1)
   1725       .addImm(AMDGPU::sub1);
   1726   (void)FullBase;
   1727   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
   1728 
   1729   return FullDestReg;
   1730 }
   1731 
   1732 // Update base and offset with the NewBase and NewOffset in MI.
   1733 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
   1734                                                Register NewBase,
   1735                                                int32_t NewOffset) const {
   1736   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
   1737   Base->setReg(NewBase);
   1738   Base->setIsKill(false);
   1739   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
   1740 }
   1741 
   1742 Optional<int32_t>
   1743 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
   1744   if (Op.isImm())
   1745     return Op.getImm();
   1746 
   1747   if (!Op.isReg())
   1748     return None;
   1749 
   1750   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
   1751   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
   1752       !Def->getOperand(1).isImm())
   1753     return None;
   1754 
   1755   return Def->getOperand(1).getImm();
   1756 }
   1757 
   1758 // Analyze Base and extracts:
   1759 //  - 32bit base registers, subregisters
   1760 //  - 64bit constant offset
   1761 // Expecting base computation as:
   1762 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
   1763 //   %LO:vgpr_32, %c:sreg_64_xexec =
   1764 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
   1765 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
   1766 //   %Base:vreg_64 =
   1767 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
   1768 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
   1769                                                       MemAddress &Addr) const {
   1770   if (!Base.isReg())
   1771     return;
   1772 
   1773   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
   1774   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
   1775       || Def->getNumOperands() != 5)
   1776     return;
   1777 
   1778   MachineOperand BaseLo = Def->getOperand(1);
   1779   MachineOperand BaseHi = Def->getOperand(3);
   1780   if (!BaseLo.isReg() || !BaseHi.isReg())
   1781     return;
   1782 
   1783   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
   1784   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
   1785 
   1786   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
   1787       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
   1788     return;
   1789 
   1790   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
   1791   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
   1792 
   1793   auto Offset0P = extractConstOffset(*Src0);
   1794   if (Offset0P)
   1795     BaseLo = *Src1;
   1796   else {
   1797     if (!(Offset0P = extractConstOffset(*Src1)))
   1798       return;
   1799     BaseLo = *Src0;
   1800   }
   1801 
   1802   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
   1803   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
   1804 
   1805   if (Src0->isImm())
   1806     std::swap(Src0, Src1);
   1807 
   1808   if (!Src1->isImm())
   1809     return;
   1810 
   1811   uint64_t Offset1 = Src1->getImm();
   1812   BaseHi = *Src0;
   1813 
   1814   Addr.Base.LoReg = BaseLo.getReg();
   1815   Addr.Base.HiReg = BaseHi.getReg();
   1816   Addr.Base.LoSubReg = BaseLo.getSubReg();
   1817   Addr.Base.HiSubReg = BaseHi.getSubReg();
   1818   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
   1819 }
   1820 
   1821 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   1822     MachineInstr &MI,
   1823     MemInfoMap &Visited,
   1824     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
   1825 
   1826   if (!(MI.mayLoad() ^ MI.mayStore()))
   1827     return false;
   1828 
   1829   // TODO: Support flat and scratch.
   1830   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
   1831     return false;
   1832 
   1833   if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
   1834     return false;
   1835 
   1836   if (AnchorList.count(&MI))
   1837     return false;
   1838 
   1839   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
   1840 
   1841   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
   1842     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
   1843     return false;
   1844   }
   1845 
   1846   // Step1: Find the base-registers and a 64bit constant offset.
   1847   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
   1848   MemAddress MAddr;
   1849   if (Visited.find(&MI) == Visited.end()) {
   1850     processBaseWithConstOffset(Base, MAddr);
   1851     Visited[&MI] = MAddr;
   1852   } else
   1853     MAddr = Visited[&MI];
   1854 
   1855   if (MAddr.Offset == 0) {
   1856     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
   1857                          " constant offsets that can be promoted.\n";);
   1858     return false;
   1859   }
   1860 
   1861   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
   1862              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
   1863 
   1864   // Step2: Traverse through MI's basic block and find an anchor(that has the
   1865   // same base-registers) with the highest 13bit distance from MI's offset.
   1866   // E.g. (64bit loads)
   1867   // bb:
   1868   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
   1869   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
   1870   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
   1871   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
   1872   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
   1873   //
   1874   // Starting from the first load, the optimization will try to find a new base
   1875   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
   1876   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
   1877   // as the new-base(anchor) because of the maximum distance which can
   1878   // accomodate more intermediate bases presumeably.
   1879   //
   1880   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
   1881   // (&a + 8192) for load1, load2, load4.
   1882   //   addr = &a + 8192
   1883   //   load1 = load(addr,       -4096)
   1884   //   load2 = load(addr,       -2048)
   1885   //   load3 = load(addr,       0)
   1886   //   load4 = load(addr,       2048)
   1887   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
   1888   //
   1889   MachineInstr *AnchorInst = nullptr;
   1890   MemAddress AnchorAddr;
   1891   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
   1892   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
   1893 
   1894   MachineBasicBlock *MBB = MI.getParent();
   1895   MachineBasicBlock::iterator E = MBB->end();
   1896   MachineBasicBlock::iterator MBBI = MI.getIterator();
   1897   ++MBBI;
   1898   const SITargetLowering *TLI =
   1899     static_cast<const SITargetLowering *>(STM->getTargetLowering());
   1900 
   1901   for ( ; MBBI != E; ++MBBI) {
   1902     MachineInstr &MINext = *MBBI;
   1903     // TODO: Support finding an anchor(with same base) from store addresses or
   1904     // any other load addresses where the opcodes are different.
   1905     if (MINext.getOpcode() != MI.getOpcode() ||
   1906         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
   1907       continue;
   1908 
   1909     const MachineOperand &BaseNext =
   1910       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
   1911     MemAddress MAddrNext;
   1912     if (Visited.find(&MINext) == Visited.end()) {
   1913       processBaseWithConstOffset(BaseNext, MAddrNext);
   1914       Visited[&MINext] = MAddrNext;
   1915     } else
   1916       MAddrNext = Visited[&MINext];
   1917 
   1918     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
   1919         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
   1920         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
   1921         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
   1922       continue;
   1923 
   1924     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
   1925 
   1926     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
   1927     TargetLoweringBase::AddrMode AM;
   1928     AM.HasBaseReg = true;
   1929     AM.BaseOffs = Dist;
   1930     if (TLI->isLegalGlobalAddressingMode(AM) &&
   1931         (uint32_t)std::abs(Dist) > MaxDist) {
   1932       MaxDist = std::abs(Dist);
   1933 
   1934       AnchorAddr = MAddrNext;
   1935       AnchorInst = &MINext;
   1936     }
   1937   }
   1938 
   1939   if (AnchorInst) {
   1940     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
   1941                AnchorInst->dump());
   1942     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
   1943                <<  AnchorAddr.Offset << "\n\n");
   1944 
   1945     // Instead of moving up, just re-compute anchor-instruction's base address.
   1946     Register Base = computeBase(MI, AnchorAddr);
   1947 
   1948     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
   1949     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
   1950 
   1951     for (auto P : InstsWCommonBase) {
   1952       TargetLoweringBase::AddrMode AM;
   1953       AM.HasBaseReg = true;
   1954       AM.BaseOffs = P.second - AnchorAddr.Offset;
   1955 
   1956       if (TLI->isLegalGlobalAddressingMode(AM)) {
   1957         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
   1958                    dbgs() << ")"; P.first->dump());
   1959         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
   1960         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
   1961       }
   1962     }
   1963     AnchorList.insert(AnchorInst);
   1964     return true;
   1965   }
   1966 
   1967   return false;
   1968 }
   1969 
   1970 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
   1971                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
   1972   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
   1973     if (AddrList.front().InstClass == CI.InstClass &&
   1974         AddrList.front().hasSameBaseAddress(*CI.I)) {
   1975       AddrList.emplace_back(CI);
   1976       return;
   1977     }
   1978   }
   1979 
   1980   // Base address not found, so add a new list.
   1981   MergeableInsts.emplace_back(1, CI);
   1982 }
   1983 
   1984 std::pair<MachineBasicBlock::iterator, bool>
   1985 SILoadStoreOptimizer::collectMergeableInsts(
   1986     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
   1987     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
   1988     std::list<std::list<CombineInfo>> &MergeableInsts) const {
   1989   bool Modified = false;
   1990 
   1991   // Sort potential mergeable instructions into lists.  One list per base address.
   1992   unsigned Order = 0;
   1993   MachineBasicBlock::iterator BlockI = Begin;
   1994   for (; BlockI != End; ++BlockI) {
   1995     MachineInstr &MI = *BlockI;
   1996 
   1997     // We run this before checking if an address is mergeable, because it can produce
   1998     // better code even if the instructions aren't mergeable.
   1999     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
   2000       Modified = true;
   2001 
   2002     // Don't combine if volatile. We also won't be able to merge across this, so
   2003     // break the search. We can look after this barrier for separate merges.
   2004     if (MI.hasOrderedMemoryRef()) {
   2005       LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
   2006 
   2007       // Search will resume after this instruction in a separate merge list.
   2008       ++BlockI;
   2009       break;
   2010     }
   2011 
   2012     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
   2013     if (InstClass == UNKNOWN)
   2014       continue;
   2015 
   2016     CombineInfo CI;
   2017     CI.setMI(MI, *TII, *STM);
   2018     CI.Order = Order++;
   2019 
   2020     if (!CI.hasMergeableAddress(*MRI))
   2021       continue;
   2022 
   2023     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
   2024 
   2025     addInstToMergeableList(CI, MergeableInsts);
   2026   }
   2027 
   2028   // At this point we have lists of Mergeable instructions.
   2029   //
   2030   // Part 2: Sort lists by offset and then for each CombineInfo object in the
   2031   // list try to find an instruction that can be merged with I.  If an instruction
   2032   // is found, it is stored in the Paired field.  If no instructions are found, then
   2033   // the CombineInfo object is deleted from the list.
   2034 
   2035   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
   2036                                                    E = MergeableInsts.end(); I != E;) {
   2037 
   2038     std::list<CombineInfo> &MergeList = *I;
   2039     if (MergeList.size() <= 1) {
   2040       // This means we have found only one instruction with a given address
   2041       // that can be merged, and we need at least 2 instructions to do a merge,
   2042       // so this list can be discarded.
   2043       I = MergeableInsts.erase(I);
   2044       continue;
   2045     }
   2046 
   2047     // Sort the lists by offsets, this way mergeable instructions will be
   2048     // adjacent to each other in the list, which will make it easier to find
   2049     // matches.
   2050     MergeList.sort(
   2051         [] (const CombineInfo &A, CombineInfo &B) {
   2052           return A.Offset < B.Offset;
   2053         });
   2054     ++I;
   2055   }
   2056 
   2057   return std::make_pair(BlockI, Modified);
   2058 }
   2059 
   2060 // Scan through looking for adjacent LDS operations with constant offsets from
   2061 // the same base register. We rely on the scheduler to do the hard work of
   2062 // clustering nearby loads, and assume these are all adjacent.
   2063 bool SILoadStoreOptimizer::optimizeBlock(
   2064                        std::list<std::list<CombineInfo> > &MergeableInsts) {
   2065   bool Modified = false;
   2066 
   2067   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
   2068                                                    E = MergeableInsts.end(); I != E;) {
   2069     std::list<CombineInfo> &MergeList = *I;
   2070 
   2071     bool OptimizeListAgain = false;
   2072     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
   2073       // We weren't able to make any changes, so delete the list so we don't
   2074       // process the same instructions the next time we try to optimize this
   2075       // block.
   2076       I = MergeableInsts.erase(I);
   2077       continue;
   2078     }
   2079 
   2080     Modified = true;
   2081 
   2082     // We made changes, but also determined that there were no more optimization
   2083     // opportunities, so we don't need to reprocess the list
   2084     if (!OptimizeListAgain) {
   2085       I = MergeableInsts.erase(I);
   2086       continue;
   2087     }
   2088     OptimizeAgain = true;
   2089   }
   2090   return Modified;
   2091 }
   2092 
   2093 bool
   2094 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
   2095                                           std::list<CombineInfo> &MergeList,
   2096                                           bool &OptimizeListAgain) {
   2097   if (MergeList.empty())
   2098     return false;
   2099 
   2100   bool Modified = false;
   2101 
   2102   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
   2103        Next = std::next(I)) {
   2104 
   2105     auto First = I;
   2106     auto Second = Next;
   2107 
   2108     if ((*First).Order > (*Second).Order)
   2109       std::swap(First, Second);
   2110     CombineInfo &CI = *First;
   2111     CombineInfo &Paired = *Second;
   2112 
   2113     SmallVector<MachineInstr *, 8> InstsToMove;
   2114     if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
   2115       ++I;
   2116       continue;
   2117     }
   2118 
   2119     Modified = true;
   2120 
   2121     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
   2122 
   2123     switch (CI.InstClass) {
   2124     default:
   2125       llvm_unreachable("unknown InstClass");
   2126       break;
   2127     case DS_READ: {
   2128       MachineBasicBlock::iterator NewMI =
   2129           mergeRead2Pair(CI, Paired, InstsToMove);
   2130       CI.setMI(NewMI, *TII, *STM);
   2131       break;
   2132     }
   2133     case DS_WRITE: {
   2134       MachineBasicBlock::iterator NewMI =
   2135           mergeWrite2Pair(CI, Paired, InstsToMove);
   2136       CI.setMI(NewMI, *TII, *STM);
   2137       break;
   2138     }
   2139     case S_BUFFER_LOAD_IMM: {
   2140       MachineBasicBlock::iterator NewMI =
   2141           mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
   2142       CI.setMI(NewMI, *TII, *STM);
   2143       OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
   2144       break;
   2145     }
   2146     case BUFFER_LOAD: {
   2147       MachineBasicBlock::iterator NewMI =
   2148           mergeBufferLoadPair(CI, Paired, InstsToMove);
   2149       CI.setMI(NewMI, *TII, *STM);
   2150       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
   2151       break;
   2152     }
   2153     case BUFFER_STORE: {
   2154       MachineBasicBlock::iterator NewMI =
   2155           mergeBufferStorePair(CI, Paired, InstsToMove);
   2156       CI.setMI(NewMI, *TII, *STM);
   2157       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
   2158       break;
   2159     }
   2160     case MIMG: {
   2161       MachineBasicBlock::iterator NewMI =
   2162           mergeImagePair(CI, Paired, InstsToMove);
   2163       CI.setMI(NewMI, *TII, *STM);
   2164       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
   2165       break;
   2166     }
   2167     case TBUFFER_LOAD: {
   2168       MachineBasicBlock::iterator NewMI =
   2169           mergeTBufferLoadPair(CI, Paired, InstsToMove);
   2170       CI.setMI(NewMI, *TII, *STM);
   2171       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
   2172       break;
   2173     }
   2174     case TBUFFER_STORE: {
   2175       MachineBasicBlock::iterator NewMI =
   2176           mergeTBufferStorePair(CI, Paired, InstsToMove);
   2177       CI.setMI(NewMI, *TII, *STM);
   2178       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
   2179       break;
   2180     }
   2181     }
   2182     CI.Order = Paired.Order;
   2183     if (I == Second)
   2184       I = Next;
   2185 
   2186     MergeList.erase(Second);
   2187   }
   2188 
   2189   return Modified;
   2190 }
   2191 
   2192 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   2193   if (skipFunction(MF.getFunction()))
   2194     return false;
   2195 
   2196   STM = &MF.getSubtarget<GCNSubtarget>();
   2197   if (!STM->loadStoreOptEnabled())
   2198     return false;
   2199 
   2200   TII = STM->getInstrInfo();
   2201   TRI = &TII->getRegisterInfo();
   2202 
   2203   MRI = &MF.getRegInfo();
   2204   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   2205 
   2206   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
   2207 
   2208   bool Modified = false;
   2209 
   2210   // Contains the list of instructions for which constant offsets are being
   2211   // promoted to the IMM. This is tracked for an entire block at time.
   2212   SmallPtrSet<MachineInstr *, 4> AnchorList;
   2213   MemInfoMap Visited;
   2214 
   2215   for (MachineBasicBlock &MBB : MF) {
   2216     MachineBasicBlock::iterator SectionEnd;
   2217     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
   2218          I = SectionEnd) {
   2219       bool CollectModified;
   2220       std::list<std::list<CombineInfo>> MergeableInsts;
   2221 
   2222       // First pass: Collect list of all instructions we know how to merge in a
   2223       // subset of the block.
   2224       std::tie(SectionEnd, CollectModified) =
   2225           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
   2226 
   2227       Modified |= CollectModified;
   2228 
   2229       do {
   2230         OptimizeAgain = false;
   2231         Modified |= optimizeBlock(MergeableInsts);
   2232       } while (OptimizeAgain);
   2233     }
   2234 
   2235     Visited.clear();
   2236     AnchorList.clear();
   2237   }
   2238 
   2239   return Modified;
   2240 }
   2241