Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// This pass performs the peephole optimizations before code emission.
     11 ///
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "AMDGPU.h"
     15 #include "GCNSubtarget.h"
     16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     17 #include "llvm/CodeGen/MachineFunctionPass.h"
     18 
     19 using namespace llvm;
     20 
     21 #define DEBUG_TYPE "si-pre-emit-peephole"
     22 
     23 static unsigned SkipThreshold;
     24 
     25 static cl::opt<unsigned, true> SkipThresholdFlag(
     26     "amdgpu-skip-threshold", cl::Hidden,
     27     cl::desc(
     28         "Number of instructions before jumping over divergent control flow"),
     29     cl::location(SkipThreshold), cl::init(12));
     30 
     31 namespace {
     32 
     33 class SIPreEmitPeephole : public MachineFunctionPass {
     34 private:
     35   const SIInstrInfo *TII = nullptr;
     36   const SIRegisterInfo *TRI = nullptr;
     37 
     38   bool optimizeVccBranch(MachineInstr &MI) const;
     39   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
     40   bool getBlockDestinations(MachineBasicBlock &SrcMBB,
     41                             MachineBasicBlock *&TrueMBB,
     42                             MachineBasicBlock *&FalseMBB,
     43                             SmallVectorImpl<MachineOperand> &Cond);
     44   bool mustRetainExeczBranch(const MachineBasicBlock &From,
     45                              const MachineBasicBlock &To) const;
     46   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
     47 
     48 public:
     49   static char ID;
     50 
     51   SIPreEmitPeephole() : MachineFunctionPass(ID) {
     52     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
     53   }
     54 
     55   bool runOnMachineFunction(MachineFunction &MF) override;
     56 };
     57 
     58 } // End anonymous namespace.
     59 
     60 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
     61                 "SI peephole optimizations", false, false)
     62 
     63 char SIPreEmitPeephole::ID = 0;
     64 
     65 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
     66 
     67 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
     68   // Match:
     69   // sreg = -1 or 0
     70   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
     71   // S_CBRANCH_VCC[N]Z
     72   // =>
     73   // S_CBRANCH_EXEC[N]Z
     74   // We end up with this pattern sometimes after basic block placement.
     75   // It happens while combining a block which assigns -1 or 0 to a saved mask
     76   // and another block which consumes that saved mask and then a branch.
     77   bool Changed = false;
     78   MachineBasicBlock &MBB = *MI.getParent();
     79   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
     80   const bool IsWave32 = ST.isWave32();
     81   const unsigned CondReg = TRI->getVCC();
     82   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     83   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
     84   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
     85   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     86 
     87   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
     88                                       E = MBB.rend();
     89   bool ReadsCond = false;
     90   unsigned Threshold = 5;
     91   for (++A; A != E; ++A) {
     92     if (!--Threshold)
     93       return false;
     94     if (A->modifiesRegister(ExecReg, TRI))
     95       return false;
     96     if (A->modifiesRegister(CondReg, TRI)) {
     97       if (!A->definesRegister(CondReg, TRI) ||
     98           (A->getOpcode() != And && A->getOpcode() != AndN2))
     99         return false;
    100       break;
    101     }
    102     ReadsCond |= A->readsRegister(CondReg, TRI);
    103   }
    104   if (A == E)
    105     return false;
    106 
    107   MachineOperand &Op1 = A->getOperand(1);
    108   MachineOperand &Op2 = A->getOperand(2);
    109   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
    110     TII->commuteInstruction(*A);
    111     Changed = true;
    112   }
    113   if (Op1.getReg() != ExecReg)
    114     return Changed;
    115   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
    116     return Changed;
    117 
    118   int64_t MaskValue = 0;
    119   Register SReg;
    120   if (Op2.isReg()) {
    121     SReg = Op2.getReg();
    122     auto M = std::next(A);
    123     bool ReadsSreg = false;
    124     for (; M != E; ++M) {
    125       if (M->definesRegister(SReg, TRI))
    126         break;
    127       if (M->modifiesRegister(SReg, TRI))
    128         return Changed;
    129       ReadsSreg |= M->readsRegister(SReg, TRI);
    130     }
    131     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
    132         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
    133       return Changed;
    134     MaskValue = M->getOperand(1).getImm();
    135     // First if sreg is only used in the AND instruction fold the immediate
    136     // into into the AND.
    137     if (!ReadsSreg && Op2.isKill()) {
    138       A->getOperand(2).ChangeToImmediate(MaskValue);
    139       M->eraseFromParent();
    140     }
    141   } else if (Op2.isImm()) {
    142     MaskValue = Op2.getImm();
    143   } else {
    144     llvm_unreachable("Op2 must be register or immediate");
    145   }
    146 
    147   // Invert mask for s_andn2
    148   assert(MaskValue == 0 || MaskValue == -1);
    149   if (A->getOpcode() == AndN2)
    150     MaskValue = ~MaskValue;
    151 
    152   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
    153     if (!MI.killsRegister(CondReg, TRI)) {
    154       // Replace AND with MOV
    155       if (MaskValue == 0) {
    156         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
    157             .addImm(0);
    158       } else {
    159         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
    160             .addReg(ExecReg);
    161       }
    162     }
    163     // Remove AND instruction
    164     A->eraseFromParent();
    165   }
    166 
    167   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
    168   if (SReg == ExecReg) {
    169     // EXEC is updated directly
    170     if (IsVCCZ) {
    171       MI.eraseFromParent();
    172       return true;
    173     }
    174     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
    175   } else if (IsVCCZ && MaskValue == 0) {
    176     // Will always branch
    177     // Remove all succesors shadowed by new unconditional branch
    178     MachineBasicBlock *Parent = MI.getParent();
    179     SmallVector<MachineInstr *, 4> ToRemove;
    180     bool Found = false;
    181     for (MachineInstr &Term : Parent->terminators()) {
    182       if (Found) {
    183         if (Term.isBranch())
    184           ToRemove.push_back(&Term);
    185       } else {
    186         Found = Term.isIdenticalTo(MI);
    187       }
    188     }
    189     assert(Found && "conditional branch is not terminator");
    190     for (auto BranchMI : ToRemove) {
    191       MachineOperand &Dst = BranchMI->getOperand(0);
    192       assert(Dst.isMBB() && "destination is not basic block");
    193       Parent->removeSuccessor(Dst.getMBB());
    194       BranchMI->eraseFromParent();
    195     }
    196 
    197     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
    198       Parent->removeSuccessor(Succ);
    199     }
    200 
    201     // Rewrite to unconditional branch
    202     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
    203   } else if (!IsVCCZ && MaskValue == 0) {
    204     // Will never branch
    205     MachineOperand &Dst = MI.getOperand(0);
    206     assert(Dst.isMBB() && "destination is not basic block");
    207     MI.getParent()->removeSuccessor(Dst.getMBB());
    208     MI.eraseFromParent();
    209     return true;
    210   } else if (MaskValue == -1) {
    211     // Depends only on EXEC
    212     MI.setDesc(
    213         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
    214   }
    215 
    216   MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
    217   MI.addImplicitDefUseOperands(*MBB.getParent());
    218 
    219   return true;
    220 }
    221 
    222 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
    223                                        MachineInstr &MI) const {
    224   MachineBasicBlock &MBB = *MI.getParent();
    225   const MachineFunction &MF = *MBB.getParent();
    226   const MachineRegisterInfo &MRI = MF.getRegInfo();
    227   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    228   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
    229   SmallVector<MachineInstr *, 4> ToRemove;
    230   bool IdxOn = true;
    231 
    232   if (!MI.isIdenticalTo(First))
    233     return false;
    234 
    235   // Scan back to find an identical S_SET_GPR_IDX_ON
    236   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
    237                                          E = MI.getIterator();
    238        I != E; ++I) {
    239     if (I->isBundle())
    240       continue;
    241     switch (I->getOpcode()) {
    242     case AMDGPU::S_SET_GPR_IDX_MODE:
    243       return false;
    244     case AMDGPU::S_SET_GPR_IDX_OFF:
    245       IdxOn = false;
    246       ToRemove.push_back(&*I);
    247       break;
    248     default:
    249       if (I->modifiesRegister(AMDGPU::M0, TRI))
    250         return false;
    251       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
    252         return false;
    253       if (llvm::any_of(I->operands(),
    254                        [&MRI, this](const MachineOperand &MO) {
    255                          return MO.isReg() &&
    256                                 TRI->isVectorRegister(MRI, MO.getReg());
    257                        })) {
    258         // The only exception allowed here is another indirect vector move
    259         // with the same mode.
    260         if (!IdxOn ||
    261             !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
    262                I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
    263               I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
    264           return false;
    265       }
    266     }
    267   }
    268 
    269   MI.eraseFromBundle();
    270   for (MachineInstr *RI : ToRemove)
    271     RI->eraseFromBundle();
    272   return true;
    273 }
    274 
    275 bool SIPreEmitPeephole::getBlockDestinations(
    276     MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
    277     MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
    278   if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
    279     return false;
    280 
    281   if (!FalseMBB)
    282     FalseMBB = SrcMBB.getNextNode();
    283 
    284   return true;
    285 }
    286 
    287 bool SIPreEmitPeephole::mustRetainExeczBranch(
    288     const MachineBasicBlock &From, const MachineBasicBlock &To) const {
    289   unsigned NumInstr = 0;
    290   const MachineFunction *MF = From.getParent();
    291 
    292   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
    293        MBBI != End && MBBI != ToI; ++MBBI) {
    294     const MachineBasicBlock &MBB = *MBBI;
    295 
    296     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
    297          I != E; ++I) {
    298       // When a uniform loop is inside non-uniform control flow, the branch
    299       // leaving the loop might never be taken when EXEC = 0.
    300       // Hence we should retain cbranch out of the loop lest it become infinite.
    301       if (I->isConditionalBranch())
    302         return true;
    303 
    304       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
    305         return true;
    306 
    307       // These instructions are potentially expensive even if EXEC = 0.
    308       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
    309           TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
    310         return true;
    311 
    312       ++NumInstr;
    313       if (NumInstr >= SkipThreshold)
    314         return true;
    315     }
    316   }
    317 
    318   return false;
    319 }
    320 
    321 // Returns true if the skip branch instruction is removed.
    322 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
    323                                           MachineBasicBlock &SrcMBB) {
    324   MachineBasicBlock *TrueMBB = nullptr;
    325   MachineBasicBlock *FalseMBB = nullptr;
    326   SmallVector<MachineOperand, 1> Cond;
    327 
    328   if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
    329     return false;
    330 
    331   // Consider only the forward branches.
    332   if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
    333       mustRetainExeczBranch(*FalseMBB, *TrueMBB))
    334     return false;
    335 
    336   LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
    337   MI.eraseFromParent();
    338   SrcMBB.removeSuccessor(TrueMBB);
    339 
    340   return true;
    341 }
    342 
    343 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
    344   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    345   TII = ST.getInstrInfo();
    346   TRI = &TII->getRegisterInfo();
    347   bool Changed = false;
    348 
    349   MF.RenumberBlocks();
    350 
    351   for (MachineBasicBlock &MBB : MF) {
    352     MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
    353     // Check first terminator for branches to optimize
    354     if (TermI != MBB.end()) {
    355       MachineInstr &MI = *TermI;
    356       switch (MI.getOpcode()) {
    357       case AMDGPU::S_CBRANCH_VCCZ:
    358       case AMDGPU::S_CBRANCH_VCCNZ:
    359         Changed |= optimizeVccBranch(MI);
    360         break;
    361       case AMDGPU::S_CBRANCH_EXECZ:
    362         Changed |= removeExeczBranch(MI, MBB);
    363         break;
    364       }
    365     }
    366 
    367     if (!ST.hasVGPRIndexMode())
    368       continue;
    369 
    370     MachineInstr *SetGPRMI = nullptr;
    371     const unsigned Threshold = 20;
    372     unsigned Count = 0;
    373     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
    374     // second is not needed. Do expensive checks in the optimizeSetGPR()
    375     // and limit the distance to 20 instructions for compile time purposes.
    376     // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
    377     // may be bundled with the instructions they modify.
    378     for (auto &MI :
    379          make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
    380       if (Count == Threshold)
    381         SetGPRMI = nullptr;
    382       else
    383         ++Count;
    384 
    385       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
    386         continue;
    387 
    388       Count = 0;
    389       if (!SetGPRMI) {
    390         SetGPRMI = &MI;
    391         continue;
    392       }
    393 
    394       if (optimizeSetGPR(*SetGPRMI, MI))
    395         Changed = true;
    396       else
    397         SetGPRMI = &MI;
    398     }
    399   }
    400 
    401   return Changed;
    402 }
    403