Home | History | Annotate | Line # | Download | only in CodeGen
      1 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // Software pipelining (SWP) is an instruction scheduling technique for loops
     10 // that overlaps loop iterations and exploits ILP via compiler transformations.
     11 //
     12 // There are multiple methods for analyzing a loop and creating a schedule.
     13 // An example algorithm is Swing Modulo Scheduling (implemented by the
     14 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant
     15 // for the task of actually rewriting a loop to adhere to the schedule, which
     16 // is what this file does.
     17 //
     18 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note
     19 // that we only support single-block loops, so "block" and "loop" can be used
     20 // interchangably.
     21 //
     22 // The Cycle of an instruction defines a partial order of the instructions in
     23 // the remapped loop. Instructions within a cycle must not consume the output
     24 // of any instruction in the same cycle. Cycle information is assumed to have
     25 // been calculated such that the processor will execute instructions in
     26 // lock-step (for example in a VLIW ISA).
     27 //
     28 // The Stage of an instruction defines the mapping between logical loop
     29 // iterations and pipelined loop iterations. An example (unrolled) pipeline
     30 // may look something like:
     31 //
     32 //  I0[0]                      Execute instruction I0 of iteration 0
     33 //  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1
     34 //         I1[1], I0[2]
     35 //                I1[2], I0[3]
     36 //
     37 // In the schedule for this unrolled sequence we would say that I0 was scheduled
     38 // in stage 0 and I1 in stage 1:
     39 //
     40 //  loop:
     41 //    [stage 0] x = I0
     42 //    [stage 1] I1 x (from stage 0)
     43 //
     44 // And to actually generate valid code we must insert a phi:
     45 //
     46 //  loop:
     47 //    x' = phi(x)
     48 //    x = I0
     49 //    I1 x'
     50 //
     51 // This is a simple example; the rules for how to generate correct code given
     52 // an arbitrary schedule containing loop-carried values are complex.
     53 //
     54 // Note that these examples only mention the steady-state kernel of the
     55 // generated loop; prologs and epilogs must be generated also that prime and
     56 // flush the pipeline. Doing so is nontrivial.
     57 //
     58 //===----------------------------------------------------------------------===//
     59 
     60 #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
     61 #define LLVM_CODEGEN_MODULOSCHEDULE_H
     62 
     63 #include "llvm/CodeGen/MachineFunction.h"
     64 #include "llvm/CodeGen/MachineLoopInfo.h"
     65 #include "llvm/CodeGen/MachineLoopUtils.h"
     66 #include "llvm/CodeGen/TargetInstrInfo.h"
     67 #include "llvm/CodeGen/TargetSubtargetInfo.h"
     68 #include <deque>
     69 #include <vector>
     70 
     71 namespace llvm {
     72 class MachineBasicBlock;
     73 class MachineInstr;
     74 class LiveIntervals;
     75 
     76 /// Represents a schedule for a single-block loop. For every instruction we
     77 /// maintain a Cycle and Stage.
     78 class ModuloSchedule {
     79 private:
     80   /// The block containing the loop instructions.
     81   MachineLoop *Loop;
     82 
     83   /// The instructions to be generated, in total order. Cycle provides a partial
     84   /// order; the total order within cycles has been decided by the schedule
     85   /// producer.
     86   std::vector<MachineInstr *> ScheduledInstrs;
     87 
     88   /// The cycle for each instruction.
     89   DenseMap<MachineInstr *, int> Cycle;
     90 
     91   /// The stage for each instruction.
     92   DenseMap<MachineInstr *, int> Stage;
     93 
     94   /// The number of stages in this schedule (Max(Stage) + 1).
     95   int NumStages;
     96 
     97 public:
     98   /// Create a new ModuloSchedule.
     99   /// \arg ScheduledInstrs The new loop instructions, in total resequenced
    100   ///    order.
    101   /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
    102   ///    not need to start at zero. ScheduledInstrs must be partially ordered by
    103   ///    Cycle.
    104   /// \arg Stage Stage index for all instructions in ScheduleInstrs.
    105   ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
    106                  std::vector<MachineInstr *> ScheduledInstrs,
    107                  DenseMap<MachineInstr *, int> Cycle,
    108                  DenseMap<MachineInstr *, int> Stage)
    109       : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
    110         Stage(std::move(Stage)) {
    111     NumStages = 0;
    112     for (auto &KV : this->Stage)
    113       NumStages = std::max(NumStages, KV.second);
    114     ++NumStages;
    115   }
    116 
    117   /// Return the single-block loop being scheduled.
    118   MachineLoop *getLoop() const { return Loop; }
    119 
    120   /// Return the number of stages contained in this schedule, which is the
    121   /// largest stage index + 1.
    122   int getNumStages() const { return NumStages; }
    123 
    124   /// Return the first cycle in the schedule, which is the cycle index of the
    125   /// first instruction.
    126   int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
    127 
    128   /// Return the final cycle in the schedule, which is the cycle index of the
    129   /// last instruction.
    130   int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
    131 
    132   /// Return the stage that MI is scheduled in, or -1.
    133   int getStage(MachineInstr *MI) {
    134     auto I = Stage.find(MI);
    135     return I == Stage.end() ? -1 : I->second;
    136   }
    137 
    138   /// Return the cycle that MI is scheduled at, or -1.
    139   int getCycle(MachineInstr *MI) {
    140     auto I = Cycle.find(MI);
    141     return I == Cycle.end() ? -1 : I->second;
    142   }
    143 
    144   /// Set the stage of a newly created instruction.
    145   void setStage(MachineInstr *MI, int MIStage) {
    146     assert(Stage.count(MI) == 0);
    147     Stage[MI] = MIStage;
    148   }
    149 
    150   /// Return the rescheduled instructions in order.
    151   ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
    152 
    153   void dump() { print(dbgs()); }
    154   void print(raw_ostream &OS);
    155 };
    156 
    157 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
    158 /// rewriting the old loop and inserting prologs and epilogs as required.
    159 class ModuloScheduleExpander {
    160 public:
    161   using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
    162 
    163 private:
    164   using ValueMapTy = DenseMap<unsigned, unsigned>;
    165   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
    166   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
    167 
    168   ModuloSchedule &Schedule;
    169   MachineFunction &MF;
    170   const TargetSubtargetInfo &ST;
    171   MachineRegisterInfo &MRI;
    172   const TargetInstrInfo *TII;
    173   LiveIntervals &LIS;
    174 
    175   MachineBasicBlock *BB;
    176   MachineBasicBlock *Preheader;
    177   MachineBasicBlock *NewKernel = nullptr;
    178   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
    179 
    180   /// Map for each register and the max difference between its uses and def.
    181   /// The first element in the pair is the max difference in stages. The
    182   /// second is true if the register defines a Phi value and loop value is
    183   /// scheduled before the Phi.
    184   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
    185 
    186   /// Instructions to change when emitting the final schedule.
    187   InstrChangesTy InstrChanges;
    188 
    189   void generatePipelinedLoop();
    190   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
    191                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
    192   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
    193                       ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
    194                       MBBVectorTy &PrologBBs);
    195   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
    196                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
    197                             ValueMapTy *VRMap, InstrMapTy &InstrMap,
    198                             unsigned LastStageNum, unsigned CurStageNum,
    199                             bool IsLast);
    200   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
    201                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
    202                     ValueMapTy *VRMap, InstrMapTy &InstrMap,
    203                     unsigned LastStageNum, unsigned CurStageNum, bool IsLast);
    204   void removeDeadInstructions(MachineBasicBlock *KernelBB,
    205                               MBBVectorTy &EpilogBBs);
    206   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
    207   void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
    208                    MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
    209                    ValueMapTy *VRMap);
    210   bool computeDelta(MachineInstr &MI, unsigned &Delta);
    211   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
    212                          unsigned Num);
    213   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
    214                            unsigned InstStageNum);
    215   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
    216                                     unsigned InstStageNum);
    217   void updateInstruction(MachineInstr *NewMI, bool LastDef,
    218                          unsigned CurStageNum, unsigned InstrStageNum,
    219                          ValueMapTy *VRMap);
    220   MachineInstr *findDefInLoop(unsigned Reg);
    221   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
    222                          unsigned LoopStage, ValueMapTy *VRMap,
    223                          MachineBasicBlock *BB);
    224   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
    225                         ValueMapTy *VRMap, InstrMapTy &InstrMap);
    226   void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
    227                              unsigned CurStageNum, unsigned PhiNum,
    228                              MachineInstr *Phi, unsigned OldReg,
    229                              unsigned NewReg, unsigned PrevReg = 0);
    230   bool isLoopCarried(MachineInstr &Phi);
    231 
    232   /// Return the max. number of stages/iterations that can occur between a
    233   /// register definition and its uses.
    234   unsigned getStagesForReg(int Reg, unsigned CurStage) {
    235     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
    236     if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
    237         Stages.second)
    238       return 1;
    239     return Stages.first;
    240   }
    241 
    242   /// The number of stages for a Phi is a little different than other
    243   /// instructions. The minimum value computed in RegToStageDiff is 1
    244   /// because we assume the Phi is needed for at least 1 iteration.
    245   /// This is not the case if the loop value is scheduled prior to the
    246   /// Phi in the same stage.  This function returns the number of stages
    247   /// or iterations needed between the Phi definition and any uses.
    248   unsigned getStagesForPhi(int Reg) {
    249     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
    250     if (Stages.second)
    251       return Stages.first;
    252     return Stages.first - 1;
    253   }
    254 
    255 public:
    256   /// Create a new ModuloScheduleExpander.
    257   /// \arg InstrChanges Modifications to make to instructions with memory
    258   ///   operands.
    259   /// FIXME: InstrChanges is opaque and is an implementation detail of an
    260   ///   optimization in MachinePipeliner that crosses abstraction boundaries.
    261   ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
    262                          LiveIntervals &LIS, InstrChangesTy InstrChanges)
    263       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
    264         TII(ST.getInstrInfo()), LIS(LIS),
    265         InstrChanges(std::move(InstrChanges)) {}
    266 
    267   /// Performs the actual expansion.
    268   void expand();
    269   /// Performs final cleanup after expansion.
    270   void cleanup();
    271 
    272   /// Returns the newly rewritten kernel block, or nullptr if this was
    273   /// optimized away.
    274   MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
    275 };
    276 
    277 /// A reimplementation of ModuloScheduleExpander. It works by generating a
    278 /// standalone kernel loop and peeling out the prologs and epilogs.
    279 class PeelingModuloScheduleExpander {
    280 public:
    281   PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
    282                                 LiveIntervals *LIS)
    283       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
    284         TII(ST.getInstrInfo()), LIS(LIS) {}
    285 
    286   void expand();
    287 
    288   /// Runs ModuloScheduleExpander and treats it as a golden input to validate
    289   /// aspects of the code generated by PeelingModuloScheduleExpander.
    290   void validateAgainstModuloScheduleExpander();
    291 
    292 protected:
    293   ModuloSchedule &Schedule;
    294   MachineFunction &MF;
    295   const TargetSubtargetInfo &ST;
    296   MachineRegisterInfo &MRI;
    297   const TargetInstrInfo *TII;
    298   LiveIntervals *LIS;
    299 
    300   /// The original loop block that gets rewritten in-place.
    301   MachineBasicBlock *BB;
    302   /// The original loop preheader.
    303   MachineBasicBlock *Preheader;
    304   /// All prolog and epilog blocks.
    305   SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
    306   /// For every block, the stages that are produced.
    307   DenseMap<MachineBasicBlock *, BitVector> LiveStages;
    308   /// For every block, the stages that are available. A stage can be available
    309   /// but not produced (in the epilog) or produced but not available (in the
    310   /// prolog).
    311   DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
    312   /// When peeling the epilogue keep track of the distance between the phi
    313   /// nodes and the kernel.
    314   DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
    315 
    316   /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
    317   /// loop kernel clones.
    318   DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
    319   DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
    320       BlockMIs;
    321 
    322   /// State passed from peelKernel to peelPrologAndEpilogs().
    323   std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
    324   /// Illegal phis that need to be deleted once we re-link stages.
    325   SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
    326 
    327   /// Converts BB from the original loop body to the rewritten, pipelined
    328   /// steady-state.
    329   void rewriteKernel();
    330 
    331   /// Peels one iteration of the rewritten kernel (BB) in the specified
    332   /// direction.
    333   MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
    334   // Delete instructions whose stage is less than MinStage in the given basic
    335   // block.
    336   void filterInstructions(MachineBasicBlock *MB, int MinStage);
    337   // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
    338   // instructions to keep a valid IR.
    339   void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
    340                               MachineBasicBlock *SourceBB, unsigned Stage);
    341   /// Peel the kernel forwards and backwards to produce prologs and epilogs,
    342   /// and stitch them together.
    343   void peelPrologAndEpilogs();
    344   /// All prolog and epilog blocks are clones of the kernel, so any produced
    345   /// register in one block has an corollary in all other blocks.
    346   Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
    347   /// Change all users of MI, if MI is predicated out
    348   /// (LiveStages[MI->getParent()] == false).
    349   void rewriteUsesOf(MachineInstr *MI);
    350   /// Insert branches between prologs, kernel and epilogs.
    351   void fixupBranches();
    352   /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
    353   /// to a block dominated by all prologs and epilogs. This allows us to treat
    354   /// the loop exiting block as any other kernel clone.
    355   MachineBasicBlock *CreateLCSSAExitingBlock();
    356   /// Helper to get the stage of an instruction in the schedule.
    357   unsigned getStage(MachineInstr *MI) {
    358     if (CanonicalMIs.count(MI))
    359       MI = CanonicalMIs[MI];
    360     return Schedule.getStage(MI);
    361   }
    362   /// Helper function to find the right canonical register for a phi instruction
    363   /// coming from a peeled out prologue.
    364   Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
    365   /// Target loop info before kernel peeling.
    366   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
    367 };
    368 
    369 /// Expander that simply annotates each scheduled instruction with a post-instr
    370 /// symbol that can be consumed by the ModuloScheduleTest pass.
    371 ///
    372 /// The post-instr symbol is a way of annotating an instruction that can be
    373 /// roundtripped in MIR. The syntax is:
    374 ///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
    375 class ModuloScheduleTestAnnotater {
    376   MachineFunction &MF;
    377   ModuloSchedule &S;
    378 
    379 public:
    380   ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
    381       : MF(MF), S(S) {}
    382 
    383   /// Performs the annotation.
    384   void annotate();
    385 };
    386 
    387 } // end namespace llvm
    388 
    389 #endif // LLVM_CODEGEN_MODULOSCHEDULE_H
    390