Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// SI Machine Scheduler interface
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
     15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
     16 
     17 #include "llvm/CodeGen/MachineScheduler.h"
     18 #include "llvm/CodeGen/RegisterPressure.h"
     19 #include "llvm/CodeGen/ScheduleDAG.h"
     20 #include <cstdint>
     21 #include <set>
     22 #include <vector>
     23 
     24 namespace llvm {
     25 
     26 class SIInstrInfo;
     27 class SIRegisterInfo;
     28 
     29 enum SIScheduleCandReason {
     30   NoCand,
     31   RegUsage,
     32   Latency,
     33   Successor,
     34   Depth,
     35   NodeOrder
     36 };
     37 
     38 struct SISchedulerCandidate {
     39   // The reason for this candidate.
     40   SIScheduleCandReason Reason = NoCand;
     41 
     42   // Set of reasons that apply to multiple candidates.
     43   uint32_t RepeatReasonSet = 0;
     44 
     45   SISchedulerCandidate() = default;
     46 
     47   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
     48   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
     49 };
     50 
     51 class SIScheduleDAGMI;
     52 class SIScheduleBlockCreator;
     53 
     54 enum SIScheduleBlockLinkKind {
     55   NoData,
     56   Data
     57 };
     58 
     59 class SIScheduleBlock {
     60   SIScheduleDAGMI *DAG;
     61   SIScheduleBlockCreator *BC;
     62 
     63   std::vector<SUnit*> SUnits;
     64   std::map<unsigned, unsigned> NodeNum2Index;
     65   std::vector<SUnit*> TopReadySUs;
     66   std::vector<SUnit*> ScheduledSUnits;
     67 
     68   /// The top of the unscheduled zone.
     69   IntervalPressure TopPressure;
     70   RegPressureTracker TopRPTracker;
     71 
     72   // Pressure: number of said class of registers needed to
     73   // store the live virtual and real registers.
     74   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
     75   // Pressure of additional registers required inside the block.
     76   std::vector<unsigned> InternalAdditionnalPressure;
     77   // Pressure of input and output registers
     78   std::vector<unsigned> LiveInPressure;
     79   std::vector<unsigned> LiveOutPressure;
     80   // Registers required by the block, and outputs.
     81   // We do track only virtual registers.
     82   // Note that some registers are not 32 bits,
     83   // and thus the pressure is not equal
     84   // to the number of live registers.
     85   std::set<unsigned> LiveInRegs;
     86   std::set<unsigned> LiveOutRegs;
     87 
     88   bool Scheduled = false;
     89   bool HighLatencyBlock = false;
     90 
     91   std::vector<unsigned> HasLowLatencyNonWaitedParent;
     92 
     93   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
     94   unsigned ID;
     95 
     96   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
     97   // All blocks successors, and the kind of link
     98   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
     99   unsigned NumHighLatencySuccessors = 0;
    100 
    101 public:
    102   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
    103                   unsigned ID):
    104     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
    105 
    106   ~SIScheduleBlock() = default;
    107 
    108   unsigned getID() const { return ID; }
    109 
    110   /// Functions for Block construction.
    111   void addUnit(SUnit *SU);
    112 
    113   // When all SUs have been added.
    114   void finalizeUnits();
    115 
    116   // Add block pred, which has instruction predecessor of SU.
    117   void addPred(SIScheduleBlock *Pred);
    118   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
    119 
    120   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
    121   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
    122     getSuccs() const { return Succs; }
    123 
    124   unsigned Height;  // Maximum topdown path length to block without outputs
    125   unsigned Depth;   // Maximum bottomup path length to block without inputs
    126 
    127   unsigned getNumHighLatencySuccessors() const {
    128     return NumHighLatencySuccessors;
    129   }
    130 
    131   bool isHighLatencyBlock() { return HighLatencyBlock; }
    132 
    133   // This is approximative.
    134   // Ideally should take into accounts some instructions (rcp, etc)
    135   // are 4 times slower.
    136   int getCost() { return SUnits.size(); }
    137 
    138   // The block Predecessors and Successors must be all registered
    139   // before fastSchedule().
    140   // Fast schedule with no particular requirement.
    141   void fastSchedule();
    142 
    143   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
    144 
    145   // Complete schedule that will try to minimize reg pressure and
    146   // low latencies, and will fill liveins and liveouts.
    147   // Needs all MIs to be grouped between BeginBlock and EndBlock.
    148   // The MIs can be moved after the scheduling,
    149   // it is just used to allow correct track of live registers.
    150   void schedule(MachineBasicBlock::iterator BeginBlock,
    151                 MachineBasicBlock::iterator EndBlock);
    152 
    153   bool isScheduled() { return Scheduled; }
    154 
    155   // Needs the block to be scheduled inside
    156   // TODO: find a way to compute it.
    157   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
    158     return InternalAdditionnalPressure;
    159   }
    160 
    161   std::set<unsigned> &getInRegs() { return LiveInRegs; }
    162   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
    163 
    164   void printDebug(bool Full);
    165 
    166 private:
    167   struct SISchedCandidate : SISchedulerCandidate {
    168     // The best SUnit candidate.
    169     SUnit *SU = nullptr;
    170 
    171     unsigned SGPRUsage;
    172     unsigned VGPRUsage;
    173     bool IsLowLatency;
    174     unsigned LowLatencyOffset;
    175     bool HasLowLatencyNonWaitedParent;
    176 
    177     SISchedCandidate() = default;
    178 
    179     bool isValid() const { return SU; }
    180 
    181     // Copy the status of another candidate without changing policy.
    182     void setBest(SISchedCandidate &Best) {
    183       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
    184       SU = Best.SU;
    185       Reason = Best.Reason;
    186       SGPRUsage = Best.SGPRUsage;
    187       VGPRUsage = Best.VGPRUsage;
    188       IsLowLatency = Best.IsLowLatency;
    189       LowLatencyOffset = Best.LowLatencyOffset;
    190       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
    191     }
    192   };
    193 
    194   void undoSchedule();
    195 
    196   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
    197   void releaseSucc(SUnit *SU, SDep *SuccEdge);
    198   // InOrOutBlock: restrict to links pointing inside the block (true),
    199   // or restrict to links pointing outside the block (false).
    200   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
    201 
    202   void nodeScheduled(SUnit *SU);
    203   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
    204   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
    205   SUnit* pickNode();
    206   void traceCandidate(const SISchedCandidate &Cand);
    207   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
    208                        MachineBasicBlock::iterator EndBlock);
    209 };
    210 
    211 struct SIScheduleBlocks {
    212   std::vector<SIScheduleBlock*> Blocks;
    213   std::vector<int> TopDownIndex2Block;
    214   std::vector<int> TopDownBlock2Index;
    215 };
    216 
    217 enum SISchedulerBlockCreatorVariant {
    218   LatenciesAlone,
    219   LatenciesGrouped,
    220   LatenciesAlonePlusConsecutive
    221 };
    222 
    223 class SIScheduleBlockCreator {
    224   SIScheduleDAGMI *DAG;
    225   // unique_ptr handles freeing memory for us.
    226   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
    227   std::map<SISchedulerBlockCreatorVariant,
    228            SIScheduleBlocks> Blocks;
    229   std::vector<SIScheduleBlock*> CurrentBlocks;
    230   std::vector<int> Node2CurrentBlock;
    231 
    232   // Topological sort
    233   // Maps topological index to the node number.
    234   std::vector<int> TopDownIndex2Block;
    235   std::vector<int> TopDownBlock2Index;
    236   std::vector<int> BottomUpIndex2Block;
    237 
    238   // 0 -> Color not given.
    239   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
    240   // Above -> Other groups.
    241   int NextReservedID;
    242   int NextNonReservedID;
    243   std::vector<int> CurrentColoring;
    244   std::vector<int> CurrentTopDownReservedDependencyColoring;
    245   std::vector<int> CurrentBottomUpReservedDependencyColoring;
    246 
    247 public:
    248   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
    249 
    250   SIScheduleBlocks
    251   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
    252 
    253   bool isSUInBlock(SUnit *SU, unsigned ID);
    254 
    255 private:
    256   // Give a Reserved color to every high latency.
    257   void colorHighLatenciesAlone();
    258 
    259   // Create groups of high latencies with a Reserved color.
    260   void colorHighLatenciesGroups();
    261 
    262   // Compute coloring for topdown and bottom traversals with
    263   // different colors depending on dependencies on Reserved colors.
    264   void colorComputeReservedDependencies();
    265 
    266   // Give color to all non-colored SUs according to Reserved groups dependencies.
    267   void colorAccordingToReservedDependencies();
    268 
    269   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
    270   // The new colors are computed according to the dependencies on the other blocks
    271   // formed with colorAccordingToReservedDependencies.
    272   void colorEndsAccordingToDependencies();
    273 
    274   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
    275   void colorForceConsecutiveOrderInGroup();
    276 
    277   // Merge Constant loads that have all their users into another group to the group.
    278   // (TODO: else if all their users depend on the same group, put them there)
    279   void colorMergeConstantLoadsNextGroup();
    280 
    281   // Merge SUs that have all their users into another group to the group
    282   void colorMergeIfPossibleNextGroup();
    283 
    284   // Merge SUs that have all their users into another group to the group,
    285   // but only for Reserved groups.
    286   void colorMergeIfPossibleNextGroupOnlyForReserved();
    287 
    288   // Merge SUs that have all their users into another group to the group,
    289   // but only if the group is no more than a few SUs.
    290   void colorMergeIfPossibleSmallGroupsToNextGroup();
    291 
    292   // Divides Blocks with important size.
    293   // Idea of implementation: attribute new colors depending on topdown and
    294   // bottom up links to other blocks.
    295   void cutHugeBlocks();
    296 
    297   // Put in one group all instructions with no users in this scheduling region
    298   // (we'd want these groups be at the end).
    299   void regroupNoUserInstructions();
    300 
    301   // Give Reserved color to export instructions
    302   void colorExports();
    303 
    304   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
    305 
    306   void topologicalSort();
    307 
    308   void scheduleInsideBlocks();
    309 
    310   void fillStats();
    311 };
    312 
    313 enum SISchedulerBlockSchedulerVariant {
    314   BlockLatencyRegUsage,
    315   BlockRegUsageLatency,
    316   BlockRegUsage
    317 };
    318 
    319 class SIScheduleBlockScheduler {
    320   SIScheduleDAGMI *DAG;
    321   SISchedulerBlockSchedulerVariant Variant;
    322   std::vector<SIScheduleBlock*> Blocks;
    323 
    324   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
    325   std::set<unsigned> LiveRegs;
    326   // Num of schedulable unscheduled blocks reading the register.
    327   std::map<unsigned, unsigned> LiveRegsConsumers;
    328 
    329   std::vector<unsigned> LastPosHighLatencyParentScheduled;
    330   int LastPosWaitedHighLatency;
    331 
    332   std::vector<SIScheduleBlock*> BlocksScheduled;
    333   unsigned NumBlockScheduled;
    334   std::vector<SIScheduleBlock*> ReadyBlocks;
    335 
    336   unsigned VregCurrentUsage;
    337   unsigned SregCurrentUsage;
    338 
    339   // Currently is only approximation.
    340   unsigned maxVregUsage;
    341   unsigned maxSregUsage;
    342 
    343   std::vector<unsigned> BlockNumPredsLeft;
    344   std::vector<unsigned> BlockNumSuccsLeft;
    345 
    346 public:
    347   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
    348                            SISchedulerBlockSchedulerVariant Variant,
    349                            SIScheduleBlocks BlocksStruct);
    350   ~SIScheduleBlockScheduler() = default;
    351 
    352   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
    353 
    354   unsigned getVGPRUsage() { return maxVregUsage; }
    355   unsigned getSGPRUsage() { return maxSregUsage; }
    356 
    357 private:
    358   struct SIBlockSchedCandidate : SISchedulerCandidate {
    359     // The best Block candidate.
    360     SIScheduleBlock *Block = nullptr;
    361 
    362     bool IsHighLatency;
    363     int VGPRUsageDiff;
    364     unsigned NumSuccessors;
    365     unsigned NumHighLatencySuccessors;
    366     unsigned LastPosHighLatParentScheduled;
    367     unsigned Height;
    368 
    369     SIBlockSchedCandidate() = default;
    370 
    371     bool isValid() const { return Block; }
    372 
    373     // Copy the status of another candidate without changing policy.
    374     void setBest(SIBlockSchedCandidate &Best) {
    375       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
    376       Block = Best.Block;
    377       Reason = Best.Reason;
    378       IsHighLatency = Best.IsHighLatency;
    379       VGPRUsageDiff = Best.VGPRUsageDiff;
    380       NumSuccessors = Best.NumSuccessors;
    381       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
    382       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
    383       Height = Best.Height;
    384     }
    385   };
    386 
    387   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
    388                            SIBlockSchedCandidate &TryCand);
    389   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
    390                             SIBlockSchedCandidate &TryCand);
    391   SIScheduleBlock *pickBlock();
    392 
    393   void addLiveRegs(std::set<unsigned> &Regs);
    394   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
    395   void releaseBlockSuccs(SIScheduleBlock *Parent);
    396   void blockScheduled(SIScheduleBlock *Block);
    397 
    398   // Check register pressure change
    399   // by scheduling a block with these LiveIn and LiveOut.
    400   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
    401                                        std::set<unsigned> &OutRegs);
    402 
    403   void schedule();
    404 };
    405 
    406 struct SIScheduleBlockResult {
    407   std::vector<unsigned> SUs;
    408   unsigned MaxSGPRUsage;
    409   unsigned MaxVGPRUsage;
    410 };
    411 
    412 class SIScheduler {
    413   SIScheduleDAGMI *DAG;
    414   SIScheduleBlockCreator BlockCreator;
    415 
    416 public:
    417   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
    418 
    419   ~SIScheduler() = default;
    420 
    421   struct SIScheduleBlockResult
    422   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
    423                   SISchedulerBlockSchedulerVariant ScheduleVariant);
    424 };
    425 
    426 class SIScheduleDAGMI final : public ScheduleDAGMILive {
    427   const SIInstrInfo *SITII;
    428   const SIRegisterInfo *SITRI;
    429 
    430   std::vector<SUnit> SUnitsLinksBackup;
    431 
    432   // For moveLowLatencies. After all Scheduling variants are tested.
    433   std::vector<unsigned> ScheduledSUnits;
    434   std::vector<unsigned> ScheduledSUnitsInv;
    435 
    436 public:
    437   SIScheduleDAGMI(MachineSchedContext *C);
    438 
    439   ~SIScheduleDAGMI() override;
    440 
    441   // Entry point for the schedule.
    442   void schedule() override;
    443 
    444   // To init Block's RPTracker.
    445   void initRPTracker(RegPressureTracker &RPTracker) {
    446     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
    447   }
    448 
    449   MachineBasicBlock *getBB() { return BB; }
    450   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
    451   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
    452   LiveIntervals *getLIS() { return LIS; }
    453   MachineRegisterInfo *getMRI() { return &MRI; }
    454   const TargetRegisterInfo *getTRI() { return TRI; }
    455   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
    456   SUnit &getEntrySU() { return EntrySU; }
    457   SUnit& getExitSU() { return ExitSU; }
    458 
    459   void restoreSULinksLeft();
    460 
    461   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
    462                                                      _Iterator End,
    463                                                      unsigned &VgprUsage,
    464                                                      unsigned &SgprUsage);
    465 
    466   std::set<unsigned> getInRegs() {
    467     std::set<unsigned> InRegs;
    468     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
    469       InRegs.insert(RegMaskPair.RegUnit);
    470     }
    471     return InRegs;
    472   }
    473 
    474   std::set<unsigned> getOutRegs() {
    475     std::set<unsigned> OutRegs;
    476     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
    477       OutRegs.insert(RegMaskPair.RegUnit);
    478     }
    479     return OutRegs;
    480   };
    481 
    482 private:
    483   void topologicalSort();
    484   // After scheduling is done, improve low latency placements.
    485   void moveLowLatencies();
    486 
    487 public:
    488   // Some stats for scheduling inside blocks.
    489   std::vector<unsigned> IsLowLatencySU;
    490   std::vector<unsigned> LowLatencyOffset;
    491   std::vector<unsigned> IsHighLatencySU;
    492   // Topological sort
    493   // Maps topological index to the node number.
    494   std::vector<int> TopDownIndex2SU;
    495   std::vector<int> BottomUpIndex2SU;
    496 };
    497 
    498 } // end namespace llvm
    499 
    500 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
    501