Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// SI Machine Scheduler interface
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "SIMachineScheduler.h"
     15 #include "SIInstrInfo.h"
     16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     17 #include "llvm/CodeGen/LiveIntervals.h"
     18 #include "llvm/CodeGen/MachineRegisterInfo.h"
     19 
     20 using namespace llvm;
     21 
     22 #define DEBUG_TYPE "machine-scheduler"
     23 
     24 // This scheduler implements a different scheduling algorithm than
     25 // GenericScheduler.
     26 //
     27 // There are several specific architecture behaviours that can't be modelled
     28 // for GenericScheduler:
     29 // . When accessing the result of an SGPR load instruction, you have to wait
     30 // for all the SGPR load instructions before your current instruction to
     31 // have finished.
     32 // . When accessing the result of an VGPR load instruction, you have to wait
     33 // for all the VGPR load instructions previous to the VGPR load instruction
     34 // you are interested in to finish.
     35 // . The less the register pressure, the best load latencies are hidden
     36 //
     37 // Moreover some specifities (like the fact a lot of instructions in the shader
     38 // have few dependencies) makes the generic scheduler have some unpredictable
     39 // behaviours. For example when register pressure becomes high, it can either
     40 // manage to prevent register pressure from going too high, or it can
     41 // increase register pressure even more than if it hadn't taken register
     42 // pressure into account.
     43 //
     44 // Also some other bad behaviours are generated, like loading at the beginning
     45 // of the shader a constant in VGPR you won't need until the end of the shader.
     46 //
     47 // The scheduling problem for SI can distinguish three main parts:
     48 // . Hiding high latencies (texture sampling, etc)
     49 // . Hiding low latencies (SGPR constant loading, etc)
     50 // . Keeping register usage low for better latency hiding and general
     51 //   performance
     52 //
     53 // Some other things can also affect performance, but are hard to predict
     54 // (cache usage, the fact the HW can issue several instructions from different
     55 // wavefronts if different types, etc)
     56 //
     57 // This scheduler tries to solve the scheduling problem by dividing it into
     58 // simpler sub-problems. It divides the instructions into blocks, schedules
     59 // locally inside the blocks where it takes care of low latencies, and then
     60 // chooses the order of the blocks by taking care of high latencies.
     61 // Dividing the instructions into blocks helps control keeping register
     62 // usage low.
     63 //
     64 // First the instructions are put into blocks.
     65 //   We want the blocks help control register usage and hide high latencies
     66 //   later. To help control register usage, we typically want all local
     67 //   computations, when for example you create a result that can be comsummed
     68 //   right away, to be contained in a block. Block inputs and outputs would
     69 //   typically be important results that are needed in several locations of
     70 //   the shader. Since we do want blocks to help hide high latencies, we want
     71 //   the instructions inside the block to have a minimal set of dependencies
     72 //   on high latencies. It will make it easy to pick blocks to hide specific
     73 //   high latencies.
     74 //   The block creation algorithm is divided into several steps, and several
     75 //   variants can be tried during the scheduling process.
     76 //
     77 // Second the order of the instructions inside the blocks is chosen.
     78 //   At that step we do take into account only register usage and hiding
     79 //   low latency instructions
     80 //
     81 // Third the block order is chosen, there we try to hide high latencies
     82 // and keep register usage low.
     83 //
     84 // After the third step, a pass is done to improve the hiding of low
     85 // latencies.
     86 //
     87 // Actually when talking about 'low latency' or 'high latency' it includes
     88 // both the latency to get the cache (or global mem) data go to the register,
     89 // and the bandwidth limitations.
     90 // Increasing the number of active wavefronts helps hide the former, but it
     91 // doesn't solve the latter, thus why even if wavefront count is high, we have
     92 // to try have as many instructions hiding high latencies as possible.
     93 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
     94 // which is hidden by 10 instructions if the wavefront count is 10.
     95 
     96 // Some figures taken from AMD docs:
     97 // Both texture and constant L1 caches are 4-way associative with 64 bytes
     98 // lines.
     99 // Constant cache is shared with 4 CUs.
    100 // For texture sampling, the address generation unit receives 4 texture
    101 // addresses per cycle, thus we could expect texture sampling latency to be
    102 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
    103 // instructions in a wavefront group are executed every 4 cycles),
    104 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
    105 // of the CU do texture sampling too. (Don't take these figures too seriously,
    106 // as I'm not 100% sure of the computation)
    107 // Data exports should get similar latency.
    108 // For constant loading, the cache is shader with 4 CUs.
    109 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
    110 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
    111 // It means a simple s_buffer_load should take one instruction to hide, as
    112 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
    113 // cache line.
    114 //
    115 // As of today the driver doesn't preload the constants in cache, thus the
    116 // first loads get extra latency. The doc says global memory access can be
    117 // 300-600 cycles. We do not specially take that into account when scheduling
    118 // As we expect the driver to be able to preload the constants soon.
    119 
    120 // common code //
    121 
    122 #ifndef NDEBUG
    123 
    124 static const char *getReasonStr(SIScheduleCandReason Reason) {
    125   switch (Reason) {
    126   case NoCand:         return "NOCAND";
    127   case RegUsage:       return "REGUSAGE";
    128   case Latency:        return "LATENCY";
    129   case Successor:      return "SUCCESSOR";
    130   case Depth:          return "DEPTH";
    131   case NodeOrder:      return "ORDER";
    132   }
    133   llvm_unreachable("Unknown reason!");
    134 }
    135 
    136 #endif
    137 
    138 namespace llvm {
    139 namespace SISched {
    140 static bool tryLess(int TryVal, int CandVal,
    141                     SISchedulerCandidate &TryCand,
    142                     SISchedulerCandidate &Cand,
    143                     SIScheduleCandReason Reason) {
    144   if (TryVal < CandVal) {
    145     TryCand.Reason = Reason;
    146     return true;
    147   }
    148   if (TryVal > CandVal) {
    149     if (Cand.Reason > Reason)
    150       Cand.Reason = Reason;
    151     return true;
    152   }
    153   Cand.setRepeat(Reason);
    154   return false;
    155 }
    156 
    157 static bool tryGreater(int TryVal, int CandVal,
    158                        SISchedulerCandidate &TryCand,
    159                        SISchedulerCandidate &Cand,
    160                        SIScheduleCandReason Reason) {
    161   if (TryVal > CandVal) {
    162     TryCand.Reason = Reason;
    163     return true;
    164   }
    165   if (TryVal < CandVal) {
    166     if (Cand.Reason > Reason)
    167       Cand.Reason = Reason;
    168     return true;
    169   }
    170   Cand.setRepeat(Reason);
    171   return false;
    172 }
    173 } // end namespace SISched
    174 } // end namespace llvm
    175 
    176 // SIScheduleBlock //
    177 
    178 void SIScheduleBlock::addUnit(SUnit *SU) {
    179   NodeNum2Index[SU->NodeNum] = SUnits.size();
    180   SUnits.push_back(SU);
    181 }
    182 
    183 #ifndef NDEBUG
    184 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
    185 
    186   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
    187   dbgs() << '\n';
    188 }
    189 #endif
    190 
    191 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
    192                                           SISchedCandidate &TryCand) {
    193   // Initialize the candidate if needed.
    194   if (!Cand.isValid()) {
    195     TryCand.Reason = NodeOrder;
    196     return;
    197   }
    198 
    199   if (Cand.SGPRUsage > 60 &&
    200       SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
    201                        TryCand, Cand, RegUsage))
    202     return;
    203 
    204   // Schedule low latency instructions as top as possible.
    205   // Order of priority is:
    206   // . Low latency instructions which do not depend on other low latency
    207   //   instructions we haven't waited for
    208   // . Other instructions which do not depend on low latency instructions
    209   //   we haven't waited for
    210   // . Low latencies
    211   // . All other instructions
    212   // Goal is to get: low latency instructions - independent instructions
    213   //     - (eventually some more low latency instructions)
    214   //     - instructions that depend on the first low latency instructions.
    215   // If in the block there is a lot of constant loads, the SGPR usage
    216   // could go quite high, thus above the arbitrary limit of 60 will encourage
    217   // use the already loaded constants (in order to release some SGPRs) before
    218   // loading more.
    219   if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
    220                        Cand.HasLowLatencyNonWaitedParent,
    221                        TryCand, Cand, SIScheduleCandReason::Depth))
    222     return;
    223 
    224   if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
    225                           TryCand, Cand, SIScheduleCandReason::Depth))
    226     return;
    227 
    228   if (TryCand.IsLowLatency &&
    229       SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
    230                        TryCand, Cand, SIScheduleCandReason::Depth))
    231     return;
    232 
    233   if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
    234                        TryCand, Cand, RegUsage))
    235     return;
    236 
    237   // Fall through to original instruction order.
    238   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
    239     TryCand.Reason = NodeOrder;
    240   }
    241 }
    242 
    243 SUnit* SIScheduleBlock::pickNode() {
    244   SISchedCandidate TopCand;
    245 
    246   for (SUnit* SU : TopReadySUs) {
    247     SISchedCandidate TryCand;
    248     std::vector<unsigned> pressure;
    249     std::vector<unsigned> MaxPressure;
    250     // Predict register usage after this instruction.
    251     TryCand.SU = SU;
    252     TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
    253     TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
    254     TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
    255     TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
    256     TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
    257     TryCand.HasLowLatencyNonWaitedParent =
    258       HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
    259     tryCandidateTopDown(TopCand, TryCand);
    260     if (TryCand.Reason != NoCand)
    261       TopCand.setBest(TryCand);
    262   }
    263 
    264   return TopCand.SU;
    265 }
    266 
    267 
    268 // Schedule something valid.
    269 void SIScheduleBlock::fastSchedule() {
    270   TopReadySUs.clear();
    271   if (Scheduled)
    272     undoSchedule();
    273 
    274   for (SUnit* SU : SUnits) {
    275     if (!SU->NumPredsLeft)
    276       TopReadySUs.push_back(SU);
    277   }
    278 
    279   while (!TopReadySUs.empty()) {
    280     SUnit *SU = TopReadySUs[0];
    281     ScheduledSUnits.push_back(SU);
    282     nodeScheduled(SU);
    283   }
    284 
    285   Scheduled = true;
    286 }
    287 
    288 // Returns if the register was set between first and last.
    289 static bool isDefBetween(unsigned Reg,
    290                            SlotIndex First, SlotIndex Last,
    291                            const MachineRegisterInfo *MRI,
    292                            const LiveIntervals *LIS) {
    293   for (MachineRegisterInfo::def_instr_iterator
    294        UI = MRI->def_instr_begin(Reg),
    295        UE = MRI->def_instr_end(); UI != UE; ++UI) {
    296     const MachineInstr* MI = &*UI;
    297     if (MI->isDebugValue())
    298       continue;
    299     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
    300     if (InstSlot >= First && InstSlot <= Last)
    301       return true;
    302   }
    303   return false;
    304 }
    305 
    306 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
    307                                       MachineBasicBlock::iterator EndBlock) {
    308   IntervalPressure Pressure, BotPressure;
    309   RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
    310   LiveIntervals *LIS = DAG->getLIS();
    311   MachineRegisterInfo *MRI = DAG->getMRI();
    312   DAG->initRPTracker(TopRPTracker);
    313   DAG->initRPTracker(BotRPTracker);
    314   DAG->initRPTracker(RPTracker);
    315 
    316   // Goes though all SU. RPTracker captures what had to be alive for the SUs
    317   // to execute, and what is still alive at the end.
    318   for (SUnit* SU : ScheduledSUnits) {
    319     RPTracker.setPos(SU->getInstr());
    320     RPTracker.advance();
    321   }
    322 
    323   // Close the RPTracker to finalize live ins/outs.
    324   RPTracker.closeRegion();
    325 
    326   // Initialize the live ins and live outs.
    327   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
    328   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
    329 
    330   // Do not Track Physical Registers, because it messes up.
    331   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
    332     if (Register::isVirtualRegister(RegMaskPair.RegUnit))
    333       LiveInRegs.insert(RegMaskPair.RegUnit);
    334   }
    335   LiveOutRegs.clear();
    336   // There is several possibilities to distinguish:
    337   // 1) Reg is not input to any instruction in the block, but is output of one
    338   // 2) 1) + read in the block and not needed after it
    339   // 3) 1) + read in the block but needed in another block
    340   // 4) Reg is input of an instruction but another block will read it too
    341   // 5) Reg is input of an instruction and then rewritten in the block.
    342   //    result is not read in the block (implies used in another block)
    343   // 6) Reg is input of an instruction and then rewritten in the block.
    344   //    result is read in the block and not needed in another block
    345   // 7) Reg is input of an instruction and then rewritten in the block.
    346   //    result is read in the block but also needed in another block
    347   // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
    348   // We want LiveOutRegs to contain only Regs whose content will be read after
    349   // in another block, and whose content was written in the current block,
    350   // that is we want it to get 1, 3, 5, 7
    351   // Since we made the MIs of a block to be packed all together before
    352   // scheduling, then the LiveIntervals were correct, and the RPTracker was
    353   // able to correctly handle 5 vs 6, 2 vs 3.
    354   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
    355   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
    356   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
    357   // The use of findDefBetween removes the case 4.
    358   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
    359     Register Reg = RegMaskPair.RegUnit;
    360     if (Reg.isVirtual() &&
    361         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
    362                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
    363                      LIS)) {
    364       LiveOutRegs.insert(Reg);
    365     }
    366   }
    367 
    368   // Pressure = sum_alive_registers register size
    369   // Internally llvm will represent some registers as big 128 bits registers
    370   // for example, but they actually correspond to 4 actual 32 bits registers.
    371   // Thus Pressure is not equal to num_alive_registers * constant.
    372   LiveInPressure = TopPressure.MaxSetPressure;
    373   LiveOutPressure = BotPressure.MaxSetPressure;
    374 
    375   // Prepares TopRPTracker for top down scheduling.
    376   TopRPTracker.closeTop();
    377 }
    378 
    379 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
    380                                MachineBasicBlock::iterator EndBlock) {
    381   if (!Scheduled)
    382     fastSchedule();
    383 
    384   // PreScheduling phase to set LiveIn and LiveOut.
    385   initRegPressure(BeginBlock, EndBlock);
    386   undoSchedule();
    387 
    388   // Schedule for real now.
    389 
    390   TopReadySUs.clear();
    391 
    392   for (SUnit* SU : SUnits) {
    393     if (!SU->NumPredsLeft)
    394       TopReadySUs.push_back(SU);
    395   }
    396 
    397   while (!TopReadySUs.empty()) {
    398     SUnit *SU = pickNode();
    399     ScheduledSUnits.push_back(SU);
    400     TopRPTracker.setPos(SU->getInstr());
    401     TopRPTracker.advance();
    402     nodeScheduled(SU);
    403   }
    404 
    405   // TODO: compute InternalAdditionnalPressure.
    406   InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
    407 
    408   // Check everything is right.
    409 #ifndef NDEBUG
    410   assert(SUnits.size() == ScheduledSUnits.size() &&
    411             TopReadySUs.empty());
    412   for (SUnit* SU : SUnits) {
    413     assert(SU->isScheduled &&
    414               SU->NumPredsLeft == 0);
    415   }
    416 #endif
    417 
    418   Scheduled = true;
    419 }
    420 
    421 void SIScheduleBlock::undoSchedule() {
    422   for (SUnit* SU : SUnits) {
    423     SU->isScheduled = false;
    424     for (SDep& Succ : SU->Succs) {
    425       if (BC->isSUInBlock(Succ.getSUnit(), ID))
    426         undoReleaseSucc(SU, &Succ);
    427     }
    428   }
    429   HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
    430   ScheduledSUnits.clear();
    431   Scheduled = false;
    432 }
    433 
    434 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
    435   SUnit *SuccSU = SuccEdge->getSUnit();
    436 
    437   if (SuccEdge->isWeak()) {
    438     ++SuccSU->WeakPredsLeft;
    439     return;
    440   }
    441   ++SuccSU->NumPredsLeft;
    442 }
    443 
    444 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
    445   SUnit *SuccSU = SuccEdge->getSUnit();
    446 
    447   if (SuccEdge->isWeak()) {
    448     --SuccSU->WeakPredsLeft;
    449     return;
    450   }
    451 #ifndef NDEBUG
    452   if (SuccSU->NumPredsLeft == 0) {
    453     dbgs() << "*** Scheduling failed! ***\n";
    454     DAG->dumpNode(*SuccSU);
    455     dbgs() << " has been released too many times!\n";
    456     llvm_unreachable(nullptr);
    457   }
    458 #endif
    459 
    460   --SuccSU->NumPredsLeft;
    461 }
    462 
    463 /// Release Successors of the SU that are in the block or not.
    464 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
    465   for (SDep& Succ : SU->Succs) {
    466     SUnit *SuccSU = Succ.getSUnit();
    467 
    468     if (SuccSU->NodeNum >= DAG->SUnits.size())
    469         continue;
    470 
    471     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
    472       continue;
    473 
    474     releaseSucc(SU, &Succ);
    475     if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
    476       TopReadySUs.push_back(SuccSU);
    477   }
    478 }
    479 
    480 void SIScheduleBlock::nodeScheduled(SUnit *SU) {
    481   // Is in TopReadySUs
    482   assert (!SU->NumPredsLeft);
    483   std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
    484   if (I == TopReadySUs.end()) {
    485     dbgs() << "Data Structure Bug in SI Scheduler\n";
    486     llvm_unreachable(nullptr);
    487   }
    488   TopReadySUs.erase(I);
    489 
    490   releaseSuccessors(SU, true);
    491   // Scheduling this node will trigger a wait,
    492   // thus propagate to other instructions that they do not need to wait either.
    493   if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
    494     HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
    495 
    496   if (DAG->IsLowLatencySU[SU->NodeNum]) {
    497      for (SDep& Succ : SU->Succs) {
    498       std::map<unsigned, unsigned>::iterator I =
    499         NodeNum2Index.find(Succ.getSUnit()->NodeNum);
    500       if (I != NodeNum2Index.end())
    501         HasLowLatencyNonWaitedParent[I->second] = 1;
    502     }
    503   }
    504   SU->isScheduled = true;
    505 }
    506 
    507 void SIScheduleBlock::finalizeUnits() {
    508   // We remove links from outside blocks to enable scheduling inside the block.
    509   for (SUnit* SU : SUnits) {
    510     releaseSuccessors(SU, false);
    511     if (DAG->IsHighLatencySU[SU->NodeNum])
    512       HighLatencyBlock = true;
    513   }
    514   HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
    515 }
    516 
    517 // we maintain ascending order of IDs
    518 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
    519   unsigned PredID = Pred->getID();
    520 
    521   // Check if not already predecessor.
    522   for (SIScheduleBlock* P : Preds) {
    523     if (PredID == P->getID())
    524       return;
    525   }
    526   Preds.push_back(Pred);
    527 
    528   assert(none_of(Succs,
    529                  [=](std::pair<SIScheduleBlock*,
    530                      SIScheduleBlockLinkKind> S) {
    531                    return PredID == S.first->getID();
    532                     }) &&
    533          "Loop in the Block Graph!");
    534 }
    535 
    536 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
    537                               SIScheduleBlockLinkKind Kind) {
    538   unsigned SuccID = Succ->getID();
    539 
    540   // Check if not already predecessor.
    541   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
    542     if (SuccID == S.first->getID()) {
    543       if (S.second == SIScheduleBlockLinkKind::NoData &&
    544           Kind == SIScheduleBlockLinkKind::Data)
    545         S.second = Kind;
    546       return;
    547     }
    548   }
    549   if (Succ->isHighLatencyBlock())
    550     ++NumHighLatencySuccessors;
    551   Succs.push_back(std::make_pair(Succ, Kind));
    552 
    553   assert(none_of(Preds,
    554                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
    555          "Loop in the Block Graph!");
    556 }
    557 
    558 #ifndef NDEBUG
    559 void SIScheduleBlock::printDebug(bool full) {
    560   dbgs() << "Block (" << ID << ")\n";
    561   if (!full)
    562     return;
    563 
    564   dbgs() << "\nContains High Latency Instruction: "
    565          << HighLatencyBlock << '\n';
    566   dbgs() << "\nDepends On:\n";
    567   for (SIScheduleBlock* P : Preds) {
    568     P->printDebug(false);
    569   }
    570 
    571   dbgs() << "\nSuccessors:\n";
    572   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
    573     if (S.second == SIScheduleBlockLinkKind::Data)
    574       dbgs() << "(Data Dep) ";
    575     S.first->printDebug(false);
    576   }
    577 
    578   if (Scheduled) {
    579     dbgs() << "LiveInPressure "
    580            << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
    581            << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
    582     dbgs() << "LiveOutPressure "
    583            << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
    584            << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
    585     dbgs() << "LiveIns:\n";
    586     for (unsigned Reg : LiveInRegs)
    587       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    588 
    589     dbgs() << "\nLiveOuts:\n";
    590     for (unsigned Reg : LiveOutRegs)
    591       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    592   }
    593 
    594   dbgs() << "\nInstructions:\n";
    595   for (const SUnit* SU : SUnits)
    596       DAG->dumpNode(*SU);
    597 
    598   dbgs() << "///////////////////////\n";
    599 }
    600 #endif
    601 
    602 // SIScheduleBlockCreator //
    603 
    604 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
    605     : DAG(DAG) {}
    606 
    607 SIScheduleBlocks
    608 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
    609   std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
    610     Blocks.find(BlockVariant);
    611   if (B == Blocks.end()) {
    612     SIScheduleBlocks Res;
    613     createBlocksForVariant(BlockVariant);
    614     topologicalSort();
    615     scheduleInsideBlocks();
    616     fillStats();
    617     Res.Blocks = CurrentBlocks;
    618     Res.TopDownIndex2Block = TopDownIndex2Block;
    619     Res.TopDownBlock2Index = TopDownBlock2Index;
    620     Blocks[BlockVariant] = Res;
    621     return Res;
    622   } else {
    623     return B->second;
    624   }
    625 }
    626 
    627 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
    628   if (SU->NodeNum >= DAG->SUnits.size())
    629     return false;
    630   return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
    631 }
    632 
    633 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
    634   unsigned DAGSize = DAG->SUnits.size();
    635 
    636   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    637     SUnit *SU = &DAG->SUnits[i];
    638     if (DAG->IsHighLatencySU[SU->NodeNum]) {
    639       CurrentColoring[SU->NodeNum] = NextReservedID++;
    640     }
    641   }
    642 }
    643 
    644 static bool
    645 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
    646   for (const auto &PredDep : SU.Preds) {
    647     if (PredDep.getSUnit() == &FromSU &&
    648         PredDep.getKind() == llvm::SDep::Data)
    649       return true;
    650   }
    651   return false;
    652 }
    653 
    654 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
    655   unsigned DAGSize = DAG->SUnits.size();
    656   unsigned NumHighLatencies = 0;
    657   unsigned GroupSize;
    658   int Color = NextReservedID;
    659   unsigned Count = 0;
    660   std::set<unsigned> FormingGroup;
    661 
    662   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    663     SUnit *SU = &DAG->SUnits[i];
    664     if (DAG->IsHighLatencySU[SU->NodeNum])
    665       ++NumHighLatencies;
    666   }
    667 
    668   if (NumHighLatencies == 0)
    669     return;
    670 
    671   if (NumHighLatencies <= 6)
    672     GroupSize = 2;
    673   else if (NumHighLatencies <= 12)
    674     GroupSize = 3;
    675   else
    676     GroupSize = 4;
    677 
    678   for (unsigned SUNum : DAG->TopDownIndex2SU) {
    679     const SUnit &SU = DAG->SUnits[SUNum];
    680     if (DAG->IsHighLatencySU[SU.NodeNum]) {
    681       unsigned CompatibleGroup = true;
    682       int ProposedColor = Color;
    683       std::vector<int> AdditionalElements;
    684 
    685       // We don't want to put in the same block
    686       // two high latency instructions that depend
    687       // on each other.
    688       // One way would be to check canAddEdge
    689       // in both directions, but that currently is not
    690       // enough because there the high latency order is
    691       // enforced (via links).
    692       // Instead, look at the dependencies between the
    693       // high latency instructions and deduce if it is
    694       // a data dependency or not.
    695       for (unsigned j : FormingGroup) {
    696         bool HasSubGraph;
    697         std::vector<int> SubGraph;
    698         // By construction (topological order), if SU and
    699         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
    700         // in the parent graph of SU.
    701 #ifndef NDEBUG
    702         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
    703                                                HasSubGraph);
    704         assert(!HasSubGraph);
    705 #endif
    706         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
    707                                                HasSubGraph);
    708         if (!HasSubGraph)
    709           continue; // No dependencies between each other
    710         else if (SubGraph.size() > 5) {
    711           // Too many elements would be required to be added to the block.
    712           CompatibleGroup = false;
    713           break;
    714         }
    715         else {
    716           // Check the type of dependency
    717           for (unsigned k : SubGraph) {
    718             // If in the path to join the two instructions,
    719             // there is another high latency instruction,
    720             // or instructions colored for another block
    721             // abort the merge.
    722             if (DAG->IsHighLatencySU[k] ||
    723                 (CurrentColoring[k] != ProposedColor &&
    724                  CurrentColoring[k] != 0)) {
    725               CompatibleGroup = false;
    726               break;
    727             }
    728             // If one of the SU in the subgraph depends on the result of SU j,
    729             // there'll be a data dependency.
    730             if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
    731               CompatibleGroup = false;
    732               break;
    733             }
    734           }
    735           if (!CompatibleGroup)
    736             break;
    737           // Same check for the SU
    738           if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
    739             CompatibleGroup = false;
    740             break;
    741           }
    742           // Add all the required instructions to the block
    743           // These cannot live in another block (because they
    744           // depend (order dependency) on one of the
    745           // instruction in the block, and are required for the
    746           // high latency instruction we add.
    747           llvm::append_range(AdditionalElements, SubGraph);
    748         }
    749       }
    750       if (CompatibleGroup) {
    751         FormingGroup.insert(SU.NodeNum);
    752         for (unsigned j : AdditionalElements)
    753           CurrentColoring[j] = ProposedColor;
    754         CurrentColoring[SU.NodeNum] = ProposedColor;
    755         ++Count;
    756       }
    757       // Found one incompatible instruction,
    758       // or has filled a big enough group.
    759       // -> start a new one.
    760       if (!CompatibleGroup) {
    761         FormingGroup.clear();
    762         Color = ++NextReservedID;
    763         ProposedColor = Color;
    764         FormingGroup.insert(SU.NodeNum);
    765         CurrentColoring[SU.NodeNum] = ProposedColor;
    766         Count = 0;
    767       } else if (Count == GroupSize) {
    768         FormingGroup.clear();
    769         Color = ++NextReservedID;
    770         ProposedColor = Color;
    771         Count = 0;
    772       }
    773     }
    774   }
    775 }
    776 
    777 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
    778   unsigned DAGSize = DAG->SUnits.size();
    779   std::map<std::set<unsigned>, unsigned> ColorCombinations;
    780 
    781   CurrentTopDownReservedDependencyColoring.clear();
    782   CurrentBottomUpReservedDependencyColoring.clear();
    783 
    784   CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
    785   CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
    786 
    787   // Traverse TopDown, and give different colors to SUs depending
    788   // on which combination of High Latencies they depend on.
    789 
    790   for (unsigned SUNum : DAG->TopDownIndex2SU) {
    791     SUnit *SU = &DAG->SUnits[SUNum];
    792     std::set<unsigned> SUColors;
    793 
    794     // Already given.
    795     if (CurrentColoring[SU->NodeNum]) {
    796       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
    797         CurrentColoring[SU->NodeNum];
    798       continue;
    799     }
    800 
    801    for (SDep& PredDep : SU->Preds) {
    802       SUnit *Pred = PredDep.getSUnit();
    803       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
    804         continue;
    805       if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
    806         SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
    807     }
    808     // Color 0 by default.
    809     if (SUColors.empty())
    810       continue;
    811     // Same color than parents.
    812     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
    813       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
    814         *SUColors.begin();
    815     else {
    816       std::map<std::set<unsigned>, unsigned>::iterator Pos =
    817         ColorCombinations.find(SUColors);
    818       if (Pos != ColorCombinations.end()) {
    819           CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
    820       } else {
    821         CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
    822           NextNonReservedID;
    823         ColorCombinations[SUColors] = NextNonReservedID++;
    824       }
    825     }
    826   }
    827 
    828   ColorCombinations.clear();
    829 
    830   // Same as before, but BottomUp.
    831 
    832   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    833     SUnit *SU = &DAG->SUnits[SUNum];
    834     std::set<unsigned> SUColors;
    835 
    836     // Already given.
    837     if (CurrentColoring[SU->NodeNum]) {
    838       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
    839         CurrentColoring[SU->NodeNum];
    840       continue;
    841     }
    842 
    843     for (SDep& SuccDep : SU->Succs) {
    844       SUnit *Succ = SuccDep.getSUnit();
    845       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    846         continue;
    847       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
    848         SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
    849     }
    850     // Keep color 0.
    851     if (SUColors.empty())
    852       continue;
    853     // Same color than parents.
    854     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
    855       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
    856         *SUColors.begin();
    857     else {
    858       std::map<std::set<unsigned>, unsigned>::iterator Pos =
    859         ColorCombinations.find(SUColors);
    860       if (Pos != ColorCombinations.end()) {
    861         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
    862       } else {
    863         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
    864           NextNonReservedID;
    865         ColorCombinations[SUColors] = NextNonReservedID++;
    866       }
    867     }
    868   }
    869 }
    870 
    871 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
    872   unsigned DAGSize = DAG->SUnits.size();
    873   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
    874 
    875   // Every combination of colors given by the top down
    876   // and bottom up Reserved node dependency
    877 
    878   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    879     SUnit *SU = &DAG->SUnits[i];
    880     std::pair<unsigned, unsigned> SUColors;
    881 
    882     // High latency instructions: already given.
    883     if (CurrentColoring[SU->NodeNum])
    884       continue;
    885 
    886     SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
    887     SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
    888 
    889     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
    890       ColorCombinations.find(SUColors);
    891     if (Pos != ColorCombinations.end()) {
    892       CurrentColoring[SU->NodeNum] = Pos->second;
    893     } else {
    894       CurrentColoring[SU->NodeNum] = NextNonReservedID;
    895       ColorCombinations[SUColors] = NextNonReservedID++;
    896     }
    897   }
    898 }
    899 
    900 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
    901   unsigned DAGSize = DAG->SUnits.size();
    902   std::vector<int> PendingColoring = CurrentColoring;
    903 
    904   assert(DAGSize >= 1 &&
    905          CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
    906          CurrentTopDownReservedDependencyColoring.size() == DAGSize);
    907   // If there is no reserved block at all, do nothing. We don't want
    908   // everything in one block.
    909   if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
    910                         CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
    911       *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
    912                         CurrentTopDownReservedDependencyColoring.end()) == 0)
    913     return;
    914 
    915   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    916     SUnit *SU = &DAG->SUnits[SUNum];
    917     std::set<unsigned> SUColors;
    918     std::set<unsigned> SUColorsPending;
    919 
    920     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    921       continue;
    922 
    923     if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
    924         CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
    925       continue;
    926 
    927     for (SDep& SuccDep : SU->Succs) {
    928       SUnit *Succ = SuccDep.getSUnit();
    929       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    930         continue;
    931       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
    932           CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
    933         SUColors.insert(CurrentColoring[Succ->NodeNum]);
    934       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
    935     }
    936     // If there is only one child/parent block, and that block
    937     // is not among the ones we are removing in this path, then
    938     // merge the instruction to that block
    939     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
    940       PendingColoring[SU->NodeNum] = *SUColors.begin();
    941     else // TODO: Attribute new colors depending on color
    942          // combination of children.
    943       PendingColoring[SU->NodeNum] = NextNonReservedID++;
    944   }
    945   CurrentColoring = PendingColoring;
    946 }
    947 
    948 
    949 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
    950   unsigned DAGSize = DAG->SUnits.size();
    951   unsigned PreviousColor;
    952   std::set<unsigned> SeenColors;
    953 
    954   if (DAGSize <= 1)
    955     return;
    956 
    957   PreviousColor = CurrentColoring[0];
    958 
    959   for (unsigned i = 1, e = DAGSize; i != e; ++i) {
    960     SUnit *SU = &DAG->SUnits[i];
    961     unsigned CurrentColor = CurrentColoring[i];
    962     unsigned PreviousColorSave = PreviousColor;
    963     assert(i == SU->NodeNum);
    964 
    965     if (CurrentColor != PreviousColor)
    966       SeenColors.insert(PreviousColor);
    967     PreviousColor = CurrentColor;
    968 
    969     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    970       continue;
    971 
    972     if (SeenColors.find(CurrentColor) == SeenColors.end())
    973       continue;
    974 
    975     if (PreviousColorSave != CurrentColor)
    976       CurrentColoring[i] = NextNonReservedID++;
    977     else
    978       CurrentColoring[i] = CurrentColoring[i-1];
    979   }
    980 }
    981 
    982 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
    983   unsigned DAGSize = DAG->SUnits.size();
    984 
    985   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    986     SUnit *SU = &DAG->SUnits[SUNum];
    987     std::set<unsigned> SUColors;
    988 
    989     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    990       continue;
    991 
    992     // No predecessor: Vgpr constant loading.
    993     // Low latency instructions usually have a predecessor (the address)
    994     if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
    995       continue;
    996 
    997     for (SDep& SuccDep : SU->Succs) {
    998       SUnit *Succ = SuccDep.getSUnit();
    999       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1000         continue;
   1001       SUColors.insert(CurrentColoring[Succ->NodeNum]);
   1002     }
   1003     if (SUColors.size() == 1)
   1004       CurrentColoring[SU->NodeNum] = *SUColors.begin();
   1005   }
   1006 }
   1007 
   1008 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
   1009   unsigned DAGSize = DAG->SUnits.size();
   1010 
   1011   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
   1012     SUnit *SU = &DAG->SUnits[SUNum];
   1013     std::set<unsigned> SUColors;
   1014 
   1015     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
   1016       continue;
   1017 
   1018     for (SDep& SuccDep : SU->Succs) {
   1019        SUnit *Succ = SuccDep.getSUnit();
   1020       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1021         continue;
   1022       SUColors.insert(CurrentColoring[Succ->NodeNum]);
   1023     }
   1024     if (SUColors.size() == 1)
   1025       CurrentColoring[SU->NodeNum] = *SUColors.begin();
   1026   }
   1027 }
   1028 
   1029 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
   1030   unsigned DAGSize = DAG->SUnits.size();
   1031 
   1032   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
   1033     SUnit *SU = &DAG->SUnits[SUNum];
   1034     std::set<unsigned> SUColors;
   1035 
   1036     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
   1037       continue;
   1038 
   1039     for (SDep& SuccDep : SU->Succs) {
   1040        SUnit *Succ = SuccDep.getSUnit();
   1041       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1042         continue;
   1043       SUColors.insert(CurrentColoring[Succ->NodeNum]);
   1044     }
   1045     if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
   1046       CurrentColoring[SU->NodeNum] = *SUColors.begin();
   1047   }
   1048 }
   1049 
   1050 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   1051   unsigned DAGSize = DAG->SUnits.size();
   1052   std::map<unsigned, unsigned> ColorCount;
   1053 
   1054   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
   1055     SUnit *SU = &DAG->SUnits[SUNum];
   1056     unsigned color = CurrentColoring[SU->NodeNum];
   1057      ++ColorCount[color];
   1058   }
   1059 
   1060   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
   1061     SUnit *SU = &DAG->SUnits[SUNum];
   1062     unsigned color = CurrentColoring[SU->NodeNum];
   1063     std::set<unsigned> SUColors;
   1064 
   1065     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
   1066       continue;
   1067 
   1068     if (ColorCount[color] > 1)
   1069       continue;
   1070 
   1071     for (SDep& SuccDep : SU->Succs) {
   1072        SUnit *Succ = SuccDep.getSUnit();
   1073       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1074         continue;
   1075       SUColors.insert(CurrentColoring[Succ->NodeNum]);
   1076     }
   1077     if (SUColors.size() == 1 && *SUColors.begin() != color) {
   1078       --ColorCount[color];
   1079       CurrentColoring[SU->NodeNum] = *SUColors.begin();
   1080       ++ColorCount[*SUColors.begin()];
   1081     }
   1082   }
   1083 }
   1084 
   1085 void SIScheduleBlockCreator::cutHugeBlocks() {
   1086   // TODO
   1087 }
   1088 
   1089 void SIScheduleBlockCreator::regroupNoUserInstructions() {
   1090   unsigned DAGSize = DAG->SUnits.size();
   1091   int GroupID = NextNonReservedID++;
   1092 
   1093   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
   1094     SUnit *SU = &DAG->SUnits[SUNum];
   1095     bool hasSuccessor = false;
   1096 
   1097     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
   1098       continue;
   1099 
   1100     for (SDep& SuccDep : SU->Succs) {
   1101        SUnit *Succ = SuccDep.getSUnit();
   1102       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1103         continue;
   1104       hasSuccessor = true;
   1105     }
   1106     if (!hasSuccessor)
   1107       CurrentColoring[SU->NodeNum] = GroupID;
   1108   }
   1109 }
   1110 
   1111 void SIScheduleBlockCreator::colorExports() {
   1112   unsigned ExportColor = NextNonReservedID++;
   1113   SmallVector<unsigned, 8> ExpGroup;
   1114 
   1115   // Put all exports together in a block.
   1116   // The block will naturally end up being scheduled last,
   1117   // thus putting exports at the end of the schedule, which
   1118   // is better for performance.
   1119   // However we must ensure, for safety, the exports can be put
   1120   // together in the same block without any other instruction.
   1121   // This could happen, for example, when scheduling after regalloc
   1122   // if reloading a spilled register from memory using the same
   1123   // register than used in a previous export.
   1124   // If that happens, do not regroup the exports.
   1125   for (unsigned SUNum : DAG->TopDownIndex2SU) {
   1126     const SUnit &SU = DAG->SUnits[SUNum];
   1127     if (SIInstrInfo::isEXP(*SU.getInstr())) {
   1128       // Check the EXP can be added to the group safely,
   1129       // ie without needing any other instruction.
   1130       // The EXP is allowed to depend on other EXP
   1131       // (they will be in the same group).
   1132       for (unsigned j : ExpGroup) {
   1133         bool HasSubGraph;
   1134         std::vector<int> SubGraph;
   1135         // By construction (topological order), if SU and
   1136         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
   1137         // in the parent graph of SU.
   1138 #ifndef NDEBUG
   1139         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
   1140                                                HasSubGraph);
   1141         assert(!HasSubGraph);
   1142 #endif
   1143         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
   1144                                                HasSubGraph);
   1145         if (!HasSubGraph)
   1146           continue; // No dependencies between each other
   1147 
   1148         // SubGraph contains all the instructions required
   1149         // between EXP SUnits[j] and EXP SU.
   1150         for (unsigned k : SubGraph) {
   1151           if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
   1152             // Other instructions than EXP would be required in the group.
   1153             // Abort the groupping.
   1154             return;
   1155         }
   1156       }
   1157 
   1158       ExpGroup.push_back(SUNum);
   1159     }
   1160   }
   1161 
   1162   // The group can be formed. Give the color.
   1163   for (unsigned j : ExpGroup)
   1164     CurrentColoring[j] = ExportColor;
   1165 }
   1166 
   1167 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
   1168   unsigned DAGSize = DAG->SUnits.size();
   1169   std::map<unsigned,unsigned> RealID;
   1170 
   1171   CurrentBlocks.clear();
   1172   CurrentColoring.clear();
   1173   CurrentColoring.resize(DAGSize, 0);
   1174   Node2CurrentBlock.clear();
   1175 
   1176   // Restore links previous scheduling variant has overridden.
   1177   DAG->restoreSULinksLeft();
   1178 
   1179   NextReservedID = 1;
   1180   NextNonReservedID = DAGSize + 1;
   1181 
   1182   LLVM_DEBUG(dbgs() << "Coloring the graph\n");
   1183 
   1184   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
   1185     colorHighLatenciesGroups();
   1186   else
   1187     colorHighLatenciesAlone();
   1188   colorComputeReservedDependencies();
   1189   colorAccordingToReservedDependencies();
   1190   colorEndsAccordingToDependencies();
   1191   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
   1192     colorForceConsecutiveOrderInGroup();
   1193   regroupNoUserInstructions();
   1194   colorMergeConstantLoadsNextGroup();
   1195   colorMergeIfPossibleNextGroupOnlyForReserved();
   1196   colorExports();
   1197 
   1198   // Put SUs of same color into same block
   1199   Node2CurrentBlock.resize(DAGSize, -1);
   1200   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1201     SUnit *SU = &DAG->SUnits[i];
   1202     unsigned Color = CurrentColoring[SU->NodeNum];
   1203     if (RealID.find(Color) == RealID.end()) {
   1204       int ID = CurrentBlocks.size();
   1205       BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
   1206       CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
   1207       RealID[Color] = ID;
   1208     }
   1209     CurrentBlocks[RealID[Color]]->addUnit(SU);
   1210     Node2CurrentBlock[SU->NodeNum] = RealID[Color];
   1211   }
   1212 
   1213   // Build dependencies between blocks.
   1214   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1215     SUnit *SU = &DAG->SUnits[i];
   1216     int SUID = Node2CurrentBlock[i];
   1217      for (SDep& SuccDep : SU->Succs) {
   1218        SUnit *Succ = SuccDep.getSUnit();
   1219       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1220         continue;
   1221       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
   1222         CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
   1223                                      SuccDep.isCtrl() ? NoData : Data);
   1224     }
   1225     for (SDep& PredDep : SU->Preds) {
   1226       SUnit *Pred = PredDep.getSUnit();
   1227       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
   1228         continue;
   1229       if (Node2CurrentBlock[Pred->NodeNum] != SUID)
   1230         CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
   1231     }
   1232   }
   1233 
   1234   // Free root and leafs of all blocks to enable scheduling inside them.
   1235   for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
   1236     SIScheduleBlock *Block = CurrentBlocks[i];
   1237     Block->finalizeUnits();
   1238   }
   1239   LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
   1240              for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
   1241                SIScheduleBlock *Block = CurrentBlocks[i];
   1242                Block->printDebug(true);
   1243              });
   1244 }
   1245 
   1246 // Two functions taken from Codegen/MachineScheduler.cpp
   1247 
   1248 /// Non-const version.
   1249 static MachineBasicBlock::iterator
   1250 nextIfDebug(MachineBasicBlock::iterator I,
   1251             MachineBasicBlock::const_iterator End) {
   1252   for (; I != End; ++I) {
   1253     if (!I->isDebugInstr())
   1254       break;
   1255   }
   1256   return I;
   1257 }
   1258 
   1259 void SIScheduleBlockCreator::topologicalSort() {
   1260   unsigned DAGSize = CurrentBlocks.size();
   1261   std::vector<int> WorkList;
   1262 
   1263   LLVM_DEBUG(dbgs() << "Topological Sort\n");
   1264 
   1265   WorkList.reserve(DAGSize);
   1266   TopDownIndex2Block.resize(DAGSize);
   1267   TopDownBlock2Index.resize(DAGSize);
   1268   BottomUpIndex2Block.resize(DAGSize);
   1269 
   1270   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1271     SIScheduleBlock *Block = CurrentBlocks[i];
   1272     unsigned Degree = Block->getSuccs().size();
   1273     TopDownBlock2Index[i] = Degree;
   1274     if (Degree == 0) {
   1275       WorkList.push_back(i);
   1276     }
   1277   }
   1278 
   1279   int Id = DAGSize;
   1280   while (!WorkList.empty()) {
   1281     int i = WorkList.back();
   1282     SIScheduleBlock *Block = CurrentBlocks[i];
   1283     WorkList.pop_back();
   1284     TopDownBlock2Index[i] = --Id;
   1285     TopDownIndex2Block[Id] = i;
   1286     for (SIScheduleBlock* Pred : Block->getPreds()) {
   1287       if (!--TopDownBlock2Index[Pred->getID()])
   1288         WorkList.push_back(Pred->getID());
   1289     }
   1290   }
   1291 
   1292 #ifndef NDEBUG
   1293   // Check correctness of the ordering.
   1294   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1295     SIScheduleBlock *Block = CurrentBlocks[i];
   1296     for (SIScheduleBlock* Pred : Block->getPreds()) {
   1297       assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
   1298       "Wrong Top Down topological sorting");
   1299     }
   1300   }
   1301 #endif
   1302 
   1303   BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
   1304                                          TopDownIndex2Block.rend());
   1305 }
   1306 
   1307 void SIScheduleBlockCreator::scheduleInsideBlocks() {
   1308   unsigned DAGSize = CurrentBlocks.size();
   1309 
   1310   LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
   1311 
   1312   // We do schedule a valid scheduling such that a Block corresponds
   1313   // to a range of instructions.
   1314   LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
   1315   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1316     SIScheduleBlock *Block = CurrentBlocks[i];
   1317     Block->fastSchedule();
   1318   }
   1319 
   1320   // Note: the following code, and the part restoring previous position
   1321   // is by far the most expensive operation of the Scheduler.
   1322 
   1323   // Do not update CurrentTop.
   1324   MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
   1325   std::vector<MachineBasicBlock::iterator> PosOld;
   1326   std::vector<MachineBasicBlock::iterator> PosNew;
   1327   PosOld.reserve(DAG->SUnits.size());
   1328   PosNew.reserve(DAG->SUnits.size());
   1329 
   1330   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1331     int BlockIndice = TopDownIndex2Block[i];
   1332     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
   1333     std::vector<SUnit*> SUs = Block->getScheduledUnits();
   1334 
   1335     for (SUnit* SU : SUs) {
   1336       MachineInstr *MI = SU->getInstr();
   1337       MachineBasicBlock::iterator Pos = MI;
   1338       PosOld.push_back(Pos);
   1339       if (&*CurrentTopFastSched == MI) {
   1340         PosNew.push_back(Pos);
   1341         CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
   1342                                           DAG->getCurrentBottom());
   1343       } else {
   1344         // Update the instruction stream.
   1345         DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
   1346 
   1347         // Update LiveIntervals.
   1348         // Note: Moving all instructions and calling handleMove every time
   1349         // is the most cpu intensive operation of the scheduler.
   1350         // It would gain a lot if there was a way to recompute the
   1351         // LiveIntervals for the entire scheduling region.
   1352         DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
   1353         PosNew.push_back(CurrentTopFastSched);
   1354       }
   1355     }
   1356   }
   1357 
   1358   // Now we have Block of SUs == Block of MI.
   1359   // We do the final schedule for the instructions inside the block.
   1360   // The property that all the SUs of the Block are grouped together as MI
   1361   // is used for correct reg usage tracking.
   1362   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1363     SIScheduleBlock *Block = CurrentBlocks[i];
   1364     std::vector<SUnit*> SUs = Block->getScheduledUnits();
   1365     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
   1366   }
   1367 
   1368   LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
   1369   // Restore old ordering (which prevents a LIS->handleMove bug).
   1370   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
   1371     MachineBasicBlock::iterator POld = PosOld[i-1];
   1372     MachineBasicBlock::iterator PNew = PosNew[i-1];
   1373     if (PNew != POld) {
   1374       // Update the instruction stream.
   1375       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
   1376 
   1377       // Update LiveIntervals.
   1378       DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
   1379     }
   1380   }
   1381 
   1382   LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
   1383     SIScheduleBlock *Block = CurrentBlocks[i];
   1384     Block->printDebug(true);
   1385   });
   1386 }
   1387 
   1388 void SIScheduleBlockCreator::fillStats() {
   1389   unsigned DAGSize = CurrentBlocks.size();
   1390 
   1391   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1392     int BlockIndice = TopDownIndex2Block[i];
   1393     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
   1394     if (Block->getPreds().empty())
   1395       Block->Depth = 0;
   1396     else {
   1397       unsigned Depth = 0;
   1398       for (SIScheduleBlock *Pred : Block->getPreds()) {
   1399         if (Depth < Pred->Depth + Pred->getCost())
   1400           Depth = Pred->Depth + Pred->getCost();
   1401       }
   1402       Block->Depth = Depth;
   1403     }
   1404   }
   1405 
   1406   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1407     int BlockIndice = BottomUpIndex2Block[i];
   1408     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
   1409     if (Block->getSuccs().empty())
   1410       Block->Height = 0;
   1411     else {
   1412       unsigned Height = 0;
   1413       for (const auto &Succ : Block->getSuccs())
   1414         Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
   1415       Block->Height = Height;
   1416     }
   1417   }
   1418 }
   1419 
   1420 // SIScheduleBlockScheduler //
   1421 
   1422 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   1423                                                    SISchedulerBlockSchedulerVariant Variant,
   1424                                                    SIScheduleBlocks  BlocksStruct) :
   1425   DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
   1426   LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
   1427   SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
   1428 
   1429   // Fill the usage of every output
   1430   // Warning: while by construction we always have a link between two blocks
   1431   // when one needs a result from the other, the number of users of an output
   1432   // is not the sum of child blocks having as input the same virtual register.
   1433   // Here is an example. A produces x and y. B eats x and produces x'.
   1434   // C eats x' and y. The register coalescer may have attributed the same
   1435   // virtual register to x and x'.
   1436   // To count accurately, we do a topological sort. In case the register is
   1437   // found for several parents, we increment the usage of the one with the
   1438   // highest topological index.
   1439   LiveOutRegsNumUsages.resize(Blocks.size());
   1440   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1441     SIScheduleBlock *Block = Blocks[i];
   1442     for (unsigned Reg : Block->getInRegs()) {
   1443       bool Found = false;
   1444       int topoInd = -1;
   1445       for (SIScheduleBlock* Pred: Block->getPreds()) {
   1446         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
   1447         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
   1448 
   1449         if (RegPos != PredOutRegs.end()) {
   1450           Found = true;
   1451           if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
   1452             topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
   1453           }
   1454         }
   1455       }
   1456 
   1457       if (!Found)
   1458         continue;
   1459 
   1460       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
   1461       ++LiveOutRegsNumUsages[PredID][Reg];
   1462     }
   1463   }
   1464 
   1465   LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
   1466   BlockNumPredsLeft.resize(Blocks.size());
   1467   BlockNumSuccsLeft.resize(Blocks.size());
   1468 
   1469   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1470     SIScheduleBlock *Block = Blocks[i];
   1471     BlockNumPredsLeft[i] = Block->getPreds().size();
   1472     BlockNumSuccsLeft[i] = Block->getSuccs().size();
   1473   }
   1474 
   1475 #ifndef NDEBUG
   1476   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1477     SIScheduleBlock *Block = Blocks[i];
   1478     assert(Block->getID() == i);
   1479   }
   1480 #endif
   1481 
   1482   std::set<unsigned> InRegs = DAG->getInRegs();
   1483   addLiveRegs(InRegs);
   1484 
   1485   // Increase LiveOutRegsNumUsages for blocks
   1486   // producing registers consumed in another
   1487   // scheduling region.
   1488   for (unsigned Reg : DAG->getOutRegs()) {
   1489     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1490       // Do reverse traversal
   1491       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
   1492       SIScheduleBlock *Block = Blocks[ID];
   1493       const std::set<unsigned> &OutRegs = Block->getOutRegs();
   1494 
   1495       if (OutRegs.find(Reg) == OutRegs.end())
   1496         continue;
   1497 
   1498       ++LiveOutRegsNumUsages[ID][Reg];
   1499       break;
   1500     }
   1501   }
   1502 
   1503   // Fill LiveRegsConsumers for regs that were already
   1504   // defined before scheduling.
   1505   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1506     SIScheduleBlock *Block = Blocks[i];
   1507     for (unsigned Reg : Block->getInRegs()) {
   1508       bool Found = false;
   1509       for (SIScheduleBlock* Pred: Block->getPreds()) {
   1510         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
   1511         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
   1512 
   1513         if (RegPos != PredOutRegs.end()) {
   1514           Found = true;
   1515           break;
   1516         }
   1517       }
   1518 
   1519       if (!Found)
   1520         ++LiveRegsConsumers[Reg];
   1521     }
   1522   }
   1523 
   1524   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1525     SIScheduleBlock *Block = Blocks[i];
   1526     if (BlockNumPredsLeft[i] == 0) {
   1527       ReadyBlocks.push_back(Block);
   1528     }
   1529   }
   1530 
   1531   while (SIScheduleBlock *Block = pickBlock()) {
   1532     BlocksScheduled.push_back(Block);
   1533     blockScheduled(Block);
   1534   }
   1535 
   1536   LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
   1537                                             : BlocksScheduled) {
   1538     dbgs() << ' ' << Block->getID();
   1539   } dbgs() << '\n';);
   1540 }
   1541 
   1542 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
   1543                                                    SIBlockSchedCandidate &TryCand) {
   1544   if (!Cand.isValid()) {
   1545     TryCand.Reason = NodeOrder;
   1546     return true;
   1547   }
   1548 
   1549   // Try to hide high latencies.
   1550   if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
   1551                  Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
   1552     return true;
   1553   // Schedule high latencies early so you can hide them better.
   1554   if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
   1555                           TryCand, Cand, Latency))
   1556     return true;
   1557   if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
   1558                                                    TryCand, Cand, Depth))
   1559     return true;
   1560   if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
   1561                           Cand.NumHighLatencySuccessors,
   1562                           TryCand, Cand, Successor))
   1563     return true;
   1564   return false;
   1565 }
   1566 
   1567 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
   1568                                                     SIBlockSchedCandidate &TryCand) {
   1569   if (!Cand.isValid()) {
   1570     TryCand.Reason = NodeOrder;
   1571     return true;
   1572   }
   1573 
   1574   if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
   1575                        TryCand, Cand, RegUsage))
   1576     return true;
   1577   if (SISched::tryGreater(TryCand.NumSuccessors > 0,
   1578                           Cand.NumSuccessors > 0,
   1579                           TryCand, Cand, Successor))
   1580     return true;
   1581   if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
   1582     return true;
   1583   if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
   1584                        TryCand, Cand, RegUsage))
   1585     return true;
   1586   return false;
   1587 }
   1588 
   1589 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
   1590   SIBlockSchedCandidate Cand;
   1591   std::vector<SIScheduleBlock*>::iterator Best;
   1592   SIScheduleBlock *Block;
   1593   if (ReadyBlocks.empty())
   1594     return nullptr;
   1595 
   1596   DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
   1597                         VregCurrentUsage, SregCurrentUsage);
   1598   if (VregCurrentUsage > maxVregUsage)
   1599     maxVregUsage = VregCurrentUsage;
   1600   if (SregCurrentUsage > maxSregUsage)
   1601     maxSregUsage = SregCurrentUsage;
   1602   LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
   1603              for (SIScheduleBlock *Block
   1604                   : ReadyBlocks) dbgs()
   1605              << Block->getID() << ' ';
   1606              dbgs() << "\nCurrent Live:\n";
   1607              for (unsigned Reg
   1608                   : LiveRegs) dbgs()
   1609              << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
   1610              dbgs() << '\n';
   1611              dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
   1612              dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
   1613 
   1614   Cand.Block = nullptr;
   1615   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
   1616        E = ReadyBlocks.end(); I != E; ++I) {
   1617     SIBlockSchedCandidate TryCand;
   1618     TryCand.Block = *I;
   1619     TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
   1620     TryCand.VGPRUsageDiff =
   1621       checkRegUsageImpact(TryCand.Block->getInRegs(),
   1622           TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
   1623     TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
   1624     TryCand.NumHighLatencySuccessors =
   1625       TryCand.Block->getNumHighLatencySuccessors();
   1626     TryCand.LastPosHighLatParentScheduled =
   1627       (unsigned int) std::max<int> (0,
   1628          LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
   1629            LastPosWaitedHighLatency);
   1630     TryCand.Height = TryCand.Block->Height;
   1631     // Try not to increase VGPR usage too much, else we may spill.
   1632     if (VregCurrentUsage > 120 ||
   1633         Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
   1634       if (!tryCandidateRegUsage(Cand, TryCand) &&
   1635           Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
   1636         tryCandidateLatency(Cand, TryCand);
   1637     } else {
   1638       if (!tryCandidateLatency(Cand, TryCand))
   1639         tryCandidateRegUsage(Cand, TryCand);
   1640     }
   1641     if (TryCand.Reason != NoCand) {
   1642       Cand.setBest(TryCand);
   1643       Best = I;
   1644       LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
   1645                         << getReasonStr(Cand.Reason) << '\n');
   1646     }
   1647   }
   1648 
   1649   LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
   1650              dbgs() << "Is a block with high latency instruction: "
   1651                     << (Cand.IsHighLatency ? "yes\n" : "no\n");
   1652              dbgs() << "Position of last high latency dependency: "
   1653                     << Cand.LastPosHighLatParentScheduled << '\n';
   1654              dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
   1655              dbgs() << '\n';);
   1656 
   1657   Block = Cand.Block;
   1658   ReadyBlocks.erase(Best);
   1659   return Block;
   1660 }
   1661 
   1662 // Tracking of currently alive registers to determine VGPR Usage.
   1663 
   1664 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
   1665   for (Register Reg : Regs) {
   1666     // For now only track virtual registers.
   1667     if (!Reg.isVirtual())
   1668       continue;
   1669     // If not already in the live set, then add it.
   1670     (void) LiveRegs.insert(Reg);
   1671   }
   1672 }
   1673 
   1674 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
   1675                                        std::set<unsigned> &Regs) {
   1676   for (unsigned Reg : Regs) {
   1677     // For now only track virtual registers.
   1678     std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
   1679     assert (Pos != LiveRegs.end() && // Reg must be live.
   1680                LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
   1681                LiveRegsConsumers[Reg] >= 1);
   1682     --LiveRegsConsumers[Reg];
   1683     if (LiveRegsConsumers[Reg] == 0)
   1684       LiveRegs.erase(Pos);
   1685   }
   1686 }
   1687 
   1688 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
   1689   for (const auto &Block : Parent->getSuccs()) {
   1690     if (--BlockNumPredsLeft[Block.first->getID()] == 0)
   1691       ReadyBlocks.push_back(Block.first);
   1692 
   1693     if (Parent->isHighLatencyBlock() &&
   1694         Block.second == SIScheduleBlockLinkKind::Data)
   1695       LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
   1696   }
   1697 }
   1698 
   1699 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
   1700   decreaseLiveRegs(Block, Block->getInRegs());
   1701   addLiveRegs(Block->getOutRegs());
   1702   releaseBlockSuccs(Block);
   1703   for (std::map<unsigned, unsigned>::iterator RegI =
   1704        LiveOutRegsNumUsages[Block->getID()].begin(),
   1705        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
   1706     std::pair<unsigned, unsigned> RegP = *RegI;
   1707     // We produce this register, thus it must not be previously alive.
   1708     assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
   1709            LiveRegsConsumers[RegP.first] == 0);
   1710     LiveRegsConsumers[RegP.first] += RegP.second;
   1711   }
   1712   if (LastPosHighLatencyParentScheduled[Block->getID()] >
   1713         (unsigned)LastPosWaitedHighLatency)
   1714     LastPosWaitedHighLatency =
   1715       LastPosHighLatencyParentScheduled[Block->getID()];
   1716   ++NumBlockScheduled;
   1717 }
   1718 
   1719 std::vector<int>
   1720 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
   1721                                      std::set<unsigned> &OutRegs) {
   1722   std::vector<int> DiffSetPressure;
   1723   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
   1724 
   1725   for (Register Reg : InRegs) {
   1726     // For now only track virtual registers.
   1727     if (!Reg.isVirtual())
   1728       continue;
   1729     if (LiveRegsConsumers[Reg] > 1)
   1730       continue;
   1731     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
   1732     for (; PSetI.isValid(); ++PSetI) {
   1733       DiffSetPressure[*PSetI] -= PSetI.getWeight();
   1734     }
   1735   }
   1736 
   1737   for (Register Reg : OutRegs) {
   1738     // For now only track virtual registers.
   1739     if (!Reg.isVirtual())
   1740       continue;
   1741     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
   1742     for (; PSetI.isValid(); ++PSetI) {
   1743       DiffSetPressure[*PSetI] += PSetI.getWeight();
   1744     }
   1745   }
   1746 
   1747   return DiffSetPressure;
   1748 }
   1749 
   1750 // SIScheduler //
   1751 
   1752 struct SIScheduleBlockResult
   1753 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
   1754                              SISchedulerBlockSchedulerVariant ScheduleVariant) {
   1755   SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
   1756   SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
   1757   std::vector<SIScheduleBlock*> ScheduledBlocks;
   1758   struct SIScheduleBlockResult Res;
   1759 
   1760   ScheduledBlocks = Scheduler.getBlocks();
   1761 
   1762   for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
   1763     SIScheduleBlock *Block = ScheduledBlocks[b];
   1764     std::vector<SUnit*> SUs = Block->getScheduledUnits();
   1765 
   1766     for (SUnit* SU : SUs)
   1767       Res.SUs.push_back(SU->NodeNum);
   1768   }
   1769 
   1770   Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
   1771   Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
   1772   return Res;
   1773 }
   1774 
   1775 // SIScheduleDAGMI //
   1776 
   1777 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
   1778   ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
   1779   SITII = static_cast<const SIInstrInfo*>(TII);
   1780   SITRI = static_cast<const SIRegisterInfo*>(TRI);
   1781 }
   1782 
   1783 SIScheduleDAGMI::~SIScheduleDAGMI() = default;
   1784 
   1785 // Code adapted from scheduleDAG.cpp
   1786 // Does a topological sort over the SUs.
   1787 // Both TopDown and BottomUp
   1788 void SIScheduleDAGMI::topologicalSort() {
   1789   Topo.InitDAGTopologicalSorting();
   1790 
   1791   TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
   1792   BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
   1793 }
   1794 
   1795 // Move low latencies further from their user without
   1796 // increasing SGPR usage (in general)
   1797 // This is to be replaced by a better pass that would
   1798 // take into account SGPR usage (based on VGPR Usage
   1799 // and the corresponding wavefront count), that would
   1800 // try to merge groups of loads if it make sense, etc
   1801 void SIScheduleDAGMI::moveLowLatencies() {
   1802    unsigned DAGSize = SUnits.size();
   1803    int LastLowLatencyUser = -1;
   1804    int LastLowLatencyPos = -1;
   1805 
   1806    for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
   1807     SUnit *SU = &SUnits[ScheduledSUnits[i]];
   1808     bool IsLowLatencyUser = false;
   1809     unsigned MinPos = 0;
   1810 
   1811     for (SDep& PredDep : SU->Preds) {
   1812       SUnit *Pred = PredDep.getSUnit();
   1813       if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
   1814         IsLowLatencyUser = true;
   1815       }
   1816       if (Pred->NodeNum >= DAGSize)
   1817         continue;
   1818       unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
   1819       if (PredPos >= MinPos)
   1820         MinPos = PredPos + 1;
   1821     }
   1822 
   1823     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
   1824       unsigned BestPos = LastLowLatencyUser + 1;
   1825       if ((int)BestPos <= LastLowLatencyPos)
   1826         BestPos = LastLowLatencyPos + 1;
   1827       if (BestPos < MinPos)
   1828         BestPos = MinPos;
   1829       if (BestPos < i) {
   1830         for (unsigned u = i; u > BestPos; --u) {
   1831           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
   1832           ScheduledSUnits[u] = ScheduledSUnits[u-1];
   1833         }
   1834         ScheduledSUnits[BestPos] = SU->NodeNum;
   1835         ScheduledSUnitsInv[SU->NodeNum] = BestPos;
   1836       }
   1837       LastLowLatencyPos = BestPos;
   1838       if (IsLowLatencyUser)
   1839         LastLowLatencyUser = BestPos;
   1840     } else if (IsLowLatencyUser) {
   1841       LastLowLatencyUser = i;
   1842     // Moves COPY instructions on which depends
   1843     // the low latency instructions too.
   1844     } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
   1845       bool CopyForLowLat = false;
   1846       for (SDep& SuccDep : SU->Succs) {
   1847         SUnit *Succ = SuccDep.getSUnit();
   1848         if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1849           continue;
   1850         if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
   1851           CopyForLowLat = true;
   1852         }
   1853       }
   1854       if (!CopyForLowLat)
   1855         continue;
   1856       if (MinPos < i) {
   1857         for (unsigned u = i; u > MinPos; --u) {
   1858           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
   1859           ScheduledSUnits[u] = ScheduledSUnits[u-1];
   1860         }
   1861         ScheduledSUnits[MinPos] = SU->NodeNum;
   1862         ScheduledSUnitsInv[SU->NodeNum] = MinPos;
   1863       }
   1864     }
   1865   }
   1866 }
   1867 
   1868 void SIScheduleDAGMI::restoreSULinksLeft() {
   1869   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
   1870     SUnits[i].isScheduled = false;
   1871     SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
   1872     SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
   1873     SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
   1874     SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
   1875   }
   1876 }
   1877 
   1878 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
   1879 template<typename _Iterator> void
   1880 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
   1881                                   unsigned &VgprUsage, unsigned &SgprUsage) {
   1882   VgprUsage = 0;
   1883   SgprUsage = 0;
   1884   for (_Iterator RegI = First; RegI != End; ++RegI) {
   1885     Register Reg = *RegI;
   1886     // For now only track virtual registers
   1887     if (!Reg.isVirtual())
   1888       continue;
   1889     PSetIterator PSetI = MRI.getPressureSets(Reg);
   1890     for (; PSetI.isValid(); ++PSetI) {
   1891       if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
   1892         VgprUsage += PSetI.getWeight();
   1893       else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
   1894         SgprUsage += PSetI.getWeight();
   1895     }
   1896   }
   1897 }
   1898 
   1899 void SIScheduleDAGMI::schedule()
   1900 {
   1901   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   1902   SIScheduleBlockResult Best, Temp;
   1903   LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
   1904 
   1905   buildDAGWithRegPressure();
   1906   LLVM_DEBUG(dump());
   1907 
   1908   topologicalSort();
   1909   findRootsAndBiasEdges(TopRoots, BotRoots);
   1910   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
   1911   // functions, but to make them happy we must initialize
   1912   // the default Scheduler implementation (even if we do not
   1913   // run it)
   1914   SchedImpl->initialize(this);
   1915   initQueues(TopRoots, BotRoots);
   1916 
   1917   // Fill some stats to help scheduling.
   1918 
   1919   SUnitsLinksBackup = SUnits;
   1920   IsLowLatencySU.clear();
   1921   LowLatencyOffset.clear();
   1922   IsHighLatencySU.clear();
   1923 
   1924   IsLowLatencySU.resize(SUnits.size(), 0);
   1925   LowLatencyOffset.resize(SUnits.size(), 0);
   1926   IsHighLatencySU.resize(SUnits.size(), 0);
   1927 
   1928   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
   1929     SUnit *SU = &SUnits[i];
   1930     const MachineOperand *BaseLatOp;
   1931     int64_t OffLatReg;
   1932     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
   1933       IsLowLatencySU[i] = 1;
   1934       bool OffsetIsScalable;
   1935       if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
   1936                                          OffsetIsScalable, TRI))
   1937         LowLatencyOffset[i] = OffLatReg;
   1938     } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode()))
   1939       IsHighLatencySU[i] = 1;
   1940   }
   1941 
   1942   SIScheduler Scheduler(this);
   1943   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
   1944                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
   1945 
   1946   // if VGPR usage is extremely high, try other good performing variants
   1947   // which could lead to lower VGPR usage
   1948   if (Best.MaxVGPRUsage > 180) {
   1949     static const std::pair<SISchedulerBlockCreatorVariant,
   1950                            SISchedulerBlockSchedulerVariant>
   1951         Variants[] = {
   1952       { LatenciesAlone, BlockRegUsageLatency },
   1953 //      { LatenciesAlone, BlockRegUsage },
   1954       { LatenciesGrouped, BlockLatencyRegUsage },
   1955 //      { LatenciesGrouped, BlockRegUsageLatency },
   1956 //      { LatenciesGrouped, BlockRegUsage },
   1957       { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
   1958 //      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
   1959 //      { LatenciesAlonePlusConsecutive, BlockRegUsage }
   1960     };
   1961     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
   1962       Temp = Scheduler.scheduleVariant(v.first, v.second);
   1963       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
   1964         Best = Temp;
   1965     }
   1966   }
   1967   // if VGPR usage is still extremely high, we may spill. Try other variants
   1968   // which are less performing, but that could lead to lower VGPR usage.
   1969   if (Best.MaxVGPRUsage > 200) {
   1970     static const std::pair<SISchedulerBlockCreatorVariant,
   1971                            SISchedulerBlockSchedulerVariant>
   1972         Variants[] = {
   1973 //      { LatenciesAlone, BlockRegUsageLatency },
   1974       { LatenciesAlone, BlockRegUsage },
   1975 //      { LatenciesGrouped, BlockLatencyRegUsage },
   1976       { LatenciesGrouped, BlockRegUsageLatency },
   1977       { LatenciesGrouped, BlockRegUsage },
   1978 //      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
   1979       { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
   1980       { LatenciesAlonePlusConsecutive, BlockRegUsage }
   1981     };
   1982     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
   1983       Temp = Scheduler.scheduleVariant(v.first, v.second);
   1984       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
   1985         Best = Temp;
   1986     }
   1987   }
   1988 
   1989   ScheduledSUnits = Best.SUs;
   1990   ScheduledSUnitsInv.resize(SUnits.size());
   1991 
   1992   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
   1993     ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
   1994   }
   1995 
   1996   moveLowLatencies();
   1997 
   1998   // Tell the outside world about the result of the scheduling.
   1999 
   2000   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   2001   TopRPTracker.setPos(CurrentTop);
   2002 
   2003   for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
   2004        E = ScheduledSUnits.end(); I != E; ++I) {
   2005     SUnit *SU = &SUnits[*I];
   2006 
   2007     scheduleMI(SU, true);
   2008 
   2009     LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
   2010                       << *SU->getInstr());
   2011   }
   2012 
   2013   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
   2014 
   2015   placeDebugValues();
   2016 
   2017   LLVM_DEBUG({
   2018     dbgs() << "*** Final schedule for "
   2019            << printMBBReference(*begin()->getParent()) << " ***\n";
   2020     dumpSchedule();
   2021     dbgs() << '\n';
   2022   });
   2023 }
   2024