Home | History | Annotate | Line # | Download | only in Utils
      1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file contains an optimization for div and rem on architectures that
     10 // execute short instructions significantly faster than longer instructions.
     11 // For example, on Intel Atom 32-bit divides are slow enough that during
     12 // runtime it is profitable to check the value of the operands, and if they are
     13 // positive and less than 256 use an unsigned 8-bit divide.
     14 //
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
     18 #include "llvm/ADT/DenseMap.h"
     19 #include "llvm/ADT/None.h"
     20 #include "llvm/ADT/Optional.h"
     21 #include "llvm/ADT/STLExtras.h"
     22 #include "llvm/ADT/SmallPtrSet.h"
     23 #include "llvm/Transforms/Utils/Local.h"
     24 #include "llvm/Analysis/ValueTracking.h"
     25 #include "llvm/IR/BasicBlock.h"
     26 #include "llvm/IR/Constants.h"
     27 #include "llvm/IR/DerivedTypes.h"
     28 #include "llvm/IR/Function.h"
     29 #include "llvm/IR/IRBuilder.h"
     30 #include "llvm/IR/Instruction.h"
     31 #include "llvm/IR/Instructions.h"
     32 #include "llvm/IR/Module.h"
     33 #include "llvm/IR/Type.h"
     34 #include "llvm/IR/Value.h"
     35 #include "llvm/Support/Casting.h"
     36 #include "llvm/Support/KnownBits.h"
     37 #include <cassert>
     38 #include <cstdint>
     39 
     40 using namespace llvm;
     41 
     42 #define DEBUG_TYPE "bypass-slow-division"
     43 
     44 namespace {
     45 
     46   struct QuotRemPair {
     47     Value *Quotient;
     48     Value *Remainder;
     49 
     50     QuotRemPair(Value *InQuotient, Value *InRemainder)
     51         : Quotient(InQuotient), Remainder(InRemainder) {}
     52   };
     53 
     54   /// A quotient and remainder, plus a BB from which they logically "originate".
     55   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
     56   /// corresponding predecessor.
     57   struct QuotRemWithBB {
     58     BasicBlock *BB = nullptr;
     59     Value *Quotient = nullptr;
     60     Value *Remainder = nullptr;
     61   };
     62 
     63 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
     64 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
     65 using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
     66 
     67 enum ValueRange {
     68   /// Operand definitely fits into BypassType. No runtime checks are needed.
     69   VALRNG_KNOWN_SHORT,
     70   /// A runtime check is required, as value range is unknown.
     71   VALRNG_UNKNOWN,
     72   /// Operand is unlikely to fit into BypassType. The bypassing should be
     73   /// disabled.
     74   VALRNG_LIKELY_LONG
     75 };
     76 
     77 class FastDivInsertionTask {
     78   bool IsValidTask = false;
     79   Instruction *SlowDivOrRem = nullptr;
     80   IntegerType *BypassType = nullptr;
     81   BasicBlock *MainBB = nullptr;
     82 
     83   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
     84   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
     85   QuotRemWithBB createSlowBB(BasicBlock *Successor);
     86   QuotRemWithBB createFastBB(BasicBlock *Successor);
     87   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
     88                                    BasicBlock *PhiBB);
     89   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
     90   Optional<QuotRemPair> insertFastDivAndRem();
     91 
     92   bool isSignedOp() {
     93     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
     94            SlowDivOrRem->getOpcode() == Instruction::SRem;
     95   }
     96 
     97   bool isDivisionOp() {
     98     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
     99            SlowDivOrRem->getOpcode() == Instruction::UDiv;
    100   }
    101 
    102   Type *getSlowType() { return SlowDivOrRem->getType(); }
    103 
    104 public:
    105   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
    106 
    107   Value *getReplacement(DivCacheTy &Cache);
    108 };
    109 
    110 } // end anonymous namespace
    111 
    112 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
    113                                            const BypassWidthsTy &BypassWidths) {
    114   switch (I->getOpcode()) {
    115   case Instruction::UDiv:
    116   case Instruction::SDiv:
    117   case Instruction::URem:
    118   case Instruction::SRem:
    119     SlowDivOrRem = I;
    120     break;
    121   default:
    122     // I is not a div/rem operation.
    123     return;
    124   }
    125 
    126   // Skip division on vector types. Only optimize integer instructions.
    127   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
    128   if (!SlowType)
    129     return;
    130 
    131   // Skip if this bitwidth is not bypassed.
    132   auto BI = BypassWidths.find(SlowType->getBitWidth());
    133   if (BI == BypassWidths.end())
    134     return;
    135 
    136   // Get type for div/rem instruction with bypass bitwidth.
    137   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
    138   BypassType = BT;
    139 
    140   // The original basic block.
    141   MainBB = I->getParent();
    142 
    143   // The instruction is indeed a slow div or rem operation.
    144   IsValidTask = true;
    145 }
    146 
    147 /// Reuses previously-computed dividend or remainder from the current BB if
    148 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
    149 /// perform the optimization and caches the resulting dividend and remainder.
    150 /// If no replacement can be generated, nullptr is returned.
    151 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
    152   // First, make sure that the task is valid.
    153   if (!IsValidTask)
    154     return nullptr;
    155 
    156   // Then, look for a value in Cache.
    157   Value *Dividend = SlowDivOrRem->getOperand(0);
    158   Value *Divisor = SlowDivOrRem->getOperand(1);
    159   DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
    160   auto CacheI = Cache.find(Key);
    161 
    162   if (CacheI == Cache.end()) {
    163     // If previous instance does not exist, try to insert fast div.
    164     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
    165     // Bail out if insertFastDivAndRem has failed.
    166     if (!OptResult)
    167       return nullptr;
    168     CacheI = Cache.insert({Key, *OptResult}).first;
    169   }
    170 
    171   QuotRemPair &Value = CacheI->second;
    172   return isDivisionOp() ? Value.Quotient : Value.Remainder;
    173 }
    174 
    175 /// Check if a value looks like a hash.
    176 ///
    177 /// The routine is expected to detect values computed using the most common hash
    178 /// algorithms. Typically, hash computations end with one of the following
    179 /// instructions:
    180 ///
    181 /// 1) MUL with a constant wider than BypassType
    182 /// 2) XOR instruction
    183 ///
    184 /// And even if we are wrong and the value is not a hash, it is still quite
    185 /// unlikely that such values will fit into BypassType.
    186 ///
    187 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
    188 /// It is implemented as a depth-first search for values that look neither long
    189 /// nor hash-like.
    190 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
    191   Instruction *I = dyn_cast<Instruction>(V);
    192   if (!I)
    193     return false;
    194 
    195   switch (I->getOpcode()) {
    196   case Instruction::Xor:
    197     return true;
    198   case Instruction::Mul: {
    199     // After Constant Hoisting pass, long constants may be represented as
    200     // bitcast instructions. As a result, some constants may look like an
    201     // instruction at first, and an additional check is necessary to find out if
    202     // an operand is actually a constant.
    203     Value *Op1 = I->getOperand(1);
    204     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
    205     if (!C && isa<BitCastInst>(Op1))
    206       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
    207     return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
    208   }
    209   case Instruction::PHI:
    210     // Stop IR traversal in case of a crazy input code. This limits recursion
    211     // depth.
    212     if (Visited.size() >= 16)
    213       return false;
    214     // Do not visit nodes that have been visited already. We return true because
    215     // it means that we couldn't find any value that doesn't look hash-like.
    216     if (!Visited.insert(I).second)
    217       return true;
    218     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
    219       // Ignore undef values as they probably don't affect the division
    220       // operands.
    221       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
    222              isa<UndefValue>(V);
    223     });
    224   default:
    225     return false;
    226   }
    227 }
    228 
    229 /// Check if an integer value fits into our bypass type.
    230 ValueRange FastDivInsertionTask::getValueRange(Value *V,
    231                                                VisitedSetTy &Visited) {
    232   unsigned ShortLen = BypassType->getBitWidth();
    233   unsigned LongLen = V->getType()->getIntegerBitWidth();
    234 
    235   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
    236   unsigned HiBits = LongLen - ShortLen;
    237 
    238   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
    239   KnownBits Known(LongLen);
    240 
    241   computeKnownBits(V, Known, DL);
    242 
    243   if (Known.countMinLeadingZeros() >= HiBits)
    244     return VALRNG_KNOWN_SHORT;
    245 
    246   if (Known.countMaxLeadingZeros() < HiBits)
    247     return VALRNG_LIKELY_LONG;
    248 
    249   // Long integer divisions are often used in hashtable implementations. It's
    250   // not worth bypassing such divisions because hash values are extremely
    251   // unlikely to have enough leading zeros. The call below tries to detect
    252   // values that are unlikely to fit BypassType (including hashes).
    253   if (isHashLikeValue(V, Visited))
    254     return VALRNG_LIKELY_LONG;
    255 
    256   return VALRNG_UNKNOWN;
    257 }
    258 
    259 /// Add new basic block for slow div and rem operations and put it before
    260 /// SuccessorBB.
    261 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
    262   QuotRemWithBB DivRemPair;
    263   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
    264                                      MainBB->getParent(), SuccessorBB);
    265   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
    266   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
    267 
    268   Value *Dividend = SlowDivOrRem->getOperand(0);
    269   Value *Divisor = SlowDivOrRem->getOperand(1);
    270 
    271   if (isSignedOp()) {
    272     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
    273     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
    274   } else {
    275     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
    276     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
    277   }
    278 
    279   Builder.CreateBr(SuccessorBB);
    280   return DivRemPair;
    281 }
    282 
    283 /// Add new basic block for fast div and rem operations and put it before
    284 /// SuccessorBB.
    285 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
    286   QuotRemWithBB DivRemPair;
    287   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
    288                                      MainBB->getParent(), SuccessorBB);
    289   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
    290   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
    291 
    292   Value *Dividend = SlowDivOrRem->getOperand(0);
    293   Value *Divisor = SlowDivOrRem->getOperand(1);
    294   Value *ShortDivisorV =
    295       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
    296   Value *ShortDividendV =
    297       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
    298 
    299   // udiv/urem because this optimization only handles positive numbers.
    300   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
    301   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
    302   DivRemPair.Quotient =
    303       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
    304   DivRemPair.Remainder =
    305       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
    306   Builder.CreateBr(SuccessorBB);
    307 
    308   return DivRemPair;
    309 }
    310 
    311 /// Creates Phi nodes for result of Div and Rem.
    312 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
    313                                                        QuotRemWithBB &RHS,
    314                                                        BasicBlock *PhiBB) {
    315   IRBuilder<> Builder(PhiBB, PhiBB->begin());
    316   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
    317   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
    318   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
    319   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
    320   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
    321   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
    322   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
    323   return QuotRemPair(QuoPhi, RemPhi);
    324 }
    325 
    326 /// Creates a runtime check to test whether both the divisor and dividend fit
    327 /// into BypassType. The check is inserted at the end of MainBB. True return
    328 /// value means that the operands fit. Either of the operands may be NULL if it
    329 /// doesn't need a runtime check.
    330 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
    331   assert((Op1 || Op2) && "Nothing to check");
    332   IRBuilder<> Builder(MainBB, MainBB->end());
    333   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
    334 
    335   Value *OrV;
    336   if (Op1 && Op2)
    337     OrV = Builder.CreateOr(Op1, Op2);
    338   else
    339     OrV = Op1 ? Op1 : Op2;
    340 
    341   // BitMask is inverted to check if the operands are
    342   // larger than the bypass type
    343   uint64_t BitMask = ~BypassType->getBitMask();
    344   Value *AndV = Builder.CreateAnd(OrV, BitMask);
    345 
    346   // Compare operand values
    347   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
    348   return Builder.CreateICmpEQ(AndV, ZeroV);
    349 }
    350 
    351 /// Substitutes the div/rem instruction with code that checks the value of the
    352 /// operands and uses a shorter-faster div/rem instruction when possible.
    353 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
    354   Value *Dividend = SlowDivOrRem->getOperand(0);
    355   Value *Divisor = SlowDivOrRem->getOperand(1);
    356 
    357   VisitedSetTy SetL;
    358   ValueRange DividendRange = getValueRange(Dividend, SetL);
    359   if (DividendRange == VALRNG_LIKELY_LONG)
    360     return None;
    361 
    362   VisitedSetTy SetR;
    363   ValueRange DivisorRange = getValueRange(Divisor, SetR);
    364   if (DivisorRange == VALRNG_LIKELY_LONG)
    365     return None;
    366 
    367   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
    368   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
    369 
    370   if (DividendShort && DivisorShort) {
    371     // If both operands are known to be short then just replace the long
    372     // division with a short one in-place.  Since we're not introducing control
    373     // flow in this case, narrowing the division is always a win, even if the
    374     // divisor is a constant (and will later get replaced by a multiplication).
    375 
    376     IRBuilder<> Builder(SlowDivOrRem);
    377     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
    378     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
    379     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
    380     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
    381     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
    382     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
    383     return QuotRemPair(ExtDiv, ExtRem);
    384   }
    385 
    386   if (isa<ConstantInt>(Divisor)) {
    387     // If the divisor is not a constant, DAGCombiner will convert it to a
    388     // multiplication by a magic constant.  It isn't clear if it is worth
    389     // introducing control flow to get a narrower multiply.
    390     return None;
    391   }
    392 
    393   // After Constant Hoisting pass, long constants may be represented as
    394   // bitcast instructions. As a result, some constants may look like an
    395   // instruction at first, and an additional check is necessary to find out if
    396   // an operand is actually a constant.
    397   if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
    398     if (BCI->getParent() == SlowDivOrRem->getParent() &&
    399         isa<ConstantInt>(BCI->getOperand(0)))
    400       return None;
    401 
    402   IRBuilder<> Builder(MainBB, MainBB->end());
    403   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
    404 
    405   if (DividendShort && !isSignedOp()) {
    406     // If the division is unsigned and Dividend is known to be short, then
    407     // either
    408     // 1) Divisor is less or equal to Dividend, and the result can be computed
    409     //    with a short division.
    410     // 2) Divisor is greater than Dividend. In this case, no division is needed
    411     //    at all: The quotient is 0 and the remainder is equal to Dividend.
    412     //
    413     // So instead of checking at runtime whether Divisor fits into BypassType,
    414     // we emit a runtime check to differentiate between these two cases. This
    415     // lets us entirely avoid a long div.
    416 
    417     // Split the basic block before the div/rem.
    418     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
    419     // Remove the unconditional branch from MainBB to SuccessorBB.
    420     MainBB->getInstList().back().eraseFromParent();
    421     QuotRemWithBB Long;
    422     Long.BB = MainBB;
    423     Long.Quotient = ConstantInt::get(getSlowType(), 0);
    424     Long.Remainder = Dividend;
    425     QuotRemWithBB Fast = createFastBB(SuccessorBB);
    426     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
    427     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
    428     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
    429     return Result;
    430   } else {
    431     // General case. Create both slow and fast div/rem pairs and choose one of
    432     // them at runtime.
    433 
    434     // Split the basic block before the div/rem.
    435     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
    436     // Remove the unconditional branch from MainBB to SuccessorBB.
    437     MainBB->getInstList().back().eraseFromParent();
    438     QuotRemWithBB Fast = createFastBB(SuccessorBB);
    439     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
    440     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
    441     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
    442                                             DivisorShort ? nullptr : Divisor);
    443     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
    444     return Result;
    445   }
    446 }
    447 
    448 /// This optimization identifies DIV/REM instructions in a BB that can be
    449 /// profitably bypassed and carried out with a shorter, faster divide.
    450 bool llvm::bypassSlowDivision(BasicBlock *BB,
    451                               const BypassWidthsTy &BypassWidths) {
    452   DivCacheTy PerBBDivCache;
    453 
    454   bool MadeChange = false;
    455   Instruction *Next = &*BB->begin();
    456   while (Next != nullptr) {
    457     // We may add instructions immediately after I, but we want to skip over
    458     // them.
    459     Instruction *I = Next;
    460     Next = Next->getNextNode();
    461 
    462     // Ignore dead code to save time and avoid bugs.
    463     if (I->hasNUses(0))
    464       continue;
    465 
    466     FastDivInsertionTask Task(I, BypassWidths);
    467     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
    468       I->replaceAllUsesWith(Replacement);
    469       I->eraseFromParent();
    470       MadeChange = true;
    471     }
    472   }
    473 
    474   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
    475   // create divrem machine instructions.  Now erase any unused divs / rems so we
    476   // don't leave extra instructions sitting around.
    477   for (auto &KV : PerBBDivCache)
    478     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
    479       RecursivelyDeleteTriviallyDeadInstructions(V);
    480 
    481   return MadeChange;
    482 }
    483