Home | History | Annotate | Line # | Download | only in Vectorize
      1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
     10 // stores that can be put together into vector-stores. Next, it attempts to
     11 // construct vectorizable tree using the use-def chains. If a profitable tree
     12 // was found, the SLP vectorizer performs vectorization on the tree.
     13 //
     14 // The pass is inspired by the work described in the paper:
     15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
     16 //
     17 //===----------------------------------------------------------------------===//
     18 
     19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
     20 #include "llvm/ADT/DenseMap.h"
     21 #include "llvm/ADT/DenseSet.h"
     22 #include "llvm/ADT/Optional.h"
     23 #include "llvm/ADT/PostOrderIterator.h"
     24 #include "llvm/ADT/STLExtras.h"
     25 #include "llvm/ADT/SetVector.h"
     26 #include "llvm/ADT/SmallBitVector.h"
     27 #include "llvm/ADT/SmallPtrSet.h"
     28 #include "llvm/ADT/SmallSet.h"
     29 #include "llvm/ADT/SmallString.h"
     30 #include "llvm/ADT/Statistic.h"
     31 #include "llvm/ADT/iterator.h"
     32 #include "llvm/ADT/iterator_range.h"
     33 #include "llvm/Analysis/AliasAnalysis.h"
     34 #include "llvm/Analysis/AssumptionCache.h"
     35 #include "llvm/Analysis/CodeMetrics.h"
     36 #include "llvm/Analysis/DemandedBits.h"
     37 #include "llvm/Analysis/GlobalsModRef.h"
     38 #include "llvm/Analysis/IVDescriptors.h"
     39 #include "llvm/Analysis/LoopAccessAnalysis.h"
     40 #include "llvm/Analysis/LoopInfo.h"
     41 #include "llvm/Analysis/MemoryLocation.h"
     42 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
     43 #include "llvm/Analysis/ScalarEvolution.h"
     44 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
     45 #include "llvm/Analysis/TargetLibraryInfo.h"
     46 #include "llvm/Analysis/TargetTransformInfo.h"
     47 #include "llvm/Analysis/ValueTracking.h"
     48 #include "llvm/Analysis/VectorUtils.h"
     49 #include "llvm/IR/Attributes.h"
     50 #include "llvm/IR/BasicBlock.h"
     51 #include "llvm/IR/Constant.h"
     52 #include "llvm/IR/Constants.h"
     53 #include "llvm/IR/DataLayout.h"
     54 #include "llvm/IR/DebugLoc.h"
     55 #include "llvm/IR/DerivedTypes.h"
     56 #include "llvm/IR/Dominators.h"
     57 #include "llvm/IR/Function.h"
     58 #include "llvm/IR/IRBuilder.h"
     59 #include "llvm/IR/InstrTypes.h"
     60 #include "llvm/IR/Instruction.h"
     61 #include "llvm/IR/Instructions.h"
     62 #include "llvm/IR/IntrinsicInst.h"
     63 #include "llvm/IR/Intrinsics.h"
     64 #include "llvm/IR/Module.h"
     65 #include "llvm/IR/NoFolder.h"
     66 #include "llvm/IR/Operator.h"
     67 #include "llvm/IR/PatternMatch.h"
     68 #include "llvm/IR/Type.h"
     69 #include "llvm/IR/Use.h"
     70 #include "llvm/IR/User.h"
     71 #include "llvm/IR/Value.h"
     72 #include "llvm/IR/ValueHandle.h"
     73 #include "llvm/IR/Verifier.h"
     74 #include "llvm/InitializePasses.h"
     75 #include "llvm/Pass.h"
     76 #include "llvm/Support/Casting.h"
     77 #include "llvm/Support/CommandLine.h"
     78 #include "llvm/Support/Compiler.h"
     79 #include "llvm/Support/DOTGraphTraits.h"
     80 #include "llvm/Support/Debug.h"
     81 #include "llvm/Support/ErrorHandling.h"
     82 #include "llvm/Support/GraphWriter.h"
     83 #include "llvm/Support/InstructionCost.h"
     84 #include "llvm/Support/KnownBits.h"
     85 #include "llvm/Support/MathExtras.h"
     86 #include "llvm/Support/raw_ostream.h"
     87 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
     88 #include "llvm/Transforms/Utils/LoopUtils.h"
     89 #include "llvm/Transforms/Vectorize.h"
     90 #include <algorithm>
     91 #include <cassert>
     92 #include <cstdint>
     93 #include <iterator>
     94 #include <memory>
     95 #include <set>
     96 #include <string>
     97 #include <tuple>
     98 #include <utility>
     99 #include <vector>
    100 
    101 using namespace llvm;
    102 using namespace llvm::PatternMatch;
    103 using namespace slpvectorizer;
    104 
    105 #define SV_NAME "slp-vectorizer"
    106 #define DEBUG_TYPE "SLP"
    107 
    108 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
    109 
    110 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
    111                                   cl::desc("Run the SLP vectorization passes"));
    112 
    113 static cl::opt<int>
    114     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
    115                      cl::desc("Only vectorize if you gain more than this "
    116                               "number "));
    117 
    118 static cl::opt<bool>
    119 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
    120                    cl::desc("Attempt to vectorize horizontal reductions"));
    121 
    122 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
    123     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
    124     cl::desc(
    125         "Attempt to vectorize horizontal reductions feeding into a store"));
    126 
    127 static cl::opt<int>
    128 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
    129     cl::desc("Attempt to vectorize for this register size in bits"));
    130 
    131 static cl::opt<unsigned>
    132 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
    133     cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
    134 
    135 static cl::opt<int>
    136 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
    137     cl::desc("Maximum depth of the lookup for consecutive stores."));
    138 
    139 /// Limits the size of scheduling regions in a block.
    140 /// It avoid long compile times for _very_ large blocks where vector
    141 /// instructions are spread over a wide range.
    142 /// This limit is way higher than needed by real-world functions.
    143 static cl::opt<int>
    144 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
    145     cl::desc("Limit the size of the SLP scheduling region per block"));
    146 
    147 static cl::opt<int> MinVectorRegSizeOption(
    148     "slp-min-reg-size", cl::init(128), cl::Hidden,
    149     cl::desc("Attempt to vectorize for this register size in bits"));
    150 
    151 static cl::opt<unsigned> RecursionMaxDepth(
    152     "slp-recursion-max-depth", cl::init(12), cl::Hidden,
    153     cl::desc("Limit the recursion depth when building a vectorizable tree"));
    154 
    155 static cl::opt<unsigned> MinTreeSize(
    156     "slp-min-tree-size", cl::init(3), cl::Hidden,
    157     cl::desc("Only vectorize small trees if they are fully vectorizable"));
    158 
    159 // The maximum depth that the look-ahead score heuristic will explore.
    160 // The higher this value, the higher the compilation time overhead.
    161 static cl::opt<int> LookAheadMaxDepth(
    162     "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
    163     cl::desc("The maximum look-ahead depth for operand reordering scores"));
    164 
    165 // The Look-ahead heuristic goes through the users of the bundle to calculate
    166 // the users cost in getExternalUsesCost(). To avoid compilation time increase
    167 // we limit the number of users visited to this value.
    168 static cl::opt<unsigned> LookAheadUsersBudget(
    169     "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
    170     cl::desc("The maximum number of users to visit while visiting the "
    171              "predecessors. This prevents compilation time increase."));
    172 
    173 static cl::opt<bool>
    174     ViewSLPTree("view-slp-tree", cl::Hidden,
    175                 cl::desc("Display the SLP trees with Graphviz"));
    176 
    177 // Limit the number of alias checks. The limit is chosen so that
    178 // it has no negative effect on the llvm benchmarks.
    179 static const unsigned AliasedCheckLimit = 10;
    180 
    181 // Another limit for the alias checks: The maximum distance between load/store
    182 // instructions where alias checks are done.
    183 // This limit is useful for very large basic blocks.
    184 static const unsigned MaxMemDepDistance = 160;
    185 
    186 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
    187 /// regions to be handled.
    188 static const int MinScheduleRegionSize = 16;
    189 
    190 /// Predicate for the element types that the SLP vectorizer supports.
    191 ///
    192 /// The most important thing to filter here are types which are invalid in LLVM
    193 /// vectors. We also filter target specific types which have absolutely no
    194 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
    195 /// avoids spending time checking the cost model and realizing that they will
    196 /// be inevitably scalarized.
    197 static bool isValidElementType(Type *Ty) {
    198   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
    199          !Ty->isPPC_FP128Ty();
    200 }
    201 
    202 /// \returns true if all of the instructions in \p VL are in the same block or
    203 /// false otherwise.
    204 static bool allSameBlock(ArrayRef<Value *> VL) {
    205   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
    206   if (!I0)
    207     return false;
    208   BasicBlock *BB = I0->getParent();
    209   for (int I = 1, E = VL.size(); I < E; I++) {
    210     auto *II = dyn_cast<Instruction>(VL[I]);
    211     if (!II)
    212       return false;
    213 
    214     if (BB != II->getParent())
    215       return false;
    216   }
    217   return true;
    218 }
    219 
    220 /// \returns True if all of the values in \p VL are constants (but not
    221 /// globals/constant expressions).
    222 static bool allConstant(ArrayRef<Value *> VL) {
    223   // Constant expressions and globals can't be vectorized like normal integer/FP
    224   // constants.
    225   for (Value *i : VL)
    226     if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
    227       return false;
    228   return true;
    229 }
    230 
    231 /// \returns True if all of the values in \p VL are identical.
    232 static bool isSplat(ArrayRef<Value *> VL) {
    233   for (unsigned i = 1, e = VL.size(); i < e; ++i)
    234     if (VL[i] != VL[0])
    235       return false;
    236   return true;
    237 }
    238 
    239 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
    240 static bool isCommutative(Instruction *I) {
    241   if (auto *Cmp = dyn_cast<CmpInst>(I))
    242     return Cmp->isCommutative();
    243   if (auto *BO = dyn_cast<BinaryOperator>(I))
    244     return BO->isCommutative();
    245   // TODO: This should check for generic Instruction::isCommutative(), but
    246   //       we need to confirm that the caller code correctly handles Intrinsics
    247   //       for example (does not have 2 operands).
    248   return false;
    249 }
    250 
    251 /// Checks if the vector of instructions can be represented as a shuffle, like:
    252 /// %x0 = extractelement <4 x i8> %x, i32 0
    253 /// %x3 = extractelement <4 x i8> %x, i32 3
    254 /// %y1 = extractelement <4 x i8> %y, i32 1
    255 /// %y2 = extractelement <4 x i8> %y, i32 2
    256 /// %x0x0 = mul i8 %x0, %x0
    257 /// %x3x3 = mul i8 %x3, %x3
    258 /// %y1y1 = mul i8 %y1, %y1
    259 /// %y2y2 = mul i8 %y2, %y2
    260 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
    261 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
    262 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
    263 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
    264 /// ret <4 x i8> %ins4
    265 /// can be transformed into:
    266 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
    267 ///                                                         i32 6>
    268 /// %2 = mul <4 x i8> %1, %1
    269 /// ret <4 x i8> %2
    270 /// We convert this initially to something like:
    271 /// %x0 = extractelement <4 x i8> %x, i32 0
    272 /// %x3 = extractelement <4 x i8> %x, i32 3
    273 /// %y1 = extractelement <4 x i8> %y, i32 1
    274 /// %y2 = extractelement <4 x i8> %y, i32 2
    275 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
    276 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
    277 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
    278 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
    279 /// %5 = mul <4 x i8> %4, %4
    280 /// %6 = extractelement <4 x i8> %5, i32 0
    281 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
    282 /// %7 = extractelement <4 x i8> %5, i32 1
    283 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
    284 /// %8 = extractelement <4 x i8> %5, i32 2
    285 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
    286 /// %9 = extractelement <4 x i8> %5, i32 3
    287 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
    288 /// ret <4 x i8> %ins4
    289 /// InstCombiner transforms this into a shuffle and vector mul
    290 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
    291 /// TODO: Can we split off and reuse the shuffle mask detection from
    292 /// TargetTransformInfo::getInstructionThroughput?
    293 static Optional<TargetTransformInfo::ShuffleKind>
    294 isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
    295   auto *EI0 = cast<ExtractElementInst>(VL[0]);
    296   unsigned Size =
    297       cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
    298   Value *Vec1 = nullptr;
    299   Value *Vec2 = nullptr;
    300   enum ShuffleMode { Unknown, Select, Permute };
    301   ShuffleMode CommonShuffleMode = Unknown;
    302   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
    303     auto *EI = cast<ExtractElementInst>(VL[I]);
    304     auto *Vec = EI->getVectorOperand();
    305     // All vector operands must have the same number of vector elements.
    306     if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
    307       return None;
    308     auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
    309     if (!Idx)
    310       return None;
    311     // Undefined behavior if Idx is negative or >= Size.
    312     if (Idx->getValue().uge(Size)) {
    313       Mask.push_back(UndefMaskElem);
    314       continue;
    315     }
    316     unsigned IntIdx = Idx->getValue().getZExtValue();
    317     Mask.push_back(IntIdx);
    318     // We can extractelement from undef or poison vector.
    319     if (isa<UndefValue>(Vec))
    320       continue;
    321     // For correct shuffling we have to have at most 2 different vector operands
    322     // in all extractelement instructions.
    323     if (!Vec1 || Vec1 == Vec)
    324       Vec1 = Vec;
    325     else if (!Vec2 || Vec2 == Vec)
    326       Vec2 = Vec;
    327     else
    328       return None;
    329     if (CommonShuffleMode == Permute)
    330       continue;
    331     // If the extract index is not the same as the operation number, it is a
    332     // permutation.
    333     if (IntIdx != I) {
    334       CommonShuffleMode = Permute;
    335       continue;
    336     }
    337     CommonShuffleMode = Select;
    338   }
    339   // If we're not crossing lanes in different vectors, consider it as blending.
    340   if (CommonShuffleMode == Select && Vec2)
    341     return TargetTransformInfo::SK_Select;
    342   // If Vec2 was never used, we have a permutation of a single vector, otherwise
    343   // we have permutation of 2 vectors.
    344   return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
    345               : TargetTransformInfo::SK_PermuteSingleSrc;
    346 }
    347 
    348 namespace {
    349 
    350 /// Main data required for vectorization of instructions.
    351 struct InstructionsState {
    352   /// The very first instruction in the list with the main opcode.
    353   Value *OpValue = nullptr;
    354 
    355   /// The main/alternate instruction.
    356   Instruction *MainOp = nullptr;
    357   Instruction *AltOp = nullptr;
    358 
    359   /// The main/alternate opcodes for the list of instructions.
    360   unsigned getOpcode() const {
    361     return MainOp ? MainOp->getOpcode() : 0;
    362   }
    363 
    364   unsigned getAltOpcode() const {
    365     return AltOp ? AltOp->getOpcode() : 0;
    366   }
    367 
    368   /// Some of the instructions in the list have alternate opcodes.
    369   bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
    370 
    371   bool isOpcodeOrAlt(Instruction *I) const {
    372     unsigned CheckedOpcode = I->getOpcode();
    373     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
    374   }
    375 
    376   InstructionsState() = delete;
    377   InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
    378       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
    379 };
    380 
    381 } // end anonymous namespace
    382 
    383 /// Chooses the correct key for scheduling data. If \p Op has the same (or
    384 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
    385 /// OpValue.
    386 static Value *isOneOf(const InstructionsState &S, Value *Op) {
    387   auto *I = dyn_cast<Instruction>(Op);
    388   if (I && S.isOpcodeOrAlt(I))
    389     return Op;
    390   return S.OpValue;
    391 }
    392 
    393 /// \returns true if \p Opcode is allowed as part of of the main/alternate
    394 /// instruction for SLP vectorization.
    395 ///
    396 /// Example of unsupported opcode is SDIV that can potentially cause UB if the
    397 /// "shuffled out" lane would result in division by zero.
    398 static bool isValidForAlternation(unsigned Opcode) {
    399   if (Instruction::isIntDivRem(Opcode))
    400     return false;
    401 
    402   return true;
    403 }
    404 
    405 /// \returns analysis of the Instructions in \p VL described in
    406 /// InstructionsState, the Opcode that we suppose the whole list
    407 /// could be vectorized even if its structure is diverse.
    408 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
    409                                        unsigned BaseIndex = 0) {
    410   // Make sure these are all Instructions.
    411   if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
    412     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
    413 
    414   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
    415   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
    416   unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
    417   unsigned AltOpcode = Opcode;
    418   unsigned AltIndex = BaseIndex;
    419 
    420   // Check for one alternate opcode from another BinaryOperator.
    421   // TODO - generalize to support all operators (types, calls etc.).
    422   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
    423     unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
    424     if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
    425       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
    426         continue;
    427       if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
    428           isValidForAlternation(Opcode)) {
    429         AltOpcode = InstOpcode;
    430         AltIndex = Cnt;
    431         continue;
    432       }
    433     } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
    434       Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
    435       Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
    436       if (Ty0 == Ty1) {
    437         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
    438           continue;
    439         if (Opcode == AltOpcode) {
    440           assert(isValidForAlternation(Opcode) &&
    441                  isValidForAlternation(InstOpcode) &&
    442                  "Cast isn't safe for alternation, logic needs to be updated!");
    443           AltOpcode = InstOpcode;
    444           AltIndex = Cnt;
    445           continue;
    446         }
    447       }
    448     } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
    449       continue;
    450     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
    451   }
    452 
    453   return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
    454                            cast<Instruction>(VL[AltIndex]));
    455 }
    456 
    457 /// \returns true if all of the values in \p VL have the same type or false
    458 /// otherwise.
    459 static bool allSameType(ArrayRef<Value *> VL) {
    460   Type *Ty = VL[0]->getType();
    461   for (int i = 1, e = VL.size(); i < e; i++)
    462     if (VL[i]->getType() != Ty)
    463       return false;
    464 
    465   return true;
    466 }
    467 
    468 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
    469 static Optional<unsigned> getExtractIndex(Instruction *E) {
    470   unsigned Opcode = E->getOpcode();
    471   assert((Opcode == Instruction::ExtractElement ||
    472           Opcode == Instruction::ExtractValue) &&
    473          "Expected extractelement or extractvalue instruction.");
    474   if (Opcode == Instruction::ExtractElement) {
    475     auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
    476     if (!CI)
    477       return None;
    478     return CI->getZExtValue();
    479   }
    480   ExtractValueInst *EI = cast<ExtractValueInst>(E);
    481   if (EI->getNumIndices() != 1)
    482     return None;
    483   return *EI->idx_begin();
    484 }
    485 
    486 /// \returns True if in-tree use also needs extract. This refers to
    487 /// possible scalar operand in vectorized instruction.
    488 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
    489                                     TargetLibraryInfo *TLI) {
    490   unsigned Opcode = UserInst->getOpcode();
    491   switch (Opcode) {
    492   case Instruction::Load: {
    493     LoadInst *LI = cast<LoadInst>(UserInst);
    494     return (LI->getPointerOperand() == Scalar);
    495   }
    496   case Instruction::Store: {
    497     StoreInst *SI = cast<StoreInst>(UserInst);
    498     return (SI->getPointerOperand() == Scalar);
    499   }
    500   case Instruction::Call: {
    501     CallInst *CI = cast<CallInst>(UserInst);
    502     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
    503     for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
    504       if (hasVectorInstrinsicScalarOpd(ID, i))
    505         return (CI->getArgOperand(i) == Scalar);
    506     }
    507     LLVM_FALLTHROUGH;
    508   }
    509   default:
    510     return false;
    511   }
    512 }
    513 
    514 /// \returns the AA location that is being access by the instruction.
    515 static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
    516   if (StoreInst *SI = dyn_cast<StoreInst>(I))
    517     return MemoryLocation::get(SI);
    518   if (LoadInst *LI = dyn_cast<LoadInst>(I))
    519     return MemoryLocation::get(LI);
    520   return MemoryLocation();
    521 }
    522 
    523 /// \returns True if the instruction is not a volatile or atomic load/store.
    524 static bool isSimple(Instruction *I) {
    525   if (LoadInst *LI = dyn_cast<LoadInst>(I))
    526     return LI->isSimple();
    527   if (StoreInst *SI = dyn_cast<StoreInst>(I))
    528     return SI->isSimple();
    529   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
    530     return !MI->isVolatile();
    531   return true;
    532 }
    533 
    534 namespace llvm {
    535 
    536 static void inversePermutation(ArrayRef<unsigned> Indices,
    537                                SmallVectorImpl<int> &Mask) {
    538   Mask.clear();
    539   const unsigned E = Indices.size();
    540   Mask.resize(E, E + 1);
    541   for (unsigned I = 0; I < E; ++I)
    542     Mask[Indices[I]] = I;
    543 }
    544 
    545 /// \returns inserting index of InsertElement or InsertValue instruction,
    546 /// using Offset as base offset for index.
    547 static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
    548   int Index = Offset;
    549   if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
    550     if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
    551       auto *VT = cast<FixedVectorType>(IE->getType());
    552       if (CI->getValue().uge(VT->getNumElements()))
    553         return UndefMaskElem;
    554       Index *= VT->getNumElements();
    555       Index += CI->getZExtValue();
    556       return Index;
    557     }
    558     if (isa<UndefValue>(IE->getOperand(2)))
    559       return UndefMaskElem;
    560     return None;
    561   }
    562 
    563   auto *IV = cast<InsertValueInst>(InsertInst);
    564   Type *CurrentType = IV->getType();
    565   for (unsigned I : IV->indices()) {
    566     if (auto *ST = dyn_cast<StructType>(CurrentType)) {
    567       Index *= ST->getNumElements();
    568       CurrentType = ST->getElementType(I);
    569     } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
    570       Index *= AT->getNumElements();
    571       CurrentType = AT->getElementType();
    572     } else {
    573       return None;
    574     }
    575     Index += I;
    576   }
    577   return Index;
    578 }
    579 
    580 namespace slpvectorizer {
    581 
    582 /// Bottom Up SLP Vectorizer.
    583 class BoUpSLP {
    584   struct TreeEntry;
    585   struct ScheduleData;
    586 
    587 public:
    588   using ValueList = SmallVector<Value *, 8>;
    589   using InstrList = SmallVector<Instruction *, 16>;
    590   using ValueSet = SmallPtrSet<Value *, 16>;
    591   using StoreList = SmallVector<StoreInst *, 8>;
    592   using ExtraValueToDebugLocsMap =
    593       MapVector<Value *, SmallVector<Instruction *, 2>>;
    594   using OrdersType = SmallVector<unsigned, 4>;
    595 
    596   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
    597           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
    598           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
    599           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
    600       : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
    601         DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
    602     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
    603     // Use the vector register size specified by the target unless overridden
    604     // by a command-line option.
    605     // TODO: It would be better to limit the vectorization factor based on
    606     //       data type rather than just register size. For example, x86 AVX has
    607     //       256-bit registers, but it does not support integer operations
    608     //       at that width (that requires AVX2).
    609     if (MaxVectorRegSizeOption.getNumOccurrences())
    610       MaxVecRegSize = MaxVectorRegSizeOption;
    611     else
    612       MaxVecRegSize =
    613           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
    614               .getFixedSize();
    615 
    616     if (MinVectorRegSizeOption.getNumOccurrences())
    617       MinVecRegSize = MinVectorRegSizeOption;
    618     else
    619       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
    620   }
    621 
    622   /// Vectorize the tree that starts with the elements in \p VL.
    623   /// Returns the vectorized root.
    624   Value *vectorizeTree();
    625 
    626   /// Vectorize the tree but with the list of externally used values \p
    627   /// ExternallyUsedValues. Values in this MapVector can be replaced but the
    628   /// generated extractvalue instructions.
    629   Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
    630 
    631   /// \returns the cost incurred by unwanted spills and fills, caused by
    632   /// holding live values over call sites.
    633   InstructionCost getSpillCost() const;
    634 
    635   /// \returns the vectorization cost of the subtree that starts at \p VL.
    636   /// A negative number means that this is profitable.
    637   InstructionCost getTreeCost();
    638 
    639   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
    640   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
    641   void buildTree(ArrayRef<Value *> Roots,
    642                  ArrayRef<Value *> UserIgnoreLst = None);
    643 
    644   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
    645   /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
    646   /// into account (and updating it, if required) list of externally used
    647   /// values stored in \p ExternallyUsedValues.
    648   void buildTree(ArrayRef<Value *> Roots,
    649                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
    650                  ArrayRef<Value *> UserIgnoreLst = None);
    651 
    652   /// Clear the internal data structures that are created by 'buildTree'.
    653   void deleteTree() {
    654     VectorizableTree.clear();
    655     ScalarToTreeEntry.clear();
    656     MustGather.clear();
    657     ExternalUses.clear();
    658     NumOpsWantToKeepOrder.clear();
    659     NumOpsWantToKeepOriginalOrder = 0;
    660     for (auto &Iter : BlocksSchedules) {
    661       BlockScheduling *BS = Iter.second.get();
    662       BS->clear();
    663     }
    664     MinBWs.clear();
    665     InstrElementSize.clear();
    666   }
    667 
    668   unsigned getTreeSize() const { return VectorizableTree.size(); }
    669 
    670   /// Perform LICM and CSE on the newly generated gather sequences.
    671   void optimizeGatherSequence();
    672 
    673   /// \returns The best order of instructions for vectorization.
    674   Optional<ArrayRef<unsigned>> bestOrder() const {
    675     assert(llvm::all_of(
    676                NumOpsWantToKeepOrder,
    677                [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
    678                  return D.getFirst().size() ==
    679                         VectorizableTree[0]->Scalars.size();
    680                }) &&
    681            "All orders must have the same size as number of instructions in "
    682            "tree node.");
    683     auto I = std::max_element(
    684         NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
    685         [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
    686            const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
    687           return D1.second < D2.second;
    688         });
    689     if (I == NumOpsWantToKeepOrder.end() ||
    690         I->getSecond() <= NumOpsWantToKeepOriginalOrder)
    691       return None;
    692 
    693     return makeArrayRef(I->getFirst());
    694   }
    695 
    696   /// Builds the correct order for root instructions.
    697   /// If some leaves have the same instructions to be vectorized, we may
    698   /// incorrectly evaluate the best order for the root node (it is built for the
    699   /// vector of instructions without repeated instructions and, thus, has less
    700   /// elements than the root node). This function builds the correct order for
    701   /// the root node.
    702   /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
    703   /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
    704   /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
    705   /// be reordered, the best order will be \<1, 0\>. We need to extend this
    706   /// order for the root node. For the root node this order should look like
    707   /// \<3, 0, 1, 2\>. This function extends the order for the reused
    708   /// instructions.
    709   void findRootOrder(OrdersType &Order) {
    710     // If the leaf has the same number of instructions to vectorize as the root
    711     // - order must be set already.
    712     unsigned RootSize = VectorizableTree[0]->Scalars.size();
    713     if (Order.size() == RootSize)
    714       return;
    715     SmallVector<unsigned, 4> RealOrder(Order.size());
    716     std::swap(Order, RealOrder);
    717     SmallVector<int, 4> Mask;
    718     inversePermutation(RealOrder, Mask);
    719     Order.assign(Mask.begin(), Mask.end());
    720     // The leaf has less number of instructions - need to find the true order of
    721     // the root.
    722     // Scan the nodes starting from the leaf back to the root.
    723     const TreeEntry *PNode = VectorizableTree.back().get();
    724     SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
    725     SmallPtrSet<const TreeEntry *, 4> Visited;
    726     while (!Nodes.empty() && Order.size() != RootSize) {
    727       const TreeEntry *PNode = Nodes.pop_back_val();
    728       if (!Visited.insert(PNode).second)
    729         continue;
    730       const TreeEntry &Node = *PNode;
    731       for (const EdgeInfo &EI : Node.UserTreeIndices)
    732         if (EI.UserTE)
    733           Nodes.push_back(EI.UserTE);
    734       if (Node.ReuseShuffleIndices.empty())
    735         continue;
    736       // Build the order for the parent node.
    737       OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
    738       SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
    739       // The algorithm of the order extension is:
    740       // 1. Calculate the number of the same instructions for the order.
    741       // 2. Calculate the index of the new order: total number of instructions
    742       // with order less than the order of the current instruction + reuse
    743       // number of the current instruction.
    744       // 3. The new order is just the index of the instruction in the original
    745       // vector of the instructions.
    746       for (unsigned I : Node.ReuseShuffleIndices)
    747         ++OrderCounter[Order[I]];
    748       SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
    749       for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
    750         unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
    751         unsigned OrderIdx = Order[ReusedIdx];
    752         unsigned NewIdx = 0;
    753         for (unsigned J = 0; J < OrderIdx; ++J)
    754           NewIdx += OrderCounter[J];
    755         NewIdx += CurrentCounter[OrderIdx];
    756         ++CurrentCounter[OrderIdx];
    757         assert(NewOrder[NewIdx] == RootSize &&
    758                "The order index should not be written already.");
    759         NewOrder[NewIdx] = I;
    760       }
    761       std::swap(Order, NewOrder);
    762     }
    763     assert(Order.size() == RootSize &&
    764            "Root node is expected or the size of the order must be the same as "
    765            "the number of elements in the root node.");
    766     assert(llvm::all_of(Order,
    767                         [RootSize](unsigned Val) { return Val != RootSize; }) &&
    768            "All indices must be initialized");
    769   }
    770 
    771   /// \return The vector element size in bits to use when vectorizing the
    772   /// expression tree ending at \p V. If V is a store, the size is the width of
    773   /// the stored value. Otherwise, the size is the width of the largest loaded
    774   /// value reaching V. This method is used by the vectorizer to calculate
    775   /// vectorization factors.
    776   unsigned getVectorElementSize(Value *V);
    777 
    778   /// Compute the minimum type sizes required to represent the entries in a
    779   /// vectorizable tree.
    780   void computeMinimumValueSizes();
    781 
    782   // \returns maximum vector register size as set by TTI or overridden by cl::opt.
    783   unsigned getMaxVecRegSize() const {
    784     return MaxVecRegSize;
    785   }
    786 
    787   // \returns minimum vector register size as set by cl::opt.
    788   unsigned getMinVecRegSize() const {
    789     return MinVecRegSize;
    790   }
    791 
    792   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
    793     unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
    794       MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
    795     return MaxVF ? MaxVF : UINT_MAX;
    796   }
    797 
    798   /// Check if homogeneous aggregate is isomorphic to some VectorType.
    799   /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
    800   /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
    801   /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
    802   ///
    803   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
    804   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
    805 
    806   /// \returns True if the VectorizableTree is both tiny and not fully
    807   /// vectorizable. We do not vectorize such trees.
    808   bool isTreeTinyAndNotFullyVectorizable() const;
    809 
    810   /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
    811   /// can be load combined in the backend. Load combining may not be allowed in
    812   /// the IR optimizer, so we do not want to alter the pattern. For example,
    813   /// partially transforming a scalar bswap() pattern into vector code is
    814   /// effectively impossible for the backend to undo.
    815   /// TODO: If load combining is allowed in the IR optimizer, this analysis
    816   ///       may not be necessary.
    817   bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
    818 
    819   /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
    820   /// can be load combined in the backend. Load combining may not be allowed in
    821   /// the IR optimizer, so we do not want to alter the pattern. For example,
    822   /// partially transforming a scalar bswap() pattern into vector code is
    823   /// effectively impossible for the backend to undo.
    824   /// TODO: If load combining is allowed in the IR optimizer, this analysis
    825   ///       may not be necessary.
    826   bool isLoadCombineCandidate() const;
    827 
    828   OptimizationRemarkEmitter *getORE() { return ORE; }
    829 
    830   /// This structure holds any data we need about the edges being traversed
    831   /// during buildTree_rec(). We keep track of:
    832   /// (i) the user TreeEntry index, and
    833   /// (ii) the index of the edge.
    834   struct EdgeInfo {
    835     EdgeInfo() = default;
    836     EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
    837         : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
    838     /// The user TreeEntry.
    839     TreeEntry *UserTE = nullptr;
    840     /// The operand index of the use.
    841     unsigned EdgeIdx = UINT_MAX;
    842 #ifndef NDEBUG
    843     friend inline raw_ostream &operator<<(raw_ostream &OS,
    844                                           const BoUpSLP::EdgeInfo &EI) {
    845       EI.dump(OS);
    846       return OS;
    847     }
    848     /// Debug print.
    849     void dump(raw_ostream &OS) const {
    850       OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
    851          << " EdgeIdx:" << EdgeIdx << "}";
    852     }
    853     LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
    854 #endif
    855   };
    856 
    857   /// A helper data structure to hold the operands of a vector of instructions.
    858   /// This supports a fixed vector length for all operand vectors.
    859   class VLOperands {
    860     /// For each operand we need (i) the value, and (ii) the opcode that it
    861     /// would be attached to if the expression was in a left-linearized form.
    862     /// This is required to avoid illegal operand reordering.
    863     /// For example:
    864     /// \verbatim
    865     ///                         0 Op1
    866     ///                         |/
    867     /// Op1 Op2   Linearized    + Op2
    868     ///   \ /     ---------->   |/
    869     ///    -                    -
    870     ///
    871     /// Op1 - Op2            (0 + Op1) - Op2
    872     /// \endverbatim
    873     ///
    874     /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
    875     ///
    876     /// Another way to think of this is to track all the operations across the
    877     /// path from the operand all the way to the root of the tree and to
    878     /// calculate the operation that corresponds to this path. For example, the
    879     /// path from Op2 to the root crosses the RHS of the '-', therefore the
    880     /// corresponding operation is a '-' (which matches the one in the
    881     /// linearized tree, as shown above).
    882     ///
    883     /// For lack of a better term, we refer to this operation as Accumulated
    884     /// Path Operation (APO).
    885     struct OperandData {
    886       OperandData() = default;
    887       OperandData(Value *V, bool APO, bool IsUsed)
    888           : V(V), APO(APO), IsUsed(IsUsed) {}
    889       /// The operand value.
    890       Value *V = nullptr;
    891       /// TreeEntries only allow a single opcode, or an alternate sequence of
    892       /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
    893       /// APO. It is set to 'true' if 'V' is attached to an inverse operation
    894       /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
    895       /// (e.g., Add/Mul)
    896       bool APO = false;
    897       /// Helper data for the reordering function.
    898       bool IsUsed = false;
    899     };
    900 
    901     /// During operand reordering, we are trying to select the operand at lane
    902     /// that matches best with the operand at the neighboring lane. Our
    903     /// selection is based on the type of value we are looking for. For example,
    904     /// if the neighboring lane has a load, we need to look for a load that is
    905     /// accessing a consecutive address. These strategies are summarized in the
    906     /// 'ReorderingMode' enumerator.
    907     enum class ReorderingMode {
    908       Load,     ///< Matching loads to consecutive memory addresses
    909       Opcode,   ///< Matching instructions based on opcode (same or alternate)
    910       Constant, ///< Matching constants
    911       Splat,    ///< Matching the same instruction multiple times (broadcast)
    912       Failed,   ///< We failed to create a vectorizable group
    913     };
    914 
    915     using OperandDataVec = SmallVector<OperandData, 2>;
    916 
    917     /// A vector of operand vectors.
    918     SmallVector<OperandDataVec, 4> OpsVec;
    919 
    920     const DataLayout &DL;
    921     ScalarEvolution &SE;
    922     const BoUpSLP &R;
    923 
    924     /// \returns the operand data at \p OpIdx and \p Lane.
    925     OperandData &getData(unsigned OpIdx, unsigned Lane) {
    926       return OpsVec[OpIdx][Lane];
    927     }
    928 
    929     /// \returns the operand data at \p OpIdx and \p Lane. Const version.
    930     const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
    931       return OpsVec[OpIdx][Lane];
    932     }
    933 
    934     /// Clears the used flag for all entries.
    935     void clearUsed() {
    936       for (unsigned OpIdx = 0, NumOperands = getNumOperands();
    937            OpIdx != NumOperands; ++OpIdx)
    938         for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
    939              ++Lane)
    940           OpsVec[OpIdx][Lane].IsUsed = false;
    941     }
    942 
    943     /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
    944     void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
    945       std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
    946     }
    947 
    948     // The hard-coded scores listed here are not very important. When computing
    949     // the scores of matching one sub-tree with another, we are basically
    950     // counting the number of values that are matching. So even if all scores
    951     // are set to 1, we would still get a decent matching result.
    952     // However, sometimes we have to break ties. For example we may have to
    953     // choose between matching loads vs matching opcodes. This is what these
    954     // scores are helping us with: they provide the order of preference.
    955 
    956     /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
    957     static const int ScoreConsecutiveLoads = 3;
    958     /// ExtractElementInst from same vector and consecutive indexes.
    959     static const int ScoreConsecutiveExtracts = 3;
    960     /// Constants.
    961     static const int ScoreConstants = 2;
    962     /// Instructions with the same opcode.
    963     static const int ScoreSameOpcode = 2;
    964     /// Instructions with alt opcodes (e.g, add + sub).
    965     static const int ScoreAltOpcodes = 1;
    966     /// Identical instructions (a.k.a. splat or broadcast).
    967     static const int ScoreSplat = 1;
    968     /// Matching with an undef is preferable to failing.
    969     static const int ScoreUndef = 1;
    970     /// Score for failing to find a decent match.
    971     static const int ScoreFail = 0;
    972     /// User exteranl to the vectorized code.
    973     static const int ExternalUseCost = 1;
    974     /// The user is internal but in a different lane.
    975     static const int UserInDiffLaneCost = ExternalUseCost;
    976 
    977     /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
    978     static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
    979                                ScalarEvolution &SE) {
    980       auto *LI1 = dyn_cast<LoadInst>(V1);
    981       auto *LI2 = dyn_cast<LoadInst>(V2);
    982       if (LI1 && LI2) {
    983         if (LI1->getParent() != LI2->getParent())
    984           return VLOperands::ScoreFail;
    985 
    986         Optional<int> Dist =
    987             getPointersDiff(LI1->getPointerOperand(), LI2->getPointerOperand(),
    988                             DL, SE, /*StrictCheck=*/true);
    989         return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
    990                                     : VLOperands::ScoreFail;
    991       }
    992 
    993       auto *C1 = dyn_cast<Constant>(V1);
    994       auto *C2 = dyn_cast<Constant>(V2);
    995       if (C1 && C2)
    996         return VLOperands::ScoreConstants;
    997 
    998       // Extracts from consecutive indexes of the same vector better score as
    999       // the extracts could be optimized away.
   1000       Value *EV;
   1001       ConstantInt *Ex1Idx, *Ex2Idx;
   1002       if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
   1003           match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
   1004           Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
   1005         return VLOperands::ScoreConsecutiveExtracts;
   1006 
   1007       auto *I1 = dyn_cast<Instruction>(V1);
   1008       auto *I2 = dyn_cast<Instruction>(V2);
   1009       if (I1 && I2) {
   1010         if (I1 == I2)
   1011           return VLOperands::ScoreSplat;
   1012         InstructionsState S = getSameOpcode({I1, I2});
   1013         // Note: Only consider instructions with <= 2 operands to avoid
   1014         // complexity explosion.
   1015         if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
   1016           return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
   1017                                   : VLOperands::ScoreSameOpcode;
   1018       }
   1019 
   1020       if (isa<UndefValue>(V2))
   1021         return VLOperands::ScoreUndef;
   1022 
   1023       return VLOperands::ScoreFail;
   1024     }
   1025 
   1026     /// Holds the values and their lane that are taking part in the look-ahead
   1027     /// score calculation. This is used in the external uses cost calculation.
   1028     SmallDenseMap<Value *, int> InLookAheadValues;
   1029 
   1030     /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
   1031     /// either external to the vectorized code, or require shuffling.
   1032     int getExternalUsesCost(const std::pair<Value *, int> &LHS,
   1033                             const std::pair<Value *, int> &RHS) {
   1034       int Cost = 0;
   1035       std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
   1036       for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
   1037         Value *V = Values[Idx].first;
   1038         if (isa<Constant>(V)) {
   1039           // Since this is a function pass, it doesn't make semantic sense to
   1040           // walk the users of a subclass of Constant. The users could be in
   1041           // another function, or even another module that happens to be in
   1042           // the same LLVMContext.
   1043           continue;
   1044         }
   1045 
   1046         // Calculate the absolute lane, using the minimum relative lane of LHS
   1047         // and RHS as base and Idx as the offset.
   1048         int Ln = std::min(LHS.second, RHS.second) + Idx;
   1049         assert(Ln >= 0 && "Bad lane calculation");
   1050         unsigned UsersBudget = LookAheadUsersBudget;
   1051         for (User *U : V->users()) {
   1052           if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
   1053             // The user is in the VectorizableTree. Check if we need to insert.
   1054             auto It = llvm::find(UserTE->Scalars, U);
   1055             assert(It != UserTE->Scalars.end() && "U is in UserTE");
   1056             int UserLn = std::distance(UserTE->Scalars.begin(), It);
   1057             assert(UserLn >= 0 && "Bad lane");
   1058             if (UserLn != Ln)
   1059               Cost += UserInDiffLaneCost;
   1060           } else {
   1061             // Check if the user is in the look-ahead code.
   1062             auto It2 = InLookAheadValues.find(U);
   1063             if (It2 != InLookAheadValues.end()) {
   1064               // The user is in the look-ahead code. Check the lane.
   1065               if (It2->second != Ln)
   1066                 Cost += UserInDiffLaneCost;
   1067             } else {
   1068               // The user is neither in SLP tree nor in the look-ahead code.
   1069               Cost += ExternalUseCost;
   1070             }
   1071           }
   1072           // Limit the number of visited uses to cap compilation time.
   1073           if (--UsersBudget == 0)
   1074             break;
   1075         }
   1076       }
   1077       return Cost;
   1078     }
   1079 
   1080     /// Go through the operands of \p LHS and \p RHS recursively until \p
   1081     /// MaxLevel, and return the cummulative score. For example:
   1082     /// \verbatim
   1083     ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
   1084     ///     \ /         \ /         \ /        \ /
   1085     ///      +           +           +          +
   1086     ///     G1          G2          G3         G4
   1087     /// \endverbatim
   1088     /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
   1089     /// each level recursively, accumulating the score. It starts from matching
   1090     /// the additions at level 0, then moves on to the loads (level 1). The
   1091     /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
   1092     /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
   1093     /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
   1094     /// Please note that the order of the operands does not matter, as we
   1095     /// evaluate the score of all profitable combinations of operands. In
   1096     /// other words the score of G1 and G4 is the same as G1 and G2. This
   1097     /// heuristic is based on ideas described in:
   1098     ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
   1099     ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
   1100     ///   Lus F. W. Ges
   1101     int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
   1102                            const std::pair<Value *, int> &RHS, int CurrLevel,
   1103                            int MaxLevel) {
   1104 
   1105       Value *V1 = LHS.first;
   1106       Value *V2 = RHS.first;
   1107       // Get the shallow score of V1 and V2.
   1108       int ShallowScoreAtThisLevel =
   1109           std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
   1110                                        getExternalUsesCost(LHS, RHS));
   1111       int Lane1 = LHS.second;
   1112       int Lane2 = RHS.second;
   1113 
   1114       // If reached MaxLevel,
   1115       //  or if V1 and V2 are not instructions,
   1116       //  or if they are SPLAT,
   1117       //  or if they are not consecutive, early return the current cost.
   1118       auto *I1 = dyn_cast<Instruction>(V1);
   1119       auto *I2 = dyn_cast<Instruction>(V2);
   1120       if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
   1121           ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
   1122           (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
   1123         return ShallowScoreAtThisLevel;
   1124       assert(I1 && I2 && "Should have early exited.");
   1125 
   1126       // Keep track of in-tree values for determining the external-use cost.
   1127       InLookAheadValues[V1] = Lane1;
   1128       InLookAheadValues[V2] = Lane2;
   1129 
   1130       // Contains the I2 operand indexes that got matched with I1 operands.
   1131       SmallSet<unsigned, 4> Op2Used;
   1132 
   1133       // Recursion towards the operands of I1 and I2. We are trying all possbile
   1134       // operand pairs, and keeping track of the best score.
   1135       for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
   1136            OpIdx1 != NumOperands1; ++OpIdx1) {
   1137         // Try to pair op1I with the best operand of I2.
   1138         int MaxTmpScore = 0;
   1139         unsigned MaxOpIdx2 = 0;
   1140         bool FoundBest = false;
   1141         // If I2 is commutative try all combinations.
   1142         unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
   1143         unsigned ToIdx = isCommutative(I2)
   1144                              ? I2->getNumOperands()
   1145                              : std::min(I2->getNumOperands(), OpIdx1 + 1);
   1146         assert(FromIdx <= ToIdx && "Bad index");
   1147         for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
   1148           // Skip operands already paired with OpIdx1.
   1149           if (Op2Used.count(OpIdx2))
   1150             continue;
   1151           // Recursively calculate the cost at each level
   1152           int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
   1153                                             {I2->getOperand(OpIdx2), Lane2},
   1154                                             CurrLevel + 1, MaxLevel);
   1155           // Look for the best score.
   1156           if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
   1157             MaxTmpScore = TmpScore;
   1158             MaxOpIdx2 = OpIdx2;
   1159             FoundBest = true;
   1160           }
   1161         }
   1162         if (FoundBest) {
   1163           // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
   1164           Op2Used.insert(MaxOpIdx2);
   1165           ShallowScoreAtThisLevel += MaxTmpScore;
   1166         }
   1167       }
   1168       return ShallowScoreAtThisLevel;
   1169     }
   1170 
   1171     /// \Returns the look-ahead score, which tells us how much the sub-trees
   1172     /// rooted at \p LHS and \p RHS match, the more they match the higher the
   1173     /// score. This helps break ties in an informed way when we cannot decide on
   1174     /// the order of the operands by just considering the immediate
   1175     /// predecessors.
   1176     int getLookAheadScore(const std::pair<Value *, int> &LHS,
   1177                           const std::pair<Value *, int> &RHS) {
   1178       InLookAheadValues.clear();
   1179       return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
   1180     }
   1181 
   1182     // Search all operands in Ops[*][Lane] for the one that matches best
   1183     // Ops[OpIdx][LastLane] and return its opreand index.
   1184     // If no good match can be found, return None.
   1185     Optional<unsigned>
   1186     getBestOperand(unsigned OpIdx, int Lane, int LastLane,
   1187                    ArrayRef<ReorderingMode> ReorderingModes) {
   1188       unsigned NumOperands = getNumOperands();
   1189 
   1190       // The operand of the previous lane at OpIdx.
   1191       Value *OpLastLane = getData(OpIdx, LastLane).V;
   1192 
   1193       // Our strategy mode for OpIdx.
   1194       ReorderingMode RMode = ReorderingModes[OpIdx];
   1195 
   1196       // The linearized opcode of the operand at OpIdx, Lane.
   1197       bool OpIdxAPO = getData(OpIdx, Lane).APO;
   1198 
   1199       // The best operand index and its score.
   1200       // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
   1201       // are using the score to differentiate between the two.
   1202       struct BestOpData {
   1203         Optional<unsigned> Idx = None;
   1204         unsigned Score = 0;
   1205       } BestOp;
   1206 
   1207       // Iterate through all unused operands and look for the best.
   1208       for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
   1209         // Get the operand at Idx and Lane.
   1210         OperandData &OpData = getData(Idx, Lane);
   1211         Value *Op = OpData.V;
   1212         bool OpAPO = OpData.APO;
   1213 
   1214         // Skip already selected operands.
   1215         if (OpData.IsUsed)
   1216           continue;
   1217 
   1218         // Skip if we are trying to move the operand to a position with a
   1219         // different opcode in the linearized tree form. This would break the
   1220         // semantics.
   1221         if (OpAPO != OpIdxAPO)
   1222           continue;
   1223 
   1224         // Look for an operand that matches the current mode.
   1225         switch (RMode) {
   1226         case ReorderingMode::Load:
   1227         case ReorderingMode::Constant:
   1228         case ReorderingMode::Opcode: {
   1229           bool LeftToRight = Lane > LastLane;
   1230           Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
   1231           Value *OpRight = (LeftToRight) ? Op : OpLastLane;
   1232           unsigned Score =
   1233               getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
   1234           if (Score > BestOp.Score) {
   1235             BestOp.Idx = Idx;
   1236             BestOp.Score = Score;
   1237           }
   1238           break;
   1239         }
   1240         case ReorderingMode::Splat:
   1241           if (Op == OpLastLane)
   1242             BestOp.Idx = Idx;
   1243           break;
   1244         case ReorderingMode::Failed:
   1245           return None;
   1246         }
   1247       }
   1248 
   1249       if (BestOp.Idx) {
   1250         getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
   1251         return BestOp.Idx;
   1252       }
   1253       // If we could not find a good match return None.
   1254       return None;
   1255     }
   1256 
   1257     /// Helper for reorderOperandVecs. \Returns the lane that we should start
   1258     /// reordering from. This is the one which has the least number of operands
   1259     /// that can freely move about.
   1260     unsigned getBestLaneToStartReordering() const {
   1261       unsigned BestLane = 0;
   1262       unsigned Min = UINT_MAX;
   1263       for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
   1264            ++Lane) {
   1265         unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
   1266         if (NumFreeOps < Min) {
   1267           Min = NumFreeOps;
   1268           BestLane = Lane;
   1269         }
   1270       }
   1271       return BestLane;
   1272     }
   1273 
   1274     /// \Returns the maximum number of operands that are allowed to be reordered
   1275     /// for \p Lane. This is used as a heuristic for selecting the first lane to
   1276     /// start operand reordering.
   1277     unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
   1278       unsigned CntTrue = 0;
   1279       unsigned NumOperands = getNumOperands();
   1280       // Operands with the same APO can be reordered. We therefore need to count
   1281       // how many of them we have for each APO, like this: Cnt[APO] = x.
   1282       // Since we only have two APOs, namely true and false, we can avoid using
   1283       // a map. Instead we can simply count the number of operands that
   1284       // correspond to one of them (in this case the 'true' APO), and calculate
   1285       // the other by subtracting it from the total number of operands.
   1286       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
   1287         if (getData(OpIdx, Lane).APO)
   1288           ++CntTrue;
   1289       unsigned CntFalse = NumOperands - CntTrue;
   1290       return std::max(CntTrue, CntFalse);
   1291     }
   1292 
   1293     /// Go through the instructions in VL and append their operands.
   1294     void appendOperandsOfVL(ArrayRef<Value *> VL) {
   1295       assert(!VL.empty() && "Bad VL");
   1296       assert((empty() || VL.size() == getNumLanes()) &&
   1297              "Expected same number of lanes");
   1298       assert(isa<Instruction>(VL[0]) && "Expected instruction");
   1299       unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
   1300       OpsVec.resize(NumOperands);
   1301       unsigned NumLanes = VL.size();
   1302       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
   1303         OpsVec[OpIdx].resize(NumLanes);
   1304         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
   1305           assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
   1306           // Our tree has just 3 nodes: the root and two operands.
   1307           // It is therefore trivial to get the APO. We only need to check the
   1308           // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
   1309           // RHS operand. The LHS operand of both add and sub is never attached
   1310           // to an inversese operation in the linearized form, therefore its APO
   1311           // is false. The RHS is true only if VL[Lane] is an inverse operation.
   1312 
   1313           // Since operand reordering is performed on groups of commutative
   1314           // operations or alternating sequences (e.g., +, -), we can safely
   1315           // tell the inverse operations by checking commutativity.
   1316           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
   1317           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
   1318           OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
   1319                                  APO, false};
   1320         }
   1321       }
   1322     }
   1323 
   1324     /// \returns the number of operands.
   1325     unsigned getNumOperands() const { return OpsVec.size(); }
   1326 
   1327     /// \returns the number of lanes.
   1328     unsigned getNumLanes() const { return OpsVec[0].size(); }
   1329 
   1330     /// \returns the operand value at \p OpIdx and \p Lane.
   1331     Value *getValue(unsigned OpIdx, unsigned Lane) const {
   1332       return getData(OpIdx, Lane).V;
   1333     }
   1334 
   1335     /// \returns true if the data structure is empty.
   1336     bool empty() const { return OpsVec.empty(); }
   1337 
   1338     /// Clears the data.
   1339     void clear() { OpsVec.clear(); }
   1340 
   1341     /// \Returns true if there are enough operands identical to \p Op to fill
   1342     /// the whole vector.
   1343     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
   1344     bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
   1345       bool OpAPO = getData(OpIdx, Lane).APO;
   1346       for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
   1347         if (Ln == Lane)
   1348           continue;
   1349         // This is set to true if we found a candidate for broadcast at Lane.
   1350         bool FoundCandidate = false;
   1351         for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
   1352           OperandData &Data = getData(OpI, Ln);
   1353           if (Data.APO != OpAPO || Data.IsUsed)
   1354             continue;
   1355           if (Data.V == Op) {
   1356             FoundCandidate = true;
   1357             Data.IsUsed = true;
   1358             break;
   1359           }
   1360         }
   1361         if (!FoundCandidate)
   1362           return false;
   1363       }
   1364       return true;
   1365     }
   1366 
   1367   public:
   1368     /// Initialize with all the operands of the instruction vector \p RootVL.
   1369     VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
   1370                ScalarEvolution &SE, const BoUpSLP &R)
   1371         : DL(DL), SE(SE), R(R) {
   1372       // Append all the operands of RootVL.
   1373       appendOperandsOfVL(RootVL);
   1374     }
   1375 
   1376     /// \Returns a value vector with the operands across all lanes for the
   1377     /// opearnd at \p OpIdx.
   1378     ValueList getVL(unsigned OpIdx) const {
   1379       ValueList OpVL(OpsVec[OpIdx].size());
   1380       assert(OpsVec[OpIdx].size() == getNumLanes() &&
   1381              "Expected same num of lanes across all operands");
   1382       for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
   1383         OpVL[Lane] = OpsVec[OpIdx][Lane].V;
   1384       return OpVL;
   1385     }
   1386 
   1387     // Performs operand reordering for 2 or more operands.
   1388     // The original operands are in OrigOps[OpIdx][Lane].
   1389     // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
   1390     void reorder() {
   1391       unsigned NumOperands = getNumOperands();
   1392       unsigned NumLanes = getNumLanes();
   1393       // Each operand has its own mode. We are using this mode to help us select
   1394       // the instructions for each lane, so that they match best with the ones
   1395       // we have selected so far.
   1396       SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
   1397 
   1398       // This is a greedy single-pass algorithm. We are going over each lane
   1399       // once and deciding on the best order right away with no back-tracking.
   1400       // However, in order to increase its effectiveness, we start with the lane
   1401       // that has operands that can move the least. For example, given the
   1402       // following lanes:
   1403       //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
   1404       //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
   1405       //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
   1406       //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
   1407       // we will start at Lane 1, since the operands of the subtraction cannot
   1408       // be reordered. Then we will visit the rest of the lanes in a circular
   1409       // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
   1410 
   1411       // Find the first lane that we will start our search from.
   1412       unsigned FirstLane = getBestLaneToStartReordering();
   1413 
   1414       // Initialize the modes.
   1415       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
   1416         Value *OpLane0 = getValue(OpIdx, FirstLane);
   1417         // Keep track if we have instructions with all the same opcode on one
   1418         // side.
   1419         if (isa<LoadInst>(OpLane0))
   1420           ReorderingModes[OpIdx] = ReorderingMode::Load;
   1421         else if (isa<Instruction>(OpLane0)) {
   1422           // Check if OpLane0 should be broadcast.
   1423           if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
   1424             ReorderingModes[OpIdx] = ReorderingMode::Splat;
   1425           else
   1426             ReorderingModes[OpIdx] = ReorderingMode::Opcode;
   1427         }
   1428         else if (isa<Constant>(OpLane0))
   1429           ReorderingModes[OpIdx] = ReorderingMode::Constant;
   1430         else if (isa<Argument>(OpLane0))
   1431           // Our best hope is a Splat. It may save some cost in some cases.
   1432           ReorderingModes[OpIdx] = ReorderingMode::Splat;
   1433         else
   1434           // NOTE: This should be unreachable.
   1435           ReorderingModes[OpIdx] = ReorderingMode::Failed;
   1436       }
   1437 
   1438       // If the initial strategy fails for any of the operand indexes, then we
   1439       // perform reordering again in a second pass. This helps avoid assigning
   1440       // high priority to the failed strategy, and should improve reordering for
   1441       // the non-failed operand indexes.
   1442       for (int Pass = 0; Pass != 2; ++Pass) {
   1443         // Skip the second pass if the first pass did not fail.
   1444         bool StrategyFailed = false;
   1445         // Mark all operand data as free to use.
   1446         clearUsed();
   1447         // We keep the original operand order for the FirstLane, so reorder the
   1448         // rest of the lanes. We are visiting the nodes in a circular fashion,
   1449         // using FirstLane as the center point and increasing the radius
   1450         // distance.
   1451         for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
   1452           // Visit the lane on the right and then the lane on the left.
   1453           for (int Direction : {+1, -1}) {
   1454             int Lane = FirstLane + Direction * Distance;
   1455             if (Lane < 0 || Lane >= (int)NumLanes)
   1456               continue;
   1457             int LastLane = Lane - Direction;
   1458             assert(LastLane >= 0 && LastLane < (int)NumLanes &&
   1459                    "Out of bounds");
   1460             // Look for a good match for each operand.
   1461             for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
   1462               // Search for the operand that matches SortedOps[OpIdx][Lane-1].
   1463               Optional<unsigned> BestIdx =
   1464                   getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
   1465               // By not selecting a value, we allow the operands that follow to
   1466               // select a better matching value. We will get a non-null value in
   1467               // the next run of getBestOperand().
   1468               if (BestIdx) {
   1469                 // Swap the current operand with the one returned by
   1470                 // getBestOperand().
   1471                 swap(OpIdx, BestIdx.getValue(), Lane);
   1472               } else {
   1473                 // We failed to find a best operand, set mode to 'Failed'.
   1474                 ReorderingModes[OpIdx] = ReorderingMode::Failed;
   1475                 // Enable the second pass.
   1476                 StrategyFailed = true;
   1477               }
   1478             }
   1479           }
   1480         }
   1481         // Skip second pass if the strategy did not fail.
   1482         if (!StrategyFailed)
   1483           break;
   1484       }
   1485     }
   1486 
   1487 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   1488     LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
   1489       switch (RMode) {
   1490       case ReorderingMode::Load:
   1491         return "Load";
   1492       case ReorderingMode::Opcode:
   1493         return "Opcode";
   1494       case ReorderingMode::Constant:
   1495         return "Constant";
   1496       case ReorderingMode::Splat:
   1497         return "Splat";
   1498       case ReorderingMode::Failed:
   1499         return "Failed";
   1500       }
   1501       llvm_unreachable("Unimplemented Reordering Type");
   1502     }
   1503 
   1504     LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
   1505                                                    raw_ostream &OS) {
   1506       return OS << getModeStr(RMode);
   1507     }
   1508 
   1509     /// Debug print.
   1510     LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
   1511       printMode(RMode, dbgs());
   1512     }
   1513 
   1514     friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
   1515       return printMode(RMode, OS);
   1516     }
   1517 
   1518     LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
   1519       const unsigned Indent = 2;
   1520       unsigned Cnt = 0;
   1521       for (const OperandDataVec &OpDataVec : OpsVec) {
   1522         OS << "Operand " << Cnt++ << "\n";
   1523         for (const OperandData &OpData : OpDataVec) {
   1524           OS.indent(Indent) << "{";
   1525           if (Value *V = OpData.V)
   1526             OS << *V;
   1527           else
   1528             OS << "null";
   1529           OS << ", APO:" << OpData.APO << "}\n";
   1530         }
   1531         OS << "\n";
   1532       }
   1533       return OS;
   1534     }
   1535 
   1536     /// Debug print.
   1537     LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
   1538 #endif
   1539   };
   1540 
   1541   /// Checks if the instruction is marked for deletion.
   1542   bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
   1543 
   1544   /// Marks values operands for later deletion by replacing them with Undefs.
   1545   void eraseInstructions(ArrayRef<Value *> AV);
   1546 
   1547   ~BoUpSLP();
   1548 
   1549 private:
   1550   /// Checks if all users of \p I are the part of the vectorization tree.
   1551   bool areAllUsersVectorized(Instruction *I) const;
   1552 
   1553   /// \returns the cost of the vectorizable entry.
   1554   InstructionCost getEntryCost(const TreeEntry *E);
   1555 
   1556   /// This is the recursive part of buildTree.
   1557   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
   1558                      const EdgeInfo &EI);
   1559 
   1560   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   1561   /// be vectorized to use the original vector (or aggregate "bitcast" to a
   1562   /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
   1563   /// returns false, setting \p CurrentOrder to either an empty vector or a
   1564   /// non-identity permutation that allows to reuse extract instructions.
   1565   bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
   1566                        SmallVectorImpl<unsigned> &CurrentOrder) const;
   1567 
   1568   /// Vectorize a single entry in the tree.
   1569   Value *vectorizeTree(TreeEntry *E);
   1570 
   1571   /// Vectorize a single entry in the tree, starting in \p VL.
   1572   Value *vectorizeTree(ArrayRef<Value *> VL);
   1573 
   1574   /// \returns the scalarization cost for this type. Scalarization in this
   1575   /// context means the creation of vectors from a group of scalars.
   1576   InstructionCost
   1577   getGatherCost(FixedVectorType *Ty,
   1578                 const DenseSet<unsigned> &ShuffledIndices) const;
   1579 
   1580   /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
   1581   /// tree entries.
   1582   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
   1583   /// previous tree entries. \p Mask is filled with the shuffle mask.
   1584   Optional<TargetTransformInfo::ShuffleKind>
   1585   isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
   1586                         SmallVectorImpl<const TreeEntry *> &Entries);
   1587 
   1588   /// \returns the scalarization cost for this list of values. Assuming that
   1589   /// this subtree gets vectorized, we may need to extract the values from the
   1590   /// roots. This method calculates the cost of extracting the values.
   1591   InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
   1592 
   1593   /// Set the Builder insert point to one after the last instruction in
   1594   /// the bundle
   1595   void setInsertPointAfterBundle(const TreeEntry *E);
   1596 
   1597   /// \returns a vector from a collection of scalars in \p VL.
   1598   Value *gather(ArrayRef<Value *> VL);
   1599 
   1600   /// \returns whether the VectorizableTree is fully vectorizable and will
   1601   /// be beneficial even the tree height is tiny.
   1602   bool isFullyVectorizableTinyTree() const;
   1603 
   1604   /// Reorder commutative or alt operands to get better probability of
   1605   /// generating vectorized code.
   1606   static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   1607                                              SmallVectorImpl<Value *> &Left,
   1608                                              SmallVectorImpl<Value *> &Right,
   1609                                              const DataLayout &DL,
   1610                                              ScalarEvolution &SE,
   1611                                              const BoUpSLP &R);
   1612   struct TreeEntry {
   1613     using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
   1614     TreeEntry(VecTreeTy &Container) : Container(Container) {}
   1615 
   1616     /// \returns true if the scalars in VL are equal to this entry.
   1617     bool isSame(ArrayRef<Value *> VL) const {
   1618       if (VL.size() == Scalars.size())
   1619         return std::equal(VL.begin(), VL.end(), Scalars.begin());
   1620       return VL.size() == ReuseShuffleIndices.size() &&
   1621              std::equal(
   1622                  VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
   1623                  [this](Value *V, int Idx) { return V == Scalars[Idx]; });
   1624     }
   1625 
   1626     /// A vector of scalars.
   1627     ValueList Scalars;
   1628 
   1629     /// The Scalars are vectorized into this value. It is initialized to Null.
   1630     Value *VectorizedValue = nullptr;
   1631 
   1632     /// Do we need to gather this sequence or vectorize it
   1633     /// (either with vector instruction or with scatter/gather
   1634     /// intrinsics for store/load)?
   1635     enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
   1636     EntryState State;
   1637 
   1638     /// Does this sequence require some shuffling?
   1639     SmallVector<int, 4> ReuseShuffleIndices;
   1640 
   1641     /// Does this entry require reordering?
   1642     SmallVector<unsigned, 4> ReorderIndices;
   1643 
   1644     /// Points back to the VectorizableTree.
   1645     ///
   1646     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
   1647     /// to be a pointer and needs to be able to initialize the child iterator.
   1648     /// Thus we need a reference back to the container to translate the indices
   1649     /// to entries.
   1650     VecTreeTy &Container;
   1651 
   1652     /// The TreeEntry index containing the user of this entry.  We can actually
   1653     /// have multiple users so the data structure is not truly a tree.
   1654     SmallVector<EdgeInfo, 1> UserTreeIndices;
   1655 
   1656     /// The index of this treeEntry in VectorizableTree.
   1657     int Idx = -1;
   1658 
   1659   private:
   1660     /// The operands of each instruction in each lane Operands[op_index][lane].
   1661     /// Note: This helps avoid the replication of the code that performs the
   1662     /// reordering of operands during buildTree_rec() and vectorizeTree().
   1663     SmallVector<ValueList, 2> Operands;
   1664 
   1665     /// The main/alternate instruction.
   1666     Instruction *MainOp = nullptr;
   1667     Instruction *AltOp = nullptr;
   1668 
   1669   public:
   1670     /// Set this bundle's \p OpIdx'th operand to \p OpVL.
   1671     void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
   1672       if (Operands.size() < OpIdx + 1)
   1673         Operands.resize(OpIdx + 1);
   1674       assert(Operands[OpIdx].empty() && "Already resized?");
   1675       Operands[OpIdx].resize(Scalars.size());
   1676       for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
   1677         Operands[OpIdx][Lane] = OpVL[Lane];
   1678     }
   1679 
   1680     /// Set the operands of this bundle in their original order.
   1681     void setOperandsInOrder() {
   1682       assert(Operands.empty() && "Already initialized?");
   1683       auto *I0 = cast<Instruction>(Scalars[0]);
   1684       Operands.resize(I0->getNumOperands());
   1685       unsigned NumLanes = Scalars.size();
   1686       for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
   1687            OpIdx != NumOperands; ++OpIdx) {
   1688         Operands[OpIdx].resize(NumLanes);
   1689         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
   1690           auto *I = cast<Instruction>(Scalars[Lane]);
   1691           assert(I->getNumOperands() == NumOperands &&
   1692                  "Expected same number of operands");
   1693           Operands[OpIdx][Lane] = I->getOperand(OpIdx);
   1694         }
   1695       }
   1696     }
   1697 
   1698     /// \returns the \p OpIdx operand of this TreeEntry.
   1699     ValueList &getOperand(unsigned OpIdx) {
   1700       assert(OpIdx < Operands.size() && "Off bounds");
   1701       return Operands[OpIdx];
   1702     }
   1703 
   1704     /// \returns the number of operands.
   1705     unsigned getNumOperands() const { return Operands.size(); }
   1706 
   1707     /// \return the single \p OpIdx operand.
   1708     Value *getSingleOperand(unsigned OpIdx) const {
   1709       assert(OpIdx < Operands.size() && "Off bounds");
   1710       assert(!Operands[OpIdx].empty() && "No operand available");
   1711       return Operands[OpIdx][0];
   1712     }
   1713 
   1714     /// Some of the instructions in the list have alternate opcodes.
   1715     bool isAltShuffle() const {
   1716       return getOpcode() != getAltOpcode();
   1717     }
   1718 
   1719     bool isOpcodeOrAlt(Instruction *I) const {
   1720       unsigned CheckedOpcode = I->getOpcode();
   1721       return (getOpcode() == CheckedOpcode ||
   1722               getAltOpcode() == CheckedOpcode);
   1723     }
   1724 
   1725     /// Chooses the correct key for scheduling data. If \p Op has the same (or
   1726     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
   1727     /// \p OpValue.
   1728     Value *isOneOf(Value *Op) const {
   1729       auto *I = dyn_cast<Instruction>(Op);
   1730       if (I && isOpcodeOrAlt(I))
   1731         return Op;
   1732       return MainOp;
   1733     }
   1734 
   1735     void setOperations(const InstructionsState &S) {
   1736       MainOp = S.MainOp;
   1737       AltOp = S.AltOp;
   1738     }
   1739 
   1740     Instruction *getMainOp() const {
   1741       return MainOp;
   1742     }
   1743 
   1744     Instruction *getAltOp() const {
   1745       return AltOp;
   1746     }
   1747 
   1748     /// The main/alternate opcodes for the list of instructions.
   1749     unsigned getOpcode() const {
   1750       return MainOp ? MainOp->getOpcode() : 0;
   1751     }
   1752 
   1753     unsigned getAltOpcode() const {
   1754       return AltOp ? AltOp->getOpcode() : 0;
   1755     }
   1756 
   1757     /// Update operations state of this entry if reorder occurred.
   1758     bool updateStateIfReorder() {
   1759       if (ReorderIndices.empty())
   1760         return false;
   1761       InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
   1762       setOperations(S);
   1763       return true;
   1764     }
   1765 
   1766 #ifndef NDEBUG
   1767     /// Debug printer.
   1768     LLVM_DUMP_METHOD void dump() const {
   1769       dbgs() << Idx << ".\n";
   1770       for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
   1771         dbgs() << "Operand " << OpI << ":\n";
   1772         for (const Value *V : Operands[OpI])
   1773           dbgs().indent(2) << *V << "\n";
   1774       }
   1775       dbgs() << "Scalars: \n";
   1776       for (Value *V : Scalars)
   1777         dbgs().indent(2) << *V << "\n";
   1778       dbgs() << "State: ";
   1779       switch (State) {
   1780       case Vectorize:
   1781         dbgs() << "Vectorize\n";
   1782         break;
   1783       case ScatterVectorize:
   1784         dbgs() << "ScatterVectorize\n";
   1785         break;
   1786       case NeedToGather:
   1787         dbgs() << "NeedToGather\n";
   1788         break;
   1789       }
   1790       dbgs() << "MainOp: ";
   1791       if (MainOp)
   1792         dbgs() << *MainOp << "\n";
   1793       else
   1794         dbgs() << "NULL\n";
   1795       dbgs() << "AltOp: ";
   1796       if (AltOp)
   1797         dbgs() << *AltOp << "\n";
   1798       else
   1799         dbgs() << "NULL\n";
   1800       dbgs() << "VectorizedValue: ";
   1801       if (VectorizedValue)
   1802         dbgs() << *VectorizedValue << "\n";
   1803       else
   1804         dbgs() << "NULL\n";
   1805       dbgs() << "ReuseShuffleIndices: ";
   1806       if (ReuseShuffleIndices.empty())
   1807         dbgs() << "Empty";
   1808       else
   1809         for (unsigned ReuseIdx : ReuseShuffleIndices)
   1810           dbgs() << ReuseIdx << ", ";
   1811       dbgs() << "\n";
   1812       dbgs() << "ReorderIndices: ";
   1813       for (unsigned ReorderIdx : ReorderIndices)
   1814         dbgs() << ReorderIdx << ", ";
   1815       dbgs() << "\n";
   1816       dbgs() << "UserTreeIndices: ";
   1817       for (const auto &EInfo : UserTreeIndices)
   1818         dbgs() << EInfo << ", ";
   1819       dbgs() << "\n";
   1820     }
   1821 #endif
   1822   };
   1823 
   1824 #ifndef NDEBUG
   1825   void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
   1826                      InstructionCost VecCost,
   1827                      InstructionCost ScalarCost) const {
   1828     dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
   1829     dbgs() << "SLP: Costs:\n";
   1830     dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";
   1831     dbgs() << "SLP:     VectorCost = " << VecCost << "\n";
   1832     dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";
   1833     dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = " <<
   1834                ReuseShuffleCost + VecCost - ScalarCost << "\n";
   1835   }
   1836 #endif
   1837 
   1838   /// Create a new VectorizableTree entry.
   1839   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
   1840                           const InstructionsState &S,
   1841                           const EdgeInfo &UserTreeIdx,
   1842                           ArrayRef<unsigned> ReuseShuffleIndices = None,
   1843                           ArrayRef<unsigned> ReorderIndices = None) {
   1844     TreeEntry::EntryState EntryState =
   1845         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
   1846     return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
   1847                         ReuseShuffleIndices, ReorderIndices);
   1848   }
   1849 
   1850   TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
   1851                           TreeEntry::EntryState EntryState,
   1852                           Optional<ScheduleData *> Bundle,
   1853                           const InstructionsState &S,
   1854                           const EdgeInfo &UserTreeIdx,
   1855                           ArrayRef<unsigned> ReuseShuffleIndices = None,
   1856                           ArrayRef<unsigned> ReorderIndices = None) {
   1857     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
   1858             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
   1859            "Need to vectorize gather entry?");
   1860     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
   1861     TreeEntry *Last = VectorizableTree.back().get();
   1862     Last->Idx = VectorizableTree.size() - 1;
   1863     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
   1864     Last->State = EntryState;
   1865     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
   1866                                      ReuseShuffleIndices.end());
   1867     Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
   1868     Last->setOperations(S);
   1869     if (Last->State != TreeEntry::NeedToGather) {
   1870       for (Value *V : VL) {
   1871         assert(!getTreeEntry(V) && "Scalar already in tree!");
   1872         ScalarToTreeEntry[V] = Last;
   1873       }
   1874       // Update the scheduler bundle to point to this TreeEntry.
   1875       unsigned Lane = 0;
   1876       for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
   1877            BundleMember = BundleMember->NextInBundle) {
   1878         BundleMember->TE = Last;
   1879         BundleMember->Lane = Lane;
   1880         ++Lane;
   1881       }
   1882       assert((!Bundle.getValue() || Lane == VL.size()) &&
   1883              "Bundle and VL out of sync");
   1884     } else {
   1885       MustGather.insert(VL.begin(), VL.end());
   1886     }
   1887 
   1888     if (UserTreeIdx.UserTE)
   1889       Last->UserTreeIndices.push_back(UserTreeIdx);
   1890 
   1891     return Last;
   1892   }
   1893 
   1894   /// -- Vectorization State --
   1895   /// Holds all of the tree entries.
   1896   TreeEntry::VecTreeTy VectorizableTree;
   1897 
   1898 #ifndef NDEBUG
   1899   /// Debug printer.
   1900   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
   1901     for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
   1902       VectorizableTree[Id]->dump();
   1903       dbgs() << "\n";
   1904     }
   1905   }
   1906 #endif
   1907 
   1908   TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
   1909 
   1910   const TreeEntry *getTreeEntry(Value *V) const {
   1911     return ScalarToTreeEntry.lookup(V);
   1912   }
   1913 
   1914   /// Maps a specific scalar to its tree entry.
   1915   SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
   1916 
   1917   /// Maps a valueto the proposed vectorizable size.
   1918   SmallDenseMap<Value *, unsigned> InstrElementSize;
   1919 
   1920   /// A list of scalars that we found that we need to keep as scalars.
   1921   ValueSet MustGather;
   1922 
   1923   /// This POD struct describes one external user in the vectorized tree.
   1924   struct ExternalUser {
   1925     ExternalUser(Value *S, llvm::User *U, int L)
   1926         : Scalar(S), User(U), Lane(L) {}
   1927 
   1928     // Which scalar in our function.
   1929     Value *Scalar;
   1930 
   1931     // Which user that uses the scalar.
   1932     llvm::User *User;
   1933 
   1934     // Which lane does the scalar belong to.
   1935     int Lane;
   1936   };
   1937   using UserList = SmallVector<ExternalUser, 16>;
   1938 
   1939   /// Checks if two instructions may access the same memory.
   1940   ///
   1941   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
   1942   /// is invariant in the calling loop.
   1943   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
   1944                  Instruction *Inst2) {
   1945     // First check if the result is already in the cache.
   1946     AliasCacheKey key = std::make_pair(Inst1, Inst2);
   1947     Optional<bool> &result = AliasCache[key];
   1948     if (result.hasValue()) {
   1949       return result.getValue();
   1950     }
   1951     MemoryLocation Loc2 = getLocation(Inst2, AA);
   1952     bool aliased = true;
   1953     if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
   1954       // Do the alias check.
   1955       aliased = !AA->isNoAlias(Loc1, Loc2);
   1956     }
   1957     // Store the result in the cache.
   1958     result = aliased;
   1959     return aliased;
   1960   }
   1961 
   1962   using AliasCacheKey = std::pair<Instruction *, Instruction *>;
   1963 
   1964   /// Cache for alias results.
   1965   /// TODO: consider moving this to the AliasAnalysis itself.
   1966   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
   1967 
   1968   /// Removes an instruction from its block and eventually deletes it.
   1969   /// It's like Instruction::eraseFromParent() except that the actual deletion
   1970   /// is delayed until BoUpSLP is destructed.
   1971   /// This is required to ensure that there are no incorrect collisions in the
   1972   /// AliasCache, which can happen if a new instruction is allocated at the
   1973   /// same address as a previously deleted instruction.
   1974   void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
   1975     auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
   1976     It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
   1977   }
   1978 
   1979   /// Temporary store for deleted instructions. Instructions will be deleted
   1980   /// eventually when the BoUpSLP is destructed.
   1981   DenseMap<Instruction *, bool> DeletedInstructions;
   1982 
   1983   /// A list of values that need to extracted out of the tree.
   1984   /// This list holds pairs of (Internal Scalar : External User). External User
   1985   /// can be nullptr, it means that this Internal Scalar will be used later,
   1986   /// after vectorization.
   1987   UserList ExternalUses;
   1988 
   1989   /// Values used only by @llvm.assume calls.
   1990   SmallPtrSet<const Value *, 32> EphValues;
   1991 
   1992   /// Holds all of the instructions that we gathered.
   1993   SetVector<Instruction *> GatherSeq;
   1994 
   1995   /// A list of blocks that we are going to CSE.
   1996   SetVector<BasicBlock *> CSEBlocks;
   1997 
   1998   /// Contains all scheduling relevant data for an instruction.
   1999   /// A ScheduleData either represents a single instruction or a member of an
   2000   /// instruction bundle (= a group of instructions which is combined into a
   2001   /// vector instruction).
   2002   struct ScheduleData {
   2003     // The initial value for the dependency counters. It means that the
   2004     // dependencies are not calculated yet.
   2005     enum { InvalidDeps = -1 };
   2006 
   2007     ScheduleData() = default;
   2008 
   2009     void init(int BlockSchedulingRegionID, Value *OpVal) {
   2010       FirstInBundle = this;
   2011       NextInBundle = nullptr;
   2012       NextLoadStore = nullptr;
   2013       IsScheduled = false;
   2014       SchedulingRegionID = BlockSchedulingRegionID;
   2015       UnscheduledDepsInBundle = UnscheduledDeps;
   2016       clearDependencies();
   2017       OpValue = OpVal;
   2018       TE = nullptr;
   2019       Lane = -1;
   2020     }
   2021 
   2022     /// Returns true if the dependency information has been calculated.
   2023     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
   2024 
   2025     /// Returns true for single instructions and for bundle representatives
   2026     /// (= the head of a bundle).
   2027     bool isSchedulingEntity() const { return FirstInBundle == this; }
   2028 
   2029     /// Returns true if it represents an instruction bundle and not only a
   2030     /// single instruction.
   2031     bool isPartOfBundle() const {
   2032       return NextInBundle != nullptr || FirstInBundle != this;
   2033     }
   2034 
   2035     /// Returns true if it is ready for scheduling, i.e. it has no more
   2036     /// unscheduled depending instructions/bundles.
   2037     bool isReady() const {
   2038       assert(isSchedulingEntity() &&
   2039              "can't consider non-scheduling entity for ready list");
   2040       return UnscheduledDepsInBundle == 0 && !IsScheduled;
   2041     }
   2042 
   2043     /// Modifies the number of unscheduled dependencies, also updating it for
   2044     /// the whole bundle.
   2045     int incrementUnscheduledDeps(int Incr) {
   2046       UnscheduledDeps += Incr;
   2047       return FirstInBundle->UnscheduledDepsInBundle += Incr;
   2048     }
   2049 
   2050     /// Sets the number of unscheduled dependencies to the number of
   2051     /// dependencies.
   2052     void resetUnscheduledDeps() {
   2053       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
   2054     }
   2055 
   2056     /// Clears all dependency information.
   2057     void clearDependencies() {
   2058       Dependencies = InvalidDeps;
   2059       resetUnscheduledDeps();
   2060       MemoryDependencies.clear();
   2061     }
   2062 
   2063     void dump(raw_ostream &os) const {
   2064       if (!isSchedulingEntity()) {
   2065         os << "/ " << *Inst;
   2066       } else if (NextInBundle) {
   2067         os << '[' << *Inst;
   2068         ScheduleData *SD = NextInBundle;
   2069         while (SD) {
   2070           os << ';' << *SD->Inst;
   2071           SD = SD->NextInBundle;
   2072         }
   2073         os << ']';
   2074       } else {
   2075         os << *Inst;
   2076       }
   2077     }
   2078 
   2079     Instruction *Inst = nullptr;
   2080 
   2081     /// Points to the head in an instruction bundle (and always to this for
   2082     /// single instructions).
   2083     ScheduleData *FirstInBundle = nullptr;
   2084 
   2085     /// Single linked list of all instructions in a bundle. Null if it is a
   2086     /// single instruction.
   2087     ScheduleData *NextInBundle = nullptr;
   2088 
   2089     /// Single linked list of all memory instructions (e.g. load, store, call)
   2090     /// in the block - until the end of the scheduling region.
   2091     ScheduleData *NextLoadStore = nullptr;
   2092 
   2093     /// The dependent memory instructions.
   2094     /// This list is derived on demand in calculateDependencies().
   2095     SmallVector<ScheduleData *, 4> MemoryDependencies;
   2096 
   2097     /// This ScheduleData is in the current scheduling region if this matches
   2098     /// the current SchedulingRegionID of BlockScheduling.
   2099     int SchedulingRegionID = 0;
   2100 
   2101     /// Used for getting a "good" final ordering of instructions.
   2102     int SchedulingPriority = 0;
   2103 
   2104     /// The number of dependencies. Constitutes of the number of users of the
   2105     /// instruction plus the number of dependent memory instructions (if any).
   2106     /// This value is calculated on demand.
   2107     /// If InvalidDeps, the number of dependencies is not calculated yet.
   2108     int Dependencies = InvalidDeps;
   2109 
   2110     /// The number of dependencies minus the number of dependencies of scheduled
   2111     /// instructions. As soon as this is zero, the instruction/bundle gets ready
   2112     /// for scheduling.
   2113     /// Note that this is negative as long as Dependencies is not calculated.
   2114     int UnscheduledDeps = InvalidDeps;
   2115 
   2116     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
   2117     /// single instructions.
   2118     int UnscheduledDepsInBundle = InvalidDeps;
   2119 
   2120     /// True if this instruction is scheduled (or considered as scheduled in the
   2121     /// dry-run).
   2122     bool IsScheduled = false;
   2123 
   2124     /// Opcode of the current instruction in the schedule data.
   2125     Value *OpValue = nullptr;
   2126 
   2127     /// The TreeEntry that this instruction corresponds to.
   2128     TreeEntry *TE = nullptr;
   2129 
   2130     /// The lane of this node in the TreeEntry.
   2131     int Lane = -1;
   2132   };
   2133 
   2134 #ifndef NDEBUG
   2135   friend inline raw_ostream &operator<<(raw_ostream &os,
   2136                                         const BoUpSLP::ScheduleData &SD) {
   2137     SD.dump(os);
   2138     return os;
   2139   }
   2140 #endif
   2141 
   2142   friend struct GraphTraits<BoUpSLP *>;
   2143   friend struct DOTGraphTraits<BoUpSLP *>;
   2144 
   2145   /// Contains all scheduling data for a basic block.
   2146   struct BlockScheduling {
   2147     BlockScheduling(BasicBlock *BB)
   2148         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
   2149 
   2150     void clear() {
   2151       ReadyInsts.clear();
   2152       ScheduleStart = nullptr;
   2153       ScheduleEnd = nullptr;
   2154       FirstLoadStoreInRegion = nullptr;
   2155       LastLoadStoreInRegion = nullptr;
   2156 
   2157       // Reduce the maximum schedule region size by the size of the
   2158       // previous scheduling run.
   2159       ScheduleRegionSizeLimit -= ScheduleRegionSize;
   2160       if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
   2161         ScheduleRegionSizeLimit = MinScheduleRegionSize;
   2162       ScheduleRegionSize = 0;
   2163 
   2164       // Make a new scheduling region, i.e. all existing ScheduleData is not
   2165       // in the new region yet.
   2166       ++SchedulingRegionID;
   2167     }
   2168 
   2169     ScheduleData *getScheduleData(Value *V) {
   2170       ScheduleData *SD = ScheduleDataMap[V];
   2171       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
   2172         return SD;
   2173       return nullptr;
   2174     }
   2175 
   2176     ScheduleData *getScheduleData(Value *V, Value *Key) {
   2177       if (V == Key)
   2178         return getScheduleData(V);
   2179       auto I = ExtraScheduleDataMap.find(V);
   2180       if (I != ExtraScheduleDataMap.end()) {
   2181         ScheduleData *SD = I->second[Key];
   2182         if (SD && SD->SchedulingRegionID == SchedulingRegionID)
   2183           return SD;
   2184       }
   2185       return nullptr;
   2186     }
   2187 
   2188     bool isInSchedulingRegion(ScheduleData *SD) const {
   2189       return SD->SchedulingRegionID == SchedulingRegionID;
   2190     }
   2191 
   2192     /// Marks an instruction as scheduled and puts all dependent ready
   2193     /// instructions into the ready-list.
   2194     template <typename ReadyListType>
   2195     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
   2196       SD->IsScheduled = true;
   2197       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
   2198 
   2199       ScheduleData *BundleMember = SD;
   2200       while (BundleMember) {
   2201         if (BundleMember->Inst != BundleMember->OpValue) {
   2202           BundleMember = BundleMember->NextInBundle;
   2203           continue;
   2204         }
   2205         // Handle the def-use chain dependencies.
   2206 
   2207         // Decrement the unscheduled counter and insert to ready list if ready.
   2208         auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
   2209           doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
   2210             if (OpDef && OpDef->hasValidDependencies() &&
   2211                 OpDef->incrementUnscheduledDeps(-1) == 0) {
   2212               // There are no more unscheduled dependencies after
   2213               // decrementing, so we can put the dependent instruction
   2214               // into the ready list.
   2215               ScheduleData *DepBundle = OpDef->FirstInBundle;
   2216               assert(!DepBundle->IsScheduled &&
   2217                      "already scheduled bundle gets ready");
   2218               ReadyList.insert(DepBundle);
   2219               LLVM_DEBUG(dbgs()
   2220                          << "SLP:    gets ready (def): " << *DepBundle << "\n");
   2221             }
   2222           });
   2223         };
   2224 
   2225         // If BundleMember is a vector bundle, its operands may have been
   2226         // reordered duiring buildTree(). We therefore need to get its operands
   2227         // through the TreeEntry.
   2228         if (TreeEntry *TE = BundleMember->TE) {
   2229           int Lane = BundleMember->Lane;
   2230           assert(Lane >= 0 && "Lane not set");
   2231 
   2232           // Since vectorization tree is being built recursively this assertion
   2233           // ensures that the tree entry has all operands set before reaching
   2234           // this code. Couple of exceptions known at the moment are extracts
   2235           // where their second (immediate) operand is not added. Since
   2236           // immediates do not affect scheduler behavior this is considered
   2237           // okay.
   2238           auto *In = TE->getMainOp();
   2239           assert(In &&
   2240                  (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
   2241                   isa<InsertElementInst>(In) ||
   2242                   In->getNumOperands() == TE->getNumOperands()) &&
   2243                  "Missed TreeEntry operands?");
   2244           (void)In; // fake use to avoid build failure when assertions disabled
   2245 
   2246           for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
   2247                OpIdx != NumOperands; ++OpIdx)
   2248             if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
   2249               DecrUnsched(I);
   2250         } else {
   2251           // If BundleMember is a stand-alone instruction, no operand reordering
   2252           // has taken place, so we directly access its operands.
   2253           for (Use &U : BundleMember->Inst->operands())
   2254             if (auto *I = dyn_cast<Instruction>(U.get()))
   2255               DecrUnsched(I);
   2256         }
   2257         // Handle the memory dependencies.
   2258         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
   2259           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
   2260             // There are no more unscheduled dependencies after decrementing,
   2261             // so we can put the dependent instruction into the ready list.
   2262             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
   2263             assert(!DepBundle->IsScheduled &&
   2264                    "already scheduled bundle gets ready");
   2265             ReadyList.insert(DepBundle);
   2266             LLVM_DEBUG(dbgs()
   2267                        << "SLP:    gets ready (mem): " << *DepBundle << "\n");
   2268           }
   2269         }
   2270         BundleMember = BundleMember->NextInBundle;
   2271       }
   2272     }
   2273 
   2274     void doForAllOpcodes(Value *V,
   2275                          function_ref<void(ScheduleData *SD)> Action) {
   2276       if (ScheduleData *SD = getScheduleData(V))
   2277         Action(SD);
   2278       auto I = ExtraScheduleDataMap.find(V);
   2279       if (I != ExtraScheduleDataMap.end())
   2280         for (auto &P : I->second)
   2281           if (P.second->SchedulingRegionID == SchedulingRegionID)
   2282             Action(P.second);
   2283     }
   2284 
   2285     /// Put all instructions into the ReadyList which are ready for scheduling.
   2286     template <typename ReadyListType>
   2287     void initialFillReadyList(ReadyListType &ReadyList) {
   2288       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
   2289         doForAllOpcodes(I, [&](ScheduleData *SD) {
   2290           if (SD->isSchedulingEntity() && SD->isReady()) {
   2291             ReadyList.insert(SD);
   2292             LLVM_DEBUG(dbgs()
   2293                        << "SLP:    initially in ready list: " << *I << "\n");
   2294           }
   2295         });
   2296       }
   2297     }
   2298 
   2299     /// Checks if a bundle of instructions can be scheduled, i.e. has no
   2300     /// cyclic dependencies. This is only a dry-run, no instructions are
   2301     /// actually moved at this stage.
   2302     /// \returns the scheduling bundle. The returned Optional value is non-None
   2303     /// if \p VL is allowed to be scheduled.
   2304     Optional<ScheduleData *>
   2305     tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   2306                       const InstructionsState &S);
   2307 
   2308     /// Un-bundles a group of instructions.
   2309     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
   2310 
   2311     /// Allocates schedule data chunk.
   2312     ScheduleData *allocateScheduleDataChunks();
   2313 
   2314     /// Extends the scheduling region so that V is inside the region.
   2315     /// \returns true if the region size is within the limit.
   2316     bool extendSchedulingRegion(Value *V, const InstructionsState &S);
   2317 
   2318     /// Initialize the ScheduleData structures for new instructions in the
   2319     /// scheduling region.
   2320     void initScheduleData(Instruction *FromI, Instruction *ToI,
   2321                           ScheduleData *PrevLoadStore,
   2322                           ScheduleData *NextLoadStore);
   2323 
   2324     /// Updates the dependency information of a bundle and of all instructions/
   2325     /// bundles which depend on the original bundle.
   2326     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
   2327                                BoUpSLP *SLP);
   2328 
   2329     /// Sets all instruction in the scheduling region to un-scheduled.
   2330     void resetSchedule();
   2331 
   2332     BasicBlock *BB;
   2333 
   2334     /// Simple memory allocation for ScheduleData.
   2335     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
   2336 
   2337     /// The size of a ScheduleData array in ScheduleDataChunks.
   2338     int ChunkSize;
   2339 
   2340     /// The allocator position in the current chunk, which is the last entry
   2341     /// of ScheduleDataChunks.
   2342     int ChunkPos;
   2343 
   2344     /// Attaches ScheduleData to Instruction.
   2345     /// Note that the mapping survives during all vectorization iterations, i.e.
   2346     /// ScheduleData structures are recycled.
   2347     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
   2348 
   2349     /// Attaches ScheduleData to Instruction with the leading key.
   2350     DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
   2351         ExtraScheduleDataMap;
   2352 
   2353     struct ReadyList : SmallVector<ScheduleData *, 8> {
   2354       void insert(ScheduleData *SD) { push_back(SD); }
   2355     };
   2356 
   2357     /// The ready-list for scheduling (only used for the dry-run).
   2358     ReadyList ReadyInsts;
   2359 
   2360     /// The first instruction of the scheduling region.
   2361     Instruction *ScheduleStart = nullptr;
   2362 
   2363     /// The first instruction _after_ the scheduling region.
   2364     Instruction *ScheduleEnd = nullptr;
   2365 
   2366     /// The first memory accessing instruction in the scheduling region
   2367     /// (can be null).
   2368     ScheduleData *FirstLoadStoreInRegion = nullptr;
   2369 
   2370     /// The last memory accessing instruction in the scheduling region
   2371     /// (can be null).
   2372     ScheduleData *LastLoadStoreInRegion = nullptr;
   2373 
   2374     /// The current size of the scheduling region.
   2375     int ScheduleRegionSize = 0;
   2376 
   2377     /// The maximum size allowed for the scheduling region.
   2378     int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
   2379 
   2380     /// The ID of the scheduling region. For a new vectorization iteration this
   2381     /// is incremented which "removes" all ScheduleData from the region.
   2382     // Make sure that the initial SchedulingRegionID is greater than the
   2383     // initial SchedulingRegionID in ScheduleData (which is 0).
   2384     int SchedulingRegionID = 1;
   2385   };
   2386 
   2387   /// Attaches the BlockScheduling structures to basic blocks.
   2388   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
   2389 
   2390   /// Performs the "real" scheduling. Done before vectorization is actually
   2391   /// performed in a basic block.
   2392   void scheduleBlock(BlockScheduling *BS);
   2393 
   2394   /// List of users to ignore during scheduling and that don't need extracting.
   2395   ArrayRef<Value *> UserIgnoreList;
   2396 
   2397   /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
   2398   /// sorted SmallVectors of unsigned.
   2399   struct OrdersTypeDenseMapInfo {
   2400     static OrdersType getEmptyKey() {
   2401       OrdersType V;
   2402       V.push_back(~1U);
   2403       return V;
   2404     }
   2405 
   2406     static OrdersType getTombstoneKey() {
   2407       OrdersType V;
   2408       V.push_back(~2U);
   2409       return V;
   2410     }
   2411 
   2412     static unsigned getHashValue(const OrdersType &V) {
   2413       return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
   2414     }
   2415 
   2416     static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
   2417       return LHS == RHS;
   2418     }
   2419   };
   2420 
   2421   /// Contains orders of operations along with the number of bundles that have
   2422   /// operations in this order. It stores only those orders that require
   2423   /// reordering, if reordering is not required it is counted using \a
   2424   /// NumOpsWantToKeepOriginalOrder.
   2425   DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
   2426   /// Number of bundles that do not require reordering.
   2427   unsigned NumOpsWantToKeepOriginalOrder = 0;
   2428 
   2429   // Analysis and block reference.
   2430   Function *F;
   2431   ScalarEvolution *SE;
   2432   TargetTransformInfo *TTI;
   2433   TargetLibraryInfo *TLI;
   2434   AAResults *AA;
   2435   LoopInfo *LI;
   2436   DominatorTree *DT;
   2437   AssumptionCache *AC;
   2438   DemandedBits *DB;
   2439   const DataLayout *DL;
   2440   OptimizationRemarkEmitter *ORE;
   2441 
   2442   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   2443   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   2444 
   2445   /// Instruction builder to construct the vectorized tree.
   2446   IRBuilder<> Builder;
   2447 
   2448   /// A map of scalar integer values to the smallest bit width with which they
   2449   /// can legally be represented. The values map to (width, signed) pairs,
   2450   /// where "width" indicates the minimum bit width and "signed" is True if the
   2451   /// value must be signed-extended, rather than zero-extended, back to its
   2452   /// original width.
   2453   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
   2454 };
   2455 
   2456 } // end namespace slpvectorizer
   2457 
   2458 template <> struct GraphTraits<BoUpSLP *> {
   2459   using TreeEntry = BoUpSLP::TreeEntry;
   2460 
   2461   /// NodeRef has to be a pointer per the GraphWriter.
   2462   using NodeRef = TreeEntry *;
   2463 
   2464   using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
   2465 
   2466   /// Add the VectorizableTree to the index iterator to be able to return
   2467   /// TreeEntry pointers.
   2468   struct ChildIteratorType
   2469       : public iterator_adaptor_base<
   2470             ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
   2471     ContainerTy &VectorizableTree;
   2472 
   2473     ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
   2474                       ContainerTy &VT)
   2475         : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
   2476 
   2477     NodeRef operator*() { return I->UserTE; }
   2478   };
   2479 
   2480   static NodeRef getEntryNode(BoUpSLP &R) {
   2481     return R.VectorizableTree[0].get();
   2482   }
   2483 
   2484   static ChildIteratorType child_begin(NodeRef N) {
   2485     return {N->UserTreeIndices.begin(), N->Container};
   2486   }
   2487 
   2488   static ChildIteratorType child_end(NodeRef N) {
   2489     return {N->UserTreeIndices.end(), N->Container};
   2490   }
   2491 
   2492   /// For the node iterator we just need to turn the TreeEntry iterator into a
   2493   /// TreeEntry* iterator so that it dereferences to NodeRef.
   2494   class nodes_iterator {
   2495     using ItTy = ContainerTy::iterator;
   2496     ItTy It;
   2497 
   2498   public:
   2499     nodes_iterator(const ItTy &It2) : It(It2) {}
   2500     NodeRef operator*() { return It->get(); }
   2501     nodes_iterator operator++() {
   2502       ++It;
   2503       return *this;
   2504     }
   2505     bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
   2506   };
   2507 
   2508   static nodes_iterator nodes_begin(BoUpSLP *R) {
   2509     return nodes_iterator(R->VectorizableTree.begin());
   2510   }
   2511 
   2512   static nodes_iterator nodes_end(BoUpSLP *R) {
   2513     return nodes_iterator(R->VectorizableTree.end());
   2514   }
   2515 
   2516   static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
   2517 };
   2518 
   2519 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
   2520   using TreeEntry = BoUpSLP::TreeEntry;
   2521 
   2522   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
   2523 
   2524   std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
   2525     std::string Str;
   2526     raw_string_ostream OS(Str);
   2527     if (isSplat(Entry->Scalars)) {
   2528       OS << "<splat> " << *Entry->Scalars[0];
   2529       return Str;
   2530     }
   2531     for (auto V : Entry->Scalars) {
   2532       OS << *V;
   2533       if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
   2534             return EU.Scalar == V;
   2535           }))
   2536         OS << " <extract>";
   2537       OS << "\n";
   2538     }
   2539     return Str;
   2540   }
   2541 
   2542   static std::string getNodeAttributes(const TreeEntry *Entry,
   2543                                        const BoUpSLP *) {
   2544     if (Entry->State == TreeEntry::NeedToGather)
   2545       return "color=red";
   2546     return "";
   2547   }
   2548 };
   2549 
   2550 } // end namespace llvm
   2551 
   2552 BoUpSLP::~BoUpSLP() {
   2553   for (const auto &Pair : DeletedInstructions) {
   2554     // Replace operands of ignored instructions with Undefs in case if they were
   2555     // marked for deletion.
   2556     if (Pair.getSecond()) {
   2557       Value *Undef = UndefValue::get(Pair.getFirst()->getType());
   2558       Pair.getFirst()->replaceAllUsesWith(Undef);
   2559     }
   2560     Pair.getFirst()->dropAllReferences();
   2561   }
   2562   for (const auto &Pair : DeletedInstructions) {
   2563     assert(Pair.getFirst()->use_empty() &&
   2564            "trying to erase instruction with users.");
   2565     Pair.getFirst()->eraseFromParent();
   2566   }
   2567 #ifdef EXPENSIVE_CHECKS
   2568   // If we could guarantee that this call is not extremely slow, we could
   2569   // remove the ifdef limitation (see PR47712).
   2570   assert(!verifyFunction(*F, &dbgs()));
   2571 #endif
   2572 }
   2573 
   2574 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
   2575   for (auto *V : AV) {
   2576     if (auto *I = dyn_cast<Instruction>(V))
   2577       eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
   2578   };
   2579 }
   2580 
   2581 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   2582                         ArrayRef<Value *> UserIgnoreLst) {
   2583   ExtraValueToDebugLocsMap ExternallyUsedValues;
   2584   buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
   2585 }
   2586 
   2587 static int findLaneForValue(ArrayRef<Value *> Scalars,
   2588                             ArrayRef<int> ReuseShuffleIndices, Value *V) {
   2589   unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
   2590   assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
   2591   if (!ReuseShuffleIndices.empty()) {
   2592     FoundLane = std::distance(ReuseShuffleIndices.begin(),
   2593                               find(ReuseShuffleIndices, FoundLane));
   2594   }
   2595   return FoundLane;
   2596 }
   2597 
   2598 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   2599                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
   2600                         ArrayRef<Value *> UserIgnoreLst) {
   2601   deleteTree();
   2602   UserIgnoreList = UserIgnoreLst;
   2603   if (!allSameType(Roots))
   2604     return;
   2605   buildTree_rec(Roots, 0, EdgeInfo());
   2606 
   2607   // Collect the values that we need to extract from the tree.
   2608   for (auto &TEPtr : VectorizableTree) {
   2609     TreeEntry *Entry = TEPtr.get();
   2610 
   2611     // No need to handle users of gathered values.
   2612     if (Entry->State == TreeEntry::NeedToGather)
   2613       continue;
   2614 
   2615     // For each lane:
   2616     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
   2617       Value *Scalar = Entry->Scalars[Lane];
   2618       int FoundLane =
   2619           findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Scalar);
   2620 
   2621       // Check if the scalar is externally used as an extra arg.
   2622       auto ExtI = ExternallyUsedValues.find(Scalar);
   2623       if (ExtI != ExternallyUsedValues.end()) {
   2624         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
   2625                           << Lane << " from " << *Scalar << ".\n");
   2626         ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
   2627       }
   2628       for (User *U : Scalar->users()) {
   2629         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
   2630 
   2631         Instruction *UserInst = dyn_cast<Instruction>(U);
   2632         if (!UserInst)
   2633           continue;
   2634 
   2635         // Skip in-tree scalars that become vectors
   2636         if (TreeEntry *UseEntry = getTreeEntry(U)) {
   2637           Value *UseScalar = UseEntry->Scalars[0];
   2638           // Some in-tree scalars will remain as scalar in vectorized
   2639           // instructions. If that is the case, the one in Lane 0 will
   2640           // be used.
   2641           if (UseScalar != U ||
   2642               UseEntry->State == TreeEntry::ScatterVectorize ||
   2643               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
   2644             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
   2645                               << ".\n");
   2646             assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
   2647             continue;
   2648           }
   2649         }
   2650 
   2651         // Ignore users in the user ignore list.
   2652         if (is_contained(UserIgnoreList, UserInst))
   2653           continue;
   2654 
   2655         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
   2656                           << Lane << " from " << *Scalar << ".\n");
   2657         ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
   2658       }
   2659     }
   2660   }
   2661 }
   2662 
   2663 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   2664                             const EdgeInfo &UserTreeIdx) {
   2665   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
   2666 
   2667   InstructionsState S = getSameOpcode(VL);
   2668   if (Depth == RecursionMaxDepth) {
   2669     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
   2670     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2671     return;
   2672   }
   2673 
   2674   // Don't handle vectors.
   2675   if (S.OpValue->getType()->isVectorTy() &&
   2676       !isa<InsertElementInst>(S.OpValue)) {
   2677     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
   2678     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2679     return;
   2680   }
   2681 
   2682   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
   2683     if (SI->getValueOperand()->getType()->isVectorTy()) {
   2684       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
   2685       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2686       return;
   2687     }
   2688 
   2689   // If all of the operands are identical or constant we have a simple solution.
   2690   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
   2691     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
   2692     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2693     return;
   2694   }
   2695 
   2696   // We now know that this is a vector of instructions of the same type from
   2697   // the same block.
   2698 
   2699   // Don't vectorize ephemeral values.
   2700   for (Value *V : VL) {
   2701     if (EphValues.count(V)) {
   2702       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
   2703                         << ") is ephemeral.\n");
   2704       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2705       return;
   2706     }
   2707   }
   2708 
   2709   // Check if this is a duplicate of another entry.
   2710   if (TreeEntry *E = getTreeEntry(S.OpValue)) {
   2711     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
   2712     if (!E->isSame(VL)) {
   2713       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
   2714       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2715       return;
   2716     }
   2717     // Record the reuse of the tree node.  FIXME, currently this is only used to
   2718     // properly draw the graph rather than for the actual vectorization.
   2719     E->UserTreeIndices.push_back(UserTreeIdx);
   2720     LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
   2721                       << ".\n");
   2722     return;
   2723   }
   2724 
   2725   // Check that none of the instructions in the bundle are already in the tree.
   2726   for (Value *V : VL) {
   2727     auto *I = dyn_cast<Instruction>(V);
   2728     if (!I)
   2729       continue;
   2730     if (getTreeEntry(I)) {
   2731       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
   2732                         << ") is already in tree.\n");
   2733       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2734       return;
   2735     }
   2736   }
   2737 
   2738   // If any of the scalars is marked as a value that needs to stay scalar, then
   2739   // we need to gather the scalars.
   2740   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
   2741   for (Value *V : VL) {
   2742     if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
   2743       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
   2744       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2745       return;
   2746     }
   2747   }
   2748 
   2749   // Check that all of the users of the scalars that we want to vectorize are
   2750   // schedulable.
   2751   auto *VL0 = cast<Instruction>(S.OpValue);
   2752   BasicBlock *BB = VL0->getParent();
   2753 
   2754   if (!DT->isReachableFromEntry(BB)) {
   2755     // Don't go into unreachable blocks. They may contain instructions with
   2756     // dependency cycles which confuse the final scheduling.
   2757     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
   2758     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2759     return;
   2760   }
   2761 
   2762   // Check that every instruction appears once in this bundle.
   2763   SmallVector<unsigned, 4> ReuseShuffleIndicies;
   2764   SmallVector<Value *, 4> UniqueValues;
   2765   DenseMap<Value *, unsigned> UniquePositions;
   2766   for (Value *V : VL) {
   2767     auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
   2768     ReuseShuffleIndicies.emplace_back(Res.first->second);
   2769     if (Res.second)
   2770       UniqueValues.emplace_back(V);
   2771   }
   2772   size_t NumUniqueScalarValues = UniqueValues.size();
   2773   if (NumUniqueScalarValues == VL.size()) {
   2774     ReuseShuffleIndicies.clear();
   2775   } else {
   2776     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
   2777     if (NumUniqueScalarValues <= 1 ||
   2778         !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
   2779       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
   2780       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
   2781       return;
   2782     }
   2783     VL = UniqueValues;
   2784   }
   2785 
   2786   auto &BSRef = BlocksSchedules[BB];
   2787   if (!BSRef)
   2788     BSRef = std::make_unique<BlockScheduling>(BB);
   2789 
   2790   BlockScheduling &BS = *BSRef.get();
   2791 
   2792   Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
   2793   if (!Bundle) {
   2794     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
   2795     assert((!BS.getScheduleData(VL0) ||
   2796             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
   2797            "tryScheduleBundle should cancelScheduling on failure");
   2798     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   2799                  ReuseShuffleIndicies);
   2800     return;
   2801   }
   2802   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
   2803 
   2804   unsigned ShuffleOrOp = S.isAltShuffle() ?
   2805                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
   2806   switch (ShuffleOrOp) {
   2807     case Instruction::PHI: {
   2808       auto *PH = cast<PHINode>(VL0);
   2809 
   2810       // Check for terminator values (e.g. invoke).
   2811       for (Value *V : VL)
   2812         for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
   2813           Instruction *Term = dyn_cast<Instruction>(
   2814               cast<PHINode>(V)->getIncomingValueForBlock(
   2815                   PH->getIncomingBlock(I)));
   2816           if (Term && Term->isTerminator()) {
   2817             LLVM_DEBUG(dbgs()
   2818                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
   2819             BS.cancelScheduling(VL, VL0);
   2820             newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   2821                          ReuseShuffleIndicies);
   2822             return;
   2823           }
   2824         }
   2825 
   2826       TreeEntry *TE =
   2827           newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
   2828       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
   2829 
   2830       // Keeps the reordered operands to avoid code duplication.
   2831       SmallVector<ValueList, 2> OperandsVec;
   2832       for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
   2833         ValueList Operands;
   2834         // Prepare the operand vector.
   2835         for (Value *V : VL)
   2836           Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
   2837               PH->getIncomingBlock(I)));
   2838         TE->setOperand(I, Operands);
   2839         OperandsVec.push_back(Operands);
   2840       }
   2841       for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
   2842         buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
   2843       return;
   2844     }
   2845     case Instruction::ExtractValue:
   2846     case Instruction::ExtractElement: {
   2847       OrdersType CurrentOrder;
   2848       bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
   2849       if (Reuse) {
   2850         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
   2851         ++NumOpsWantToKeepOriginalOrder;
   2852         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   2853                      ReuseShuffleIndicies);
   2854         // This is a special case, as it does not gather, but at the same time
   2855         // we are not extending buildTree_rec() towards the operands.
   2856         ValueList Op0;
   2857         Op0.assign(VL.size(), VL0->getOperand(0));
   2858         VectorizableTree.back()->setOperand(0, Op0);
   2859         return;
   2860       }
   2861       if (!CurrentOrder.empty()) {
   2862         LLVM_DEBUG({
   2863           dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
   2864                     "with order";
   2865           for (unsigned Idx : CurrentOrder)
   2866             dbgs() << " " << Idx;
   2867           dbgs() << "\n";
   2868         });
   2869         // Insert new order with initial value 0, if it does not exist,
   2870         // otherwise return the iterator to the existing one.
   2871         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   2872                      ReuseShuffleIndicies, CurrentOrder);
   2873         findRootOrder(CurrentOrder);
   2874         ++NumOpsWantToKeepOrder[CurrentOrder];
   2875         // This is a special case, as it does not gather, but at the same time
   2876         // we are not extending buildTree_rec() towards the operands.
   2877         ValueList Op0;
   2878         Op0.assign(VL.size(), VL0->getOperand(0));
   2879         VectorizableTree.back()->setOperand(0, Op0);
   2880         return;
   2881       }
   2882       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
   2883       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   2884                    ReuseShuffleIndicies);
   2885       BS.cancelScheduling(VL, VL0);
   2886       return;
   2887     }
   2888     case Instruction::InsertElement: {
   2889       assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
   2890 
   2891       // Check that we have a buildvector and not a shuffle of 2 or more
   2892       // different vectors.
   2893       ValueSet SourceVectors;
   2894       for (Value *V : VL)
   2895         SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
   2896 
   2897       if (count_if(VL, [&SourceVectors](Value *V) {
   2898             return !SourceVectors.contains(V);
   2899           }) >= 2) {
   2900         // Found 2nd source vector - cancel.
   2901         LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
   2902                              "different source vectors.\n");
   2903         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   2904                      ReuseShuffleIndicies);
   2905         BS.cancelScheduling(VL, VL0);
   2906         return;
   2907       }
   2908 
   2909       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
   2910       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
   2911 
   2912       constexpr int NumOps = 2;
   2913       ValueList VectorOperands[NumOps];
   2914       for (int I = 0; I < NumOps; ++I) {
   2915         for (Value *V : VL)
   2916           VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
   2917 
   2918         TE->setOperand(I, VectorOperands[I]);
   2919       }
   2920       buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0});
   2921       return;
   2922     }
   2923     case Instruction::Load: {
   2924       // Check that a vectorized load would load the same memory as a scalar
   2925       // load. For example, we don't want to vectorize loads that are smaller
   2926       // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
   2927       // treats loading/storing it as an i8 struct. If we vectorize loads/stores
   2928       // from such a struct, we read/write packed bits disagreeing with the
   2929       // unvectorized version.
   2930       Type *ScalarTy = VL0->getType();
   2931 
   2932       if (DL->getTypeSizeInBits(ScalarTy) !=
   2933           DL->getTypeAllocSizeInBits(ScalarTy)) {
   2934         BS.cancelScheduling(VL, VL0);
   2935         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   2936                      ReuseShuffleIndicies);
   2937         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
   2938         return;
   2939       }
   2940 
   2941       // Make sure all loads in the bundle are simple - we can't vectorize
   2942       // atomic or volatile loads.
   2943       SmallVector<Value *, 4> PointerOps(VL.size());
   2944       auto POIter = PointerOps.begin();
   2945       for (Value *V : VL) {
   2946         auto *L = cast<LoadInst>(V);
   2947         if (!L->isSimple()) {
   2948           BS.cancelScheduling(VL, VL0);
   2949           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   2950                        ReuseShuffleIndicies);
   2951           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
   2952           return;
   2953         }
   2954         *POIter = L->getPointerOperand();
   2955         ++POIter;
   2956       }
   2957 
   2958       OrdersType CurrentOrder;
   2959       // Check the order of pointer operands.
   2960       if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
   2961         Value *Ptr0;
   2962         Value *PtrN;
   2963         if (CurrentOrder.empty()) {
   2964           Ptr0 = PointerOps.front();
   2965           PtrN = PointerOps.back();
   2966         } else {
   2967           Ptr0 = PointerOps[CurrentOrder.front()];
   2968           PtrN = PointerOps[CurrentOrder.back()];
   2969         }
   2970         Optional<int> Diff = getPointersDiff(Ptr0, PtrN, *DL, *SE);
   2971         // Check that the sorted loads are consecutive.
   2972         if (static_cast<unsigned>(*Diff) == VL.size() - 1) {
   2973           if (CurrentOrder.empty()) {
   2974             // Original loads are consecutive and does not require reordering.
   2975             ++NumOpsWantToKeepOriginalOrder;
   2976             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
   2977                                          UserTreeIdx, ReuseShuffleIndicies);
   2978             TE->setOperandsInOrder();
   2979             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
   2980           } else {
   2981             // Need to reorder.
   2982             TreeEntry *TE =
   2983                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   2984                              ReuseShuffleIndicies, CurrentOrder);
   2985             TE->setOperandsInOrder();
   2986             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
   2987             findRootOrder(CurrentOrder);
   2988             ++NumOpsWantToKeepOrder[CurrentOrder];
   2989           }
   2990           return;
   2991         }
   2992         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
   2993         TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
   2994                                      UserTreeIdx, ReuseShuffleIndicies);
   2995         TE->setOperandsInOrder();
   2996         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
   2997         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
   2998         return;
   2999       }
   3000 
   3001       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
   3002       BS.cancelScheduling(VL, VL0);
   3003       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3004                    ReuseShuffleIndicies);
   3005       return;
   3006     }
   3007     case Instruction::ZExt:
   3008     case Instruction::SExt:
   3009     case Instruction::FPToUI:
   3010     case Instruction::FPToSI:
   3011     case Instruction::FPExt:
   3012     case Instruction::PtrToInt:
   3013     case Instruction::IntToPtr:
   3014     case Instruction::SIToFP:
   3015     case Instruction::UIToFP:
   3016     case Instruction::Trunc:
   3017     case Instruction::FPTrunc:
   3018     case Instruction::BitCast: {
   3019       Type *SrcTy = VL0->getOperand(0)->getType();
   3020       for (Value *V : VL) {
   3021         Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
   3022         if (Ty != SrcTy || !isValidElementType(Ty)) {
   3023           BS.cancelScheduling(VL, VL0);
   3024           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3025                        ReuseShuffleIndicies);
   3026           LLVM_DEBUG(dbgs()
   3027                      << "SLP: Gathering casts with different src types.\n");
   3028           return;
   3029         }
   3030       }
   3031       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3032                                    ReuseShuffleIndicies);
   3033       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
   3034 
   3035       TE->setOperandsInOrder();
   3036       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   3037         ValueList Operands;
   3038         // Prepare the operand vector.
   3039         for (Value *V : VL)
   3040           Operands.push_back(cast<Instruction>(V)->getOperand(i));
   3041 
   3042         buildTree_rec(Operands, Depth + 1, {TE, i});
   3043       }
   3044       return;
   3045     }
   3046     case Instruction::ICmp:
   3047     case Instruction::FCmp: {
   3048       // Check that all of the compares have the same predicate.
   3049       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
   3050       CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
   3051       Type *ComparedTy = VL0->getOperand(0)->getType();
   3052       for (Value *V : VL) {
   3053         CmpInst *Cmp = cast<CmpInst>(V);
   3054         if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
   3055             Cmp->getOperand(0)->getType() != ComparedTy) {
   3056           BS.cancelScheduling(VL, VL0);
   3057           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3058                        ReuseShuffleIndicies);
   3059           LLVM_DEBUG(dbgs()
   3060                      << "SLP: Gathering cmp with different predicate.\n");
   3061           return;
   3062         }
   3063       }
   3064 
   3065       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3066                                    ReuseShuffleIndicies);
   3067       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
   3068 
   3069       ValueList Left, Right;
   3070       if (cast<CmpInst>(VL0)->isCommutative()) {
   3071         // Commutative predicate - collect + sort operands of the instructions
   3072         // so that each side is more likely to have the same opcode.
   3073         assert(P0 == SwapP0 && "Commutative Predicate mismatch");
   3074         reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
   3075       } else {
   3076         // Collect operands - commute if it uses the swapped predicate.
   3077         for (Value *V : VL) {
   3078           auto *Cmp = cast<CmpInst>(V);
   3079           Value *LHS = Cmp->getOperand(0);
   3080           Value *RHS = Cmp->getOperand(1);
   3081           if (Cmp->getPredicate() != P0)
   3082             std::swap(LHS, RHS);
   3083           Left.push_back(LHS);
   3084           Right.push_back(RHS);
   3085         }
   3086       }
   3087       TE->setOperand(0, Left);
   3088       TE->setOperand(1, Right);
   3089       buildTree_rec(Left, Depth + 1, {TE, 0});
   3090       buildTree_rec(Right, Depth + 1, {TE, 1});
   3091       return;
   3092     }
   3093     case Instruction::Select:
   3094     case Instruction::FNeg:
   3095     case Instruction::Add:
   3096     case Instruction::FAdd:
   3097     case Instruction::Sub:
   3098     case Instruction::FSub:
   3099     case Instruction::Mul:
   3100     case Instruction::FMul:
   3101     case Instruction::UDiv:
   3102     case Instruction::SDiv:
   3103     case Instruction::FDiv:
   3104     case Instruction::URem:
   3105     case Instruction::SRem:
   3106     case Instruction::FRem:
   3107     case Instruction::Shl:
   3108     case Instruction::LShr:
   3109     case Instruction::AShr:
   3110     case Instruction::And:
   3111     case Instruction::Or:
   3112     case Instruction::Xor: {
   3113       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3114                                    ReuseShuffleIndicies);
   3115       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
   3116 
   3117       // Sort operands of the instructions so that each side is more likely to
   3118       // have the same opcode.
   3119       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
   3120         ValueList Left, Right;
   3121         reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
   3122         TE->setOperand(0, Left);
   3123         TE->setOperand(1, Right);
   3124         buildTree_rec(Left, Depth + 1, {TE, 0});
   3125         buildTree_rec(Right, Depth + 1, {TE, 1});
   3126         return;
   3127       }
   3128 
   3129       TE->setOperandsInOrder();
   3130       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   3131         ValueList Operands;
   3132         // Prepare the operand vector.
   3133         for (Value *V : VL)
   3134           Operands.push_back(cast<Instruction>(V)->getOperand(i));
   3135 
   3136         buildTree_rec(Operands, Depth + 1, {TE, i});
   3137       }
   3138       return;
   3139     }
   3140     case Instruction::GetElementPtr: {
   3141       // We don't combine GEPs with complicated (nested) indexing.
   3142       for (Value *V : VL) {
   3143         if (cast<Instruction>(V)->getNumOperands() != 2) {
   3144           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
   3145           BS.cancelScheduling(VL, VL0);
   3146           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3147                        ReuseShuffleIndicies);
   3148           return;
   3149         }
   3150       }
   3151 
   3152       // We can't combine several GEPs into one vector if they operate on
   3153       // different types.
   3154       Type *Ty0 = VL0->getOperand(0)->getType();
   3155       for (Value *V : VL) {
   3156         Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
   3157         if (Ty0 != CurTy) {
   3158           LLVM_DEBUG(dbgs()
   3159                      << "SLP: not-vectorizable GEP (different types).\n");
   3160           BS.cancelScheduling(VL, VL0);
   3161           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3162                        ReuseShuffleIndicies);
   3163           return;
   3164         }
   3165       }
   3166 
   3167       // We don't combine GEPs with non-constant indexes.
   3168       Type *Ty1 = VL0->getOperand(1)->getType();
   3169       for (Value *V : VL) {
   3170         auto Op = cast<Instruction>(V)->getOperand(1);
   3171         if (!isa<ConstantInt>(Op) ||
   3172             (Op->getType() != Ty1 &&
   3173              Op->getType()->getScalarSizeInBits() >
   3174                  DL->getIndexSizeInBits(
   3175                      V->getType()->getPointerAddressSpace()))) {
   3176           LLVM_DEBUG(dbgs()
   3177                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
   3178           BS.cancelScheduling(VL, VL0);
   3179           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3180                        ReuseShuffleIndicies);
   3181           return;
   3182         }
   3183       }
   3184 
   3185       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3186                                    ReuseShuffleIndicies);
   3187       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
   3188       TE->setOperandsInOrder();
   3189       for (unsigned i = 0, e = 2; i < e; ++i) {
   3190         ValueList Operands;
   3191         // Prepare the operand vector.
   3192         for (Value *V : VL)
   3193           Operands.push_back(cast<Instruction>(V)->getOperand(i));
   3194 
   3195         buildTree_rec(Operands, Depth + 1, {TE, i});
   3196       }
   3197       return;
   3198     }
   3199     case Instruction::Store: {
   3200       // Check if the stores are consecutive or if we need to swizzle them.
   3201       llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
   3202       // Avoid types that are padded when being allocated as scalars, while
   3203       // being packed together in a vector (such as i1).
   3204       if (DL->getTypeSizeInBits(ScalarTy) !=
   3205           DL->getTypeAllocSizeInBits(ScalarTy)) {
   3206         BS.cancelScheduling(VL, VL0);
   3207         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3208                      ReuseShuffleIndicies);
   3209         LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
   3210         return;
   3211       }
   3212       // Make sure all stores in the bundle are simple - we can't vectorize
   3213       // atomic or volatile stores.
   3214       SmallVector<Value *, 4> PointerOps(VL.size());
   3215       ValueList Operands(VL.size());
   3216       auto POIter = PointerOps.begin();
   3217       auto OIter = Operands.begin();
   3218       for (Value *V : VL) {
   3219         auto *SI = cast<StoreInst>(V);
   3220         if (!SI->isSimple()) {
   3221           BS.cancelScheduling(VL, VL0);
   3222           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3223                        ReuseShuffleIndicies);
   3224           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
   3225           return;
   3226         }
   3227         *POIter = SI->getPointerOperand();
   3228         *OIter = SI->getValueOperand();
   3229         ++POIter;
   3230         ++OIter;
   3231       }
   3232 
   3233       OrdersType CurrentOrder;
   3234       // Check the order of pointer operands.
   3235       if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
   3236         Value *Ptr0;
   3237         Value *PtrN;
   3238         if (CurrentOrder.empty()) {
   3239           Ptr0 = PointerOps.front();
   3240           PtrN = PointerOps.back();
   3241         } else {
   3242           Ptr0 = PointerOps[CurrentOrder.front()];
   3243           PtrN = PointerOps[CurrentOrder.back()];
   3244         }
   3245         Optional<int> Dist = getPointersDiff(Ptr0, PtrN, *DL, *SE);
   3246         // Check that the sorted pointer operands are consecutive.
   3247         if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
   3248           if (CurrentOrder.empty()) {
   3249             // Original stores are consecutive and does not require reordering.
   3250             ++NumOpsWantToKeepOriginalOrder;
   3251             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
   3252                                          UserTreeIdx, ReuseShuffleIndicies);
   3253             TE->setOperandsInOrder();
   3254             buildTree_rec(Operands, Depth + 1, {TE, 0});
   3255             LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
   3256           } else {
   3257             TreeEntry *TE =
   3258                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3259                              ReuseShuffleIndicies, CurrentOrder);
   3260             TE->setOperandsInOrder();
   3261             buildTree_rec(Operands, Depth + 1, {TE, 0});
   3262             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
   3263             findRootOrder(CurrentOrder);
   3264             ++NumOpsWantToKeepOrder[CurrentOrder];
   3265           }
   3266           return;
   3267         }
   3268       }
   3269 
   3270       BS.cancelScheduling(VL, VL0);
   3271       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3272                    ReuseShuffleIndicies);
   3273       LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
   3274       return;
   3275     }
   3276     case Instruction::Call: {
   3277       // Check if the calls are all to the same vectorizable intrinsic or
   3278       // library function.
   3279       CallInst *CI = cast<CallInst>(VL0);
   3280       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   3281 
   3282       VFShape Shape = VFShape::get(
   3283           *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
   3284           false /*HasGlobalPred*/);
   3285       Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
   3286 
   3287       if (!VecFunc && !isTriviallyVectorizable(ID)) {
   3288         BS.cancelScheduling(VL, VL0);
   3289         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3290                      ReuseShuffleIndicies);
   3291         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
   3292         return;
   3293       }
   3294       Function *F = CI->getCalledFunction();
   3295       unsigned NumArgs = CI->getNumArgOperands();
   3296       SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
   3297       for (unsigned j = 0; j != NumArgs; ++j)
   3298         if (hasVectorInstrinsicScalarOpd(ID, j))
   3299           ScalarArgs[j] = CI->getArgOperand(j);
   3300       for (Value *V : VL) {
   3301         CallInst *CI2 = dyn_cast<CallInst>(V);
   3302         if (!CI2 || CI2->getCalledFunction() != F ||
   3303             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
   3304             (VecFunc &&
   3305              VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
   3306             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
   3307           BS.cancelScheduling(VL, VL0);
   3308           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3309                        ReuseShuffleIndicies);
   3310           LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
   3311                             << "\n");
   3312           return;
   3313         }
   3314         // Some intrinsics have scalar arguments and should be same in order for
   3315         // them to be vectorized.
   3316         for (unsigned j = 0; j != NumArgs; ++j) {
   3317           if (hasVectorInstrinsicScalarOpd(ID, j)) {
   3318             Value *A1J = CI2->getArgOperand(j);
   3319             if (ScalarArgs[j] != A1J) {
   3320               BS.cancelScheduling(VL, VL0);
   3321               newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3322                            ReuseShuffleIndicies);
   3323               LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
   3324                                 << " argument " << ScalarArgs[j] << "!=" << A1J
   3325                                 << "\n");
   3326               return;
   3327             }
   3328           }
   3329         }
   3330         // Verify that the bundle operands are identical between the two calls.
   3331         if (CI->hasOperandBundles() &&
   3332             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
   3333                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
   3334                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
   3335           BS.cancelScheduling(VL, VL0);
   3336           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3337                        ReuseShuffleIndicies);
   3338           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
   3339                             << *CI << "!=" << *V << '\n');
   3340           return;
   3341         }
   3342       }
   3343 
   3344       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3345                                    ReuseShuffleIndicies);
   3346       TE->setOperandsInOrder();
   3347       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
   3348         ValueList Operands;
   3349         // Prepare the operand vector.
   3350         for (Value *V : VL) {
   3351           auto *CI2 = cast<CallInst>(V);
   3352           Operands.push_back(CI2->getArgOperand(i));
   3353         }
   3354         buildTree_rec(Operands, Depth + 1, {TE, i});
   3355       }
   3356       return;
   3357     }
   3358     case Instruction::ShuffleVector: {
   3359       // If this is not an alternate sequence of opcode like add-sub
   3360       // then do not vectorize this instruction.
   3361       if (!S.isAltShuffle()) {
   3362         BS.cancelScheduling(VL, VL0);
   3363         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3364                      ReuseShuffleIndicies);
   3365         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
   3366         return;
   3367       }
   3368       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
   3369                                    ReuseShuffleIndicies);
   3370       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
   3371 
   3372       // Reorder operands if reordering would enable vectorization.
   3373       if (isa<BinaryOperator>(VL0)) {
   3374         ValueList Left, Right;
   3375         reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
   3376         TE->setOperand(0, Left);
   3377         TE->setOperand(1, Right);
   3378         buildTree_rec(Left, Depth + 1, {TE, 0});
   3379         buildTree_rec(Right, Depth + 1, {TE, 1});
   3380         return;
   3381       }
   3382 
   3383       TE->setOperandsInOrder();
   3384       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   3385         ValueList Operands;
   3386         // Prepare the operand vector.
   3387         for (Value *V : VL)
   3388           Operands.push_back(cast<Instruction>(V)->getOperand(i));
   3389 
   3390         buildTree_rec(Operands, Depth + 1, {TE, i});
   3391       }
   3392       return;
   3393     }
   3394     default:
   3395       BS.cancelScheduling(VL, VL0);
   3396       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
   3397                    ReuseShuffleIndicies);
   3398       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
   3399       return;
   3400   }
   3401 }
   3402 
   3403 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   3404   unsigned N = 1;
   3405   Type *EltTy = T;
   3406 
   3407   while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
   3408          isa<VectorType>(EltTy)) {
   3409     if (auto *ST = dyn_cast<StructType>(EltTy)) {
   3410       // Check that struct is homogeneous.
   3411       for (const auto *Ty : ST->elements())
   3412         if (Ty != *ST->element_begin())
   3413           return 0;
   3414       N *= ST->getNumElements();
   3415       EltTy = *ST->element_begin();
   3416     } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
   3417       N *= AT->getNumElements();
   3418       EltTy = AT->getElementType();
   3419     } else {
   3420       auto *VT = cast<FixedVectorType>(EltTy);
   3421       N *= VT->getNumElements();
   3422       EltTy = VT->getElementType();
   3423     }
   3424   }
   3425 
   3426   if (!isValidElementType(EltTy))
   3427     return 0;
   3428   uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
   3429   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
   3430     return 0;
   3431   return N;
   3432 }
   3433 
   3434 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
   3435                               SmallVectorImpl<unsigned> &CurrentOrder) const {
   3436   Instruction *E0 = cast<Instruction>(OpValue);
   3437   assert(E0->getOpcode() == Instruction::ExtractElement ||
   3438          E0->getOpcode() == Instruction::ExtractValue);
   3439   assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
   3440   // Check if all of the extracts come from the same vector and from the
   3441   // correct offset.
   3442   Value *Vec = E0->getOperand(0);
   3443 
   3444   CurrentOrder.clear();
   3445 
   3446   // We have to extract from a vector/aggregate with the same number of elements.
   3447   unsigned NElts;
   3448   if (E0->getOpcode() == Instruction::ExtractValue) {
   3449     const DataLayout &DL = E0->getModule()->getDataLayout();
   3450     NElts = canMapToVector(Vec->getType(), DL);
   3451     if (!NElts)
   3452       return false;
   3453     // Check if load can be rewritten as load of vector.
   3454     LoadInst *LI = dyn_cast<LoadInst>(Vec);
   3455     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
   3456       return false;
   3457   } else {
   3458     NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
   3459   }
   3460 
   3461   if (NElts != VL.size())
   3462     return false;
   3463 
   3464   // Check that all of the indices extract from the correct offset.
   3465   bool ShouldKeepOrder = true;
   3466   unsigned E = VL.size();
   3467   // Assign to all items the initial value E + 1 so we can check if the extract
   3468   // instruction index was used already.
   3469   // Also, later we can check that all the indices are used and we have a
   3470   // consecutive access in the extract instructions, by checking that no
   3471   // element of CurrentOrder still has value E + 1.
   3472   CurrentOrder.assign(E, E + 1);
   3473   unsigned I = 0;
   3474   for (; I < E; ++I) {
   3475     auto *Inst = cast<Instruction>(VL[I]);
   3476     if (Inst->getOperand(0) != Vec)
   3477       break;
   3478     Optional<unsigned> Idx = getExtractIndex(Inst);
   3479     if (!Idx)
   3480       break;
   3481     const unsigned ExtIdx = *Idx;
   3482     if (ExtIdx != I) {
   3483       if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
   3484         break;
   3485       ShouldKeepOrder = false;
   3486       CurrentOrder[ExtIdx] = I;
   3487     } else {
   3488       if (CurrentOrder[I] != E + 1)
   3489         break;
   3490       CurrentOrder[I] = I;
   3491     }
   3492   }
   3493   if (I < E) {
   3494     CurrentOrder.clear();
   3495     return false;
   3496   }
   3497 
   3498   return ShouldKeepOrder;
   3499 }
   3500 
   3501 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   3502   return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
   3503            return ScalarToTreeEntry.count(U) > 0;
   3504          });
   3505 }
   3506 
   3507 static std::pair<InstructionCost, InstructionCost>
   3508 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   3509                    TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
   3510   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   3511 
   3512   // Calculate the cost of the scalar and vector calls.
   3513   SmallVector<Type *, 4> VecTys;
   3514   for (Use &Arg : CI->args())
   3515     VecTys.push_back(
   3516         FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
   3517   FastMathFlags FMF;
   3518   if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
   3519     FMF = FPCI->getFastMathFlags();
   3520   SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
   3521   IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
   3522                                     dyn_cast<IntrinsicInst>(CI));
   3523   auto IntrinsicCost =
   3524     TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
   3525 
   3526   auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
   3527                                      VecTy->getNumElements())),
   3528                             false /*HasGlobalPred*/);
   3529   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
   3530   auto LibCost = IntrinsicCost;
   3531   if (!CI->isNoBuiltin() && VecFunc) {
   3532     // Calculate the cost of the vector library call.
   3533     // If the corresponding vector call is cheaper, return its cost.
   3534     LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
   3535                                     TTI::TCK_RecipThroughput);
   3536   }
   3537   return {IntrinsicCost, LibCost};
   3538 }
   3539 
   3540 /// Compute the cost of creating a vector of type \p VecTy containing the
   3541 /// extracted values from \p VL.
   3542 static InstructionCost
   3543 computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
   3544                    TargetTransformInfo::ShuffleKind ShuffleKind,
   3545                    ArrayRef<int> Mask, TargetTransformInfo &TTI) {
   3546   unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
   3547 
   3548   if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
   3549       VecTy->getNumElements() < NumOfParts)
   3550     return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
   3551 
   3552   bool AllConsecutive = true;
   3553   unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
   3554   unsigned Idx = -1;
   3555   InstructionCost Cost = 0;
   3556 
   3557   // Process extracts in blocks of EltsPerVector to check if the source vector
   3558   // operand can be re-used directly. If not, add the cost of creating a shuffle
   3559   // to extract the values into a vector register.
   3560   for (auto *V : VL) {
   3561     ++Idx;
   3562 
   3563     // Reached the start of a new vector registers.
   3564     if (Idx % EltsPerVector == 0) {
   3565       AllConsecutive = true;
   3566       continue;
   3567     }
   3568 
   3569     // Check all extracts for a vector register on the target directly
   3570     // extract values in order.
   3571     unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
   3572     unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
   3573     AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
   3574                       CurrentIdx % EltsPerVector == Idx % EltsPerVector;
   3575 
   3576     if (AllConsecutive)
   3577       continue;
   3578 
   3579     // Skip all indices, except for the last index per vector block.
   3580     if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
   3581       continue;
   3582 
   3583     // If we have a series of extracts which are not consecutive and hence
   3584     // cannot re-use the source vector register directly, compute the shuffle
   3585     // cost to extract the a vector with EltsPerVector elements.
   3586     Cost += TTI.getShuffleCost(
   3587         TargetTransformInfo::SK_PermuteSingleSrc,
   3588         FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
   3589   }
   3590   return Cost;
   3591 }
   3592 
   3593 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
   3594   ArrayRef<Value*> VL = E->Scalars;
   3595 
   3596   Type *ScalarTy = VL[0]->getType();
   3597   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
   3598     ScalarTy = SI->getValueOperand()->getType();
   3599   else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
   3600     ScalarTy = CI->getOperand(0)->getType();
   3601   else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
   3602     ScalarTy = IE->getOperand(1)->getType();
   3603   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   3604   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   3605 
   3606   // If we have computed a smaller type for the expression, update VecTy so
   3607   // that the costs will be accurate.
   3608   if (MinBWs.count(VL[0]))
   3609     VecTy = FixedVectorType::get(
   3610         IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
   3611 
   3612   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   3613   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   3614   InstructionCost ReuseShuffleCost = 0;
   3615   if (NeedToShuffleReuses) {
   3616     ReuseShuffleCost =
   3617         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
   3618                             E->ReuseShuffleIndices);
   3619   }
   3620   // FIXME: it tries to fix a problem with MSVC buildbots.
   3621   TargetTransformInfo &TTIRef = *TTI;
   3622   auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL,
   3623                                VecTy](InstructionCost &Cost, bool IsGather) {
   3624     DenseMap<Value *, int> ExtractVectorsTys;
   3625     for (auto *V : VL) {
   3626       // If all users of instruction are going to be vectorized and this
   3627       // instruction itself is not going to be vectorized, consider this
   3628       // instruction as dead and remove its cost from the final cost of the
   3629       // vectorized tree.
   3630       if (IsGather && (!areAllUsersVectorized(cast<Instruction>(V)) ||
   3631                        ScalarToTreeEntry.count(V)))
   3632         continue;
   3633       auto *EE = cast<ExtractElementInst>(V);
   3634       unsigned Idx = *getExtractIndex(EE);
   3635       if (TTIRef.getNumberOfParts(VecTy) !=
   3636           TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
   3637         auto It =
   3638             ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
   3639         It->getSecond() = std::min<int>(It->second, Idx);
   3640       }
   3641       // Take credit for instruction that will become dead.
   3642       if (EE->hasOneUse()) {
   3643         Instruction *Ext = EE->user_back();
   3644         if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
   3645             all_of(Ext->users(),
   3646                    [](User *U) { return isa<GetElementPtrInst>(U); })) {
   3647           // Use getExtractWithExtendCost() to calculate the cost of
   3648           // extractelement/ext pair.
   3649           Cost -=
   3650               TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
   3651                                               EE->getVectorOperandType(), Idx);
   3652           // Add back the cost of s|zext which is subtracted separately.
   3653           Cost += TTIRef.getCastInstrCost(
   3654               Ext->getOpcode(), Ext->getType(), EE->getType(),
   3655               TTI::getCastContextHint(Ext), CostKind, Ext);
   3656           continue;
   3657         }
   3658       }
   3659       Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
   3660                                         EE->getVectorOperandType(), Idx);
   3661     }
   3662     // Add a cost for subvector extracts/inserts if required.
   3663     for (const auto &Data : ExtractVectorsTys) {
   3664       auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
   3665       unsigned NumElts = VecTy->getNumElements();
   3666       if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
   3667         unsigned Idx = (Data.second / NumElts) * NumElts;
   3668         unsigned EENumElts = EEVTy->getNumElements();
   3669         if (Idx + NumElts <= EENumElts) {
   3670           Cost +=
   3671               TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
   3672                                     EEVTy, None, Idx, VecTy);
   3673         } else {
   3674           // Need to round up the subvector type vectorization factor to avoid a
   3675           // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
   3676           // <= EENumElts.
   3677           auto *SubVT =
   3678               FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
   3679           Cost +=
   3680               TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
   3681                                     EEVTy, None, Idx, SubVT);
   3682         }
   3683       } else {
   3684         Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
   3685                                       VecTy, None, 0, EEVTy);
   3686       }
   3687     }
   3688   };
   3689   if (E->State == TreeEntry::NeedToGather) {
   3690     if (allConstant(VL))
   3691       return 0;
   3692     if (isSplat(VL)) {
   3693       return ReuseShuffleCost +
   3694              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
   3695                                  0);
   3696     }
   3697     if (E->getOpcode() == Instruction::ExtractElement &&
   3698         allSameType(VL) && allSameBlock(VL)) {
   3699       SmallVector<int> Mask;
   3700       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
   3701           isShuffle(VL, Mask);
   3702       if (ShuffleKind.hasValue()) {
   3703         InstructionCost Cost =
   3704             computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
   3705         AdjustExtractsCost(Cost, /*IsGather=*/true);
   3706         return ReuseShuffleCost + Cost;
   3707       }
   3708     }
   3709     InstructionCost GatherCost = 0;
   3710     SmallVector<int> Mask;
   3711     SmallVector<const TreeEntry *> Entries;
   3712     Optional<TargetTransformInfo::ShuffleKind> Shuffle =
   3713         isGatherShuffledEntry(E, Mask, Entries);
   3714     if (Shuffle.hasValue()) {
   3715       if (ShuffleVectorInst::isIdentityMask(Mask)) {
   3716         LLVM_DEBUG(
   3717             dbgs()
   3718             << "SLP: perfect diamond match for gather bundle that starts with "
   3719             << *VL.front() << ".\n");
   3720       } else {
   3721         LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
   3722                           << " entries for bundle that starts with "
   3723                           << *VL.front() << ".\n");
   3724         GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
   3725       }
   3726     } else {
   3727       GatherCost = getGatherCost(VL);
   3728     }
   3729     return ReuseShuffleCost + GatherCost;
   3730   }
   3731   assert((E->State == TreeEntry::Vectorize ||
   3732           E->State == TreeEntry::ScatterVectorize) &&
   3733          "Unhandled state");
   3734   assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
   3735   Instruction *VL0 = E->getMainOp();
   3736   unsigned ShuffleOrOp =
   3737       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   3738   switch (ShuffleOrOp) {
   3739     case Instruction::PHI:
   3740       return 0;
   3741 
   3742     case Instruction::ExtractValue:
   3743     case Instruction::ExtractElement: {
   3744       // The common cost of removal ExtractElement/ExtractValue instructions +
   3745       // the cost of shuffles, if required to resuffle the original vector.
   3746       InstructionCost CommonCost = 0;
   3747       if (NeedToShuffleReuses) {
   3748         unsigned Idx = 0;
   3749         for (unsigned I : E->ReuseShuffleIndices) {
   3750           if (ShuffleOrOp == Instruction::ExtractElement) {
   3751             auto *EE = cast<ExtractElementInst>(VL[I]);
   3752             ReuseShuffleCost -= TTI->getVectorInstrCost(
   3753                 Instruction::ExtractElement, EE->getVectorOperandType(),
   3754                 *getExtractIndex(EE));
   3755           } else {
   3756             ReuseShuffleCost -= TTI->getVectorInstrCost(
   3757                 Instruction::ExtractElement, VecTy, Idx);
   3758             ++Idx;
   3759           }
   3760         }
   3761         Idx = ReuseShuffleNumbers;
   3762         for (Value *V : VL) {
   3763           if (ShuffleOrOp == Instruction::ExtractElement) {
   3764             auto *EE = cast<ExtractElementInst>(V);
   3765             ReuseShuffleCost += TTI->getVectorInstrCost(
   3766                 Instruction::ExtractElement, EE->getVectorOperandType(),
   3767                 *getExtractIndex(EE));
   3768           } else {
   3769             --Idx;
   3770             ReuseShuffleCost += TTI->getVectorInstrCost(
   3771                 Instruction::ExtractElement, VecTy, Idx);
   3772           }
   3773         }
   3774         CommonCost = ReuseShuffleCost;
   3775       } else if (!E->ReorderIndices.empty()) {
   3776         SmallVector<int> NewMask;
   3777         inversePermutation(E->ReorderIndices, NewMask);
   3778         CommonCost = TTI->getShuffleCost(
   3779             TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
   3780       }
   3781       if (ShuffleOrOp == Instruction::ExtractValue) {
   3782         for (unsigned I = 0, E = VL.size(); I < E; ++I) {
   3783           auto *EI = cast<Instruction>(VL[I]);
   3784           // Take credit for instruction that will become dead.
   3785           if (EI->hasOneUse()) {
   3786             Instruction *Ext = EI->user_back();
   3787             if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
   3788                 all_of(Ext->users(),
   3789                        [](User *U) { return isa<GetElementPtrInst>(U); })) {
   3790               // Use getExtractWithExtendCost() to calculate the cost of
   3791               // extractelement/ext pair.
   3792               CommonCost -= TTI->getExtractWithExtendCost(
   3793                   Ext->getOpcode(), Ext->getType(), VecTy, I);
   3794               // Add back the cost of s|zext which is subtracted separately.
   3795               CommonCost += TTI->getCastInstrCost(
   3796                   Ext->getOpcode(), Ext->getType(), EI->getType(),
   3797                   TTI::getCastContextHint(Ext), CostKind, Ext);
   3798               continue;
   3799             }
   3800           }
   3801           CommonCost -=
   3802               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
   3803         }
   3804       } else {
   3805         AdjustExtractsCost(CommonCost, /*IsGather=*/false);
   3806       }
   3807       return CommonCost;
   3808     }
   3809     case Instruction::InsertElement: {
   3810       auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
   3811 
   3812       unsigned const NumElts = SrcVecTy->getNumElements();
   3813       unsigned const NumScalars = VL.size();
   3814       APInt DemandedElts = APInt::getNullValue(NumElts);
   3815       // TODO: Add support for Instruction::InsertValue.
   3816       unsigned Offset = UINT_MAX;
   3817       bool IsIdentity = true;
   3818       SmallVector<int> ShuffleMask(NumElts, UndefMaskElem);
   3819       for (unsigned I = 0; I < NumScalars; ++I) {
   3820         Optional<int> InsertIdx = getInsertIndex(VL[I], 0);
   3821         if (!InsertIdx || *InsertIdx == UndefMaskElem)
   3822           continue;
   3823         unsigned Idx = *InsertIdx;
   3824         DemandedElts.setBit(Idx);
   3825         if (Idx < Offset) {
   3826           Offset = Idx;
   3827           IsIdentity &= I == 0;
   3828         } else {
   3829           assert(Idx >= Offset && "Failed to find vector index offset");
   3830           IsIdentity &= Idx - Offset == I;
   3831         }
   3832         ShuffleMask[Idx] = I;
   3833       }
   3834       assert(Offset < NumElts && "Failed to find vector index offset");
   3835 
   3836       InstructionCost Cost = 0;
   3837       Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
   3838                                             /*Insert*/ true, /*Extract*/ false);
   3839 
   3840       if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0)
   3841         Cost += TTI->getShuffleCost(
   3842             TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None,
   3843             Offset,
   3844             FixedVectorType::get(SrcVecTy->getElementType(), NumScalars));
   3845       else if (!IsIdentity)
   3846         Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
   3847                                     ShuffleMask);
   3848 
   3849       return Cost;
   3850     }
   3851     case Instruction::ZExt:
   3852     case Instruction::SExt:
   3853     case Instruction::FPToUI:
   3854     case Instruction::FPToSI:
   3855     case Instruction::FPExt:
   3856     case Instruction::PtrToInt:
   3857     case Instruction::IntToPtr:
   3858     case Instruction::SIToFP:
   3859     case Instruction::UIToFP:
   3860     case Instruction::Trunc:
   3861     case Instruction::FPTrunc:
   3862     case Instruction::BitCast: {
   3863       Type *SrcTy = VL0->getOperand(0)->getType();
   3864       InstructionCost ScalarEltCost =
   3865           TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
   3866                                 TTI::getCastContextHint(VL0), CostKind, VL0);
   3867       if (NeedToShuffleReuses) {
   3868         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
   3869       }
   3870 
   3871       // Calculate the cost of this instruction.
   3872       InstructionCost ScalarCost = VL.size() * ScalarEltCost;
   3873 
   3874       auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
   3875       InstructionCost VecCost = 0;
   3876       // Check if the values are candidates to demote.
   3877       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
   3878         VecCost =
   3879             ReuseShuffleCost +
   3880             TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
   3881                                   TTI::getCastContextHint(VL0), CostKind, VL0);
   3882       }
   3883       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
   3884       return VecCost - ScalarCost;
   3885     }
   3886     case Instruction::FCmp:
   3887     case Instruction::ICmp:
   3888     case Instruction::Select: {
   3889       // Calculate the cost of this instruction.
   3890       InstructionCost ScalarEltCost =
   3891           TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
   3892                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
   3893       if (NeedToShuffleReuses) {
   3894         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
   3895       }
   3896       auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
   3897       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
   3898 
   3899       // Check if all entries in VL are either compares or selects with compares
   3900       // as condition that have the same predicates.
   3901       CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
   3902       bool First = true;
   3903       for (auto *V : VL) {
   3904         CmpInst::Predicate CurrentPred;
   3905         auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
   3906         if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
   3907              !match(V, MatchCmp)) ||
   3908             (!First && VecPred != CurrentPred)) {
   3909           VecPred = CmpInst::BAD_ICMP_PREDICATE;
   3910           break;
   3911         }
   3912         First = false;
   3913         VecPred = CurrentPred;
   3914       }
   3915 
   3916       InstructionCost VecCost = TTI->getCmpSelInstrCost(
   3917           E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
   3918       // Check if it is possible and profitable to use min/max for selects in
   3919       // VL.
   3920       //
   3921       auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
   3922       if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
   3923         IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
   3924                                           {VecTy, VecTy});
   3925         InstructionCost IntrinsicCost =
   3926             TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
   3927         // If the selects are the only uses of the compares, they will be dead
   3928         // and we can adjust the cost by removing their cost.
   3929         if (IntrinsicAndUse.second)
   3930           IntrinsicCost -=
   3931               TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
   3932                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   3933         VecCost = std::min(VecCost, IntrinsicCost);
   3934       }
   3935       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
   3936       return ReuseShuffleCost + VecCost - ScalarCost;
   3937     }
   3938     case Instruction::FNeg:
   3939     case Instruction::Add:
   3940     case Instruction::FAdd:
   3941     case Instruction::Sub:
   3942     case Instruction::FSub:
   3943     case Instruction::Mul:
   3944     case Instruction::FMul:
   3945     case Instruction::UDiv:
   3946     case Instruction::SDiv:
   3947     case Instruction::FDiv:
   3948     case Instruction::URem:
   3949     case Instruction::SRem:
   3950     case Instruction::FRem:
   3951     case Instruction::Shl:
   3952     case Instruction::LShr:
   3953     case Instruction::AShr:
   3954     case Instruction::And:
   3955     case Instruction::Or:
   3956     case Instruction::Xor: {
   3957       // Certain instructions can be cheaper to vectorize if they have a
   3958       // constant second vector operand.
   3959       TargetTransformInfo::OperandValueKind Op1VK =
   3960           TargetTransformInfo::OK_AnyValue;
   3961       TargetTransformInfo::OperandValueKind Op2VK =
   3962           TargetTransformInfo::OK_UniformConstantValue;
   3963       TargetTransformInfo::OperandValueProperties Op1VP =
   3964           TargetTransformInfo::OP_None;
   3965       TargetTransformInfo::OperandValueProperties Op2VP =
   3966           TargetTransformInfo::OP_PowerOf2;
   3967 
   3968       // If all operands are exactly the same ConstantInt then set the
   3969       // operand kind to OK_UniformConstantValue.
   3970       // If instead not all operands are constants, then set the operand kind
   3971       // to OK_AnyValue. If all operands are constants but not the same,
   3972       // then set the operand kind to OK_NonUniformConstantValue.
   3973       ConstantInt *CInt0 = nullptr;
   3974       for (unsigned i = 0, e = VL.size(); i < e; ++i) {
   3975         const Instruction *I = cast<Instruction>(VL[i]);
   3976         unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
   3977         ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
   3978         if (!CInt) {
   3979           Op2VK = TargetTransformInfo::OK_AnyValue;
   3980           Op2VP = TargetTransformInfo::OP_None;
   3981           break;
   3982         }
   3983         if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
   3984             !CInt->getValue().isPowerOf2())
   3985           Op2VP = TargetTransformInfo::OP_None;
   3986         if (i == 0) {
   3987           CInt0 = CInt;
   3988           continue;
   3989         }
   3990         if (CInt0 != CInt)
   3991           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
   3992       }
   3993 
   3994       SmallVector<const Value *, 4> Operands(VL0->operand_values());
   3995       InstructionCost ScalarEltCost =
   3996           TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
   3997                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
   3998       if (NeedToShuffleReuses) {
   3999         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
   4000       }
   4001       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
   4002       InstructionCost VecCost =
   4003           TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
   4004                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
   4005       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
   4006       return ReuseShuffleCost + VecCost - ScalarCost;
   4007     }
   4008     case Instruction::GetElementPtr: {
   4009       TargetTransformInfo::OperandValueKind Op1VK =
   4010           TargetTransformInfo::OK_AnyValue;
   4011       TargetTransformInfo::OperandValueKind Op2VK =
   4012           TargetTransformInfo::OK_UniformConstantValue;
   4013 
   4014       InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
   4015           Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
   4016       if (NeedToShuffleReuses) {
   4017         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
   4018       }
   4019       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
   4020       InstructionCost VecCost = TTI->getArithmeticInstrCost(
   4021           Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
   4022       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
   4023       return ReuseShuffleCost + VecCost - ScalarCost;
   4024     }
   4025     case Instruction::Load: {
   4026       // Cost of wide load - cost of scalar loads.
   4027       Align alignment = cast<LoadInst>(VL0)->getAlign();
   4028       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
   4029           Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
   4030       if (NeedToShuffleReuses) {
   4031         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
   4032       }
   4033       InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
   4034       InstructionCost VecLdCost;
   4035       if (E->State == TreeEntry::Vectorize) {
   4036         VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
   4037                                          CostKind, VL0);
   4038       } else {
   4039         assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
   4040         VecLdCost = TTI->getGatherScatterOpCost(
   4041             Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
   4042             /*VariableMask=*/false, alignment, CostKind, VL0);
   4043       }
   4044       if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) {
   4045         SmallVector<int> NewMask;
   4046         inversePermutation(E->ReorderIndices, NewMask);
   4047         VecLdCost += TTI->getShuffleCost(
   4048             TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
   4049       }
   4050       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
   4051       return ReuseShuffleCost + VecLdCost - ScalarLdCost;
   4052     }
   4053     case Instruction::Store: {
   4054       // We know that we can merge the stores. Calculate the cost.
   4055       bool IsReorder = !E->ReorderIndices.empty();
   4056       auto *SI =
   4057           cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
   4058       Align Alignment = SI->getAlign();
   4059       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
   4060           Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
   4061       InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
   4062       InstructionCost VecStCost = TTI->getMemoryOpCost(
   4063           Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
   4064       if (IsReorder) {
   4065         SmallVector<int> NewMask;
   4066         inversePermutation(E->ReorderIndices, NewMask);
   4067         VecStCost += TTI->getShuffleCost(
   4068             TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
   4069       }
   4070       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
   4071       return VecStCost - ScalarStCost;
   4072     }
   4073     case Instruction::Call: {
   4074       CallInst *CI = cast<CallInst>(VL0);
   4075       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   4076 
   4077       // Calculate the cost of the scalar and vector calls.
   4078       IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
   4079       InstructionCost ScalarEltCost =
   4080           TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
   4081       if (NeedToShuffleReuses) {
   4082         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
   4083       }
   4084       InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
   4085 
   4086       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
   4087       InstructionCost VecCallCost =
   4088           std::min(VecCallCosts.first, VecCallCosts.second);
   4089 
   4090       LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
   4091                         << " (" << VecCallCost << "-" << ScalarCallCost << ")"
   4092                         << " for " << *CI << "\n");
   4093 
   4094       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
   4095     }
   4096     case Instruction::ShuffleVector: {
   4097       assert(E->isAltShuffle() &&
   4098              ((Instruction::isBinaryOp(E->getOpcode()) &&
   4099                Instruction::isBinaryOp(E->getAltOpcode())) ||
   4100               (Instruction::isCast(E->getOpcode()) &&
   4101                Instruction::isCast(E->getAltOpcode()))) &&
   4102              "Invalid Shuffle Vector Operand");
   4103       InstructionCost ScalarCost = 0;
   4104       if (NeedToShuffleReuses) {
   4105         for (unsigned Idx : E->ReuseShuffleIndices) {
   4106           Instruction *I = cast<Instruction>(VL[Idx]);
   4107           ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
   4108         }
   4109         for (Value *V : VL) {
   4110           Instruction *I = cast<Instruction>(V);
   4111           ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
   4112         }
   4113       }
   4114       for (Value *V : VL) {
   4115         Instruction *I = cast<Instruction>(V);
   4116         assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
   4117         ScalarCost += TTI->getInstructionCost(I, CostKind);
   4118       }
   4119       // VecCost is equal to sum of the cost of creating 2 vectors
   4120       // and the cost of creating shuffle.
   4121       InstructionCost VecCost = 0;
   4122       if (Instruction::isBinaryOp(E->getOpcode())) {
   4123         VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
   4124         VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
   4125                                                CostKind);
   4126       } else {
   4127         Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
   4128         Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
   4129         auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
   4130         auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
   4131         VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
   4132                                         TTI::CastContextHint::None, CostKind);
   4133         VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
   4134                                          TTI::CastContextHint::None, CostKind);
   4135       }
   4136 
   4137       SmallVector<int> Mask(E->Scalars.size());
   4138       for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {
   4139         auto *OpInst = cast<Instruction>(E->Scalars[I]);
   4140         assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
   4141         Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);
   4142       }
   4143       VecCost +=
   4144           TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);
   4145       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
   4146       return ReuseShuffleCost + VecCost - ScalarCost;
   4147     }
   4148     default:
   4149       llvm_unreachable("Unknown instruction");
   4150   }
   4151 }
   4152 
   4153 bool BoUpSLP::isFullyVectorizableTinyTree() const {
   4154   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
   4155                     << VectorizableTree.size() << " is fully vectorizable .\n");
   4156 
   4157   // We only handle trees of heights 1 and 2.
   4158   if (VectorizableTree.size() == 1 &&
   4159       VectorizableTree[0]->State == TreeEntry::Vectorize)
   4160     return true;
   4161 
   4162   if (VectorizableTree.size() != 2)
   4163     return false;
   4164 
   4165   // Handle splat and all-constants stores. Also try to vectorize tiny trees
   4166   // with the second gather nodes if they have less scalar operands rather than
   4167   // the initial tree element (may be profitable to shuffle the second gather)
   4168   // or they are extractelements, which form shuffle.
   4169   SmallVector<int> Mask;
   4170   if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
   4171       (allConstant(VectorizableTree[1]->Scalars) ||
   4172        isSplat(VectorizableTree[1]->Scalars) ||
   4173        (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
   4174         VectorizableTree[1]->Scalars.size() <
   4175             VectorizableTree[0]->Scalars.size()) ||
   4176        (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
   4177         VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
   4178         isShuffle(VectorizableTree[1]->Scalars, Mask))))
   4179     return true;
   4180 
   4181   // Gathering cost would be too much for tiny trees.
   4182   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
   4183       VectorizableTree[1]->State == TreeEntry::NeedToGather)
   4184     return false;
   4185 
   4186   return true;
   4187 }
   4188 
   4189 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
   4190                                        TargetTransformInfo *TTI,
   4191                                        bool MustMatchOrInst) {
   4192   // Look past the root to find a source value. Arbitrarily follow the
   4193   // path through operand 0 of any 'or'. Also, peek through optional
   4194   // shift-left-by-multiple-of-8-bits.
   4195   Value *ZextLoad = Root;
   4196   const APInt *ShAmtC;
   4197   bool FoundOr = false;
   4198   while (!isa<ConstantExpr>(ZextLoad) &&
   4199          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
   4200           (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
   4201            ShAmtC->urem(8) == 0))) {
   4202     auto *BinOp = cast<BinaryOperator>(ZextLoad);
   4203     ZextLoad = BinOp->getOperand(0);
   4204     if (BinOp->getOpcode() == Instruction::Or)
   4205       FoundOr = true;
   4206   }
   4207   // Check if the input is an extended load of the required or/shift expression.
   4208   Value *LoadPtr;
   4209   if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
   4210       !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
   4211     return false;
   4212 
   4213   // Require that the total load bit width is a legal integer type.
   4214   // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
   4215   // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
   4216   Type *SrcTy = LoadPtr->getType()->getPointerElementType();
   4217   unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
   4218   if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
   4219     return false;
   4220 
   4221   // Everything matched - assume that we can fold the whole sequence using
   4222   // load combining.
   4223   LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
   4224              << *(cast<Instruction>(Root)) << "\n");
   4225 
   4226   return true;
   4227 }
   4228 
   4229 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
   4230   if (RdxKind != RecurKind::Or)
   4231     return false;
   4232 
   4233   unsigned NumElts = VectorizableTree[0]->Scalars.size();
   4234   Value *FirstReduced = VectorizableTree[0]->Scalars[0];
   4235   return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
   4236                                     /* MatchOr */ false);
   4237 }
   4238 
   4239 bool BoUpSLP::isLoadCombineCandidate() const {
   4240   // Peek through a final sequence of stores and check if all operations are
   4241   // likely to be load-combined.
   4242   unsigned NumElts = VectorizableTree[0]->Scalars.size();
   4243   for (Value *Scalar : VectorizableTree[0]->Scalars) {
   4244     Value *X;
   4245     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
   4246         !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
   4247       return false;
   4248   }
   4249   return true;
   4250 }
   4251 
   4252 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   4253   // No need to vectorize inserts of gathered values.
   4254   if (VectorizableTree.size() == 2 &&
   4255       isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
   4256       VectorizableTree[1]->State == TreeEntry::NeedToGather)
   4257     return true;
   4258 
   4259   // We can vectorize the tree if its size is greater than or equal to the
   4260   // minimum size specified by the MinTreeSize command line option.
   4261   if (VectorizableTree.size() >= MinTreeSize)
   4262     return false;
   4263 
   4264   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
   4265   // can vectorize it if we can prove it fully vectorizable.
   4266   if (isFullyVectorizableTinyTree())
   4267     return false;
   4268 
   4269   assert(VectorizableTree.empty()
   4270              ? ExternalUses.empty()
   4271              : true && "We shouldn't have any external users");
   4272 
   4273   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
   4274   // vectorizable.
   4275   return true;
   4276 }
   4277 
   4278 InstructionCost BoUpSLP::getSpillCost() const {
   4279   // Walk from the bottom of the tree to the top, tracking which values are
   4280   // live. When we see a call instruction that is not part of our tree,
   4281   // query TTI to see if there is a cost to keeping values live over it
   4282   // (for example, if spills and fills are required).
   4283   unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   4284   InstructionCost Cost = 0;
   4285 
   4286   SmallPtrSet<Instruction*, 4> LiveValues;
   4287   Instruction *PrevInst = nullptr;
   4288 
   4289   // The entries in VectorizableTree are not necessarily ordered by their
   4290   // position in basic blocks. Collect them and order them by dominance so later
   4291   // instructions are guaranteed to be visited first. For instructions in
   4292   // different basic blocks, we only scan to the beginning of the block, so
   4293   // their order does not matter, as long as all instructions in a basic block
   4294   // are grouped together. Using dominance ensures a deterministic order.
   4295   SmallVector<Instruction *, 16> OrderedScalars;
   4296   for (const auto &TEPtr : VectorizableTree) {
   4297     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
   4298     if (!Inst)
   4299       continue;
   4300     OrderedScalars.push_back(Inst);
   4301   }
   4302   llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
   4303     return DT->dominates(B, A);
   4304   });
   4305 
   4306   for (Instruction *Inst : OrderedScalars) {
   4307     if (!PrevInst) {
   4308       PrevInst = Inst;
   4309       continue;
   4310     }
   4311 
   4312     // Update LiveValues.
   4313     LiveValues.erase(PrevInst);
   4314     for (auto &J : PrevInst->operands()) {
   4315       if (isa<Instruction>(&*J) && getTreeEntry(&*J))
   4316         LiveValues.insert(cast<Instruction>(&*J));
   4317     }
   4318 
   4319     LLVM_DEBUG({
   4320       dbgs() << "SLP: #LV: " << LiveValues.size();
   4321       for (auto *X : LiveValues)
   4322         dbgs() << " " << X->getName();
   4323       dbgs() << ", Looking at ";
   4324       Inst->dump();
   4325     });
   4326 
   4327     // Now find the sequence of instructions between PrevInst and Inst.
   4328     unsigned NumCalls = 0;
   4329     BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
   4330                                  PrevInstIt =
   4331                                      PrevInst->getIterator().getReverse();
   4332     while (InstIt != PrevInstIt) {
   4333       if (PrevInstIt == PrevInst->getParent()->rend()) {
   4334         PrevInstIt = Inst->getParent()->rbegin();
   4335         continue;
   4336       }
   4337 
   4338       // Debug information does not impact spill cost.
   4339       if ((isa<CallInst>(&*PrevInstIt) &&
   4340            !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
   4341           &*PrevInstIt != PrevInst)
   4342         NumCalls++;
   4343 
   4344       ++PrevInstIt;
   4345     }
   4346 
   4347     if (NumCalls) {
   4348       SmallVector<Type*, 4> V;
   4349       for (auto *II : LiveValues) {
   4350         auto *ScalarTy = II->getType();
   4351         if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
   4352           ScalarTy = VectorTy->getElementType();
   4353         V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
   4354       }
   4355       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
   4356     }
   4357 
   4358     PrevInst = Inst;
   4359   }
   4360 
   4361   return Cost;
   4362 }
   4363 
   4364 InstructionCost BoUpSLP::getTreeCost() {
   4365   InstructionCost Cost = 0;
   4366   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
   4367                     << VectorizableTree.size() << ".\n");
   4368 
   4369   unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
   4370 
   4371   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
   4372     TreeEntry &TE = *VectorizableTree[I].get();
   4373 
   4374     InstructionCost C = getEntryCost(&TE);
   4375     Cost += C;
   4376     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
   4377                       << " for bundle that starts with " << *TE.Scalars[0]
   4378                       << ".\n"
   4379                       << "SLP: Current total cost = " << Cost << "\n");
   4380   }
   4381 
   4382   SmallPtrSet<Value *, 16> ExtractCostCalculated;
   4383   InstructionCost ExtractCost = 0;
   4384   SmallBitVector IsIdentity;
   4385   SmallVector<unsigned> VF;
   4386   SmallVector<SmallVector<int>> ShuffleMask;
   4387   SmallVector<Value *> FirstUsers;
   4388   SmallVector<APInt> DemandedElts;
   4389   for (ExternalUser &EU : ExternalUses) {
   4390     // We only add extract cost once for the same scalar.
   4391     if (!ExtractCostCalculated.insert(EU.Scalar).second)
   4392       continue;
   4393 
   4394     // Uses by ephemeral values are free (because the ephemeral value will be
   4395     // removed prior to code generation, and so the extraction will be
   4396     // removed as well).
   4397     if (EphValues.count(EU.User))
   4398       continue;
   4399 
   4400     // No extract cost for vector "scalar"
   4401     if (isa<FixedVectorType>(EU.Scalar->getType()))
   4402       continue;
   4403 
   4404     // If found user is an insertelement, do not calculate extract cost but try
   4405     // to detect it as a final shuffled/identity match.
   4406     if (EU.User && isa<InsertElementInst>(EU.User)) {
   4407       if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
   4408         Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
   4409         if (!InsertIdx || *InsertIdx == UndefMaskElem)
   4410           continue;
   4411         Value *VU = EU.User;
   4412         auto *It = find_if(FirstUsers, [VU](Value *V) {
   4413           // Checks if 2 insertelements are from the same buildvector.
   4414           if (VU->getType() != V->getType())
   4415             return false;
   4416           auto *IE1 = cast<InsertElementInst>(VU);
   4417           auto *IE2 = cast<InsertElementInst>(V);
   4418           do {
   4419             if (IE1 == VU || IE2 == V)
   4420               return true;
   4421             if (IE1)
   4422               IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
   4423             if (IE2)
   4424               IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
   4425           } while (IE1 || IE2);
   4426           return false;
   4427         });
   4428         int VecId = -1;
   4429         if (It == FirstUsers.end()) {
   4430           VF.push_back(FTy->getNumElements());
   4431           ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
   4432           FirstUsers.push_back(EU.User);
   4433           DemandedElts.push_back(APInt::getNullValue(VF.back()));
   4434           IsIdentity.push_back(true);
   4435           VecId = FirstUsers.size() - 1;
   4436         } else {
   4437           VecId = std::distance(FirstUsers.begin(), It);
   4438         }
   4439         int Idx = *InsertIdx;
   4440         ShuffleMask[VecId][Idx] = EU.Lane;
   4441         IsIdentity.set(IsIdentity.test(VecId) &
   4442                        (EU.Lane == Idx || EU.Lane == UndefMaskElem));
   4443         DemandedElts[VecId].setBit(Idx);
   4444       }
   4445     }
   4446 
   4447     // If we plan to rewrite the tree in a smaller type, we will need to sign
   4448     // extend the extracted value back to the original type. Here, we account
   4449     // for the extract and the added cost of the sign extend if needed.
   4450     auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
   4451     auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
   4452     if (MinBWs.count(ScalarRoot)) {
   4453       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
   4454       auto Extend =
   4455           MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
   4456       VecTy = FixedVectorType::get(MinTy, BundleWidth);
   4457       ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
   4458                                                    VecTy, EU.Lane);
   4459     } else {
   4460       ExtractCost +=
   4461           TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
   4462     }
   4463   }
   4464 
   4465   InstructionCost SpillCost = getSpillCost();
   4466   Cost += SpillCost + ExtractCost;
   4467   for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
   4468     if (!IsIdentity.test(I)) {
   4469       InstructionCost C = TTI->getShuffleCost(
   4470           TTI::SK_PermuteSingleSrc,
   4471           cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
   4472       LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
   4473                         << " for final shuffle of insertelement external users "
   4474                         << *VectorizableTree.front()->Scalars.front() << ".\n"
   4475                         << "SLP: Current total cost = " << Cost << "\n");
   4476       Cost += C;
   4477     }
   4478     unsigned VF = ShuffleMask[I].size();
   4479     for (int &Mask : ShuffleMask[I])
   4480       Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask;
   4481     InstructionCost C = TTI->getShuffleCost(
   4482         TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
   4483         ShuffleMask[I]);
   4484     LLVM_DEBUG(
   4485         dbgs()
   4486         << "SLP: Adding cost " << C
   4487         << " for final shuffle of vector node and external insertelement users "
   4488         << *VectorizableTree.front()->Scalars.front() << ".\n"
   4489         << "SLP: Current total cost = " << Cost << "\n");
   4490     Cost += C;
   4491     InstructionCost InsertCost = TTI->getScalarizationOverhead(
   4492         cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
   4493         /*Insert*/ true,
   4494         /*Extract*/ false);
   4495     Cost -= InsertCost;
   4496     LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
   4497                       << " for insertelements gather.\n"
   4498                       << "SLP: Current total cost = " << Cost << "\n");
   4499   }
   4500 
   4501 #ifndef NDEBUG
   4502   SmallString<256> Str;
   4503   {
   4504     raw_svector_ostream OS(Str);
   4505     OS << "SLP: Spill Cost = " << SpillCost << ".\n"
   4506        << "SLP: Extract Cost = " << ExtractCost << ".\n"
   4507        << "SLP: Total Cost = " << Cost << ".\n";
   4508   }
   4509   LLVM_DEBUG(dbgs() << Str);
   4510   if (ViewSLPTree)
   4511     ViewGraph(this, "SLP" + F->getName(), false, Str);
   4512 #endif
   4513 
   4514   return Cost;
   4515 }
   4516 
   4517 Optional<TargetTransformInfo::ShuffleKind>
   4518 BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
   4519                                SmallVectorImpl<const TreeEntry *> &Entries) {
   4520   Mask.assign(TE->Scalars.size(), UndefMaskElem);
   4521   Entries.clear();
   4522   DenseMap<Value *, const TreeEntry *> UsedValuesEntry;
   4523   unsigned VF = 0;
   4524   // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
   4525   // landed.
   4526   auto &&GetVF = [](const TreeEntry *TE) {
   4527     if (!TE->ReuseShuffleIndices.empty())
   4528       return TE->ReuseShuffleIndices.size();
   4529     return TE->Scalars.size();
   4530   };
   4531   for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
   4532     Value *V = TE->Scalars[I];
   4533     if (isa<UndefValue>(V))
   4534       continue;
   4535     const TreeEntry *VTE = UsedValuesEntry.lookup(V);
   4536     if (!VTE) {
   4537       if (Entries.size() == 2)
   4538         return None;
   4539       VTE = getTreeEntry(V);
   4540       if (!VTE || find_if(
   4541                       VectorizableTree,
   4542                       [VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
   4543                         return EntryPtr.get() == VTE || EntryPtr.get() == TE;
   4544                       })->get() == TE) {
   4545         // Check if it is used in one of the gathered entries.
   4546         const auto *It =
   4547             find_if(VectorizableTree,
   4548                     [V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
   4549                       return EntryPtr.get() == TE ||
   4550                              (EntryPtr->State == TreeEntry::NeedToGather &&
   4551                               is_contained(EntryPtr->Scalars, V));
   4552                     });
   4553         // The vector factor of shuffled entries must be the same.
   4554         if (It->get() == TE)
   4555           return None;
   4556         VTE = It->get();
   4557       }
   4558       Entries.push_back(VTE);
   4559       if (Entries.size() == 1) {
   4560         VF = GetVF(VTE);
   4561       } else if (VF != GetVF(VTE)) {
   4562         assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
   4563         assert(VF > 0 && "Expected non-zero vector factor.");
   4564         return None;
   4565       }
   4566       for (Value *SV : VTE->Scalars)
   4567         UsedValuesEntry.try_emplace(SV, VTE);
   4568     }
   4569     int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
   4570     Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;
   4571     // Extra check required by isSingleSourceMaskImpl function (called by
   4572     // ShuffleVectorInst::isSingleSourceMask).
   4573     if (Mask[I] >= 2 * E)
   4574       return None;
   4575   }
   4576   switch (Entries.size()) {
   4577   case 1:
   4578     return TargetTransformInfo::SK_PermuteSingleSrc;
   4579   case 2:
   4580     return TargetTransformInfo::SK_PermuteTwoSrc;
   4581   default:
   4582     break;
   4583   }
   4584   return None;
   4585 }
   4586 
   4587 InstructionCost
   4588 BoUpSLP::getGatherCost(FixedVectorType *Ty,
   4589                        const DenseSet<unsigned> &ShuffledIndices) const {
   4590   unsigned NumElts = Ty->getNumElements();
   4591   APInt DemandedElts = APInt::getNullValue(NumElts);
   4592   for (unsigned I = 0; I < NumElts; ++I)
   4593     if (!ShuffledIndices.count(I))
   4594       DemandedElts.setBit(I);
   4595   InstructionCost Cost =
   4596       TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
   4597                                     /*Extract*/ false);
   4598   if (!ShuffledIndices.empty())
   4599     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   4600   return Cost;
   4601 }
   4602 
   4603 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
   4604   // Find the type of the operands in VL.
   4605   Type *ScalarTy = VL[0]->getType();
   4606   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
   4607     ScalarTy = SI->getValueOperand()->getType();
   4608   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   4609   // Find the cost of inserting/extracting values from the vector.
   4610   // Check if the same elements are inserted several times and count them as
   4611   // shuffle candidates.
   4612   DenseSet<unsigned> ShuffledElements;
   4613   DenseSet<Value *> UniqueElements;
   4614   // Iterate in reverse order to consider insert elements with the high cost.
   4615   for (unsigned I = VL.size(); I > 0; --I) {
   4616     unsigned Idx = I - 1;
   4617     if (!UniqueElements.insert(VL[Idx]).second)
   4618       ShuffledElements.insert(Idx);
   4619   }
   4620   return getGatherCost(VecTy, ShuffledElements);
   4621 }
   4622 
   4623 // Perform operand reordering on the instructions in VL and return the reordered
   4624 // operands in Left and Right.
   4625 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   4626                                              SmallVectorImpl<Value *> &Left,
   4627                                              SmallVectorImpl<Value *> &Right,
   4628                                              const DataLayout &DL,
   4629                                              ScalarEvolution &SE,
   4630                                              const BoUpSLP &R) {
   4631   if (VL.empty())
   4632     return;
   4633   VLOperands Ops(VL, DL, SE, R);
   4634   // Reorder the operands in place.
   4635   Ops.reorder();
   4636   Left = Ops.getVL(0);
   4637   Right = Ops.getVL(1);
   4638 }
   4639 
   4640 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   4641   // Get the basic block this bundle is in. All instructions in the bundle
   4642   // should be in this block.
   4643   auto *Front = E->getMainOp();
   4644   auto *BB = Front->getParent();
   4645   assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
   4646     auto *I = cast<Instruction>(V);
   4647     return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
   4648   }));
   4649 
   4650   // The last instruction in the bundle in program order.
   4651   Instruction *LastInst = nullptr;
   4652 
   4653   // Find the last instruction. The common case should be that BB has been
   4654   // scheduled, and the last instruction is VL.back(). So we start with
   4655   // VL.back() and iterate over schedule data until we reach the end of the
   4656   // bundle. The end of the bundle is marked by null ScheduleData.
   4657   if (BlocksSchedules.count(BB)) {
   4658     auto *Bundle =
   4659         BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
   4660     if (Bundle && Bundle->isPartOfBundle())
   4661       for (; Bundle; Bundle = Bundle->NextInBundle)
   4662         if (Bundle->OpValue == Bundle->Inst)
   4663           LastInst = Bundle->Inst;
   4664   }
   4665 
   4666   // LastInst can still be null at this point if there's either not an entry
   4667   // for BB in BlocksSchedules or there's no ScheduleData available for
   4668   // VL.back(). This can be the case if buildTree_rec aborts for various
   4669   // reasons (e.g., the maximum recursion depth is reached, the maximum region
   4670   // size is reached, etc.). ScheduleData is initialized in the scheduling
   4671   // "dry-run".
   4672   //
   4673   // If this happens, we can still find the last instruction by brute force. We
   4674   // iterate forwards from Front (inclusive) until we either see all
   4675   // instructions in the bundle or reach the end of the block. If Front is the
   4676   // last instruction in program order, LastInst will be set to Front, and we
   4677   // will visit all the remaining instructions in the block.
   4678   //
   4679   // One of the reasons we exit early from buildTree_rec is to place an upper
   4680   // bound on compile-time. Thus, taking an additional compile-time hit here is
   4681   // not ideal. However, this should be exceedingly rare since it requires that
   4682   // we both exit early from buildTree_rec and that the bundle be out-of-order
   4683   // (causing us to iterate all the way to the end of the block).
   4684   if (!LastInst) {
   4685     SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
   4686     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
   4687       if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
   4688         LastInst = &I;
   4689       if (Bundle.empty())
   4690         break;
   4691     }
   4692   }
   4693   assert(LastInst && "Failed to find last instruction in bundle");
   4694 
   4695   // Set the insertion point after the last instruction in the bundle. Set the
   4696   // debug location to Front.
   4697   Builder.SetInsertPoint(BB, ++LastInst->getIterator());
   4698   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
   4699 }
   4700 
   4701 Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
   4702   Value *Val0 =
   4703       isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
   4704   FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
   4705   Value *Vec = PoisonValue::get(VecTy);
   4706   unsigned InsIndex = 0;
   4707   for (Value *Val : VL) {
   4708     Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
   4709     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
   4710     if (!InsElt)
   4711       continue;
   4712     GatherSeq.insert(InsElt);
   4713     CSEBlocks.insert(InsElt->getParent());
   4714     // Add to our 'need-to-extract' list.
   4715     if (TreeEntry *Entry = getTreeEntry(Val)) {
   4716       // Find which lane we need to extract.
   4717       int FoundLane =
   4718           findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Val);
   4719       ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
   4720     }
   4721   }
   4722 
   4723   return Vec;
   4724 }
   4725 
   4726 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   4727   InstructionsState S = getSameOpcode(VL);
   4728   if (S.getOpcode()) {
   4729     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
   4730       if (E->isSame(VL)) {
   4731         Value *V = vectorizeTree(E);
   4732         if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
   4733           // Reshuffle to get only unique values.
   4734           // If some of the scalars are duplicated in the vectorization tree
   4735           // entry, we do not vectorize them but instead generate a mask for the
   4736           // reuses. But if there are several users of the same entry, they may
   4737           // have different vectorization factors. This is especially important
   4738           // for PHI nodes. In this case, we need to adapt the resulting
   4739           // instruction for the user vectorization factor and have to reshuffle
   4740           // it again to take only unique elements of the vector. Without this
   4741           // code the function incorrectly returns reduced vector instruction
   4742           // with the same elements, not with the unique ones.
   4743           // block:
   4744           // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
   4745           // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
   4746           // ... (use %2)
   4747           // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
   4748           // br %block
   4749           SmallVector<int, 4> UniqueIdxs;
   4750           SmallSet<int, 4> UsedIdxs;
   4751           int Pos = 0;
   4752           for (int Idx : E->ReuseShuffleIndices) {
   4753             if (UsedIdxs.insert(Idx).second)
   4754               UniqueIdxs.emplace_back(Pos);
   4755             ++Pos;
   4756           }
   4757           V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
   4758         }
   4759         return V;
   4760       }
   4761     }
   4762   }
   4763 
   4764   // Check that every instruction appears once in this bundle.
   4765   SmallVector<int, 4> ReuseShuffleIndicies;
   4766   SmallVector<Value *, 4> UniqueValues;
   4767   if (VL.size() > 2) {
   4768     DenseMap<Value *, unsigned> UniquePositions;
   4769     for (Value *V : VL) {
   4770       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
   4771       ReuseShuffleIndicies.emplace_back(Res.first->second);
   4772       if (Res.second || isa<Constant>(V))
   4773         UniqueValues.emplace_back(V);
   4774     }
   4775     // Do not shuffle single element or if number of unique values is not power
   4776     // of 2.
   4777     if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
   4778         !llvm::isPowerOf2_32(UniqueValues.size()))
   4779       ReuseShuffleIndicies.clear();
   4780     else
   4781       VL = UniqueValues;
   4782   }
   4783 
   4784   Value *Vec = gather(VL);
   4785   if (!ReuseShuffleIndicies.empty()) {
   4786     Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
   4787     if (auto *I = dyn_cast<Instruction>(Vec)) {
   4788       GatherSeq.insert(I);
   4789       CSEBlocks.insert(I->getParent());
   4790     }
   4791   }
   4792   return Vec;
   4793 }
   4794 
   4795 namespace {
   4796 /// Merges shuffle masks and emits final shuffle instruction, if required.
   4797 class ShuffleInstructionBuilder {
   4798   IRBuilderBase &Builder;
   4799   bool IsFinalized = false;
   4800   SmallVector<int, 4> Mask;
   4801 
   4802 public:
   4803   ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {}
   4804 
   4805   /// Adds a mask, inverting it before applying.
   4806   void addInversedMask(ArrayRef<unsigned> SubMask) {
   4807     if (SubMask.empty())
   4808       return;
   4809     SmallVector<int, 4> NewMask;
   4810     inversePermutation(SubMask, NewMask);
   4811     addMask(NewMask);
   4812   }
   4813 
   4814   /// Functions adds masks, merging them into  single one.
   4815   void addMask(ArrayRef<unsigned> SubMask) {
   4816     SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end());
   4817     addMask(NewMask);
   4818   }
   4819 
   4820   void addMask(ArrayRef<int> SubMask) {
   4821     if (SubMask.empty())
   4822       return;
   4823     if (Mask.empty()) {
   4824       Mask.append(SubMask.begin(), SubMask.end());
   4825       return;
   4826     }
   4827     SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
   4828     int TermValue = std::min(Mask.size(), SubMask.size());
   4829     for (int I = 0, E = SubMask.size(); I < E; ++I) {
   4830       if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) {
   4831         NewMask[I] = E;
   4832         continue;
   4833       }
   4834       NewMask[I] = Mask[SubMask[I]];
   4835     }
   4836     Mask.swap(NewMask);
   4837   }
   4838 
   4839   Value *finalize(Value *V) {
   4840     IsFinalized = true;
   4841     if (Mask.empty())
   4842       return V;
   4843     return Builder.CreateShuffleVector(V, Mask, "shuffle");
   4844   }
   4845 
   4846   ~ShuffleInstructionBuilder() {
   4847     assert((IsFinalized || Mask.empty()) &&
   4848            "Shuffle construction must be finalized.");
   4849   }
   4850 };
   4851 } // namespace
   4852 
   4853 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   4854   IRBuilder<>::InsertPointGuard Guard(Builder);
   4855 
   4856   if (E->VectorizedValue) {
   4857     LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
   4858     return E->VectorizedValue;
   4859   }
   4860 
   4861   ShuffleInstructionBuilder ShuffleBuilder(Builder);
   4862   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   4863   if (E->State == TreeEntry::NeedToGather) {
   4864     setInsertPointAfterBundle(E);
   4865     Value *Vec;
   4866     SmallVector<int> Mask;
   4867     SmallVector<const TreeEntry *> Entries;
   4868     Optional<TargetTransformInfo::ShuffleKind> Shuffle =
   4869         isGatherShuffledEntry(E, Mask, Entries);
   4870     if (Shuffle.hasValue()) {
   4871       assert((Entries.size() == 1 || Entries.size() == 2) &&
   4872              "Expected shuffle of 1 or 2 entries.");
   4873       Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
   4874                                         Entries.back()->VectorizedValue, Mask);
   4875     } else {
   4876       Vec = gather(E->Scalars);
   4877     }
   4878     if (NeedToShuffleReuses) {
   4879       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   4880       Vec = ShuffleBuilder.finalize(Vec);
   4881       if (auto *I = dyn_cast<Instruction>(Vec)) {
   4882         GatherSeq.insert(I);
   4883         CSEBlocks.insert(I->getParent());
   4884       }
   4885     }
   4886     E->VectorizedValue = Vec;
   4887     return Vec;
   4888   }
   4889 
   4890   assert((E->State == TreeEntry::Vectorize ||
   4891           E->State == TreeEntry::ScatterVectorize) &&
   4892          "Unhandled state");
   4893   unsigned ShuffleOrOp =
   4894       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   4895   Instruction *VL0 = E->getMainOp();
   4896   Type *ScalarTy = VL0->getType();
   4897   if (auto *Store = dyn_cast<StoreInst>(VL0))
   4898     ScalarTy = Store->getValueOperand()->getType();
   4899   else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
   4900     ScalarTy = IE->getOperand(1)->getType();
   4901   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   4902   switch (ShuffleOrOp) {
   4903     case Instruction::PHI: {
   4904       auto *PH = cast<PHINode>(VL0);
   4905       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
   4906       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
   4907       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
   4908       Value *V = NewPhi;
   4909       if (NeedToShuffleReuses)
   4910         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
   4911 
   4912       E->VectorizedValue = V;
   4913 
   4914       // PHINodes may have multiple entries from the same block. We want to
   4915       // visit every block once.
   4916       SmallPtrSet<BasicBlock*, 4> VisitedBBs;
   4917 
   4918       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
   4919         ValueList Operands;
   4920         BasicBlock *IBB = PH->getIncomingBlock(i);
   4921 
   4922         if (!VisitedBBs.insert(IBB).second) {
   4923           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
   4924           continue;
   4925         }
   4926 
   4927         Builder.SetInsertPoint(IBB->getTerminator());
   4928         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
   4929         Value *Vec = vectorizeTree(E->getOperand(i));
   4930         NewPhi->addIncoming(Vec, IBB);
   4931       }
   4932 
   4933       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
   4934              "Invalid number of incoming values");
   4935       return V;
   4936     }
   4937 
   4938     case Instruction::ExtractElement: {
   4939       Value *V = E->getSingleOperand(0);
   4940       Builder.SetInsertPoint(VL0);
   4941       ShuffleBuilder.addInversedMask(E->ReorderIndices);
   4942       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   4943       V = ShuffleBuilder.finalize(V);
   4944       E->VectorizedValue = V;
   4945       return V;
   4946     }
   4947     case Instruction::ExtractValue: {
   4948       auto *LI = cast<LoadInst>(E->getSingleOperand(0));
   4949       Builder.SetInsertPoint(LI);
   4950       auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
   4951       Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
   4952       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
   4953       Value *NewV = propagateMetadata(V, E->Scalars);
   4954       ShuffleBuilder.addInversedMask(E->ReorderIndices);
   4955       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   4956       NewV = ShuffleBuilder.finalize(NewV);
   4957       E->VectorizedValue = NewV;
   4958       return NewV;
   4959     }
   4960     case Instruction::InsertElement: {
   4961       Builder.SetInsertPoint(VL0);
   4962       Value *V = vectorizeTree(E->getOperand(1));
   4963 
   4964       const unsigned NumElts =
   4965           cast<FixedVectorType>(VL0->getType())->getNumElements();
   4966       const unsigned NumScalars = E->Scalars.size();
   4967 
   4968       // Create InsertVector shuffle if necessary
   4969       Instruction *FirstInsert = nullptr;
   4970       bool IsIdentity = true;
   4971       unsigned Offset = UINT_MAX;
   4972       for (unsigned I = 0; I < NumScalars; ++I) {
   4973         Value *Scalar = E->Scalars[I];
   4974         if (!FirstInsert &&
   4975             !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0)))
   4976           FirstInsert = cast<Instruction>(Scalar);
   4977         Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
   4978         if (!InsertIdx || *InsertIdx == UndefMaskElem)
   4979           continue;
   4980         unsigned Idx = *InsertIdx;
   4981         if (Idx < Offset) {
   4982           Offset = Idx;
   4983           IsIdentity &= I == 0;
   4984         } else {
   4985           assert(Idx >= Offset && "Failed to find vector index offset");
   4986           IsIdentity &= Idx - Offset == I;
   4987         }
   4988       }
   4989       assert(Offset < NumElts && "Failed to find vector index offset");
   4990 
   4991       // Create shuffle to resize vector
   4992       SmallVector<int> Mask(NumElts, UndefMaskElem);
   4993       if (!IsIdentity) {
   4994         for (unsigned I = 0; I < NumScalars; ++I) {
   4995           Value *Scalar = E->Scalars[I];
   4996           Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
   4997           if (!InsertIdx || *InsertIdx == UndefMaskElem)
   4998             continue;
   4999           Mask[*InsertIdx - Offset] = I;
   5000         }
   5001       } else {
   5002         std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
   5003       }
   5004       if (!IsIdentity || NumElts != NumScalars)
   5005         V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask);
   5006 
   5007       if (NumElts != NumScalars) {
   5008         SmallVector<int> InsertMask(NumElts);
   5009         std::iota(InsertMask.begin(), InsertMask.end(), 0);
   5010         for (unsigned I = 0; I < NumElts; I++) {
   5011           if (Mask[I] != UndefMaskElem)
   5012             InsertMask[Offset + I] = NumElts + I;
   5013         }
   5014 
   5015         V = Builder.CreateShuffleVector(
   5016             FirstInsert->getOperand(0), V, InsertMask,
   5017             cast<Instruction>(E->Scalars.back())->getName());
   5018       }
   5019 
   5020       ++NumVectorInstructions;
   5021       E->VectorizedValue = V;
   5022       return V;
   5023     }
   5024     case Instruction::ZExt:
   5025     case Instruction::SExt:
   5026     case Instruction::FPToUI:
   5027     case Instruction::FPToSI:
   5028     case Instruction::FPExt:
   5029     case Instruction::PtrToInt:
   5030     case Instruction::IntToPtr:
   5031     case Instruction::SIToFP:
   5032     case Instruction::UIToFP:
   5033     case Instruction::Trunc:
   5034     case Instruction::FPTrunc:
   5035     case Instruction::BitCast: {
   5036       setInsertPointAfterBundle(E);
   5037 
   5038       Value *InVec = vectorizeTree(E->getOperand(0));
   5039 
   5040       if (E->VectorizedValue) {
   5041         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
   5042         return E->VectorizedValue;
   5043       }
   5044 
   5045       auto *CI = cast<CastInst>(VL0);
   5046       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
   5047       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5048       V = ShuffleBuilder.finalize(V);
   5049 
   5050       E->VectorizedValue = V;
   5051       ++NumVectorInstructions;
   5052       return V;
   5053     }
   5054     case Instruction::FCmp:
   5055     case Instruction::ICmp: {
   5056       setInsertPointAfterBundle(E);
   5057 
   5058       Value *L = vectorizeTree(E->getOperand(0));
   5059       Value *R = vectorizeTree(E->getOperand(1));
   5060 
   5061       if (E->VectorizedValue) {
   5062         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
   5063         return E->VectorizedValue;
   5064       }
   5065 
   5066       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
   5067       Value *V = Builder.CreateCmp(P0, L, R);
   5068       propagateIRFlags(V, E->Scalars, VL0);
   5069       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5070       V = ShuffleBuilder.finalize(V);
   5071 
   5072       E->VectorizedValue = V;
   5073       ++NumVectorInstructions;
   5074       return V;
   5075     }
   5076     case Instruction::Select: {
   5077       setInsertPointAfterBundle(E);
   5078 
   5079       Value *Cond = vectorizeTree(E->getOperand(0));
   5080       Value *True = vectorizeTree(E->getOperand(1));
   5081       Value *False = vectorizeTree(E->getOperand(2));
   5082 
   5083       if (E->VectorizedValue) {
   5084         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
   5085         return E->VectorizedValue;
   5086       }
   5087 
   5088       Value *V = Builder.CreateSelect(Cond, True, False);
   5089       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5090       V = ShuffleBuilder.finalize(V);
   5091 
   5092       E->VectorizedValue = V;
   5093       ++NumVectorInstructions;
   5094       return V;
   5095     }
   5096     case Instruction::FNeg: {
   5097       setInsertPointAfterBundle(E);
   5098 
   5099       Value *Op = vectorizeTree(E->getOperand(0));
   5100 
   5101       if (E->VectorizedValue) {
   5102         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
   5103         return E->VectorizedValue;
   5104       }
   5105 
   5106       Value *V = Builder.CreateUnOp(
   5107           static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
   5108       propagateIRFlags(V, E->Scalars, VL0);
   5109       if (auto *I = dyn_cast<Instruction>(V))
   5110         V = propagateMetadata(I, E->Scalars);
   5111 
   5112       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5113       V = ShuffleBuilder.finalize(V);
   5114 
   5115       E->VectorizedValue = V;
   5116       ++NumVectorInstructions;
   5117 
   5118       return V;
   5119     }
   5120     case Instruction::Add:
   5121     case Instruction::FAdd:
   5122     case Instruction::Sub:
   5123     case Instruction::FSub:
   5124     case Instruction::Mul:
   5125     case Instruction::FMul:
   5126     case Instruction::UDiv:
   5127     case Instruction::SDiv:
   5128     case Instruction::FDiv:
   5129     case Instruction::URem:
   5130     case Instruction::SRem:
   5131     case Instruction::FRem:
   5132     case Instruction::Shl:
   5133     case Instruction::LShr:
   5134     case Instruction::AShr:
   5135     case Instruction::And:
   5136     case Instruction::Or:
   5137     case Instruction::Xor: {
   5138       setInsertPointAfterBundle(E);
   5139 
   5140       Value *LHS = vectorizeTree(E->getOperand(0));
   5141       Value *RHS = vectorizeTree(E->getOperand(1));
   5142 
   5143       if (E->VectorizedValue) {
   5144         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
   5145         return E->VectorizedValue;
   5146       }
   5147 
   5148       Value *V = Builder.CreateBinOp(
   5149           static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
   5150           RHS);
   5151       propagateIRFlags(V, E->Scalars, VL0);
   5152       if (auto *I = dyn_cast<Instruction>(V))
   5153         V = propagateMetadata(I, E->Scalars);
   5154 
   5155       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5156       V = ShuffleBuilder.finalize(V);
   5157 
   5158       E->VectorizedValue = V;
   5159       ++NumVectorInstructions;
   5160 
   5161       return V;
   5162     }
   5163     case Instruction::Load: {
   5164       // Loads are inserted at the head of the tree because we don't want to
   5165       // sink them all the way down past store instructions.
   5166       bool IsReorder = E->updateStateIfReorder();
   5167       if (IsReorder)
   5168         VL0 = E->getMainOp();
   5169       setInsertPointAfterBundle(E);
   5170 
   5171       LoadInst *LI = cast<LoadInst>(VL0);
   5172       Instruction *NewLI;
   5173       unsigned AS = LI->getPointerAddressSpace();
   5174       Value *PO = LI->getPointerOperand();
   5175       if (E->State == TreeEntry::Vectorize) {
   5176 
   5177         Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
   5178 
   5179         // The pointer operand uses an in-tree scalar so we add the new BitCast
   5180         // to ExternalUses list to make sure that an extract will be generated
   5181         // in the future.
   5182         if (getTreeEntry(PO))
   5183           ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
   5184 
   5185         NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
   5186       } else {
   5187         assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
   5188         Value *VecPtr = vectorizeTree(E->getOperand(0));
   5189         // Use the minimum alignment of the gathered loads.
   5190         Align CommonAlignment = LI->getAlign();
   5191         for (Value *V : E->Scalars)
   5192           CommonAlignment =
   5193               commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
   5194         NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
   5195       }
   5196       Value *V = propagateMetadata(NewLI, E->Scalars);
   5197 
   5198       ShuffleBuilder.addInversedMask(E->ReorderIndices);
   5199       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5200       V = ShuffleBuilder.finalize(V);
   5201       E->VectorizedValue = V;
   5202       ++NumVectorInstructions;
   5203       return V;
   5204     }
   5205     case Instruction::Store: {
   5206       bool IsReorder = !E->ReorderIndices.empty();
   5207       auto *SI = cast<StoreInst>(
   5208           IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
   5209       unsigned AS = SI->getPointerAddressSpace();
   5210 
   5211       setInsertPointAfterBundle(E);
   5212 
   5213       Value *VecValue = vectorizeTree(E->getOperand(0));
   5214       ShuffleBuilder.addMask(E->ReorderIndices);
   5215       VecValue = ShuffleBuilder.finalize(VecValue);
   5216 
   5217       Value *ScalarPtr = SI->getPointerOperand();
   5218       Value *VecPtr = Builder.CreateBitCast(
   5219           ScalarPtr, VecValue->getType()->getPointerTo(AS));
   5220       StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
   5221                                                  SI->getAlign());
   5222 
   5223       // The pointer operand uses an in-tree scalar, so add the new BitCast to
   5224       // ExternalUses to make sure that an extract will be generated in the
   5225       // future.
   5226       if (getTreeEntry(ScalarPtr))
   5227         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
   5228 
   5229       Value *V = propagateMetadata(ST, E->Scalars);
   5230 
   5231       E->VectorizedValue = V;
   5232       ++NumVectorInstructions;
   5233       return V;
   5234     }
   5235     case Instruction::GetElementPtr: {
   5236       setInsertPointAfterBundle(E);
   5237 
   5238       Value *Op0 = vectorizeTree(E->getOperand(0));
   5239 
   5240       std::vector<Value *> OpVecs;
   5241       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
   5242            ++j) {
   5243         ValueList &VL = E->getOperand(j);
   5244         // Need to cast all elements to the same type before vectorization to
   5245         // avoid crash.
   5246         Type *VL0Ty = VL0->getOperand(j)->getType();
   5247         Type *Ty = llvm::all_of(
   5248                        VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
   5249                        ? VL0Ty
   5250                        : DL->getIndexType(cast<GetElementPtrInst>(VL0)
   5251                                               ->getPointerOperandType()
   5252                                               ->getScalarType());
   5253         for (Value *&V : VL) {
   5254           auto *CI = cast<ConstantInt>(V);
   5255           V = ConstantExpr::getIntegerCast(CI, Ty,
   5256                                            CI->getValue().isSignBitSet());
   5257         }
   5258         Value *OpVec = vectorizeTree(VL);
   5259         OpVecs.push_back(OpVec);
   5260       }
   5261 
   5262       Value *V = Builder.CreateGEP(
   5263           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
   5264       if (Instruction *I = dyn_cast<Instruction>(V))
   5265         V = propagateMetadata(I, E->Scalars);
   5266 
   5267       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5268       V = ShuffleBuilder.finalize(V);
   5269 
   5270       E->VectorizedValue = V;
   5271       ++NumVectorInstructions;
   5272 
   5273       return V;
   5274     }
   5275     case Instruction::Call: {
   5276       CallInst *CI = cast<CallInst>(VL0);
   5277       setInsertPointAfterBundle(E);
   5278 
   5279       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
   5280       if (Function *FI = CI->getCalledFunction())
   5281         IID = FI->getIntrinsicID();
   5282 
   5283       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   5284 
   5285       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
   5286       bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
   5287                           VecCallCosts.first <= VecCallCosts.second;
   5288 
   5289       Value *ScalarArg = nullptr;
   5290       std::vector<Value *> OpVecs;
   5291       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
   5292         ValueList OpVL;
   5293         // Some intrinsics have scalar arguments. This argument should not be
   5294         // vectorized.
   5295         if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
   5296           CallInst *CEI = cast<CallInst>(VL0);
   5297           ScalarArg = CEI->getArgOperand(j);
   5298           OpVecs.push_back(CEI->getArgOperand(j));
   5299           continue;
   5300         }
   5301 
   5302         Value *OpVec = vectorizeTree(E->getOperand(j));
   5303         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
   5304         OpVecs.push_back(OpVec);
   5305       }
   5306 
   5307       Function *CF;
   5308       if (!UseIntrinsic) {
   5309         VFShape Shape =
   5310             VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
   5311                                   VecTy->getNumElements())),
   5312                          false /*HasGlobalPred*/);
   5313         CF = VFDatabase(*CI).getVectorizedFunction(Shape);
   5314       } else {
   5315         Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
   5316         CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
   5317       }
   5318 
   5319       SmallVector<OperandBundleDef, 1> OpBundles;
   5320       CI->getOperandBundlesAsDefs(OpBundles);
   5321       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
   5322 
   5323       // The scalar argument uses an in-tree scalar so we add the new vectorized
   5324       // call to ExternalUses list to make sure that an extract will be
   5325       // generated in the future.
   5326       if (ScalarArg && getTreeEntry(ScalarArg))
   5327         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
   5328 
   5329       propagateIRFlags(V, E->Scalars, VL0);
   5330       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5331       V = ShuffleBuilder.finalize(V);
   5332 
   5333       E->VectorizedValue = V;
   5334       ++NumVectorInstructions;
   5335       return V;
   5336     }
   5337     case Instruction::ShuffleVector: {
   5338       assert(E->isAltShuffle() &&
   5339              ((Instruction::isBinaryOp(E->getOpcode()) &&
   5340                Instruction::isBinaryOp(E->getAltOpcode())) ||
   5341               (Instruction::isCast(E->getOpcode()) &&
   5342                Instruction::isCast(E->getAltOpcode()))) &&
   5343              "Invalid Shuffle Vector Operand");
   5344 
   5345       Value *LHS = nullptr, *RHS = nullptr;
   5346       if (Instruction::isBinaryOp(E->getOpcode())) {
   5347         setInsertPointAfterBundle(E);
   5348         LHS = vectorizeTree(E->getOperand(0));
   5349         RHS = vectorizeTree(E->getOperand(1));
   5350       } else {
   5351         setInsertPointAfterBundle(E);
   5352         LHS = vectorizeTree(E->getOperand(0));
   5353       }
   5354 
   5355       if (E->VectorizedValue) {
   5356         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
   5357         return E->VectorizedValue;
   5358       }
   5359 
   5360       Value *V0, *V1;
   5361       if (Instruction::isBinaryOp(E->getOpcode())) {
   5362         V0 = Builder.CreateBinOp(
   5363             static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
   5364         V1 = Builder.CreateBinOp(
   5365             static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
   5366       } else {
   5367         V0 = Builder.CreateCast(
   5368             static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
   5369         V1 = Builder.CreateCast(
   5370             static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
   5371       }
   5372 
   5373       // Create shuffle to take alternate operations from the vector.
   5374       // Also, gather up main and alt scalar ops to propagate IR flags to
   5375       // each vector operation.
   5376       ValueList OpScalars, AltScalars;
   5377       unsigned e = E->Scalars.size();
   5378       SmallVector<int, 8> Mask(e);
   5379       for (unsigned i = 0; i < e; ++i) {
   5380         auto *OpInst = cast<Instruction>(E->Scalars[i]);
   5381         assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
   5382         if (OpInst->getOpcode() == E->getAltOpcode()) {
   5383           Mask[i] = e + i;
   5384           AltScalars.push_back(E->Scalars[i]);
   5385         } else {
   5386           Mask[i] = i;
   5387           OpScalars.push_back(E->Scalars[i]);
   5388         }
   5389       }
   5390 
   5391       propagateIRFlags(V0, OpScalars);
   5392       propagateIRFlags(V1, AltScalars);
   5393 
   5394       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
   5395       if (Instruction *I = dyn_cast<Instruction>(V))
   5396         V = propagateMetadata(I, E->Scalars);
   5397       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
   5398       V = ShuffleBuilder.finalize(V);
   5399 
   5400       E->VectorizedValue = V;
   5401       ++NumVectorInstructions;
   5402 
   5403       return V;
   5404     }
   5405     default:
   5406     llvm_unreachable("unknown inst");
   5407   }
   5408   return nullptr;
   5409 }
   5410 
   5411 Value *BoUpSLP::vectorizeTree() {
   5412   ExtraValueToDebugLocsMap ExternallyUsedValues;
   5413   return vectorizeTree(ExternallyUsedValues);
   5414 }
   5415 
   5416 Value *
   5417 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   5418   // All blocks must be scheduled before any instructions are inserted.
   5419   for (auto &BSIter : BlocksSchedules) {
   5420     scheduleBlock(BSIter.second.get());
   5421   }
   5422 
   5423   Builder.SetInsertPoint(&F->getEntryBlock().front());
   5424   auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
   5425 
   5426   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   5427   // vectorized root. InstCombine will then rewrite the entire expression. We
   5428   // sign extend the extracted values below.
   5429   auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
   5430   if (MinBWs.count(ScalarRoot)) {
   5431     if (auto *I = dyn_cast<Instruction>(VectorRoot)) {
   5432       // If current instr is a phi and not the last phi, insert it after the
   5433       // last phi node.
   5434       if (isa<PHINode>(I))
   5435         Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());
   5436       else
   5437         Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
   5438     }
   5439     auto BundleWidth = VectorizableTree[0]->Scalars.size();
   5440     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
   5441     auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
   5442     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
   5443     VectorizableTree[0]->VectorizedValue = Trunc;
   5444   }
   5445 
   5446   LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
   5447                     << " values .\n");
   5448 
   5449   // Extract all of the elements with the external uses.
   5450   for (const auto &ExternalUse : ExternalUses) {
   5451     Value *Scalar = ExternalUse.Scalar;
   5452     llvm::User *User = ExternalUse.User;
   5453 
   5454     // Skip users that we already RAUW. This happens when one instruction
   5455     // has multiple uses of the same value.
   5456     if (User && !is_contained(Scalar->users(), User))
   5457       continue;
   5458     TreeEntry *E = getTreeEntry(Scalar);
   5459     assert(E && "Invalid scalar");
   5460     assert(E->State != TreeEntry::NeedToGather &&
   5461            "Extracting from a gather list");
   5462 
   5463     Value *Vec = E->VectorizedValue;
   5464     assert(Vec && "Can't find vectorizable value");
   5465 
   5466     Value *Lane = Builder.getInt32(ExternalUse.Lane);
   5467     auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
   5468       if (Scalar->getType() != Vec->getType()) {
   5469         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
   5470         // If necessary, sign-extend or zero-extend ScalarRoot
   5471         // to the larger type.
   5472         if (!MinBWs.count(ScalarRoot))
   5473           return Ex;
   5474         if (MinBWs[ScalarRoot].second)
   5475           return Builder.CreateSExt(Ex, Scalar->getType());
   5476         return Builder.CreateZExt(Ex, Scalar->getType());
   5477       } else {
   5478         assert(isa<FixedVectorType>(Scalar->getType()) &&
   5479                isa<InsertElementInst>(Scalar) &&
   5480                "In-tree scalar of vector type is not insertelement?");
   5481         return Vec;
   5482       }
   5483     };
   5484     // If User == nullptr, the Scalar is used as extra arg. Generate
   5485     // ExtractElement instruction and update the record for this scalar in
   5486     // ExternallyUsedValues.
   5487     if (!User) {
   5488       assert(ExternallyUsedValues.count(Scalar) &&
   5489              "Scalar with nullptr as an external user must be registered in "
   5490              "ExternallyUsedValues map");
   5491       if (auto *VecI = dyn_cast<Instruction>(Vec)) {
   5492         Builder.SetInsertPoint(VecI->getParent(),
   5493                                std::next(VecI->getIterator()));
   5494       } else {
   5495         Builder.SetInsertPoint(&F->getEntryBlock().front());
   5496       }
   5497       Value *NewInst = ExtractAndExtendIfNeeded(Vec);
   5498       CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
   5499       auto &Locs = ExternallyUsedValues[Scalar];
   5500       ExternallyUsedValues.insert({NewInst, Locs});
   5501       ExternallyUsedValues.erase(Scalar);
   5502       // Required to update internally referenced instructions.
   5503       Scalar->replaceAllUsesWith(NewInst);
   5504       continue;
   5505     }
   5506 
   5507     // Generate extracts for out-of-tree users.
   5508     // Find the insertion point for the extractelement lane.
   5509     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
   5510       if (PHINode *PH = dyn_cast<PHINode>(User)) {
   5511         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
   5512           if (PH->getIncomingValue(i) == Scalar) {
   5513             Instruction *IncomingTerminator =
   5514                 PH->getIncomingBlock(i)->getTerminator();
   5515             if (isa<CatchSwitchInst>(IncomingTerminator)) {
   5516               Builder.SetInsertPoint(VecI->getParent(),
   5517                                      std::next(VecI->getIterator()));
   5518             } else {
   5519               Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
   5520             }
   5521             Value *NewInst = ExtractAndExtendIfNeeded(Vec);
   5522             CSEBlocks.insert(PH->getIncomingBlock(i));
   5523             PH->setOperand(i, NewInst);
   5524           }
   5525         }
   5526       } else {
   5527         Builder.SetInsertPoint(cast<Instruction>(User));
   5528         Value *NewInst = ExtractAndExtendIfNeeded(Vec);
   5529         CSEBlocks.insert(cast<Instruction>(User)->getParent());
   5530         User->replaceUsesOfWith(Scalar, NewInst);
   5531       }
   5532     } else {
   5533       Builder.SetInsertPoint(&F->getEntryBlock().front());
   5534       Value *NewInst = ExtractAndExtendIfNeeded(Vec);
   5535       CSEBlocks.insert(&F->getEntryBlock());
   5536       User->replaceUsesOfWith(Scalar, NewInst);
   5537     }
   5538 
   5539     LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
   5540   }
   5541 
   5542   // For each vectorized value:
   5543   for (auto &TEPtr : VectorizableTree) {
   5544     TreeEntry *Entry = TEPtr.get();
   5545 
   5546     // No need to handle users of gathered values.
   5547     if (Entry->State == TreeEntry::NeedToGather)
   5548       continue;
   5549 
   5550     assert(Entry->VectorizedValue && "Can't find vectorizable value");
   5551 
   5552     // For each lane:
   5553     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
   5554       Value *Scalar = Entry->Scalars[Lane];
   5555 
   5556 #ifndef NDEBUG
   5557       Type *Ty = Scalar->getType();
   5558       if (!Ty->isVoidTy()) {
   5559         for (User *U : Scalar->users()) {
   5560           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
   5561 
   5562           // It is legal to delete users in the ignorelist.
   5563           assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
   5564                  "Deleting out-of-tree value");
   5565         }
   5566       }
   5567 #endif
   5568       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
   5569       eraseInstruction(cast<Instruction>(Scalar));
   5570     }
   5571   }
   5572 
   5573   Builder.ClearInsertionPoint();
   5574   InstrElementSize.clear();
   5575 
   5576   return VectorizableTree[0]->VectorizedValue;
   5577 }
   5578 
   5579 void BoUpSLP::optimizeGatherSequence() {
   5580   LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
   5581                     << " gather sequences instructions.\n");
   5582   // LICM InsertElementInst sequences.
   5583   for (Instruction *I : GatherSeq) {
   5584     if (isDeleted(I))
   5585       continue;
   5586 
   5587     // Check if this block is inside a loop.
   5588     Loop *L = LI->getLoopFor(I->getParent());
   5589     if (!L)
   5590       continue;
   5591 
   5592     // Check if it has a preheader.
   5593     BasicBlock *PreHeader = L->getLoopPreheader();
   5594     if (!PreHeader)
   5595       continue;
   5596 
   5597     // If the vector or the element that we insert into it are
   5598     // instructions that are defined in this basic block then we can't
   5599     // hoist this instruction.
   5600     auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
   5601     auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
   5602     if (Op0 && L->contains(Op0))
   5603       continue;
   5604     if (Op1 && L->contains(Op1))
   5605       continue;
   5606 
   5607     // We can hoist this instruction. Move it to the pre-header.
   5608     I->moveBefore(PreHeader->getTerminator());
   5609   }
   5610 
   5611   // Make a list of all reachable blocks in our CSE queue.
   5612   SmallVector<const DomTreeNode *, 8> CSEWorkList;
   5613   CSEWorkList.reserve(CSEBlocks.size());
   5614   for (BasicBlock *BB : CSEBlocks)
   5615     if (DomTreeNode *N = DT->getNode(BB)) {
   5616       assert(DT->isReachableFromEntry(N));
   5617       CSEWorkList.push_back(N);
   5618     }
   5619 
   5620   // Sort blocks by domination. This ensures we visit a block after all blocks
   5621   // dominating it are visited.
   5622   llvm::stable_sort(CSEWorkList,
   5623                     [this](const DomTreeNode *A, const DomTreeNode *B) {
   5624                       return DT->properlyDominates(A, B);
   5625                     });
   5626 
   5627   // Perform O(N^2) search over the gather sequences and merge identical
   5628   // instructions. TODO: We can further optimize this scan if we split the
   5629   // instructions into different buckets based on the insert lane.
   5630   SmallVector<Instruction *, 16> Visited;
   5631   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
   5632     assert(*I &&
   5633            (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
   5634            "Worklist not sorted properly!");
   5635     BasicBlock *BB = (*I)->getBlock();
   5636     // For all instructions in blocks containing gather sequences:
   5637     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
   5638       Instruction *In = &*it++;
   5639       if (isDeleted(In))
   5640         continue;
   5641       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
   5642         continue;
   5643 
   5644       // Check if we can replace this instruction with any of the
   5645       // visited instructions.
   5646       for (Instruction *v : Visited) {
   5647         if (In->isIdenticalTo(v) &&
   5648             DT->dominates(v->getParent(), In->getParent())) {
   5649           In->replaceAllUsesWith(v);
   5650           eraseInstruction(In);
   5651           In = nullptr;
   5652           break;
   5653         }
   5654       }
   5655       if (In) {
   5656         assert(!is_contained(Visited, In));
   5657         Visited.push_back(In);
   5658       }
   5659     }
   5660   }
   5661   CSEBlocks.clear();
   5662   GatherSeq.clear();
   5663 }
   5664 
   5665 // Groups the instructions to a bundle (which is then a single scheduling entity)
   5666 // and schedules instructions until the bundle gets ready.
   5667 Optional<BoUpSLP::ScheduleData *>
   5668 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   5669                                             const InstructionsState &S) {
   5670   if (isa<PHINode>(S.OpValue))
   5671     return nullptr;
   5672 
   5673   // Initialize the instruction bundle.
   5674   Instruction *OldScheduleEnd = ScheduleEnd;
   5675   ScheduleData *PrevInBundle = nullptr;
   5676   ScheduleData *Bundle = nullptr;
   5677   bool ReSchedule = false;
   5678   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
   5679 
   5680   auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule,
   5681                                                          ScheduleData *Bundle) {
   5682     // The scheduling region got new instructions at the lower end (or it is a
   5683     // new region for the first bundle). This makes it necessary to
   5684     // recalculate all dependencies.
   5685     // It is seldom that this needs to be done a second time after adding the
   5686     // initial bundle to the region.
   5687     if (ScheduleEnd != OldScheduleEnd) {
   5688       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
   5689         doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
   5690       ReSchedule = true;
   5691     }
   5692     if (ReSchedule) {
   5693       resetSchedule();
   5694       initialFillReadyList(ReadyInsts);
   5695     }
   5696     if (Bundle) {
   5697       LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
   5698                         << " in block " << BB->getName() << "\n");
   5699       calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
   5700     }
   5701 
   5702     // Now try to schedule the new bundle or (if no bundle) just calculate
   5703     // dependencies. As soon as the bundle is "ready" it means that there are no
   5704     // cyclic dependencies and we can schedule it. Note that's important that we
   5705     // don't "schedule" the bundle yet (see cancelScheduling).
   5706     while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
   5707            !ReadyInsts.empty()) {
   5708       ScheduleData *Picked = ReadyInsts.pop_back_val();
   5709       if (Picked->isSchedulingEntity() && Picked->isReady())
   5710         schedule(Picked, ReadyInsts);
   5711     }
   5712   };
   5713 
   5714   // Make sure that the scheduling region contains all
   5715   // instructions of the bundle.
   5716   for (Value *V : VL) {
   5717     if (!extendSchedulingRegion(V, S)) {
   5718       // If the scheduling region got new instructions at the lower end (or it
   5719       // is a new region for the first bundle). This makes it necessary to
   5720       // recalculate all dependencies.
   5721       // Otherwise the compiler may crash trying to incorrectly calculate
   5722       // dependencies and emit instruction in the wrong order at the actual
   5723       // scheduling.
   5724       TryScheduleBundle(/*ReSchedule=*/false, nullptr);
   5725       return None;
   5726     }
   5727   }
   5728 
   5729   for (Value *V : VL) {
   5730     ScheduleData *BundleMember = getScheduleData(V);
   5731     assert(BundleMember &&
   5732            "no ScheduleData for bundle member (maybe not in same basic block)");
   5733     if (BundleMember->IsScheduled) {
   5734       // A bundle member was scheduled as single instruction before and now
   5735       // needs to be scheduled as part of the bundle. We just get rid of the
   5736       // existing schedule.
   5737       LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
   5738                         << " was already scheduled\n");
   5739       ReSchedule = true;
   5740     }
   5741     assert(BundleMember->isSchedulingEntity() &&
   5742            "bundle member already part of other bundle");
   5743     if (PrevInBundle) {
   5744       PrevInBundle->NextInBundle = BundleMember;
   5745     } else {
   5746       Bundle = BundleMember;
   5747     }
   5748     BundleMember->UnscheduledDepsInBundle = 0;
   5749     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
   5750 
   5751     // Group the instructions to a bundle.
   5752     BundleMember->FirstInBundle = Bundle;
   5753     PrevInBundle = BundleMember;
   5754   }
   5755   assert(Bundle && "Failed to find schedule bundle");
   5756   TryScheduleBundle(ReSchedule, Bundle);
   5757   if (!Bundle->isReady()) {
   5758     cancelScheduling(VL, S.OpValue);
   5759     return None;
   5760   }
   5761   return Bundle;
   5762 }
   5763 
   5764 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
   5765                                                 Value *OpValue) {
   5766   if (isa<PHINode>(OpValue))
   5767     return;
   5768 
   5769   ScheduleData *Bundle = getScheduleData(OpValue);
   5770   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   5771   assert(!Bundle->IsScheduled &&
   5772          "Can't cancel bundle which is already scheduled");
   5773   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
   5774          "tried to unbundle something which is not a bundle");
   5775 
   5776   // Un-bundle: make single instructions out of the bundle.
   5777   ScheduleData *BundleMember = Bundle;
   5778   while (BundleMember) {
   5779     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
   5780     BundleMember->FirstInBundle = BundleMember;
   5781     ScheduleData *Next = BundleMember->NextInBundle;
   5782     BundleMember->NextInBundle = nullptr;
   5783     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
   5784     if (BundleMember->UnscheduledDepsInBundle == 0) {
   5785       ReadyInsts.insert(BundleMember);
   5786     }
   5787     BundleMember = Next;
   5788   }
   5789 }
   5790 
   5791 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   5792   // Allocate a new ScheduleData for the instruction.
   5793   if (ChunkPos >= ChunkSize) {
   5794     ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
   5795     ChunkPos = 0;
   5796   }
   5797   return &(ScheduleDataChunks.back()[ChunkPos++]);
   5798 }
   5799 
   5800 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
   5801                                                       const InstructionsState &S) {
   5802   if (getScheduleData(V, isOneOf(S, V)))
   5803     return true;
   5804   Instruction *I = dyn_cast<Instruction>(V);
   5805   assert(I && "bundle member must be an instruction");
   5806   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
   5807   auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
   5808     ScheduleData *ISD = getScheduleData(I);
   5809     if (!ISD)
   5810       return false;
   5811     assert(isInSchedulingRegion(ISD) &&
   5812            "ScheduleData not in scheduling region");
   5813     ScheduleData *SD = allocateScheduleDataChunks();
   5814     SD->Inst = I;
   5815     SD->init(SchedulingRegionID, S.OpValue);
   5816     ExtraScheduleDataMap[I][S.OpValue] = SD;
   5817     return true;
   5818   };
   5819   if (CheckSheduleForI(I))
   5820     return true;
   5821   if (!ScheduleStart) {
   5822     // It's the first instruction in the new region.
   5823     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
   5824     ScheduleStart = I;
   5825     ScheduleEnd = I->getNextNode();
   5826     if (isOneOf(S, I) != I)
   5827       CheckSheduleForI(I);
   5828     assert(ScheduleEnd && "tried to vectorize a terminator?");
   5829     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
   5830     return true;
   5831   }
   5832   // Search up and down at the same time, because we don't know if the new
   5833   // instruction is above or below the existing scheduling region.
   5834   BasicBlock::reverse_iterator UpIter =
   5835       ++ScheduleStart->getIterator().getReverse();
   5836   BasicBlock::reverse_iterator UpperEnd = BB->rend();
   5837   BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
   5838   BasicBlock::iterator LowerEnd = BB->end();
   5839   while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
   5840          &*DownIter != I) {
   5841     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
   5842       LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
   5843       return false;
   5844     }
   5845 
   5846     ++UpIter;
   5847     ++DownIter;
   5848   }
   5849   if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
   5850     assert(I->getParent() == ScheduleStart->getParent() &&
   5851            "Instruction is in wrong basic block.");
   5852     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
   5853     ScheduleStart = I;
   5854     if (isOneOf(S, I) != I)
   5855       CheckSheduleForI(I);
   5856     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
   5857                       << "\n");
   5858     return true;
   5859   }
   5860   assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
   5861          "Expected to reach top of the basic block or instruction down the "
   5862          "lower end.");
   5863   assert(I->getParent() == ScheduleEnd->getParent() &&
   5864          "Instruction is in wrong basic block.");
   5865   initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
   5866                    nullptr);
   5867   ScheduleEnd = I->getNextNode();
   5868   if (isOneOf(S, I) != I)
   5869     CheckSheduleForI(I);
   5870   assert(ScheduleEnd && "tried to vectorize a terminator?");
   5871   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   5872   return true;
   5873 }
   5874 
   5875 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   5876                                                 Instruction *ToI,
   5877                                                 ScheduleData *PrevLoadStore,
   5878                                                 ScheduleData *NextLoadStore) {
   5879   ScheduleData *CurrentLoadStore = PrevLoadStore;
   5880   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
   5881     ScheduleData *SD = ScheduleDataMap[I];
   5882     if (!SD) {
   5883       SD = allocateScheduleDataChunks();
   5884       ScheduleDataMap[I] = SD;
   5885       SD->Inst = I;
   5886     }
   5887     assert(!isInSchedulingRegion(SD) &&
   5888            "new ScheduleData already in scheduling region");
   5889     SD->init(SchedulingRegionID, I);
   5890 
   5891     if (I->mayReadOrWriteMemory() &&
   5892         (!isa<IntrinsicInst>(I) ||
   5893          (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
   5894           cast<IntrinsicInst>(I)->getIntrinsicID() !=
   5895               Intrinsic::pseudoprobe))) {
   5896       // Update the linked list of memory accessing instructions.
   5897       if (CurrentLoadStore) {
   5898         CurrentLoadStore->NextLoadStore = SD;
   5899       } else {
   5900         FirstLoadStoreInRegion = SD;
   5901       }
   5902       CurrentLoadStore = SD;
   5903     }
   5904   }
   5905   if (NextLoadStore) {
   5906     if (CurrentLoadStore)
   5907       CurrentLoadStore->NextLoadStore = NextLoadStore;
   5908   } else {
   5909     LastLoadStoreInRegion = CurrentLoadStore;
   5910   }
   5911 }
   5912 
   5913 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
   5914                                                      bool InsertInReadyList,
   5915                                                      BoUpSLP *SLP) {
   5916   assert(SD->isSchedulingEntity());
   5917 
   5918   SmallVector<ScheduleData *, 10> WorkList;
   5919   WorkList.push_back(SD);
   5920 
   5921   while (!WorkList.empty()) {
   5922     ScheduleData *SD = WorkList.pop_back_val();
   5923 
   5924     ScheduleData *BundleMember = SD;
   5925     while (BundleMember) {
   5926       assert(isInSchedulingRegion(BundleMember));
   5927       if (!BundleMember->hasValidDependencies()) {
   5928 
   5929         LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
   5930                           << "\n");
   5931         BundleMember->Dependencies = 0;
   5932         BundleMember->resetUnscheduledDeps();
   5933 
   5934         // Handle def-use chain dependencies.
   5935         if (BundleMember->OpValue != BundleMember->Inst) {
   5936           ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
   5937           if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
   5938             BundleMember->Dependencies++;
   5939             ScheduleData *DestBundle = UseSD->FirstInBundle;
   5940             if (!DestBundle->IsScheduled)
   5941               BundleMember->incrementUnscheduledDeps(1);
   5942             if (!DestBundle->hasValidDependencies())
   5943               WorkList.push_back(DestBundle);
   5944           }
   5945         } else {
   5946           for (User *U : BundleMember->Inst->users()) {
   5947             if (isa<Instruction>(U)) {
   5948               ScheduleData *UseSD = getScheduleData(U);
   5949               if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle) &&
   5950                   // Ignore inner deps for insertelement
   5951                   !(UseSD->FirstInBundle == SD &&
   5952                     isa<InsertElementInst>(BundleMember->Inst))) {
   5953                 BundleMember->Dependencies++;
   5954                 ScheduleData *DestBundle = UseSD->FirstInBundle;
   5955                 if (!DestBundle->IsScheduled)
   5956                   BundleMember->incrementUnscheduledDeps(1);
   5957                 if (!DestBundle->hasValidDependencies())
   5958                   WorkList.push_back(DestBundle);
   5959               }
   5960             } else {
   5961               // I'm not sure if this can ever happen. But we need to be safe.
   5962               // This lets the instruction/bundle never be scheduled and
   5963               // eventually disable vectorization.
   5964               BundleMember->Dependencies++;
   5965               BundleMember->incrementUnscheduledDeps(1);
   5966             }
   5967           }
   5968         }
   5969 
   5970         // Handle the memory dependencies.
   5971         ScheduleData *DepDest = BundleMember->NextLoadStore;
   5972         if (DepDest) {
   5973           Instruction *SrcInst = BundleMember->Inst;
   5974           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
   5975           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
   5976           unsigned numAliased = 0;
   5977           unsigned DistToSrc = 1;
   5978 
   5979           while (DepDest) {
   5980             assert(isInSchedulingRegion(DepDest));
   5981 
   5982             // We have two limits to reduce the complexity:
   5983             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
   5984             //    SLP->isAliased (which is the expensive part in this loop).
   5985             // 2) MaxMemDepDistance: It's for very large blocks and it aborts
   5986             //    the whole loop (even if the loop is fast, it's quadratic).
   5987             //    It's important for the loop break condition (see below) to
   5988             //    check this limit even between two read-only instructions.
   5989             if (DistToSrc >= MaxMemDepDistance ||
   5990                     ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
   5991                      (numAliased >= AliasedCheckLimit ||
   5992                       SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
   5993 
   5994               // We increment the counter only if the locations are aliased
   5995               // (instead of counting all alias checks). This gives a better
   5996               // balance between reduced runtime and accurate dependencies.
   5997               numAliased++;
   5998 
   5999               DepDest->MemoryDependencies.push_back(BundleMember);
   6000               BundleMember->Dependencies++;
   6001               ScheduleData *DestBundle = DepDest->FirstInBundle;
   6002               if (!DestBundle->IsScheduled) {
   6003                 BundleMember->incrementUnscheduledDeps(1);
   6004               }
   6005               if (!DestBundle->hasValidDependencies()) {
   6006                 WorkList.push_back(DestBundle);
   6007               }
   6008             }
   6009             DepDest = DepDest->NextLoadStore;
   6010 
   6011             // Example, explaining the loop break condition: Let's assume our
   6012             // starting instruction is i0 and MaxMemDepDistance = 3.
   6013             //
   6014             //                      +--------v--v--v
   6015             //             i0,i1,i2,i3,i4,i5,i6,i7,i8
   6016             //             +--------^--^--^
   6017             //
   6018             // MaxMemDepDistance let us stop alias-checking at i3 and we add
   6019             // dependencies from i0 to i3,i4,.. (even if they are not aliased).
   6020             // Previously we already added dependencies from i3 to i6,i7,i8
   6021             // (because of MaxMemDepDistance). As we added a dependency from
   6022             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
   6023             // and we can abort this loop at i6.
   6024             if (DistToSrc >= 2 * MaxMemDepDistance)
   6025               break;
   6026             DistToSrc++;
   6027           }
   6028         }
   6029       }
   6030       BundleMember = BundleMember->NextInBundle;
   6031     }
   6032     if (InsertInReadyList && SD->isReady()) {
   6033       ReadyInsts.push_back(SD);
   6034       LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
   6035                         << "\n");
   6036     }
   6037   }
   6038 }
   6039 
   6040 void BoUpSLP::BlockScheduling::resetSchedule() {
   6041   assert(ScheduleStart &&
   6042          "tried to reset schedule on block which has not been scheduled");
   6043   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
   6044     doForAllOpcodes(I, [&](ScheduleData *SD) {
   6045       assert(isInSchedulingRegion(SD) &&
   6046              "ScheduleData not in scheduling region");
   6047       SD->IsScheduled = false;
   6048       SD->resetUnscheduledDeps();
   6049     });
   6050   }
   6051   ReadyInsts.clear();
   6052 }
   6053 
   6054 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   6055   if (!BS->ScheduleStart)
   6056     return;
   6057 
   6058   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
   6059 
   6060   BS->resetSchedule();
   6061 
   6062   // For the real scheduling we use a more sophisticated ready-list: it is
   6063   // sorted by the original instruction location. This lets the final schedule
   6064   // be as  close as possible to the original instruction order.
   6065   struct ScheduleDataCompare {
   6066     bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
   6067       return SD2->SchedulingPriority < SD1->SchedulingPriority;
   6068     }
   6069   };
   6070   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
   6071 
   6072   // Ensure that all dependency data is updated and fill the ready-list with
   6073   // initial instructions.
   6074   int Idx = 0;
   6075   int NumToSchedule = 0;
   6076   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
   6077        I = I->getNextNode()) {
   6078     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
   6079       assert(SD->isPartOfBundle() ==
   6080                  (getTreeEntry(SD->Inst) != nullptr) &&
   6081              "scheduler and vectorizer bundle mismatch");
   6082       SD->FirstInBundle->SchedulingPriority = Idx++;
   6083       if (SD->isSchedulingEntity()) {
   6084         BS->calculateDependencies(SD, false, this);
   6085         NumToSchedule++;
   6086       }
   6087     });
   6088   }
   6089   BS->initialFillReadyList(ReadyInsts);
   6090 
   6091   Instruction *LastScheduledInst = BS->ScheduleEnd;
   6092 
   6093   // Do the "real" scheduling.
   6094   while (!ReadyInsts.empty()) {
   6095     ScheduleData *picked = *ReadyInsts.begin();
   6096     ReadyInsts.erase(ReadyInsts.begin());
   6097 
   6098     // Move the scheduled instruction(s) to their dedicated places, if not
   6099     // there yet.
   6100     ScheduleData *BundleMember = picked;
   6101     while (BundleMember) {
   6102       Instruction *pickedInst = BundleMember->Inst;
   6103       if (LastScheduledInst->getNextNode() != pickedInst) {
   6104         BS->BB->getInstList().remove(pickedInst);
   6105         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
   6106                                      pickedInst);
   6107       }
   6108       LastScheduledInst = pickedInst;
   6109       BundleMember = BundleMember->NextInBundle;
   6110     }
   6111 
   6112     BS->schedule(picked, ReadyInsts);
   6113     NumToSchedule--;
   6114   }
   6115   assert(NumToSchedule == 0 && "could not schedule all instructions");
   6116 
   6117   // Avoid duplicate scheduling of the block.
   6118   BS->ScheduleStart = nullptr;
   6119 }
   6120 
   6121 unsigned BoUpSLP::getVectorElementSize(Value *V) {
   6122   // If V is a store, just return the width of the stored value (or value
   6123   // truncated just before storing) without traversing the expression tree.
   6124   // This is the common case.
   6125   if (auto *Store = dyn_cast<StoreInst>(V)) {
   6126     if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
   6127       return DL->getTypeSizeInBits(Trunc->getSrcTy());
   6128     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
   6129   }
   6130 
   6131   if (auto *IEI = dyn_cast<InsertElementInst>(V))
   6132     return getVectorElementSize(IEI->getOperand(1));
   6133 
   6134   auto E = InstrElementSize.find(V);
   6135   if (E != InstrElementSize.end())
   6136     return E->second;
   6137 
   6138   // If V is not a store, we can traverse the expression tree to find loads
   6139   // that feed it. The type of the loaded value may indicate a more suitable
   6140   // width than V's type. We want to base the vector element size on the width
   6141   // of memory operations where possible.
   6142   SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;
   6143   SmallPtrSet<Instruction *, 16> Visited;
   6144   if (auto *I = dyn_cast<Instruction>(V)) {
   6145     Worklist.emplace_back(I, I->getParent());
   6146     Visited.insert(I);
   6147   }
   6148 
   6149   // Traverse the expression tree in bottom-up order looking for loads. If we
   6150   // encounter an instruction we don't yet handle, we give up.
   6151   auto Width = 0u;
   6152   while (!Worklist.empty()) {
   6153     Instruction *I;
   6154     BasicBlock *Parent;
   6155     std::tie(I, Parent) = Worklist.pop_back_val();
   6156 
   6157     // We should only be looking at scalar instructions here. If the current
   6158     // instruction has a vector type, skip.
   6159     auto *Ty = I->getType();
   6160     if (isa<VectorType>(Ty))
   6161       continue;
   6162 
   6163     // If the current instruction is a load, update MaxWidth to reflect the
   6164     // width of the loaded value.
   6165     if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) ||
   6166         isa<ExtractValueInst>(I))
   6167       Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
   6168 
   6169     // Otherwise, we need to visit the operands of the instruction. We only
   6170     // handle the interesting cases from buildTree here. If an operand is an
   6171     // instruction we haven't yet visited and from the same basic block as the
   6172     // user or the use is a PHI node, we add it to the worklist.
   6173     else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
   6174              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) ||
   6175              isa<UnaryOperator>(I)) {
   6176       for (Use &U : I->operands())
   6177         if (auto *J = dyn_cast<Instruction>(U.get()))
   6178           if (Visited.insert(J).second &&
   6179               (isa<PHINode>(I) || J->getParent() == Parent))
   6180             Worklist.emplace_back(J, J->getParent());
   6181     } else {
   6182       break;
   6183     }
   6184   }
   6185 
   6186   // If we didn't encounter a memory access in the expression tree, or if we
   6187   // gave up for some reason, just return the width of V. Otherwise, return the
   6188   // maximum width we found.
   6189   if (!Width) {
   6190     if (auto *CI = dyn_cast<CmpInst>(V))
   6191       V = CI->getOperand(0);
   6192     Width = DL->getTypeSizeInBits(V->getType());
   6193   }
   6194 
   6195   for (Instruction *I : Visited)
   6196     InstrElementSize[I] = Width;
   6197 
   6198   return Width;
   6199 }
   6200 
   6201 // Determine if a value V in a vectorizable expression Expr can be demoted to a
   6202 // smaller type with a truncation. We collect the values that will be demoted
   6203 // in ToDemote and additional roots that require investigating in Roots.
   6204 static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
   6205                                   SmallVectorImpl<Value *> &ToDemote,
   6206                                   SmallVectorImpl<Value *> &Roots) {
   6207   // We can always demote constants.
   6208   if (isa<Constant>(V)) {
   6209     ToDemote.push_back(V);
   6210     return true;
   6211   }
   6212 
   6213   // If the value is not an instruction in the expression with only one use, it
   6214   // cannot be demoted.
   6215   auto *I = dyn_cast<Instruction>(V);
   6216   if (!I || !I->hasOneUse() || !Expr.count(I))
   6217     return false;
   6218 
   6219   switch (I->getOpcode()) {
   6220 
   6221   // We can always demote truncations and extensions. Since truncations can
   6222   // seed additional demotion, we save the truncated value.
   6223   case Instruction::Trunc:
   6224     Roots.push_back(I->getOperand(0));
   6225     break;
   6226   case Instruction::ZExt:
   6227   case Instruction::SExt:
   6228     if (isa<ExtractElementInst>(I->getOperand(0)) ||
   6229         isa<InsertElementInst>(I->getOperand(0)))
   6230       return false;
   6231     break;
   6232 
   6233   // We can demote certain binary operations if we can demote both of their
   6234   // operands.
   6235   case Instruction::Add:
   6236   case Instruction::Sub:
   6237   case Instruction::Mul:
   6238   case Instruction::And:
   6239   case Instruction::Or:
   6240   case Instruction::Xor:
   6241     if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
   6242         !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
   6243       return false;
   6244     break;
   6245 
   6246   // We can demote selects if we can demote their true and false values.
   6247   case Instruction::Select: {
   6248     SelectInst *SI = cast<SelectInst>(I);
   6249     if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
   6250         !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
   6251       return false;
   6252     break;
   6253   }
   6254 
   6255   // We can demote phis if we can demote all their incoming operands. Note that
   6256   // we don't need to worry about cycles since we ensure single use above.
   6257   case Instruction::PHI: {
   6258     PHINode *PN = cast<PHINode>(I);
   6259     for (Value *IncValue : PN->incoming_values())
   6260       if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
   6261         return false;
   6262     break;
   6263   }
   6264 
   6265   // Otherwise, conservatively give up.
   6266   default:
   6267     return false;
   6268   }
   6269 
   6270   // Record the value that we can demote.
   6271   ToDemote.push_back(V);
   6272   return true;
   6273 }
   6274 
   6275 void BoUpSLP::computeMinimumValueSizes() {
   6276   // If there are no external uses, the expression tree must be rooted by a
   6277   // store. We can't demote in-memory values, so there is nothing to do here.
   6278   if (ExternalUses.empty())
   6279     return;
   6280 
   6281   // We only attempt to truncate integer expressions.
   6282   auto &TreeRoot = VectorizableTree[0]->Scalars;
   6283   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
   6284   if (!TreeRootIT)
   6285     return;
   6286 
   6287   // If the expression is not rooted by a store, these roots should have
   6288   // external uses. We will rely on InstCombine to rewrite the expression in
   6289   // the narrower type. However, InstCombine only rewrites single-use values.
   6290   // This means that if a tree entry other than a root is used externally, it
   6291   // must have multiple uses and InstCombine will not rewrite it. The code
   6292   // below ensures that only the roots are used externally.
   6293   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
   6294   for (auto &EU : ExternalUses)
   6295     if (!Expr.erase(EU.Scalar))
   6296       return;
   6297   if (!Expr.empty())
   6298     return;
   6299 
   6300   // Collect the scalar values of the vectorizable expression. We will use this
   6301   // context to determine which values can be demoted. If we see a truncation,
   6302   // we mark it as seeding another demotion.
   6303   for (auto &EntryPtr : VectorizableTree)
   6304     Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
   6305 
   6306   // Ensure the roots of the vectorizable tree don't form a cycle. They must
   6307   // have a single external user that is not in the vectorizable tree.
   6308   for (auto *Root : TreeRoot)
   6309     if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
   6310       return;
   6311 
   6312   // Conservatively determine if we can actually truncate the roots of the
   6313   // expression. Collect the values that can be demoted in ToDemote and
   6314   // additional roots that require investigating in Roots.
   6315   SmallVector<Value *, 32> ToDemote;
   6316   SmallVector<Value *, 4> Roots;
   6317   for (auto *Root : TreeRoot)
   6318     if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
   6319       return;
   6320 
   6321   // The maximum bit width required to represent all the values that can be
   6322   // demoted without loss of precision. It would be safe to truncate the roots
   6323   // of the expression to this width.
   6324   auto MaxBitWidth = 8u;
   6325 
   6326   // We first check if all the bits of the roots are demanded. If they're not,
   6327   // we can truncate the roots to this narrower type.
   6328   for (auto *Root : TreeRoot) {
   6329     auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
   6330     MaxBitWidth = std::max<unsigned>(
   6331         Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
   6332   }
   6333 
   6334   // True if the roots can be zero-extended back to their original type, rather
   6335   // than sign-extended. We know that if the leading bits are not demanded, we
   6336   // can safely zero-extend. So we initialize IsKnownPositive to True.
   6337   bool IsKnownPositive = true;
   6338 
   6339   // If all the bits of the roots are demanded, we can try a little harder to
   6340   // compute a narrower type. This can happen, for example, if the roots are
   6341   // getelementptr indices. InstCombine promotes these indices to the pointer
   6342   // width. Thus, all their bits are technically demanded even though the
   6343   // address computation might be vectorized in a smaller type.
   6344   //
   6345   // We start by looking at each entry that can be demoted. We compute the
   6346   // maximum bit width required to store the scalar by using ValueTracking to
   6347   // compute the number of high-order bits we can truncate.
   6348   if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
   6349       llvm::all_of(TreeRoot, [](Value *R) {
   6350         assert(R->hasOneUse() && "Root should have only one use!");
   6351         return isa<GetElementPtrInst>(R->user_back());
   6352       })) {
   6353     MaxBitWidth = 8u;
   6354 
   6355     // Determine if the sign bit of all the roots is known to be zero. If not,
   6356     // IsKnownPositive is set to False.
   6357     IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
   6358       KnownBits Known = computeKnownBits(R, *DL);
   6359       return Known.isNonNegative();
   6360     });
   6361 
   6362     // Determine the maximum number of bits required to store the scalar
   6363     // values.
   6364     for (auto *Scalar : ToDemote) {
   6365       auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
   6366       auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
   6367       MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
   6368     }
   6369 
   6370     // If we can't prove that the sign bit is zero, we must add one to the
   6371     // maximum bit width to account for the unknown sign bit. This preserves
   6372     // the existing sign bit so we can safely sign-extend the root back to the
   6373     // original type. Otherwise, if we know the sign bit is zero, we will
   6374     // zero-extend the root instead.
   6375     //
   6376     // FIXME: This is somewhat suboptimal, as there will be cases where adding
   6377     //        one to the maximum bit width will yield a larger-than-necessary
   6378     //        type. In general, we need to add an extra bit only if we can't
   6379     //        prove that the upper bit of the original type is equal to the
   6380     //        upper bit of the proposed smaller type. If these two bits are the
   6381     //        same (either zero or one) we know that sign-extending from the
   6382     //        smaller type will result in the same value. Here, since we can't
   6383     //        yet prove this, we are just making the proposed smaller type
   6384     //        larger to ensure correctness.
   6385     if (!IsKnownPositive)
   6386       ++MaxBitWidth;
   6387   }
   6388 
   6389   // Round MaxBitWidth up to the next power-of-two.
   6390   if (!isPowerOf2_64(MaxBitWidth))
   6391     MaxBitWidth = NextPowerOf2(MaxBitWidth);
   6392 
   6393   // If the maximum bit width we compute is less than the with of the roots'
   6394   // type, we can proceed with the narrowing. Otherwise, do nothing.
   6395   if (MaxBitWidth >= TreeRootIT->getBitWidth())
   6396     return;
   6397 
   6398   // If we can truncate the root, we must collect additional values that might
   6399   // be demoted as a result. That is, those seeded by truncations we will
   6400   // modify.
   6401   while (!Roots.empty())
   6402     collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
   6403 
   6404   // Finally, map the values we can demote to the maximum bit with we computed.
   6405   for (auto *Scalar : ToDemote)
   6406     MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
   6407 }
   6408 
   6409 namespace {
   6410 
   6411 /// The SLPVectorizer Pass.
   6412 struct SLPVectorizer : public FunctionPass {
   6413   SLPVectorizerPass Impl;
   6414 
   6415   /// Pass identification, replacement for typeid
   6416   static char ID;
   6417 
   6418   explicit SLPVectorizer() : FunctionPass(ID) {
   6419     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
   6420   }
   6421 
   6422   bool doInitialization(Module &M) override {
   6423     return false;
   6424   }
   6425 
   6426   bool runOnFunction(Function &F) override {
   6427     if (skipFunction(F))
   6428       return false;
   6429 
   6430     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   6431     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   6432     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   6433     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
   6434     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   6435     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   6436     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   6437     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   6438     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
   6439     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
   6440 
   6441     return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   6442   }
   6443 
   6444   void getAnalysisUsage(AnalysisUsage &AU) const override {
   6445     FunctionPass::getAnalysisUsage(AU);
   6446     AU.addRequired<AssumptionCacheTracker>();
   6447     AU.addRequired<ScalarEvolutionWrapperPass>();
   6448     AU.addRequired<AAResultsWrapperPass>();
   6449     AU.addRequired<TargetTransformInfoWrapperPass>();
   6450     AU.addRequired<LoopInfoWrapperPass>();
   6451     AU.addRequired<DominatorTreeWrapperPass>();
   6452     AU.addRequired<DemandedBitsWrapperPass>();
   6453     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   6454     AU.addRequired<InjectTLIMappingsLegacy>();
   6455     AU.addPreserved<LoopInfoWrapperPass>();
   6456     AU.addPreserved<DominatorTreeWrapperPass>();
   6457     AU.addPreserved<AAResultsWrapperPass>();
   6458     AU.addPreserved<GlobalsAAWrapperPass>();
   6459     AU.setPreservesCFG();
   6460   }
   6461 };
   6462 
   6463 } // end anonymous namespace
   6464 
   6465 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
   6466   auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
   6467   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
   6468   auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
   6469   auto *AA = &AM.getResult<AAManager>(F);
   6470   auto *LI = &AM.getResult<LoopAnalysis>(F);
   6471   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   6472   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   6473   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
   6474   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   6475 
   6476   bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   6477   if (!Changed)
   6478     return PreservedAnalyses::all();
   6479 
   6480   PreservedAnalyses PA;
   6481   PA.preserveSet<CFGAnalyses>();
   6482   return PA;
   6483 }
   6484 
   6485 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   6486                                 TargetTransformInfo *TTI_,
   6487                                 TargetLibraryInfo *TLI_, AAResults *AA_,
   6488                                 LoopInfo *LI_, DominatorTree *DT_,
   6489                                 AssumptionCache *AC_, DemandedBits *DB_,
   6490                                 OptimizationRemarkEmitter *ORE_) {
   6491   if (!RunSLPVectorization)
   6492     return false;
   6493   SE = SE_;
   6494   TTI = TTI_;
   6495   TLI = TLI_;
   6496   AA = AA_;
   6497   LI = LI_;
   6498   DT = DT_;
   6499   AC = AC_;
   6500   DB = DB_;
   6501   DL = &F.getParent()->getDataLayout();
   6502 
   6503   Stores.clear();
   6504   GEPs.clear();
   6505   bool Changed = false;
   6506 
   6507   // If the target claims to have no vector registers don't attempt
   6508   // vectorization.
   6509   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
   6510     return false;
   6511 
   6512   // Don't vectorize when the attribute NoImplicitFloat is used.
   6513   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
   6514     return false;
   6515 
   6516   LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
   6517 
   6518   // Use the bottom up slp vectorizer to construct chains that start with
   6519   // store instructions.
   6520   BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
   6521 
   6522   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   6523   // delete instructions.
   6524 
   6525   // Scan the blocks in the function in post order.
   6526   for (auto BB : post_order(&F.getEntryBlock())) {
   6527     collectSeedInstructions(BB);
   6528 
   6529     // Vectorize trees that end at stores.
   6530     if (!Stores.empty()) {
   6531       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
   6532                         << " underlying objects.\n");
   6533       Changed |= vectorizeStoreChains(R);
   6534     }
   6535 
   6536     // Vectorize trees that end at reductions.
   6537     Changed |= vectorizeChainsInBlock(BB, R);
   6538 
   6539     // Vectorize the index computations of getelementptr instructions. This
   6540     // is primarily intended to catch gather-like idioms ending at
   6541     // non-consecutive loads.
   6542     if (!GEPs.empty()) {
   6543       LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
   6544                         << " underlying objects.\n");
   6545       Changed |= vectorizeGEPIndices(BB, R);
   6546     }
   6547   }
   6548 
   6549   if (Changed) {
   6550     R.optimizeGatherSequence();
   6551     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
   6552   }
   6553   return Changed;
   6554 }
   6555 
   6556 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   6557                                             unsigned Idx) {
   6558   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
   6559                     << "\n");
   6560   const unsigned Sz = R.getVectorElementSize(Chain[0]);
   6561   const unsigned MinVF = R.getMinVecRegSize() / Sz;
   6562   unsigned VF = Chain.size();
   6563 
   6564   if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
   6565     return false;
   6566 
   6567   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
   6568                     << "\n");
   6569 
   6570   R.buildTree(Chain);
   6571   Optional<ArrayRef<unsigned>> Order = R.bestOrder();
   6572   // TODO: Handle orders of size less than number of elements in the vector.
   6573   if (Order && Order->size() == Chain.size()) {
   6574     // TODO: reorder tree nodes without tree rebuilding.
   6575     SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
   6576     llvm::transform(*Order, ReorderedOps.begin(),
   6577                     [Chain](const unsigned Idx) { return Chain[Idx]; });
   6578     R.buildTree(ReorderedOps);
   6579   }
   6580   if (R.isTreeTinyAndNotFullyVectorizable())
   6581     return false;
   6582   if (R.isLoadCombineCandidate())
   6583     return false;
   6584 
   6585   R.computeMinimumValueSizes();
   6586 
   6587   InstructionCost Cost = R.getTreeCost();
   6588 
   6589   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
   6590   if (Cost < -SLPCostThreshold) {
   6591     LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
   6592 
   6593     using namespace ore;
   6594 
   6595     R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
   6596                                         cast<StoreInst>(Chain[0]))
   6597                      << "Stores SLP vectorized with cost " << NV("Cost", Cost)
   6598                      << " and with tree size "
   6599                      << NV("TreeSize", R.getTreeSize()));
   6600 
   6601     R.vectorizeTree();
   6602     return true;
   6603   }
   6604 
   6605   return false;
   6606 }
   6607 
   6608 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
   6609                                         BoUpSLP &R) {
   6610   // We may run into multiple chains that merge into a single chain. We mark the
   6611   // stores that we vectorized so that we don't visit the same store twice.
   6612   BoUpSLP::ValueSet VectorizedStores;
   6613   bool Changed = false;
   6614 
   6615   int E = Stores.size();
   6616   SmallBitVector Tails(E, false);
   6617   int MaxIter = MaxStoreLookup.getValue();
   6618   SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
   6619       E, std::make_pair(E, INT_MAX));
   6620   SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
   6621   int IterCnt;
   6622   auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
   6623                                   &CheckedPairs,
   6624                                   &ConsecutiveChain](int K, int Idx) {
   6625     if (IterCnt >= MaxIter)
   6626       return true;
   6627     if (CheckedPairs[Idx].test(K))
   6628       return ConsecutiveChain[K].second == 1 &&
   6629              ConsecutiveChain[K].first == Idx;
   6630     ++IterCnt;
   6631     CheckedPairs[Idx].set(K);
   6632     CheckedPairs[K].set(Idx);
   6633     Optional<int> Diff = getPointersDiff(Stores[K]->getPointerOperand(),
   6634                                          Stores[Idx]->getPointerOperand(), *DL,
   6635                                          *SE, /*StrictCheck=*/true);
   6636     if (!Diff || *Diff == 0)
   6637       return false;
   6638     int Val = *Diff;
   6639     if (Val < 0) {
   6640       if (ConsecutiveChain[Idx].second > -Val) {
   6641         Tails.set(K);
   6642         ConsecutiveChain[Idx] = std::make_pair(K, -Val);
   6643       }
   6644       return false;
   6645     }
   6646     if (ConsecutiveChain[K].second <= Val)
   6647       return false;
   6648 
   6649     Tails.set(Idx);
   6650     ConsecutiveChain[K] = std::make_pair(Idx, Val);
   6651     return Val == 1;
   6652   };
   6653   // Do a quadratic search on all of the given stores in reverse order and find
   6654   // all of the pairs of stores that follow each other.
   6655   for (int Idx = E - 1; Idx >= 0; --Idx) {
   6656     // If a store has multiple consecutive store candidates, search according
   6657     // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
   6658     // This is because usually pairing with immediate succeeding or preceding
   6659     // candidate create the best chance to find slp vectorization opportunity.
   6660     const int MaxLookDepth = std::max(E - Idx, Idx + 1);
   6661     IterCnt = 0;
   6662     for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
   6663       if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
   6664           (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
   6665         break;
   6666   }
   6667 
   6668   // Tracks if we tried to vectorize stores starting from the given tail
   6669   // already.
   6670   SmallBitVector TriedTails(E, false);
   6671   // For stores that start but don't end a link in the chain:
   6672   for (int Cnt = E; Cnt > 0; --Cnt) {
   6673     int I = Cnt - 1;
   6674     if (ConsecutiveChain[I].first == E || Tails.test(I))
   6675       continue;
   6676     // We found a store instr that starts a chain. Now follow the chain and try
   6677     // to vectorize it.
   6678     BoUpSLP::ValueList Operands;
   6679     // Collect the chain into a list.
   6680     while (I != E && !VectorizedStores.count(Stores[I])) {
   6681       Operands.push_back(Stores[I]);
   6682       Tails.set(I);
   6683       if (ConsecutiveChain[I].second != 1) {
   6684         // Mark the new end in the chain and go back, if required. It might be
   6685         // required if the original stores come in reversed order, for example.
   6686         if (ConsecutiveChain[I].first != E &&
   6687             Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
   6688             !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
   6689           TriedTails.set(I);
   6690           Tails.reset(ConsecutiveChain[I].first);
   6691           if (Cnt < ConsecutiveChain[I].first + 2)
   6692             Cnt = ConsecutiveChain[I].first + 2;
   6693         }
   6694         break;
   6695       }
   6696       // Move to the next value in the chain.
   6697       I = ConsecutiveChain[I].first;
   6698     }
   6699     assert(!Operands.empty() && "Expected non-empty list of stores.");
   6700 
   6701     unsigned MaxVecRegSize = R.getMaxVecRegSize();
   6702     unsigned EltSize = R.getVectorElementSize(Operands[0]);
   6703     unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
   6704 
   6705     unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);
   6706     unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
   6707                               MaxElts);
   6708 
   6709     // FIXME: Is division-by-2 the correct step? Should we assert that the
   6710     // register size is a power-of-2?
   6711     unsigned StartIdx = 0;
   6712     for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
   6713       for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
   6714         ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
   6715         if (!VectorizedStores.count(Slice.front()) &&
   6716             !VectorizedStores.count(Slice.back()) &&
   6717             vectorizeStoreChain(Slice, R, Cnt)) {
   6718           // Mark the vectorized stores so that we don't vectorize them again.
   6719           VectorizedStores.insert(Slice.begin(), Slice.end());
   6720           Changed = true;
   6721           // If we vectorized initial block, no need to try to vectorize it
   6722           // again.
   6723           if (Cnt == StartIdx)
   6724             StartIdx += Size;
   6725           Cnt += Size;
   6726           continue;
   6727         }
   6728         ++Cnt;
   6729       }
   6730       // Check if the whole array was vectorized already - exit.
   6731       if (StartIdx >= Operands.size())
   6732         break;
   6733     }
   6734   }
   6735 
   6736   return Changed;
   6737 }
   6738 
   6739 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
   6740   // Initialize the collections. We will make a single pass over the block.
   6741   Stores.clear();
   6742   GEPs.clear();
   6743 
   6744   // Visit the store and getelementptr instructions in BB and organize them in
   6745   // Stores and GEPs according to the underlying objects of their pointer
   6746   // operands.
   6747   for (Instruction &I : *BB) {
   6748     // Ignore store instructions that are volatile or have a pointer operand
   6749     // that doesn't point to a scalar type.
   6750     if (auto *SI = dyn_cast<StoreInst>(&I)) {
   6751       if (!SI->isSimple())
   6752         continue;
   6753       if (!isValidElementType(SI->getValueOperand()->getType()))
   6754         continue;
   6755       Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
   6756     }
   6757 
   6758     // Ignore getelementptr instructions that have more than one index, a
   6759     // constant index, or a pointer operand that doesn't point to a scalar
   6760     // type.
   6761     else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
   6762       auto Idx = GEP->idx_begin()->get();
   6763       if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
   6764         continue;
   6765       if (!isValidElementType(Idx->getType()))
   6766         continue;
   6767       if (GEP->getType()->isVectorTy())
   6768         continue;
   6769       GEPs[GEP->getPointerOperand()].push_back(GEP);
   6770     }
   6771   }
   6772 }
   6773 
   6774 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   6775   if (!A || !B)
   6776     return false;
   6777   Value *VL[] = {A, B};
   6778   return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
   6779 }
   6780 
   6781 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   6782                                            bool AllowReorder) {
   6783   if (VL.size() < 2)
   6784     return false;
   6785 
   6786   LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
   6787                     << VL.size() << ".\n");
   6788 
   6789   // Check that all of the parts are instructions of the same type,
   6790   // we permit an alternate opcode via InstructionsState.
   6791   InstructionsState S = getSameOpcode(VL);
   6792   if (!S.getOpcode())
   6793     return false;
   6794 
   6795   Instruction *I0 = cast<Instruction>(S.OpValue);
   6796   // Make sure invalid types (including vector type) are rejected before
   6797   // determining vectorization factor for scalar instructions.
   6798   for (Value *V : VL) {
   6799     Type *Ty = V->getType();
   6800     if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
   6801       // NOTE: the following will give user internal llvm type name, which may
   6802       // not be useful.
   6803       R.getORE()->emit([&]() {
   6804         std::string type_str;
   6805         llvm::raw_string_ostream rso(type_str);
   6806         Ty->print(rso);
   6807         return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
   6808                << "Cannot SLP vectorize list: type "
   6809                << rso.str() + " is unsupported by vectorizer";
   6810       });
   6811       return false;
   6812     }
   6813   }
   6814 
   6815   unsigned Sz = R.getVectorElementSize(I0);
   6816   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   6817   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
   6818   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   6819   if (MaxVF < 2) {
   6820     R.getORE()->emit([&]() {
   6821       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
   6822              << "Cannot SLP vectorize list: vectorization factor "
   6823              << "less than 2 is not supported";
   6824     });
   6825     return false;
   6826   }
   6827 
   6828   bool Changed = false;
   6829   bool CandidateFound = false;
   6830   InstructionCost MinCost = SLPCostThreshold.getValue();
   6831   Type *ScalarTy = VL[0]->getType();
   6832   if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
   6833     ScalarTy = IE->getOperand(1)->getType();
   6834 
   6835   unsigned NextInst = 0, MaxInst = VL.size();
   6836   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
   6837     // No actual vectorization should happen, if number of parts is the same as
   6838     // provided vectorization factor (i.e. the scalar type is used for vector
   6839     // code during codegen).
   6840     auto *VecTy = FixedVectorType::get(ScalarTy, VF);
   6841     if (TTI->getNumberOfParts(VecTy) == VF)
   6842       continue;
   6843     for (unsigned I = NextInst; I < MaxInst; ++I) {
   6844       unsigned OpsWidth = 0;
   6845 
   6846       if (I + VF > MaxInst)
   6847         OpsWidth = MaxInst - I;
   6848       else
   6849         OpsWidth = VF;
   6850 
   6851       if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
   6852         break;
   6853 
   6854       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
   6855       // Check that a previous iteration of this loop did not delete the Value.
   6856       if (llvm::any_of(Ops, [&R](Value *V) {
   6857             auto *I = dyn_cast<Instruction>(V);
   6858             return I && R.isDeleted(I);
   6859           }))
   6860         continue;
   6861 
   6862       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
   6863                         << "\n");
   6864 
   6865       R.buildTree(Ops);
   6866       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
   6867       // TODO: check if we can allow reordering for more cases.
   6868       if (AllowReorder && Order) {
   6869         // TODO: reorder tree nodes without tree rebuilding.
   6870         // Conceptually, there is nothing actually preventing us from trying to
   6871         // reorder a larger list. In fact, we do exactly this when vectorizing
   6872         // reductions. However, at this point, we only expect to get here when
   6873         // there are exactly two operations.
   6874         assert(Ops.size() == 2);
   6875         Value *ReorderedOps[] = {Ops[1], Ops[0]};
   6876         R.buildTree(ReorderedOps, None);
   6877       }
   6878       if (R.isTreeTinyAndNotFullyVectorizable())
   6879         continue;
   6880 
   6881       R.computeMinimumValueSizes();
   6882       InstructionCost Cost = R.getTreeCost();
   6883       CandidateFound = true;
   6884       MinCost = std::min(MinCost, Cost);
   6885 
   6886       if (Cost < -SLPCostThreshold) {
   6887         LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
   6888         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
   6889                                                     cast<Instruction>(Ops[0]))
   6890                                  << "SLP vectorized with cost " << ore::NV("Cost", Cost)
   6891                                  << " and with tree size "
   6892                                  << ore::NV("TreeSize", R.getTreeSize()));
   6893 
   6894         R.vectorizeTree();
   6895         // Move to the next bundle.
   6896         I += VF - 1;
   6897         NextInst = I + 1;
   6898         Changed = true;
   6899       }
   6900     }
   6901   }
   6902 
   6903   if (!Changed && CandidateFound) {
   6904     R.getORE()->emit([&]() {
   6905       return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
   6906              << "List vectorization was possible but not beneficial with cost "
   6907              << ore::NV("Cost", MinCost) << " >= "
   6908              << ore::NV("Treshold", -SLPCostThreshold);
   6909     });
   6910   } else if (!Changed) {
   6911     R.getORE()->emit([&]() {
   6912       return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
   6913              << "Cannot SLP vectorize list: vectorization was impossible"
   6914              << " with available vectorization factors";
   6915     });
   6916   }
   6917   return Changed;
   6918 }
   6919 
   6920 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   6921   if (!I)
   6922     return false;
   6923 
   6924   if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
   6925     return false;
   6926 
   6927   Value *P = I->getParent();
   6928 
   6929   // Vectorize in current basic block only.
   6930   auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
   6931   auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
   6932   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
   6933     return false;
   6934 
   6935   // Try to vectorize V.
   6936   if (tryToVectorizePair(Op0, Op1, R))
   6937     return true;
   6938 
   6939   auto *A = dyn_cast<BinaryOperator>(Op0);
   6940   auto *B = dyn_cast<BinaryOperator>(Op1);
   6941   // Try to skip B.
   6942   if (B && B->hasOneUse()) {
   6943     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
   6944     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
   6945     if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
   6946       return true;
   6947     if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
   6948       return true;
   6949   }
   6950 
   6951   // Try to skip A.
   6952   if (A && A->hasOneUse()) {
   6953     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
   6954     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
   6955     if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
   6956       return true;
   6957     if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
   6958       return true;
   6959   }
   6960   return false;
   6961 }
   6962 
   6963 namespace {
   6964 
   6965 /// Model horizontal reductions.
   6966 ///
   6967 /// A horizontal reduction is a tree of reduction instructions that has values
   6968 /// that can be put into a vector as its leaves. For example:
   6969 ///
   6970 /// mul mul mul mul
   6971 ///  \  /    \  /
   6972 ///   +       +
   6973 ///    \     /
   6974 ///       +
   6975 /// This tree has "mul" as its leaf values and "+" as its reduction
   6976 /// instructions. A reduction can feed into a store or a binary operation
   6977 /// feeding a phi.
   6978 ///    ...
   6979 ///    \  /
   6980 ///     +
   6981 ///     |
   6982 ///  phi +=
   6983 ///
   6984 ///  Or:
   6985 ///    ...
   6986 ///    \  /
   6987 ///     +
   6988 ///     |
   6989 ///   *p =
   6990 ///
   6991 class HorizontalReduction {
   6992   using ReductionOpsType = SmallVector<Value *, 16>;
   6993   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
   6994   ReductionOpsListType ReductionOps;
   6995   SmallVector<Value *, 32> ReducedVals;
   6996   // Use map vector to make stable output.
   6997   MapVector<Instruction *, Value *> ExtraArgs;
   6998   WeakTrackingVH ReductionRoot;
   6999   /// The type of reduction operation.
   7000   RecurKind RdxKind;
   7001 
   7002   /// Checks if instruction is associative and can be vectorized.
   7003   static bool isVectorizable(RecurKind Kind, Instruction *I) {
   7004     if (Kind == RecurKind::None)
   7005       return false;
   7006     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
   7007       return true;
   7008 
   7009     if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
   7010       // FP min/max are associative except for NaN and -0.0. We do not
   7011       // have to rule out -0.0 here because the intrinsic semantics do not
   7012       // specify a fixed result for it.
   7013       return I->getFastMathFlags().noNaNs();
   7014     }
   7015 
   7016     return I->isAssociative();
   7017   }
   7018 
   7019   /// Checks if the ParentStackElem.first should be marked as a reduction
   7020   /// operation with an extra argument or as extra argument itself.
   7021   void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
   7022                     Value *ExtraArg) {
   7023     if (ExtraArgs.count(ParentStackElem.first)) {
   7024       ExtraArgs[ParentStackElem.first] = nullptr;
   7025       // We ran into something like:
   7026       // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
   7027       // The whole ParentStackElem.first should be considered as an extra value
   7028       // in this case.
   7029       // Do not perform analysis of remaining operands of ParentStackElem.first
   7030       // instruction, this whole instruction is an extra argument.
   7031       ParentStackElem.second = getNumberOfOperands(ParentStackElem.first);
   7032     } else {
   7033       // We ran into something like:
   7034       // ParentStackElem.first += ... + ExtraArg + ...
   7035       ExtraArgs[ParentStackElem.first] = ExtraArg;
   7036     }
   7037   }
   7038 
   7039   /// Creates reduction operation with the current opcode.
   7040   static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
   7041                          Value *RHS, const Twine &Name, bool UseSelect) {
   7042     unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
   7043     switch (Kind) {
   7044     case RecurKind::Add:
   7045     case RecurKind::Mul:
   7046     case RecurKind::Or:
   7047     case RecurKind::And:
   7048     case RecurKind::Xor:
   7049     case RecurKind::FAdd:
   7050     case RecurKind::FMul:
   7051       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
   7052                                  Name);
   7053     case RecurKind::FMax:
   7054       return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
   7055     case RecurKind::FMin:
   7056       return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
   7057     case RecurKind::SMax:
   7058       if (UseSelect) {
   7059         Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
   7060         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
   7061       }
   7062       return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
   7063     case RecurKind::SMin:
   7064       if (UseSelect) {
   7065         Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
   7066         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
   7067       }
   7068       return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
   7069     case RecurKind::UMax:
   7070       if (UseSelect) {
   7071         Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
   7072         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
   7073       }
   7074       return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
   7075     case RecurKind::UMin:
   7076       if (UseSelect) {
   7077         Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
   7078         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
   7079       }
   7080       return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
   7081     default:
   7082       llvm_unreachable("Unknown reduction operation.");
   7083     }
   7084   }
   7085 
   7086   /// Creates reduction operation with the current opcode with the IR flags
   7087   /// from \p ReductionOps.
   7088   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
   7089                          Value *RHS, const Twine &Name,
   7090                          const ReductionOpsListType &ReductionOps) {
   7091     bool UseSelect = ReductionOps.size() == 2;
   7092     assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) &&
   7093            "Expected cmp + select pairs for reduction");
   7094     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
   7095     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
   7096       if (auto *Sel = dyn_cast<SelectInst>(Op)) {
   7097         propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
   7098         propagateIRFlags(Op, ReductionOps[1]);
   7099         return Op;
   7100       }
   7101     }
   7102     propagateIRFlags(Op, ReductionOps[0]);
   7103     return Op;
   7104   }
   7105 
   7106   /// Creates reduction operation with the current opcode with the IR flags
   7107   /// from \p I.
   7108   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
   7109                          Value *RHS, const Twine &Name, Instruction *I) {
   7110     auto *SelI = dyn_cast<SelectInst>(I);
   7111     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
   7112     if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
   7113       if (auto *Sel = dyn_cast<SelectInst>(Op))
   7114         propagateIRFlags(Sel->getCondition(), SelI->getCondition());
   7115     }
   7116     propagateIRFlags(Op, I);
   7117     return Op;
   7118   }
   7119 
   7120   static RecurKind getRdxKind(Instruction *I) {
   7121     assert(I && "Expected instruction for reduction matching");
   7122     TargetTransformInfo::ReductionFlags RdxFlags;
   7123     if (match(I, m_Add(m_Value(), m_Value())))
   7124       return RecurKind::Add;
   7125     if (match(I, m_Mul(m_Value(), m_Value())))
   7126       return RecurKind::Mul;
   7127     if (match(I, m_And(m_Value(), m_Value())))
   7128       return RecurKind::And;
   7129     if (match(I, m_Or(m_Value(), m_Value())))
   7130       return RecurKind::Or;
   7131     if (match(I, m_Xor(m_Value(), m_Value())))
   7132       return RecurKind::Xor;
   7133     if (match(I, m_FAdd(m_Value(), m_Value())))
   7134       return RecurKind::FAdd;
   7135     if (match(I, m_FMul(m_Value(), m_Value())))
   7136       return RecurKind::FMul;
   7137 
   7138     if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
   7139       return RecurKind::FMax;
   7140     if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
   7141       return RecurKind::FMin;
   7142 
   7143     // This matches either cmp+select or intrinsics. SLP is expected to handle
   7144     // either form.
   7145     // TODO: If we are canonicalizing to intrinsics, we can remove several
   7146     //       special-case paths that deal with selects.
   7147     if (match(I, m_SMax(m_Value(), m_Value())))
   7148       return RecurKind::SMax;
   7149     if (match(I, m_SMin(m_Value(), m_Value())))
   7150       return RecurKind::SMin;
   7151     if (match(I, m_UMax(m_Value(), m_Value())))
   7152       return RecurKind::UMax;
   7153     if (match(I, m_UMin(m_Value(), m_Value())))
   7154       return RecurKind::UMin;
   7155 
   7156     if (auto *Select = dyn_cast<SelectInst>(I)) {
   7157       // Try harder: look for min/max pattern based on instructions producing
   7158       // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
   7159       // During the intermediate stages of SLP, it's very common to have
   7160       // pattern like this (since optimizeGatherSequence is run only once
   7161       // at the end):
   7162       // %1 = extractelement <2 x i32> %a, i32 0
   7163       // %2 = extractelement <2 x i32> %a, i32 1
   7164       // %cond = icmp sgt i32 %1, %2
   7165       // %3 = extractelement <2 x i32> %a, i32 0
   7166       // %4 = extractelement <2 x i32> %a, i32 1
   7167       // %select = select i1 %cond, i32 %3, i32 %4
   7168       CmpInst::Predicate Pred;
   7169       Instruction *L1;
   7170       Instruction *L2;
   7171 
   7172       Value *LHS = Select->getTrueValue();
   7173       Value *RHS = Select->getFalseValue();
   7174       Value *Cond = Select->getCondition();
   7175 
   7176       // TODO: Support inverse predicates.
   7177       if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
   7178         if (!isa<ExtractElementInst>(RHS) ||
   7179             !L2->isIdenticalTo(cast<Instruction>(RHS)))
   7180           return RecurKind::None;
   7181       } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
   7182         if (!isa<ExtractElementInst>(LHS) ||
   7183             !L1->isIdenticalTo(cast<Instruction>(LHS)))
   7184           return RecurKind::None;
   7185       } else {
   7186         if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
   7187           return RecurKind::None;
   7188         if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
   7189             !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
   7190             !L2->isIdenticalTo(cast<Instruction>(RHS)))
   7191           return RecurKind::None;
   7192       }
   7193 
   7194       TargetTransformInfo::ReductionFlags RdxFlags;
   7195       switch (Pred) {
   7196       default:
   7197         return RecurKind::None;
   7198       case CmpInst::ICMP_SGT:
   7199       case CmpInst::ICMP_SGE:
   7200         return RecurKind::SMax;
   7201       case CmpInst::ICMP_SLT:
   7202       case CmpInst::ICMP_SLE:
   7203         return RecurKind::SMin;
   7204       case CmpInst::ICMP_UGT:
   7205       case CmpInst::ICMP_UGE:
   7206         return RecurKind::UMax;
   7207       case CmpInst::ICMP_ULT:
   7208       case CmpInst::ICMP_ULE:
   7209         return RecurKind::UMin;
   7210       }
   7211     }
   7212     return RecurKind::None;
   7213   }
   7214 
   7215   /// Get the index of the first operand.
   7216   static unsigned getFirstOperandIndex(Instruction *I) {
   7217     return isa<SelectInst>(I) ? 1 : 0;
   7218   }
   7219 
   7220   /// Total number of operands in the reduction operation.
   7221   static unsigned getNumberOfOperands(Instruction *I) {
   7222     return isa<SelectInst>(I) ? 3 : 2;
   7223   }
   7224 
   7225   /// Checks if the instruction is in basic block \p BB.
   7226   /// For a min/max reduction check that both compare and select are in \p BB.
   7227   static bool hasSameParent(Instruction *I, BasicBlock *BB, bool IsRedOp) {
   7228     auto *Sel = dyn_cast<SelectInst>(I);
   7229     if (IsRedOp && Sel) {
   7230       auto *Cmp = cast<Instruction>(Sel->getCondition());
   7231       return Sel->getParent() == BB && Cmp->getParent() == BB;
   7232     }
   7233     return I->getParent() == BB;
   7234   }
   7235 
   7236   /// Expected number of uses for reduction operations/reduced values.
   7237   static bool hasRequiredNumberOfUses(bool MatchCmpSel, Instruction *I) {
   7238     // SelectInst must be used twice while the condition op must have single
   7239     // use only.
   7240     if (MatchCmpSel) {
   7241       if (auto *Sel = dyn_cast<SelectInst>(I))
   7242         return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
   7243       return I->hasNUses(2);
   7244     }
   7245 
   7246     // Arithmetic reduction operation must be used once only.
   7247     return I->hasOneUse();
   7248   }
   7249 
   7250   /// Initializes the list of reduction operations.
   7251   void initReductionOps(Instruction *I) {
   7252     if (isa<SelectInst>(I))
   7253       ReductionOps.assign(2, ReductionOpsType());
   7254     else
   7255       ReductionOps.assign(1, ReductionOpsType());
   7256   }
   7257 
   7258   /// Add all reduction operations for the reduction instruction \p I.
   7259   void addReductionOps(Instruction *I) {
   7260     if (auto *Sel = dyn_cast<SelectInst>(I)) {
   7261       ReductionOps[0].emplace_back(Sel->getCondition());
   7262       ReductionOps[1].emplace_back(Sel);
   7263     } else {
   7264       ReductionOps[0].emplace_back(I);
   7265     }
   7266   }
   7267 
   7268   static Value *getLHS(RecurKind Kind, Instruction *I) {
   7269     if (Kind == RecurKind::None)
   7270       return nullptr;
   7271     return I->getOperand(getFirstOperandIndex(I));
   7272   }
   7273   static Value *getRHS(RecurKind Kind, Instruction *I) {
   7274     if (Kind == RecurKind::None)
   7275       return nullptr;
   7276     return I->getOperand(getFirstOperandIndex(I) + 1);
   7277   }
   7278 
   7279 public:
   7280   HorizontalReduction() = default;
   7281 
   7282   /// Try to find a reduction tree.
   7283   bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
   7284     assert((!Phi || is_contained(Phi->operands(), B)) &&
   7285            "Phi needs to use the binary operator");
   7286 
   7287     RdxKind = getRdxKind(B);
   7288 
   7289     // We could have a initial reductions that is not an add.
   7290     //  r *= v1 + v2 + v3 + v4
   7291     // In such a case start looking for a tree rooted in the first '+'.
   7292     if (Phi) {
   7293       if (getLHS(RdxKind, B) == Phi) {
   7294         Phi = nullptr;
   7295         B = dyn_cast<Instruction>(getRHS(RdxKind, B));
   7296         if (!B)
   7297           return false;
   7298         RdxKind = getRdxKind(B);
   7299       } else if (getRHS(RdxKind, B) == Phi) {
   7300         Phi = nullptr;
   7301         B = dyn_cast<Instruction>(getLHS(RdxKind, B));
   7302         if (!B)
   7303           return false;
   7304         RdxKind = getRdxKind(B);
   7305       }
   7306     }
   7307 
   7308     if (!isVectorizable(RdxKind, B))
   7309       return false;
   7310 
   7311     // Analyze "regular" integer/FP types for reductions - no target-specific
   7312     // types or pointers.
   7313     Type *Ty = B->getType();
   7314     if (!isValidElementType(Ty) || Ty->isPointerTy())
   7315       return false;
   7316 
   7317     // Though the ultimate reduction may have multiple uses, its condition must
   7318     // have only single use.
   7319     if (auto *SI = dyn_cast<SelectInst>(B))
   7320       if (!SI->getCondition()->hasOneUse())
   7321         return false;
   7322 
   7323     ReductionRoot = B;
   7324 
   7325     // The opcode for leaf values that we perform a reduction on.
   7326     // For example: load(x) + load(y) + load(z) + fptoui(w)
   7327     // The leaf opcode for 'w' does not match, so we don't include it as a
   7328     // potential candidate for the reduction.
   7329     unsigned LeafOpcode = 0;
   7330 
   7331     // Post order traverse the reduction tree starting at B. We only handle true
   7332     // trees containing only binary operators.
   7333     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
   7334     Stack.push_back(std::make_pair(B, getFirstOperandIndex(B)));
   7335     initReductionOps(B);
   7336     while (!Stack.empty()) {
   7337       Instruction *TreeN = Stack.back().first;
   7338       unsigned EdgeToVisit = Stack.back().second++;
   7339       const RecurKind TreeRdxKind = getRdxKind(TreeN);
   7340       bool IsReducedValue = TreeRdxKind != RdxKind;
   7341 
   7342       // Postorder visit.
   7343       if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeN)) {
   7344         if (IsReducedValue)
   7345           ReducedVals.push_back(TreeN);
   7346         else {
   7347           auto ExtraArgsIter = ExtraArgs.find(TreeN);
   7348           if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
   7349             // Check if TreeN is an extra argument of its parent operation.
   7350             if (Stack.size() <= 1) {
   7351               // TreeN can't be an extra argument as it is a root reduction
   7352               // operation.
   7353               return false;
   7354             }
   7355             // Yes, TreeN is an extra argument, do not add it to a list of
   7356             // reduction operations.
   7357             // Stack[Stack.size() - 2] always points to the parent operation.
   7358             markExtraArg(Stack[Stack.size() - 2], TreeN);
   7359             ExtraArgs.erase(TreeN);
   7360           } else
   7361             addReductionOps(TreeN);
   7362         }
   7363         // Retract.
   7364         Stack.pop_back();
   7365         continue;
   7366       }
   7367 
   7368       // Visit left or right.
   7369       Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
   7370       auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
   7371       if (!EdgeInst) {
   7372         // Edge value is not a reduction instruction or a leaf instruction.
   7373         // (It may be a constant, function argument, or something else.)
   7374         markExtraArg(Stack.back(), EdgeVal);
   7375         continue;
   7376       }
   7377       RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
   7378       // Continue analysis if the next operand is a reduction operation or
   7379       // (possibly) a leaf value. If the leaf value opcode is not set,
   7380       // the first met operation != reduction operation is considered as the
   7381       // leaf opcode.
   7382       // Only handle trees in the current basic block.
   7383       // Each tree node needs to have minimal number of users except for the
   7384       // ultimate reduction.
   7385       const bool IsRdxInst = EdgeRdxKind == RdxKind;
   7386       if (EdgeInst != Phi && EdgeInst != B &&
   7387           hasSameParent(EdgeInst, B->getParent(), IsRdxInst) &&
   7388           hasRequiredNumberOfUses(isa<SelectInst>(B), EdgeInst) &&
   7389           (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
   7390         if (IsRdxInst) {
   7391           // We need to be able to reassociate the reduction operations.
   7392           if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
   7393             // I is an extra argument for TreeN (its parent operation).
   7394             markExtraArg(Stack.back(), EdgeInst);
   7395             continue;
   7396           }
   7397         } else if (!LeafOpcode) {
   7398           LeafOpcode = EdgeInst->getOpcode();
   7399         }
   7400         Stack.push_back(
   7401             std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
   7402         continue;
   7403       }
   7404       // I is an extra argument for TreeN (its parent operation).
   7405       markExtraArg(Stack.back(), EdgeInst);
   7406     }
   7407     return true;
   7408   }
   7409 
   7410   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   7411   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
   7412     // If there are a sufficient number of reduction values, reduce
   7413     // to a nearby power-of-2. We can safely generate oversized
   7414     // vectors and rely on the backend to split them to legal sizes.
   7415     unsigned NumReducedVals = ReducedVals.size();
   7416     if (NumReducedVals < 4)
   7417       return false;
   7418 
   7419     // Intersect the fast-math-flags from all reduction operations.
   7420     FastMathFlags RdxFMF;
   7421     RdxFMF.set();
   7422     for (ReductionOpsType &RdxOp : ReductionOps) {
   7423       for (Value *RdxVal : RdxOp) {
   7424         if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
   7425           RdxFMF &= FPMO->getFastMathFlags();
   7426       }
   7427     }
   7428 
   7429     IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
   7430     Builder.setFastMathFlags(RdxFMF);
   7431 
   7432     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
   7433     // The same extra argument may be used several times, so log each attempt
   7434     // to use it.
   7435     for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
   7436       assert(Pair.first && "DebugLoc must be set.");
   7437       ExternallyUsedValues[Pair.second].push_back(Pair.first);
   7438     }
   7439 
   7440     // The compare instruction of a min/max is the insertion point for new
   7441     // instructions and may be replaced with a new compare instruction.
   7442     auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
   7443       assert(isa<SelectInst>(RdxRootInst) &&
   7444              "Expected min/max reduction to have select root instruction");
   7445       Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
   7446       assert(isa<Instruction>(ScalarCond) &&
   7447              "Expected min/max reduction to have compare condition");
   7448       return cast<Instruction>(ScalarCond);
   7449     };
   7450 
   7451     // The reduction root is used as the insertion point for new instructions,
   7452     // so set it as externally used to prevent it from being deleted.
   7453     ExternallyUsedValues[ReductionRoot];
   7454     SmallVector<Value *, 16> IgnoreList;
   7455     for (ReductionOpsType &RdxOp : ReductionOps)
   7456       IgnoreList.append(RdxOp.begin(), RdxOp.end());
   7457 
   7458     unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
   7459     if (NumReducedVals > ReduxWidth) {
   7460       // In the loop below, we are building a tree based on a window of
   7461       // 'ReduxWidth' values.
   7462       // If the operands of those values have common traits (compare predicate,
   7463       // constant operand, etc), then we want to group those together to
   7464       // minimize the cost of the reduction.
   7465 
   7466       // TODO: This should be extended to count common operands for
   7467       //       compares and binops.
   7468 
   7469       // Step 1: Count the number of times each compare predicate occurs.
   7470       SmallDenseMap<unsigned, unsigned> PredCountMap;
   7471       for (Value *RdxVal : ReducedVals) {
   7472         CmpInst::Predicate Pred;
   7473         if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
   7474           ++PredCountMap[Pred];
   7475       }
   7476       // Step 2: Sort the values so the most common predicates come first.
   7477       stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
   7478         CmpInst::Predicate PredA, PredB;
   7479         if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
   7480             match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
   7481           return PredCountMap[PredA] > PredCountMap[PredB];
   7482         }
   7483         return false;
   7484       });
   7485     }
   7486 
   7487     Value *VectorizedTree = nullptr;
   7488     unsigned i = 0;
   7489     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
   7490       ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
   7491       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
   7492       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
   7493       if (Order) {
   7494         assert(Order->size() == VL.size() &&
   7495                "Order size must be the same as number of vectorized "
   7496                "instructions.");
   7497         // TODO: reorder tree nodes without tree rebuilding.
   7498         SmallVector<Value *, 4> ReorderedOps(VL.size());
   7499         llvm::transform(*Order, ReorderedOps.begin(),
   7500                         [VL](const unsigned Idx) { return VL[Idx]; });
   7501         V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
   7502       }
   7503       if (V.isTreeTinyAndNotFullyVectorizable())
   7504         break;
   7505       if (V.isLoadCombineReductionCandidate(RdxKind))
   7506         break;
   7507 
   7508       V.computeMinimumValueSizes();
   7509 
   7510       // Estimate cost.
   7511       InstructionCost TreeCost = V.getTreeCost();
   7512       InstructionCost ReductionCost =
   7513           getReductionCost(TTI, ReducedVals[i], ReduxWidth);
   7514       InstructionCost Cost = TreeCost + ReductionCost;
   7515       if (!Cost.isValid()) {
   7516         LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
   7517         return false;
   7518       }
   7519       if (Cost >= -SLPCostThreshold) {
   7520         V.getORE()->emit([&]() {
   7521           return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
   7522                                           cast<Instruction>(VL[0]))
   7523                  << "Vectorizing horizontal reduction is possible"
   7524                  << "but not beneficial with cost " << ore::NV("Cost", Cost)
   7525                  << " and threshold "
   7526                  << ore::NV("Threshold", -SLPCostThreshold);
   7527         });
   7528         break;
   7529       }
   7530 
   7531       LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
   7532                         << Cost << ". (HorRdx)\n");
   7533       V.getORE()->emit([&]() {
   7534         return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
   7535                                   cast<Instruction>(VL[0]))
   7536                << "Vectorized horizontal reduction with cost "
   7537                << ore::NV("Cost", Cost) << " and with tree size "
   7538                << ore::NV("TreeSize", V.getTreeSize());
   7539       });
   7540 
   7541       // Vectorize a tree.
   7542       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
   7543       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
   7544 
   7545       // Emit a reduction. If the root is a select (min/max idiom), the insert
   7546       // point is the compare condition of that select.
   7547       Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
   7548       if (isa<SelectInst>(RdxRootInst))
   7549         Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
   7550       else
   7551         Builder.SetInsertPoint(RdxRootInst);
   7552 
   7553       Value *ReducedSubTree =
   7554           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
   7555 
   7556       if (!VectorizedTree) {
   7557         // Initialize the final value in the reduction.
   7558         VectorizedTree = ReducedSubTree;
   7559       } else {
   7560         // Update the final value in the reduction.
   7561         Builder.SetCurrentDebugLocation(Loc);
   7562         VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
   7563                                   ReducedSubTree, "op.rdx", ReductionOps);
   7564       }
   7565       i += ReduxWidth;
   7566       ReduxWidth = PowerOf2Floor(NumReducedVals - i);
   7567     }
   7568 
   7569     if (VectorizedTree) {
   7570       // Finish the reduction.
   7571       for (; i < NumReducedVals; ++i) {
   7572         auto *I = cast<Instruction>(ReducedVals[i]);
   7573         Builder.SetCurrentDebugLocation(I->getDebugLoc());
   7574         VectorizedTree =
   7575             createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
   7576       }
   7577       for (auto &Pair : ExternallyUsedValues) {
   7578         // Add each externally used value to the final reduction.
   7579         for (auto *I : Pair.second) {
   7580           Builder.SetCurrentDebugLocation(I->getDebugLoc());
   7581           VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
   7582                                     Pair.first, "op.extra", I);
   7583         }
   7584       }
   7585 
   7586       ReductionRoot->replaceAllUsesWith(VectorizedTree);
   7587 
   7588       // Mark all scalar reduction ops for deletion, they are replaced by the
   7589       // vector reductions.
   7590       V.eraseInstructions(IgnoreList);
   7591     }
   7592     return VectorizedTree != nullptr;
   7593   }
   7594 
   7595   unsigned numReductionValues() const { return ReducedVals.size(); }
   7596 
   7597 private:
   7598   /// Calculate the cost of a reduction.
   7599   InstructionCost getReductionCost(TargetTransformInfo *TTI,
   7600                                    Value *FirstReducedVal,
   7601                                    unsigned ReduxWidth) {
   7602     Type *ScalarTy = FirstReducedVal->getType();
   7603     FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
   7604     InstructionCost VectorCost, ScalarCost;
   7605     switch (RdxKind) {
   7606     case RecurKind::Add:
   7607     case RecurKind::Mul:
   7608     case RecurKind::Or:
   7609     case RecurKind::And:
   7610     case RecurKind::Xor:
   7611     case RecurKind::FAdd:
   7612     case RecurKind::FMul: {
   7613       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
   7614       VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
   7615                                                    /*IsPairwiseForm=*/false);
   7616       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
   7617       break;
   7618     }
   7619     case RecurKind::FMax:
   7620     case RecurKind::FMin: {
   7621       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
   7622       VectorCost =
   7623           TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
   7624                                       /*pairwise=*/false, /*unsigned=*/false);
   7625       ScalarCost =
   7626           TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
   7627           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
   7628                                   CmpInst::makeCmpResultType(ScalarTy));
   7629       break;
   7630     }
   7631     case RecurKind::SMax:
   7632     case RecurKind::SMin:
   7633     case RecurKind::UMax:
   7634     case RecurKind::UMin: {
   7635       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
   7636       bool IsUnsigned =
   7637           RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
   7638       VectorCost =
   7639           TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
   7640                                       /*IsPairwiseForm=*/false, IsUnsigned);
   7641       ScalarCost =
   7642           TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
   7643           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
   7644                                   CmpInst::makeCmpResultType(ScalarTy));
   7645       break;
   7646     }
   7647     default:
   7648       llvm_unreachable("Expected arithmetic or min/max reduction operation");
   7649     }
   7650 
   7651     // Scalar cost is repeated for N-1 elements.
   7652     ScalarCost *= (ReduxWidth - 1);
   7653     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
   7654                       << " for reduction that starts with " << *FirstReducedVal
   7655                       << " (It is a splitting reduction)\n");
   7656     return VectorCost - ScalarCost;
   7657   }
   7658 
   7659   /// Emit a horizontal reduction of the vectorized value.
   7660   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
   7661                        unsigned ReduxWidth, const TargetTransformInfo *TTI) {
   7662     assert(VectorizedValue && "Need to have a vectorized tree node");
   7663     assert(isPowerOf2_32(ReduxWidth) &&
   7664            "We only handle power-of-two reductions for now");
   7665 
   7666     return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
   7667                                        ReductionOps.back());
   7668   }
   7669 };
   7670 
   7671 } // end anonymous namespace
   7672 
   7673 static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
   7674   if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
   7675     return cast<FixedVectorType>(IE->getType())->getNumElements();
   7676 
   7677   unsigned AggregateSize = 1;
   7678   auto *IV = cast<InsertValueInst>(InsertInst);
   7679   Type *CurrentType = IV->getType();
   7680   do {
   7681     if (auto *ST = dyn_cast<StructType>(CurrentType)) {
   7682       for (auto *Elt : ST->elements())
   7683         if (Elt != ST->getElementType(0)) // check homogeneity
   7684           return None;
   7685       AggregateSize *= ST->getNumElements();
   7686       CurrentType = ST->getElementType(0);
   7687     } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
   7688       AggregateSize *= AT->getNumElements();
   7689       CurrentType = AT->getElementType();
   7690     } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
   7691       AggregateSize *= VT->getNumElements();
   7692       return AggregateSize;
   7693     } else if (CurrentType->isSingleValueType()) {
   7694       return AggregateSize;
   7695     } else {
   7696       return None;
   7697     }
   7698   } while (true);
   7699 }
   7700 
   7701 static bool findBuildAggregate_rec(Instruction *LastInsertInst,
   7702                                    TargetTransformInfo *TTI,
   7703                                    SmallVectorImpl<Value *> &BuildVectorOpds,
   7704                                    SmallVectorImpl<Value *> &InsertElts,
   7705                                    unsigned OperandOffset) {
   7706   do {
   7707     Value *InsertedOperand = LastInsertInst->getOperand(1);
   7708     Optional<int> OperandIndex = getInsertIndex(LastInsertInst, OperandOffset);
   7709     if (!OperandIndex)
   7710       return false;
   7711     if (isa<InsertElementInst>(InsertedOperand) ||
   7712         isa<InsertValueInst>(InsertedOperand)) {
   7713       if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
   7714                                   BuildVectorOpds, InsertElts, *OperandIndex))
   7715         return false;
   7716     } else {
   7717       BuildVectorOpds[*OperandIndex] = InsertedOperand;
   7718       InsertElts[*OperandIndex] = LastInsertInst;
   7719     }
   7720     LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
   7721   } while (LastInsertInst != nullptr &&
   7722            (isa<InsertValueInst>(LastInsertInst) ||
   7723             isa<InsertElementInst>(LastInsertInst)));
   7724   return true;
   7725 }
   7726 
   7727 /// Recognize construction of vectors like
   7728 ///  %ra = insertelement <4 x float> poison, float %s0, i32 0
   7729 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
   7730 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
   7731 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
   7732 ///  starting from the last insertelement or insertvalue instruction.
   7733 ///
   7734 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
   7735 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
   7736 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
   7737 ///
   7738 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
   7739 ///
   7740 /// \return true if it matches.
   7741 static bool findBuildAggregate(Instruction *LastInsertInst,
   7742                                TargetTransformInfo *TTI,
   7743                                SmallVectorImpl<Value *> &BuildVectorOpds,
   7744                                SmallVectorImpl<Value *> &InsertElts) {
   7745 
   7746   assert((isa<InsertElementInst>(LastInsertInst) ||
   7747           isa<InsertValueInst>(LastInsertInst)) &&
   7748          "Expected insertelement or insertvalue instruction!");
   7749 
   7750   assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
   7751          "Expected empty result vectors!");
   7752 
   7753   Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
   7754   if (!AggregateSize)
   7755     return false;
   7756   BuildVectorOpds.resize(*AggregateSize);
   7757   InsertElts.resize(*AggregateSize);
   7758 
   7759   if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts,
   7760                              0)) {
   7761     llvm::erase_value(BuildVectorOpds, nullptr);
   7762     llvm::erase_value(InsertElts, nullptr);
   7763     if (BuildVectorOpds.size() >= 2)
   7764       return true;
   7765   }
   7766 
   7767   return false;
   7768 }
   7769 
   7770 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   7771   return V->getType() < V2->getType();
   7772 }
   7773 
   7774 /// Try and get a reduction value from a phi node.
   7775 ///
   7776 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
   7777 /// if they come from either \p ParentBB or a containing loop latch.
   7778 ///
   7779 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
   7780 /// if not possible.
   7781 static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   7782                                 BasicBlock *ParentBB, LoopInfo *LI) {
   7783   // There are situations where the reduction value is not dominated by the
   7784   // reduction phi. Vectorizing such cases has been reported to cause
   7785   // miscompiles. See PR25787.
   7786   auto DominatedReduxValue = [&](Value *R) {
   7787     return isa<Instruction>(R) &&
   7788            DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
   7789   };
   7790 
   7791   Value *Rdx = nullptr;
   7792 
   7793   // Return the incoming value if it comes from the same BB as the phi node.
   7794   if (P->getIncomingBlock(0) == ParentBB) {
   7795     Rdx = P->getIncomingValue(0);
   7796   } else if (P->getIncomingBlock(1) == ParentBB) {
   7797     Rdx = P->getIncomingValue(1);
   7798   }
   7799 
   7800   if (Rdx && DominatedReduxValue(Rdx))
   7801     return Rdx;
   7802 
   7803   // Otherwise, check whether we have a loop latch to look at.
   7804   Loop *BBL = LI->getLoopFor(ParentBB);
   7805   if (!BBL)
   7806     return nullptr;
   7807   BasicBlock *BBLatch = BBL->getLoopLatch();
   7808   if (!BBLatch)
   7809     return nullptr;
   7810 
   7811   // There is a loop latch, return the incoming value if it comes from
   7812   // that. This reduction pattern occasionally turns up.
   7813   if (P->getIncomingBlock(0) == BBLatch) {
   7814     Rdx = P->getIncomingValue(0);
   7815   } else if (P->getIncomingBlock(1) == BBLatch) {
   7816     Rdx = P->getIncomingValue(1);
   7817   }
   7818 
   7819   if (Rdx && DominatedReduxValue(Rdx))
   7820     return Rdx;
   7821 
   7822   return nullptr;
   7823 }
   7824 
   7825 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
   7826   if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
   7827     return true;
   7828   if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
   7829     return true;
   7830   if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
   7831     return true;
   7832   if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
   7833     return true;
   7834   if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
   7835     return true;
   7836   if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
   7837     return true;
   7838   if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
   7839     return true;
   7840   return false;
   7841 }
   7842 
   7843 /// Attempt to reduce a horizontal reduction.
   7844 /// If it is legal to match a horizontal reduction feeding the phi node \a P
   7845 /// with reduction operators \a Root (or one of its operands) in a basic block
   7846 /// \a BB, then check if it can be done. If horizontal reduction is not found
   7847 /// and root instruction is a binary operation, vectorization of the operands is
   7848 /// attempted.
   7849 /// \returns true if a horizontal reduction was matched and reduced or operands
   7850 /// of one of the binary instruction were vectorized.
   7851 /// \returns false if a horizontal reduction was not matched (or not possible)
   7852 /// or no vectorization of any binary operation feeding \a Root instruction was
   7853 /// performed.
   7854 static bool tryToVectorizeHorReductionOrInstOperands(
   7855     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
   7856     TargetTransformInfo *TTI,
   7857     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   7858   if (!ShouldVectorizeHor)
   7859     return false;
   7860 
   7861   if (!Root)
   7862     return false;
   7863 
   7864   if (Root->getParent() != BB || isa<PHINode>(Root))
   7865     return false;
   7866   // Start analysis starting from Root instruction. If horizontal reduction is
   7867   // found, try to vectorize it. If it is not a horizontal reduction or
   7868   // vectorization is not possible or not effective, and currently analyzed
   7869   // instruction is a binary operation, try to vectorize the operands, using
   7870   // pre-order DFS traversal order. If the operands were not vectorized, repeat
   7871   // the same procedure considering each operand as a possible root of the
   7872   // horizontal reduction.
   7873   // Interrupt the process if the Root instruction itself was vectorized or all
   7874   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
   7875   // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
   7876   // CmpInsts so we can skip extra attempts in
   7877   // tryToVectorizeHorReductionOrInstOperands and save compile time.
   7878   SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
   7879   SmallPtrSet<Value *, 8> VisitedInstrs;
   7880   bool Res = false;
   7881   while (!Stack.empty()) {
   7882     Instruction *Inst;
   7883     unsigned Level;
   7884     std::tie(Inst, Level) = Stack.pop_back_val();
   7885     Value *B0, *B1;
   7886     bool IsBinop = matchRdxBop(Inst, B0, B1);
   7887     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
   7888     if (IsBinop || IsSelect) {
   7889       HorizontalReduction HorRdx;
   7890       if (HorRdx.matchAssociativeReduction(P, Inst)) {
   7891         if (HorRdx.tryToReduce(R, TTI)) {
   7892           Res = true;
   7893           // Set P to nullptr to avoid re-analysis of phi node in
   7894           // matchAssociativeReduction function unless this is the root node.
   7895           P = nullptr;
   7896           continue;
   7897         }
   7898       }
   7899       if (P && IsBinop) {
   7900         Inst = dyn_cast<Instruction>(B0);
   7901         if (Inst == P)
   7902           Inst = dyn_cast<Instruction>(B1);
   7903         if (!Inst) {
   7904           // Set P to nullptr to avoid re-analysis of phi node in
   7905           // matchAssociativeReduction function unless this is the root node.
   7906           P = nullptr;
   7907           continue;
   7908         }
   7909       }
   7910     }
   7911     // Set P to nullptr to avoid re-analysis of phi node in
   7912     // matchAssociativeReduction function unless this is the root node.
   7913     P = nullptr;
   7914     // Do not try to vectorize CmpInst operands, this is done separately.
   7915     if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) {
   7916       Res = true;
   7917       continue;
   7918     }
   7919 
   7920     // Try to vectorize operands.
   7921     // Continue analysis for the instruction from the same basic block only to
   7922     // save compile time.
   7923     if (++Level < RecursionMaxDepth)
   7924       for (auto *Op : Inst->operand_values())
   7925         if (VisitedInstrs.insert(Op).second)
   7926           if (auto *I = dyn_cast<Instruction>(Op))
   7927             // Do not try to vectorize CmpInst operands,  this is done
   7928             // separately.
   7929             if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
   7930                 I->getParent() == BB)
   7931               Stack.emplace_back(I, Level);
   7932   }
   7933   return Res;
   7934 }
   7935 
   7936 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
   7937                                                  BasicBlock *BB, BoUpSLP &R,
   7938                                                  TargetTransformInfo *TTI) {
   7939   auto *I = dyn_cast_or_null<Instruction>(V);
   7940   if (!I)
   7941     return false;
   7942 
   7943   if (!isa<BinaryOperator>(I))
   7944     P = nullptr;
   7945   // Try to match and vectorize a horizontal reduction.
   7946   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
   7947     return tryToVectorize(I, R);
   7948   };
   7949   return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
   7950                                                   ExtraVectorization);
   7951 }
   7952 
   7953 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
   7954                                                  BasicBlock *BB, BoUpSLP &R) {
   7955   const DataLayout &DL = BB->getModule()->getDataLayout();
   7956   if (!R.canMapToVector(IVI->getType(), DL))
   7957     return false;
   7958 
   7959   SmallVector<Value *, 16> BuildVectorOpds;
   7960   SmallVector<Value *, 16> BuildVectorInsts;
   7961   if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
   7962     return false;
   7963 
   7964   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   7965   // Aggregate value is unlikely to be processed in vector register, we need to
   7966   // extract scalars into scalar registers, so NeedExtraction is set true.
   7967   return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false);
   7968 }
   7969 
   7970 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   7971                                                    BasicBlock *BB, BoUpSLP &R) {
   7972   SmallVector<Value *, 16> BuildVectorInsts;
   7973   SmallVector<Value *, 16> BuildVectorOpds;
   7974   SmallVector<int> Mask;
   7975   if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
   7976       (llvm::all_of(BuildVectorOpds,
   7977                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
   7978        isShuffle(BuildVectorOpds, Mask)))
   7979     return false;
   7980 
   7981   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
   7982   return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/false);
   7983 }
   7984 
   7985 bool SLPVectorizerPass::vectorizeSimpleInstructions(
   7986     SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
   7987     bool AtTerminator) {
   7988   bool OpsChanged = false;
   7989   SmallVector<Instruction *, 4> PostponedCmps;
   7990   for (auto *I : reverse(Instructions)) {
   7991     if (R.isDeleted(I))
   7992       continue;
   7993     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
   7994       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
   7995     else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
   7996       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
   7997     else if (isa<CmpInst>(I))
   7998       PostponedCmps.push_back(I);
   7999   }
   8000   if (AtTerminator) {
   8001     // Try to find reductions first.
   8002     for (Instruction *I : PostponedCmps) {
   8003       if (R.isDeleted(I))
   8004         continue;
   8005       for (Value *Op : I->operands())
   8006         OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
   8007     }
   8008     // Try to vectorize operands as vector bundles.
   8009     for (Instruction *I : PostponedCmps) {
   8010       if (R.isDeleted(I))
   8011         continue;
   8012       OpsChanged |= tryToVectorize(I, R);
   8013     }
   8014     Instructions.clear();
   8015   } else {
   8016     // Insert in reverse order since the PostponedCmps vector was filled in
   8017     // reverse order.
   8018     Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
   8019   }
   8020   return OpsChanged;
   8021 }
   8022 
   8023 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   8024   bool Changed = false;
   8025   SmallVector<Value *, 4> Incoming;
   8026   SmallPtrSet<Value *, 16> VisitedInstrs;
   8027 
   8028   bool HaveVectorizedPhiNodes = true;
   8029   while (HaveVectorizedPhiNodes) {
   8030     HaveVectorizedPhiNodes = false;
   8031 
   8032     // Collect the incoming values from the PHIs.
   8033     Incoming.clear();
   8034     for (Instruction &I : *BB) {
   8035       PHINode *P = dyn_cast<PHINode>(&I);
   8036       if (!P)
   8037         break;
   8038 
   8039       if (!VisitedInstrs.count(P) && !R.isDeleted(P))
   8040         Incoming.push_back(P);
   8041     }
   8042 
   8043     // Sort by type.
   8044     llvm::stable_sort(Incoming, PhiTypeSorterFunc);
   8045 
   8046     // Try to vectorize elements base on their type.
   8047     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
   8048                                            E = Incoming.end();
   8049          IncIt != E;) {
   8050 
   8051       // Look for the next elements with the same type.
   8052       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
   8053       while (SameTypeIt != E &&
   8054              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
   8055         VisitedInstrs.insert(*SameTypeIt);
   8056         ++SameTypeIt;
   8057       }
   8058 
   8059       // Try to vectorize them.
   8060       unsigned NumElts = (SameTypeIt - IncIt);
   8061       LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
   8062                         << NumElts << ")\n");
   8063       // The order in which the phi nodes appear in the program does not matter.
   8064       // So allow tryToVectorizeList to reorder them if it is beneficial. This
   8065       // is done when there are exactly two elements since tryToVectorizeList
   8066       // asserts that there are only two values when AllowReorder is true.
   8067       bool AllowReorder = NumElts == 2;
   8068       if (NumElts > 1 &&
   8069           tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
   8070         // Success start over because instructions might have been changed.
   8071         HaveVectorizedPhiNodes = true;
   8072         Changed = true;
   8073         break;
   8074       }
   8075 
   8076       // Start over at the next instruction of a different type (or the end).
   8077       IncIt = SameTypeIt;
   8078     }
   8079   }
   8080 
   8081   VisitedInstrs.clear();
   8082 
   8083   SmallVector<Instruction *, 8> PostProcessInstructions;
   8084   SmallDenseSet<Instruction *, 4> KeyNodes;
   8085   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
   8086     // Skip instructions with scalable type. The num of elements is unknown at
   8087     // compile-time for scalable type.
   8088     if (isa<ScalableVectorType>(it->getType()))
   8089       continue;
   8090 
   8091     // Skip instructions marked for the deletion.
   8092     if (R.isDeleted(&*it))
   8093       continue;
   8094     // We may go through BB multiple times so skip the one we have checked.
   8095     if (!VisitedInstrs.insert(&*it).second) {
   8096       if (it->use_empty() && KeyNodes.contains(&*it) &&
   8097           vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
   8098                                       it->isTerminator())) {
   8099         // We would like to start over since some instructions are deleted
   8100         // and the iterator may become invalid value.
   8101         Changed = true;
   8102         it = BB->begin();
   8103         e = BB->end();
   8104       }
   8105       continue;
   8106     }
   8107 
   8108     if (isa<DbgInfoIntrinsic>(it))
   8109       continue;
   8110 
   8111     // Try to vectorize reductions that use PHINodes.
   8112     if (PHINode *P = dyn_cast<PHINode>(it)) {
   8113       // Check that the PHI is a reduction PHI.
   8114       if (P->getNumIncomingValues() == 2) {
   8115         // Try to match and vectorize a horizontal reduction.
   8116         if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
   8117                                      TTI)) {
   8118           Changed = true;
   8119           it = BB->begin();
   8120           e = BB->end();
   8121           continue;
   8122         }
   8123       }
   8124       // Try to vectorize the incoming values of the PHI, to catch reductions
   8125       // that feed into PHIs.
   8126       for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
   8127         // Skip if the incoming block is the current BB for now. Also, bypass
   8128         // unreachable IR for efficiency and to avoid crashing.
   8129         // TODO: Collect the skipped incoming values and try to vectorize them
   8130         // after processing BB.
   8131         if (BB == P->getIncomingBlock(I) ||
   8132             !DT->isReachableFromEntry(P->getIncomingBlock(I)))
   8133           continue;
   8134 
   8135         Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
   8136                                             P->getIncomingBlock(I), R, TTI);
   8137       }
   8138       continue;
   8139     }
   8140 
   8141     // Ran into an instruction without users, like terminator, or function call
   8142     // with ignored return value, store. Ignore unused instructions (basing on
   8143     // instruction type, except for CallInst and InvokeInst).
   8144     if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
   8145                             isa<InvokeInst>(it))) {
   8146       KeyNodes.insert(&*it);
   8147       bool OpsChanged = false;
   8148       if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
   8149         for (auto *V : it->operand_values()) {
   8150           // Try to match and vectorize a horizontal reduction.
   8151           OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
   8152         }
   8153       }
   8154       // Start vectorization of post-process list of instructions from the
   8155       // top-tree instructions to try to vectorize as many instructions as
   8156       // possible.
   8157       OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
   8158                                                 it->isTerminator());
   8159       if (OpsChanged) {
   8160         // We would like to start over since some instructions are deleted
   8161         // and the iterator may become invalid value.
   8162         Changed = true;
   8163         it = BB->begin();
   8164         e = BB->end();
   8165         continue;
   8166       }
   8167     }
   8168 
   8169     if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
   8170         isa<InsertValueInst>(it))
   8171       PostProcessInstructions.push_back(&*it);
   8172   }
   8173 
   8174   return Changed;
   8175 }
   8176 
   8177 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
   8178   auto Changed = false;
   8179   for (auto &Entry : GEPs) {
   8180     // If the getelementptr list has fewer than two elements, there's nothing
   8181     // to do.
   8182     if (Entry.second.size() < 2)
   8183       continue;
   8184 
   8185     LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
   8186                       << Entry.second.size() << ".\n");
   8187 
   8188     // Process the GEP list in chunks suitable for the target's supported
   8189     // vector size. If a vector register can't hold 1 element, we are done. We
   8190     // are trying to vectorize the index computations, so the maximum number of
   8191     // elements is based on the size of the index expression, rather than the
   8192     // size of the GEP itself (the target's pointer size).
   8193     unsigned MaxVecRegSize = R.getMaxVecRegSize();
   8194     unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
   8195     if (MaxVecRegSize < EltSize)
   8196       continue;
   8197 
   8198     unsigned MaxElts = MaxVecRegSize / EltSize;
   8199     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
   8200       auto Len = std::min<unsigned>(BE - BI, MaxElts);
   8201       ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
   8202 
   8203       // Initialize a set a candidate getelementptrs. Note that we use a
   8204       // SetVector here to preserve program order. If the index computations
   8205       // are vectorizable and begin with loads, we want to minimize the chance
   8206       // of having to reorder them later.
   8207       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
   8208 
   8209       // Some of the candidates may have already been vectorized after we
   8210       // initially collected them. If so, they are marked as deleted, so remove
   8211       // them from the set of candidates.
   8212       Candidates.remove_if(
   8213           [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
   8214 
   8215       // Remove from the set of candidates all pairs of getelementptrs with
   8216       // constant differences. Such getelementptrs are likely not good
   8217       // candidates for vectorization in a bottom-up phase since one can be
   8218       // computed from the other. We also ensure all candidate getelementptr
   8219       // indices are unique.
   8220       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
   8221         auto *GEPI = GEPList[I];
   8222         if (!Candidates.count(GEPI))
   8223           continue;
   8224         auto *SCEVI = SE->getSCEV(GEPList[I]);
   8225         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
   8226           auto *GEPJ = GEPList[J];
   8227           auto *SCEVJ = SE->getSCEV(GEPList[J]);
   8228           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
   8229             Candidates.remove(GEPI);
   8230             Candidates.remove(GEPJ);
   8231           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
   8232             Candidates.remove(GEPJ);
   8233           }
   8234         }
   8235       }
   8236 
   8237       // We break out of the above computation as soon as we know there are
   8238       // fewer than two candidates remaining.
   8239       if (Candidates.size() < 2)
   8240         continue;
   8241 
   8242       // Add the single, non-constant index of each candidate to the bundle. We
   8243       // ensured the indices met these constraints when we originally collected
   8244       // the getelementptrs.
   8245       SmallVector<Value *, 16> Bundle(Candidates.size());
   8246       auto BundleIndex = 0u;
   8247       for (auto *V : Candidates) {
   8248         auto *GEP = cast<GetElementPtrInst>(V);
   8249         auto *GEPIdx = GEP->idx_begin()->get();
   8250         assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
   8251         Bundle[BundleIndex++] = GEPIdx;
   8252       }
   8253 
   8254       // Try and vectorize the indices. We are currently only interested in
   8255       // gather-like cases of the form:
   8256       //
   8257       // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
   8258       //
   8259       // where the loads of "a", the loads of "b", and the subtractions can be
   8260       // performed in parallel. It's likely that detecting this pattern in a
   8261       // bottom-up phase will be simpler and less costly than building a
   8262       // full-blown top-down phase beginning at the consecutive loads.
   8263       Changed |= tryToVectorizeList(Bundle, R);
   8264     }
   8265   }
   8266   return Changed;
   8267 }
   8268 
   8269 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
   8270   bool Changed = false;
   8271   // Attempt to sort and vectorize each of the store-groups.
   8272   for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
   8273        ++it) {
   8274     if (it->second.size() < 2)
   8275       continue;
   8276 
   8277     LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
   8278                       << it->second.size() << ".\n");
   8279 
   8280     Changed |= vectorizeStores(it->second, R);
   8281   }
   8282   return Changed;
   8283 }
   8284 
   8285 char SLPVectorizer::ID = 0;
   8286 
   8287 static const char lv_name[] = "SLP Vectorizer";
   8288 
   8289 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
   8290 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
   8291 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
   8292 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
   8293 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
   8294 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
   8295 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
   8296 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
   8297 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
   8298 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
   8299 
   8300 Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
   8301