Home | History | Annotate | Line # | Download | only in CodeGen
      1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// This file provides a helper that implements much of the TTI interface in
     11 /// terms of the target-independent code generator and TargetLowering
     12 /// interfaces.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
     17 #define LLVM_CODEGEN_BASICTTIIMPL_H
     18 
     19 #include "llvm/ADT/APInt.h"
     20 #include "llvm/ADT/ArrayRef.h"
     21 #include "llvm/ADT/BitVector.h"
     22 #include "llvm/ADT/SmallPtrSet.h"
     23 #include "llvm/ADT/SmallVector.h"
     24 #include "llvm/Analysis/LoopInfo.h"
     25 #include "llvm/Analysis/TargetTransformInfo.h"
     26 #include "llvm/Analysis/TargetTransformInfoImpl.h"
     27 #include "llvm/CodeGen/ISDOpcodes.h"
     28 #include "llvm/CodeGen/TargetLowering.h"
     29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
     30 #include "llvm/CodeGen/ValueTypes.h"
     31 #include "llvm/IR/BasicBlock.h"
     32 #include "llvm/IR/Constant.h"
     33 #include "llvm/IR/Constants.h"
     34 #include "llvm/IR/DataLayout.h"
     35 #include "llvm/IR/DerivedTypes.h"
     36 #include "llvm/IR/InstrTypes.h"
     37 #include "llvm/IR/Instruction.h"
     38 #include "llvm/IR/Instructions.h"
     39 #include "llvm/IR/Intrinsics.h"
     40 #include "llvm/IR/Operator.h"
     41 #include "llvm/IR/Type.h"
     42 #include "llvm/IR/Value.h"
     43 #include "llvm/Support/Casting.h"
     44 #include "llvm/Support/CommandLine.h"
     45 #include "llvm/Support/ErrorHandling.h"
     46 #include "llvm/Support/MachineValueType.h"
     47 #include "llvm/Support/MathExtras.h"
     48 #include "llvm/Target/TargetMachine.h"
     49 #include <algorithm>
     50 #include <cassert>
     51 #include <cstdint>
     52 #include <limits>
     53 #include <utility>
     54 
     55 namespace llvm {
     56 
     57 class Function;
     58 class GlobalValue;
     59 class LLVMContext;
     60 class ScalarEvolution;
     61 class SCEV;
     62 class TargetMachine;
     63 
     64 extern cl::opt<unsigned> PartialUnrollingThreshold;
     65 
     66 /// Base class which can be used to help build a TTI implementation.
     67 ///
     68 /// This class provides as much implementation of the TTI interface as is
     69 /// possible using the target independent parts of the code generator.
     70 ///
     71 /// In order to subclass it, your class must implement a getST() method to
     72 /// return the subtarget, and a getTLI() method to return the target lowering.
     73 /// We need these methods implemented in the derived class so that this class
     74 /// doesn't have to duplicate storage for them.
     75 template <typename T>
     76 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     77 private:
     78   using BaseT = TargetTransformInfoImplCRTPBase<T>;
     79   using TTI = TargetTransformInfo;
     80 
     81   /// Helper function to access this as a T.
     82   T *thisT() { return static_cast<T *>(this); }
     83 
     84   /// Estimate a cost of Broadcast as an extract and sequence of insert
     85   /// operations.
     86   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
     87     InstructionCost Cost = 0;
     88     // Broadcast cost is equal to the cost of extracting the zero'th element
     89     // plus the cost of inserting it into every element of the result vector.
     90     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
     91 
     92     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
     93       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
     94     }
     95     return Cost;
     96   }
     97 
     98   /// Estimate a cost of shuffle as a sequence of extract and insert
     99   /// operations.
    100   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
    101     InstructionCost Cost = 0;
    102     // Shuffle cost is equal to the cost of extracting element from its argument
    103     // plus the cost of inserting them onto the result vector.
    104 
    105     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
    106     // index 0 of first vector, index 1 of second vector,index 2 of first
    107     // vector and finally index 3 of second vector and insert them at index
    108     // <0,1,2,3> of result vector.
    109     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
    110       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
    111       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
    112     }
    113     return Cost;
    114   }
    115 
    116   /// Estimate a cost of subvector extraction as a sequence of extract and
    117   /// insert operations.
    118   InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
    119                                        FixedVectorType *SubVTy) {
    120     assert(VTy && SubVTy &&
    121            "Can only extract subvectors from vectors");
    122     int NumSubElts = SubVTy->getNumElements();
    123     assert((!isa<FixedVectorType>(VTy) ||
    124             (Index + NumSubElts) <=
    125                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
    126            "SK_ExtractSubvector index out of range");
    127 
    128     InstructionCost Cost = 0;
    129     // Subvector extraction cost is equal to the cost of extracting element from
    130     // the source type plus the cost of inserting them into the result vector
    131     // type.
    132     for (int i = 0; i != NumSubElts; ++i) {
    133       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
    134                                           i + Index);
    135       Cost +=
    136           thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
    137     }
    138     return Cost;
    139   }
    140 
    141   /// Estimate a cost of subvector insertion as a sequence of extract and
    142   /// insert operations.
    143   InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
    144                                       FixedVectorType *SubVTy) {
    145     assert(VTy && SubVTy &&
    146            "Can only insert subvectors into vectors");
    147     int NumSubElts = SubVTy->getNumElements();
    148     assert((!isa<FixedVectorType>(VTy) ||
    149             (Index + NumSubElts) <=
    150                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
    151            "SK_InsertSubvector index out of range");
    152 
    153     InstructionCost Cost = 0;
    154     // Subvector insertion cost is equal to the cost of extracting element from
    155     // the source type plus the cost of inserting them into the result vector
    156     // type.
    157     for (int i = 0; i != NumSubElts; ++i) {
    158       Cost +=
    159           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
    160       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
    161                                           i + Index);
    162     }
    163     return Cost;
    164   }
    165 
    166   /// Local query method delegates up to T which *must* implement this!
    167   const TargetSubtargetInfo *getST() const {
    168     return static_cast<const T *>(this)->getST();
    169   }
    170 
    171   /// Local query method delegates up to T which *must* implement this!
    172   const TargetLoweringBase *getTLI() const {
    173     return static_cast<const T *>(this)->getTLI();
    174   }
    175 
    176   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
    177     switch (M) {
    178       case TTI::MIM_Unindexed:
    179         return ISD::UNINDEXED;
    180       case TTI::MIM_PreInc:
    181         return ISD::PRE_INC;
    182       case TTI::MIM_PreDec:
    183         return ISD::PRE_DEC;
    184       case TTI::MIM_PostInc:
    185         return ISD::POST_INC;
    186       case TTI::MIM_PostDec:
    187         return ISD::POST_DEC;
    188     }
    189     llvm_unreachable("Unexpected MemIndexedMode");
    190   }
    191 
    192   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
    193                                               Align Alignment,
    194                                               bool VariableMask,
    195                                               bool IsGatherScatter,
    196                                               TTI::TargetCostKind CostKind) {
    197     auto *VT = cast<FixedVectorType>(DataTy);
    198     // Assume the target does not have support for gather/scatter operations
    199     // and provide a rough estimate.
    200     //
    201     // First, compute the cost of the individual memory operations.
    202     InstructionCost AddrExtractCost =
    203         IsGatherScatter
    204             ? getVectorInstrCost(Instruction::ExtractElement,
    205                                  FixedVectorType::get(
    206                                      PointerType::get(VT->getElementType(), 0),
    207                                      VT->getNumElements()),
    208                                  -1)
    209             : 0;
    210     InstructionCost LoadCost =
    211         VT->getNumElements() *
    212         (AddrExtractCost +
    213          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
    214 
    215     // Next, compute the cost of packing the result in a vector.
    216     InstructionCost PackingCost = getScalarizationOverhead(
    217         VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
    218 
    219     InstructionCost ConditionalCost = 0;
    220     if (VariableMask) {
    221       // Compute the cost of conditionally executing the memory operations with
    222       // variable masks. This includes extracting the individual conditions, a
    223       // branches and PHIs to combine the results.
    224       // NOTE: Estimating the cost of conditionally executing the memory
    225       // operations accurately is quite difficult and the current solution
    226       // provides a very rough estimate only.
    227       ConditionalCost =
    228           VT->getNumElements() *
    229           (getVectorInstrCost(
    230                Instruction::ExtractElement,
    231                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
    232                                     VT->getNumElements()),
    233                -1) +
    234            getCFInstrCost(Instruction::Br, CostKind) +
    235            getCFInstrCost(Instruction::PHI, CostKind));
    236     }
    237 
    238     return LoadCost + PackingCost + ConditionalCost;
    239   }
    240 
    241 protected:
    242   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
    243       : BaseT(DL) {}
    244   virtual ~BasicTTIImplBase() = default;
    245 
    246   using TargetTransformInfoImplBase::DL;
    247 
    248 public:
    249   /// \name Scalar TTI Implementations
    250   /// @{
    251   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
    252                                       unsigned AddressSpace, Align Alignment,
    253                                       bool *Fast) const {
    254     EVT E = EVT::getIntegerVT(Context, BitWidth);
    255     return getTLI()->allowsMisalignedMemoryAccesses(
    256         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
    257   }
    258 
    259   bool hasBranchDivergence() { return false; }
    260 
    261   bool useGPUDivergenceAnalysis() { return false; }
    262 
    263   bool isSourceOfDivergence(const Value *V) { return false; }
    264 
    265   bool isAlwaysUniform(const Value *V) { return false; }
    266 
    267   unsigned getFlatAddressSpace() {
    268     // Return an invalid address space.
    269     return -1;
    270   }
    271 
    272   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
    273                                   Intrinsic::ID IID) const {
    274     return false;
    275   }
    276 
    277   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
    278     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
    279   }
    280 
    281   unsigned getAssumedAddrSpace(const Value *V) const {
    282     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
    283   }
    284 
    285   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
    286                                           Value *NewV) const {
    287     return nullptr;
    288   }
    289 
    290   bool isLegalAddImmediate(int64_t imm) {
    291     return getTLI()->isLegalAddImmediate(imm);
    292   }
    293 
    294   bool isLegalICmpImmediate(int64_t imm) {
    295     return getTLI()->isLegalICmpImmediate(imm);
    296   }
    297 
    298   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
    299                              bool HasBaseReg, int64_t Scale,
    300                              unsigned AddrSpace, Instruction *I = nullptr) {
    301     TargetLoweringBase::AddrMode AM;
    302     AM.BaseGV = BaseGV;
    303     AM.BaseOffs = BaseOffset;
    304     AM.HasBaseReg = HasBaseReg;
    305     AM.Scale = Scale;
    306     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
    307   }
    308 
    309   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
    310                           const DataLayout &DL) const {
    311     EVT VT = getTLI()->getValueType(DL, Ty);
    312     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
    313   }
    314 
    315   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
    316                            const DataLayout &DL) const {
    317     EVT VT = getTLI()->getValueType(DL, Ty);
    318     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
    319   }
    320 
    321   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
    322     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
    323   }
    324 
    325   bool isNumRegsMajorCostOfLSR() {
    326     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
    327   }
    328 
    329   bool isProfitableLSRChainElement(Instruction *I) {
    330     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
    331   }
    332 
    333   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
    334                                        int64_t BaseOffset, bool HasBaseReg,
    335                                        int64_t Scale, unsigned AddrSpace) {
    336     TargetLoweringBase::AddrMode AM;
    337     AM.BaseGV = BaseGV;
    338     AM.BaseOffs = BaseOffset;
    339     AM.HasBaseReg = HasBaseReg;
    340     AM.Scale = Scale;
    341     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
    342   }
    343 
    344   bool isTruncateFree(Type *Ty1, Type *Ty2) {
    345     return getTLI()->isTruncateFree(Ty1, Ty2);
    346   }
    347 
    348   bool isProfitableToHoist(Instruction *I) {
    349     return getTLI()->isProfitableToHoist(I);
    350   }
    351 
    352   bool useAA() const { return getST()->useAA(); }
    353 
    354   bool isTypeLegal(Type *Ty) {
    355     EVT VT = getTLI()->getValueType(DL, Ty);
    356     return getTLI()->isTypeLegal(VT);
    357   }
    358 
    359   InstructionCost getRegUsageForType(Type *Ty) {
    360     InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
    361     assert(Val >= 0 && "Negative cost!");
    362     return Val;
    363   }
    364 
    365   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
    366                              ArrayRef<const Value *> Operands) {
    367     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
    368   }
    369 
    370   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
    371                                             unsigned &JumpTableSize,
    372                                             ProfileSummaryInfo *PSI,
    373                                             BlockFrequencyInfo *BFI) {
    374     /// Try to find the estimated number of clusters. Note that the number of
    375     /// clusters identified in this function could be different from the actual
    376     /// numbers found in lowering. This function ignore switches that are
    377     /// lowered with a mix of jump table / bit test / BTree. This function was
    378     /// initially intended to be used when estimating the cost of switch in
    379     /// inline cost heuristic, but it's a generic cost model to be used in other
    380     /// places (e.g., in loop unrolling).
    381     unsigned N = SI.getNumCases();
    382     const TargetLoweringBase *TLI = getTLI();
    383     const DataLayout &DL = this->getDataLayout();
    384 
    385     JumpTableSize = 0;
    386     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
    387 
    388     // Early exit if both a jump table and bit test are not allowed.
    389     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
    390       return N;
    391 
    392     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
    393     APInt MinCaseVal = MaxCaseVal;
    394     for (auto CI : SI.cases()) {
    395       const APInt &CaseVal = CI.getCaseValue()->getValue();
    396       if (CaseVal.sgt(MaxCaseVal))
    397         MaxCaseVal = CaseVal;
    398       if (CaseVal.slt(MinCaseVal))
    399         MinCaseVal = CaseVal;
    400     }
    401 
    402     // Check if suitable for a bit test
    403     if (N <= DL.getIndexSizeInBits(0u)) {
    404       SmallPtrSet<const BasicBlock *, 4> Dests;
    405       for (auto I : SI.cases())
    406         Dests.insert(I.getCaseSuccessor());
    407 
    408       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
    409                                      DL))
    410         return 1;
    411     }
    412 
    413     // Check if suitable for a jump table.
    414     if (IsJTAllowed) {
    415       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
    416         return N;
    417       uint64_t Range =
    418           (MaxCaseVal - MinCaseVal)
    419               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
    420       // Check whether a range of clusters is dense enough for a jump table
    421       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
    422         JumpTableSize = Range;
    423         return 1;
    424       }
    425     }
    426     return N;
    427   }
    428 
    429   bool shouldBuildLookupTables() {
    430     const TargetLoweringBase *TLI = getTLI();
    431     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
    432            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
    433   }
    434 
    435   bool shouldBuildRelLookupTables() const {
    436     const TargetMachine &TM = getTLI()->getTargetMachine();
    437     // If non-PIC mode, do not generate a relative lookup table.
    438     if (!TM.isPositionIndependent())
    439       return false;
    440 
    441     /// Relative lookup table entries consist of 32-bit offsets.
    442     /// Do not generate relative lookup tables for large code models
    443     /// in 64-bit achitectures where 32-bit offsets might not be enough.
    444     if (TM.getCodeModel() == CodeModel::Medium ||
    445         TM.getCodeModel() == CodeModel::Large)
    446       return false;
    447 
    448     Triple TargetTriple = TM.getTargetTriple();
    449     if (!TargetTriple.isArch64Bit())
    450       return false;
    451 
    452     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
    453     // there.
    454     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
    455       return false;
    456 
    457     return true;
    458   }
    459 
    460   bool haveFastSqrt(Type *Ty) {
    461     const TargetLoweringBase *TLI = getTLI();
    462     EVT VT = TLI->getValueType(DL, Ty);
    463     return TLI->isTypeLegal(VT) &&
    464            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
    465   }
    466 
    467   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
    468     return true;
    469   }
    470 
    471   InstructionCost getFPOpCost(Type *Ty) {
    472     // Check whether FADD is available, as a proxy for floating-point in
    473     // general.
    474     const TargetLoweringBase *TLI = getTLI();
    475     EVT VT = TLI->getValueType(DL, Ty);
    476     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
    477       return TargetTransformInfo::TCC_Basic;
    478     return TargetTransformInfo::TCC_Expensive;
    479   }
    480 
    481   unsigned getInliningThresholdMultiplier() { return 1; }
    482   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
    483 
    484   int getInlinerVectorBonusPercent() { return 150; }
    485 
    486   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
    487                                TTI::UnrollingPreferences &UP) {
    488     // This unrolling functionality is target independent, but to provide some
    489     // motivation for its intended use, for x86:
    490 
    491     // According to the Intel 64 and IA-32 Architectures Optimization Reference
    492     // Manual, Intel Core models and later have a loop stream detector (and
    493     // associated uop queue) that can benefit from partial unrolling.
    494     // The relevant requirements are:
    495     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
    496     //    taken, and none of them may be calls.
    497     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
    498 
    499     // According to the Software Optimization Guide for AMD Family 15h
    500     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
    501     // and loop buffer which can benefit from partial unrolling.
    502     // The relevant requirements are:
    503     //  - The loop must have fewer than 16 branches
    504     //  - The loop must have less than 40 uops in all executed loop branches
    505 
    506     // The number of taken branches in a loop is hard to estimate here, and
    507     // benchmarking has revealed that it is better not to be conservative when
    508     // estimating the branch count. As a result, we'll ignore the branch limits
    509     // until someone finds a case where it matters in practice.
    510 
    511     unsigned MaxOps;
    512     const TargetSubtargetInfo *ST = getST();
    513     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
    514       MaxOps = PartialUnrollingThreshold;
    515     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
    516       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
    517     else
    518       return;
    519 
    520     // Scan the loop: don't unroll loops with calls.
    521     for (BasicBlock *BB : L->blocks()) {
    522       for (Instruction &I : *BB) {
    523         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
    524           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
    525             if (!thisT()->isLoweredToCall(F))
    526               continue;
    527           }
    528 
    529           return;
    530         }
    531       }
    532     }
    533 
    534     // Enable runtime and partial unrolling up to the specified size.
    535     // Enable using trip count upper bound to unroll loops.
    536     UP.Partial = UP.Runtime = UP.UpperBound = true;
    537     UP.PartialThreshold = MaxOps;
    538 
    539     // Avoid unrolling when optimizing for size.
    540     UP.OptSizeThreshold = 0;
    541     UP.PartialOptSizeThreshold = 0;
    542 
    543     // Set number of instructions optimized when "back edge"
    544     // becomes "fall through" to default value of 2.
    545     UP.BEInsns = 2;
    546   }
    547 
    548   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
    549                              TTI::PeelingPreferences &PP) {
    550     PP.PeelCount = 0;
    551     PP.AllowPeeling = true;
    552     PP.AllowLoopNestsPeeling = false;
    553     PP.PeelProfiledIterations = true;
    554   }
    555 
    556   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
    557                                 AssumptionCache &AC,
    558                                 TargetLibraryInfo *LibInfo,
    559                                 HardwareLoopInfo &HWLoopInfo) {
    560     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
    561   }
    562 
    563   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
    564                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
    565                                    DominatorTree *DT,
    566                                    const LoopAccessInfo *LAI) {
    567     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
    568   }
    569 
    570   bool emitGetActiveLaneMask() {
    571     return BaseT::emitGetActiveLaneMask();
    572   }
    573 
    574   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
    575                                                IntrinsicInst &II) {
    576     return BaseT::instCombineIntrinsic(IC, II);
    577   }
    578 
    579   Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
    580                                                      IntrinsicInst &II,
    581                                                      APInt DemandedMask,
    582                                                      KnownBits &Known,
    583                                                      bool &KnownBitsComputed) {
    584     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
    585                                                    KnownBitsComputed);
    586   }
    587 
    588   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
    589       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
    590       APInt &UndefElts2, APInt &UndefElts3,
    591       std::function<void(Instruction *, unsigned, APInt, APInt &)>
    592           SimplifyAndSetOp) {
    593     return BaseT::simplifyDemandedVectorEltsIntrinsic(
    594         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
    595         SimplifyAndSetOp);
    596   }
    597 
    598   InstructionCost getInstructionLatency(const Instruction *I) {
    599     if (isa<LoadInst>(I))
    600       return getST()->getSchedModel().DefaultLoadLatency;
    601 
    602     return BaseT::getInstructionLatency(I);
    603   }
    604 
    605   virtual Optional<unsigned>
    606   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
    607     return Optional<unsigned>(
    608       getST()->getCacheSize(static_cast<unsigned>(Level)));
    609   }
    610 
    611   virtual Optional<unsigned>
    612   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
    613     Optional<unsigned> TargetResult =
    614         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
    615 
    616     if (TargetResult)
    617       return TargetResult;
    618 
    619     return BaseT::getCacheAssociativity(Level);
    620   }
    621 
    622   virtual unsigned getCacheLineSize() const {
    623     return getST()->getCacheLineSize();
    624   }
    625 
    626   virtual unsigned getPrefetchDistance() const {
    627     return getST()->getPrefetchDistance();
    628   }
    629 
    630   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
    631                                         unsigned NumStridedMemAccesses,
    632                                         unsigned NumPrefetches,
    633                                         bool HasCall) const {
    634     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
    635                                          NumPrefetches, HasCall);
    636   }
    637 
    638   virtual unsigned getMaxPrefetchIterationsAhead() const {
    639     return getST()->getMaxPrefetchIterationsAhead();
    640   }
    641 
    642   virtual bool enableWritePrefetching() const {
    643     return getST()->enableWritePrefetching();
    644   }
    645 
    646   /// @}
    647 
    648   /// \name Vector TTI Implementations
    649   /// @{
    650 
    651   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
    652     return TypeSize::getFixed(32);
    653   }
    654 
    655   Optional<unsigned> getMaxVScale() const { return None; }
    656 
    657   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
    658   /// are set if the demanded result elements need to be inserted and/or
    659   /// extracted from vectors.
    660   InstructionCost getScalarizationOverhead(VectorType *InTy,
    661                                            const APInt &DemandedElts,
    662                                            bool Insert, bool Extract) {
    663     /// FIXME: a bitfield is not a reasonable abstraction for talking about
    664     /// which elements are needed from a scalable vector
    665     auto *Ty = cast<FixedVectorType>(InTy);
    666 
    667     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
    668            "Vector size mismatch");
    669 
    670     InstructionCost Cost = 0;
    671 
    672     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
    673       if (!DemandedElts[i])
    674         continue;
    675       if (Insert)
    676         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
    677       if (Extract)
    678         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
    679     }
    680 
    681     return Cost;
    682   }
    683 
    684   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
    685   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
    686                                            bool Extract) {
    687     auto *Ty = cast<FixedVectorType>(InTy);
    688 
    689     APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
    690     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
    691   }
    692 
    693   /// Estimate the overhead of scalarizing an instructions unique
    694   /// non-constant operands. The (potentially vector) types to use for each of
    695   /// argument are passes via Tys.
    696   InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
    697                                                    ArrayRef<Type *> Tys) {
    698     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
    699 
    700     InstructionCost Cost = 0;
    701     SmallPtrSet<const Value*, 4> UniqueOperands;
    702     for (int I = 0, E = Args.size(); I != E; I++) {
    703       // Disregard things like metadata arguments.
    704       const Value *A = Args[I];
    705       Type *Ty = Tys[I];
    706       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
    707           !Ty->isPtrOrPtrVectorTy())
    708         continue;
    709 
    710       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
    711         if (auto *VecTy = dyn_cast<VectorType>(Ty))
    712           Cost += getScalarizationOverhead(VecTy, false, true);
    713       }
    714     }
    715 
    716     return Cost;
    717   }
    718 
    719   /// Estimate the overhead of scalarizing the inputs and outputs of an
    720   /// instruction, with return type RetTy and arguments Args of type Tys. If
    721   /// Args are unknown (empty), then the cost associated with one argument is
    722   /// added as a heuristic.
    723   InstructionCost getScalarizationOverhead(VectorType *RetTy,
    724                                            ArrayRef<const Value *> Args,
    725                                            ArrayRef<Type *> Tys) {
    726     InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
    727     if (!Args.empty())
    728       Cost += getOperandsScalarizationOverhead(Args, Tys);
    729     else
    730       // When no information on arguments is provided, we add the cost
    731       // associated with one argument as a heuristic.
    732       Cost += getScalarizationOverhead(RetTy, false, true);
    733 
    734     return Cost;
    735   }
    736 
    737   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
    738 
    739   InstructionCost getArithmeticInstrCost(
    740       unsigned Opcode, Type *Ty,
    741       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
    742       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
    743       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
    744       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
    745       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
    746       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
    747       const Instruction *CxtI = nullptr) {
    748     // Check if any of the operands are vector operands.
    749     const TargetLoweringBase *TLI = getTLI();
    750     int ISD = TLI->InstructionOpcodeToISD(Opcode);
    751     assert(ISD && "Invalid opcode");
    752 
    753     // TODO: Handle more cost kinds.
    754     if (CostKind != TTI::TCK_RecipThroughput)
    755       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
    756                                            Opd1Info, Opd2Info,
    757                                            Opd1PropInfo, Opd2PropInfo,
    758                                            Args, CxtI);
    759 
    760     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
    761 
    762     bool IsFloat = Ty->isFPOrFPVectorTy();
    763     // Assume that floating point arithmetic operations cost twice as much as
    764     // integer operations.
    765     InstructionCost OpCost = (IsFloat ? 2 : 1);
    766 
    767     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
    768       // The operation is legal. Assume it costs 1.
    769       // TODO: Once we have extract/insert subvector cost we need to use them.
    770       return LT.first * OpCost;
    771     }
    772 
    773     if (!TLI->isOperationExpand(ISD, LT.second)) {
    774       // If the operation is custom lowered, then assume that the code is twice
    775       // as expensive.
    776       return LT.first * 2 * OpCost;
    777     }
    778 
    779     // Else, assume that we need to scalarize this op.
    780     // TODO: If one of the types get legalized by splitting, handle this
    781     // similarly to what getCastInstrCost() does.
    782     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
    783       unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
    784       InstructionCost Cost = thisT()->getArithmeticInstrCost(
    785           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
    786           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
    787       // Return the cost of multiple scalar invocation plus the cost of
    788       // inserting and extracting the values.
    789       SmallVector<Type *> Tys(Args.size(), Ty);
    790       return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
    791     }
    792 
    793     // We don't know anything about this scalar instruction.
    794     return OpCost;
    795   }
    796 
    797   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
    798                                               ArrayRef<int> Mask) const {
    799     int Limit = Mask.size() * 2;
    800     if (Mask.empty() ||
    801         // Extra check required by isSingleSourceMaskImpl function (called by
    802         // ShuffleVectorInst::isSingleSourceMask).
    803         any_of(Mask, [Limit](int I) { return I >= Limit; }))
    804       return Kind;
    805     switch (Kind) {
    806     case TTI::SK_PermuteSingleSrc:
    807       if (ShuffleVectorInst::isReverseMask(Mask))
    808         return TTI::SK_Reverse;
    809       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
    810         return TTI::SK_Broadcast;
    811       break;
    812     case TTI::SK_PermuteTwoSrc:
    813       if (ShuffleVectorInst::isSelectMask(Mask))
    814         return TTI::SK_Select;
    815       if (ShuffleVectorInst::isTransposeMask(Mask))
    816         return TTI::SK_Transpose;
    817       break;
    818     case TTI::SK_Select:
    819     case TTI::SK_Reverse:
    820     case TTI::SK_Broadcast:
    821     case TTI::SK_Transpose:
    822     case TTI::SK_InsertSubvector:
    823     case TTI::SK_ExtractSubvector:
    824       break;
    825     }
    826     return Kind;
    827   }
    828 
    829   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
    830                                  ArrayRef<int> Mask, int Index,
    831                                  VectorType *SubTp) {
    832 
    833     switch (improveShuffleKindFromMask(Kind, Mask)) {
    834     case TTI::SK_Broadcast:
    835       return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
    836     case TTI::SK_Select:
    837     case TTI::SK_Reverse:
    838     case TTI::SK_Transpose:
    839     case TTI::SK_PermuteSingleSrc:
    840     case TTI::SK_PermuteTwoSrc:
    841       return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
    842     case TTI::SK_ExtractSubvector:
    843       return getExtractSubvectorOverhead(Tp, Index,
    844                                          cast<FixedVectorType>(SubTp));
    845     case TTI::SK_InsertSubvector:
    846       return getInsertSubvectorOverhead(Tp, Index,
    847                                         cast<FixedVectorType>(SubTp));
    848     }
    849     llvm_unreachable("Unknown TTI::ShuffleKind");
    850   }
    851 
    852   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    853                                    TTI::CastContextHint CCH,
    854                                    TTI::TargetCostKind CostKind,
    855                                    const Instruction *I = nullptr) {
    856     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
    857       return 0;
    858 
    859     const TargetLoweringBase *TLI = getTLI();
    860     int ISD = TLI->InstructionOpcodeToISD(Opcode);
    861     assert(ISD && "Invalid opcode");
    862     std::pair<InstructionCost, MVT> SrcLT =
    863         TLI->getTypeLegalizationCost(DL, Src);
    864     std::pair<InstructionCost, MVT> DstLT =
    865         TLI->getTypeLegalizationCost(DL, Dst);
    866 
    867     TypeSize SrcSize = SrcLT.second.getSizeInBits();
    868     TypeSize DstSize = DstLT.second.getSizeInBits();
    869     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
    870     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
    871 
    872     switch (Opcode) {
    873     default:
    874       break;
    875     case Instruction::Trunc:
    876       // Check for NOOP conversions.
    877       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
    878         return 0;
    879       LLVM_FALLTHROUGH;
    880     case Instruction::BitCast:
    881       // Bitcast between types that are legalized to the same type are free and
    882       // assume int to/from ptr of the same size is also free.
    883       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
    884           SrcSize == DstSize)
    885         return 0;
    886       break;
    887     case Instruction::FPExt:
    888       if (I && getTLI()->isExtFree(I))
    889         return 0;
    890       break;
    891     case Instruction::ZExt:
    892       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
    893         return 0;
    894       LLVM_FALLTHROUGH;
    895     case Instruction::SExt:
    896       if (I && getTLI()->isExtFree(I))
    897         return 0;
    898 
    899       // If this is a zext/sext of a load, return 0 if the corresponding
    900       // extending load exists on target and the result type is legal.
    901       if (CCH == TTI::CastContextHint::Normal) {
    902         EVT ExtVT = EVT::getEVT(Dst);
    903         EVT LoadVT = EVT::getEVT(Src);
    904         unsigned LType =
    905           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
    906         if (DstLT.first == SrcLT.first &&
    907             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
    908           return 0;
    909       }
    910       break;
    911     case Instruction::AddrSpaceCast:
    912       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
    913                                    Dst->getPointerAddressSpace()))
    914         return 0;
    915       break;
    916     }
    917 
    918     auto *SrcVTy = dyn_cast<VectorType>(Src);
    919     auto *DstVTy = dyn_cast<VectorType>(Dst);
    920 
    921     // If the cast is marked as legal (or promote) then assume low cost.
    922     if (SrcLT.first == DstLT.first &&
    923         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
    924       return SrcLT.first;
    925 
    926     // Handle scalar conversions.
    927     if (!SrcVTy && !DstVTy) {
    928       // Just check the op cost. If the operation is legal then assume it costs
    929       // 1.
    930       if (!TLI->isOperationExpand(ISD, DstLT.second))
    931         return 1;
    932 
    933       // Assume that illegal scalar instruction are expensive.
    934       return 4;
    935     }
    936 
    937     // Check vector-to-vector casts.
    938     if (DstVTy && SrcVTy) {
    939       // If the cast is between same-sized registers, then the check is simple.
    940       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
    941 
    942         // Assume that Zext is done using AND.
    943         if (Opcode == Instruction::ZExt)
    944           return SrcLT.first;
    945 
    946         // Assume that sext is done using SHL and SRA.
    947         if (Opcode == Instruction::SExt)
    948           return SrcLT.first * 2;
    949 
    950         // Just check the op cost. If the operation is legal then assume it
    951         // costs
    952         // 1 and multiply by the type-legalization overhead.
    953         if (!TLI->isOperationExpand(ISD, DstLT.second))
    954           return SrcLT.first * 1;
    955       }
    956 
    957       // If we are legalizing by splitting, query the concrete TTI for the cost
    958       // of casting the original vector twice. We also need to factor in the
    959       // cost of the split itself. Count that as 1, to be consistent with
    960       // TLI->getTypeLegalizationCost().
    961       bool SplitSrc =
    962           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
    963           TargetLowering::TypeSplitVector;
    964       bool SplitDst =
    965           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
    966           TargetLowering::TypeSplitVector;
    967       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
    968           DstVTy->getElementCount().isVector()) {
    969         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
    970         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
    971         T *TTI = static_cast<T *>(this);
    972         // If both types need to be split then the split is free.
    973         InstructionCost SplitCost =
    974             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
    975         return SplitCost +
    976                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
    977                                           CostKind, I));
    978       }
    979 
    980       // In other cases where the source or destination are illegal, assume
    981       // the operation will get scalarized.
    982       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
    983       InstructionCost Cost = thisT()->getCastInstrCost(
    984           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
    985 
    986       // Return the cost of multiple scalar invocation plus the cost of
    987       // inserting and extracting the values.
    988       return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
    989     }
    990 
    991     // We already handled vector-to-vector and scalar-to-scalar conversions.
    992     // This
    993     // is where we handle bitcast between vectors and scalars. We need to assume
    994     //  that the conversion is scalarized in one way or another.
    995     if (Opcode == Instruction::BitCast) {
    996       // Illegal bitcasts are done by storing and loading from a stack slot.
    997       return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
    998              (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
    999     }
   1000 
   1001     llvm_unreachable("Unhandled cast");
   1002   }
   1003 
   1004   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   1005                                            VectorType *VecTy, unsigned Index) {
   1006     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
   1007                                        Index) +
   1008            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
   1009                                      TTI::CastContextHint::None,
   1010                                      TTI::TCK_RecipThroughput);
   1011   }
   1012 
   1013   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
   1014                                  const Instruction *I = nullptr) {
   1015     return BaseT::getCFInstrCost(Opcode, CostKind, I);
   1016   }
   1017 
   1018   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
   1019                                      CmpInst::Predicate VecPred,
   1020                                      TTI::TargetCostKind CostKind,
   1021                                      const Instruction *I = nullptr) {
   1022     const TargetLoweringBase *TLI = getTLI();
   1023     int ISD = TLI->InstructionOpcodeToISD(Opcode);
   1024     assert(ISD && "Invalid opcode");
   1025 
   1026     // TODO: Handle other cost kinds.
   1027     if (CostKind != TTI::TCK_RecipThroughput)
   1028       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
   1029                                        I);
   1030 
   1031     // Selects on vectors are actually vector selects.
   1032     if (ISD == ISD::SELECT) {
   1033       assert(CondTy && "CondTy must exist");
   1034       if (CondTy->isVectorTy())
   1035         ISD = ISD::VSELECT;
   1036     }
   1037     std::pair<InstructionCost, MVT> LT =
   1038         TLI->getTypeLegalizationCost(DL, ValTy);
   1039 
   1040     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
   1041         !TLI->isOperationExpand(ISD, LT.second)) {
   1042       // The operation is legal. Assume it costs 1. Multiply
   1043       // by the type-legalization overhead.
   1044       return LT.first * 1;
   1045     }
   1046 
   1047     // Otherwise, assume that the cast is scalarized.
   1048     // TODO: If one of the types get legalized by splitting, handle this
   1049     // similarly to what getCastInstrCost() does.
   1050     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
   1051       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
   1052       if (CondTy)
   1053         CondTy = CondTy->getScalarType();
   1054       InstructionCost Cost = thisT()->getCmpSelInstrCost(
   1055           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
   1056 
   1057       // Return the cost of multiple scalar invocation plus the cost of
   1058       // inserting and extracting the values.
   1059       return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
   1060     }
   1061 
   1062     // Unknown scalar opcode.
   1063     return 1;
   1064   }
   1065 
   1066   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
   1067                                      unsigned Index) {
   1068     std::pair<InstructionCost, MVT> LT =
   1069         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
   1070 
   1071     return LT.first;
   1072   }
   1073 
   1074   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
   1075                                   MaybeAlign Alignment, unsigned AddressSpace,
   1076                                   TTI::TargetCostKind CostKind,
   1077                                   const Instruction *I = nullptr) {
   1078     assert(!Src->isVoidTy() && "Invalid type");
   1079     // Assume types, such as structs, are expensive.
   1080     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
   1081       return 4;
   1082     std::pair<InstructionCost, MVT> LT =
   1083         getTLI()->getTypeLegalizationCost(DL, Src);
   1084 
   1085     // Assuming that all loads of legal types cost 1.
   1086     InstructionCost Cost = LT.first;
   1087     if (CostKind != TTI::TCK_RecipThroughput)
   1088       return Cost;
   1089 
   1090     if (Src->isVectorTy() &&
   1091         // In practice it's not currently possible to have a change in lane
   1092         // length for extending loads or truncating stores so both types should
   1093         // have the same scalable property.
   1094         TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
   1095                             LT.second.getSizeInBits())) {
   1096       // This is a vector load that legalizes to a larger type than the vector
   1097       // itself. Unless the corresponding extending load or truncating store is
   1098       // legal, then this will scalarize.
   1099       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
   1100       EVT MemVT = getTLI()->getValueType(DL, Src);
   1101       if (Opcode == Instruction::Store)
   1102         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
   1103       else
   1104         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
   1105 
   1106       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
   1107         // This is a vector load/store for some illegal type that is scalarized.
   1108         // We must account for the cost of building or decomposing the vector.
   1109         Cost += getScalarizationOverhead(cast<VectorType>(Src),
   1110                                          Opcode != Instruction::Store,
   1111                                          Opcode == Instruction::Store);
   1112       }
   1113     }
   1114 
   1115     return Cost;
   1116   }
   1117 
   1118   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
   1119                                         Align Alignment, unsigned AddressSpace,
   1120                                         TTI::TargetCostKind CostKind) {
   1121     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
   1122                                        CostKind);
   1123   }
   1124 
   1125   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   1126                                          const Value *Ptr, bool VariableMask,
   1127                                          Align Alignment,
   1128                                          TTI::TargetCostKind CostKind,
   1129                                          const Instruction *I = nullptr) {
   1130     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
   1131                                        true, CostKind);
   1132   }
   1133 
   1134   InstructionCost getInterleavedMemoryOpCost(
   1135       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
   1136       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
   1137       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
   1138     auto *VT = cast<FixedVectorType>(VecTy);
   1139 
   1140     unsigned NumElts = VT->getNumElements();
   1141     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
   1142 
   1143     unsigned NumSubElts = NumElts / Factor;
   1144     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
   1145 
   1146     // Firstly, the cost of load/store operation.
   1147     InstructionCost Cost;
   1148     if (UseMaskForCond || UseMaskForGaps)
   1149       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
   1150                                             AddressSpace, CostKind);
   1151     else
   1152       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
   1153                                       CostKind);
   1154 
   1155     // Legalize the vector type, and get the legalized and unlegalized type
   1156     // sizes.
   1157     MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
   1158     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
   1159     unsigned VecTyLTSize = VecTyLT.getStoreSize();
   1160 
   1161     // Scale the cost of the memory operation by the fraction of legalized
   1162     // instructions that will actually be used. We shouldn't account for the
   1163     // cost of dead instructions since they will be removed.
   1164     //
   1165     // E.g., An interleaved load of factor 8:
   1166     //       %vec = load <16 x i64>, <16 x i64>* %ptr
   1167     //       %v0 = shufflevector %vec, undef, <0, 8>
   1168     //
   1169     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
   1170     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
   1171     // type). The other loads are unused.
   1172     //
   1173     // We only scale the cost of loads since interleaved store groups aren't
   1174     // allowed to have gaps.
   1175     if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
   1176       // The number of loads of a legal type it will take to represent a load
   1177       // of the unlegalized vector type.
   1178       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
   1179 
   1180       // The number of elements of the unlegalized type that correspond to a
   1181       // single legal instruction.
   1182       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
   1183 
   1184       // Determine which legal instructions will be used.
   1185       BitVector UsedInsts(NumLegalInsts, false);
   1186       for (unsigned Index : Indices)
   1187         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
   1188           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
   1189 
   1190       // Scale the cost of the load by the fraction of legal instructions that
   1191       // will be used.
   1192       Cost *= UsedInsts.count() / NumLegalInsts;
   1193     }
   1194 
   1195     // Then plus the cost of interleave operation.
   1196     if (Opcode == Instruction::Load) {
   1197       // The interleave cost is similar to extract sub vectors' elements
   1198       // from the wide vector, and insert them into sub vectors.
   1199       //
   1200       // E.g. An interleaved load of factor 2 (with one member of index 0):
   1201       //      %vec = load <8 x i32>, <8 x i32>* %ptr
   1202       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
   1203       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
   1204       // <8 x i32> vector and insert them into a <4 x i32> vector.
   1205 
   1206       assert(Indices.size() <= Factor &&
   1207              "Interleaved memory op has too many members");
   1208 
   1209       for (unsigned Index : Indices) {
   1210         assert(Index < Factor && "Invalid index for interleaved memory op");
   1211 
   1212         // Extract elements from loaded vector for each sub vector.
   1213         for (unsigned i = 0; i < NumSubElts; i++)
   1214           Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
   1215                                               Index + i * Factor);
   1216       }
   1217 
   1218       InstructionCost InsSubCost = 0;
   1219       for (unsigned i = 0; i < NumSubElts; i++)
   1220         InsSubCost +=
   1221             thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
   1222 
   1223       Cost += Indices.size() * InsSubCost;
   1224     } else {
   1225       // The interleave cost is extract all elements from sub vectors, and
   1226       // insert them into the wide vector.
   1227       //
   1228       // E.g. An interleaved store of factor 2:
   1229       //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
   1230       //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
   1231       // The cost is estimated as extract all elements from both <4 x i32>
   1232       // vectors and insert into the <8 x i32> vector.
   1233 
   1234       InstructionCost ExtSubCost = 0;
   1235       for (unsigned i = 0; i < NumSubElts; i++)
   1236         ExtSubCost +=
   1237             thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
   1238       Cost += ExtSubCost * Factor;
   1239 
   1240       for (unsigned i = 0; i < NumElts; i++)
   1241         Cost += static_cast<T *>(this)
   1242                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
   1243     }
   1244 
   1245     if (!UseMaskForCond)
   1246       return Cost;
   1247 
   1248     Type *I8Type = Type::getInt8Ty(VT->getContext());
   1249     auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
   1250     SubVT = FixedVectorType::get(I8Type, NumSubElts);
   1251 
   1252     // The Mask shuffling cost is extract all the elements of the Mask
   1253     // and insert each of them Factor times into the wide vector:
   1254     //
   1255     // E.g. an interleaved group with factor 3:
   1256     //    %mask = icmp ult <8 x i32> %vec1, %vec2
   1257     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
   1258     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
   1259     // The cost is estimated as extract all mask elements from the <8xi1> mask
   1260     // vector and insert them factor times into the <24xi1> shuffled mask
   1261     // vector.
   1262     for (unsigned i = 0; i < NumSubElts; i++)
   1263       Cost +=
   1264           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
   1265 
   1266     for (unsigned i = 0; i < NumElts; i++)
   1267       Cost +=
   1268           thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
   1269 
   1270     // The Gaps mask is invariant and created outside the loop, therefore the
   1271     // cost of creating it is not accounted for here. However if we have both
   1272     // a MaskForGaps and some other mask that guards the execution of the
   1273     // memory access, we need to account for the cost of And-ing the two masks
   1274     // inside the loop.
   1275     if (UseMaskForGaps)
   1276       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
   1277                                               CostKind);
   1278 
   1279     return Cost;
   1280   }
   1281 
   1282   /// Get intrinsic cost based on arguments.
   1283   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   1284                                         TTI::TargetCostKind CostKind) {
   1285     // Check for generically free intrinsics.
   1286     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
   1287       return 0;
   1288 
   1289     // Assume that target intrinsics are cheap.
   1290     Intrinsic::ID IID = ICA.getID();
   1291     if (Function::isTargetIntrinsic(IID))
   1292       return TargetTransformInfo::TCC_Basic;
   1293 
   1294     if (ICA.isTypeBasedOnly())
   1295       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
   1296 
   1297     Type *RetTy = ICA.getReturnType();
   1298 
   1299     ElementCount RetVF =
   1300         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
   1301                              : ElementCount::getFixed(1));
   1302     const IntrinsicInst *I = ICA.getInst();
   1303     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
   1304     FastMathFlags FMF = ICA.getFlags();
   1305     switch (IID) {
   1306     default:
   1307       break;
   1308 
   1309     case Intrinsic::cttz:
   1310       // FIXME: If necessary, this should go in target-specific overrides.
   1311       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
   1312         return TargetTransformInfo::TCC_Basic;
   1313       break;
   1314 
   1315     case Intrinsic::ctlz:
   1316       // FIXME: If necessary, this should go in target-specific overrides.
   1317       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
   1318         return TargetTransformInfo::TCC_Basic;
   1319       break;
   1320 
   1321     case Intrinsic::memcpy:
   1322       return thisT()->getMemcpyCost(ICA.getInst());
   1323 
   1324     case Intrinsic::masked_scatter: {
   1325       const Value *Mask = Args[3];
   1326       bool VarMask = !isa<Constant>(Mask);
   1327       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
   1328       return thisT()->getGatherScatterOpCost(Instruction::Store,
   1329                                              ICA.getArgTypes()[0], Args[1],
   1330                                              VarMask, Alignment, CostKind, I);
   1331     }
   1332     case Intrinsic::masked_gather: {
   1333       const Value *Mask = Args[2];
   1334       bool VarMask = !isa<Constant>(Mask);
   1335       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
   1336       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
   1337                                              VarMask, Alignment, CostKind, I);
   1338     }
   1339     case Intrinsic::experimental_stepvector: {
   1340       if (isa<ScalableVectorType>(RetTy))
   1341         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
   1342       // The cost of materialising a constant integer vector.
   1343       return TargetTransformInfo::TCC_Basic;
   1344     }
   1345     case Intrinsic::experimental_vector_extract: {
   1346       // FIXME: Handle case where a scalable vector is extracted from a scalable
   1347       // vector
   1348       if (isa<ScalableVectorType>(RetTy))
   1349         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
   1350       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
   1351       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
   1352                                      cast<VectorType>(Args[0]->getType()), None,
   1353                                      Index, cast<VectorType>(RetTy));
   1354     }
   1355     case Intrinsic::experimental_vector_insert: {
   1356       // FIXME: Handle case where a scalable vector is inserted into a scalable
   1357       // vector
   1358       if (isa<ScalableVectorType>(Args[1]->getType()))
   1359         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
   1360       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
   1361       return thisT()->getShuffleCost(
   1362           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
   1363           Index, cast<VectorType>(Args[1]->getType()));
   1364     }
   1365     case Intrinsic::experimental_vector_reverse: {
   1366       return thisT()->getShuffleCost(TTI::SK_Reverse,
   1367                                      cast<VectorType>(Args[0]->getType()), None,
   1368                                      0, cast<VectorType>(RetTy));
   1369     }
   1370     case Intrinsic::vector_reduce_add:
   1371     case Intrinsic::vector_reduce_mul:
   1372     case Intrinsic::vector_reduce_and:
   1373     case Intrinsic::vector_reduce_or:
   1374     case Intrinsic::vector_reduce_xor:
   1375     case Intrinsic::vector_reduce_smax:
   1376     case Intrinsic::vector_reduce_smin:
   1377     case Intrinsic::vector_reduce_fmax:
   1378     case Intrinsic::vector_reduce_fmin:
   1379     case Intrinsic::vector_reduce_umax:
   1380     case Intrinsic::vector_reduce_umin: {
   1381       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
   1382       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
   1383     }
   1384     case Intrinsic::vector_reduce_fadd:
   1385     case Intrinsic::vector_reduce_fmul: {
   1386       IntrinsicCostAttributes Attrs(
   1387           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
   1388       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
   1389     }
   1390     case Intrinsic::fshl:
   1391     case Intrinsic::fshr: {
   1392       if (isa<ScalableVectorType>(RetTy))
   1393         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
   1394       const Value *X = Args[0];
   1395       const Value *Y = Args[1];
   1396       const Value *Z = Args[2];
   1397       TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
   1398       TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
   1399       TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
   1400       TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
   1401       TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
   1402       OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
   1403                                                               : TTI::OP_None;
   1404       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
   1405       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
   1406       InstructionCost Cost = 0;
   1407       Cost +=
   1408           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
   1409       Cost +=
   1410           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
   1411       Cost += thisT()->getArithmeticInstrCost(
   1412           BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
   1413       Cost += thisT()->getArithmeticInstrCost(
   1414           BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
   1415       // Non-constant shift amounts requires a modulo.
   1416       if (OpKindZ != TTI::OK_UniformConstantValue &&
   1417           OpKindZ != TTI::OK_NonUniformConstantValue)
   1418         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
   1419                                                 CostKind, OpKindZ, OpKindBW,
   1420                                                 OpPropsZ, OpPropsBW);
   1421       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
   1422       if (X != Y) {
   1423         Type *CondTy = RetTy->getWithNewBitWidth(1);
   1424         Cost +=
   1425             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
   1426                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1427         Cost +=
   1428             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
   1429                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1430       }
   1431       return Cost;
   1432     }
   1433     }
   1434 
   1435     // Assume that we need to scalarize this intrinsic.
   1436     // Compute the scalarization overhead based on Args for a vector
   1437     // intrinsic.
   1438     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
   1439     if (RetVF.isVector() && !RetVF.isScalable()) {
   1440       ScalarizationCost = 0;
   1441       if (!RetTy->isVoidTy())
   1442         ScalarizationCost +=
   1443             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
   1444       ScalarizationCost +=
   1445           getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
   1446     }
   1447 
   1448     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
   1449                                   ScalarizationCost);
   1450     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
   1451   }
   1452 
   1453   /// Get intrinsic cost based on argument types.
   1454   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
   1455   /// cost of scalarizing the arguments and the return value will be computed
   1456   /// based on types.
   1457   InstructionCost
   1458   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   1459                                  TTI::TargetCostKind CostKind) {
   1460     Intrinsic::ID IID = ICA.getID();
   1461     Type *RetTy = ICA.getReturnType();
   1462     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
   1463     FastMathFlags FMF = ICA.getFlags();
   1464     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
   1465     bool SkipScalarizationCost = ICA.skipScalarizationCost();
   1466 
   1467     VectorType *VecOpTy = nullptr;
   1468     if (!Tys.empty()) {
   1469       // The vector reduction operand is operand 0 except for fadd/fmul.
   1470       // Their operand 0 is a scalar start value, so the vector op is operand 1.
   1471       unsigned VecTyIndex = 0;
   1472       if (IID == Intrinsic::vector_reduce_fadd ||
   1473           IID == Intrinsic::vector_reduce_fmul)
   1474         VecTyIndex = 1;
   1475       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
   1476       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
   1477     }
   1478 
   1479     // Library call cost - other than size, make it expensive.
   1480     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
   1481     SmallVector<unsigned, 2> ISDs;
   1482     switch (IID) {
   1483     default: {
   1484       // Scalable vectors cannot be scalarized, so return Invalid.
   1485       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
   1486             return isa<ScalableVectorType>(Ty);
   1487           }))
   1488         return InstructionCost::getInvalid();
   1489 
   1490       // Assume that we need to scalarize this intrinsic.
   1491       InstructionCost ScalarizationCost =
   1492           SkipScalarizationCost ? ScalarizationCostPassed : 0;
   1493       unsigned ScalarCalls = 1;
   1494       Type *ScalarRetTy = RetTy;
   1495       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
   1496         if (!SkipScalarizationCost)
   1497           ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
   1498         ScalarCalls = std::max(ScalarCalls,
   1499                                cast<FixedVectorType>(RetVTy)->getNumElements());
   1500         ScalarRetTy = RetTy->getScalarType();
   1501       }
   1502       SmallVector<Type *, 4> ScalarTys;
   1503       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
   1504         Type *Ty = Tys[i];
   1505         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
   1506           if (!SkipScalarizationCost)
   1507             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
   1508           ScalarCalls = std::max(ScalarCalls,
   1509                                  cast<FixedVectorType>(VTy)->getNumElements());
   1510           Ty = Ty->getScalarType();
   1511         }
   1512         ScalarTys.push_back(Ty);
   1513       }
   1514       if (ScalarCalls == 1)
   1515         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
   1516 
   1517       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
   1518       InstructionCost ScalarCost =
   1519           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
   1520 
   1521       return ScalarCalls * ScalarCost + ScalarizationCost;
   1522     }
   1523     // Look for intrinsics that can be lowered directly or turned into a scalar
   1524     // intrinsic call.
   1525     case Intrinsic::sqrt:
   1526       ISDs.push_back(ISD::FSQRT);
   1527       break;
   1528     case Intrinsic::sin:
   1529       ISDs.push_back(ISD::FSIN);
   1530       break;
   1531     case Intrinsic::cos:
   1532       ISDs.push_back(ISD::FCOS);
   1533       break;
   1534     case Intrinsic::exp:
   1535       ISDs.push_back(ISD::FEXP);
   1536       break;
   1537     case Intrinsic::exp2:
   1538       ISDs.push_back(ISD::FEXP2);
   1539       break;
   1540     case Intrinsic::log:
   1541       ISDs.push_back(ISD::FLOG);
   1542       break;
   1543     case Intrinsic::log10:
   1544       ISDs.push_back(ISD::FLOG10);
   1545       break;
   1546     case Intrinsic::log2:
   1547       ISDs.push_back(ISD::FLOG2);
   1548       break;
   1549     case Intrinsic::fabs:
   1550       ISDs.push_back(ISD::FABS);
   1551       break;
   1552     case Intrinsic::canonicalize:
   1553       ISDs.push_back(ISD::FCANONICALIZE);
   1554       break;
   1555     case Intrinsic::minnum:
   1556       ISDs.push_back(ISD::FMINNUM);
   1557       break;
   1558     case Intrinsic::maxnum:
   1559       ISDs.push_back(ISD::FMAXNUM);
   1560       break;
   1561     case Intrinsic::minimum:
   1562       ISDs.push_back(ISD::FMINIMUM);
   1563       break;
   1564     case Intrinsic::maximum:
   1565       ISDs.push_back(ISD::FMAXIMUM);
   1566       break;
   1567     case Intrinsic::copysign:
   1568       ISDs.push_back(ISD::FCOPYSIGN);
   1569       break;
   1570     case Intrinsic::floor:
   1571       ISDs.push_back(ISD::FFLOOR);
   1572       break;
   1573     case Intrinsic::ceil:
   1574       ISDs.push_back(ISD::FCEIL);
   1575       break;
   1576     case Intrinsic::trunc:
   1577       ISDs.push_back(ISD::FTRUNC);
   1578       break;
   1579     case Intrinsic::nearbyint:
   1580       ISDs.push_back(ISD::FNEARBYINT);
   1581       break;
   1582     case Intrinsic::rint:
   1583       ISDs.push_back(ISD::FRINT);
   1584       break;
   1585     case Intrinsic::round:
   1586       ISDs.push_back(ISD::FROUND);
   1587       break;
   1588     case Intrinsic::roundeven:
   1589       ISDs.push_back(ISD::FROUNDEVEN);
   1590       break;
   1591     case Intrinsic::pow:
   1592       ISDs.push_back(ISD::FPOW);
   1593       break;
   1594     case Intrinsic::fma:
   1595       ISDs.push_back(ISD::FMA);
   1596       break;
   1597     case Intrinsic::fmuladd:
   1598       ISDs.push_back(ISD::FMA);
   1599       break;
   1600     case Intrinsic::experimental_constrained_fmuladd:
   1601       ISDs.push_back(ISD::STRICT_FMA);
   1602       break;
   1603     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
   1604     case Intrinsic::lifetime_start:
   1605     case Intrinsic::lifetime_end:
   1606     case Intrinsic::sideeffect:
   1607     case Intrinsic::pseudoprobe:
   1608       return 0;
   1609     case Intrinsic::masked_store: {
   1610       Type *Ty = Tys[0];
   1611       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
   1612       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
   1613                                             CostKind);
   1614     }
   1615     case Intrinsic::masked_load: {
   1616       Type *Ty = RetTy;
   1617       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
   1618       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
   1619                                             CostKind);
   1620     }
   1621     case Intrinsic::vector_reduce_add:
   1622       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
   1623                                                  /*IsPairwiseForm=*/false,
   1624                                                  CostKind);
   1625     case Intrinsic::vector_reduce_mul:
   1626       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
   1627                                                  /*IsPairwiseForm=*/false,
   1628                                                  CostKind);
   1629     case Intrinsic::vector_reduce_and:
   1630       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
   1631                                                  /*IsPairwiseForm=*/false,
   1632                                                  CostKind);
   1633     case Intrinsic::vector_reduce_or:
   1634       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
   1635                                                  /*IsPairwiseForm=*/false,
   1636                                                  CostKind);
   1637     case Intrinsic::vector_reduce_xor:
   1638       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
   1639                                                  /*IsPairwiseForm=*/false,
   1640                                                  CostKind);
   1641     case Intrinsic::vector_reduce_fadd:
   1642       // FIXME: Add new flag for cost of strict reductions.
   1643       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
   1644                                                  /*IsPairwiseForm=*/false,
   1645                                                  CostKind);
   1646     case Intrinsic::vector_reduce_fmul:
   1647       // FIXME: Add new flag for cost of strict reductions.
   1648       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
   1649                                                  /*IsPairwiseForm=*/false,
   1650                                                  CostKind);
   1651     case Intrinsic::vector_reduce_smax:
   1652     case Intrinsic::vector_reduce_smin:
   1653     case Intrinsic::vector_reduce_fmax:
   1654     case Intrinsic::vector_reduce_fmin:
   1655       return thisT()->getMinMaxReductionCost(
   1656           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
   1657           /*IsPairwiseForm=*/false,
   1658           /*IsUnsigned=*/false, CostKind);
   1659     case Intrinsic::vector_reduce_umax:
   1660     case Intrinsic::vector_reduce_umin:
   1661       return thisT()->getMinMaxReductionCost(
   1662           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
   1663           /*IsPairwiseForm=*/false,
   1664           /*IsUnsigned=*/true, CostKind);
   1665     case Intrinsic::abs:
   1666     case Intrinsic::smax:
   1667     case Intrinsic::smin:
   1668     case Intrinsic::umax:
   1669     case Intrinsic::umin: {
   1670       // abs(X) = select(icmp(X,0),X,sub(0,X))
   1671       // minmax(X,Y) = select(icmp(X,Y),X,Y)
   1672       Type *CondTy = RetTy->getWithNewBitWidth(1);
   1673       InstructionCost Cost = 0;
   1674       // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
   1675       Cost +=
   1676           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
   1677                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1678       Cost +=
   1679           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
   1680                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1681       // TODO: Should we add an OperandValueProperties::OP_Zero property?
   1682       if (IID == Intrinsic::abs)
   1683         Cost += thisT()->getArithmeticInstrCost(
   1684             BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
   1685       return Cost;
   1686     }
   1687     case Intrinsic::sadd_sat:
   1688     case Intrinsic::ssub_sat: {
   1689       Type *CondTy = RetTy->getWithNewBitWidth(1);
   1690 
   1691       Type *OpTy = StructType::create({RetTy, CondTy});
   1692       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
   1693                                      ? Intrinsic::sadd_with_overflow
   1694                                      : Intrinsic::ssub_with_overflow;
   1695 
   1696       // SatMax -> Overflow && SumDiff < 0
   1697       // SatMin -> Overflow && SumDiff >= 0
   1698       InstructionCost Cost = 0;
   1699       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
   1700                                     nullptr, ScalarizationCostPassed);
   1701       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
   1702       Cost +=
   1703           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
   1704                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1705       Cost += 2 * thisT()->getCmpSelInstrCost(
   1706                       BinaryOperator::Select, RetTy, CondTy,
   1707                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1708       return Cost;
   1709     }
   1710     case Intrinsic::uadd_sat:
   1711     case Intrinsic::usub_sat: {
   1712       Type *CondTy = RetTy->getWithNewBitWidth(1);
   1713 
   1714       Type *OpTy = StructType::create({RetTy, CondTy});
   1715       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
   1716                                      ? Intrinsic::uadd_with_overflow
   1717                                      : Intrinsic::usub_with_overflow;
   1718 
   1719       InstructionCost Cost = 0;
   1720       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
   1721                                     nullptr, ScalarizationCostPassed);
   1722       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
   1723       Cost +=
   1724           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
   1725                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1726       return Cost;
   1727     }
   1728     case Intrinsic::smul_fix:
   1729     case Intrinsic::umul_fix: {
   1730       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
   1731       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
   1732 
   1733       unsigned ExtOp =
   1734           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
   1735       TTI::CastContextHint CCH = TTI::CastContextHint::None;
   1736 
   1737       InstructionCost Cost = 0;
   1738       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
   1739       Cost +=
   1740           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
   1741       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
   1742                                             CCH, CostKind);
   1743       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
   1744                                               CostKind, TTI::OK_AnyValue,
   1745                                               TTI::OK_UniformConstantValue);
   1746       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
   1747                                               TTI::OK_AnyValue,
   1748                                               TTI::OK_UniformConstantValue);
   1749       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
   1750       return Cost;
   1751     }
   1752     case Intrinsic::sadd_with_overflow:
   1753     case Intrinsic::ssub_with_overflow: {
   1754       Type *SumTy = RetTy->getContainedType(0);
   1755       Type *OverflowTy = RetTy->getContainedType(1);
   1756       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
   1757                             ? BinaryOperator::Add
   1758                             : BinaryOperator::Sub;
   1759 
   1760       //   LHSSign -> LHS >= 0
   1761       //   RHSSign -> RHS >= 0
   1762       //   SumSign -> Sum >= 0
   1763       //
   1764       //   Add:
   1765       //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
   1766       //   Sub:
   1767       //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
   1768       InstructionCost Cost = 0;
   1769       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
   1770       Cost += 3 * thisT()->getCmpSelInstrCost(
   1771                       Instruction::ICmp, SumTy, OverflowTy,
   1772                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1773       Cost += 2 * thisT()->getCmpSelInstrCost(
   1774                       Instruction::Select, OverflowTy, OverflowTy,
   1775                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1776       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
   1777                                               CostKind);
   1778       return Cost;
   1779     }
   1780     case Intrinsic::uadd_with_overflow:
   1781     case Intrinsic::usub_with_overflow: {
   1782       Type *SumTy = RetTy->getContainedType(0);
   1783       Type *OverflowTy = RetTy->getContainedType(1);
   1784       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
   1785                             ? BinaryOperator::Add
   1786                             : BinaryOperator::Sub;
   1787 
   1788       InstructionCost Cost = 0;
   1789       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
   1790       Cost +=
   1791           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
   1792                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1793       return Cost;
   1794     }
   1795     case Intrinsic::smul_with_overflow:
   1796     case Intrinsic::umul_with_overflow: {
   1797       Type *MulTy = RetTy->getContainedType(0);
   1798       Type *OverflowTy = RetTy->getContainedType(1);
   1799       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
   1800       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
   1801 
   1802       unsigned ExtOp =
   1803           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
   1804       TTI::CastContextHint CCH = TTI::CastContextHint::None;
   1805 
   1806       InstructionCost Cost = 0;
   1807       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
   1808       Cost +=
   1809           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
   1810       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
   1811                                             CCH, CostKind);
   1812       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
   1813                                               CostKind, TTI::OK_AnyValue,
   1814                                               TTI::OK_UniformConstantValue);
   1815 
   1816       if (IID == Intrinsic::smul_with_overflow)
   1817         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
   1818                                                 CostKind, TTI::OK_AnyValue,
   1819                                                 TTI::OK_UniformConstantValue);
   1820 
   1821       Cost +=
   1822           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
   1823                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   1824       return Cost;
   1825     }
   1826     case Intrinsic::ctpop:
   1827       ISDs.push_back(ISD::CTPOP);
   1828       // In case of legalization use TCC_Expensive. This is cheaper than a
   1829       // library call but still not a cheap instruction.
   1830       SingleCallCost = TargetTransformInfo::TCC_Expensive;
   1831       break;
   1832     case Intrinsic::ctlz:
   1833       ISDs.push_back(ISD::CTLZ);
   1834       break;
   1835     case Intrinsic::cttz:
   1836       ISDs.push_back(ISD::CTTZ);
   1837       break;
   1838     case Intrinsic::bswap:
   1839       ISDs.push_back(ISD::BSWAP);
   1840       break;
   1841     case Intrinsic::bitreverse:
   1842       ISDs.push_back(ISD::BITREVERSE);
   1843       break;
   1844     }
   1845 
   1846     const TargetLoweringBase *TLI = getTLI();
   1847     std::pair<InstructionCost, MVT> LT =
   1848         TLI->getTypeLegalizationCost(DL, RetTy);
   1849 
   1850     SmallVector<InstructionCost, 2> LegalCost;
   1851     SmallVector<InstructionCost, 2> CustomCost;
   1852     for (unsigned ISD : ISDs) {
   1853       if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
   1854         if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
   1855             TLI->isFAbsFree(LT.second)) {
   1856           return 0;
   1857         }
   1858 
   1859         // The operation is legal. Assume it costs 1.
   1860         // If the type is split to multiple registers, assume that there is some
   1861         // overhead to this.
   1862         // TODO: Once we have extract/insert subvector cost we need to use them.
   1863         if (LT.first > 1)
   1864           LegalCost.push_back(LT.first * 2);
   1865         else
   1866           LegalCost.push_back(LT.first * 1);
   1867       } else if (!TLI->isOperationExpand(ISD, LT.second)) {
   1868         // If the operation is custom lowered then assume
   1869         // that the code is twice as expensive.
   1870         CustomCost.push_back(LT.first * 2);
   1871       }
   1872     }
   1873 
   1874     auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
   1875     if (MinLegalCostI != LegalCost.end())
   1876       return *MinLegalCostI;
   1877 
   1878     auto MinCustomCostI =
   1879         std::min_element(CustomCost.begin(), CustomCost.end());
   1880     if (MinCustomCostI != CustomCost.end())
   1881       return *MinCustomCostI;
   1882 
   1883     // If we can't lower fmuladd into an FMA estimate the cost as a floating
   1884     // point mul followed by an add.
   1885     if (IID == Intrinsic::fmuladd)
   1886       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
   1887                                              CostKind) +
   1888              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
   1889                                              CostKind);
   1890     if (IID == Intrinsic::experimental_constrained_fmuladd) {
   1891       IntrinsicCostAttributes FMulAttrs(
   1892         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
   1893       IntrinsicCostAttributes FAddAttrs(
   1894         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
   1895       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
   1896              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
   1897     }
   1898 
   1899     // Else, assume that we need to scalarize this intrinsic. For math builtins
   1900     // this will emit a costly libcall, adding call overhead and spills. Make it
   1901     // very expensive.
   1902     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
   1903       // Scalable vectors cannot be scalarized, so return Invalid.
   1904       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
   1905             return isa<ScalableVectorType>(Ty);
   1906           }))
   1907         return InstructionCost::getInvalid();
   1908 
   1909       InstructionCost ScalarizationCost =
   1910           SkipScalarizationCost ? ScalarizationCostPassed
   1911                                 : getScalarizationOverhead(RetVTy, true, false);
   1912 
   1913       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
   1914       SmallVector<Type *, 4> ScalarTys;
   1915       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
   1916         Type *Ty = Tys[i];
   1917         if (Ty->isVectorTy())
   1918           Ty = Ty->getScalarType();
   1919         ScalarTys.push_back(Ty);
   1920       }
   1921       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
   1922       InstructionCost ScalarCost =
   1923           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
   1924       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
   1925         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
   1926           if (!ICA.skipScalarizationCost())
   1927             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
   1928           ScalarCalls = std::max(ScalarCalls,
   1929                                  cast<FixedVectorType>(VTy)->getNumElements());
   1930         }
   1931       }
   1932       return ScalarCalls * ScalarCost + ScalarizationCost;
   1933     }
   1934 
   1935     // This is going to be turned into a library call, make it expensive.
   1936     return SingleCallCost;
   1937   }
   1938 
   1939   /// Compute a cost of the given call instruction.
   1940   ///
   1941   /// Compute the cost of calling function F with return type RetTy and
   1942   /// argument types Tys. F might be nullptr, in this case the cost of an
   1943   /// arbitrary call with the specified signature will be returned.
   1944   /// This is used, for instance,  when we estimate call of a vector
   1945   /// counterpart of the given function.
   1946   /// \param F Called function, might be nullptr.
   1947   /// \param RetTy Return value types.
   1948   /// \param Tys Argument types.
   1949   /// \returns The cost of Call instruction.
   1950   InstructionCost
   1951   getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
   1952                    TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
   1953     return 10;
   1954   }
   1955 
   1956   unsigned getNumberOfParts(Type *Tp) {
   1957     std::pair<InstructionCost, MVT> LT =
   1958         getTLI()->getTypeLegalizationCost(DL, Tp);
   1959     return *LT.first.getValue();
   1960   }
   1961 
   1962   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
   1963                                             const SCEV *) {
   1964     return 0;
   1965   }
   1966 
   1967   /// Try to calculate arithmetic and shuffle op costs for reduction operations.
   1968   /// We're assuming that reduction operation are performing the following way:
   1969   /// 1. Non-pairwise reduction
   1970   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
   1971   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
   1972   ///            \----------------v-------------/  \----------v------------/
   1973   ///                            n/2 elements               n/2 elements
   1974   /// %red1 = op <n x t> %val, <n x t> val1
   1975   /// After this operation we have a vector %red1 where only the first n/2
   1976   /// elements are meaningful, the second n/2 elements are undefined and can be
   1977   /// dropped. All other operations are actually working with the vector of
   1978   /// length n/2, not n, though the real vector length is still n.
   1979   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
   1980   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
   1981   ///            \----------------v-------------/  \----------v------------/
   1982   ///                            n/4 elements               3*n/4 elements
   1983   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
   1984   /// length n/2, the resulting vector has length n/4 etc.
   1985   /// 2. Pairwise reduction:
   1986   /// Everything is the same except for an additional shuffle operation which
   1987   /// is used to produce operands for pairwise kind of reductions.
   1988   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
   1989   /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
   1990   ///            \-------------v----------/  \----------v------------/
   1991   ///                   n/2 elements               n/2 elements
   1992   /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
   1993   /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
   1994   ///            \-------------v----------/  \----------v------------/
   1995   ///                   n/2 elements               n/2 elements
   1996   /// %red1 = op <n x t> %val1, <n x t> val2
   1997   /// Again, the operation is performed on <n x t> vector, but the resulting
   1998   /// vector %red1 is <n/2 x t> vector.
   1999   ///
   2000   /// The cost model should take into account that the actual length of the
   2001   /// vector is reduced on each iteration.
   2002   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
   2003                                              bool IsPairwise,
   2004                                              TTI::TargetCostKind CostKind) {
   2005     Type *ScalarTy = Ty->getElementType();
   2006     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
   2007     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
   2008         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
   2009         NumVecElts >= 2) {
   2010       // Or reduction for i1 is represented as:
   2011       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
   2012       // %res = cmp ne iReduxWidth %val, 0
   2013       // And reduction for i1 is represented as:
   2014       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
   2015       // %res = cmp eq iReduxWidth %val, 11111
   2016       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
   2017       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
   2018                                        TTI::CastContextHint::None, CostKind) +
   2019              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
   2020                                          CmpInst::makeCmpResultType(ValTy),
   2021                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
   2022     }
   2023     unsigned NumReduxLevels = Log2_32(NumVecElts);
   2024     InstructionCost ArithCost = 0;
   2025     InstructionCost ShuffleCost = 0;
   2026     std::pair<InstructionCost, MVT> LT =
   2027         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
   2028     unsigned LongVectorCount = 0;
   2029     unsigned MVTLen =
   2030         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
   2031     while (NumVecElts > MVTLen) {
   2032       NumVecElts /= 2;
   2033       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
   2034       // Assume the pairwise shuffles add a cost.
   2035       ShuffleCost += (IsPairwise + 1) *
   2036                      thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
   2037                                              NumVecElts, SubTy);
   2038       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
   2039       Ty = SubTy;
   2040       ++LongVectorCount;
   2041     }
   2042 
   2043     NumReduxLevels -= LongVectorCount;
   2044 
   2045     // The minimal length of the vector is limited by the real length of vector
   2046     // operations performed on the current platform. That's why several final
   2047     // reduction operations are performed on the vectors with the same
   2048     // architecture-dependent length.
   2049 
   2050     // Non pairwise reductions need one shuffle per reduction level. Pairwise
   2051     // reductions need two shuffles on every level, but the last one. On that
   2052     // level one of the shuffles is <0, u, u, ...> which is identity.
   2053     unsigned NumShuffles = NumReduxLevels;
   2054     if (IsPairwise && NumReduxLevels >= 1)
   2055       NumShuffles += NumReduxLevels - 1;
   2056     ShuffleCost += NumShuffles * thisT()->getShuffleCost(
   2057                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
   2058     ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
   2059     return ShuffleCost + ArithCost +
   2060            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   2061   }
   2062 
   2063   /// Try to calculate op costs for min/max reduction operations.
   2064   /// \param CondTy Conditional type for the Select instruction.
   2065   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
   2066                                          bool IsPairwise, bool IsUnsigned,
   2067                                          TTI::TargetCostKind CostKind) {
   2068     Type *ScalarTy = Ty->getElementType();
   2069     Type *ScalarCondTy = CondTy->getElementType();
   2070     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
   2071     unsigned NumReduxLevels = Log2_32(NumVecElts);
   2072     unsigned CmpOpcode;
   2073     if (Ty->isFPOrFPVectorTy()) {
   2074       CmpOpcode = Instruction::FCmp;
   2075     } else {
   2076       assert(Ty->isIntOrIntVectorTy() &&
   2077              "expecting floating point or integer type for min/max reduction");
   2078       CmpOpcode = Instruction::ICmp;
   2079     }
   2080     InstructionCost MinMaxCost = 0;
   2081     InstructionCost ShuffleCost = 0;
   2082     std::pair<InstructionCost, MVT> LT =
   2083         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
   2084     unsigned LongVectorCount = 0;
   2085     unsigned MVTLen =
   2086         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
   2087     while (NumVecElts > MVTLen) {
   2088       NumVecElts /= 2;
   2089       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
   2090       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
   2091 
   2092       // Assume the pairwise shuffles add a cost.
   2093       ShuffleCost += (IsPairwise + 1) *
   2094                      thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
   2095                                              NumVecElts, SubTy);
   2096       MinMaxCost +=
   2097           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
   2098                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
   2099           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
   2100                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
   2101       Ty = SubTy;
   2102       ++LongVectorCount;
   2103     }
   2104 
   2105     NumReduxLevels -= LongVectorCount;
   2106 
   2107     // The minimal length of the vector is limited by the real length of vector
   2108     // operations performed on the current platform. That's why several final
   2109     // reduction opertions are perfomed on the vectors with the same
   2110     // architecture-dependent length.
   2111 
   2112     // Non pairwise reductions need one shuffle per reduction level. Pairwise
   2113     // reductions need two shuffles on every level, but the last one. On that
   2114     // level one of the shuffles is <0, u, u, ...> which is identity.
   2115     unsigned NumShuffles = NumReduxLevels;
   2116     if (IsPairwise && NumReduxLevels >= 1)
   2117       NumShuffles += NumReduxLevels - 1;
   2118     ShuffleCost += NumShuffles * thisT()->getShuffleCost(
   2119                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
   2120     MinMaxCost +=
   2121         NumReduxLevels *
   2122         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
   2123                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
   2124          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
   2125                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
   2126     // The last min/max should be in vector registers and we counted it above.
   2127     // So just need a single extractelement.
   2128     return ShuffleCost + MinMaxCost +
   2129            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   2130   }
   2131 
   2132   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
   2133                                               Type *ResTy, VectorType *Ty,
   2134                                               TTI::TargetCostKind CostKind) {
   2135     // Without any native support, this is equivalent to the cost of
   2136     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
   2137     VectorType *ExtTy = VectorType::get(ResTy, Ty);
   2138     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
   2139         Instruction::Add, ExtTy, false, CostKind);
   2140     InstructionCost MulCost = 0;
   2141     InstructionCost ExtCost = thisT()->getCastInstrCost(
   2142         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
   2143         TTI::CastContextHint::None, CostKind);
   2144     if (IsMLA) {
   2145       MulCost =
   2146           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
   2147       ExtCost *= 2;
   2148     }
   2149 
   2150     return RedCost + MulCost + ExtCost;
   2151   }
   2152 
   2153   InstructionCost getVectorSplitCost() { return 1; }
   2154 
   2155   /// @}
   2156 };
   2157 
   2158 /// Concrete BasicTTIImpl that can be used if no further customization
   2159 /// is needed.
   2160 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
   2161   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
   2162 
   2163   friend class BasicTTIImplBase<BasicTTIImpl>;
   2164 
   2165   const TargetSubtargetInfo *ST;
   2166   const TargetLoweringBase *TLI;
   2167 
   2168   const TargetSubtargetInfo *getST() const { return ST; }
   2169   const TargetLoweringBase *getTLI() const { return TLI; }
   2170 
   2171 public:
   2172   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
   2173 };
   2174 
   2175 } // end namespace llvm
   2176 
   2177 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
   2178