Home | History | Annotate | Line # | Download | only in Analysis
      1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements inline cost analysis.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "llvm/Analysis/InlineCost.h"
     14 #include "llvm/ADT/STLExtras.h"
     15 #include "llvm/ADT/SetVector.h"
     16 #include "llvm/ADT/SmallPtrSet.h"
     17 #include "llvm/ADT/SmallVector.h"
     18 #include "llvm/ADT/Statistic.h"
     19 #include "llvm/Analysis/AssumptionCache.h"
     20 #include "llvm/Analysis/BlockFrequencyInfo.h"
     21 #include "llvm/Analysis/CFG.h"
     22 #include "llvm/Analysis/CodeMetrics.h"
     23 #include "llvm/Analysis/ConstantFolding.h"
     24 #include "llvm/Analysis/InstructionSimplify.h"
     25 #include "llvm/Analysis/LoopInfo.h"
     26 #include "llvm/Analysis/ProfileSummaryInfo.h"
     27 #include "llvm/Analysis/TargetLibraryInfo.h"
     28 #include "llvm/Analysis/TargetTransformInfo.h"
     29 #include "llvm/Analysis/ValueTracking.h"
     30 #include "llvm/Config/llvm-config.h"
     31 #include "llvm/IR/AssemblyAnnotationWriter.h"
     32 #include "llvm/IR/CallingConv.h"
     33 #include "llvm/IR/DataLayout.h"
     34 #include "llvm/IR/Dominators.h"
     35 #include "llvm/IR/GetElementPtrTypeIterator.h"
     36 #include "llvm/IR/GlobalAlias.h"
     37 #include "llvm/IR/InstVisitor.h"
     38 #include "llvm/IR/IntrinsicInst.h"
     39 #include "llvm/IR/Operator.h"
     40 #include "llvm/IR/PatternMatch.h"
     41 #include "llvm/Support/CommandLine.h"
     42 #include "llvm/Support/Debug.h"
     43 #include "llvm/Support/FormattedStream.h"
     44 #include "llvm/Support/raw_ostream.h"
     45 
     46 using namespace llvm;
     47 
     48 #define DEBUG_TYPE "inline-cost"
     49 
     50 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
     51 
     52 static cl::opt<int>
     53     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
     54                      cl::ZeroOrMore,
     55                      cl::desc("Default amount of inlining to perform"));
     56 
     57 static cl::opt<bool> PrintInstructionComments(
     58     "print-instruction-comments", cl::Hidden, cl::init(false),
     59     cl::desc("Prints comments for instruction based on inline cost analysis"));
     60 
     61 static cl::opt<int> InlineThreshold(
     62     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
     63     cl::desc("Control the amount of inlining to perform (default = 225)"));
     64 
     65 static cl::opt<int> HintThreshold(
     66     "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
     67     cl::desc("Threshold for inlining functions with inline hint"));
     68 
     69 static cl::opt<int>
     70     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
     71                           cl::init(45), cl::ZeroOrMore,
     72                           cl::desc("Threshold for inlining cold callsites"));
     73 
     74 static cl::opt<bool> InlineEnableCostBenefitAnalysis(
     75     "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
     76     cl::desc("Enable the cost-benefit analysis for the inliner"));
     77 
     78 static cl::opt<int> InlineSavingsMultiplier(
     79     "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore,
     80     cl::desc("Multiplier to multiply cycle savings by during inlining"));
     81 
     82 static cl::opt<int>
     83     InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
     84                         cl::ZeroOrMore,
     85                         cl::desc("The maximum size of a callee that get's "
     86                                  "inlined without sufficient cycle savings"));
     87 
     88 // We introduce this threshold to help performance of instrumentation based
     89 // PGO before we actually hook up inliner with analysis passes such as BPI and
     90 // BFI.
     91 static cl::opt<int> ColdThreshold(
     92     "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
     93     cl::desc("Threshold for inlining functions with cold attribute"));
     94 
     95 static cl::opt<int>
     96     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
     97                          cl::ZeroOrMore,
     98                          cl::desc("Threshold for hot callsites "));
     99 
    100 static cl::opt<int> LocallyHotCallSiteThreshold(
    101     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
    102     cl::desc("Threshold for locally hot callsites "));
    103 
    104 static cl::opt<int> ColdCallSiteRelFreq(
    105     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
    106     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
    107              "entry frequency, for a callsite to be cold in the absence of "
    108              "profile information."));
    109 
    110 static cl::opt<int> HotCallSiteRelFreq(
    111     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
    112     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
    113              "entry frequency, for a callsite to be hot in the absence of "
    114              "profile information."));
    115 
    116 static cl::opt<bool> OptComputeFullInlineCost(
    117     "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
    118     cl::desc("Compute the full inline cost of a call site even when the cost "
    119              "exceeds the threshold."));
    120 
    121 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
    122     "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
    123     cl::ZeroOrMore,
    124     cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
    125              "attributes."));
    126 
    127 static cl::opt<bool> DisableGEPConstOperand(
    128     "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
    129     cl::desc("Disables evaluation of GetElementPtr with constant operands"));
    130 
    131 namespace {
    132 class InlineCostCallAnalyzer;
    133 
    134 // This struct is used to store information about inline cost of a
    135 // particular instruction
    136 struct InstructionCostDetail {
    137   int CostBefore = 0;
    138   int CostAfter = 0;
    139   int ThresholdBefore = 0;
    140   int ThresholdAfter = 0;
    141 
    142   int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
    143 
    144   int getCostDelta() const { return CostAfter - CostBefore; }
    145 
    146   bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
    147 };
    148 
    149 class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
    150 private:
    151   InlineCostCallAnalyzer *const ICCA;
    152 
    153 public:
    154   InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
    155   virtual void emitInstructionAnnot(const Instruction *I,
    156                                     formatted_raw_ostream &OS) override;
    157 };
    158 
    159 /// Carry out call site analysis, in order to evaluate inlinability.
    160 /// NOTE: the type is currently used as implementation detail of functions such
    161 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
    162 /// expectation is that they come from the outer scope, from the wrapper
    163 /// functions. If we want to support constructing CallAnalyzer objects where
    164 /// lambdas are provided inline at construction, or where the object needs to
    165 /// otherwise survive past the scope of the provided functions, we need to
    166 /// revisit the argument types.
    167 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    168   typedef InstVisitor<CallAnalyzer, bool> Base;
    169   friend class InstVisitor<CallAnalyzer, bool>;
    170 
    171 protected:
    172   virtual ~CallAnalyzer() {}
    173   /// The TargetTransformInfo available for this compilation.
    174   const TargetTransformInfo &TTI;
    175 
    176   /// Getter for the cache of @llvm.assume intrinsics.
    177   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
    178 
    179   /// Getter for BlockFrequencyInfo
    180   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
    181 
    182   /// Profile summary information.
    183   ProfileSummaryInfo *PSI;
    184 
    185   /// The called function.
    186   Function &F;
    187 
    188   // Cache the DataLayout since we use it a lot.
    189   const DataLayout &DL;
    190 
    191   /// The OptimizationRemarkEmitter available for this compilation.
    192   OptimizationRemarkEmitter *ORE;
    193 
    194   /// The candidate callsite being analyzed. Please do not use this to do
    195   /// analysis in the caller function; we want the inline cost query to be
    196   /// easily cacheable. Instead, use the cover function paramHasAttr.
    197   CallBase &CandidateCall;
    198 
    199   /// Extension points for handling callsite features.
    200   // Called before a basic block was analyzed.
    201   virtual void onBlockStart(const BasicBlock *BB) {}
    202 
    203   /// Called after a basic block was analyzed.
    204   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
    205 
    206   /// Called before an instruction was analyzed
    207   virtual void onInstructionAnalysisStart(const Instruction *I) {}
    208 
    209   /// Called after an instruction was analyzed
    210   virtual void onInstructionAnalysisFinish(const Instruction *I) {}
    211 
    212   /// Called at the end of the analysis of the callsite. Return the outcome of
    213   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
    214   /// the reason it can't.
    215   virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
    216   /// Called when we're about to start processing a basic block, and every time
    217   /// we are done processing an instruction. Return true if there is no point in
    218   /// continuing the analysis (e.g. we've determined already the call site is
    219   /// too expensive to inline)
    220   virtual bool shouldStop() { return false; }
    221 
    222   /// Called before the analysis of the callee body starts (with callsite
    223   /// contexts propagated).  It checks callsite-specific information. Return a
    224   /// reason analysis can't continue if that's the case, or 'true' if it may
    225   /// continue.
    226   virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
    227   /// Called if the analysis engine decides SROA cannot be done for the given
    228   /// alloca.
    229   virtual void onDisableSROA(AllocaInst *Arg) {}
    230 
    231   /// Called the analysis engine determines load elimination won't happen.
    232   virtual void onDisableLoadElimination() {}
    233 
    234   /// Called to account for a call.
    235   virtual void onCallPenalty() {}
    236 
    237   /// Called to account for the expectation the inlining would result in a load
    238   /// elimination.
    239   virtual void onLoadEliminationOpportunity() {}
    240 
    241   /// Called to account for the cost of argument setup for the Call in the
    242   /// callee's body (not the callsite currently under analysis).
    243   virtual void onCallArgumentSetup(const CallBase &Call) {}
    244 
    245   /// Called to account for a load relative intrinsic.
    246   virtual void onLoadRelativeIntrinsic() {}
    247 
    248   /// Called to account for a lowered call.
    249   virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
    250   }
    251 
    252   /// Account for a jump table of given size. Return false to stop further
    253   /// processing the switch instruction
    254   virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
    255 
    256   /// Account for a case cluster of given size. Return false to stop further
    257   /// processing of the instruction.
    258   virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
    259 
    260   /// Called at the end of processing a switch instruction, with the given
    261   /// number of case clusters.
    262   virtual void onFinalizeSwitch(unsigned JumpTableSize,
    263                                 unsigned NumCaseCluster) {}
    264 
    265   /// Called to account for any other instruction not specifically accounted
    266   /// for.
    267   virtual void onMissedSimplification() {}
    268 
    269   /// Start accounting potential benefits due to SROA for the given alloca.
    270   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
    271 
    272   /// Account SROA savings for the AllocaInst value.
    273   virtual void onAggregateSROAUse(AllocaInst *V) {}
    274 
    275   bool handleSROA(Value *V, bool DoNotDisable) {
    276     // Check for SROA candidates in comparisons.
    277     if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
    278       if (DoNotDisable) {
    279         onAggregateSROAUse(SROAArg);
    280         return true;
    281       }
    282       disableSROAForArg(SROAArg);
    283     }
    284     return false;
    285   }
    286 
    287   bool IsCallerRecursive = false;
    288   bool IsRecursiveCall = false;
    289   bool ExposesReturnsTwice = false;
    290   bool HasDynamicAlloca = false;
    291   bool ContainsNoDuplicateCall = false;
    292   bool HasReturn = false;
    293   bool HasIndirectBr = false;
    294   bool HasUninlineableIntrinsic = false;
    295   bool InitsVargArgs = false;
    296 
    297   /// Number of bytes allocated statically by the callee.
    298   uint64_t AllocatedSize = 0;
    299   unsigned NumInstructions = 0;
    300   unsigned NumVectorInstructions = 0;
    301 
    302   /// While we walk the potentially-inlined instructions, we build up and
    303   /// maintain a mapping of simplified values specific to this callsite. The
    304   /// idea is to propagate any special information we have about arguments to
    305   /// this call through the inlinable section of the function, and account for
    306   /// likely simplifications post-inlining. The most important aspect we track
    307   /// is CFG altering simplifications -- when we prove a basic block dead, that
    308   /// can cause dramatic shifts in the cost of inlining a function.
    309   DenseMap<Value *, Constant *> SimplifiedValues;
    310 
    311   /// Keep track of the values which map back (through function arguments) to
    312   /// allocas on the caller stack which could be simplified through SROA.
    313   DenseMap<Value *, AllocaInst *> SROAArgValues;
    314 
    315   /// Keep track of Allocas for which we believe we may get SROA optimization.
    316   DenseSet<AllocaInst *> EnabledSROAAllocas;
    317 
    318   /// Keep track of values which map to a pointer base and constant offset.
    319   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
    320 
    321   /// Keep track of dead blocks due to the constant arguments.
    322   SetVector<BasicBlock *> DeadBlocks;
    323 
    324   /// The mapping of the blocks to their known unique successors due to the
    325   /// constant arguments.
    326   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
    327 
    328   /// Model the elimination of repeated loads that is expected to happen
    329   /// whenever we simplify away the stores that would otherwise cause them to be
    330   /// loads.
    331   bool EnableLoadElimination;
    332   SmallPtrSet<Value *, 16> LoadAddrSet;
    333 
    334   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
    335     auto It = SROAArgValues.find(V);
    336     if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
    337       return nullptr;
    338     return It->second;
    339   }
    340 
    341   // Custom simplification helper routines.
    342   bool isAllocaDerivedArg(Value *V);
    343   void disableSROAForArg(AllocaInst *SROAArg);
    344   void disableSROA(Value *V);
    345   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
    346   void disableLoadElimination();
    347   bool isGEPFree(GetElementPtrInst &GEP);
    348   bool canFoldInboundsGEP(GetElementPtrInst &I);
    349   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
    350   bool simplifyCallSite(Function *F, CallBase &Call);
    351   template <typename Callable>
    352   bool simplifyInstruction(Instruction &I, Callable Evaluate);
    353   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
    354 
    355   /// Return true if the given argument to the function being considered for
    356   /// inlining has the given attribute set either at the call site or the
    357   /// function declaration.  Primarily used to inspect call site specific
    358   /// attributes since these can be more precise than the ones on the callee
    359   /// itself.
    360   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
    361 
    362   /// Return true if the given value is known non null within the callee if
    363   /// inlined through this particular callsite.
    364   bool isKnownNonNullInCallee(Value *V);
    365 
    366   /// Return true if size growth is allowed when inlining the callee at \p Call.
    367   bool allowSizeGrowth(CallBase &Call);
    368 
    369   // Custom analysis routines.
    370   InlineResult analyzeBlock(BasicBlock *BB,
    371                             SmallPtrSetImpl<const Value *> &EphValues);
    372 
    373   // Disable several entry points to the visitor so we don't accidentally use
    374   // them by declaring but not defining them here.
    375   void visit(Module *);
    376   void visit(Module &);
    377   void visit(Function *);
    378   void visit(Function &);
    379   void visit(BasicBlock *);
    380   void visit(BasicBlock &);
    381 
    382   // Provide base case for our instruction visit.
    383   bool visitInstruction(Instruction &I);
    384 
    385   // Our visit overrides.
    386   bool visitAlloca(AllocaInst &I);
    387   bool visitPHI(PHINode &I);
    388   bool visitGetElementPtr(GetElementPtrInst &I);
    389   bool visitBitCast(BitCastInst &I);
    390   bool visitPtrToInt(PtrToIntInst &I);
    391   bool visitIntToPtr(IntToPtrInst &I);
    392   bool visitCastInst(CastInst &I);
    393   bool visitCmpInst(CmpInst &I);
    394   bool visitSub(BinaryOperator &I);
    395   bool visitBinaryOperator(BinaryOperator &I);
    396   bool visitFNeg(UnaryOperator &I);
    397   bool visitLoad(LoadInst &I);
    398   bool visitStore(StoreInst &I);
    399   bool visitExtractValue(ExtractValueInst &I);
    400   bool visitInsertValue(InsertValueInst &I);
    401   bool visitCallBase(CallBase &Call);
    402   bool visitReturnInst(ReturnInst &RI);
    403   bool visitBranchInst(BranchInst &BI);
    404   bool visitSelectInst(SelectInst &SI);
    405   bool visitSwitchInst(SwitchInst &SI);
    406   bool visitIndirectBrInst(IndirectBrInst &IBI);
    407   bool visitResumeInst(ResumeInst &RI);
    408   bool visitCleanupReturnInst(CleanupReturnInst &RI);
    409   bool visitCatchReturnInst(CatchReturnInst &RI);
    410   bool visitUnreachableInst(UnreachableInst &I);
    411 
    412 public:
    413   CallAnalyzer(
    414       Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
    415       function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
    416       function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
    417       ProfileSummaryInfo *PSI = nullptr,
    418       OptimizationRemarkEmitter *ORE = nullptr)
    419       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
    420         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
    421         CandidateCall(Call), EnableLoadElimination(true) {}
    422 
    423   InlineResult analyze();
    424 
    425   Optional<Constant*> getSimplifiedValue(Instruction *I) {
    426     if (SimplifiedValues.find(I) != SimplifiedValues.end())
    427       return SimplifiedValues[I];
    428     return None;
    429   }
    430 
    431   // Keep a bunch of stats about the cost savings found so we can print them
    432   // out when debugging.
    433   unsigned NumConstantArgs = 0;
    434   unsigned NumConstantOffsetPtrArgs = 0;
    435   unsigned NumAllocaArgs = 0;
    436   unsigned NumConstantPtrCmps = 0;
    437   unsigned NumConstantPtrDiffs = 0;
    438   unsigned NumInstructionsSimplified = 0;
    439 
    440   void dump();
    441 };
    442 
    443 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
    444 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
    445 class InlineCostCallAnalyzer final : public CallAnalyzer {
    446   const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
    447   const bool ComputeFullInlineCost;
    448   int LoadEliminationCost = 0;
    449   /// Bonus to be applied when percentage of vector instructions in callee is
    450   /// high (see more details in updateThreshold).
    451   int VectorBonus = 0;
    452   /// Bonus to be applied when the callee has only one reachable basic block.
    453   int SingleBBBonus = 0;
    454 
    455   /// Tunable parameters that control the analysis.
    456   const InlineParams &Params;
    457 
    458   // This DenseMap stores the delta change in cost and threshold after
    459   // accounting for the given instruction. The map is filled only with the
    460   // flag PrintInstructionComments on.
    461   DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
    462 
    463   /// Upper bound for the inlining cost. Bonuses are being applied to account
    464   /// for speculative "expected profit" of the inlining decision.
    465   int Threshold = 0;
    466 
    467   /// Attempt to evaluate indirect calls to boost its inline cost.
    468   const bool BoostIndirectCalls;
    469 
    470   /// Ignore the threshold when finalizing analysis.
    471   const bool IgnoreThreshold;
    472 
    473   // True if the cost-benefit-analysis-based inliner is enabled.
    474   const bool CostBenefitAnalysisEnabled;
    475 
    476   /// Inlining cost measured in abstract units, accounts for all the
    477   /// instructions expected to be executed for a given function invocation.
    478   /// Instructions that are statically proven to be dead based on call-site
    479   /// arguments are not counted here.
    480   int Cost = 0;
    481 
    482   // The cumulative cost at the beginning of the basic block being analyzed.  At
    483   // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
    484   // the size of that basic block.
    485   int CostAtBBStart = 0;
    486 
    487   // The static size of live but cold basic blocks.  This is "static" in the
    488   // sense that it's not weighted by profile counts at all.
    489   int ColdSize = 0;
    490 
    491   // Whether inlining is decided by cost-benefit analysis.
    492   bool DecidedByCostBenefit = false;
    493 
    494   bool SingleBB = true;
    495 
    496   unsigned SROACostSavings = 0;
    497   unsigned SROACostSavingsLost = 0;
    498 
    499   /// The mapping of caller Alloca values to their accumulated cost savings. If
    500   /// we have to disable SROA for one of the allocas, this tells us how much
    501   /// cost must be added.
    502   DenseMap<AllocaInst *, int> SROAArgCosts;
    503 
    504   /// Return true if \p Call is a cold callsite.
    505   bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
    506 
    507   /// Update Threshold based on callsite properties such as callee
    508   /// attributes and callee hotness for PGO builds. The Callee is explicitly
    509   /// passed to support analyzing indirect calls whose target is inferred by
    510   /// analysis.
    511   void updateThreshold(CallBase &Call, Function &Callee);
    512   /// Return a higher threshold if \p Call is a hot callsite.
    513   Optional<int> getHotCallSiteThreshold(CallBase &Call,
    514                                         BlockFrequencyInfo *CallerBFI);
    515 
    516   /// Handle a capped 'int' increment for Cost.
    517   void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
    518     assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
    519     Cost = (int)std::min(UpperBound, Cost + Inc);
    520   }
    521 
    522   void onDisableSROA(AllocaInst *Arg) override {
    523     auto CostIt = SROAArgCosts.find(Arg);
    524     if (CostIt == SROAArgCosts.end())
    525       return;
    526     addCost(CostIt->second);
    527     SROACostSavings -= CostIt->second;
    528     SROACostSavingsLost += CostIt->second;
    529     SROAArgCosts.erase(CostIt);
    530   }
    531 
    532   void onDisableLoadElimination() override {
    533     addCost(LoadEliminationCost);
    534     LoadEliminationCost = 0;
    535   }
    536   void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
    537   void onCallArgumentSetup(const CallBase &Call) override {
    538     // Pay the price of the argument setup. We account for the average 1
    539     // instruction per call argument setup here.
    540     addCost(Call.arg_size() * InlineConstants::InstrCost);
    541   }
    542   void onLoadRelativeIntrinsic() override {
    543     // This is normally lowered to 4 LLVM instructions.
    544     addCost(3 * InlineConstants::InstrCost);
    545   }
    546   void onLoweredCall(Function *F, CallBase &Call,
    547                      bool IsIndirectCall) override {
    548     // We account for the average 1 instruction per call argument setup here.
    549     addCost(Call.arg_size() * InlineConstants::InstrCost);
    550 
    551     // If we have a constant that we are calling as a function, we can peer
    552     // through it and see the function target. This happens not infrequently
    553     // during devirtualization and so we want to give it a hefty bonus for
    554     // inlining, but cap that bonus in the event that inlining wouldn't pan out.
    555     // Pretend to inline the function, with a custom threshold.
    556     if (IsIndirectCall && BoostIndirectCalls) {
    557       auto IndirectCallParams = Params;
    558       IndirectCallParams.DefaultThreshold =
    559           InlineConstants::IndirectCallThreshold;
    560       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
    561       /// to instantiate the derived class.
    562       InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
    563                                 GetAssumptionCache, GetBFI, PSI, ORE, false);
    564       if (CA.analyze().isSuccess()) {
    565         // We were able to inline the indirect call! Subtract the cost from the
    566         // threshold to get the bonus we want to apply, but don't go below zero.
    567         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
    568       }
    569     } else
    570       // Otherwise simply add the cost for merely making the call.
    571       addCost(InlineConstants::CallPenalty);
    572   }
    573 
    574   void onFinalizeSwitch(unsigned JumpTableSize,
    575                         unsigned NumCaseCluster) override {
    576     // If suitable for a jump table, consider the cost for the table size and
    577     // branch to destination.
    578     // Maximum valid cost increased in this function.
    579     if (JumpTableSize) {
    580       int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
    581                        4 * InlineConstants::InstrCost;
    582 
    583       addCost(JTCost, (int64_t)CostUpperBound);
    584       return;
    585     }
    586     // Considering forming a binary search, we should find the number of nodes
    587     // which is same as the number of comparisons when lowered. For a given
    588     // number of clusters, n, we can define a recursive function, f(n), to find
    589     // the number of nodes in the tree. The recursion is :
    590     // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
    591     // and f(n) = n, when n <= 3.
    592     // This will lead a binary tree where the leaf should be either f(2) or f(3)
    593     // when n > 3.  So, the number of comparisons from leaves should be n, while
    594     // the number of non-leaf should be :
    595     //   2^(log2(n) - 1) - 1
    596     //   = 2^log2(n) * 2^-1 - 1
    597     //   = n / 2 - 1.
    598     // Considering comparisons from leaf and non-leaf nodes, we can estimate the
    599     // number of comparisons in a simple closed form :
    600     //   n + n / 2 - 1 = n * 3 / 2 - 1
    601     if (NumCaseCluster <= 3) {
    602       // Suppose a comparison includes one compare and one conditional branch.
    603       addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
    604       return;
    605     }
    606 
    607     int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
    608     int64_t SwitchCost =
    609         ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
    610 
    611     addCost(SwitchCost, (int64_t)CostUpperBound);
    612   }
    613   void onMissedSimplification() override {
    614     addCost(InlineConstants::InstrCost);
    615   }
    616 
    617   void onInitializeSROAArg(AllocaInst *Arg) override {
    618     assert(Arg != nullptr &&
    619            "Should not initialize SROA costs for null value.");
    620     SROAArgCosts[Arg] = 0;
    621   }
    622 
    623   void onAggregateSROAUse(AllocaInst *SROAArg) override {
    624     auto CostIt = SROAArgCosts.find(SROAArg);
    625     assert(CostIt != SROAArgCosts.end() &&
    626            "expected this argument to have a cost");
    627     CostIt->second += InlineConstants::InstrCost;
    628     SROACostSavings += InlineConstants::InstrCost;
    629   }
    630 
    631   void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
    632 
    633   void onBlockAnalyzed(const BasicBlock *BB) override {
    634     if (CostBenefitAnalysisEnabled) {
    635       // Keep track of the static size of live but cold basic blocks.  For now,
    636       // we define a cold basic block to be one that's never executed.
    637       assert(GetBFI && "GetBFI must be available");
    638       BlockFrequencyInfo *BFI = &(GetBFI(F));
    639       assert(BFI && "BFI must be available");
    640       auto ProfileCount = BFI->getBlockProfileCount(BB);
    641       assert(ProfileCount.hasValue());
    642       if (ProfileCount.getValue() == 0)
    643         ColdSize += Cost - CostAtBBStart;
    644     }
    645 
    646     auto *TI = BB->getTerminator();
    647     // If we had any successors at this point, than post-inlining is likely to
    648     // have them as well. Note that we assume any basic blocks which existed
    649     // due to branches or switches which folded above will also fold after
    650     // inlining.
    651     if (SingleBB && TI->getNumSuccessors() > 1) {
    652       // Take off the bonus we applied to the threshold.
    653       Threshold -= SingleBBBonus;
    654       SingleBB = false;
    655     }
    656   }
    657 
    658   void onInstructionAnalysisStart(const Instruction *I) override {
    659     // This function is called to store the initial cost of inlining before
    660     // the given instruction was assessed.
    661     if (!PrintInstructionComments)
    662       return;
    663     InstructionCostDetailMap[I].CostBefore = Cost;
    664     InstructionCostDetailMap[I].ThresholdBefore = Threshold;
    665   }
    666 
    667   void onInstructionAnalysisFinish(const Instruction *I) override {
    668     // This function is called to find new values of cost and threshold after
    669     // the instruction has been assessed.
    670     if (!PrintInstructionComments)
    671       return;
    672     InstructionCostDetailMap[I].CostAfter = Cost;
    673     InstructionCostDetailMap[I].ThresholdAfter = Threshold;
    674   }
    675 
    676   bool isCostBenefitAnalysisEnabled() {
    677     if (!PSI || !PSI->hasProfileSummary())
    678       return false;
    679 
    680     if (!GetBFI)
    681       return false;
    682 
    683     if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
    684       // Honor the explicit request from the user.
    685       if (!InlineEnableCostBenefitAnalysis)
    686         return false;
    687     } else {
    688       // Otherwise, require instrumentation profile.
    689       if (!PSI->hasInstrumentationProfile())
    690         return false;
    691     }
    692 
    693     auto *Caller = CandidateCall.getParent()->getParent();
    694     if (!Caller->getEntryCount())
    695       return false;
    696 
    697     BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
    698     if (!CallerBFI)
    699       return false;
    700 
    701     // For now, limit to hot call site.
    702     if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
    703       return false;
    704 
    705     // Make sure we have a nonzero entry count.
    706     auto EntryCount = F.getEntryCount();
    707     if (!EntryCount || !EntryCount.getCount())
    708       return false;
    709 
    710     BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
    711     if (!CalleeBFI)
    712       return false;
    713 
    714     return true;
    715   }
    716 
    717   // Determine whether we should inline the given call site, taking into account
    718   // both the size cost and the cycle savings.  Return None if we don't have
    719   // suficient profiling information to determine.
    720   Optional<bool> costBenefitAnalysis() {
    721     if (!CostBenefitAnalysisEnabled)
    722       return None;
    723 
    724     // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
    725     // for the prelink phase of the AutoFDO + ThinLTO build.  Honor the logic by
    726     // falling back to the cost-based metric.
    727     // TODO: Improve this hacky condition.
    728     if (Threshold == 0)
    729       return None;
    730 
    731     assert(GetBFI);
    732     BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
    733     assert(CalleeBFI);
    734 
    735     // The cycle savings expressed as the sum of InlineConstants::InstrCost
    736     // multiplied by the estimated dynamic count of each instruction we can
    737     // avoid.  Savings come from the call site cost, such as argument setup and
    738     // the call instruction, as well as the instructions that are folded.
    739     //
    740     // We use 128-bit APInt here to avoid potential overflow.  This variable
    741     // should stay well below 10^^24 (or 2^^80) in practice.  This "worst" case
    742     // assumes that we can avoid or fold a billion instructions, each with a
    743     // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
    744     // period on a 4GHz machine.
    745     APInt CycleSavings(128, 0);
    746 
    747     for (auto &BB : F) {
    748       APInt CurrentSavings(128, 0);
    749       for (auto &I : BB) {
    750         if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
    751           // Count a conditional branch as savings if it becomes unconditional.
    752           if (BI->isConditional() &&
    753               dyn_cast_or_null<ConstantInt>(
    754                   SimplifiedValues.lookup(BI->getCondition()))) {
    755             CurrentSavings += InlineConstants::InstrCost;
    756           }
    757         } else if (Value *V = dyn_cast<Value>(&I)) {
    758           // Count an instruction as savings if we can fold it.
    759           if (SimplifiedValues.count(V)) {
    760             CurrentSavings += InlineConstants::InstrCost;
    761           }
    762         }
    763       }
    764 
    765       auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
    766       assert(ProfileCount.hasValue());
    767       CurrentSavings *= ProfileCount.getValue();
    768       CycleSavings += CurrentSavings;
    769     }
    770 
    771     // Compute the cycle savings per call.
    772     auto EntryProfileCount = F.getEntryCount();
    773     assert(EntryProfileCount.hasValue() && EntryProfileCount.getCount());
    774     auto EntryCount = EntryProfileCount.getCount();
    775     CycleSavings += EntryCount / 2;
    776     CycleSavings = CycleSavings.udiv(EntryCount);
    777 
    778     // Compute the total savings for the call site.
    779     auto *CallerBB = CandidateCall.getParent();
    780     BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
    781     CycleSavings += getCallsiteCost(this->CandidateCall, DL);
    782     CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue();
    783 
    784     // Remove the cost of the cold basic blocks.
    785     int Size = Cost - ColdSize;
    786 
    787     // Allow tiny callees to be inlined regardless of whether they meet the
    788     // savings threshold.
    789     Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
    790 
    791     // Return true if the savings justify the cost of inlining.  Specifically,
    792     // we evaluate the following inequality:
    793     //
    794     //  CycleSavings      PSI->getOrCompHotCountThreshold()
    795     // -------------- >= -----------------------------------
    796     //       Size              InlineSavingsMultiplier
    797     //
    798     // Note that the left hand side is specific to a call site.  The right hand
    799     // side is a constant for the entire executable.
    800     APInt LHS = CycleSavings;
    801     LHS *= InlineSavingsMultiplier;
    802     APInt RHS(128, PSI->getOrCompHotCountThreshold());
    803     RHS *= Size;
    804     return LHS.uge(RHS);
    805   }
    806 
    807   InlineResult finalizeAnalysis() override {
    808     // Loops generally act a lot like calls in that they act like barriers to
    809     // movement, require a certain amount of setup, etc. So when optimising for
    810     // size, we penalise any call sites that perform loops. We do this after all
    811     // other costs here, so will likely only be dealing with relatively small
    812     // functions (and hence DT and LI will hopefully be cheap).
    813     auto *Caller = CandidateCall.getFunction();
    814     if (Caller->hasMinSize()) {
    815       DominatorTree DT(F);
    816       LoopInfo LI(DT);
    817       int NumLoops = 0;
    818       for (Loop *L : LI) {
    819         // Ignore loops that will not be executed
    820         if (DeadBlocks.count(L->getHeader()))
    821           continue;
    822         NumLoops++;
    823       }
    824       addCost(NumLoops * InlineConstants::CallPenalty);
    825     }
    826 
    827     // We applied the maximum possible vector bonus at the beginning. Now,
    828     // subtract the excess bonus, if any, from the Threshold before
    829     // comparing against Cost.
    830     if (NumVectorInstructions <= NumInstructions / 10)
    831       Threshold -= VectorBonus;
    832     else if (NumVectorInstructions <= NumInstructions / 2)
    833       Threshold -= VectorBonus / 2;
    834 
    835     if (auto Result = costBenefitAnalysis()) {
    836       DecidedByCostBenefit = true;
    837       if (Result.getValue())
    838         return InlineResult::success();
    839       else
    840         return InlineResult::failure("Cost over threshold.");
    841     }
    842 
    843     if (IgnoreThreshold || Cost < std::max(1, Threshold))
    844       return InlineResult::success();
    845     return InlineResult::failure("Cost over threshold.");
    846   }
    847   bool shouldStop() override {
    848     // Bail out the moment we cross the threshold. This means we'll under-count
    849     // the cost, but only when undercounting doesn't matter.
    850     return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost;
    851   }
    852 
    853   void onLoadEliminationOpportunity() override {
    854     LoadEliminationCost += InlineConstants::InstrCost;
    855   }
    856 
    857   InlineResult onAnalysisStart() override {
    858     // Perform some tweaks to the cost and threshold based on the direct
    859     // callsite information.
    860 
    861     // We want to more aggressively inline vector-dense kernels, so up the
    862     // threshold, and we'll lower it if the % of vector instructions gets too
    863     // low. Note that these bonuses are some what arbitrary and evolved over
    864     // time by accident as much as because they are principled bonuses.
    865     //
    866     // FIXME: It would be nice to remove all such bonuses. At least it would be
    867     // nice to base the bonus values on something more scientific.
    868     assert(NumInstructions == 0);
    869     assert(NumVectorInstructions == 0);
    870 
    871     // Update the threshold based on callsite properties
    872     updateThreshold(CandidateCall, F);
    873 
    874     // While Threshold depends on commandline options that can take negative
    875     // values, we want to enforce the invariant that the computed threshold and
    876     // bonuses are non-negative.
    877     assert(Threshold >= 0);
    878     assert(SingleBBBonus >= 0);
    879     assert(VectorBonus >= 0);
    880 
    881     // Speculatively apply all possible bonuses to Threshold. If cost exceeds
    882     // this Threshold any time, and cost cannot decrease, we can stop processing
    883     // the rest of the function body.
    884     Threshold += (SingleBBBonus + VectorBonus);
    885 
    886     // Give out bonuses for the callsite, as the instructions setting them up
    887     // will be gone after inlining.
    888     addCost(-getCallsiteCost(this->CandidateCall, DL));
    889 
    890     // If this function uses the coldcc calling convention, prefer not to inline
    891     // it.
    892     if (F.getCallingConv() == CallingConv::Cold)
    893       Cost += InlineConstants::ColdccPenalty;
    894 
    895     // Check if we're done. This can happen due to bonuses and penalties.
    896     if (Cost >= Threshold && !ComputeFullInlineCost)
    897       return InlineResult::failure("high cost");
    898 
    899     return InlineResult::success();
    900   }
    901 
    902 public:
    903   InlineCostCallAnalyzer(
    904       Function &Callee, CallBase &Call, const InlineParams &Params,
    905       const TargetTransformInfo &TTI,
    906       function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
    907       function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
    908       ProfileSummaryInfo *PSI = nullptr,
    909       OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
    910       bool IgnoreThreshold = false)
    911       : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
    912         ComputeFullInlineCost(OptComputeFullInlineCost ||
    913                               Params.ComputeFullInlineCost || ORE ||
    914                               isCostBenefitAnalysisEnabled()),
    915         Params(Params), Threshold(Params.DefaultThreshold),
    916         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
    917         CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
    918         Writer(this) {}
    919 
    920   /// Annotation Writer for instruction details
    921   InlineCostAnnotationWriter Writer;
    922 
    923   void dump();
    924 
    925   // Prints the same analysis as dump(), but its definition is not dependent
    926   // on the build.
    927   void print();
    928 
    929   Optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
    930     if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end())
    931       return InstructionCostDetailMap[I];
    932     return None;
    933   }
    934 
    935   virtual ~InlineCostCallAnalyzer() {}
    936   int getThreshold() { return Threshold; }
    937   int getCost() { return Cost; }
    938   bool wasDecidedByCostBenefit() { return DecidedByCostBenefit; }
    939 };
    940 } // namespace
    941 
    942 /// Test whether the given value is an Alloca-derived function argument.
    943 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
    944   return SROAArgValues.count(V);
    945 }
    946 
    947 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
    948   onDisableSROA(SROAArg);
    949   EnabledSROAAllocas.erase(SROAArg);
    950   disableLoadElimination();
    951 }
    952 
    953 void InlineCostAnnotationWriter::emitInstructionAnnot(const Instruction *I,
    954                                                 formatted_raw_ostream &OS) {
    955   // The cost of inlining of the given instruction is printed always.
    956   // The threshold delta is printed only when it is non-zero. It happens
    957   // when we decided to give a bonus at a particular instruction.
    958   Optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
    959   if (!Record)
    960     OS << "; No analysis for the instruction";
    961   else {
    962     OS << "; cost before = " << Record->CostBefore
    963        << ", cost after = " << Record->CostAfter
    964        << ", threshold before = " << Record->ThresholdBefore
    965        << ", threshold after = " << Record->ThresholdAfter << ", ";
    966     OS << "cost delta = " << Record->getCostDelta();
    967     if (Record->hasThresholdChanged())
    968       OS << ", threshold delta = " << Record->getThresholdDelta();
    969   }
    970   auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
    971   if (C) {
    972     OS << ", simplified to ";
    973     C.getValue()->print(OS, true);
    974   }
    975   OS << "\n";
    976 }
    977 
    978 /// If 'V' maps to a SROA candidate, disable SROA for it.
    979 void CallAnalyzer::disableSROA(Value *V) {
    980   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
    981     disableSROAForArg(SROAArg);
    982   }
    983 }
    984 
    985 void CallAnalyzer::disableLoadElimination() {
    986   if (EnableLoadElimination) {
    987     onDisableLoadElimination();
    988     EnableLoadElimination = false;
    989   }
    990 }
    991 
    992 /// Accumulate a constant GEP offset into an APInt if possible.
    993 ///
    994 /// Returns false if unable to compute the offset for any reason. Respects any
    995 /// simplified values known during the analysis of this callsite.
    996 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
    997   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
    998   assert(IntPtrWidth == Offset.getBitWidth());
    999 
   1000   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
   1001        GTI != GTE; ++GTI) {
   1002     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
   1003     if (!OpC)
   1004       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
   1005         OpC = dyn_cast<ConstantInt>(SimpleOp);
   1006     if (!OpC)
   1007       return false;
   1008     if (OpC->isZero())
   1009       continue;
   1010 
   1011     // Handle a struct index, which adds its field offset to the pointer.
   1012     if (StructType *STy = GTI.getStructTypeOrNull()) {
   1013       unsigned ElementIdx = OpC->getZExtValue();
   1014       const StructLayout *SL = DL.getStructLayout(STy);
   1015       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
   1016       continue;
   1017     }
   1018 
   1019     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
   1020     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
   1021   }
   1022   return true;
   1023 }
   1024 
   1025 /// Use TTI to check whether a GEP is free.
   1026 ///
   1027 /// Respects any simplified values known during the analysis of this callsite.
   1028 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
   1029   SmallVector<Value *, 4> Operands;
   1030   Operands.push_back(GEP.getOperand(0));
   1031   for (const Use &Op : GEP.indices())
   1032     if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
   1033       Operands.push_back(SimpleOp);
   1034     else
   1035       Operands.push_back(Op);
   1036   return TTI.getUserCost(&GEP, Operands,
   1037                          TargetTransformInfo::TCK_SizeAndLatency) ==
   1038          TargetTransformInfo::TCC_Free;
   1039 }
   1040 
   1041 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   1042   disableSROA(I.getOperand(0));
   1043 
   1044   // Check whether inlining will turn a dynamic alloca into a static
   1045   // alloca and handle that case.
   1046   if (I.isArrayAllocation()) {
   1047     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
   1048     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
   1049       // Sometimes a dynamic alloca could be converted into a static alloca
   1050       // after this constant prop, and become a huge static alloca on an
   1051       // unconditional CFG path. Avoid inlining if this is going to happen above
   1052       // a threshold.
   1053       // FIXME: If the threshold is removed or lowered too much, we could end up
   1054       // being too pessimistic and prevent inlining non-problematic code. This
   1055       // could result in unintended perf regressions. A better overall strategy
   1056       // is needed to track stack usage during inlining.
   1057       Type *Ty = I.getAllocatedType();
   1058       AllocatedSize = SaturatingMultiplyAdd(
   1059           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getKnownMinSize(),
   1060           AllocatedSize);
   1061       if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
   1062         HasDynamicAlloca = true;
   1063       return false;
   1064     }
   1065   }
   1066 
   1067   // Accumulate the allocated size.
   1068   if (I.isStaticAlloca()) {
   1069     Type *Ty = I.getAllocatedType();
   1070     AllocatedSize =
   1071         SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize);
   1072   }
   1073 
   1074   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
   1075   // a variety of reasons, and so we would like to not inline them into
   1076   // functions which don't currently have a dynamic alloca. This simply
   1077   // disables inlining altogether in the presence of a dynamic alloca.
   1078   if (!I.isStaticAlloca())
   1079     HasDynamicAlloca = true;
   1080 
   1081   return false;
   1082 }
   1083 
   1084 bool CallAnalyzer::visitPHI(PHINode &I) {
   1085   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
   1086   // though we don't want to propagate it's bonuses. The idea is to disable
   1087   // SROA if it *might* be used in an inappropriate manner.
   1088 
   1089   // Phi nodes are always zero-cost.
   1090   // FIXME: Pointer sizes may differ between different address spaces, so do we
   1091   // need to use correct address space in the call to getPointerSizeInBits here?
   1092   // Or could we skip the getPointerSizeInBits call completely? As far as I can
   1093   // see the ZeroOffset is used as a dummy value, so we can probably use any
   1094   // bit width for the ZeroOffset?
   1095   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
   1096   bool CheckSROA = I.getType()->isPointerTy();
   1097 
   1098   // Track the constant or pointer with constant offset we've seen so far.
   1099   Constant *FirstC = nullptr;
   1100   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
   1101   Value *FirstV = nullptr;
   1102 
   1103   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
   1104     BasicBlock *Pred = I.getIncomingBlock(i);
   1105     // If the incoming block is dead, skip the incoming block.
   1106     if (DeadBlocks.count(Pred))
   1107       continue;
   1108     // If the parent block of phi is not the known successor of the incoming
   1109     // block, skip the incoming block.
   1110     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
   1111     if (KnownSuccessor && KnownSuccessor != I.getParent())
   1112       continue;
   1113 
   1114     Value *V = I.getIncomingValue(i);
   1115     // If the incoming value is this phi itself, skip the incoming value.
   1116     if (&I == V)
   1117       continue;
   1118 
   1119     Constant *C = dyn_cast<Constant>(V);
   1120     if (!C)
   1121       C = SimplifiedValues.lookup(V);
   1122 
   1123     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
   1124     if (!C && CheckSROA)
   1125       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
   1126 
   1127     if (!C && !BaseAndOffset.first)
   1128       // The incoming value is neither a constant nor a pointer with constant
   1129       // offset, exit early.
   1130       return true;
   1131 
   1132     if (FirstC) {
   1133       if (FirstC == C)
   1134         // If we've seen a constant incoming value before and it is the same
   1135         // constant we see this time, continue checking the next incoming value.
   1136         continue;
   1137       // Otherwise early exit because we either see a different constant or saw
   1138       // a constant before but we have a pointer with constant offset this time.
   1139       return true;
   1140     }
   1141 
   1142     if (FirstV) {
   1143       // The same logic as above, but check pointer with constant offset here.
   1144       if (FirstBaseAndOffset == BaseAndOffset)
   1145         continue;
   1146       return true;
   1147     }
   1148 
   1149     if (C) {
   1150       // This is the 1st time we've seen a constant, record it.
   1151       FirstC = C;
   1152       continue;
   1153     }
   1154 
   1155     // The remaining case is that this is the 1st time we've seen a pointer with
   1156     // constant offset, record it.
   1157     FirstV = V;
   1158     FirstBaseAndOffset = BaseAndOffset;
   1159   }
   1160 
   1161   // Check if we can map phi to a constant.
   1162   if (FirstC) {
   1163     SimplifiedValues[&I] = FirstC;
   1164     return true;
   1165   }
   1166 
   1167   // Check if we can map phi to a pointer with constant offset.
   1168   if (FirstBaseAndOffset.first) {
   1169     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
   1170 
   1171     if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
   1172       SROAArgValues[&I] = SROAArg;
   1173   }
   1174 
   1175   return true;
   1176 }
   1177 
   1178 /// Check we can fold GEPs of constant-offset call site argument pointers.
   1179 /// This requires target data and inbounds GEPs.
   1180 ///
   1181 /// \return true if the specified GEP can be folded.
   1182 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
   1183   // Check if we have a base + offset for the pointer.
   1184   std::pair<Value *, APInt> BaseAndOffset =
   1185       ConstantOffsetPtrs.lookup(I.getPointerOperand());
   1186   if (!BaseAndOffset.first)
   1187     return false;
   1188 
   1189   // Check if the offset of this GEP is constant, and if so accumulate it
   1190   // into Offset.
   1191   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
   1192     return false;
   1193 
   1194   // Add the result as a new mapping to Base + Offset.
   1195   ConstantOffsetPtrs[&I] = BaseAndOffset;
   1196 
   1197   return true;
   1198 }
   1199 
   1200 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   1201   auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
   1202 
   1203   // Lambda to check whether a GEP's indices are all constant.
   1204   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
   1205     for (const Use &Op : GEP.indices())
   1206       if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
   1207         return false;
   1208     return true;
   1209   };
   1210 
   1211   if (!DisableGEPConstOperand)
   1212     if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1213         SmallVector<Constant *, 2> Indices;
   1214         for (unsigned int Index = 1 ; Index < COps.size() ; ++Index)
   1215             Indices.push_back(COps[Index]);
   1216         return ConstantExpr::getGetElementPtr(I.getSourceElementType(), COps[0],
   1217                                               Indices, I.isInBounds());
   1218         }))
   1219       return true;
   1220 
   1221   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
   1222     if (SROAArg)
   1223       SROAArgValues[&I] = SROAArg;
   1224 
   1225     // Constant GEPs are modeled as free.
   1226     return true;
   1227   }
   1228 
   1229   // Variable GEPs will require math and will disable SROA.
   1230   if (SROAArg)
   1231     disableSROAForArg(SROAArg);
   1232   return isGEPFree(I);
   1233 }
   1234 
   1235 /// Simplify \p I if its operands are constants and update SimplifiedValues.
   1236 /// \p Evaluate is a callable specific to instruction type that evaluates the
   1237 /// instruction when all the operands are constants.
   1238 template <typename Callable>
   1239 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
   1240   SmallVector<Constant *, 2> COps;
   1241   for (Value *Op : I.operands()) {
   1242     Constant *COp = dyn_cast<Constant>(Op);
   1243     if (!COp)
   1244       COp = SimplifiedValues.lookup(Op);
   1245     if (!COp)
   1246       return false;
   1247     COps.push_back(COp);
   1248   }
   1249   auto *C = Evaluate(COps);
   1250   if (!C)
   1251     return false;
   1252   SimplifiedValues[&I] = C;
   1253   return true;
   1254 }
   1255 
   1256 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   1257   // Propagate constants through bitcasts.
   1258   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1259         return ConstantExpr::getBitCast(COps[0], I.getType());
   1260       }))
   1261     return true;
   1262 
   1263   // Track base/offsets through casts
   1264   std::pair<Value *, APInt> BaseAndOffset =
   1265       ConstantOffsetPtrs.lookup(I.getOperand(0));
   1266   // Casts don't change the offset, just wrap it up.
   1267   if (BaseAndOffset.first)
   1268     ConstantOffsetPtrs[&I] = BaseAndOffset;
   1269 
   1270   // Also look for SROA candidates here.
   1271   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
   1272     SROAArgValues[&I] = SROAArg;
   1273 
   1274   // Bitcasts are always zero cost.
   1275   return true;
   1276 }
   1277 
   1278 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   1279   // Propagate constants through ptrtoint.
   1280   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1281         return ConstantExpr::getPtrToInt(COps[0], I.getType());
   1282       }))
   1283     return true;
   1284 
   1285   // Track base/offset pairs when converted to a plain integer provided the
   1286   // integer is large enough to represent the pointer.
   1287   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
   1288   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
   1289   if (IntegerSize == DL.getPointerSizeInBits(AS)) {
   1290     std::pair<Value *, APInt> BaseAndOffset =
   1291         ConstantOffsetPtrs.lookup(I.getOperand(0));
   1292     if (BaseAndOffset.first)
   1293       ConstantOffsetPtrs[&I] = BaseAndOffset;
   1294   }
   1295 
   1296   // This is really weird. Technically, ptrtoint will disable SROA. However,
   1297   // unless that ptrtoint is *used* somewhere in the live basic blocks after
   1298   // inlining, it will be nuked, and SROA should proceed. All of the uses which
   1299   // would block SROA would also block SROA if applied directly to a pointer,
   1300   // and so we can just add the integer in here. The only places where SROA is
   1301   // preserved either cannot fire on an integer, or won't in-and-of themselves
   1302   // disable SROA (ext) w/o some later use that we would see and disable.
   1303   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
   1304     SROAArgValues[&I] = SROAArg;
   1305 
   1306   return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
   1307          TargetTransformInfo::TCC_Free;
   1308 }
   1309 
   1310 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   1311   // Propagate constants through ptrtoint.
   1312   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1313         return ConstantExpr::getIntToPtr(COps[0], I.getType());
   1314       }))
   1315     return true;
   1316 
   1317   // Track base/offset pairs when round-tripped through a pointer without
   1318   // modifications provided the integer is not too large.
   1319   Value *Op = I.getOperand(0);
   1320   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
   1321   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
   1322     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
   1323     if (BaseAndOffset.first)
   1324       ConstantOffsetPtrs[&I] = BaseAndOffset;
   1325   }
   1326 
   1327   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
   1328   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
   1329     SROAArgValues[&I] = SROAArg;
   1330 
   1331   return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
   1332          TargetTransformInfo::TCC_Free;
   1333 }
   1334 
   1335 bool CallAnalyzer::visitCastInst(CastInst &I) {
   1336   // Propagate constants through casts.
   1337   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1338         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
   1339       }))
   1340     return true;
   1341 
   1342   // Disable SROA in the face of arbitrary casts we don't explicitly list
   1343   // elsewhere.
   1344   disableSROA(I.getOperand(0));
   1345 
   1346   // If this is a floating-point cast, and the target says this operation
   1347   // is expensive, this may eventually become a library call. Treat the cost
   1348   // as such.
   1349   switch (I.getOpcode()) {
   1350   case Instruction::FPTrunc:
   1351   case Instruction::FPExt:
   1352   case Instruction::UIToFP:
   1353   case Instruction::SIToFP:
   1354   case Instruction::FPToUI:
   1355   case Instruction::FPToSI:
   1356     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
   1357       onCallPenalty();
   1358     break;
   1359   default:
   1360     break;
   1361   }
   1362 
   1363   return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
   1364          TargetTransformInfo::TCC_Free;
   1365 }
   1366 
   1367 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
   1368   return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
   1369 }
   1370 
   1371 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
   1372   // Does the *call site* have the NonNull attribute set on an argument?  We
   1373   // use the attribute on the call site to memoize any analysis done in the
   1374   // caller. This will also trip if the callee function has a non-null
   1375   // parameter attribute, but that's a less interesting case because hopefully
   1376   // the callee would already have been simplified based on that.
   1377   if (Argument *A = dyn_cast<Argument>(V))
   1378     if (paramHasAttr(A, Attribute::NonNull))
   1379       return true;
   1380 
   1381   // Is this an alloca in the caller?  This is distinct from the attribute case
   1382   // above because attributes aren't updated within the inliner itself and we
   1383   // always want to catch the alloca derived case.
   1384   if (isAllocaDerivedArg(V))
   1385     // We can actually predict the result of comparisons between an
   1386     // alloca-derived value and null. Note that this fires regardless of
   1387     // SROA firing.
   1388     return true;
   1389 
   1390   return false;
   1391 }
   1392 
   1393 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
   1394   // If the normal destination of the invoke or the parent block of the call
   1395   // site is unreachable-terminated, there is little point in inlining this
   1396   // unless there is literally zero cost.
   1397   // FIXME: Note that it is possible that an unreachable-terminated block has a
   1398   // hot entry. For example, in below scenario inlining hot_call_X() may be
   1399   // beneficial :
   1400   // main() {
   1401   //   hot_call_1();
   1402   //   ...
   1403   //   hot_call_N()
   1404   //   exit(0);
   1405   // }
   1406   // For now, we are not handling this corner case here as it is rare in real
   1407   // code. In future, we should elaborate this based on BPI and BFI in more
   1408   // general threshold adjusting heuristics in updateThreshold().
   1409   if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
   1410     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
   1411       return false;
   1412   } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
   1413     return false;
   1414 
   1415   return true;
   1416 }
   1417 
   1418 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
   1419                                             BlockFrequencyInfo *CallerBFI) {
   1420   // If global profile summary is available, then callsite's coldness is
   1421   // determined based on that.
   1422   if (PSI && PSI->hasProfileSummary())
   1423     return PSI->isColdCallSite(Call, CallerBFI);
   1424 
   1425   // Otherwise we need BFI to be available.
   1426   if (!CallerBFI)
   1427     return false;
   1428 
   1429   // Determine if the callsite is cold relative to caller's entry. We could
   1430   // potentially cache the computation of scaled entry frequency, but the added
   1431   // complexity is not worth it unless this scaling shows up high in the
   1432   // profiles.
   1433   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
   1434   auto CallSiteBB = Call.getParent();
   1435   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
   1436   auto CallerEntryFreq =
   1437       CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
   1438   return CallSiteFreq < CallerEntryFreq * ColdProb;
   1439 }
   1440 
   1441 Optional<int>
   1442 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
   1443                                                 BlockFrequencyInfo *CallerBFI) {
   1444 
   1445   // If global profile summary is available, then callsite's hotness is
   1446   // determined based on that.
   1447   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
   1448     return Params.HotCallSiteThreshold;
   1449 
   1450   // Otherwise we need BFI to be available and to have a locally hot callsite
   1451   // threshold.
   1452   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
   1453     return None;
   1454 
   1455   // Determine if the callsite is hot relative to caller's entry. We could
   1456   // potentially cache the computation of scaled entry frequency, but the added
   1457   // complexity is not worth it unless this scaling shows up high in the
   1458   // profiles.
   1459   auto CallSiteBB = Call.getParent();
   1460   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
   1461   auto CallerEntryFreq = CallerBFI->getEntryFreq();
   1462   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
   1463     return Params.LocallyHotCallSiteThreshold;
   1464 
   1465   // Otherwise treat it normally.
   1466   return None;
   1467 }
   1468 
   1469 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   1470   // If no size growth is allowed for this inlining, set Threshold to 0.
   1471   if (!allowSizeGrowth(Call)) {
   1472     Threshold = 0;
   1473     return;
   1474   }
   1475 
   1476   Function *Caller = Call.getCaller();
   1477 
   1478   // return min(A, B) if B is valid.
   1479   auto MinIfValid = [](int A, Optional<int> B) {
   1480     return B ? std::min(A, B.getValue()) : A;
   1481   };
   1482 
   1483   // return max(A, B) if B is valid.
   1484   auto MaxIfValid = [](int A, Optional<int> B) {
   1485     return B ? std::max(A, B.getValue()) : A;
   1486   };
   1487 
   1488   // Various bonus percentages. These are multiplied by Threshold to get the
   1489   // bonus values.
   1490   // SingleBBBonus: This bonus is applied if the callee has a single reachable
   1491   // basic block at the given callsite context. This is speculatively applied
   1492   // and withdrawn if more than one basic block is seen.
   1493   //
   1494   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
   1495   // of the last call to a static function as inlining such functions is
   1496   // guaranteed to reduce code size.
   1497   //
   1498   // These bonus percentages may be set to 0 based on properties of the caller
   1499   // and the callsite.
   1500   int SingleBBBonusPercent = 50;
   1501   int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
   1502   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
   1503 
   1504   // Lambda to set all the above bonus and bonus percentages to 0.
   1505   auto DisallowAllBonuses = [&]() {
   1506     SingleBBBonusPercent = 0;
   1507     VectorBonusPercent = 0;
   1508     LastCallToStaticBonus = 0;
   1509   };
   1510 
   1511   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
   1512   // and reduce the threshold if the caller has the necessary attribute.
   1513   if (Caller->hasMinSize()) {
   1514     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
   1515     // For minsize, we want to disable the single BB bonus and the vector
   1516     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
   1517     // a static function will, at the minimum, eliminate the parameter setup and
   1518     // call/return instructions.
   1519     SingleBBBonusPercent = 0;
   1520     VectorBonusPercent = 0;
   1521   } else if (Caller->hasOptSize())
   1522     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
   1523 
   1524   // Adjust the threshold based on inlinehint attribute and profile based
   1525   // hotness information if the caller does not have MinSize attribute.
   1526   if (!Caller->hasMinSize()) {
   1527     if (Callee.hasFnAttribute(Attribute::InlineHint))
   1528       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
   1529 
   1530     // FIXME: After switching to the new passmanager, simplify the logic below
   1531     // by checking only the callsite hotness/coldness as we will reliably
   1532     // have local profile information.
   1533     //
   1534     // Callsite hotness and coldness can be determined if sample profile is
   1535     // used (which adds hotness metadata to calls) or if caller's
   1536     // BlockFrequencyInfo is available.
   1537     BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
   1538     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
   1539     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
   1540       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
   1541       // FIXME: This should update the threshold only if it exceeds the
   1542       // current threshold, but AutoFDO + ThinLTO currently relies on this
   1543       // behavior to prevent inlining of hot callsites during ThinLTO
   1544       // compile phase.
   1545       Threshold = HotCallSiteThreshold.getValue();
   1546     } else if (isColdCallSite(Call, CallerBFI)) {
   1547       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
   1548       // Do not apply bonuses for a cold callsite including the
   1549       // LastCallToStatic bonus. While this bonus might result in code size
   1550       // reduction, it can cause the size of a non-cold caller to increase
   1551       // preventing it from being inlined.
   1552       DisallowAllBonuses();
   1553       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
   1554     } else if (PSI) {
   1555       // Use callee's global profile information only if we have no way of
   1556       // determining this via callsite information.
   1557       if (PSI->isFunctionEntryHot(&Callee)) {
   1558         LLVM_DEBUG(dbgs() << "Hot callee.\n");
   1559         // If callsite hotness can not be determined, we may still know
   1560         // that the callee is hot and treat it as a weaker hint for threshold
   1561         // increase.
   1562         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
   1563       } else if (PSI->isFunctionEntryCold(&Callee)) {
   1564         LLVM_DEBUG(dbgs() << "Cold callee.\n");
   1565         // Do not apply bonuses for a cold callee including the
   1566         // LastCallToStatic bonus. While this bonus might result in code size
   1567         // reduction, it can cause the size of a non-cold caller to increase
   1568         // preventing it from being inlined.
   1569         DisallowAllBonuses();
   1570         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
   1571       }
   1572     }
   1573   }
   1574 
   1575   Threshold += TTI.adjustInliningThreshold(&Call);
   1576 
   1577   // Finally, take the target-specific inlining threshold multiplier into
   1578   // account.
   1579   Threshold *= TTI.getInliningThresholdMultiplier();
   1580 
   1581   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
   1582   VectorBonus = Threshold * VectorBonusPercent / 100;
   1583 
   1584   bool OnlyOneCallAndLocalLinkage =
   1585       F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
   1586   // If there is only one call of the function, and it has internal linkage,
   1587   // the cost of inlining it drops dramatically. It may seem odd to update
   1588   // Cost in updateThreshold, but the bonus depends on the logic in this method.
   1589   if (OnlyOneCallAndLocalLinkage)
   1590     Cost -= LastCallToStaticBonus;
   1591 }
   1592 
   1593 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   1594   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   1595   // First try to handle simplified comparisons.
   1596   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1597         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
   1598       }))
   1599     return true;
   1600 
   1601   if (I.getOpcode() == Instruction::FCmp)
   1602     return false;
   1603 
   1604   // Otherwise look for a comparison between constant offset pointers with
   1605   // a common base.
   1606   Value *LHSBase, *RHSBase;
   1607   APInt LHSOffset, RHSOffset;
   1608   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
   1609   if (LHSBase) {
   1610     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
   1611     if (RHSBase && LHSBase == RHSBase) {
   1612       // We have common bases, fold the icmp to a constant based on the
   1613       // offsets.
   1614       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
   1615       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
   1616       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
   1617         SimplifiedValues[&I] = C;
   1618         ++NumConstantPtrCmps;
   1619         return true;
   1620       }
   1621     }
   1622   }
   1623 
   1624   // If the comparison is an equality comparison with null, we can simplify it
   1625   // if we know the value (argument) can't be null
   1626   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
   1627       isKnownNonNullInCallee(I.getOperand(0))) {
   1628     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
   1629     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
   1630                                       : ConstantInt::getFalse(I.getType());
   1631     return true;
   1632   }
   1633   return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
   1634 }
   1635 
   1636 bool CallAnalyzer::visitSub(BinaryOperator &I) {
   1637   // Try to handle a special case: we can fold computing the difference of two
   1638   // constant-related pointers.
   1639   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   1640   Value *LHSBase, *RHSBase;
   1641   APInt LHSOffset, RHSOffset;
   1642   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
   1643   if (LHSBase) {
   1644     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
   1645     if (RHSBase && LHSBase == RHSBase) {
   1646       // We have common bases, fold the subtract to a constant based on the
   1647       // offsets.
   1648       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
   1649       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
   1650       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
   1651         SimplifiedValues[&I] = C;
   1652         ++NumConstantPtrDiffs;
   1653         return true;
   1654       }
   1655     }
   1656   }
   1657 
   1658   // Otherwise, fall back to the generic logic for simplifying and handling
   1659   // instructions.
   1660   return Base::visitSub(I);
   1661 }
   1662 
   1663 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   1664   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   1665   Constant *CLHS = dyn_cast<Constant>(LHS);
   1666   if (!CLHS)
   1667     CLHS = SimplifiedValues.lookup(LHS);
   1668   Constant *CRHS = dyn_cast<Constant>(RHS);
   1669   if (!CRHS)
   1670     CRHS = SimplifiedValues.lookup(RHS);
   1671 
   1672   Value *SimpleV = nullptr;
   1673   if (auto FI = dyn_cast<FPMathOperator>(&I))
   1674     SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
   1675                             FI->getFastMathFlags(), DL);
   1676   else
   1677     SimpleV =
   1678         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
   1679 
   1680   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
   1681     SimplifiedValues[&I] = C;
   1682 
   1683   if (SimpleV)
   1684     return true;
   1685 
   1686   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
   1687   disableSROA(LHS);
   1688   disableSROA(RHS);
   1689 
   1690   // If the instruction is floating point, and the target says this operation
   1691   // is expensive, this may eventually become a library call. Treat the cost
   1692   // as such. Unless it's fneg which can be implemented with an xor.
   1693   using namespace llvm::PatternMatch;
   1694   if (I.getType()->isFloatingPointTy() &&
   1695       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
   1696       !match(&I, m_FNeg(m_Value())))
   1697     onCallPenalty();
   1698 
   1699   return false;
   1700 }
   1701 
   1702 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
   1703   Value *Op = I.getOperand(0);
   1704   Constant *COp = dyn_cast<Constant>(Op);
   1705   if (!COp)
   1706     COp = SimplifiedValues.lookup(Op);
   1707 
   1708   Value *SimpleV = SimplifyFNegInst(
   1709       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
   1710 
   1711   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
   1712     SimplifiedValues[&I] = C;
   1713 
   1714   if (SimpleV)
   1715     return true;
   1716 
   1717   // Disable any SROA on arguments to arbitrary, unsimplified fneg.
   1718   disableSROA(Op);
   1719 
   1720   return false;
   1721 }
   1722 
   1723 bool CallAnalyzer::visitLoad(LoadInst &I) {
   1724   if (handleSROA(I.getPointerOperand(), I.isSimple()))
   1725     return true;
   1726 
   1727   // If the data is already loaded from this address and hasn't been clobbered
   1728   // by any stores or calls, this load is likely to be redundant and can be
   1729   // eliminated.
   1730   if (EnableLoadElimination &&
   1731       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
   1732     onLoadEliminationOpportunity();
   1733     return true;
   1734   }
   1735 
   1736   return false;
   1737 }
   1738 
   1739 bool CallAnalyzer::visitStore(StoreInst &I) {
   1740   if (handleSROA(I.getPointerOperand(), I.isSimple()))
   1741     return true;
   1742 
   1743   // The store can potentially clobber loads and prevent repeated loads from
   1744   // being eliminated.
   1745   // FIXME:
   1746   // 1. We can probably keep an initial set of eliminatable loads substracted
   1747   // from the cost even when we finally see a store. We just need to disable
   1748   // *further* accumulation of elimination savings.
   1749   // 2. We should probably at some point thread MemorySSA for the callee into
   1750   // this and then use that to actually compute *really* precise savings.
   1751   disableLoadElimination();
   1752   return false;
   1753 }
   1754 
   1755 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
   1756   // Constant folding for extract value is trivial.
   1757   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1758         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
   1759       }))
   1760     return true;
   1761 
   1762   // SROA can't look through these, but they may be free.
   1763   return Base::visitExtractValue(I);
   1764 }
   1765 
   1766 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   1767   // Constant folding for insert value is trivial.
   1768   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
   1769         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
   1770                                             /*InsertedValueOperand*/ COps[1],
   1771                                             I.getIndices());
   1772       }))
   1773     return true;
   1774 
   1775   // SROA can't look through these, but they may be free.
   1776   return Base::visitInsertValue(I);
   1777 }
   1778 
   1779 /// Try to simplify a call site.
   1780 ///
   1781 /// Takes a concrete function and callsite and tries to actually simplify it by
   1782 /// analyzing the arguments and call itself with instsimplify. Returns true if
   1783 /// it has simplified the callsite to some other entity (a constant), making it
   1784 /// free.
   1785 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
   1786   // FIXME: Using the instsimplify logic directly for this is inefficient
   1787   // because we have to continually rebuild the argument list even when no
   1788   // simplifications can be performed. Until that is fixed with remapping
   1789   // inside of instsimplify, directly constant fold calls here.
   1790   if (!canConstantFoldCallTo(&Call, F))
   1791     return false;
   1792 
   1793   // Try to re-map the arguments to constants.
   1794   SmallVector<Constant *, 4> ConstantArgs;
   1795   ConstantArgs.reserve(Call.arg_size());
   1796   for (Value *I : Call.args()) {
   1797     Constant *C = dyn_cast<Constant>(I);
   1798     if (!C)
   1799       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
   1800     if (!C)
   1801       return false; // This argument doesn't map to a constant.
   1802 
   1803     ConstantArgs.push_back(C);
   1804   }
   1805   if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
   1806     SimplifiedValues[&Call] = C;
   1807     return true;
   1808   }
   1809 
   1810   return false;
   1811 }
   1812 
   1813 bool CallAnalyzer::visitCallBase(CallBase &Call) {
   1814   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
   1815       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
   1816     // This aborts the entire analysis.
   1817     ExposesReturnsTwice = true;
   1818     return false;
   1819   }
   1820   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
   1821     ContainsNoDuplicateCall = true;
   1822 
   1823   Value *Callee = Call.getCalledOperand();
   1824   Function *F = dyn_cast_or_null<Function>(Callee);
   1825   bool IsIndirectCall = !F;
   1826   if (IsIndirectCall) {
   1827     // Check if this happens to be an indirect function call to a known function
   1828     // in this inline context. If not, we've done all we can.
   1829     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
   1830     if (!F) {
   1831       onCallArgumentSetup(Call);
   1832 
   1833       if (!Call.onlyReadsMemory())
   1834         disableLoadElimination();
   1835       return Base::visitCallBase(Call);
   1836     }
   1837   }
   1838 
   1839   assert(F && "Expected a call to a known function");
   1840 
   1841   // When we have a concrete function, first try to simplify it directly.
   1842   if (simplifyCallSite(F, Call))
   1843     return true;
   1844 
   1845   // Next check if it is an intrinsic we know about.
   1846   // FIXME: Lift this into part of the InstVisitor.
   1847   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
   1848     switch (II->getIntrinsicID()) {
   1849     default:
   1850       if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
   1851         disableLoadElimination();
   1852       return Base::visitCallBase(Call);
   1853 
   1854     case Intrinsic::load_relative:
   1855       onLoadRelativeIntrinsic();
   1856       return false;
   1857 
   1858     case Intrinsic::memset:
   1859     case Intrinsic::memcpy:
   1860     case Intrinsic::memmove:
   1861       disableLoadElimination();
   1862       // SROA can usually chew through these intrinsics, but they aren't free.
   1863       return false;
   1864     case Intrinsic::icall_branch_funnel:
   1865     case Intrinsic::localescape:
   1866       HasUninlineableIntrinsic = true;
   1867       return false;
   1868     case Intrinsic::vastart:
   1869       InitsVargArgs = true;
   1870       return false;
   1871     case Intrinsic::launder_invariant_group:
   1872     case Intrinsic::strip_invariant_group:
   1873       if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
   1874         SROAArgValues[II] = SROAArg;
   1875       return true;
   1876     }
   1877   }
   1878 
   1879   if (F == Call.getFunction()) {
   1880     // This flag will fully abort the analysis, so don't bother with anything
   1881     // else.
   1882     IsRecursiveCall = true;
   1883     return false;
   1884   }
   1885 
   1886   if (TTI.isLoweredToCall(F)) {
   1887     onLoweredCall(F, Call, IsIndirectCall);
   1888   }
   1889 
   1890   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
   1891     disableLoadElimination();
   1892   return Base::visitCallBase(Call);
   1893 }
   1894 
   1895 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
   1896   // At least one return instruction will be free after inlining.
   1897   bool Free = !HasReturn;
   1898   HasReturn = true;
   1899   return Free;
   1900 }
   1901 
   1902 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
   1903   // We model unconditional branches as essentially free -- they really
   1904   // shouldn't exist at all, but handling them makes the behavior of the
   1905   // inliner more regular and predictable. Interestingly, conditional branches
   1906   // which will fold away are also free.
   1907   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
   1908          dyn_cast_or_null<ConstantInt>(
   1909              SimplifiedValues.lookup(BI.getCondition()));
   1910 }
   1911 
   1912 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
   1913   bool CheckSROA = SI.getType()->isPointerTy();
   1914   Value *TrueVal = SI.getTrueValue();
   1915   Value *FalseVal = SI.getFalseValue();
   1916 
   1917   Constant *TrueC = dyn_cast<Constant>(TrueVal);
   1918   if (!TrueC)
   1919     TrueC = SimplifiedValues.lookup(TrueVal);
   1920   Constant *FalseC = dyn_cast<Constant>(FalseVal);
   1921   if (!FalseC)
   1922     FalseC = SimplifiedValues.lookup(FalseVal);
   1923   Constant *CondC =
   1924       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
   1925 
   1926   if (!CondC) {
   1927     // Select C, X, X => X
   1928     if (TrueC == FalseC && TrueC) {
   1929       SimplifiedValues[&SI] = TrueC;
   1930       return true;
   1931     }
   1932 
   1933     if (!CheckSROA)
   1934       return Base::visitSelectInst(SI);
   1935 
   1936     std::pair<Value *, APInt> TrueBaseAndOffset =
   1937         ConstantOffsetPtrs.lookup(TrueVal);
   1938     std::pair<Value *, APInt> FalseBaseAndOffset =
   1939         ConstantOffsetPtrs.lookup(FalseVal);
   1940     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
   1941       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
   1942 
   1943       if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
   1944         SROAArgValues[&SI] = SROAArg;
   1945       return true;
   1946     }
   1947 
   1948     return Base::visitSelectInst(SI);
   1949   }
   1950 
   1951   // Select condition is a constant.
   1952   Value *SelectedV = CondC->isAllOnesValue()
   1953                          ? TrueVal
   1954                          : (CondC->isNullValue()) ? FalseVal : nullptr;
   1955   if (!SelectedV) {
   1956     // Condition is a vector constant that is not all 1s or all 0s.  If all
   1957     // operands are constants, ConstantExpr::getSelect() can handle the cases
   1958     // such as select vectors.
   1959     if (TrueC && FalseC) {
   1960       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
   1961         SimplifiedValues[&SI] = C;
   1962         return true;
   1963       }
   1964     }
   1965     return Base::visitSelectInst(SI);
   1966   }
   1967 
   1968   // Condition is either all 1s or all 0s. SI can be simplified.
   1969   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
   1970     SimplifiedValues[&SI] = SelectedC;
   1971     return true;
   1972   }
   1973 
   1974   if (!CheckSROA)
   1975     return true;
   1976 
   1977   std::pair<Value *, APInt> BaseAndOffset =
   1978       ConstantOffsetPtrs.lookup(SelectedV);
   1979   if (BaseAndOffset.first) {
   1980     ConstantOffsetPtrs[&SI] = BaseAndOffset;
   1981 
   1982     if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
   1983       SROAArgValues[&SI] = SROAArg;
   1984   }
   1985 
   1986   return true;
   1987 }
   1988 
   1989 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   1990   // We model unconditional switches as free, see the comments on handling
   1991   // branches.
   1992   if (isa<ConstantInt>(SI.getCondition()))
   1993     return true;
   1994   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
   1995     if (isa<ConstantInt>(V))
   1996       return true;
   1997 
   1998   // Assume the most general case where the switch is lowered into
   1999   // either a jump table, bit test, or a balanced binary tree consisting of
   2000   // case clusters without merging adjacent clusters with the same
   2001   // destination. We do not consider the switches that are lowered with a mix
   2002   // of jump table/bit test/binary search tree. The cost of the switch is
   2003   // proportional to the size of the tree or the size of jump table range.
   2004   //
   2005   // NB: We convert large switches which are just used to initialize large phi
   2006   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
   2007   // inlining those. It will prevent inlining in cases where the optimization
   2008   // does not (yet) fire.
   2009 
   2010   unsigned JumpTableSize = 0;
   2011   BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
   2012   unsigned NumCaseCluster =
   2013       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
   2014 
   2015   onFinalizeSwitch(JumpTableSize, NumCaseCluster);
   2016   return false;
   2017 }
   2018 
   2019 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
   2020   // We never want to inline functions that contain an indirectbr.  This is
   2021   // incorrect because all the blockaddress's (in static global initializers
   2022   // for example) would be referring to the original function, and this
   2023   // indirect jump would jump from the inlined copy of the function into the
   2024   // original function which is extremely undefined behavior.
   2025   // FIXME: This logic isn't really right; we can safely inline functions with
   2026   // indirectbr's as long as no other function or global references the
   2027   // blockaddress of a block within the current function.
   2028   HasIndirectBr = true;
   2029   return false;
   2030 }
   2031 
   2032 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
   2033   // FIXME: It's not clear that a single instruction is an accurate model for
   2034   // the inline cost of a resume instruction.
   2035   return false;
   2036 }
   2037 
   2038 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
   2039   // FIXME: It's not clear that a single instruction is an accurate model for
   2040   // the inline cost of a cleanupret instruction.
   2041   return false;
   2042 }
   2043 
   2044 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
   2045   // FIXME: It's not clear that a single instruction is an accurate model for
   2046   // the inline cost of a catchret instruction.
   2047   return false;
   2048 }
   2049 
   2050 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
   2051   // FIXME: It might be reasonably to discount the cost of instructions leading
   2052   // to unreachable as they have the lowest possible impact on both runtime and
   2053   // code size.
   2054   return true; // No actual code is needed for unreachable.
   2055 }
   2056 
   2057 bool CallAnalyzer::visitInstruction(Instruction &I) {
   2058   // Some instructions are free. All of the free intrinsics can also be
   2059   // handled by SROA, etc.
   2060   if (TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
   2061       TargetTransformInfo::TCC_Free)
   2062     return true;
   2063 
   2064   // We found something we don't understand or can't handle. Mark any SROA-able
   2065   // values in the operand list as no longer viable.
   2066   for (const Use &Op : I.operands())
   2067     disableSROA(Op);
   2068 
   2069   return false;
   2070 }
   2071 
   2072 /// Analyze a basic block for its contribution to the inline cost.
   2073 ///
   2074 /// This method walks the analyzer over every instruction in the given basic
   2075 /// block and accounts for their cost during inlining at this callsite. It
   2076 /// aborts early if the threshold has been exceeded or an impossible to inline
   2077 /// construct has been detected. It returns false if inlining is no longer
   2078 /// viable, and true if inlining remains viable.
   2079 InlineResult
   2080 CallAnalyzer::analyzeBlock(BasicBlock *BB,
   2081                            SmallPtrSetImpl<const Value *> &EphValues) {
   2082   for (Instruction &I : *BB) {
   2083     // FIXME: Currently, the number of instructions in a function regardless of
   2084     // our ability to simplify them during inline to constants or dead code,
   2085     // are actually used by the vector bonus heuristic. As long as that's true,
   2086     // we have to special case debug intrinsics here to prevent differences in
   2087     // inlining due to debug symbols. Eventually, the number of unsimplified
   2088     // instructions shouldn't factor into the cost computation, but until then,
   2089     // hack around it here.
   2090     if (isa<DbgInfoIntrinsic>(I))
   2091       continue;
   2092 
   2093     // Skip pseudo-probes.
   2094     if (isa<PseudoProbeInst>(I))
   2095       continue;
   2096 
   2097     // Skip ephemeral values.
   2098     if (EphValues.count(&I))
   2099       continue;
   2100 
   2101     ++NumInstructions;
   2102     if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
   2103       ++NumVectorInstructions;
   2104 
   2105     // If the instruction simplified to a constant, there is no cost to this
   2106     // instruction. Visit the instructions using our InstVisitor to account for
   2107     // all of the per-instruction logic. The visit tree returns true if we
   2108     // consumed the instruction in any way, and false if the instruction's base
   2109     // cost should count against inlining.
   2110     onInstructionAnalysisStart(&I);
   2111 
   2112     if (Base::visit(&I))
   2113       ++NumInstructionsSimplified;
   2114     else
   2115       onMissedSimplification();
   2116 
   2117     onInstructionAnalysisFinish(&I);
   2118     using namespace ore;
   2119     // If the visit this instruction detected an uninlinable pattern, abort.
   2120     InlineResult IR = InlineResult::success();
   2121     if (IsRecursiveCall)
   2122       IR = InlineResult::failure("recursive");
   2123     else if (ExposesReturnsTwice)
   2124       IR = InlineResult::failure("exposes returns twice");
   2125     else if (HasDynamicAlloca)
   2126       IR = InlineResult::failure("dynamic alloca");
   2127     else if (HasIndirectBr)
   2128       IR = InlineResult::failure("indirect branch");
   2129     else if (HasUninlineableIntrinsic)
   2130       IR = InlineResult::failure("uninlinable intrinsic");
   2131     else if (InitsVargArgs)
   2132       IR = InlineResult::failure("varargs");
   2133     if (!IR.isSuccess()) {
   2134       if (ORE)
   2135         ORE->emit([&]() {
   2136           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
   2137                                           &CandidateCall)
   2138                  << NV("Callee", &F) << " has uninlinable pattern ("
   2139                  << NV("InlineResult", IR.getFailureReason())
   2140                  << ") and cost is not fully computed";
   2141         });
   2142       return IR;
   2143     }
   2144 
   2145     // If the caller is a recursive function then we don't want to inline
   2146     // functions which allocate a lot of stack space because it would increase
   2147     // the caller stack usage dramatically.
   2148     if (IsCallerRecursive &&
   2149         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
   2150       auto IR =
   2151           InlineResult::failure("recursive and allocates too much stack space");
   2152       if (ORE)
   2153         ORE->emit([&]() {
   2154           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
   2155                                           &CandidateCall)
   2156                  << NV("Callee", &F) << " is "
   2157                  << NV("InlineResult", IR.getFailureReason())
   2158                  << ". Cost is not fully computed";
   2159         });
   2160       return IR;
   2161     }
   2162 
   2163     if (shouldStop())
   2164       return InlineResult::failure(
   2165           "Call site analysis is not favorable to inlining.");
   2166   }
   2167 
   2168   return InlineResult::success();
   2169 }
   2170 
   2171 /// Compute the base pointer and cumulative constant offsets for V.
   2172 ///
   2173 /// This strips all constant offsets off of V, leaving it the base pointer, and
   2174 /// accumulates the total constant offset applied in the returned constant. It
   2175 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
   2176 /// no constant offsets applied.
   2177 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   2178   if (!V->getType()->isPointerTy())
   2179     return nullptr;
   2180 
   2181   unsigned AS = V->getType()->getPointerAddressSpace();
   2182   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
   2183   APInt Offset = APInt::getNullValue(IntPtrWidth);
   2184 
   2185   // Even though we don't look through PHI nodes, we could be called on an
   2186   // instruction in an unreachable block, which may be on a cycle.
   2187   SmallPtrSet<Value *, 4> Visited;
   2188   Visited.insert(V);
   2189   do {
   2190     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
   2191       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
   2192         return nullptr;
   2193       V = GEP->getPointerOperand();
   2194     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
   2195       V = cast<Operator>(V)->getOperand(0);
   2196     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
   2197       if (GA->isInterposable())
   2198         break;
   2199       V = GA->getAliasee();
   2200     } else {
   2201       break;
   2202     }
   2203     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   2204   } while (Visited.insert(V).second);
   2205 
   2206   Type *IdxPtrTy = DL.getIndexType(V->getType());
   2207   return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
   2208 }
   2209 
   2210 /// Find dead blocks due to deleted CFG edges during inlining.
   2211 ///
   2212 /// If we know the successor of the current block, \p CurrBB, has to be \p
   2213 /// NextBB, the other successors of \p CurrBB are dead if these successors have
   2214 /// no live incoming CFG edges.  If one block is found to be dead, we can
   2215 /// continue growing the dead block list by checking the successors of the dead
   2216 /// blocks to see if all their incoming edges are dead or not.
   2217 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
   2218   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
   2219     // A CFG edge is dead if the predecessor is dead or the predecessor has a
   2220     // known successor which is not the one under exam.
   2221     return (DeadBlocks.count(Pred) ||
   2222             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
   2223   };
   2224 
   2225   auto IsNewlyDead = [&](BasicBlock *BB) {
   2226     // If all the edges to a block are dead, the block is also dead.
   2227     return (!DeadBlocks.count(BB) &&
   2228             llvm::all_of(predecessors(BB),
   2229                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
   2230   };
   2231 
   2232   for (BasicBlock *Succ : successors(CurrBB)) {
   2233     if (Succ == NextBB || !IsNewlyDead(Succ))
   2234       continue;
   2235     SmallVector<BasicBlock *, 4> NewDead;
   2236     NewDead.push_back(Succ);
   2237     while (!NewDead.empty()) {
   2238       BasicBlock *Dead = NewDead.pop_back_val();
   2239       if (DeadBlocks.insert(Dead))
   2240         // Continue growing the dead block lists.
   2241         for (BasicBlock *S : successors(Dead))
   2242           if (IsNewlyDead(S))
   2243             NewDead.push_back(S);
   2244     }
   2245   }
   2246 }
   2247 
   2248 /// Analyze a call site for potential inlining.
   2249 ///
   2250 /// Returns true if inlining this call is viable, and false if it is not
   2251 /// viable. It computes the cost and adjusts the threshold based on numerous
   2252 /// factors and heuristics. If this method returns false but the computed cost
   2253 /// is below the computed threshold, then inlining was forcibly disabled by
   2254 /// some artifact of the routine.
   2255 InlineResult CallAnalyzer::analyze() {
   2256   ++NumCallsAnalyzed;
   2257 
   2258   auto Result = onAnalysisStart();
   2259   if (!Result.isSuccess())
   2260     return Result;
   2261 
   2262   if (F.empty())
   2263     return InlineResult::success();
   2264 
   2265   Function *Caller = CandidateCall.getFunction();
   2266   // Check if the caller function is recursive itself.
   2267   for (User *U : Caller->users()) {
   2268     CallBase *Call = dyn_cast<CallBase>(U);
   2269     if (Call && Call->getFunction() == Caller) {
   2270       IsCallerRecursive = true;
   2271       break;
   2272     }
   2273   }
   2274 
   2275   // Populate our simplified values by mapping from function arguments to call
   2276   // arguments with known important simplifications.
   2277   auto CAI = CandidateCall.arg_begin();
   2278   for (Argument &FAI : F.args()) {
   2279     assert(CAI != CandidateCall.arg_end());
   2280     if (Constant *C = dyn_cast<Constant>(CAI))
   2281       SimplifiedValues[&FAI] = C;
   2282 
   2283     Value *PtrArg = *CAI;
   2284     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
   2285       ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
   2286 
   2287       // We can SROA any pointer arguments derived from alloca instructions.
   2288       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
   2289         SROAArgValues[&FAI] = SROAArg;
   2290         onInitializeSROAArg(SROAArg);
   2291         EnabledSROAAllocas.insert(SROAArg);
   2292       }
   2293     }
   2294     ++CAI;
   2295   }
   2296   NumConstantArgs = SimplifiedValues.size();
   2297   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
   2298   NumAllocaArgs = SROAArgValues.size();
   2299 
   2300   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
   2301   // the ephemeral values multiple times (and they're completely determined by
   2302   // the callee, so this is purely duplicate work).
   2303   SmallPtrSet<const Value *, 32> EphValues;
   2304   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
   2305 
   2306   // The worklist of live basic blocks in the callee *after* inlining. We avoid
   2307   // adding basic blocks of the callee which can be proven to be dead for this
   2308   // particular call site in order to get more accurate cost estimates. This
   2309   // requires a somewhat heavyweight iteration pattern: we need to walk the
   2310   // basic blocks in a breadth-first order as we insert live successors. To
   2311   // accomplish this, prioritizing for small iterations because we exit after
   2312   // crossing our threshold, we use a small-size optimized SetVector.
   2313   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
   2314                     SmallPtrSet<BasicBlock *, 16>>
   2315       BBSetVector;
   2316   BBSetVector BBWorklist;
   2317   BBWorklist.insert(&F.getEntryBlock());
   2318 
   2319   // Note that we *must not* cache the size, this loop grows the worklist.
   2320   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
   2321     if (shouldStop())
   2322       break;
   2323 
   2324     BasicBlock *BB = BBWorklist[Idx];
   2325     if (BB->empty())
   2326       continue;
   2327 
   2328     onBlockStart(BB);
   2329 
   2330     // Disallow inlining a blockaddress with uses other than strictly callbr.
   2331     // A blockaddress only has defined behavior for an indirect branch in the
   2332     // same function, and we do not currently support inlining indirect
   2333     // branches.  But, the inliner may not see an indirect branch that ends up
   2334     // being dead code at a particular call site. If the blockaddress escapes
   2335     // the function, e.g., via a global variable, inlining may lead to an
   2336     // invalid cross-function reference.
   2337     // FIXME: pr/39560: continue relaxing this overt restriction.
   2338     if (BB->hasAddressTaken())
   2339       for (User *U : BlockAddress::get(&*BB)->users())
   2340         if (!isa<CallBrInst>(*U))
   2341           return InlineResult::failure("blockaddress used outside of callbr");
   2342 
   2343     // Analyze the cost of this block. If we blow through the threshold, this
   2344     // returns false, and we can bail on out.
   2345     InlineResult IR = analyzeBlock(BB, EphValues);
   2346     if (!IR.isSuccess())
   2347       return IR;
   2348 
   2349     Instruction *TI = BB->getTerminator();
   2350 
   2351     // Add in the live successors by first checking whether we have terminator
   2352     // that may be simplified based on the values simplified by this call.
   2353     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
   2354       if (BI->isConditional()) {
   2355         Value *Cond = BI->getCondition();
   2356         if (ConstantInt *SimpleCond =
   2357                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
   2358           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
   2359           BBWorklist.insert(NextBB);
   2360           KnownSuccessors[BB] = NextBB;
   2361           findDeadBlocks(BB, NextBB);
   2362           continue;
   2363         }
   2364       }
   2365     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
   2366       Value *Cond = SI->getCondition();
   2367       if (ConstantInt *SimpleCond =
   2368               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
   2369         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
   2370         BBWorklist.insert(NextBB);
   2371         KnownSuccessors[BB] = NextBB;
   2372         findDeadBlocks(BB, NextBB);
   2373         continue;
   2374       }
   2375     }
   2376 
   2377     // If we're unable to select a particular successor, just count all of
   2378     // them.
   2379     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
   2380          ++TIdx)
   2381       BBWorklist.insert(TI->getSuccessor(TIdx));
   2382 
   2383     onBlockAnalyzed(BB);
   2384   }
   2385 
   2386   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
   2387                                     &F == CandidateCall.getCalledFunction();
   2388   // If this is a noduplicate call, we can still inline as long as
   2389   // inlining this would cause the removal of the caller (so the instruction
   2390   // is not actually duplicated, just moved).
   2391   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
   2392     return InlineResult::failure("noduplicate");
   2393 
   2394   return finalizeAnalysis();
   2395 }
   2396 
   2397 void InlineCostCallAnalyzer::print() {
   2398 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
   2399   if (PrintInstructionComments)
   2400     F.print(dbgs(), &Writer);
   2401   DEBUG_PRINT_STAT(NumConstantArgs);
   2402   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
   2403   DEBUG_PRINT_STAT(NumAllocaArgs);
   2404   DEBUG_PRINT_STAT(NumConstantPtrCmps);
   2405   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
   2406   DEBUG_PRINT_STAT(NumInstructionsSimplified);
   2407   DEBUG_PRINT_STAT(NumInstructions);
   2408   DEBUG_PRINT_STAT(SROACostSavings);
   2409   DEBUG_PRINT_STAT(SROACostSavingsLost);
   2410   DEBUG_PRINT_STAT(LoadEliminationCost);
   2411   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
   2412   DEBUG_PRINT_STAT(Cost);
   2413   DEBUG_PRINT_STAT(Threshold);
   2414 #undef DEBUG_PRINT_STAT
   2415 }
   2416 
   2417 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   2418 /// Dump stats about this call's analysis.
   2419 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
   2420   print();
   2421 }
   2422 #endif
   2423 
   2424 /// Test that there are no attribute conflicts between Caller and Callee
   2425 ///        that prevent inlining.
   2426 static bool functionsHaveCompatibleAttributes(
   2427     Function *Caller, Function *Callee, TargetTransformInfo &TTI,
   2428     function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
   2429   // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
   2430   // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
   2431   // object, and always returns the same object (which is overwritten on each
   2432   // GetTLI call). Therefore we copy the first result.
   2433   auto CalleeTLI = GetTLI(*Callee);
   2434   return TTI.areInlineCompatible(Caller, Callee) &&
   2435          GetTLI(*Caller).areInlineCompatible(CalleeTLI,
   2436                                              InlineCallerSupersetNoBuiltin) &&
   2437          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
   2438 }
   2439 
   2440 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
   2441   int Cost = 0;
   2442   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
   2443     if (Call.isByValArgument(I)) {
   2444       // We approximate the number of loads and stores needed by dividing the
   2445       // size of the byval type by the target's pointer size.
   2446       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
   2447       unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
   2448       unsigned AS = PTy->getAddressSpace();
   2449       unsigned PointerSize = DL.getPointerSizeInBits(AS);
   2450       // Ceiling division.
   2451       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
   2452 
   2453       // If it generates more than 8 stores it is likely to be expanded as an
   2454       // inline memcpy so we take that as an upper bound. Otherwise we assume
   2455       // one load and one store per word copied.
   2456       // FIXME: The maxStoresPerMemcpy setting from the target should be used
   2457       // here instead of a magic number of 8, but it's not available via
   2458       // DataLayout.
   2459       NumStores = std::min(NumStores, 8U);
   2460 
   2461       Cost += 2 * NumStores * InlineConstants::InstrCost;
   2462     } else {
   2463       // For non-byval arguments subtract off one instruction per call
   2464       // argument.
   2465       Cost += InlineConstants::InstrCost;
   2466     }
   2467   }
   2468   // The call instruction also disappears after inlining.
   2469   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
   2470   return Cost;
   2471 }
   2472 
   2473 InlineCost llvm::getInlineCost(
   2474     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
   2475     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
   2476     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
   2477     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
   2478     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
   2479   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
   2480                        GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
   2481 }
   2482 
   2483 Optional<int> llvm::getInliningCostEstimate(
   2484     CallBase &Call, TargetTransformInfo &CalleeTTI,
   2485     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
   2486     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
   2487     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
   2488   const InlineParams Params = {/* DefaultThreshold*/ 0,
   2489                                /*HintThreshold*/ {},
   2490                                /*ColdThreshold*/ {},
   2491                                /*OptSizeThreshold*/ {},
   2492                                /*OptMinSizeThreshold*/ {},
   2493                                /*HotCallSiteThreshold*/ {},
   2494                                /*LocallyHotCallSiteThreshold*/ {},
   2495                                /*ColdCallSiteThreshold*/ {},
   2496                                /*ComputeFullInlineCost*/ true,
   2497                                /*EnableDeferral*/ true};
   2498 
   2499   InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
   2500                             GetAssumptionCache, GetBFI, PSI, ORE, true,
   2501                             /*IgnoreThreshold*/ true);
   2502   auto R = CA.analyze();
   2503   if (!R.isSuccess())
   2504     return None;
   2505   return CA.getCost();
   2506 }
   2507 
   2508 Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
   2509     CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
   2510     function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
   2511 
   2512   // Cannot inline indirect calls.
   2513   if (!Callee)
   2514     return InlineResult::failure("indirect call");
   2515 
   2516   // When callee coroutine function is inlined into caller coroutine function
   2517   // before coro-split pass,
   2518   // coro-early pass can not handle this quiet well.
   2519   // So we won't inline the coroutine function if it have not been unsplited
   2520   if (Callee->isPresplitCoroutine())
   2521     return InlineResult::failure("unsplited coroutine call");
   2522 
   2523   // Never inline calls with byval arguments that does not have the alloca
   2524   // address space. Since byval arguments can be replaced with a copy to an
   2525   // alloca, the inlined code would need to be adjusted to handle that the
   2526   // argument is in the alloca address space (so it is a little bit complicated
   2527   // to solve).
   2528   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
   2529   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
   2530     if (Call.isByValArgument(I)) {
   2531       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
   2532       if (PTy->getAddressSpace() != AllocaAS)
   2533         return InlineResult::failure("byval arguments without alloca"
   2534                                      " address space");
   2535     }
   2536 
   2537   // Calls to functions with always-inline attributes should be inlined
   2538   // whenever possible.
   2539   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
   2540     auto IsViable = isInlineViable(*Callee);
   2541     if (IsViable.isSuccess())
   2542       return InlineResult::success();
   2543     return InlineResult::failure(IsViable.getFailureReason());
   2544   }
   2545 
   2546   // Never inline functions with conflicting attributes (unless callee has
   2547   // always-inline attribute).
   2548   Function *Caller = Call.getCaller();
   2549   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
   2550     return InlineResult::failure("conflicting attributes");
   2551 
   2552   // Don't inline this call if the caller has the optnone attribute.
   2553   if (Caller->hasOptNone())
   2554     return InlineResult::failure("optnone attribute");
   2555 
   2556   // Don't inline a function that treats null pointer as valid into a caller
   2557   // that does not have this attribute.
   2558   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
   2559     return InlineResult::failure("nullptr definitions incompatible");
   2560 
   2561   // Don't inline functions which can be interposed at link-time.
   2562   if (Callee->isInterposable())
   2563     return InlineResult::failure("interposable");
   2564 
   2565   // Don't inline functions marked noinline.
   2566   if (Callee->hasFnAttribute(Attribute::NoInline))
   2567     return InlineResult::failure("noinline function attribute");
   2568 
   2569   // Don't inline call sites marked noinline.
   2570   if (Call.isNoInline())
   2571     return InlineResult::failure("noinline call site attribute");
   2572 
   2573   // Don't inline functions if one does not have any stack protector attribute
   2574   // but the other does.
   2575   if (Caller->hasStackProtectorFnAttr() && !Callee->hasStackProtectorFnAttr())
   2576     return InlineResult::failure(
   2577         "stack protected caller but callee requested no stack protector");
   2578   if (Callee->hasStackProtectorFnAttr() && !Caller->hasStackProtectorFnAttr())
   2579     return InlineResult::failure(
   2580         "stack protected callee but caller requested no stack protector");
   2581 
   2582   return None;
   2583 }
   2584 
   2585 InlineCost llvm::getInlineCost(
   2586     CallBase &Call, Function *Callee, const InlineParams &Params,
   2587     TargetTransformInfo &CalleeTTI,
   2588     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
   2589     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
   2590     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
   2591     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
   2592 
   2593   auto UserDecision =
   2594       llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
   2595 
   2596   if (UserDecision.hasValue()) {
   2597     if (UserDecision->isSuccess())
   2598       return llvm::InlineCost::getAlways("always inline attribute");
   2599     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
   2600   }
   2601 
   2602   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
   2603                           << "... (caller:" << Call.getCaller()->getName()
   2604                           << ")\n");
   2605 
   2606   InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
   2607                             GetAssumptionCache, GetBFI, PSI, ORE);
   2608   InlineResult ShouldInline = CA.analyze();
   2609 
   2610   LLVM_DEBUG(CA.dump());
   2611 
   2612   // Always make cost benefit based decision explicit.
   2613   // We use always/never here since threshold is not meaningful,
   2614   // as it's not what drives cost-benefit analysis.
   2615   if (CA.wasDecidedByCostBenefit()) {
   2616     if (ShouldInline.isSuccess())
   2617       return InlineCost::getAlways("benefit over cost");
   2618     else
   2619       return InlineCost::getNever("cost over benefit");
   2620   }
   2621 
   2622   // Check if there was a reason to force inlining or no inlining.
   2623   if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
   2624     return InlineCost::getNever(ShouldInline.getFailureReason());
   2625   if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
   2626     return InlineCost::getAlways("empty function");
   2627 
   2628   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
   2629 }
   2630 
   2631 InlineResult llvm::isInlineViable(Function &F) {
   2632   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
   2633   for (BasicBlock &BB : F) {
   2634     // Disallow inlining of functions which contain indirect branches.
   2635     if (isa<IndirectBrInst>(BB.getTerminator()))
   2636       return InlineResult::failure("contains indirect branches");
   2637 
   2638     // Disallow inlining of blockaddresses which are used by non-callbr
   2639     // instructions.
   2640     if (BB.hasAddressTaken())
   2641       for (User *U : BlockAddress::get(&BB)->users())
   2642         if (!isa<CallBrInst>(*U))
   2643           return InlineResult::failure("blockaddress used outside of callbr");
   2644 
   2645     for (auto &II : BB) {
   2646       CallBase *Call = dyn_cast<CallBase>(&II);
   2647       if (!Call)
   2648         continue;
   2649 
   2650       // Disallow recursive calls.
   2651       Function *Callee = Call->getCalledFunction();
   2652       if (&F == Callee)
   2653         return InlineResult::failure("recursive call");
   2654 
   2655       // Disallow calls which expose returns-twice to a function not previously
   2656       // attributed as such.
   2657       if (!ReturnsTwice && isa<CallInst>(Call) &&
   2658           cast<CallInst>(Call)->canReturnTwice())
   2659         return InlineResult::failure("exposes returns-twice attribute");
   2660 
   2661       if (Callee)
   2662         switch (Callee->getIntrinsicID()) {
   2663         default:
   2664           break;
   2665         case llvm::Intrinsic::icall_branch_funnel:
   2666           // Disallow inlining of @llvm.icall.branch.funnel because current
   2667           // backend can't separate call targets from call arguments.
   2668           return InlineResult::failure(
   2669               "disallowed inlining of @llvm.icall.branch.funnel");
   2670         case llvm::Intrinsic::localescape:
   2671           // Disallow inlining functions that call @llvm.localescape. Doing this
   2672           // correctly would require major changes to the inliner.
   2673           return InlineResult::failure(
   2674               "disallowed inlining of @llvm.localescape");
   2675         case llvm::Intrinsic::vastart:
   2676           // Disallow inlining of functions that initialize VarArgs with
   2677           // va_start.
   2678           return InlineResult::failure(
   2679               "contains VarArgs initialized with va_start");
   2680         }
   2681     }
   2682   }
   2683 
   2684   return InlineResult::success();
   2685 }
   2686 
   2687 // APIs to create InlineParams based on command line flags and/or other
   2688 // parameters.
   2689 
   2690 InlineParams llvm::getInlineParams(int Threshold) {
   2691   InlineParams Params;
   2692 
   2693   // This field is the threshold to use for a callee by default. This is
   2694   // derived from one or more of:
   2695   //  * optimization or size-optimization levels,
   2696   //  * a value passed to createFunctionInliningPass function, or
   2697   //  * the -inline-threshold flag.
   2698   //  If the -inline-threshold flag is explicitly specified, that is used
   2699   //  irrespective of anything else.
   2700   if (InlineThreshold.getNumOccurrences() > 0)
   2701     Params.DefaultThreshold = InlineThreshold;
   2702   else
   2703     Params.DefaultThreshold = Threshold;
   2704 
   2705   // Set the HintThreshold knob from the -inlinehint-threshold.
   2706   Params.HintThreshold = HintThreshold;
   2707 
   2708   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
   2709   Params.HotCallSiteThreshold = HotCallSiteThreshold;
   2710 
   2711   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
   2712   // populate LocallyHotCallSiteThreshold. Later, we populate
   2713   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
   2714   // we know that optimization level is O3 (in the getInlineParams variant that
   2715   // takes the opt and size levels).
   2716   // FIXME: Remove this check (and make the assignment unconditional) after
   2717   // addressing size regression issues at O2.
   2718   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
   2719     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
   2720 
   2721   // Set the ColdCallSiteThreshold knob from the
   2722   // -inline-cold-callsite-threshold.
   2723   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
   2724 
   2725   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
   2726   // -inlinehint-threshold commandline option is not explicitly given. If that
   2727   // option is present, then its value applies even for callees with size and
   2728   // minsize attributes.
   2729   // If the -inline-threshold is not specified, set the ColdThreshold from the
   2730   // -inlinecold-threshold even if it is not explicitly passed. If
   2731   // -inline-threshold is specified, then -inlinecold-threshold needs to be
   2732   // explicitly specified to set the ColdThreshold knob
   2733   if (InlineThreshold.getNumOccurrences() == 0) {
   2734     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
   2735     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
   2736     Params.ColdThreshold = ColdThreshold;
   2737   } else if (ColdThreshold.getNumOccurrences() > 0) {
   2738     Params.ColdThreshold = ColdThreshold;
   2739   }
   2740   return Params;
   2741 }
   2742 
   2743 InlineParams llvm::getInlineParams() {
   2744   return getInlineParams(DefaultThreshold);
   2745 }
   2746 
   2747 // Compute the default threshold for inlining based on the opt level and the
   2748 // size opt level.
   2749 static int computeThresholdFromOptLevels(unsigned OptLevel,
   2750                                          unsigned SizeOptLevel) {
   2751   if (OptLevel > 2)
   2752     return InlineConstants::OptAggressiveThreshold;
   2753   if (SizeOptLevel == 1) // -Os
   2754     return InlineConstants::OptSizeThreshold;
   2755   if (SizeOptLevel == 2) // -Oz
   2756     return InlineConstants::OptMinSizeThreshold;
   2757   return DefaultThreshold;
   2758 }
   2759 
   2760 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
   2761   auto Params =
   2762       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
   2763   // At O3, use the value of -locally-hot-callsite-threshold option to populate
   2764   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
   2765   // when it is specified explicitly.
   2766   if (OptLevel > 2)
   2767     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
   2768   return Params;
   2769 }
   2770 
   2771 PreservedAnalyses
   2772 InlineCostAnnotationPrinterPass::run(Function &F,
   2773                                      FunctionAnalysisManager &FAM) {
   2774   PrintInstructionComments = true;
   2775   std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
   2776       Function &F) -> AssumptionCache & {
   2777     return FAM.getResult<AssumptionAnalysis>(F);
   2778   };
   2779   Module *M = F.getParent();
   2780   ProfileSummaryInfo PSI(*M);
   2781   DataLayout DL(M);
   2782   TargetTransformInfo TTI(DL);
   2783   // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
   2784   // In the current implementation, the type of InlineParams doesn't matter as
   2785   // the pass serves only for verification of inliner's decisions.
   2786   // We can add a flag which determines InlineParams for this run. Right now,
   2787   // the default InlineParams are used.
   2788   const InlineParams Params = llvm::getInlineParams();
   2789   for (BasicBlock &BB : F) {
   2790     for (Instruction &I : BB) {
   2791       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
   2792         Function *CalledFunction = CI->getCalledFunction();
   2793         if (!CalledFunction || CalledFunction->isDeclaration())
   2794           continue;
   2795         OptimizationRemarkEmitter ORE(CalledFunction);
   2796         InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
   2797                                     GetAssumptionCache, nullptr, &PSI, &ORE);
   2798         ICCA.analyze();
   2799         OS << "      Analyzing call of " << CalledFunction->getName()
   2800            << "... (caller:" << CI->getCaller()->getName() << ")\n";
   2801         ICCA.print();
   2802       }
   2803     }
   2804   }
   2805   return PreservedAnalyses::all();
   2806 }
   2807