Home | History | Annotate | Line # | Download | only in IPO
      1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the SampleProfileLoader transformation. This pass
     10 // reads a profile file generated by a sampling profiler (e.g. Linux Perf -
     11 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
     12 // profile information in the given profile.
     13 //
     14 // This pass generates branch weight annotations on the IR:
     15 //
     16 // - prof: Represents branch weights. This annotation is added to branches
     17 //      to indicate the weights of each edge coming out of the branch.
     18 //      The weight of each edge is the weight of the target block for
     19 //      that edge. The weight of a block B is computed as the maximum
     20 //      number of samples found in B.
     21 //
     22 //===----------------------------------------------------------------------===//
     23 
     24 #include "llvm/Transforms/IPO/SampleProfile.h"
     25 #include "llvm/ADT/ArrayRef.h"
     26 #include "llvm/ADT/DenseMap.h"
     27 #include "llvm/ADT/DenseSet.h"
     28 #include "llvm/ADT/None.h"
     29 #include "llvm/ADT/PriorityQueue.h"
     30 #include "llvm/ADT/SCCIterator.h"
     31 #include "llvm/ADT/SmallPtrSet.h"
     32 #include "llvm/ADT/SmallSet.h"
     33 #include "llvm/ADT/SmallVector.h"
     34 #include "llvm/ADT/Statistic.h"
     35 #include "llvm/ADT/StringMap.h"
     36 #include "llvm/ADT/StringRef.h"
     37 #include "llvm/ADT/Twine.h"
     38 #include "llvm/Analysis/AssumptionCache.h"
     39 #include "llvm/Analysis/CallGraph.h"
     40 #include "llvm/Analysis/CallGraphSCCPass.h"
     41 #include "llvm/Analysis/InlineAdvisor.h"
     42 #include "llvm/Analysis/InlineCost.h"
     43 #include "llvm/Analysis/LoopInfo.h"
     44 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
     45 #include "llvm/Analysis/PostDominators.h"
     46 #include "llvm/Analysis/ProfileSummaryInfo.h"
     47 #include "llvm/Analysis/ReplayInlineAdvisor.h"
     48 #include "llvm/Analysis/TargetLibraryInfo.h"
     49 #include "llvm/Analysis/TargetTransformInfo.h"
     50 #include "llvm/IR/BasicBlock.h"
     51 #include "llvm/IR/CFG.h"
     52 #include "llvm/IR/DebugInfoMetadata.h"
     53 #include "llvm/IR/DebugLoc.h"
     54 #include "llvm/IR/DiagnosticInfo.h"
     55 #include "llvm/IR/Dominators.h"
     56 #include "llvm/IR/Function.h"
     57 #include "llvm/IR/GlobalValue.h"
     58 #include "llvm/IR/InstrTypes.h"
     59 #include "llvm/IR/Instruction.h"
     60 #include "llvm/IR/Instructions.h"
     61 #include "llvm/IR/IntrinsicInst.h"
     62 #include "llvm/IR/LLVMContext.h"
     63 #include "llvm/IR/MDBuilder.h"
     64 #include "llvm/IR/Module.h"
     65 #include "llvm/IR/PassManager.h"
     66 #include "llvm/IR/ValueSymbolTable.h"
     67 #include "llvm/InitializePasses.h"
     68 #include "llvm/Pass.h"
     69 #include "llvm/ProfileData/InstrProf.h"
     70 #include "llvm/ProfileData/SampleProf.h"
     71 #include "llvm/ProfileData/SampleProfReader.h"
     72 #include "llvm/Support/Casting.h"
     73 #include "llvm/Support/CommandLine.h"
     74 #include "llvm/Support/Debug.h"
     75 #include "llvm/Support/ErrorHandling.h"
     76 #include "llvm/Support/ErrorOr.h"
     77 #include "llvm/Support/GenericDomTree.h"
     78 #include "llvm/Support/raw_ostream.h"
     79 #include "llvm/Transforms/IPO.h"
     80 #include "llvm/Transforms/IPO/ProfiledCallGraph.h"
     81 #include "llvm/Transforms/IPO/SampleContextTracker.h"
     82 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
     83 #include "llvm/Transforms/Instrumentation.h"
     84 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
     85 #include "llvm/Transforms/Utils/Cloning.h"
     86 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
     87 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
     88 #include <algorithm>
     89 #include <cassert>
     90 #include <cstdint>
     91 #include <functional>
     92 #include <limits>
     93 #include <map>
     94 #include <memory>
     95 #include <queue>
     96 #include <string>
     97 #include <system_error>
     98 #include <utility>
     99 #include <vector>
    100 
    101 using namespace llvm;
    102 using namespace sampleprof;
    103 using namespace llvm::sampleprofutil;
    104 using ProfileCount = Function::ProfileCount;
    105 #define DEBUG_TYPE "sample-profile"
    106 #define CSINLINE_DEBUG DEBUG_TYPE "-inline"
    107 
    108 STATISTIC(NumCSInlined,
    109           "Number of functions inlined with context sensitive profile");
    110 STATISTIC(NumCSNotInlined,
    111           "Number of functions not inlined with context sensitive profile");
    112 STATISTIC(NumMismatchedProfile,
    113           "Number of functions with CFG mismatched profile");
    114 STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
    115 STATISTIC(NumDuplicatedInlinesite,
    116           "Number of inlined callsites with a partial distribution factor");
    117 
    118 STATISTIC(NumCSInlinedHitMinLimit,
    119           "Number of functions with FDO inline stopped due to min size limit");
    120 STATISTIC(NumCSInlinedHitMaxLimit,
    121           "Number of functions with FDO inline stopped due to max size limit");
    122 STATISTIC(
    123     NumCSInlinedHitGrowthLimit,
    124     "Number of functions with FDO inline stopped due to growth size limit");
    125 
    126 // Command line option to specify the file to read samples from. This is
    127 // mainly used for debugging.
    128 static cl::opt<std::string> SampleProfileFile(
    129     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
    130     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
    131 
    132 // The named file contains a set of transformations that may have been applied
    133 // to the symbol names between the program from which the sample data was
    134 // collected and the current program's symbols.
    135 static cl::opt<std::string> SampleProfileRemappingFile(
    136     "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
    137     cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
    138 
    139 static cl::opt<bool> ProfileSampleAccurate(
    140     "profile-sample-accurate", cl::Hidden, cl::init(false),
    141     cl::desc("If the sample profile is accurate, we will mark all un-sampled "
    142              "callsite and function as having 0 samples. Otherwise, treat "
    143              "un-sampled callsites and functions conservatively as unknown. "));
    144 
    145 static cl::opt<bool> ProfileAccurateForSymsInList(
    146     "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
    147     cl::init(true),
    148     cl::desc("For symbols in profile symbol list, regard their profiles to "
    149              "be accurate. It may be overriden by profile-sample-accurate. "));
    150 
    151 static cl::opt<bool> ProfileMergeInlinee(
    152     "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
    153     cl::desc("Merge past inlinee's profile to outline version if sample "
    154              "profile loader decided not to inline a call site. It will "
    155              "only be enabled when top-down order of profile loading is "
    156              "enabled. "));
    157 
    158 static cl::opt<bool> ProfileTopDownLoad(
    159     "sample-profile-top-down-load", cl::Hidden, cl::init(true),
    160     cl::desc("Do profile annotation and inlining for functions in top-down "
    161              "order of call graph during sample profile loading. It only "
    162              "works for new pass manager. "));
    163 
    164 static cl::opt<bool>
    165     UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
    166                          cl::desc("Process functions in a top-down order "
    167                                   "defined by the profiled call graph when "
    168                                   "-sample-profile-top-down-load is on."));
    169 
    170 static cl::opt<bool> ProfileSizeInline(
    171     "sample-profile-inline-size", cl::Hidden, cl::init(false),
    172     cl::desc("Inline cold call sites in profile loader if it's beneficial "
    173              "for code size."));
    174 
    175 cl::opt<int> ProfileInlineGrowthLimit(
    176     "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
    177     cl::desc("The size growth ratio limit for proirity-based sample profile "
    178              "loader inlining."));
    179 
    180 cl::opt<int> ProfileInlineLimitMin(
    181     "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
    182     cl::desc("The lower bound of size growth limit for "
    183              "proirity-based sample profile loader inlining."));
    184 
    185 cl::opt<int> ProfileInlineLimitMax(
    186     "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
    187     cl::desc("The upper bound of size growth limit for "
    188              "proirity-based sample profile loader inlining."));
    189 
    190 cl::opt<int> SampleHotCallSiteThreshold(
    191     "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
    192     cl::desc("Hot callsite threshold for proirity-based sample profile loader "
    193              "inlining."));
    194 
    195 cl::opt<int> SampleColdCallSiteThreshold(
    196     "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
    197     cl::desc("Threshold for inlining cold callsites"));
    198 
    199 static cl::opt<int> ProfileICPThreshold(
    200     "sample-profile-icp-threshold", cl::Hidden, cl::init(5),
    201     cl::desc(
    202         "Relative hotness threshold for indirect "
    203         "call promotion in proirity-based sample profile loader inlining."));
    204 
    205 static cl::opt<bool> CallsitePrioritizedInline(
    206     "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
    207     cl::init(false),
    208     cl::desc("Use call site prioritized inlining for sample profile loader."
    209              "Currently only CSSPGO is supported."));
    210 
    211 static cl::opt<std::string> ProfileInlineReplayFile(
    212     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
    213     cl::desc(
    214         "Optimization remarks file containing inline remarks to be replayed "
    215         "by inlining from sample profile loader."),
    216     cl::Hidden);
    217 
    218 static cl::opt<unsigned>
    219     MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
    220                      cl::ZeroOrMore,
    221                      cl::desc("Max number of promotions for a single indirect "
    222                               "call callsite in sample profile loader"));
    223 
    224 static cl::opt<bool> OverwriteExistingWeights(
    225     "overwrite-existing-weights", cl::Hidden, cl::init(false),
    226     cl::desc("Ignore existing branch weights on IR and always overwrite."));
    227 
    228 namespace {
    229 
    230 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
    231 using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
    232 using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
    233 using EdgeWeightMap = DenseMap<Edge, uint64_t>;
    234 using BlockEdgeMap =
    235     DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
    236 
    237 class GUIDToFuncNameMapper {
    238 public:
    239   GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
    240                        DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
    241       : CurrentReader(Reader), CurrentModule(M),
    242         CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
    243     if (!CurrentReader.useMD5())
    244       return;
    245 
    246     for (const auto &F : CurrentModule) {
    247       StringRef OrigName = F.getName();
    248       CurrentGUIDToFuncNameMap.insert(
    249           {Function::getGUID(OrigName), OrigName});
    250 
    251       // Local to global var promotion used by optimization like thinlto
    252       // will rename the var and add suffix like ".llvm.xxx" to the
    253       // original local name. In sample profile, the suffixes of function
    254       // names are all stripped. Since it is possible that the mapper is
    255       // built in post-thin-link phase and var promotion has been done,
    256       // we need to add the substring of function name without the suffix
    257       // into the GUIDToFuncNameMap.
    258       StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
    259       if (CanonName != OrigName)
    260         CurrentGUIDToFuncNameMap.insert(
    261             {Function::getGUID(CanonName), CanonName});
    262     }
    263 
    264     // Update GUIDToFuncNameMap for each function including inlinees.
    265     SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
    266   }
    267 
    268   ~GUIDToFuncNameMapper() {
    269     if (!CurrentReader.useMD5())
    270       return;
    271 
    272     CurrentGUIDToFuncNameMap.clear();
    273 
    274     // Reset GUIDToFuncNameMap for of each function as they're no
    275     // longer valid at this point.
    276     SetGUIDToFuncNameMapForAll(nullptr);
    277   }
    278 
    279 private:
    280   void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
    281     std::queue<FunctionSamples *> FSToUpdate;
    282     for (auto &IFS : CurrentReader.getProfiles()) {
    283       FSToUpdate.push(&IFS.second);
    284     }
    285 
    286     while (!FSToUpdate.empty()) {
    287       FunctionSamples *FS = FSToUpdate.front();
    288       FSToUpdate.pop();
    289       FS->GUIDToFuncNameMap = Map;
    290       for (const auto &ICS : FS->getCallsiteSamples()) {
    291         const FunctionSamplesMap &FSMap = ICS.second;
    292         for (auto &IFS : FSMap) {
    293           FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
    294           FSToUpdate.push(&FS);
    295         }
    296       }
    297     }
    298   }
    299 
    300   SampleProfileReader &CurrentReader;
    301   Module &CurrentModule;
    302   DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
    303 };
    304 
    305 // Inline candidate used by iterative callsite prioritized inliner
    306 struct InlineCandidate {
    307   CallBase *CallInstr;
    308   const FunctionSamples *CalleeSamples;
    309   // Prorated callsite count, which will be used to guide inlining. For example,
    310   // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
    311   // copies will get their own distribution factors and their prorated counts
    312   // will be used to decide if they should be inlined independently.
    313   uint64_t CallsiteCount;
    314   // Call site distribution factor to prorate the profile samples for a
    315   // duplicated callsite. Default value is 1.0.
    316   float CallsiteDistribution;
    317 };
    318 
    319 // Inline candidate comparer using call site weight
    320 struct CandidateComparer {
    321   bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
    322     if (LHS.CallsiteCount != RHS.CallsiteCount)
    323       return LHS.CallsiteCount < RHS.CallsiteCount;
    324 
    325     const FunctionSamples *LCS = LHS.CalleeSamples;
    326     const FunctionSamples *RCS = RHS.CalleeSamples;
    327     assert(LCS && RCS && "Expect non-null FunctionSamples");
    328 
    329     // Tie breaker using number of samples try to favor smaller functions first
    330     if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
    331       return LCS->getBodySamples().size() > RCS->getBodySamples().size();
    332 
    333     // Tie breaker using GUID so we have stable/deterministic inlining order
    334     return LCS->getGUID(LCS->getName()) < RCS->getGUID(RCS->getName());
    335   }
    336 };
    337 
    338 using CandidateQueue =
    339     PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
    340                   CandidateComparer>;
    341 
    342 /// Sample profile pass.
    343 ///
    344 /// This pass reads profile data from the file specified by
    345 /// -sample-profile-file and annotates every affected function with the
    346 /// profile information found in that file.
    347 class SampleProfileLoader final
    348     : public SampleProfileLoaderBaseImpl<BasicBlock> {
    349 public:
    350   SampleProfileLoader(
    351       StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
    352       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
    353       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
    354       std::function<const TargetLibraryInfo &(Function &)> GetTLI)
    355       : SampleProfileLoaderBaseImpl(std::string(Name)),
    356         GetAC(std::move(GetAssumptionCache)),
    357         GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
    358         RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
    359 
    360   bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
    361   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
    362                    ProfileSummaryInfo *_PSI, CallGraph *CG);
    363 
    364 protected:
    365   bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
    366   bool emitAnnotations(Function &F);
    367   ErrorOr<uint64_t> getInstWeight(const Instruction &I) override;
    368   ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
    369   const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
    370   const FunctionSamples *
    371   findFunctionSamples(const Instruction &I) const override;
    372   std::vector<const FunctionSamples *>
    373   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
    374   void findExternalInlineCandidate(const FunctionSamples *Samples,
    375                                    DenseSet<GlobalValue::GUID> &InlinedGUIDs,
    376                                    const StringMap<Function *> &SymbolMap,
    377                                    uint64_t Threshold);
    378   // Attempt to promote indirect call and also inline the promoted call
    379   bool tryPromoteAndInlineCandidate(
    380       Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
    381       uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
    382   bool inlineHotFunctions(Function &F,
    383                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
    384   InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
    385   bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
    386   bool
    387   tryInlineCandidate(InlineCandidate &Candidate,
    388                      SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
    389   bool
    390   inlineHotFunctionsWithPriority(Function &F,
    391                                  DenseSet<GlobalValue::GUID> &InlinedGUIDs);
    392   // Inline cold/small functions in addition to hot ones
    393   bool shouldInlineColdCallee(CallBase &CallInst);
    394   void emitOptimizationRemarksForInlineCandidates(
    395       const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
    396       bool Hot);
    397   std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
    398   std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG);
    399   void generateMDProfMetadata(Function &F);
    400 
    401   /// Map from function name to Function *. Used to find the function from
    402   /// the function name. If the function name contains suffix, additional
    403   /// entry is added to map from the stripped name to the function if there
    404   /// is one-to-one mapping.
    405   StringMap<Function *> SymbolMap;
    406 
    407   std::function<AssumptionCache &(Function &)> GetAC;
    408   std::function<TargetTransformInfo &(Function &)> GetTTI;
    409   std::function<const TargetLibraryInfo &(Function &)> GetTLI;
    410 
    411   /// Profile tracker for different context.
    412   std::unique_ptr<SampleContextTracker> ContextTracker;
    413 
    414   /// Name of the profile remapping file to load.
    415   std::string RemappingFilename;
    416 
    417   /// Flag indicating whether the profile input loaded successfully.
    418   bool ProfileIsValid = false;
    419 
    420   /// Flag indicating whether input profile is context-sensitive
    421   bool ProfileIsCS = false;
    422 
    423   /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
    424   ///
    425   /// We need to know the LTO phase because for example in ThinLTOPrelink
    426   /// phase, in annotation, we should not promote indirect calls. Instead,
    427   /// we will mark GUIDs that needs to be annotated to the function.
    428   ThinOrFullLTOPhase LTOPhase;
    429 
    430   /// Profle Symbol list tells whether a function name appears in the binary
    431   /// used to generate the current profile.
    432   std::unique_ptr<ProfileSymbolList> PSL;
    433 
    434   /// Total number of samples collected in this profile.
    435   ///
    436   /// This is the sum of all the samples collected in all the functions executed
    437   /// at runtime.
    438   uint64_t TotalCollectedSamples = 0;
    439 
    440   // Information recorded when we declined to inline a call site
    441   // because we have determined it is too cold is accumulated for
    442   // each callee function. Initially this is just the entry count.
    443   struct NotInlinedProfileInfo {
    444     uint64_t entryCount;
    445   };
    446   DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
    447 
    448   // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
    449   // all the function symbols defined or declared in current module.
    450   DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
    451 
    452   // All the Names used in FunctionSamples including outline function
    453   // names, inline instance names and call target names.
    454   StringSet<> NamesInProfile;
    455 
    456   // For symbol in profile symbol list, whether to regard their profiles
    457   // to be accurate. It is mainly decided by existance of profile symbol
    458   // list and -profile-accurate-for-symsinlist flag, but it can be
    459   // overriden by -profile-sample-accurate or profile-sample-accurate
    460   // attribute.
    461   bool ProfAccForSymsInList;
    462 
    463   // External inline advisor used to replay inline decision from remarks.
    464   std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
    465 
    466   // A pseudo probe helper to correlate the imported sample counts.
    467   std::unique_ptr<PseudoProbeManager> ProbeManager;
    468 };
    469 
    470 class SampleProfileLoaderLegacyPass : public ModulePass {
    471 public:
    472   // Class identification, replacement for typeinfo
    473   static char ID;
    474 
    475   SampleProfileLoaderLegacyPass(
    476       StringRef Name = SampleProfileFile,
    477       ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
    478       : ModulePass(ID), SampleLoader(
    479                             Name, SampleProfileRemappingFile, LTOPhase,
    480                             [&](Function &F) -> AssumptionCache & {
    481                               return ACT->getAssumptionCache(F);
    482                             },
    483                             [&](Function &F) -> TargetTransformInfo & {
    484                               return TTIWP->getTTI(F);
    485                             },
    486                             [&](Function &F) -> TargetLibraryInfo & {
    487                               return TLIWP->getTLI(F);
    488                             }) {
    489     initializeSampleProfileLoaderLegacyPassPass(
    490         *PassRegistry::getPassRegistry());
    491   }
    492 
    493   void dump() { SampleLoader.dump(); }
    494 
    495   bool doInitialization(Module &M) override {
    496     return SampleLoader.doInitialization(M);
    497   }
    498 
    499   StringRef getPassName() const override { return "Sample profile pass"; }
    500   bool runOnModule(Module &M) override;
    501 
    502   void getAnalysisUsage(AnalysisUsage &AU) const override {
    503     AU.addRequired<AssumptionCacheTracker>();
    504     AU.addRequired<TargetTransformInfoWrapperPass>();
    505     AU.addRequired<TargetLibraryInfoWrapperPass>();
    506     AU.addRequired<ProfileSummaryInfoWrapperPass>();
    507   }
    508 
    509 private:
    510   SampleProfileLoader SampleLoader;
    511   AssumptionCacheTracker *ACT = nullptr;
    512   TargetTransformInfoWrapperPass *TTIWP = nullptr;
    513   TargetLibraryInfoWrapperPass *TLIWP = nullptr;
    514 };
    515 
    516 } // end anonymous namespace
    517 
    518 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
    519   if (FunctionSamples::ProfileIsProbeBased)
    520     return getProbeWeight(Inst);
    521 
    522   const DebugLoc &DLoc = Inst.getDebugLoc();
    523   if (!DLoc)
    524     return std::error_code();
    525 
    526   // Ignore all intrinsics, phinodes and branch instructions.
    527   // Branch and phinodes instruction usually contains debug info from sources
    528   // outside of the residing basic block, thus we ignore them during annotation.
    529   if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
    530     return std::error_code();
    531 
    532   // For non-CS profile, if a direct call/invoke instruction is inlined in
    533   // profile (findCalleeFunctionSamples returns non-empty result), but not
    534   // inlined here, it means that the inlined callsite has no sample, thus the
    535   // call instruction should have 0 count.
    536   // For CS profile, the callsite count of previously inlined callees is
    537   // populated with the entry count of the callees.
    538   if (!ProfileIsCS)
    539     if (const auto *CB = dyn_cast<CallBase>(&Inst))
    540       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
    541         return 0;
    542 
    543   return getInstWeightImpl(Inst);
    544 }
    545 
    546 // Here use error_code to represent: 1) The dangling probe. 2) Ignore the weight
    547 // of non-probe instruction. So if all instructions of the BB give error_code,
    548 // tell the inference algorithm to infer the BB weight.
    549 ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
    550   assert(FunctionSamples::ProfileIsProbeBased &&
    551          "Profile is not pseudo probe based");
    552   Optional<PseudoProbe> Probe = extractProbe(Inst);
    553   // Ignore the non-probe instruction. If none of the instruction in the BB is
    554   // probe, we choose to infer the BB's weight.
    555   if (!Probe)
    556     return std::error_code();
    557 
    558   // This is not the dangling probe from the training pass but generated by the
    559   // current compilation. Ignore this since they are logically deleted and
    560   // should not consume any profile samples.
    561   if (Probe->isDangling())
    562     return std::error_code();
    563 
    564   const FunctionSamples *FS = findFunctionSamples(Inst);
    565   // If none of the instruction has FunctionSample, we choose to return zero
    566   // value sample to indicate the BB is cold. This could happen when the
    567   // instruction is from inlinee and no profile data is found.
    568   // FIXME: This should not be affected by the source drift issue as 1) if the
    569   // newly added function is top-level inliner, it won't match the CFG checksum
    570   // in the function profile or 2) if it's the inlinee, the inlinee should have
    571   // a profile, otherwise it wouldn't be inlined. For non-probe based profile,
    572   // we can improve it by adding a switch for profile-sample-block-accurate for
    573   // block level counts in the future.
    574   if (!FS)
    575     return 0;
    576 
    577   // For non-CS profile, If a direct call/invoke instruction is inlined in
    578   // profile (findCalleeFunctionSamples returns non-empty result), but not
    579   // inlined here, it means that the inlined callsite has no sample, thus the
    580   // call instruction should have 0 count.
    581   // For CS profile, the callsite count of previously inlined callees is
    582   // populated with the entry count of the callees.
    583   if (!ProfileIsCS)
    584     if (const auto *CB = dyn_cast<CallBase>(&Inst))
    585       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
    586         return 0;
    587 
    588   const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
    589   if (R) {
    590     uint64_t Samples = R.get() * Probe->Factor;
    591     bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
    592     if (FirstMark) {
    593       ORE->emit([&]() {
    594         OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
    595         Remark << "Applied " << ore::NV("NumSamples", Samples);
    596         Remark << " samples from profile (ProbeId=";
    597         Remark << ore::NV("ProbeId", Probe->Id);
    598         Remark << ", Factor=";
    599         Remark << ore::NV("Factor", Probe->Factor);
    600         Remark << ", OriginalSamples=";
    601         Remark << ore::NV("OriginalSamples", R.get());
    602         Remark << ")";
    603         return Remark;
    604       });
    605     }
    606     LLVM_DEBUG(dbgs() << "    " << Probe->Id << ":" << Inst
    607                       << " - weight: " << R.get() << " - factor: "
    608                       << format("%0.2f", Probe->Factor) << ")\n");
    609     return Samples;
    610   }
    611   return R;
    612 }
    613 
    614 /// Get the FunctionSamples for a call instruction.
    615 ///
    616 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
    617 /// instance in which that call instruction is calling to. It contains
    618 /// all samples that resides in the inlined instance. We first find the
    619 /// inlined instance in which the call instruction is from, then we
    620 /// traverse its children to find the callsite with the matching
    621 /// location.
    622 ///
    623 /// \param Inst Call/Invoke instruction to query.
    624 ///
    625 /// \returns The FunctionSamples pointer to the inlined instance.
    626 const FunctionSamples *
    627 SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
    628   const DILocation *DIL = Inst.getDebugLoc();
    629   if (!DIL) {
    630     return nullptr;
    631   }
    632 
    633   StringRef CalleeName;
    634   if (Function *Callee = Inst.getCalledFunction())
    635     CalleeName = Callee->getName();
    636 
    637   if (ProfileIsCS)
    638     return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
    639 
    640   const FunctionSamples *FS = findFunctionSamples(Inst);
    641   if (FS == nullptr)
    642     return nullptr;
    643 
    644   return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
    645                                    CalleeName, Reader->getRemapper());
    646 }
    647 
    648 /// Returns a vector of FunctionSamples that are the indirect call targets
    649 /// of \p Inst. The vector is sorted by the total number of samples. Stores
    650 /// the total call count of the indirect call in \p Sum.
    651 std::vector<const FunctionSamples *>
    652 SampleProfileLoader::findIndirectCallFunctionSamples(
    653     const Instruction &Inst, uint64_t &Sum) const {
    654   const DILocation *DIL = Inst.getDebugLoc();
    655   std::vector<const FunctionSamples *> R;
    656 
    657   if (!DIL) {
    658     return R;
    659   }
    660 
    661   auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
    662     assert(L && R && "Expect non-null FunctionSamples");
    663     if (L->getEntrySamples() != R->getEntrySamples())
    664       return L->getEntrySamples() > R->getEntrySamples();
    665     return FunctionSamples::getGUID(L->getName()) <
    666            FunctionSamples::getGUID(R->getName());
    667   };
    668 
    669   if (ProfileIsCS) {
    670     auto CalleeSamples =
    671         ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
    672     if (CalleeSamples.empty())
    673       return R;
    674 
    675     // For CSSPGO, we only use target context profile's entry count
    676     // as that already includes both inlined callee and non-inlined ones..
    677     Sum = 0;
    678     for (const auto *const FS : CalleeSamples) {
    679       Sum += FS->getEntrySamples();
    680       R.push_back(FS);
    681     }
    682     llvm::sort(R, FSCompare);
    683     return R;
    684   }
    685 
    686   const FunctionSamples *FS = findFunctionSamples(Inst);
    687   if (FS == nullptr)
    688     return R;
    689 
    690   auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
    691   auto T = FS->findCallTargetMapAt(CallSite);
    692   Sum = 0;
    693   if (T)
    694     for (const auto &T_C : T.get())
    695       Sum += T_C.second;
    696   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
    697     if (M->empty())
    698       return R;
    699     for (const auto &NameFS : *M) {
    700       Sum += NameFS.second.getEntrySamples();
    701       R.push_back(&NameFS.second);
    702     }
    703     llvm::sort(R, FSCompare);
    704   }
    705   return R;
    706 }
    707 
    708 const FunctionSamples *
    709 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
    710   if (FunctionSamples::ProfileIsProbeBased) {
    711     Optional<PseudoProbe> Probe = extractProbe(Inst);
    712     if (!Probe)
    713       return nullptr;
    714   }
    715 
    716   const DILocation *DIL = Inst.getDebugLoc();
    717   if (!DIL)
    718     return Samples;
    719 
    720   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
    721   if (it.second) {
    722     if (ProfileIsCS)
    723       it.first->second = ContextTracker->getContextSamplesFor(DIL);
    724     else
    725       it.first->second =
    726           Samples->findFunctionSamples(DIL, Reader->getRemapper());
    727   }
    728   return it.first->second;
    729 }
    730 
    731 /// Check whether the indirect call promotion history of \p Inst allows
    732 /// the promotion for \p Candidate.
    733 /// If the profile count for the promotion candidate \p Candidate is
    734 /// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
    735 /// for \p Inst. If we already have at least MaxNumPromotions
    736 /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
    737 /// cannot promote for \p Inst anymore.
    738 static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
    739   uint32_t NumVals = 0;
    740   uint64_t TotalCount = 0;
    741   std::unique_ptr<InstrProfValueData[]> ValueData =
    742       std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
    743   bool Valid =
    744       getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
    745                                ValueData.get(), NumVals, TotalCount, true);
    746   // No valid value profile so no promoted targets have been recorded
    747   // before. Ok to do ICP.
    748   if (!Valid)
    749     return true;
    750 
    751   unsigned NumPromoted = 0;
    752   for (uint32_t I = 0; I < NumVals; I++) {
    753     if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
    754       continue;
    755 
    756     // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
    757     // metadata, it means the candidate has been promoted for this
    758     // indirect call.
    759     if (ValueData[I].Value == Function::getGUID(Candidate))
    760       return false;
    761     NumPromoted++;
    762     // If already have MaxNumPromotions promotion, don't do it anymore.
    763     if (NumPromoted == MaxNumPromotions)
    764       return false;
    765   }
    766   return true;
    767 }
    768 
    769 /// Update indirect call target profile metadata for \p Inst.
    770 /// Usually \p Sum is the sum of counts of all the targets for \p Inst.
    771 /// If it is 0, it means updateIDTMetaData is used to mark a
    772 /// certain target to be promoted already. If it is not zero,
    773 /// we expect to use it to update the total count in the value profile.
    774 static void
    775 updateIDTMetaData(Instruction &Inst,
    776                   const SmallVectorImpl<InstrProfValueData> &CallTargets,
    777                   uint64_t Sum) {
    778   uint32_t NumVals = 0;
    779   // OldSum is the existing total count in the value profile data.
    780   uint64_t OldSum = 0;
    781   std::unique_ptr<InstrProfValueData[]> ValueData =
    782       std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
    783   bool Valid =
    784       getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
    785                                ValueData.get(), NumVals, OldSum, true);
    786 
    787   DenseMap<uint64_t, uint64_t> ValueCountMap;
    788   if (Sum == 0) {
    789     assert((CallTargets.size() == 1 &&
    790             CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
    791            "If sum is 0, assume only one element in CallTargets "
    792            "with count being NOMORE_ICP_MAGICNUM");
    793     // Initialize ValueCountMap with existing value profile data.
    794     if (Valid) {
    795       for (uint32_t I = 0; I < NumVals; I++)
    796         ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
    797     }
    798     auto Pair =
    799         ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
    800     // If the target already exists in value profile, decrease the total
    801     // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
    802     if (!Pair.second) {
    803       OldSum -= Pair.first->second;
    804       Pair.first->second = NOMORE_ICP_MAGICNUM;
    805     }
    806     Sum = OldSum;
    807   } else {
    808     // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
    809     // counts in the value profile.
    810     if (Valid) {
    811       for (uint32_t I = 0; I < NumVals; I++) {
    812         if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
    813           ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
    814       }
    815     }
    816 
    817     for (const auto &Data : CallTargets) {
    818       auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
    819       if (Pair.second)
    820         continue;
    821       // The target represented by Data.Value has already been promoted.
    822       // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
    823       // Sum by Data.Count.
    824       assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
    825       Sum -= Data.Count;
    826     }
    827   }
    828 
    829   SmallVector<InstrProfValueData, 8> NewCallTargets;
    830   for (const auto &ValueCount : ValueCountMap) {
    831     NewCallTargets.emplace_back(
    832         InstrProfValueData{ValueCount.first, ValueCount.second});
    833   }
    834 
    835   llvm::sort(NewCallTargets,
    836              [](const InstrProfValueData &L, const InstrProfValueData &R) {
    837                if (L.Count != R.Count)
    838                  return L.Count > R.Count;
    839                return L.Value > R.Value;
    840              });
    841 
    842   uint32_t MaxMDCount =
    843       std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
    844   annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,
    845                     NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
    846 }
    847 
    848 /// Attempt to promote indirect call and also inline the promoted call.
    849 ///
    850 /// \param F  Caller function.
    851 /// \param Candidate  ICP and inline candidate.
    852 /// \param SumOrigin  Original sum of target counts for indirect call before
    853 ///                   promoting given candidate.
    854 /// \param Sum        Prorated sum of remaining target counts for indirect call
    855 ///                   after promoting given candidate.
    856 /// \param InlinedCallSite  Output vector for new call sites exposed after
    857 /// inlining.
    858 bool SampleProfileLoader::tryPromoteAndInlineCandidate(
    859     Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
    860     SmallVector<CallBase *, 8> *InlinedCallSite) {
    861   auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
    862   auto R = SymbolMap.find(CalleeFunctionName);
    863   if (R == SymbolMap.end() || !R->getValue())
    864     return false;
    865 
    866   auto &CI = *Candidate.CallInstr;
    867   if (!doesHistoryAllowICP(CI, R->getValue()->getName()))
    868     return false;
    869 
    870   const char *Reason = "Callee function not available";
    871   // R->getValue() != &F is to prevent promoting a recursive call.
    872   // If it is a recursive call, we do not inline it as it could bloat
    873   // the code exponentially. There is way to better handle this, e.g.
    874   // clone the caller first, and inline the cloned caller if it is
    875   // recursive. As llvm does not inline recursive calls, we will
    876   // simply ignore it instead of handling it explicitly.
    877   if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
    878       R->getValue()->hasFnAttribute("use-sample-profile") &&
    879       R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) {
    880     // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
    881     // in the value profile metadata so the target won't be promoted again.
    882     SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
    883         Function::getGUID(R->getValue()->getName()), NOMORE_ICP_MAGICNUM}};
    884     updateIDTMetaData(CI, SortedCallTargets, 0);
    885 
    886     auto *DI = &pgo::promoteIndirectCall(
    887         CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE);
    888     if (DI) {
    889       Sum -= Candidate.CallsiteCount;
    890       // Do not prorate the indirect callsite distribution since the original
    891       // distribution will be used to scale down non-promoted profile target
    892       // counts later. By doing this we lose track of the real callsite count
    893       // for the leftover indirect callsite as a trade off for accurate call
    894       // target counts.
    895       // TODO: Ideally we would have two separate factors, one for call site
    896       // counts and one is used to prorate call target counts.
    897       // Do not update the promoted direct callsite distribution at this
    898       // point since the original distribution combined with the callee profile
    899       // will be used to prorate callsites from the callee if inlined. Once not
    900       // inlined, the direct callsite distribution should be prorated so that
    901       // the it will reflect the real callsite counts.
    902       Candidate.CallInstr = DI;
    903       if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
    904         bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
    905         if (!Inlined) {
    906           // Prorate the direct callsite distribution so that it reflects real
    907           // callsite counts.
    908           setProbeDistributionFactor(
    909               *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
    910         }
    911         return Inlined;
    912       }
    913     }
    914   } else {
    915     LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
    916                       << Candidate.CalleeSamples->getFuncName() << " because "
    917                       << Reason << "\n");
    918   }
    919   return false;
    920 }
    921 
    922 bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
    923   if (!ProfileSizeInline)
    924     return false;
    925 
    926   Function *Callee = CallInst.getCalledFunction();
    927   if (Callee == nullptr)
    928     return false;
    929 
    930   InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
    931                                   GetAC, GetTLI);
    932 
    933   if (Cost.isNever())
    934     return false;
    935 
    936   if (Cost.isAlways())
    937     return true;
    938 
    939   return Cost.getCost() <= SampleColdCallSiteThreshold;
    940 }
    941 
    942 void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
    943     const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
    944     bool Hot) {
    945   for (auto I : Candidates) {
    946     Function *CalledFunction = I->getCalledFunction();
    947     if (CalledFunction) {
    948       ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
    949                                            I->getDebugLoc(), I->getParent())
    950                 << "previous inlining reattempted for "
    951                 << (Hot ? "hotness: '" : "size: '")
    952                 << ore::NV("Callee", CalledFunction) << "' into '"
    953                 << ore::NV("Caller", &F) << "'");
    954     }
    955   }
    956 }
    957 
    958 void SampleProfileLoader::findExternalInlineCandidate(
    959     const FunctionSamples *Samples, DenseSet<GlobalValue::GUID> &InlinedGUIDs,
    960     const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
    961   assert(Samples && "expect non-null caller profile");
    962 
    963   // For AutoFDO profile, retrieve candidate profiles by walking over
    964   // the nested inlinee profiles.
    965   if (!ProfileIsCS) {
    966     Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
    967     return;
    968   }
    969 
    970   ContextTrieNode *Caller =
    971       ContextTracker->getContextFor(Samples->getContext());
    972   std::queue<ContextTrieNode *> CalleeList;
    973   CalleeList.push(Caller);
    974   while (!CalleeList.empty()) {
    975     ContextTrieNode *Node = CalleeList.front();
    976     CalleeList.pop();
    977     FunctionSamples *CalleeSample = Node->getFunctionSamples();
    978     // For CSSPGO profile, retrieve candidate profile by walking over the
    979     // trie built for context profile. Note that also take call targets
    980     // even if callee doesn't have a corresponding context profile.
    981     if (!CalleeSample || CalleeSample->getEntrySamples() < Threshold)
    982       continue;
    983 
    984     StringRef Name = CalleeSample->getFuncName();
    985     Function *Func = SymbolMap.lookup(Name);
    986     // Add to the import list only when it's defined out of module.
    987     if (!Func || Func->isDeclaration())
    988       InlinedGUIDs.insert(FunctionSamples::getGUID(Name));
    989 
    990     // Import hot CallTargets, which may not be available in IR because full
    991     // profile annotation cannot be done until backend compilation in ThinLTO.
    992     for (const auto &BS : CalleeSample->getBodySamples())
    993       for (const auto &TS : BS.second.getCallTargets())
    994         if (TS.getValue() > Threshold) {
    995           StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
    996           const Function *Callee = SymbolMap.lookup(CalleeName);
    997           if (!Callee || Callee->isDeclaration())
    998             InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeName));
    999         }
   1000 
   1001     // Import hot child context profile associted with callees. Note that this
   1002     // may have some overlap with the call target loop above, but doing this
   1003     // based child context profile again effectively allow us to use the max of
   1004     // entry count and call target count to determine importing.
   1005     for (auto &Child : Node->getAllChildContext()) {
   1006       ContextTrieNode *CalleeNode = &Child.second;
   1007       CalleeList.push(CalleeNode);
   1008     }
   1009   }
   1010 }
   1011 
   1012 /// Iteratively inline hot callsites of a function.
   1013 ///
   1014 /// Iteratively traverse all callsites of the function \p F, and find if
   1015 /// the corresponding inlined instance exists and is hot in profile. If
   1016 /// it is hot enough, inline the callsites and adds new callsites of the
   1017 /// callee into the caller. If the call is an indirect call, first promote
   1018 /// it to direct call. Each indirect call is limited with a single target.
   1019 ///
   1020 /// \param F function to perform iterative inlining.
   1021 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are
   1022 ///     inlined in the profiled binary.
   1023 ///
   1024 /// \returns True if there is any inline happened.
   1025 bool SampleProfileLoader::inlineHotFunctions(
   1026     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
   1027   // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
   1028   // Profile symbol list is ignored when profile-sample-accurate is on.
   1029   assert((!ProfAccForSymsInList ||
   1030           (!ProfileSampleAccurate &&
   1031            !F.hasFnAttribute("profile-sample-accurate"))) &&
   1032          "ProfAccForSymsInList should be false when profile-sample-accurate "
   1033          "is enabled");
   1034 
   1035   DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
   1036   bool Changed = false;
   1037   bool LocalChanged = true;
   1038   while (LocalChanged) {
   1039     LocalChanged = false;
   1040     SmallVector<CallBase *, 10> CIS;
   1041     for (auto &BB : F) {
   1042       bool Hot = false;
   1043       SmallVector<CallBase *, 10> AllCandidates;
   1044       SmallVector<CallBase *, 10> ColdCandidates;
   1045       for (auto &I : BB.getInstList()) {
   1046         const FunctionSamples *FS = nullptr;
   1047         if (auto *CB = dyn_cast<CallBase>(&I)) {
   1048           if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
   1049             assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
   1050                    "GUIDToFuncNameMap has to be populated");
   1051             AllCandidates.push_back(CB);
   1052             if (FS->getEntrySamples() > 0 || ProfileIsCS)
   1053               LocalNotInlinedCallSites.try_emplace(CB, FS);
   1054             if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
   1055               Hot = true;
   1056             else if (shouldInlineColdCallee(*CB))
   1057               ColdCandidates.push_back(CB);
   1058           }
   1059         }
   1060       }
   1061       if (Hot || ExternalInlineAdvisor) {
   1062         CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
   1063         emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
   1064       } else {
   1065         CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
   1066         emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
   1067       }
   1068     }
   1069     for (CallBase *I : CIS) {
   1070       Function *CalledFunction = I->getCalledFunction();
   1071       InlineCandidate Candidate = {
   1072           I,
   1073           LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
   1074                                             : nullptr,
   1075           0 /* dummy count */, 1.0 /* dummy distribution factor */};
   1076       // Do not inline recursive calls.
   1077       if (CalledFunction == &F)
   1078         continue;
   1079       if (I->isIndirectCall()) {
   1080         uint64_t Sum;
   1081         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
   1082           uint64_t SumOrigin = Sum;
   1083           if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
   1084             findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap,
   1085                                         PSI->getOrCompHotCountThreshold());
   1086             continue;
   1087           }
   1088           if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
   1089             continue;
   1090 
   1091           Candidate = {I, FS, FS->getEntrySamples(), 1.0};
   1092           if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
   1093             LocalNotInlinedCallSites.erase(I);
   1094             LocalChanged = true;
   1095           }
   1096         }
   1097       } else if (CalledFunction && CalledFunction->getSubprogram() &&
   1098                  !CalledFunction->isDeclaration()) {
   1099         if (tryInlineCandidate(Candidate)) {
   1100           LocalNotInlinedCallSites.erase(I);
   1101           LocalChanged = true;
   1102         }
   1103       } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
   1104         findExternalInlineCandidate(findCalleeFunctionSamples(*I), InlinedGUIDs,
   1105                                     SymbolMap,
   1106                                     PSI->getOrCompHotCountThreshold());
   1107       }
   1108     }
   1109     Changed |= LocalChanged;
   1110   }
   1111 
   1112   // For CS profile, profile for not inlined context will be merged when
   1113   // base profile is being trieved
   1114   if (ProfileIsCS)
   1115     return Changed;
   1116 
   1117   // Accumulate not inlined callsite information into notInlinedSamples
   1118   for (const auto &Pair : LocalNotInlinedCallSites) {
   1119     CallBase *I = Pair.getFirst();
   1120     Function *Callee = I->getCalledFunction();
   1121     if (!Callee || Callee->isDeclaration())
   1122       continue;
   1123 
   1124     ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
   1125                                          I->getDebugLoc(), I->getParent())
   1126               << "previous inlining not repeated: '"
   1127               << ore::NV("Callee", Callee) << "' into '"
   1128               << ore::NV("Caller", &F) << "'");
   1129 
   1130     ++NumCSNotInlined;
   1131     const FunctionSamples *FS = Pair.getSecond();
   1132     if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
   1133       continue;
   1134     }
   1135 
   1136     if (ProfileMergeInlinee) {
   1137       // A function call can be replicated by optimizations like callsite
   1138       // splitting or jump threading and the replicates end up sharing the
   1139       // sample nested callee profile instead of slicing the original inlinee's
   1140       // profile. We want to do merge exactly once by filtering out callee
   1141       // profiles with a non-zero head sample count.
   1142       if (FS->getHeadSamples() == 0) {
   1143         // Use entry samples as head samples during the merge, as inlinees
   1144         // don't have head samples.
   1145         const_cast<FunctionSamples *>(FS)->addHeadSamples(
   1146             FS->getEntrySamples());
   1147 
   1148         // Note that we have to do the merge right after processing function.
   1149         // This allows OutlineFS's profile to be used for annotation during
   1150         // top-down processing of functions' annotation.
   1151         FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
   1152         OutlineFS->merge(*FS);
   1153       }
   1154     } else {
   1155       auto pair =
   1156           notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
   1157       pair.first->second.entryCount += FS->getEntrySamples();
   1158     }
   1159   }
   1160   return Changed;
   1161 }
   1162 
   1163 bool SampleProfileLoader::tryInlineCandidate(
   1164     InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
   1165 
   1166   CallBase &CB = *Candidate.CallInstr;
   1167   Function *CalledFunction = CB.getCalledFunction();
   1168   assert(CalledFunction && "Expect a callee with definition");
   1169   DebugLoc DLoc = CB.getDebugLoc();
   1170   BasicBlock *BB = CB.getParent();
   1171 
   1172   InlineCost Cost = shouldInlineCandidate(Candidate);
   1173   if (Cost.isNever()) {
   1174     ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
   1175               << "incompatible inlining");
   1176     return false;
   1177   }
   1178 
   1179   if (!Cost)
   1180     return false;
   1181 
   1182   InlineFunctionInfo IFI(nullptr, GetAC);
   1183   IFI.UpdateProfile = false;
   1184   if (InlineFunction(CB, IFI).isSuccess()) {
   1185     // The call to InlineFunction erases I, so we can't pass it here.
   1186     emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
   1187                     true, CSINLINE_DEBUG);
   1188 
   1189     // Now populate the list of newly exposed call sites.
   1190     if (InlinedCallSites) {
   1191       InlinedCallSites->clear();
   1192       for (auto &I : IFI.InlinedCallSites)
   1193         InlinedCallSites->push_back(I);
   1194     }
   1195 
   1196     if (ProfileIsCS)
   1197       ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
   1198     ++NumCSInlined;
   1199 
   1200     // Prorate inlined probes for a duplicated inlining callsite which probably
   1201     // has a distribution less than 100%. Samples for an inlinee should be
   1202     // distributed among the copies of the original callsite based on each
   1203     // callsite's distribution factor for counts accuracy. Note that an inlined
   1204     // probe may come with its own distribution factor if it has been duplicated
   1205     // in the inlinee body. The two factor are multiplied to reflect the
   1206     // aggregation of duplication.
   1207     if (Candidate.CallsiteDistribution < 1) {
   1208       for (auto &I : IFI.InlinedCallSites) {
   1209         if (Optional<PseudoProbe> Probe = extractProbe(*I))
   1210           setProbeDistributionFactor(*I, Probe->Factor *
   1211                                              Candidate.CallsiteDistribution);
   1212       }
   1213       NumDuplicatedInlinesite++;
   1214     }
   1215 
   1216     return true;
   1217   }
   1218   return false;
   1219 }
   1220 
   1221 bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
   1222                                              CallBase *CB) {
   1223   assert(CB && "Expect non-null call instruction");
   1224 
   1225   if (isa<IntrinsicInst>(CB))
   1226     return false;
   1227 
   1228   // Find the callee's profile. For indirect call, find hottest target profile.
   1229   const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
   1230   if (!CalleeSamples)
   1231     return false;
   1232 
   1233   float Factor = 1.0;
   1234   if (Optional<PseudoProbe> Probe = extractProbe(*CB))
   1235     Factor = Probe->Factor;
   1236 
   1237   uint64_t CallsiteCount = 0;
   1238   ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
   1239   if (Weight)
   1240     CallsiteCount = Weight.get();
   1241   if (CalleeSamples)
   1242     CallsiteCount = std::max(
   1243         CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
   1244 
   1245   *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
   1246   return true;
   1247 }
   1248 
   1249 InlineCost
   1250 SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   1251   std::unique_ptr<InlineAdvice> Advice = nullptr;
   1252   if (ExternalInlineAdvisor) {
   1253     Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
   1254     if (!Advice->isInliningRecommended()) {
   1255       Advice->recordUnattemptedInlining();
   1256       return InlineCost::getNever("not previously inlined");
   1257     }
   1258     Advice->recordInlining();
   1259     return InlineCost::getAlways("previously inlined");
   1260   }
   1261 
   1262   // Adjust threshold based on call site hotness, only do this for callsite
   1263   // prioritized inliner because otherwise cost-benefit check is done earlier.
   1264   int SampleThreshold = SampleColdCallSiteThreshold;
   1265   if (CallsitePrioritizedInline) {
   1266     if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
   1267       SampleThreshold = SampleHotCallSiteThreshold;
   1268     else if (!ProfileSizeInline)
   1269       return InlineCost::getNever("cold callsite");
   1270   }
   1271 
   1272   Function *Callee = Candidate.CallInstr->getCalledFunction();
   1273   assert(Callee && "Expect a definition for inline candidate of direct call");
   1274 
   1275   InlineParams Params = getInlineParams();
   1276   Params.ComputeFullInlineCost = true;
   1277   // Checks if there is anything in the reachable portion of the callee at
   1278   // this callsite that makes this inlining potentially illegal. Need to
   1279   // set ComputeFullInlineCost, otherwise getInlineCost may return early
   1280   // when cost exceeds threshold without checking all IRs in the callee.
   1281   // The acutal cost does not matter because we only checks isNever() to
   1282   // see if it is legal to inline the callsite.
   1283   InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
   1284                                   GetTTI(*Callee), GetAC, GetTLI);
   1285 
   1286   // Honor always inline and never inline from call analyzer
   1287   if (Cost.isNever() || Cost.isAlways())
   1288     return Cost;
   1289 
   1290   // For old FDO inliner, we inline the call site as long as cost is not
   1291   // "Never". The cost-benefit check is done earlier.
   1292   if (!CallsitePrioritizedInline) {
   1293     return InlineCost::get(Cost.getCost(), INT_MAX);
   1294   }
   1295 
   1296   // Otherwise only use the cost from call analyzer, but overwite threshold with
   1297   // Sample PGO threshold.
   1298   return InlineCost::get(Cost.getCost(), SampleThreshold);
   1299 }
   1300 
   1301 bool SampleProfileLoader::inlineHotFunctionsWithPriority(
   1302     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
   1303   assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
   1304 
   1305   // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
   1306   // Profile symbol list is ignored when profile-sample-accurate is on.
   1307   assert((!ProfAccForSymsInList ||
   1308           (!ProfileSampleAccurate &&
   1309            !F.hasFnAttribute("profile-sample-accurate"))) &&
   1310          "ProfAccForSymsInList should be false when profile-sample-accurate "
   1311          "is enabled");
   1312 
   1313   // Populating worklist with initial call sites from root inliner, along
   1314   // with call site weights.
   1315   CandidateQueue CQueue;
   1316   InlineCandidate NewCandidate;
   1317   for (auto &BB : F) {
   1318     for (auto &I : BB.getInstList()) {
   1319       auto *CB = dyn_cast<CallBase>(&I);
   1320       if (!CB)
   1321         continue;
   1322       if (getInlineCandidate(&NewCandidate, CB))
   1323         CQueue.push(NewCandidate);
   1324     }
   1325   }
   1326 
   1327   // Cap the size growth from profile guided inlining. This is needed even
   1328   // though cost of each inline candidate already accounts for callee size,
   1329   // because with top-down inlining, we can grow inliner size significantly
   1330   // with large number of smaller inlinees each pass the cost check.
   1331   assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
   1332          "Max inline size limit should not be smaller than min inline size "
   1333          "limit.");
   1334   unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
   1335   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
   1336   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
   1337   if (ExternalInlineAdvisor)
   1338     SizeLimit = std::numeric_limits<unsigned>::max();
   1339 
   1340   // Perform iterative BFS call site prioritized inlining
   1341   bool Changed = false;
   1342   while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
   1343     InlineCandidate Candidate = CQueue.top();
   1344     CQueue.pop();
   1345     CallBase *I = Candidate.CallInstr;
   1346     Function *CalledFunction = I->getCalledFunction();
   1347 
   1348     if (CalledFunction == &F)
   1349       continue;
   1350     if (I->isIndirectCall()) {
   1351       uint64_t Sum = 0;
   1352       auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
   1353       uint64_t SumOrigin = Sum;
   1354       Sum *= Candidate.CallsiteDistribution;
   1355       for (const auto *FS : CalleeSamples) {
   1356         // TODO: Consider disable pre-lTO ICP for MonoLTO as well
   1357         if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
   1358           findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap,
   1359                                       PSI->getOrCompHotCountThreshold());
   1360           continue;
   1361         }
   1362         uint64_t EntryCountDistributed =
   1363             FS->getEntrySamples() * Candidate.CallsiteDistribution;
   1364         // In addition to regular inline cost check, we also need to make sure
   1365         // ICP isn't introducing excessive speculative checks even if individual
   1366         // target looks beneficial to promote and inline. That means we should
   1367         // only do ICP when there's a small number dominant targets.
   1368         if (EntryCountDistributed < SumOrigin / ProfileICPThreshold)
   1369           break;
   1370         // TODO: Fix CallAnalyzer to handle all indirect calls.
   1371         // For indirect call, we don't run CallAnalyzer to get InlineCost
   1372         // before actual inlining. This is because we could see two different
   1373         // types from the same definition, which makes CallAnalyzer choke as
   1374         // it's expecting matching parameter type on both caller and callee
   1375         // side. See example from PR18962 for the triggering cases (the bug was
   1376         // fixed, but we generate different types).
   1377         if (!PSI->isHotCount(EntryCountDistributed))
   1378           break;
   1379         SmallVector<CallBase *, 8> InlinedCallSites;
   1380         // Attach function profile for promoted indirect callee, and update
   1381         // call site count for the promoted inline candidate too.
   1382         Candidate = {I, FS, EntryCountDistributed,
   1383                      Candidate.CallsiteDistribution};
   1384         if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
   1385                                          &InlinedCallSites)) {
   1386           for (auto *CB : InlinedCallSites) {
   1387             if (getInlineCandidate(&NewCandidate, CB))
   1388               CQueue.emplace(NewCandidate);
   1389           }
   1390           Changed = true;
   1391         }
   1392       }
   1393     } else if (CalledFunction && CalledFunction->getSubprogram() &&
   1394                !CalledFunction->isDeclaration()) {
   1395       SmallVector<CallBase *, 8> InlinedCallSites;
   1396       if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
   1397         for (auto *CB : InlinedCallSites) {
   1398           if (getInlineCandidate(&NewCandidate, CB))
   1399             CQueue.emplace(NewCandidate);
   1400         }
   1401         Changed = true;
   1402       }
   1403     } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
   1404       findExternalInlineCandidate(Candidate.CalleeSamples, InlinedGUIDs,
   1405                                   SymbolMap, PSI->getOrCompHotCountThreshold());
   1406     }
   1407   }
   1408 
   1409   if (!CQueue.empty()) {
   1410     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
   1411       ++NumCSInlinedHitMaxLimit;
   1412     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
   1413       ++NumCSInlinedHitMinLimit;
   1414     else
   1415       ++NumCSInlinedHitGrowthLimit;
   1416   }
   1417 
   1418   return Changed;
   1419 }
   1420 
   1421 /// Returns the sorted CallTargetMap \p M by count in descending order.
   1422 static SmallVector<InstrProfValueData, 2>
   1423 GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) {
   1424   SmallVector<InstrProfValueData, 2> R;
   1425   for (const auto &I : SampleRecord::SortCallTargets(M)) {
   1426     R.emplace_back(
   1427         InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
   1428   }
   1429   return R;
   1430 }
   1431 
   1432 // Generate MD_prof metadata for every branch instruction using the
   1433 // edge weights computed during propagation.
   1434 void SampleProfileLoader::generateMDProfMetadata(Function &F) {
   1435   // Generate MD_prof metadata for every branch instruction using the
   1436   // edge weights computed during propagation.
   1437   LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
   1438   LLVMContext &Ctx = F.getContext();
   1439   MDBuilder MDB(Ctx);
   1440   for (auto &BI : F) {
   1441     BasicBlock *BB = &BI;
   1442 
   1443     if (BlockWeights[BB]) {
   1444       for (auto &I : BB->getInstList()) {
   1445         if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
   1446           continue;
   1447         if (!cast<CallBase>(I).getCalledFunction()) {
   1448           const DebugLoc &DLoc = I.getDebugLoc();
   1449           if (!DLoc)
   1450             continue;
   1451           const DILocation *DIL = DLoc;
   1452           const FunctionSamples *FS = findFunctionSamples(I);
   1453           if (!FS)
   1454             continue;
   1455           auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
   1456           auto T = FS->findCallTargetMapAt(CallSite);
   1457           if (!T || T.get().empty())
   1458             continue;
   1459           if (FunctionSamples::ProfileIsProbeBased) {
   1460             // Prorate the callsite counts based on the pre-ICP distribution
   1461             // factor to reflect what is already done to the callsite before
   1462             // ICP, such as calliste cloning.
   1463             if (Optional<PseudoProbe> Probe = extractProbe(I)) {
   1464               if (Probe->Factor < 1)
   1465                 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
   1466             }
   1467           }
   1468           SmallVector<InstrProfValueData, 2> SortedCallTargets =
   1469               GetSortedValueDataFromCallTargets(T.get());
   1470           uint64_t Sum = 0;
   1471           for (const auto &C : T.get())
   1472             Sum += C.second;
   1473           // With CSSPGO all indirect call targets are counted torwards the
   1474           // original indirect call site in the profile, including both
   1475           // inlined and non-inlined targets.
   1476           if (!FunctionSamples::ProfileIsCS) {
   1477             if (const FunctionSamplesMap *M =
   1478                     FS->findFunctionSamplesMapAt(CallSite)) {
   1479               for (const auto &NameFS : *M)
   1480                 Sum += NameFS.second.getEntrySamples();
   1481             }
   1482           }
   1483           if (Sum)
   1484             updateIDTMetaData(I, SortedCallTargets, Sum);
   1485           else if (OverwriteExistingWeights)
   1486             I.setMetadata(LLVMContext::MD_prof, nullptr);
   1487         } else if (!isa<IntrinsicInst>(&I)) {
   1488           I.setMetadata(LLVMContext::MD_prof,
   1489                         MDB.createBranchWeights(
   1490                             {static_cast<uint32_t>(BlockWeights[BB])}));
   1491         }
   1492       }
   1493     } else if (OverwriteExistingWeights) {
   1494       // Set profile metadata (possibly annotated by LTO prelink) to zero or
   1495       // clear it for cold code.
   1496       for (auto &I : BB->getInstList()) {
   1497         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
   1498           if (cast<CallBase>(I).isIndirectCall())
   1499             I.setMetadata(LLVMContext::MD_prof, nullptr);
   1500           else
   1501             I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0));
   1502         }
   1503       }
   1504     }
   1505 
   1506     Instruction *TI = BB->getTerminator();
   1507     if (TI->getNumSuccessors() == 1)
   1508       continue;
   1509     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
   1510         !isa<IndirectBrInst>(TI))
   1511       continue;
   1512 
   1513     DebugLoc BranchLoc = TI->getDebugLoc();
   1514     LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
   1515                       << ((BranchLoc) ? Twine(BranchLoc.getLine())
   1516                                       : Twine("<UNKNOWN LOCATION>"))
   1517                       << ".\n");
   1518     SmallVector<uint32_t, 4> Weights;
   1519     uint32_t MaxWeight = 0;
   1520     Instruction *MaxDestInst;
   1521     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
   1522       BasicBlock *Succ = TI->getSuccessor(I);
   1523       Edge E = std::make_pair(BB, Succ);
   1524       uint64_t Weight = EdgeWeights[E];
   1525       LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
   1526       // Use uint32_t saturated arithmetic to adjust the incoming weights,
   1527       // if needed. Sample counts in profiles are 64-bit unsigned values,
   1528       // but internally branch weights are expressed as 32-bit values.
   1529       if (Weight > std::numeric_limits<uint32_t>::max()) {
   1530         LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
   1531         Weight = std::numeric_limits<uint32_t>::max();
   1532       }
   1533       // Weight is added by one to avoid propagation errors introduced by
   1534       // 0 weights.
   1535       Weights.push_back(static_cast<uint32_t>(Weight + 1));
   1536       if (Weight != 0) {
   1537         if (Weight > MaxWeight) {
   1538           MaxWeight = Weight;
   1539           MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
   1540         }
   1541       }
   1542     }
   1543 
   1544     uint64_t TempWeight;
   1545     // Only set weights if there is at least one non-zero weight.
   1546     // In any other case, let the analyzer set weights.
   1547     // Do not set weights if the weights are present unless under
   1548     // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
   1549     // twice. If the first annotation already set the weights, the second pass
   1550     // does not need to set it. With OverwriteExistingWeights, Blocks with zero
   1551     // weight should have their existing metadata (possibly annotated by LTO
   1552     // prelink) cleared.
   1553     if (MaxWeight > 0 &&
   1554         (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
   1555       LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
   1556       TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   1557       ORE->emit([&]() {
   1558         return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
   1559                << "most popular destination for conditional branches at "
   1560                << ore::NV("CondBranchesLoc", BranchLoc);
   1561       });
   1562     } else {
   1563       if (OverwriteExistingWeights) {
   1564         TI->setMetadata(LLVMContext::MD_prof, nullptr);
   1565         LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
   1566       } else {
   1567         LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
   1568       }
   1569     }
   1570   }
   1571 }
   1572 
   1573 /// Once all the branch weights are computed, we emit the MD_prof
   1574 /// metadata on BB using the computed values for each of its branches.
   1575 ///
   1576 /// \param F The function to query.
   1577 ///
   1578 /// \returns true if \p F was modified. Returns false, otherwise.
   1579 bool SampleProfileLoader::emitAnnotations(Function &F) {
   1580   bool Changed = false;
   1581 
   1582   if (FunctionSamples::ProfileIsProbeBased) {
   1583     if (!ProbeManager->profileIsValid(F, *Samples)) {
   1584       LLVM_DEBUG(
   1585           dbgs() << "Profile is invalid due to CFG mismatch for Function "
   1586                  << F.getName());
   1587       ++NumMismatchedProfile;
   1588       return false;
   1589     }
   1590     ++NumMatchedProfile;
   1591   } else {
   1592     if (getFunctionLoc(F) == 0)
   1593       return false;
   1594 
   1595     LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
   1596                       << F.getName() << ": " << getFunctionLoc(F) << "\n");
   1597   }
   1598 
   1599   DenseSet<GlobalValue::GUID> InlinedGUIDs;
   1600   if (ProfileIsCS && CallsitePrioritizedInline)
   1601     Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
   1602   else
   1603     Changed |= inlineHotFunctions(F, InlinedGUIDs);
   1604 
   1605   Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
   1606 
   1607   if (Changed)
   1608     generateMDProfMetadata(F);
   1609 
   1610   emitCoverageRemarks(F);
   1611   return Changed;
   1612 }
   1613 
   1614 char SampleProfileLoaderLegacyPass::ID = 0;
   1615 
   1616 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
   1617                       "Sample Profile loader", false, false)
   1618 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
   1619 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
   1620 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
   1621 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
   1622 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
   1623                     "Sample Profile loader", false, false)
   1624 
   1625 std::unique_ptr<ProfiledCallGraph>
   1626 SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
   1627   std::unique_ptr<ProfiledCallGraph> ProfiledCG;
   1628   if (ProfileIsCS)
   1629     ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
   1630   else
   1631     ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
   1632 
   1633   // Add all functions into the profiled call graph even if they are not in
   1634   // the profile. This makes sure functions missing from the profile still
   1635   // gets a chance to be processed.
   1636   for (auto &Node : CG) {
   1637     const auto *F = Node.first;
   1638     if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
   1639       continue;
   1640     ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(*F));
   1641   }
   1642 
   1643   return ProfiledCG;
   1644 }
   1645 
   1646 std::vector<Function *>
   1647 SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   1648   std::vector<Function *> FunctionOrderList;
   1649   FunctionOrderList.reserve(M.size());
   1650 
   1651   if (!ProfileTopDownLoad && UseProfiledCallGraph)
   1652     errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
   1653               "together with -sample-profile-top-down-load.\n";
   1654 
   1655   if (!ProfileTopDownLoad || CG == nullptr) {
   1656     if (ProfileMergeInlinee) {
   1657       // Disable ProfileMergeInlinee if profile is not loaded in top down order,
   1658       // because the profile for a function may be used for the profile
   1659       // annotation of its outline copy before the profile merging of its
   1660       // non-inlined inline instances, and that is not the way how
   1661       // ProfileMergeInlinee is supposed to work.
   1662       ProfileMergeInlinee = false;
   1663     }
   1664 
   1665     for (Function &F : M)
   1666       if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
   1667         FunctionOrderList.push_back(&F);
   1668     return FunctionOrderList;
   1669   }
   1670 
   1671   assert(&CG->getModule() == &M);
   1672 
   1673   if (UseProfiledCallGraph ||
   1674       (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) {
   1675     // Use profiled call edges to augment the top-down order. There are cases
   1676     // that the top-down order computed based on the static call graph doesn't
   1677     // reflect real execution order. For example
   1678     //
   1679     // 1. Incomplete static call graph due to unknown indirect call targets.
   1680     //    Adjusting the order by considering indirect call edges from the
   1681     //    profile can enable the inlining of indirect call targets by allowing
   1682     //    the caller processed before them.
   1683     // 2. Mutual call edges in an SCC. The static processing order computed for
   1684     //    an SCC may not reflect the call contexts in the context-sensitive
   1685     //    profile, thus may cause potential inlining to be overlooked. The
   1686     //    function order in one SCC is being adjusted to a top-down order based
   1687     //    on the profile to favor more inlining. This is only a problem with CS
   1688     //    profile.
   1689     // 3. Transitive indirect call edges due to inlining. When a callee function
   1690     //    (say B) is inlined into into a caller function (say A) in LTO prelink,
   1691     //    every call edge originated from the callee B will be transferred to
   1692     //    the caller A. If any transferred edge (say A->C) is indirect, the
   1693     //    original profiled indirect edge B->C, even if considered, would not
   1694     //    enforce a top-down order from the caller A to the potential indirect
   1695     //    call target C in LTO postlink since the inlined callee B is gone from
   1696     //    the static call graph.
   1697     // 4. #3 can happen even for direct call targets, due to functions defined
   1698     //    in header files. A header function (say A), when included into source
   1699     //    files, is defined multiple times but only one definition survives due
   1700     //    to ODR. Therefore, the LTO prelink inlining done on those dropped
   1701     //    definitions can be useless based on a local file scope. More
   1702     //    importantly, the inlinee (say B), once fully inlined to a
   1703     //    to-be-dropped A, will have no profile to consume when its outlined
   1704     //    version is compiled. This can lead to a profile-less prelink
   1705     //    compilation for the outlined version of B which may be called from
   1706     //    external modules. while this isn't easy to fix, we rely on the
   1707     //    postlink AutoFDO pipeline to optimize B. Since the survived copy of
   1708     //    the A can be inlined in its local scope in prelink, it may not exist
   1709     //    in the merged IR in postlink, and we'll need the profiled call edges
   1710     //    to enforce a top-down order for the rest of the functions.
   1711     //
   1712     // Considering those cases, a profiled call graph completely independent of
   1713     // the static call graph is constructed based on profile data, where
   1714     // function objects are not even needed to handle case #3 and case 4.
   1715     //
   1716     // Note that static callgraph edges are completely ignored since they
   1717     // can be conflicting with profiled edges for cyclic SCCs and may result in
   1718     // an SCC order incompatible with profile-defined one. Using strictly
   1719     // profile order ensures a maximum inlining experience. On the other hand,
   1720     // static call edges are not so important when they don't correspond to a
   1721     // context in the profile.
   1722 
   1723     std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);
   1724     scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
   1725     while (!CGI.isAtEnd()) {
   1726       for (ProfiledCallGraphNode *Node : *CGI) {
   1727         Function *F = SymbolMap.lookup(Node->Name);
   1728         if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
   1729           FunctionOrderList.push_back(F);
   1730       }
   1731       ++CGI;
   1732     }
   1733   } else {
   1734     scc_iterator<CallGraph *> CGI = scc_begin(CG);
   1735     while (!CGI.isAtEnd()) {
   1736       for (CallGraphNode *Node : *CGI) {
   1737         auto *F = Node->getFunction();
   1738         if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
   1739           FunctionOrderList.push_back(F);
   1740       }
   1741       ++CGI;
   1742     }
   1743   }
   1744 
   1745   LLVM_DEBUG({
   1746     dbgs() << "Function processing order:\n";
   1747     for (auto F : reverse(FunctionOrderList)) {
   1748       dbgs() << F->getName() << "\n";
   1749     }
   1750   });
   1751 
   1752   std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
   1753   return FunctionOrderList;
   1754 }
   1755 
   1756 bool SampleProfileLoader::doInitialization(Module &M,
   1757                                            FunctionAnalysisManager *FAM) {
   1758   auto &Ctx = M.getContext();
   1759 
   1760   auto ReaderOrErr =
   1761       SampleProfileReader::create(Filename, Ctx, RemappingFilename);
   1762   if (std::error_code EC = ReaderOrErr.getError()) {
   1763     std::string Msg = "Could not open profile: " + EC.message();
   1764     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
   1765     return false;
   1766   }
   1767   Reader = std::move(ReaderOrErr.get());
   1768   Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
   1769   // set module before reading the profile so reader may be able to only
   1770   // read the function profiles which are used by the current module.
   1771   Reader->setModule(&M);
   1772   if (std::error_code EC = Reader->read()) {
   1773     std::string Msg = "profile reading failed: " + EC.message();
   1774     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
   1775     return false;
   1776   }
   1777 
   1778   PSL = Reader->getProfileSymbolList();
   1779 
   1780   // While profile-sample-accurate is on, ignore symbol list.
   1781   ProfAccForSymsInList =
   1782       ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
   1783   if (ProfAccForSymsInList) {
   1784     NamesInProfile.clear();
   1785     if (auto NameTable = Reader->getNameTable())
   1786       NamesInProfile.insert(NameTable->begin(), NameTable->end());
   1787     CoverageTracker.setProfAccForSymsInList(true);
   1788   }
   1789 
   1790   if (FAM && !ProfileInlineReplayFile.empty()) {
   1791     ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
   1792         M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
   1793         /*EmitRemarks=*/false);
   1794     if (!ExternalInlineAdvisor->areReplayRemarksLoaded())
   1795       ExternalInlineAdvisor.reset();
   1796   }
   1797 
   1798   // Apply tweaks if context-sensitive profile is available.
   1799   if (Reader->profileIsCS()) {
   1800     ProfileIsCS = true;
   1801     FunctionSamples::ProfileIsCS = true;
   1802 
   1803     // Enable priority-base inliner and size inline by default for CSSPGO.
   1804     if (!ProfileSizeInline.getNumOccurrences())
   1805       ProfileSizeInline = true;
   1806     if (!CallsitePrioritizedInline.getNumOccurrences())
   1807       CallsitePrioritizedInline = true;
   1808 
   1809     // Tracker for profiles under different context
   1810     ContextTracker =
   1811         std::make_unique<SampleContextTracker>(Reader->getProfiles());
   1812   }
   1813 
   1814   // Load pseudo probe descriptors for probe-based function samples.
   1815   if (Reader->profileIsProbeBased()) {
   1816     ProbeManager = std::make_unique<PseudoProbeManager>(M);
   1817     if (!ProbeManager->moduleIsProbed(M)) {
   1818       const char *Msg =
   1819           "Pseudo-probe-based profile requires SampleProfileProbePass";
   1820       Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
   1821       return false;
   1822     }
   1823   }
   1824 
   1825   return true;
   1826 }
   1827 
   1828 ModulePass *llvm::createSampleProfileLoaderPass() {
   1829   return new SampleProfileLoaderLegacyPass();
   1830 }
   1831 
   1832 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
   1833   return new SampleProfileLoaderLegacyPass(Name);
   1834 }
   1835 
   1836 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   1837                                       ProfileSummaryInfo *_PSI, CallGraph *CG) {
   1838   GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
   1839 
   1840   PSI = _PSI;
   1841   if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
   1842     M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
   1843                         ProfileSummary::PSK_Sample);
   1844     PSI->refresh();
   1845   }
   1846   // Compute the total number of samples collected in this profile.
   1847   for (const auto &I : Reader->getProfiles())
   1848     TotalCollectedSamples += I.second.getTotalSamples();
   1849 
   1850   auto Remapper = Reader->getRemapper();
   1851   // Populate the symbol map.
   1852   for (const auto &N_F : M.getValueSymbolTable()) {
   1853     StringRef OrigName = N_F.getKey();
   1854     Function *F = dyn_cast<Function>(N_F.getValue());
   1855     if (F == nullptr || OrigName.empty())
   1856       continue;
   1857     SymbolMap[OrigName] = F;
   1858     StringRef NewName = FunctionSamples::getCanonicalFnName(*F);
   1859     if (OrigName != NewName && !NewName.empty()) {
   1860       auto r = SymbolMap.insert(std::make_pair(NewName, F));
   1861       // Failiing to insert means there is already an entry in SymbolMap,
   1862       // thus there are multiple functions that are mapped to the same
   1863       // stripped name. In this case of name conflicting, set the value
   1864       // to nullptr to avoid confusion.
   1865       if (!r.second)
   1866         r.first->second = nullptr;
   1867       OrigName = NewName;
   1868     }
   1869     // Insert the remapped names into SymbolMap.
   1870     if (Remapper) {
   1871       if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
   1872         if (*MapName != OrigName && !MapName->empty())
   1873           SymbolMap.insert(std::make_pair(*MapName, F));
   1874       }
   1875     }
   1876   }
   1877   assert(SymbolMap.count(StringRef()) == 0 &&
   1878          "No empty StringRef should be added in SymbolMap");
   1879 
   1880   bool retval = false;
   1881   for (auto F : buildFunctionOrder(M, CG)) {
   1882     assert(!F->isDeclaration());
   1883     clearFunctionData();
   1884     retval |= runOnFunction(*F, AM);
   1885   }
   1886 
   1887   // Account for cold calls not inlined....
   1888   if (!ProfileIsCS)
   1889     for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
   1890          notInlinedCallInfo)
   1891       updateProfileCallee(pair.first, pair.second.entryCount);
   1892 
   1893   return retval;
   1894 }
   1895 
   1896 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
   1897   ACT = &getAnalysis<AssumptionCacheTracker>();
   1898   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
   1899   TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
   1900   ProfileSummaryInfo *PSI =
   1901       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   1902   return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
   1903 }
   1904 
   1905 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
   1906   LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
   1907   DILocation2SampleMap.clear();
   1908   // By default the entry count is initialized to -1, which will be treated
   1909   // conservatively by getEntryCount as the same as unknown (None). This is
   1910   // to avoid newly added code to be treated as cold. If we have samples
   1911   // this will be overwritten in emitAnnotations.
   1912   uint64_t initialEntryCount = -1;
   1913 
   1914   ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
   1915   if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
   1916     // initialize all the function entry counts to 0. It means all the
   1917     // functions without profile will be regarded as cold.
   1918     initialEntryCount = 0;
   1919     // profile-sample-accurate is a user assertion which has a higher precedence
   1920     // than symbol list. When profile-sample-accurate is on, ignore symbol list.
   1921     ProfAccForSymsInList = false;
   1922   }
   1923   CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
   1924 
   1925   // PSL -- profile symbol list include all the symbols in sampled binary.
   1926   // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
   1927   // old functions without samples being cold, without having to worry
   1928   // about new and hot functions being mistakenly treated as cold.
   1929   if (ProfAccForSymsInList) {
   1930     // Initialize the entry count to 0 for functions in the list.
   1931     if (PSL->contains(F.getName()))
   1932       initialEntryCount = 0;
   1933 
   1934     // Function in the symbol list but without sample will be regarded as
   1935     // cold. To minimize the potential negative performance impact it could
   1936     // have, we want to be a little conservative here saying if a function
   1937     // shows up in the profile, no matter as outline function, inline instance
   1938     // or call targets, treat the function as not being cold. This will handle
   1939     // the cases such as most callsites of a function are inlined in sampled
   1940     // binary but not inlined in current build (because of source code drift,
   1941     // imprecise debug information, or the callsites are all cold individually
   1942     // but not cold accumulatively...), so the outline function showing up as
   1943     // cold in sampled binary will actually not be cold after current build.
   1944     StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
   1945     if (NamesInProfile.count(CanonName))
   1946       initialEntryCount = -1;
   1947   }
   1948 
   1949   // Initialize entry count when the function has no existing entry
   1950   // count value.
   1951   if (!F.getEntryCount().hasValue())
   1952     F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   1953   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   1954   if (AM) {
   1955     auto &FAM =
   1956         AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
   1957             .getManager();
   1958     ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   1959   } else {
   1960     OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
   1961     ORE = OwnedORE.get();
   1962   }
   1963 
   1964   if (ProfileIsCS)
   1965     Samples = ContextTracker->getBaseSamplesFor(F);
   1966   else
   1967     Samples = Reader->getSamplesFor(F);
   1968 
   1969   if (Samples && !Samples->empty())
   1970     return emitAnnotations(F);
   1971   return false;
   1972 }
   1973 
   1974 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
   1975                                                ModuleAnalysisManager &AM) {
   1976   FunctionAnalysisManager &FAM =
   1977       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   1978 
   1979   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
   1980     return FAM.getResult<AssumptionAnalysis>(F);
   1981   };
   1982   auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
   1983     return FAM.getResult<TargetIRAnalysis>(F);
   1984   };
   1985   auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
   1986     return FAM.getResult<TargetLibraryAnalysis>(F);
   1987   };
   1988 
   1989   SampleProfileLoader SampleLoader(
   1990       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
   1991       ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
   1992                                        : ProfileRemappingFileName,
   1993       LTOPhase, GetAssumptionCache, GetTTI, GetTLI);
   1994 
   1995   if (!SampleLoader.doInitialization(M, &FAM))
   1996     return PreservedAnalyses::all();
   1997 
   1998   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
   1999   CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
   2000   if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
   2001     return PreservedAnalyses::all();
   2002 
   2003   return PreservedAnalyses::none();
   2004 }
   2005