Home | History | Annotate | Line # | Download | only in Analysis
      1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // \file
     10 // Interface file for the IRSimilarityIdentifier for identifying similarities in
     11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned
     12 // integers.
     13 //
     14 // Two sequences of instructions are called "similar" if they perform the same
     15 // series of operations for all inputs.
     16 //
     17 // \code
     18 // %1 = add i32 %a, 10
     19 // %2 = add i32 %a, %1
     20 // %3 = icmp slt icmp %1, %2
     21 // \endcode
     22 //
     23 // and
     24 //
     25 // \code
     26 // %1 = add i32 11, %a
     27 // %2 = sub i32 %a, %1
     28 // %3 = icmp sgt icmp %2, %1
     29 // \endcode
     30 //
     31 // ultimately have the same result, even if the inputs, and structure are
     32 // slightly different.
     33 //
     34 // For instructions, we do not worry about operands that do not have fixed
     35 // semantic meaning to the program.  We consider the opcode that the instruction
     36 // has, the types, parameters, and extra information such as the function name,
     37 // or comparison predicate.  These are used to create a hash to map instructions
     38 // to integers to be used in similarity matching in sequences of instructions
     39 //
     40 // Terminology:
     41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
     42 // Instructions), usually used to denote a region of similarity has been found.
     43 //
     44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
     45 // similar to one another.
     46 //
     47 //===----------------------------------------------------------------------===//
     48 
     49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
     50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
     51 
     52 #include "llvm/IR/InstVisitor.h"
     53 #include "llvm/IR/Instructions.h"
     54 #include "llvm/IR/Module.h"
     55 #include "llvm/IR/PassManager.h"
     56 #include "llvm/Pass.h"
     57 #include "llvm/Support/Allocator.h"
     58 
     59 namespace llvm {
     60 namespace IRSimilarity {
     61 
     62 struct IRInstructionDataList;
     63 
     64 /// This represents what is and is not supported when finding similarity in
     65 /// Instructions.
     66 ///
     67 /// Legal Instructions are considered when looking at similarity between
     68 /// Instructions.
     69 ///
     70 /// Illegal Instructions cannot be considered when looking for similarity
     71 /// between Instructions. They act as boundaries between similarity regions.
     72 ///
     73 /// Invisible Instructions are skipped over during analysis.
     74 // TODO: Shared with MachineOutliner
     75 enum InstrType { Legal, Illegal, Invisible };
     76 
     77 /// This provides the utilities for hashing an Instruction to an unsigned
     78 /// integer. Two IRInstructionDatas produce the same hash value when their
     79 /// underlying Instructions perform the same operation (even if they don't have
     80 /// the same input operands.)
     81 /// As a more concrete example, consider the following:
     82 ///
     83 /// \code
     84 /// %add1 = add i32 %a, %b
     85 /// %add2 = add i32 %c, %d
     86 /// %add3 = add i64 %e, %f
     87 /// \endcode
     88 ///
     89 // Then the IRInstructionData wrappers for these Instructions may be hashed like
     90 /// so:
     91 ///
     92 /// \code
     93 /// ; These two adds have the same types and operand types, so they hash to the
     94 /// ; same number.
     95 /// %add1 = add i32 %a, %b ; Hash: 1
     96 /// %add2 = add i32 %c, %d ; Hash: 1
     97 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
     98 /// ; it hashes to a different number.
     99 /// %add3 = add i64 %e, %f; Hash: 2
    100 /// \endcode
    101 ///
    102 ///
    103 /// This hashing scheme will be used to represent the program as a very long
    104 /// string. This string can then be placed in a data structure which can be used
    105 /// for similarity queries.
    106 ///
    107 /// TODO: Handle types of Instructions which can be equal even with different
    108 /// operands. (E.g. comparisons with swapped predicates.)
    109 /// TODO: Handle CallInsts, which are only checked for function type
    110 /// by \ref isSameOperationAs.
    111 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
    112 /// exact same, and some do not.
    113 struct IRInstructionData : ilist_node<IRInstructionData> {
    114 
    115   /// The source Instruction that is being wrapped.
    116   Instruction *Inst = nullptr;
    117   /// The values of the operands in the Instruction.
    118   SmallVector<Value *, 4> OperVals;
    119   /// The legality of the wrapped instruction. This is informed by InstrType,
    120   /// and is used when checking when two instructions are considered similar.
    121   /// If either instruction is not legal, the instructions are automatically not
    122   /// considered similar.
    123   bool Legal;
    124 
    125   /// This is only relevant if we are wrapping a CmpInst where we needed to
    126   /// change the predicate of a compare instruction from a greater than form
    127   /// to a less than form.  It is None otherwise.
    128   Optional<CmpInst::Predicate> RevisedPredicate;
    129 
    130   /// Gather the information that is difficult to gather for an Instruction, or
    131   /// is changed. i.e. the operands of an Instruction and the Types of those
    132   /// operands. This extra information allows for similarity matching to make
    133   /// assertions that allow for more flexibility when checking for whether an
    134   /// Instruction performs the same operation.
    135   IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
    136 
    137   /// Get the predicate that the compare instruction is using for hashing the
    138   /// instruction. the IRInstructionData must be wrapping a CmpInst.
    139   CmpInst::Predicate getPredicate() const;
    140 
    141   /// A function that swaps the predicates to their less than form if they are
    142   /// in a greater than form. Otherwise, the predicate is unchanged.
    143   ///
    144   /// \param CI - The comparison operation to find a consistent preidcate for.
    145   /// \return the consistent comparison predicate.
    146   static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
    147 
    148   /// Hashes \p Value based on its opcode, types, and operand types.
    149   /// Two IRInstructionData instances produce the same hash when they perform
    150   /// the same operation.
    151   ///
    152   /// As a simple example, consider the following instructions.
    153   ///
    154   /// \code
    155   /// %add1 = add i32 %x1, %y1
    156   /// %add2 = add i32 %x2, %y2
    157   ///
    158   /// %sub = sub i32 %x1, %y1
    159   ///
    160   /// %add_i64 = add i64 %x2, %y2
    161   /// \endcode
    162   ///
    163   /// Because the first two adds operate the same types, and are performing the
    164   /// same action, they will be hashed to the same value.
    165   ///
    166   /// However, the subtraction instruction is not the same as an addition, and
    167   /// will be hashed to a different value.
    168   ///
    169   /// Finally, the last add has a different type compared to the first two add
    170   /// instructions, so it will also be hashed to a different value that any of
    171   /// the previous instructions.
    172   ///
    173   /// \param [in] ID - The IRInstructionData instance to be hashed.
    174   /// \returns A hash_value of the IRInstructionData.
    175   friend hash_code hash_value(const IRInstructionData &ID) {
    176     SmallVector<Type *, 4> OperTypes;
    177     for (Value *V : ID.OperVals)
    178       OperTypes.push_back(V->getType());
    179 
    180     if (isa<CmpInst>(ID.Inst))
    181       return llvm::hash_combine(
    182           llvm::hash_value(ID.Inst->getOpcode()),
    183           llvm::hash_value(ID.Inst->getType()),
    184           llvm::hash_value(ID.getPredicate()),
    185           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
    186     else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
    187       return llvm::hash_combine(
    188           llvm::hash_value(ID.Inst->getOpcode()),
    189           llvm::hash_value(ID.Inst->getType()),
    190           llvm::hash_value(CI->getCalledFunction()->getName().str()),
    191           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
    192     return llvm::hash_combine(
    193         llvm::hash_value(ID.Inst->getOpcode()),
    194         llvm::hash_value(ID.Inst->getType()),
    195         llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
    196   }
    197 
    198   IRInstructionDataList *IDL = nullptr;
    199 };
    200 
    201 struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
    202 
    203 /// Compare one IRInstructionData class to another IRInstructionData class for
    204 /// whether they are performing a the same operation, and can mapped to the
    205 /// same value. For regular instructions if the hash value is the same, then
    206 /// they will also be close.
    207 ///
    208 /// \param A - The first IRInstructionData class to compare
    209 /// \param B - The second IRInstructionData class to compare
    210 /// \returns true if \p A and \p B are similar enough to be mapped to the same
    211 /// value.
    212 bool isClose(const IRInstructionData &A, const IRInstructionData &B);
    213 
    214 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
    215   static inline IRInstructionData *getEmptyKey() { return nullptr; }
    216   static inline IRInstructionData *getTombstoneKey() {
    217     return reinterpret_cast<IRInstructionData *>(-1);
    218   }
    219 
    220   static unsigned getHashValue(const IRInstructionData *E) {
    221     using llvm::hash_value;
    222     assert(E && "IRInstructionData is a nullptr?");
    223     return hash_value(*E);
    224   }
    225 
    226   static bool isEqual(const IRInstructionData *LHS,
    227                       const IRInstructionData *RHS) {
    228     if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
    229         LHS == getEmptyKey() || LHS == getTombstoneKey())
    230       return LHS == RHS;
    231 
    232     assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
    233     return isClose(*LHS, *RHS);
    234   }
    235 };
    236 
    237 /// Helper struct for converting the Instructions in a Module into a vector of
    238 /// unsigned integers. This vector of unsigned integers can be thought of as a
    239 /// "numeric string". This numeric string can then be queried by, for example,
    240 /// data structures that find repeated substrings.
    241 ///
    242 /// This hashing is done per BasicBlock in the module. To hash Instructions
    243 /// based off of their operations, each Instruction is wrapped in an
    244 /// IRInstructionData struct. The unsigned integer for an IRInstructionData
    245 /// depends on:
    246 /// - The hash provided by the IRInstructionData.
    247 /// - Which member of InstrType the IRInstructionData is classified as.
    248 // See InstrType for more details on the possible classifications, and how they
    249 // manifest in the numeric string.
    250 ///
    251 /// The numeric string for an individual BasicBlock is terminated by an unique
    252 /// unsigned integer. This prevents data structures which rely on repetition
    253 /// from matching across BasicBlocks. (For example, the SuffixTree.)
    254 /// As a concrete example, if we have the following two BasicBlocks:
    255 /// \code
    256 /// bb0:
    257 /// %add1 = add i32 %a, %b
    258 /// %add2 = add i32 %c, %d
    259 /// %add3 = add i64 %e, %f
    260 /// bb1:
    261 /// %sub = sub i32 %c, %d
    262 /// \endcode
    263 /// We may hash the Instructions like this (via IRInstructionData):
    264 /// \code
    265 /// bb0:
    266 /// %add1 = add i32 %a, %b ; Hash: 1
    267 /// %add2 = add i32 %c, %d; Hash: 1
    268 /// %add3 = add i64 %e, %f; Hash: 2
    269 /// bb1:
    270 /// %sub = sub i32 %c, %d; Hash: 3
    271 /// %add4 = add i32 %c, %d ; Hash: 1
    272 /// \endcode
    273 /// And produce a "numeric string representation" like so:
    274 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
    275 ///
    276 /// TODO: This is very similar to the MachineOutliner, and should be
    277 /// consolidated into the same interface.
    278 struct IRInstructionMapper {
    279   /// The starting illegal instruction number to map to.
    280   ///
    281   /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
    282   unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
    283 
    284   /// The next available integer to assign to a legal Instruction to.
    285   unsigned LegalInstrNumber = 0;
    286 
    287   /// Correspondence from IRInstructionData to unsigned integers.
    288   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
    289       InstructionIntegerMap;
    290 
    291   /// Set if we added an illegal number in the previous step.
    292   /// Since each illegal number is unique, we only need one of them between
    293   /// each range of legal numbers. This lets us make sure we don't add more
    294   /// than one illegal number per range.
    295   bool AddedIllegalLastTime = false;
    296 
    297   /// Marks whether we found a illegal instruction in the previous step.
    298   bool CanCombineWithPrevInstr = false;
    299 
    300   /// Marks whether we have found a set of instructions that is long enough
    301   /// to be considered for similarity.
    302   bool HaveLegalRange = false;
    303 
    304   /// This allocator pointer is in charge of holding on to the IRInstructionData
    305   /// so it is not deallocated until whatever external tool is using it is done
    306   /// with the information.
    307   SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
    308 
    309   /// This allocator pointer is in charge of creating the IRInstructionDataList
    310   /// so it is not deallocated until whatever external tool is using it is done
    311   /// with the information.
    312   SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
    313 
    314   /// Get an allocated IRInstructionData struct using the InstDataAllocator.
    315   ///
    316   /// \param I - The Instruction to wrap with IRInstructionData.
    317   /// \param Legality - A boolean value that is true if the instruction is to
    318   /// be considered for similarity, and false if not.
    319   /// \param IDL - The InstructionDataList that the IRInstructionData is
    320   /// inserted into.
    321   /// \returns An allocated IRInstructionData struct.
    322   IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
    323                                                IRInstructionDataList &IDL);
    324 
    325   /// Get an allocated IRInstructionDataList object using the IDLAllocator.
    326   ///
    327   /// \returns An allocated IRInstructionDataList object.
    328   IRInstructionDataList *allocateIRInstructionDataList();
    329 
    330   IRInstructionDataList *IDL = nullptr;
    331 
    332   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
    333   /// determined by \p InstrType. Two Instructions are mapped to the same value
    334   /// if they are close as defined by the InstructionData class above.
    335   ///
    336   /// \param [in] BB - The BasicBlock to be mapped to integers.
    337   /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
    338   /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
    339   void convertToUnsignedVec(BasicBlock &BB,
    340                             std::vector<IRInstructionData *> &InstrList,
    341                             std::vector<unsigned> &IntegerMapping);
    342 
    343   /// Maps an Instruction to a legal integer.
    344   ///
    345   /// \param [in] It - The Instruction to be mapped to an integer.
    346   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
    347   /// append to.
    348   /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
    349   /// \returns The integer \p It was mapped to.
    350   unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
    351                               std::vector<unsigned> &IntegerMappingForBB,
    352                               std::vector<IRInstructionData *> &InstrListForBB);
    353 
    354   /// Maps an Instruction to an illegal integer.
    355   ///
    356   /// \param [in] It - The \p Instruction to be mapped to an integer.
    357   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
    358   /// append to.
    359   /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
    360   /// \param End - true if creating a dummy IRInstructionData at the end of a
    361   /// basic block.
    362   /// \returns The integer \p It was mapped to.
    363   unsigned mapToIllegalUnsigned(
    364       BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
    365       std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
    366 
    367   IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
    368                       SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
    369       : InstDataAllocator(IDA), IDLAllocator(IDLA) {
    370     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
    371     // changed.
    372     assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
    373            "DenseMapInfo<unsigned>'s empty key isn't -1!");
    374     assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
    375                static_cast<unsigned>(-2) &&
    376            "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
    377 
    378     IDL = new (IDLAllocator->Allocate())
    379         IRInstructionDataList();
    380   }
    381 
    382   /// Custom InstVisitor to classify different instructions for whether it can
    383   /// be analyzed for similarity.
    384   struct InstructionClassification
    385       : public InstVisitor<InstructionClassification, InstrType> {
    386     InstructionClassification() {}
    387 
    388     // TODO: Determine a scheme to resolve when the label is similar enough.
    389     InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
    390     // TODO: Determine a scheme to resolve when the labels are similar enough.
    391     InstrType visitPHINode(PHINode &PN) { return Illegal; }
    392     // TODO: Handle allocas.
    393     InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
    394     // We exclude variable argument instructions since variable arguments
    395     // requires extra checking of the argument list.
    396     InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
    397     // We exclude all exception handling cases since they are so context
    398     // dependent.
    399     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
    400     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
    401     // DebugInfo should be included in the regions, but should not be
    402     // analyzed for similarity as it has no bearing on the outcome of the
    403     // program.
    404     InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
    405     // TODO: Handle specific intrinsics.
    406     InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
    407     // We only allow call instructions where the function has a name and
    408     // is not an indirect call.
    409     InstrType visitCallInst(CallInst &CI) {
    410       Function *F = CI.getCalledFunction();
    411       if (!F || CI.isIndirectCall() || !F->hasName())
    412         return Illegal;
    413       return Legal;
    414     }
    415     // TODO: We do not current handle similarity that changes the control flow.
    416     InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
    417     // TODO: We do not current handle similarity that changes the control flow.
    418     InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
    419     // TODO: Handle interblock similarity.
    420     InstrType visitTerminator(Instruction &I) { return Illegal; }
    421     InstrType visitInstruction(Instruction &I) { return Legal; }
    422   };
    423 
    424   /// Maps an Instruction to a member of InstrType.
    425   InstructionClassification InstClassifier;
    426 };
    427 
    428 /// This is a class that wraps a range of IRInstructionData from one point to
    429 /// another in the vector of IRInstructionData, which is a region of the
    430 /// program.  It is also responsible for defining the structure within this
    431 /// region of instructions.
    432 ///
    433 /// The structure of a region is defined through a value numbering system
    434 /// assigned to each unique value in a region at the creation of the
    435 /// IRSimilarityCandidate.
    436 ///
    437 /// For example, for each Instruction we add a mapping for each new
    438 /// value seen in that Instruction.
    439 /// IR:                    Mapping Added:
    440 /// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
    441 /// %add2 = add i32 %a, %1    %add2 -> 4
    442 /// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
    443 ///
    444 /// We can compare IRSimilarityCandidates against one another.
    445 /// The \ref isSimilar function compares each IRInstructionData against one
    446 /// another and if we have the same sequences of IRInstructionData that would
    447 /// create the same hash, we have similar IRSimilarityCandidates.
    448 ///
    449 /// We can also compare the structure of IRSimilarityCandidates. If we can
    450 /// create a mapping of registers in the region contained by one
    451 /// IRSimilarityCandidate to the region contained by different
    452 /// IRSimilarityCandidate, they can be considered structurally similar.
    453 ///
    454 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
    455 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
    456 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
    457 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
    458 ///
    459 /// Can have the following mapping from candidate to candidate of:
    460 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
    461 /// and can be considered similar.
    462 ///
    463 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
    464 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
    465 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
    466 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
    467 ///
    468 /// We cannot create the same mapping since the use of c4 is not used in the
    469 /// same way as %b or c2.
    470 class IRSimilarityCandidate {
    471 private:
    472   /// The start index of this IRSimilarityCandidate in the instruction list.
    473   unsigned StartIdx = 0;
    474 
    475   /// The number of instructions in this IRSimilarityCandidate.
    476   unsigned Len = 0;
    477 
    478   /// The first instruction in this IRSimilarityCandidate.
    479   IRInstructionData *FirstInst = nullptr;
    480 
    481   /// The last instruction in this IRSimilarityCandidate.
    482   IRInstructionData *LastInst = nullptr;
    483 
    484   /// Global Value Numbering structures
    485   /// @{
    486   /// Stores the mapping of the value to the number assigned to it in the
    487   /// IRSimilarityCandidate.
    488   DenseMap<Value *, unsigned> ValueToNumber;
    489   /// Stores the mapping of the number to the value assigned this number.
    490   DenseMap<unsigned, Value *> NumberToValue;
    491   /// @}
    492 
    493 public:
    494   /// \param StartIdx - The starting location of the region.
    495   /// \param Len - The length of the region.
    496   /// \param FirstInstIt - The starting IRInstructionData of the region.
    497   /// \param LastInstIt - The ending IRInstructionData of the region.
    498   IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
    499                         IRInstructionData *FirstInstIt,
    500                         IRInstructionData *LastInstIt);
    501 
    502   /// \param A - The first IRInstructionCandidate to compare.
    503   /// \param B - The second IRInstructionCandidate to compare.
    504   /// \returns True when every IRInstructionData in \p A is similar to every
    505   /// IRInstructionData in \p B.
    506   static bool isSimilar(const IRSimilarityCandidate &A,
    507                         const IRSimilarityCandidate &B);
    508 
    509   /// \param A - The first IRInstructionCandidate to compare.
    510   /// \param B - The second IRInstructionCandidate to compare.
    511   /// \returns True when every IRInstructionData in \p A is structurally similar
    512   /// to \p B.
    513   static bool compareStructure(const IRSimilarityCandidate &A,
    514                                const IRSimilarityCandidate &B);
    515 
    516   struct OperandMapping {
    517     /// The IRSimilarityCandidate that holds the instruction the OperVals were
    518     /// pulled from.
    519     const IRSimilarityCandidate &IRSC;
    520 
    521     /// The operand values to be analyzed.
    522     ArrayRef<Value *> &OperVals;
    523 
    524     /// The current mapping of global value numbers from one IRSimilarityCandidate
    525     /// to another IRSimilarityCandidate.
    526     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
    527   };
    528 
    529   /// Compare the operands in \p A and \p B and check that the current mapping
    530   /// of global value numbers from \p A to \p B and \p B to \A is consistent.
    531   ///
    532   /// \param A - The first IRInstructionCandidate, operand values, and current
    533   /// operand mappings to compare.
    534   /// \param B - The second IRInstructionCandidate, operand values, and current
    535   /// operand mappings to compare.
    536   /// \returns true if the IRSimilarityCandidates operands are compatible.
    537   static bool compareNonCommutativeOperandMapping(OperandMapping A,
    538                                                   OperandMapping B);
    539 
    540   /// Compare the operands in \p A and \p B and check that the current mapping
    541   /// of global value numbers from \p A to \p B and \p B to \A is consistent
    542   /// given that the operands are commutative.
    543   ///
    544   /// \param A - The first IRInstructionCandidate, operand values, and current
    545   /// operand mappings to compare.
    546   /// \param B - The second IRInstructionCandidate, operand values, and current
    547   /// operand mappings to compare.
    548   /// \returns true if the IRSimilarityCandidates operands are compatible.
    549   static bool compareCommutativeOperandMapping(OperandMapping A,
    550                                                OperandMapping B);
    551 
    552   /// Compare the start and end indices of the two IRSimilarityCandidates for
    553   /// whether they overlap. If the start instruction of one
    554   /// IRSimilarityCandidate is less than the end instruction of the other, and
    555   /// the start instruction of one is greater than the start instruction of the
    556   /// other, they overlap.
    557   ///
    558   /// \returns true if the IRSimilarityCandidates do not have overlapping
    559   /// instructions.
    560   static bool overlap(const IRSimilarityCandidate &A,
    561                       const IRSimilarityCandidate &B);
    562 
    563   /// \returns the number of instructions in this Candidate.
    564   unsigned getLength() const { return Len; }
    565 
    566   /// \returns the start index of this IRSimilarityCandidate.
    567   unsigned getStartIdx() const { return StartIdx; }
    568 
    569   /// \returns the end index of this IRSimilarityCandidate.
    570   unsigned getEndIdx() const { return StartIdx + Len - 1; }
    571 
    572   /// \returns The first IRInstructionData.
    573   IRInstructionData *front() const { return FirstInst; }
    574   /// \returns The last IRInstructionData.
    575   IRInstructionData *back() const { return LastInst; }
    576 
    577   /// \returns The first Instruction.
    578   Instruction *frontInstruction() { return FirstInst->Inst; }
    579   /// \returns The last Instruction
    580   Instruction *backInstruction() { return LastInst->Inst; }
    581 
    582   /// \returns The BasicBlock the IRSimilarityCandidate starts in.
    583   BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
    584   /// \returns The BasicBlock the IRSimilarityCandidate ends in.
    585   BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
    586 
    587   /// \returns The Function that the IRSimilarityCandidate is located in.
    588   Function *getFunction() { return getStartBB()->getParent(); }
    589 
    590   /// Finds the positive number associated with \p V if it has been mapped.
    591   /// \param [in] V - the Value to find.
    592   /// \returns The positive number corresponding to the value.
    593   /// \returns None if not present.
    594   Optional<unsigned> getGVN(Value *V) {
    595     assert(V != nullptr && "Value is a nullptr?");
    596     DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
    597     if (VNIt == ValueToNumber.end())
    598       return None;
    599     return VNIt->second;
    600   }
    601 
    602   /// Finds the Value associate with \p Num if it exists.
    603   /// \param [in] Num - the number to find.
    604   /// \returns The Value associated with the number.
    605   /// \returns None if not present.
    606   Optional<Value *> fromGVN(unsigned Num) {
    607     DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
    608     if (VNIt == NumberToValue.end())
    609       return None;
    610     assert(VNIt->second != nullptr && "Found value is a nullptr!");
    611     return VNIt->second;
    612   }
    613 
    614   /// \param RHS -The IRSimilarityCandidate to compare against
    615   /// \returns true if the IRSimilarityCandidate is occurs after the
    616   /// IRSimilarityCandidate in the program.
    617   bool operator<(const IRSimilarityCandidate &RHS) const {
    618     return getStartIdx() > RHS.getStartIdx();
    619   }
    620 
    621   using iterator = IRInstructionDataList::iterator;
    622   iterator begin() const { return iterator(front()); }
    623   iterator end() const { return std::next(iterator(back())); }
    624 };
    625 
    626 typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
    627 typedef std::vector<SimilarityGroup> SimilarityGroupList;
    628 
    629 /// This class puts all the pieces of the IRInstructionData,
    630 /// IRInstructionMapper, IRSimilarityCandidate together.
    631 ///
    632 /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
    633 /// and puts all the mapped instructions into a single long list of
    634 /// IRInstructionData.
    635 ///
    636 /// The list of unsigned integers is given to the Suffix Tree or similar data
    637 /// structure to find repeated subsequences.  We construct an
    638 /// IRSimilarityCandidate for each instance of the subsequence.  We compare them
    639 /// against one another since  These repeated subsequences can have different
    640 /// structure.  For each different kind of structure found, we create a
    641 /// similarity group.
    642 ///
    643 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
    644 /// structurally similar to one another, while C is different we would have two
    645 /// SimilarityGroups:
    646 ///
    647 /// SimilarityGroup 1:  SimilarityGroup 2
    648 /// A, B, D             C
    649 ///
    650 /// A list of the different similarity groups is then returned after
    651 /// analyzing the module.
    652 class IRSimilarityIdentifier {
    653 public:
    654   IRSimilarityIdentifier()
    655       : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
    656 
    657   /// \param M the module to find similarity in.
    658   explicit IRSimilarityIdentifier(Module &M)
    659       : Mapper(&InstDataAllocator, &InstDataListAllocator) {
    660     findSimilarity(M);
    661   }
    662 
    663 private:
    664   /// Map the instructions in the module to unsigned integers, using mapping
    665   /// already present in the Mapper if possible.
    666   ///
    667   /// \param [in] M Module - To map to integers.
    668   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
    669   /// \param [in,out] IntegerMapping - The vector to append integers to.
    670   void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
    671                       std::vector<unsigned> &IntegerMapping);
    672 
    673   /// Map the instructions in the modules vector to unsigned integers, using
    674   /// mapping already present in the mapper if possible.
    675   ///
    676   /// \param [in] Modules - The list of modules to use to populate the mapper
    677   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
    678   /// \param [in,out] IntegerMapping - The vector to append integers to.
    679   void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
    680                       std::vector<IRInstructionData *> &InstrList,
    681                       std::vector<unsigned> &IntegerMapping);
    682 
    683   /// Find the similarity candidates in \p InstrList and corresponding
    684   /// \p UnsignedVec
    685   ///
    686   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
    687   /// \param [in,out] IntegerMapping - The vector to append integers to.
    688   /// candidates found in the program.
    689   void findCandidates(std::vector<IRInstructionData *> &InstrList,
    690                       std::vector<unsigned> &IntegerMapping);
    691 
    692 public:
    693   // Find the IRSimilarityCandidates in the \p Modules and group by structural
    694   // similarity in a SimilarityGroup, each group is returned in a
    695   // SimilarityGroupList.
    696   //
    697   // \param [in] Modules - the modules to analyze.
    698   // \returns The groups of similarity ranges found in the modules.
    699   SimilarityGroupList &
    700   findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
    701 
    702   // Find the IRSimilarityCandidates in the given Module grouped by structural
    703   // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
    704   //
    705   // \param [in] M - the module to analyze.
    706   // \returns The groups of similarity ranges found in the module.
    707   SimilarityGroupList &findSimilarity(Module &M);
    708 
    709   // Clears \ref SimilarityCandidates if it is already filled by a previous run.
    710   void resetSimilarityCandidates() {
    711     // If we've already analyzed a Module or set of Modules, so we must clear
    712     // the SimilarityCandidates to make sure we do not have only old values
    713     // hanging around.
    714     if (SimilarityCandidates.hasValue())
    715       SimilarityCandidates->clear();
    716     else
    717       SimilarityCandidates = SimilarityGroupList();
    718   }
    719 
    720   // \returns The groups of similarity ranges found in the most recently passed
    721   // set of modules.
    722   Optional<SimilarityGroupList> &getSimilarity() {
    723     return SimilarityCandidates;
    724   }
    725 
    726 private:
    727   /// The allocator for IRInstructionData.
    728   SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
    729 
    730   /// The allocator for IRInstructionDataLists.
    731   SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
    732 
    733   /// Map Instructions to unsigned integers and wraps the Instruction in an
    734   /// instance of IRInstructionData.
    735   IRInstructionMapper Mapper;
    736 
    737   /// The SimilarityGroups found with the most recent run of \ref
    738   /// findSimilarity. None if there is no recent run.
    739   Optional<SimilarityGroupList> SimilarityCandidates;
    740 };
    741 
    742 } // end namespace IRSimilarity
    743 
    744 /// An analysis pass based on legacy pass manager that runs and returns
    745 /// IRSimilarityIdentifier run on the Module.
    746 class IRSimilarityIdentifierWrapperPass : public ModulePass {
    747   std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
    748 
    749 public:
    750   static char ID;
    751   IRSimilarityIdentifierWrapperPass();
    752 
    753   IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
    754   const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
    755 
    756   bool doInitialization(Module &M) override;
    757   bool doFinalization(Module &M) override;
    758   bool runOnModule(Module &M) override;
    759   void getAnalysisUsage(AnalysisUsage &AU) const override {
    760     AU.setPreservesAll();
    761   }
    762 };
    763 
    764 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
    765 /// Module.
    766 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
    767 public:
    768   typedef IRSimilarity::IRSimilarityIdentifier Result;
    769 
    770   Result run(Module &M, ModuleAnalysisManager &);
    771 
    772 private:
    773   friend AnalysisInfoMixin<IRSimilarityAnalysis>;
    774   static AnalysisKey Key;
    775 };
    776 
    777 /// Printer pass that uses \c IRSimilarityAnalysis.
    778 class IRSimilarityAnalysisPrinterPass
    779     : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
    780   raw_ostream &OS;
    781 
    782 public:
    783   explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
    784   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
    785 };
    786 
    787 } // end namespace llvm
    788 
    789 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
    790