Home | History | Annotate | Line # | Download | only in InstCombine
      1 //===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the visit functions for load, store and alloca.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "InstCombineInternal.h"
     14 #include "llvm/ADT/MapVector.h"
     15 #include "llvm/ADT/SmallString.h"
     16 #include "llvm/ADT/Statistic.h"
     17 #include "llvm/Analysis/AliasAnalysis.h"
     18 #include "llvm/Analysis/Loads.h"
     19 #include "llvm/IR/ConstantRange.h"
     20 #include "llvm/IR/DataLayout.h"
     21 #include "llvm/IR/DebugInfoMetadata.h"
     22 #include "llvm/IR/IntrinsicInst.h"
     23 #include "llvm/IR/LLVMContext.h"
     24 #include "llvm/IR/MDBuilder.h"
     25 #include "llvm/IR/PatternMatch.h"
     26 #include "llvm/Transforms/InstCombine/InstCombiner.h"
     27 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
     28 #include "llvm/Transforms/Utils/Local.h"
     29 using namespace llvm;
     30 using namespace PatternMatch;
     31 
     32 #define DEBUG_TYPE "instcombine"
     33 
     34 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
     35 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
     36 
     37 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
     38 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
     39 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
     40 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
     41 /// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
     42 /// the alloca, and if the source pointer is a pointer to a constant global, we
     43 /// can optimize this.
     44 static bool
     45 isOnlyCopiedFromConstantMemory(AAResults *AA,
     46                                Value *V, MemTransferInst *&TheCopy,
     47                                SmallVectorImpl<Instruction *> &ToDelete) {
     48   // We track lifetime intrinsics as we encounter them.  If we decide to go
     49   // ahead and replace the value with the global, this lets the caller quickly
     50   // eliminate the markers.
     51 
     52   SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
     53   ValuesToInspect.emplace_back(V, false);
     54   while (!ValuesToInspect.empty()) {
     55     auto ValuePair = ValuesToInspect.pop_back_val();
     56     const bool IsOffset = ValuePair.second;
     57     for (auto &U : ValuePair.first->uses()) {
     58       auto *I = cast<Instruction>(U.getUser());
     59 
     60       if (auto *LI = dyn_cast<LoadInst>(I)) {
     61         // Ignore non-volatile loads, they are always ok.
     62         if (!LI->isSimple()) return false;
     63         continue;
     64       }
     65 
     66       if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
     67         // If uses of the bitcast are ok, we are ok.
     68         ValuesToInspect.emplace_back(I, IsOffset);
     69         continue;
     70       }
     71       if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     72         // If the GEP has all zero indices, it doesn't offset the pointer. If it
     73         // doesn't, it does.
     74         ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
     75         continue;
     76       }
     77 
     78       if (auto *Call = dyn_cast<CallBase>(I)) {
     79         // If this is the function being called then we treat it like a load and
     80         // ignore it.
     81         if (Call->isCallee(&U))
     82           continue;
     83 
     84         unsigned DataOpNo = Call->getDataOperandNo(&U);
     85         bool IsArgOperand = Call->isArgOperand(&U);
     86 
     87         // Inalloca arguments are clobbered by the call.
     88         if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
     89           return false;
     90 
     91         // If this is a readonly/readnone call site, then we know it is just a
     92         // load (but one that potentially returns the value itself), so we can
     93         // ignore it if we know that the value isn't captured.
     94         if (Call->onlyReadsMemory() &&
     95             (Call->use_empty() || Call->doesNotCapture(DataOpNo)))
     96           continue;
     97 
     98         // If this is being passed as a byval argument, the caller is making a
     99         // copy, so it is only a read of the alloca.
    100         if (IsArgOperand && Call->isByValArgument(DataOpNo))
    101           continue;
    102       }
    103 
    104       // Lifetime intrinsics can be handled by the caller.
    105       if (I->isLifetimeStartOrEnd()) {
    106         assert(I->use_empty() && "Lifetime markers have no result to use!");
    107         ToDelete.push_back(I);
    108         continue;
    109       }
    110 
    111       // If this is isn't our memcpy/memmove, reject it as something we can't
    112       // handle.
    113       MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
    114       if (!MI)
    115         return false;
    116 
    117       // If the transfer is using the alloca as a source of the transfer, then
    118       // ignore it since it is a load (unless the transfer is volatile).
    119       if (U.getOperandNo() == 1) {
    120         if (MI->isVolatile()) return false;
    121         continue;
    122       }
    123 
    124       // If we already have seen a copy, reject the second one.
    125       if (TheCopy) return false;
    126 
    127       // If the pointer has been offset from the start of the alloca, we can't
    128       // safely handle this.
    129       if (IsOffset) return false;
    130 
    131       // If the memintrinsic isn't using the alloca as the dest, reject it.
    132       if (U.getOperandNo() != 0) return false;
    133 
    134       // If the source of the memcpy/move is not a constant global, reject it.
    135       if (!AA->pointsToConstantMemory(MI->getSource()))
    136         return false;
    137 
    138       // Otherwise, the transform is safe.  Remember the copy instruction.
    139       TheCopy = MI;
    140     }
    141   }
    142   return true;
    143 }
    144 
    145 /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
    146 /// modified by a copy from a constant global.  If we can prove this, we can
    147 /// replace any uses of the alloca with uses of the global directly.
    148 static MemTransferInst *
    149 isOnlyCopiedFromConstantMemory(AAResults *AA,
    150                                AllocaInst *AI,
    151                                SmallVectorImpl<Instruction *> &ToDelete) {
    152   MemTransferInst *TheCopy = nullptr;
    153   if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
    154     return TheCopy;
    155   return nullptr;
    156 }
    157 
    158 /// Returns true if V is dereferenceable for size of alloca.
    159 static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
    160                                            const DataLayout &DL) {
    161   if (AI->isArrayAllocation())
    162     return false;
    163   uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
    164   if (!AllocaSize)
    165     return false;
    166   return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
    167                                             APInt(64, AllocaSize), DL);
    168 }
    169 
    170 static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
    171                                             AllocaInst &AI) {
    172   // Check for array size of 1 (scalar allocation).
    173   if (!AI.isArrayAllocation()) {
    174     // i32 1 is the canonical array size for scalar allocations.
    175     if (AI.getArraySize()->getType()->isIntegerTy(32))
    176       return nullptr;
    177 
    178     // Canonicalize it.
    179     return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
    180   }
    181 
    182   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
    183   if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
    184     if (C->getValue().getActiveBits() <= 64) {
    185       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
    186       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
    187       New->setAlignment(AI.getAlign());
    188 
    189       // Scan to the end of the allocation instructions, to skip over a block of
    190       // allocas if possible...also skip interleaved debug info
    191       //
    192       BasicBlock::iterator It(New);
    193       while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
    194         ++It;
    195 
    196       // Now that I is pointing to the first non-allocation-inst in the block,
    197       // insert our getelementptr instruction...
    198       //
    199       Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
    200       Value *NullIdx = Constant::getNullValue(IdxTy);
    201       Value *Idx[2] = {NullIdx, NullIdx};
    202       Instruction *GEP = GetElementPtrInst::CreateInBounds(
    203           NewTy, New, Idx, New->getName() + ".sub");
    204       IC.InsertNewInstBefore(GEP, *It);
    205 
    206       // Now make everything use the getelementptr instead of the original
    207       // allocation.
    208       return IC.replaceInstUsesWith(AI, GEP);
    209     }
    210   }
    211 
    212   if (isa<UndefValue>(AI.getArraySize()))
    213     return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
    214 
    215   // Ensure that the alloca array size argument has type intptr_t, so that
    216   // any casting is exposed early.
    217   Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
    218   if (AI.getArraySize()->getType() != IntPtrTy) {
    219     Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
    220     return IC.replaceOperand(AI, 0, V);
    221   }
    222 
    223   return nullptr;
    224 }
    225 
    226 namespace {
    227 // If I and V are pointers in different address space, it is not allowed to
    228 // use replaceAllUsesWith since I and V have different types. A
    229 // non-target-specific transformation should not use addrspacecast on V since
    230 // the two address space may be disjoint depending on target.
    231 //
    232 // This class chases down uses of the old pointer until reaching the load
    233 // instructions, then replaces the old pointer in the load instructions with
    234 // the new pointer. If during the chasing it sees bitcast or GEP, it will
    235 // create new bitcast or GEP with the new pointer and use them in the load
    236 // instruction.
    237 class PointerReplacer {
    238 public:
    239   PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
    240 
    241   bool collectUsers(Instruction &I);
    242   void replacePointer(Instruction &I, Value *V);
    243 
    244 private:
    245   void replace(Instruction *I);
    246   Value *getReplacement(Value *I);
    247 
    248   SmallSetVector<Instruction *, 4> Worklist;
    249   MapVector<Value *, Value *> WorkMap;
    250   InstCombinerImpl &IC;
    251 };
    252 } // end anonymous namespace
    253 
    254 bool PointerReplacer::collectUsers(Instruction &I) {
    255   for (auto U : I.users()) {
    256     Instruction *Inst = cast<Instruction>(&*U);
    257     if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
    258       if (Load->isVolatile())
    259         return false;
    260       Worklist.insert(Load);
    261     } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
    262       Worklist.insert(Inst);
    263       if (!collectUsers(*Inst))
    264         return false;
    265     } else if (isa<MemTransferInst>(Inst)) {
    266       Worklist.insert(Inst);
    267     } else {
    268       LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
    269       return false;
    270     }
    271   }
    272 
    273   return true;
    274 }
    275 
    276 Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
    277 
    278 void PointerReplacer::replace(Instruction *I) {
    279   if (getReplacement(I))
    280     return;
    281 
    282   if (auto *LT = dyn_cast<LoadInst>(I)) {
    283     auto *V = getReplacement(LT->getPointerOperand());
    284     assert(V && "Operand not replaced");
    285     auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
    286                               LT->getAlign(), LT->getOrdering(),
    287                               LT->getSyncScopeID());
    288     NewI->takeName(LT);
    289     copyMetadataForLoad(*NewI, *LT);
    290 
    291     IC.InsertNewInstWith(NewI, *LT);
    292     IC.replaceInstUsesWith(*LT, NewI);
    293     WorkMap[LT] = NewI;
    294   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
    295     auto *V = getReplacement(GEP->getPointerOperand());
    296     assert(V && "Operand not replaced");
    297     SmallVector<Value *, 8> Indices;
    298     Indices.append(GEP->idx_begin(), GEP->idx_end());
    299     auto *NewI = GetElementPtrInst::Create(
    300         V->getType()->getPointerElementType(), V, Indices);
    301     IC.InsertNewInstWith(NewI, *GEP);
    302     NewI->takeName(GEP);
    303     WorkMap[GEP] = NewI;
    304   } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
    305     auto *V = getReplacement(BC->getOperand(0));
    306     assert(V && "Operand not replaced");
    307     auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
    308                                   V->getType()->getPointerAddressSpace());
    309     auto *NewI = new BitCastInst(V, NewT);
    310     IC.InsertNewInstWith(NewI, *BC);
    311     NewI->takeName(BC);
    312     WorkMap[BC] = NewI;
    313   } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
    314     auto *SrcV = getReplacement(MemCpy->getRawSource());
    315     // The pointer may appear in the destination of a copy, but we don't want to
    316     // replace it.
    317     if (!SrcV) {
    318       assert(getReplacement(MemCpy->getRawDest()) &&
    319              "destination not in replace list");
    320       return;
    321     }
    322 
    323     IC.Builder.SetInsertPoint(MemCpy);
    324     auto *NewI = IC.Builder.CreateMemTransferInst(
    325         MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
    326         SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
    327         MemCpy->isVolatile());
    328     AAMDNodes AAMD;
    329     MemCpy->getAAMetadata(AAMD);
    330     if (AAMD)
    331       NewI->setAAMetadata(AAMD);
    332 
    333     IC.eraseInstFromFunction(*MemCpy);
    334     WorkMap[MemCpy] = NewI;
    335   } else {
    336     llvm_unreachable("should never reach here");
    337   }
    338 }
    339 
    340 void PointerReplacer::replacePointer(Instruction &I, Value *V) {
    341 #ifndef NDEBUG
    342   auto *PT = cast<PointerType>(I.getType());
    343   auto *NT = cast<PointerType>(V->getType());
    344   assert(PT != NT && PT->getElementType() == NT->getElementType() &&
    345          "Invalid usage");
    346 #endif
    347   WorkMap[&I] = V;
    348 
    349   for (Instruction *Workitem : Worklist)
    350     replace(Workitem);
    351 }
    352 
    353 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
    354   if (auto *I = simplifyAllocaArraySize(*this, AI))
    355     return I;
    356 
    357   if (AI.getAllocatedType()->isSized()) {
    358     // Move all alloca's of zero byte objects to the entry block and merge them
    359     // together.  Note that we only do this for alloca's, because malloc should
    360     // allocate and return a unique pointer, even for a zero byte allocation.
    361     if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
    362       // For a zero sized alloca there is no point in doing an array allocation.
    363       // This is helpful if the array size is a complicated expression not used
    364       // elsewhere.
    365       if (AI.isArrayAllocation())
    366         return replaceOperand(AI, 0,
    367             ConstantInt::get(AI.getArraySize()->getType(), 1));
    368 
    369       // Get the first instruction in the entry block.
    370       BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
    371       Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
    372       if (FirstInst != &AI) {
    373         // If the entry block doesn't start with a zero-size alloca then move
    374         // this one to the start of the entry block.  There is no problem with
    375         // dominance as the array size was forced to a constant earlier already.
    376         AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
    377         if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
    378             DL.getTypeAllocSize(EntryAI->getAllocatedType())
    379                     .getKnownMinSize() != 0) {
    380           AI.moveBefore(FirstInst);
    381           return &AI;
    382         }
    383 
    384         // Replace this zero-sized alloca with the one at the start of the entry
    385         // block after ensuring that the address will be aligned enough for both
    386         // types.
    387         const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
    388         EntryAI->setAlignment(MaxAlign);
    389         if (AI.getType() != EntryAI->getType())
    390           return new BitCastInst(EntryAI, AI.getType());
    391         return replaceInstUsesWith(AI, EntryAI);
    392       }
    393     }
    394   }
    395 
    396   // Check to see if this allocation is only modified by a memcpy/memmove from
    397   // a constant whose alignment is equal to or exceeds that of the allocation.
    398   // If this is the case, we can change all users to use the constant global
    399   // instead.  This is commonly produced by the CFE by constructs like "void
    400   // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
    401   // read.
    402   SmallVector<Instruction *, 4> ToDelete;
    403   if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
    404     Value *TheSrc = Copy->getSource();
    405     Align AllocaAlign = AI.getAlign();
    406     Align SourceAlign = getOrEnforceKnownAlignment(
    407       TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
    408     if (AllocaAlign <= SourceAlign &&
    409         isDereferenceableForAllocaSize(TheSrc, &AI, DL) &&
    410         !isa<Instruction>(TheSrc)) {
    411       // FIXME: Can we sink instructions without violating dominance when TheSrc
    412       // is an instruction instead of a constant or argument?
    413       LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
    414       LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
    415       unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
    416       auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
    417       if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
    418         for (Instruction *Delete : ToDelete)
    419           eraseInstFromFunction(*Delete);
    420 
    421         Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
    422         Instruction *NewI = replaceInstUsesWith(AI, Cast);
    423         eraseInstFromFunction(*Copy);
    424         ++NumGlobalCopies;
    425         return NewI;
    426       }
    427 
    428       PointerReplacer PtrReplacer(*this);
    429       if (PtrReplacer.collectUsers(AI)) {
    430         for (Instruction *Delete : ToDelete)
    431           eraseInstFromFunction(*Delete);
    432 
    433         Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
    434         PtrReplacer.replacePointer(AI, Cast);
    435         ++NumGlobalCopies;
    436       }
    437     }
    438   }
    439 
    440   // At last, use the generic allocation site handler to aggressively remove
    441   // unused allocas.
    442   return visitAllocSite(AI);
    443 }
    444 
    445 // Are we allowed to form a atomic load or store of this type?
    446 static bool isSupportedAtomicType(Type *Ty) {
    447   return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
    448 }
    449 
    450 /// Helper to combine a load to a new type.
    451 ///
    452 /// This just does the work of combining a load to a new type. It handles
    453 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
    454 /// loaded *value* type. This will convert it to a pointer, cast the operand to
    455 /// that pointer type, load it, etc.
    456 ///
    457 /// Note that this will create all of the instructions with whatever insert
    458 /// point the \c InstCombinerImpl currently is using.
    459 LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
    460                                                  const Twine &Suffix) {
    461   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
    462          "can't fold an atomic load to requested type");
    463 
    464   Value *Ptr = LI.getPointerOperand();
    465   unsigned AS = LI.getPointerAddressSpace();
    466   Value *NewPtr = nullptr;
    467   if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
    468         NewPtr->getType()->getPointerElementType() == NewTy &&
    469         NewPtr->getType()->getPointerAddressSpace() == AS))
    470     NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
    471 
    472   LoadInst *NewLoad = Builder.CreateAlignedLoad(
    473       NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
    474   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
    475   copyMetadataForLoad(*NewLoad, LI);
    476   return NewLoad;
    477 }
    478 
    479 /// Combine a store to a new type.
    480 ///
    481 /// Returns the newly created store instruction.
    482 static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
    483                                          Value *V) {
    484   assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
    485          "can't fold an atomic store of requested type");
    486 
    487   Value *Ptr = SI.getPointerOperand();
    488   unsigned AS = SI.getPointerAddressSpace();
    489   SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
    490   SI.getAllMetadata(MD);
    491 
    492   StoreInst *NewStore = IC.Builder.CreateAlignedStore(
    493       V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
    494       SI.getAlign(), SI.isVolatile());
    495   NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
    496   for (const auto &MDPair : MD) {
    497     unsigned ID = MDPair.first;
    498     MDNode *N = MDPair.second;
    499     // Note, essentially every kind of metadata should be preserved here! This
    500     // routine is supposed to clone a store instruction changing *only its
    501     // type*. The only metadata it makes sense to drop is metadata which is
    502     // invalidated when the pointer type changes. This should essentially
    503     // never be the case in LLVM, but we explicitly switch over only known
    504     // metadata to be conservatively correct. If you are adding metadata to
    505     // LLVM which pertains to stores, you almost certainly want to add it
    506     // here.
    507     switch (ID) {
    508     case LLVMContext::MD_dbg:
    509     case LLVMContext::MD_tbaa:
    510     case LLVMContext::MD_prof:
    511     case LLVMContext::MD_fpmath:
    512     case LLVMContext::MD_tbaa_struct:
    513     case LLVMContext::MD_alias_scope:
    514     case LLVMContext::MD_noalias:
    515     case LLVMContext::MD_nontemporal:
    516     case LLVMContext::MD_mem_parallel_loop_access:
    517     case LLVMContext::MD_access_group:
    518       // All of these directly apply.
    519       NewStore->setMetadata(ID, N);
    520       break;
    521     case LLVMContext::MD_invariant_load:
    522     case LLVMContext::MD_nonnull:
    523     case LLVMContext::MD_noundef:
    524     case LLVMContext::MD_range:
    525     case LLVMContext::MD_align:
    526     case LLVMContext::MD_dereferenceable:
    527     case LLVMContext::MD_dereferenceable_or_null:
    528       // These don't apply for stores.
    529       break;
    530     }
    531   }
    532 
    533   return NewStore;
    534 }
    535 
    536 /// Returns true if instruction represent minmax pattern like:
    537 ///   select ((cmp load V1, load V2), V1, V2).
    538 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
    539   assert(V->getType()->isPointerTy() && "Expected pointer type.");
    540   // Ignore possible ty* to ixx* bitcast.
    541   V = InstCombiner::peekThroughBitcast(V);
    542   // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
    543   // pattern.
    544   CmpInst::Predicate Pred;
    545   Instruction *L1;
    546   Instruction *L2;
    547   Value *LHS;
    548   Value *RHS;
    549   if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
    550                          m_Value(LHS), m_Value(RHS))))
    551     return false;
    552   LoadTy = L1->getType();
    553   return (match(L1, m_Load(m_Specific(LHS))) &&
    554           match(L2, m_Load(m_Specific(RHS)))) ||
    555          (match(L1, m_Load(m_Specific(RHS))) &&
    556           match(L2, m_Load(m_Specific(LHS))));
    557 }
    558 
    559 /// Combine loads to match the type of their uses' value after looking
    560 /// through intervening bitcasts.
    561 ///
    562 /// The core idea here is that if the result of a load is used in an operation,
    563 /// we should load the type most conducive to that operation. For example, when
    564 /// loading an integer and converting that immediately to a pointer, we should
    565 /// instead directly load a pointer.
    566 ///
    567 /// However, this routine must never change the width of a load or the number of
    568 /// loads as that would introduce a semantic change. This combine is expected to
    569 /// be a semantic no-op which just allows loads to more closely model the types
    570 /// of their consuming operations.
    571 ///
    572 /// Currently, we also refuse to change the precise type used for an atomic load
    573 /// or a volatile load. This is debatable, and might be reasonable to change
    574 /// later. However, it is risky in case some backend or other part of LLVM is
    575 /// relying on the exact type loaded to select appropriate atomic operations.
    576 static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
    577                                                LoadInst &LI) {
    578   // FIXME: We could probably with some care handle both volatile and ordered
    579   // atomic loads here but it isn't clear that this is important.
    580   if (!LI.isUnordered())
    581     return nullptr;
    582 
    583   if (LI.use_empty())
    584     return nullptr;
    585 
    586   // swifterror values can't be bitcasted.
    587   if (LI.getPointerOperand()->isSwiftError())
    588     return nullptr;
    589 
    590   const DataLayout &DL = IC.getDataLayout();
    591 
    592   // Fold away bit casts of the loaded value by loading the desired type.
    593   // Note that we should not do this for pointer<->integer casts,
    594   // because that would result in type punning.
    595   if (LI.hasOneUse()) {
    596     // Don't transform when the type is x86_amx, it makes the pass that lower
    597     // x86_amx type happy.
    598     if (auto *BC = dyn_cast<BitCastInst>(LI.user_back())) {
    599       assert(!LI.getType()->isX86_AMXTy() &&
    600              "load from x86_amx* should not happen!");
    601       if (BC->getType()->isX86_AMXTy())
    602         return nullptr;
    603     }
    604 
    605     if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
    606       if (CI->isNoopCast(DL) && LI.getType()->isPtrOrPtrVectorTy() ==
    607                                     CI->getDestTy()->isPtrOrPtrVectorTy())
    608         if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
    609           LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
    610           CI->replaceAllUsesWith(NewLoad);
    611           IC.eraseInstFromFunction(*CI);
    612           return &LI;
    613         }
    614   }
    615 
    616   // FIXME: We should also canonicalize loads of vectors when their elements are
    617   // cast to other types.
    618   return nullptr;
    619 }
    620 
    621 static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
    622   // FIXME: We could probably with some care handle both volatile and atomic
    623   // stores here but it isn't clear that this is important.
    624   if (!LI.isSimple())
    625     return nullptr;
    626 
    627   Type *T = LI.getType();
    628   if (!T->isAggregateType())
    629     return nullptr;
    630 
    631   StringRef Name = LI.getName();
    632   assert(LI.getAlignment() && "Alignment must be set at this point");
    633 
    634   if (auto *ST = dyn_cast<StructType>(T)) {
    635     // If the struct only have one element, we unpack.
    636     auto NumElements = ST->getNumElements();
    637     if (NumElements == 1) {
    638       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
    639                                                   ".unpack");
    640       AAMDNodes AAMD;
    641       LI.getAAMetadata(AAMD);
    642       NewLoad->setAAMetadata(AAMD);
    643       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
    644         UndefValue::get(T), NewLoad, 0, Name));
    645     }
    646 
    647     // We don't want to break loads with padding here as we'd loose
    648     // the knowledge that padding exists for the rest of the pipeline.
    649     const DataLayout &DL = IC.getDataLayout();
    650     auto *SL = DL.getStructLayout(ST);
    651     if (SL->hasPadding())
    652       return nullptr;
    653 
    654     const auto Align = LI.getAlign();
    655     auto *Addr = LI.getPointerOperand();
    656     auto *IdxType = Type::getInt32Ty(T->getContext());
    657     auto *Zero = ConstantInt::get(IdxType, 0);
    658 
    659     Value *V = UndefValue::get(T);
    660     for (unsigned i = 0; i < NumElements; i++) {
    661       Value *Indices[2] = {
    662         Zero,
    663         ConstantInt::get(IdxType, i),
    664       };
    665       auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
    666                                                Name + ".elt");
    667       auto *L = IC.Builder.CreateAlignedLoad(
    668           ST->getElementType(i), Ptr,
    669           commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
    670       // Propagate AA metadata. It'll still be valid on the narrowed load.
    671       AAMDNodes AAMD;
    672       LI.getAAMetadata(AAMD);
    673       L->setAAMetadata(AAMD);
    674       V = IC.Builder.CreateInsertValue(V, L, i);
    675     }
    676 
    677     V->setName(Name);
    678     return IC.replaceInstUsesWith(LI, V);
    679   }
    680 
    681   if (auto *AT = dyn_cast<ArrayType>(T)) {
    682     auto *ET = AT->getElementType();
    683     auto NumElements = AT->getNumElements();
    684     if (NumElements == 1) {
    685       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
    686       AAMDNodes AAMD;
    687       LI.getAAMetadata(AAMD);
    688       NewLoad->setAAMetadata(AAMD);
    689       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
    690         UndefValue::get(T), NewLoad, 0, Name));
    691     }
    692 
    693     // Bail out if the array is too large. Ideally we would like to optimize
    694     // arrays of arbitrary size but this has a terrible impact on compile time.
    695     // The threshold here is chosen arbitrarily, maybe needs a little bit of
    696     // tuning.
    697     if (NumElements > IC.MaxArraySizeForCombine)
    698       return nullptr;
    699 
    700     const DataLayout &DL = IC.getDataLayout();
    701     auto EltSize = DL.getTypeAllocSize(ET);
    702     const auto Align = LI.getAlign();
    703 
    704     auto *Addr = LI.getPointerOperand();
    705     auto *IdxType = Type::getInt64Ty(T->getContext());
    706     auto *Zero = ConstantInt::get(IdxType, 0);
    707 
    708     Value *V = UndefValue::get(T);
    709     uint64_t Offset = 0;
    710     for (uint64_t i = 0; i < NumElements; i++) {
    711       Value *Indices[2] = {
    712         Zero,
    713         ConstantInt::get(IdxType, i),
    714       };
    715       auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
    716                                                Name + ".elt");
    717       auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
    718                                              commonAlignment(Align, Offset),
    719                                              Name + ".unpack");
    720       AAMDNodes AAMD;
    721       LI.getAAMetadata(AAMD);
    722       L->setAAMetadata(AAMD);
    723       V = IC.Builder.CreateInsertValue(V, L, i);
    724       Offset += EltSize;
    725     }
    726 
    727     V->setName(Name);
    728     return IC.replaceInstUsesWith(LI, V);
    729   }
    730 
    731   return nullptr;
    732 }
    733 
    734 // If we can determine that all possible objects pointed to by the provided
    735 // pointer value are, not only dereferenceable, but also definitively less than
    736 // or equal to the provided maximum size, then return true. Otherwise, return
    737 // false (constant global values and allocas fall into this category).
    738 //
    739 // FIXME: This should probably live in ValueTracking (or similar).
    740 static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
    741                                      const DataLayout &DL) {
    742   SmallPtrSet<Value *, 4> Visited;
    743   SmallVector<Value *, 4> Worklist(1, V);
    744 
    745   do {
    746     Value *P = Worklist.pop_back_val();
    747     P = P->stripPointerCasts();
    748 
    749     if (!Visited.insert(P).second)
    750       continue;
    751 
    752     if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
    753       Worklist.push_back(SI->getTrueValue());
    754       Worklist.push_back(SI->getFalseValue());
    755       continue;
    756     }
    757 
    758     if (PHINode *PN = dyn_cast<PHINode>(P)) {
    759       append_range(Worklist, PN->incoming_values());
    760       continue;
    761     }
    762 
    763     if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
    764       if (GA->isInterposable())
    765         return false;
    766       Worklist.push_back(GA->getAliasee());
    767       continue;
    768     }
    769 
    770     // If we know how big this object is, and it is less than MaxSize, continue
    771     // searching. Otherwise, return false.
    772     if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
    773       if (!AI->getAllocatedType()->isSized())
    774         return false;
    775 
    776       ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
    777       if (!CS)
    778         return false;
    779 
    780       uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
    781       // Make sure that, even if the multiplication below would wrap as an
    782       // uint64_t, we still do the right thing.
    783       if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
    784         return false;
    785       continue;
    786     }
    787 
    788     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
    789       if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
    790         return false;
    791 
    792       uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
    793       if (InitSize > MaxSize)
    794         return false;
    795       continue;
    796     }
    797 
    798     return false;
    799   } while (!Worklist.empty());
    800 
    801   return true;
    802 }
    803 
    804 // If we're indexing into an object of a known size, and the outer index is
    805 // not a constant, but having any value but zero would lead to undefined
    806 // behavior, replace it with zero.
    807 //
    808 // For example, if we have:
    809 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
    810 // ...
    811 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
    812 // ... = load i32* %arrayidx, align 4
    813 // Then we know that we can replace %x in the GEP with i64 0.
    814 //
    815 // FIXME: We could fold any GEP index to zero that would cause UB if it were
    816 // not zero. Currently, we only handle the first such index. Also, we could
    817 // also search through non-zero constant indices if we kept track of the
    818 // offsets those indices implied.
    819 static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
    820                                      GetElementPtrInst *GEPI, Instruction *MemI,
    821                                      unsigned &Idx) {
    822   if (GEPI->getNumOperands() < 2)
    823     return false;
    824 
    825   // Find the first non-zero index of a GEP. If all indices are zero, return
    826   // one past the last index.
    827   auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
    828     unsigned I = 1;
    829     for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
    830       Value *V = GEPI->getOperand(I);
    831       if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
    832         if (CI->isZero())
    833           continue;
    834 
    835       break;
    836     }
    837 
    838     return I;
    839   };
    840 
    841   // Skip through initial 'zero' indices, and find the corresponding pointer
    842   // type. See if the next index is not a constant.
    843   Idx = FirstNZIdx(GEPI);
    844   if (Idx == GEPI->getNumOperands())
    845     return false;
    846   if (isa<Constant>(GEPI->getOperand(Idx)))
    847     return false;
    848 
    849   SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
    850   Type *SourceElementType = GEPI->getSourceElementType();
    851   // Size information about scalable vectors is not available, so we cannot
    852   // deduce whether indexing at n is undefined behaviour or not. Bail out.
    853   if (isa<ScalableVectorType>(SourceElementType))
    854     return false;
    855 
    856   Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
    857   if (!AllocTy || !AllocTy->isSized())
    858     return false;
    859   const DataLayout &DL = IC.getDataLayout();
    860   uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedSize();
    861 
    862   // If there are more indices after the one we might replace with a zero, make
    863   // sure they're all non-negative. If any of them are negative, the overall
    864   // address being computed might be before the base address determined by the
    865   // first non-zero index.
    866   auto IsAllNonNegative = [&]() {
    867     for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
    868       KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
    869       if (Known.isNonNegative())
    870         continue;
    871       return false;
    872     }
    873 
    874     return true;
    875   };
    876 
    877   // FIXME: If the GEP is not inbounds, and there are extra indices after the
    878   // one we'll replace, those could cause the address computation to wrap
    879   // (rendering the IsAllNonNegative() check below insufficient). We can do
    880   // better, ignoring zero indices (and other indices we can prove small
    881   // enough not to wrap).
    882   if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
    883     return false;
    884 
    885   // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
    886   // also known to be dereferenceable.
    887   return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
    888          IsAllNonNegative();
    889 }
    890 
    891 // If we're indexing into an object with a variable index for the memory
    892 // access, but the object has only one element, we can assume that the index
    893 // will always be zero. If we replace the GEP, return it.
    894 template <typename T>
    895 static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
    896                                           T &MemI) {
    897   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
    898     unsigned Idx;
    899     if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
    900       Instruction *NewGEPI = GEPI->clone();
    901       NewGEPI->setOperand(Idx,
    902         ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
    903       NewGEPI->insertBefore(GEPI);
    904       MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
    905       return NewGEPI;
    906     }
    907   }
    908 
    909   return nullptr;
    910 }
    911 
    912 static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
    913   if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
    914     return false;
    915 
    916   auto *Ptr = SI.getPointerOperand();
    917   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
    918     Ptr = GEPI->getOperand(0);
    919   return (isa<ConstantPointerNull>(Ptr) &&
    920           !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
    921 }
    922 
    923 static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
    924   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
    925     const Value *GEPI0 = GEPI->getOperand(0);
    926     if (isa<ConstantPointerNull>(GEPI0) &&
    927         !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
    928       return true;
    929   }
    930   if (isa<UndefValue>(Op) ||
    931       (isa<ConstantPointerNull>(Op) &&
    932        !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
    933     return true;
    934   return false;
    935 }
    936 
    937 Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
    938   Value *Op = LI.getOperand(0);
    939 
    940   // Try to canonicalize the loaded type.
    941   if (Instruction *Res = combineLoadToOperationType(*this, LI))
    942     return Res;
    943 
    944   // Attempt to improve the alignment.
    945   Align KnownAlign = getOrEnforceKnownAlignment(
    946       Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
    947   if (KnownAlign > LI.getAlign())
    948     LI.setAlignment(KnownAlign);
    949 
    950   // Replace GEP indices if possible.
    951   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
    952       Worklist.push(NewGEPI);
    953       return &LI;
    954   }
    955 
    956   if (Instruction *Res = unpackLoadToAggregate(*this, LI))
    957     return Res;
    958 
    959   // Do really simple store-to-load forwarding and load CSE, to catch cases
    960   // where there are several consecutive memory accesses to the same location,
    961   // separated by a few arithmetic operations.
    962   bool IsLoadCSE = false;
    963   if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {
    964     if (IsLoadCSE)
    965       combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
    966 
    967     return replaceInstUsesWith(
    968         LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
    969                                            LI.getName() + ".cast"));
    970   }
    971 
    972   // None of the following transforms are legal for volatile/ordered atomic
    973   // loads.  Most of them do apply for unordered atomics.
    974   if (!LI.isUnordered()) return nullptr;
    975 
    976   // load(gep null, ...) -> unreachable
    977   // load null/undef -> unreachable
    978   // TODO: Consider a target hook for valid address spaces for this xforms.
    979   if (canSimplifyNullLoadOrGEP(LI, Op)) {
    980     // Insert a new store to null instruction before the load to indicate
    981     // that this code is not reachable.  We do this instead of inserting
    982     // an unreachable instruction directly because we cannot modify the
    983     // CFG.
    984     StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
    985                                   Constant::getNullValue(Op->getType()), &LI);
    986     SI->setDebugLoc(LI.getDebugLoc());
    987     return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
    988   }
    989 
    990   if (Op->hasOneUse()) {
    991     // Change select and PHI nodes to select values instead of addresses: this
    992     // helps alias analysis out a lot, allows many others simplifications, and
    993     // exposes redundancy in the code.
    994     //
    995     // Note that we cannot do the transformation unless we know that the
    996     // introduced loads cannot trap!  Something like this is valid as long as
    997     // the condition is always false: load (select bool %C, int* null, int* %G),
    998     // but it would not be valid if we transformed it to load from null
    999     // unconditionally.
   1000     //
   1001     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
   1002       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
   1003       Align Alignment = LI.getAlign();
   1004       if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
   1005                                       Alignment, DL, SI) &&
   1006           isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
   1007                                       Alignment, DL, SI)) {
   1008         LoadInst *V1 =
   1009             Builder.CreateLoad(LI.getType(), SI->getOperand(1),
   1010                                SI->getOperand(1)->getName() + ".val");
   1011         LoadInst *V2 =
   1012             Builder.CreateLoad(LI.getType(), SI->getOperand(2),
   1013                                SI->getOperand(2)->getName() + ".val");
   1014         assert(LI.isUnordered() && "implied by above");
   1015         V1->setAlignment(Alignment);
   1016         V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
   1017         V2->setAlignment(Alignment);
   1018         V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
   1019         return SelectInst::Create(SI->getCondition(), V1, V2);
   1020       }
   1021 
   1022       // load (select (cond, null, P)) -> load P
   1023       if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
   1024           !NullPointerIsDefined(SI->getFunction(),
   1025                                 LI.getPointerAddressSpace()))
   1026         return replaceOperand(LI, 0, SI->getOperand(2));
   1027 
   1028       // load (select (cond, P, null)) -> load P
   1029       if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
   1030           !NullPointerIsDefined(SI->getFunction(),
   1031                                 LI.getPointerAddressSpace()))
   1032         return replaceOperand(LI, 0, SI->getOperand(1));
   1033     }
   1034   }
   1035   return nullptr;
   1036 }
   1037 
   1038 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
   1039 ///
   1040 /// \returns underlying value that was "cast", or nullptr otherwise.
   1041 ///
   1042 /// For example, if we have:
   1043 ///
   1044 ///     %E0 = extractelement <2 x double> %U, i32 0
   1045 ///     %V0 = insertvalue [2 x double] undef, double %E0, 0
   1046 ///     %E1 = extractelement <2 x double> %U, i32 1
   1047 ///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
   1048 ///
   1049 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
   1050 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
   1051 /// Note that %U may contain non-undef values where %V1 has undef.
   1052 static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
   1053   Value *U = nullptr;
   1054   while (auto *IV = dyn_cast<InsertValueInst>(V)) {
   1055     auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
   1056     if (!E)
   1057       return nullptr;
   1058     auto *W = E->getVectorOperand();
   1059     if (!U)
   1060       U = W;
   1061     else if (U != W)
   1062       return nullptr;
   1063     auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
   1064     if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
   1065       return nullptr;
   1066     V = IV->getAggregateOperand();
   1067   }
   1068   if (!match(V, m_Undef()) || !U)
   1069     return nullptr;
   1070 
   1071   auto *UT = cast<VectorType>(U->getType());
   1072   auto *VT = V->getType();
   1073   // Check that types UT and VT are bitwise isomorphic.
   1074   const auto &DL = IC.getDataLayout();
   1075   if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
   1076     return nullptr;
   1077   }
   1078   if (auto *AT = dyn_cast<ArrayType>(VT)) {
   1079     if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
   1080       return nullptr;
   1081   } else {
   1082     auto *ST = cast<StructType>(VT);
   1083     if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
   1084       return nullptr;
   1085     for (const auto *EltT : ST->elements()) {
   1086       if (EltT != UT->getElementType())
   1087         return nullptr;
   1088     }
   1089   }
   1090   return U;
   1091 }
   1092 
   1093 /// Combine stores to match the type of value being stored.
   1094 ///
   1095 /// The core idea here is that the memory does not have any intrinsic type and
   1096 /// where we can we should match the type of a store to the type of value being
   1097 /// stored.
   1098 ///
   1099 /// However, this routine must never change the width of a store or the number of
   1100 /// stores as that would introduce a semantic change. This combine is expected to
   1101 /// be a semantic no-op which just allows stores to more closely model the types
   1102 /// of their incoming values.
   1103 ///
   1104 /// Currently, we also refuse to change the precise type used for an atomic or
   1105 /// volatile store. This is debatable, and might be reasonable to change later.
   1106 /// However, it is risky in case some backend or other part of LLVM is relying
   1107 /// on the exact type stored to select appropriate atomic operations.
   1108 ///
   1109 /// \returns true if the store was successfully combined away. This indicates
   1110 /// the caller must erase the store instruction. We have to let the caller erase
   1111 /// the store instruction as otherwise there is no way to signal whether it was
   1112 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
   1113 static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
   1114   // FIXME: We could probably with some care handle both volatile and ordered
   1115   // atomic stores here but it isn't clear that this is important.
   1116   if (!SI.isUnordered())
   1117     return false;
   1118 
   1119   // swifterror values can't be bitcasted.
   1120   if (SI.getPointerOperand()->isSwiftError())
   1121     return false;
   1122 
   1123   Value *V = SI.getValueOperand();
   1124 
   1125   // Fold away bit casts of the stored value by storing the original type.
   1126   if (auto *BC = dyn_cast<BitCastInst>(V)) {
   1127     assert(!BC->getType()->isX86_AMXTy() &&
   1128            "store to x86_amx* should not happen!");
   1129     V = BC->getOperand(0);
   1130     // Don't transform when the type is x86_amx, it makes the pass that lower
   1131     // x86_amx type happy.
   1132     if (V->getType()->isX86_AMXTy())
   1133       return false;
   1134     if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
   1135       combineStoreToNewValue(IC, SI, V);
   1136       return true;
   1137     }
   1138   }
   1139 
   1140   if (Value *U = likeBitCastFromVector(IC, V))
   1141     if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
   1142       combineStoreToNewValue(IC, SI, U);
   1143       return true;
   1144     }
   1145 
   1146   // FIXME: We should also canonicalize stores of vectors when their elements
   1147   // are cast to other types.
   1148   return false;
   1149 }
   1150 
   1151 static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
   1152   // FIXME: We could probably with some care handle both volatile and atomic
   1153   // stores here but it isn't clear that this is important.
   1154   if (!SI.isSimple())
   1155     return false;
   1156 
   1157   Value *V = SI.getValueOperand();
   1158   Type *T = V->getType();
   1159 
   1160   if (!T->isAggregateType())
   1161     return false;
   1162 
   1163   if (auto *ST = dyn_cast<StructType>(T)) {
   1164     // If the struct only have one element, we unpack.
   1165     unsigned Count = ST->getNumElements();
   1166     if (Count == 1) {
   1167       V = IC.Builder.CreateExtractValue(V, 0);
   1168       combineStoreToNewValue(IC, SI, V);
   1169       return true;
   1170     }
   1171 
   1172     // We don't want to break loads with padding here as we'd loose
   1173     // the knowledge that padding exists for the rest of the pipeline.
   1174     const DataLayout &DL = IC.getDataLayout();
   1175     auto *SL = DL.getStructLayout(ST);
   1176     if (SL->hasPadding())
   1177       return false;
   1178 
   1179     const auto Align = SI.getAlign();
   1180 
   1181     SmallString<16> EltName = V->getName();
   1182     EltName += ".elt";
   1183     auto *Addr = SI.getPointerOperand();
   1184     SmallString<16> AddrName = Addr->getName();
   1185     AddrName += ".repack";
   1186 
   1187     auto *IdxType = Type::getInt32Ty(ST->getContext());
   1188     auto *Zero = ConstantInt::get(IdxType, 0);
   1189     for (unsigned i = 0; i < Count; i++) {
   1190       Value *Indices[2] = {
   1191         Zero,
   1192         ConstantInt::get(IdxType, i),
   1193       };
   1194       auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
   1195                                                AddrName);
   1196       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
   1197       auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
   1198       llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
   1199       AAMDNodes AAMD;
   1200       SI.getAAMetadata(AAMD);
   1201       NS->setAAMetadata(AAMD);
   1202     }
   1203 
   1204     return true;
   1205   }
   1206 
   1207   if (auto *AT = dyn_cast<ArrayType>(T)) {
   1208     // If the array only have one element, we unpack.
   1209     auto NumElements = AT->getNumElements();
   1210     if (NumElements == 1) {
   1211       V = IC.Builder.CreateExtractValue(V, 0);
   1212       combineStoreToNewValue(IC, SI, V);
   1213       return true;
   1214     }
   1215 
   1216     // Bail out if the array is too large. Ideally we would like to optimize
   1217     // arrays of arbitrary size but this has a terrible impact on compile time.
   1218     // The threshold here is chosen arbitrarily, maybe needs a little bit of
   1219     // tuning.
   1220     if (NumElements > IC.MaxArraySizeForCombine)
   1221       return false;
   1222 
   1223     const DataLayout &DL = IC.getDataLayout();
   1224     auto EltSize = DL.getTypeAllocSize(AT->getElementType());
   1225     const auto Align = SI.getAlign();
   1226 
   1227     SmallString<16> EltName = V->getName();
   1228     EltName += ".elt";
   1229     auto *Addr = SI.getPointerOperand();
   1230     SmallString<16> AddrName = Addr->getName();
   1231     AddrName += ".repack";
   1232 
   1233     auto *IdxType = Type::getInt64Ty(T->getContext());
   1234     auto *Zero = ConstantInt::get(IdxType, 0);
   1235 
   1236     uint64_t Offset = 0;
   1237     for (uint64_t i = 0; i < NumElements; i++) {
   1238       Value *Indices[2] = {
   1239         Zero,
   1240         ConstantInt::get(IdxType, i),
   1241       };
   1242       auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
   1243                                                AddrName);
   1244       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
   1245       auto EltAlign = commonAlignment(Align, Offset);
   1246       Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
   1247       AAMDNodes AAMD;
   1248       SI.getAAMetadata(AAMD);
   1249       NS->setAAMetadata(AAMD);
   1250       Offset += EltSize;
   1251     }
   1252 
   1253     return true;
   1254   }
   1255 
   1256   return false;
   1257 }
   1258 
   1259 /// equivalentAddressValues - Test if A and B will obviously have the same
   1260 /// value. This includes recognizing that %t0 and %t1 will have the same
   1261 /// value in code like this:
   1262 ///   %t0 = getelementptr \@a, 0, 3
   1263 ///   store i32 0, i32* %t0
   1264 ///   %t1 = getelementptr \@a, 0, 3
   1265 ///   %t2 = load i32* %t1
   1266 ///
   1267 static bool equivalentAddressValues(Value *A, Value *B) {
   1268   // Test if the values are trivially equivalent.
   1269   if (A == B) return true;
   1270 
   1271   // Test if the values come form identical arithmetic instructions.
   1272   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
   1273   // its only used to compare two uses within the same basic block, which
   1274   // means that they'll always either have the same value or one of them
   1275   // will have an undefined value.
   1276   if (isa<BinaryOperator>(A) ||
   1277       isa<CastInst>(A) ||
   1278       isa<PHINode>(A) ||
   1279       isa<GetElementPtrInst>(A))
   1280     if (Instruction *BI = dyn_cast<Instruction>(B))
   1281       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
   1282         return true;
   1283 
   1284   // Otherwise they may not be equivalent.
   1285   return false;
   1286 }
   1287 
   1288 /// Converts store (bitcast (load (bitcast (select ...)))) to
   1289 /// store (load (select ...)), where select is minmax:
   1290 /// select ((cmp load V1, load V2), V1, V2).
   1291 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
   1292                                                 StoreInst &SI) {
   1293   // bitcast?
   1294   if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
   1295     return false;
   1296   // load? integer?
   1297   Value *LoadAddr;
   1298   if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
   1299     return false;
   1300   auto *LI = cast<LoadInst>(SI.getValueOperand());
   1301   if (!LI->getType()->isIntegerTy())
   1302     return false;
   1303   Type *CmpLoadTy;
   1304   if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
   1305     return false;
   1306 
   1307   // Make sure the type would actually change.
   1308   // This condition can be hit with chains of bitcasts.
   1309   if (LI->getType() == CmpLoadTy)
   1310     return false;
   1311 
   1312   // Make sure we're not changing the size of the load/store.
   1313   const auto &DL = IC.getDataLayout();
   1314   if (DL.getTypeStoreSizeInBits(LI->getType()) !=
   1315       DL.getTypeStoreSizeInBits(CmpLoadTy))
   1316     return false;
   1317 
   1318   if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
   1319         auto *SI = dyn_cast<StoreInst>(U);
   1320         return SI && SI->getPointerOperand() != LI &&
   1321                InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
   1322                    LoadAddr &&
   1323                !SI->getPointerOperand()->isSwiftError();
   1324       }))
   1325     return false;
   1326 
   1327   IC.Builder.SetInsertPoint(LI);
   1328   LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
   1329   // Replace all the stores with stores of the newly loaded value.
   1330   for (auto *UI : LI->users()) {
   1331     auto *USI = cast<StoreInst>(UI);
   1332     IC.Builder.SetInsertPoint(USI);
   1333     combineStoreToNewValue(IC, *USI, NewLI);
   1334   }
   1335   IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
   1336   IC.eraseInstFromFunction(*LI);
   1337   return true;
   1338 }
   1339 
   1340 Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
   1341   Value *Val = SI.getOperand(0);
   1342   Value *Ptr = SI.getOperand(1);
   1343 
   1344   // Try to canonicalize the stored type.
   1345   if (combineStoreToValueType(*this, SI))
   1346     return eraseInstFromFunction(SI);
   1347 
   1348   // Attempt to improve the alignment.
   1349   const Align KnownAlign = getOrEnforceKnownAlignment(
   1350       Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
   1351   if (KnownAlign > SI.getAlign())
   1352     SI.setAlignment(KnownAlign);
   1353 
   1354   // Try to canonicalize the stored type.
   1355   if (unpackStoreToAggregate(*this, SI))
   1356     return eraseInstFromFunction(SI);
   1357 
   1358   if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
   1359     return eraseInstFromFunction(SI);
   1360 
   1361   // Replace GEP indices if possible.
   1362   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
   1363       Worklist.push(NewGEPI);
   1364       return &SI;
   1365   }
   1366 
   1367   // Don't hack volatile/ordered stores.
   1368   // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
   1369   if (!SI.isUnordered()) return nullptr;
   1370 
   1371   // If the RHS is an alloca with a single use, zapify the store, making the
   1372   // alloca dead.
   1373   if (Ptr->hasOneUse()) {
   1374     if (isa<AllocaInst>(Ptr))
   1375       return eraseInstFromFunction(SI);
   1376     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
   1377       if (isa<AllocaInst>(GEP->getOperand(0))) {
   1378         if (GEP->getOperand(0)->hasOneUse())
   1379           return eraseInstFromFunction(SI);
   1380       }
   1381     }
   1382   }
   1383 
   1384   // If we have a store to a location which is known constant, we can conclude
   1385   // that the store must be storing the constant value (else the memory
   1386   // wouldn't be constant), and this must be a noop.
   1387   if (AA->pointsToConstantMemory(Ptr))
   1388     return eraseInstFromFunction(SI);
   1389 
   1390   // Do really simple DSE, to catch cases where there are several consecutive
   1391   // stores to the same location, separated by a few arithmetic operations. This
   1392   // situation often occurs with bitfield accesses.
   1393   BasicBlock::iterator BBI(SI);
   1394   for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
   1395        --ScanInsts) {
   1396     --BBI;
   1397     // Don't count debug info directives, lest they affect codegen,
   1398     // and we skip pointer-to-pointer bitcasts, which are NOPs.
   1399     if (BBI->isDebugOrPseudoInst() ||
   1400         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
   1401       ScanInsts++;
   1402       continue;
   1403     }
   1404 
   1405     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
   1406       // Prev store isn't volatile, and stores to the same location?
   1407       if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
   1408                                                         SI.getOperand(1))) {
   1409         ++NumDeadStore;
   1410         // Manually add back the original store to the worklist now, so it will
   1411         // be processed after the operands of the removed store, as this may
   1412         // expose additional DSE opportunities.
   1413         Worklist.push(&SI);
   1414         eraseInstFromFunction(*PrevSI);
   1415         return nullptr;
   1416       }
   1417       break;
   1418     }
   1419 
   1420     // If this is a load, we have to stop.  However, if the loaded value is from
   1421     // the pointer we're loading and is producing the pointer we're storing,
   1422     // then *this* store is dead (X = load P; store X -> P).
   1423     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
   1424       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
   1425         assert(SI.isUnordered() && "can't eliminate ordering operation");
   1426         return eraseInstFromFunction(SI);
   1427       }
   1428 
   1429       // Otherwise, this is a load from some other location.  Stores before it
   1430       // may not be dead.
   1431       break;
   1432     }
   1433 
   1434     // Don't skip over loads, throws or things that can modify memory.
   1435     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
   1436       break;
   1437   }
   1438 
   1439   // store X, null    -> turns into 'unreachable' in SimplifyCFG
   1440   // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
   1441   if (canSimplifyNullStoreOrGEP(SI)) {
   1442     if (!isa<UndefValue>(Val))
   1443       return replaceOperand(SI, 0, UndefValue::get(Val->getType()));
   1444     return nullptr;  // Do not modify these!
   1445   }
   1446 
   1447   // store undef, Ptr -> noop
   1448   if (isa<UndefValue>(Val))
   1449     return eraseInstFromFunction(SI);
   1450 
   1451   return nullptr;
   1452 }
   1453 
   1454 /// Try to transform:
   1455 ///   if () { *P = v1; } else { *P = v2 }
   1456 /// or:
   1457 ///   *P = v1; if () { *P = v2; }
   1458 /// into a phi node with a store in the successor.
   1459 bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   1460   if (!SI.isUnordered())
   1461     return false; // This code has not been audited for volatile/ordered case.
   1462 
   1463   // Check if the successor block has exactly 2 incoming edges.
   1464   BasicBlock *StoreBB = SI.getParent();
   1465   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
   1466   if (!DestBB->hasNPredecessors(2))
   1467     return false;
   1468 
   1469   // Capture the other block (the block that doesn't contain our store).
   1470   pred_iterator PredIter = pred_begin(DestBB);
   1471   if (*PredIter == StoreBB)
   1472     ++PredIter;
   1473   BasicBlock *OtherBB = *PredIter;
   1474 
   1475   // Bail out if all of the relevant blocks aren't distinct. This can happen,
   1476   // for example, if SI is in an infinite loop.
   1477   if (StoreBB == DestBB || OtherBB == DestBB)
   1478     return false;
   1479 
   1480   // Verify that the other block ends in a branch and is not otherwise empty.
   1481   BasicBlock::iterator BBI(OtherBB->getTerminator());
   1482   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
   1483   if (!OtherBr || BBI == OtherBB->begin())
   1484     return false;
   1485 
   1486   // If the other block ends in an unconditional branch, check for the 'if then
   1487   // else' case. There is an instruction before the branch.
   1488   StoreInst *OtherStore = nullptr;
   1489   if (OtherBr->isUnconditional()) {
   1490     --BBI;
   1491     // Skip over debugging info.
   1492     while (isa<DbgInfoIntrinsic>(BBI) ||
   1493            (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
   1494       if (BBI==OtherBB->begin())
   1495         return false;
   1496       --BBI;
   1497     }
   1498     // If this isn't a store, isn't a store to the same location, or is not the
   1499     // right kind of store, bail out.
   1500     OtherStore = dyn_cast<StoreInst>(BBI);
   1501     if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
   1502         !SI.isSameOperationAs(OtherStore))
   1503       return false;
   1504   } else {
   1505     // Otherwise, the other block ended with a conditional branch. If one of the
   1506     // destinations is StoreBB, then we have the if/then case.
   1507     if (OtherBr->getSuccessor(0) != StoreBB &&
   1508         OtherBr->getSuccessor(1) != StoreBB)
   1509       return false;
   1510 
   1511     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
   1512     // if/then triangle. See if there is a store to the same ptr as SI that
   1513     // lives in OtherBB.
   1514     for (;; --BBI) {
   1515       // Check to see if we find the matching store.
   1516       if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
   1517         if (OtherStore->getOperand(1) != SI.getOperand(1) ||
   1518             !SI.isSameOperationAs(OtherStore))
   1519           return false;
   1520         break;
   1521       }
   1522       // If we find something that may be using or overwriting the stored
   1523       // value, or if we run out of instructions, we can't do the transform.
   1524       if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
   1525           BBI->mayWriteToMemory() || BBI == OtherBB->begin())
   1526         return false;
   1527     }
   1528 
   1529     // In order to eliminate the store in OtherBr, we have to make sure nothing
   1530     // reads or overwrites the stored value in StoreBB.
   1531     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
   1532       // FIXME: This should really be AA driven.
   1533       if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
   1534         return false;
   1535     }
   1536   }
   1537 
   1538   // Insert a PHI node now if we need it.
   1539   Value *MergedVal = OtherStore->getOperand(0);
   1540   // The debug locations of the original instructions might differ. Merge them.
   1541   DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
   1542                                                      OtherStore->getDebugLoc());
   1543   if (MergedVal != SI.getOperand(0)) {
   1544     PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
   1545     PN->addIncoming(SI.getOperand(0), SI.getParent());
   1546     PN->addIncoming(OtherStore->getOperand(0), OtherBB);
   1547     MergedVal = InsertNewInstBefore(PN, DestBB->front());
   1548     PN->setDebugLoc(MergedLoc);
   1549   }
   1550 
   1551   // Advance to a place where it is safe to insert the new store and insert it.
   1552   BBI = DestBB->getFirstInsertionPt();
   1553   StoreInst *NewSI =
   1554       new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
   1555                     SI.getOrdering(), SI.getSyncScopeID());
   1556   InsertNewInstBefore(NewSI, *BBI);
   1557   NewSI->setDebugLoc(MergedLoc);
   1558 
   1559   // If the two stores had AA tags, merge them.
   1560   AAMDNodes AATags;
   1561   SI.getAAMetadata(AATags);
   1562   if (AATags) {
   1563     OtherStore->getAAMetadata(AATags, /* Merge = */ true);
   1564     NewSI->setAAMetadata(AATags);
   1565   }
   1566 
   1567   // Nuke the old stores.
   1568   eraseInstFromFunction(SI);
   1569   eraseInstFromFunction(*OtherStore);
   1570   return true;
   1571 }
   1572