Home | History | Annotate | Line # | Download | only in CodeGen
      1 //===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file contains a pass (at IR level) to replace atomic instructions with
     10 // __atomic_* library calls, or target specific instruction which implement the
     11 // same semantics in a way which better fits the target backend.  This can
     12 // include the use of (intrinsic-based) load-linked/store-conditional loops,
     13 // AtomicCmpXchg, or type coercions.
     14 //
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "llvm/ADT/ArrayRef.h"
     18 #include "llvm/ADT/STLExtras.h"
     19 #include "llvm/ADT/SmallVector.h"
     20 #include "llvm/CodeGen/AtomicExpandUtils.h"
     21 #include "llvm/CodeGen/RuntimeLibcalls.h"
     22 #include "llvm/CodeGen/TargetLowering.h"
     23 #include "llvm/CodeGen/TargetPassConfig.h"
     24 #include "llvm/CodeGen/TargetSubtargetInfo.h"
     25 #include "llvm/CodeGen/ValueTypes.h"
     26 #include "llvm/IR/Attributes.h"
     27 #include "llvm/IR/BasicBlock.h"
     28 #include "llvm/IR/Constant.h"
     29 #include "llvm/IR/Constants.h"
     30 #include "llvm/IR/DataLayout.h"
     31 #include "llvm/IR/DerivedTypes.h"
     32 #include "llvm/IR/Function.h"
     33 #include "llvm/IR/IRBuilder.h"
     34 #include "llvm/IR/InstIterator.h"
     35 #include "llvm/IR/Instruction.h"
     36 #include "llvm/IR/Instructions.h"
     37 #include "llvm/IR/Module.h"
     38 #include "llvm/IR/Type.h"
     39 #include "llvm/IR/User.h"
     40 #include "llvm/IR/Value.h"
     41 #include "llvm/InitializePasses.h"
     42 #include "llvm/Pass.h"
     43 #include "llvm/Support/AtomicOrdering.h"
     44 #include "llvm/Support/Casting.h"
     45 #include "llvm/Support/Debug.h"
     46 #include "llvm/Support/ErrorHandling.h"
     47 #include "llvm/Support/raw_ostream.h"
     48 #include "llvm/Target/TargetMachine.h"
     49 #include <cassert>
     50 #include <cstdint>
     51 #include <iterator>
     52 
     53 using namespace llvm;
     54 
     55 #define DEBUG_TYPE "atomic-expand"
     56 
     57 namespace {
     58 
     59   class AtomicExpand: public FunctionPass {
     60     const TargetLowering *TLI = nullptr;
     61 
     62   public:
     63     static char ID; // Pass identification, replacement for typeid
     64 
     65     AtomicExpand() : FunctionPass(ID) {
     66       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     67     }
     68 
     69     bool runOnFunction(Function &F) override;
     70 
     71   private:
     72     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
     73     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
     74     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
     75     bool tryExpandAtomicLoad(LoadInst *LI);
     76     bool expandAtomicLoadToLL(LoadInst *LI);
     77     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
     78     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
     79     bool expandAtomicStore(StoreInst *SI);
     80     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
     81     Value *
     82     insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
     83                       Align AddrAlign, AtomicOrdering MemOpOrder,
     84                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
     85     void expandAtomicOpToLLSC(
     86         Instruction *I, Type *ResultTy, Value *Addr, Align AddrAlign,
     87         AtomicOrdering MemOpOrder,
     88         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
     89     void expandPartwordAtomicRMW(
     90         AtomicRMWInst *I,
     91         TargetLoweringBase::AtomicExpansionKind ExpansionKind);
     92     AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
     93     bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
     94     void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
     95     void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
     96 
     97     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
     98     static Value *insertRMWCmpXchgLoop(
     99         IRBuilder<> &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
    100         AtomicOrdering MemOpOrder, SyncScope::ID SSID,
    101         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
    102         CreateCmpXchgInstFun CreateCmpXchg);
    103     bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
    104 
    105     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
    106     bool isIdempotentRMW(AtomicRMWInst *RMWI);
    107     bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
    108 
    109     bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
    110                                  Value *PointerOperand, Value *ValueOperand,
    111                                  Value *CASExpected, AtomicOrdering Ordering,
    112                                  AtomicOrdering Ordering2,
    113                                  ArrayRef<RTLIB::Libcall> Libcalls);
    114     void expandAtomicLoadToLibcall(LoadInst *LI);
    115     void expandAtomicStoreToLibcall(StoreInst *LI);
    116     void expandAtomicRMWToLibcall(AtomicRMWInst *I);
    117     void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
    118 
    119     friend bool
    120     llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
    121                                    CreateCmpXchgInstFun CreateCmpXchg);
    122   };
    123 
    124 } // end anonymous namespace
    125 
    126 char AtomicExpand::ID = 0;
    127 
    128 char &llvm::AtomicExpandID = AtomicExpand::ID;
    129 
    130 INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions",
    131                 false, false)
    132 
    133 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
    134 
    135 // Helper functions to retrieve the size of atomic instructions.
    136 static unsigned getAtomicOpSize(LoadInst *LI) {
    137   const DataLayout &DL = LI->getModule()->getDataLayout();
    138   return DL.getTypeStoreSize(LI->getType());
    139 }
    140 
    141 static unsigned getAtomicOpSize(StoreInst *SI) {
    142   const DataLayout &DL = SI->getModule()->getDataLayout();
    143   return DL.getTypeStoreSize(SI->getValueOperand()->getType());
    144 }
    145 
    146 static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
    147   const DataLayout &DL = RMWI->getModule()->getDataLayout();
    148   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
    149 }
    150 
    151 static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
    152   const DataLayout &DL = CASI->getModule()->getDataLayout();
    153   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
    154 }
    155 
    156 // Determine if a particular atomic operation has a supported size,
    157 // and is of appropriate alignment, to be passed through for target
    158 // lowering. (Versus turning into a __atomic libcall)
    159 template <typename Inst>
    160 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
    161   unsigned Size = getAtomicOpSize(I);
    162   Align Alignment = I->getAlign();
    163   return Alignment >= Size &&
    164          Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
    165 }
    166 
    167 bool AtomicExpand::runOnFunction(Function &F) {
    168   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
    169   if (!TPC)
    170     return false;
    171 
    172   auto &TM = TPC->getTM<TargetMachine>();
    173   if (!TM.getSubtargetImpl(F)->enableAtomicExpand())
    174     return false;
    175   TLI = TM.getSubtargetImpl(F)->getTargetLowering();
    176 
    177   SmallVector<Instruction *, 1> AtomicInsts;
    178 
    179   // Changing control-flow while iterating through it is a bad idea, so gather a
    180   // list of all atomic instructions before we start.
    181   for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
    182     Instruction *I = &*II;
    183     if (I->isAtomic() && !isa<FenceInst>(I))
    184       AtomicInsts.push_back(I);
    185   }
    186 
    187   bool MadeChange = false;
    188   for (auto I : AtomicInsts) {
    189     auto LI = dyn_cast<LoadInst>(I);
    190     auto SI = dyn_cast<StoreInst>(I);
    191     auto RMWI = dyn_cast<AtomicRMWInst>(I);
    192     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
    193     assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
    194 
    195     // If the Size/Alignment is not supported, replace with a libcall.
    196     if (LI) {
    197       if (!atomicSizeSupported(TLI, LI)) {
    198         expandAtomicLoadToLibcall(LI);
    199         MadeChange = true;
    200         continue;
    201       }
    202     } else if (SI) {
    203       if (!atomicSizeSupported(TLI, SI)) {
    204         expandAtomicStoreToLibcall(SI);
    205         MadeChange = true;
    206         continue;
    207       }
    208     } else if (RMWI) {
    209       if (!atomicSizeSupported(TLI, RMWI)) {
    210         expandAtomicRMWToLibcall(RMWI);
    211         MadeChange = true;
    212         continue;
    213       }
    214     } else if (CASI) {
    215       if (!atomicSizeSupported(TLI, CASI)) {
    216         expandAtomicCASToLibcall(CASI);
    217         MadeChange = true;
    218         continue;
    219       }
    220     }
    221 
    222     if (TLI->shouldInsertFencesForAtomic(I)) {
    223       auto FenceOrdering = AtomicOrdering::Monotonic;
    224       if (LI && isAcquireOrStronger(LI->getOrdering())) {
    225         FenceOrdering = LI->getOrdering();
    226         LI->setOrdering(AtomicOrdering::Monotonic);
    227       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
    228         FenceOrdering = SI->getOrdering();
    229         SI->setOrdering(AtomicOrdering::Monotonic);
    230       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
    231                           isAcquireOrStronger(RMWI->getOrdering()))) {
    232         FenceOrdering = RMWI->getOrdering();
    233         RMWI->setOrdering(AtomicOrdering::Monotonic);
    234       } else if (CASI &&
    235                  TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
    236                      TargetLoweringBase::AtomicExpansionKind::None &&
    237                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
    238                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
    239         // If a compare and swap is lowered to LL/SC, we can do smarter fence
    240         // insertion, with a stronger one on the success path than on the
    241         // failure path. As a result, fence insertion is directly done by
    242         // expandAtomicCmpXchg in that case.
    243         FenceOrdering = CASI->getSuccessOrdering();
    244         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
    245         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
    246       }
    247 
    248       if (FenceOrdering != AtomicOrdering::Monotonic) {
    249         MadeChange |= bracketInstWithFences(I, FenceOrdering);
    250       }
    251     }
    252 
    253     if (LI) {
    254       if (LI->getType()->isFloatingPointTy()) {
    255         // TODO: add a TLI hook to control this so that each target can
    256         // convert to lowering the original type one at a time.
    257         LI = convertAtomicLoadToIntegerType(LI);
    258         assert(LI->getType()->isIntegerTy() && "invariant broken");
    259         MadeChange = true;
    260       }
    261 
    262       MadeChange |= tryExpandAtomicLoad(LI);
    263     } else if (SI) {
    264       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
    265         // TODO: add a TLI hook to control this so that each target can
    266         // convert to lowering the original type one at a time.
    267         SI = convertAtomicStoreToIntegerType(SI);
    268         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
    269                "invariant broken");
    270         MadeChange = true;
    271       }
    272 
    273       if (TLI->shouldExpandAtomicStoreInIR(SI))
    274         MadeChange |= expandAtomicStore(SI);
    275     } else if (RMWI) {
    276       // There are two different ways of expanding RMW instructions:
    277       // - into a load if it is idempotent
    278       // - into a Cmpxchg/LL-SC loop otherwise
    279       // we try them in that order.
    280 
    281       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
    282         MadeChange = true;
    283       } else {
    284         unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
    285         unsigned ValueSize = getAtomicOpSize(RMWI);
    286         AtomicRMWInst::BinOp Op = RMWI->getOperation();
    287         if (ValueSize < MinCASSize &&
    288             (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
    289              Op == AtomicRMWInst::And)) {
    290           RMWI = widenPartwordAtomicRMW(RMWI);
    291           MadeChange = true;
    292         }
    293 
    294         MadeChange |= tryExpandAtomicRMW(RMWI);
    295       }
    296     } else if (CASI) {
    297       // TODO: when we're ready to make the change at the IR level, we can
    298       // extend convertCmpXchgToInteger for floating point too.
    299       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
    300              "unimplemented - floating point not legal at IR level");
    301       if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
    302         // TODO: add a TLI hook to control this so that each target can
    303         // convert to lowering the original type one at a time.
    304         CASI = convertCmpXchgToIntegerType(CASI);
    305         assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
    306                "invariant broken");
    307         MadeChange = true;
    308       }
    309 
    310       MadeChange |= tryExpandAtomicCmpXchg(CASI);
    311     }
    312   }
    313   return MadeChange;
    314 }
    315 
    316 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
    317   IRBuilder<> Builder(I);
    318 
    319   auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
    320 
    321   auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
    322   // We have a guard here because not every atomic operation generates a
    323   // trailing fence.
    324   if (TrailingFence)
    325     TrailingFence->moveAfter(I);
    326 
    327   return (LeadingFence || TrailingFence);
    328 }
    329 
    330 /// Get the iX type with the same bitwidth as T.
    331 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
    332                                                        const DataLayout &DL) {
    333   EVT VT = TLI->getMemValueType(DL, T);
    334   unsigned BitWidth = VT.getStoreSizeInBits();
    335   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
    336   return IntegerType::get(T->getContext(), BitWidth);
    337 }
    338 
    339 /// Convert an atomic load of a non-integral type to an integer load of the
    340 /// equivalent bitwidth.  See the function comment on
    341 /// convertAtomicStoreToIntegerType for background.
    342 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
    343   auto *M = LI->getModule();
    344   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
    345                                             M->getDataLayout());
    346 
    347   IRBuilder<> Builder(LI);
    348 
    349   Value *Addr = LI->getPointerOperand();
    350   Type *PT = PointerType::get(NewTy,
    351                               Addr->getType()->getPointerAddressSpace());
    352   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    353 
    354   auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
    355   NewLI->setAlignment(LI->getAlign());
    356   NewLI->setVolatile(LI->isVolatile());
    357   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
    358   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
    359 
    360   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
    361   LI->replaceAllUsesWith(NewVal);
    362   LI->eraseFromParent();
    363   return NewLI;
    364 }
    365 
    366 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
    367   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
    368   case TargetLoweringBase::AtomicExpansionKind::None:
    369     return false;
    370   case TargetLoweringBase::AtomicExpansionKind::LLSC:
    371     expandAtomicOpToLLSC(
    372         LI, LI->getType(), LI->getPointerOperand(), LI->getAlign(),
    373         LI->getOrdering(),
    374         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
    375     return true;
    376   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
    377     return expandAtomicLoadToLL(LI);
    378   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
    379     return expandAtomicLoadToCmpXchg(LI);
    380   default:
    381     llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
    382   }
    383 }
    384 
    385 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
    386   IRBuilder<> Builder(LI);
    387 
    388   // On some architectures, load-linked instructions are atomic for larger
    389   // sizes than normal loads. For example, the only 64-bit load guaranteed
    390   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
    391   Value *Val =
    392       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
    393   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
    394 
    395   LI->replaceAllUsesWith(Val);
    396   LI->eraseFromParent();
    397 
    398   return true;
    399 }
    400 
    401 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
    402   IRBuilder<> Builder(LI);
    403   AtomicOrdering Order = LI->getOrdering();
    404   if (Order == AtomicOrdering::Unordered)
    405     Order = AtomicOrdering::Monotonic;
    406 
    407   Value *Addr = LI->getPointerOperand();
    408   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
    409   Constant *DummyVal = Constant::getNullValue(Ty);
    410 
    411   Value *Pair = Builder.CreateAtomicCmpXchg(
    412       Addr, DummyVal, DummyVal, LI->getAlign(), Order,
    413       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
    414   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
    415 
    416   LI->replaceAllUsesWith(Loaded);
    417   LI->eraseFromParent();
    418 
    419   return true;
    420 }
    421 
    422 /// Convert an atomic store of a non-integral type to an integer store of the
    423 /// equivalent bitwidth.  We used to not support floating point or vector
    424 /// atomics in the IR at all.  The backends learned to deal with the bitcast
    425 /// idiom because that was the only way of expressing the notion of a atomic
    426 /// float or vector store.  The long term plan is to teach each backend to
    427 /// instruction select from the original atomic store, but as a migration
    428 /// mechanism, we convert back to the old format which the backends understand.
    429 /// Each backend will need individual work to recognize the new format.
    430 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
    431   IRBuilder<> Builder(SI);
    432   auto *M = SI->getModule();
    433   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
    434                                             M->getDataLayout());
    435   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
    436 
    437   Value *Addr = SI->getPointerOperand();
    438   Type *PT = PointerType::get(NewTy,
    439                               Addr->getType()->getPointerAddressSpace());
    440   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
    441 
    442   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
    443   NewSI->setAlignment(SI->getAlign());
    444   NewSI->setVolatile(SI->isVolatile());
    445   NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
    446   LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
    447   SI->eraseFromParent();
    448   return NewSI;
    449 }
    450 
    451 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
    452   // This function is only called on atomic stores that are too large to be
    453   // atomic if implemented as a native store. So we replace them by an
    454   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
    455   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
    456   // It is the responsibility of the target to only signal expansion via
    457   // shouldExpandAtomicRMW in cases where this is required and possible.
    458   IRBuilder<> Builder(SI);
    459   AtomicRMWInst *AI = Builder.CreateAtomicRMW(
    460       AtomicRMWInst::Xchg, SI->getPointerOperand(), SI->getValueOperand(),
    461       SI->getAlign(), SI->getOrdering());
    462   SI->eraseFromParent();
    463 
    464   // Now we have an appropriate swap instruction, lower it as usual.
    465   return tryExpandAtomicRMW(AI);
    466 }
    467 
    468 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
    469                                  Value *Loaded, Value *NewVal, Align AddrAlign,
    470                                  AtomicOrdering MemOpOrder, SyncScope::ID SSID,
    471                                  Value *&Success, Value *&NewLoaded) {
    472   Type *OrigTy = NewVal->getType();
    473 
    474   // This code can go away when cmpxchg supports FP types.
    475   bool NeedBitcast = OrigTy->isFloatingPointTy();
    476   if (NeedBitcast) {
    477     IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
    478     unsigned AS = Addr->getType()->getPointerAddressSpace();
    479     Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS));
    480     NewVal = Builder.CreateBitCast(NewVal, IntTy);
    481     Loaded = Builder.CreateBitCast(Loaded, IntTy);
    482   }
    483 
    484   Value *Pair = Builder.CreateAtomicCmpXchg(
    485       Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
    486       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
    487   Success = Builder.CreateExtractValue(Pair, 1, "success");
    488   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
    489 
    490   if (NeedBitcast)
    491     NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
    492 }
    493 
    494 /// Emit IR to implement the given atomicrmw operation on values in registers,
    495 /// returning the new value.
    496 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
    497                               Value *Loaded, Value *Inc) {
    498   Value *NewVal;
    499   switch (Op) {
    500   case AtomicRMWInst::Xchg:
    501     return Inc;
    502   case AtomicRMWInst::Add:
    503     return Builder.CreateAdd(Loaded, Inc, "new");
    504   case AtomicRMWInst::Sub:
    505     return Builder.CreateSub(Loaded, Inc, "new");
    506   case AtomicRMWInst::And:
    507     return Builder.CreateAnd(Loaded, Inc, "new");
    508   case AtomicRMWInst::Nand:
    509     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
    510   case AtomicRMWInst::Or:
    511     return Builder.CreateOr(Loaded, Inc, "new");
    512   case AtomicRMWInst::Xor:
    513     return Builder.CreateXor(Loaded, Inc, "new");
    514   case AtomicRMWInst::Max:
    515     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
    516     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    517   case AtomicRMWInst::Min:
    518     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
    519     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    520   case AtomicRMWInst::UMax:
    521     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
    522     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    523   case AtomicRMWInst::UMin:
    524     NewVal = Builder.CreateICmpULE(Loaded, Inc);
    525     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
    526   case AtomicRMWInst::FAdd:
    527     return Builder.CreateFAdd(Loaded, Inc, "new");
    528   case AtomicRMWInst::FSub:
    529     return Builder.CreateFSub(Loaded, Inc, "new");
    530   default:
    531     llvm_unreachable("Unknown atomic op");
    532   }
    533 }
    534 
    535 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
    536   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
    537   case TargetLoweringBase::AtomicExpansionKind::None:
    538     return false;
    539   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
    540     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
    541     unsigned ValueSize = getAtomicOpSize(AI);
    542     if (ValueSize < MinCASSize) {
    543       expandPartwordAtomicRMW(AI,
    544                               TargetLoweringBase::AtomicExpansionKind::LLSC);
    545     } else {
    546       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
    547         return performAtomicOp(AI->getOperation(), Builder, Loaded,
    548                                AI->getValOperand());
    549       };
    550       expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
    551                            AI->getAlign(), AI->getOrdering(), PerformOp);
    552     }
    553     return true;
    554   }
    555   case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
    556     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
    557     unsigned ValueSize = getAtomicOpSize(AI);
    558     if (ValueSize < MinCASSize) {
    559       // TODO: Handle atomicrmw fadd/fsub
    560       if (AI->getType()->isFloatingPointTy())
    561         return false;
    562 
    563       expandPartwordAtomicRMW(AI,
    564                               TargetLoweringBase::AtomicExpansionKind::CmpXChg);
    565     } else {
    566       expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
    567     }
    568     return true;
    569   }
    570   case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {
    571     expandAtomicRMWToMaskedIntrinsic(AI);
    572     return true;
    573   }
    574   default:
    575     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
    576   }
    577 }
    578 
    579 namespace {
    580 
    581 struct PartwordMaskValues {
    582   // These three fields are guaranteed to be set by createMaskInstrs.
    583   Type *WordType = nullptr;
    584   Type *ValueType = nullptr;
    585   Value *AlignedAddr = nullptr;
    586   Align AlignedAddrAlignment;
    587   // The remaining fields can be null.
    588   Value *ShiftAmt = nullptr;
    589   Value *Mask = nullptr;
    590   Value *Inv_Mask = nullptr;
    591 };
    592 
    593 LLVM_ATTRIBUTE_UNUSED
    594 raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
    595   auto PrintObj = [&O](auto *V) {
    596     if (V)
    597       O << *V;
    598     else
    599       O << "nullptr";
    600     O << '\n';
    601   };
    602   O << "PartwordMaskValues {\n";
    603   O << "  WordType: ";
    604   PrintObj(PMV.WordType);
    605   O << "  ValueType: ";
    606   PrintObj(PMV.ValueType);
    607   O << "  AlignedAddr: ";
    608   PrintObj(PMV.AlignedAddr);
    609   O << "  AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << '\n';
    610   O << "  ShiftAmt: ";
    611   PrintObj(PMV.ShiftAmt);
    612   O << "  Mask: ";
    613   PrintObj(PMV.Mask);
    614   O << "  Inv_Mask: ";
    615   PrintObj(PMV.Inv_Mask);
    616   O << "}\n";
    617   return O;
    618 }
    619 
    620 } // end anonymous namespace
    621 
    622 /// This is a helper function which builds instructions to provide
    623 /// values necessary for partword atomic operations. It takes an
    624 /// incoming address, Addr, and ValueType, and constructs the address,
    625 /// shift-amounts and masks needed to work with a larger value of size
    626 /// WordSize.
    627 ///
    628 /// AlignedAddr: Addr rounded down to a multiple of WordSize
    629 ///
    630 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded
    631 ///           from AlignAddr for it to have the same value as if
    632 ///           ValueType was loaded from Addr.
    633 ///
    634 /// Mask: Value to mask with the value loaded from AlignAddr to
    635 ///       include only the part that would've been loaded from Addr.
    636 ///
    637 /// Inv_Mask: The inverse of Mask.
    638 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
    639                                            Type *ValueType, Value *Addr,
    640                                            Align AddrAlign,
    641                                            unsigned MinWordSize) {
    642   PartwordMaskValues PMV;
    643 
    644   Module *M = I->getModule();
    645   LLVMContext &Ctx = M->getContext();
    646   const DataLayout &DL = M->getDataLayout();
    647   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
    648 
    649   PMV.ValueType = ValueType;
    650   PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8)
    651                                          : ValueType;
    652   if (PMV.ValueType == PMV.WordType) {
    653     PMV.AlignedAddr = Addr;
    654     PMV.AlignedAddrAlignment = AddrAlign;
    655     return PMV;
    656   }
    657 
    658   assert(ValueSize < MinWordSize);
    659 
    660   Type *WordPtrType =
    661       PMV.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
    662 
    663   // TODO: we could skip some of this if AddrAlign >= MinWordSize.
    664   Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
    665   PMV.AlignedAddr = Builder.CreateIntToPtr(
    666       Builder.CreateAnd(AddrInt, ~(uint64_t)(MinWordSize - 1)), WordPtrType,
    667       "AlignedAddr");
    668   PMV.AlignedAddrAlignment = Align(MinWordSize);
    669 
    670   Value *PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");
    671   if (DL.isLittleEndian()) {
    672     // turn bytes into bits
    673     PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
    674   } else {
    675     // turn bytes into bits, and count from the other side.
    676     PMV.ShiftAmt = Builder.CreateShl(
    677         Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3);
    678   }
    679 
    680   PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt");
    681   PMV.Mask = Builder.CreateShl(
    682       ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt,
    683       "Mask");
    684   PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask");
    685   return PMV;
    686 }
    687 
    688 static Value *extractMaskedValue(IRBuilder<> &Builder, Value *WideWord,
    689                                  const PartwordMaskValues &PMV) {
    690   assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
    691   if (PMV.WordType == PMV.ValueType)
    692     return WideWord;
    693 
    694   Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted");
    695   Value *Trunc = Builder.CreateTrunc(Shift, PMV.ValueType, "extracted");
    696   return Trunc;
    697 }
    698 
    699 static Value *insertMaskedValue(IRBuilder<> &Builder, Value *WideWord,
    700                                 Value *Updated, const PartwordMaskValues &PMV) {
    701   assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
    702   assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
    703   if (PMV.WordType == PMV.ValueType)
    704     return Updated;
    705 
    706   Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended");
    707   Value *Shift =
    708       Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true);
    709   Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked");
    710   Value *Or = Builder.CreateOr(And, Shift, "inserted");
    711   return Or;
    712 }
    713 
    714 /// Emit IR to implement a masked version of a given atomicrmw
    715 /// operation. (That is, only the bits under the Mask should be
    716 /// affected by the operation)
    717 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
    718                                     IRBuilder<> &Builder, Value *Loaded,
    719                                     Value *Shifted_Inc, Value *Inc,
    720                                     const PartwordMaskValues &PMV) {
    721   // TODO: update to use
    722   // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order
    723   // to merge bits from two values without requiring PMV.Inv_Mask.
    724   switch (Op) {
    725   case AtomicRMWInst::Xchg: {
    726     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
    727     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
    728     return FinalVal;
    729   }
    730   case AtomicRMWInst::Or:
    731   case AtomicRMWInst::Xor:
    732   case AtomicRMWInst::And:
    733     llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");
    734   case AtomicRMWInst::Add:
    735   case AtomicRMWInst::Sub:
    736   case AtomicRMWInst::Nand: {
    737     // The other arithmetic ops need to be masked into place.
    738     Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
    739     Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
    740     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
    741     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
    742     return FinalVal;
    743   }
    744   case AtomicRMWInst::Max:
    745   case AtomicRMWInst::Min:
    746   case AtomicRMWInst::UMax:
    747   case AtomicRMWInst::UMin: {
    748     // Finally, comparison ops will operate on the full value, so
    749     // truncate down to the original size, and expand out again after
    750     // doing the operation.
    751     Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV);
    752     Value *NewVal = performAtomicOp(Op, Builder, Loaded_Extract, Inc);
    753     Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV);
    754     return FinalVal;
    755   }
    756   default:
    757     llvm_unreachable("Unknown atomic op");
    758   }
    759 }
    760 
    761 /// Expand a sub-word atomicrmw operation into an appropriate
    762 /// word-sized operation.
    763 ///
    764 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same
    765 /// way as a typical atomicrmw expansion. The only difference here is
    766 /// that the operation inside of the loop may operate upon only a
    767 /// part of the value.
    768 void AtomicExpand::expandPartwordAtomicRMW(
    769     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
    770   AtomicOrdering MemOpOrder = AI->getOrdering();
    771   SyncScope::ID SSID = AI->getSyncScopeID();
    772 
    773   IRBuilder<> Builder(AI);
    774 
    775   PartwordMaskValues PMV =
    776       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
    777                        AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
    778 
    779   Value *ValOperand_Shifted =
    780       Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
    781                         PMV.ShiftAmt, "ValOperand_Shifted");
    782 
    783   auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
    784     return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
    785                                  ValOperand_Shifted, AI->getValOperand(), PMV);
    786   };
    787 
    788   Value *OldResult;
    789   if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
    790     OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
    791                                      PMV.AlignedAddrAlignment, MemOpOrder,
    792                                      SSID, PerformPartwordOp,
    793                                      createCmpXchgInstFun);
    794   } else {
    795     assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
    796     OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
    797                                   PMV.AlignedAddrAlignment, MemOpOrder,
    798                                   PerformPartwordOp);
    799   }
    800 
    801   Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
    802   AI->replaceAllUsesWith(FinalOldResult);
    803   AI->eraseFromParent();
    804 }
    805 
    806 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
    807 AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
    808   IRBuilder<> Builder(AI);
    809   AtomicRMWInst::BinOp Op = AI->getOperation();
    810 
    811   assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
    812           Op == AtomicRMWInst::And) &&
    813          "Unable to widen operation");
    814 
    815   PartwordMaskValues PMV =
    816       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
    817                        AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
    818 
    819   Value *ValOperand_Shifted =
    820       Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
    821                         PMV.ShiftAmt, "ValOperand_Shifted");
    822 
    823   Value *NewOperand;
    824 
    825   if (Op == AtomicRMWInst::And)
    826     NewOperand =
    827         Builder.CreateOr(PMV.Inv_Mask, ValOperand_Shifted, "AndOperand");
    828   else
    829     NewOperand = ValOperand_Shifted;
    830 
    831   AtomicRMWInst *NewAI =
    832       Builder.CreateAtomicRMW(Op, PMV.AlignedAddr, NewOperand,
    833                               PMV.AlignedAddrAlignment, AI->getOrdering());
    834 
    835   Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);
    836   AI->replaceAllUsesWith(FinalOldResult);
    837   AI->eraseFromParent();
    838   return NewAI;
    839 }
    840 
    841 bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
    842   // The basic idea here is that we're expanding a cmpxchg of a
    843   // smaller memory size up to a word-sized cmpxchg. To do this, we
    844   // need to add a retry-loop for strong cmpxchg, so that
    845   // modifications to other parts of the word don't cause a spurious
    846   // failure.
    847 
    848   // This generates code like the following:
    849   //     [[Setup mask values PMV.*]]
    850   //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
    851   //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
    852   //     %InitLoaded = load i32* %addr
    853   //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
    854   //     br partword.cmpxchg.loop
    855   // partword.cmpxchg.loop:
    856   //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
    857   //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
    858   //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
    859   //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
    860   //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
    861   //        i32 %FullWord_NewVal success_ordering failure_ordering
    862   //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
    863   //     %Success = extractvalue { i32, i1 } %NewCI, 1
    864   //     br i1 %Success, label %partword.cmpxchg.end,
    865   //        label %partword.cmpxchg.failure
    866   // partword.cmpxchg.failure:
    867   //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
    868   //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
    869   //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
    870   //         label %partword.cmpxchg.end
    871   // partword.cmpxchg.end:
    872   //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
    873   //    %FinalOldVal = trunc i32 %tmp1 to i8
    874   //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
    875   //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
    876 
    877   Value *Addr = CI->getPointerOperand();
    878   Value *Cmp = CI->getCompareOperand();
    879   Value *NewVal = CI->getNewValOperand();
    880 
    881   BasicBlock *BB = CI->getParent();
    882   Function *F = BB->getParent();
    883   IRBuilder<> Builder(CI);
    884   LLVMContext &Ctx = Builder.getContext();
    885 
    886   BasicBlock *EndBB =
    887       BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
    888   auto FailureBB =
    889       BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
    890   auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
    891 
    892   // The split call above "helpfully" added a branch at the end of BB
    893   // (to the wrong place).
    894   std::prev(BB->end())->eraseFromParent();
    895   Builder.SetInsertPoint(BB);
    896 
    897   PartwordMaskValues PMV =
    898       createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,
    899                        CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
    900 
    901   // Shift the incoming values over, into the right location in the word.
    902   Value *NewVal_Shifted =
    903       Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
    904   Value *Cmp_Shifted =
    905       Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
    906 
    907   // Load the entire current word, and mask into place the expected and new
    908   // values
    909   LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
    910   InitLoaded->setVolatile(CI->isVolatile());
    911   Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
    912   Builder.CreateBr(LoopBB);
    913 
    914   // partword.cmpxchg.loop:
    915   Builder.SetInsertPoint(LoopBB);
    916   PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
    917   Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
    918 
    919   // Mask/Or the expected and new values into place in the loaded word.
    920   Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
    921   Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
    922   AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
    923       PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, PMV.AlignedAddrAlignment,
    924       CI->getSuccessOrdering(), CI->getFailureOrdering(), CI->getSyncScopeID());
    925   NewCI->setVolatile(CI->isVolatile());
    926   // When we're building a strong cmpxchg, we need a loop, so you
    927   // might think we could use a weak cmpxchg inside. But, using strong
    928   // allows the below comparison for ShouldContinue, and we're
    929   // expecting the underlying cmpxchg to be a machine instruction,
    930   // which is strong anyways.
    931   NewCI->setWeak(CI->isWeak());
    932 
    933   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
    934   Value *Success = Builder.CreateExtractValue(NewCI, 1);
    935 
    936   if (CI->isWeak())
    937     Builder.CreateBr(EndBB);
    938   else
    939     Builder.CreateCondBr(Success, EndBB, FailureBB);
    940 
    941   // partword.cmpxchg.failure:
    942   Builder.SetInsertPoint(FailureBB);
    943   // Upon failure, verify that the masked-out part of the loaded value
    944   // has been modified.  If it didn't, abort the cmpxchg, since the
    945   // masked-in part must've.
    946   Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
    947   Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
    948   Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
    949 
    950   // Add the second value to the phi from above
    951   Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
    952 
    953   // partword.cmpxchg.end:
    954   Builder.SetInsertPoint(CI);
    955 
    956   Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
    957   Value *Res = UndefValue::get(CI->getType());
    958   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
    959   Res = Builder.CreateInsertValue(Res, Success, 1);
    960 
    961   CI->replaceAllUsesWith(Res);
    962   CI->eraseFromParent();
    963   return true;
    964 }
    965 
    966 void AtomicExpand::expandAtomicOpToLLSC(
    967     Instruction *I, Type *ResultType, Value *Addr, Align AddrAlign,
    968     AtomicOrdering MemOpOrder,
    969     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
    970   IRBuilder<> Builder(I);
    971   Value *Loaded = insertRMWLLSCLoop(Builder, ResultType, Addr, AddrAlign,
    972                                     MemOpOrder, PerformOp);
    973 
    974   I->replaceAllUsesWith(Loaded);
    975   I->eraseFromParent();
    976 }
    977 
    978 void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
    979   IRBuilder<> Builder(AI);
    980 
    981   PartwordMaskValues PMV =
    982       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
    983                        AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
    984 
    985   // The value operand must be sign-extended for signed min/max so that the
    986   // target's signed comparison instructions can be used. Otherwise, just
    987   // zero-ext.
    988   Instruction::CastOps CastOp = Instruction::ZExt;
    989   AtomicRMWInst::BinOp RMWOp = AI->getOperation();
    990   if (RMWOp == AtomicRMWInst::Max || RMWOp == AtomicRMWInst::Min)
    991     CastOp = Instruction::SExt;
    992 
    993   Value *ValOperand_Shifted = Builder.CreateShl(
    994       Builder.CreateCast(CastOp, AI->getValOperand(), PMV.WordType),
    995       PMV.ShiftAmt, "ValOperand_Shifted");
    996   Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
    997       Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt,
    998       AI->getOrdering());
    999   Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
   1000   AI->replaceAllUsesWith(FinalOldResult);
   1001   AI->eraseFromParent();
   1002 }
   1003 
   1004 void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
   1005   IRBuilder<> Builder(CI);
   1006 
   1007   PartwordMaskValues PMV = createMaskInstrs(
   1008       Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(),
   1009       CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
   1010 
   1011   Value *CmpVal_Shifted = Builder.CreateShl(
   1012       Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt,
   1013       "CmpVal_Shifted");
   1014   Value *NewVal_Shifted = Builder.CreateShl(
   1015       Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt,
   1016       "NewVal_Shifted");
   1017   Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
   1018       Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,
   1019       CI->getSuccessOrdering());
   1020   Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
   1021   Value *Res = UndefValue::get(CI->getType());
   1022   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
   1023   Value *Success = Builder.CreateICmpEQ(
   1024       CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success");
   1025   Res = Builder.CreateInsertValue(Res, Success, 1);
   1026 
   1027   CI->replaceAllUsesWith(Res);
   1028   CI->eraseFromParent();
   1029 }
   1030 
   1031 Value *AtomicExpand::insertRMWLLSCLoop(
   1032     IRBuilder<> &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
   1033     AtomicOrdering MemOpOrder,
   1034     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
   1035   LLVMContext &Ctx = Builder.getContext();
   1036   BasicBlock *BB = Builder.GetInsertBlock();
   1037   Function *F = BB->getParent();
   1038 
   1039   assert(AddrAlign >=
   1040              F->getParent()->getDataLayout().getTypeStoreSize(ResultTy) &&
   1041          "Expected at least natural alignment at this point.");
   1042 
   1043   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
   1044   //
   1045   // The standard expansion we produce is:
   1046   //     [...]
   1047   // atomicrmw.start:
   1048   //     %loaded = @load.linked(%addr)
   1049   //     %new = some_op iN %loaded, %incr
   1050   //     %stored = @store_conditional(%new, %addr)
   1051   //     %try_again = icmp i32 ne %stored, 0
   1052   //     br i1 %try_again, label %loop, label %atomicrmw.end
   1053   // atomicrmw.end:
   1054   //     [...]
   1055   BasicBlock *ExitBB =
   1056       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
   1057   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
   1058 
   1059   // The split call above "helpfully" added a branch at the end of BB (to the
   1060   // wrong place).
   1061   std::prev(BB->end())->eraseFromParent();
   1062   Builder.SetInsertPoint(BB);
   1063   Builder.CreateBr(LoopBB);
   1064 
   1065   // Start the main loop block now that we've taken care of the preliminaries.
   1066   Builder.SetInsertPoint(LoopBB);
   1067   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
   1068 
   1069   Value *NewVal = PerformOp(Builder, Loaded);
   1070 
   1071   Value *StoreSuccess =
   1072       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
   1073   Value *TryAgain = Builder.CreateICmpNE(
   1074       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
   1075   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
   1076 
   1077   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   1078   return Loaded;
   1079 }
   1080 
   1081 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
   1082 /// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
   1083 /// IR.  As a migration step, we convert back to what use to be the standard
   1084 /// way to represent a pointer cmpxchg so that we can update backends one by
   1085 /// one.
   1086 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
   1087   auto *M = CI->getModule();
   1088   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
   1089                                             M->getDataLayout());
   1090 
   1091   IRBuilder<> Builder(CI);
   1092 
   1093   Value *Addr = CI->getPointerOperand();
   1094   Type *PT = PointerType::get(NewTy,
   1095                               Addr->getType()->getPointerAddressSpace());
   1096   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
   1097 
   1098   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
   1099   Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
   1100 
   1101   auto *NewCI = Builder.CreateAtomicCmpXchg(
   1102       NewAddr, NewCmp, NewNewVal, CI->getAlign(), CI->getSuccessOrdering(),
   1103       CI->getFailureOrdering(), CI->getSyncScopeID());
   1104   NewCI->setVolatile(CI->isVolatile());
   1105   NewCI->setWeak(CI->isWeak());
   1106   LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
   1107 
   1108   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
   1109   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
   1110 
   1111   OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
   1112 
   1113   Value *Res = UndefValue::get(CI->getType());
   1114   Res = Builder.CreateInsertValue(Res, OldVal, 0);
   1115   Res = Builder.CreateInsertValue(Res, Succ, 1);
   1116 
   1117   CI->replaceAllUsesWith(Res);
   1118   CI->eraseFromParent();
   1119   return NewCI;
   1120 }
   1121 
   1122 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   1123   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
   1124   AtomicOrdering FailureOrder = CI->getFailureOrdering();
   1125   Value *Addr = CI->getPointerOperand();
   1126   BasicBlock *BB = CI->getParent();
   1127   Function *F = BB->getParent();
   1128   LLVMContext &Ctx = F->getContext();
   1129   // If shouldInsertFencesForAtomic() returns true, then the target does not
   1130   // want to deal with memory orders, and emitLeading/TrailingFence should take
   1131   // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
   1132   // should preserve the ordering.
   1133   bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
   1134   AtomicOrdering MemOpOrder =
   1135       ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
   1136 
   1137   // In implementations which use a barrier to achieve release semantics, we can
   1138   // delay emitting this barrier until we know a store is actually going to be
   1139   // attempted. The cost of this delay is that we need 2 copies of the block
   1140   // emitting the load-linked, affecting code size.
   1141   //
   1142   // Ideally, this logic would be unconditional except for the minsize check
   1143   // since in other cases the extra blocks naturally collapse down to the
   1144   // minimal loop. Unfortunately, this puts too much stress on later
   1145   // optimisations so we avoid emitting the extra logic in those cases too.
   1146   bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
   1147                            SuccessOrder != AtomicOrdering::Monotonic &&
   1148                            SuccessOrder != AtomicOrdering::Acquire &&
   1149                            !F->hasMinSize();
   1150 
   1151   // There's no overhead for sinking the release barrier in a weak cmpxchg, so
   1152   // do it even on minsize.
   1153   bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();
   1154 
   1155   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
   1156   //
   1157   // The full expansion we produce is:
   1158   //     [...]
   1159   // %aligned.addr = ...
   1160   // cmpxchg.start:
   1161   //     %unreleasedload = @load.linked(%aligned.addr)
   1162   //     %unreleasedload.extract = extract value from %unreleasedload
   1163   //     %should_store = icmp eq %unreleasedload.extract, %desired
   1164   //     br i1 %should_store, label %cmpxchg.releasingstore,
   1165   //                          label %cmpxchg.nostore
   1166   // cmpxchg.releasingstore:
   1167   //     fence?
   1168   //     br label cmpxchg.trystore
   1169   // cmpxchg.trystore:
   1170   //     %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
   1171   //                            [%releasedload, %cmpxchg.releasedload]
   1172   //     %updated.new = insert %new into %loaded.trystore
   1173   //     %stored = @store_conditional(%updated.new, %aligned.addr)
   1174   //     %success = icmp eq i32 %stored, 0
   1175   //     br i1 %success, label %cmpxchg.success,
   1176   //                     label %cmpxchg.releasedload/%cmpxchg.failure
   1177   // cmpxchg.releasedload:
   1178   //     %releasedload = @load.linked(%aligned.addr)
   1179   //     %releasedload.extract = extract value from %releasedload
   1180   //     %should_store = icmp eq %releasedload.extract, %desired
   1181   //     br i1 %should_store, label %cmpxchg.trystore,
   1182   //                          label %cmpxchg.failure
   1183   // cmpxchg.success:
   1184   //     fence?
   1185   //     br label %cmpxchg.end
   1186   // cmpxchg.nostore:
   1187   //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
   1188   //                           [%releasedload,
   1189   //                               %cmpxchg.releasedload/%cmpxchg.trystore]
   1190   //     @load_linked_fail_balance()?
   1191   //     br label %cmpxchg.failure
   1192   // cmpxchg.failure:
   1193   //     fence?
   1194   //     br label %cmpxchg.end
   1195   // cmpxchg.end:
   1196   //     %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
   1197   //                        [%loaded.trystore, %cmpxchg.trystore]
   1198   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
   1199   //     %loaded = extract value from %loaded.exit
   1200   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
   1201   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   1202   //     [...]
   1203   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
   1204   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
   1205   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
   1206   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
   1207   auto ReleasedLoadBB =
   1208       BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
   1209   auto TryStoreBB =
   1210       BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
   1211   auto ReleasingStoreBB =
   1212       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
   1213   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
   1214 
   1215   // This grabs the DebugLoc from CI
   1216   IRBuilder<> Builder(CI);
   1217 
   1218   // The split call above "helpfully" added a branch at the end of BB (to the
   1219   // wrong place), but we might want a fence too. It's easiest to just remove
   1220   // the branch entirely.
   1221   std::prev(BB->end())->eraseFromParent();
   1222   Builder.SetInsertPoint(BB);
   1223   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
   1224     TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   1225 
   1226   PartwordMaskValues PMV =
   1227       createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,
   1228                        CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
   1229   Builder.CreateBr(StartBB);
   1230 
   1231   // Start the main loop block now that we've taken care of the preliminaries.
   1232   Builder.SetInsertPoint(StartBB);
   1233   Value *UnreleasedLoad =
   1234       TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder);
   1235   Value *UnreleasedLoadExtract =
   1236       extractMaskedValue(Builder, UnreleasedLoad, PMV);
   1237   Value *ShouldStore = Builder.CreateICmpEQ(
   1238       UnreleasedLoadExtract, CI->getCompareOperand(), "should_store");
   1239 
   1240   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   1241   // jump straight past that fence instruction (if it exists).
   1242   Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
   1243 
   1244   Builder.SetInsertPoint(ReleasingStoreBB);
   1245   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
   1246     TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   1247   Builder.CreateBr(TryStoreBB);
   1248 
   1249   Builder.SetInsertPoint(TryStoreBB);
   1250   PHINode *LoadedTryStore =
   1251       Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore");
   1252   LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB);
   1253   Value *NewValueInsert =
   1254       insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);
   1255   Value *StoreSuccess =
   1256       TLI->emitStoreConditional(Builder, NewValueInsert, PMV.AlignedAddr,
   1257                                 MemOpOrder);
   1258   StoreSuccess = Builder.CreateICmpEQ(
   1259       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   1260   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
   1261   Builder.CreateCondBr(StoreSuccess, SuccessBB,
   1262                        CI->isWeak() ? FailureBB : RetryBB);
   1263 
   1264   Builder.SetInsertPoint(ReleasedLoadBB);
   1265   Value *SecondLoad;
   1266   if (HasReleasedLoadBB) {
   1267     SecondLoad = TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder);
   1268     Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV);
   1269     ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract,
   1270                                        CI->getCompareOperand(), "should_store");
   1271 
   1272     // If the cmpxchg doesn't actually need any ordering when it fails, we can
   1273     // jump straight past that fence instruction (if it exists).
   1274     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
   1275     // Update PHI node in TryStoreBB.
   1276     LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);
   1277   } else
   1278     Builder.CreateUnreachable();
   1279 
   1280   // Make sure later instructions don't get reordered with a fence if
   1281   // necessary.
   1282   Builder.SetInsertPoint(SuccessBB);
   1283   if (ShouldInsertFencesForAtomic)
   1284     TLI->emitTrailingFence(Builder, CI, SuccessOrder);
   1285   Builder.CreateBr(ExitBB);
   1286 
   1287   Builder.SetInsertPoint(NoStoreBB);
   1288   PHINode *LoadedNoStore =
   1289       Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore");
   1290   LoadedNoStore->addIncoming(UnreleasedLoad, StartBB);
   1291   if (HasReleasedLoadBB)
   1292     LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB);
   1293 
   1294   // In the failing case, where we don't execute the store-conditional, the
   1295   // target might want to balance out the load-linked with a dedicated
   1296   // instruction (e.g., on ARM, clearing the exclusive monitor).
   1297   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
   1298   Builder.CreateBr(FailureBB);
   1299 
   1300   Builder.SetInsertPoint(FailureBB);
   1301   PHINode *LoadedFailure =
   1302       Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure");
   1303   LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB);
   1304   if (CI->isWeak())
   1305     LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB);
   1306   if (ShouldInsertFencesForAtomic)
   1307     TLI->emitTrailingFence(Builder, CI, FailureOrder);
   1308   Builder.CreateBr(ExitBB);
   1309 
   1310   // Finally, we have control-flow based knowledge of whether the cmpxchg
   1311   // succeeded or not. We expose this to later passes by converting any
   1312   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
   1313   // PHI.
   1314   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   1315   PHINode *LoadedExit =
   1316       Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit");
   1317   LoadedExit->addIncoming(LoadedTryStore, SuccessBB);
   1318   LoadedExit->addIncoming(LoadedFailure, FailureBB);
   1319   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success");
   1320   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
   1321   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
   1322 
   1323   // This is the "exit value" from the cmpxchg expansion. It may be of
   1324   // a type wider than the one in the cmpxchg instruction.
   1325   Value *LoadedFull = LoadedExit;
   1326 
   1327   Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator()));
   1328   Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV);
   1329 
   1330   // Look for any users of the cmpxchg that are just comparing the loaded value
   1331   // against the desired one, and replace them with the CFG-derived version.
   1332   SmallVector<ExtractValueInst *, 2> PrunedInsts;
   1333   for (auto User : CI->users()) {
   1334     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
   1335     if (!EV)
   1336       continue;
   1337 
   1338     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
   1339            "weird extraction from { iN, i1 }");
   1340 
   1341     if (EV->getIndices()[0] == 0)
   1342       EV->replaceAllUsesWith(Loaded);
   1343     else
   1344       EV->replaceAllUsesWith(Success);
   1345 
   1346     PrunedInsts.push_back(EV);
   1347   }
   1348 
   1349   // We can remove the instructions now we're no longer iterating through them.
   1350   for (auto EV : PrunedInsts)
   1351     EV->eraseFromParent();
   1352 
   1353   if (!CI->use_empty()) {
   1354     // Some use of the full struct return that we don't understand has happened,
   1355     // so we've got to reconstruct it properly.
   1356     Value *Res;
   1357     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
   1358     Res = Builder.CreateInsertValue(Res, Success, 1);
   1359 
   1360     CI->replaceAllUsesWith(Res);
   1361   }
   1362 
   1363   CI->eraseFromParent();
   1364   return true;
   1365 }
   1366 
   1367 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
   1368   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
   1369   if(!C)
   1370     return false;
   1371 
   1372   AtomicRMWInst::BinOp Op = RMWI->getOperation();
   1373   switch(Op) {
   1374     case AtomicRMWInst::Add:
   1375     case AtomicRMWInst::Sub:
   1376     case AtomicRMWInst::Or:
   1377     case AtomicRMWInst::Xor:
   1378       return C->isZero();
   1379     case AtomicRMWInst::And:
   1380       return C->isMinusOne();
   1381     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
   1382     default:
   1383       return false;
   1384   }
   1385 }
   1386 
   1387 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
   1388   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
   1389     tryExpandAtomicLoad(ResultingLoad);
   1390     return true;
   1391   }
   1392   return false;
   1393 }
   1394 
   1395 Value *AtomicExpand::insertRMWCmpXchgLoop(
   1396     IRBuilder<> &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
   1397     AtomicOrdering MemOpOrder, SyncScope::ID SSID,
   1398     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
   1399     CreateCmpXchgInstFun CreateCmpXchg) {
   1400   LLVMContext &Ctx = Builder.getContext();
   1401   BasicBlock *BB = Builder.GetInsertBlock();
   1402   Function *F = BB->getParent();
   1403 
   1404   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
   1405   //
   1406   // The standard expansion we produce is:
   1407   //     [...]
   1408   //     %init_loaded = load atomic iN* %addr
   1409   //     br label %loop
   1410   // loop:
   1411   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
   1412   //     %new = some_op iN %loaded, %incr
   1413   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
   1414   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
   1415   //     %success = extractvalue { iN, i1 } %pair, 1
   1416   //     br i1 %success, label %atomicrmw.end, label %loop
   1417   // atomicrmw.end:
   1418   //     [...]
   1419   BasicBlock *ExitBB =
   1420       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
   1421   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
   1422 
   1423   // The split call above "helpfully" added a branch at the end of BB (to the
   1424   // wrong place), but we want a load. It's easiest to just remove
   1425   // the branch entirely.
   1426   std::prev(BB->end())->eraseFromParent();
   1427   Builder.SetInsertPoint(BB);
   1428   LoadInst *InitLoaded = Builder.CreateAlignedLoad(ResultTy, Addr, AddrAlign);
   1429   Builder.CreateBr(LoopBB);
   1430 
   1431   // Start the main loop block now that we've taken care of the preliminaries.
   1432   Builder.SetInsertPoint(LoopBB);
   1433   PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
   1434   Loaded->addIncoming(InitLoaded, BB);
   1435 
   1436   Value *NewVal = PerformOp(Builder, Loaded);
   1437 
   1438   Value *NewLoaded = nullptr;
   1439   Value *Success = nullptr;
   1440 
   1441   CreateCmpXchg(Builder, Addr, Loaded, NewVal, AddrAlign,
   1442                 MemOpOrder == AtomicOrdering::Unordered
   1443                     ? AtomicOrdering::Monotonic
   1444                     : MemOpOrder,
   1445                 SSID, Success, NewLoaded);
   1446   assert(Success && NewLoaded);
   1447 
   1448   Loaded->addIncoming(NewLoaded, LoopBB);
   1449 
   1450   Builder.CreateCondBr(Success, ExitBB, LoopBB);
   1451 
   1452   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   1453   return NewLoaded;
   1454 }
   1455 
   1456 bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   1457   unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
   1458   unsigned ValueSize = getAtomicOpSize(CI);
   1459 
   1460   switch (TLI->shouldExpandAtomicCmpXchgInIR(CI)) {
   1461   default:
   1462     llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
   1463   case TargetLoweringBase::AtomicExpansionKind::None:
   1464     if (ValueSize < MinCASSize)
   1465       return expandPartwordCmpXchg(CI);
   1466     return false;
   1467   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
   1468     return expandAtomicCmpXchg(CI);
   1469   }
   1470   case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
   1471     expandAtomicCmpXchgToMaskedIntrinsic(CI);
   1472     return true;
   1473   }
   1474 }
   1475 
   1476 // Note: This function is exposed externally by AtomicExpandUtils.h
   1477 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
   1478                                     CreateCmpXchgInstFun CreateCmpXchg) {
   1479   IRBuilder<> Builder(AI);
   1480   Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
   1481       Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),
   1482       AI->getOrdering(), AI->getSyncScopeID(),
   1483       [&](IRBuilder<> &Builder, Value *Loaded) {
   1484         return performAtomicOp(AI->getOperation(), Builder, Loaded,
   1485                                AI->getValOperand());
   1486       },
   1487       CreateCmpXchg);
   1488 
   1489   AI->replaceAllUsesWith(Loaded);
   1490   AI->eraseFromParent();
   1491   return true;
   1492 }
   1493 
   1494 // In order to use one of the sized library calls such as
   1495 // __atomic_fetch_add_4, the alignment must be sufficient, the size
   1496 // must be one of the potentially-specialized sizes, and the value
   1497 // type must actually exist in C on the target (otherwise, the
   1498 // function wouldn't actually be defined.)
   1499 static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
   1500                                   const DataLayout &DL) {
   1501   // TODO: "LargestSize" is an approximation for "largest type that
   1502   // you can express in C". It seems to be the case that int128 is
   1503   // supported on all 64-bit platforms, otherwise only up to 64-bit
   1504   // integers are supported. If we get this wrong, then we'll try to
   1505   // call a sized libcall that doesn't actually exist. There should
   1506   // really be some more reliable way in LLVM of determining integer
   1507   // sizes which are valid in the target's C ABI...
   1508   unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
   1509   return Alignment >= Size &&
   1510          (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
   1511          Size <= LargestSize;
   1512 }
   1513 
   1514 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
   1515   static const RTLIB::Libcall Libcalls[6] = {
   1516       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
   1517       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
   1518   unsigned Size = getAtomicOpSize(I);
   1519 
   1520   bool expanded = expandAtomicOpToLibcall(
   1521       I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
   1522       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   1523   if (!expanded)
   1524     report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");
   1525 }
   1526 
   1527 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
   1528   static const RTLIB::Libcall Libcalls[6] = {
   1529       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
   1530       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
   1531   unsigned Size = getAtomicOpSize(I);
   1532 
   1533   bool expanded = expandAtomicOpToLibcall(
   1534       I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
   1535       nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   1536   if (!expanded)
   1537     report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");
   1538 }
   1539 
   1540 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
   1541   static const RTLIB::Libcall Libcalls[6] = {
   1542       RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
   1543       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
   1544       RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
   1545   unsigned Size = getAtomicOpSize(I);
   1546 
   1547   bool expanded = expandAtomicOpToLibcall(
   1548       I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(),
   1549       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
   1550       Libcalls);
   1551   if (!expanded)
   1552     report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS");
   1553 }
   1554 
   1555 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
   1556   static const RTLIB::Libcall LibcallsXchg[6] = {
   1557       RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
   1558       RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
   1559       RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
   1560   static const RTLIB::Libcall LibcallsAdd[6] = {
   1561       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
   1562       RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
   1563       RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
   1564   static const RTLIB::Libcall LibcallsSub[6] = {
   1565       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
   1566       RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
   1567       RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
   1568   static const RTLIB::Libcall LibcallsAnd[6] = {
   1569       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
   1570       RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
   1571       RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
   1572   static const RTLIB::Libcall LibcallsOr[6] = {
   1573       RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
   1574       RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
   1575       RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
   1576   static const RTLIB::Libcall LibcallsXor[6] = {
   1577       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
   1578       RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
   1579       RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
   1580   static const RTLIB::Libcall LibcallsNand[6] = {
   1581       RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
   1582       RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
   1583       RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
   1584 
   1585   switch (Op) {
   1586   case AtomicRMWInst::BAD_BINOP:
   1587     llvm_unreachable("Should not have BAD_BINOP.");
   1588   case AtomicRMWInst::Xchg:
   1589     return makeArrayRef(LibcallsXchg);
   1590   case AtomicRMWInst::Add:
   1591     return makeArrayRef(LibcallsAdd);
   1592   case AtomicRMWInst::Sub:
   1593     return makeArrayRef(LibcallsSub);
   1594   case AtomicRMWInst::And:
   1595     return makeArrayRef(LibcallsAnd);
   1596   case AtomicRMWInst::Or:
   1597     return makeArrayRef(LibcallsOr);
   1598   case AtomicRMWInst::Xor:
   1599     return makeArrayRef(LibcallsXor);
   1600   case AtomicRMWInst::Nand:
   1601     return makeArrayRef(LibcallsNand);
   1602   case AtomicRMWInst::Max:
   1603   case AtomicRMWInst::Min:
   1604   case AtomicRMWInst::UMax:
   1605   case AtomicRMWInst::UMin:
   1606   case AtomicRMWInst::FAdd:
   1607   case AtomicRMWInst::FSub:
   1608     // No atomic libcalls are available for max/min/umax/umin.
   1609     return {};
   1610   }
   1611   llvm_unreachable("Unexpected AtomicRMW operation.");
   1612 }
   1613 
   1614 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
   1615   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
   1616 
   1617   unsigned Size = getAtomicOpSize(I);
   1618 
   1619   bool Success = false;
   1620   if (!Libcalls.empty())
   1621     Success = expandAtomicOpToLibcall(
   1622         I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(),
   1623         nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   1624 
   1625   // The expansion failed: either there were no libcalls at all for
   1626   // the operation (min/max), or there were only size-specialized
   1627   // libcalls (add/sub/etc) and we needed a generic. So, expand to a
   1628   // CAS libcall, via a CAS loop, instead.
   1629   if (!Success) {
   1630     expandAtomicRMWToCmpXchg(
   1631         I, [this](IRBuilder<> &Builder, Value *Addr, Value *Loaded,
   1632                   Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
   1633                   SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {
   1634           // Create the CAS instruction normally...
   1635           AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
   1636               Addr, Loaded, NewVal, Alignment, MemOpOrder,
   1637               AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
   1638           Success = Builder.CreateExtractValue(Pair, 1, "success");
   1639           NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
   1640 
   1641           // ...and then expand the CAS into a libcall.
   1642           expandAtomicCASToLibcall(Pair);
   1643         });
   1644   }
   1645 }
   1646 
   1647 // A helper routine for the above expandAtomic*ToLibcall functions.
   1648 //
   1649 // 'Libcalls' contains an array of enum values for the particular
   1650 // ATOMIC libcalls to be emitted. All of the other arguments besides
   1651 // 'I' are extracted from the Instruction subclass by the
   1652 // caller. Depending on the particular call, some will be null.
   1653 bool AtomicExpand::expandAtomicOpToLibcall(
   1654     Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,
   1655     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
   1656     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
   1657   assert(Libcalls.size() == 6);
   1658 
   1659   LLVMContext &Ctx = I->getContext();
   1660   Module *M = I->getModule();
   1661   const DataLayout &DL = M->getDataLayout();
   1662   IRBuilder<> Builder(I);
   1663   IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
   1664 
   1665   bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
   1666   Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
   1667 
   1668   const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);
   1669 
   1670   // TODO: the "order" argument type is "int", not int32. So
   1671   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
   1672   ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
   1673   assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
   1674   Constant *OrderingVal =
   1675       ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
   1676   Constant *Ordering2Val = nullptr;
   1677   if (CASExpected) {
   1678     assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
   1679     Ordering2Val =
   1680         ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
   1681   }
   1682   bool HasResult = I->getType() != Type::getVoidTy(Ctx);
   1683 
   1684   RTLIB::Libcall RTLibType;
   1685   if (UseSizedLibcall) {
   1686     switch (Size) {
   1687     case 1: RTLibType = Libcalls[1]; break;
   1688     case 2: RTLibType = Libcalls[2]; break;
   1689     case 4: RTLibType = Libcalls[3]; break;
   1690     case 8: RTLibType = Libcalls[4]; break;
   1691     case 16: RTLibType = Libcalls[5]; break;
   1692     }
   1693   } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
   1694     RTLibType = Libcalls[0];
   1695   } else {
   1696     // Can't use sized function, and there's no generic for this
   1697     // operation, so give up.
   1698     return false;
   1699   }
   1700 
   1701   if (!TLI->getLibcallName(RTLibType)) {
   1702     // This target does not implement the requested atomic libcall so give up.
   1703     return false;
   1704   }
   1705 
   1706   // Build up the function call. There's two kinds. First, the sized
   1707   // variants.  These calls are going to be one of the following (with
   1708   // N=1,2,4,8,16):
   1709   //  iN    __atomic_load_N(iN *ptr, int ordering)
   1710   //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
   1711   //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
   1712   //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
   1713   //                                    int success_order, int failure_order)
   1714   //
   1715   // Note that these functions can be used for non-integer atomic
   1716   // operations, the values just need to be bitcast to integers on the
   1717   // way in and out.
   1718   //
   1719   // And, then, the generic variants. They look like the following:
   1720   //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
   1721   //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
   1722   //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
   1723   //                          int ordering)
   1724   //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
   1725   //                                  void *desired, int success_order,
   1726   //                                  int failure_order)
   1727   //
   1728   // The different signatures are built up depending on the
   1729   // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
   1730   // variables.
   1731 
   1732   AllocaInst *AllocaCASExpected = nullptr;
   1733   Value *AllocaCASExpected_i8 = nullptr;
   1734   AllocaInst *AllocaValue = nullptr;
   1735   Value *AllocaValue_i8 = nullptr;
   1736   AllocaInst *AllocaResult = nullptr;
   1737   Value *AllocaResult_i8 = nullptr;
   1738 
   1739   Type *ResultTy;
   1740   SmallVector<Value *, 6> Args;
   1741   AttributeList Attr;
   1742 
   1743   // 'size' argument.
   1744   if (!UseSizedLibcall) {
   1745     // Note, getIntPtrType is assumed equivalent to size_t.
   1746     Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
   1747   }
   1748 
   1749   // 'ptr' argument.
   1750   // note: This assumes all address spaces share a common libfunc
   1751   // implementation and that addresses are convertable.  For systems without
   1752   // that property, we'd need to extend this mechanism to support AS-specific
   1753   // families of atomic intrinsics.
   1754   auto PtrTypeAS = PointerOperand->getType()->getPointerAddressSpace();
   1755   Value *PtrVal = Builder.CreateBitCast(PointerOperand,
   1756                                         Type::getInt8PtrTy(Ctx, PtrTypeAS));
   1757   PtrVal = Builder.CreateAddrSpaceCast(PtrVal, Type::getInt8PtrTy(Ctx));
   1758   Args.push_back(PtrVal);
   1759 
   1760   // 'expected' argument, if present.
   1761   if (CASExpected) {
   1762     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
   1763     AllocaCASExpected->setAlignment(AllocaAlignment);
   1764     unsigned AllocaAS =  AllocaCASExpected->getType()->getPointerAddressSpace();
   1765 
   1766     AllocaCASExpected_i8 =
   1767       Builder.CreateBitCast(AllocaCASExpected,
   1768                             Type::getInt8PtrTy(Ctx, AllocaAS));
   1769     Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
   1770     Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
   1771     Args.push_back(AllocaCASExpected_i8);
   1772   }
   1773 
   1774   // 'val' argument ('desired' for cas), if present.
   1775   if (ValueOperand) {
   1776     if (UseSizedLibcall) {
   1777       Value *IntValue =
   1778           Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
   1779       Args.push_back(IntValue);
   1780     } else {
   1781       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
   1782       AllocaValue->setAlignment(AllocaAlignment);
   1783       AllocaValue_i8 =
   1784           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
   1785       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
   1786       Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
   1787       Args.push_back(AllocaValue_i8);
   1788     }
   1789   }
   1790 
   1791   // 'ret' argument.
   1792   if (!CASExpected && HasResult && !UseSizedLibcall) {
   1793     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
   1794     AllocaResult->setAlignment(AllocaAlignment);
   1795     unsigned AllocaAS =  AllocaResult->getType()->getPointerAddressSpace();
   1796     AllocaResult_i8 =
   1797       Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
   1798     Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
   1799     Args.push_back(AllocaResult_i8);
   1800   }
   1801 
   1802   // 'ordering' ('success_order' for cas) argument.
   1803   Args.push_back(OrderingVal);
   1804 
   1805   // 'failure_order' argument, if present.
   1806   if (Ordering2Val)
   1807     Args.push_back(Ordering2Val);
   1808 
   1809   // Now, the return type.
   1810   if (CASExpected) {
   1811     ResultTy = Type::getInt1Ty(Ctx);
   1812     Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
   1813   } else if (HasResult && UseSizedLibcall)
   1814     ResultTy = SizedIntTy;
   1815   else
   1816     ResultTy = Type::getVoidTy(Ctx);
   1817 
   1818   // Done with setting up arguments and return types, create the call:
   1819   SmallVector<Type *, 6> ArgTys;
   1820   for (Value *Arg : Args)
   1821     ArgTys.push_back(Arg->getType());
   1822   FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
   1823   FunctionCallee LibcallFn =
   1824       M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
   1825   CallInst *Call = Builder.CreateCall(LibcallFn, Args);
   1826   Call->setAttributes(Attr);
   1827   Value *Result = Call;
   1828 
   1829   // And then, extract the results...
   1830   if (ValueOperand && !UseSizedLibcall)
   1831     Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
   1832 
   1833   if (CASExpected) {
   1834     // The final result from the CAS is {load of 'expected' alloca, bool result
   1835     // from call}
   1836     Type *FinalResultTy = I->getType();
   1837     Value *V = UndefValue::get(FinalResultTy);
   1838     Value *ExpectedOut = Builder.CreateAlignedLoad(
   1839         CASExpected->getType(), AllocaCASExpected, AllocaAlignment);
   1840     Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
   1841     V = Builder.CreateInsertValue(V, ExpectedOut, 0);
   1842     V = Builder.CreateInsertValue(V, Result, 1);
   1843     I->replaceAllUsesWith(V);
   1844   } else if (HasResult) {
   1845     Value *V;
   1846     if (UseSizedLibcall)
   1847       V = Builder.CreateBitOrPointerCast(Result, I->getType());
   1848     else {
   1849       V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
   1850                                     AllocaAlignment);
   1851       Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
   1852     }
   1853     I->replaceAllUsesWith(V);
   1854   }
   1855   I->eraseFromParent();
   1856   return true;
   1857 }
   1858