Home | History | Annotate | Line # | Download | only in CodeGen
      1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
     10 // runtime library.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "CGCUDARuntime.h"
     15 #include "CGCXXABI.h"
     16 #include "CodeGenFunction.h"
     17 #include "CodeGenModule.h"
     18 #include "clang/AST/Decl.h"
     19 #include "clang/Basic/Cuda.h"
     20 #include "clang/CodeGen/CodeGenABITypes.h"
     21 #include "clang/CodeGen/ConstantInitBuilder.h"
     22 #include "llvm/IR/BasicBlock.h"
     23 #include "llvm/IR/Constants.h"
     24 #include "llvm/IR/DerivedTypes.h"
     25 #include "llvm/IR/ReplaceConstant.h"
     26 #include "llvm/Support/Format.h"
     27 
     28 using namespace clang;
     29 using namespace CodeGen;
     30 
     31 namespace {
     32 constexpr unsigned CudaFatMagic = 0x466243b1;
     33 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
     34 
     35 class CGNVCUDARuntime : public CGCUDARuntime {
     36 
     37 private:
     38   llvm::IntegerType *IntTy, *SizeTy;
     39   llvm::Type *VoidTy;
     40   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
     41 
     42   /// Convenience reference to LLVM Context
     43   llvm::LLVMContext &Context;
     44   /// Convenience reference to the current module
     45   llvm::Module &TheModule;
     46   /// Keeps track of kernel launch stubs and handles emitted in this module
     47   struct KernelInfo {
     48     llvm::Function *Kernel; // stub function to help launch kernel
     49     const Decl *D;
     50   };
     51   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
     52   // Map a device stub function to a symbol for identifying kernel in host code.
     53   // For CUDA, the symbol for identifying the kernel is the same as the device
     54   // stub function. For HIP, they are different.
     55   llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
     56   // Map a kernel handle to the kernel stub.
     57   llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
     58   struct VarInfo {
     59     llvm::GlobalVariable *Var;
     60     const VarDecl *D;
     61     DeviceVarFlags Flags;
     62   };
     63   llvm::SmallVector<VarInfo, 16> DeviceVars;
     64   /// Keeps track of variable containing handle of GPU binary. Populated by
     65   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
     66   /// ModuleDtorFunction()
     67   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
     68   /// Whether we generate relocatable device code.
     69   bool RelocatableDeviceCode;
     70   /// Mangle context for device.
     71   std::unique_ptr<MangleContext> DeviceMC;
     72 
     73   llvm::FunctionCallee getSetupArgumentFn() const;
     74   llvm::FunctionCallee getLaunchFn() const;
     75 
     76   llvm::FunctionType *getRegisterGlobalsFnTy() const;
     77   llvm::FunctionType *getCallbackFnTy() const;
     78   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
     79   std::string addPrefixToName(StringRef FuncName) const;
     80   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
     81 
     82   /// Creates a function to register all kernel stubs generated in this module.
     83   llvm::Function *makeRegisterGlobalsFn();
     84 
     85   /// Helper function that generates a constant string and returns a pointer to
     86   /// the start of the string.  The result of this function can be used anywhere
     87   /// where the C code specifies const char*.
     88   llvm::Constant *makeConstantString(const std::string &Str,
     89                                      const std::string &Name = "",
     90                                      const std::string &SectionName = "",
     91                                      unsigned Alignment = 0) {
     92     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
     93                                llvm::ConstantInt::get(SizeTy, 0)};
     94     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
     95     llvm::GlobalVariable *GV =
     96         cast<llvm::GlobalVariable>(ConstStr.getPointer());
     97     if (!SectionName.empty()) {
     98       GV->setSection(SectionName);
     99       // Mark the address as used which make sure that this section isn't
    100       // merged and we will really have it in the object file.
    101       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
    102     }
    103     if (Alignment)
    104       GV->setAlignment(llvm::Align(Alignment));
    105 
    106     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
    107                                                 ConstStr.getPointer(), Zeros);
    108   }
    109 
    110   /// Helper function that generates an empty dummy function returning void.
    111   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
    112     assert(FnTy->getReturnType()->isVoidTy() &&
    113            "Can only generate dummy functions returning void!");
    114     llvm::Function *DummyFunc = llvm::Function::Create(
    115         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
    116 
    117     llvm::BasicBlock *DummyBlock =
    118         llvm::BasicBlock::Create(Context, "", DummyFunc);
    119     CGBuilderTy FuncBuilder(CGM, Context);
    120     FuncBuilder.SetInsertPoint(DummyBlock);
    121     FuncBuilder.CreateRetVoid();
    122 
    123     return DummyFunc;
    124   }
    125 
    126   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
    127   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
    128   std::string getDeviceSideName(const NamedDecl *ND) override;
    129 
    130   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
    131                          bool Extern, bool Constant) {
    132     DeviceVars.push_back({&Var,
    133                           VD,
    134                           {DeviceVarFlags::Variable, Extern, Constant,
    135                            VD->hasAttr<HIPManagedAttr>(),
    136                            /*Normalized*/ false, 0}});
    137   }
    138   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
    139                           bool Extern, int Type) {
    140     DeviceVars.push_back({&Var,
    141                           VD,
    142                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
    143                            /*Managed*/ false,
    144                            /*Normalized*/ false, Type}});
    145   }
    146   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
    147                          bool Extern, int Type, bool Normalized) {
    148     DeviceVars.push_back({&Var,
    149                           VD,
    150                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
    151                            /*Managed*/ false, Normalized, Type}});
    152   }
    153 
    154   /// Creates module constructor function
    155   llvm::Function *makeModuleCtorFunction();
    156   /// Creates module destructor function
    157   llvm::Function *makeModuleDtorFunction();
    158   /// Transform managed variables for device compilation.
    159   void transformManagedVars();
    160 
    161 public:
    162   CGNVCUDARuntime(CodeGenModule &CGM);
    163 
    164   llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
    165   llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
    166     auto Loc = KernelStubs.find(Handle);
    167     assert(Loc != KernelStubs.end());
    168     return Loc->second;
    169   }
    170   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
    171   void handleVarRegistration(const VarDecl *VD,
    172                              llvm::GlobalVariable &Var) override;
    173   void
    174   internalizeDeviceSideVar(const VarDecl *D,
    175                            llvm::GlobalValue::LinkageTypes &Linkage) override;
    176 
    177   llvm::Function *finalizeModule() override;
    178 };
    179 
    180 }
    181 
    182 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
    183   if (CGM.getLangOpts().HIP)
    184     return ((Twine("hip") + Twine(FuncName)).str());
    185   return ((Twine("cuda") + Twine(FuncName)).str());
    186 }
    187 std::string
    188 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
    189   if (CGM.getLangOpts().HIP)
    190     return ((Twine("__hip") + Twine(FuncName)).str());
    191   return ((Twine("__cuda") + Twine(FuncName)).str());
    192 }
    193 
    194 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
    195     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
    196       TheModule(CGM.getModule()),
    197       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
    198       DeviceMC(CGM.getContext().createMangleContext(
    199           CGM.getContext().getAuxTargetInfo())) {
    200   CodeGen::CodeGenTypes &Types = CGM.getTypes();
    201   ASTContext &Ctx = CGM.getContext();
    202 
    203   IntTy = CGM.IntTy;
    204   SizeTy = CGM.SizeTy;
    205   VoidTy = CGM.VoidTy;
    206 
    207   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
    208   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
    209   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
    210   if (CGM.getContext().getAuxTargetInfo()) {
    211     // If the host and device have different C++ ABIs, mark it as the device
    212     // mangle context so that the mangling needs to retrieve the additonal
    213     // device lambda mangling number instead of the regular host one.
    214     DeviceMC->setDeviceMangleContext(
    215         CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
    216         CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
    217   }
    218 }
    219 
    220 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
    221   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
    222   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
    223   return CGM.CreateRuntimeFunction(
    224       llvm::FunctionType::get(IntTy, Params, false),
    225       addPrefixToName("SetupArgument"));
    226 }
    227 
    228 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
    229   if (CGM.getLangOpts().HIP) {
    230     // hipError_t hipLaunchByPtr(char *);
    231     return CGM.CreateRuntimeFunction(
    232         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
    233   } else {
    234     // cudaError_t cudaLaunch(char *);
    235     return CGM.CreateRuntimeFunction(
    236         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
    237   }
    238 }
    239 
    240 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
    241   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
    242 }
    243 
    244 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
    245   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
    246 }
    247 
    248 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
    249   auto CallbackFnTy = getCallbackFnTy();
    250   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
    251   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
    252                           VoidPtrTy, CallbackFnTy->getPointerTo()};
    253   return llvm::FunctionType::get(VoidTy, Params, false);
    254 }
    255 
    256 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
    257   GlobalDecl GD;
    258   // D could be either a kernel or a variable.
    259   if (auto *FD = dyn_cast<FunctionDecl>(ND))
    260     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
    261   else
    262     GD = GlobalDecl(ND);
    263   std::string DeviceSideName;
    264   MangleContext *MC;
    265   if (CGM.getLangOpts().CUDAIsDevice)
    266     MC = &CGM.getCXXABI().getMangleContext();
    267   else
    268     MC = DeviceMC.get();
    269   if (MC->shouldMangleDeclName(ND)) {
    270     SmallString<256> Buffer;
    271     llvm::raw_svector_ostream Out(Buffer);
    272     MC->mangleName(GD, Out);
    273     DeviceSideName = std::string(Out.str());
    274   } else
    275     DeviceSideName = std::string(ND->getIdentifier()->getName());
    276 
    277   // Make unique name for device side static file-scope variable for HIP.
    278   if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
    279       CGM.getLangOpts().GPURelocatableDeviceCode &&
    280       !CGM.getLangOpts().CUID.empty()) {
    281     SmallString<256> Buffer;
    282     llvm::raw_svector_ostream Out(Buffer);
    283     Out << DeviceSideName;
    284     CGM.printPostfixForExternalizedStaticVar(Out);
    285     DeviceSideName = std::string(Out.str());
    286   }
    287   return DeviceSideName;
    288 }
    289 
    290 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
    291                                      FunctionArgList &Args) {
    292   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
    293   if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
    294     GV->setLinkage(CGF.CurFn->getLinkage());
    295     GV->setInitializer(CGF.CurFn);
    296   }
    297   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
    298                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
    299       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
    300     emitDeviceStubBodyNew(CGF, Args);
    301   else
    302     emitDeviceStubBodyLegacy(CGF, Args);
    303 }
    304 
    305 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
    306 // array and kernels are launched using cudaLaunchKernel().
    307 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
    308                                             FunctionArgList &Args) {
    309   // Build the shadow stack entry at the very start of the function.
    310 
    311   // Calculate amount of space we will need for all arguments.  If we have no
    312   // args, allocate a single pointer so we still have a valid pointer to the
    313   // argument array that we can pass to runtime, even if it will be unused.
    314   Address KernelArgs = CGF.CreateTempAlloca(
    315       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
    316       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
    317   // Store pointers to the arguments in a locally allocated launch_args.
    318   for (unsigned i = 0; i < Args.size(); ++i) {
    319     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
    320     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
    321     CGF.Builder.CreateDefaultAlignedStore(
    322         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
    323   }
    324 
    325   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
    326 
    327   // Lookup cudaLaunchKernel/hipLaunchKernel function.
    328   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
    329   //                              void **args, size_t sharedMem,
    330   //                              cudaStream_t stream);
    331   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
    332   //                            void **args, size_t sharedMem,
    333   //                            hipStream_t stream);
    334   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
    335   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
    336   auto LaunchKernelName = addPrefixToName("LaunchKernel");
    337   IdentifierInfo &cudaLaunchKernelII =
    338       CGM.getContext().Idents.get(LaunchKernelName);
    339   FunctionDecl *cudaLaunchKernelFD = nullptr;
    340   for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
    341     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
    342       cudaLaunchKernelFD = FD;
    343   }
    344 
    345   if (cudaLaunchKernelFD == nullptr) {
    346     CGM.Error(CGF.CurFuncDecl->getLocation(),
    347               "Can't find declaration for " + LaunchKernelName);
    348     return;
    349   }
    350   // Create temporary dim3 grid_dim, block_dim.
    351   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
    352   QualType Dim3Ty = GridDimParam->getType();
    353   Address GridDim =
    354       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
    355   Address BlockDim =
    356       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
    357   Address ShmemSize =
    358       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
    359   Address Stream =
    360       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
    361   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
    362       llvm::FunctionType::get(IntTy,
    363                               {/*gridDim=*/GridDim.getType(),
    364                                /*blockDim=*/BlockDim.getType(),
    365                                /*ShmemSize=*/ShmemSize.getType(),
    366                                /*Stream=*/Stream.getType()},
    367                               /*isVarArg=*/false),
    368       addUnderscoredPrefixToName("PopCallConfiguration"));
    369 
    370   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
    371                               {GridDim.getPointer(), BlockDim.getPointer(),
    372                                ShmemSize.getPointer(), Stream.getPointer()});
    373 
    374   // Emit the call to cudaLaunch
    375   llvm::Value *Kernel =
    376       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
    377   CallArgList LaunchKernelArgs;
    378   LaunchKernelArgs.add(RValue::get(Kernel),
    379                        cudaLaunchKernelFD->getParamDecl(0)->getType());
    380   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
    381   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
    382   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
    383                        cudaLaunchKernelFD->getParamDecl(3)->getType());
    384   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
    385                        cudaLaunchKernelFD->getParamDecl(4)->getType());
    386   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
    387                        cudaLaunchKernelFD->getParamDecl(5)->getType());
    388 
    389   QualType QT = cudaLaunchKernelFD->getType();
    390   QualType CQT = QT.getCanonicalType();
    391   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
    392   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
    393 
    394   const CGFunctionInfo &FI =
    395       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
    396   llvm::FunctionCallee cudaLaunchKernelFn =
    397       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
    398   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
    399                LaunchKernelArgs);
    400   CGF.EmitBranch(EndBlock);
    401 
    402   CGF.EmitBlock(EndBlock);
    403 }
    404 
    405 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
    406                                                FunctionArgList &Args) {
    407   // Emit a call to cudaSetupArgument for each arg in Args.
    408   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
    409   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
    410   CharUnits Offset = CharUnits::Zero();
    411   for (const VarDecl *A : Args) {
    412     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
    413     Offset = Offset.alignTo(TInfo.Align);
    414     llvm::Value *Args[] = {
    415         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
    416                                       VoidPtrTy),
    417         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
    418         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
    419     };
    420     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
    421     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
    422     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
    423     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
    424     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
    425     CGF.EmitBlock(NextBlock);
    426     Offset += TInfo.Width;
    427   }
    428 
    429   // Emit the call to cudaLaunch
    430   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
    431   llvm::Value *Arg =
    432       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
    433   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
    434   CGF.EmitBranch(EndBlock);
    435 
    436   CGF.EmitBlock(EndBlock);
    437 }
    438 
    439 // Replace the original variable Var with the address loaded from variable
    440 // ManagedVar populated by HIP runtime.
    441 static void replaceManagedVar(llvm::GlobalVariable *Var,
    442                               llvm::GlobalVariable *ManagedVar) {
    443   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
    444   for (auto &&VarUse : Var->uses()) {
    445     WorkList.push_back({VarUse.getUser()});
    446   }
    447   while (!WorkList.empty()) {
    448     auto &&WorkItem = WorkList.pop_back_val();
    449     auto *U = WorkItem.back();
    450     if (isa<llvm::ConstantExpr>(U)) {
    451       for (auto &&UU : U->uses()) {
    452         WorkItem.push_back(UU.getUser());
    453         WorkList.push_back(WorkItem);
    454         WorkItem.pop_back();
    455       }
    456       continue;
    457     }
    458     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
    459       llvm::Value *OldV = Var;
    460       llvm::Instruction *NewV =
    461           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
    462                              llvm::Align(Var->getAlignment()), I);
    463       WorkItem.pop_back();
    464       // Replace constant expressions directly or indirectly using the managed
    465       // variable with instructions.
    466       for (auto &&Op : WorkItem) {
    467         auto *CE = cast<llvm::ConstantExpr>(Op);
    468         auto *NewInst = llvm::createReplacementInstr(CE, I);
    469         NewInst->replaceUsesOfWith(OldV, NewV);
    470         OldV = CE;
    471         NewV = NewInst;
    472       }
    473       I->replaceUsesOfWith(OldV, NewV);
    474     } else {
    475       llvm_unreachable("Invalid use of managed variable");
    476     }
    477   }
    478 }
    479 
    480 /// Creates a function that sets up state on the host side for CUDA objects that
    481 /// have a presence on both the host and device sides. Specifically, registers
    482 /// the host side of kernel functions and device global variables with the CUDA
    483 /// runtime.
    484 /// \code
    485 /// void __cuda_register_globals(void** GpuBinaryHandle) {
    486 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
    487 ///    ...
    488 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
    489 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
    490 ///    ...
    491 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
    492 /// }
    493 /// \endcode
    494 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
    495   // No need to register anything
    496   if (EmittedKernels.empty() && DeviceVars.empty())
    497     return nullptr;
    498 
    499   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
    500       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
    501       addUnderscoredPrefixToName("_register_globals"), &TheModule);
    502   llvm::BasicBlock *EntryBB =
    503       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
    504   CGBuilderTy Builder(CGM, Context);
    505   Builder.SetInsertPoint(EntryBB);
    506 
    507   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
    508   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
    509   llvm::Type *RegisterFuncParams[] = {
    510       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
    511       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
    512   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
    513       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
    514       addUnderscoredPrefixToName("RegisterFunction"));
    515 
    516   // Extract GpuBinaryHandle passed as the first argument passed to
    517   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
    518   // each emitted kernel.
    519   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
    520   for (auto &&I : EmittedKernels) {
    521     llvm::Constant *KernelName =
    522         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
    523     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
    524     llvm::Value *Args[] = {
    525         &GpuBinaryHandlePtr,
    526         Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
    527         KernelName,
    528         KernelName,
    529         llvm::ConstantInt::get(IntTy, -1),
    530         NullPtr,
    531         NullPtr,
    532         NullPtr,
    533         NullPtr,
    534         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
    535     Builder.CreateCall(RegisterFunc, Args);
    536   }
    537 
    538   llvm::Type *VarSizeTy = IntTy;
    539   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
    540   if (CGM.getLangOpts().HIP ||
    541       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
    542     VarSizeTy = SizeTy;
    543 
    544   // void __cudaRegisterVar(void **, char *, char *, const char *,
    545   //                        int, int, int, int)
    546   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
    547                                      CharPtrTy,    IntTy,     VarSizeTy,
    548                                      IntTy,        IntTy};
    549   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
    550       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
    551       addUnderscoredPrefixToName("RegisterVar"));
    552   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
    553   //                              size_t, unsigned)
    554   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
    555                                             CharPtrTy,    VarSizeTy, IntTy};
    556   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
    557       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
    558       addUnderscoredPrefixToName("RegisterManagedVar"));
    559   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
    560   //                            const void **, const char *, int, int);
    561   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
    562       llvm::FunctionType::get(
    563           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
    564           false),
    565       addUnderscoredPrefixToName("RegisterSurface"));
    566   // void __cudaRegisterTexture(void **, const struct textureReference *,
    567   //                            const void **, const char *, int, int, int)
    568   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
    569       llvm::FunctionType::get(
    570           VoidTy,
    571           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
    572           false),
    573       addUnderscoredPrefixToName("RegisterTexture"));
    574   for (auto &&Info : DeviceVars) {
    575     llvm::GlobalVariable *Var = Info.Var;
    576     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
    577            "External variables should not show up here, except HIP managed "
    578            "variables");
    579     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
    580     switch (Info.Flags.getKind()) {
    581     case DeviceVarFlags::Variable: {
    582       uint64_t VarSize =
    583           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
    584       if (Info.Flags.isManaged()) {
    585         auto ManagedVar = new llvm::GlobalVariable(
    586             CGM.getModule(), Var->getType(),
    587             /*isConstant=*/false, Var->getLinkage(),
    588             /*Init=*/Var->isDeclaration()
    589                 ? nullptr
    590                 : llvm::ConstantPointerNull::get(Var->getType()),
    591             /*Name=*/"", /*InsertBefore=*/nullptr,
    592             llvm::GlobalVariable::NotThreadLocal);
    593         ManagedVar->setDSOLocal(Var->isDSOLocal());
    594         ManagedVar->setVisibility(Var->getVisibility());
    595         ManagedVar->setExternallyInitialized(true);
    596         ManagedVar->takeName(Var);
    597         Var->setName(Twine(ManagedVar->getName() + ".managed"));
    598         replaceManagedVar(Var, ManagedVar);
    599         llvm::Value *Args[] = {
    600             &GpuBinaryHandlePtr,
    601             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
    602             Builder.CreateBitCast(Var, VoidPtrTy),
    603             VarName,
    604             llvm::ConstantInt::get(VarSizeTy, VarSize),
    605             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
    606         if (!Var->isDeclaration())
    607           Builder.CreateCall(RegisterManagedVar, Args);
    608       } else {
    609         llvm::Value *Args[] = {
    610             &GpuBinaryHandlePtr,
    611             Builder.CreateBitCast(Var, VoidPtrTy),
    612             VarName,
    613             VarName,
    614             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
    615             llvm::ConstantInt::get(VarSizeTy, VarSize),
    616             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
    617             llvm::ConstantInt::get(IntTy, 0)};
    618         Builder.CreateCall(RegisterVar, Args);
    619       }
    620       break;
    621     }
    622     case DeviceVarFlags::Surface:
    623       Builder.CreateCall(
    624           RegisterSurf,
    625           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
    626            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
    627            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
    628       break;
    629     case DeviceVarFlags::Texture:
    630       Builder.CreateCall(
    631           RegisterTex,
    632           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
    633            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
    634            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
    635            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
    636       break;
    637     }
    638   }
    639 
    640   Builder.CreateRetVoid();
    641   return RegisterKernelsFunc;
    642 }
    643 
    644 /// Creates a global constructor function for the module:
    645 ///
    646 /// For CUDA:
    647 /// \code
    648 /// void __cuda_module_ctor(void*) {
    649 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
    650 ///     __cuda_register_globals(Handle);
    651 /// }
    652 /// \endcode
    653 ///
    654 /// For HIP:
    655 /// \code
    656 /// void __hip_module_ctor(void*) {
    657 ///     if (__hip_gpubin_handle == 0) {
    658 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
    659 ///         __hip_register_globals(__hip_gpubin_handle);
    660 ///     }
    661 /// }
    662 /// \endcode
    663 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    664   bool IsHIP = CGM.getLangOpts().HIP;
    665   bool IsCUDA = CGM.getLangOpts().CUDA;
    666   // No need to generate ctors/dtors if there is no GPU binary.
    667   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
    668   if (CudaGpuBinaryFileName.empty() && !IsHIP)
    669     return nullptr;
    670   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
    671       DeviceVars.empty())
    672     return nullptr;
    673 
    674   // void __{cuda|hip}_register_globals(void* handle);
    675   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
    676   // We always need a function to pass in as callback. Create a dummy
    677   // implementation if we don't need to register anything.
    678   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
    679     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
    680 
    681   // void ** __{cuda|hip}RegisterFatBinary(void *);
    682   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
    683       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
    684       addUnderscoredPrefixToName("RegisterFatBinary"));
    685   // struct { int magic, int version, void * gpu_binary, void * dont_care };
    686   llvm::StructType *FatbinWrapperTy =
    687       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
    688 
    689   // Register GPU binary with the CUDA runtime, store returned handle in a
    690   // global variable and save a reference in GpuBinaryHandle to be cleaned up
    691   // in destructor on exit. Then associate all known kernels with the GPU binary
    692   // handle so CUDA runtime can figure out what to call on the GPU side.
    693   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
    694   if (!CudaGpuBinaryFileName.empty()) {
    695     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
    696         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
    697     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
    698       CGM.getDiags().Report(diag::err_cannot_open_file)
    699           << CudaGpuBinaryFileName << EC.message();
    700       return nullptr;
    701     }
    702     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
    703   }
    704 
    705   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
    706       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
    707       llvm::GlobalValue::InternalLinkage,
    708       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
    709   llvm::BasicBlock *CtorEntryBB =
    710       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
    711   CGBuilderTy CtorBuilder(CGM, Context);
    712 
    713   CtorBuilder.SetInsertPoint(CtorEntryBB);
    714 
    715   const char *FatbinConstantName;
    716   const char *FatbinSectionName;
    717   const char *ModuleIDSectionName;
    718   StringRef ModuleIDPrefix;
    719   llvm::Constant *FatBinStr;
    720   unsigned FatMagic;
    721   if (IsHIP) {
    722     FatbinConstantName = ".hip_fatbin";
    723     FatbinSectionName = ".hipFatBinSegment";
    724 
    725     ModuleIDSectionName = "__hip_module_id";
    726     ModuleIDPrefix = "__hip_";
    727 
    728     if (CudaGpuBinary) {
    729       // If fatbin is available from early finalization, create a string
    730       // literal containing the fat binary loaded from the given file.
    731       const unsigned HIPCodeObjectAlign = 4096;
    732       FatBinStr =
    733           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
    734                              FatbinConstantName, HIPCodeObjectAlign);
    735     } else {
    736       // If fatbin is not available, create an external symbol
    737       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
    738       // to contain the fat binary but will be populated somewhere else,
    739       // e.g. by lld through link script.
    740       FatBinStr = new llvm::GlobalVariable(
    741         CGM.getModule(), CGM.Int8Ty,
    742         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
    743         "__hip_fatbin", nullptr,
    744         llvm::GlobalVariable::NotThreadLocal);
    745       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
    746     }
    747 
    748     FatMagic = HIPFatMagic;
    749   } else {
    750     if (RelocatableDeviceCode)
    751       FatbinConstantName = CGM.getTriple().isMacOSX()
    752                                ? "__NV_CUDA,__nv_relfatbin"
    753                                : "__nv_relfatbin";
    754     else
    755       FatbinConstantName =
    756           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
    757     // NVIDIA's cuobjdump looks for fatbins in this section.
    758     FatbinSectionName =
    759         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
    760 
    761     ModuleIDSectionName = CGM.getTriple().isMacOSX()
    762                               ? "__NV_CUDA,__nv_module_id"
    763                               : "__nv_module_id";
    764     ModuleIDPrefix = "__nv_";
    765 
    766     // For CUDA, create a string literal containing the fat binary loaded from
    767     // the given file.
    768     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
    769                                    FatbinConstantName, 8);
    770     FatMagic = CudaFatMagic;
    771   }
    772 
    773   // Create initialized wrapper structure that points to the loaded GPU binary
    774   ConstantInitBuilder Builder(CGM);
    775   auto Values = Builder.beginStruct(FatbinWrapperTy);
    776   // Fatbin wrapper magic.
    777   Values.addInt(IntTy, FatMagic);
    778   // Fatbin version.
    779   Values.addInt(IntTy, 1);
    780   // Data.
    781   Values.add(FatBinStr);
    782   // Unused in fatbin v1.
    783   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
    784   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
    785       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
    786       /*constant*/ true);
    787   FatbinWrapper->setSection(FatbinSectionName);
    788 
    789   // There is only one HIP fat binary per linked module, however there are
    790   // multiple constructor functions. Make sure the fat binary is registered
    791   // only once. The constructor functions are executed by the dynamic loader
    792   // before the program gains control. The dynamic loader cannot execute the
    793   // constructor functions concurrently since doing that would not guarantee
    794   // thread safety of the loaded program. Therefore we can assume sequential
    795   // execution of constructor functions here.
    796   if (IsHIP) {
    797     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
    798         llvm::GlobalValue::LinkOnceAnyLinkage;
    799     llvm::BasicBlock *IfBlock =
    800         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
    801     llvm::BasicBlock *ExitBlock =
    802         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
    803     // The name, size, and initialization pattern of this variable is part
    804     // of HIP ABI.
    805     GpuBinaryHandle = new llvm::GlobalVariable(
    806         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
    807         Linkage,
    808         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
    809         "__hip_gpubin_handle");
    810     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
    811     // Prevent the weak symbol in different shared libraries being merged.
    812     if (Linkage != llvm::GlobalValue::InternalLinkage)
    813       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
    814     Address GpuBinaryAddr(
    815         GpuBinaryHandle,
    816         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
    817     {
    818       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
    819       llvm::Constant *Zero =
    820           llvm::Constant::getNullValue(HandleValue->getType());
    821       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
    822       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
    823     }
    824     {
    825       CtorBuilder.SetInsertPoint(IfBlock);
    826       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
    827       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
    828           RegisterFatbinFunc,
    829           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
    830       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
    831       CtorBuilder.CreateBr(ExitBlock);
    832     }
    833     {
    834       CtorBuilder.SetInsertPoint(ExitBlock);
    835       // Call __hip_register_globals(GpuBinaryHandle);
    836       if (RegisterGlobalsFunc) {
    837         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
    838         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
    839       }
    840     }
    841   } else if (!RelocatableDeviceCode) {
    842     // Register binary with CUDA runtime. This is substantially different in
    843     // default mode vs. separate compilation!
    844     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
    845     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
    846         RegisterFatbinFunc,
    847         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
    848     GpuBinaryHandle = new llvm::GlobalVariable(
    849         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
    850         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
    851     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
    852     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
    853                                    CGM.getPointerAlign());
    854 
    855     // Call __cuda_register_globals(GpuBinaryHandle);
    856     if (RegisterGlobalsFunc)
    857       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
    858 
    859     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
    860     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
    861                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
    862       // void __cudaRegisterFatBinaryEnd(void **);
    863       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
    864           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
    865           "__cudaRegisterFatBinaryEnd");
    866       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
    867     }
    868   } else {
    869     // Generate a unique module ID.
    870     SmallString<64> ModuleID;
    871     llvm::raw_svector_ostream OS(ModuleID);
    872     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
    873     llvm::Constant *ModuleIDConstant = makeConstantString(
    874         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
    875 
    876     // Create an alias for the FatbinWrapper that nvcc will look for.
    877     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
    878                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
    879 
    880     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
    881     // void *, void (*)(void **))
    882     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
    883     RegisterLinkedBinaryName += ModuleID;
    884     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
    885         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
    886 
    887     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
    888     llvm::Value *Args[] = {RegisterGlobalsFunc,
    889                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
    890                            ModuleIDConstant,
    891                            makeDummyFunction(getCallbackFnTy())};
    892     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
    893   }
    894 
    895   // Create destructor and register it with atexit() the way NVCC does it. Doing
    896   // it during regular destructor phase worked in CUDA before 9.2 but results in
    897   // double-free in 9.2.
    898   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
    899     // extern "C" int atexit(void (*f)(void));
    900     llvm::FunctionType *AtExitTy =
    901         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
    902     llvm::FunctionCallee AtExitFunc =
    903         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
    904                                   /*Local=*/true);
    905     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
    906   }
    907 
    908   CtorBuilder.CreateRetVoid();
    909   return ModuleCtorFunc;
    910 }
    911 
    912 /// Creates a global destructor function that unregisters the GPU code blob
    913 /// registered by constructor.
    914 ///
    915 /// For CUDA:
    916 /// \code
    917 /// void __cuda_module_dtor(void*) {
    918 ///     __cudaUnregisterFatBinary(Handle);
    919 /// }
    920 /// \endcode
    921 ///
    922 /// For HIP:
    923 /// \code
    924 /// void __hip_module_dtor(void*) {
    925 ///     if (__hip_gpubin_handle) {
    926 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
    927 ///         __hip_gpubin_handle = 0;
    928 ///     }
    929 /// }
    930 /// \endcode
    931 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
    932   // No need for destructor if we don't have a handle to unregister.
    933   if (!GpuBinaryHandle)
    934     return nullptr;
    935 
    936   // void __cudaUnregisterFatBinary(void ** handle);
    937   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
    938       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
    939       addUnderscoredPrefixToName("UnregisterFatBinary"));
    940 
    941   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
    942       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
    943       llvm::GlobalValue::InternalLinkage,
    944       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
    945 
    946   llvm::BasicBlock *DtorEntryBB =
    947       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
    948   CGBuilderTy DtorBuilder(CGM, Context);
    949   DtorBuilder.SetInsertPoint(DtorEntryBB);
    950 
    951   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
    952                                              GpuBinaryHandle->getAlignment()));
    953   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
    954   // There is only one HIP fat binary per linked module, however there are
    955   // multiple destructor functions. Make sure the fat binary is unregistered
    956   // only once.
    957   if (CGM.getLangOpts().HIP) {
    958     llvm::BasicBlock *IfBlock =
    959         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
    960     llvm::BasicBlock *ExitBlock =
    961         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
    962     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
    963     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
    964     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
    965 
    966     DtorBuilder.SetInsertPoint(IfBlock);
    967     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
    968     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
    969     DtorBuilder.CreateBr(ExitBlock);
    970 
    971     DtorBuilder.SetInsertPoint(ExitBlock);
    972   } else {
    973     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
    974   }
    975   DtorBuilder.CreateRetVoid();
    976   return ModuleDtorFunc;
    977 }
    978 
    979 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
    980   return new CGNVCUDARuntime(CGM);
    981 }
    982 
    983 void CGNVCUDARuntime::internalizeDeviceSideVar(
    984     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
    985   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
    986   // global variables become internal definitions. These have to be internal in
    987   // order to prevent name conflicts with global host variables with the same
    988   // name in a different TUs.
    989   //
    990   // For -fgpu-rdc, the shadow variables should not be internalized because
    991   // they may be accessed by different TU.
    992   if (CGM.getLangOpts().GPURelocatableDeviceCode)
    993     return;
    994 
    995   // __shared__ variables are odd. Shadows do get created, but
    996   // they are not registered with the CUDA runtime, so they
    997   // can't really be used to access their device-side
    998   // counterparts. It's not clear yet whether it's nvcc's bug or
    999   // a feature, but we've got to do the same for compatibility.
   1000   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
   1001       D->hasAttr<CUDASharedAttr>() ||
   1002       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
   1003       D->getType()->isCUDADeviceBuiltinTextureType()) {
   1004     Linkage = llvm::GlobalValue::InternalLinkage;
   1005   }
   1006 }
   1007 
   1008 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
   1009                                             llvm::GlobalVariable &GV) {
   1010   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
   1011     // Shadow variables and their properties must be registered with CUDA
   1012     // runtime. Skip Extern global variables, which will be registered in
   1013     // the TU where they are defined.
   1014     //
   1015     // Don't register a C++17 inline variable. The local symbol can be
   1016     // discarded and referencing a discarded local symbol from outside the
   1017     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
   1018     //
   1019     // HIP managed variables need to be always recorded in device and host
   1020     // compilations for transformation.
   1021     //
   1022     // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
   1023     // added to llvm.compiler-used, therefore they are safe to be registered.
   1024     if ((!D->hasExternalStorage() && !D->isInline()) ||
   1025         CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
   1026         D->hasAttr<HIPManagedAttr>()) {
   1027       registerDeviceVar(D, GV, !D->hasDefinition(),
   1028                         D->hasAttr<CUDAConstantAttr>());
   1029     }
   1030   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
   1031              D->getType()->isCUDADeviceBuiltinTextureType()) {
   1032     // Builtin surfaces and textures and their template arguments are
   1033     // also registered with CUDA runtime.
   1034     const auto *TD = cast<ClassTemplateSpecializationDecl>(
   1035         D->getType()->castAs<RecordType>()->getDecl());
   1036     const TemplateArgumentList &Args = TD->getTemplateArgs();
   1037     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
   1038       assert(Args.size() == 2 &&
   1039              "Unexpected number of template arguments of CUDA device "
   1040              "builtin surface type.");
   1041       auto SurfType = Args[1].getAsIntegral();
   1042       if (!D->hasExternalStorage())
   1043         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
   1044     } else {
   1045       assert(Args.size() == 3 &&
   1046              "Unexpected number of template arguments of CUDA device "
   1047              "builtin texture type.");
   1048       auto TexType = Args[1].getAsIntegral();
   1049       auto Normalized = Args[2].getAsIntegral();
   1050       if (!D->hasExternalStorage())
   1051         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
   1052                           Normalized.getZExtValue());
   1053     }
   1054   }
   1055 }
   1056 
   1057 // Transform managed variables to pointers to managed variables in device code.
   1058 // Each use of the original managed variable is replaced by a load from the
   1059 // transformed managed variable. The transformed managed variable contains
   1060 // the address of managed memory which will be allocated by the runtime.
   1061 void CGNVCUDARuntime::transformManagedVars() {
   1062   for (auto &&Info : DeviceVars) {
   1063     llvm::GlobalVariable *Var = Info.Var;
   1064     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
   1065         Info.Flags.isManaged()) {
   1066       auto ManagedVar = new llvm::GlobalVariable(
   1067           CGM.getModule(), Var->getType(),
   1068           /*isConstant=*/false, Var->getLinkage(),
   1069           /*Init=*/Var->isDeclaration()
   1070               ? nullptr
   1071               : llvm::ConstantPointerNull::get(Var->getType()),
   1072           /*Name=*/"", /*InsertBefore=*/nullptr,
   1073           llvm::GlobalVariable::NotThreadLocal,
   1074           CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
   1075       ManagedVar->setDSOLocal(Var->isDSOLocal());
   1076       ManagedVar->setVisibility(Var->getVisibility());
   1077       ManagedVar->setExternallyInitialized(true);
   1078       replaceManagedVar(Var, ManagedVar);
   1079       ManagedVar->takeName(Var);
   1080       Var->setName(Twine(ManagedVar->getName()) + ".managed");
   1081       // Keep managed variables even if they are not used in device code since
   1082       // they need to be allocated by the runtime.
   1083       if (!Var->isDeclaration()) {
   1084         assert(!ManagedVar->isDeclaration());
   1085         CGM.addCompilerUsedGlobal(Var);
   1086         CGM.addCompilerUsedGlobal(ManagedVar);
   1087       }
   1088     }
   1089   }
   1090 }
   1091 
   1092 // Returns module constructor to be added.
   1093 llvm::Function *CGNVCUDARuntime::finalizeModule() {
   1094   if (CGM.getLangOpts().CUDAIsDevice) {
   1095     transformManagedVars();
   1096 
   1097     // Mark ODR-used device variables as compiler used to prevent it from being
   1098     // eliminated by optimization. This is necessary for device variables
   1099     // ODR-used by host functions. Sema correctly marks them as ODR-used no
   1100     // matter whether they are ODR-used by device or host functions.
   1101     //
   1102     // We do not need to do this if the variable has used attribute since it
   1103     // has already been added.
   1104     //
   1105     // Static device variables have been externalized at this point, therefore
   1106     // variables with LLVM private or internal linkage need not be added.
   1107     for (auto &&Info : DeviceVars) {
   1108       auto Kind = Info.Flags.getKind();
   1109       if (!Info.Var->isDeclaration() &&
   1110           !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
   1111           (Kind == DeviceVarFlags::Variable ||
   1112            Kind == DeviceVarFlags::Surface ||
   1113            Kind == DeviceVarFlags::Texture) &&
   1114           Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
   1115         CGM.addCompilerUsedGlobal(Info.Var);
   1116       }
   1117     }
   1118     return nullptr;
   1119   }
   1120   return makeModuleCtorFunction();
   1121 }
   1122 
   1123 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
   1124                                                     GlobalDecl GD) {
   1125   auto Loc = KernelHandles.find(F);
   1126   if (Loc != KernelHandles.end())
   1127     return Loc->second;
   1128 
   1129   if (!CGM.getLangOpts().HIP) {
   1130     KernelHandles[F] = F;
   1131     KernelStubs[F] = F;
   1132     return F;
   1133   }
   1134 
   1135   auto *Var = new llvm::GlobalVariable(
   1136       TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
   1137       /*Initializer=*/nullptr,
   1138       CGM.getMangledName(
   1139           GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
   1140   Var->setAlignment(CGM.getPointerAlign().getAsAlign());
   1141   Var->setDSOLocal(F->isDSOLocal());
   1142   Var->setVisibility(F->getVisibility());
   1143   KernelHandles[F] = Var;
   1144   KernelStubs[Var] = F;
   1145   return Var;
   1146 }
   1147