Home | History | Annotate | Line # | Download | only in CodeGen
CGOpenMPRuntimeGPU.cpp revision 1.1.1.1
      1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This provides a generalized class for OpenMP runtime code generation
     10 // specialized by GPU targets NVPTX and AMDGCN.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "CGOpenMPRuntimeGPU.h"
     15 #include "CGOpenMPRuntimeNVPTX.h"
     16 #include "CodeGenFunction.h"
     17 #include "clang/AST/Attr.h"
     18 #include "clang/AST/DeclOpenMP.h"
     19 #include "clang/AST/StmtOpenMP.h"
     20 #include "clang/AST/StmtVisitor.h"
     21 #include "clang/Basic/Cuda.h"
     22 #include "llvm/ADT/SmallPtrSet.h"
     23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
     24 #include "llvm/IR/IntrinsicsNVPTX.h"
     25 
     26 using namespace clang;
     27 using namespace CodeGen;
     28 using namespace llvm::omp;
     29 
     30 namespace {
     31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
     32 class NVPTXActionTy final : public PrePostActionTy {
     33   llvm::FunctionCallee EnterCallee = nullptr;
     34   ArrayRef<llvm::Value *> EnterArgs;
     35   llvm::FunctionCallee ExitCallee = nullptr;
     36   ArrayRef<llvm::Value *> ExitArgs;
     37   bool Conditional = false;
     38   llvm::BasicBlock *ContBlock = nullptr;
     39 
     40 public:
     41   NVPTXActionTy(llvm::FunctionCallee EnterCallee,
     42                 ArrayRef<llvm::Value *> EnterArgs,
     43                 llvm::FunctionCallee ExitCallee,
     44                 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
     45       : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
     46         ExitArgs(ExitArgs), Conditional(Conditional) {}
     47   void Enter(CodeGenFunction &CGF) override {
     48     llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
     49     if (Conditional) {
     50       llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
     51       auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
     52       ContBlock = CGF.createBasicBlock("omp_if.end");
     53       // Generate the branch (If-stmt)
     54       CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
     55       CGF.EmitBlock(ThenBlock);
     56     }
     57   }
     58   void Done(CodeGenFunction &CGF) {
     59     // Emit the rest of blocks/branches
     60     CGF.EmitBranch(ContBlock);
     61     CGF.EmitBlock(ContBlock, true);
     62   }
     63   void Exit(CodeGenFunction &CGF) override {
     64     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
     65   }
     66 };
     67 
     68 /// A class to track the execution mode when codegening directives within
     69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
     70 /// to the target region and used by containing directives such as 'parallel'
     71 /// to emit optimized code.
     72 class ExecutionRuntimeModesRAII {
     73 private:
     74   CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
     75       CGOpenMPRuntimeGPU::EM_Unknown;
     76   CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
     77   bool SavedRuntimeMode = false;
     78   bool *RuntimeMode = nullptr;
     79 
     80 public:
     81   /// Constructor for Non-SPMD mode.
     82   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
     83       : ExecMode(ExecMode) {
     84     SavedExecMode = ExecMode;
     85     ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
     86   }
     87   /// Constructor for SPMD mode.
     88   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
     89                             bool &RuntimeMode, bool FullRuntimeMode)
     90       : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
     91     SavedExecMode = ExecMode;
     92     SavedRuntimeMode = RuntimeMode;
     93     ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
     94     RuntimeMode = FullRuntimeMode;
     95   }
     96   ~ExecutionRuntimeModesRAII() {
     97     ExecMode = SavedExecMode;
     98     if (RuntimeMode)
     99       *RuntimeMode = SavedRuntimeMode;
    100   }
    101 };
    102 
    103 /// GPU Configuration:  This information can be derived from cuda registers,
    104 /// however, providing compile time constants helps generate more efficient
    105 /// code.  For all practical purposes this is fine because the configuration
    106 /// is the same for all known NVPTX architectures.
    107 enum MachineConfiguration : unsigned {
    108   /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
    109   /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
    110   /// and GV_Warp_Size_Log2_Mask.
    111 
    112   /// Global memory alignment for performance.
    113   GlobalMemoryAlignment = 128,
    114 
    115   /// Maximal size of the shared memory buffer.
    116   SharedMemorySize = 128,
    117 };
    118 
    119 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
    120   RefExpr = RefExpr->IgnoreParens();
    121   if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
    122     const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
    123     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
    124       Base = TempASE->getBase()->IgnoreParenImpCasts();
    125     RefExpr = Base;
    126   } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
    127     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
    128     while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
    129       Base = TempOASE->getBase()->IgnoreParenImpCasts();
    130     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
    131       Base = TempASE->getBase()->IgnoreParenImpCasts();
    132     RefExpr = Base;
    133   }
    134   RefExpr = RefExpr->IgnoreParenImpCasts();
    135   if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
    136     return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
    137   const auto *ME = cast<MemberExpr>(RefExpr);
    138   return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
    139 }
    140 
    141 
    142 static RecordDecl *buildRecordForGlobalizedVars(
    143     ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
    144     ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
    145     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
    146         &MappedDeclsFields, int BufSize) {
    147   using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
    148   if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
    149     return nullptr;
    150   SmallVector<VarsDataTy, 4> GlobalizedVars;
    151   for (const ValueDecl *D : EscapedDecls)
    152     GlobalizedVars.emplace_back(
    153         CharUnits::fromQuantity(std::max(
    154             C.getDeclAlign(D).getQuantity(),
    155             static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
    156         D);
    157   for (const ValueDecl *D : EscapedDeclsForTeams)
    158     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
    159   llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
    160     return L.first > R.first;
    161   });
    162 
    163   // Build struct _globalized_locals_ty {
    164   //         /*  globalized vars  */[WarSize] align (max(decl_align,
    165   //         GlobalMemoryAlignment))
    166   //         /*  globalized vars  */ for EscapedDeclsForTeams
    167   //       };
    168   RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
    169   GlobalizedRD->startDefinition();
    170   llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
    171       EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
    172   for (const auto &Pair : GlobalizedVars) {
    173     const ValueDecl *VD = Pair.second;
    174     QualType Type = VD->getType();
    175     if (Type->isLValueReferenceType())
    176       Type = C.getPointerType(Type.getNonReferenceType());
    177     else
    178       Type = Type.getNonReferenceType();
    179     SourceLocation Loc = VD->getLocation();
    180     FieldDecl *Field;
    181     if (SingleEscaped.count(VD)) {
    182       Field = FieldDecl::Create(
    183           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
    184           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
    185           /*BW=*/nullptr, /*Mutable=*/false,
    186           /*InitStyle=*/ICIS_NoInit);
    187       Field->setAccess(AS_public);
    188       if (VD->hasAttrs()) {
    189         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
    190              E(VD->getAttrs().end());
    191              I != E; ++I)
    192           Field->addAttr(*I);
    193       }
    194     } else {
    195       llvm::APInt ArraySize(32, BufSize);
    196       Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
    197                                     0);
    198       Field = FieldDecl::Create(
    199           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
    200           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
    201           /*BW=*/nullptr, /*Mutable=*/false,
    202           /*InitStyle=*/ICIS_NoInit);
    203       Field->setAccess(AS_public);
    204       llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
    205                                      static_cast<CharUnits::QuantityType>(
    206                                          GlobalMemoryAlignment)));
    207       Field->addAttr(AlignedAttr::CreateImplicit(
    208           C, /*IsAlignmentExpr=*/true,
    209           IntegerLiteral::Create(C, Align,
    210                                  C.getIntTypeForBitwidth(32, /*Signed=*/0),
    211                                  SourceLocation()),
    212           {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
    213     }
    214     GlobalizedRD->addDecl(Field);
    215     MappedDeclsFields.try_emplace(VD, Field);
    216   }
    217   GlobalizedRD->completeDefinition();
    218   return GlobalizedRD;
    219 }
    220 
    221 /// Get the list of variables that can escape their declaration context.
    222 class CheckVarsEscapingDeclContext final
    223     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
    224   CodeGenFunction &CGF;
    225   llvm::SetVector<const ValueDecl *> EscapedDecls;
    226   llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
    227   llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
    228   RecordDecl *GlobalizedRD = nullptr;
    229   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
    230   bool AllEscaped = false;
    231   bool IsForCombinedParallelRegion = false;
    232 
    233   void markAsEscaped(const ValueDecl *VD) {
    234     // Do not globalize declare target variables.
    235     if (!isa<VarDecl>(VD) ||
    236         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
    237       return;
    238     VD = cast<ValueDecl>(VD->getCanonicalDecl());
    239     // Use user-specified allocation.
    240     if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
    241       return;
    242     // Variables captured by value must be globalized.
    243     if (auto *CSI = CGF.CapturedStmtInfo) {
    244       if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
    245         // Check if need to capture the variable that was already captured by
    246         // value in the outer region.
    247         if (!IsForCombinedParallelRegion) {
    248           if (!FD->hasAttrs())
    249             return;
    250           const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
    251           if (!Attr)
    252             return;
    253           if (((Attr->getCaptureKind() != OMPC_map) &&
    254                !isOpenMPPrivate(Attr->getCaptureKind())) ||
    255               ((Attr->getCaptureKind() == OMPC_map) &&
    256                !FD->getType()->isAnyPointerType()))
    257             return;
    258         }
    259         if (!FD->getType()->isReferenceType()) {
    260           assert(!VD->getType()->isVariablyModifiedType() &&
    261                  "Parameter captured by value with variably modified type");
    262           EscapedParameters.insert(VD);
    263         } else if (!IsForCombinedParallelRegion) {
    264           return;
    265         }
    266       }
    267     }
    268     if ((!CGF.CapturedStmtInfo ||
    269          (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
    270         VD->getType()->isReferenceType())
    271       // Do not globalize variables with reference type.
    272       return;
    273     if (VD->getType()->isVariablyModifiedType())
    274       EscapedVariableLengthDecls.insert(VD);
    275     else
    276       EscapedDecls.insert(VD);
    277   }
    278 
    279   void VisitValueDecl(const ValueDecl *VD) {
    280     if (VD->getType()->isLValueReferenceType())
    281       markAsEscaped(VD);
    282     if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
    283       if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
    284         const bool SavedAllEscaped = AllEscaped;
    285         AllEscaped = VD->getType()->isLValueReferenceType();
    286         Visit(VarD->getInit());
    287         AllEscaped = SavedAllEscaped;
    288       }
    289     }
    290   }
    291   void VisitOpenMPCapturedStmt(const CapturedStmt *S,
    292                                ArrayRef<OMPClause *> Clauses,
    293                                bool IsCombinedParallelRegion) {
    294     if (!S)
    295       return;
    296     for (const CapturedStmt::Capture &C : S->captures()) {
    297       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
    298         const ValueDecl *VD = C.getCapturedVar();
    299         bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
    300         if (IsCombinedParallelRegion) {
    301           // Check if the variable is privatized in the combined construct and
    302           // those private copies must be shared in the inner parallel
    303           // directive.
    304           IsForCombinedParallelRegion = false;
    305           for (const OMPClause *C : Clauses) {
    306             if (!isOpenMPPrivate(C->getClauseKind()) ||
    307                 C->getClauseKind() == OMPC_reduction ||
    308                 C->getClauseKind() == OMPC_linear ||
    309                 C->getClauseKind() == OMPC_private)
    310               continue;
    311             ArrayRef<const Expr *> Vars;
    312             if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
    313               Vars = PC->getVarRefs();
    314             else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
    315               Vars = PC->getVarRefs();
    316             else
    317               llvm_unreachable("Unexpected clause.");
    318             for (const auto *E : Vars) {
    319               const Decl *D =
    320                   cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
    321               if (D == VD->getCanonicalDecl()) {
    322                 IsForCombinedParallelRegion = true;
    323                 break;
    324               }
    325             }
    326             if (IsForCombinedParallelRegion)
    327               break;
    328           }
    329         }
    330         markAsEscaped(VD);
    331         if (isa<OMPCapturedExprDecl>(VD))
    332           VisitValueDecl(VD);
    333         IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
    334       }
    335     }
    336   }
    337 
    338   void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
    339     assert(!GlobalizedRD &&
    340            "Record for globalized variables is built already.");
    341     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
    342     unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
    343     if (IsInTTDRegion)
    344       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
    345     else
    346       EscapedDeclsForParallel = EscapedDecls.getArrayRef();
    347     GlobalizedRD = ::buildRecordForGlobalizedVars(
    348         CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
    349         MappedDeclsFields, WarpSize);
    350   }
    351 
    352 public:
    353   CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
    354                                ArrayRef<const ValueDecl *> TeamsReductions)
    355       : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
    356   }
    357   virtual ~CheckVarsEscapingDeclContext() = default;
    358   void VisitDeclStmt(const DeclStmt *S) {
    359     if (!S)
    360       return;
    361     for (const Decl *D : S->decls())
    362       if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
    363         VisitValueDecl(VD);
    364   }
    365   void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
    366     if (!D)
    367       return;
    368     if (!D->hasAssociatedStmt())
    369       return;
    370     if (const auto *S =
    371             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
    372       // Do not analyze directives that do not actually require capturing,
    373       // like `omp for` or `omp simd` directives.
    374       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
    375       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
    376       if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
    377         VisitStmt(S->getCapturedStmt());
    378         return;
    379       }
    380       VisitOpenMPCapturedStmt(
    381           S, D->clauses(),
    382           CaptureRegions.back() == OMPD_parallel &&
    383               isOpenMPDistributeDirective(D->getDirectiveKind()));
    384     }
    385   }
    386   void VisitCapturedStmt(const CapturedStmt *S) {
    387     if (!S)
    388       return;
    389     for (const CapturedStmt::Capture &C : S->captures()) {
    390       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
    391         const ValueDecl *VD = C.getCapturedVar();
    392         markAsEscaped(VD);
    393         if (isa<OMPCapturedExprDecl>(VD))
    394           VisitValueDecl(VD);
    395       }
    396     }
    397   }
    398   void VisitLambdaExpr(const LambdaExpr *E) {
    399     if (!E)
    400       return;
    401     for (const LambdaCapture &C : E->captures()) {
    402       if (C.capturesVariable()) {
    403         if (C.getCaptureKind() == LCK_ByRef) {
    404           const ValueDecl *VD = C.getCapturedVar();
    405           markAsEscaped(VD);
    406           if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
    407             VisitValueDecl(VD);
    408         }
    409       }
    410     }
    411   }
    412   void VisitBlockExpr(const BlockExpr *E) {
    413     if (!E)
    414       return;
    415     for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
    416       if (C.isByRef()) {
    417         const VarDecl *VD = C.getVariable();
    418         markAsEscaped(VD);
    419         if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
    420           VisitValueDecl(VD);
    421       }
    422     }
    423   }
    424   void VisitCallExpr(const CallExpr *E) {
    425     if (!E)
    426       return;
    427     for (const Expr *Arg : E->arguments()) {
    428       if (!Arg)
    429         continue;
    430       if (Arg->isLValue()) {
    431         const bool SavedAllEscaped = AllEscaped;
    432         AllEscaped = true;
    433         Visit(Arg);
    434         AllEscaped = SavedAllEscaped;
    435       } else {
    436         Visit(Arg);
    437       }
    438     }
    439     Visit(E->getCallee());
    440   }
    441   void VisitDeclRefExpr(const DeclRefExpr *E) {
    442     if (!E)
    443       return;
    444     const ValueDecl *VD = E->getDecl();
    445     if (AllEscaped)
    446       markAsEscaped(VD);
    447     if (isa<OMPCapturedExprDecl>(VD))
    448       VisitValueDecl(VD);
    449     else if (const auto *VarD = dyn_cast<VarDecl>(VD))
    450       if (VarD->isInitCapture())
    451         VisitValueDecl(VD);
    452   }
    453   void VisitUnaryOperator(const UnaryOperator *E) {
    454     if (!E)
    455       return;
    456     if (E->getOpcode() == UO_AddrOf) {
    457       const bool SavedAllEscaped = AllEscaped;
    458       AllEscaped = true;
    459       Visit(E->getSubExpr());
    460       AllEscaped = SavedAllEscaped;
    461     } else {
    462       Visit(E->getSubExpr());
    463     }
    464   }
    465   void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
    466     if (!E)
    467       return;
    468     if (E->getCastKind() == CK_ArrayToPointerDecay) {
    469       const bool SavedAllEscaped = AllEscaped;
    470       AllEscaped = true;
    471       Visit(E->getSubExpr());
    472       AllEscaped = SavedAllEscaped;
    473     } else {
    474       Visit(E->getSubExpr());
    475     }
    476   }
    477   void VisitExpr(const Expr *E) {
    478     if (!E)
    479       return;
    480     bool SavedAllEscaped = AllEscaped;
    481     if (!E->isLValue())
    482       AllEscaped = false;
    483     for (const Stmt *Child : E->children())
    484       if (Child)
    485         Visit(Child);
    486     AllEscaped = SavedAllEscaped;
    487   }
    488   void VisitStmt(const Stmt *S) {
    489     if (!S)
    490       return;
    491     for (const Stmt *Child : S->children())
    492       if (Child)
    493         Visit(Child);
    494   }
    495 
    496   /// Returns the record that handles all the escaped local variables and used
    497   /// instead of their original storage.
    498   const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
    499     if (!GlobalizedRD)
    500       buildRecordForGlobalizedVars(IsInTTDRegion);
    501     return GlobalizedRD;
    502   }
    503 
    504   /// Returns the field in the globalized record for the escaped variable.
    505   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
    506     assert(GlobalizedRD &&
    507            "Record for globalized variables must be generated already.");
    508     auto I = MappedDeclsFields.find(VD);
    509     if (I == MappedDeclsFields.end())
    510       return nullptr;
    511     return I->getSecond();
    512   }
    513 
    514   /// Returns the list of the escaped local variables/parameters.
    515   ArrayRef<const ValueDecl *> getEscapedDecls() const {
    516     return EscapedDecls.getArrayRef();
    517   }
    518 
    519   /// Checks if the escaped local variable is actually a parameter passed by
    520   /// value.
    521   const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
    522     return EscapedParameters;
    523   }
    524 
    525   /// Returns the list of the escaped variables with the variably modified
    526   /// types.
    527   ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
    528     return EscapedVariableLengthDecls.getArrayRef();
    529   }
    530 };
    531 } // anonymous namespace
    532 
    533 /// Get the id of the warp in the block.
    534 /// We assume that the warp size is 32, which is always the case
    535 /// on the NVPTX device, to generate more efficient code.
    536 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
    537   CGBuilderTy &Bld = CGF.Builder;
    538   unsigned LaneIDBits =
    539       CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
    540   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    541   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
    542 }
    543 
    544 /// Get the id of the current lane in the Warp.
    545 /// We assume that the warp size is 32, which is always the case
    546 /// on the NVPTX device, to generate more efficient code.
    547 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
    548   CGBuilderTy &Bld = CGF.Builder;
    549   unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
    550       llvm::omp::GV_Warp_Size_Log2_Mask);
    551   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    552   return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
    553                        "nvptx_lane_id");
    554 }
    555 
    556 /// Get the value of the thread_limit clause in the teams directive.
    557 /// For the 'generic' execution mode, the runtime encodes thread_limit in
    558 /// the launch parameters, always starting thread_limit+warpSize threads per
    559 /// CTA. The threads in the last warp are reserved for master execution.
    560 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
    561 static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
    562                                    bool IsInSPMDExecutionMode = false) {
    563   CGBuilderTy &Bld = CGF.Builder;
    564   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    565   llvm::Value *ThreadLimit = nullptr;
    566   if (IsInSPMDExecutionMode)
    567     ThreadLimit = RT.getGPUNumThreads(CGF);
    568   else {
    569     llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
    570     llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
    571     ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
    572   }
    573   assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
    574   return ThreadLimit;
    575 }
    576 
    577 /// Get the thread id of the OMP master thread.
    578 /// The master thread id is the first thread (lane) of the last warp in the
    579 /// GPU block.  Warp size is assumed to be some power of 2.
    580 /// Thread id is 0 indexed.
    581 /// E.g: If NumThreads is 33, master id is 32.
    582 ///      If NumThreads is 64, master id is 32.
    583 ///      If NumThreads is 1024, master id is 992.
    584 static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
    585   CGBuilderTy &Bld = CGF.Builder;
    586   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
    587   llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
    588   // We assume that the warp size is a power of 2.
    589   llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
    590 
    591   llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
    592   return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
    593 }
    594 
    595 CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
    596     CodeGenModule &CGM, SourceLocation Loc)
    597     : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
    598       Loc(Loc) {
    599   createWorkerFunction(CGM);
    600 }
    601 
    602 void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
    603     CodeGenModule &CGM) {
    604   // Create an worker function with no arguments.
    605 
    606   WorkerFn = llvm::Function::Create(
    607       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
    608       /*placeholder=*/"_worker", &CGM.getModule());
    609   CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
    610   WorkerFn->setDoesNotRecurse();
    611 }
    612 
    613 CGOpenMPRuntimeGPU::ExecutionMode
    614 CGOpenMPRuntimeGPU::getExecutionMode() const {
    615   return CurrentExecutionMode;
    616 }
    617 
    618 static CGOpenMPRuntimeGPU::DataSharingMode
    619 getDataSharingMode(CodeGenModule &CGM) {
    620   return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
    621                                           : CGOpenMPRuntimeGPU::Generic;
    622 }
    623 
    624 /// Check for inner (nested) SPMD construct, if any
    625 static bool hasNestedSPMDDirective(ASTContext &Ctx,
    626                                    const OMPExecutableDirective &D) {
    627   const auto *CS = D.getInnermostCapturedStmt();
    628   const auto *Body =
    629       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
    630   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    631 
    632   if (const auto *NestedDir =
    633           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    634     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
    635     switch (D.getDirectiveKind()) {
    636     case OMPD_target:
    637       if (isOpenMPParallelDirective(DKind))
    638         return true;
    639       if (DKind == OMPD_teams) {
    640         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
    641             /*IgnoreCaptured=*/true);
    642         if (!Body)
    643           return false;
    644         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    645         if (const auto *NND =
    646                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    647           DKind = NND->getDirectiveKind();
    648           if (isOpenMPParallelDirective(DKind))
    649             return true;
    650         }
    651       }
    652       return false;
    653     case OMPD_target_teams:
    654       return isOpenMPParallelDirective(DKind);
    655     case OMPD_target_simd:
    656     case OMPD_target_parallel:
    657     case OMPD_target_parallel_for:
    658     case OMPD_target_parallel_for_simd:
    659     case OMPD_target_teams_distribute:
    660     case OMPD_target_teams_distribute_simd:
    661     case OMPD_target_teams_distribute_parallel_for:
    662     case OMPD_target_teams_distribute_parallel_for_simd:
    663     case OMPD_parallel:
    664     case OMPD_for:
    665     case OMPD_parallel_for:
    666     case OMPD_parallel_master:
    667     case OMPD_parallel_sections:
    668     case OMPD_for_simd:
    669     case OMPD_parallel_for_simd:
    670     case OMPD_cancel:
    671     case OMPD_cancellation_point:
    672     case OMPD_ordered:
    673     case OMPD_threadprivate:
    674     case OMPD_allocate:
    675     case OMPD_task:
    676     case OMPD_simd:
    677     case OMPD_sections:
    678     case OMPD_section:
    679     case OMPD_single:
    680     case OMPD_master:
    681     case OMPD_critical:
    682     case OMPD_taskyield:
    683     case OMPD_barrier:
    684     case OMPD_taskwait:
    685     case OMPD_taskgroup:
    686     case OMPD_atomic:
    687     case OMPD_flush:
    688     case OMPD_depobj:
    689     case OMPD_scan:
    690     case OMPD_teams:
    691     case OMPD_target_data:
    692     case OMPD_target_exit_data:
    693     case OMPD_target_enter_data:
    694     case OMPD_distribute:
    695     case OMPD_distribute_simd:
    696     case OMPD_distribute_parallel_for:
    697     case OMPD_distribute_parallel_for_simd:
    698     case OMPD_teams_distribute:
    699     case OMPD_teams_distribute_simd:
    700     case OMPD_teams_distribute_parallel_for:
    701     case OMPD_teams_distribute_parallel_for_simd:
    702     case OMPD_target_update:
    703     case OMPD_declare_simd:
    704     case OMPD_declare_variant:
    705     case OMPD_begin_declare_variant:
    706     case OMPD_end_declare_variant:
    707     case OMPD_declare_target:
    708     case OMPD_end_declare_target:
    709     case OMPD_declare_reduction:
    710     case OMPD_declare_mapper:
    711     case OMPD_taskloop:
    712     case OMPD_taskloop_simd:
    713     case OMPD_master_taskloop:
    714     case OMPD_master_taskloop_simd:
    715     case OMPD_parallel_master_taskloop:
    716     case OMPD_parallel_master_taskloop_simd:
    717     case OMPD_requires:
    718     case OMPD_unknown:
    719     default:
    720       llvm_unreachable("Unexpected directive.");
    721     }
    722   }
    723 
    724   return false;
    725 }
    726 
    727 static bool supportsSPMDExecutionMode(ASTContext &Ctx,
    728                                       const OMPExecutableDirective &D) {
    729   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
    730   switch (DirectiveKind) {
    731   case OMPD_target:
    732   case OMPD_target_teams:
    733     return hasNestedSPMDDirective(Ctx, D);
    734   case OMPD_target_parallel:
    735   case OMPD_target_parallel_for:
    736   case OMPD_target_parallel_for_simd:
    737   case OMPD_target_teams_distribute_parallel_for:
    738   case OMPD_target_teams_distribute_parallel_for_simd:
    739   case OMPD_target_simd:
    740   case OMPD_target_teams_distribute_simd:
    741     return true;
    742   case OMPD_target_teams_distribute:
    743     return false;
    744   case OMPD_parallel:
    745   case OMPD_for:
    746   case OMPD_parallel_for:
    747   case OMPD_parallel_master:
    748   case OMPD_parallel_sections:
    749   case OMPD_for_simd:
    750   case OMPD_parallel_for_simd:
    751   case OMPD_cancel:
    752   case OMPD_cancellation_point:
    753   case OMPD_ordered:
    754   case OMPD_threadprivate:
    755   case OMPD_allocate:
    756   case OMPD_task:
    757   case OMPD_simd:
    758   case OMPD_sections:
    759   case OMPD_section:
    760   case OMPD_single:
    761   case OMPD_master:
    762   case OMPD_critical:
    763   case OMPD_taskyield:
    764   case OMPD_barrier:
    765   case OMPD_taskwait:
    766   case OMPD_taskgroup:
    767   case OMPD_atomic:
    768   case OMPD_flush:
    769   case OMPD_depobj:
    770   case OMPD_scan:
    771   case OMPD_teams:
    772   case OMPD_target_data:
    773   case OMPD_target_exit_data:
    774   case OMPD_target_enter_data:
    775   case OMPD_distribute:
    776   case OMPD_distribute_simd:
    777   case OMPD_distribute_parallel_for:
    778   case OMPD_distribute_parallel_for_simd:
    779   case OMPD_teams_distribute:
    780   case OMPD_teams_distribute_simd:
    781   case OMPD_teams_distribute_parallel_for:
    782   case OMPD_teams_distribute_parallel_for_simd:
    783   case OMPD_target_update:
    784   case OMPD_declare_simd:
    785   case OMPD_declare_variant:
    786   case OMPD_begin_declare_variant:
    787   case OMPD_end_declare_variant:
    788   case OMPD_declare_target:
    789   case OMPD_end_declare_target:
    790   case OMPD_declare_reduction:
    791   case OMPD_declare_mapper:
    792   case OMPD_taskloop:
    793   case OMPD_taskloop_simd:
    794   case OMPD_master_taskloop:
    795   case OMPD_master_taskloop_simd:
    796   case OMPD_parallel_master_taskloop:
    797   case OMPD_parallel_master_taskloop_simd:
    798   case OMPD_requires:
    799   case OMPD_unknown:
    800   default:
    801     break;
    802   }
    803   llvm_unreachable(
    804       "Unknown programming model for OpenMP directive on NVPTX target.");
    805 }
    806 
    807 /// Check if the directive is loops based and has schedule clause at all or has
    808 /// static scheduling.
    809 static bool hasStaticScheduling(const OMPExecutableDirective &D) {
    810   assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
    811          isOpenMPLoopDirective(D.getDirectiveKind()) &&
    812          "Expected loop-based directive.");
    813   return !D.hasClausesOfKind<OMPOrderedClause>() &&
    814          (!D.hasClausesOfKind<OMPScheduleClause>() ||
    815           llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
    816                        [](const OMPScheduleClause *C) {
    817                          return C->getScheduleKind() == OMPC_SCHEDULE_static;
    818                        }));
    819 }
    820 
    821 /// Check for inner (nested) lightweight runtime construct, if any
    822 static bool hasNestedLightweightDirective(ASTContext &Ctx,
    823                                           const OMPExecutableDirective &D) {
    824   assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
    825   const auto *CS = D.getInnermostCapturedStmt();
    826   const auto *Body =
    827       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
    828   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    829 
    830   if (const auto *NestedDir =
    831           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    832     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
    833     switch (D.getDirectiveKind()) {
    834     case OMPD_target:
    835       if (isOpenMPParallelDirective(DKind) &&
    836           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
    837           hasStaticScheduling(*NestedDir))
    838         return true;
    839       if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
    840         return true;
    841       if (DKind == OMPD_parallel) {
    842         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
    843             /*IgnoreCaptured=*/true);
    844         if (!Body)
    845           return false;
    846         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    847         if (const auto *NND =
    848                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    849           DKind = NND->getDirectiveKind();
    850           if (isOpenMPWorksharingDirective(DKind) &&
    851               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
    852             return true;
    853         }
    854       } else if (DKind == OMPD_teams) {
    855         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
    856             /*IgnoreCaptured=*/true);
    857         if (!Body)
    858           return false;
    859         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    860         if (const auto *NND =
    861                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    862           DKind = NND->getDirectiveKind();
    863           if (isOpenMPParallelDirective(DKind) &&
    864               isOpenMPWorksharingDirective(DKind) &&
    865               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
    866             return true;
    867           if (DKind == OMPD_parallel) {
    868             Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
    869                 /*IgnoreCaptured=*/true);
    870             if (!Body)
    871               return false;
    872             ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    873             if (const auto *NND =
    874                     dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    875               DKind = NND->getDirectiveKind();
    876               if (isOpenMPWorksharingDirective(DKind) &&
    877                   isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
    878                 return true;
    879             }
    880           }
    881         }
    882       }
    883       return false;
    884     case OMPD_target_teams:
    885       if (isOpenMPParallelDirective(DKind) &&
    886           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
    887           hasStaticScheduling(*NestedDir))
    888         return true;
    889       if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
    890         return true;
    891       if (DKind == OMPD_parallel) {
    892         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
    893             /*IgnoreCaptured=*/true);
    894         if (!Body)
    895           return false;
    896         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
    897         if (const auto *NND =
    898                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
    899           DKind = NND->getDirectiveKind();
    900           if (isOpenMPWorksharingDirective(DKind) &&
    901               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
    902             return true;
    903         }
    904       }
    905       return false;
    906     case OMPD_target_parallel:
    907       if (DKind == OMPD_simd)
    908         return true;
    909       return isOpenMPWorksharingDirective(DKind) &&
    910              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
    911     case OMPD_target_teams_distribute:
    912     case OMPD_target_simd:
    913     case OMPD_target_parallel_for:
    914     case OMPD_target_parallel_for_simd:
    915     case OMPD_target_teams_distribute_simd:
    916     case OMPD_target_teams_distribute_parallel_for:
    917     case OMPD_target_teams_distribute_parallel_for_simd:
    918     case OMPD_parallel:
    919     case OMPD_for:
    920     case OMPD_parallel_for:
    921     case OMPD_parallel_master:
    922     case OMPD_parallel_sections:
    923     case OMPD_for_simd:
    924     case OMPD_parallel_for_simd:
    925     case OMPD_cancel:
    926     case OMPD_cancellation_point:
    927     case OMPD_ordered:
    928     case OMPD_threadprivate:
    929     case OMPD_allocate:
    930     case OMPD_task:
    931     case OMPD_simd:
    932     case OMPD_sections:
    933     case OMPD_section:
    934     case OMPD_single:
    935     case OMPD_master:
    936     case OMPD_critical:
    937     case OMPD_taskyield:
    938     case OMPD_barrier:
    939     case OMPD_taskwait:
    940     case OMPD_taskgroup:
    941     case OMPD_atomic:
    942     case OMPD_flush:
    943     case OMPD_depobj:
    944     case OMPD_scan:
    945     case OMPD_teams:
    946     case OMPD_target_data:
    947     case OMPD_target_exit_data:
    948     case OMPD_target_enter_data:
    949     case OMPD_distribute:
    950     case OMPD_distribute_simd:
    951     case OMPD_distribute_parallel_for:
    952     case OMPD_distribute_parallel_for_simd:
    953     case OMPD_teams_distribute:
    954     case OMPD_teams_distribute_simd:
    955     case OMPD_teams_distribute_parallel_for:
    956     case OMPD_teams_distribute_parallel_for_simd:
    957     case OMPD_target_update:
    958     case OMPD_declare_simd:
    959     case OMPD_declare_variant:
    960     case OMPD_begin_declare_variant:
    961     case OMPD_end_declare_variant:
    962     case OMPD_declare_target:
    963     case OMPD_end_declare_target:
    964     case OMPD_declare_reduction:
    965     case OMPD_declare_mapper:
    966     case OMPD_taskloop:
    967     case OMPD_taskloop_simd:
    968     case OMPD_master_taskloop:
    969     case OMPD_master_taskloop_simd:
    970     case OMPD_parallel_master_taskloop:
    971     case OMPD_parallel_master_taskloop_simd:
    972     case OMPD_requires:
    973     case OMPD_unknown:
    974     default:
    975       llvm_unreachable("Unexpected directive.");
    976     }
    977   }
    978 
    979   return false;
    980 }
    981 
    982 /// Checks if the construct supports lightweight runtime. It must be SPMD
    983 /// construct + inner loop-based construct with static scheduling.
    984 static bool supportsLightweightRuntime(ASTContext &Ctx,
    985                                        const OMPExecutableDirective &D) {
    986   if (!supportsSPMDExecutionMode(Ctx, D))
    987     return false;
    988   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
    989   switch (DirectiveKind) {
    990   case OMPD_target:
    991   case OMPD_target_teams:
    992   case OMPD_target_parallel:
    993     return hasNestedLightweightDirective(Ctx, D);
    994   case OMPD_target_parallel_for:
    995   case OMPD_target_parallel_for_simd:
    996   case OMPD_target_teams_distribute_parallel_for:
    997   case OMPD_target_teams_distribute_parallel_for_simd:
    998     // (Last|First)-privates must be shared in parallel region.
    999     return hasStaticScheduling(D);
   1000   case OMPD_target_simd:
   1001   case OMPD_target_teams_distribute_simd:
   1002     return true;
   1003   case OMPD_target_teams_distribute:
   1004     return false;
   1005   case OMPD_parallel:
   1006   case OMPD_for:
   1007   case OMPD_parallel_for:
   1008   case OMPD_parallel_master:
   1009   case OMPD_parallel_sections:
   1010   case OMPD_for_simd:
   1011   case OMPD_parallel_for_simd:
   1012   case OMPD_cancel:
   1013   case OMPD_cancellation_point:
   1014   case OMPD_ordered:
   1015   case OMPD_threadprivate:
   1016   case OMPD_allocate:
   1017   case OMPD_task:
   1018   case OMPD_simd:
   1019   case OMPD_sections:
   1020   case OMPD_section:
   1021   case OMPD_single:
   1022   case OMPD_master:
   1023   case OMPD_critical:
   1024   case OMPD_taskyield:
   1025   case OMPD_barrier:
   1026   case OMPD_taskwait:
   1027   case OMPD_taskgroup:
   1028   case OMPD_atomic:
   1029   case OMPD_flush:
   1030   case OMPD_depobj:
   1031   case OMPD_scan:
   1032   case OMPD_teams:
   1033   case OMPD_target_data:
   1034   case OMPD_target_exit_data:
   1035   case OMPD_target_enter_data:
   1036   case OMPD_distribute:
   1037   case OMPD_distribute_simd:
   1038   case OMPD_distribute_parallel_for:
   1039   case OMPD_distribute_parallel_for_simd:
   1040   case OMPD_teams_distribute:
   1041   case OMPD_teams_distribute_simd:
   1042   case OMPD_teams_distribute_parallel_for:
   1043   case OMPD_teams_distribute_parallel_for_simd:
   1044   case OMPD_target_update:
   1045   case OMPD_declare_simd:
   1046   case OMPD_declare_variant:
   1047   case OMPD_begin_declare_variant:
   1048   case OMPD_end_declare_variant:
   1049   case OMPD_declare_target:
   1050   case OMPD_end_declare_target:
   1051   case OMPD_declare_reduction:
   1052   case OMPD_declare_mapper:
   1053   case OMPD_taskloop:
   1054   case OMPD_taskloop_simd:
   1055   case OMPD_master_taskloop:
   1056   case OMPD_master_taskloop_simd:
   1057   case OMPD_parallel_master_taskloop:
   1058   case OMPD_parallel_master_taskloop_simd:
   1059   case OMPD_requires:
   1060   case OMPD_unknown:
   1061   default:
   1062     break;
   1063   }
   1064   llvm_unreachable(
   1065       "Unknown programming model for OpenMP directive on NVPTX target.");
   1066 }
   1067 
   1068 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
   1069                                              StringRef ParentName,
   1070                                              llvm::Function *&OutlinedFn,
   1071                                              llvm::Constant *&OutlinedFnID,
   1072                                              bool IsOffloadEntry,
   1073                                              const RegionCodeGenTy &CodeGen) {
   1074   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
   1075   EntryFunctionState EST;
   1076   WorkerFunctionState WST(CGM, D.getBeginLoc());
   1077   Work.clear();
   1078   WrapperFunctionsMap.clear();
   1079 
   1080   // Emit target region as a standalone region.
   1081   class NVPTXPrePostActionTy : public PrePostActionTy {
   1082     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
   1083     CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
   1084 
   1085   public:
   1086     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
   1087                          CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
   1088         : EST(EST), WST(WST) {}
   1089     void Enter(CodeGenFunction &CGF) override {
   1090       auto &RT =
   1091           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   1092       RT.emitNonSPMDEntryHeader(CGF, EST, WST);
   1093       // Skip target region initialization.
   1094       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
   1095     }
   1096     void Exit(CodeGenFunction &CGF) override {
   1097       auto &RT =
   1098           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   1099       RT.clearLocThreadIdInsertPt(CGF);
   1100       RT.emitNonSPMDEntryFooter(CGF, EST);
   1101     }
   1102   } Action(EST, WST);
   1103   CodeGen.setAction(Action);
   1104   IsInTTDRegion = true;
   1105   // Reserve place for the globalized memory.
   1106   GlobalizedRecords.emplace_back();
   1107   if (!KernelStaticGlobalized) {
   1108     KernelStaticGlobalized = new llvm::GlobalVariable(
   1109         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
   1110         llvm::GlobalValue::InternalLinkage,
   1111         llvm::UndefValue::get(CGM.VoidPtrTy),
   1112         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
   1113         llvm::GlobalValue::NotThreadLocal,
   1114         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
   1115   }
   1116   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
   1117                                    IsOffloadEntry, CodeGen);
   1118   IsInTTDRegion = false;
   1119 
   1120   // Now change the name of the worker function to correspond to this target
   1121   // region's entry function.
   1122   WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
   1123 
   1124   // Create the worker function
   1125   emitWorkerFunction(WST);
   1126 }
   1127 
   1128 // Setup NVPTX threads for master-worker OpenMP scheme.
   1129 void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
   1130                                                   EntryFunctionState &EST,
   1131                                                   WorkerFunctionState &WST) {
   1132   CGBuilderTy &Bld = CGF.Builder;
   1133 
   1134   llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
   1135   llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
   1136   llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
   1137   EST.ExitBB = CGF.createBasicBlock(".exit");
   1138 
   1139   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   1140   llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
   1141   llvm::Value *ThreadLimit = getThreadLimit(CGF);
   1142   llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
   1143   Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
   1144 
   1145   CGF.EmitBlock(WorkerBB);
   1146   emitCall(CGF, WST.Loc, WST.WorkerFn);
   1147   CGF.EmitBranch(EST.ExitBB);
   1148 
   1149   CGF.EmitBlock(MasterCheckBB);
   1150   GPUThreadID = RT.getGPUThreadID(CGF);
   1151   llvm::Value *MasterThreadID = getMasterThreadID(CGF);
   1152   llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
   1153   Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
   1154 
   1155   CGF.EmitBlock(MasterBB);
   1156   IsInTargetMasterThreadRegion = true;
   1157   // SEQUENTIAL (MASTER) REGION START
   1158   // First action in sequential region:
   1159   // Initialize the state of the OpenMP runtime library on the GPU.
   1160   // TODO: Optimize runtime initialization and pass in correct value.
   1161   llvm::Value *Args[] = {getThreadLimit(CGF),
   1162                          Bld.getInt16(/*RequiresOMPRuntime=*/1)};
   1163   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1164                           CGM.getModule(), OMPRTL___kmpc_kernel_init),
   1165                       Args);
   1166 
   1167   // For data sharing, we need to initialize the stack.
   1168   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1169       CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
   1170 
   1171   emitGenericVarsProlog(CGF, WST.Loc);
   1172 }
   1173 
   1174 void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
   1175                                                   EntryFunctionState &EST) {
   1176   IsInTargetMasterThreadRegion = false;
   1177   if (!CGF.HaveInsertPoint())
   1178     return;
   1179 
   1180   emitGenericVarsEpilog(CGF);
   1181 
   1182   if (!EST.ExitBB)
   1183     EST.ExitBB = CGF.createBasicBlock(".exit");
   1184 
   1185   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
   1186   CGF.EmitBranch(TerminateBB);
   1187 
   1188   CGF.EmitBlock(TerminateBB);
   1189   // Signal termination condition.
   1190   // TODO: Optimize runtime initialization and pass in correct value.
   1191   llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
   1192   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1193                           CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
   1194                       Args);
   1195   // Barrier to terminate worker threads.
   1196   syncCTAThreads(CGF);
   1197   // Master thread jumps to exit point.
   1198   CGF.EmitBranch(EST.ExitBB);
   1199 
   1200   CGF.EmitBlock(EST.ExitBB);
   1201   EST.ExitBB = nullptr;
   1202 }
   1203 
   1204 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
   1205                                           StringRef ParentName,
   1206                                           llvm::Function *&OutlinedFn,
   1207                                           llvm::Constant *&OutlinedFnID,
   1208                                           bool IsOffloadEntry,
   1209                                           const RegionCodeGenTy &CodeGen) {
   1210   ExecutionRuntimeModesRAII ModeRAII(
   1211       CurrentExecutionMode, RequiresFullRuntime,
   1212       CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
   1213           !supportsLightweightRuntime(CGM.getContext(), D));
   1214   EntryFunctionState EST;
   1215 
   1216   // Emit target region as a standalone region.
   1217   class NVPTXPrePostActionTy : public PrePostActionTy {
   1218     CGOpenMPRuntimeGPU &RT;
   1219     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
   1220     const OMPExecutableDirective &D;
   1221 
   1222   public:
   1223     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
   1224                          CGOpenMPRuntimeGPU::EntryFunctionState &EST,
   1225                          const OMPExecutableDirective &D)
   1226         : RT(RT), EST(EST), D(D) {}
   1227     void Enter(CodeGenFunction &CGF) override {
   1228       RT.emitSPMDEntryHeader(CGF, EST, D);
   1229       // Skip target region initialization.
   1230       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
   1231     }
   1232     void Exit(CodeGenFunction &CGF) override {
   1233       RT.clearLocThreadIdInsertPt(CGF);
   1234       RT.emitSPMDEntryFooter(CGF, EST);
   1235     }
   1236   } Action(*this, EST, D);
   1237   CodeGen.setAction(Action);
   1238   IsInTTDRegion = true;
   1239   // Reserve place for the globalized memory.
   1240   GlobalizedRecords.emplace_back();
   1241   if (!KernelStaticGlobalized) {
   1242     KernelStaticGlobalized = new llvm::GlobalVariable(
   1243         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
   1244         llvm::GlobalValue::InternalLinkage,
   1245         llvm::UndefValue::get(CGM.VoidPtrTy),
   1246         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
   1247         llvm::GlobalValue::NotThreadLocal,
   1248         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
   1249   }
   1250   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
   1251                                    IsOffloadEntry, CodeGen);
   1252   IsInTTDRegion = false;
   1253 }
   1254 
   1255 void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
   1256     CodeGenFunction &CGF, EntryFunctionState &EST,
   1257     const OMPExecutableDirective &D) {
   1258   CGBuilderTy &Bld = CGF.Builder;
   1259 
   1260   // Setup BBs in entry function.
   1261   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
   1262   EST.ExitBB = CGF.createBasicBlock(".exit");
   1263 
   1264   llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
   1265                          /*RequiresOMPRuntime=*/
   1266                          Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
   1267   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1268                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
   1269                       Args);
   1270 
   1271   if (RequiresFullRuntime) {
   1272     // For data sharing, we need to initialize the stack.
   1273     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1274         CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
   1275   }
   1276 
   1277   CGF.EmitBranch(ExecuteBB);
   1278 
   1279   CGF.EmitBlock(ExecuteBB);
   1280 
   1281   IsInTargetMasterThreadRegion = true;
   1282 }
   1283 
   1284 void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
   1285                                                EntryFunctionState &EST) {
   1286   IsInTargetMasterThreadRegion = false;
   1287   if (!CGF.HaveInsertPoint())
   1288     return;
   1289 
   1290   if (!EST.ExitBB)
   1291     EST.ExitBB = CGF.createBasicBlock(".exit");
   1292 
   1293   llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
   1294   CGF.EmitBranch(OMPDeInitBB);
   1295 
   1296   CGF.EmitBlock(OMPDeInitBB);
   1297   // DeInitialize the OMP state in the runtime; called by all active threads.
   1298   llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
   1299                          CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
   1300   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1301                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
   1302                       Args);
   1303   CGF.EmitBranch(EST.ExitBB);
   1304 
   1305   CGF.EmitBlock(EST.ExitBB);
   1306   EST.ExitBB = nullptr;
   1307 }
   1308 
   1309 // Create a unique global variable to indicate the execution mode of this target
   1310 // region. The execution mode is either 'generic', or 'spmd' depending on the
   1311 // target directive. This variable is picked up by the offload library to setup
   1312 // the device appropriately before kernel launch. If the execution mode is
   1313 // 'generic', the runtime reserves one warp for the master, otherwise, all
   1314 // warps participate in parallel work.
   1315 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
   1316                                      bool Mode) {
   1317   auto *GVMode =
   1318       new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
   1319                                llvm::GlobalValue::WeakAnyLinkage,
   1320                                llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
   1321                                Twine(Name, "_exec_mode"));
   1322   CGM.addCompilerUsedGlobal(GVMode);
   1323 }
   1324 
   1325 void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
   1326   ASTContext &Ctx = CGM.getContext();
   1327 
   1328   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
   1329   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
   1330                     WST.Loc, WST.Loc);
   1331   emitWorkerLoop(CGF, WST);
   1332   CGF.FinishFunction();
   1333 }
   1334 
   1335 void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
   1336                                         WorkerFunctionState &WST) {
   1337   //
   1338   // The workers enter this loop and wait for parallel work from the master.
   1339   // When the master encounters a parallel region it sets up the work + variable
   1340   // arguments, and wakes up the workers.  The workers first check to see if
   1341   // they are required for the parallel region, i.e., within the # of requested
   1342   // parallel threads.  The activated workers load the variable arguments and
   1343   // execute the parallel work.
   1344   //
   1345 
   1346   CGBuilderTy &Bld = CGF.Builder;
   1347 
   1348   llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
   1349   llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
   1350   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
   1351   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
   1352   llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
   1353   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
   1354 
   1355   CGF.EmitBranch(AwaitBB);
   1356 
   1357   // Workers wait for work from master.
   1358   CGF.EmitBlock(AwaitBB);
   1359   // Wait for parallel work
   1360   syncCTAThreads(CGF);
   1361 
   1362   Address WorkFn =
   1363       CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
   1364   Address ExecStatus =
   1365       CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
   1366   CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
   1367   CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
   1368 
   1369   // TODO: Optimize runtime initialization and pass in correct value.
   1370   llvm::Value *Args[] = {WorkFn.getPointer()};
   1371   llvm::Value *Ret =
   1372       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1373                               CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
   1374                           Args);
   1375   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
   1376 
   1377   // On termination condition (workid == 0), exit loop.
   1378   llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
   1379   llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
   1380   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
   1381 
   1382   // Activate requested workers.
   1383   CGF.EmitBlock(SelectWorkersBB);
   1384   llvm::Value *IsActive =
   1385       Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
   1386   Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
   1387 
   1388   // Signal start of parallel region.
   1389   CGF.EmitBlock(ExecuteBB);
   1390   // Skip initialization.
   1391   setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
   1392 
   1393   // Process work items: outlined parallel functions.
   1394   for (llvm::Function *W : Work) {
   1395     // Try to match this outlined function.
   1396     llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
   1397 
   1398     llvm::Value *WorkFnMatch =
   1399         Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
   1400 
   1401     llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
   1402     llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
   1403     Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
   1404 
   1405     // Execute this outlined function.
   1406     CGF.EmitBlock(ExecuteFNBB);
   1407 
   1408     // Insert call to work function via shared wrapper. The shared
   1409     // wrapper takes two arguments:
   1410     //   - the parallelism level;
   1411     //   - the thread ID;
   1412     emitCall(CGF, WST.Loc, W,
   1413              {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
   1414 
   1415     // Go to end of parallel region.
   1416     CGF.EmitBranch(TerminateBB);
   1417 
   1418     CGF.EmitBlock(CheckNextBB);
   1419   }
   1420   // Default case: call to outlined function through pointer if the target
   1421   // region makes a declare target call that may contain an orphaned parallel
   1422   // directive.
   1423   auto *ParallelFnTy =
   1424       llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
   1425                               /*isVarArg=*/false);
   1426   llvm::Value *WorkFnCast =
   1427       Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
   1428   // Insert call to work function via shared wrapper. The shared
   1429   // wrapper takes two arguments:
   1430   //   - the parallelism level;
   1431   //   - the thread ID;
   1432   emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
   1433            {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
   1434   // Go to end of parallel region.
   1435   CGF.EmitBranch(TerminateBB);
   1436 
   1437   // Signal end of parallel region.
   1438   CGF.EmitBlock(TerminateBB);
   1439   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1440                           CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
   1441                       llvm::None);
   1442   CGF.EmitBranch(BarrierBB);
   1443 
   1444   // All active and inactive workers wait at a barrier after parallel region.
   1445   CGF.EmitBlock(BarrierBB);
   1446   // Barrier after parallel region.
   1447   syncCTAThreads(CGF);
   1448   CGF.EmitBranch(AwaitBB);
   1449 
   1450   // Exit target region.
   1451   CGF.EmitBlock(ExitBB);
   1452   // Skip initialization.
   1453   clearLocThreadIdInsertPt(CGF);
   1454 }
   1455 
   1456 void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
   1457                                               llvm::Constant *Addr,
   1458                                               uint64_t Size, int32_t,
   1459                                               llvm::GlobalValue::LinkageTypes) {
   1460   // TODO: Add support for global variables on the device after declare target
   1461   // support.
   1462   if (!isa<llvm::Function>(Addr))
   1463     return;
   1464   llvm::Module &M = CGM.getModule();
   1465   llvm::LLVMContext &Ctx = CGM.getLLVMContext();
   1466 
   1467   // Get "nvvm.annotations" metadata node
   1468   llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
   1469 
   1470   llvm::Metadata *MDVals[] = {
   1471       llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
   1472       llvm::ConstantAsMetadata::get(
   1473           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
   1474   // Append metadata to nvvm.annotations
   1475   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
   1476 }
   1477 
   1478 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
   1479     const OMPExecutableDirective &D, StringRef ParentName,
   1480     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
   1481     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
   1482   if (!IsOffloadEntry) // Nothing to do.
   1483     return;
   1484 
   1485   assert(!ParentName.empty() && "Invalid target region parent name!");
   1486 
   1487   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   1488   if (Mode)
   1489     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
   1490                    CodeGen);
   1491   else
   1492     emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
   1493                       CodeGen);
   1494 
   1495   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
   1496 }
   1497 
   1498 namespace {
   1499 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
   1500 /// Enum for accesseing the reserved_2 field of the ident_t struct.
   1501 enum ModeFlagsTy : unsigned {
   1502   /// Bit set to 1 when in SPMD mode.
   1503   KMP_IDENT_SPMD_MODE = 0x01,
   1504   /// Bit set to 1 when a simplified runtime is used.
   1505   KMP_IDENT_SIMPLE_RT_MODE = 0x02,
   1506   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
   1507 };
   1508 
   1509 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
   1510 static const ModeFlagsTy UndefinedMode =
   1511     (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
   1512 } // anonymous namespace
   1513 
   1514 unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
   1515   switch (getExecutionMode()) {
   1516   case EM_SPMD:
   1517     if (requiresFullRuntime())
   1518       return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
   1519     return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
   1520   case EM_NonSPMD:
   1521     assert(requiresFullRuntime() && "Expected full runtime.");
   1522     return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
   1523   case EM_Unknown:
   1524     return UndefinedMode;
   1525   }
   1526   llvm_unreachable("Unknown flags are requested.");
   1527 }
   1528 
   1529 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
   1530     : CGOpenMPRuntime(CGM, "_", "$") {
   1531   if (!CGM.getLangOpts().OpenMPIsDevice)
   1532     llvm_unreachable("OpenMP NVPTX can only handle device code.");
   1533 }
   1534 
   1535 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
   1536                                               ProcBindKind ProcBind,
   1537                                               SourceLocation Loc) {
   1538   // Do nothing in case of SPMD mode and L0 parallel.
   1539   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
   1540     return;
   1541 
   1542   CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
   1543 }
   1544 
   1545 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
   1546                                                 llvm::Value *NumThreads,
   1547                                                 SourceLocation Loc) {
   1548   // Do nothing in case of SPMD mode and L0 parallel.
   1549   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
   1550     return;
   1551 
   1552   CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
   1553 }
   1554 
   1555 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
   1556                                               const Expr *NumTeams,
   1557                                               const Expr *ThreadLimit,
   1558                                               SourceLocation Loc) {}
   1559 
   1560 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
   1561     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
   1562     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
   1563   // Emit target region as a standalone region.
   1564   class NVPTXPrePostActionTy : public PrePostActionTy {
   1565     bool &IsInParallelRegion;
   1566     bool PrevIsInParallelRegion;
   1567 
   1568   public:
   1569     NVPTXPrePostActionTy(bool &IsInParallelRegion)
   1570         : IsInParallelRegion(IsInParallelRegion) {}
   1571     void Enter(CodeGenFunction &CGF) override {
   1572       PrevIsInParallelRegion = IsInParallelRegion;
   1573       IsInParallelRegion = true;
   1574     }
   1575     void Exit(CodeGenFunction &CGF) override {
   1576       IsInParallelRegion = PrevIsInParallelRegion;
   1577     }
   1578   } Action(IsInParallelRegion);
   1579   CodeGen.setAction(Action);
   1580   bool PrevIsInTTDRegion = IsInTTDRegion;
   1581   IsInTTDRegion = false;
   1582   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
   1583   IsInTargetMasterThreadRegion = false;
   1584   auto *OutlinedFun =
   1585       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
   1586           D, ThreadIDVar, InnermostKind, CodeGen));
   1587   IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
   1588   IsInTTDRegion = PrevIsInTTDRegion;
   1589   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
   1590       !IsInParallelRegion) {
   1591     llvm::Function *WrapperFun =
   1592         createParallelDataSharingWrapper(OutlinedFun, D);
   1593     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
   1594   }
   1595 
   1596   return OutlinedFun;
   1597 }
   1598 
   1599 /// Get list of lastprivate variables from the teams distribute ... or
   1600 /// teams {distribute ...} directives.
   1601 static void
   1602 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
   1603                              llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
   1604   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
   1605          "expected teams directive.");
   1606   const OMPExecutableDirective *Dir = &D;
   1607   if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
   1608     if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
   1609             Ctx,
   1610             D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
   1611                 /*IgnoreCaptured=*/true))) {
   1612       Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
   1613       if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
   1614         Dir = nullptr;
   1615     }
   1616   }
   1617   if (!Dir)
   1618     return;
   1619   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
   1620     for (const Expr *E : C->getVarRefs())
   1621       Vars.push_back(getPrivateItem(E));
   1622   }
   1623 }
   1624 
   1625 /// Get list of reduction variables from the teams ... directives.
   1626 static void
   1627 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
   1628                       llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
   1629   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
   1630          "expected teams directive.");
   1631   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
   1632     for (const Expr *E : C->privates())
   1633       Vars.push_back(getPrivateItem(E));
   1634   }
   1635 }
   1636 
   1637 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
   1638     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
   1639     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
   1640   SourceLocation Loc = D.getBeginLoc();
   1641 
   1642   const RecordDecl *GlobalizedRD = nullptr;
   1643   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
   1644   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
   1645   unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
   1646   // Globalize team reductions variable unconditionally in all modes.
   1647   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
   1648     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
   1649   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
   1650     getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
   1651     if (!LastPrivatesReductions.empty()) {
   1652       GlobalizedRD = ::buildRecordForGlobalizedVars(
   1653           CGM.getContext(), llvm::None, LastPrivatesReductions,
   1654           MappedDeclsFields, WarpSize);
   1655     }
   1656   } else if (!LastPrivatesReductions.empty()) {
   1657     assert(!TeamAndReductions.first &&
   1658            "Previous team declaration is not expected.");
   1659     TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
   1660     std::swap(TeamAndReductions.second, LastPrivatesReductions);
   1661   }
   1662 
   1663   // Emit target region as a standalone region.
   1664   class NVPTXPrePostActionTy : public PrePostActionTy {
   1665     SourceLocation &Loc;
   1666     const RecordDecl *GlobalizedRD;
   1667     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
   1668         &MappedDeclsFields;
   1669 
   1670   public:
   1671     NVPTXPrePostActionTy(
   1672         SourceLocation &Loc, const RecordDecl *GlobalizedRD,
   1673         llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
   1674             &MappedDeclsFields)
   1675         : Loc(Loc), GlobalizedRD(GlobalizedRD),
   1676           MappedDeclsFields(MappedDeclsFields) {}
   1677     void Enter(CodeGenFunction &CGF) override {
   1678       auto &Rt =
   1679           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   1680       if (GlobalizedRD) {
   1681         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
   1682         I->getSecond().GlobalRecord = GlobalizedRD;
   1683         I->getSecond().MappedParams =
   1684             std::make_unique<CodeGenFunction::OMPMapVars>();
   1685         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
   1686         for (const auto &Pair : MappedDeclsFields) {
   1687           assert(Pair.getFirst()->isCanonicalDecl() &&
   1688                  "Expected canonical declaration");
   1689           Data.insert(std::make_pair(Pair.getFirst(),
   1690                                      MappedVarData(Pair.getSecond(),
   1691                                                    /*IsOnePerTeam=*/true)));
   1692         }
   1693       }
   1694       Rt.emitGenericVarsProlog(CGF, Loc);
   1695     }
   1696     void Exit(CodeGenFunction &CGF) override {
   1697       static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
   1698           .emitGenericVarsEpilog(CGF);
   1699     }
   1700   } Action(Loc, GlobalizedRD, MappedDeclsFields);
   1701   CodeGen.setAction(Action);
   1702   llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
   1703       D, ThreadIDVar, InnermostKind, CodeGen);
   1704 
   1705   return OutlinedFun;
   1706 }
   1707 
   1708 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
   1709                                                  SourceLocation Loc,
   1710                                                  bool WithSPMDCheck) {
   1711   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
   1712       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
   1713     return;
   1714 
   1715   CGBuilderTy &Bld = CGF.Builder;
   1716 
   1717   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   1718   if (I == FunctionGlobalizedDecls.end())
   1719     return;
   1720   if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
   1721     QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
   1722     QualType SecGlobalRecTy;
   1723 
   1724     // Recover pointer to this function's global record. The runtime will
   1725     // handle the specifics of the allocation of the memory.
   1726     // Use actual memory size of the record including the padding
   1727     // for alignment purposes.
   1728     unsigned Alignment =
   1729         CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
   1730     unsigned GlobalRecordSize =
   1731         CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
   1732     GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
   1733 
   1734     llvm::PointerType *GlobalRecPtrTy =
   1735         CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
   1736     llvm::Value *GlobalRecCastAddr;
   1737     llvm::Value *IsTTD = nullptr;
   1738     if (!IsInTTDRegion &&
   1739         (WithSPMDCheck ||
   1740          getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
   1741       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
   1742       llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
   1743       llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
   1744       if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
   1745         llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
   1746         llvm::Value *ThreadID = getThreadID(CGF, Loc);
   1747         llvm::Value *PL = CGF.EmitRuntimeCall(
   1748             OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
   1749                                                   OMPRTL___kmpc_parallel_level),
   1750             {RTLoc, ThreadID});
   1751         IsTTD = Bld.CreateIsNull(PL);
   1752       }
   1753       llvm::Value *IsSPMD = Bld.CreateIsNotNull(
   1754           CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   1755               CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
   1756       Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
   1757       // There is no need to emit line number for unconditional branch.
   1758       (void)ApplyDebugLocation::CreateEmpty(CGF);
   1759       CGF.EmitBlock(SPMDBB);
   1760       Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
   1761                                CharUnits::fromQuantity(Alignment));
   1762       CGF.EmitBranch(ExitBB);
   1763       // There is no need to emit line number for unconditional branch.
   1764       (void)ApplyDebugLocation::CreateEmpty(CGF);
   1765       CGF.EmitBlock(NonSPMDBB);
   1766       llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
   1767       if (const RecordDecl *SecGlobalizedVarsRecord =
   1768               I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
   1769         SecGlobalRecTy =
   1770             CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
   1771 
   1772         // Recover pointer to this function's global record. The runtime will
   1773         // handle the specifics of the allocation of the memory.
   1774         // Use actual memory size of the record including the padding
   1775         // for alignment purposes.
   1776         unsigned Alignment =
   1777             CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
   1778         unsigned GlobalRecordSize =
   1779             CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
   1780         GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
   1781         Size = Bld.CreateSelect(
   1782             IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
   1783       }
   1784       // TODO: allow the usage of shared memory to be controlled by
   1785       // the user, for now, default to global.
   1786       llvm::Value *GlobalRecordSizeArg[] = {
   1787           Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
   1788       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
   1789           OMPBuilder.getOrCreateRuntimeFunction(
   1790               CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
   1791           GlobalRecordSizeArg);
   1792       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   1793           GlobalRecValue, GlobalRecPtrTy);
   1794       CGF.EmitBlock(ExitBB);
   1795       auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
   1796                                 /*NumReservedValues=*/2, "_select_stack");
   1797       Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
   1798       Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
   1799       GlobalRecCastAddr = Phi;
   1800       I->getSecond().GlobalRecordAddr = Phi;
   1801       I->getSecond().IsInSPMDModeFlag = IsSPMD;
   1802     } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
   1803       assert(GlobalizedRecords.back().Records.size() < 2 &&
   1804              "Expected less than 2 globalized records: one for target and one "
   1805              "for teams.");
   1806       unsigned Offset = 0;
   1807       for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
   1808         QualType RDTy = CGM.getContext().getRecordType(RD);
   1809         unsigned Alignment =
   1810             CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
   1811         unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
   1812         Offset =
   1813             llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
   1814       }
   1815       unsigned Alignment =
   1816           CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
   1817       Offset = llvm::alignTo(Offset, Alignment);
   1818       GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
   1819       ++GlobalizedRecords.back().RegionCounter;
   1820       if (GlobalizedRecords.back().Records.size() == 1) {
   1821         assert(KernelStaticGlobalized &&
   1822                "Kernel static pointer must be initialized already.");
   1823         auto *UseSharedMemory = new llvm::GlobalVariable(
   1824             CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
   1825             llvm::GlobalValue::InternalLinkage, nullptr,
   1826             "_openmp_static_kernel$is_shared");
   1827         UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   1828         QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
   1829             /*DestWidth=*/16, /*Signed=*/0);
   1830         llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
   1831             Address(UseSharedMemory,
   1832                     CGM.getContext().getTypeAlignInChars(Int16Ty)),
   1833             /*Volatile=*/false, Int16Ty, Loc);
   1834         auto *StaticGlobalized = new llvm::GlobalVariable(
   1835             CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
   1836             llvm::GlobalValue::CommonLinkage, nullptr);
   1837         auto *RecSize = new llvm::GlobalVariable(
   1838             CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
   1839             llvm::GlobalValue::InternalLinkage, nullptr,
   1840             "_openmp_static_kernel$size");
   1841         RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
   1842         llvm::Value *Ld = CGF.EmitLoadOfScalar(
   1843             Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
   1844             CGM.getContext().getSizeType(), Loc);
   1845         llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   1846             KernelStaticGlobalized, CGM.VoidPtrPtrTy);
   1847         llvm::Value *GlobalRecordSizeArg[] = {
   1848             llvm::ConstantInt::get(
   1849                 CGM.Int16Ty,
   1850                 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
   1851             StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
   1852         CGF.EmitRuntimeCall(
   1853             OMPBuilder.getOrCreateRuntimeFunction(
   1854                 CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
   1855             GlobalRecordSizeArg);
   1856         GlobalizedRecords.back().Buffer = StaticGlobalized;
   1857         GlobalizedRecords.back().RecSize = RecSize;
   1858         GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
   1859         GlobalizedRecords.back().Loc = Loc;
   1860       }
   1861       assert(KernelStaticGlobalized && "Global address must be set already.");
   1862       Address FrameAddr = CGF.EmitLoadOfPointer(
   1863           Address(KernelStaticGlobalized, CGM.getPointerAlign()),
   1864           CGM.getContext()
   1865               .getPointerType(CGM.getContext().VoidPtrTy)
   1866               .castAs<PointerType>());
   1867       llvm::Value *GlobalRecValue =
   1868           Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
   1869       I->getSecond().GlobalRecordAddr = GlobalRecValue;
   1870       I->getSecond().IsInSPMDModeFlag = nullptr;
   1871       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   1872           GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
   1873     } else {
   1874       // TODO: allow the usage of shared memory to be controlled by
   1875       // the user, for now, default to global.
   1876       bool UseSharedMemory =
   1877           IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
   1878       llvm::Value *GlobalRecordSizeArg[] = {
   1879           llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
   1880           CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
   1881       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
   1882           OMPBuilder.getOrCreateRuntimeFunction(
   1883               CGM.getModule(),
   1884               IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
   1885                             : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
   1886           GlobalRecordSizeArg);
   1887       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   1888           GlobalRecValue, GlobalRecPtrTy);
   1889       I->getSecond().GlobalRecordAddr = GlobalRecValue;
   1890       I->getSecond().IsInSPMDModeFlag = nullptr;
   1891     }
   1892     LValue Base =
   1893         CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
   1894 
   1895     // Emit the "global alloca" which is a GEP from the global declaration
   1896     // record using the pointer returned by the runtime.
   1897     LValue SecBase;
   1898     decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
   1899     if (IsTTD) {
   1900       SecIt = I->getSecond().SecondaryLocalVarData->begin();
   1901       llvm::PointerType *SecGlobalRecPtrTy =
   1902           CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
   1903       SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
   1904           Bld.CreatePointerBitCastOrAddrSpaceCast(
   1905               I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
   1906           SecGlobalRecTy);
   1907     }
   1908     for (auto &Rec : I->getSecond().LocalVarData) {
   1909       bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
   1910       llvm::Value *ParValue;
   1911       if (EscapedParam) {
   1912         const auto *VD = cast<VarDecl>(Rec.first);
   1913         LValue ParLVal =
   1914             CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
   1915         ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
   1916       }
   1917       LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
   1918       // Emit VarAddr basing on lane-id if required.
   1919       QualType VarTy;
   1920       if (Rec.second.IsOnePerTeam) {
   1921         VarTy = Rec.second.FD->getType();
   1922       } else {
   1923         Address Addr = VarAddr.getAddress(CGF);
   1924         llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
   1925             Addr.getElementType(), Addr.getPointer(),
   1926             {Bld.getInt32(0), getNVPTXLaneID(CGF)});
   1927         VarTy =
   1928             Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
   1929         VarAddr = CGF.MakeAddrLValue(
   1930             Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
   1931             AlignmentSource::Decl);
   1932       }
   1933       Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
   1934       if (!IsInTTDRegion &&
   1935           (WithSPMDCheck ||
   1936            getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
   1937         assert(I->getSecond().IsInSPMDModeFlag &&
   1938                "Expected unknown execution mode or required SPMD check.");
   1939         if (IsTTD) {
   1940           assert(SecIt->second.IsOnePerTeam &&
   1941                  "Secondary glob data must be one per team.");
   1942           LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
   1943           VarAddr.setAddress(
   1944               Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
   1945                                        VarAddr.getPointer(CGF)),
   1946                       VarAddr.getAlignment()));
   1947           Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
   1948         }
   1949         Address GlobalPtr = Rec.second.PrivateAddr;
   1950         Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
   1951         Rec.second.PrivateAddr = Address(
   1952             Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
   1953                              LocalAddr.getPointer(), GlobalPtr.getPointer()),
   1954             LocalAddr.getAlignment());
   1955       }
   1956       if (EscapedParam) {
   1957         const auto *VD = cast<VarDecl>(Rec.first);
   1958         CGF.EmitStoreOfScalar(ParValue, VarAddr);
   1959         I->getSecond().MappedParams->setVarAddr(CGF, VD,
   1960                                                 VarAddr.getAddress(CGF));
   1961       }
   1962       if (IsTTD)
   1963         ++SecIt;
   1964     }
   1965   }
   1966   for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
   1967     // Recover pointer to this function's global record. The runtime will
   1968     // handle the specifics of the allocation of the memory.
   1969     // Use actual memory size of the record including the padding
   1970     // for alignment purposes.
   1971     CGBuilderTy &Bld = CGF.Builder;
   1972     llvm::Value *Size = CGF.getTypeSize(VD->getType());
   1973     CharUnits Align = CGM.getContext().getDeclAlign(VD);
   1974     Size = Bld.CreateNUWAdd(
   1975         Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
   1976     llvm::Value *AlignVal =
   1977         llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
   1978     Size = Bld.CreateUDiv(Size, AlignVal);
   1979     Size = Bld.CreateNUWMul(Size, AlignVal);
   1980     // TODO: allow the usage of shared memory to be controlled by
   1981     // the user, for now, default to global.
   1982     llvm::Value *GlobalRecordSizeArg[] = {
   1983         Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
   1984     llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
   1985         OMPBuilder.getOrCreateRuntimeFunction(
   1986             CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
   1987         GlobalRecordSizeArg);
   1988     llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   1989         GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
   1990     LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
   1991                                      CGM.getContext().getDeclAlign(VD),
   1992                                      AlignmentSource::Decl);
   1993     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
   1994                                             Base.getAddress(CGF));
   1995     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
   1996   }
   1997   I->getSecond().MappedParams->apply(CGF);
   1998 }
   1999 
   2000 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
   2001                                                  bool WithSPMDCheck) {
   2002   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
   2003       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
   2004     return;
   2005 
   2006   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   2007   if (I != FunctionGlobalizedDecls.end()) {
   2008     I->getSecond().MappedParams->restore(CGF);
   2009     if (!CGF.HaveInsertPoint())
   2010       return;
   2011     for (llvm::Value *Addr :
   2012          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
   2013       CGF.EmitRuntimeCall(
   2014           OMPBuilder.getOrCreateRuntimeFunction(
   2015               CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
   2016           Addr);
   2017     }
   2018     if (I->getSecond().GlobalRecordAddr) {
   2019       if (!IsInTTDRegion &&
   2020           (WithSPMDCheck ||
   2021            getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
   2022         CGBuilderTy &Bld = CGF.Builder;
   2023         llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
   2024         llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
   2025         Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
   2026         // There is no need to emit line number for unconditional branch.
   2027         (void)ApplyDebugLocation::CreateEmpty(CGF);
   2028         CGF.EmitBlock(NonSPMDBB);
   2029         CGF.EmitRuntimeCall(
   2030             OMPBuilder.getOrCreateRuntimeFunction(
   2031                 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
   2032             CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
   2033         CGF.EmitBlock(ExitBB);
   2034       } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
   2035         assert(GlobalizedRecords.back().RegionCounter > 0 &&
   2036                "region counter must be > 0.");
   2037         --GlobalizedRecords.back().RegionCounter;
   2038         // Emit the restore function only in the target region.
   2039         if (GlobalizedRecords.back().RegionCounter == 0) {
   2040           QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
   2041               /*DestWidth=*/16, /*Signed=*/0);
   2042           llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
   2043               Address(GlobalizedRecords.back().UseSharedMemory,
   2044                       CGM.getContext().getTypeAlignInChars(Int16Ty)),
   2045               /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
   2046           llvm::Value *Args[] = {
   2047               llvm::ConstantInt::get(
   2048                   CGM.Int16Ty,
   2049                   getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
   2050               IsInSharedMemory};
   2051           CGF.EmitRuntimeCall(
   2052               OMPBuilder.getOrCreateRuntimeFunction(
   2053                   CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
   2054               Args);
   2055         }
   2056       } else {
   2057         CGF.EmitRuntimeCall(
   2058             OMPBuilder.getOrCreateRuntimeFunction(
   2059                 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
   2060             I->getSecond().GlobalRecordAddr);
   2061       }
   2062     }
   2063   }
   2064 }
   2065 
   2066 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
   2067                                          const OMPExecutableDirective &D,
   2068                                          SourceLocation Loc,
   2069                                          llvm::Function *OutlinedFn,
   2070                                          ArrayRef<llvm::Value *> CapturedVars) {
   2071   if (!CGF.HaveInsertPoint())
   2072     return;
   2073 
   2074   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
   2075                                                       /*Name=*/".zero.addr");
   2076   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
   2077   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
   2078   OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
   2079   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
   2080   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
   2081   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
   2082 }
   2083 
   2084 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
   2085                                           SourceLocation Loc,
   2086                                           llvm::Function *OutlinedFn,
   2087                                           ArrayRef<llvm::Value *> CapturedVars,
   2088                                           const Expr *IfCond) {
   2089   if (!CGF.HaveInsertPoint())
   2090     return;
   2091 
   2092   auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
   2093                         IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
   2094     CGBuilderTy &Bld = CGF.Builder;
   2095     llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
   2096     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
   2097     if (WFn) {
   2098       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
   2099       // Remember for post-processing in worker loop.
   2100       Work.emplace_back(WFn);
   2101     }
   2102     llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
   2103 
   2104     // Create a private scope that will globalize the arguments
   2105     // passed from the outside of the target region.
   2106     // TODO: Is that needed?
   2107     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
   2108 
   2109     Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
   2110         llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
   2111         "captured_vars_addrs");
   2112     // There's something to share.
   2113     if (!CapturedVars.empty()) {
   2114       // Prepare for parallel region. Indicate the outlined function.
   2115       ASTContext &Ctx = CGF.getContext();
   2116       unsigned Idx = 0;
   2117       for (llvm::Value *V : CapturedVars) {
   2118         Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
   2119         llvm::Value *PtrV;
   2120         if (V->getType()->isIntegerTy())
   2121           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
   2122         else
   2123           PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
   2124         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
   2125                               Ctx.getPointerType(Ctx.VoidPtrTy));
   2126         ++Idx;
   2127       }
   2128     }
   2129 
   2130     llvm::Value *IfCondVal = nullptr;
   2131     if (IfCond)
   2132       IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
   2133                                     /* isSigned */ false);
   2134     else
   2135       IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
   2136 
   2137     assert(IfCondVal && "Expected a value");
   2138     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
   2139     llvm::Value *Args[] = {
   2140         RTLoc,
   2141         getThreadID(CGF, Loc),
   2142         IfCondVal,
   2143         llvm::ConstantInt::get(CGF.Int32Ty, -1),
   2144         llvm::ConstantInt::get(CGF.Int32Ty, -1),
   2145         FnPtr,
   2146         ID,
   2147         Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
   2148                                    CGF.VoidPtrPtrTy),
   2149         llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
   2150     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   2151                             CGM.getModule(), OMPRTL___kmpc_parallel_51),
   2152                         Args);
   2153   };
   2154 
   2155   RegionCodeGenTy RCG(ParallelGen);
   2156   RCG(CGF);
   2157 }
   2158 
   2159 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
   2160   // Always emit simple barriers!
   2161   if (!CGF.HaveInsertPoint())
   2162     return;
   2163   // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
   2164   // This function does not use parameters, so we can emit just default values.
   2165   llvm::Value *Args[] = {
   2166       llvm::ConstantPointerNull::get(
   2167           cast<llvm::PointerType>(getIdentTyPointerTy())),
   2168       llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
   2169   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   2170                           CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
   2171                       Args);
   2172 }
   2173 
   2174 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
   2175                                            SourceLocation Loc,
   2176                                            OpenMPDirectiveKind Kind, bool,
   2177                                            bool) {
   2178   // Always emit simple barriers!
   2179   if (!CGF.HaveInsertPoint())
   2180     return;
   2181   // Build call __kmpc_cancel_barrier(loc, thread_id);
   2182   unsigned Flags = getDefaultFlagsForBarriers(Kind);
   2183   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
   2184                          getThreadID(CGF, Loc)};
   2185 
   2186   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   2187                           CGM.getModule(), OMPRTL___kmpc_barrier),
   2188                       Args);
   2189 }
   2190 
   2191 void CGOpenMPRuntimeGPU::emitCriticalRegion(
   2192     CodeGenFunction &CGF, StringRef CriticalName,
   2193     const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
   2194     const Expr *Hint) {
   2195   llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
   2196   llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
   2197   llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
   2198   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
   2199   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
   2200 
   2201   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   2202 
   2203   // Get the mask of active threads in the warp.
   2204   llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   2205       CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
   2206   // Fetch team-local id of the thread.
   2207   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   2208 
   2209   // Get the width of the team.
   2210   llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
   2211 
   2212   // Initialize the counter variable for the loop.
   2213   QualType Int32Ty =
   2214       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
   2215   Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
   2216   LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
   2217   CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
   2218                         /*isInit=*/true);
   2219 
   2220   // Block checks if loop counter exceeds upper bound.
   2221   CGF.EmitBlock(LoopBB);
   2222   llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
   2223   llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
   2224   CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
   2225 
   2226   // Block tests which single thread should execute region, and which threads
   2227   // should go straight to synchronisation point.
   2228   CGF.EmitBlock(TestBB);
   2229   CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
   2230   llvm::Value *CmpThreadToCounter =
   2231       CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
   2232   CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
   2233 
   2234   // Block emits the body of the critical region.
   2235   CGF.EmitBlock(BodyBB);
   2236 
   2237   // Output the critical statement.
   2238   CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
   2239                                       Hint);
   2240 
   2241   // After the body surrounded by the critical region, the single executing
   2242   // thread will jump to the synchronisation point.
   2243   // Block waits for all threads in current team to finish then increments the
   2244   // counter variable and returns to the loop.
   2245   CGF.EmitBlock(SyncBB);
   2246   // Reconverge active threads in the warp.
   2247   (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   2248                                 CGM.getModule(), OMPRTL___kmpc_syncwarp),
   2249                             Mask);
   2250 
   2251   llvm::Value *IncCounterVal =
   2252       CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
   2253   CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
   2254   CGF.EmitBranch(LoopBB);
   2255 
   2256   // Block that is reached when  all threads in the team complete the region.
   2257   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
   2258 }
   2259 
   2260 /// Cast value to the specified type.
   2261 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
   2262                                     QualType ValTy, QualType CastTy,
   2263                                     SourceLocation Loc) {
   2264   assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
   2265          "Cast type must sized.");
   2266   assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
   2267          "Val type must sized.");
   2268   llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
   2269   if (ValTy == CastTy)
   2270     return Val;
   2271   if (CGF.getContext().getTypeSizeInChars(ValTy) ==
   2272       CGF.getContext().getTypeSizeInChars(CastTy))
   2273     return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
   2274   if (CastTy->isIntegerType() && ValTy->isIntegerType())
   2275     return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
   2276                                      CastTy->hasSignedIntegerRepresentation());
   2277   Address CastItem = CGF.CreateMemTemp(CastTy);
   2278   Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   2279       CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
   2280   CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
   2281                         LValueBaseInfo(AlignmentSource::Type),
   2282                         TBAAAccessInfo());
   2283   return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
   2284                               LValueBaseInfo(AlignmentSource::Type),
   2285                               TBAAAccessInfo());
   2286 }
   2287 
   2288 /// This function creates calls to one of two shuffle functions to copy
   2289 /// variables between lanes in a warp.
   2290 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
   2291                                                  llvm::Value *Elem,
   2292                                                  QualType ElemType,
   2293                                                  llvm::Value *Offset,
   2294                                                  SourceLocation Loc) {
   2295   CodeGenModule &CGM = CGF.CGM;
   2296   CGBuilderTy &Bld = CGF.Builder;
   2297   CGOpenMPRuntimeGPU &RT =
   2298       *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
   2299   llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
   2300 
   2301   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
   2302   assert(Size.getQuantity() <= 8 &&
   2303          "Unsupported bitwidth in shuffle instruction.");
   2304 
   2305   RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
   2306                                   ? OMPRTL___kmpc_shuffle_int32
   2307                                   : OMPRTL___kmpc_shuffle_int64;
   2308 
   2309   // Cast all types to 32- or 64-bit values before calling shuffle routines.
   2310   QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
   2311       Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
   2312   llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
   2313   llvm::Value *WarpSize =
   2314       Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
   2315 
   2316   llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
   2317       OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
   2318       {ElemCast, Offset, WarpSize});
   2319 
   2320   return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
   2321 }
   2322 
   2323 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
   2324                             Address DestAddr, QualType ElemType,
   2325                             llvm::Value *Offset, SourceLocation Loc) {
   2326   CGBuilderTy &Bld = CGF.Builder;
   2327 
   2328   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
   2329   // Create the loop over the big sized data.
   2330   // ptr = (void*)Elem;
   2331   // ptrEnd = (void*) Elem + 1;
   2332   // Step = 8;
   2333   // while (ptr + Step < ptrEnd)
   2334   //   shuffle((int64_t)*ptr);
   2335   // Step = 4;
   2336   // while (ptr + Step < ptrEnd)
   2337   //   shuffle((int32_t)*ptr);
   2338   // ...
   2339   Address ElemPtr = DestAddr;
   2340   Address Ptr = SrcAddr;
   2341   Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
   2342       Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
   2343   for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
   2344     if (Size < CharUnits::fromQuantity(IntSize))
   2345       continue;
   2346     QualType IntType = CGF.getContext().getIntTypeForBitwidth(
   2347         CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
   2348         /*Signed=*/1);
   2349     llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
   2350     Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
   2351     ElemPtr =
   2352         Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
   2353     if (Size.getQuantity() / IntSize > 1) {
   2354       llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
   2355       llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
   2356       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
   2357       llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
   2358       CGF.EmitBlock(PreCondBB);
   2359       llvm::PHINode *PhiSrc =
   2360           Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
   2361       PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
   2362       llvm::PHINode *PhiDest =
   2363           Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
   2364       PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
   2365       Ptr = Address(PhiSrc, Ptr.getAlignment());
   2366       ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
   2367       llvm::Value *PtrDiff = Bld.CreatePtrDiff(
   2368           PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
   2369                                    Ptr.getPointer(), CGF.VoidPtrTy));
   2370       Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
   2371                        ThenBB, ExitBB);
   2372       CGF.EmitBlock(ThenBB);
   2373       llvm::Value *Res = createRuntimeShuffleFunction(
   2374           CGF,
   2375           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
   2376                                LValueBaseInfo(AlignmentSource::Type),
   2377                                TBAAAccessInfo()),
   2378           IntType, Offset, Loc);
   2379       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
   2380                             LValueBaseInfo(AlignmentSource::Type),
   2381                             TBAAAccessInfo());
   2382       Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
   2383       Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
   2384       PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
   2385       PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
   2386       CGF.EmitBranch(PreCondBB);
   2387       CGF.EmitBlock(ExitBB);
   2388     } else {
   2389       llvm::Value *Res = createRuntimeShuffleFunction(
   2390           CGF,
   2391           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
   2392                                LValueBaseInfo(AlignmentSource::Type),
   2393                                TBAAAccessInfo()),
   2394           IntType, Offset, Loc);
   2395       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
   2396                             LValueBaseInfo(AlignmentSource::Type),
   2397                             TBAAAccessInfo());
   2398       Ptr = Bld.CreateConstGEP(Ptr, 1);
   2399       ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
   2400     }
   2401     Size = Size % IntSize;
   2402   }
   2403 }
   2404 
   2405 namespace {
   2406 enum CopyAction : unsigned {
   2407   // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
   2408   // the warp using shuffle instructions.
   2409   RemoteLaneToThread,
   2410   // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
   2411   ThreadCopy,
   2412   // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
   2413   ThreadToScratchpad,
   2414   // ScratchpadToThread: Copy from a scratchpad array in global memory
   2415   // containing team-reduced data to a thread's stack.
   2416   ScratchpadToThread,
   2417 };
   2418 } // namespace
   2419 
   2420 struct CopyOptionsTy {
   2421   llvm::Value *RemoteLaneOffset;
   2422   llvm::Value *ScratchpadIndex;
   2423   llvm::Value *ScratchpadWidth;
   2424 };
   2425 
   2426 /// Emit instructions to copy a Reduce list, which contains partially
   2427 /// aggregated values, in the specified direction.
   2428 static void emitReductionListCopy(
   2429     CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
   2430     ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
   2431     CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
   2432 
   2433   CodeGenModule &CGM = CGF.CGM;
   2434   ASTContext &C = CGM.getContext();
   2435   CGBuilderTy &Bld = CGF.Builder;
   2436 
   2437   llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
   2438   llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
   2439   llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
   2440 
   2441   // Iterates, element-by-element, through the source Reduce list and
   2442   // make a copy.
   2443   unsigned Idx = 0;
   2444   unsigned Size = Privates.size();
   2445   for (const Expr *Private : Privates) {
   2446     Address SrcElementAddr = Address::invalid();
   2447     Address DestElementAddr = Address::invalid();
   2448     Address DestElementPtrAddr = Address::invalid();
   2449     // Should we shuffle in an element from a remote lane?
   2450     bool ShuffleInElement = false;
   2451     // Set to true to update the pointer in the dest Reduce list to a
   2452     // newly created element.
   2453     bool UpdateDestListPtr = false;
   2454     // Increment the src or dest pointer to the scratchpad, for each
   2455     // new element.
   2456     bool IncrScratchpadSrc = false;
   2457     bool IncrScratchpadDest = false;
   2458 
   2459     switch (Action) {
   2460     case RemoteLaneToThread: {
   2461       // Step 1.1: Get the address for the src element in the Reduce list.
   2462       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
   2463       SrcElementAddr = CGF.EmitLoadOfPointer(
   2464           SrcElementPtrAddr,
   2465           C.getPointerType(Private->getType())->castAs<PointerType>());
   2466 
   2467       // Step 1.2: Create a temporary to store the element in the destination
   2468       // Reduce list.
   2469       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
   2470       DestElementAddr =
   2471           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
   2472       ShuffleInElement = true;
   2473       UpdateDestListPtr = true;
   2474       break;
   2475     }
   2476     case ThreadCopy: {
   2477       // Step 1.1: Get the address for the src element in the Reduce list.
   2478       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
   2479       SrcElementAddr = CGF.EmitLoadOfPointer(
   2480           SrcElementPtrAddr,
   2481           C.getPointerType(Private->getType())->castAs<PointerType>());
   2482 
   2483       // Step 1.2: Get the address for dest element.  The destination
   2484       // element has already been created on the thread's stack.
   2485       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
   2486       DestElementAddr = CGF.EmitLoadOfPointer(
   2487           DestElementPtrAddr,
   2488           C.getPointerType(Private->getType())->castAs<PointerType>());
   2489       break;
   2490     }
   2491     case ThreadToScratchpad: {
   2492       // Step 1.1: Get the address for the src element in the Reduce list.
   2493       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
   2494       SrcElementAddr = CGF.EmitLoadOfPointer(
   2495           SrcElementPtrAddr,
   2496           C.getPointerType(Private->getType())->castAs<PointerType>());
   2497 
   2498       // Step 1.2: Get the address for dest element:
   2499       // address = base + index * ElementSizeInChars.
   2500       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
   2501       llvm::Value *CurrentOffset =
   2502           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
   2503       llvm::Value *ScratchPadElemAbsolutePtrVal =
   2504           Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
   2505       ScratchPadElemAbsolutePtrVal =
   2506           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
   2507       DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
   2508                                 C.getTypeAlignInChars(Private->getType()));
   2509       IncrScratchpadDest = true;
   2510       break;
   2511     }
   2512     case ScratchpadToThread: {
   2513       // Step 1.1: Get the address for the src element in the scratchpad.
   2514       // address = base + index * ElementSizeInChars.
   2515       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
   2516       llvm::Value *CurrentOffset =
   2517           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
   2518       llvm::Value *ScratchPadElemAbsolutePtrVal =
   2519           Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
   2520       ScratchPadElemAbsolutePtrVal =
   2521           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
   2522       SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
   2523                                C.getTypeAlignInChars(Private->getType()));
   2524       IncrScratchpadSrc = true;
   2525 
   2526       // Step 1.2: Create a temporary to store the element in the destination
   2527       // Reduce list.
   2528       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
   2529       DestElementAddr =
   2530           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
   2531       UpdateDestListPtr = true;
   2532       break;
   2533     }
   2534     }
   2535 
   2536     // Regardless of src and dest of copy, we emit the load of src
   2537     // element as this is required in all directions
   2538     SrcElementAddr = Bld.CreateElementBitCast(
   2539         SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
   2540     DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
   2541                                                SrcElementAddr.getElementType());
   2542 
   2543     // Now that all active lanes have read the element in the
   2544     // Reduce list, shuffle over the value from the remote lane.
   2545     if (ShuffleInElement) {
   2546       shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
   2547                       RemoteLaneOffset, Private->getExprLoc());
   2548     } else {
   2549       switch (CGF.getEvaluationKind(Private->getType())) {
   2550       case TEK_Scalar: {
   2551         llvm::Value *Elem = CGF.EmitLoadOfScalar(
   2552             SrcElementAddr, /*Volatile=*/false, Private->getType(),
   2553             Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
   2554             TBAAAccessInfo());
   2555         // Store the source element value to the dest element address.
   2556         CGF.EmitStoreOfScalar(
   2557             Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
   2558             LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
   2559         break;
   2560       }
   2561       case TEK_Complex: {
   2562         CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
   2563             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
   2564             Private->getExprLoc());
   2565         CGF.EmitStoreOfComplex(
   2566             Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
   2567             /*isInit=*/false);
   2568         break;
   2569       }
   2570       case TEK_Aggregate:
   2571         CGF.EmitAggregateCopy(
   2572             CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
   2573             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
   2574             Private->getType(), AggValueSlot::DoesNotOverlap);
   2575         break;
   2576       }
   2577     }
   2578 
   2579     // Step 3.1: Modify reference in dest Reduce list as needed.
   2580     // Modifying the reference in Reduce list to point to the newly
   2581     // created element.  The element is live in the current function
   2582     // scope and that of functions it invokes (i.e., reduce_function).
   2583     // RemoteReduceData[i] = (void*)&RemoteElem
   2584     if (UpdateDestListPtr) {
   2585       CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
   2586                                 DestElementAddr.getPointer(), CGF.VoidPtrTy),
   2587                             DestElementPtrAddr, /*Volatile=*/false,
   2588                             C.VoidPtrTy);
   2589     }
   2590 
   2591     // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
   2592     // address of the next element in scratchpad memory, unless we're currently
   2593     // processing the last one.  Memory alignment is also taken care of here.
   2594     if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
   2595       llvm::Value *ScratchpadBasePtr =
   2596           IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
   2597       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
   2598       ScratchpadBasePtr = Bld.CreateNUWAdd(
   2599           ScratchpadBasePtr,
   2600           Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
   2601 
   2602       // Take care of global memory alignment for performance
   2603       ScratchpadBasePtr = Bld.CreateNUWSub(
   2604           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
   2605       ScratchpadBasePtr = Bld.CreateUDiv(
   2606           ScratchpadBasePtr,
   2607           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
   2608       ScratchpadBasePtr = Bld.CreateNUWAdd(
   2609           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
   2610       ScratchpadBasePtr = Bld.CreateNUWMul(
   2611           ScratchpadBasePtr,
   2612           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
   2613 
   2614       if (IncrScratchpadDest)
   2615         DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
   2616       else /* IncrScratchpadSrc = true */
   2617         SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
   2618     }
   2619 
   2620     ++Idx;
   2621   }
   2622 }
   2623 
   2624 /// This function emits a helper that gathers Reduce lists from the first
   2625 /// lane of every active warp to lanes in the first warp.
   2626 ///
   2627 /// void inter_warp_copy_func(void* reduce_data, num_warps)
   2628 ///   shared smem[warp_size];
   2629 ///   For all data entries D in reduce_data:
   2630 ///     sync
   2631 ///     If (I am the first lane in each warp)
   2632 ///       Copy my local D to smem[warp_id]
   2633 ///     sync
   2634 ///     if (I am the first warp)
   2635 ///       Copy smem[thread_id] to my local D
   2636 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   2637                                               ArrayRef<const Expr *> Privates,
   2638                                               QualType ReductionArrayTy,
   2639                                               SourceLocation Loc) {
   2640   ASTContext &C = CGM.getContext();
   2641   llvm::Module &M = CGM.getModule();
   2642 
   2643   // ReduceList: thread local Reduce list.
   2644   // At the stage of the computation when this function is called, partially
   2645   // aggregated values reside in the first lane of every active warp.
   2646   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   2647                                   C.VoidPtrTy, ImplicitParamDecl::Other);
   2648   // NumWarps: number of warps active in the parallel region.  This could
   2649   // be smaller than 32 (max warps in a CTA) for partial block reduction.
   2650   ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   2651                                 C.getIntTypeForBitwidth(32, /* Signed */ true),
   2652                                 ImplicitParamDecl::Other);
   2653   FunctionArgList Args;
   2654   Args.push_back(&ReduceListArg);
   2655   Args.push_back(&NumWarpsArg);
   2656 
   2657   const CGFunctionInfo &CGFI =
   2658       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   2659   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
   2660                                     llvm::GlobalValue::InternalLinkage,
   2661                                     "_omp_reduction_inter_warp_copy_func", &M);
   2662   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   2663   Fn->setDoesNotRecurse();
   2664   CodeGenFunction CGF(CGM);
   2665   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   2666 
   2667   CGBuilderTy &Bld = CGF.Builder;
   2668 
   2669   // This array is used as a medium to transfer, one reduce element at a time,
   2670   // the data from the first lane of every warp to lanes in the first warp
   2671   // in order to perform the final step of a reduction in a parallel region
   2672   // (reduction across warps).  The array is placed in NVPTX __shared__ memory
   2673   // for reduced latency, as well as to have a distinct copy for concurrently
   2674   // executing target regions.  The array is declared with common linkage so
   2675   // as to be shared across compilation units.
   2676   StringRef TransferMediumName =
   2677       "__openmp_nvptx_data_transfer_temporary_storage";
   2678   llvm::GlobalVariable *TransferMedium =
   2679       M.getGlobalVariable(TransferMediumName);
   2680   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
   2681   if (!TransferMedium) {
   2682     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
   2683     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
   2684     TransferMedium = new llvm::GlobalVariable(
   2685         M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
   2686         llvm::UndefValue::get(Ty), TransferMediumName,
   2687         /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
   2688         SharedAddressSpace);
   2689     CGM.addCompilerUsedGlobal(TransferMedium);
   2690   }
   2691 
   2692   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   2693   // Get the CUDA thread id of the current OpenMP thread on the GPU.
   2694   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   2695   // nvptx_lane_id = nvptx_id % warpsize
   2696   llvm::Value *LaneID = getNVPTXLaneID(CGF);
   2697   // nvptx_warp_id = nvptx_id / warpsize
   2698   llvm::Value *WarpID = getNVPTXWarpID(CGF);
   2699 
   2700   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   2701   Address LocalReduceList(
   2702       Bld.CreatePointerBitCastOrAddrSpaceCast(
   2703           CGF.EmitLoadOfScalar(
   2704               AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
   2705               LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
   2706           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
   2707       CGF.getPointerAlign());
   2708 
   2709   unsigned Idx = 0;
   2710   for (const Expr *Private : Privates) {
   2711     //
   2712     // Warp master copies reduce element to transfer medium in __shared__
   2713     // memory.
   2714     //
   2715     unsigned RealTySize =
   2716         C.getTypeSizeInChars(Private->getType())
   2717             .alignTo(C.getTypeAlignInChars(Private->getType()))
   2718             .getQuantity();
   2719     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
   2720       unsigned NumIters = RealTySize / TySize;
   2721       if (NumIters == 0)
   2722         continue;
   2723       QualType CType = C.getIntTypeForBitwidth(
   2724           C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
   2725       llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
   2726       CharUnits Align = CharUnits::fromQuantity(TySize);
   2727       llvm::Value *Cnt = nullptr;
   2728       Address CntAddr = Address::invalid();
   2729       llvm::BasicBlock *PrecondBB = nullptr;
   2730       llvm::BasicBlock *ExitBB = nullptr;
   2731       if (NumIters > 1) {
   2732         CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
   2733         CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
   2734                               /*Volatile=*/false, C.IntTy);
   2735         PrecondBB = CGF.createBasicBlock("precond");
   2736         ExitBB = CGF.createBasicBlock("exit");
   2737         llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
   2738         // There is no need to emit line number for unconditional branch.
   2739         (void)ApplyDebugLocation::CreateEmpty(CGF);
   2740         CGF.EmitBlock(PrecondBB);
   2741         Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
   2742         llvm::Value *Cmp =
   2743             Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
   2744         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
   2745         CGF.EmitBlock(BodyBB);
   2746       }
   2747       // kmpc_barrier.
   2748       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
   2749                                              /*EmitChecks=*/false,
   2750                                              /*ForceSimpleCall=*/true);
   2751       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
   2752       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
   2753       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
   2754 
   2755       // if (lane_id == 0)
   2756       llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
   2757       Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
   2758       CGF.EmitBlock(ThenBB);
   2759 
   2760       // Reduce element = LocalReduceList[i]
   2761       Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
   2762       llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
   2763           ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
   2764       // elemptr = ((CopyType*)(elemptrptr)) + I
   2765       Address ElemPtr = Address(ElemPtrPtr, Align);
   2766       ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
   2767       if (NumIters > 1) {
   2768         ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
   2769                           ElemPtr.getAlignment());
   2770       }
   2771 
   2772       // Get pointer to location in transfer medium.
   2773       // MediumPtr = &medium[warp_id]
   2774       llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
   2775           TransferMedium->getValueType(), TransferMedium,
   2776           {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
   2777       Address MediumPtr(MediumPtrVal, Align);
   2778       // Casting to actual data type.
   2779       // MediumPtr = (CopyType*)MediumPtrAddr;
   2780       MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
   2781 
   2782       // elem = *elemptr
   2783       //*MediumPtr = elem
   2784       llvm::Value *Elem = CGF.EmitLoadOfScalar(
   2785           ElemPtr, /*Volatile=*/false, CType, Loc,
   2786           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
   2787       // Store the source element value to the dest element address.
   2788       CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
   2789                             LValueBaseInfo(AlignmentSource::Type),
   2790                             TBAAAccessInfo());
   2791 
   2792       Bld.CreateBr(MergeBB);
   2793 
   2794       CGF.EmitBlock(ElseBB);
   2795       Bld.CreateBr(MergeBB);
   2796 
   2797       CGF.EmitBlock(MergeBB);
   2798 
   2799       // kmpc_barrier.
   2800       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
   2801                                              /*EmitChecks=*/false,
   2802                                              /*ForceSimpleCall=*/true);
   2803 
   2804       //
   2805       // Warp 0 copies reduce element from transfer medium.
   2806       //
   2807       llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
   2808       llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
   2809       llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
   2810 
   2811       Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
   2812       llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
   2813           AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
   2814 
   2815       // Up to 32 threads in warp 0 are active.
   2816       llvm::Value *IsActiveThread =
   2817           Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
   2818       Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
   2819 
   2820       CGF.EmitBlock(W0ThenBB);
   2821 
   2822       // SrcMediumPtr = &medium[tid]
   2823       llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
   2824           TransferMedium->getValueType(), TransferMedium,
   2825           {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
   2826       Address SrcMediumPtr(SrcMediumPtrVal, Align);
   2827       // SrcMediumVal = *SrcMediumPtr;
   2828       SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
   2829 
   2830       // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
   2831       Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
   2832       llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
   2833           TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
   2834       Address TargetElemPtr = Address(TargetElemPtrVal, Align);
   2835       TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
   2836       if (NumIters > 1) {
   2837         TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
   2838                                 TargetElemPtr.getAlignment());
   2839       }
   2840 
   2841       // *TargetElemPtr = SrcMediumVal;
   2842       llvm::Value *SrcMediumValue =
   2843           CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
   2844       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
   2845                             CType);
   2846       Bld.CreateBr(W0MergeBB);
   2847 
   2848       CGF.EmitBlock(W0ElseBB);
   2849       Bld.CreateBr(W0MergeBB);
   2850 
   2851       CGF.EmitBlock(W0MergeBB);
   2852 
   2853       if (NumIters > 1) {
   2854         Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
   2855         CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
   2856         CGF.EmitBranch(PrecondBB);
   2857         (void)ApplyDebugLocation::CreateEmpty(CGF);
   2858         CGF.EmitBlock(ExitBB);
   2859       }
   2860       RealTySize %= TySize;
   2861     }
   2862     ++Idx;
   2863   }
   2864 
   2865   CGF.FinishFunction();
   2866   return Fn;
   2867 }
   2868 
   2869 /// Emit a helper that reduces data across two OpenMP threads (lanes)
   2870 /// in the same warp.  It uses shuffle instructions to copy over data from
   2871 /// a remote lane's stack.  The reduction algorithm performed is specified
   2872 /// by the fourth parameter.
   2873 ///
   2874 /// Algorithm Versions.
   2875 /// Full Warp Reduce (argument value 0):
   2876 ///   This algorithm assumes that all 32 lanes are active and gathers
   2877 ///   data from these 32 lanes, producing a single resultant value.
   2878 /// Contiguous Partial Warp Reduce (argument value 1):
   2879 ///   This algorithm assumes that only a *contiguous* subset of lanes
   2880 ///   are active.  This happens for the last warp in a parallel region
   2881 ///   when the user specified num_threads is not an integer multiple of
   2882 ///   32.  This contiguous subset always starts with the zeroth lane.
   2883 /// Partial Warp Reduce (argument value 2):
   2884 ///   This algorithm gathers data from any number of lanes at any position.
   2885 /// All reduced values are stored in the lowest possible lane.  The set
   2886 /// of problems every algorithm addresses is a super set of those
   2887 /// addressable by algorithms with a lower version number.  Overhead
   2888 /// increases as algorithm version increases.
   2889 ///
   2890 /// Terminology
   2891 /// Reduce element:
   2892 ///   Reduce element refers to the individual data field with primitive
   2893 ///   data types to be combined and reduced across threads.
   2894 /// Reduce list:
   2895 ///   Reduce list refers to a collection of local, thread-private
   2896 ///   reduce elements.
   2897 /// Remote Reduce list:
   2898 ///   Remote Reduce list refers to a collection of remote (relative to
   2899 ///   the current thread) reduce elements.
   2900 ///
   2901 /// We distinguish between three states of threads that are important to
   2902 /// the implementation of this function.
   2903 /// Alive threads:
   2904 ///   Threads in a warp executing the SIMT instruction, as distinguished from
   2905 ///   threads that are inactive due to divergent control flow.
   2906 /// Active threads:
   2907 ///   The minimal set of threads that has to be alive upon entry to this
   2908 ///   function.  The computation is correct iff active threads are alive.
   2909 ///   Some threads are alive but they are not active because they do not
   2910 ///   contribute to the computation in any useful manner.  Turning them off
   2911 ///   may introduce control flow overheads without any tangible benefits.
   2912 /// Effective threads:
   2913 ///   In order to comply with the argument requirements of the shuffle
   2914 ///   function, we must keep all lanes holding data alive.  But at most
   2915 ///   half of them perform value aggregation; we refer to this half of
   2916 ///   threads as effective. The other half is simply handing off their
   2917 ///   data.
   2918 ///
   2919 /// Procedure
   2920 /// Value shuffle:
   2921 ///   In this step active threads transfer data from higher lane positions
   2922 ///   in the warp to lower lane positions, creating Remote Reduce list.
   2923 /// Value aggregation:
   2924 ///   In this step, effective threads combine their thread local Reduce list
   2925 ///   with Remote Reduce list and store the result in the thread local
   2926 ///   Reduce list.
   2927 /// Value copy:
   2928 ///   In this step, we deal with the assumption made by algorithm 2
   2929 ///   (i.e. contiguity assumption).  When we have an odd number of lanes
   2930 ///   active, say 2k+1, only k threads will be effective and therefore k
   2931 ///   new values will be produced.  However, the Reduce list owned by the
   2932 ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
   2933 ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
   2934 ///   that the contiguity assumption still holds.
   2935 static llvm::Function *emitShuffleAndReduceFunction(
   2936     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
   2937     QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
   2938   ASTContext &C = CGM.getContext();
   2939 
   2940   // Thread local Reduce list used to host the values of data to be reduced.
   2941   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   2942                                   C.VoidPtrTy, ImplicitParamDecl::Other);
   2943   // Current lane id; could be logical.
   2944   ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
   2945                               ImplicitParamDecl::Other);
   2946   // Offset of the remote source lane relative to the current lane.
   2947   ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   2948                                         C.ShortTy, ImplicitParamDecl::Other);
   2949   // Algorithm version.  This is expected to be known at compile time.
   2950   ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   2951                                C.ShortTy, ImplicitParamDecl::Other);
   2952   FunctionArgList Args;
   2953   Args.push_back(&ReduceListArg);
   2954   Args.push_back(&LaneIDArg);
   2955   Args.push_back(&RemoteLaneOffsetArg);
   2956   Args.push_back(&AlgoVerArg);
   2957 
   2958   const CGFunctionInfo &CGFI =
   2959       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   2960   auto *Fn = llvm::Function::Create(
   2961       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
   2962       "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
   2963   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   2964   Fn->setDoesNotRecurse();
   2965 
   2966   CodeGenFunction CGF(CGM);
   2967   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   2968 
   2969   CGBuilderTy &Bld = CGF.Builder;
   2970 
   2971   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   2972   Address LocalReduceList(
   2973       Bld.CreatePointerBitCastOrAddrSpaceCast(
   2974           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
   2975                                C.VoidPtrTy, SourceLocation()),
   2976           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
   2977       CGF.getPointerAlign());
   2978 
   2979   Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
   2980   llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
   2981       AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
   2982 
   2983   Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
   2984   llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
   2985       AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
   2986 
   2987   Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
   2988   llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
   2989       AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
   2990 
   2991   // Create a local thread-private variable to host the Reduce list
   2992   // from a remote lane.
   2993   Address RemoteReduceList =
   2994       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
   2995 
   2996   // This loop iterates through the list of reduce elements and copies,
   2997   // element by element, from a remote lane in the warp to RemoteReduceList,
   2998   // hosted on the thread's stack.
   2999   emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
   3000                         LocalReduceList, RemoteReduceList,
   3001                         {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
   3002                          /*ScratchpadIndex=*/nullptr,
   3003                          /*ScratchpadWidth=*/nullptr});
   3004 
   3005   // The actions to be performed on the Remote Reduce list is dependent
   3006   // on the algorithm version.
   3007   //
   3008   //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
   3009   //  LaneId % 2 == 0 && Offset > 0):
   3010   //    do the reduction value aggregation
   3011   //
   3012   //  The thread local variable Reduce list is mutated in place to host the
   3013   //  reduced data, which is the aggregated value produced from local and
   3014   //  remote lanes.
   3015   //
   3016   //  Note that AlgoVer is expected to be a constant integer known at compile
   3017   //  time.
   3018   //  When AlgoVer==0, the first conjunction evaluates to true, making
   3019   //    the entire predicate true during compile time.
   3020   //  When AlgoVer==1, the second conjunction has only the second part to be
   3021   //    evaluated during runtime.  Other conjunctions evaluates to false
   3022   //    during compile time.
   3023   //  When AlgoVer==2, the third conjunction has only the second part to be
   3024   //    evaluated during runtime.  Other conjunctions evaluates to false
   3025   //    during compile time.
   3026   llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
   3027 
   3028   llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
   3029   llvm::Value *CondAlgo1 = Bld.CreateAnd(
   3030       Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
   3031 
   3032   llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
   3033   llvm::Value *CondAlgo2 = Bld.CreateAnd(
   3034       Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
   3035   CondAlgo2 = Bld.CreateAnd(
   3036       CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
   3037 
   3038   llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
   3039   CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
   3040 
   3041   llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
   3042   llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
   3043   llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
   3044   Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
   3045 
   3046   CGF.EmitBlock(ThenBB);
   3047   // reduce_function(LocalReduceList, RemoteReduceList)
   3048   llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3049       LocalReduceList.getPointer(), CGF.VoidPtrTy);
   3050   llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3051       RemoteReduceList.getPointer(), CGF.VoidPtrTy);
   3052   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
   3053       CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
   3054   Bld.CreateBr(MergeBB);
   3055 
   3056   CGF.EmitBlock(ElseBB);
   3057   Bld.CreateBr(MergeBB);
   3058 
   3059   CGF.EmitBlock(MergeBB);
   3060 
   3061   // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
   3062   // Reduce list.
   3063   Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
   3064   llvm::Value *CondCopy = Bld.CreateAnd(
   3065       Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
   3066 
   3067   llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
   3068   llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
   3069   llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
   3070   Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
   3071 
   3072   CGF.EmitBlock(CpyThenBB);
   3073   emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
   3074                         RemoteReduceList, LocalReduceList);
   3075   Bld.CreateBr(CpyMergeBB);
   3076 
   3077   CGF.EmitBlock(CpyElseBB);
   3078   Bld.CreateBr(CpyMergeBB);
   3079 
   3080   CGF.EmitBlock(CpyMergeBB);
   3081 
   3082   CGF.FinishFunction();
   3083   return Fn;
   3084 }
   3085 
   3086 /// This function emits a helper that copies all the reduction variables from
   3087 /// the team into the provided global buffer for the reduction variables.
   3088 ///
   3089 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
   3090 ///   For all data entries D in reduce_data:
   3091 ///     Copy local D to buffer.D[Idx]
   3092 static llvm::Value *emitListToGlobalCopyFunction(
   3093     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
   3094     QualType ReductionArrayTy, SourceLocation Loc,
   3095     const RecordDecl *TeamReductionRec,
   3096     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
   3097         &VarFieldMap) {
   3098   ASTContext &C = CGM.getContext();
   3099 
   3100   // Buffer: global reduction buffer.
   3101   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3102                               C.VoidPtrTy, ImplicitParamDecl::Other);
   3103   // Idx: index of the buffer.
   3104   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
   3105                            ImplicitParamDecl::Other);
   3106   // ReduceList: thread local Reduce list.
   3107   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3108                                   C.VoidPtrTy, ImplicitParamDecl::Other);
   3109   FunctionArgList Args;
   3110   Args.push_back(&BufferArg);
   3111   Args.push_back(&IdxArg);
   3112   Args.push_back(&ReduceListArg);
   3113 
   3114   const CGFunctionInfo &CGFI =
   3115       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   3116   auto *Fn = llvm::Function::Create(
   3117       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
   3118       "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
   3119   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   3120   Fn->setDoesNotRecurse();
   3121   CodeGenFunction CGF(CGM);
   3122   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   3123 
   3124   CGBuilderTy &Bld = CGF.Builder;
   3125 
   3126   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   3127   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
   3128   Address LocalReduceList(
   3129       Bld.CreatePointerBitCastOrAddrSpaceCast(
   3130           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
   3131                                C.VoidPtrTy, Loc),
   3132           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
   3133       CGF.getPointerAlign());
   3134   QualType StaticTy = C.getRecordType(TeamReductionRec);
   3135   llvm::Type *LLVMReductionsBufferTy =
   3136       CGM.getTypes().ConvertTypeForMem(StaticTy);
   3137   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3138       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
   3139       LLVMReductionsBufferTy->getPointerTo());
   3140   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
   3141                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
   3142                                               /*Volatile=*/false, C.IntTy,
   3143                                               Loc)};
   3144   unsigned Idx = 0;
   3145   for (const Expr *Private : Privates) {
   3146     // Reduce element = LocalReduceList[i]
   3147     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
   3148     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
   3149         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
   3150     // elemptr = ((CopyType*)(elemptrptr)) + I
   3151     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3152         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
   3153     Address ElemPtr =
   3154         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
   3155     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
   3156     // Global = Buffer.VD[Idx];
   3157     const FieldDecl *FD = VarFieldMap.lookup(VD);
   3158     LValue GlobLVal = CGF.EmitLValueForField(
   3159         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
   3160     Address GlobAddr = GlobLVal.getAddress(CGF);
   3161     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
   3162         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
   3163     GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
   3164     switch (CGF.getEvaluationKind(Private->getType())) {
   3165     case TEK_Scalar: {
   3166       llvm::Value *V = CGF.EmitLoadOfScalar(
   3167           ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
   3168           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
   3169       CGF.EmitStoreOfScalar(V, GlobLVal);
   3170       break;
   3171     }
   3172     case TEK_Complex: {
   3173       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
   3174           CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
   3175       CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
   3176       break;
   3177     }
   3178     case TEK_Aggregate:
   3179       CGF.EmitAggregateCopy(GlobLVal,
   3180                             CGF.MakeAddrLValue(ElemPtr, Private->getType()),
   3181                             Private->getType(), AggValueSlot::DoesNotOverlap);
   3182       break;
   3183     }
   3184     ++Idx;
   3185   }
   3186 
   3187   CGF.FinishFunction();
   3188   return Fn;
   3189 }
   3190 
   3191 /// This function emits a helper that reduces all the reduction variables from
   3192 /// the team into the provided global buffer for the reduction variables.
   3193 ///
   3194 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
   3195 ///  void *GlobPtrs[];
   3196 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
   3197 ///  ...
   3198 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
   3199 ///  reduce_function(GlobPtrs, reduce_data);
   3200 static llvm::Value *emitListToGlobalReduceFunction(
   3201     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
   3202     QualType ReductionArrayTy, SourceLocation Loc,
   3203     const RecordDecl *TeamReductionRec,
   3204     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
   3205         &VarFieldMap,
   3206     llvm::Function *ReduceFn) {
   3207   ASTContext &C = CGM.getContext();
   3208 
   3209   // Buffer: global reduction buffer.
   3210   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3211                               C.VoidPtrTy, ImplicitParamDecl::Other);
   3212   // Idx: index of the buffer.
   3213   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
   3214                            ImplicitParamDecl::Other);
   3215   // ReduceList: thread local Reduce list.
   3216   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3217                                   C.VoidPtrTy, ImplicitParamDecl::Other);
   3218   FunctionArgList Args;
   3219   Args.push_back(&BufferArg);
   3220   Args.push_back(&IdxArg);
   3221   Args.push_back(&ReduceListArg);
   3222 
   3223   const CGFunctionInfo &CGFI =
   3224       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   3225   auto *Fn = llvm::Function::Create(
   3226       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
   3227       "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
   3228   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   3229   Fn->setDoesNotRecurse();
   3230   CodeGenFunction CGF(CGM);
   3231   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   3232 
   3233   CGBuilderTy &Bld = CGF.Builder;
   3234 
   3235   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
   3236   QualType StaticTy = C.getRecordType(TeamReductionRec);
   3237   llvm::Type *LLVMReductionsBufferTy =
   3238       CGM.getTypes().ConvertTypeForMem(StaticTy);
   3239   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3240       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
   3241       LLVMReductionsBufferTy->getPointerTo());
   3242 
   3243   // 1. Build a list of reduction variables.
   3244   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   3245   Address ReductionList =
   3246       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
   3247   auto IPriv = Privates.begin();
   3248   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
   3249                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
   3250                                               /*Volatile=*/false, C.IntTy,
   3251                                               Loc)};
   3252   unsigned Idx = 0;
   3253   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
   3254     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
   3255     // Global = Buffer.VD[Idx];
   3256     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
   3257     const FieldDecl *FD = VarFieldMap.lookup(VD);
   3258     LValue GlobLVal = CGF.EmitLValueForField(
   3259         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
   3260     Address GlobAddr = GlobLVal.getAddress(CGF);
   3261     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
   3262         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
   3263     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
   3264     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
   3265     if ((*IPriv)->getType()->isVariablyModifiedType()) {
   3266       // Store array size.
   3267       ++Idx;
   3268       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
   3269       llvm::Value *Size = CGF.Builder.CreateIntCast(
   3270           CGF.getVLASize(
   3271                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
   3272               .NumElts,
   3273           CGF.SizeTy, /*isSigned=*/false);
   3274       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
   3275                               Elem);
   3276     }
   3277   }
   3278 
   3279   // Call reduce_function(GlobalReduceList, ReduceList)
   3280   llvm::Value *GlobalReduceList =
   3281       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
   3282   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   3283   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
   3284       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
   3285   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
   3286       CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
   3287   CGF.FinishFunction();
   3288   return Fn;
   3289 }
   3290 
   3291 /// This function emits a helper that copies all the reduction variables from
   3292 /// the team into the provided global buffer for the reduction variables.
   3293 ///
   3294 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
   3295 ///   For all data entries D in reduce_data:
   3296 ///     Copy buffer.D[Idx] to local D;
   3297 static llvm::Value *emitGlobalToListCopyFunction(
   3298     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
   3299     QualType ReductionArrayTy, SourceLocation Loc,
   3300     const RecordDecl *TeamReductionRec,
   3301     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
   3302         &VarFieldMap) {
   3303   ASTContext &C = CGM.getContext();
   3304 
   3305   // Buffer: global reduction buffer.
   3306   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3307                               C.VoidPtrTy, ImplicitParamDecl::Other);
   3308   // Idx: index of the buffer.
   3309   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
   3310                            ImplicitParamDecl::Other);
   3311   // ReduceList: thread local Reduce list.
   3312   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3313                                   C.VoidPtrTy, ImplicitParamDecl::Other);
   3314   FunctionArgList Args;
   3315   Args.push_back(&BufferArg);
   3316   Args.push_back(&IdxArg);
   3317   Args.push_back(&ReduceListArg);
   3318 
   3319   const CGFunctionInfo &CGFI =
   3320       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   3321   auto *Fn = llvm::Function::Create(
   3322       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
   3323       "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
   3324   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   3325   Fn->setDoesNotRecurse();
   3326   CodeGenFunction CGF(CGM);
   3327   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   3328 
   3329   CGBuilderTy &Bld = CGF.Builder;
   3330 
   3331   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   3332   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
   3333   Address LocalReduceList(
   3334       Bld.CreatePointerBitCastOrAddrSpaceCast(
   3335           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
   3336                                C.VoidPtrTy, Loc),
   3337           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
   3338       CGF.getPointerAlign());
   3339   QualType StaticTy = C.getRecordType(TeamReductionRec);
   3340   llvm::Type *LLVMReductionsBufferTy =
   3341       CGM.getTypes().ConvertTypeForMem(StaticTy);
   3342   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3343       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
   3344       LLVMReductionsBufferTy->getPointerTo());
   3345 
   3346   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
   3347                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
   3348                                               /*Volatile=*/false, C.IntTy,
   3349                                               Loc)};
   3350   unsigned Idx = 0;
   3351   for (const Expr *Private : Privates) {
   3352     // Reduce element = LocalReduceList[i]
   3353     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
   3354     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
   3355         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
   3356     // elemptr = ((CopyType*)(elemptrptr)) + I
   3357     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3358         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
   3359     Address ElemPtr =
   3360         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
   3361     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
   3362     // Global = Buffer.VD[Idx];
   3363     const FieldDecl *FD = VarFieldMap.lookup(VD);
   3364     LValue GlobLVal = CGF.EmitLValueForField(
   3365         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
   3366     Address GlobAddr = GlobLVal.getAddress(CGF);
   3367     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
   3368         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
   3369     GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
   3370     switch (CGF.getEvaluationKind(Private->getType())) {
   3371     case TEK_Scalar: {
   3372       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
   3373       CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
   3374                             LValueBaseInfo(AlignmentSource::Type),
   3375                             TBAAAccessInfo());
   3376       break;
   3377     }
   3378     case TEK_Complex: {
   3379       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
   3380       CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
   3381                              /*isInit=*/false);
   3382       break;
   3383     }
   3384     case TEK_Aggregate:
   3385       CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
   3386                             GlobLVal, Private->getType(),
   3387                             AggValueSlot::DoesNotOverlap);
   3388       break;
   3389     }
   3390     ++Idx;
   3391   }
   3392 
   3393   CGF.FinishFunction();
   3394   return Fn;
   3395 }
   3396 
   3397 /// This function emits a helper that reduces all the reduction variables from
   3398 /// the team into the provided global buffer for the reduction variables.
   3399 ///
   3400 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
   3401 ///  void *GlobPtrs[];
   3402 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
   3403 ///  ...
   3404 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
   3405 ///  reduce_function(reduce_data, GlobPtrs);
   3406 static llvm::Value *emitGlobalToListReduceFunction(
   3407     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
   3408     QualType ReductionArrayTy, SourceLocation Loc,
   3409     const RecordDecl *TeamReductionRec,
   3410     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
   3411         &VarFieldMap,
   3412     llvm::Function *ReduceFn) {
   3413   ASTContext &C = CGM.getContext();
   3414 
   3415   // Buffer: global reduction buffer.
   3416   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3417                               C.VoidPtrTy, ImplicitParamDecl::Other);
   3418   // Idx: index of the buffer.
   3419   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
   3420                            ImplicitParamDecl::Other);
   3421   // ReduceList: thread local Reduce list.
   3422   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
   3423                                   C.VoidPtrTy, ImplicitParamDecl::Other);
   3424   FunctionArgList Args;
   3425   Args.push_back(&BufferArg);
   3426   Args.push_back(&IdxArg);
   3427   Args.push_back(&ReduceListArg);
   3428 
   3429   const CGFunctionInfo &CGFI =
   3430       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   3431   auto *Fn = llvm::Function::Create(
   3432       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
   3433       "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
   3434   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   3435   Fn->setDoesNotRecurse();
   3436   CodeGenFunction CGF(CGM);
   3437   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   3438 
   3439   CGBuilderTy &Bld = CGF.Builder;
   3440 
   3441   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
   3442   QualType StaticTy = C.getRecordType(TeamReductionRec);
   3443   llvm::Type *LLVMReductionsBufferTy =
   3444       CGM.getTypes().ConvertTypeForMem(StaticTy);
   3445   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
   3446       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
   3447       LLVMReductionsBufferTy->getPointerTo());
   3448 
   3449   // 1. Build a list of reduction variables.
   3450   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   3451   Address ReductionList =
   3452       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
   3453   auto IPriv = Privates.begin();
   3454   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
   3455                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
   3456                                               /*Volatile=*/false, C.IntTy,
   3457                                               Loc)};
   3458   unsigned Idx = 0;
   3459   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
   3460     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
   3461     // Global = Buffer.VD[Idx];
   3462     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
   3463     const FieldDecl *FD = VarFieldMap.lookup(VD);
   3464     LValue GlobLVal = CGF.EmitLValueForField(
   3465         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
   3466     Address GlobAddr = GlobLVal.getAddress(CGF);
   3467     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
   3468         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
   3469     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
   3470     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
   3471     if ((*IPriv)->getType()->isVariablyModifiedType()) {
   3472       // Store array size.
   3473       ++Idx;
   3474       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
   3475       llvm::Value *Size = CGF.Builder.CreateIntCast(
   3476           CGF.getVLASize(
   3477                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
   3478               .NumElts,
   3479           CGF.SizeTy, /*isSigned=*/false);
   3480       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
   3481                               Elem);
   3482     }
   3483   }
   3484 
   3485   // Call reduce_function(ReduceList, GlobalReduceList)
   3486   llvm::Value *GlobalReduceList =
   3487       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
   3488   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   3489   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
   3490       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
   3491   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
   3492       CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
   3493   CGF.FinishFunction();
   3494   return Fn;
   3495 }
   3496 
   3497 ///
   3498 /// Design of OpenMP reductions on the GPU
   3499 ///
   3500 /// Consider a typical OpenMP program with one or more reduction
   3501 /// clauses:
   3502 ///
   3503 /// float foo;
   3504 /// double bar;
   3505 /// #pragma omp target teams distribute parallel for \
   3506 ///             reduction(+:foo) reduction(*:bar)
   3507 /// for (int i = 0; i < N; i++) {
   3508 ///   foo += A[i]; bar *= B[i];
   3509 /// }
   3510 ///
   3511 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
   3512 /// all teams.  In our OpenMP implementation on the NVPTX device an
   3513 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
   3514 /// within a team are mapped to CUDA threads within a threadblock.
   3515 /// Our goal is to efficiently aggregate values across all OpenMP
   3516 /// threads such that:
   3517 ///
   3518 ///   - the compiler and runtime are logically concise, and
   3519 ///   - the reduction is performed efficiently in a hierarchical
   3520 ///     manner as follows: within OpenMP threads in the same warp,
   3521 ///     across warps in a threadblock, and finally across teams on
   3522 ///     the NVPTX device.
   3523 ///
   3524 /// Introduction to Decoupling
   3525 ///
   3526 /// We would like to decouple the compiler and the runtime so that the
   3527 /// latter is ignorant of the reduction variables (number, data types)
   3528 /// and the reduction operators.  This allows a simpler interface
   3529 /// and implementation while still attaining good performance.
   3530 ///
   3531 /// Pseudocode for the aforementioned OpenMP program generated by the
   3532 /// compiler is as follows:
   3533 ///
   3534 /// 1. Create private copies of reduction variables on each OpenMP
   3535 ///    thread: 'foo_private', 'bar_private'
   3536 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
   3537 ///    to it and writes the result in 'foo_private' and 'bar_private'
   3538 ///    respectively.
   3539 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
   3540 ///    and store the result on the team master:
   3541 ///
   3542 ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
   3543 ///        reduceData, shuffleReduceFn, interWarpCpyFn)
   3544 ///
   3545 ///     where:
   3546 ///       struct ReduceData {
   3547 ///         double *foo;
   3548 ///         double *bar;
   3549 ///       } reduceData
   3550 ///       reduceData.foo = &foo_private
   3551 ///       reduceData.bar = &bar_private
   3552 ///
   3553 ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
   3554 ///     auxiliary functions generated by the compiler that operate on
   3555 ///     variables of type 'ReduceData'.  They aid the runtime perform
   3556 ///     algorithmic steps in a data agnostic manner.
   3557 ///
   3558 ///     'shuffleReduceFn' is a pointer to a function that reduces data
   3559 ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
   3560 ///     same warp.  It takes the following arguments as input:
   3561 ///
   3562 ///     a. variable of type 'ReduceData' on the calling lane,
   3563 ///     b. its lane_id,
   3564 ///     c. an offset relative to the current lane_id to generate a
   3565 ///        remote_lane_id.  The remote lane contains the second
   3566 ///        variable of type 'ReduceData' that is to be reduced.
   3567 ///     d. an algorithm version parameter determining which reduction
   3568 ///        algorithm to use.
   3569 ///
   3570 ///     'shuffleReduceFn' retrieves data from the remote lane using
   3571 ///     efficient GPU shuffle intrinsics and reduces, using the
   3572 ///     algorithm specified by the 4th parameter, the two operands
   3573 ///     element-wise.  The result is written to the first operand.
   3574 ///
   3575 ///     Different reduction algorithms are implemented in different
   3576 ///     runtime functions, all calling 'shuffleReduceFn' to perform
   3577 ///     the essential reduction step.  Therefore, based on the 4th
   3578 ///     parameter, this function behaves slightly differently to
   3579 ///     cooperate with the runtime to ensure correctness under
   3580 ///     different circumstances.
   3581 ///
   3582 ///     'InterWarpCpyFn' is a pointer to a function that transfers
   3583 ///     reduced variables across warps.  It tunnels, through CUDA
   3584 ///     shared memory, the thread-private data of type 'ReduceData'
   3585 ///     from lane 0 of each warp to a lane in the first warp.
   3586 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
   3587 ///    The last team writes the global reduced value to memory.
   3588 ///
   3589 ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
   3590 ///             reduceData, shuffleReduceFn, interWarpCpyFn,
   3591 ///             scratchpadCopyFn, loadAndReduceFn)
   3592 ///
   3593 ///     'scratchpadCopyFn' is a helper that stores reduced
   3594 ///     data from the team master to a scratchpad array in
   3595 ///     global memory.
   3596 ///
   3597 ///     'loadAndReduceFn' is a helper that loads data from
   3598 ///     the scratchpad array and reduces it with the input
   3599 ///     operand.
   3600 ///
   3601 ///     These compiler generated functions hide address
   3602 ///     calculation and alignment information from the runtime.
   3603 /// 5. if ret == 1:
   3604 ///     The team master of the last team stores the reduced
   3605 ///     result to the globals in memory.
   3606 ///     foo += reduceData.foo; bar *= reduceData.bar
   3607 ///
   3608 ///
   3609 /// Warp Reduction Algorithms
   3610 ///
   3611 /// On the warp level, we have three algorithms implemented in the
   3612 /// OpenMP runtime depending on the number of active lanes:
   3613 ///
   3614 /// Full Warp Reduction
   3615 ///
   3616 /// The reduce algorithm within a warp where all lanes are active
   3617 /// is implemented in the runtime as follows:
   3618 ///
   3619 /// full_warp_reduce(void *reduce_data,
   3620 ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
   3621 ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
   3622 ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
   3623 /// }
   3624 ///
   3625 /// The algorithm completes in log(2, WARPSIZE) steps.
   3626 ///
   3627 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
   3628 /// not used therefore we save instructions by not retrieving lane_id
   3629 /// from the corresponding special registers.  The 4th parameter, which
   3630 /// represents the version of the algorithm being used, is set to 0 to
   3631 /// signify full warp reduction.
   3632 ///
   3633 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
   3634 ///
   3635 /// #reduce_elem refers to an element in the local lane's data structure
   3636 /// #remote_elem is retrieved from a remote lane
   3637 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
   3638 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
   3639 ///
   3640 /// Contiguous Partial Warp Reduction
   3641 ///
   3642 /// This reduce algorithm is used within a warp where only the first
   3643 /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
   3644 /// number of OpenMP threads in a parallel region is not a multiple of
   3645 /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
   3646 ///
   3647 /// void
   3648 /// contiguous_partial_reduce(void *reduce_data,
   3649 ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
   3650 ///                           int size, int lane_id) {
   3651 ///   int curr_size;
   3652 ///   int offset;
   3653 ///   curr_size = size;
   3654 ///   mask = curr_size/2;
   3655 ///   while (offset>0) {
   3656 ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
   3657 ///     curr_size = (curr_size+1)/2;
   3658 ///     offset = curr_size/2;
   3659 ///   }
   3660 /// }
   3661 ///
   3662 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
   3663 ///
   3664 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
   3665 /// if (lane_id < offset)
   3666 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
   3667 /// else
   3668 ///     reduce_elem = remote_elem
   3669 ///
   3670 /// This algorithm assumes that the data to be reduced are located in a
   3671 /// contiguous subset of lanes starting from the first.  When there is
   3672 /// an odd number of active lanes, the data in the last lane is not
   3673 /// aggregated with any other lane's dat but is instead copied over.
   3674 ///
   3675 /// Dispersed Partial Warp Reduction
   3676 ///
   3677 /// This algorithm is used within a warp when any discontiguous subset of
   3678 /// lanes are active.  It is used to implement the reduction operation
   3679 /// across lanes in an OpenMP simd region or in a nested parallel region.
   3680 ///
   3681 /// void
   3682 /// dispersed_partial_reduce(void *reduce_data,
   3683 ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
   3684 ///   int size, remote_id;
   3685 ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
   3686 ///   do {
   3687 ///       remote_id = next_active_lane_id_right_after_me();
   3688 ///       # the above function returns 0 of no active lane
   3689 ///       # is present right after the current lane.
   3690 ///       size = number_of_active_lanes_in_this_warp();
   3691 ///       logical_lane_id /= 2;
   3692 ///       ShuffleReduceFn(reduce_data, logical_lane_id,
   3693 ///                       remote_id-1-threadIdx.x, 2);
   3694 ///   } while (logical_lane_id % 2 == 0 && size > 1);
   3695 /// }
   3696 ///
   3697 /// There is no assumption made about the initial state of the reduction.
   3698 /// Any number of lanes (>=1) could be active at any position.  The reduction
   3699 /// result is returned in the first active lane.
   3700 ///
   3701 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
   3702 ///
   3703 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
   3704 /// if (lane_id % 2 == 0 && offset > 0)
   3705 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
   3706 /// else
   3707 ///     reduce_elem = remote_elem
   3708 ///
   3709 ///
   3710 /// Intra-Team Reduction
   3711 ///
   3712 /// This function, as implemented in the runtime call
   3713 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
   3714 /// threads in a team.  It first reduces within a warp using the
   3715 /// aforementioned algorithms.  We then proceed to gather all such
   3716 /// reduced values at the first warp.
   3717 ///
   3718 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
   3719 /// data from each of the "warp master" (zeroth lane of each warp, where
   3720 /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
   3721 /// a mathematical sense) the problem of reduction across warp masters in
   3722 /// a block to the problem of warp reduction.
   3723 ///
   3724 ///
   3725 /// Inter-Team Reduction
   3726 ///
   3727 /// Once a team has reduced its data to a single value, it is stored in
   3728 /// a global scratchpad array.  Since each team has a distinct slot, this
   3729 /// can be done without locking.
   3730 ///
   3731 /// The last team to write to the scratchpad array proceeds to reduce the
   3732 /// scratchpad array.  One or more workers in the last team use the helper
   3733 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
   3734 /// the k'th worker reduces every k'th element.
   3735 ///
   3736 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
   3737 /// reduce across workers and compute a globally reduced value.
   3738 ///
   3739 void CGOpenMPRuntimeGPU::emitReduction(
   3740     CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
   3741     ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
   3742     ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
   3743   if (!CGF.HaveInsertPoint())
   3744     return;
   3745 
   3746   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
   3747 #ifndef NDEBUG
   3748   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
   3749 #endif
   3750 
   3751   if (Options.SimpleReduction) {
   3752     assert(!TeamsReduction && !ParallelReduction &&
   3753            "Invalid reduction selection in emitReduction.");
   3754     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
   3755                                    ReductionOps, Options);
   3756     return;
   3757   }
   3758 
   3759   assert((TeamsReduction || ParallelReduction) &&
   3760          "Invalid reduction selection in emitReduction.");
   3761 
   3762   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
   3763   // RedList, shuffle_reduce_func, interwarp_copy_func);
   3764   // or
   3765   // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
   3766   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
   3767   llvm::Value *ThreadId = getThreadID(CGF, Loc);
   3768 
   3769   llvm::Value *Res;
   3770   ASTContext &C = CGM.getContext();
   3771   // 1. Build a list of reduction variables.
   3772   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   3773   auto Size = RHSExprs.size();
   3774   for (const Expr *E : Privates) {
   3775     if (E->getType()->isVariablyModifiedType())
   3776       // Reserve place for array size.
   3777       ++Size;
   3778   }
   3779   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
   3780   QualType ReductionArrayTy =
   3781       C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
   3782                              /*IndexTypeQuals=*/0);
   3783   Address ReductionList =
   3784       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
   3785   auto IPriv = Privates.begin();
   3786   unsigned Idx = 0;
   3787   for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
   3788     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
   3789     CGF.Builder.CreateStore(
   3790         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   3791             CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
   3792         Elem);
   3793     if ((*IPriv)->getType()->isVariablyModifiedType()) {
   3794       // Store array size.
   3795       ++Idx;
   3796       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
   3797       llvm::Value *Size = CGF.Builder.CreateIntCast(
   3798           CGF.getVLASize(
   3799                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
   3800               .NumElts,
   3801           CGF.SizeTy, /*isSigned=*/false);
   3802       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
   3803                               Elem);
   3804     }
   3805   }
   3806 
   3807   llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   3808       ReductionList.getPointer(), CGF.VoidPtrTy);
   3809   llvm::Function *ReductionFn = emitReductionFunction(
   3810       Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
   3811       LHSExprs, RHSExprs, ReductionOps);
   3812   llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
   3813   llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
   3814       CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
   3815   llvm::Value *InterWarpCopyFn =
   3816       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
   3817 
   3818   if (ParallelReduction) {
   3819     llvm::Value *Args[] = {RTLoc,
   3820                            ThreadId,
   3821                            CGF.Builder.getInt32(RHSExprs.size()),
   3822                            ReductionArrayTySize,
   3823                            RL,
   3824                            ShuffleAndReduceFn,
   3825                            InterWarpCopyFn};
   3826 
   3827     Res = CGF.EmitRuntimeCall(
   3828         OMPBuilder.getOrCreateRuntimeFunction(
   3829             CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
   3830         Args);
   3831   } else {
   3832     assert(TeamsReduction && "expected teams reduction.");
   3833     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
   3834     llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
   3835     int Cnt = 0;
   3836     for (const Expr *DRE : Privates) {
   3837       PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
   3838       ++Cnt;
   3839     }
   3840     const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
   3841         CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
   3842         C.getLangOpts().OpenMPCUDAReductionBufNum);
   3843     TeamsReductions.push_back(TeamReductionRec);
   3844     if (!KernelTeamsReductionPtr) {
   3845       KernelTeamsReductionPtr = new llvm::GlobalVariable(
   3846           CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
   3847           llvm::GlobalValue::InternalLinkage, nullptr,
   3848           "_openmp_teams_reductions_buffer_$_$ptr");
   3849     }
   3850     llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
   3851         Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
   3852         /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
   3853     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
   3854         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
   3855     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
   3856         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
   3857         ReductionFn);
   3858     llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
   3859         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
   3860     llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
   3861         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
   3862         ReductionFn);
   3863 
   3864     llvm::Value *Args[] = {
   3865         RTLoc,
   3866         ThreadId,
   3867         GlobalBufferPtr,
   3868         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
   3869         RL,
   3870         ShuffleAndReduceFn,
   3871         InterWarpCopyFn,
   3872         GlobalToBufferCpyFn,
   3873         GlobalToBufferRedFn,
   3874         BufferToGlobalCpyFn,
   3875         BufferToGlobalRedFn};
   3876 
   3877     Res = CGF.EmitRuntimeCall(
   3878         OMPBuilder.getOrCreateRuntimeFunction(
   3879             CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
   3880         Args);
   3881   }
   3882 
   3883   // 5. Build if (res == 1)
   3884   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
   3885   llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
   3886   llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
   3887       Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
   3888   CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
   3889 
   3890   // 6. Build then branch: where we have reduced values in the master
   3891   //    thread in each team.
   3892   //    __kmpc_end_reduce{_nowait}(<gtid>);
   3893   //    break;
   3894   CGF.EmitBlock(ThenBB);
   3895 
   3896   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
   3897   auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
   3898                     this](CodeGenFunction &CGF, PrePostActionTy &Action) {
   3899     auto IPriv = Privates.begin();
   3900     auto ILHS = LHSExprs.begin();
   3901     auto IRHS = RHSExprs.begin();
   3902     for (const Expr *E : ReductionOps) {
   3903       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
   3904                                   cast<DeclRefExpr>(*IRHS));
   3905       ++IPriv;
   3906       ++ILHS;
   3907       ++IRHS;
   3908     }
   3909   };
   3910   llvm::Value *EndArgs[] = {ThreadId};
   3911   RegionCodeGenTy RCG(CodeGen);
   3912   NVPTXActionTy Action(
   3913       nullptr, llvm::None,
   3914       OMPBuilder.getOrCreateRuntimeFunction(
   3915           CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
   3916       EndArgs);
   3917   RCG.setAction(Action);
   3918   RCG(CGF);
   3919   // There is no need to emit line number for unconditional branch.
   3920   (void)ApplyDebugLocation::CreateEmpty(CGF);
   3921   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
   3922 }
   3923 
   3924 const VarDecl *
   3925 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
   3926                                        const VarDecl *NativeParam) const {
   3927   if (!NativeParam->getType()->isReferenceType())
   3928     return NativeParam;
   3929   QualType ArgType = NativeParam->getType();
   3930   QualifierCollector QC;
   3931   const Type *NonQualTy = QC.strip(ArgType);
   3932   QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
   3933   if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
   3934     if (Attr->getCaptureKind() == OMPC_map) {
   3935       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
   3936                                                         LangAS::opencl_global);
   3937     } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
   3938                PointeeTy.isConstant(CGM.getContext())) {
   3939       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
   3940                                                         LangAS::opencl_generic);
   3941     }
   3942   }
   3943   ArgType = CGM.getContext().getPointerType(PointeeTy);
   3944   QC.addRestrict();
   3945   enum { NVPTX_local_addr = 5 };
   3946   QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
   3947   ArgType = QC.apply(CGM.getContext(), ArgType);
   3948   if (isa<ImplicitParamDecl>(NativeParam))
   3949     return ImplicitParamDecl::Create(
   3950         CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
   3951         NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
   3952   return ParmVarDecl::Create(
   3953       CGM.getContext(),
   3954       const_cast<DeclContext *>(NativeParam->getDeclContext()),
   3955       NativeParam->getBeginLoc(), NativeParam->getLocation(),
   3956       NativeParam->getIdentifier(), ArgType,
   3957       /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
   3958 }
   3959 
   3960 Address
   3961 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
   3962                                           const VarDecl *NativeParam,
   3963                                           const VarDecl *TargetParam) const {
   3964   assert(NativeParam != TargetParam &&
   3965          NativeParam->getType()->isReferenceType() &&
   3966          "Native arg must not be the same as target arg.");
   3967   Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
   3968   QualType NativeParamType = NativeParam->getType();
   3969   QualifierCollector QC;
   3970   const Type *NonQualTy = QC.strip(NativeParamType);
   3971   QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
   3972   unsigned NativePointeeAddrSpace =
   3973       CGF.getContext().getTargetAddressSpace(NativePointeeTy);
   3974   QualType TargetTy = TargetParam->getType();
   3975   llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
   3976       LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
   3977   // First cast to generic.
   3978   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   3979       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
   3980                       /*AddrSpace=*/0));
   3981   // Cast from generic to native address space.
   3982   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   3983       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
   3984                       NativePointeeAddrSpace));
   3985   Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
   3986   CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
   3987                         NativeParamType);
   3988   return NativeParamAddr;
   3989 }
   3990 
   3991 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
   3992     CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
   3993     ArrayRef<llvm::Value *> Args) const {
   3994   SmallVector<llvm::Value *, 4> TargetArgs;
   3995   TargetArgs.reserve(Args.size());
   3996   auto *FnType = OutlinedFn.getFunctionType();
   3997   for (unsigned I = 0, E = Args.size(); I < E; ++I) {
   3998     if (FnType->isVarArg() && FnType->getNumParams() <= I) {
   3999       TargetArgs.append(std::next(Args.begin(), I), Args.end());
   4000       break;
   4001     }
   4002     llvm::Type *TargetType = FnType->getParamType(I);
   4003     llvm::Value *NativeArg = Args[I];
   4004     if (!TargetType->isPointerTy()) {
   4005       TargetArgs.emplace_back(NativeArg);
   4006       continue;
   4007     }
   4008     llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   4009         NativeArg,
   4010         NativeArg->getType()->getPointerElementType()->getPointerTo());
   4011     TargetArgs.emplace_back(
   4012         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
   4013   }
   4014   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
   4015 }
   4016 
   4017 /// Emit function which wraps the outline parallel region
   4018 /// and controls the arguments which are passed to this function.
   4019 /// The wrapper ensures that the outlined function is called
   4020 /// with the correct arguments when data is shared.
   4021 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
   4022     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
   4023   ASTContext &Ctx = CGM.getContext();
   4024   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
   4025 
   4026   // Create a function that takes as argument the source thread.
   4027   FunctionArgList WrapperArgs;
   4028   QualType Int16QTy =
   4029       Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
   4030   QualType Int32QTy =
   4031       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
   4032   ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
   4033                                      /*Id=*/nullptr, Int16QTy,
   4034                                      ImplicitParamDecl::Other);
   4035   ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
   4036                                /*Id=*/nullptr, Int32QTy,
   4037                                ImplicitParamDecl::Other);
   4038   WrapperArgs.emplace_back(&ParallelLevelArg);
   4039   WrapperArgs.emplace_back(&WrapperArg);
   4040 
   4041   const CGFunctionInfo &CGFI =
   4042       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
   4043 
   4044   auto *Fn = llvm::Function::Create(
   4045       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
   4046       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
   4047 
   4048   // Ensure we do not inline the function. This is trivially true for the ones
   4049   // passed to __kmpc_fork_call but the ones calles in serialized regions
   4050   // could be inlined. This is not a perfect but it is closer to the invariant
   4051   // we want, namely, every data environment starts with a new function.
   4052   // TODO: We should pass the if condition to the runtime function and do the
   4053   //       handling there. Much cleaner code.
   4054   Fn->addFnAttr(llvm::Attribute::NoInline);
   4055 
   4056   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   4057   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
   4058   Fn->setDoesNotRecurse();
   4059 
   4060   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
   4061   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
   4062                     D.getBeginLoc(), D.getBeginLoc());
   4063 
   4064   const auto *RD = CS.getCapturedRecordDecl();
   4065   auto CurField = RD->field_begin();
   4066 
   4067   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
   4068                                                       /*Name=*/".zero.addr");
   4069   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
   4070   // Get the array of arguments.
   4071   SmallVector<llvm::Value *, 8> Args;
   4072 
   4073   Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
   4074   Args.emplace_back(ZeroAddr.getPointer());
   4075 
   4076   CGBuilderTy &Bld = CGF.Builder;
   4077   auto CI = CS.capture_begin();
   4078 
   4079   // Use global memory for data sharing.
   4080   // Handle passing of global args to workers.
   4081   Address GlobalArgs =
   4082       CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
   4083   llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
   4084   llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
   4085   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
   4086                           CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
   4087                       DataSharingArgs);
   4088 
   4089   // Retrieve the shared variables from the list of references returned
   4090   // by the runtime. Pass the variables to the outlined function.
   4091   Address SharedArgListAddress = Address::invalid();
   4092   if (CS.capture_size() > 0 ||
   4093       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
   4094     SharedArgListAddress = CGF.EmitLoadOfPointer(
   4095         GlobalArgs, CGF.getContext()
   4096                         .getPointerType(CGF.getContext().getPointerType(
   4097                             CGF.getContext().VoidPtrTy))
   4098                         .castAs<PointerType>());
   4099   }
   4100   unsigned Idx = 0;
   4101   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
   4102     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
   4103     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
   4104         Src, CGF.SizeTy->getPointerTo());
   4105     llvm::Value *LB = CGF.EmitLoadOfScalar(
   4106         TypedAddress,
   4107         /*Volatile=*/false,
   4108         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
   4109         cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
   4110     Args.emplace_back(LB);
   4111     ++Idx;
   4112     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
   4113     TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
   4114         Src, CGF.SizeTy->getPointerTo());
   4115     llvm::Value *UB = CGF.EmitLoadOfScalar(
   4116         TypedAddress,
   4117         /*Volatile=*/false,
   4118         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
   4119         cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
   4120     Args.emplace_back(UB);
   4121     ++Idx;
   4122   }
   4123   if (CS.capture_size() > 0) {
   4124     ASTContext &CGFContext = CGF.getContext();
   4125     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
   4126       QualType ElemTy = CurField->getType();
   4127       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
   4128       Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
   4129           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
   4130       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
   4131                                               /*Volatile=*/false,
   4132                                               CGFContext.getPointerType(ElemTy),
   4133                                               CI->getLocation());
   4134       if (CI->capturesVariableByCopy() &&
   4135           !CI->getCapturedVar()->getType()->isAnyPointerType()) {
   4136         Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
   4137                               CI->getLocation());
   4138       }
   4139       Args.emplace_back(Arg);
   4140     }
   4141   }
   4142 
   4143   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
   4144   CGF.FinishFunction();
   4145   return Fn;
   4146 }
   4147 
   4148 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
   4149                                               const Decl *D) {
   4150   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
   4151     return;
   4152 
   4153   assert(D && "Expected function or captured|block decl.");
   4154   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
   4155          "Function is registered already.");
   4156   assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
   4157          "Team is set but not processed.");
   4158   const Stmt *Body = nullptr;
   4159   bool NeedToDelayGlobalization = false;
   4160   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
   4161     Body = FD->getBody();
   4162   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
   4163     Body = BD->getBody();
   4164   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
   4165     Body = CD->getBody();
   4166     NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
   4167     if (NeedToDelayGlobalization &&
   4168         getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
   4169       return;
   4170   }
   4171   if (!Body)
   4172     return;
   4173   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
   4174   VarChecker.Visit(Body);
   4175   const RecordDecl *GlobalizedVarsRecord =
   4176       VarChecker.getGlobalizedRecord(IsInTTDRegion);
   4177   TeamAndReductions.first = nullptr;
   4178   TeamAndReductions.second.clear();
   4179   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
   4180       VarChecker.getEscapedVariableLengthDecls();
   4181   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
   4182     return;
   4183   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
   4184   I->getSecond().MappedParams =
   4185       std::make_unique<CodeGenFunction::OMPMapVars>();
   4186   I->getSecond().GlobalRecord = GlobalizedVarsRecord;
   4187   I->getSecond().EscapedParameters.insert(
   4188       VarChecker.getEscapedParameters().begin(),
   4189       VarChecker.getEscapedParameters().end());
   4190   I->getSecond().EscapedVariableLengthDecls.append(
   4191       EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
   4192   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
   4193   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
   4194     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
   4195     const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
   4196     Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
   4197   }
   4198   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
   4199     CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
   4200     VarChecker.Visit(Body);
   4201     I->getSecond().SecondaryGlobalRecord =
   4202         VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
   4203     I->getSecond().SecondaryLocalVarData.emplace();
   4204     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
   4205     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
   4206       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
   4207       const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
   4208       Data.insert(
   4209           std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
   4210     }
   4211   }
   4212   if (!NeedToDelayGlobalization) {
   4213     emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
   4214     struct GlobalizationScope final : EHScopeStack::Cleanup {
   4215       GlobalizationScope() = default;
   4216 
   4217       void Emit(CodeGenFunction &CGF, Flags flags) override {
   4218         static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
   4219             .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
   4220       }
   4221     };
   4222     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
   4223   }
   4224 }
   4225 
   4226 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
   4227                                                         const VarDecl *VD) {
   4228   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
   4229     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
   4230     auto AS = LangAS::Default;
   4231     switch (A->getAllocatorType()) {
   4232       // Use the default allocator here as by default local vars are
   4233       // threadlocal.
   4234     case OMPAllocateDeclAttr::OMPNullMemAlloc:
   4235     case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
   4236     case OMPAllocateDeclAttr::OMPThreadMemAlloc:
   4237     case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
   4238     case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
   4239       // Follow the user decision - use default allocation.
   4240       return Address::invalid();
   4241     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
   4242       // TODO: implement aupport for user-defined allocators.
   4243       return Address::invalid();
   4244     case OMPAllocateDeclAttr::OMPConstMemAlloc:
   4245       AS = LangAS::cuda_constant;
   4246       break;
   4247     case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
   4248       AS = LangAS::cuda_shared;
   4249       break;
   4250     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
   4251     case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
   4252       break;
   4253     }
   4254     llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
   4255     auto *GV = new llvm::GlobalVariable(
   4256         CGM.getModule(), VarTy, /*isConstant=*/false,
   4257         llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
   4258         VD->getName(),
   4259         /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
   4260         CGM.getContext().getTargetAddressSpace(AS));
   4261     CharUnits Align = CGM.getContext().getDeclAlign(VD);
   4262     GV->setAlignment(Align.getAsAlign());
   4263     return Address(
   4264         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
   4265             GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
   4266                     VD->getType().getAddressSpace()))),
   4267         Align);
   4268   }
   4269 
   4270   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
   4271     return Address::invalid();
   4272 
   4273   VD = VD->getCanonicalDecl();
   4274   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   4275   if (I == FunctionGlobalizedDecls.end())
   4276     return Address::invalid();
   4277   auto VDI = I->getSecond().LocalVarData.find(VD);
   4278   if (VDI != I->getSecond().LocalVarData.end())
   4279     return VDI->second.PrivateAddr;
   4280   if (VD->hasAttrs()) {
   4281     for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
   4282          E(VD->attr_end());
   4283          IT != E; ++IT) {
   4284       auto VDI = I->getSecond().LocalVarData.find(
   4285           cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
   4286               ->getCanonicalDecl());
   4287       if (VDI != I->getSecond().LocalVarData.end())
   4288         return VDI->second.PrivateAddr;
   4289     }
   4290   }
   4291 
   4292   return Address::invalid();
   4293 }
   4294 
   4295 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
   4296   FunctionGlobalizedDecls.erase(CGF.CurFn);
   4297   CGOpenMPRuntime::functionFinished(CGF);
   4298 }
   4299 
   4300 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
   4301     CodeGenFunction &CGF, const OMPLoopDirective &S,
   4302     OpenMPDistScheduleClauseKind &ScheduleKind,
   4303     llvm::Value *&Chunk) const {
   4304   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   4305   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
   4306     ScheduleKind = OMPC_DIST_SCHEDULE_static;
   4307     Chunk = CGF.EmitScalarConversion(
   4308         RT.getGPUNumThreads(CGF),
   4309         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
   4310         S.getIterationVariable()->getType(), S.getBeginLoc());
   4311     return;
   4312   }
   4313   CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
   4314       CGF, S, ScheduleKind, Chunk);
   4315 }
   4316 
   4317 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
   4318     CodeGenFunction &CGF, const OMPLoopDirective &S,
   4319     OpenMPScheduleClauseKind &ScheduleKind,
   4320     const Expr *&ChunkExpr) const {
   4321   ScheduleKind = OMPC_SCHEDULE_static;
   4322   // Chunk size is 1 in this case.
   4323   llvm::APInt ChunkSize(32, 1);
   4324   ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
   4325       CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
   4326       SourceLocation());
   4327 }
   4328 
   4329 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
   4330     CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
   4331   assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
   4332          " Expected target-based directive.");
   4333   const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
   4334   for (const CapturedStmt::Capture &C : CS->captures()) {
   4335     // Capture variables captured by reference in lambdas for target-based
   4336     // directives.
   4337     if (!C.capturesVariable())
   4338       continue;
   4339     const VarDecl *VD = C.getCapturedVar();
   4340     const auto *RD = VD->getType()
   4341                          .getCanonicalType()
   4342                          .getNonReferenceType()
   4343                          ->getAsCXXRecordDecl();
   4344     if (!RD || !RD->isLambda())
   4345       continue;
   4346     Address VDAddr = CGF.GetAddrOfLocalVar(VD);
   4347     LValue VDLVal;
   4348     if (VD->getType().getCanonicalType()->isReferenceType())
   4349       VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
   4350     else
   4351       VDLVal = CGF.MakeAddrLValue(
   4352           VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
   4353     llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
   4354     FieldDecl *ThisCapture = nullptr;
   4355     RD->getCaptureFields(Captures, ThisCapture);
   4356     if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
   4357       LValue ThisLVal =
   4358           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
   4359       llvm::Value *CXXThis = CGF.LoadCXXThis();
   4360       CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
   4361     }
   4362     for (const LambdaCapture &LC : RD->captures()) {
   4363       if (LC.getCaptureKind() != LCK_ByRef)
   4364         continue;
   4365       const VarDecl *VD = LC.getCapturedVar();
   4366       if (!CS->capturesVariable(VD))
   4367         continue;
   4368       auto It = Captures.find(VD);
   4369       assert(It != Captures.end() && "Found lambda capture without field.");
   4370       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
   4371       Address VDAddr = CGF.GetAddrOfLocalVar(VD);
   4372       if (VD->getType().getCanonicalType()->isReferenceType())
   4373         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
   4374                                                VD->getType().getCanonicalType())
   4375                      .getAddress(CGF);
   4376       CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
   4377     }
   4378   }
   4379 }
   4380 
   4381 unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const {
   4382   return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
   4383 }
   4384 
   4385 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
   4386                                                             LangAS &AS) {
   4387   if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
   4388     return false;
   4389   const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
   4390   switch(A->getAllocatorType()) {
   4391   case OMPAllocateDeclAttr::OMPNullMemAlloc:
   4392   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
   4393   // Not supported, fallback to the default mem space.
   4394   case OMPAllocateDeclAttr::OMPThreadMemAlloc:
   4395   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
   4396   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
   4397   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
   4398   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
   4399     AS = LangAS::Default;
   4400     return true;
   4401   case OMPAllocateDeclAttr::OMPConstMemAlloc:
   4402     AS = LangAS::cuda_constant;
   4403     return true;
   4404   case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
   4405     AS = LangAS::cuda_shared;
   4406     return true;
   4407   case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
   4408     llvm_unreachable("Expected predefined allocator for the variables with the "
   4409                      "static storage.");
   4410   }
   4411   return false;
   4412 }
   4413 
   4414 // Get current CudaArch and ignore any unknown values
   4415 static CudaArch getCudaArch(CodeGenModule &CGM) {
   4416   if (!CGM.getTarget().hasFeature("ptx"))
   4417     return CudaArch::UNKNOWN;
   4418   for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
   4419     if (Feature.getValue()) {
   4420       CudaArch Arch = StringToCudaArch(Feature.getKey());
   4421       if (Arch != CudaArch::UNKNOWN)
   4422         return Arch;
   4423     }
   4424   }
   4425   return CudaArch::UNKNOWN;
   4426 }
   4427 
   4428 /// Check to see if target architecture supports unified addressing which is
   4429 /// a restriction for OpenMP requires clause "unified_shared_memory".
   4430 void CGOpenMPRuntimeGPU::processRequiresDirective(
   4431     const OMPRequiresDecl *D) {
   4432   for (const OMPClause *Clause : D->clauselists()) {
   4433     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
   4434       CudaArch Arch = getCudaArch(CGM);
   4435       switch (Arch) {
   4436       case CudaArch::SM_20:
   4437       case CudaArch::SM_21:
   4438       case CudaArch::SM_30:
   4439       case CudaArch::SM_32:
   4440       case CudaArch::SM_35:
   4441       case CudaArch::SM_37:
   4442       case CudaArch::SM_50:
   4443       case CudaArch::SM_52:
   4444       case CudaArch::SM_53: {
   4445         SmallString<256> Buffer;
   4446         llvm::raw_svector_ostream Out(Buffer);
   4447         Out << "Target architecture " << CudaArchToString(Arch)
   4448             << " does not support unified addressing";
   4449         CGM.Error(Clause->getBeginLoc(), Out.str());
   4450         return;
   4451       }
   4452       case CudaArch::SM_60:
   4453       case CudaArch::SM_61:
   4454       case CudaArch::SM_62:
   4455       case CudaArch::SM_70:
   4456       case CudaArch::SM_72:
   4457       case CudaArch::SM_75:
   4458       case CudaArch::SM_80:
   4459       case CudaArch::SM_86:
   4460       case CudaArch::GFX600:
   4461       case CudaArch::GFX601:
   4462       case CudaArch::GFX602:
   4463       case CudaArch::GFX700:
   4464       case CudaArch::GFX701:
   4465       case CudaArch::GFX702:
   4466       case CudaArch::GFX703:
   4467       case CudaArch::GFX704:
   4468       case CudaArch::GFX705:
   4469       case CudaArch::GFX801:
   4470       case CudaArch::GFX802:
   4471       case CudaArch::GFX803:
   4472       case CudaArch::GFX805:
   4473       case CudaArch::GFX810:
   4474       case CudaArch::GFX900:
   4475       case CudaArch::GFX902:
   4476       case CudaArch::GFX904:
   4477       case CudaArch::GFX906:
   4478       case CudaArch::GFX908:
   4479       case CudaArch::GFX909:
   4480       case CudaArch::GFX90a:
   4481       case CudaArch::GFX90c:
   4482       case CudaArch::GFX1010:
   4483       case CudaArch::GFX1011:
   4484       case CudaArch::GFX1012:
   4485       case CudaArch::GFX1030:
   4486       case CudaArch::GFX1031:
   4487       case CudaArch::GFX1032:
   4488       case CudaArch::GFX1033:
   4489       case CudaArch::GFX1034:
   4490       case CudaArch::UNUSED:
   4491       case CudaArch::UNKNOWN:
   4492         break;
   4493       case CudaArch::LAST:
   4494         llvm_unreachable("Unexpected Cuda arch.");
   4495       }
   4496     }
   4497   }
   4498   CGOpenMPRuntime::processRequiresDirective(D);
   4499 }
   4500 
   4501 /// Get number of SMs and number of blocks per SM.
   4502 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
   4503   std::pair<unsigned, unsigned> Data;
   4504   if (CGM.getLangOpts().OpenMPCUDANumSMs)
   4505     Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
   4506   if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
   4507     Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
   4508   if (Data.first && Data.second)
   4509     return Data;
   4510   switch (getCudaArch(CGM)) {
   4511   case CudaArch::SM_20:
   4512   case CudaArch::SM_21:
   4513   case CudaArch::SM_30:
   4514   case CudaArch::SM_32:
   4515   case CudaArch::SM_35:
   4516   case CudaArch::SM_37:
   4517   case CudaArch::SM_50:
   4518   case CudaArch::SM_52:
   4519   case CudaArch::SM_53:
   4520     return {16, 16};
   4521   case CudaArch::SM_60:
   4522   case CudaArch::SM_61:
   4523   case CudaArch::SM_62:
   4524     return {56, 32};
   4525   case CudaArch::SM_70:
   4526   case CudaArch::SM_72:
   4527   case CudaArch::SM_75:
   4528   case CudaArch::SM_80:
   4529   case CudaArch::SM_86:
   4530     return {84, 32};
   4531   case CudaArch::GFX600:
   4532   case CudaArch::GFX601:
   4533   case CudaArch::GFX602:
   4534   case CudaArch::GFX700:
   4535   case CudaArch::GFX701:
   4536   case CudaArch::GFX702:
   4537   case CudaArch::GFX703:
   4538   case CudaArch::GFX704:
   4539   case CudaArch::GFX705:
   4540   case CudaArch::GFX801:
   4541   case CudaArch::GFX802:
   4542   case CudaArch::GFX803:
   4543   case CudaArch::GFX805:
   4544   case CudaArch::GFX810:
   4545   case CudaArch::GFX900:
   4546   case CudaArch::GFX902:
   4547   case CudaArch::GFX904:
   4548   case CudaArch::GFX906:
   4549   case CudaArch::GFX908:
   4550   case CudaArch::GFX909:
   4551   case CudaArch::GFX90a:
   4552   case CudaArch::GFX90c:
   4553   case CudaArch::GFX1010:
   4554   case CudaArch::GFX1011:
   4555   case CudaArch::GFX1012:
   4556   case CudaArch::GFX1030:
   4557   case CudaArch::GFX1031:
   4558   case CudaArch::GFX1032:
   4559   case CudaArch::GFX1033:
   4560   case CudaArch::GFX1034:
   4561   case CudaArch::UNUSED:
   4562   case CudaArch::UNKNOWN:
   4563     break;
   4564   case CudaArch::LAST:
   4565     llvm_unreachable("Unexpected Cuda arch.");
   4566   }
   4567   llvm_unreachable("Unexpected NVPTX target without ptx feature.");
   4568 }
   4569 
   4570 void CGOpenMPRuntimeGPU::clear() {
   4571   if (!GlobalizedRecords.empty() &&
   4572       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
   4573     ASTContext &C = CGM.getContext();
   4574     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
   4575     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
   4576     RecordDecl *StaticRD = C.buildImplicitRecord(
   4577         "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
   4578     StaticRD->startDefinition();
   4579     RecordDecl *SharedStaticRD = C.buildImplicitRecord(
   4580         "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
   4581     SharedStaticRD->startDefinition();
   4582     for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
   4583       if (Records.Records.empty())
   4584         continue;
   4585       unsigned Size = 0;
   4586       unsigned RecAlignment = 0;
   4587       for (const RecordDecl *RD : Records.Records) {
   4588         QualType RDTy = C.getRecordType(RD);
   4589         unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
   4590         RecAlignment = std::max(RecAlignment, Alignment);
   4591         unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
   4592         Size =
   4593             llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
   4594       }
   4595       Size = llvm::alignTo(Size, RecAlignment);
   4596       llvm::APInt ArySize(/*numBits=*/64, Size);
   4597       QualType SubTy = C.getConstantArrayType(
   4598           C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
   4599       const bool UseSharedMemory = Size <= SharedMemorySize;
   4600       auto *Field =
   4601           FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
   4602                             SourceLocation(), SourceLocation(), nullptr, SubTy,
   4603                             C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
   4604                             /*BW=*/nullptr, /*Mutable=*/false,
   4605                             /*InitStyle=*/ICIS_NoInit);
   4606       Field->setAccess(AS_public);
   4607       if (UseSharedMemory) {
   4608         SharedStaticRD->addDecl(Field);
   4609         SharedRecs.push_back(&Records);
   4610       } else {
   4611         StaticRD->addDecl(Field);
   4612         GlobalRecs.push_back(&Records);
   4613       }
   4614       Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
   4615       Records.UseSharedMemory->setInitializer(
   4616           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
   4617     }
   4618     // Allocate SharedMemorySize buffer for the shared memory.
   4619     // FIXME: nvlink does not handle weak linkage correctly (object with the
   4620     // different size are reported as erroneous).
   4621     // Restore this code as sson as nvlink is fixed.
   4622     if (!SharedStaticRD->field_empty()) {
   4623       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
   4624       QualType SubTy = C.getConstantArrayType(
   4625           C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
   4626       auto *Field = FieldDecl::Create(
   4627           C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
   4628           C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
   4629           /*BW=*/nullptr, /*Mutable=*/false,
   4630           /*InitStyle=*/ICIS_NoInit);
   4631       Field->setAccess(AS_public);
   4632       SharedStaticRD->addDecl(Field);
   4633     }
   4634     SharedStaticRD->completeDefinition();
   4635     if (!SharedStaticRD->field_empty()) {
   4636       QualType StaticTy = C.getRecordType(SharedStaticRD);
   4637       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
   4638       auto *GV = new llvm::GlobalVariable(
   4639           CGM.getModule(), LLVMStaticTy,
   4640           /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
   4641           llvm::UndefValue::get(LLVMStaticTy),
   4642           "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
   4643           llvm::GlobalValue::NotThreadLocal,
   4644           C.getTargetAddressSpace(LangAS::cuda_shared));
   4645       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
   4646           GV, CGM.VoidPtrTy);
   4647       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
   4648         Rec->Buffer->replaceAllUsesWith(Replacement);
   4649         Rec->Buffer->eraseFromParent();
   4650       }
   4651     }
   4652     StaticRD->completeDefinition();
   4653     if (!StaticRD->field_empty()) {
   4654       QualType StaticTy = C.getRecordType(StaticRD);
   4655       std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
   4656       llvm::APInt Size1(32, SMsBlockPerSM.second);
   4657       QualType Arr1Ty =
   4658           C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
   4659                                  /*IndexTypeQuals=*/0);
   4660       llvm::APInt Size2(32, SMsBlockPerSM.first);
   4661       QualType Arr2Ty =
   4662           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
   4663                                  /*IndexTypeQuals=*/0);
   4664       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
   4665       // FIXME: nvlink does not handle weak linkage correctly (object with the
   4666       // different size are reported as erroneous).
   4667       // Restore CommonLinkage as soon as nvlink is fixed.
   4668       auto *GV = new llvm::GlobalVariable(
   4669           CGM.getModule(), LLVMArr2Ty,
   4670           /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
   4671           llvm::Constant::getNullValue(LLVMArr2Ty),
   4672           "_openmp_static_glob_rd_$_");
   4673       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
   4674           GV, CGM.VoidPtrTy);
   4675       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
   4676         Rec->Buffer->replaceAllUsesWith(Replacement);
   4677         Rec->Buffer->eraseFromParent();
   4678       }
   4679     }
   4680   }
   4681   if (!TeamsReductions.empty()) {
   4682     ASTContext &C = CGM.getContext();
   4683     RecordDecl *StaticRD = C.buildImplicitRecord(
   4684         "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
   4685     StaticRD->startDefinition();
   4686     for (const RecordDecl *TeamReductionRec : TeamsReductions) {
   4687       QualType RecTy = C.getRecordType(TeamReductionRec);
   4688       auto *Field = FieldDecl::Create(
   4689           C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
   4690           C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
   4691           /*BW=*/nullptr, /*Mutable=*/false,
   4692           /*InitStyle=*/ICIS_NoInit);
   4693       Field->setAccess(AS_public);
   4694       StaticRD->addDecl(Field);
   4695     }
   4696     StaticRD->completeDefinition();
   4697     QualType StaticTy = C.getRecordType(StaticRD);
   4698     llvm::Type *LLVMReductionsBufferTy =
   4699         CGM.getTypes().ConvertTypeForMem(StaticTy);
   4700     // FIXME: nvlink does not handle weak linkage correctly (object with the
   4701     // different size are reported as erroneous).
   4702     // Restore CommonLinkage as soon as nvlink is fixed.
   4703     auto *GV = new llvm::GlobalVariable(
   4704         CGM.getModule(), LLVMReductionsBufferTy,
   4705         /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
   4706         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
   4707         "_openmp_teams_reductions_buffer_$_");
   4708     KernelTeamsReductionPtr->setInitializer(
   4709         llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
   4710                                                              CGM.VoidPtrTy));
   4711   }
   4712   CGOpenMPRuntime::clear();
   4713 }
   4714