1 1.1 joerg //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2 1.1 joerg // 3 1.1 joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 1.1 joerg // See https://llvm.org/LICENSE.txt for license information. 5 1.1 joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 1.1 joerg // 7 1.1 joerg //===----------------------------------------------------------------------===// 8 1.1 joerg // 9 1.1 joerg // This provides a generalized class for OpenMP runtime code generation 10 1.1 joerg // specialized by GPU targets NVPTX and AMDGCN. 11 1.1 joerg // 12 1.1 joerg //===----------------------------------------------------------------------===// 13 1.1 joerg 14 1.1 joerg #include "CGOpenMPRuntimeGPU.h" 15 1.1 joerg #include "CGOpenMPRuntimeNVPTX.h" 16 1.1 joerg #include "CodeGenFunction.h" 17 1.1 joerg #include "clang/AST/Attr.h" 18 1.1 joerg #include "clang/AST/DeclOpenMP.h" 19 1.1 joerg #include "clang/AST/StmtOpenMP.h" 20 1.1 joerg #include "clang/AST/StmtVisitor.h" 21 1.1 joerg #include "clang/Basic/Cuda.h" 22 1.1 joerg #include "llvm/ADT/SmallPtrSet.h" 23 1.1 joerg #include "llvm/Frontend/OpenMP/OMPGridValues.h" 24 1.1 joerg #include "llvm/IR/IntrinsicsNVPTX.h" 25 1.1 joerg 26 1.1 joerg using namespace clang; 27 1.1 joerg using namespace CodeGen; 28 1.1 joerg using namespace llvm::omp; 29 1.1 joerg 30 1.1 joerg namespace { 31 1.1 joerg /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 32 1.1 joerg class NVPTXActionTy final : public PrePostActionTy { 33 1.1 joerg llvm::FunctionCallee EnterCallee = nullptr; 34 1.1 joerg ArrayRef<llvm::Value *> EnterArgs; 35 1.1 joerg llvm::FunctionCallee ExitCallee = nullptr; 36 1.1 joerg ArrayRef<llvm::Value *> ExitArgs; 37 1.1 joerg bool Conditional = false; 38 1.1 joerg llvm::BasicBlock *ContBlock = nullptr; 39 1.1 joerg 40 1.1 joerg public: 41 1.1 joerg NVPTXActionTy(llvm::FunctionCallee EnterCallee, 42 1.1 joerg ArrayRef<llvm::Value *> EnterArgs, 43 1.1 joerg llvm::FunctionCallee ExitCallee, 44 1.1 joerg ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 45 1.1 joerg : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 46 1.1 joerg ExitArgs(ExitArgs), Conditional(Conditional) {} 47 1.1 joerg void Enter(CodeGenFunction &CGF) override { 48 1.1 joerg llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 49 1.1 joerg if (Conditional) { 50 1.1 joerg llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 51 1.1 joerg auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 52 1.1 joerg ContBlock = CGF.createBasicBlock("omp_if.end"); 53 1.1 joerg // Generate the branch (If-stmt) 54 1.1 joerg CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 55 1.1 joerg CGF.EmitBlock(ThenBlock); 56 1.1 joerg } 57 1.1 joerg } 58 1.1 joerg void Done(CodeGenFunction &CGF) { 59 1.1 joerg // Emit the rest of blocks/branches 60 1.1 joerg CGF.EmitBranch(ContBlock); 61 1.1 joerg CGF.EmitBlock(ContBlock, true); 62 1.1 joerg } 63 1.1 joerg void Exit(CodeGenFunction &CGF) override { 64 1.1 joerg CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 65 1.1 joerg } 66 1.1 joerg }; 67 1.1 joerg 68 1.1 joerg /// A class to track the execution mode when codegening directives within 69 1.1 joerg /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 70 1.1 joerg /// to the target region and used by containing directives such as 'parallel' 71 1.1 joerg /// to emit optimized code. 72 1.1 joerg class ExecutionRuntimeModesRAII { 73 1.1 joerg private: 74 1.1 joerg CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 75 1.1 joerg CGOpenMPRuntimeGPU::EM_Unknown; 76 1.1 joerg CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 77 1.1 joerg bool SavedRuntimeMode = false; 78 1.1 joerg bool *RuntimeMode = nullptr; 79 1.1 joerg 80 1.1 joerg public: 81 1.1 joerg /// Constructor for Non-SPMD mode. 82 1.1 joerg ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode) 83 1.1 joerg : ExecMode(ExecMode) { 84 1.1 joerg SavedExecMode = ExecMode; 85 1.1 joerg ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD; 86 1.1 joerg } 87 1.1 joerg /// Constructor for SPMD mode. 88 1.1 joerg ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 89 1.1 joerg bool &RuntimeMode, bool FullRuntimeMode) 90 1.1 joerg : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) { 91 1.1 joerg SavedExecMode = ExecMode; 92 1.1 joerg SavedRuntimeMode = RuntimeMode; 93 1.1 joerg ExecMode = CGOpenMPRuntimeGPU::EM_SPMD; 94 1.1 joerg RuntimeMode = FullRuntimeMode; 95 1.1 joerg } 96 1.1 joerg ~ExecutionRuntimeModesRAII() { 97 1.1 joerg ExecMode = SavedExecMode; 98 1.1 joerg if (RuntimeMode) 99 1.1 joerg *RuntimeMode = SavedRuntimeMode; 100 1.1 joerg } 101 1.1 joerg }; 102 1.1 joerg 103 1.1 joerg /// GPU Configuration: This information can be derived from cuda registers, 104 1.1 joerg /// however, providing compile time constants helps generate more efficient 105 1.1 joerg /// code. For all practical purposes this is fine because the configuration 106 1.1 joerg /// is the same for all known NVPTX architectures. 107 1.1 joerg enum MachineConfiguration : unsigned { 108 1.1 joerg /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target 109 1.1 joerg /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2, 110 1.1 joerg /// and GV_Warp_Size_Log2_Mask. 111 1.1 joerg 112 1.1 joerg /// Global memory alignment for performance. 113 1.1 joerg GlobalMemoryAlignment = 128, 114 1.1 joerg 115 1.1 joerg /// Maximal size of the shared memory buffer. 116 1.1 joerg SharedMemorySize = 128, 117 1.1 joerg }; 118 1.1 joerg 119 1.1 joerg static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 120 1.1 joerg RefExpr = RefExpr->IgnoreParens(); 121 1.1 joerg if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 122 1.1 joerg const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 123 1.1 joerg while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 124 1.1 joerg Base = TempASE->getBase()->IgnoreParenImpCasts(); 125 1.1 joerg RefExpr = Base; 126 1.1 joerg } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) { 127 1.1 joerg const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 128 1.1 joerg while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) 129 1.1 joerg Base = TempOASE->getBase()->IgnoreParenImpCasts(); 130 1.1 joerg while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 131 1.1 joerg Base = TempASE->getBase()->IgnoreParenImpCasts(); 132 1.1 joerg RefExpr = Base; 133 1.1 joerg } 134 1.1 joerg RefExpr = RefExpr->IgnoreParenImpCasts(); 135 1.1 joerg if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 136 1.1 joerg return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 137 1.1 joerg const auto *ME = cast<MemberExpr>(RefExpr); 138 1.1 joerg return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 139 1.1 joerg } 140 1.1 joerg 141 1.1 joerg 142 1.1 joerg static RecordDecl *buildRecordForGlobalizedVars( 143 1.1 joerg ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 144 1.1 joerg ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 145 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 146 1.1 joerg &MappedDeclsFields, int BufSize) { 147 1.1 joerg using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 148 1.1 joerg if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 149 1.1 joerg return nullptr; 150 1.1 joerg SmallVector<VarsDataTy, 4> GlobalizedVars; 151 1.1 joerg for (const ValueDecl *D : EscapedDecls) 152 1.1 joerg GlobalizedVars.emplace_back( 153 1.1 joerg CharUnits::fromQuantity(std::max( 154 1.1 joerg C.getDeclAlign(D).getQuantity(), 155 1.1 joerg static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))), 156 1.1 joerg D); 157 1.1 joerg for (const ValueDecl *D : EscapedDeclsForTeams) 158 1.1 joerg GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 159 1.1 joerg llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { 160 1.1 joerg return L.first > R.first; 161 1.1 joerg }); 162 1.1 joerg 163 1.1 joerg // Build struct _globalized_locals_ty { 164 1.1 joerg // /* globalized vars */[WarSize] align (max(decl_align, 165 1.1 joerg // GlobalMemoryAlignment)) 166 1.1 joerg // /* globalized vars */ for EscapedDeclsForTeams 167 1.1 joerg // }; 168 1.1 joerg RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 169 1.1 joerg GlobalizedRD->startDefinition(); 170 1.1 joerg llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( 171 1.1 joerg EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); 172 1.1 joerg for (const auto &Pair : GlobalizedVars) { 173 1.1 joerg const ValueDecl *VD = Pair.second; 174 1.1 joerg QualType Type = VD->getType(); 175 1.1 joerg if (Type->isLValueReferenceType()) 176 1.1 joerg Type = C.getPointerType(Type.getNonReferenceType()); 177 1.1 joerg else 178 1.1 joerg Type = Type.getNonReferenceType(); 179 1.1 joerg SourceLocation Loc = VD->getLocation(); 180 1.1 joerg FieldDecl *Field; 181 1.1 joerg if (SingleEscaped.count(VD)) { 182 1.1 joerg Field = FieldDecl::Create( 183 1.1 joerg C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 184 1.1 joerg C.getTrivialTypeSourceInfo(Type, SourceLocation()), 185 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false, 186 1.1 joerg /*InitStyle=*/ICIS_NoInit); 187 1.1 joerg Field->setAccess(AS_public); 188 1.1 joerg if (VD->hasAttrs()) { 189 1.1 joerg for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 190 1.1 joerg E(VD->getAttrs().end()); 191 1.1 joerg I != E; ++I) 192 1.1 joerg Field->addAttr(*I); 193 1.1 joerg } 194 1.1 joerg } else { 195 1.1 joerg llvm::APInt ArraySize(32, BufSize); 196 1.1 joerg Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, 197 1.1 joerg 0); 198 1.1 joerg Field = FieldDecl::Create( 199 1.1 joerg C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 200 1.1 joerg C.getTrivialTypeSourceInfo(Type, SourceLocation()), 201 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false, 202 1.1 joerg /*InitStyle=*/ICIS_NoInit); 203 1.1 joerg Field->setAccess(AS_public); 204 1.1 joerg llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), 205 1.1 joerg static_cast<CharUnits::QuantityType>( 206 1.1 joerg GlobalMemoryAlignment))); 207 1.1 joerg Field->addAttr(AlignedAttr::CreateImplicit( 208 1.1 joerg C, /*IsAlignmentExpr=*/true, 209 1.1 joerg IntegerLiteral::Create(C, Align, 210 1.1 joerg C.getIntTypeForBitwidth(32, /*Signed=*/0), 211 1.1 joerg SourceLocation()), 212 1.1 joerg {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned)); 213 1.1 joerg } 214 1.1 joerg GlobalizedRD->addDecl(Field); 215 1.1 joerg MappedDeclsFields.try_emplace(VD, Field); 216 1.1 joerg } 217 1.1 joerg GlobalizedRD->completeDefinition(); 218 1.1 joerg return GlobalizedRD; 219 1.1 joerg } 220 1.1 joerg 221 1.1 joerg /// Get the list of variables that can escape their declaration context. 222 1.1 joerg class CheckVarsEscapingDeclContext final 223 1.1 joerg : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 224 1.1 joerg CodeGenFunction &CGF; 225 1.1 joerg llvm::SetVector<const ValueDecl *> EscapedDecls; 226 1.1 joerg llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 227 1.1 joerg llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 228 1.1 joerg RecordDecl *GlobalizedRD = nullptr; 229 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 230 1.1 joerg bool AllEscaped = false; 231 1.1 joerg bool IsForCombinedParallelRegion = false; 232 1.1 joerg 233 1.1 joerg void markAsEscaped(const ValueDecl *VD) { 234 1.1 joerg // Do not globalize declare target variables. 235 1.1 joerg if (!isa<VarDecl>(VD) || 236 1.1 joerg OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 237 1.1 joerg return; 238 1.1 joerg VD = cast<ValueDecl>(VD->getCanonicalDecl()); 239 1.1 joerg // Use user-specified allocation. 240 1.1 joerg if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 241 1.1 joerg return; 242 1.1 joerg // Variables captured by value must be globalized. 243 1.1 joerg if (auto *CSI = CGF.CapturedStmtInfo) { 244 1.1 joerg if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 245 1.1 joerg // Check if need to capture the variable that was already captured by 246 1.1 joerg // value in the outer region. 247 1.1 joerg if (!IsForCombinedParallelRegion) { 248 1.1 joerg if (!FD->hasAttrs()) 249 1.1 joerg return; 250 1.1 joerg const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 251 1.1 joerg if (!Attr) 252 1.1 joerg return; 253 1.1 joerg if (((Attr->getCaptureKind() != OMPC_map) && 254 1.1 joerg !isOpenMPPrivate(Attr->getCaptureKind())) || 255 1.1 joerg ((Attr->getCaptureKind() == OMPC_map) && 256 1.1 joerg !FD->getType()->isAnyPointerType())) 257 1.1 joerg return; 258 1.1 joerg } 259 1.1 joerg if (!FD->getType()->isReferenceType()) { 260 1.1 joerg assert(!VD->getType()->isVariablyModifiedType() && 261 1.1 joerg "Parameter captured by value with variably modified type"); 262 1.1 joerg EscapedParameters.insert(VD); 263 1.1 joerg } else if (!IsForCombinedParallelRegion) { 264 1.1 joerg return; 265 1.1 joerg } 266 1.1 joerg } 267 1.1 joerg } 268 1.1 joerg if ((!CGF.CapturedStmtInfo || 269 1.1 joerg (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 270 1.1 joerg VD->getType()->isReferenceType()) 271 1.1 joerg // Do not globalize variables with reference type. 272 1.1 joerg return; 273 1.1 joerg if (VD->getType()->isVariablyModifiedType()) 274 1.1 joerg EscapedVariableLengthDecls.insert(VD); 275 1.1 joerg else 276 1.1 joerg EscapedDecls.insert(VD); 277 1.1 joerg } 278 1.1 joerg 279 1.1 joerg void VisitValueDecl(const ValueDecl *VD) { 280 1.1 joerg if (VD->getType()->isLValueReferenceType()) 281 1.1 joerg markAsEscaped(VD); 282 1.1 joerg if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 283 1.1 joerg if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 284 1.1 joerg const bool SavedAllEscaped = AllEscaped; 285 1.1 joerg AllEscaped = VD->getType()->isLValueReferenceType(); 286 1.1 joerg Visit(VarD->getInit()); 287 1.1 joerg AllEscaped = SavedAllEscaped; 288 1.1 joerg } 289 1.1 joerg } 290 1.1 joerg } 291 1.1 joerg void VisitOpenMPCapturedStmt(const CapturedStmt *S, 292 1.1 joerg ArrayRef<OMPClause *> Clauses, 293 1.1 joerg bool IsCombinedParallelRegion) { 294 1.1 joerg if (!S) 295 1.1 joerg return; 296 1.1 joerg for (const CapturedStmt::Capture &C : S->captures()) { 297 1.1 joerg if (C.capturesVariable() && !C.capturesVariableByCopy()) { 298 1.1 joerg const ValueDecl *VD = C.getCapturedVar(); 299 1.1 joerg bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 300 1.1 joerg if (IsCombinedParallelRegion) { 301 1.1 joerg // Check if the variable is privatized in the combined construct and 302 1.1 joerg // those private copies must be shared in the inner parallel 303 1.1 joerg // directive. 304 1.1 joerg IsForCombinedParallelRegion = false; 305 1.1 joerg for (const OMPClause *C : Clauses) { 306 1.1 joerg if (!isOpenMPPrivate(C->getClauseKind()) || 307 1.1 joerg C->getClauseKind() == OMPC_reduction || 308 1.1 joerg C->getClauseKind() == OMPC_linear || 309 1.1 joerg C->getClauseKind() == OMPC_private) 310 1.1 joerg continue; 311 1.1 joerg ArrayRef<const Expr *> Vars; 312 1.1 joerg if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 313 1.1 joerg Vars = PC->getVarRefs(); 314 1.1 joerg else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 315 1.1 joerg Vars = PC->getVarRefs(); 316 1.1 joerg else 317 1.1 joerg llvm_unreachable("Unexpected clause."); 318 1.1 joerg for (const auto *E : Vars) { 319 1.1 joerg const Decl *D = 320 1.1 joerg cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 321 1.1 joerg if (D == VD->getCanonicalDecl()) { 322 1.1 joerg IsForCombinedParallelRegion = true; 323 1.1 joerg break; 324 1.1 joerg } 325 1.1 joerg } 326 1.1 joerg if (IsForCombinedParallelRegion) 327 1.1 joerg break; 328 1.1 joerg } 329 1.1 joerg } 330 1.1 joerg markAsEscaped(VD); 331 1.1 joerg if (isa<OMPCapturedExprDecl>(VD)) 332 1.1 joerg VisitValueDecl(VD); 333 1.1 joerg IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 334 1.1 joerg } 335 1.1 joerg } 336 1.1 joerg } 337 1.1 joerg 338 1.1 joerg void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 339 1.1 joerg assert(!GlobalizedRD && 340 1.1 joerg "Record for globalized variables is built already."); 341 1.1 joerg ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 342 1.1 joerg unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); 343 1.1 joerg if (IsInTTDRegion) 344 1.1 joerg EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 345 1.1 joerg else 346 1.1 joerg EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 347 1.1 joerg GlobalizedRD = ::buildRecordForGlobalizedVars( 348 1.1 joerg CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 349 1.1 joerg MappedDeclsFields, WarpSize); 350 1.1 joerg } 351 1.1 joerg 352 1.1 joerg public: 353 1.1 joerg CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 354 1.1 joerg ArrayRef<const ValueDecl *> TeamsReductions) 355 1.1 joerg : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { 356 1.1 joerg } 357 1.1 joerg virtual ~CheckVarsEscapingDeclContext() = default; 358 1.1 joerg void VisitDeclStmt(const DeclStmt *S) { 359 1.1 joerg if (!S) 360 1.1 joerg return; 361 1.1 joerg for (const Decl *D : S->decls()) 362 1.1 joerg if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 363 1.1 joerg VisitValueDecl(VD); 364 1.1 joerg } 365 1.1 joerg void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 366 1.1 joerg if (!D) 367 1.1 joerg return; 368 1.1 joerg if (!D->hasAssociatedStmt()) 369 1.1 joerg return; 370 1.1 joerg if (const auto *S = 371 1.1 joerg dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 372 1.1 joerg // Do not analyze directives that do not actually require capturing, 373 1.1 joerg // like `omp for` or `omp simd` directives. 374 1.1 joerg llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 375 1.1 joerg getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 376 1.1 joerg if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 377 1.1 joerg VisitStmt(S->getCapturedStmt()); 378 1.1 joerg return; 379 1.1 joerg } 380 1.1 joerg VisitOpenMPCapturedStmt( 381 1.1 joerg S, D->clauses(), 382 1.1 joerg CaptureRegions.back() == OMPD_parallel && 383 1.1 joerg isOpenMPDistributeDirective(D->getDirectiveKind())); 384 1.1 joerg } 385 1.1 joerg } 386 1.1 joerg void VisitCapturedStmt(const CapturedStmt *S) { 387 1.1 joerg if (!S) 388 1.1 joerg return; 389 1.1 joerg for (const CapturedStmt::Capture &C : S->captures()) { 390 1.1 joerg if (C.capturesVariable() && !C.capturesVariableByCopy()) { 391 1.1 joerg const ValueDecl *VD = C.getCapturedVar(); 392 1.1 joerg markAsEscaped(VD); 393 1.1 joerg if (isa<OMPCapturedExprDecl>(VD)) 394 1.1 joerg VisitValueDecl(VD); 395 1.1 joerg } 396 1.1 joerg } 397 1.1 joerg } 398 1.1 joerg void VisitLambdaExpr(const LambdaExpr *E) { 399 1.1 joerg if (!E) 400 1.1 joerg return; 401 1.1 joerg for (const LambdaCapture &C : E->captures()) { 402 1.1 joerg if (C.capturesVariable()) { 403 1.1 joerg if (C.getCaptureKind() == LCK_ByRef) { 404 1.1 joerg const ValueDecl *VD = C.getCapturedVar(); 405 1.1 joerg markAsEscaped(VD); 406 1.1 joerg if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 407 1.1 joerg VisitValueDecl(VD); 408 1.1 joerg } 409 1.1 joerg } 410 1.1 joerg } 411 1.1 joerg } 412 1.1 joerg void VisitBlockExpr(const BlockExpr *E) { 413 1.1 joerg if (!E) 414 1.1 joerg return; 415 1.1 joerg for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 416 1.1 joerg if (C.isByRef()) { 417 1.1 joerg const VarDecl *VD = C.getVariable(); 418 1.1 joerg markAsEscaped(VD); 419 1.1 joerg if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 420 1.1 joerg VisitValueDecl(VD); 421 1.1 joerg } 422 1.1 joerg } 423 1.1 joerg } 424 1.1 joerg void VisitCallExpr(const CallExpr *E) { 425 1.1 joerg if (!E) 426 1.1 joerg return; 427 1.1 joerg for (const Expr *Arg : E->arguments()) { 428 1.1 joerg if (!Arg) 429 1.1 joerg continue; 430 1.1 joerg if (Arg->isLValue()) { 431 1.1 joerg const bool SavedAllEscaped = AllEscaped; 432 1.1 joerg AllEscaped = true; 433 1.1 joerg Visit(Arg); 434 1.1 joerg AllEscaped = SavedAllEscaped; 435 1.1 joerg } else { 436 1.1 joerg Visit(Arg); 437 1.1 joerg } 438 1.1 joerg } 439 1.1 joerg Visit(E->getCallee()); 440 1.1 joerg } 441 1.1 joerg void VisitDeclRefExpr(const DeclRefExpr *E) { 442 1.1 joerg if (!E) 443 1.1 joerg return; 444 1.1 joerg const ValueDecl *VD = E->getDecl(); 445 1.1 joerg if (AllEscaped) 446 1.1 joerg markAsEscaped(VD); 447 1.1 joerg if (isa<OMPCapturedExprDecl>(VD)) 448 1.1 joerg VisitValueDecl(VD); 449 1.1 joerg else if (const auto *VarD = dyn_cast<VarDecl>(VD)) 450 1.1 joerg if (VarD->isInitCapture()) 451 1.1 joerg VisitValueDecl(VD); 452 1.1 joerg } 453 1.1 joerg void VisitUnaryOperator(const UnaryOperator *E) { 454 1.1 joerg if (!E) 455 1.1 joerg return; 456 1.1 joerg if (E->getOpcode() == UO_AddrOf) { 457 1.1 joerg const bool SavedAllEscaped = AllEscaped; 458 1.1 joerg AllEscaped = true; 459 1.1 joerg Visit(E->getSubExpr()); 460 1.1 joerg AllEscaped = SavedAllEscaped; 461 1.1 joerg } else { 462 1.1 joerg Visit(E->getSubExpr()); 463 1.1 joerg } 464 1.1 joerg } 465 1.1 joerg void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 466 1.1 joerg if (!E) 467 1.1 joerg return; 468 1.1 joerg if (E->getCastKind() == CK_ArrayToPointerDecay) { 469 1.1 joerg const bool SavedAllEscaped = AllEscaped; 470 1.1 joerg AllEscaped = true; 471 1.1 joerg Visit(E->getSubExpr()); 472 1.1 joerg AllEscaped = SavedAllEscaped; 473 1.1 joerg } else { 474 1.1 joerg Visit(E->getSubExpr()); 475 1.1 joerg } 476 1.1 joerg } 477 1.1 joerg void VisitExpr(const Expr *E) { 478 1.1 joerg if (!E) 479 1.1 joerg return; 480 1.1 joerg bool SavedAllEscaped = AllEscaped; 481 1.1 joerg if (!E->isLValue()) 482 1.1 joerg AllEscaped = false; 483 1.1 joerg for (const Stmt *Child : E->children()) 484 1.1 joerg if (Child) 485 1.1 joerg Visit(Child); 486 1.1 joerg AllEscaped = SavedAllEscaped; 487 1.1 joerg } 488 1.1 joerg void VisitStmt(const Stmt *S) { 489 1.1 joerg if (!S) 490 1.1 joerg return; 491 1.1 joerg for (const Stmt *Child : S->children()) 492 1.1 joerg if (Child) 493 1.1 joerg Visit(Child); 494 1.1 joerg } 495 1.1 joerg 496 1.1 joerg /// Returns the record that handles all the escaped local variables and used 497 1.1 joerg /// instead of their original storage. 498 1.1 joerg const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 499 1.1 joerg if (!GlobalizedRD) 500 1.1 joerg buildRecordForGlobalizedVars(IsInTTDRegion); 501 1.1 joerg return GlobalizedRD; 502 1.1 joerg } 503 1.1 joerg 504 1.1 joerg /// Returns the field in the globalized record for the escaped variable. 505 1.1 joerg const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 506 1.1 joerg assert(GlobalizedRD && 507 1.1 joerg "Record for globalized variables must be generated already."); 508 1.1 joerg auto I = MappedDeclsFields.find(VD); 509 1.1 joerg if (I == MappedDeclsFields.end()) 510 1.1 joerg return nullptr; 511 1.1 joerg return I->getSecond(); 512 1.1 joerg } 513 1.1 joerg 514 1.1 joerg /// Returns the list of the escaped local variables/parameters. 515 1.1 joerg ArrayRef<const ValueDecl *> getEscapedDecls() const { 516 1.1 joerg return EscapedDecls.getArrayRef(); 517 1.1 joerg } 518 1.1 joerg 519 1.1 joerg /// Checks if the escaped local variable is actually a parameter passed by 520 1.1 joerg /// value. 521 1.1 joerg const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 522 1.1 joerg return EscapedParameters; 523 1.1 joerg } 524 1.1 joerg 525 1.1 joerg /// Returns the list of the escaped variables with the variably modified 526 1.1 joerg /// types. 527 1.1 joerg ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 528 1.1 joerg return EscapedVariableLengthDecls.getArrayRef(); 529 1.1 joerg } 530 1.1 joerg }; 531 1.1 joerg } // anonymous namespace 532 1.1 joerg 533 1.1 joerg /// Get the id of the warp in the block. 534 1.1 joerg /// We assume that the warp size is 32, which is always the case 535 1.1 joerg /// on the NVPTX device, to generate more efficient code. 536 1.1 joerg static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { 537 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 538 1.1 joerg unsigned LaneIDBits = 539 1.1 joerg CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); 540 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 541 1.1 joerg return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); 542 1.1 joerg } 543 1.1 joerg 544 1.1 joerg /// Get the id of the current lane in the Warp. 545 1.1 joerg /// We assume that the warp size is 32, which is always the case 546 1.1 joerg /// on the NVPTX device, to generate more efficient code. 547 1.1 joerg static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { 548 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 549 1.1 joerg unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( 550 1.1 joerg llvm::omp::GV_Warp_Size_Log2_Mask); 551 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 552 1.1 joerg return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), 553 1.1 joerg "nvptx_lane_id"); 554 1.1 joerg } 555 1.1 joerg 556 1.1 joerg /// Get the value of the thread_limit clause in the teams directive. 557 1.1 joerg /// For the 'generic' execution mode, the runtime encodes thread_limit in 558 1.1 joerg /// the launch parameters, always starting thread_limit+warpSize threads per 559 1.1 joerg /// CTA. The threads in the last warp are reserved for master execution. 560 1.1 joerg /// For the 'spmd' execution mode, all threads in a CTA are part of the team. 561 1.1 joerg static llvm::Value *getThreadLimit(CodeGenFunction &CGF, 562 1.1 joerg bool IsInSPMDExecutionMode = false) { 563 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 564 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 565 1.1 joerg llvm::Value *ThreadLimit = nullptr; 566 1.1 joerg if (IsInSPMDExecutionMode) 567 1.1 joerg ThreadLimit = RT.getGPUNumThreads(CGF); 568 1.1 joerg else { 569 1.1 joerg llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF); 570 1.1 joerg llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF); 571 1.1 joerg ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit"); 572 1.1 joerg } 573 1.1 joerg assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit"); 574 1.1 joerg return ThreadLimit; 575 1.1 joerg } 576 1.1 joerg 577 1.1 joerg /// Get the thread id of the OMP master thread. 578 1.1 joerg /// The master thread id is the first thread (lane) of the last warp in the 579 1.1 joerg /// GPU block. Warp size is assumed to be some power of 2. 580 1.1 joerg /// Thread id is 0 indexed. 581 1.1 joerg /// E.g: If NumThreads is 33, master id is 32. 582 1.1 joerg /// If NumThreads is 64, master id is 32. 583 1.1 joerg /// If NumThreads is 1024, master id is 992. 584 1.1 joerg static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { 585 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 586 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 587 1.1 joerg llvm::Value *NumThreads = RT.getGPUNumThreads(CGF); 588 1.1 joerg // We assume that the warp size is a power of 2. 589 1.1 joerg llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); 590 1.1 joerg 591 1.1 joerg llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)); 592 1.1 joerg return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid"); 593 1.1 joerg } 594 1.1 joerg 595 1.1 joerg CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState( 596 1.1 joerg CodeGenModule &CGM, SourceLocation Loc) 597 1.1 joerg : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), 598 1.1 joerg Loc(Loc) { 599 1.1 joerg createWorkerFunction(CGM); 600 1.1 joerg } 601 1.1 joerg 602 1.1 joerg void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction( 603 1.1 joerg CodeGenModule &CGM) { 604 1.1 joerg // Create an worker function with no arguments. 605 1.1 joerg 606 1.1 joerg WorkerFn = llvm::Function::Create( 607 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 608 1.1 joerg /*placeholder=*/"_worker", &CGM.getModule()); 609 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI); 610 1.1 joerg WorkerFn->setDoesNotRecurse(); 611 1.1 joerg } 612 1.1 joerg 613 1.1 joerg CGOpenMPRuntimeGPU::ExecutionMode 614 1.1 joerg CGOpenMPRuntimeGPU::getExecutionMode() const { 615 1.1 joerg return CurrentExecutionMode; 616 1.1 joerg } 617 1.1 joerg 618 1.1 joerg static CGOpenMPRuntimeGPU::DataSharingMode 619 1.1 joerg getDataSharingMode(CodeGenModule &CGM) { 620 1.1 joerg return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA 621 1.1 joerg : CGOpenMPRuntimeGPU::Generic; 622 1.1 joerg } 623 1.1 joerg 624 1.1 joerg /// Check for inner (nested) SPMD construct, if any 625 1.1 joerg static bool hasNestedSPMDDirective(ASTContext &Ctx, 626 1.1 joerg const OMPExecutableDirective &D) { 627 1.1 joerg const auto *CS = D.getInnermostCapturedStmt(); 628 1.1 joerg const auto *Body = 629 1.1 joerg CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 630 1.1 joerg const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 631 1.1 joerg 632 1.1 joerg if (const auto *NestedDir = 633 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 634 1.1 joerg OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 635 1.1 joerg switch (D.getDirectiveKind()) { 636 1.1 joerg case OMPD_target: 637 1.1 joerg if (isOpenMPParallelDirective(DKind)) 638 1.1 joerg return true; 639 1.1 joerg if (DKind == OMPD_teams) { 640 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 641 1.1 joerg /*IgnoreCaptured=*/true); 642 1.1 joerg if (!Body) 643 1.1 joerg return false; 644 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 645 1.1 joerg if (const auto *NND = 646 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 647 1.1 joerg DKind = NND->getDirectiveKind(); 648 1.1 joerg if (isOpenMPParallelDirective(DKind)) 649 1.1 joerg return true; 650 1.1 joerg } 651 1.1 joerg } 652 1.1 joerg return false; 653 1.1 joerg case OMPD_target_teams: 654 1.1 joerg return isOpenMPParallelDirective(DKind); 655 1.1 joerg case OMPD_target_simd: 656 1.1 joerg case OMPD_target_parallel: 657 1.1 joerg case OMPD_target_parallel_for: 658 1.1 joerg case OMPD_target_parallel_for_simd: 659 1.1 joerg case OMPD_target_teams_distribute: 660 1.1 joerg case OMPD_target_teams_distribute_simd: 661 1.1 joerg case OMPD_target_teams_distribute_parallel_for: 662 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd: 663 1.1 joerg case OMPD_parallel: 664 1.1 joerg case OMPD_for: 665 1.1 joerg case OMPD_parallel_for: 666 1.1 joerg case OMPD_parallel_master: 667 1.1 joerg case OMPD_parallel_sections: 668 1.1 joerg case OMPD_for_simd: 669 1.1 joerg case OMPD_parallel_for_simd: 670 1.1 joerg case OMPD_cancel: 671 1.1 joerg case OMPD_cancellation_point: 672 1.1 joerg case OMPD_ordered: 673 1.1 joerg case OMPD_threadprivate: 674 1.1 joerg case OMPD_allocate: 675 1.1 joerg case OMPD_task: 676 1.1 joerg case OMPD_simd: 677 1.1 joerg case OMPD_sections: 678 1.1 joerg case OMPD_section: 679 1.1 joerg case OMPD_single: 680 1.1 joerg case OMPD_master: 681 1.1 joerg case OMPD_critical: 682 1.1 joerg case OMPD_taskyield: 683 1.1 joerg case OMPD_barrier: 684 1.1 joerg case OMPD_taskwait: 685 1.1 joerg case OMPD_taskgroup: 686 1.1 joerg case OMPD_atomic: 687 1.1 joerg case OMPD_flush: 688 1.1 joerg case OMPD_depobj: 689 1.1 joerg case OMPD_scan: 690 1.1 joerg case OMPD_teams: 691 1.1 joerg case OMPD_target_data: 692 1.1 joerg case OMPD_target_exit_data: 693 1.1 joerg case OMPD_target_enter_data: 694 1.1 joerg case OMPD_distribute: 695 1.1 joerg case OMPD_distribute_simd: 696 1.1 joerg case OMPD_distribute_parallel_for: 697 1.1 joerg case OMPD_distribute_parallel_for_simd: 698 1.1 joerg case OMPD_teams_distribute: 699 1.1 joerg case OMPD_teams_distribute_simd: 700 1.1 joerg case OMPD_teams_distribute_parallel_for: 701 1.1 joerg case OMPD_teams_distribute_parallel_for_simd: 702 1.1 joerg case OMPD_target_update: 703 1.1 joerg case OMPD_declare_simd: 704 1.1 joerg case OMPD_declare_variant: 705 1.1 joerg case OMPD_begin_declare_variant: 706 1.1 joerg case OMPD_end_declare_variant: 707 1.1 joerg case OMPD_declare_target: 708 1.1 joerg case OMPD_end_declare_target: 709 1.1 joerg case OMPD_declare_reduction: 710 1.1 joerg case OMPD_declare_mapper: 711 1.1 joerg case OMPD_taskloop: 712 1.1 joerg case OMPD_taskloop_simd: 713 1.1 joerg case OMPD_master_taskloop: 714 1.1 joerg case OMPD_master_taskloop_simd: 715 1.1 joerg case OMPD_parallel_master_taskloop: 716 1.1 joerg case OMPD_parallel_master_taskloop_simd: 717 1.1 joerg case OMPD_requires: 718 1.1 joerg case OMPD_unknown: 719 1.1 joerg default: 720 1.1 joerg llvm_unreachable("Unexpected directive."); 721 1.1 joerg } 722 1.1 joerg } 723 1.1 joerg 724 1.1 joerg return false; 725 1.1 joerg } 726 1.1 joerg 727 1.1 joerg static bool supportsSPMDExecutionMode(ASTContext &Ctx, 728 1.1 joerg const OMPExecutableDirective &D) { 729 1.1 joerg OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 730 1.1 joerg switch (DirectiveKind) { 731 1.1 joerg case OMPD_target: 732 1.1 joerg case OMPD_target_teams: 733 1.1 joerg return hasNestedSPMDDirective(Ctx, D); 734 1.1 joerg case OMPD_target_parallel: 735 1.1 joerg case OMPD_target_parallel_for: 736 1.1 joerg case OMPD_target_parallel_for_simd: 737 1.1 joerg case OMPD_target_teams_distribute_parallel_for: 738 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd: 739 1.1 joerg case OMPD_target_simd: 740 1.1 joerg case OMPD_target_teams_distribute_simd: 741 1.1 joerg return true; 742 1.1 joerg case OMPD_target_teams_distribute: 743 1.1 joerg return false; 744 1.1 joerg case OMPD_parallel: 745 1.1 joerg case OMPD_for: 746 1.1 joerg case OMPD_parallel_for: 747 1.1 joerg case OMPD_parallel_master: 748 1.1 joerg case OMPD_parallel_sections: 749 1.1 joerg case OMPD_for_simd: 750 1.1 joerg case OMPD_parallel_for_simd: 751 1.1 joerg case OMPD_cancel: 752 1.1 joerg case OMPD_cancellation_point: 753 1.1 joerg case OMPD_ordered: 754 1.1 joerg case OMPD_threadprivate: 755 1.1 joerg case OMPD_allocate: 756 1.1 joerg case OMPD_task: 757 1.1 joerg case OMPD_simd: 758 1.1 joerg case OMPD_sections: 759 1.1 joerg case OMPD_section: 760 1.1 joerg case OMPD_single: 761 1.1 joerg case OMPD_master: 762 1.1 joerg case OMPD_critical: 763 1.1 joerg case OMPD_taskyield: 764 1.1 joerg case OMPD_barrier: 765 1.1 joerg case OMPD_taskwait: 766 1.1 joerg case OMPD_taskgroup: 767 1.1 joerg case OMPD_atomic: 768 1.1 joerg case OMPD_flush: 769 1.1 joerg case OMPD_depobj: 770 1.1 joerg case OMPD_scan: 771 1.1 joerg case OMPD_teams: 772 1.1 joerg case OMPD_target_data: 773 1.1 joerg case OMPD_target_exit_data: 774 1.1 joerg case OMPD_target_enter_data: 775 1.1 joerg case OMPD_distribute: 776 1.1 joerg case OMPD_distribute_simd: 777 1.1 joerg case OMPD_distribute_parallel_for: 778 1.1 joerg case OMPD_distribute_parallel_for_simd: 779 1.1 joerg case OMPD_teams_distribute: 780 1.1 joerg case OMPD_teams_distribute_simd: 781 1.1 joerg case OMPD_teams_distribute_parallel_for: 782 1.1 joerg case OMPD_teams_distribute_parallel_for_simd: 783 1.1 joerg case OMPD_target_update: 784 1.1 joerg case OMPD_declare_simd: 785 1.1 joerg case OMPD_declare_variant: 786 1.1 joerg case OMPD_begin_declare_variant: 787 1.1 joerg case OMPD_end_declare_variant: 788 1.1 joerg case OMPD_declare_target: 789 1.1 joerg case OMPD_end_declare_target: 790 1.1 joerg case OMPD_declare_reduction: 791 1.1 joerg case OMPD_declare_mapper: 792 1.1 joerg case OMPD_taskloop: 793 1.1 joerg case OMPD_taskloop_simd: 794 1.1 joerg case OMPD_master_taskloop: 795 1.1 joerg case OMPD_master_taskloop_simd: 796 1.1 joerg case OMPD_parallel_master_taskloop: 797 1.1 joerg case OMPD_parallel_master_taskloop_simd: 798 1.1 joerg case OMPD_requires: 799 1.1 joerg case OMPD_unknown: 800 1.1 joerg default: 801 1.1 joerg break; 802 1.1 joerg } 803 1.1 joerg llvm_unreachable( 804 1.1 joerg "Unknown programming model for OpenMP directive on NVPTX target."); 805 1.1 joerg } 806 1.1 joerg 807 1.1 joerg /// Check if the directive is loops based and has schedule clause at all or has 808 1.1 joerg /// static scheduling. 809 1.1 joerg static bool hasStaticScheduling(const OMPExecutableDirective &D) { 810 1.1 joerg assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) && 811 1.1 joerg isOpenMPLoopDirective(D.getDirectiveKind()) && 812 1.1 joerg "Expected loop-based directive."); 813 1.1 joerg return !D.hasClausesOfKind<OMPOrderedClause>() && 814 1.1 joerg (!D.hasClausesOfKind<OMPScheduleClause>() || 815 1.1 joerg llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(), 816 1.1 joerg [](const OMPScheduleClause *C) { 817 1.1 joerg return C->getScheduleKind() == OMPC_SCHEDULE_static; 818 1.1 joerg })); 819 1.1 joerg } 820 1.1 joerg 821 1.1 joerg /// Check for inner (nested) lightweight runtime construct, if any 822 1.1 joerg static bool hasNestedLightweightDirective(ASTContext &Ctx, 823 1.1 joerg const OMPExecutableDirective &D) { 824 1.1 joerg assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive."); 825 1.1 joerg const auto *CS = D.getInnermostCapturedStmt(); 826 1.1 joerg const auto *Body = 827 1.1 joerg CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 828 1.1 joerg const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 829 1.1 joerg 830 1.1 joerg if (const auto *NestedDir = 831 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 832 1.1 joerg OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 833 1.1 joerg switch (D.getDirectiveKind()) { 834 1.1 joerg case OMPD_target: 835 1.1 joerg if (isOpenMPParallelDirective(DKind) && 836 1.1 joerg isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && 837 1.1 joerg hasStaticScheduling(*NestedDir)) 838 1.1 joerg return true; 839 1.1 joerg if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd) 840 1.1 joerg return true; 841 1.1 joerg if (DKind == OMPD_parallel) { 842 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 843 1.1 joerg /*IgnoreCaptured=*/true); 844 1.1 joerg if (!Body) 845 1.1 joerg return false; 846 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 847 1.1 joerg if (const auto *NND = 848 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 849 1.1 joerg DKind = NND->getDirectiveKind(); 850 1.1 joerg if (isOpenMPWorksharingDirective(DKind) && 851 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 852 1.1 joerg return true; 853 1.1 joerg } 854 1.1 joerg } else if (DKind == OMPD_teams) { 855 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 856 1.1 joerg /*IgnoreCaptured=*/true); 857 1.1 joerg if (!Body) 858 1.1 joerg return false; 859 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 860 1.1 joerg if (const auto *NND = 861 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 862 1.1 joerg DKind = NND->getDirectiveKind(); 863 1.1 joerg if (isOpenMPParallelDirective(DKind) && 864 1.1 joerg isOpenMPWorksharingDirective(DKind) && 865 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 866 1.1 joerg return true; 867 1.1 joerg if (DKind == OMPD_parallel) { 868 1.1 joerg Body = NND->getInnermostCapturedStmt()->IgnoreContainers( 869 1.1 joerg /*IgnoreCaptured=*/true); 870 1.1 joerg if (!Body) 871 1.1 joerg return false; 872 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 873 1.1 joerg if (const auto *NND = 874 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 875 1.1 joerg DKind = NND->getDirectiveKind(); 876 1.1 joerg if (isOpenMPWorksharingDirective(DKind) && 877 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 878 1.1 joerg return true; 879 1.1 joerg } 880 1.1 joerg } 881 1.1 joerg } 882 1.1 joerg } 883 1.1 joerg return false; 884 1.1 joerg case OMPD_target_teams: 885 1.1 joerg if (isOpenMPParallelDirective(DKind) && 886 1.1 joerg isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && 887 1.1 joerg hasStaticScheduling(*NestedDir)) 888 1.1 joerg return true; 889 1.1 joerg if (DKind == OMPD_distribute_simd || DKind == OMPD_simd) 890 1.1 joerg return true; 891 1.1 joerg if (DKind == OMPD_parallel) { 892 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 893 1.1 joerg /*IgnoreCaptured=*/true); 894 1.1 joerg if (!Body) 895 1.1 joerg return false; 896 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 897 1.1 joerg if (const auto *NND = 898 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 899 1.1 joerg DKind = NND->getDirectiveKind(); 900 1.1 joerg if (isOpenMPWorksharingDirective(DKind) && 901 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 902 1.1 joerg return true; 903 1.1 joerg } 904 1.1 joerg } 905 1.1 joerg return false; 906 1.1 joerg case OMPD_target_parallel: 907 1.1 joerg if (DKind == OMPD_simd) 908 1.1 joerg return true; 909 1.1 joerg return isOpenMPWorksharingDirective(DKind) && 910 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir); 911 1.1 joerg case OMPD_target_teams_distribute: 912 1.1 joerg case OMPD_target_simd: 913 1.1 joerg case OMPD_target_parallel_for: 914 1.1 joerg case OMPD_target_parallel_for_simd: 915 1.1 joerg case OMPD_target_teams_distribute_simd: 916 1.1 joerg case OMPD_target_teams_distribute_parallel_for: 917 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd: 918 1.1 joerg case OMPD_parallel: 919 1.1 joerg case OMPD_for: 920 1.1 joerg case OMPD_parallel_for: 921 1.1 joerg case OMPD_parallel_master: 922 1.1 joerg case OMPD_parallel_sections: 923 1.1 joerg case OMPD_for_simd: 924 1.1 joerg case OMPD_parallel_for_simd: 925 1.1 joerg case OMPD_cancel: 926 1.1 joerg case OMPD_cancellation_point: 927 1.1 joerg case OMPD_ordered: 928 1.1 joerg case OMPD_threadprivate: 929 1.1 joerg case OMPD_allocate: 930 1.1 joerg case OMPD_task: 931 1.1 joerg case OMPD_simd: 932 1.1 joerg case OMPD_sections: 933 1.1 joerg case OMPD_section: 934 1.1 joerg case OMPD_single: 935 1.1 joerg case OMPD_master: 936 1.1 joerg case OMPD_critical: 937 1.1 joerg case OMPD_taskyield: 938 1.1 joerg case OMPD_barrier: 939 1.1 joerg case OMPD_taskwait: 940 1.1 joerg case OMPD_taskgroup: 941 1.1 joerg case OMPD_atomic: 942 1.1 joerg case OMPD_flush: 943 1.1 joerg case OMPD_depobj: 944 1.1 joerg case OMPD_scan: 945 1.1 joerg case OMPD_teams: 946 1.1 joerg case OMPD_target_data: 947 1.1 joerg case OMPD_target_exit_data: 948 1.1 joerg case OMPD_target_enter_data: 949 1.1 joerg case OMPD_distribute: 950 1.1 joerg case OMPD_distribute_simd: 951 1.1 joerg case OMPD_distribute_parallel_for: 952 1.1 joerg case OMPD_distribute_parallel_for_simd: 953 1.1 joerg case OMPD_teams_distribute: 954 1.1 joerg case OMPD_teams_distribute_simd: 955 1.1 joerg case OMPD_teams_distribute_parallel_for: 956 1.1 joerg case OMPD_teams_distribute_parallel_for_simd: 957 1.1 joerg case OMPD_target_update: 958 1.1 joerg case OMPD_declare_simd: 959 1.1 joerg case OMPD_declare_variant: 960 1.1 joerg case OMPD_begin_declare_variant: 961 1.1 joerg case OMPD_end_declare_variant: 962 1.1 joerg case OMPD_declare_target: 963 1.1 joerg case OMPD_end_declare_target: 964 1.1 joerg case OMPD_declare_reduction: 965 1.1 joerg case OMPD_declare_mapper: 966 1.1 joerg case OMPD_taskloop: 967 1.1 joerg case OMPD_taskloop_simd: 968 1.1 joerg case OMPD_master_taskloop: 969 1.1 joerg case OMPD_master_taskloop_simd: 970 1.1 joerg case OMPD_parallel_master_taskloop: 971 1.1 joerg case OMPD_parallel_master_taskloop_simd: 972 1.1 joerg case OMPD_requires: 973 1.1 joerg case OMPD_unknown: 974 1.1 joerg default: 975 1.1 joerg llvm_unreachable("Unexpected directive."); 976 1.1 joerg } 977 1.1 joerg } 978 1.1 joerg 979 1.1 joerg return false; 980 1.1 joerg } 981 1.1 joerg 982 1.1 joerg /// Checks if the construct supports lightweight runtime. It must be SPMD 983 1.1 joerg /// construct + inner loop-based construct with static scheduling. 984 1.1 joerg static bool supportsLightweightRuntime(ASTContext &Ctx, 985 1.1 joerg const OMPExecutableDirective &D) { 986 1.1 joerg if (!supportsSPMDExecutionMode(Ctx, D)) 987 1.1 joerg return false; 988 1.1 joerg OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 989 1.1 joerg switch (DirectiveKind) { 990 1.1 joerg case OMPD_target: 991 1.1 joerg case OMPD_target_teams: 992 1.1 joerg case OMPD_target_parallel: 993 1.1 joerg return hasNestedLightweightDirective(Ctx, D); 994 1.1 joerg case OMPD_target_parallel_for: 995 1.1 joerg case OMPD_target_parallel_for_simd: 996 1.1 joerg case OMPD_target_teams_distribute_parallel_for: 997 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd: 998 1.1 joerg // (Last|First)-privates must be shared in parallel region. 999 1.1 joerg return hasStaticScheduling(D); 1000 1.1 joerg case OMPD_target_simd: 1001 1.1 joerg case OMPD_target_teams_distribute_simd: 1002 1.1 joerg return true; 1003 1.1 joerg case OMPD_target_teams_distribute: 1004 1.1 joerg return false; 1005 1.1 joerg case OMPD_parallel: 1006 1.1 joerg case OMPD_for: 1007 1.1 joerg case OMPD_parallel_for: 1008 1.1 joerg case OMPD_parallel_master: 1009 1.1 joerg case OMPD_parallel_sections: 1010 1.1 joerg case OMPD_for_simd: 1011 1.1 joerg case OMPD_parallel_for_simd: 1012 1.1 joerg case OMPD_cancel: 1013 1.1 joerg case OMPD_cancellation_point: 1014 1.1 joerg case OMPD_ordered: 1015 1.1 joerg case OMPD_threadprivate: 1016 1.1 joerg case OMPD_allocate: 1017 1.1 joerg case OMPD_task: 1018 1.1 joerg case OMPD_simd: 1019 1.1 joerg case OMPD_sections: 1020 1.1 joerg case OMPD_section: 1021 1.1 joerg case OMPD_single: 1022 1.1 joerg case OMPD_master: 1023 1.1 joerg case OMPD_critical: 1024 1.1 joerg case OMPD_taskyield: 1025 1.1 joerg case OMPD_barrier: 1026 1.1 joerg case OMPD_taskwait: 1027 1.1 joerg case OMPD_taskgroup: 1028 1.1 joerg case OMPD_atomic: 1029 1.1 joerg case OMPD_flush: 1030 1.1 joerg case OMPD_depobj: 1031 1.1 joerg case OMPD_scan: 1032 1.1 joerg case OMPD_teams: 1033 1.1 joerg case OMPD_target_data: 1034 1.1 joerg case OMPD_target_exit_data: 1035 1.1 joerg case OMPD_target_enter_data: 1036 1.1 joerg case OMPD_distribute: 1037 1.1 joerg case OMPD_distribute_simd: 1038 1.1 joerg case OMPD_distribute_parallel_for: 1039 1.1 joerg case OMPD_distribute_parallel_for_simd: 1040 1.1 joerg case OMPD_teams_distribute: 1041 1.1 joerg case OMPD_teams_distribute_simd: 1042 1.1 joerg case OMPD_teams_distribute_parallel_for: 1043 1.1 joerg case OMPD_teams_distribute_parallel_for_simd: 1044 1.1 joerg case OMPD_target_update: 1045 1.1 joerg case OMPD_declare_simd: 1046 1.1 joerg case OMPD_declare_variant: 1047 1.1 joerg case OMPD_begin_declare_variant: 1048 1.1 joerg case OMPD_end_declare_variant: 1049 1.1 joerg case OMPD_declare_target: 1050 1.1 joerg case OMPD_end_declare_target: 1051 1.1 joerg case OMPD_declare_reduction: 1052 1.1 joerg case OMPD_declare_mapper: 1053 1.1 joerg case OMPD_taskloop: 1054 1.1 joerg case OMPD_taskloop_simd: 1055 1.1 joerg case OMPD_master_taskloop: 1056 1.1 joerg case OMPD_master_taskloop_simd: 1057 1.1 joerg case OMPD_parallel_master_taskloop: 1058 1.1 joerg case OMPD_parallel_master_taskloop_simd: 1059 1.1 joerg case OMPD_requires: 1060 1.1 joerg case OMPD_unknown: 1061 1.1 joerg default: 1062 1.1 joerg break; 1063 1.1 joerg } 1064 1.1 joerg llvm_unreachable( 1065 1.1 joerg "Unknown programming model for OpenMP directive on NVPTX target."); 1066 1.1 joerg } 1067 1.1 joerg 1068 1.1 joerg void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 1069 1.1 joerg StringRef ParentName, 1070 1.1 joerg llvm::Function *&OutlinedFn, 1071 1.1 joerg llvm::Constant *&OutlinedFnID, 1072 1.1 joerg bool IsOffloadEntry, 1073 1.1 joerg const RegionCodeGenTy &CodeGen) { 1074 1.1 joerg ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode); 1075 1.1 joerg EntryFunctionState EST; 1076 1.1 joerg WorkerFunctionState WST(CGM, D.getBeginLoc()); 1077 1.1 joerg Work.clear(); 1078 1.1 joerg WrapperFunctionsMap.clear(); 1079 1.1 joerg 1080 1.1 joerg // Emit target region as a standalone region. 1081 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy { 1082 1.1 joerg CGOpenMPRuntimeGPU::EntryFunctionState &EST; 1083 1.1 joerg CGOpenMPRuntimeGPU::WorkerFunctionState &WST; 1084 1.1 joerg 1085 1.1 joerg public: 1086 1.1 joerg NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, 1087 1.1 joerg CGOpenMPRuntimeGPU::WorkerFunctionState &WST) 1088 1.1 joerg : EST(EST), WST(WST) {} 1089 1.1 joerg void Enter(CodeGenFunction &CGF) override { 1090 1.1 joerg auto &RT = 1091 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1092 1.1 joerg RT.emitNonSPMDEntryHeader(CGF, EST, WST); 1093 1.1 joerg // Skip target region initialization. 1094 1.1 joerg RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 1095 1.1 joerg } 1096 1.1 joerg void Exit(CodeGenFunction &CGF) override { 1097 1.1 joerg auto &RT = 1098 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1099 1.1 joerg RT.clearLocThreadIdInsertPt(CGF); 1100 1.1 joerg RT.emitNonSPMDEntryFooter(CGF, EST); 1101 1.1 joerg } 1102 1.1 joerg } Action(EST, WST); 1103 1.1 joerg CodeGen.setAction(Action); 1104 1.1 joerg IsInTTDRegion = true; 1105 1.1 joerg // Reserve place for the globalized memory. 1106 1.1 joerg GlobalizedRecords.emplace_back(); 1107 1.1 joerg if (!KernelStaticGlobalized) { 1108 1.1 joerg KernelStaticGlobalized = new llvm::GlobalVariable( 1109 1.1 joerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, 1110 1.1 joerg llvm::GlobalValue::InternalLinkage, 1111 1.1 joerg llvm::UndefValue::get(CGM.VoidPtrTy), 1112 1.1 joerg "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, 1113 1.1 joerg llvm::GlobalValue::NotThreadLocal, 1114 1.1 joerg CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); 1115 1.1 joerg } 1116 1.1 joerg emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 1117 1.1 joerg IsOffloadEntry, CodeGen); 1118 1.1 joerg IsInTTDRegion = false; 1119 1.1 joerg 1120 1.1 joerg // Now change the name of the worker function to correspond to this target 1121 1.1 joerg // region's entry function. 1122 1.1 joerg WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker")); 1123 1.1 joerg 1124 1.1 joerg // Create the worker function 1125 1.1 joerg emitWorkerFunction(WST); 1126 1.1 joerg } 1127 1.1 joerg 1128 1.1 joerg // Setup NVPTX threads for master-worker OpenMP scheme. 1129 1.1 joerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, 1130 1.1 joerg EntryFunctionState &EST, 1131 1.1 joerg WorkerFunctionState &WST) { 1132 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 1133 1.1 joerg 1134 1.1 joerg llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); 1135 1.1 joerg llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); 1136 1.1 joerg llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); 1137 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit"); 1138 1.1 joerg 1139 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1140 1.1 joerg llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF); 1141 1.1 joerg llvm::Value *ThreadLimit = getThreadLimit(CGF); 1142 1.1 joerg llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit); 1143 1.1 joerg Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); 1144 1.1 joerg 1145 1.1 joerg CGF.EmitBlock(WorkerBB); 1146 1.1 joerg emitCall(CGF, WST.Loc, WST.WorkerFn); 1147 1.1 joerg CGF.EmitBranch(EST.ExitBB); 1148 1.1 joerg 1149 1.1 joerg CGF.EmitBlock(MasterCheckBB); 1150 1.1 joerg GPUThreadID = RT.getGPUThreadID(CGF); 1151 1.1 joerg llvm::Value *MasterThreadID = getMasterThreadID(CGF); 1152 1.1 joerg llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID); 1153 1.1 joerg Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); 1154 1.1 joerg 1155 1.1 joerg CGF.EmitBlock(MasterBB); 1156 1.1 joerg IsInTargetMasterThreadRegion = true; 1157 1.1 joerg // SEQUENTIAL (MASTER) REGION START 1158 1.1 joerg // First action in sequential region: 1159 1.1 joerg // Initialize the state of the OpenMP runtime library on the GPU. 1160 1.1 joerg // TODO: Optimize runtime initialization and pass in correct value. 1161 1.1 joerg llvm::Value *Args[] = {getThreadLimit(CGF), 1162 1.1 joerg Bld.getInt16(/*RequiresOMPRuntime=*/1)}; 1163 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1164 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_init), 1165 1.1 joerg Args); 1166 1.1 joerg 1167 1.1 joerg // For data sharing, we need to initialize the stack. 1168 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1169 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); 1170 1.1 joerg 1171 1.1 joerg emitGenericVarsProlog(CGF, WST.Loc); 1172 1.1 joerg } 1173 1.1 joerg 1174 1.1 joerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF, 1175 1.1 joerg EntryFunctionState &EST) { 1176 1.1 joerg IsInTargetMasterThreadRegion = false; 1177 1.1 joerg if (!CGF.HaveInsertPoint()) 1178 1.1 joerg return; 1179 1.1 joerg 1180 1.1 joerg emitGenericVarsEpilog(CGF); 1181 1.1 joerg 1182 1.1 joerg if (!EST.ExitBB) 1183 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit"); 1184 1.1 joerg 1185 1.1 joerg llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); 1186 1.1 joerg CGF.EmitBranch(TerminateBB); 1187 1.1 joerg 1188 1.1 joerg CGF.EmitBlock(TerminateBB); 1189 1.1 joerg // Signal termination condition. 1190 1.1 joerg // TODO: Optimize runtime initialization and pass in correct value. 1191 1.1 joerg llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; 1192 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1193 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_deinit), 1194 1.1 joerg Args); 1195 1.1 joerg // Barrier to terminate worker threads. 1196 1.1 joerg syncCTAThreads(CGF); 1197 1.1 joerg // Master thread jumps to exit point. 1198 1.1 joerg CGF.EmitBranch(EST.ExitBB); 1199 1.1 joerg 1200 1.1 joerg CGF.EmitBlock(EST.ExitBB); 1201 1.1 joerg EST.ExitBB = nullptr; 1202 1.1 joerg } 1203 1.1 joerg 1204 1.1 joerg void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 1205 1.1 joerg StringRef ParentName, 1206 1.1 joerg llvm::Function *&OutlinedFn, 1207 1.1 joerg llvm::Constant *&OutlinedFnID, 1208 1.1 joerg bool IsOffloadEntry, 1209 1.1 joerg const RegionCodeGenTy &CodeGen) { 1210 1.1 joerg ExecutionRuntimeModesRAII ModeRAII( 1211 1.1 joerg CurrentExecutionMode, RequiresFullRuntime, 1212 1.1 joerg CGM.getLangOpts().OpenMPCUDAForceFullRuntime || 1213 1.1 joerg !supportsLightweightRuntime(CGM.getContext(), D)); 1214 1.1 joerg EntryFunctionState EST; 1215 1.1 joerg 1216 1.1 joerg // Emit target region as a standalone region. 1217 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy { 1218 1.1 joerg CGOpenMPRuntimeGPU &RT; 1219 1.1 joerg CGOpenMPRuntimeGPU::EntryFunctionState &EST; 1220 1.1 joerg const OMPExecutableDirective &D; 1221 1.1 joerg 1222 1.1 joerg public: 1223 1.1 joerg NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 1224 1.1 joerg CGOpenMPRuntimeGPU::EntryFunctionState &EST, 1225 1.1 joerg const OMPExecutableDirective &D) 1226 1.1 joerg : RT(RT), EST(EST), D(D) {} 1227 1.1 joerg void Enter(CodeGenFunction &CGF) override { 1228 1.1 joerg RT.emitSPMDEntryHeader(CGF, EST, D); 1229 1.1 joerg // Skip target region initialization. 1230 1.1 joerg RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 1231 1.1 joerg } 1232 1.1 joerg void Exit(CodeGenFunction &CGF) override { 1233 1.1 joerg RT.clearLocThreadIdInsertPt(CGF); 1234 1.1 joerg RT.emitSPMDEntryFooter(CGF, EST); 1235 1.1 joerg } 1236 1.1 joerg } Action(*this, EST, D); 1237 1.1 joerg CodeGen.setAction(Action); 1238 1.1 joerg IsInTTDRegion = true; 1239 1.1 joerg // Reserve place for the globalized memory. 1240 1.1 joerg GlobalizedRecords.emplace_back(); 1241 1.1 joerg if (!KernelStaticGlobalized) { 1242 1.1 joerg KernelStaticGlobalized = new llvm::GlobalVariable( 1243 1.1 joerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, 1244 1.1 joerg llvm::GlobalValue::InternalLinkage, 1245 1.1 joerg llvm::UndefValue::get(CGM.VoidPtrTy), 1246 1.1 joerg "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, 1247 1.1 joerg llvm::GlobalValue::NotThreadLocal, 1248 1.1 joerg CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); 1249 1.1 joerg } 1250 1.1 joerg emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 1251 1.1 joerg IsOffloadEntry, CodeGen); 1252 1.1 joerg IsInTTDRegion = false; 1253 1.1 joerg } 1254 1.1 joerg 1255 1.1 joerg void CGOpenMPRuntimeGPU::emitSPMDEntryHeader( 1256 1.1 joerg CodeGenFunction &CGF, EntryFunctionState &EST, 1257 1.1 joerg const OMPExecutableDirective &D) { 1258 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 1259 1.1 joerg 1260 1.1 joerg // Setup BBs in entry function. 1261 1.1 joerg llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); 1262 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit"); 1263 1.1 joerg 1264 1.1 joerg llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), 1265 1.1 joerg /*RequiresOMPRuntime=*/ 1266 1.1 joerg Bld.getInt16(RequiresFullRuntime ? 1 : 0)}; 1267 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1268 1.1 joerg CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), 1269 1.1 joerg Args); 1270 1.1 joerg 1271 1.1 joerg if (RequiresFullRuntime) { 1272 1.1 joerg // For data sharing, we need to initialize the stack. 1273 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1274 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); 1275 1.1 joerg } 1276 1.1 joerg 1277 1.1 joerg CGF.EmitBranch(ExecuteBB); 1278 1.1 joerg 1279 1.1 joerg CGF.EmitBlock(ExecuteBB); 1280 1.1 joerg 1281 1.1 joerg IsInTargetMasterThreadRegion = true; 1282 1.1 joerg } 1283 1.1 joerg 1284 1.1 joerg void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF, 1285 1.1 joerg EntryFunctionState &EST) { 1286 1.1 joerg IsInTargetMasterThreadRegion = false; 1287 1.1 joerg if (!CGF.HaveInsertPoint()) 1288 1.1 joerg return; 1289 1.1 joerg 1290 1.1 joerg if (!EST.ExitBB) 1291 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit"); 1292 1.1 joerg 1293 1.1 joerg llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); 1294 1.1 joerg CGF.EmitBranch(OMPDeInitBB); 1295 1.1 joerg 1296 1.1 joerg CGF.EmitBlock(OMPDeInitBB); 1297 1.1 joerg // DeInitialize the OMP state in the runtime; called by all active threads. 1298 1.1 joerg llvm::Value *Args[] = {/*RequiresOMPRuntime=*/ 1299 1.1 joerg CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)}; 1300 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1301 1.1 joerg CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2), 1302 1.1 joerg Args); 1303 1.1 joerg CGF.EmitBranch(EST.ExitBB); 1304 1.1 joerg 1305 1.1 joerg CGF.EmitBlock(EST.ExitBB); 1306 1.1 joerg EST.ExitBB = nullptr; 1307 1.1 joerg } 1308 1.1 joerg 1309 1.1 joerg // Create a unique global variable to indicate the execution mode of this target 1310 1.1 joerg // region. The execution mode is either 'generic', or 'spmd' depending on the 1311 1.1 joerg // target directive. This variable is picked up by the offload library to setup 1312 1.1 joerg // the device appropriately before kernel launch. If the execution mode is 1313 1.1 joerg // 'generic', the runtime reserves one warp for the master, otherwise, all 1314 1.1 joerg // warps participate in parallel work. 1315 1.1 joerg static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, 1316 1.1 joerg bool Mode) { 1317 1.1 joerg auto *GVMode = 1318 1.1 joerg new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, 1319 1.1 joerg llvm::GlobalValue::WeakAnyLinkage, 1320 1.1 joerg llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), 1321 1.1 joerg Twine(Name, "_exec_mode")); 1322 1.1 joerg CGM.addCompilerUsedGlobal(GVMode); 1323 1.1 joerg } 1324 1.1 joerg 1325 1.1 joerg void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) { 1326 1.1 joerg ASTContext &Ctx = CGM.getContext(); 1327 1.1 joerg 1328 1.1 joerg CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 1329 1.1 joerg CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {}, 1330 1.1 joerg WST.Loc, WST.Loc); 1331 1.1 joerg emitWorkerLoop(CGF, WST); 1332 1.1 joerg CGF.FinishFunction(); 1333 1.1 joerg } 1334 1.1 joerg 1335 1.1 joerg void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF, 1336 1.1 joerg WorkerFunctionState &WST) { 1337 1.1 joerg // 1338 1.1 joerg // The workers enter this loop and wait for parallel work from the master. 1339 1.1 joerg // When the master encounters a parallel region it sets up the work + variable 1340 1.1 joerg // arguments, and wakes up the workers. The workers first check to see if 1341 1.1 joerg // they are required for the parallel region, i.e., within the # of requested 1342 1.1 joerg // parallel threads. The activated workers load the variable arguments and 1343 1.1 joerg // execute the parallel work. 1344 1.1 joerg // 1345 1.1 joerg 1346 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 1347 1.1 joerg 1348 1.1 joerg llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); 1349 1.1 joerg llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); 1350 1.1 joerg llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); 1351 1.1 joerg llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); 1352 1.1 joerg llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); 1353 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 1354 1.1 joerg 1355 1.1 joerg CGF.EmitBranch(AwaitBB); 1356 1.1 joerg 1357 1.1 joerg // Workers wait for work from master. 1358 1.1 joerg CGF.EmitBlock(AwaitBB); 1359 1.1 joerg // Wait for parallel work 1360 1.1 joerg syncCTAThreads(CGF); 1361 1.1 joerg 1362 1.1 joerg Address WorkFn = 1363 1.1 joerg CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); 1364 1.1 joerg Address ExecStatus = 1365 1.1 joerg CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); 1366 1.1 joerg CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); 1367 1.1 joerg CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); 1368 1.1 joerg 1369 1.1 joerg // TODO: Optimize runtime initialization and pass in correct value. 1370 1.1 joerg llvm::Value *Args[] = {WorkFn.getPointer()}; 1371 1.1 joerg llvm::Value *Ret = 1372 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1373 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_parallel), 1374 1.1 joerg Args); 1375 1.1 joerg Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); 1376 1.1 joerg 1377 1.1 joerg // On termination condition (workid == 0), exit loop. 1378 1.1 joerg llvm::Value *WorkID = Bld.CreateLoad(WorkFn); 1379 1.1 joerg llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); 1380 1.1 joerg Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); 1381 1.1 joerg 1382 1.1 joerg // Activate requested workers. 1383 1.1 joerg CGF.EmitBlock(SelectWorkersBB); 1384 1.1 joerg llvm::Value *IsActive = 1385 1.1 joerg Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); 1386 1.1 joerg Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); 1387 1.1 joerg 1388 1.1 joerg // Signal start of parallel region. 1389 1.1 joerg CGF.EmitBlock(ExecuteBB); 1390 1.1 joerg // Skip initialization. 1391 1.1 joerg setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 1392 1.1 joerg 1393 1.1 joerg // Process work items: outlined parallel functions. 1394 1.1 joerg for (llvm::Function *W : Work) { 1395 1.1 joerg // Try to match this outlined function. 1396 1.1 joerg llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); 1397 1.1 joerg 1398 1.1 joerg llvm::Value *WorkFnMatch = 1399 1.1 joerg Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); 1400 1.1 joerg 1401 1.1 joerg llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); 1402 1.1 joerg llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); 1403 1.1 joerg Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); 1404 1.1 joerg 1405 1.1 joerg // Execute this outlined function. 1406 1.1 joerg CGF.EmitBlock(ExecuteFNBB); 1407 1.1 joerg 1408 1.1 joerg // Insert call to work function via shared wrapper. The shared 1409 1.1 joerg // wrapper takes two arguments: 1410 1.1 joerg // - the parallelism level; 1411 1.1 joerg // - the thread ID; 1412 1.1 joerg emitCall(CGF, WST.Loc, W, 1413 1.1 joerg {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); 1414 1.1 joerg 1415 1.1 joerg // Go to end of parallel region. 1416 1.1 joerg CGF.EmitBranch(TerminateBB); 1417 1.1 joerg 1418 1.1 joerg CGF.EmitBlock(CheckNextBB); 1419 1.1 joerg } 1420 1.1 joerg // Default case: call to outlined function through pointer if the target 1421 1.1 joerg // region makes a declare target call that may contain an orphaned parallel 1422 1.1 joerg // directive. 1423 1.1 joerg auto *ParallelFnTy = 1424 1.1 joerg llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, 1425 1.1 joerg /*isVarArg=*/false); 1426 1.1 joerg llvm::Value *WorkFnCast = 1427 1.1 joerg Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo()); 1428 1.1 joerg // Insert call to work function via shared wrapper. The shared 1429 1.1 joerg // wrapper takes two arguments: 1430 1.1 joerg // - the parallelism level; 1431 1.1 joerg // - the thread ID; 1432 1.1 joerg emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast}, 1433 1.1 joerg {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); 1434 1.1 joerg // Go to end of parallel region. 1435 1.1 joerg CGF.EmitBranch(TerminateBB); 1436 1.1 joerg 1437 1.1 joerg // Signal end of parallel region. 1438 1.1 joerg CGF.EmitBlock(TerminateBB); 1439 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1440 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel), 1441 1.1 joerg llvm::None); 1442 1.1 joerg CGF.EmitBranch(BarrierBB); 1443 1.1 joerg 1444 1.1 joerg // All active and inactive workers wait at a barrier after parallel region. 1445 1.1 joerg CGF.EmitBlock(BarrierBB); 1446 1.1 joerg // Barrier after parallel region. 1447 1.1 joerg syncCTAThreads(CGF); 1448 1.1 joerg CGF.EmitBranch(AwaitBB); 1449 1.1 joerg 1450 1.1 joerg // Exit target region. 1451 1.1 joerg CGF.EmitBlock(ExitBB); 1452 1.1 joerg // Skip initialization. 1453 1.1 joerg clearLocThreadIdInsertPt(CGF); 1454 1.1 joerg } 1455 1.1 joerg 1456 1.1 joerg void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, 1457 1.1 joerg llvm::Constant *Addr, 1458 1.1 joerg uint64_t Size, int32_t, 1459 1.1 joerg llvm::GlobalValue::LinkageTypes) { 1460 1.1 joerg // TODO: Add support for global variables on the device after declare target 1461 1.1 joerg // support. 1462 1.1 joerg if (!isa<llvm::Function>(Addr)) 1463 1.1 joerg return; 1464 1.1 joerg llvm::Module &M = CGM.getModule(); 1465 1.1 joerg llvm::LLVMContext &Ctx = CGM.getLLVMContext(); 1466 1.1 joerg 1467 1.1 joerg // Get "nvvm.annotations" metadata node 1468 1.1 joerg llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 1469 1.1 joerg 1470 1.1 joerg llvm::Metadata *MDVals[] = { 1471 1.1 joerg llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"), 1472 1.1 joerg llvm::ConstantAsMetadata::get( 1473 1.1 joerg llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; 1474 1.1 joerg // Append metadata to nvvm.annotations 1475 1.1 joerg MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); 1476 1.1 joerg } 1477 1.1 joerg 1478 1.1 joerg void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 1479 1.1 joerg const OMPExecutableDirective &D, StringRef ParentName, 1480 1.1 joerg llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 1481 1.1 joerg bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 1482 1.1 joerg if (!IsOffloadEntry) // Nothing to do. 1483 1.1 joerg return; 1484 1.1 joerg 1485 1.1 joerg assert(!ParentName.empty() && "Invalid target region parent name!"); 1486 1.1 joerg 1487 1.1 joerg bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 1488 1.1 joerg if (Mode) 1489 1.1 joerg emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 1490 1.1 joerg CodeGen); 1491 1.1 joerg else 1492 1.1 joerg emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 1493 1.1 joerg CodeGen); 1494 1.1 joerg 1495 1.1 joerg setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); 1496 1.1 joerg } 1497 1.1 joerg 1498 1.1 joerg namespace { 1499 1.1 joerg LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 1500 1.1 joerg /// Enum for accesseing the reserved_2 field of the ident_t struct. 1501 1.1 joerg enum ModeFlagsTy : unsigned { 1502 1.1 joerg /// Bit set to 1 when in SPMD mode. 1503 1.1 joerg KMP_IDENT_SPMD_MODE = 0x01, 1504 1.1 joerg /// Bit set to 1 when a simplified runtime is used. 1505 1.1 joerg KMP_IDENT_SIMPLE_RT_MODE = 0x02, 1506 1.1 joerg LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE) 1507 1.1 joerg }; 1508 1.1 joerg 1509 1.1 joerg /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime. 1510 1.1 joerg static const ModeFlagsTy UndefinedMode = 1511 1.1 joerg (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE; 1512 1.1 joerg } // anonymous namespace 1513 1.1 joerg 1514 1.1 joerg unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const { 1515 1.1 joerg switch (getExecutionMode()) { 1516 1.1 joerg case EM_SPMD: 1517 1.1 joerg if (requiresFullRuntime()) 1518 1.1 joerg return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE); 1519 1.1 joerg return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE; 1520 1.1 joerg case EM_NonSPMD: 1521 1.1 joerg assert(requiresFullRuntime() && "Expected full runtime."); 1522 1.1 joerg return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE); 1523 1.1 joerg case EM_Unknown: 1524 1.1 joerg return UndefinedMode; 1525 1.1 joerg } 1526 1.1 joerg llvm_unreachable("Unknown flags are requested."); 1527 1.1 joerg } 1528 1.1 joerg 1529 1.1 joerg CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 1530 1.1 joerg : CGOpenMPRuntime(CGM, "_", "$") { 1531 1.1 joerg if (!CGM.getLangOpts().OpenMPIsDevice) 1532 1.1 joerg llvm_unreachable("OpenMP NVPTX can only handle device code."); 1533 1.1 joerg } 1534 1.1 joerg 1535 1.1 joerg void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 1536 1.1 joerg ProcBindKind ProcBind, 1537 1.1 joerg SourceLocation Loc) { 1538 1.1 joerg // Do nothing in case of SPMD mode and L0 parallel. 1539 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1540 1.1 joerg return; 1541 1.1 joerg 1542 1.1 joerg CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); 1543 1.1 joerg } 1544 1.1 joerg 1545 1.1 joerg void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 1546 1.1 joerg llvm::Value *NumThreads, 1547 1.1 joerg SourceLocation Loc) { 1548 1.1 joerg // Do nothing in case of SPMD mode and L0 parallel. 1549 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1550 1.1 joerg return; 1551 1.1 joerg 1552 1.1 joerg CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); 1553 1.1 joerg } 1554 1.1 joerg 1555 1.1 joerg void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 1556 1.1 joerg const Expr *NumTeams, 1557 1.1 joerg const Expr *ThreadLimit, 1558 1.1 joerg SourceLocation Loc) {} 1559 1.1 joerg 1560 1.1 joerg llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 1561 1.1 joerg const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, 1562 1.1 joerg OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { 1563 1.1 joerg // Emit target region as a standalone region. 1564 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy { 1565 1.1 joerg bool &IsInParallelRegion; 1566 1.1 joerg bool PrevIsInParallelRegion; 1567 1.1 joerg 1568 1.1 joerg public: 1569 1.1 joerg NVPTXPrePostActionTy(bool &IsInParallelRegion) 1570 1.1 joerg : IsInParallelRegion(IsInParallelRegion) {} 1571 1.1 joerg void Enter(CodeGenFunction &CGF) override { 1572 1.1 joerg PrevIsInParallelRegion = IsInParallelRegion; 1573 1.1 joerg IsInParallelRegion = true; 1574 1.1 joerg } 1575 1.1 joerg void Exit(CodeGenFunction &CGF) override { 1576 1.1 joerg IsInParallelRegion = PrevIsInParallelRegion; 1577 1.1 joerg } 1578 1.1 joerg } Action(IsInParallelRegion); 1579 1.1 joerg CodeGen.setAction(Action); 1580 1.1 joerg bool PrevIsInTTDRegion = IsInTTDRegion; 1581 1.1 joerg IsInTTDRegion = false; 1582 1.1 joerg bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; 1583 1.1 joerg IsInTargetMasterThreadRegion = false; 1584 1.1 joerg auto *OutlinedFun = 1585 1.1 joerg cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 1586 1.1 joerg D, ThreadIDVar, InnermostKind, CodeGen)); 1587 1.1 joerg IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; 1588 1.1 joerg IsInTTDRegion = PrevIsInTTDRegion; 1589 1.1 joerg if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD && 1590 1.1 joerg !IsInParallelRegion) { 1591 1.1 joerg llvm::Function *WrapperFun = 1592 1.1 joerg createParallelDataSharingWrapper(OutlinedFun, D); 1593 1.1 joerg WrapperFunctionsMap[OutlinedFun] = WrapperFun; 1594 1.1 joerg } 1595 1.1 joerg 1596 1.1 joerg return OutlinedFun; 1597 1.1 joerg } 1598 1.1 joerg 1599 1.1 joerg /// Get list of lastprivate variables from the teams distribute ... or 1600 1.1 joerg /// teams {distribute ...} directives. 1601 1.1 joerg static void 1602 1.1 joerg getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 1603 1.1 joerg llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 1604 1.1 joerg assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 1605 1.1 joerg "expected teams directive."); 1606 1.1 joerg const OMPExecutableDirective *Dir = &D; 1607 1.1 joerg if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 1608 1.1 joerg if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 1609 1.1 joerg Ctx, 1610 1.1 joerg D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 1611 1.1 joerg /*IgnoreCaptured=*/true))) { 1612 1.1 joerg Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 1613 1.1 joerg if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 1614 1.1 joerg Dir = nullptr; 1615 1.1 joerg } 1616 1.1 joerg } 1617 1.1 joerg if (!Dir) 1618 1.1 joerg return; 1619 1.1 joerg for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 1620 1.1 joerg for (const Expr *E : C->getVarRefs()) 1621 1.1 joerg Vars.push_back(getPrivateItem(E)); 1622 1.1 joerg } 1623 1.1 joerg } 1624 1.1 joerg 1625 1.1 joerg /// Get list of reduction variables from the teams ... directives. 1626 1.1 joerg static void 1627 1.1 joerg getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 1628 1.1 joerg llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 1629 1.1 joerg assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 1630 1.1 joerg "expected teams directive."); 1631 1.1 joerg for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 1632 1.1 joerg for (const Expr *E : C->privates()) 1633 1.1 joerg Vars.push_back(getPrivateItem(E)); 1634 1.1 joerg } 1635 1.1 joerg } 1636 1.1 joerg 1637 1.1 joerg llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 1638 1.1 joerg const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, 1639 1.1 joerg OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { 1640 1.1 joerg SourceLocation Loc = D.getBeginLoc(); 1641 1.1 joerg 1642 1.1 joerg const RecordDecl *GlobalizedRD = nullptr; 1643 1.1 joerg llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 1644 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 1645 1.1 joerg unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); 1646 1.1 joerg // Globalize team reductions variable unconditionally in all modes. 1647 1.1 joerg if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 1648 1.1 joerg getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 1649 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 1650 1.1 joerg getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 1651 1.1 joerg if (!LastPrivatesReductions.empty()) { 1652 1.1 joerg GlobalizedRD = ::buildRecordForGlobalizedVars( 1653 1.1 joerg CGM.getContext(), llvm::None, LastPrivatesReductions, 1654 1.1 joerg MappedDeclsFields, WarpSize); 1655 1.1 joerg } 1656 1.1 joerg } else if (!LastPrivatesReductions.empty()) { 1657 1.1 joerg assert(!TeamAndReductions.first && 1658 1.1 joerg "Previous team declaration is not expected."); 1659 1.1 joerg TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 1660 1.1 joerg std::swap(TeamAndReductions.second, LastPrivatesReductions); 1661 1.1 joerg } 1662 1.1 joerg 1663 1.1 joerg // Emit target region as a standalone region. 1664 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy { 1665 1.1 joerg SourceLocation &Loc; 1666 1.1 joerg const RecordDecl *GlobalizedRD; 1667 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1668 1.1 joerg &MappedDeclsFields; 1669 1.1 joerg 1670 1.1 joerg public: 1671 1.1 joerg NVPTXPrePostActionTy( 1672 1.1 joerg SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1673 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1674 1.1 joerg &MappedDeclsFields) 1675 1.1 joerg : Loc(Loc), GlobalizedRD(GlobalizedRD), 1676 1.1 joerg MappedDeclsFields(MappedDeclsFields) {} 1677 1.1 joerg void Enter(CodeGenFunction &CGF) override { 1678 1.1 joerg auto &Rt = 1679 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1680 1.1 joerg if (GlobalizedRD) { 1681 1.1 joerg auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1682 1.1 joerg I->getSecond().GlobalRecord = GlobalizedRD; 1683 1.1 joerg I->getSecond().MappedParams = 1684 1.1 joerg std::make_unique<CodeGenFunction::OMPMapVars>(); 1685 1.1 joerg DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1686 1.1 joerg for (const auto &Pair : MappedDeclsFields) { 1687 1.1 joerg assert(Pair.getFirst()->isCanonicalDecl() && 1688 1.1 joerg "Expected canonical declaration"); 1689 1.1 joerg Data.insert(std::make_pair(Pair.getFirst(), 1690 1.1 joerg MappedVarData(Pair.getSecond(), 1691 1.1 joerg /*IsOnePerTeam=*/true))); 1692 1.1 joerg } 1693 1.1 joerg } 1694 1.1 joerg Rt.emitGenericVarsProlog(CGF, Loc); 1695 1.1 joerg } 1696 1.1 joerg void Exit(CodeGenFunction &CGF) override { 1697 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1698 1.1 joerg .emitGenericVarsEpilog(CGF); 1699 1.1 joerg } 1700 1.1 joerg } Action(Loc, GlobalizedRD, MappedDeclsFields); 1701 1.1 joerg CodeGen.setAction(Action); 1702 1.1 joerg llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 1703 1.1 joerg D, ThreadIDVar, InnermostKind, CodeGen); 1704 1.1 joerg 1705 1.1 joerg return OutlinedFun; 1706 1.1 joerg } 1707 1.1 joerg 1708 1.1 joerg void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 1709 1.1 joerg SourceLocation Loc, 1710 1.1 joerg bool WithSPMDCheck) { 1711 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && 1712 1.1 joerg getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 1713 1.1 joerg return; 1714 1.1 joerg 1715 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 1716 1.1 joerg 1717 1.1 joerg const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1718 1.1 joerg if (I == FunctionGlobalizedDecls.end()) 1719 1.1 joerg return; 1720 1.1 joerg if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { 1721 1.1 joerg QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); 1722 1.1 joerg QualType SecGlobalRecTy; 1723 1.1 joerg 1724 1.1 joerg // Recover pointer to this function's global record. The runtime will 1725 1.1 joerg // handle the specifics of the allocation of the memory. 1726 1.1 joerg // Use actual memory size of the record including the padding 1727 1.1 joerg // for alignment purposes. 1728 1.1 joerg unsigned Alignment = 1729 1.1 joerg CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); 1730 1.1 joerg unsigned GlobalRecordSize = 1731 1.1 joerg CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); 1732 1.1 joerg GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); 1733 1.1 joerg 1734 1.1 joerg llvm::PointerType *GlobalRecPtrTy = 1735 1.1 joerg CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); 1736 1.1 joerg llvm::Value *GlobalRecCastAddr; 1737 1.1 joerg llvm::Value *IsTTD = nullptr; 1738 1.1 joerg if (!IsInTTDRegion && 1739 1.1 joerg (WithSPMDCheck || 1740 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { 1741 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 1742 1.1 joerg llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); 1743 1.1 joerg llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); 1744 1.1 joerg if (I->getSecond().SecondaryGlobalRecord.hasValue()) { 1745 1.1 joerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1746 1.1 joerg llvm::Value *ThreadID = getThreadID(CGF, Loc); 1747 1.1 joerg llvm::Value *PL = CGF.EmitRuntimeCall( 1748 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), 1749 1.1 joerg OMPRTL___kmpc_parallel_level), 1750 1.1 joerg {RTLoc, ThreadID}); 1751 1.1 joerg IsTTD = Bld.CreateIsNull(PL); 1752 1.1 joerg } 1753 1.1 joerg llvm::Value *IsSPMD = Bld.CreateIsNotNull( 1754 1.1 joerg CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1755 1.1 joerg CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); 1756 1.1 joerg Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); 1757 1.1 joerg // There is no need to emit line number for unconditional branch. 1758 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF); 1759 1.1 joerg CGF.EmitBlock(SPMDBB); 1760 1.1 joerg Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), 1761 1.1 joerg CharUnits::fromQuantity(Alignment)); 1762 1.1 joerg CGF.EmitBranch(ExitBB); 1763 1.1 joerg // There is no need to emit line number for unconditional branch. 1764 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF); 1765 1.1 joerg CGF.EmitBlock(NonSPMDBB); 1766 1.1 joerg llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); 1767 1.1 joerg if (const RecordDecl *SecGlobalizedVarsRecord = 1768 1.1 joerg I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { 1769 1.1 joerg SecGlobalRecTy = 1770 1.1 joerg CGM.getContext().getRecordType(SecGlobalizedVarsRecord); 1771 1.1 joerg 1772 1.1 joerg // Recover pointer to this function's global record. The runtime will 1773 1.1 joerg // handle the specifics of the allocation of the memory. 1774 1.1 joerg // Use actual memory size of the record including the padding 1775 1.1 joerg // for alignment purposes. 1776 1.1 joerg unsigned Alignment = 1777 1.1 joerg CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); 1778 1.1 joerg unsigned GlobalRecordSize = 1779 1.1 joerg CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); 1780 1.1 joerg GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); 1781 1.1 joerg Size = Bld.CreateSelect( 1782 1.1 joerg IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); 1783 1.1 joerg } 1784 1.1 joerg // TODO: allow the usage of shared memory to be controlled by 1785 1.1 joerg // the user, for now, default to global. 1786 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = { 1787 1.1 joerg Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; 1788 1.1 joerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( 1789 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 1790 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), 1791 1.1 joerg GlobalRecordSizeArg); 1792 1.1 joerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1793 1.1 joerg GlobalRecValue, GlobalRecPtrTy); 1794 1.1 joerg CGF.EmitBlock(ExitBB); 1795 1.1 joerg auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, 1796 1.1 joerg /*NumReservedValues=*/2, "_select_stack"); 1797 1.1 joerg Phi->addIncoming(RecPtr.getPointer(), SPMDBB); 1798 1.1 joerg Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); 1799 1.1 joerg GlobalRecCastAddr = Phi; 1800 1.1 joerg I->getSecond().GlobalRecordAddr = Phi; 1801 1.1 joerg I->getSecond().IsInSPMDModeFlag = IsSPMD; 1802 1.1 joerg } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { 1803 1.1 joerg assert(GlobalizedRecords.back().Records.size() < 2 && 1804 1.1 joerg "Expected less than 2 globalized records: one for target and one " 1805 1.1 joerg "for teams."); 1806 1.1 joerg unsigned Offset = 0; 1807 1.1 joerg for (const RecordDecl *RD : GlobalizedRecords.back().Records) { 1808 1.1 joerg QualType RDTy = CGM.getContext().getRecordType(RD); 1809 1.1 joerg unsigned Alignment = 1810 1.1 joerg CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); 1811 1.1 joerg unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); 1812 1.1 joerg Offset = 1813 1.1 joerg llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); 1814 1.1 joerg } 1815 1.1 joerg unsigned Alignment = 1816 1.1 joerg CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); 1817 1.1 joerg Offset = llvm::alignTo(Offset, Alignment); 1818 1.1 joerg GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); 1819 1.1 joerg ++GlobalizedRecords.back().RegionCounter; 1820 1.1 joerg if (GlobalizedRecords.back().Records.size() == 1) { 1821 1.1 joerg assert(KernelStaticGlobalized && 1822 1.1 joerg "Kernel static pointer must be initialized already."); 1823 1.1 joerg auto *UseSharedMemory = new llvm::GlobalVariable( 1824 1.1 joerg CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, 1825 1.1 joerg llvm::GlobalValue::InternalLinkage, nullptr, 1826 1.1 joerg "_openmp_static_kernel$is_shared"); 1827 1.1 joerg UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 1828 1.1 joerg QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( 1829 1.1 joerg /*DestWidth=*/16, /*Signed=*/0); 1830 1.1 joerg llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( 1831 1.1 joerg Address(UseSharedMemory, 1832 1.1 joerg CGM.getContext().getTypeAlignInChars(Int16Ty)), 1833 1.1 joerg /*Volatile=*/false, Int16Ty, Loc); 1834 1.1 joerg auto *StaticGlobalized = new llvm::GlobalVariable( 1835 1.1 joerg CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, 1836 1.1 joerg llvm::GlobalValue::CommonLinkage, nullptr); 1837 1.1 joerg auto *RecSize = new llvm::GlobalVariable( 1838 1.1 joerg CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, 1839 1.1 joerg llvm::GlobalValue::InternalLinkage, nullptr, 1840 1.1 joerg "_openmp_static_kernel$size"); 1841 1.1 joerg RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 1842 1.1 joerg llvm::Value *Ld = CGF.EmitLoadOfScalar( 1843 1.1 joerg Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, 1844 1.1 joerg CGM.getContext().getSizeType(), Loc); 1845 1.1 joerg llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1846 1.1 joerg KernelStaticGlobalized, CGM.VoidPtrPtrTy); 1847 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = { 1848 1.1 joerg llvm::ConstantInt::get( 1849 1.1 joerg CGM.Int16Ty, 1850 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), 1851 1.1 joerg StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; 1852 1.1 joerg CGF.EmitRuntimeCall( 1853 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 1854 1.1 joerg CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), 1855 1.1 joerg GlobalRecordSizeArg); 1856 1.1 joerg GlobalizedRecords.back().Buffer = StaticGlobalized; 1857 1.1 joerg GlobalizedRecords.back().RecSize = RecSize; 1858 1.1 joerg GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; 1859 1.1 joerg GlobalizedRecords.back().Loc = Loc; 1860 1.1 joerg } 1861 1.1 joerg assert(KernelStaticGlobalized && "Global address must be set already."); 1862 1.1 joerg Address FrameAddr = CGF.EmitLoadOfPointer( 1863 1.1 joerg Address(KernelStaticGlobalized, CGM.getPointerAlign()), 1864 1.1 joerg CGM.getContext() 1865 1.1 joerg .getPointerType(CGM.getContext().VoidPtrTy) 1866 1.1 joerg .castAs<PointerType>()); 1867 1.1 joerg llvm::Value *GlobalRecValue = 1868 1.1 joerg Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); 1869 1.1 joerg I->getSecond().GlobalRecordAddr = GlobalRecValue; 1870 1.1 joerg I->getSecond().IsInSPMDModeFlag = nullptr; 1871 1.1 joerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1872 1.1 joerg GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); 1873 1.1 joerg } else { 1874 1.1 joerg // TODO: allow the usage of shared memory to be controlled by 1875 1.1 joerg // the user, for now, default to global. 1876 1.1 joerg bool UseSharedMemory = 1877 1.1 joerg IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; 1878 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = { 1879 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), 1880 1.1 joerg CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; 1881 1.1 joerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( 1882 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 1883 1.1 joerg CGM.getModule(), 1884 1.1 joerg IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack 1885 1.1 joerg : OMPRTL___kmpc_data_sharing_coalesced_push_stack), 1886 1.1 joerg GlobalRecordSizeArg); 1887 1.1 joerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1888 1.1 joerg GlobalRecValue, GlobalRecPtrTy); 1889 1.1 joerg I->getSecond().GlobalRecordAddr = GlobalRecValue; 1890 1.1 joerg I->getSecond().IsInSPMDModeFlag = nullptr; 1891 1.1 joerg } 1892 1.1 joerg LValue Base = 1893 1.1 joerg CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); 1894 1.1 joerg 1895 1.1 joerg // Emit the "global alloca" which is a GEP from the global declaration 1896 1.1 joerg // record using the pointer returned by the runtime. 1897 1.1 joerg LValue SecBase; 1898 1.1 joerg decltype(I->getSecond().LocalVarData)::const_iterator SecIt; 1899 1.1 joerg if (IsTTD) { 1900 1.1 joerg SecIt = I->getSecond().SecondaryLocalVarData->begin(); 1901 1.1 joerg llvm::PointerType *SecGlobalRecPtrTy = 1902 1.1 joerg CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); 1903 1.1 joerg SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( 1904 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast( 1905 1.1 joerg I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), 1906 1.1 joerg SecGlobalRecTy); 1907 1.1 joerg } 1908 1.1 joerg for (auto &Rec : I->getSecond().LocalVarData) { 1909 1.1 joerg bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1910 1.1 joerg llvm::Value *ParValue; 1911 1.1 joerg if (EscapedParam) { 1912 1.1 joerg const auto *VD = cast<VarDecl>(Rec.first); 1913 1.1 joerg LValue ParLVal = 1914 1.1 joerg CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1915 1.1 joerg ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1916 1.1 joerg } 1917 1.1 joerg LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); 1918 1.1 joerg // Emit VarAddr basing on lane-id if required. 1919 1.1 joerg QualType VarTy; 1920 1.1 joerg if (Rec.second.IsOnePerTeam) { 1921 1.1 joerg VarTy = Rec.second.FD->getType(); 1922 1.1 joerg } else { 1923 1.1 joerg Address Addr = VarAddr.getAddress(CGF); 1924 1.1 joerg llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( 1925 1.1 joerg Addr.getElementType(), Addr.getPointer(), 1926 1.1 joerg {Bld.getInt32(0), getNVPTXLaneID(CGF)}); 1927 1.1 joerg VarTy = 1928 1.1 joerg Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); 1929 1.1 joerg VarAddr = CGF.MakeAddrLValue( 1930 1.1 joerg Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, 1931 1.1 joerg AlignmentSource::Decl); 1932 1.1 joerg } 1933 1.1 joerg Rec.second.PrivateAddr = VarAddr.getAddress(CGF); 1934 1.1 joerg if (!IsInTTDRegion && 1935 1.1 joerg (WithSPMDCheck || 1936 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { 1937 1.1 joerg assert(I->getSecond().IsInSPMDModeFlag && 1938 1.1 joerg "Expected unknown execution mode or required SPMD check."); 1939 1.1 joerg if (IsTTD) { 1940 1.1 joerg assert(SecIt->second.IsOnePerTeam && 1941 1.1 joerg "Secondary glob data must be one per team."); 1942 1.1 joerg LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); 1943 1.1 joerg VarAddr.setAddress( 1944 1.1 joerg Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), 1945 1.1 joerg VarAddr.getPointer(CGF)), 1946 1.1 joerg VarAddr.getAlignment())); 1947 1.1 joerg Rec.second.PrivateAddr = VarAddr.getAddress(CGF); 1948 1.1 joerg } 1949 1.1 joerg Address GlobalPtr = Rec.second.PrivateAddr; 1950 1.1 joerg Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); 1951 1.1 joerg Rec.second.PrivateAddr = Address( 1952 1.1 joerg Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, 1953 1.1 joerg LocalAddr.getPointer(), GlobalPtr.getPointer()), 1954 1.1 joerg LocalAddr.getAlignment()); 1955 1.1 joerg } 1956 1.1 joerg if (EscapedParam) { 1957 1.1 joerg const auto *VD = cast<VarDecl>(Rec.first); 1958 1.1 joerg CGF.EmitStoreOfScalar(ParValue, VarAddr); 1959 1.1 joerg I->getSecond().MappedParams->setVarAddr(CGF, VD, 1960 1.1 joerg VarAddr.getAddress(CGF)); 1961 1.1 joerg } 1962 1.1 joerg if (IsTTD) 1963 1.1 joerg ++SecIt; 1964 1.1 joerg } 1965 1.1 joerg } 1966 1.1 joerg for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { 1967 1.1 joerg // Recover pointer to this function's global record. The runtime will 1968 1.1 joerg // handle the specifics of the allocation of the memory. 1969 1.1 joerg // Use actual memory size of the record including the padding 1970 1.1 joerg // for alignment purposes. 1971 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 1972 1.1 joerg llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1973 1.1 joerg CharUnits Align = CGM.getContext().getDeclAlign(VD); 1974 1.1 joerg Size = Bld.CreateNUWAdd( 1975 1.1 joerg Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1976 1.1 joerg llvm::Value *AlignVal = 1977 1.1 joerg llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1978 1.1 joerg Size = Bld.CreateUDiv(Size, AlignVal); 1979 1.1 joerg Size = Bld.CreateNUWMul(Size, AlignVal); 1980 1.1 joerg // TODO: allow the usage of shared memory to be controlled by 1981 1.1 joerg // the user, for now, default to global. 1982 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = { 1983 1.1 joerg Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; 1984 1.1 joerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( 1985 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 1986 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), 1987 1.1 joerg GlobalRecordSizeArg); 1988 1.1 joerg llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1989 1.1 joerg GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); 1990 1.1 joerg LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), 1991 1.1 joerg CGM.getContext().getDeclAlign(VD), 1992 1.1 joerg AlignmentSource::Decl); 1993 1.1 joerg I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD), 1994 1.1 joerg Base.getAddress(CGF)); 1995 1.1 joerg I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); 1996 1.1 joerg } 1997 1.1 joerg I->getSecond().MappedParams->apply(CGF); 1998 1.1 joerg } 1999 1.1 joerg 2000 1.1 joerg void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF, 2001 1.1 joerg bool WithSPMDCheck) { 2002 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && 2003 1.1 joerg getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 2004 1.1 joerg return; 2005 1.1 joerg 2006 1.1 joerg const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 2007 1.1 joerg if (I != FunctionGlobalizedDecls.end()) { 2008 1.1 joerg I->getSecond().MappedParams->restore(CGF); 2009 1.1 joerg if (!CGF.HaveInsertPoint()) 2010 1.1 joerg return; 2011 1.1 joerg for (llvm::Value *Addr : 2012 1.1 joerg llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 2013 1.1 joerg CGF.EmitRuntimeCall( 2014 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 2015 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), 2016 1.1 joerg Addr); 2017 1.1 joerg } 2018 1.1 joerg if (I->getSecond().GlobalRecordAddr) { 2019 1.1 joerg if (!IsInTTDRegion && 2020 1.1 joerg (WithSPMDCheck || 2021 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { 2022 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2023 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 2024 1.1 joerg llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); 2025 1.1 joerg Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); 2026 1.1 joerg // There is no need to emit line number for unconditional branch. 2027 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF); 2028 1.1 joerg CGF.EmitBlock(NonSPMDBB); 2029 1.1 joerg CGF.EmitRuntimeCall( 2030 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 2031 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), 2032 1.1 joerg CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); 2033 1.1 joerg CGF.EmitBlock(ExitBB); 2034 1.1 joerg } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { 2035 1.1 joerg assert(GlobalizedRecords.back().RegionCounter > 0 && 2036 1.1 joerg "region counter must be > 0."); 2037 1.1 joerg --GlobalizedRecords.back().RegionCounter; 2038 1.1 joerg // Emit the restore function only in the target region. 2039 1.1 joerg if (GlobalizedRecords.back().RegionCounter == 0) { 2040 1.1 joerg QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( 2041 1.1 joerg /*DestWidth=*/16, /*Signed=*/0); 2042 1.1 joerg llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( 2043 1.1 joerg Address(GlobalizedRecords.back().UseSharedMemory, 2044 1.1 joerg CGM.getContext().getTypeAlignInChars(Int16Ty)), 2045 1.1 joerg /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); 2046 1.1 joerg llvm::Value *Args[] = { 2047 1.1 joerg llvm::ConstantInt::get( 2048 1.1 joerg CGM.Int16Ty, 2049 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), 2050 1.1 joerg IsInSharedMemory}; 2051 1.1 joerg CGF.EmitRuntimeCall( 2052 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 2053 1.1 joerg CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), 2054 1.1 joerg Args); 2055 1.1 joerg } 2056 1.1 joerg } else { 2057 1.1 joerg CGF.EmitRuntimeCall( 2058 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 2059 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), 2060 1.1 joerg I->getSecond().GlobalRecordAddr); 2061 1.1 joerg } 2062 1.1 joerg } 2063 1.1 joerg } 2064 1.1 joerg } 2065 1.1 joerg 2066 1.1 joerg void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 2067 1.1 joerg const OMPExecutableDirective &D, 2068 1.1 joerg SourceLocation Loc, 2069 1.1 joerg llvm::Function *OutlinedFn, 2070 1.1 joerg ArrayRef<llvm::Value *> CapturedVars) { 2071 1.1 joerg if (!CGF.HaveInsertPoint()) 2072 1.1 joerg return; 2073 1.1 joerg 2074 1.1 joerg Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 2075 1.1 joerg /*Name=*/".zero.addr"); 2076 1.1 joerg CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); 2077 1.1 joerg llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 2078 1.1 joerg OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); 2079 1.1 joerg OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 2080 1.1 joerg OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 2081 1.1 joerg emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 2082 1.1 joerg } 2083 1.1 joerg 2084 1.1 joerg void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 2085 1.1 joerg SourceLocation Loc, 2086 1.1 joerg llvm::Function *OutlinedFn, 2087 1.1 joerg ArrayRef<llvm::Value *> CapturedVars, 2088 1.1 joerg const Expr *IfCond) { 2089 1.1 joerg if (!CGF.HaveInsertPoint()) 2090 1.1 joerg return; 2091 1.1 joerg 2092 1.1 joerg auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, 2093 1.1 joerg IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) { 2094 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2095 1.1 joerg llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 2096 1.1 joerg llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 2097 1.1 joerg if (WFn) { 2098 1.1 joerg ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 2099 1.1 joerg // Remember for post-processing in worker loop. 2100 1.1 joerg Work.emplace_back(WFn); 2101 1.1 joerg } 2102 1.1 joerg llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 2103 1.1 joerg 2104 1.1 joerg // Create a private scope that will globalize the arguments 2105 1.1 joerg // passed from the outside of the target region. 2106 1.1 joerg // TODO: Is that needed? 2107 1.1 joerg CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 2108 1.1 joerg 2109 1.1 joerg Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 2110 1.1 joerg llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 2111 1.1 joerg "captured_vars_addrs"); 2112 1.1 joerg // There's something to share. 2113 1.1 joerg if (!CapturedVars.empty()) { 2114 1.1 joerg // Prepare for parallel region. Indicate the outlined function. 2115 1.1 joerg ASTContext &Ctx = CGF.getContext(); 2116 1.1 joerg unsigned Idx = 0; 2117 1.1 joerg for (llvm::Value *V : CapturedVars) { 2118 1.1 joerg Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 2119 1.1 joerg llvm::Value *PtrV; 2120 1.1 joerg if (V->getType()->isIntegerTy()) 2121 1.1 joerg PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 2122 1.1 joerg else 2123 1.1 joerg PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 2124 1.1 joerg CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 2125 1.1 joerg Ctx.getPointerType(Ctx.VoidPtrTy)); 2126 1.1 joerg ++Idx; 2127 1.1 joerg } 2128 1.1 joerg } 2129 1.1 joerg 2130 1.1 joerg llvm::Value *IfCondVal = nullptr; 2131 1.1 joerg if (IfCond) 2132 1.1 joerg IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 2133 1.1 joerg /* isSigned */ false); 2134 1.1 joerg else 2135 1.1 joerg IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 2136 1.1 joerg 2137 1.1 joerg assert(IfCondVal && "Expected a value"); 2138 1.1 joerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 2139 1.1 joerg llvm::Value *Args[] = { 2140 1.1 joerg RTLoc, 2141 1.1 joerg getThreadID(CGF, Loc), 2142 1.1 joerg IfCondVal, 2143 1.1 joerg llvm::ConstantInt::get(CGF.Int32Ty, -1), 2144 1.1 joerg llvm::ConstantInt::get(CGF.Int32Ty, -1), 2145 1.1 joerg FnPtr, 2146 1.1 joerg ID, 2147 1.1 joerg Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(), 2148 1.1 joerg CGF.VoidPtrPtrTy), 2149 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 2150 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2151 1.1 joerg CGM.getModule(), OMPRTL___kmpc_parallel_51), 2152 1.1 joerg Args); 2153 1.1 joerg }; 2154 1.1 joerg 2155 1.1 joerg RegionCodeGenTy RCG(ParallelGen); 2156 1.1 joerg RCG(CGF); 2157 1.1 joerg } 2158 1.1 joerg 2159 1.1 joerg void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 2160 1.1 joerg // Always emit simple barriers! 2161 1.1 joerg if (!CGF.HaveInsertPoint()) 2162 1.1 joerg return; 2163 1.1 joerg // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 2164 1.1 joerg // This function does not use parameters, so we can emit just default values. 2165 1.1 joerg llvm::Value *Args[] = { 2166 1.1 joerg llvm::ConstantPointerNull::get( 2167 1.1 joerg cast<llvm::PointerType>(getIdentTyPointerTy())), 2168 1.1 joerg llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 2169 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2170 1.1 joerg CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 2171 1.1 joerg Args); 2172 1.1 joerg } 2173 1.1 joerg 2174 1.1 joerg void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 2175 1.1 joerg SourceLocation Loc, 2176 1.1 joerg OpenMPDirectiveKind Kind, bool, 2177 1.1 joerg bool) { 2178 1.1 joerg // Always emit simple barriers! 2179 1.1 joerg if (!CGF.HaveInsertPoint()) 2180 1.1 joerg return; 2181 1.1 joerg // Build call __kmpc_cancel_barrier(loc, thread_id); 2182 1.1 joerg unsigned Flags = getDefaultFlagsForBarriers(Kind); 2183 1.1 joerg llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 2184 1.1 joerg getThreadID(CGF, Loc)}; 2185 1.1 joerg 2186 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2187 1.1 joerg CGM.getModule(), OMPRTL___kmpc_barrier), 2188 1.1 joerg Args); 2189 1.1 joerg } 2190 1.1 joerg 2191 1.1 joerg void CGOpenMPRuntimeGPU::emitCriticalRegion( 2192 1.1 joerg CodeGenFunction &CGF, StringRef CriticalName, 2193 1.1 joerg const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 2194 1.1 joerg const Expr *Hint) { 2195 1.1 joerg llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 2196 1.1 joerg llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 2197 1.1 joerg llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 2198 1.1 joerg llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 2199 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 2200 1.1 joerg 2201 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2202 1.1 joerg 2203 1.1 joerg // Get the mask of active threads in the warp. 2204 1.1 joerg llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2205 1.1 joerg CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 2206 1.1 joerg // Fetch team-local id of the thread. 2207 1.1 joerg llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 2208 1.1 joerg 2209 1.1 joerg // Get the width of the team. 2210 1.1 joerg llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 2211 1.1 joerg 2212 1.1 joerg // Initialize the counter variable for the loop. 2213 1.1 joerg QualType Int32Ty = 2214 1.1 joerg CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 2215 1.1 joerg Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 2216 1.1 joerg LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 2217 1.1 joerg CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 2218 1.1 joerg /*isInit=*/true); 2219 1.1 joerg 2220 1.1 joerg // Block checks if loop counter exceeds upper bound. 2221 1.1 joerg CGF.EmitBlock(LoopBB); 2222 1.1 joerg llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 2223 1.1 joerg llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 2224 1.1 joerg CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 2225 1.1 joerg 2226 1.1 joerg // Block tests which single thread should execute region, and which threads 2227 1.1 joerg // should go straight to synchronisation point. 2228 1.1 joerg CGF.EmitBlock(TestBB); 2229 1.1 joerg CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 2230 1.1 joerg llvm::Value *CmpThreadToCounter = 2231 1.1 joerg CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 2232 1.1 joerg CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 2233 1.1 joerg 2234 1.1 joerg // Block emits the body of the critical region. 2235 1.1 joerg CGF.EmitBlock(BodyBB); 2236 1.1 joerg 2237 1.1 joerg // Output the critical statement. 2238 1.1 joerg CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 2239 1.1 joerg Hint); 2240 1.1 joerg 2241 1.1 joerg // After the body surrounded by the critical region, the single executing 2242 1.1 joerg // thread will jump to the synchronisation point. 2243 1.1 joerg // Block waits for all threads in current team to finish then increments the 2244 1.1 joerg // counter variable and returns to the loop. 2245 1.1 joerg CGF.EmitBlock(SyncBB); 2246 1.1 joerg // Reconverge active threads in the warp. 2247 1.1 joerg (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2248 1.1 joerg CGM.getModule(), OMPRTL___kmpc_syncwarp), 2249 1.1 joerg Mask); 2250 1.1 joerg 2251 1.1 joerg llvm::Value *IncCounterVal = 2252 1.1 joerg CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 2253 1.1 joerg CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 2254 1.1 joerg CGF.EmitBranch(LoopBB); 2255 1.1 joerg 2256 1.1 joerg // Block that is reached when all threads in the team complete the region. 2257 1.1 joerg CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 2258 1.1 joerg } 2259 1.1 joerg 2260 1.1 joerg /// Cast value to the specified type. 2261 1.1 joerg static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 2262 1.1 joerg QualType ValTy, QualType CastTy, 2263 1.1 joerg SourceLocation Loc) { 2264 1.1 joerg assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 2265 1.1 joerg "Cast type must sized."); 2266 1.1 joerg assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 2267 1.1 joerg "Val type must sized."); 2268 1.1 joerg llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 2269 1.1 joerg if (ValTy == CastTy) 2270 1.1 joerg return Val; 2271 1.1 joerg if (CGF.getContext().getTypeSizeInChars(ValTy) == 2272 1.1 joerg CGF.getContext().getTypeSizeInChars(CastTy)) 2273 1.1 joerg return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 2274 1.1 joerg if (CastTy->isIntegerType() && ValTy->isIntegerType()) 2275 1.1 joerg return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 2276 1.1 joerg CastTy->hasSignedIntegerRepresentation()); 2277 1.1 joerg Address CastItem = CGF.CreateMemTemp(CastTy); 2278 1.1 joerg Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2279 1.1 joerg CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace())); 2280 1.1 joerg CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 2281 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2282 1.1 joerg TBAAAccessInfo()); 2283 1.1 joerg return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 2284 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2285 1.1 joerg TBAAAccessInfo()); 2286 1.1 joerg } 2287 1.1 joerg 2288 1.1 joerg /// This function creates calls to one of two shuffle functions to copy 2289 1.1 joerg /// variables between lanes in a warp. 2290 1.1 joerg static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, 2291 1.1 joerg llvm::Value *Elem, 2292 1.1 joerg QualType ElemType, 2293 1.1 joerg llvm::Value *Offset, 2294 1.1 joerg SourceLocation Loc) { 2295 1.1 joerg CodeGenModule &CGM = CGF.CGM; 2296 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2297 1.1 joerg CGOpenMPRuntimeGPU &RT = 2298 1.1 joerg *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime())); 2299 1.1 joerg llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder(); 2300 1.1 joerg 2301 1.1 joerg CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); 2302 1.1 joerg assert(Size.getQuantity() <= 8 && 2303 1.1 joerg "Unsupported bitwidth in shuffle instruction."); 2304 1.1 joerg 2305 1.1 joerg RuntimeFunction ShuffleFn = Size.getQuantity() <= 4 2306 1.1 joerg ? OMPRTL___kmpc_shuffle_int32 2307 1.1 joerg : OMPRTL___kmpc_shuffle_int64; 2308 1.1 joerg 2309 1.1 joerg // Cast all types to 32- or 64-bit values before calling shuffle routines. 2310 1.1 joerg QualType CastTy = CGF.getContext().getIntTypeForBitwidth( 2311 1.1 joerg Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1); 2312 1.1 joerg llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc); 2313 1.1 joerg llvm::Value *WarpSize = 2314 1.1 joerg Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); 2315 1.1 joerg 2316 1.1 joerg llvm::Value *ShuffledVal = CGF.EmitRuntimeCall( 2317 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn), 2318 1.1 joerg {ElemCast, Offset, WarpSize}); 2319 1.1 joerg 2320 1.1 joerg return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc); 2321 1.1 joerg } 2322 1.1 joerg 2323 1.1 joerg static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, 2324 1.1 joerg Address DestAddr, QualType ElemType, 2325 1.1 joerg llvm::Value *Offset, SourceLocation Loc) { 2326 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2327 1.1 joerg 2328 1.1 joerg CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); 2329 1.1 joerg // Create the loop over the big sized data. 2330 1.1 joerg // ptr = (void*)Elem; 2331 1.1 joerg // ptrEnd = (void*) Elem + 1; 2332 1.1 joerg // Step = 8; 2333 1.1 joerg // while (ptr + Step < ptrEnd) 2334 1.1 joerg // shuffle((int64_t)*ptr); 2335 1.1 joerg // Step = 4; 2336 1.1 joerg // while (ptr + Step < ptrEnd) 2337 1.1 joerg // shuffle((int32_t)*ptr); 2338 1.1 joerg // ... 2339 1.1 joerg Address ElemPtr = DestAddr; 2340 1.1 joerg Address Ptr = SrcAddr; 2341 1.1 joerg Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast( 2342 1.1 joerg Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy); 2343 1.1 joerg for (int IntSize = 8; IntSize >= 1; IntSize /= 2) { 2344 1.1 joerg if (Size < CharUnits::fromQuantity(IntSize)) 2345 1.1 joerg continue; 2346 1.1 joerg QualType IntType = CGF.getContext().getIntTypeForBitwidth( 2347 1.1 joerg CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)), 2348 1.1 joerg /*Signed=*/1); 2349 1.1 joerg llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType); 2350 1.1 joerg Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo()); 2351 1.1 joerg ElemPtr = 2352 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo()); 2353 1.1 joerg if (Size.getQuantity() / IntSize > 1) { 2354 1.1 joerg llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond"); 2355 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then"); 2356 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit"); 2357 1.1 joerg llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock(); 2358 1.1 joerg CGF.EmitBlock(PreCondBB); 2359 1.1 joerg llvm::PHINode *PhiSrc = 2360 1.1 joerg Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2); 2361 1.1 joerg PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB); 2362 1.1 joerg llvm::PHINode *PhiDest = 2363 1.1 joerg Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2); 2364 1.1 joerg PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB); 2365 1.1 joerg Ptr = Address(PhiSrc, Ptr.getAlignment()); 2366 1.1 joerg ElemPtr = Address(PhiDest, ElemPtr.getAlignment()); 2367 1.1 joerg llvm::Value *PtrDiff = Bld.CreatePtrDiff( 2368 1.1 joerg PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast( 2369 1.1 joerg Ptr.getPointer(), CGF.VoidPtrTy)); 2370 1.1 joerg Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)), 2371 1.1 joerg ThenBB, ExitBB); 2372 1.1 joerg CGF.EmitBlock(ThenBB); 2373 1.1 joerg llvm::Value *Res = createRuntimeShuffleFunction( 2374 1.1 joerg CGF, 2375 1.1 joerg CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, 2376 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2377 1.1 joerg TBAAAccessInfo()), 2378 1.1 joerg IntType, Offset, Loc); 2379 1.1 joerg CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, 2380 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2381 1.1 joerg TBAAAccessInfo()); 2382 1.1 joerg Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); 2383 1.1 joerg Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); 2384 1.1 joerg PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); 2385 1.1 joerg PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); 2386 1.1 joerg CGF.EmitBranch(PreCondBB); 2387 1.1 joerg CGF.EmitBlock(ExitBB); 2388 1.1 joerg } else { 2389 1.1 joerg llvm::Value *Res = createRuntimeShuffleFunction( 2390 1.1 joerg CGF, 2391 1.1 joerg CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, 2392 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2393 1.1 joerg TBAAAccessInfo()), 2394 1.1 joerg IntType, Offset, Loc); 2395 1.1 joerg CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, 2396 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2397 1.1 joerg TBAAAccessInfo()); 2398 1.1 joerg Ptr = Bld.CreateConstGEP(Ptr, 1); 2399 1.1 joerg ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); 2400 1.1 joerg } 2401 1.1 joerg Size = Size % IntSize; 2402 1.1 joerg } 2403 1.1 joerg } 2404 1.1 joerg 2405 1.1 joerg namespace { 2406 1.1 joerg enum CopyAction : unsigned { 2407 1.1 joerg // RemoteLaneToThread: Copy over a Reduce list from a remote lane in 2408 1.1 joerg // the warp using shuffle instructions. 2409 1.1 joerg RemoteLaneToThread, 2410 1.1 joerg // ThreadCopy: Make a copy of a Reduce list on the thread's stack. 2411 1.1 joerg ThreadCopy, 2412 1.1 joerg // ThreadToScratchpad: Copy a team-reduced array to the scratchpad. 2413 1.1 joerg ThreadToScratchpad, 2414 1.1 joerg // ScratchpadToThread: Copy from a scratchpad array in global memory 2415 1.1 joerg // containing team-reduced data to a thread's stack. 2416 1.1 joerg ScratchpadToThread, 2417 1.1 joerg }; 2418 1.1 joerg } // namespace 2419 1.1 joerg 2420 1.1 joerg struct CopyOptionsTy { 2421 1.1 joerg llvm::Value *RemoteLaneOffset; 2422 1.1 joerg llvm::Value *ScratchpadIndex; 2423 1.1 joerg llvm::Value *ScratchpadWidth; 2424 1.1 joerg }; 2425 1.1 joerg 2426 1.1 joerg /// Emit instructions to copy a Reduce list, which contains partially 2427 1.1 joerg /// aggregated values, in the specified direction. 2428 1.1 joerg static void emitReductionListCopy( 2429 1.1 joerg CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, 2430 1.1 joerg ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase, 2431 1.1 joerg CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) { 2432 1.1 joerg 2433 1.1 joerg CodeGenModule &CGM = CGF.CGM; 2434 1.1 joerg ASTContext &C = CGM.getContext(); 2435 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2436 1.1 joerg 2437 1.1 joerg llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; 2438 1.1 joerg llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex; 2439 1.1 joerg llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth; 2440 1.1 joerg 2441 1.1 joerg // Iterates, element-by-element, through the source Reduce list and 2442 1.1 joerg // make a copy. 2443 1.1 joerg unsigned Idx = 0; 2444 1.1 joerg unsigned Size = Privates.size(); 2445 1.1 joerg for (const Expr *Private : Privates) { 2446 1.1 joerg Address SrcElementAddr = Address::invalid(); 2447 1.1 joerg Address DestElementAddr = Address::invalid(); 2448 1.1 joerg Address DestElementPtrAddr = Address::invalid(); 2449 1.1 joerg // Should we shuffle in an element from a remote lane? 2450 1.1 joerg bool ShuffleInElement = false; 2451 1.1 joerg // Set to true to update the pointer in the dest Reduce list to a 2452 1.1 joerg // newly created element. 2453 1.1 joerg bool UpdateDestListPtr = false; 2454 1.1 joerg // Increment the src or dest pointer to the scratchpad, for each 2455 1.1 joerg // new element. 2456 1.1 joerg bool IncrScratchpadSrc = false; 2457 1.1 joerg bool IncrScratchpadDest = false; 2458 1.1 joerg 2459 1.1 joerg switch (Action) { 2460 1.1 joerg case RemoteLaneToThread: { 2461 1.1 joerg // Step 1.1: Get the address for the src element in the Reduce list. 2462 1.1 joerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 2463 1.1 joerg SrcElementAddr = CGF.EmitLoadOfPointer( 2464 1.1 joerg SrcElementPtrAddr, 2465 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>()); 2466 1.1 joerg 2467 1.1 joerg // Step 1.2: Create a temporary to store the element in the destination 2468 1.1 joerg // Reduce list. 2469 1.1 joerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 2470 1.1 joerg DestElementAddr = 2471 1.1 joerg CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); 2472 1.1 joerg ShuffleInElement = true; 2473 1.1 joerg UpdateDestListPtr = true; 2474 1.1 joerg break; 2475 1.1 joerg } 2476 1.1 joerg case ThreadCopy: { 2477 1.1 joerg // Step 1.1: Get the address for the src element in the Reduce list. 2478 1.1 joerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 2479 1.1 joerg SrcElementAddr = CGF.EmitLoadOfPointer( 2480 1.1 joerg SrcElementPtrAddr, 2481 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>()); 2482 1.1 joerg 2483 1.1 joerg // Step 1.2: Get the address for dest element. The destination 2484 1.1 joerg // element has already been created on the thread's stack. 2485 1.1 joerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 2486 1.1 joerg DestElementAddr = CGF.EmitLoadOfPointer( 2487 1.1 joerg DestElementPtrAddr, 2488 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>()); 2489 1.1 joerg break; 2490 1.1 joerg } 2491 1.1 joerg case ThreadToScratchpad: { 2492 1.1 joerg // Step 1.1: Get the address for the src element in the Reduce list. 2493 1.1 joerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 2494 1.1 joerg SrcElementAddr = CGF.EmitLoadOfPointer( 2495 1.1 joerg SrcElementPtrAddr, 2496 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>()); 2497 1.1 joerg 2498 1.1 joerg // Step 1.2: Get the address for dest element: 2499 1.1 joerg // address = base + index * ElementSizeInChars. 2500 1.1 joerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 2501 1.1 joerg llvm::Value *CurrentOffset = 2502 1.1 joerg Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); 2503 1.1 joerg llvm::Value *ScratchPadElemAbsolutePtrVal = 2504 1.1 joerg Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset); 2505 1.1 joerg ScratchPadElemAbsolutePtrVal = 2506 1.1 joerg Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); 2507 1.1 joerg DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, 2508 1.1 joerg C.getTypeAlignInChars(Private->getType())); 2509 1.1 joerg IncrScratchpadDest = true; 2510 1.1 joerg break; 2511 1.1 joerg } 2512 1.1 joerg case ScratchpadToThread: { 2513 1.1 joerg // Step 1.1: Get the address for the src element in the scratchpad. 2514 1.1 joerg // address = base + index * ElementSizeInChars. 2515 1.1 joerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 2516 1.1 joerg llvm::Value *CurrentOffset = 2517 1.1 joerg Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); 2518 1.1 joerg llvm::Value *ScratchPadElemAbsolutePtrVal = 2519 1.1 joerg Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset); 2520 1.1 joerg ScratchPadElemAbsolutePtrVal = 2521 1.1 joerg Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); 2522 1.1 joerg SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, 2523 1.1 joerg C.getTypeAlignInChars(Private->getType())); 2524 1.1 joerg IncrScratchpadSrc = true; 2525 1.1 joerg 2526 1.1 joerg // Step 1.2: Create a temporary to store the element in the destination 2527 1.1 joerg // Reduce list. 2528 1.1 joerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 2529 1.1 joerg DestElementAddr = 2530 1.1 joerg CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); 2531 1.1 joerg UpdateDestListPtr = true; 2532 1.1 joerg break; 2533 1.1 joerg } 2534 1.1 joerg } 2535 1.1 joerg 2536 1.1 joerg // Regardless of src and dest of copy, we emit the load of src 2537 1.1 joerg // element as this is required in all directions 2538 1.1 joerg SrcElementAddr = Bld.CreateElementBitCast( 2539 1.1 joerg SrcElementAddr, CGF.ConvertTypeForMem(Private->getType())); 2540 1.1 joerg DestElementAddr = Bld.CreateElementBitCast(DestElementAddr, 2541 1.1 joerg SrcElementAddr.getElementType()); 2542 1.1 joerg 2543 1.1 joerg // Now that all active lanes have read the element in the 2544 1.1 joerg // Reduce list, shuffle over the value from the remote lane. 2545 1.1 joerg if (ShuffleInElement) { 2546 1.1 joerg shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), 2547 1.1 joerg RemoteLaneOffset, Private->getExprLoc()); 2548 1.1 joerg } else { 2549 1.1 joerg switch (CGF.getEvaluationKind(Private->getType())) { 2550 1.1 joerg case TEK_Scalar: { 2551 1.1 joerg llvm::Value *Elem = CGF.EmitLoadOfScalar( 2552 1.1 joerg SrcElementAddr, /*Volatile=*/false, Private->getType(), 2553 1.1 joerg Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type), 2554 1.1 joerg TBAAAccessInfo()); 2555 1.1 joerg // Store the source element value to the dest element address. 2556 1.1 joerg CGF.EmitStoreOfScalar( 2557 1.1 joerg Elem, DestElementAddr, /*Volatile=*/false, Private->getType(), 2558 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 2559 1.1 joerg break; 2560 1.1 joerg } 2561 1.1 joerg case TEK_Complex: { 2562 1.1 joerg CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( 2563 1.1 joerg CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), 2564 1.1 joerg Private->getExprLoc()); 2565 1.1 joerg CGF.EmitStoreOfComplex( 2566 1.1 joerg Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), 2567 1.1 joerg /*isInit=*/false); 2568 1.1 joerg break; 2569 1.1 joerg } 2570 1.1 joerg case TEK_Aggregate: 2571 1.1 joerg CGF.EmitAggregateCopy( 2572 1.1 joerg CGF.MakeAddrLValue(DestElementAddr, Private->getType()), 2573 1.1 joerg CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), 2574 1.1 joerg Private->getType(), AggValueSlot::DoesNotOverlap); 2575 1.1 joerg break; 2576 1.1 joerg } 2577 1.1 joerg } 2578 1.1 joerg 2579 1.1 joerg // Step 3.1: Modify reference in dest Reduce list as needed. 2580 1.1 joerg // Modifying the reference in Reduce list to point to the newly 2581 1.1 joerg // created element. The element is live in the current function 2582 1.1 joerg // scope and that of functions it invokes (i.e., reduce_function). 2583 1.1 joerg // RemoteReduceData[i] = (void*)&RemoteElem 2584 1.1 joerg if (UpdateDestListPtr) { 2585 1.1 joerg CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( 2586 1.1 joerg DestElementAddr.getPointer(), CGF.VoidPtrTy), 2587 1.1 joerg DestElementPtrAddr, /*Volatile=*/false, 2588 1.1 joerg C.VoidPtrTy); 2589 1.1 joerg } 2590 1.1 joerg 2591 1.1 joerg // Step 4.1: Increment SrcBase/DestBase so that it points to the starting 2592 1.1 joerg // address of the next element in scratchpad memory, unless we're currently 2593 1.1 joerg // processing the last one. Memory alignment is also taken care of here. 2594 1.1 joerg if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { 2595 1.1 joerg llvm::Value *ScratchpadBasePtr = 2596 1.1 joerg IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); 2597 1.1 joerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 2598 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWAdd( 2599 1.1 joerg ScratchpadBasePtr, 2600 1.1 joerg Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars)); 2601 1.1 joerg 2602 1.1 joerg // Take care of global memory alignment for performance 2603 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWSub( 2604 1.1 joerg ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); 2605 1.1 joerg ScratchpadBasePtr = Bld.CreateUDiv( 2606 1.1 joerg ScratchpadBasePtr, 2607 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); 2608 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWAdd( 2609 1.1 joerg ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); 2610 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWMul( 2611 1.1 joerg ScratchpadBasePtr, 2612 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); 2613 1.1 joerg 2614 1.1 joerg if (IncrScratchpadDest) 2615 1.1 joerg DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); 2616 1.1 joerg else /* IncrScratchpadSrc = true */ 2617 1.1 joerg SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); 2618 1.1 joerg } 2619 1.1 joerg 2620 1.1 joerg ++Idx; 2621 1.1 joerg } 2622 1.1 joerg } 2623 1.1 joerg 2624 1.1 joerg /// This function emits a helper that gathers Reduce lists from the first 2625 1.1 joerg /// lane of every active warp to lanes in the first warp. 2626 1.1 joerg /// 2627 1.1 joerg /// void inter_warp_copy_func(void* reduce_data, num_warps) 2628 1.1 joerg /// shared smem[warp_size]; 2629 1.1 joerg /// For all data entries D in reduce_data: 2630 1.1 joerg /// sync 2631 1.1 joerg /// If (I am the first lane in each warp) 2632 1.1 joerg /// Copy my local D to smem[warp_id] 2633 1.1 joerg /// sync 2634 1.1 joerg /// if (I am the first warp) 2635 1.1 joerg /// Copy smem[thread_id] to my local D 2636 1.1 joerg static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, 2637 1.1 joerg ArrayRef<const Expr *> Privates, 2638 1.1 joerg QualType ReductionArrayTy, 2639 1.1 joerg SourceLocation Loc) { 2640 1.1 joerg ASTContext &C = CGM.getContext(); 2641 1.1 joerg llvm::Module &M = CGM.getModule(); 2642 1.1 joerg 2643 1.1 joerg // ReduceList: thread local Reduce list. 2644 1.1 joerg // At the stage of the computation when this function is called, partially 2645 1.1 joerg // aggregated values reside in the first lane of every active warp. 2646 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2647 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 2648 1.1 joerg // NumWarps: number of warps active in the parallel region. This could 2649 1.1 joerg // be smaller than 32 (max warps in a CTA) for partial block reduction. 2650 1.1 joerg ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2651 1.1 joerg C.getIntTypeForBitwidth(32, /* Signed */ true), 2652 1.1 joerg ImplicitParamDecl::Other); 2653 1.1 joerg FunctionArgList Args; 2654 1.1 joerg Args.push_back(&ReduceListArg); 2655 1.1 joerg Args.push_back(&NumWarpsArg); 2656 1.1 joerg 2657 1.1 joerg const CGFunctionInfo &CGFI = 2658 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2659 1.1 joerg auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), 2660 1.1 joerg llvm::GlobalValue::InternalLinkage, 2661 1.1 joerg "_omp_reduction_inter_warp_copy_func", &M); 2662 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2663 1.1 joerg Fn->setDoesNotRecurse(); 2664 1.1 joerg CodeGenFunction CGF(CGM); 2665 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2666 1.1 joerg 2667 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2668 1.1 joerg 2669 1.1 joerg // This array is used as a medium to transfer, one reduce element at a time, 2670 1.1 joerg // the data from the first lane of every warp to lanes in the first warp 2671 1.1 joerg // in order to perform the final step of a reduction in a parallel region 2672 1.1 joerg // (reduction across warps). The array is placed in NVPTX __shared__ memory 2673 1.1 joerg // for reduced latency, as well as to have a distinct copy for concurrently 2674 1.1 joerg // executing target regions. The array is declared with common linkage so 2675 1.1 joerg // as to be shared across compilation units. 2676 1.1 joerg StringRef TransferMediumName = 2677 1.1 joerg "__openmp_nvptx_data_transfer_temporary_storage"; 2678 1.1 joerg llvm::GlobalVariable *TransferMedium = 2679 1.1 joerg M.getGlobalVariable(TransferMediumName); 2680 1.1 joerg unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); 2681 1.1 joerg if (!TransferMedium) { 2682 1.1 joerg auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); 2683 1.1 joerg unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); 2684 1.1 joerg TransferMedium = new llvm::GlobalVariable( 2685 1.1 joerg M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage, 2686 1.1 joerg llvm::UndefValue::get(Ty), TransferMediumName, 2687 1.1 joerg /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, 2688 1.1 joerg SharedAddressSpace); 2689 1.1 joerg CGM.addCompilerUsedGlobal(TransferMedium); 2690 1.1 joerg } 2691 1.1 joerg 2692 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2693 1.1 joerg // Get the CUDA thread id of the current OpenMP thread on the GPU. 2694 1.1 joerg llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 2695 1.1 joerg // nvptx_lane_id = nvptx_id % warpsize 2696 1.1 joerg llvm::Value *LaneID = getNVPTXLaneID(CGF); 2697 1.1 joerg // nvptx_warp_id = nvptx_id / warpsize 2698 1.1 joerg llvm::Value *WarpID = getNVPTXWarpID(CGF); 2699 1.1 joerg 2700 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2701 1.1 joerg Address LocalReduceList( 2702 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast( 2703 1.1 joerg CGF.EmitLoadOfScalar( 2704 1.1 joerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc, 2705 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()), 2706 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 2707 1.1 joerg CGF.getPointerAlign()); 2708 1.1 joerg 2709 1.1 joerg unsigned Idx = 0; 2710 1.1 joerg for (const Expr *Private : Privates) { 2711 1.1 joerg // 2712 1.1 joerg // Warp master copies reduce element to transfer medium in __shared__ 2713 1.1 joerg // memory. 2714 1.1 joerg // 2715 1.1 joerg unsigned RealTySize = 2716 1.1 joerg C.getTypeSizeInChars(Private->getType()) 2717 1.1 joerg .alignTo(C.getTypeAlignInChars(Private->getType())) 2718 1.1 joerg .getQuantity(); 2719 1.1 joerg for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { 2720 1.1 joerg unsigned NumIters = RealTySize / TySize; 2721 1.1 joerg if (NumIters == 0) 2722 1.1 joerg continue; 2723 1.1 joerg QualType CType = C.getIntTypeForBitwidth( 2724 1.1 joerg C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); 2725 1.1 joerg llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); 2726 1.1 joerg CharUnits Align = CharUnits::fromQuantity(TySize); 2727 1.1 joerg llvm::Value *Cnt = nullptr; 2728 1.1 joerg Address CntAddr = Address::invalid(); 2729 1.1 joerg llvm::BasicBlock *PrecondBB = nullptr; 2730 1.1 joerg llvm::BasicBlock *ExitBB = nullptr; 2731 1.1 joerg if (NumIters > 1) { 2732 1.1 joerg CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); 2733 1.1 joerg CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, 2734 1.1 joerg /*Volatile=*/false, C.IntTy); 2735 1.1 joerg PrecondBB = CGF.createBasicBlock("precond"); 2736 1.1 joerg ExitBB = CGF.createBasicBlock("exit"); 2737 1.1 joerg llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); 2738 1.1 joerg // There is no need to emit line number for unconditional branch. 2739 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF); 2740 1.1 joerg CGF.EmitBlock(PrecondBB); 2741 1.1 joerg Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); 2742 1.1 joerg llvm::Value *Cmp = 2743 1.1 joerg Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); 2744 1.1 joerg Bld.CreateCondBr(Cmp, BodyBB, ExitBB); 2745 1.1 joerg CGF.EmitBlock(BodyBB); 2746 1.1 joerg } 2747 1.1 joerg // kmpc_barrier. 2748 1.1 joerg CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, 2749 1.1 joerg /*EmitChecks=*/false, 2750 1.1 joerg /*ForceSimpleCall=*/true); 2751 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); 2752 1.1 joerg llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); 2753 1.1 joerg llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); 2754 1.1 joerg 2755 1.1 joerg // if (lane_id == 0) 2756 1.1 joerg llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); 2757 1.1 joerg Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); 2758 1.1 joerg CGF.EmitBlock(ThenBB); 2759 1.1 joerg 2760 1.1 joerg // Reduce element = LocalReduceList[i] 2761 1.1 joerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 2762 1.1 joerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 2763 1.1 joerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 2764 1.1 joerg // elemptr = ((CopyType*)(elemptrptr)) + I 2765 1.1 joerg Address ElemPtr = Address(ElemPtrPtr, Align); 2766 1.1 joerg ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType); 2767 1.1 joerg if (NumIters > 1) { 2768 1.1 joerg ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt), 2769 1.1 joerg ElemPtr.getAlignment()); 2770 1.1 joerg } 2771 1.1 joerg 2772 1.1 joerg // Get pointer to location in transfer medium. 2773 1.1 joerg // MediumPtr = &medium[warp_id] 2774 1.1 joerg llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( 2775 1.1 joerg TransferMedium->getValueType(), TransferMedium, 2776 1.1 joerg {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); 2777 1.1 joerg Address MediumPtr(MediumPtrVal, Align); 2778 1.1 joerg // Casting to actual data type. 2779 1.1 joerg // MediumPtr = (CopyType*)MediumPtrAddr; 2780 1.1 joerg MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType); 2781 1.1 joerg 2782 1.1 joerg // elem = *elemptr 2783 1.1 joerg //*MediumPtr = elem 2784 1.1 joerg llvm::Value *Elem = CGF.EmitLoadOfScalar( 2785 1.1 joerg ElemPtr, /*Volatile=*/false, CType, Loc, 2786 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 2787 1.1 joerg // Store the source element value to the dest element address. 2788 1.1 joerg CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType, 2789 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 2790 1.1 joerg TBAAAccessInfo()); 2791 1.1 joerg 2792 1.1 joerg Bld.CreateBr(MergeBB); 2793 1.1 joerg 2794 1.1 joerg CGF.EmitBlock(ElseBB); 2795 1.1 joerg Bld.CreateBr(MergeBB); 2796 1.1 joerg 2797 1.1 joerg CGF.EmitBlock(MergeBB); 2798 1.1 joerg 2799 1.1 joerg // kmpc_barrier. 2800 1.1 joerg CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, 2801 1.1 joerg /*EmitChecks=*/false, 2802 1.1 joerg /*ForceSimpleCall=*/true); 2803 1.1 joerg 2804 1.1 joerg // 2805 1.1 joerg // Warp 0 copies reduce element from transfer medium. 2806 1.1 joerg // 2807 1.1 joerg llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); 2808 1.1 joerg llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); 2809 1.1 joerg llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); 2810 1.1 joerg 2811 1.1 joerg Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); 2812 1.1 joerg llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( 2813 1.1 joerg AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); 2814 1.1 joerg 2815 1.1 joerg // Up to 32 threads in warp 0 are active. 2816 1.1 joerg llvm::Value *IsActiveThread = 2817 1.1 joerg Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); 2818 1.1 joerg Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); 2819 1.1 joerg 2820 1.1 joerg CGF.EmitBlock(W0ThenBB); 2821 1.1 joerg 2822 1.1 joerg // SrcMediumPtr = &medium[tid] 2823 1.1 joerg llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( 2824 1.1 joerg TransferMedium->getValueType(), TransferMedium, 2825 1.1 joerg {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); 2826 1.1 joerg Address SrcMediumPtr(SrcMediumPtrVal, Align); 2827 1.1 joerg // SrcMediumVal = *SrcMediumPtr; 2828 1.1 joerg SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType); 2829 1.1 joerg 2830 1.1 joerg // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I 2831 1.1 joerg Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 2832 1.1 joerg llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( 2833 1.1 joerg TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); 2834 1.1 joerg Address TargetElemPtr = Address(TargetElemPtrVal, Align); 2835 1.1 joerg TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType); 2836 1.1 joerg if (NumIters > 1) { 2837 1.1 joerg TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt), 2838 1.1 joerg TargetElemPtr.getAlignment()); 2839 1.1 joerg } 2840 1.1 joerg 2841 1.1 joerg // *TargetElemPtr = SrcMediumVal; 2842 1.1 joerg llvm::Value *SrcMediumValue = 2843 1.1 joerg CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc); 2844 1.1 joerg CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, 2845 1.1 joerg CType); 2846 1.1 joerg Bld.CreateBr(W0MergeBB); 2847 1.1 joerg 2848 1.1 joerg CGF.EmitBlock(W0ElseBB); 2849 1.1 joerg Bld.CreateBr(W0MergeBB); 2850 1.1 joerg 2851 1.1 joerg CGF.EmitBlock(W0MergeBB); 2852 1.1 joerg 2853 1.1 joerg if (NumIters > 1) { 2854 1.1 joerg Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); 2855 1.1 joerg CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); 2856 1.1 joerg CGF.EmitBranch(PrecondBB); 2857 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF); 2858 1.1 joerg CGF.EmitBlock(ExitBB); 2859 1.1 joerg } 2860 1.1 joerg RealTySize %= TySize; 2861 1.1 joerg } 2862 1.1 joerg ++Idx; 2863 1.1 joerg } 2864 1.1 joerg 2865 1.1 joerg CGF.FinishFunction(); 2866 1.1 joerg return Fn; 2867 1.1 joerg } 2868 1.1 joerg 2869 1.1 joerg /// Emit a helper that reduces data across two OpenMP threads (lanes) 2870 1.1 joerg /// in the same warp. It uses shuffle instructions to copy over data from 2871 1.1 joerg /// a remote lane's stack. The reduction algorithm performed is specified 2872 1.1 joerg /// by the fourth parameter. 2873 1.1 joerg /// 2874 1.1 joerg /// Algorithm Versions. 2875 1.1 joerg /// Full Warp Reduce (argument value 0): 2876 1.1 joerg /// This algorithm assumes that all 32 lanes are active and gathers 2877 1.1 joerg /// data from these 32 lanes, producing a single resultant value. 2878 1.1 joerg /// Contiguous Partial Warp Reduce (argument value 1): 2879 1.1 joerg /// This algorithm assumes that only a *contiguous* subset of lanes 2880 1.1 joerg /// are active. This happens for the last warp in a parallel region 2881 1.1 joerg /// when the user specified num_threads is not an integer multiple of 2882 1.1 joerg /// 32. This contiguous subset always starts with the zeroth lane. 2883 1.1 joerg /// Partial Warp Reduce (argument value 2): 2884 1.1 joerg /// This algorithm gathers data from any number of lanes at any position. 2885 1.1 joerg /// All reduced values are stored in the lowest possible lane. The set 2886 1.1 joerg /// of problems every algorithm addresses is a super set of those 2887 1.1 joerg /// addressable by algorithms with a lower version number. Overhead 2888 1.1 joerg /// increases as algorithm version increases. 2889 1.1 joerg /// 2890 1.1 joerg /// Terminology 2891 1.1 joerg /// Reduce element: 2892 1.1 joerg /// Reduce element refers to the individual data field with primitive 2893 1.1 joerg /// data types to be combined and reduced across threads. 2894 1.1 joerg /// Reduce list: 2895 1.1 joerg /// Reduce list refers to a collection of local, thread-private 2896 1.1 joerg /// reduce elements. 2897 1.1 joerg /// Remote Reduce list: 2898 1.1 joerg /// Remote Reduce list refers to a collection of remote (relative to 2899 1.1 joerg /// the current thread) reduce elements. 2900 1.1 joerg /// 2901 1.1 joerg /// We distinguish between three states of threads that are important to 2902 1.1 joerg /// the implementation of this function. 2903 1.1 joerg /// Alive threads: 2904 1.1 joerg /// Threads in a warp executing the SIMT instruction, as distinguished from 2905 1.1 joerg /// threads that are inactive due to divergent control flow. 2906 1.1 joerg /// Active threads: 2907 1.1 joerg /// The minimal set of threads that has to be alive upon entry to this 2908 1.1 joerg /// function. The computation is correct iff active threads are alive. 2909 1.1 joerg /// Some threads are alive but they are not active because they do not 2910 1.1 joerg /// contribute to the computation in any useful manner. Turning them off 2911 1.1 joerg /// may introduce control flow overheads without any tangible benefits. 2912 1.1 joerg /// Effective threads: 2913 1.1 joerg /// In order to comply with the argument requirements of the shuffle 2914 1.1 joerg /// function, we must keep all lanes holding data alive. But at most 2915 1.1 joerg /// half of them perform value aggregation; we refer to this half of 2916 1.1 joerg /// threads as effective. The other half is simply handing off their 2917 1.1 joerg /// data. 2918 1.1 joerg /// 2919 1.1 joerg /// Procedure 2920 1.1 joerg /// Value shuffle: 2921 1.1 joerg /// In this step active threads transfer data from higher lane positions 2922 1.1 joerg /// in the warp to lower lane positions, creating Remote Reduce list. 2923 1.1 joerg /// Value aggregation: 2924 1.1 joerg /// In this step, effective threads combine their thread local Reduce list 2925 1.1 joerg /// with Remote Reduce list and store the result in the thread local 2926 1.1 joerg /// Reduce list. 2927 1.1 joerg /// Value copy: 2928 1.1 joerg /// In this step, we deal with the assumption made by algorithm 2 2929 1.1 joerg /// (i.e. contiguity assumption). When we have an odd number of lanes 2930 1.1 joerg /// active, say 2k+1, only k threads will be effective and therefore k 2931 1.1 joerg /// new values will be produced. However, the Reduce list owned by the 2932 1.1 joerg /// (2k+1)th thread is ignored in the value aggregation. Therefore 2933 1.1 joerg /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so 2934 1.1 joerg /// that the contiguity assumption still holds. 2935 1.1 joerg static llvm::Function *emitShuffleAndReduceFunction( 2936 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2937 1.1 joerg QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) { 2938 1.1 joerg ASTContext &C = CGM.getContext(); 2939 1.1 joerg 2940 1.1 joerg // Thread local Reduce list used to host the values of data to be reduced. 2941 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2942 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 2943 1.1 joerg // Current lane id; could be logical. 2944 1.1 joerg ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy, 2945 1.1 joerg ImplicitParamDecl::Other); 2946 1.1 joerg // Offset of the remote source lane relative to the current lane. 2947 1.1 joerg ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2948 1.1 joerg C.ShortTy, ImplicitParamDecl::Other); 2949 1.1 joerg // Algorithm version. This is expected to be known at compile time. 2950 1.1 joerg ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2951 1.1 joerg C.ShortTy, ImplicitParamDecl::Other); 2952 1.1 joerg FunctionArgList Args; 2953 1.1 joerg Args.push_back(&ReduceListArg); 2954 1.1 joerg Args.push_back(&LaneIDArg); 2955 1.1 joerg Args.push_back(&RemoteLaneOffsetArg); 2956 1.1 joerg Args.push_back(&AlgoVerArg); 2957 1.1 joerg 2958 1.1 joerg const CGFunctionInfo &CGFI = 2959 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2960 1.1 joerg auto *Fn = llvm::Function::Create( 2961 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2962 1.1 joerg "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); 2963 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2964 1.1 joerg Fn->setDoesNotRecurse(); 2965 1.1 joerg 2966 1.1 joerg CodeGenFunction CGF(CGM); 2967 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2968 1.1 joerg 2969 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 2970 1.1 joerg 2971 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2972 1.1 joerg Address LocalReduceList( 2973 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast( 2974 1.1 joerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 2975 1.1 joerg C.VoidPtrTy, SourceLocation()), 2976 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 2977 1.1 joerg CGF.getPointerAlign()); 2978 1.1 joerg 2979 1.1 joerg Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); 2980 1.1 joerg llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( 2981 1.1 joerg AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2982 1.1 joerg 2983 1.1 joerg Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); 2984 1.1 joerg llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( 2985 1.1 joerg AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2986 1.1 joerg 2987 1.1 joerg Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); 2988 1.1 joerg llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( 2989 1.1 joerg AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2990 1.1 joerg 2991 1.1 joerg // Create a local thread-private variable to host the Reduce list 2992 1.1 joerg // from a remote lane. 2993 1.1 joerg Address RemoteReduceList = 2994 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); 2995 1.1 joerg 2996 1.1 joerg // This loop iterates through the list of reduce elements and copies, 2997 1.1 joerg // element by element, from a remote lane in the warp to RemoteReduceList, 2998 1.1 joerg // hosted on the thread's stack. 2999 1.1 joerg emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, 3000 1.1 joerg LocalReduceList, RemoteReduceList, 3001 1.1 joerg {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal, 3002 1.1 joerg /*ScratchpadIndex=*/nullptr, 3003 1.1 joerg /*ScratchpadWidth=*/nullptr}); 3004 1.1 joerg 3005 1.1 joerg // The actions to be performed on the Remote Reduce list is dependent 3006 1.1 joerg // on the algorithm version. 3007 1.1 joerg // 3008 1.1 joerg // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && 3009 1.1 joerg // LaneId % 2 == 0 && Offset > 0): 3010 1.1 joerg // do the reduction value aggregation 3011 1.1 joerg // 3012 1.1 joerg // The thread local variable Reduce list is mutated in place to host the 3013 1.1 joerg // reduced data, which is the aggregated value produced from local and 3014 1.1 joerg // remote lanes. 3015 1.1 joerg // 3016 1.1 joerg // Note that AlgoVer is expected to be a constant integer known at compile 3017 1.1 joerg // time. 3018 1.1 joerg // When AlgoVer==0, the first conjunction evaluates to true, making 3019 1.1 joerg // the entire predicate true during compile time. 3020 1.1 joerg // When AlgoVer==1, the second conjunction has only the second part to be 3021 1.1 joerg // evaluated during runtime. Other conjunctions evaluates to false 3022 1.1 joerg // during compile time. 3023 1.1 joerg // When AlgoVer==2, the third conjunction has only the second part to be 3024 1.1 joerg // evaluated during runtime. Other conjunctions evaluates to false 3025 1.1 joerg // during compile time. 3026 1.1 joerg llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal); 3027 1.1 joerg 3028 1.1 joerg llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); 3029 1.1 joerg llvm::Value *CondAlgo1 = Bld.CreateAnd( 3030 1.1 joerg Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); 3031 1.1 joerg 3032 1.1 joerg llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)); 3033 1.1 joerg llvm::Value *CondAlgo2 = Bld.CreateAnd( 3034 1.1 joerg Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)))); 3035 1.1 joerg CondAlgo2 = Bld.CreateAnd( 3036 1.1 joerg CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); 3037 1.1 joerg 3038 1.1 joerg llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); 3039 1.1 joerg CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); 3040 1.1 joerg 3041 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); 3042 1.1 joerg llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); 3043 1.1 joerg llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); 3044 1.1 joerg Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); 3045 1.1 joerg 3046 1.1 joerg CGF.EmitBlock(ThenBB); 3047 1.1 joerg // reduce_function(LocalReduceList, RemoteReduceList) 3048 1.1 joerg llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3049 1.1 joerg LocalReduceList.getPointer(), CGF.VoidPtrTy); 3050 1.1 joerg llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3051 1.1 joerg RemoteReduceList.getPointer(), CGF.VoidPtrTy); 3052 1.1 joerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 3053 1.1 joerg CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}); 3054 1.1 joerg Bld.CreateBr(MergeBB); 3055 1.1 joerg 3056 1.1 joerg CGF.EmitBlock(ElseBB); 3057 1.1 joerg Bld.CreateBr(MergeBB); 3058 1.1 joerg 3059 1.1 joerg CGF.EmitBlock(MergeBB); 3060 1.1 joerg 3061 1.1 joerg // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local 3062 1.1 joerg // Reduce list. 3063 1.1 joerg Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); 3064 1.1 joerg llvm::Value *CondCopy = Bld.CreateAnd( 3065 1.1 joerg Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); 3066 1.1 joerg 3067 1.1 joerg llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); 3068 1.1 joerg llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); 3069 1.1 joerg llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); 3070 1.1 joerg Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); 3071 1.1 joerg 3072 1.1 joerg CGF.EmitBlock(CpyThenBB); 3073 1.1 joerg emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, 3074 1.1 joerg RemoteReduceList, LocalReduceList); 3075 1.1 joerg Bld.CreateBr(CpyMergeBB); 3076 1.1 joerg 3077 1.1 joerg CGF.EmitBlock(CpyElseBB); 3078 1.1 joerg Bld.CreateBr(CpyMergeBB); 3079 1.1 joerg 3080 1.1 joerg CGF.EmitBlock(CpyMergeBB); 3081 1.1 joerg 3082 1.1 joerg CGF.FinishFunction(); 3083 1.1 joerg return Fn; 3084 1.1 joerg } 3085 1.1 joerg 3086 1.1 joerg /// This function emits a helper that copies all the reduction variables from 3087 1.1 joerg /// the team into the provided global buffer for the reduction variables. 3088 1.1 joerg /// 3089 1.1 joerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 3090 1.1 joerg /// For all data entries D in reduce_data: 3091 1.1 joerg /// Copy local D to buffer.D[Idx] 3092 1.1 joerg static llvm::Value *emitListToGlobalCopyFunction( 3093 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3094 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc, 3095 1.1 joerg const RecordDecl *TeamReductionRec, 3096 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3097 1.1 joerg &VarFieldMap) { 3098 1.1 joerg ASTContext &C = CGM.getContext(); 3099 1.1 joerg 3100 1.1 joerg // Buffer: global reduction buffer. 3101 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3102 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3103 1.1 joerg // Idx: index of the buffer. 3104 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3105 1.1 joerg ImplicitParamDecl::Other); 3106 1.1 joerg // ReduceList: thread local Reduce list. 3107 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3108 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3109 1.1 joerg FunctionArgList Args; 3110 1.1 joerg Args.push_back(&BufferArg); 3111 1.1 joerg Args.push_back(&IdxArg); 3112 1.1 joerg Args.push_back(&ReduceListArg); 3113 1.1 joerg 3114 1.1 joerg const CGFunctionInfo &CGFI = 3115 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3116 1.1 joerg auto *Fn = llvm::Function::Create( 3117 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3118 1.1 joerg "_omp_reduction_list_to_global_copy_func", &CGM.getModule()); 3119 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3120 1.1 joerg Fn->setDoesNotRecurse(); 3121 1.1 joerg CodeGenFunction CGF(CGM); 3122 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3123 1.1 joerg 3124 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 3125 1.1 joerg 3126 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3127 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3128 1.1 joerg Address LocalReduceList( 3129 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast( 3130 1.1 joerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 3131 1.1 joerg C.VoidPtrTy, Loc), 3132 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 3133 1.1 joerg CGF.getPointerAlign()); 3134 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec); 3135 1.1 joerg llvm::Type *LLVMReductionsBufferTy = 3136 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy); 3137 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3138 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3139 1.1 joerg LLVMReductionsBufferTy->getPointerTo()); 3140 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3141 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3142 1.1 joerg /*Volatile=*/false, C.IntTy, 3143 1.1 joerg Loc)}; 3144 1.1 joerg unsigned Idx = 0; 3145 1.1 joerg for (const Expr *Private : Privates) { 3146 1.1 joerg // Reduce element = LocalReduceList[i] 3147 1.1 joerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 3148 1.1 joerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 3149 1.1 joerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 3150 1.1 joerg // elemptr = ((CopyType*)(elemptrptr)) + I 3151 1.1 joerg ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3152 1.1 joerg ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); 3153 1.1 joerg Address ElemPtr = 3154 1.1 joerg Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); 3155 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); 3156 1.1 joerg // Global = Buffer.VD[Idx]; 3157 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD); 3158 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField( 3159 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3160 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF); 3161 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3162 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3163 1.1 joerg GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment())); 3164 1.1 joerg switch (CGF.getEvaluationKind(Private->getType())) { 3165 1.1 joerg case TEK_Scalar: { 3166 1.1 joerg llvm::Value *V = CGF.EmitLoadOfScalar( 3167 1.1 joerg ElemPtr, /*Volatile=*/false, Private->getType(), Loc, 3168 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 3169 1.1 joerg CGF.EmitStoreOfScalar(V, GlobLVal); 3170 1.1 joerg break; 3171 1.1 joerg } 3172 1.1 joerg case TEK_Complex: { 3173 1.1 joerg CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex( 3174 1.1 joerg CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc); 3175 1.1 joerg CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false); 3176 1.1 joerg break; 3177 1.1 joerg } 3178 1.1 joerg case TEK_Aggregate: 3179 1.1 joerg CGF.EmitAggregateCopy(GlobLVal, 3180 1.1 joerg CGF.MakeAddrLValue(ElemPtr, Private->getType()), 3181 1.1 joerg Private->getType(), AggValueSlot::DoesNotOverlap); 3182 1.1 joerg break; 3183 1.1 joerg } 3184 1.1 joerg ++Idx; 3185 1.1 joerg } 3186 1.1 joerg 3187 1.1 joerg CGF.FinishFunction(); 3188 1.1 joerg return Fn; 3189 1.1 joerg } 3190 1.1 joerg 3191 1.1 joerg /// This function emits a helper that reduces all the reduction variables from 3192 1.1 joerg /// the team into the provided global buffer for the reduction variables. 3193 1.1 joerg /// 3194 1.1 joerg /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) 3195 1.1 joerg /// void *GlobPtrs[]; 3196 1.1 joerg /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 3197 1.1 joerg /// ... 3198 1.1 joerg /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 3199 1.1 joerg /// reduce_function(GlobPtrs, reduce_data); 3200 1.1 joerg static llvm::Value *emitListToGlobalReduceFunction( 3201 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3202 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc, 3203 1.1 joerg const RecordDecl *TeamReductionRec, 3204 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3205 1.1 joerg &VarFieldMap, 3206 1.1 joerg llvm::Function *ReduceFn) { 3207 1.1 joerg ASTContext &C = CGM.getContext(); 3208 1.1 joerg 3209 1.1 joerg // Buffer: global reduction buffer. 3210 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3211 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3212 1.1 joerg // Idx: index of the buffer. 3213 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3214 1.1 joerg ImplicitParamDecl::Other); 3215 1.1 joerg // ReduceList: thread local Reduce list. 3216 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3217 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3218 1.1 joerg FunctionArgList Args; 3219 1.1 joerg Args.push_back(&BufferArg); 3220 1.1 joerg Args.push_back(&IdxArg); 3221 1.1 joerg Args.push_back(&ReduceListArg); 3222 1.1 joerg 3223 1.1 joerg const CGFunctionInfo &CGFI = 3224 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3225 1.1 joerg auto *Fn = llvm::Function::Create( 3226 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3227 1.1 joerg "_omp_reduction_list_to_global_reduce_func", &CGM.getModule()); 3228 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3229 1.1 joerg Fn->setDoesNotRecurse(); 3230 1.1 joerg CodeGenFunction CGF(CGM); 3231 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3232 1.1 joerg 3233 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 3234 1.1 joerg 3235 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3236 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec); 3237 1.1 joerg llvm::Type *LLVMReductionsBufferTy = 3238 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy); 3239 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3240 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3241 1.1 joerg LLVMReductionsBufferTy->getPointerTo()); 3242 1.1 joerg 3243 1.1 joerg // 1. Build a list of reduction variables. 3244 1.1 joerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3245 1.1 joerg Address ReductionList = 3246 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 3247 1.1 joerg auto IPriv = Privates.begin(); 3248 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3249 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3250 1.1 joerg /*Volatile=*/false, C.IntTy, 3251 1.1 joerg Loc)}; 3252 1.1 joerg unsigned Idx = 0; 3253 1.1 joerg for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { 3254 1.1 joerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3255 1.1 joerg // Global = Buffer.VD[Idx]; 3256 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); 3257 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD); 3258 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField( 3259 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3260 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF); 3261 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3262 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3263 1.1 joerg llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); 3264 1.1 joerg CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); 3265 1.1 joerg if ((*IPriv)->getType()->isVariablyModifiedType()) { 3266 1.1 joerg // Store array size. 3267 1.1 joerg ++Idx; 3268 1.1 joerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3269 1.1 joerg llvm::Value *Size = CGF.Builder.CreateIntCast( 3270 1.1 joerg CGF.getVLASize( 3271 1.1 joerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 3272 1.1 joerg .NumElts, 3273 1.1 joerg CGF.SizeTy, /*isSigned=*/false); 3274 1.1 joerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 3275 1.1 joerg Elem); 3276 1.1 joerg } 3277 1.1 joerg } 3278 1.1 joerg 3279 1.1 joerg // Call reduce_function(GlobalReduceList, ReduceList) 3280 1.1 joerg llvm::Value *GlobalReduceList = 3281 1.1 joerg CGF.EmitCastToVoidPtr(ReductionList.getPointer()); 3282 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3283 1.1 joerg llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( 3284 1.1 joerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); 3285 1.1 joerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 3286 1.1 joerg CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr}); 3287 1.1 joerg CGF.FinishFunction(); 3288 1.1 joerg return Fn; 3289 1.1 joerg } 3290 1.1 joerg 3291 1.1 joerg /// This function emits a helper that copies all the reduction variables from 3292 1.1 joerg /// the team into the provided global buffer for the reduction variables. 3293 1.1 joerg /// 3294 1.1 joerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 3295 1.1 joerg /// For all data entries D in reduce_data: 3296 1.1 joerg /// Copy buffer.D[Idx] to local D; 3297 1.1 joerg static llvm::Value *emitGlobalToListCopyFunction( 3298 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3299 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc, 3300 1.1 joerg const RecordDecl *TeamReductionRec, 3301 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3302 1.1 joerg &VarFieldMap) { 3303 1.1 joerg ASTContext &C = CGM.getContext(); 3304 1.1 joerg 3305 1.1 joerg // Buffer: global reduction buffer. 3306 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3307 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3308 1.1 joerg // Idx: index of the buffer. 3309 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3310 1.1 joerg ImplicitParamDecl::Other); 3311 1.1 joerg // ReduceList: thread local Reduce list. 3312 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3313 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3314 1.1 joerg FunctionArgList Args; 3315 1.1 joerg Args.push_back(&BufferArg); 3316 1.1 joerg Args.push_back(&IdxArg); 3317 1.1 joerg Args.push_back(&ReduceListArg); 3318 1.1 joerg 3319 1.1 joerg const CGFunctionInfo &CGFI = 3320 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3321 1.1 joerg auto *Fn = llvm::Function::Create( 3322 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3323 1.1 joerg "_omp_reduction_global_to_list_copy_func", &CGM.getModule()); 3324 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3325 1.1 joerg Fn->setDoesNotRecurse(); 3326 1.1 joerg CodeGenFunction CGF(CGM); 3327 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3328 1.1 joerg 3329 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 3330 1.1 joerg 3331 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3332 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3333 1.1 joerg Address LocalReduceList( 3334 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast( 3335 1.1 joerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 3336 1.1 joerg C.VoidPtrTy, Loc), 3337 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 3338 1.1 joerg CGF.getPointerAlign()); 3339 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec); 3340 1.1 joerg llvm::Type *LLVMReductionsBufferTy = 3341 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy); 3342 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3343 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3344 1.1 joerg LLVMReductionsBufferTy->getPointerTo()); 3345 1.1 joerg 3346 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3347 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3348 1.1 joerg /*Volatile=*/false, C.IntTy, 3349 1.1 joerg Loc)}; 3350 1.1 joerg unsigned Idx = 0; 3351 1.1 joerg for (const Expr *Private : Privates) { 3352 1.1 joerg // Reduce element = LocalReduceList[i] 3353 1.1 joerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 3354 1.1 joerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 3355 1.1 joerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 3356 1.1 joerg // elemptr = ((CopyType*)(elemptrptr)) + I 3357 1.1 joerg ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3358 1.1 joerg ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); 3359 1.1 joerg Address ElemPtr = 3360 1.1 joerg Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); 3361 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); 3362 1.1 joerg // Global = Buffer.VD[Idx]; 3363 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD); 3364 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField( 3365 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3366 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF); 3367 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3368 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3369 1.1 joerg GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment())); 3370 1.1 joerg switch (CGF.getEvaluationKind(Private->getType())) { 3371 1.1 joerg case TEK_Scalar: { 3372 1.1 joerg llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); 3373 1.1 joerg CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(), 3374 1.1 joerg LValueBaseInfo(AlignmentSource::Type), 3375 1.1 joerg TBAAAccessInfo()); 3376 1.1 joerg break; 3377 1.1 joerg } 3378 1.1 joerg case TEK_Complex: { 3379 1.1 joerg CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc); 3380 1.1 joerg CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()), 3381 1.1 joerg /*isInit=*/false); 3382 1.1 joerg break; 3383 1.1 joerg } 3384 1.1 joerg case TEK_Aggregate: 3385 1.1 joerg CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), 3386 1.1 joerg GlobLVal, Private->getType(), 3387 1.1 joerg AggValueSlot::DoesNotOverlap); 3388 1.1 joerg break; 3389 1.1 joerg } 3390 1.1 joerg ++Idx; 3391 1.1 joerg } 3392 1.1 joerg 3393 1.1 joerg CGF.FinishFunction(); 3394 1.1 joerg return Fn; 3395 1.1 joerg } 3396 1.1 joerg 3397 1.1 joerg /// This function emits a helper that reduces all the reduction variables from 3398 1.1 joerg /// the team into the provided global buffer for the reduction variables. 3399 1.1 joerg /// 3400 1.1 joerg /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) 3401 1.1 joerg /// void *GlobPtrs[]; 3402 1.1 joerg /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 3403 1.1 joerg /// ... 3404 1.1 joerg /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 3405 1.1 joerg /// reduce_function(reduce_data, GlobPtrs); 3406 1.1 joerg static llvm::Value *emitGlobalToListReduceFunction( 3407 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3408 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc, 3409 1.1 joerg const RecordDecl *TeamReductionRec, 3410 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3411 1.1 joerg &VarFieldMap, 3412 1.1 joerg llvm::Function *ReduceFn) { 3413 1.1 joerg ASTContext &C = CGM.getContext(); 3414 1.1 joerg 3415 1.1 joerg // Buffer: global reduction buffer. 3416 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3417 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3418 1.1 joerg // Idx: index of the buffer. 3419 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3420 1.1 joerg ImplicitParamDecl::Other); 3421 1.1 joerg // ReduceList: thread local Reduce list. 3422 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3423 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other); 3424 1.1 joerg FunctionArgList Args; 3425 1.1 joerg Args.push_back(&BufferArg); 3426 1.1 joerg Args.push_back(&IdxArg); 3427 1.1 joerg Args.push_back(&ReduceListArg); 3428 1.1 joerg 3429 1.1 joerg const CGFunctionInfo &CGFI = 3430 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3431 1.1 joerg auto *Fn = llvm::Function::Create( 3432 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3433 1.1 joerg "_omp_reduction_global_to_list_reduce_func", &CGM.getModule()); 3434 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3435 1.1 joerg Fn->setDoesNotRecurse(); 3436 1.1 joerg CodeGenFunction CGF(CGM); 3437 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3438 1.1 joerg 3439 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 3440 1.1 joerg 3441 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3442 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec); 3443 1.1 joerg llvm::Type *LLVMReductionsBufferTy = 3444 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy); 3445 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3446 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3447 1.1 joerg LLVMReductionsBufferTy->getPointerTo()); 3448 1.1 joerg 3449 1.1 joerg // 1. Build a list of reduction variables. 3450 1.1 joerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3451 1.1 joerg Address ReductionList = 3452 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 3453 1.1 joerg auto IPriv = Privates.begin(); 3454 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3455 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3456 1.1 joerg /*Volatile=*/false, C.IntTy, 3457 1.1 joerg Loc)}; 3458 1.1 joerg unsigned Idx = 0; 3459 1.1 joerg for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { 3460 1.1 joerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3461 1.1 joerg // Global = Buffer.VD[Idx]; 3462 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); 3463 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD); 3464 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField( 3465 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3466 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF); 3467 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3468 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3469 1.1 joerg llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); 3470 1.1 joerg CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); 3471 1.1 joerg if ((*IPriv)->getType()->isVariablyModifiedType()) { 3472 1.1 joerg // Store array size. 3473 1.1 joerg ++Idx; 3474 1.1 joerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3475 1.1 joerg llvm::Value *Size = CGF.Builder.CreateIntCast( 3476 1.1 joerg CGF.getVLASize( 3477 1.1 joerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 3478 1.1 joerg .NumElts, 3479 1.1 joerg CGF.SizeTy, /*isSigned=*/false); 3480 1.1 joerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 3481 1.1 joerg Elem); 3482 1.1 joerg } 3483 1.1 joerg } 3484 1.1 joerg 3485 1.1 joerg // Call reduce_function(ReduceList, GlobalReduceList) 3486 1.1 joerg llvm::Value *GlobalReduceList = 3487 1.1 joerg CGF.EmitCastToVoidPtr(ReductionList.getPointer()); 3488 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3489 1.1 joerg llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( 3490 1.1 joerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); 3491 1.1 joerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 3492 1.1 joerg CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList}); 3493 1.1 joerg CGF.FinishFunction(); 3494 1.1 joerg return Fn; 3495 1.1 joerg } 3496 1.1 joerg 3497 1.1 joerg /// 3498 1.1 joerg /// Design of OpenMP reductions on the GPU 3499 1.1 joerg /// 3500 1.1 joerg /// Consider a typical OpenMP program with one or more reduction 3501 1.1 joerg /// clauses: 3502 1.1 joerg /// 3503 1.1 joerg /// float foo; 3504 1.1 joerg /// double bar; 3505 1.1 joerg /// #pragma omp target teams distribute parallel for \ 3506 1.1 joerg /// reduction(+:foo) reduction(*:bar) 3507 1.1 joerg /// for (int i = 0; i < N; i++) { 3508 1.1 joerg /// foo += A[i]; bar *= B[i]; 3509 1.1 joerg /// } 3510 1.1 joerg /// 3511 1.1 joerg /// where 'foo' and 'bar' are reduced across all OpenMP threads in 3512 1.1 joerg /// all teams. In our OpenMP implementation on the NVPTX device an 3513 1.1 joerg /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 3514 1.1 joerg /// within a team are mapped to CUDA threads within a threadblock. 3515 1.1 joerg /// Our goal is to efficiently aggregate values across all OpenMP 3516 1.1 joerg /// threads such that: 3517 1.1 joerg /// 3518 1.1 joerg /// - the compiler and runtime are logically concise, and 3519 1.1 joerg /// - the reduction is performed efficiently in a hierarchical 3520 1.1 joerg /// manner as follows: within OpenMP threads in the same warp, 3521 1.1 joerg /// across warps in a threadblock, and finally across teams on 3522 1.1 joerg /// the NVPTX device. 3523 1.1 joerg /// 3524 1.1 joerg /// Introduction to Decoupling 3525 1.1 joerg /// 3526 1.1 joerg /// We would like to decouple the compiler and the runtime so that the 3527 1.1 joerg /// latter is ignorant of the reduction variables (number, data types) 3528 1.1 joerg /// and the reduction operators. This allows a simpler interface 3529 1.1 joerg /// and implementation while still attaining good performance. 3530 1.1 joerg /// 3531 1.1 joerg /// Pseudocode for the aforementioned OpenMP program generated by the 3532 1.1 joerg /// compiler is as follows: 3533 1.1 joerg /// 3534 1.1 joerg /// 1. Create private copies of reduction variables on each OpenMP 3535 1.1 joerg /// thread: 'foo_private', 'bar_private' 3536 1.1 joerg /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 3537 1.1 joerg /// to it and writes the result in 'foo_private' and 'bar_private' 3538 1.1 joerg /// respectively. 3539 1.1 joerg /// 3. Call the OpenMP runtime on the GPU to reduce within a team 3540 1.1 joerg /// and store the result on the team master: 3541 1.1 joerg /// 3542 1.1 joerg /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 3543 1.1 joerg /// reduceData, shuffleReduceFn, interWarpCpyFn) 3544 1.1 joerg /// 3545 1.1 joerg /// where: 3546 1.1 joerg /// struct ReduceData { 3547 1.1 joerg /// double *foo; 3548 1.1 joerg /// double *bar; 3549 1.1 joerg /// } reduceData 3550 1.1 joerg /// reduceData.foo = &foo_private 3551 1.1 joerg /// reduceData.bar = &bar_private 3552 1.1 joerg /// 3553 1.1 joerg /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 3554 1.1 joerg /// auxiliary functions generated by the compiler that operate on 3555 1.1 joerg /// variables of type 'ReduceData'. They aid the runtime perform 3556 1.1 joerg /// algorithmic steps in a data agnostic manner. 3557 1.1 joerg /// 3558 1.1 joerg /// 'shuffleReduceFn' is a pointer to a function that reduces data 3559 1.1 joerg /// of type 'ReduceData' across two OpenMP threads (lanes) in the 3560 1.1 joerg /// same warp. It takes the following arguments as input: 3561 1.1 joerg /// 3562 1.1 joerg /// a. variable of type 'ReduceData' on the calling lane, 3563 1.1 joerg /// b. its lane_id, 3564 1.1 joerg /// c. an offset relative to the current lane_id to generate a 3565 1.1 joerg /// remote_lane_id. The remote lane contains the second 3566 1.1 joerg /// variable of type 'ReduceData' that is to be reduced. 3567 1.1 joerg /// d. an algorithm version parameter determining which reduction 3568 1.1 joerg /// algorithm to use. 3569 1.1 joerg /// 3570 1.1 joerg /// 'shuffleReduceFn' retrieves data from the remote lane using 3571 1.1 joerg /// efficient GPU shuffle intrinsics and reduces, using the 3572 1.1 joerg /// algorithm specified by the 4th parameter, the two operands 3573 1.1 joerg /// element-wise. The result is written to the first operand. 3574 1.1 joerg /// 3575 1.1 joerg /// Different reduction algorithms are implemented in different 3576 1.1 joerg /// runtime functions, all calling 'shuffleReduceFn' to perform 3577 1.1 joerg /// the essential reduction step. Therefore, based on the 4th 3578 1.1 joerg /// parameter, this function behaves slightly differently to 3579 1.1 joerg /// cooperate with the runtime to ensure correctness under 3580 1.1 joerg /// different circumstances. 3581 1.1 joerg /// 3582 1.1 joerg /// 'InterWarpCpyFn' is a pointer to a function that transfers 3583 1.1 joerg /// reduced variables across warps. It tunnels, through CUDA 3584 1.1 joerg /// shared memory, the thread-private data of type 'ReduceData' 3585 1.1 joerg /// from lane 0 of each warp to a lane in the first warp. 3586 1.1 joerg /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 3587 1.1 joerg /// The last team writes the global reduced value to memory. 3588 1.1 joerg /// 3589 1.1 joerg /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 3590 1.1 joerg /// reduceData, shuffleReduceFn, interWarpCpyFn, 3591 1.1 joerg /// scratchpadCopyFn, loadAndReduceFn) 3592 1.1 joerg /// 3593 1.1 joerg /// 'scratchpadCopyFn' is a helper that stores reduced 3594 1.1 joerg /// data from the team master to a scratchpad array in 3595 1.1 joerg /// global memory. 3596 1.1 joerg /// 3597 1.1 joerg /// 'loadAndReduceFn' is a helper that loads data from 3598 1.1 joerg /// the scratchpad array and reduces it with the input 3599 1.1 joerg /// operand. 3600 1.1 joerg /// 3601 1.1 joerg /// These compiler generated functions hide address 3602 1.1 joerg /// calculation and alignment information from the runtime. 3603 1.1 joerg /// 5. if ret == 1: 3604 1.1 joerg /// The team master of the last team stores the reduced 3605 1.1 joerg /// result to the globals in memory. 3606 1.1 joerg /// foo += reduceData.foo; bar *= reduceData.bar 3607 1.1 joerg /// 3608 1.1 joerg /// 3609 1.1 joerg /// Warp Reduction Algorithms 3610 1.1 joerg /// 3611 1.1 joerg /// On the warp level, we have three algorithms implemented in the 3612 1.1 joerg /// OpenMP runtime depending on the number of active lanes: 3613 1.1 joerg /// 3614 1.1 joerg /// Full Warp Reduction 3615 1.1 joerg /// 3616 1.1 joerg /// The reduce algorithm within a warp where all lanes are active 3617 1.1 joerg /// is implemented in the runtime as follows: 3618 1.1 joerg /// 3619 1.1 joerg /// full_warp_reduce(void *reduce_data, 3620 1.1 joerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 3621 1.1 joerg /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 3622 1.1 joerg /// ShuffleReduceFn(reduce_data, 0, offset, 0); 3623 1.1 joerg /// } 3624 1.1 joerg /// 3625 1.1 joerg /// The algorithm completes in log(2, WARPSIZE) steps. 3626 1.1 joerg /// 3627 1.1 joerg /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 3628 1.1 joerg /// not used therefore we save instructions by not retrieving lane_id 3629 1.1 joerg /// from the corresponding special registers. The 4th parameter, which 3630 1.1 joerg /// represents the version of the algorithm being used, is set to 0 to 3631 1.1 joerg /// signify full warp reduction. 3632 1.1 joerg /// 3633 1.1 joerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 3634 1.1 joerg /// 3635 1.1 joerg /// #reduce_elem refers to an element in the local lane's data structure 3636 1.1 joerg /// #remote_elem is retrieved from a remote lane 3637 1.1 joerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 3638 1.1 joerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 3639 1.1 joerg /// 3640 1.1 joerg /// Contiguous Partial Warp Reduction 3641 1.1 joerg /// 3642 1.1 joerg /// This reduce algorithm is used within a warp where only the first 3643 1.1 joerg /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 3644 1.1 joerg /// number of OpenMP threads in a parallel region is not a multiple of 3645 1.1 joerg /// WARPSIZE. The algorithm is implemented in the runtime as follows: 3646 1.1 joerg /// 3647 1.1 joerg /// void 3648 1.1 joerg /// contiguous_partial_reduce(void *reduce_data, 3649 1.1 joerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 3650 1.1 joerg /// int size, int lane_id) { 3651 1.1 joerg /// int curr_size; 3652 1.1 joerg /// int offset; 3653 1.1 joerg /// curr_size = size; 3654 1.1 joerg /// mask = curr_size/2; 3655 1.1 joerg /// while (offset>0) { 3656 1.1 joerg /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 3657 1.1 joerg /// curr_size = (curr_size+1)/2; 3658 1.1 joerg /// offset = curr_size/2; 3659 1.1 joerg /// } 3660 1.1 joerg /// } 3661 1.1 joerg /// 3662 1.1 joerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 3663 1.1 joerg /// 3664 1.1 joerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 3665 1.1 joerg /// if (lane_id < offset) 3666 1.1 joerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem 3667 1.1 joerg /// else 3668 1.1 joerg /// reduce_elem = remote_elem 3669 1.1 joerg /// 3670 1.1 joerg /// This algorithm assumes that the data to be reduced are located in a 3671 1.1 joerg /// contiguous subset of lanes starting from the first. When there is 3672 1.1 joerg /// an odd number of active lanes, the data in the last lane is not 3673 1.1 joerg /// aggregated with any other lane's dat but is instead copied over. 3674 1.1 joerg /// 3675 1.1 joerg /// Dispersed Partial Warp Reduction 3676 1.1 joerg /// 3677 1.1 joerg /// This algorithm is used within a warp when any discontiguous subset of 3678 1.1 joerg /// lanes are active. It is used to implement the reduction operation 3679 1.1 joerg /// across lanes in an OpenMP simd region or in a nested parallel region. 3680 1.1 joerg /// 3681 1.1 joerg /// void 3682 1.1 joerg /// dispersed_partial_reduce(void *reduce_data, 3683 1.1 joerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 3684 1.1 joerg /// int size, remote_id; 3685 1.1 joerg /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 3686 1.1 joerg /// do { 3687 1.1 joerg /// remote_id = next_active_lane_id_right_after_me(); 3688 1.1 joerg /// # the above function returns 0 of no active lane 3689 1.1 joerg /// # is present right after the current lane. 3690 1.1 joerg /// size = number_of_active_lanes_in_this_warp(); 3691 1.1 joerg /// logical_lane_id /= 2; 3692 1.1 joerg /// ShuffleReduceFn(reduce_data, logical_lane_id, 3693 1.1 joerg /// remote_id-1-threadIdx.x, 2); 3694 1.1 joerg /// } while (logical_lane_id % 2 == 0 && size > 1); 3695 1.1 joerg /// } 3696 1.1 joerg /// 3697 1.1 joerg /// There is no assumption made about the initial state of the reduction. 3698 1.1 joerg /// Any number of lanes (>=1) could be active at any position. The reduction 3699 1.1 joerg /// result is returned in the first active lane. 3700 1.1 joerg /// 3701 1.1 joerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 3702 1.1 joerg /// 3703 1.1 joerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 3704 1.1 joerg /// if (lane_id % 2 == 0 && offset > 0) 3705 1.1 joerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem 3706 1.1 joerg /// else 3707 1.1 joerg /// reduce_elem = remote_elem 3708 1.1 joerg /// 3709 1.1 joerg /// 3710 1.1 joerg /// Intra-Team Reduction 3711 1.1 joerg /// 3712 1.1 joerg /// This function, as implemented in the runtime call 3713 1.1 joerg /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 3714 1.1 joerg /// threads in a team. It first reduces within a warp using the 3715 1.1 joerg /// aforementioned algorithms. We then proceed to gather all such 3716 1.1 joerg /// reduced values at the first warp. 3717 1.1 joerg /// 3718 1.1 joerg /// The runtime makes use of the function 'InterWarpCpyFn', which copies 3719 1.1 joerg /// data from each of the "warp master" (zeroth lane of each warp, where 3720 1.1 joerg /// warp-reduced data is held) to the zeroth warp. This step reduces (in 3721 1.1 joerg /// a mathematical sense) the problem of reduction across warp masters in 3722 1.1 joerg /// a block to the problem of warp reduction. 3723 1.1 joerg /// 3724 1.1 joerg /// 3725 1.1 joerg /// Inter-Team Reduction 3726 1.1 joerg /// 3727 1.1 joerg /// Once a team has reduced its data to a single value, it is stored in 3728 1.1 joerg /// a global scratchpad array. Since each team has a distinct slot, this 3729 1.1 joerg /// can be done without locking. 3730 1.1 joerg /// 3731 1.1 joerg /// The last team to write to the scratchpad array proceeds to reduce the 3732 1.1 joerg /// scratchpad array. One or more workers in the last team use the helper 3733 1.1 joerg /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 3734 1.1 joerg /// the k'th worker reduces every k'th element. 3735 1.1 joerg /// 3736 1.1 joerg /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 3737 1.1 joerg /// reduce across workers and compute a globally reduced value. 3738 1.1 joerg /// 3739 1.1 joerg void CGOpenMPRuntimeGPU::emitReduction( 3740 1.1 joerg CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 3741 1.1 joerg ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 3742 1.1 joerg ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 3743 1.1 joerg if (!CGF.HaveInsertPoint()) 3744 1.1 joerg return; 3745 1.1 joerg 3746 1.1 joerg bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 3747 1.1 joerg #ifndef NDEBUG 3748 1.1 joerg bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 3749 1.1 joerg #endif 3750 1.1 joerg 3751 1.1 joerg if (Options.SimpleReduction) { 3752 1.1 joerg assert(!TeamsReduction && !ParallelReduction && 3753 1.1 joerg "Invalid reduction selection in emitReduction."); 3754 1.1 joerg CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 3755 1.1 joerg ReductionOps, Options); 3756 1.1 joerg return; 3757 1.1 joerg } 3758 1.1 joerg 3759 1.1 joerg assert((TeamsReduction || ParallelReduction) && 3760 1.1 joerg "Invalid reduction selection in emitReduction."); 3761 1.1 joerg 3762 1.1 joerg // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), 3763 1.1 joerg // RedList, shuffle_reduce_func, interwarp_copy_func); 3764 1.1 joerg // or 3765 1.1 joerg // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); 3766 1.1 joerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 3767 1.1 joerg llvm::Value *ThreadId = getThreadID(CGF, Loc); 3768 1.1 joerg 3769 1.1 joerg llvm::Value *Res; 3770 1.1 joerg ASTContext &C = CGM.getContext(); 3771 1.1 joerg // 1. Build a list of reduction variables. 3772 1.1 joerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3773 1.1 joerg auto Size = RHSExprs.size(); 3774 1.1 joerg for (const Expr *E : Privates) { 3775 1.1 joerg if (E->getType()->isVariablyModifiedType()) 3776 1.1 joerg // Reserve place for array size. 3777 1.1 joerg ++Size; 3778 1.1 joerg } 3779 1.1 joerg llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); 3780 1.1 joerg QualType ReductionArrayTy = 3781 1.1 joerg C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal, 3782 1.1 joerg /*IndexTypeQuals=*/0); 3783 1.1 joerg Address ReductionList = 3784 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 3785 1.1 joerg auto IPriv = Privates.begin(); 3786 1.1 joerg unsigned Idx = 0; 3787 1.1 joerg for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { 3788 1.1 joerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3789 1.1 joerg CGF.Builder.CreateStore( 3790 1.1 joerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3791 1.1 joerg CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy), 3792 1.1 joerg Elem); 3793 1.1 joerg if ((*IPriv)->getType()->isVariablyModifiedType()) { 3794 1.1 joerg // Store array size. 3795 1.1 joerg ++Idx; 3796 1.1 joerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3797 1.1 joerg llvm::Value *Size = CGF.Builder.CreateIntCast( 3798 1.1 joerg CGF.getVLASize( 3799 1.1 joerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 3800 1.1 joerg .NumElts, 3801 1.1 joerg CGF.SizeTy, /*isSigned=*/false); 3802 1.1 joerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 3803 1.1 joerg Elem); 3804 1.1 joerg } 3805 1.1 joerg } 3806 1.1 joerg 3807 1.1 joerg llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3808 1.1 joerg ReductionList.getPointer(), CGF.VoidPtrTy); 3809 1.1 joerg llvm::Function *ReductionFn = emitReductionFunction( 3810 1.1 joerg Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, 3811 1.1 joerg LHSExprs, RHSExprs, ReductionOps); 3812 1.1 joerg llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); 3813 1.1 joerg llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( 3814 1.1 joerg CGM, Privates, ReductionArrayTy, ReductionFn, Loc); 3815 1.1 joerg llvm::Value *InterWarpCopyFn = 3816 1.1 joerg emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); 3817 1.1 joerg 3818 1.1 joerg if (ParallelReduction) { 3819 1.1 joerg llvm::Value *Args[] = {RTLoc, 3820 1.1 joerg ThreadId, 3821 1.1 joerg CGF.Builder.getInt32(RHSExprs.size()), 3822 1.1 joerg ReductionArrayTySize, 3823 1.1 joerg RL, 3824 1.1 joerg ShuffleAndReduceFn, 3825 1.1 joerg InterWarpCopyFn}; 3826 1.1 joerg 3827 1.1 joerg Res = CGF.EmitRuntimeCall( 3828 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 3829 1.1 joerg CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2), 3830 1.1 joerg Args); 3831 1.1 joerg } else { 3832 1.1 joerg assert(TeamsReduction && "expected teams reduction."); 3833 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 3834 1.1 joerg llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 3835 1.1 joerg int Cnt = 0; 3836 1.1 joerg for (const Expr *DRE : Privates) { 3837 1.1 joerg PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 3838 1.1 joerg ++Cnt; 3839 1.1 joerg } 3840 1.1 joerg const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( 3841 1.1 joerg CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap, 3842 1.1 joerg C.getLangOpts().OpenMPCUDAReductionBufNum); 3843 1.1 joerg TeamsReductions.push_back(TeamReductionRec); 3844 1.1 joerg if (!KernelTeamsReductionPtr) { 3845 1.1 joerg KernelTeamsReductionPtr = new llvm::GlobalVariable( 3846 1.1 joerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, 3847 1.1 joerg llvm::GlobalValue::InternalLinkage, nullptr, 3848 1.1 joerg "_openmp_teams_reductions_buffer_$_$ptr"); 3849 1.1 joerg } 3850 1.1 joerg llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( 3851 1.1 joerg Address(KernelTeamsReductionPtr, CGM.getPointerAlign()), 3852 1.1 joerg /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); 3853 1.1 joerg llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( 3854 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); 3855 1.1 joerg llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( 3856 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, 3857 1.1 joerg ReductionFn); 3858 1.1 joerg llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( 3859 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); 3860 1.1 joerg llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( 3861 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, 3862 1.1 joerg ReductionFn); 3863 1.1 joerg 3864 1.1 joerg llvm::Value *Args[] = { 3865 1.1 joerg RTLoc, 3866 1.1 joerg ThreadId, 3867 1.1 joerg GlobalBufferPtr, 3868 1.1 joerg CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), 3869 1.1 joerg RL, 3870 1.1 joerg ShuffleAndReduceFn, 3871 1.1 joerg InterWarpCopyFn, 3872 1.1 joerg GlobalToBufferCpyFn, 3873 1.1 joerg GlobalToBufferRedFn, 3874 1.1 joerg BufferToGlobalCpyFn, 3875 1.1 joerg BufferToGlobalRedFn}; 3876 1.1 joerg 3877 1.1 joerg Res = CGF.EmitRuntimeCall( 3878 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 3879 1.1 joerg CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2), 3880 1.1 joerg Args); 3881 1.1 joerg } 3882 1.1 joerg 3883 1.1 joerg // 5. Build if (res == 1) 3884 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done"); 3885 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then"); 3886 1.1 joerg llvm::Value *Cond = CGF.Builder.CreateICmpEQ( 3887 1.1 joerg Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1)); 3888 1.1 joerg CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB); 3889 1.1 joerg 3890 1.1 joerg // 6. Build then branch: where we have reduced values in the master 3891 1.1 joerg // thread in each team. 3892 1.1 joerg // __kmpc_end_reduce{_nowait}(<gtid>); 3893 1.1 joerg // break; 3894 1.1 joerg CGF.EmitBlock(ThenBB); 3895 1.1 joerg 3896 1.1 joerg // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); 3897 1.1 joerg auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps, 3898 1.1 joerg this](CodeGenFunction &CGF, PrePostActionTy &Action) { 3899 1.1 joerg auto IPriv = Privates.begin(); 3900 1.1 joerg auto ILHS = LHSExprs.begin(); 3901 1.1 joerg auto IRHS = RHSExprs.begin(); 3902 1.1 joerg for (const Expr *E : ReductionOps) { 3903 1.1 joerg emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), 3904 1.1 joerg cast<DeclRefExpr>(*IRHS)); 3905 1.1 joerg ++IPriv; 3906 1.1 joerg ++ILHS; 3907 1.1 joerg ++IRHS; 3908 1.1 joerg } 3909 1.1 joerg }; 3910 1.1 joerg llvm::Value *EndArgs[] = {ThreadId}; 3911 1.1 joerg RegionCodeGenTy RCG(CodeGen); 3912 1.1 joerg NVPTXActionTy Action( 3913 1.1 joerg nullptr, llvm::None, 3914 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction( 3915 1.1 joerg CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait), 3916 1.1 joerg EndArgs); 3917 1.1 joerg RCG.setAction(Action); 3918 1.1 joerg RCG(CGF); 3919 1.1 joerg // There is no need to emit line number for unconditional branch. 3920 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF); 3921 1.1 joerg CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 3922 1.1 joerg } 3923 1.1 joerg 3924 1.1 joerg const VarDecl * 3925 1.1 joerg CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 3926 1.1 joerg const VarDecl *NativeParam) const { 3927 1.1 joerg if (!NativeParam->getType()->isReferenceType()) 3928 1.1 joerg return NativeParam; 3929 1.1 joerg QualType ArgType = NativeParam->getType(); 3930 1.1 joerg QualifierCollector QC; 3931 1.1 joerg const Type *NonQualTy = QC.strip(ArgType); 3932 1.1 joerg QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 3933 1.1 joerg if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 3934 1.1 joerg if (Attr->getCaptureKind() == OMPC_map) { 3935 1.1 joerg PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 3936 1.1 joerg LangAS::opencl_global); 3937 1.1 joerg } else if (Attr->getCaptureKind() == OMPC_firstprivate && 3938 1.1 joerg PointeeTy.isConstant(CGM.getContext())) { 3939 1.1 joerg PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 3940 1.1 joerg LangAS::opencl_generic); 3941 1.1 joerg } 3942 1.1 joerg } 3943 1.1 joerg ArgType = CGM.getContext().getPointerType(PointeeTy); 3944 1.1 joerg QC.addRestrict(); 3945 1.1 joerg enum { NVPTX_local_addr = 5 }; 3946 1.1 joerg QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 3947 1.1 joerg ArgType = QC.apply(CGM.getContext(), ArgType); 3948 1.1 joerg if (isa<ImplicitParamDecl>(NativeParam)) 3949 1.1 joerg return ImplicitParamDecl::Create( 3950 1.1 joerg CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 3951 1.1 joerg NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other); 3952 1.1 joerg return ParmVarDecl::Create( 3953 1.1 joerg CGM.getContext(), 3954 1.1 joerg const_cast<DeclContext *>(NativeParam->getDeclContext()), 3955 1.1 joerg NativeParam->getBeginLoc(), NativeParam->getLocation(), 3956 1.1 joerg NativeParam->getIdentifier(), ArgType, 3957 1.1 joerg /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 3958 1.1 joerg } 3959 1.1 joerg 3960 1.1 joerg Address 3961 1.1 joerg CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 3962 1.1 joerg const VarDecl *NativeParam, 3963 1.1 joerg const VarDecl *TargetParam) const { 3964 1.1 joerg assert(NativeParam != TargetParam && 3965 1.1 joerg NativeParam->getType()->isReferenceType() && 3966 1.1 joerg "Native arg must not be the same as target arg."); 3967 1.1 joerg Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 3968 1.1 joerg QualType NativeParamType = NativeParam->getType(); 3969 1.1 joerg QualifierCollector QC; 3970 1.1 joerg const Type *NonQualTy = QC.strip(NativeParamType); 3971 1.1 joerg QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 3972 1.1 joerg unsigned NativePointeeAddrSpace = 3973 1.1 joerg CGF.getContext().getTargetAddressSpace(NativePointeeTy); 3974 1.1 joerg QualType TargetTy = TargetParam->getType(); 3975 1.1 joerg llvm::Value *TargetAddr = CGF.EmitLoadOfScalar( 3976 1.1 joerg LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation()); 3977 1.1 joerg // First cast to generic. 3978 1.1 joerg TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3979 1.1 joerg TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( 3980 1.1 joerg /*AddrSpace=*/0)); 3981 1.1 joerg // Cast from generic to native address space. 3982 1.1 joerg TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3983 1.1 joerg TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( 3984 1.1 joerg NativePointeeAddrSpace)); 3985 1.1 joerg Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 3986 1.1 joerg CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 3987 1.1 joerg NativeParamType); 3988 1.1 joerg return NativeParamAddr; 3989 1.1 joerg } 3990 1.1 joerg 3991 1.1 joerg void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 3992 1.1 joerg CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 3993 1.1 joerg ArrayRef<llvm::Value *> Args) const { 3994 1.1 joerg SmallVector<llvm::Value *, 4> TargetArgs; 3995 1.1 joerg TargetArgs.reserve(Args.size()); 3996 1.1 joerg auto *FnType = OutlinedFn.getFunctionType(); 3997 1.1 joerg for (unsigned I = 0, E = Args.size(); I < E; ++I) { 3998 1.1 joerg if (FnType->isVarArg() && FnType->getNumParams() <= I) { 3999 1.1 joerg TargetArgs.append(std::next(Args.begin(), I), Args.end()); 4000 1.1 joerg break; 4001 1.1 joerg } 4002 1.1 joerg llvm::Type *TargetType = FnType->getParamType(I); 4003 1.1 joerg llvm::Value *NativeArg = Args[I]; 4004 1.1 joerg if (!TargetType->isPointerTy()) { 4005 1.1 joerg TargetArgs.emplace_back(NativeArg); 4006 1.1 joerg continue; 4007 1.1 joerg } 4008 1.1 joerg llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 4009 1.1 joerg NativeArg, 4010 1.1 joerg NativeArg->getType()->getPointerElementType()->getPointerTo()); 4011 1.1 joerg TargetArgs.emplace_back( 4012 1.1 joerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); 4013 1.1 joerg } 4014 1.1 joerg CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 4015 1.1 joerg } 4016 1.1 joerg 4017 1.1 joerg /// Emit function which wraps the outline parallel region 4018 1.1 joerg /// and controls the arguments which are passed to this function. 4019 1.1 joerg /// The wrapper ensures that the outlined function is called 4020 1.1 joerg /// with the correct arguments when data is shared. 4021 1.1 joerg llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 4022 1.1 joerg llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 4023 1.1 joerg ASTContext &Ctx = CGM.getContext(); 4024 1.1 joerg const auto &CS = *D.getCapturedStmt(OMPD_parallel); 4025 1.1 joerg 4026 1.1 joerg // Create a function that takes as argument the source thread. 4027 1.1 joerg FunctionArgList WrapperArgs; 4028 1.1 joerg QualType Int16QTy = 4029 1.1 joerg Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 4030 1.1 joerg QualType Int32QTy = 4031 1.1 joerg Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 4032 1.1 joerg ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 4033 1.1 joerg /*Id=*/nullptr, Int16QTy, 4034 1.1 joerg ImplicitParamDecl::Other); 4035 1.1 joerg ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 4036 1.1 joerg /*Id=*/nullptr, Int32QTy, 4037 1.1 joerg ImplicitParamDecl::Other); 4038 1.1 joerg WrapperArgs.emplace_back(&ParallelLevelArg); 4039 1.1 joerg WrapperArgs.emplace_back(&WrapperArg); 4040 1.1 joerg 4041 1.1 joerg const CGFunctionInfo &CGFI = 4042 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 4043 1.1 joerg 4044 1.1 joerg auto *Fn = llvm::Function::Create( 4045 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 4046 1.1 joerg Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 4047 1.1 joerg 4048 1.1 joerg // Ensure we do not inline the function. This is trivially true for the ones 4049 1.1 joerg // passed to __kmpc_fork_call but the ones calles in serialized regions 4050 1.1 joerg // could be inlined. This is not a perfect but it is closer to the invariant 4051 1.1 joerg // we want, namely, every data environment starts with a new function. 4052 1.1 joerg // TODO: We should pass the if condition to the runtime function and do the 4053 1.1 joerg // handling there. Much cleaner code. 4054 1.1 joerg Fn->addFnAttr(llvm::Attribute::NoInline); 4055 1.1 joerg 4056 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 4057 1.1 joerg Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 4058 1.1 joerg Fn->setDoesNotRecurse(); 4059 1.1 joerg 4060 1.1 joerg CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 4061 1.1 joerg CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 4062 1.1 joerg D.getBeginLoc(), D.getBeginLoc()); 4063 1.1 joerg 4064 1.1 joerg const auto *RD = CS.getCapturedRecordDecl(); 4065 1.1 joerg auto CurField = RD->field_begin(); 4066 1.1 joerg 4067 1.1 joerg Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 4068 1.1 joerg /*Name=*/".zero.addr"); 4069 1.1 joerg CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); 4070 1.1 joerg // Get the array of arguments. 4071 1.1 joerg SmallVector<llvm::Value *, 8> Args; 4072 1.1 joerg 4073 1.1 joerg Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer()); 4074 1.1 joerg Args.emplace_back(ZeroAddr.getPointer()); 4075 1.1 joerg 4076 1.1 joerg CGBuilderTy &Bld = CGF.Builder; 4077 1.1 joerg auto CI = CS.capture_begin(); 4078 1.1 joerg 4079 1.1 joerg // Use global memory for data sharing. 4080 1.1 joerg // Handle passing of global args to workers. 4081 1.1 joerg Address GlobalArgs = 4082 1.1 joerg CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 4083 1.1 joerg llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 4084 1.1 joerg llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 4085 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 4086 1.1 joerg CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 4087 1.1 joerg DataSharingArgs); 4088 1.1 joerg 4089 1.1 joerg // Retrieve the shared variables from the list of references returned 4090 1.1 joerg // by the runtime. Pass the variables to the outlined function. 4091 1.1 joerg Address SharedArgListAddress = Address::invalid(); 4092 1.1 joerg if (CS.capture_size() > 0 || 4093 1.1 joerg isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 4094 1.1 joerg SharedArgListAddress = CGF.EmitLoadOfPointer( 4095 1.1 joerg GlobalArgs, CGF.getContext() 4096 1.1 joerg .getPointerType(CGF.getContext().getPointerType( 4097 1.1 joerg CGF.getContext().VoidPtrTy)) 4098 1.1 joerg .castAs<PointerType>()); 4099 1.1 joerg } 4100 1.1 joerg unsigned Idx = 0; 4101 1.1 joerg if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 4102 1.1 joerg Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 4103 1.1 joerg Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 4104 1.1 joerg Src, CGF.SizeTy->getPointerTo()); 4105 1.1 joerg llvm::Value *LB = CGF.EmitLoadOfScalar( 4106 1.1 joerg TypedAddress, 4107 1.1 joerg /*Volatile=*/false, 4108 1.1 joerg CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 4109 1.1 joerg cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 4110 1.1 joerg Args.emplace_back(LB); 4111 1.1 joerg ++Idx; 4112 1.1 joerg Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 4113 1.1 joerg TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 4114 1.1 joerg Src, CGF.SizeTy->getPointerTo()); 4115 1.1 joerg llvm::Value *UB = CGF.EmitLoadOfScalar( 4116 1.1 joerg TypedAddress, 4117 1.1 joerg /*Volatile=*/false, 4118 1.1 joerg CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 4119 1.1 joerg cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 4120 1.1 joerg Args.emplace_back(UB); 4121 1.1 joerg ++Idx; 4122 1.1 joerg } 4123 1.1 joerg if (CS.capture_size() > 0) { 4124 1.1 joerg ASTContext &CGFContext = CGF.getContext(); 4125 1.1 joerg for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 4126 1.1 joerg QualType ElemTy = CurField->getType(); 4127 1.1 joerg Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 4128 1.1 joerg Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 4129 1.1 joerg Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); 4130 1.1 joerg llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 4131 1.1 joerg /*Volatile=*/false, 4132 1.1 joerg CGFContext.getPointerType(ElemTy), 4133 1.1 joerg CI->getLocation()); 4134 1.1 joerg if (CI->capturesVariableByCopy() && 4135 1.1 joerg !CI->getCapturedVar()->getType()->isAnyPointerType()) { 4136 1.1 joerg Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 4137 1.1 joerg CI->getLocation()); 4138 1.1 joerg } 4139 1.1 joerg Args.emplace_back(Arg); 4140 1.1 joerg } 4141 1.1 joerg } 4142 1.1 joerg 4143 1.1 joerg emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 4144 1.1 joerg CGF.FinishFunction(); 4145 1.1 joerg return Fn; 4146 1.1 joerg } 4147 1.1 joerg 4148 1.1 joerg void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 4149 1.1 joerg const Decl *D) { 4150 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) 4151 1.1 joerg return; 4152 1.1 joerg 4153 1.1 joerg assert(D && "Expected function or captured|block decl."); 4154 1.1 joerg assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 4155 1.1 joerg "Function is registered already."); 4156 1.1 joerg assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 4157 1.1 joerg "Team is set but not processed."); 4158 1.1 joerg const Stmt *Body = nullptr; 4159 1.1 joerg bool NeedToDelayGlobalization = false; 4160 1.1 joerg if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 4161 1.1 joerg Body = FD->getBody(); 4162 1.1 joerg } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 4163 1.1 joerg Body = BD->getBody(); 4164 1.1 joerg } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 4165 1.1 joerg Body = CD->getBody(); 4166 1.1 joerg NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 4167 1.1 joerg if (NeedToDelayGlobalization && 4168 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 4169 1.1 joerg return; 4170 1.1 joerg } 4171 1.1 joerg if (!Body) 4172 1.1 joerg return; 4173 1.1 joerg CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 4174 1.1 joerg VarChecker.Visit(Body); 4175 1.1 joerg const RecordDecl *GlobalizedVarsRecord = 4176 1.1 joerg VarChecker.getGlobalizedRecord(IsInTTDRegion); 4177 1.1 joerg TeamAndReductions.first = nullptr; 4178 1.1 joerg TeamAndReductions.second.clear(); 4179 1.1 joerg ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 4180 1.1 joerg VarChecker.getEscapedVariableLengthDecls(); 4181 1.1 joerg if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) 4182 1.1 joerg return; 4183 1.1 joerg auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 4184 1.1 joerg I->getSecond().MappedParams = 4185 1.1 joerg std::make_unique<CodeGenFunction::OMPMapVars>(); 4186 1.1 joerg I->getSecond().GlobalRecord = GlobalizedVarsRecord; 4187 1.1 joerg I->getSecond().EscapedParameters.insert( 4188 1.1 joerg VarChecker.getEscapedParameters().begin(), 4189 1.1 joerg VarChecker.getEscapedParameters().end()); 4190 1.1 joerg I->getSecond().EscapedVariableLengthDecls.append( 4191 1.1 joerg EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 4192 1.1 joerg DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 4193 1.1 joerg for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 4194 1.1 joerg assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 4195 1.1 joerg const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); 4196 1.1 joerg Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); 4197 1.1 joerg } 4198 1.1 joerg if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { 4199 1.1 joerg CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); 4200 1.1 joerg VarChecker.Visit(Body); 4201 1.1 joerg I->getSecond().SecondaryGlobalRecord = 4202 1.1 joerg VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); 4203 1.1 joerg I->getSecond().SecondaryLocalVarData.emplace(); 4204 1.1 joerg DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); 4205 1.1 joerg for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 4206 1.1 joerg assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 4207 1.1 joerg const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); 4208 1.1 joerg Data.insert( 4209 1.1 joerg std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); 4210 1.1 joerg } 4211 1.1 joerg } 4212 1.1 joerg if (!NeedToDelayGlobalization) { 4213 1.1 joerg emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); 4214 1.1 joerg struct GlobalizationScope final : EHScopeStack::Cleanup { 4215 1.1 joerg GlobalizationScope() = default; 4216 1.1 joerg 4217 1.1 joerg void Emit(CodeGenFunction &CGF, Flags flags) override { 4218 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 4219 1.1 joerg .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); 4220 1.1 joerg } 4221 1.1 joerg }; 4222 1.1 joerg CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 4223 1.1 joerg } 4224 1.1 joerg } 4225 1.1 joerg 4226 1.1 joerg Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 4227 1.1 joerg const VarDecl *VD) { 4228 1.1 joerg if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 4229 1.1 joerg const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 4230 1.1 joerg auto AS = LangAS::Default; 4231 1.1 joerg switch (A->getAllocatorType()) { 4232 1.1 joerg // Use the default allocator here as by default local vars are 4233 1.1 joerg // threadlocal. 4234 1.1 joerg case OMPAllocateDeclAttr::OMPNullMemAlloc: 4235 1.1 joerg case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 4236 1.1 joerg case OMPAllocateDeclAttr::OMPThreadMemAlloc: 4237 1.1 joerg case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 4238 1.1 joerg case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 4239 1.1 joerg // Follow the user decision - use default allocation. 4240 1.1 joerg return Address::invalid(); 4241 1.1 joerg case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 4242 1.1 joerg // TODO: implement aupport for user-defined allocators. 4243 1.1 joerg return Address::invalid(); 4244 1.1 joerg case OMPAllocateDeclAttr::OMPConstMemAlloc: 4245 1.1 joerg AS = LangAS::cuda_constant; 4246 1.1 joerg break; 4247 1.1 joerg case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 4248 1.1 joerg AS = LangAS::cuda_shared; 4249 1.1 joerg break; 4250 1.1 joerg case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 4251 1.1 joerg case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 4252 1.1 joerg break; 4253 1.1 joerg } 4254 1.1 joerg llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 4255 1.1 joerg auto *GV = new llvm::GlobalVariable( 4256 1.1 joerg CGM.getModule(), VarTy, /*isConstant=*/false, 4257 1.1 joerg llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy), 4258 1.1 joerg VD->getName(), 4259 1.1 joerg /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 4260 1.1 joerg CGM.getContext().getTargetAddressSpace(AS)); 4261 1.1 joerg CharUnits Align = CGM.getContext().getDeclAlign(VD); 4262 1.1 joerg GV->setAlignment(Align.getAsAlign()); 4263 1.1 joerg return Address( 4264 1.1 joerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 4265 1.1 joerg GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace( 4266 1.1 joerg VD->getType().getAddressSpace()))), 4267 1.1 joerg Align); 4268 1.1 joerg } 4269 1.1 joerg 4270 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) 4271 1.1 joerg return Address::invalid(); 4272 1.1 joerg 4273 1.1 joerg VD = VD->getCanonicalDecl(); 4274 1.1 joerg auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 4275 1.1 joerg if (I == FunctionGlobalizedDecls.end()) 4276 1.1 joerg return Address::invalid(); 4277 1.1 joerg auto VDI = I->getSecond().LocalVarData.find(VD); 4278 1.1 joerg if (VDI != I->getSecond().LocalVarData.end()) 4279 1.1 joerg return VDI->second.PrivateAddr; 4280 1.1 joerg if (VD->hasAttrs()) { 4281 1.1 joerg for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 4282 1.1 joerg E(VD->attr_end()); 4283 1.1 joerg IT != E; ++IT) { 4284 1.1 joerg auto VDI = I->getSecond().LocalVarData.find( 4285 1.1 joerg cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 4286 1.1 joerg ->getCanonicalDecl()); 4287 1.1 joerg if (VDI != I->getSecond().LocalVarData.end()) 4288 1.1 joerg return VDI->second.PrivateAddr; 4289 1.1 joerg } 4290 1.1 joerg } 4291 1.1 joerg 4292 1.1 joerg return Address::invalid(); 4293 1.1 joerg } 4294 1.1 joerg 4295 1.1 joerg void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 4296 1.1 joerg FunctionGlobalizedDecls.erase(CGF.CurFn); 4297 1.1 joerg CGOpenMPRuntime::functionFinished(CGF); 4298 1.1 joerg } 4299 1.1 joerg 4300 1.1 joerg void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 4301 1.1 joerg CodeGenFunction &CGF, const OMPLoopDirective &S, 4302 1.1 joerg OpenMPDistScheduleClauseKind &ScheduleKind, 4303 1.1 joerg llvm::Value *&Chunk) const { 4304 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 4305 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 4306 1.1 joerg ScheduleKind = OMPC_DIST_SCHEDULE_static; 4307 1.1 joerg Chunk = CGF.EmitScalarConversion( 4308 1.1 joerg RT.getGPUNumThreads(CGF), 4309 1.1 joerg CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 4310 1.1 joerg S.getIterationVariable()->getType(), S.getBeginLoc()); 4311 1.1 joerg return; 4312 1.1 joerg } 4313 1.1 joerg CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 4314 1.1 joerg CGF, S, ScheduleKind, Chunk); 4315 1.1 joerg } 4316 1.1 joerg 4317 1.1 joerg void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 4318 1.1 joerg CodeGenFunction &CGF, const OMPLoopDirective &S, 4319 1.1 joerg OpenMPScheduleClauseKind &ScheduleKind, 4320 1.1 joerg const Expr *&ChunkExpr) const { 4321 1.1 joerg ScheduleKind = OMPC_SCHEDULE_static; 4322 1.1 joerg // Chunk size is 1 in this case. 4323 1.1 joerg llvm::APInt ChunkSize(32, 1); 4324 1.1 joerg ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 4325 1.1 joerg CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 4326 1.1 joerg SourceLocation()); 4327 1.1 joerg } 4328 1.1 joerg 4329 1.1 joerg void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 4330 1.1 joerg CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 4331 1.1 joerg assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 4332 1.1 joerg " Expected target-based directive."); 4333 1.1 joerg const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 4334 1.1 joerg for (const CapturedStmt::Capture &C : CS->captures()) { 4335 1.1 joerg // Capture variables captured by reference in lambdas for target-based 4336 1.1 joerg // directives. 4337 1.1 joerg if (!C.capturesVariable()) 4338 1.1 joerg continue; 4339 1.1 joerg const VarDecl *VD = C.getCapturedVar(); 4340 1.1 joerg const auto *RD = VD->getType() 4341 1.1 joerg .getCanonicalType() 4342 1.1 joerg .getNonReferenceType() 4343 1.1 joerg ->getAsCXXRecordDecl(); 4344 1.1 joerg if (!RD || !RD->isLambda()) 4345 1.1 joerg continue; 4346 1.1 joerg Address VDAddr = CGF.GetAddrOfLocalVar(VD); 4347 1.1 joerg LValue VDLVal; 4348 1.1 joerg if (VD->getType().getCanonicalType()->isReferenceType()) 4349 1.1 joerg VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 4350 1.1 joerg else 4351 1.1 joerg VDLVal = CGF.MakeAddrLValue( 4352 1.1 joerg VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 4353 1.1 joerg llvm::DenseMap<const VarDecl *, FieldDecl *> Captures; 4354 1.1 joerg FieldDecl *ThisCapture = nullptr; 4355 1.1 joerg RD->getCaptureFields(Captures, ThisCapture); 4356 1.1 joerg if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 4357 1.1 joerg LValue ThisLVal = 4358 1.1 joerg CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 4359 1.1 joerg llvm::Value *CXXThis = CGF.LoadCXXThis(); 4360 1.1 joerg CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 4361 1.1 joerg } 4362 1.1 joerg for (const LambdaCapture &LC : RD->captures()) { 4363 1.1 joerg if (LC.getCaptureKind() != LCK_ByRef) 4364 1.1 joerg continue; 4365 1.1 joerg const VarDecl *VD = LC.getCapturedVar(); 4366 1.1 joerg if (!CS->capturesVariable(VD)) 4367 1.1 joerg continue; 4368 1.1 joerg auto It = Captures.find(VD); 4369 1.1 joerg assert(It != Captures.end() && "Found lambda capture without field."); 4370 1.1 joerg LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 4371 1.1 joerg Address VDAddr = CGF.GetAddrOfLocalVar(VD); 4372 1.1 joerg if (VD->getType().getCanonicalType()->isReferenceType()) 4373 1.1 joerg VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 4374 1.1 joerg VD->getType().getCanonicalType()) 4375 1.1 joerg .getAddress(CGF); 4376 1.1 joerg CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal); 4377 1.1 joerg } 4378 1.1 joerg } 4379 1.1 joerg } 4380 1.1 joerg 4381 1.1 joerg unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const { 4382 1.1 joerg return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant); 4383 1.1 joerg } 4384 1.1 joerg 4385 1.1 joerg bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 4386 1.1 joerg LangAS &AS) { 4387 1.1 joerg if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 4388 1.1 joerg return false; 4389 1.1 joerg const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 4390 1.1 joerg switch(A->getAllocatorType()) { 4391 1.1 joerg case OMPAllocateDeclAttr::OMPNullMemAlloc: 4392 1.1 joerg case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 4393 1.1 joerg // Not supported, fallback to the default mem space. 4394 1.1 joerg case OMPAllocateDeclAttr::OMPThreadMemAlloc: 4395 1.1 joerg case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 4396 1.1 joerg case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 4397 1.1 joerg case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 4398 1.1 joerg case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 4399 1.1 joerg AS = LangAS::Default; 4400 1.1 joerg return true; 4401 1.1 joerg case OMPAllocateDeclAttr::OMPConstMemAlloc: 4402 1.1 joerg AS = LangAS::cuda_constant; 4403 1.1 joerg return true; 4404 1.1 joerg case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 4405 1.1 joerg AS = LangAS::cuda_shared; 4406 1.1 joerg return true; 4407 1.1 joerg case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 4408 1.1 joerg llvm_unreachable("Expected predefined allocator for the variables with the " 4409 1.1 joerg "static storage."); 4410 1.1 joerg } 4411 1.1 joerg return false; 4412 1.1 joerg } 4413 1.1 joerg 4414 1.1 joerg // Get current CudaArch and ignore any unknown values 4415 1.1 joerg static CudaArch getCudaArch(CodeGenModule &CGM) { 4416 1.1 joerg if (!CGM.getTarget().hasFeature("ptx")) 4417 1.1 joerg return CudaArch::UNKNOWN; 4418 1.1 joerg for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 4419 1.1 joerg if (Feature.getValue()) { 4420 1.1 joerg CudaArch Arch = StringToCudaArch(Feature.getKey()); 4421 1.1 joerg if (Arch != CudaArch::UNKNOWN) 4422 1.1 joerg return Arch; 4423 1.1 joerg } 4424 1.1 joerg } 4425 1.1 joerg return CudaArch::UNKNOWN; 4426 1.1 joerg } 4427 1.1 joerg 4428 1.1 joerg /// Check to see if target architecture supports unified addressing which is 4429 1.1 joerg /// a restriction for OpenMP requires clause "unified_shared_memory". 4430 1.1 joerg void CGOpenMPRuntimeGPU::processRequiresDirective( 4431 1.1 joerg const OMPRequiresDecl *D) { 4432 1.1 joerg for (const OMPClause *Clause : D->clauselists()) { 4433 1.1 joerg if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 4434 1.1 joerg CudaArch Arch = getCudaArch(CGM); 4435 1.1 joerg switch (Arch) { 4436 1.1 joerg case CudaArch::SM_20: 4437 1.1 joerg case CudaArch::SM_21: 4438 1.1 joerg case CudaArch::SM_30: 4439 1.1 joerg case CudaArch::SM_32: 4440 1.1 joerg case CudaArch::SM_35: 4441 1.1 joerg case CudaArch::SM_37: 4442 1.1 joerg case CudaArch::SM_50: 4443 1.1 joerg case CudaArch::SM_52: 4444 1.1 joerg case CudaArch::SM_53: { 4445 1.1 joerg SmallString<256> Buffer; 4446 1.1 joerg llvm::raw_svector_ostream Out(Buffer); 4447 1.1 joerg Out << "Target architecture " << CudaArchToString(Arch) 4448 1.1 joerg << " does not support unified addressing"; 4449 1.1 joerg CGM.Error(Clause->getBeginLoc(), Out.str()); 4450 1.1 joerg return; 4451 1.1 joerg } 4452 1.1 joerg case CudaArch::SM_60: 4453 1.1 joerg case CudaArch::SM_61: 4454 1.1 joerg case CudaArch::SM_62: 4455 1.1 joerg case CudaArch::SM_70: 4456 1.1 joerg case CudaArch::SM_72: 4457 1.1 joerg case CudaArch::SM_75: 4458 1.1 joerg case CudaArch::SM_80: 4459 1.1 joerg case CudaArch::SM_86: 4460 1.1 joerg case CudaArch::GFX600: 4461 1.1 joerg case CudaArch::GFX601: 4462 1.1 joerg case CudaArch::GFX602: 4463 1.1 joerg case CudaArch::GFX700: 4464 1.1 joerg case CudaArch::GFX701: 4465 1.1 joerg case CudaArch::GFX702: 4466 1.1 joerg case CudaArch::GFX703: 4467 1.1 joerg case CudaArch::GFX704: 4468 1.1 joerg case CudaArch::GFX705: 4469 1.1 joerg case CudaArch::GFX801: 4470 1.1 joerg case CudaArch::GFX802: 4471 1.1 joerg case CudaArch::GFX803: 4472 1.1 joerg case CudaArch::GFX805: 4473 1.1 joerg case CudaArch::GFX810: 4474 1.1 joerg case CudaArch::GFX900: 4475 1.1 joerg case CudaArch::GFX902: 4476 1.1 joerg case CudaArch::GFX904: 4477 1.1 joerg case CudaArch::GFX906: 4478 1.1 joerg case CudaArch::GFX908: 4479 1.1 joerg case CudaArch::GFX909: 4480 1.1 joerg case CudaArch::GFX90a: 4481 1.1 joerg case CudaArch::GFX90c: 4482 1.1 joerg case CudaArch::GFX1010: 4483 1.1 joerg case CudaArch::GFX1011: 4484 1.1 joerg case CudaArch::GFX1012: 4485 1.1 joerg case CudaArch::GFX1030: 4486 1.1 joerg case CudaArch::GFX1031: 4487 1.1 joerg case CudaArch::GFX1032: 4488 1.1 joerg case CudaArch::GFX1033: 4489 1.1 joerg case CudaArch::GFX1034: 4490 1.1 joerg case CudaArch::UNUSED: 4491 1.1 joerg case CudaArch::UNKNOWN: 4492 1.1 joerg break; 4493 1.1 joerg case CudaArch::LAST: 4494 1.1 joerg llvm_unreachable("Unexpected Cuda arch."); 4495 1.1 joerg } 4496 1.1 joerg } 4497 1.1 joerg } 4498 1.1 joerg CGOpenMPRuntime::processRequiresDirective(D); 4499 1.1 joerg } 4500 1.1 joerg 4501 1.1 joerg /// Get number of SMs and number of blocks per SM. 4502 1.1 joerg static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) { 4503 1.1 joerg std::pair<unsigned, unsigned> Data; 4504 1.1 joerg if (CGM.getLangOpts().OpenMPCUDANumSMs) 4505 1.1 joerg Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; 4506 1.1 joerg if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) 4507 1.1 joerg Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; 4508 1.1 joerg if (Data.first && Data.second) 4509 1.1 joerg return Data; 4510 1.1 joerg switch (getCudaArch(CGM)) { 4511 1.1 joerg case CudaArch::SM_20: 4512 1.1 joerg case CudaArch::SM_21: 4513 1.1 joerg case CudaArch::SM_30: 4514 1.1 joerg case CudaArch::SM_32: 4515 1.1 joerg case CudaArch::SM_35: 4516 1.1 joerg case CudaArch::SM_37: 4517 1.1 joerg case CudaArch::SM_50: 4518 1.1 joerg case CudaArch::SM_52: 4519 1.1 joerg case CudaArch::SM_53: 4520 1.1 joerg return {16, 16}; 4521 1.1 joerg case CudaArch::SM_60: 4522 1.1 joerg case CudaArch::SM_61: 4523 1.1 joerg case CudaArch::SM_62: 4524 1.1 joerg return {56, 32}; 4525 1.1 joerg case CudaArch::SM_70: 4526 1.1 joerg case CudaArch::SM_72: 4527 1.1 joerg case CudaArch::SM_75: 4528 1.1 joerg case CudaArch::SM_80: 4529 1.1 joerg case CudaArch::SM_86: 4530 1.1 joerg return {84, 32}; 4531 1.1 joerg case CudaArch::GFX600: 4532 1.1 joerg case CudaArch::GFX601: 4533 1.1 joerg case CudaArch::GFX602: 4534 1.1 joerg case CudaArch::GFX700: 4535 1.1 joerg case CudaArch::GFX701: 4536 1.1 joerg case CudaArch::GFX702: 4537 1.1 joerg case CudaArch::GFX703: 4538 1.1 joerg case CudaArch::GFX704: 4539 1.1 joerg case CudaArch::GFX705: 4540 1.1 joerg case CudaArch::GFX801: 4541 1.1 joerg case CudaArch::GFX802: 4542 1.1 joerg case CudaArch::GFX803: 4543 1.1 joerg case CudaArch::GFX805: 4544 1.1 joerg case CudaArch::GFX810: 4545 1.1 joerg case CudaArch::GFX900: 4546 1.1 joerg case CudaArch::GFX902: 4547 1.1 joerg case CudaArch::GFX904: 4548 1.1 joerg case CudaArch::GFX906: 4549 1.1 joerg case CudaArch::GFX908: 4550 1.1 joerg case CudaArch::GFX909: 4551 1.1 joerg case CudaArch::GFX90a: 4552 1.1 joerg case CudaArch::GFX90c: 4553 1.1 joerg case CudaArch::GFX1010: 4554 1.1 joerg case CudaArch::GFX1011: 4555 1.1 joerg case CudaArch::GFX1012: 4556 1.1 joerg case CudaArch::GFX1030: 4557 1.1 joerg case CudaArch::GFX1031: 4558 1.1 joerg case CudaArch::GFX1032: 4559 1.1 joerg case CudaArch::GFX1033: 4560 1.1 joerg case CudaArch::GFX1034: 4561 1.1 joerg case CudaArch::UNUSED: 4562 1.1 joerg case CudaArch::UNKNOWN: 4563 1.1 joerg break; 4564 1.1 joerg case CudaArch::LAST: 4565 1.1 joerg llvm_unreachable("Unexpected Cuda arch."); 4566 1.1 joerg } 4567 1.1 joerg llvm_unreachable("Unexpected NVPTX target without ptx feature."); 4568 1.1 joerg } 4569 1.1 joerg 4570 1.1 joerg void CGOpenMPRuntimeGPU::clear() { 4571 1.1 joerg if (!GlobalizedRecords.empty() && 4572 1.1 joerg !CGM.getLangOpts().OpenMPCUDATargetParallel) { 4573 1.1 joerg ASTContext &C = CGM.getContext(); 4574 1.1 joerg llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs; 4575 1.1 joerg llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs; 4576 1.1 joerg RecordDecl *StaticRD = C.buildImplicitRecord( 4577 1.1 joerg "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); 4578 1.1 joerg StaticRD->startDefinition(); 4579 1.1 joerg RecordDecl *SharedStaticRD = C.buildImplicitRecord( 4580 1.1 joerg "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); 4581 1.1 joerg SharedStaticRD->startDefinition(); 4582 1.1 joerg for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { 4583 1.1 joerg if (Records.Records.empty()) 4584 1.1 joerg continue; 4585 1.1 joerg unsigned Size = 0; 4586 1.1 joerg unsigned RecAlignment = 0; 4587 1.1 joerg for (const RecordDecl *RD : Records.Records) { 4588 1.1 joerg QualType RDTy = C.getRecordType(RD); 4589 1.1 joerg unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); 4590 1.1 joerg RecAlignment = std::max(RecAlignment, Alignment); 4591 1.1 joerg unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); 4592 1.1 joerg Size = 4593 1.1 joerg llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); 4594 1.1 joerg } 4595 1.1 joerg Size = llvm::alignTo(Size, RecAlignment); 4596 1.1 joerg llvm::APInt ArySize(/*numBits=*/64, Size); 4597 1.1 joerg QualType SubTy = C.getConstantArrayType( 4598 1.1 joerg C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); 4599 1.1 joerg const bool UseSharedMemory = Size <= SharedMemorySize; 4600 1.1 joerg auto *Field = 4601 1.1 joerg FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, 4602 1.1 joerg SourceLocation(), SourceLocation(), nullptr, SubTy, 4603 1.1 joerg C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), 4604 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false, 4605 1.1 joerg /*InitStyle=*/ICIS_NoInit); 4606 1.1 joerg Field->setAccess(AS_public); 4607 1.1 joerg if (UseSharedMemory) { 4608 1.1 joerg SharedStaticRD->addDecl(Field); 4609 1.1 joerg SharedRecs.push_back(&Records); 4610 1.1 joerg } else { 4611 1.1 joerg StaticRD->addDecl(Field); 4612 1.1 joerg GlobalRecs.push_back(&Records); 4613 1.1 joerg } 4614 1.1 joerg Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); 4615 1.1 joerg Records.UseSharedMemory->setInitializer( 4616 1.1 joerg llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); 4617 1.1 joerg } 4618 1.1 joerg // Allocate SharedMemorySize buffer for the shared memory. 4619 1.1 joerg // FIXME: nvlink does not handle weak linkage correctly (object with the 4620 1.1 joerg // different size are reported as erroneous). 4621 1.1 joerg // Restore this code as sson as nvlink is fixed. 4622 1.1 joerg if (!SharedStaticRD->field_empty()) { 4623 1.1 joerg llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); 4624 1.1 joerg QualType SubTy = C.getConstantArrayType( 4625 1.1 joerg C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); 4626 1.1 joerg auto *Field = FieldDecl::Create( 4627 1.1 joerg C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, 4628 1.1 joerg C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), 4629 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false, 4630 1.1 joerg /*InitStyle=*/ICIS_NoInit); 4631 1.1 joerg Field->setAccess(AS_public); 4632 1.1 joerg SharedStaticRD->addDecl(Field); 4633 1.1 joerg } 4634 1.1 joerg SharedStaticRD->completeDefinition(); 4635 1.1 joerg if (!SharedStaticRD->field_empty()) { 4636 1.1 joerg QualType StaticTy = C.getRecordType(SharedStaticRD); 4637 1.1 joerg llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); 4638 1.1 joerg auto *GV = new llvm::GlobalVariable( 4639 1.1 joerg CGM.getModule(), LLVMStaticTy, 4640 1.1 joerg /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, 4641 1.1 joerg llvm::UndefValue::get(LLVMStaticTy), 4642 1.1 joerg "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, 4643 1.1 joerg llvm::GlobalValue::NotThreadLocal, 4644 1.1 joerg C.getTargetAddressSpace(LangAS::cuda_shared)); 4645 1.1 joerg auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( 4646 1.1 joerg GV, CGM.VoidPtrTy); 4647 1.1 joerg for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { 4648 1.1 joerg Rec->Buffer->replaceAllUsesWith(Replacement); 4649 1.1 joerg Rec->Buffer->eraseFromParent(); 4650 1.1 joerg } 4651 1.1 joerg } 4652 1.1 joerg StaticRD->completeDefinition(); 4653 1.1 joerg if (!StaticRD->field_empty()) { 4654 1.1 joerg QualType StaticTy = C.getRecordType(StaticRD); 4655 1.1 joerg std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); 4656 1.1 joerg llvm::APInt Size1(32, SMsBlockPerSM.second); 4657 1.1 joerg QualType Arr1Ty = 4658 1.1 joerg C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, 4659 1.1 joerg /*IndexTypeQuals=*/0); 4660 1.1 joerg llvm::APInt Size2(32, SMsBlockPerSM.first); 4661 1.1 joerg QualType Arr2Ty = 4662 1.1 joerg C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, 4663 1.1 joerg /*IndexTypeQuals=*/0); 4664 1.1 joerg llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); 4665 1.1 joerg // FIXME: nvlink does not handle weak linkage correctly (object with the 4666 1.1 joerg // different size are reported as erroneous). 4667 1.1 joerg // Restore CommonLinkage as soon as nvlink is fixed. 4668 1.1 joerg auto *GV = new llvm::GlobalVariable( 4669 1.1 joerg CGM.getModule(), LLVMArr2Ty, 4670 1.1 joerg /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, 4671 1.1 joerg llvm::Constant::getNullValue(LLVMArr2Ty), 4672 1.1 joerg "_openmp_static_glob_rd_$_"); 4673 1.1 joerg auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( 4674 1.1 joerg GV, CGM.VoidPtrTy); 4675 1.1 joerg for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { 4676 1.1 joerg Rec->Buffer->replaceAllUsesWith(Replacement); 4677 1.1 joerg Rec->Buffer->eraseFromParent(); 4678 1.1 joerg } 4679 1.1 joerg } 4680 1.1 joerg } 4681 1.1 joerg if (!TeamsReductions.empty()) { 4682 1.1 joerg ASTContext &C = CGM.getContext(); 4683 1.1 joerg RecordDecl *StaticRD = C.buildImplicitRecord( 4684 1.1 joerg "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); 4685 1.1 joerg StaticRD->startDefinition(); 4686 1.1 joerg for (const RecordDecl *TeamReductionRec : TeamsReductions) { 4687 1.1 joerg QualType RecTy = C.getRecordType(TeamReductionRec); 4688 1.1 joerg auto *Field = FieldDecl::Create( 4689 1.1 joerg C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 4690 1.1 joerg C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 4691 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false, 4692 1.1 joerg /*InitStyle=*/ICIS_NoInit); 4693 1.1 joerg Field->setAccess(AS_public); 4694 1.1 joerg StaticRD->addDecl(Field); 4695 1.1 joerg } 4696 1.1 joerg StaticRD->completeDefinition(); 4697 1.1 joerg QualType StaticTy = C.getRecordType(StaticRD); 4698 1.1 joerg llvm::Type *LLVMReductionsBufferTy = 4699 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy); 4700 1.1 joerg // FIXME: nvlink does not handle weak linkage correctly (object with the 4701 1.1 joerg // different size are reported as erroneous). 4702 1.1 joerg // Restore CommonLinkage as soon as nvlink is fixed. 4703 1.1 joerg auto *GV = new llvm::GlobalVariable( 4704 1.1 joerg CGM.getModule(), LLVMReductionsBufferTy, 4705 1.1 joerg /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, 4706 1.1 joerg llvm::Constant::getNullValue(LLVMReductionsBufferTy), 4707 1.1 joerg "_openmp_teams_reductions_buffer_$_"); 4708 1.1 joerg KernelTeamsReductionPtr->setInitializer( 4709 1.1 joerg llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, 4710 1.1 joerg CGM.VoidPtrTy)); 4711 1.1 joerg } 4712 1.1 joerg CGOpenMPRuntime::clear(); 4713 1.1 joerg } 4714