CGOpenMPRuntimeGPU.cpp revision 1.1 1 1.1 joerg //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 1.1 joerg //
3 1.1 joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 1.1 joerg // See https://llvm.org/LICENSE.txt for license information.
5 1.1 joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 1.1 joerg //
7 1.1 joerg //===----------------------------------------------------------------------===//
8 1.1 joerg //
9 1.1 joerg // This provides a generalized class for OpenMP runtime code generation
10 1.1 joerg // specialized by GPU targets NVPTX and AMDGCN.
11 1.1 joerg //
12 1.1 joerg //===----------------------------------------------------------------------===//
13 1.1 joerg
14 1.1 joerg #include "CGOpenMPRuntimeGPU.h"
15 1.1 joerg #include "CGOpenMPRuntimeNVPTX.h"
16 1.1 joerg #include "CodeGenFunction.h"
17 1.1 joerg #include "clang/AST/Attr.h"
18 1.1 joerg #include "clang/AST/DeclOpenMP.h"
19 1.1 joerg #include "clang/AST/StmtOpenMP.h"
20 1.1 joerg #include "clang/AST/StmtVisitor.h"
21 1.1 joerg #include "clang/Basic/Cuda.h"
22 1.1 joerg #include "llvm/ADT/SmallPtrSet.h"
23 1.1 joerg #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 1.1 joerg #include "llvm/IR/IntrinsicsNVPTX.h"
25 1.1 joerg
26 1.1 joerg using namespace clang;
27 1.1 joerg using namespace CodeGen;
28 1.1 joerg using namespace llvm::omp;
29 1.1 joerg
30 1.1 joerg namespace {
31 1.1 joerg /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 1.1 joerg class NVPTXActionTy final : public PrePostActionTy {
33 1.1 joerg llvm::FunctionCallee EnterCallee = nullptr;
34 1.1 joerg ArrayRef<llvm::Value *> EnterArgs;
35 1.1 joerg llvm::FunctionCallee ExitCallee = nullptr;
36 1.1 joerg ArrayRef<llvm::Value *> ExitArgs;
37 1.1 joerg bool Conditional = false;
38 1.1 joerg llvm::BasicBlock *ContBlock = nullptr;
39 1.1 joerg
40 1.1 joerg public:
41 1.1 joerg NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 1.1 joerg ArrayRef<llvm::Value *> EnterArgs,
43 1.1 joerg llvm::FunctionCallee ExitCallee,
44 1.1 joerg ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 1.1 joerg : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 1.1 joerg ExitArgs(ExitArgs), Conditional(Conditional) {}
47 1.1 joerg void Enter(CodeGenFunction &CGF) override {
48 1.1 joerg llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 1.1 joerg if (Conditional) {
50 1.1 joerg llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 1.1 joerg auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 1.1 joerg ContBlock = CGF.createBasicBlock("omp_if.end");
53 1.1 joerg // Generate the branch (If-stmt)
54 1.1 joerg CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 1.1 joerg CGF.EmitBlock(ThenBlock);
56 1.1 joerg }
57 1.1 joerg }
58 1.1 joerg void Done(CodeGenFunction &CGF) {
59 1.1 joerg // Emit the rest of blocks/branches
60 1.1 joerg CGF.EmitBranch(ContBlock);
61 1.1 joerg CGF.EmitBlock(ContBlock, true);
62 1.1 joerg }
63 1.1 joerg void Exit(CodeGenFunction &CGF) override {
64 1.1 joerg CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 1.1 joerg }
66 1.1 joerg };
67 1.1 joerg
68 1.1 joerg /// A class to track the execution mode when codegening directives within
69 1.1 joerg /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 1.1 joerg /// to the target region and used by containing directives such as 'parallel'
71 1.1 joerg /// to emit optimized code.
72 1.1 joerg class ExecutionRuntimeModesRAII {
73 1.1 joerg private:
74 1.1 joerg CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 1.1 joerg CGOpenMPRuntimeGPU::EM_Unknown;
76 1.1 joerg CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77 1.1 joerg bool SavedRuntimeMode = false;
78 1.1 joerg bool *RuntimeMode = nullptr;
79 1.1 joerg
80 1.1 joerg public:
81 1.1 joerg /// Constructor for Non-SPMD mode.
82 1.1 joerg ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
83 1.1 joerg : ExecMode(ExecMode) {
84 1.1 joerg SavedExecMode = ExecMode;
85 1.1 joerg ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
86 1.1 joerg }
87 1.1 joerg /// Constructor for SPMD mode.
88 1.1 joerg ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
89 1.1 joerg bool &RuntimeMode, bool FullRuntimeMode)
90 1.1 joerg : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
91 1.1 joerg SavedExecMode = ExecMode;
92 1.1 joerg SavedRuntimeMode = RuntimeMode;
93 1.1 joerg ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
94 1.1 joerg RuntimeMode = FullRuntimeMode;
95 1.1 joerg }
96 1.1 joerg ~ExecutionRuntimeModesRAII() {
97 1.1 joerg ExecMode = SavedExecMode;
98 1.1 joerg if (RuntimeMode)
99 1.1 joerg *RuntimeMode = SavedRuntimeMode;
100 1.1 joerg }
101 1.1 joerg };
102 1.1 joerg
103 1.1 joerg /// GPU Configuration: This information can be derived from cuda registers,
104 1.1 joerg /// however, providing compile time constants helps generate more efficient
105 1.1 joerg /// code. For all practical purposes this is fine because the configuration
106 1.1 joerg /// is the same for all known NVPTX architectures.
107 1.1 joerg enum MachineConfiguration : unsigned {
108 1.1 joerg /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
109 1.1 joerg /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
110 1.1 joerg /// and GV_Warp_Size_Log2_Mask.
111 1.1 joerg
112 1.1 joerg /// Global memory alignment for performance.
113 1.1 joerg GlobalMemoryAlignment = 128,
114 1.1 joerg
115 1.1 joerg /// Maximal size of the shared memory buffer.
116 1.1 joerg SharedMemorySize = 128,
117 1.1 joerg };
118 1.1 joerg
119 1.1 joerg static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120 1.1 joerg RefExpr = RefExpr->IgnoreParens();
121 1.1 joerg if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122 1.1 joerg const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123 1.1 joerg while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124 1.1 joerg Base = TempASE->getBase()->IgnoreParenImpCasts();
125 1.1 joerg RefExpr = Base;
126 1.1 joerg } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127 1.1 joerg const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128 1.1 joerg while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129 1.1 joerg Base = TempOASE->getBase()->IgnoreParenImpCasts();
130 1.1 joerg while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131 1.1 joerg Base = TempASE->getBase()->IgnoreParenImpCasts();
132 1.1 joerg RefExpr = Base;
133 1.1 joerg }
134 1.1 joerg RefExpr = RefExpr->IgnoreParenImpCasts();
135 1.1 joerg if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136 1.1 joerg return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137 1.1 joerg const auto *ME = cast<MemberExpr>(RefExpr);
138 1.1 joerg return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139 1.1 joerg }
140 1.1 joerg
141 1.1 joerg
142 1.1 joerg static RecordDecl *buildRecordForGlobalizedVars(
143 1.1 joerg ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144 1.1 joerg ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146 1.1 joerg &MappedDeclsFields, int BufSize) {
147 1.1 joerg using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148 1.1 joerg if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149 1.1 joerg return nullptr;
150 1.1 joerg SmallVector<VarsDataTy, 4> GlobalizedVars;
151 1.1 joerg for (const ValueDecl *D : EscapedDecls)
152 1.1 joerg GlobalizedVars.emplace_back(
153 1.1 joerg CharUnits::fromQuantity(std::max(
154 1.1 joerg C.getDeclAlign(D).getQuantity(),
155 1.1 joerg static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156 1.1 joerg D);
157 1.1 joerg for (const ValueDecl *D : EscapedDeclsForTeams)
158 1.1 joerg GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159 1.1 joerg llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160 1.1 joerg return L.first > R.first;
161 1.1 joerg });
162 1.1 joerg
163 1.1 joerg // Build struct _globalized_locals_ty {
164 1.1 joerg // /* globalized vars */[WarSize] align (max(decl_align,
165 1.1 joerg // GlobalMemoryAlignment))
166 1.1 joerg // /* globalized vars */ for EscapedDeclsForTeams
167 1.1 joerg // };
168 1.1 joerg RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169 1.1 joerg GlobalizedRD->startDefinition();
170 1.1 joerg llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171 1.1 joerg EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172 1.1 joerg for (const auto &Pair : GlobalizedVars) {
173 1.1 joerg const ValueDecl *VD = Pair.second;
174 1.1 joerg QualType Type = VD->getType();
175 1.1 joerg if (Type->isLValueReferenceType())
176 1.1 joerg Type = C.getPointerType(Type.getNonReferenceType());
177 1.1 joerg else
178 1.1 joerg Type = Type.getNonReferenceType();
179 1.1 joerg SourceLocation Loc = VD->getLocation();
180 1.1 joerg FieldDecl *Field;
181 1.1 joerg if (SingleEscaped.count(VD)) {
182 1.1 joerg Field = FieldDecl::Create(
183 1.1 joerg C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184 1.1 joerg C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false,
186 1.1 joerg /*InitStyle=*/ICIS_NoInit);
187 1.1 joerg Field->setAccess(AS_public);
188 1.1 joerg if (VD->hasAttrs()) {
189 1.1 joerg for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190 1.1 joerg E(VD->getAttrs().end());
191 1.1 joerg I != E; ++I)
192 1.1 joerg Field->addAttr(*I);
193 1.1 joerg }
194 1.1 joerg } else {
195 1.1 joerg llvm::APInt ArraySize(32, BufSize);
196 1.1 joerg Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197 1.1 joerg 0);
198 1.1 joerg Field = FieldDecl::Create(
199 1.1 joerg C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200 1.1 joerg C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false,
202 1.1 joerg /*InitStyle=*/ICIS_NoInit);
203 1.1 joerg Field->setAccess(AS_public);
204 1.1 joerg llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205 1.1 joerg static_cast<CharUnits::QuantityType>(
206 1.1 joerg GlobalMemoryAlignment)));
207 1.1 joerg Field->addAttr(AlignedAttr::CreateImplicit(
208 1.1 joerg C, /*IsAlignmentExpr=*/true,
209 1.1 joerg IntegerLiteral::Create(C, Align,
210 1.1 joerg C.getIntTypeForBitwidth(32, /*Signed=*/0),
211 1.1 joerg SourceLocation()),
212 1.1 joerg {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213 1.1 joerg }
214 1.1 joerg GlobalizedRD->addDecl(Field);
215 1.1 joerg MappedDeclsFields.try_emplace(VD, Field);
216 1.1 joerg }
217 1.1 joerg GlobalizedRD->completeDefinition();
218 1.1 joerg return GlobalizedRD;
219 1.1 joerg }
220 1.1 joerg
221 1.1 joerg /// Get the list of variables that can escape their declaration context.
222 1.1 joerg class CheckVarsEscapingDeclContext final
223 1.1 joerg : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224 1.1 joerg CodeGenFunction &CGF;
225 1.1 joerg llvm::SetVector<const ValueDecl *> EscapedDecls;
226 1.1 joerg llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227 1.1 joerg llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228 1.1 joerg RecordDecl *GlobalizedRD = nullptr;
229 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230 1.1 joerg bool AllEscaped = false;
231 1.1 joerg bool IsForCombinedParallelRegion = false;
232 1.1 joerg
233 1.1 joerg void markAsEscaped(const ValueDecl *VD) {
234 1.1 joerg // Do not globalize declare target variables.
235 1.1 joerg if (!isa<VarDecl>(VD) ||
236 1.1 joerg OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237 1.1 joerg return;
238 1.1 joerg VD = cast<ValueDecl>(VD->getCanonicalDecl());
239 1.1 joerg // Use user-specified allocation.
240 1.1 joerg if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241 1.1 joerg return;
242 1.1 joerg // Variables captured by value must be globalized.
243 1.1 joerg if (auto *CSI = CGF.CapturedStmtInfo) {
244 1.1 joerg if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245 1.1 joerg // Check if need to capture the variable that was already captured by
246 1.1 joerg // value in the outer region.
247 1.1 joerg if (!IsForCombinedParallelRegion) {
248 1.1 joerg if (!FD->hasAttrs())
249 1.1 joerg return;
250 1.1 joerg const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251 1.1 joerg if (!Attr)
252 1.1 joerg return;
253 1.1 joerg if (((Attr->getCaptureKind() != OMPC_map) &&
254 1.1 joerg !isOpenMPPrivate(Attr->getCaptureKind())) ||
255 1.1 joerg ((Attr->getCaptureKind() == OMPC_map) &&
256 1.1 joerg !FD->getType()->isAnyPointerType()))
257 1.1 joerg return;
258 1.1 joerg }
259 1.1 joerg if (!FD->getType()->isReferenceType()) {
260 1.1 joerg assert(!VD->getType()->isVariablyModifiedType() &&
261 1.1 joerg "Parameter captured by value with variably modified type");
262 1.1 joerg EscapedParameters.insert(VD);
263 1.1 joerg } else if (!IsForCombinedParallelRegion) {
264 1.1 joerg return;
265 1.1 joerg }
266 1.1 joerg }
267 1.1 joerg }
268 1.1 joerg if ((!CGF.CapturedStmtInfo ||
269 1.1 joerg (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270 1.1 joerg VD->getType()->isReferenceType())
271 1.1 joerg // Do not globalize variables with reference type.
272 1.1 joerg return;
273 1.1 joerg if (VD->getType()->isVariablyModifiedType())
274 1.1 joerg EscapedVariableLengthDecls.insert(VD);
275 1.1 joerg else
276 1.1 joerg EscapedDecls.insert(VD);
277 1.1 joerg }
278 1.1 joerg
279 1.1 joerg void VisitValueDecl(const ValueDecl *VD) {
280 1.1 joerg if (VD->getType()->isLValueReferenceType())
281 1.1 joerg markAsEscaped(VD);
282 1.1 joerg if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283 1.1 joerg if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284 1.1 joerg const bool SavedAllEscaped = AllEscaped;
285 1.1 joerg AllEscaped = VD->getType()->isLValueReferenceType();
286 1.1 joerg Visit(VarD->getInit());
287 1.1 joerg AllEscaped = SavedAllEscaped;
288 1.1 joerg }
289 1.1 joerg }
290 1.1 joerg }
291 1.1 joerg void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292 1.1 joerg ArrayRef<OMPClause *> Clauses,
293 1.1 joerg bool IsCombinedParallelRegion) {
294 1.1 joerg if (!S)
295 1.1 joerg return;
296 1.1 joerg for (const CapturedStmt::Capture &C : S->captures()) {
297 1.1 joerg if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298 1.1 joerg const ValueDecl *VD = C.getCapturedVar();
299 1.1 joerg bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300 1.1 joerg if (IsCombinedParallelRegion) {
301 1.1 joerg // Check if the variable is privatized in the combined construct and
302 1.1 joerg // those private copies must be shared in the inner parallel
303 1.1 joerg // directive.
304 1.1 joerg IsForCombinedParallelRegion = false;
305 1.1 joerg for (const OMPClause *C : Clauses) {
306 1.1 joerg if (!isOpenMPPrivate(C->getClauseKind()) ||
307 1.1 joerg C->getClauseKind() == OMPC_reduction ||
308 1.1 joerg C->getClauseKind() == OMPC_linear ||
309 1.1 joerg C->getClauseKind() == OMPC_private)
310 1.1 joerg continue;
311 1.1 joerg ArrayRef<const Expr *> Vars;
312 1.1 joerg if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313 1.1 joerg Vars = PC->getVarRefs();
314 1.1 joerg else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315 1.1 joerg Vars = PC->getVarRefs();
316 1.1 joerg else
317 1.1 joerg llvm_unreachable("Unexpected clause.");
318 1.1 joerg for (const auto *E : Vars) {
319 1.1 joerg const Decl *D =
320 1.1 joerg cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321 1.1 joerg if (D == VD->getCanonicalDecl()) {
322 1.1 joerg IsForCombinedParallelRegion = true;
323 1.1 joerg break;
324 1.1 joerg }
325 1.1 joerg }
326 1.1 joerg if (IsForCombinedParallelRegion)
327 1.1 joerg break;
328 1.1 joerg }
329 1.1 joerg }
330 1.1 joerg markAsEscaped(VD);
331 1.1 joerg if (isa<OMPCapturedExprDecl>(VD))
332 1.1 joerg VisitValueDecl(VD);
333 1.1 joerg IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334 1.1 joerg }
335 1.1 joerg }
336 1.1 joerg }
337 1.1 joerg
338 1.1 joerg void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339 1.1 joerg assert(!GlobalizedRD &&
340 1.1 joerg "Record for globalized variables is built already.");
341 1.1 joerg ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342 1.1 joerg unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
343 1.1 joerg if (IsInTTDRegion)
344 1.1 joerg EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345 1.1 joerg else
346 1.1 joerg EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347 1.1 joerg GlobalizedRD = ::buildRecordForGlobalizedVars(
348 1.1 joerg CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349 1.1 joerg MappedDeclsFields, WarpSize);
350 1.1 joerg }
351 1.1 joerg
352 1.1 joerg public:
353 1.1 joerg CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354 1.1 joerg ArrayRef<const ValueDecl *> TeamsReductions)
355 1.1 joerg : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356 1.1 joerg }
357 1.1 joerg virtual ~CheckVarsEscapingDeclContext() = default;
358 1.1 joerg void VisitDeclStmt(const DeclStmt *S) {
359 1.1 joerg if (!S)
360 1.1 joerg return;
361 1.1 joerg for (const Decl *D : S->decls())
362 1.1 joerg if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363 1.1 joerg VisitValueDecl(VD);
364 1.1 joerg }
365 1.1 joerg void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366 1.1 joerg if (!D)
367 1.1 joerg return;
368 1.1 joerg if (!D->hasAssociatedStmt())
369 1.1 joerg return;
370 1.1 joerg if (const auto *S =
371 1.1 joerg dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372 1.1 joerg // Do not analyze directives that do not actually require capturing,
373 1.1 joerg // like `omp for` or `omp simd` directives.
374 1.1 joerg llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375 1.1 joerg getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376 1.1 joerg if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377 1.1 joerg VisitStmt(S->getCapturedStmt());
378 1.1 joerg return;
379 1.1 joerg }
380 1.1 joerg VisitOpenMPCapturedStmt(
381 1.1 joerg S, D->clauses(),
382 1.1 joerg CaptureRegions.back() == OMPD_parallel &&
383 1.1 joerg isOpenMPDistributeDirective(D->getDirectiveKind()));
384 1.1 joerg }
385 1.1 joerg }
386 1.1 joerg void VisitCapturedStmt(const CapturedStmt *S) {
387 1.1 joerg if (!S)
388 1.1 joerg return;
389 1.1 joerg for (const CapturedStmt::Capture &C : S->captures()) {
390 1.1 joerg if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391 1.1 joerg const ValueDecl *VD = C.getCapturedVar();
392 1.1 joerg markAsEscaped(VD);
393 1.1 joerg if (isa<OMPCapturedExprDecl>(VD))
394 1.1 joerg VisitValueDecl(VD);
395 1.1 joerg }
396 1.1 joerg }
397 1.1 joerg }
398 1.1 joerg void VisitLambdaExpr(const LambdaExpr *E) {
399 1.1 joerg if (!E)
400 1.1 joerg return;
401 1.1 joerg for (const LambdaCapture &C : E->captures()) {
402 1.1 joerg if (C.capturesVariable()) {
403 1.1 joerg if (C.getCaptureKind() == LCK_ByRef) {
404 1.1 joerg const ValueDecl *VD = C.getCapturedVar();
405 1.1 joerg markAsEscaped(VD);
406 1.1 joerg if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407 1.1 joerg VisitValueDecl(VD);
408 1.1 joerg }
409 1.1 joerg }
410 1.1 joerg }
411 1.1 joerg }
412 1.1 joerg void VisitBlockExpr(const BlockExpr *E) {
413 1.1 joerg if (!E)
414 1.1 joerg return;
415 1.1 joerg for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416 1.1 joerg if (C.isByRef()) {
417 1.1 joerg const VarDecl *VD = C.getVariable();
418 1.1 joerg markAsEscaped(VD);
419 1.1 joerg if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420 1.1 joerg VisitValueDecl(VD);
421 1.1 joerg }
422 1.1 joerg }
423 1.1 joerg }
424 1.1 joerg void VisitCallExpr(const CallExpr *E) {
425 1.1 joerg if (!E)
426 1.1 joerg return;
427 1.1 joerg for (const Expr *Arg : E->arguments()) {
428 1.1 joerg if (!Arg)
429 1.1 joerg continue;
430 1.1 joerg if (Arg->isLValue()) {
431 1.1 joerg const bool SavedAllEscaped = AllEscaped;
432 1.1 joerg AllEscaped = true;
433 1.1 joerg Visit(Arg);
434 1.1 joerg AllEscaped = SavedAllEscaped;
435 1.1 joerg } else {
436 1.1 joerg Visit(Arg);
437 1.1 joerg }
438 1.1 joerg }
439 1.1 joerg Visit(E->getCallee());
440 1.1 joerg }
441 1.1 joerg void VisitDeclRefExpr(const DeclRefExpr *E) {
442 1.1 joerg if (!E)
443 1.1 joerg return;
444 1.1 joerg const ValueDecl *VD = E->getDecl();
445 1.1 joerg if (AllEscaped)
446 1.1 joerg markAsEscaped(VD);
447 1.1 joerg if (isa<OMPCapturedExprDecl>(VD))
448 1.1 joerg VisitValueDecl(VD);
449 1.1 joerg else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450 1.1 joerg if (VarD->isInitCapture())
451 1.1 joerg VisitValueDecl(VD);
452 1.1 joerg }
453 1.1 joerg void VisitUnaryOperator(const UnaryOperator *E) {
454 1.1 joerg if (!E)
455 1.1 joerg return;
456 1.1 joerg if (E->getOpcode() == UO_AddrOf) {
457 1.1 joerg const bool SavedAllEscaped = AllEscaped;
458 1.1 joerg AllEscaped = true;
459 1.1 joerg Visit(E->getSubExpr());
460 1.1 joerg AllEscaped = SavedAllEscaped;
461 1.1 joerg } else {
462 1.1 joerg Visit(E->getSubExpr());
463 1.1 joerg }
464 1.1 joerg }
465 1.1 joerg void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466 1.1 joerg if (!E)
467 1.1 joerg return;
468 1.1 joerg if (E->getCastKind() == CK_ArrayToPointerDecay) {
469 1.1 joerg const bool SavedAllEscaped = AllEscaped;
470 1.1 joerg AllEscaped = true;
471 1.1 joerg Visit(E->getSubExpr());
472 1.1 joerg AllEscaped = SavedAllEscaped;
473 1.1 joerg } else {
474 1.1 joerg Visit(E->getSubExpr());
475 1.1 joerg }
476 1.1 joerg }
477 1.1 joerg void VisitExpr(const Expr *E) {
478 1.1 joerg if (!E)
479 1.1 joerg return;
480 1.1 joerg bool SavedAllEscaped = AllEscaped;
481 1.1 joerg if (!E->isLValue())
482 1.1 joerg AllEscaped = false;
483 1.1 joerg for (const Stmt *Child : E->children())
484 1.1 joerg if (Child)
485 1.1 joerg Visit(Child);
486 1.1 joerg AllEscaped = SavedAllEscaped;
487 1.1 joerg }
488 1.1 joerg void VisitStmt(const Stmt *S) {
489 1.1 joerg if (!S)
490 1.1 joerg return;
491 1.1 joerg for (const Stmt *Child : S->children())
492 1.1 joerg if (Child)
493 1.1 joerg Visit(Child);
494 1.1 joerg }
495 1.1 joerg
496 1.1 joerg /// Returns the record that handles all the escaped local variables and used
497 1.1 joerg /// instead of their original storage.
498 1.1 joerg const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499 1.1 joerg if (!GlobalizedRD)
500 1.1 joerg buildRecordForGlobalizedVars(IsInTTDRegion);
501 1.1 joerg return GlobalizedRD;
502 1.1 joerg }
503 1.1 joerg
504 1.1 joerg /// Returns the field in the globalized record for the escaped variable.
505 1.1 joerg const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506 1.1 joerg assert(GlobalizedRD &&
507 1.1 joerg "Record for globalized variables must be generated already.");
508 1.1 joerg auto I = MappedDeclsFields.find(VD);
509 1.1 joerg if (I == MappedDeclsFields.end())
510 1.1 joerg return nullptr;
511 1.1 joerg return I->getSecond();
512 1.1 joerg }
513 1.1 joerg
514 1.1 joerg /// Returns the list of the escaped local variables/parameters.
515 1.1 joerg ArrayRef<const ValueDecl *> getEscapedDecls() const {
516 1.1 joerg return EscapedDecls.getArrayRef();
517 1.1 joerg }
518 1.1 joerg
519 1.1 joerg /// Checks if the escaped local variable is actually a parameter passed by
520 1.1 joerg /// value.
521 1.1 joerg const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522 1.1 joerg return EscapedParameters;
523 1.1 joerg }
524 1.1 joerg
525 1.1 joerg /// Returns the list of the escaped variables with the variably modified
526 1.1 joerg /// types.
527 1.1 joerg ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528 1.1 joerg return EscapedVariableLengthDecls.getArrayRef();
529 1.1 joerg }
530 1.1 joerg };
531 1.1 joerg } // anonymous namespace
532 1.1 joerg
533 1.1 joerg /// Get the id of the warp in the block.
534 1.1 joerg /// We assume that the warp size is 32, which is always the case
535 1.1 joerg /// on the NVPTX device, to generate more efficient code.
536 1.1 joerg static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
538 1.1 joerg unsigned LaneIDBits =
539 1.1 joerg CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
540 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541 1.1 joerg return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542 1.1 joerg }
543 1.1 joerg
544 1.1 joerg /// Get the id of the current lane in the Warp.
545 1.1 joerg /// We assume that the warp size is 32, which is always the case
546 1.1 joerg /// on the NVPTX device, to generate more efficient code.
547 1.1 joerg static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
549 1.1 joerg unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
550 1.1 joerg llvm::omp::GV_Warp_Size_Log2_Mask);
551 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
552 1.1 joerg return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
553 1.1 joerg "nvptx_lane_id");
554 1.1 joerg }
555 1.1 joerg
556 1.1 joerg /// Get the value of the thread_limit clause in the teams directive.
557 1.1 joerg /// For the 'generic' execution mode, the runtime encodes thread_limit in
558 1.1 joerg /// the launch parameters, always starting thread_limit+warpSize threads per
559 1.1 joerg /// CTA. The threads in the last warp are reserved for master execution.
560 1.1 joerg /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
561 1.1 joerg static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
562 1.1 joerg bool IsInSPMDExecutionMode = false) {
563 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
564 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
565 1.1 joerg llvm::Value *ThreadLimit = nullptr;
566 1.1 joerg if (IsInSPMDExecutionMode)
567 1.1 joerg ThreadLimit = RT.getGPUNumThreads(CGF);
568 1.1 joerg else {
569 1.1 joerg llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
570 1.1 joerg llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
571 1.1 joerg ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
572 1.1 joerg }
573 1.1 joerg assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
574 1.1 joerg return ThreadLimit;
575 1.1 joerg }
576 1.1 joerg
577 1.1 joerg /// Get the thread id of the OMP master thread.
578 1.1 joerg /// The master thread id is the first thread (lane) of the last warp in the
579 1.1 joerg /// GPU block. Warp size is assumed to be some power of 2.
580 1.1 joerg /// Thread id is 0 indexed.
581 1.1 joerg /// E.g: If NumThreads is 33, master id is 32.
582 1.1 joerg /// If NumThreads is 64, master id is 32.
583 1.1 joerg /// If NumThreads is 1024, master id is 992.
584 1.1 joerg static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
585 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
586 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
587 1.1 joerg llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
588 1.1 joerg // We assume that the warp size is a power of 2.
589 1.1 joerg llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
590 1.1 joerg
591 1.1 joerg llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
592 1.1 joerg return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
593 1.1 joerg }
594 1.1 joerg
595 1.1 joerg CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
596 1.1 joerg CodeGenModule &CGM, SourceLocation Loc)
597 1.1 joerg : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
598 1.1 joerg Loc(Loc) {
599 1.1 joerg createWorkerFunction(CGM);
600 1.1 joerg }
601 1.1 joerg
602 1.1 joerg void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
603 1.1 joerg CodeGenModule &CGM) {
604 1.1 joerg // Create an worker function with no arguments.
605 1.1 joerg
606 1.1 joerg WorkerFn = llvm::Function::Create(
607 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
608 1.1 joerg /*placeholder=*/"_worker", &CGM.getModule());
609 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
610 1.1 joerg WorkerFn->setDoesNotRecurse();
611 1.1 joerg }
612 1.1 joerg
613 1.1 joerg CGOpenMPRuntimeGPU::ExecutionMode
614 1.1 joerg CGOpenMPRuntimeGPU::getExecutionMode() const {
615 1.1 joerg return CurrentExecutionMode;
616 1.1 joerg }
617 1.1 joerg
618 1.1 joerg static CGOpenMPRuntimeGPU::DataSharingMode
619 1.1 joerg getDataSharingMode(CodeGenModule &CGM) {
620 1.1 joerg return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
621 1.1 joerg : CGOpenMPRuntimeGPU::Generic;
622 1.1 joerg }
623 1.1 joerg
624 1.1 joerg /// Check for inner (nested) SPMD construct, if any
625 1.1 joerg static bool hasNestedSPMDDirective(ASTContext &Ctx,
626 1.1 joerg const OMPExecutableDirective &D) {
627 1.1 joerg const auto *CS = D.getInnermostCapturedStmt();
628 1.1 joerg const auto *Body =
629 1.1 joerg CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
630 1.1 joerg const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
631 1.1 joerg
632 1.1 joerg if (const auto *NestedDir =
633 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
634 1.1 joerg OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
635 1.1 joerg switch (D.getDirectiveKind()) {
636 1.1 joerg case OMPD_target:
637 1.1 joerg if (isOpenMPParallelDirective(DKind))
638 1.1 joerg return true;
639 1.1 joerg if (DKind == OMPD_teams) {
640 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
641 1.1 joerg /*IgnoreCaptured=*/true);
642 1.1 joerg if (!Body)
643 1.1 joerg return false;
644 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
645 1.1 joerg if (const auto *NND =
646 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
647 1.1 joerg DKind = NND->getDirectiveKind();
648 1.1 joerg if (isOpenMPParallelDirective(DKind))
649 1.1 joerg return true;
650 1.1 joerg }
651 1.1 joerg }
652 1.1 joerg return false;
653 1.1 joerg case OMPD_target_teams:
654 1.1 joerg return isOpenMPParallelDirective(DKind);
655 1.1 joerg case OMPD_target_simd:
656 1.1 joerg case OMPD_target_parallel:
657 1.1 joerg case OMPD_target_parallel_for:
658 1.1 joerg case OMPD_target_parallel_for_simd:
659 1.1 joerg case OMPD_target_teams_distribute:
660 1.1 joerg case OMPD_target_teams_distribute_simd:
661 1.1 joerg case OMPD_target_teams_distribute_parallel_for:
662 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd:
663 1.1 joerg case OMPD_parallel:
664 1.1 joerg case OMPD_for:
665 1.1 joerg case OMPD_parallel_for:
666 1.1 joerg case OMPD_parallel_master:
667 1.1 joerg case OMPD_parallel_sections:
668 1.1 joerg case OMPD_for_simd:
669 1.1 joerg case OMPD_parallel_for_simd:
670 1.1 joerg case OMPD_cancel:
671 1.1 joerg case OMPD_cancellation_point:
672 1.1 joerg case OMPD_ordered:
673 1.1 joerg case OMPD_threadprivate:
674 1.1 joerg case OMPD_allocate:
675 1.1 joerg case OMPD_task:
676 1.1 joerg case OMPD_simd:
677 1.1 joerg case OMPD_sections:
678 1.1 joerg case OMPD_section:
679 1.1 joerg case OMPD_single:
680 1.1 joerg case OMPD_master:
681 1.1 joerg case OMPD_critical:
682 1.1 joerg case OMPD_taskyield:
683 1.1 joerg case OMPD_barrier:
684 1.1 joerg case OMPD_taskwait:
685 1.1 joerg case OMPD_taskgroup:
686 1.1 joerg case OMPD_atomic:
687 1.1 joerg case OMPD_flush:
688 1.1 joerg case OMPD_depobj:
689 1.1 joerg case OMPD_scan:
690 1.1 joerg case OMPD_teams:
691 1.1 joerg case OMPD_target_data:
692 1.1 joerg case OMPD_target_exit_data:
693 1.1 joerg case OMPD_target_enter_data:
694 1.1 joerg case OMPD_distribute:
695 1.1 joerg case OMPD_distribute_simd:
696 1.1 joerg case OMPD_distribute_parallel_for:
697 1.1 joerg case OMPD_distribute_parallel_for_simd:
698 1.1 joerg case OMPD_teams_distribute:
699 1.1 joerg case OMPD_teams_distribute_simd:
700 1.1 joerg case OMPD_teams_distribute_parallel_for:
701 1.1 joerg case OMPD_teams_distribute_parallel_for_simd:
702 1.1 joerg case OMPD_target_update:
703 1.1 joerg case OMPD_declare_simd:
704 1.1 joerg case OMPD_declare_variant:
705 1.1 joerg case OMPD_begin_declare_variant:
706 1.1 joerg case OMPD_end_declare_variant:
707 1.1 joerg case OMPD_declare_target:
708 1.1 joerg case OMPD_end_declare_target:
709 1.1 joerg case OMPD_declare_reduction:
710 1.1 joerg case OMPD_declare_mapper:
711 1.1 joerg case OMPD_taskloop:
712 1.1 joerg case OMPD_taskloop_simd:
713 1.1 joerg case OMPD_master_taskloop:
714 1.1 joerg case OMPD_master_taskloop_simd:
715 1.1 joerg case OMPD_parallel_master_taskloop:
716 1.1 joerg case OMPD_parallel_master_taskloop_simd:
717 1.1 joerg case OMPD_requires:
718 1.1 joerg case OMPD_unknown:
719 1.1 joerg default:
720 1.1 joerg llvm_unreachable("Unexpected directive.");
721 1.1 joerg }
722 1.1 joerg }
723 1.1 joerg
724 1.1 joerg return false;
725 1.1 joerg }
726 1.1 joerg
727 1.1 joerg static bool supportsSPMDExecutionMode(ASTContext &Ctx,
728 1.1 joerg const OMPExecutableDirective &D) {
729 1.1 joerg OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
730 1.1 joerg switch (DirectiveKind) {
731 1.1 joerg case OMPD_target:
732 1.1 joerg case OMPD_target_teams:
733 1.1 joerg return hasNestedSPMDDirective(Ctx, D);
734 1.1 joerg case OMPD_target_parallel:
735 1.1 joerg case OMPD_target_parallel_for:
736 1.1 joerg case OMPD_target_parallel_for_simd:
737 1.1 joerg case OMPD_target_teams_distribute_parallel_for:
738 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd:
739 1.1 joerg case OMPD_target_simd:
740 1.1 joerg case OMPD_target_teams_distribute_simd:
741 1.1 joerg return true;
742 1.1 joerg case OMPD_target_teams_distribute:
743 1.1 joerg return false;
744 1.1 joerg case OMPD_parallel:
745 1.1 joerg case OMPD_for:
746 1.1 joerg case OMPD_parallel_for:
747 1.1 joerg case OMPD_parallel_master:
748 1.1 joerg case OMPD_parallel_sections:
749 1.1 joerg case OMPD_for_simd:
750 1.1 joerg case OMPD_parallel_for_simd:
751 1.1 joerg case OMPD_cancel:
752 1.1 joerg case OMPD_cancellation_point:
753 1.1 joerg case OMPD_ordered:
754 1.1 joerg case OMPD_threadprivate:
755 1.1 joerg case OMPD_allocate:
756 1.1 joerg case OMPD_task:
757 1.1 joerg case OMPD_simd:
758 1.1 joerg case OMPD_sections:
759 1.1 joerg case OMPD_section:
760 1.1 joerg case OMPD_single:
761 1.1 joerg case OMPD_master:
762 1.1 joerg case OMPD_critical:
763 1.1 joerg case OMPD_taskyield:
764 1.1 joerg case OMPD_barrier:
765 1.1 joerg case OMPD_taskwait:
766 1.1 joerg case OMPD_taskgroup:
767 1.1 joerg case OMPD_atomic:
768 1.1 joerg case OMPD_flush:
769 1.1 joerg case OMPD_depobj:
770 1.1 joerg case OMPD_scan:
771 1.1 joerg case OMPD_teams:
772 1.1 joerg case OMPD_target_data:
773 1.1 joerg case OMPD_target_exit_data:
774 1.1 joerg case OMPD_target_enter_data:
775 1.1 joerg case OMPD_distribute:
776 1.1 joerg case OMPD_distribute_simd:
777 1.1 joerg case OMPD_distribute_parallel_for:
778 1.1 joerg case OMPD_distribute_parallel_for_simd:
779 1.1 joerg case OMPD_teams_distribute:
780 1.1 joerg case OMPD_teams_distribute_simd:
781 1.1 joerg case OMPD_teams_distribute_parallel_for:
782 1.1 joerg case OMPD_teams_distribute_parallel_for_simd:
783 1.1 joerg case OMPD_target_update:
784 1.1 joerg case OMPD_declare_simd:
785 1.1 joerg case OMPD_declare_variant:
786 1.1 joerg case OMPD_begin_declare_variant:
787 1.1 joerg case OMPD_end_declare_variant:
788 1.1 joerg case OMPD_declare_target:
789 1.1 joerg case OMPD_end_declare_target:
790 1.1 joerg case OMPD_declare_reduction:
791 1.1 joerg case OMPD_declare_mapper:
792 1.1 joerg case OMPD_taskloop:
793 1.1 joerg case OMPD_taskloop_simd:
794 1.1 joerg case OMPD_master_taskloop:
795 1.1 joerg case OMPD_master_taskloop_simd:
796 1.1 joerg case OMPD_parallel_master_taskloop:
797 1.1 joerg case OMPD_parallel_master_taskloop_simd:
798 1.1 joerg case OMPD_requires:
799 1.1 joerg case OMPD_unknown:
800 1.1 joerg default:
801 1.1 joerg break;
802 1.1 joerg }
803 1.1 joerg llvm_unreachable(
804 1.1 joerg "Unknown programming model for OpenMP directive on NVPTX target.");
805 1.1 joerg }
806 1.1 joerg
807 1.1 joerg /// Check if the directive is loops based and has schedule clause at all or has
808 1.1 joerg /// static scheduling.
809 1.1 joerg static bool hasStaticScheduling(const OMPExecutableDirective &D) {
810 1.1 joerg assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
811 1.1 joerg isOpenMPLoopDirective(D.getDirectiveKind()) &&
812 1.1 joerg "Expected loop-based directive.");
813 1.1 joerg return !D.hasClausesOfKind<OMPOrderedClause>() &&
814 1.1 joerg (!D.hasClausesOfKind<OMPScheduleClause>() ||
815 1.1 joerg llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
816 1.1 joerg [](const OMPScheduleClause *C) {
817 1.1 joerg return C->getScheduleKind() == OMPC_SCHEDULE_static;
818 1.1 joerg }));
819 1.1 joerg }
820 1.1 joerg
821 1.1 joerg /// Check for inner (nested) lightweight runtime construct, if any
822 1.1 joerg static bool hasNestedLightweightDirective(ASTContext &Ctx,
823 1.1 joerg const OMPExecutableDirective &D) {
824 1.1 joerg assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
825 1.1 joerg const auto *CS = D.getInnermostCapturedStmt();
826 1.1 joerg const auto *Body =
827 1.1 joerg CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
828 1.1 joerg const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
829 1.1 joerg
830 1.1 joerg if (const auto *NestedDir =
831 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
832 1.1 joerg OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
833 1.1 joerg switch (D.getDirectiveKind()) {
834 1.1 joerg case OMPD_target:
835 1.1 joerg if (isOpenMPParallelDirective(DKind) &&
836 1.1 joerg isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
837 1.1 joerg hasStaticScheduling(*NestedDir))
838 1.1 joerg return true;
839 1.1 joerg if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
840 1.1 joerg return true;
841 1.1 joerg if (DKind == OMPD_parallel) {
842 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
843 1.1 joerg /*IgnoreCaptured=*/true);
844 1.1 joerg if (!Body)
845 1.1 joerg return false;
846 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
847 1.1 joerg if (const auto *NND =
848 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
849 1.1 joerg DKind = NND->getDirectiveKind();
850 1.1 joerg if (isOpenMPWorksharingDirective(DKind) &&
851 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
852 1.1 joerg return true;
853 1.1 joerg }
854 1.1 joerg } else if (DKind == OMPD_teams) {
855 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
856 1.1 joerg /*IgnoreCaptured=*/true);
857 1.1 joerg if (!Body)
858 1.1 joerg return false;
859 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
860 1.1 joerg if (const auto *NND =
861 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
862 1.1 joerg DKind = NND->getDirectiveKind();
863 1.1 joerg if (isOpenMPParallelDirective(DKind) &&
864 1.1 joerg isOpenMPWorksharingDirective(DKind) &&
865 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
866 1.1 joerg return true;
867 1.1 joerg if (DKind == OMPD_parallel) {
868 1.1 joerg Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
869 1.1 joerg /*IgnoreCaptured=*/true);
870 1.1 joerg if (!Body)
871 1.1 joerg return false;
872 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
873 1.1 joerg if (const auto *NND =
874 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
875 1.1 joerg DKind = NND->getDirectiveKind();
876 1.1 joerg if (isOpenMPWorksharingDirective(DKind) &&
877 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
878 1.1 joerg return true;
879 1.1 joerg }
880 1.1 joerg }
881 1.1 joerg }
882 1.1 joerg }
883 1.1 joerg return false;
884 1.1 joerg case OMPD_target_teams:
885 1.1 joerg if (isOpenMPParallelDirective(DKind) &&
886 1.1 joerg isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
887 1.1 joerg hasStaticScheduling(*NestedDir))
888 1.1 joerg return true;
889 1.1 joerg if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
890 1.1 joerg return true;
891 1.1 joerg if (DKind == OMPD_parallel) {
892 1.1 joerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
893 1.1 joerg /*IgnoreCaptured=*/true);
894 1.1 joerg if (!Body)
895 1.1 joerg return false;
896 1.1 joerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
897 1.1 joerg if (const auto *NND =
898 1.1 joerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
899 1.1 joerg DKind = NND->getDirectiveKind();
900 1.1 joerg if (isOpenMPWorksharingDirective(DKind) &&
901 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
902 1.1 joerg return true;
903 1.1 joerg }
904 1.1 joerg }
905 1.1 joerg return false;
906 1.1 joerg case OMPD_target_parallel:
907 1.1 joerg if (DKind == OMPD_simd)
908 1.1 joerg return true;
909 1.1 joerg return isOpenMPWorksharingDirective(DKind) &&
910 1.1 joerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
911 1.1 joerg case OMPD_target_teams_distribute:
912 1.1 joerg case OMPD_target_simd:
913 1.1 joerg case OMPD_target_parallel_for:
914 1.1 joerg case OMPD_target_parallel_for_simd:
915 1.1 joerg case OMPD_target_teams_distribute_simd:
916 1.1 joerg case OMPD_target_teams_distribute_parallel_for:
917 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd:
918 1.1 joerg case OMPD_parallel:
919 1.1 joerg case OMPD_for:
920 1.1 joerg case OMPD_parallel_for:
921 1.1 joerg case OMPD_parallel_master:
922 1.1 joerg case OMPD_parallel_sections:
923 1.1 joerg case OMPD_for_simd:
924 1.1 joerg case OMPD_parallel_for_simd:
925 1.1 joerg case OMPD_cancel:
926 1.1 joerg case OMPD_cancellation_point:
927 1.1 joerg case OMPD_ordered:
928 1.1 joerg case OMPD_threadprivate:
929 1.1 joerg case OMPD_allocate:
930 1.1 joerg case OMPD_task:
931 1.1 joerg case OMPD_simd:
932 1.1 joerg case OMPD_sections:
933 1.1 joerg case OMPD_section:
934 1.1 joerg case OMPD_single:
935 1.1 joerg case OMPD_master:
936 1.1 joerg case OMPD_critical:
937 1.1 joerg case OMPD_taskyield:
938 1.1 joerg case OMPD_barrier:
939 1.1 joerg case OMPD_taskwait:
940 1.1 joerg case OMPD_taskgroup:
941 1.1 joerg case OMPD_atomic:
942 1.1 joerg case OMPD_flush:
943 1.1 joerg case OMPD_depobj:
944 1.1 joerg case OMPD_scan:
945 1.1 joerg case OMPD_teams:
946 1.1 joerg case OMPD_target_data:
947 1.1 joerg case OMPD_target_exit_data:
948 1.1 joerg case OMPD_target_enter_data:
949 1.1 joerg case OMPD_distribute:
950 1.1 joerg case OMPD_distribute_simd:
951 1.1 joerg case OMPD_distribute_parallel_for:
952 1.1 joerg case OMPD_distribute_parallel_for_simd:
953 1.1 joerg case OMPD_teams_distribute:
954 1.1 joerg case OMPD_teams_distribute_simd:
955 1.1 joerg case OMPD_teams_distribute_parallel_for:
956 1.1 joerg case OMPD_teams_distribute_parallel_for_simd:
957 1.1 joerg case OMPD_target_update:
958 1.1 joerg case OMPD_declare_simd:
959 1.1 joerg case OMPD_declare_variant:
960 1.1 joerg case OMPD_begin_declare_variant:
961 1.1 joerg case OMPD_end_declare_variant:
962 1.1 joerg case OMPD_declare_target:
963 1.1 joerg case OMPD_end_declare_target:
964 1.1 joerg case OMPD_declare_reduction:
965 1.1 joerg case OMPD_declare_mapper:
966 1.1 joerg case OMPD_taskloop:
967 1.1 joerg case OMPD_taskloop_simd:
968 1.1 joerg case OMPD_master_taskloop:
969 1.1 joerg case OMPD_master_taskloop_simd:
970 1.1 joerg case OMPD_parallel_master_taskloop:
971 1.1 joerg case OMPD_parallel_master_taskloop_simd:
972 1.1 joerg case OMPD_requires:
973 1.1 joerg case OMPD_unknown:
974 1.1 joerg default:
975 1.1 joerg llvm_unreachable("Unexpected directive.");
976 1.1 joerg }
977 1.1 joerg }
978 1.1 joerg
979 1.1 joerg return false;
980 1.1 joerg }
981 1.1 joerg
982 1.1 joerg /// Checks if the construct supports lightweight runtime. It must be SPMD
983 1.1 joerg /// construct + inner loop-based construct with static scheduling.
984 1.1 joerg static bool supportsLightweightRuntime(ASTContext &Ctx,
985 1.1 joerg const OMPExecutableDirective &D) {
986 1.1 joerg if (!supportsSPMDExecutionMode(Ctx, D))
987 1.1 joerg return false;
988 1.1 joerg OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
989 1.1 joerg switch (DirectiveKind) {
990 1.1 joerg case OMPD_target:
991 1.1 joerg case OMPD_target_teams:
992 1.1 joerg case OMPD_target_parallel:
993 1.1 joerg return hasNestedLightweightDirective(Ctx, D);
994 1.1 joerg case OMPD_target_parallel_for:
995 1.1 joerg case OMPD_target_parallel_for_simd:
996 1.1 joerg case OMPD_target_teams_distribute_parallel_for:
997 1.1 joerg case OMPD_target_teams_distribute_parallel_for_simd:
998 1.1 joerg // (Last|First)-privates must be shared in parallel region.
999 1.1 joerg return hasStaticScheduling(D);
1000 1.1 joerg case OMPD_target_simd:
1001 1.1 joerg case OMPD_target_teams_distribute_simd:
1002 1.1 joerg return true;
1003 1.1 joerg case OMPD_target_teams_distribute:
1004 1.1 joerg return false;
1005 1.1 joerg case OMPD_parallel:
1006 1.1 joerg case OMPD_for:
1007 1.1 joerg case OMPD_parallel_for:
1008 1.1 joerg case OMPD_parallel_master:
1009 1.1 joerg case OMPD_parallel_sections:
1010 1.1 joerg case OMPD_for_simd:
1011 1.1 joerg case OMPD_parallel_for_simd:
1012 1.1 joerg case OMPD_cancel:
1013 1.1 joerg case OMPD_cancellation_point:
1014 1.1 joerg case OMPD_ordered:
1015 1.1 joerg case OMPD_threadprivate:
1016 1.1 joerg case OMPD_allocate:
1017 1.1 joerg case OMPD_task:
1018 1.1 joerg case OMPD_simd:
1019 1.1 joerg case OMPD_sections:
1020 1.1 joerg case OMPD_section:
1021 1.1 joerg case OMPD_single:
1022 1.1 joerg case OMPD_master:
1023 1.1 joerg case OMPD_critical:
1024 1.1 joerg case OMPD_taskyield:
1025 1.1 joerg case OMPD_barrier:
1026 1.1 joerg case OMPD_taskwait:
1027 1.1 joerg case OMPD_taskgroup:
1028 1.1 joerg case OMPD_atomic:
1029 1.1 joerg case OMPD_flush:
1030 1.1 joerg case OMPD_depobj:
1031 1.1 joerg case OMPD_scan:
1032 1.1 joerg case OMPD_teams:
1033 1.1 joerg case OMPD_target_data:
1034 1.1 joerg case OMPD_target_exit_data:
1035 1.1 joerg case OMPD_target_enter_data:
1036 1.1 joerg case OMPD_distribute:
1037 1.1 joerg case OMPD_distribute_simd:
1038 1.1 joerg case OMPD_distribute_parallel_for:
1039 1.1 joerg case OMPD_distribute_parallel_for_simd:
1040 1.1 joerg case OMPD_teams_distribute:
1041 1.1 joerg case OMPD_teams_distribute_simd:
1042 1.1 joerg case OMPD_teams_distribute_parallel_for:
1043 1.1 joerg case OMPD_teams_distribute_parallel_for_simd:
1044 1.1 joerg case OMPD_target_update:
1045 1.1 joerg case OMPD_declare_simd:
1046 1.1 joerg case OMPD_declare_variant:
1047 1.1 joerg case OMPD_begin_declare_variant:
1048 1.1 joerg case OMPD_end_declare_variant:
1049 1.1 joerg case OMPD_declare_target:
1050 1.1 joerg case OMPD_end_declare_target:
1051 1.1 joerg case OMPD_declare_reduction:
1052 1.1 joerg case OMPD_declare_mapper:
1053 1.1 joerg case OMPD_taskloop:
1054 1.1 joerg case OMPD_taskloop_simd:
1055 1.1 joerg case OMPD_master_taskloop:
1056 1.1 joerg case OMPD_master_taskloop_simd:
1057 1.1 joerg case OMPD_parallel_master_taskloop:
1058 1.1 joerg case OMPD_parallel_master_taskloop_simd:
1059 1.1 joerg case OMPD_requires:
1060 1.1 joerg case OMPD_unknown:
1061 1.1 joerg default:
1062 1.1 joerg break;
1063 1.1 joerg }
1064 1.1 joerg llvm_unreachable(
1065 1.1 joerg "Unknown programming model for OpenMP directive on NVPTX target.");
1066 1.1 joerg }
1067 1.1 joerg
1068 1.1 joerg void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1069 1.1 joerg StringRef ParentName,
1070 1.1 joerg llvm::Function *&OutlinedFn,
1071 1.1 joerg llvm::Constant *&OutlinedFnID,
1072 1.1 joerg bool IsOffloadEntry,
1073 1.1 joerg const RegionCodeGenTy &CodeGen) {
1074 1.1 joerg ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1075 1.1 joerg EntryFunctionState EST;
1076 1.1 joerg WorkerFunctionState WST(CGM, D.getBeginLoc());
1077 1.1 joerg Work.clear();
1078 1.1 joerg WrapperFunctionsMap.clear();
1079 1.1 joerg
1080 1.1 joerg // Emit target region as a standalone region.
1081 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy {
1082 1.1 joerg CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083 1.1 joerg CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
1084 1.1 joerg
1085 1.1 joerg public:
1086 1.1 joerg NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1087 1.1 joerg CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
1088 1.1 joerg : EST(EST), WST(WST) {}
1089 1.1 joerg void Enter(CodeGenFunction &CGF) override {
1090 1.1 joerg auto &RT =
1091 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1092 1.1 joerg RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1093 1.1 joerg // Skip target region initialization.
1094 1.1 joerg RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1095 1.1 joerg }
1096 1.1 joerg void Exit(CodeGenFunction &CGF) override {
1097 1.1 joerg auto &RT =
1098 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1099 1.1 joerg RT.clearLocThreadIdInsertPt(CGF);
1100 1.1 joerg RT.emitNonSPMDEntryFooter(CGF, EST);
1101 1.1 joerg }
1102 1.1 joerg } Action(EST, WST);
1103 1.1 joerg CodeGen.setAction(Action);
1104 1.1 joerg IsInTTDRegion = true;
1105 1.1 joerg // Reserve place for the globalized memory.
1106 1.1 joerg GlobalizedRecords.emplace_back();
1107 1.1 joerg if (!KernelStaticGlobalized) {
1108 1.1 joerg KernelStaticGlobalized = new llvm::GlobalVariable(
1109 1.1 joerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1110 1.1 joerg llvm::GlobalValue::InternalLinkage,
1111 1.1 joerg llvm::UndefValue::get(CGM.VoidPtrTy),
1112 1.1 joerg "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1113 1.1 joerg llvm::GlobalValue::NotThreadLocal,
1114 1.1 joerg CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1115 1.1 joerg }
1116 1.1 joerg emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1117 1.1 joerg IsOffloadEntry, CodeGen);
1118 1.1 joerg IsInTTDRegion = false;
1119 1.1 joerg
1120 1.1 joerg // Now change the name of the worker function to correspond to this target
1121 1.1 joerg // region's entry function.
1122 1.1 joerg WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1123 1.1 joerg
1124 1.1 joerg // Create the worker function
1125 1.1 joerg emitWorkerFunction(WST);
1126 1.1 joerg }
1127 1.1 joerg
1128 1.1 joerg // Setup NVPTX threads for master-worker OpenMP scheme.
1129 1.1 joerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1130 1.1 joerg EntryFunctionState &EST,
1131 1.1 joerg WorkerFunctionState &WST) {
1132 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
1133 1.1 joerg
1134 1.1 joerg llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1135 1.1 joerg llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1136 1.1 joerg llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1137 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit");
1138 1.1 joerg
1139 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1140 1.1 joerg llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
1141 1.1 joerg llvm::Value *ThreadLimit = getThreadLimit(CGF);
1142 1.1 joerg llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
1143 1.1 joerg Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1144 1.1 joerg
1145 1.1 joerg CGF.EmitBlock(WorkerBB);
1146 1.1 joerg emitCall(CGF, WST.Loc, WST.WorkerFn);
1147 1.1 joerg CGF.EmitBranch(EST.ExitBB);
1148 1.1 joerg
1149 1.1 joerg CGF.EmitBlock(MasterCheckBB);
1150 1.1 joerg GPUThreadID = RT.getGPUThreadID(CGF);
1151 1.1 joerg llvm::Value *MasterThreadID = getMasterThreadID(CGF);
1152 1.1 joerg llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
1153 1.1 joerg Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1154 1.1 joerg
1155 1.1 joerg CGF.EmitBlock(MasterBB);
1156 1.1 joerg IsInTargetMasterThreadRegion = true;
1157 1.1 joerg // SEQUENTIAL (MASTER) REGION START
1158 1.1 joerg // First action in sequential region:
1159 1.1 joerg // Initialize the state of the OpenMP runtime library on the GPU.
1160 1.1 joerg // TODO: Optimize runtime initialization and pass in correct value.
1161 1.1 joerg llvm::Value *Args[] = {getThreadLimit(CGF),
1162 1.1 joerg Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1163 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1164 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_init),
1165 1.1 joerg Args);
1166 1.1 joerg
1167 1.1 joerg // For data sharing, we need to initialize the stack.
1168 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1169 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
1170 1.1 joerg
1171 1.1 joerg emitGenericVarsProlog(CGF, WST.Loc);
1172 1.1 joerg }
1173 1.1 joerg
1174 1.1 joerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1175 1.1 joerg EntryFunctionState &EST) {
1176 1.1 joerg IsInTargetMasterThreadRegion = false;
1177 1.1 joerg if (!CGF.HaveInsertPoint())
1178 1.1 joerg return;
1179 1.1 joerg
1180 1.1 joerg emitGenericVarsEpilog(CGF);
1181 1.1 joerg
1182 1.1 joerg if (!EST.ExitBB)
1183 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit");
1184 1.1 joerg
1185 1.1 joerg llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1186 1.1 joerg CGF.EmitBranch(TerminateBB);
1187 1.1 joerg
1188 1.1 joerg CGF.EmitBlock(TerminateBB);
1189 1.1 joerg // Signal termination condition.
1190 1.1 joerg // TODO: Optimize runtime initialization and pass in correct value.
1191 1.1 joerg llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1192 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
1194 1.1 joerg Args);
1195 1.1 joerg // Barrier to terminate worker threads.
1196 1.1 joerg syncCTAThreads(CGF);
1197 1.1 joerg // Master thread jumps to exit point.
1198 1.1 joerg CGF.EmitBranch(EST.ExitBB);
1199 1.1 joerg
1200 1.1 joerg CGF.EmitBlock(EST.ExitBB);
1201 1.1 joerg EST.ExitBB = nullptr;
1202 1.1 joerg }
1203 1.1 joerg
1204 1.1 joerg void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1205 1.1 joerg StringRef ParentName,
1206 1.1 joerg llvm::Function *&OutlinedFn,
1207 1.1 joerg llvm::Constant *&OutlinedFnID,
1208 1.1 joerg bool IsOffloadEntry,
1209 1.1 joerg const RegionCodeGenTy &CodeGen) {
1210 1.1 joerg ExecutionRuntimeModesRAII ModeRAII(
1211 1.1 joerg CurrentExecutionMode, RequiresFullRuntime,
1212 1.1 joerg CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1213 1.1 joerg !supportsLightweightRuntime(CGM.getContext(), D));
1214 1.1 joerg EntryFunctionState EST;
1215 1.1 joerg
1216 1.1 joerg // Emit target region as a standalone region.
1217 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy {
1218 1.1 joerg CGOpenMPRuntimeGPU &RT;
1219 1.1 joerg CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1220 1.1 joerg const OMPExecutableDirective &D;
1221 1.1 joerg
1222 1.1 joerg public:
1223 1.1 joerg NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1224 1.1 joerg CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1225 1.1 joerg const OMPExecutableDirective &D)
1226 1.1 joerg : RT(RT), EST(EST), D(D) {}
1227 1.1 joerg void Enter(CodeGenFunction &CGF) override {
1228 1.1 joerg RT.emitSPMDEntryHeader(CGF, EST, D);
1229 1.1 joerg // Skip target region initialization.
1230 1.1 joerg RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1231 1.1 joerg }
1232 1.1 joerg void Exit(CodeGenFunction &CGF) override {
1233 1.1 joerg RT.clearLocThreadIdInsertPt(CGF);
1234 1.1 joerg RT.emitSPMDEntryFooter(CGF, EST);
1235 1.1 joerg }
1236 1.1 joerg } Action(*this, EST, D);
1237 1.1 joerg CodeGen.setAction(Action);
1238 1.1 joerg IsInTTDRegion = true;
1239 1.1 joerg // Reserve place for the globalized memory.
1240 1.1 joerg GlobalizedRecords.emplace_back();
1241 1.1 joerg if (!KernelStaticGlobalized) {
1242 1.1 joerg KernelStaticGlobalized = new llvm::GlobalVariable(
1243 1.1 joerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1244 1.1 joerg llvm::GlobalValue::InternalLinkage,
1245 1.1 joerg llvm::UndefValue::get(CGM.VoidPtrTy),
1246 1.1 joerg "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1247 1.1 joerg llvm::GlobalValue::NotThreadLocal,
1248 1.1 joerg CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1249 1.1 joerg }
1250 1.1 joerg emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1251 1.1 joerg IsOffloadEntry, CodeGen);
1252 1.1 joerg IsInTTDRegion = false;
1253 1.1 joerg }
1254 1.1 joerg
1255 1.1 joerg void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
1256 1.1 joerg CodeGenFunction &CGF, EntryFunctionState &EST,
1257 1.1 joerg const OMPExecutableDirective &D) {
1258 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
1259 1.1 joerg
1260 1.1 joerg // Setup BBs in entry function.
1261 1.1 joerg llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1262 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit");
1263 1.1 joerg
1264 1.1 joerg llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1265 1.1 joerg /*RequiresOMPRuntime=*/
1266 1.1 joerg Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
1267 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1268 1.1 joerg CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
1269 1.1 joerg Args);
1270 1.1 joerg
1271 1.1 joerg if (RequiresFullRuntime) {
1272 1.1 joerg // For data sharing, we need to initialize the stack.
1273 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1274 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
1275 1.1 joerg }
1276 1.1 joerg
1277 1.1 joerg CGF.EmitBranch(ExecuteBB);
1278 1.1 joerg
1279 1.1 joerg CGF.EmitBlock(ExecuteBB);
1280 1.1 joerg
1281 1.1 joerg IsInTargetMasterThreadRegion = true;
1282 1.1 joerg }
1283 1.1 joerg
1284 1.1 joerg void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
1285 1.1 joerg EntryFunctionState &EST) {
1286 1.1 joerg IsInTargetMasterThreadRegion = false;
1287 1.1 joerg if (!CGF.HaveInsertPoint())
1288 1.1 joerg return;
1289 1.1 joerg
1290 1.1 joerg if (!EST.ExitBB)
1291 1.1 joerg EST.ExitBB = CGF.createBasicBlock(".exit");
1292 1.1 joerg
1293 1.1 joerg llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1294 1.1 joerg CGF.EmitBranch(OMPDeInitBB);
1295 1.1 joerg
1296 1.1 joerg CGF.EmitBlock(OMPDeInitBB);
1297 1.1 joerg // DeInitialize the OMP state in the runtime; called by all active threads.
1298 1.1 joerg llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1299 1.1 joerg CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1300 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1301 1.1 joerg CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
1302 1.1 joerg Args);
1303 1.1 joerg CGF.EmitBranch(EST.ExitBB);
1304 1.1 joerg
1305 1.1 joerg CGF.EmitBlock(EST.ExitBB);
1306 1.1 joerg EST.ExitBB = nullptr;
1307 1.1 joerg }
1308 1.1 joerg
1309 1.1 joerg // Create a unique global variable to indicate the execution mode of this target
1310 1.1 joerg // region. The execution mode is either 'generic', or 'spmd' depending on the
1311 1.1 joerg // target directive. This variable is picked up by the offload library to setup
1312 1.1 joerg // the device appropriately before kernel launch. If the execution mode is
1313 1.1 joerg // 'generic', the runtime reserves one warp for the master, otherwise, all
1314 1.1 joerg // warps participate in parallel work.
1315 1.1 joerg static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1316 1.1 joerg bool Mode) {
1317 1.1 joerg auto *GVMode =
1318 1.1 joerg new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1319 1.1 joerg llvm::GlobalValue::WeakAnyLinkage,
1320 1.1 joerg llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1321 1.1 joerg Twine(Name, "_exec_mode"));
1322 1.1 joerg CGM.addCompilerUsedGlobal(GVMode);
1323 1.1 joerg }
1324 1.1 joerg
1325 1.1 joerg void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
1326 1.1 joerg ASTContext &Ctx = CGM.getContext();
1327 1.1 joerg
1328 1.1 joerg CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1329 1.1 joerg CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1330 1.1 joerg WST.Loc, WST.Loc);
1331 1.1 joerg emitWorkerLoop(CGF, WST);
1332 1.1 joerg CGF.FinishFunction();
1333 1.1 joerg }
1334 1.1 joerg
1335 1.1 joerg void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
1336 1.1 joerg WorkerFunctionState &WST) {
1337 1.1 joerg //
1338 1.1 joerg // The workers enter this loop and wait for parallel work from the master.
1339 1.1 joerg // When the master encounters a parallel region it sets up the work + variable
1340 1.1 joerg // arguments, and wakes up the workers. The workers first check to see if
1341 1.1 joerg // they are required for the parallel region, i.e., within the # of requested
1342 1.1 joerg // parallel threads. The activated workers load the variable arguments and
1343 1.1 joerg // execute the parallel work.
1344 1.1 joerg //
1345 1.1 joerg
1346 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
1347 1.1 joerg
1348 1.1 joerg llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1349 1.1 joerg llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1350 1.1 joerg llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1351 1.1 joerg llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1352 1.1 joerg llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1353 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1354 1.1 joerg
1355 1.1 joerg CGF.EmitBranch(AwaitBB);
1356 1.1 joerg
1357 1.1 joerg // Workers wait for work from master.
1358 1.1 joerg CGF.EmitBlock(AwaitBB);
1359 1.1 joerg // Wait for parallel work
1360 1.1 joerg syncCTAThreads(CGF);
1361 1.1 joerg
1362 1.1 joerg Address WorkFn =
1363 1.1 joerg CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1364 1.1 joerg Address ExecStatus =
1365 1.1 joerg CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1366 1.1 joerg CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1367 1.1 joerg CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1368 1.1 joerg
1369 1.1 joerg // TODO: Optimize runtime initialization and pass in correct value.
1370 1.1 joerg llvm::Value *Args[] = {WorkFn.getPointer()};
1371 1.1 joerg llvm::Value *Ret =
1372 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1373 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
1374 1.1 joerg Args);
1375 1.1 joerg Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1376 1.1 joerg
1377 1.1 joerg // On termination condition (workid == 0), exit loop.
1378 1.1 joerg llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1379 1.1 joerg llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1380 1.1 joerg Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1381 1.1 joerg
1382 1.1 joerg // Activate requested workers.
1383 1.1 joerg CGF.EmitBlock(SelectWorkersBB);
1384 1.1 joerg llvm::Value *IsActive =
1385 1.1 joerg Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1386 1.1 joerg Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1387 1.1 joerg
1388 1.1 joerg // Signal start of parallel region.
1389 1.1 joerg CGF.EmitBlock(ExecuteBB);
1390 1.1 joerg // Skip initialization.
1391 1.1 joerg setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1392 1.1 joerg
1393 1.1 joerg // Process work items: outlined parallel functions.
1394 1.1 joerg for (llvm::Function *W : Work) {
1395 1.1 joerg // Try to match this outlined function.
1396 1.1 joerg llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1397 1.1 joerg
1398 1.1 joerg llvm::Value *WorkFnMatch =
1399 1.1 joerg Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1400 1.1 joerg
1401 1.1 joerg llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1402 1.1 joerg llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1403 1.1 joerg Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1404 1.1 joerg
1405 1.1 joerg // Execute this outlined function.
1406 1.1 joerg CGF.EmitBlock(ExecuteFNBB);
1407 1.1 joerg
1408 1.1 joerg // Insert call to work function via shared wrapper. The shared
1409 1.1 joerg // wrapper takes two arguments:
1410 1.1 joerg // - the parallelism level;
1411 1.1 joerg // - the thread ID;
1412 1.1 joerg emitCall(CGF, WST.Loc, W,
1413 1.1 joerg {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1414 1.1 joerg
1415 1.1 joerg // Go to end of parallel region.
1416 1.1 joerg CGF.EmitBranch(TerminateBB);
1417 1.1 joerg
1418 1.1 joerg CGF.EmitBlock(CheckNextBB);
1419 1.1 joerg }
1420 1.1 joerg // Default case: call to outlined function through pointer if the target
1421 1.1 joerg // region makes a declare target call that may contain an orphaned parallel
1422 1.1 joerg // directive.
1423 1.1 joerg auto *ParallelFnTy =
1424 1.1 joerg llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1425 1.1 joerg /*isVarArg=*/false);
1426 1.1 joerg llvm::Value *WorkFnCast =
1427 1.1 joerg Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1428 1.1 joerg // Insert call to work function via shared wrapper. The shared
1429 1.1 joerg // wrapper takes two arguments:
1430 1.1 joerg // - the parallelism level;
1431 1.1 joerg // - the thread ID;
1432 1.1 joerg emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1433 1.1 joerg {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1434 1.1 joerg // Go to end of parallel region.
1435 1.1 joerg CGF.EmitBranch(TerminateBB);
1436 1.1 joerg
1437 1.1 joerg // Signal end of parallel region.
1438 1.1 joerg CGF.EmitBlock(TerminateBB);
1439 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1440 1.1 joerg CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
1441 1.1 joerg llvm::None);
1442 1.1 joerg CGF.EmitBranch(BarrierBB);
1443 1.1 joerg
1444 1.1 joerg // All active and inactive workers wait at a barrier after parallel region.
1445 1.1 joerg CGF.EmitBlock(BarrierBB);
1446 1.1 joerg // Barrier after parallel region.
1447 1.1 joerg syncCTAThreads(CGF);
1448 1.1 joerg CGF.EmitBranch(AwaitBB);
1449 1.1 joerg
1450 1.1 joerg // Exit target region.
1451 1.1 joerg CGF.EmitBlock(ExitBB);
1452 1.1 joerg // Skip initialization.
1453 1.1 joerg clearLocThreadIdInsertPt(CGF);
1454 1.1 joerg }
1455 1.1 joerg
1456 1.1 joerg void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1457 1.1 joerg llvm::Constant *Addr,
1458 1.1 joerg uint64_t Size, int32_t,
1459 1.1 joerg llvm::GlobalValue::LinkageTypes) {
1460 1.1 joerg // TODO: Add support for global variables on the device after declare target
1461 1.1 joerg // support.
1462 1.1 joerg if (!isa<llvm::Function>(Addr))
1463 1.1 joerg return;
1464 1.1 joerg llvm::Module &M = CGM.getModule();
1465 1.1 joerg llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1466 1.1 joerg
1467 1.1 joerg // Get "nvvm.annotations" metadata node
1468 1.1 joerg llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1469 1.1 joerg
1470 1.1 joerg llvm::Metadata *MDVals[] = {
1471 1.1 joerg llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1472 1.1 joerg llvm::ConstantAsMetadata::get(
1473 1.1 joerg llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1474 1.1 joerg // Append metadata to nvvm.annotations
1475 1.1 joerg MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1476 1.1 joerg }
1477 1.1 joerg
1478 1.1 joerg void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1479 1.1 joerg const OMPExecutableDirective &D, StringRef ParentName,
1480 1.1 joerg llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1481 1.1 joerg bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1482 1.1 joerg if (!IsOffloadEntry) // Nothing to do.
1483 1.1 joerg return;
1484 1.1 joerg
1485 1.1 joerg assert(!ParentName.empty() && "Invalid target region parent name!");
1486 1.1 joerg
1487 1.1 joerg bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1488 1.1 joerg if (Mode)
1489 1.1 joerg emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1490 1.1 joerg CodeGen);
1491 1.1 joerg else
1492 1.1 joerg emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1493 1.1 joerg CodeGen);
1494 1.1 joerg
1495 1.1 joerg setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1496 1.1 joerg }
1497 1.1 joerg
1498 1.1 joerg namespace {
1499 1.1 joerg LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1500 1.1 joerg /// Enum for accesseing the reserved_2 field of the ident_t struct.
1501 1.1 joerg enum ModeFlagsTy : unsigned {
1502 1.1 joerg /// Bit set to 1 when in SPMD mode.
1503 1.1 joerg KMP_IDENT_SPMD_MODE = 0x01,
1504 1.1 joerg /// Bit set to 1 when a simplified runtime is used.
1505 1.1 joerg KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1506 1.1 joerg LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1507 1.1 joerg };
1508 1.1 joerg
1509 1.1 joerg /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1510 1.1 joerg static const ModeFlagsTy UndefinedMode =
1511 1.1 joerg (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1512 1.1 joerg } // anonymous namespace
1513 1.1 joerg
1514 1.1 joerg unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1515 1.1 joerg switch (getExecutionMode()) {
1516 1.1 joerg case EM_SPMD:
1517 1.1 joerg if (requiresFullRuntime())
1518 1.1 joerg return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1519 1.1 joerg return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1520 1.1 joerg case EM_NonSPMD:
1521 1.1 joerg assert(requiresFullRuntime() && "Expected full runtime.");
1522 1.1 joerg return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1523 1.1 joerg case EM_Unknown:
1524 1.1 joerg return UndefinedMode;
1525 1.1 joerg }
1526 1.1 joerg llvm_unreachable("Unknown flags are requested.");
1527 1.1 joerg }
1528 1.1 joerg
1529 1.1 joerg CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1530 1.1 joerg : CGOpenMPRuntime(CGM, "_", "$") {
1531 1.1 joerg if (!CGM.getLangOpts().OpenMPIsDevice)
1532 1.1 joerg llvm_unreachable("OpenMP NVPTX can only handle device code.");
1533 1.1 joerg }
1534 1.1 joerg
1535 1.1 joerg void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1536 1.1 joerg ProcBindKind ProcBind,
1537 1.1 joerg SourceLocation Loc) {
1538 1.1 joerg // Do nothing in case of SPMD mode and L0 parallel.
1539 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1540 1.1 joerg return;
1541 1.1 joerg
1542 1.1 joerg CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1543 1.1 joerg }
1544 1.1 joerg
1545 1.1 joerg void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1546 1.1 joerg llvm::Value *NumThreads,
1547 1.1 joerg SourceLocation Loc) {
1548 1.1 joerg // Do nothing in case of SPMD mode and L0 parallel.
1549 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1550 1.1 joerg return;
1551 1.1 joerg
1552 1.1 joerg CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1553 1.1 joerg }
1554 1.1 joerg
1555 1.1 joerg void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1556 1.1 joerg const Expr *NumTeams,
1557 1.1 joerg const Expr *ThreadLimit,
1558 1.1 joerg SourceLocation Loc) {}
1559 1.1 joerg
1560 1.1 joerg llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1561 1.1 joerg const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1562 1.1 joerg OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1563 1.1 joerg // Emit target region as a standalone region.
1564 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy {
1565 1.1 joerg bool &IsInParallelRegion;
1566 1.1 joerg bool PrevIsInParallelRegion;
1567 1.1 joerg
1568 1.1 joerg public:
1569 1.1 joerg NVPTXPrePostActionTy(bool &IsInParallelRegion)
1570 1.1 joerg : IsInParallelRegion(IsInParallelRegion) {}
1571 1.1 joerg void Enter(CodeGenFunction &CGF) override {
1572 1.1 joerg PrevIsInParallelRegion = IsInParallelRegion;
1573 1.1 joerg IsInParallelRegion = true;
1574 1.1 joerg }
1575 1.1 joerg void Exit(CodeGenFunction &CGF) override {
1576 1.1 joerg IsInParallelRegion = PrevIsInParallelRegion;
1577 1.1 joerg }
1578 1.1 joerg } Action(IsInParallelRegion);
1579 1.1 joerg CodeGen.setAction(Action);
1580 1.1 joerg bool PrevIsInTTDRegion = IsInTTDRegion;
1581 1.1 joerg IsInTTDRegion = false;
1582 1.1 joerg bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1583 1.1 joerg IsInTargetMasterThreadRegion = false;
1584 1.1 joerg auto *OutlinedFun =
1585 1.1 joerg cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1586 1.1 joerg D, ThreadIDVar, InnermostKind, CodeGen));
1587 1.1 joerg IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1588 1.1 joerg IsInTTDRegion = PrevIsInTTDRegion;
1589 1.1 joerg if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1590 1.1 joerg !IsInParallelRegion) {
1591 1.1 joerg llvm::Function *WrapperFun =
1592 1.1 joerg createParallelDataSharingWrapper(OutlinedFun, D);
1593 1.1 joerg WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1594 1.1 joerg }
1595 1.1 joerg
1596 1.1 joerg return OutlinedFun;
1597 1.1 joerg }
1598 1.1 joerg
1599 1.1 joerg /// Get list of lastprivate variables from the teams distribute ... or
1600 1.1 joerg /// teams {distribute ...} directives.
1601 1.1 joerg static void
1602 1.1 joerg getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1603 1.1 joerg llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1604 1.1 joerg assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1605 1.1 joerg "expected teams directive.");
1606 1.1 joerg const OMPExecutableDirective *Dir = &D;
1607 1.1 joerg if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1608 1.1 joerg if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1609 1.1 joerg Ctx,
1610 1.1 joerg D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1611 1.1 joerg /*IgnoreCaptured=*/true))) {
1612 1.1 joerg Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1613 1.1 joerg if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1614 1.1 joerg Dir = nullptr;
1615 1.1 joerg }
1616 1.1 joerg }
1617 1.1 joerg if (!Dir)
1618 1.1 joerg return;
1619 1.1 joerg for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1620 1.1 joerg for (const Expr *E : C->getVarRefs())
1621 1.1 joerg Vars.push_back(getPrivateItem(E));
1622 1.1 joerg }
1623 1.1 joerg }
1624 1.1 joerg
1625 1.1 joerg /// Get list of reduction variables from the teams ... directives.
1626 1.1 joerg static void
1627 1.1 joerg getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1628 1.1 joerg llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1629 1.1 joerg assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1630 1.1 joerg "expected teams directive.");
1631 1.1 joerg for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1632 1.1 joerg for (const Expr *E : C->privates())
1633 1.1 joerg Vars.push_back(getPrivateItem(E));
1634 1.1 joerg }
1635 1.1 joerg }
1636 1.1 joerg
1637 1.1 joerg llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1638 1.1 joerg const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1639 1.1 joerg OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1640 1.1 joerg SourceLocation Loc = D.getBeginLoc();
1641 1.1 joerg
1642 1.1 joerg const RecordDecl *GlobalizedRD = nullptr;
1643 1.1 joerg llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1644 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1645 1.1 joerg unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
1646 1.1 joerg // Globalize team reductions variable unconditionally in all modes.
1647 1.1 joerg if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1648 1.1 joerg getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1649 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1650 1.1 joerg getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1651 1.1 joerg if (!LastPrivatesReductions.empty()) {
1652 1.1 joerg GlobalizedRD = ::buildRecordForGlobalizedVars(
1653 1.1 joerg CGM.getContext(), llvm::None, LastPrivatesReductions,
1654 1.1 joerg MappedDeclsFields, WarpSize);
1655 1.1 joerg }
1656 1.1 joerg } else if (!LastPrivatesReductions.empty()) {
1657 1.1 joerg assert(!TeamAndReductions.first &&
1658 1.1 joerg "Previous team declaration is not expected.");
1659 1.1 joerg TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1660 1.1 joerg std::swap(TeamAndReductions.second, LastPrivatesReductions);
1661 1.1 joerg }
1662 1.1 joerg
1663 1.1 joerg // Emit target region as a standalone region.
1664 1.1 joerg class NVPTXPrePostActionTy : public PrePostActionTy {
1665 1.1 joerg SourceLocation &Loc;
1666 1.1 joerg const RecordDecl *GlobalizedRD;
1667 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1668 1.1 joerg &MappedDeclsFields;
1669 1.1 joerg
1670 1.1 joerg public:
1671 1.1 joerg NVPTXPrePostActionTy(
1672 1.1 joerg SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1673 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1674 1.1 joerg &MappedDeclsFields)
1675 1.1 joerg : Loc(Loc), GlobalizedRD(GlobalizedRD),
1676 1.1 joerg MappedDeclsFields(MappedDeclsFields) {}
1677 1.1 joerg void Enter(CodeGenFunction &CGF) override {
1678 1.1 joerg auto &Rt =
1679 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1680 1.1 joerg if (GlobalizedRD) {
1681 1.1 joerg auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1682 1.1 joerg I->getSecond().GlobalRecord = GlobalizedRD;
1683 1.1 joerg I->getSecond().MappedParams =
1684 1.1 joerg std::make_unique<CodeGenFunction::OMPMapVars>();
1685 1.1 joerg DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1686 1.1 joerg for (const auto &Pair : MappedDeclsFields) {
1687 1.1 joerg assert(Pair.getFirst()->isCanonicalDecl() &&
1688 1.1 joerg "Expected canonical declaration");
1689 1.1 joerg Data.insert(std::make_pair(Pair.getFirst(),
1690 1.1 joerg MappedVarData(Pair.getSecond(),
1691 1.1 joerg /*IsOnePerTeam=*/true)));
1692 1.1 joerg }
1693 1.1 joerg }
1694 1.1 joerg Rt.emitGenericVarsProlog(CGF, Loc);
1695 1.1 joerg }
1696 1.1 joerg void Exit(CodeGenFunction &CGF) override {
1697 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1698 1.1 joerg .emitGenericVarsEpilog(CGF);
1699 1.1 joerg }
1700 1.1 joerg } Action(Loc, GlobalizedRD, MappedDeclsFields);
1701 1.1 joerg CodeGen.setAction(Action);
1702 1.1 joerg llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1703 1.1 joerg D, ThreadIDVar, InnermostKind, CodeGen);
1704 1.1 joerg
1705 1.1 joerg return OutlinedFun;
1706 1.1 joerg }
1707 1.1 joerg
1708 1.1 joerg void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1709 1.1 joerg SourceLocation Loc,
1710 1.1 joerg bool WithSPMDCheck) {
1711 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1712 1.1 joerg getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1713 1.1 joerg return;
1714 1.1 joerg
1715 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
1716 1.1 joerg
1717 1.1 joerg const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1718 1.1 joerg if (I == FunctionGlobalizedDecls.end())
1719 1.1 joerg return;
1720 1.1 joerg if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1721 1.1 joerg QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1722 1.1 joerg QualType SecGlobalRecTy;
1723 1.1 joerg
1724 1.1 joerg // Recover pointer to this function's global record. The runtime will
1725 1.1 joerg // handle the specifics of the allocation of the memory.
1726 1.1 joerg // Use actual memory size of the record including the padding
1727 1.1 joerg // for alignment purposes.
1728 1.1 joerg unsigned Alignment =
1729 1.1 joerg CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1730 1.1 joerg unsigned GlobalRecordSize =
1731 1.1 joerg CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
1732 1.1 joerg GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1733 1.1 joerg
1734 1.1 joerg llvm::PointerType *GlobalRecPtrTy =
1735 1.1 joerg CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
1736 1.1 joerg llvm::Value *GlobalRecCastAddr;
1737 1.1 joerg llvm::Value *IsTTD = nullptr;
1738 1.1 joerg if (!IsInTTDRegion &&
1739 1.1 joerg (WithSPMDCheck ||
1740 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1741 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1742 1.1 joerg llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1743 1.1 joerg llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1744 1.1 joerg if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
1745 1.1 joerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1746 1.1 joerg llvm::Value *ThreadID = getThreadID(CGF, Loc);
1747 1.1 joerg llvm::Value *PL = CGF.EmitRuntimeCall(
1748 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1749 1.1 joerg OMPRTL___kmpc_parallel_level),
1750 1.1 joerg {RTLoc, ThreadID});
1751 1.1 joerg IsTTD = Bld.CreateIsNull(PL);
1752 1.1 joerg }
1753 1.1 joerg llvm::Value *IsSPMD = Bld.CreateIsNotNull(
1754 1.1 joerg CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1755 1.1 joerg CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
1756 1.1 joerg Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1757 1.1 joerg // There is no need to emit line number for unconditional branch.
1758 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF);
1759 1.1 joerg CGF.EmitBlock(SPMDBB);
1760 1.1 joerg Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
1761 1.1 joerg CharUnits::fromQuantity(Alignment));
1762 1.1 joerg CGF.EmitBranch(ExitBB);
1763 1.1 joerg // There is no need to emit line number for unconditional branch.
1764 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF);
1765 1.1 joerg CGF.EmitBlock(NonSPMDBB);
1766 1.1 joerg llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
1767 1.1 joerg if (const RecordDecl *SecGlobalizedVarsRecord =
1768 1.1 joerg I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
1769 1.1 joerg SecGlobalRecTy =
1770 1.1 joerg CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
1771 1.1 joerg
1772 1.1 joerg // Recover pointer to this function's global record. The runtime will
1773 1.1 joerg // handle the specifics of the allocation of the memory.
1774 1.1 joerg // Use actual memory size of the record including the padding
1775 1.1 joerg // for alignment purposes.
1776 1.1 joerg unsigned Alignment =
1777 1.1 joerg CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
1778 1.1 joerg unsigned GlobalRecordSize =
1779 1.1 joerg CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
1780 1.1 joerg GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1781 1.1 joerg Size = Bld.CreateSelect(
1782 1.1 joerg IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
1783 1.1 joerg }
1784 1.1 joerg // TODO: allow the usage of shared memory to be controlled by
1785 1.1 joerg // the user, for now, default to global.
1786 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = {
1787 1.1 joerg Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1788 1.1 joerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1789 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
1790 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1791 1.1 joerg GlobalRecordSizeArg);
1792 1.1 joerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1793 1.1 joerg GlobalRecValue, GlobalRecPtrTy);
1794 1.1 joerg CGF.EmitBlock(ExitBB);
1795 1.1 joerg auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
1796 1.1 joerg /*NumReservedValues=*/2, "_select_stack");
1797 1.1 joerg Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1798 1.1 joerg Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1799 1.1 joerg GlobalRecCastAddr = Phi;
1800 1.1 joerg I->getSecond().GlobalRecordAddr = Phi;
1801 1.1 joerg I->getSecond().IsInSPMDModeFlag = IsSPMD;
1802 1.1 joerg } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
1803 1.1 joerg assert(GlobalizedRecords.back().Records.size() < 2 &&
1804 1.1 joerg "Expected less than 2 globalized records: one for target and one "
1805 1.1 joerg "for teams.");
1806 1.1 joerg unsigned Offset = 0;
1807 1.1 joerg for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
1808 1.1 joerg QualType RDTy = CGM.getContext().getRecordType(RD);
1809 1.1 joerg unsigned Alignment =
1810 1.1 joerg CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
1811 1.1 joerg unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
1812 1.1 joerg Offset =
1813 1.1 joerg llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
1814 1.1 joerg }
1815 1.1 joerg unsigned Alignment =
1816 1.1 joerg CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1817 1.1 joerg Offset = llvm::alignTo(Offset, Alignment);
1818 1.1 joerg GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
1819 1.1 joerg ++GlobalizedRecords.back().RegionCounter;
1820 1.1 joerg if (GlobalizedRecords.back().Records.size() == 1) {
1821 1.1 joerg assert(KernelStaticGlobalized &&
1822 1.1 joerg "Kernel static pointer must be initialized already.");
1823 1.1 joerg auto *UseSharedMemory = new llvm::GlobalVariable(
1824 1.1 joerg CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
1825 1.1 joerg llvm::GlobalValue::InternalLinkage, nullptr,
1826 1.1 joerg "_openmp_static_kernel$is_shared");
1827 1.1 joerg UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1828 1.1 joerg QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
1829 1.1 joerg /*DestWidth=*/16, /*Signed=*/0);
1830 1.1 joerg llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
1831 1.1 joerg Address(UseSharedMemory,
1832 1.1 joerg CGM.getContext().getTypeAlignInChars(Int16Ty)),
1833 1.1 joerg /*Volatile=*/false, Int16Ty, Loc);
1834 1.1 joerg auto *StaticGlobalized = new llvm::GlobalVariable(
1835 1.1 joerg CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
1836 1.1 joerg llvm::GlobalValue::CommonLinkage, nullptr);
1837 1.1 joerg auto *RecSize = new llvm::GlobalVariable(
1838 1.1 joerg CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
1839 1.1 joerg llvm::GlobalValue::InternalLinkage, nullptr,
1840 1.1 joerg "_openmp_static_kernel$size");
1841 1.1 joerg RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1842 1.1 joerg llvm::Value *Ld = CGF.EmitLoadOfScalar(
1843 1.1 joerg Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
1844 1.1 joerg CGM.getContext().getSizeType(), Loc);
1845 1.1 joerg llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1846 1.1 joerg KernelStaticGlobalized, CGM.VoidPtrPtrTy);
1847 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = {
1848 1.1 joerg llvm::ConstantInt::get(
1849 1.1 joerg CGM.Int16Ty,
1850 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
1851 1.1 joerg StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
1852 1.1 joerg CGF.EmitRuntimeCall(
1853 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
1854 1.1 joerg CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
1855 1.1 joerg GlobalRecordSizeArg);
1856 1.1 joerg GlobalizedRecords.back().Buffer = StaticGlobalized;
1857 1.1 joerg GlobalizedRecords.back().RecSize = RecSize;
1858 1.1 joerg GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
1859 1.1 joerg GlobalizedRecords.back().Loc = Loc;
1860 1.1 joerg }
1861 1.1 joerg assert(KernelStaticGlobalized && "Global address must be set already.");
1862 1.1 joerg Address FrameAddr = CGF.EmitLoadOfPointer(
1863 1.1 joerg Address(KernelStaticGlobalized, CGM.getPointerAlign()),
1864 1.1 joerg CGM.getContext()
1865 1.1 joerg .getPointerType(CGM.getContext().VoidPtrTy)
1866 1.1 joerg .castAs<PointerType>());
1867 1.1 joerg llvm::Value *GlobalRecValue =
1868 1.1 joerg Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
1869 1.1 joerg I->getSecond().GlobalRecordAddr = GlobalRecValue;
1870 1.1 joerg I->getSecond().IsInSPMDModeFlag = nullptr;
1871 1.1 joerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1872 1.1 joerg GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
1873 1.1 joerg } else {
1874 1.1 joerg // TODO: allow the usage of shared memory to be controlled by
1875 1.1 joerg // the user, for now, default to global.
1876 1.1 joerg bool UseSharedMemory =
1877 1.1 joerg IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
1878 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = {
1879 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1880 1.1 joerg CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
1881 1.1 joerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1882 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
1883 1.1 joerg CGM.getModule(),
1884 1.1 joerg IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
1885 1.1 joerg : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1886 1.1 joerg GlobalRecordSizeArg);
1887 1.1 joerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1888 1.1 joerg GlobalRecValue, GlobalRecPtrTy);
1889 1.1 joerg I->getSecond().GlobalRecordAddr = GlobalRecValue;
1890 1.1 joerg I->getSecond().IsInSPMDModeFlag = nullptr;
1891 1.1 joerg }
1892 1.1 joerg LValue Base =
1893 1.1 joerg CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
1894 1.1 joerg
1895 1.1 joerg // Emit the "global alloca" which is a GEP from the global declaration
1896 1.1 joerg // record using the pointer returned by the runtime.
1897 1.1 joerg LValue SecBase;
1898 1.1 joerg decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
1899 1.1 joerg if (IsTTD) {
1900 1.1 joerg SecIt = I->getSecond().SecondaryLocalVarData->begin();
1901 1.1 joerg llvm::PointerType *SecGlobalRecPtrTy =
1902 1.1 joerg CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
1903 1.1 joerg SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
1904 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(
1905 1.1 joerg I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
1906 1.1 joerg SecGlobalRecTy);
1907 1.1 joerg }
1908 1.1 joerg for (auto &Rec : I->getSecond().LocalVarData) {
1909 1.1 joerg bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1910 1.1 joerg llvm::Value *ParValue;
1911 1.1 joerg if (EscapedParam) {
1912 1.1 joerg const auto *VD = cast<VarDecl>(Rec.first);
1913 1.1 joerg LValue ParLVal =
1914 1.1 joerg CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1915 1.1 joerg ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1916 1.1 joerg }
1917 1.1 joerg LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
1918 1.1 joerg // Emit VarAddr basing on lane-id if required.
1919 1.1 joerg QualType VarTy;
1920 1.1 joerg if (Rec.second.IsOnePerTeam) {
1921 1.1 joerg VarTy = Rec.second.FD->getType();
1922 1.1 joerg } else {
1923 1.1 joerg Address Addr = VarAddr.getAddress(CGF);
1924 1.1 joerg llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
1925 1.1 joerg Addr.getElementType(), Addr.getPointer(),
1926 1.1 joerg {Bld.getInt32(0), getNVPTXLaneID(CGF)});
1927 1.1 joerg VarTy =
1928 1.1 joerg Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
1929 1.1 joerg VarAddr = CGF.MakeAddrLValue(
1930 1.1 joerg Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
1931 1.1 joerg AlignmentSource::Decl);
1932 1.1 joerg }
1933 1.1 joerg Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1934 1.1 joerg if (!IsInTTDRegion &&
1935 1.1 joerg (WithSPMDCheck ||
1936 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1937 1.1 joerg assert(I->getSecond().IsInSPMDModeFlag &&
1938 1.1 joerg "Expected unknown execution mode or required SPMD check.");
1939 1.1 joerg if (IsTTD) {
1940 1.1 joerg assert(SecIt->second.IsOnePerTeam &&
1941 1.1 joerg "Secondary glob data must be one per team.");
1942 1.1 joerg LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
1943 1.1 joerg VarAddr.setAddress(
1944 1.1 joerg Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
1945 1.1 joerg VarAddr.getPointer(CGF)),
1946 1.1 joerg VarAddr.getAlignment()));
1947 1.1 joerg Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1948 1.1 joerg }
1949 1.1 joerg Address GlobalPtr = Rec.second.PrivateAddr;
1950 1.1 joerg Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
1951 1.1 joerg Rec.second.PrivateAddr = Address(
1952 1.1 joerg Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
1953 1.1 joerg LocalAddr.getPointer(), GlobalPtr.getPointer()),
1954 1.1 joerg LocalAddr.getAlignment());
1955 1.1 joerg }
1956 1.1 joerg if (EscapedParam) {
1957 1.1 joerg const auto *VD = cast<VarDecl>(Rec.first);
1958 1.1 joerg CGF.EmitStoreOfScalar(ParValue, VarAddr);
1959 1.1 joerg I->getSecond().MappedParams->setVarAddr(CGF, VD,
1960 1.1 joerg VarAddr.getAddress(CGF));
1961 1.1 joerg }
1962 1.1 joerg if (IsTTD)
1963 1.1 joerg ++SecIt;
1964 1.1 joerg }
1965 1.1 joerg }
1966 1.1 joerg for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1967 1.1 joerg // Recover pointer to this function's global record. The runtime will
1968 1.1 joerg // handle the specifics of the allocation of the memory.
1969 1.1 joerg // Use actual memory size of the record including the padding
1970 1.1 joerg // for alignment purposes.
1971 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
1972 1.1 joerg llvm::Value *Size = CGF.getTypeSize(VD->getType());
1973 1.1 joerg CharUnits Align = CGM.getContext().getDeclAlign(VD);
1974 1.1 joerg Size = Bld.CreateNUWAdd(
1975 1.1 joerg Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1976 1.1 joerg llvm::Value *AlignVal =
1977 1.1 joerg llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1978 1.1 joerg Size = Bld.CreateUDiv(Size, AlignVal);
1979 1.1 joerg Size = Bld.CreateNUWMul(Size, AlignVal);
1980 1.1 joerg // TODO: allow the usage of shared memory to be controlled by
1981 1.1 joerg // the user, for now, default to global.
1982 1.1 joerg llvm::Value *GlobalRecordSizeArg[] = {
1983 1.1 joerg Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1984 1.1 joerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1985 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
1986 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1987 1.1 joerg GlobalRecordSizeArg);
1988 1.1 joerg llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1989 1.1 joerg GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1990 1.1 joerg LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1991 1.1 joerg CGM.getContext().getDeclAlign(VD),
1992 1.1 joerg AlignmentSource::Decl);
1993 1.1 joerg I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1994 1.1 joerg Base.getAddress(CGF));
1995 1.1 joerg I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1996 1.1 joerg }
1997 1.1 joerg I->getSecond().MappedParams->apply(CGF);
1998 1.1 joerg }
1999 1.1 joerg
2000 1.1 joerg void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
2001 1.1 joerg bool WithSPMDCheck) {
2002 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
2003 1.1 joerg getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
2004 1.1 joerg return;
2005 1.1 joerg
2006 1.1 joerg const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2007 1.1 joerg if (I != FunctionGlobalizedDecls.end()) {
2008 1.1 joerg I->getSecond().MappedParams->restore(CGF);
2009 1.1 joerg if (!CGF.HaveInsertPoint())
2010 1.1 joerg return;
2011 1.1 joerg for (llvm::Value *Addr :
2012 1.1 joerg llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2013 1.1 joerg CGF.EmitRuntimeCall(
2014 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
2015 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2016 1.1 joerg Addr);
2017 1.1 joerg }
2018 1.1 joerg if (I->getSecond().GlobalRecordAddr) {
2019 1.1 joerg if (!IsInTTDRegion &&
2020 1.1 joerg (WithSPMDCheck ||
2021 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
2022 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2023 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2024 1.1 joerg llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2025 1.1 joerg Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2026 1.1 joerg // There is no need to emit line number for unconditional branch.
2027 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF);
2028 1.1 joerg CGF.EmitBlock(NonSPMDBB);
2029 1.1 joerg CGF.EmitRuntimeCall(
2030 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
2031 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2032 1.1 joerg CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2033 1.1 joerg CGF.EmitBlock(ExitBB);
2034 1.1 joerg } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
2035 1.1 joerg assert(GlobalizedRecords.back().RegionCounter > 0 &&
2036 1.1 joerg "region counter must be > 0.");
2037 1.1 joerg --GlobalizedRecords.back().RegionCounter;
2038 1.1 joerg // Emit the restore function only in the target region.
2039 1.1 joerg if (GlobalizedRecords.back().RegionCounter == 0) {
2040 1.1 joerg QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2041 1.1 joerg /*DestWidth=*/16, /*Signed=*/0);
2042 1.1 joerg llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2043 1.1 joerg Address(GlobalizedRecords.back().UseSharedMemory,
2044 1.1 joerg CGM.getContext().getTypeAlignInChars(Int16Ty)),
2045 1.1 joerg /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2046 1.1 joerg llvm::Value *Args[] = {
2047 1.1 joerg llvm::ConstantInt::get(
2048 1.1 joerg CGM.Int16Ty,
2049 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
2050 1.1 joerg IsInSharedMemory};
2051 1.1 joerg CGF.EmitRuntimeCall(
2052 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
2053 1.1 joerg CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
2054 1.1 joerg Args);
2055 1.1 joerg }
2056 1.1 joerg } else {
2057 1.1 joerg CGF.EmitRuntimeCall(
2058 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
2059 1.1 joerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2060 1.1 joerg I->getSecond().GlobalRecordAddr);
2061 1.1 joerg }
2062 1.1 joerg }
2063 1.1 joerg }
2064 1.1 joerg }
2065 1.1 joerg
2066 1.1 joerg void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
2067 1.1 joerg const OMPExecutableDirective &D,
2068 1.1 joerg SourceLocation Loc,
2069 1.1 joerg llvm::Function *OutlinedFn,
2070 1.1 joerg ArrayRef<llvm::Value *> CapturedVars) {
2071 1.1 joerg if (!CGF.HaveInsertPoint())
2072 1.1 joerg return;
2073 1.1 joerg
2074 1.1 joerg Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2075 1.1 joerg /*Name=*/".zero.addr");
2076 1.1 joerg CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2077 1.1 joerg llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2078 1.1 joerg OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2079 1.1 joerg OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2080 1.1 joerg OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2081 1.1 joerg emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2082 1.1 joerg }
2083 1.1 joerg
2084 1.1 joerg void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
2085 1.1 joerg SourceLocation Loc,
2086 1.1 joerg llvm::Function *OutlinedFn,
2087 1.1 joerg ArrayRef<llvm::Value *> CapturedVars,
2088 1.1 joerg const Expr *IfCond) {
2089 1.1 joerg if (!CGF.HaveInsertPoint())
2090 1.1 joerg return;
2091 1.1 joerg
2092 1.1 joerg auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
2093 1.1 joerg IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
2094 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2095 1.1 joerg llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
2096 1.1 joerg llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
2097 1.1 joerg if (WFn) {
2098 1.1 joerg ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2099 1.1 joerg // Remember for post-processing in worker loop.
2100 1.1 joerg Work.emplace_back(WFn);
2101 1.1 joerg }
2102 1.1 joerg llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
2103 1.1 joerg
2104 1.1 joerg // Create a private scope that will globalize the arguments
2105 1.1 joerg // passed from the outside of the target region.
2106 1.1 joerg // TODO: Is that needed?
2107 1.1 joerg CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2108 1.1 joerg
2109 1.1 joerg Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
2110 1.1 joerg llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
2111 1.1 joerg "captured_vars_addrs");
2112 1.1 joerg // There's something to share.
2113 1.1 joerg if (!CapturedVars.empty()) {
2114 1.1 joerg // Prepare for parallel region. Indicate the outlined function.
2115 1.1 joerg ASTContext &Ctx = CGF.getContext();
2116 1.1 joerg unsigned Idx = 0;
2117 1.1 joerg for (llvm::Value *V : CapturedVars) {
2118 1.1 joerg Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
2119 1.1 joerg llvm::Value *PtrV;
2120 1.1 joerg if (V->getType()->isIntegerTy())
2121 1.1 joerg PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2122 1.1 joerg else
2123 1.1 joerg PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2124 1.1 joerg CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2125 1.1 joerg Ctx.getPointerType(Ctx.VoidPtrTy));
2126 1.1 joerg ++Idx;
2127 1.1 joerg }
2128 1.1 joerg }
2129 1.1 joerg
2130 1.1 joerg llvm::Value *IfCondVal = nullptr;
2131 1.1 joerg if (IfCond)
2132 1.1 joerg IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
2133 1.1 joerg /* isSigned */ false);
2134 1.1 joerg else
2135 1.1 joerg IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
2136 1.1 joerg
2137 1.1 joerg assert(IfCondVal && "Expected a value");
2138 1.1 joerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2139 1.1 joerg llvm::Value *Args[] = {
2140 1.1 joerg RTLoc,
2141 1.1 joerg getThreadID(CGF, Loc),
2142 1.1 joerg IfCondVal,
2143 1.1 joerg llvm::ConstantInt::get(CGF.Int32Ty, -1),
2144 1.1 joerg llvm::ConstantInt::get(CGF.Int32Ty, -1),
2145 1.1 joerg FnPtr,
2146 1.1 joerg ID,
2147 1.1 joerg Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
2148 1.1 joerg CGF.VoidPtrPtrTy),
2149 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2150 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2151 1.1 joerg CGM.getModule(), OMPRTL___kmpc_parallel_51),
2152 1.1 joerg Args);
2153 1.1 joerg };
2154 1.1 joerg
2155 1.1 joerg RegionCodeGenTy RCG(ParallelGen);
2156 1.1 joerg RCG(CGF);
2157 1.1 joerg }
2158 1.1 joerg
2159 1.1 joerg void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
2160 1.1 joerg // Always emit simple barriers!
2161 1.1 joerg if (!CGF.HaveInsertPoint())
2162 1.1 joerg return;
2163 1.1 joerg // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2164 1.1 joerg // This function does not use parameters, so we can emit just default values.
2165 1.1 joerg llvm::Value *Args[] = {
2166 1.1 joerg llvm::ConstantPointerNull::get(
2167 1.1 joerg cast<llvm::PointerType>(getIdentTyPointerTy())),
2168 1.1 joerg llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2169 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2170 1.1 joerg CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
2171 1.1 joerg Args);
2172 1.1 joerg }
2173 1.1 joerg
2174 1.1 joerg void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
2175 1.1 joerg SourceLocation Loc,
2176 1.1 joerg OpenMPDirectiveKind Kind, bool,
2177 1.1 joerg bool) {
2178 1.1 joerg // Always emit simple barriers!
2179 1.1 joerg if (!CGF.HaveInsertPoint())
2180 1.1 joerg return;
2181 1.1 joerg // Build call __kmpc_cancel_barrier(loc, thread_id);
2182 1.1 joerg unsigned Flags = getDefaultFlagsForBarriers(Kind);
2183 1.1 joerg llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2184 1.1 joerg getThreadID(CGF, Loc)};
2185 1.1 joerg
2186 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2187 1.1 joerg CGM.getModule(), OMPRTL___kmpc_barrier),
2188 1.1 joerg Args);
2189 1.1 joerg }
2190 1.1 joerg
2191 1.1 joerg void CGOpenMPRuntimeGPU::emitCriticalRegion(
2192 1.1 joerg CodeGenFunction &CGF, StringRef CriticalName,
2193 1.1 joerg const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2194 1.1 joerg const Expr *Hint) {
2195 1.1 joerg llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2196 1.1 joerg llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2197 1.1 joerg llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2198 1.1 joerg llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2199 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2200 1.1 joerg
2201 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2202 1.1 joerg
2203 1.1 joerg // Get the mask of active threads in the warp.
2204 1.1 joerg llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2205 1.1 joerg CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
2206 1.1 joerg // Fetch team-local id of the thread.
2207 1.1 joerg llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2208 1.1 joerg
2209 1.1 joerg // Get the width of the team.
2210 1.1 joerg llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
2211 1.1 joerg
2212 1.1 joerg // Initialize the counter variable for the loop.
2213 1.1 joerg QualType Int32Ty =
2214 1.1 joerg CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2215 1.1 joerg Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2216 1.1 joerg LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2217 1.1 joerg CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2218 1.1 joerg /*isInit=*/true);
2219 1.1 joerg
2220 1.1 joerg // Block checks if loop counter exceeds upper bound.
2221 1.1 joerg CGF.EmitBlock(LoopBB);
2222 1.1 joerg llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2223 1.1 joerg llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2224 1.1 joerg CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2225 1.1 joerg
2226 1.1 joerg // Block tests which single thread should execute region, and which threads
2227 1.1 joerg // should go straight to synchronisation point.
2228 1.1 joerg CGF.EmitBlock(TestBB);
2229 1.1 joerg CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2230 1.1 joerg llvm::Value *CmpThreadToCounter =
2231 1.1 joerg CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2232 1.1 joerg CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2233 1.1 joerg
2234 1.1 joerg // Block emits the body of the critical region.
2235 1.1 joerg CGF.EmitBlock(BodyBB);
2236 1.1 joerg
2237 1.1 joerg // Output the critical statement.
2238 1.1 joerg CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2239 1.1 joerg Hint);
2240 1.1 joerg
2241 1.1 joerg // After the body surrounded by the critical region, the single executing
2242 1.1 joerg // thread will jump to the synchronisation point.
2243 1.1 joerg // Block waits for all threads in current team to finish then increments the
2244 1.1 joerg // counter variable and returns to the loop.
2245 1.1 joerg CGF.EmitBlock(SyncBB);
2246 1.1 joerg // Reconverge active threads in the warp.
2247 1.1 joerg (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2248 1.1 joerg CGM.getModule(), OMPRTL___kmpc_syncwarp),
2249 1.1 joerg Mask);
2250 1.1 joerg
2251 1.1 joerg llvm::Value *IncCounterVal =
2252 1.1 joerg CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2253 1.1 joerg CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2254 1.1 joerg CGF.EmitBranch(LoopBB);
2255 1.1 joerg
2256 1.1 joerg // Block that is reached when all threads in the team complete the region.
2257 1.1 joerg CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2258 1.1 joerg }
2259 1.1 joerg
2260 1.1 joerg /// Cast value to the specified type.
2261 1.1 joerg static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2262 1.1 joerg QualType ValTy, QualType CastTy,
2263 1.1 joerg SourceLocation Loc) {
2264 1.1 joerg assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2265 1.1 joerg "Cast type must sized.");
2266 1.1 joerg assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2267 1.1 joerg "Val type must sized.");
2268 1.1 joerg llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2269 1.1 joerg if (ValTy == CastTy)
2270 1.1 joerg return Val;
2271 1.1 joerg if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2272 1.1 joerg CGF.getContext().getTypeSizeInChars(CastTy))
2273 1.1 joerg return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2274 1.1 joerg if (CastTy->isIntegerType() && ValTy->isIntegerType())
2275 1.1 joerg return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2276 1.1 joerg CastTy->hasSignedIntegerRepresentation());
2277 1.1 joerg Address CastItem = CGF.CreateMemTemp(CastTy);
2278 1.1 joerg Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2279 1.1 joerg CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2280 1.1 joerg CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
2281 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2282 1.1 joerg TBAAAccessInfo());
2283 1.1 joerg return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
2284 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2285 1.1 joerg TBAAAccessInfo());
2286 1.1 joerg }
2287 1.1 joerg
2288 1.1 joerg /// This function creates calls to one of two shuffle functions to copy
2289 1.1 joerg /// variables between lanes in a warp.
2290 1.1 joerg static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2291 1.1 joerg llvm::Value *Elem,
2292 1.1 joerg QualType ElemType,
2293 1.1 joerg llvm::Value *Offset,
2294 1.1 joerg SourceLocation Loc) {
2295 1.1 joerg CodeGenModule &CGM = CGF.CGM;
2296 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2297 1.1 joerg CGOpenMPRuntimeGPU &RT =
2298 1.1 joerg *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
2299 1.1 joerg llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
2300 1.1 joerg
2301 1.1 joerg CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2302 1.1 joerg assert(Size.getQuantity() <= 8 &&
2303 1.1 joerg "Unsupported bitwidth in shuffle instruction.");
2304 1.1 joerg
2305 1.1 joerg RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
2306 1.1 joerg ? OMPRTL___kmpc_shuffle_int32
2307 1.1 joerg : OMPRTL___kmpc_shuffle_int64;
2308 1.1 joerg
2309 1.1 joerg // Cast all types to 32- or 64-bit values before calling shuffle routines.
2310 1.1 joerg QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2311 1.1 joerg Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2312 1.1 joerg llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2313 1.1 joerg llvm::Value *WarpSize =
2314 1.1 joerg Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2315 1.1 joerg
2316 1.1 joerg llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2317 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
2318 1.1 joerg {ElemCast, Offset, WarpSize});
2319 1.1 joerg
2320 1.1 joerg return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2321 1.1 joerg }
2322 1.1 joerg
2323 1.1 joerg static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2324 1.1 joerg Address DestAddr, QualType ElemType,
2325 1.1 joerg llvm::Value *Offset, SourceLocation Loc) {
2326 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2327 1.1 joerg
2328 1.1 joerg CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2329 1.1 joerg // Create the loop over the big sized data.
2330 1.1 joerg // ptr = (void*)Elem;
2331 1.1 joerg // ptrEnd = (void*) Elem + 1;
2332 1.1 joerg // Step = 8;
2333 1.1 joerg // while (ptr + Step < ptrEnd)
2334 1.1 joerg // shuffle((int64_t)*ptr);
2335 1.1 joerg // Step = 4;
2336 1.1 joerg // while (ptr + Step < ptrEnd)
2337 1.1 joerg // shuffle((int32_t)*ptr);
2338 1.1 joerg // ...
2339 1.1 joerg Address ElemPtr = DestAddr;
2340 1.1 joerg Address Ptr = SrcAddr;
2341 1.1 joerg Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2342 1.1 joerg Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2343 1.1 joerg for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2344 1.1 joerg if (Size < CharUnits::fromQuantity(IntSize))
2345 1.1 joerg continue;
2346 1.1 joerg QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2347 1.1 joerg CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2348 1.1 joerg /*Signed=*/1);
2349 1.1 joerg llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2350 1.1 joerg Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2351 1.1 joerg ElemPtr =
2352 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2353 1.1 joerg if (Size.getQuantity() / IntSize > 1) {
2354 1.1 joerg llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2355 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2356 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2357 1.1 joerg llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2358 1.1 joerg CGF.EmitBlock(PreCondBB);
2359 1.1 joerg llvm::PHINode *PhiSrc =
2360 1.1 joerg Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2361 1.1 joerg PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2362 1.1 joerg llvm::PHINode *PhiDest =
2363 1.1 joerg Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2364 1.1 joerg PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2365 1.1 joerg Ptr = Address(PhiSrc, Ptr.getAlignment());
2366 1.1 joerg ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2367 1.1 joerg llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2368 1.1 joerg PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2369 1.1 joerg Ptr.getPointer(), CGF.VoidPtrTy));
2370 1.1 joerg Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2371 1.1 joerg ThenBB, ExitBB);
2372 1.1 joerg CGF.EmitBlock(ThenBB);
2373 1.1 joerg llvm::Value *Res = createRuntimeShuffleFunction(
2374 1.1 joerg CGF,
2375 1.1 joerg CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2376 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2377 1.1 joerg TBAAAccessInfo()),
2378 1.1 joerg IntType, Offset, Loc);
2379 1.1 joerg CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2380 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2381 1.1 joerg TBAAAccessInfo());
2382 1.1 joerg Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2383 1.1 joerg Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2384 1.1 joerg PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2385 1.1 joerg PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2386 1.1 joerg CGF.EmitBranch(PreCondBB);
2387 1.1 joerg CGF.EmitBlock(ExitBB);
2388 1.1 joerg } else {
2389 1.1 joerg llvm::Value *Res = createRuntimeShuffleFunction(
2390 1.1 joerg CGF,
2391 1.1 joerg CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2392 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2393 1.1 joerg TBAAAccessInfo()),
2394 1.1 joerg IntType, Offset, Loc);
2395 1.1 joerg CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2396 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2397 1.1 joerg TBAAAccessInfo());
2398 1.1 joerg Ptr = Bld.CreateConstGEP(Ptr, 1);
2399 1.1 joerg ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2400 1.1 joerg }
2401 1.1 joerg Size = Size % IntSize;
2402 1.1 joerg }
2403 1.1 joerg }
2404 1.1 joerg
2405 1.1 joerg namespace {
2406 1.1 joerg enum CopyAction : unsigned {
2407 1.1 joerg // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2408 1.1 joerg // the warp using shuffle instructions.
2409 1.1 joerg RemoteLaneToThread,
2410 1.1 joerg // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2411 1.1 joerg ThreadCopy,
2412 1.1 joerg // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2413 1.1 joerg ThreadToScratchpad,
2414 1.1 joerg // ScratchpadToThread: Copy from a scratchpad array in global memory
2415 1.1 joerg // containing team-reduced data to a thread's stack.
2416 1.1 joerg ScratchpadToThread,
2417 1.1 joerg };
2418 1.1 joerg } // namespace
2419 1.1 joerg
2420 1.1 joerg struct CopyOptionsTy {
2421 1.1 joerg llvm::Value *RemoteLaneOffset;
2422 1.1 joerg llvm::Value *ScratchpadIndex;
2423 1.1 joerg llvm::Value *ScratchpadWidth;
2424 1.1 joerg };
2425 1.1 joerg
2426 1.1 joerg /// Emit instructions to copy a Reduce list, which contains partially
2427 1.1 joerg /// aggregated values, in the specified direction.
2428 1.1 joerg static void emitReductionListCopy(
2429 1.1 joerg CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2430 1.1 joerg ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2431 1.1 joerg CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2432 1.1 joerg
2433 1.1 joerg CodeGenModule &CGM = CGF.CGM;
2434 1.1 joerg ASTContext &C = CGM.getContext();
2435 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2436 1.1 joerg
2437 1.1 joerg llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2438 1.1 joerg llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2439 1.1 joerg llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2440 1.1 joerg
2441 1.1 joerg // Iterates, element-by-element, through the source Reduce list and
2442 1.1 joerg // make a copy.
2443 1.1 joerg unsigned Idx = 0;
2444 1.1 joerg unsigned Size = Privates.size();
2445 1.1 joerg for (const Expr *Private : Privates) {
2446 1.1 joerg Address SrcElementAddr = Address::invalid();
2447 1.1 joerg Address DestElementAddr = Address::invalid();
2448 1.1 joerg Address DestElementPtrAddr = Address::invalid();
2449 1.1 joerg // Should we shuffle in an element from a remote lane?
2450 1.1 joerg bool ShuffleInElement = false;
2451 1.1 joerg // Set to true to update the pointer in the dest Reduce list to a
2452 1.1 joerg // newly created element.
2453 1.1 joerg bool UpdateDestListPtr = false;
2454 1.1 joerg // Increment the src or dest pointer to the scratchpad, for each
2455 1.1 joerg // new element.
2456 1.1 joerg bool IncrScratchpadSrc = false;
2457 1.1 joerg bool IncrScratchpadDest = false;
2458 1.1 joerg
2459 1.1 joerg switch (Action) {
2460 1.1 joerg case RemoteLaneToThread: {
2461 1.1 joerg // Step 1.1: Get the address for the src element in the Reduce list.
2462 1.1 joerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2463 1.1 joerg SrcElementAddr = CGF.EmitLoadOfPointer(
2464 1.1 joerg SrcElementPtrAddr,
2465 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>());
2466 1.1 joerg
2467 1.1 joerg // Step 1.2: Create a temporary to store the element in the destination
2468 1.1 joerg // Reduce list.
2469 1.1 joerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2470 1.1 joerg DestElementAddr =
2471 1.1 joerg CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2472 1.1 joerg ShuffleInElement = true;
2473 1.1 joerg UpdateDestListPtr = true;
2474 1.1 joerg break;
2475 1.1 joerg }
2476 1.1 joerg case ThreadCopy: {
2477 1.1 joerg // Step 1.1: Get the address for the src element in the Reduce list.
2478 1.1 joerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2479 1.1 joerg SrcElementAddr = CGF.EmitLoadOfPointer(
2480 1.1 joerg SrcElementPtrAddr,
2481 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>());
2482 1.1 joerg
2483 1.1 joerg // Step 1.2: Get the address for dest element. The destination
2484 1.1 joerg // element has already been created on the thread's stack.
2485 1.1 joerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2486 1.1 joerg DestElementAddr = CGF.EmitLoadOfPointer(
2487 1.1 joerg DestElementPtrAddr,
2488 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>());
2489 1.1 joerg break;
2490 1.1 joerg }
2491 1.1 joerg case ThreadToScratchpad: {
2492 1.1 joerg // Step 1.1: Get the address for the src element in the Reduce list.
2493 1.1 joerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2494 1.1 joerg SrcElementAddr = CGF.EmitLoadOfPointer(
2495 1.1 joerg SrcElementPtrAddr,
2496 1.1 joerg C.getPointerType(Private->getType())->castAs<PointerType>());
2497 1.1 joerg
2498 1.1 joerg // Step 1.2: Get the address for dest element:
2499 1.1 joerg // address = base + index * ElementSizeInChars.
2500 1.1 joerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2501 1.1 joerg llvm::Value *CurrentOffset =
2502 1.1 joerg Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2503 1.1 joerg llvm::Value *ScratchPadElemAbsolutePtrVal =
2504 1.1 joerg Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2505 1.1 joerg ScratchPadElemAbsolutePtrVal =
2506 1.1 joerg Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2507 1.1 joerg DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2508 1.1 joerg C.getTypeAlignInChars(Private->getType()));
2509 1.1 joerg IncrScratchpadDest = true;
2510 1.1 joerg break;
2511 1.1 joerg }
2512 1.1 joerg case ScratchpadToThread: {
2513 1.1 joerg // Step 1.1: Get the address for the src element in the scratchpad.
2514 1.1 joerg // address = base + index * ElementSizeInChars.
2515 1.1 joerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2516 1.1 joerg llvm::Value *CurrentOffset =
2517 1.1 joerg Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2518 1.1 joerg llvm::Value *ScratchPadElemAbsolutePtrVal =
2519 1.1 joerg Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2520 1.1 joerg ScratchPadElemAbsolutePtrVal =
2521 1.1 joerg Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2522 1.1 joerg SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2523 1.1 joerg C.getTypeAlignInChars(Private->getType()));
2524 1.1 joerg IncrScratchpadSrc = true;
2525 1.1 joerg
2526 1.1 joerg // Step 1.2: Create a temporary to store the element in the destination
2527 1.1 joerg // Reduce list.
2528 1.1 joerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2529 1.1 joerg DestElementAddr =
2530 1.1 joerg CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2531 1.1 joerg UpdateDestListPtr = true;
2532 1.1 joerg break;
2533 1.1 joerg }
2534 1.1 joerg }
2535 1.1 joerg
2536 1.1 joerg // Regardless of src and dest of copy, we emit the load of src
2537 1.1 joerg // element as this is required in all directions
2538 1.1 joerg SrcElementAddr = Bld.CreateElementBitCast(
2539 1.1 joerg SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2540 1.1 joerg DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2541 1.1 joerg SrcElementAddr.getElementType());
2542 1.1 joerg
2543 1.1 joerg // Now that all active lanes have read the element in the
2544 1.1 joerg // Reduce list, shuffle over the value from the remote lane.
2545 1.1 joerg if (ShuffleInElement) {
2546 1.1 joerg shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2547 1.1 joerg RemoteLaneOffset, Private->getExprLoc());
2548 1.1 joerg } else {
2549 1.1 joerg switch (CGF.getEvaluationKind(Private->getType())) {
2550 1.1 joerg case TEK_Scalar: {
2551 1.1 joerg llvm::Value *Elem = CGF.EmitLoadOfScalar(
2552 1.1 joerg SrcElementAddr, /*Volatile=*/false, Private->getType(),
2553 1.1 joerg Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2554 1.1 joerg TBAAAccessInfo());
2555 1.1 joerg // Store the source element value to the dest element address.
2556 1.1 joerg CGF.EmitStoreOfScalar(
2557 1.1 joerg Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2558 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2559 1.1 joerg break;
2560 1.1 joerg }
2561 1.1 joerg case TEK_Complex: {
2562 1.1 joerg CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2563 1.1 joerg CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2564 1.1 joerg Private->getExprLoc());
2565 1.1 joerg CGF.EmitStoreOfComplex(
2566 1.1 joerg Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2567 1.1 joerg /*isInit=*/false);
2568 1.1 joerg break;
2569 1.1 joerg }
2570 1.1 joerg case TEK_Aggregate:
2571 1.1 joerg CGF.EmitAggregateCopy(
2572 1.1 joerg CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2573 1.1 joerg CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2574 1.1 joerg Private->getType(), AggValueSlot::DoesNotOverlap);
2575 1.1 joerg break;
2576 1.1 joerg }
2577 1.1 joerg }
2578 1.1 joerg
2579 1.1 joerg // Step 3.1: Modify reference in dest Reduce list as needed.
2580 1.1 joerg // Modifying the reference in Reduce list to point to the newly
2581 1.1 joerg // created element. The element is live in the current function
2582 1.1 joerg // scope and that of functions it invokes (i.e., reduce_function).
2583 1.1 joerg // RemoteReduceData[i] = (void*)&RemoteElem
2584 1.1 joerg if (UpdateDestListPtr) {
2585 1.1 joerg CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2586 1.1 joerg DestElementAddr.getPointer(), CGF.VoidPtrTy),
2587 1.1 joerg DestElementPtrAddr, /*Volatile=*/false,
2588 1.1 joerg C.VoidPtrTy);
2589 1.1 joerg }
2590 1.1 joerg
2591 1.1 joerg // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2592 1.1 joerg // address of the next element in scratchpad memory, unless we're currently
2593 1.1 joerg // processing the last one. Memory alignment is also taken care of here.
2594 1.1 joerg if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2595 1.1 joerg llvm::Value *ScratchpadBasePtr =
2596 1.1 joerg IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2597 1.1 joerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2598 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWAdd(
2599 1.1 joerg ScratchpadBasePtr,
2600 1.1 joerg Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2601 1.1 joerg
2602 1.1 joerg // Take care of global memory alignment for performance
2603 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWSub(
2604 1.1 joerg ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2605 1.1 joerg ScratchpadBasePtr = Bld.CreateUDiv(
2606 1.1 joerg ScratchpadBasePtr,
2607 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2608 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWAdd(
2609 1.1 joerg ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2610 1.1 joerg ScratchpadBasePtr = Bld.CreateNUWMul(
2611 1.1 joerg ScratchpadBasePtr,
2612 1.1 joerg llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2613 1.1 joerg
2614 1.1 joerg if (IncrScratchpadDest)
2615 1.1 joerg DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2616 1.1 joerg else /* IncrScratchpadSrc = true */
2617 1.1 joerg SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2618 1.1 joerg }
2619 1.1 joerg
2620 1.1 joerg ++Idx;
2621 1.1 joerg }
2622 1.1 joerg }
2623 1.1 joerg
2624 1.1 joerg /// This function emits a helper that gathers Reduce lists from the first
2625 1.1 joerg /// lane of every active warp to lanes in the first warp.
2626 1.1 joerg ///
2627 1.1 joerg /// void inter_warp_copy_func(void* reduce_data, num_warps)
2628 1.1 joerg /// shared smem[warp_size];
2629 1.1 joerg /// For all data entries D in reduce_data:
2630 1.1 joerg /// sync
2631 1.1 joerg /// If (I am the first lane in each warp)
2632 1.1 joerg /// Copy my local D to smem[warp_id]
2633 1.1 joerg /// sync
2634 1.1 joerg /// if (I am the first warp)
2635 1.1 joerg /// Copy smem[thread_id] to my local D
2636 1.1 joerg static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2637 1.1 joerg ArrayRef<const Expr *> Privates,
2638 1.1 joerg QualType ReductionArrayTy,
2639 1.1 joerg SourceLocation Loc) {
2640 1.1 joerg ASTContext &C = CGM.getContext();
2641 1.1 joerg llvm::Module &M = CGM.getModule();
2642 1.1 joerg
2643 1.1 joerg // ReduceList: thread local Reduce list.
2644 1.1 joerg // At the stage of the computation when this function is called, partially
2645 1.1 joerg // aggregated values reside in the first lane of every active warp.
2646 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2647 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
2648 1.1 joerg // NumWarps: number of warps active in the parallel region. This could
2649 1.1 joerg // be smaller than 32 (max warps in a CTA) for partial block reduction.
2650 1.1 joerg ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2651 1.1 joerg C.getIntTypeForBitwidth(32, /* Signed */ true),
2652 1.1 joerg ImplicitParamDecl::Other);
2653 1.1 joerg FunctionArgList Args;
2654 1.1 joerg Args.push_back(&ReduceListArg);
2655 1.1 joerg Args.push_back(&NumWarpsArg);
2656 1.1 joerg
2657 1.1 joerg const CGFunctionInfo &CGFI =
2658 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2659 1.1 joerg auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2660 1.1 joerg llvm::GlobalValue::InternalLinkage,
2661 1.1 joerg "_omp_reduction_inter_warp_copy_func", &M);
2662 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2663 1.1 joerg Fn->setDoesNotRecurse();
2664 1.1 joerg CodeGenFunction CGF(CGM);
2665 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2666 1.1 joerg
2667 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2668 1.1 joerg
2669 1.1 joerg // This array is used as a medium to transfer, one reduce element at a time,
2670 1.1 joerg // the data from the first lane of every warp to lanes in the first warp
2671 1.1 joerg // in order to perform the final step of a reduction in a parallel region
2672 1.1 joerg // (reduction across warps). The array is placed in NVPTX __shared__ memory
2673 1.1 joerg // for reduced latency, as well as to have a distinct copy for concurrently
2674 1.1 joerg // executing target regions. The array is declared with common linkage so
2675 1.1 joerg // as to be shared across compilation units.
2676 1.1 joerg StringRef TransferMediumName =
2677 1.1 joerg "__openmp_nvptx_data_transfer_temporary_storage";
2678 1.1 joerg llvm::GlobalVariable *TransferMedium =
2679 1.1 joerg M.getGlobalVariable(TransferMediumName);
2680 1.1 joerg unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
2681 1.1 joerg if (!TransferMedium) {
2682 1.1 joerg auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2683 1.1 joerg unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2684 1.1 joerg TransferMedium = new llvm::GlobalVariable(
2685 1.1 joerg M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2686 1.1 joerg llvm::UndefValue::get(Ty), TransferMediumName,
2687 1.1 joerg /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2688 1.1 joerg SharedAddressSpace);
2689 1.1 joerg CGM.addCompilerUsedGlobal(TransferMedium);
2690 1.1 joerg }
2691 1.1 joerg
2692 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2693 1.1 joerg // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 1.1 joerg llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2695 1.1 joerg // nvptx_lane_id = nvptx_id % warpsize
2696 1.1 joerg llvm::Value *LaneID = getNVPTXLaneID(CGF);
2697 1.1 joerg // nvptx_warp_id = nvptx_id / warpsize
2698 1.1 joerg llvm::Value *WarpID = getNVPTXWarpID(CGF);
2699 1.1 joerg
2700 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2701 1.1 joerg Address LocalReduceList(
2702 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(
2703 1.1 joerg CGF.EmitLoadOfScalar(
2704 1.1 joerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2705 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2706 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2707 1.1 joerg CGF.getPointerAlign());
2708 1.1 joerg
2709 1.1 joerg unsigned Idx = 0;
2710 1.1 joerg for (const Expr *Private : Privates) {
2711 1.1 joerg //
2712 1.1 joerg // Warp master copies reduce element to transfer medium in __shared__
2713 1.1 joerg // memory.
2714 1.1 joerg //
2715 1.1 joerg unsigned RealTySize =
2716 1.1 joerg C.getTypeSizeInChars(Private->getType())
2717 1.1 joerg .alignTo(C.getTypeAlignInChars(Private->getType()))
2718 1.1 joerg .getQuantity();
2719 1.1 joerg for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2720 1.1 joerg unsigned NumIters = RealTySize / TySize;
2721 1.1 joerg if (NumIters == 0)
2722 1.1 joerg continue;
2723 1.1 joerg QualType CType = C.getIntTypeForBitwidth(
2724 1.1 joerg C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2725 1.1 joerg llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2726 1.1 joerg CharUnits Align = CharUnits::fromQuantity(TySize);
2727 1.1 joerg llvm::Value *Cnt = nullptr;
2728 1.1 joerg Address CntAddr = Address::invalid();
2729 1.1 joerg llvm::BasicBlock *PrecondBB = nullptr;
2730 1.1 joerg llvm::BasicBlock *ExitBB = nullptr;
2731 1.1 joerg if (NumIters > 1) {
2732 1.1 joerg CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2733 1.1 joerg CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2734 1.1 joerg /*Volatile=*/false, C.IntTy);
2735 1.1 joerg PrecondBB = CGF.createBasicBlock("precond");
2736 1.1 joerg ExitBB = CGF.createBasicBlock("exit");
2737 1.1 joerg llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2738 1.1 joerg // There is no need to emit line number for unconditional branch.
2739 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF);
2740 1.1 joerg CGF.EmitBlock(PrecondBB);
2741 1.1 joerg Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2742 1.1 joerg llvm::Value *Cmp =
2743 1.1 joerg Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2744 1.1 joerg Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2745 1.1 joerg CGF.EmitBlock(BodyBB);
2746 1.1 joerg }
2747 1.1 joerg // kmpc_barrier.
2748 1.1 joerg CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2749 1.1 joerg /*EmitChecks=*/false,
2750 1.1 joerg /*ForceSimpleCall=*/true);
2751 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2752 1.1 joerg llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2753 1.1 joerg llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2754 1.1 joerg
2755 1.1 joerg // if (lane_id == 0)
2756 1.1 joerg llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2757 1.1 joerg Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2758 1.1 joerg CGF.EmitBlock(ThenBB);
2759 1.1 joerg
2760 1.1 joerg // Reduce element = LocalReduceList[i]
2761 1.1 joerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2762 1.1 joerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2763 1.1 joerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2764 1.1 joerg // elemptr = ((CopyType*)(elemptrptr)) + I
2765 1.1 joerg Address ElemPtr = Address(ElemPtrPtr, Align);
2766 1.1 joerg ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2767 1.1 joerg if (NumIters > 1) {
2768 1.1 joerg ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
2769 1.1 joerg ElemPtr.getAlignment());
2770 1.1 joerg }
2771 1.1 joerg
2772 1.1 joerg // Get pointer to location in transfer medium.
2773 1.1 joerg // MediumPtr = &medium[warp_id]
2774 1.1 joerg llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2775 1.1 joerg TransferMedium->getValueType(), TransferMedium,
2776 1.1 joerg {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2777 1.1 joerg Address MediumPtr(MediumPtrVal, Align);
2778 1.1 joerg // Casting to actual data type.
2779 1.1 joerg // MediumPtr = (CopyType*)MediumPtrAddr;
2780 1.1 joerg MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2781 1.1 joerg
2782 1.1 joerg // elem = *elemptr
2783 1.1 joerg //*MediumPtr = elem
2784 1.1 joerg llvm::Value *Elem = CGF.EmitLoadOfScalar(
2785 1.1 joerg ElemPtr, /*Volatile=*/false, CType, Loc,
2786 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2787 1.1 joerg // Store the source element value to the dest element address.
2788 1.1 joerg CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2789 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
2790 1.1 joerg TBAAAccessInfo());
2791 1.1 joerg
2792 1.1 joerg Bld.CreateBr(MergeBB);
2793 1.1 joerg
2794 1.1 joerg CGF.EmitBlock(ElseBB);
2795 1.1 joerg Bld.CreateBr(MergeBB);
2796 1.1 joerg
2797 1.1 joerg CGF.EmitBlock(MergeBB);
2798 1.1 joerg
2799 1.1 joerg // kmpc_barrier.
2800 1.1 joerg CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2801 1.1 joerg /*EmitChecks=*/false,
2802 1.1 joerg /*ForceSimpleCall=*/true);
2803 1.1 joerg
2804 1.1 joerg //
2805 1.1 joerg // Warp 0 copies reduce element from transfer medium.
2806 1.1 joerg //
2807 1.1 joerg llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2808 1.1 joerg llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2809 1.1 joerg llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2810 1.1 joerg
2811 1.1 joerg Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2812 1.1 joerg llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2813 1.1 joerg AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2814 1.1 joerg
2815 1.1 joerg // Up to 32 threads in warp 0 are active.
2816 1.1 joerg llvm::Value *IsActiveThread =
2817 1.1 joerg Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2818 1.1 joerg Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2819 1.1 joerg
2820 1.1 joerg CGF.EmitBlock(W0ThenBB);
2821 1.1 joerg
2822 1.1 joerg // SrcMediumPtr = &medium[tid]
2823 1.1 joerg llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2824 1.1 joerg TransferMedium->getValueType(), TransferMedium,
2825 1.1 joerg {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2826 1.1 joerg Address SrcMediumPtr(SrcMediumPtrVal, Align);
2827 1.1 joerg // SrcMediumVal = *SrcMediumPtr;
2828 1.1 joerg SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2829 1.1 joerg
2830 1.1 joerg // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2831 1.1 joerg Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2832 1.1 joerg llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2833 1.1 joerg TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2834 1.1 joerg Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2835 1.1 joerg TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2836 1.1 joerg if (NumIters > 1) {
2837 1.1 joerg TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
2838 1.1 joerg TargetElemPtr.getAlignment());
2839 1.1 joerg }
2840 1.1 joerg
2841 1.1 joerg // *TargetElemPtr = SrcMediumVal;
2842 1.1 joerg llvm::Value *SrcMediumValue =
2843 1.1 joerg CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2844 1.1 joerg CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2845 1.1 joerg CType);
2846 1.1 joerg Bld.CreateBr(W0MergeBB);
2847 1.1 joerg
2848 1.1 joerg CGF.EmitBlock(W0ElseBB);
2849 1.1 joerg Bld.CreateBr(W0MergeBB);
2850 1.1 joerg
2851 1.1 joerg CGF.EmitBlock(W0MergeBB);
2852 1.1 joerg
2853 1.1 joerg if (NumIters > 1) {
2854 1.1 joerg Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2855 1.1 joerg CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2856 1.1 joerg CGF.EmitBranch(PrecondBB);
2857 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF);
2858 1.1 joerg CGF.EmitBlock(ExitBB);
2859 1.1 joerg }
2860 1.1 joerg RealTySize %= TySize;
2861 1.1 joerg }
2862 1.1 joerg ++Idx;
2863 1.1 joerg }
2864 1.1 joerg
2865 1.1 joerg CGF.FinishFunction();
2866 1.1 joerg return Fn;
2867 1.1 joerg }
2868 1.1 joerg
2869 1.1 joerg /// Emit a helper that reduces data across two OpenMP threads (lanes)
2870 1.1 joerg /// in the same warp. It uses shuffle instructions to copy over data from
2871 1.1 joerg /// a remote lane's stack. The reduction algorithm performed is specified
2872 1.1 joerg /// by the fourth parameter.
2873 1.1 joerg ///
2874 1.1 joerg /// Algorithm Versions.
2875 1.1 joerg /// Full Warp Reduce (argument value 0):
2876 1.1 joerg /// This algorithm assumes that all 32 lanes are active and gathers
2877 1.1 joerg /// data from these 32 lanes, producing a single resultant value.
2878 1.1 joerg /// Contiguous Partial Warp Reduce (argument value 1):
2879 1.1 joerg /// This algorithm assumes that only a *contiguous* subset of lanes
2880 1.1 joerg /// are active. This happens for the last warp in a parallel region
2881 1.1 joerg /// when the user specified num_threads is not an integer multiple of
2882 1.1 joerg /// 32. This contiguous subset always starts with the zeroth lane.
2883 1.1 joerg /// Partial Warp Reduce (argument value 2):
2884 1.1 joerg /// This algorithm gathers data from any number of lanes at any position.
2885 1.1 joerg /// All reduced values are stored in the lowest possible lane. The set
2886 1.1 joerg /// of problems every algorithm addresses is a super set of those
2887 1.1 joerg /// addressable by algorithms with a lower version number. Overhead
2888 1.1 joerg /// increases as algorithm version increases.
2889 1.1 joerg ///
2890 1.1 joerg /// Terminology
2891 1.1 joerg /// Reduce element:
2892 1.1 joerg /// Reduce element refers to the individual data field with primitive
2893 1.1 joerg /// data types to be combined and reduced across threads.
2894 1.1 joerg /// Reduce list:
2895 1.1 joerg /// Reduce list refers to a collection of local, thread-private
2896 1.1 joerg /// reduce elements.
2897 1.1 joerg /// Remote Reduce list:
2898 1.1 joerg /// Remote Reduce list refers to a collection of remote (relative to
2899 1.1 joerg /// the current thread) reduce elements.
2900 1.1 joerg ///
2901 1.1 joerg /// We distinguish between three states of threads that are important to
2902 1.1 joerg /// the implementation of this function.
2903 1.1 joerg /// Alive threads:
2904 1.1 joerg /// Threads in a warp executing the SIMT instruction, as distinguished from
2905 1.1 joerg /// threads that are inactive due to divergent control flow.
2906 1.1 joerg /// Active threads:
2907 1.1 joerg /// The minimal set of threads that has to be alive upon entry to this
2908 1.1 joerg /// function. The computation is correct iff active threads are alive.
2909 1.1 joerg /// Some threads are alive but they are not active because they do not
2910 1.1 joerg /// contribute to the computation in any useful manner. Turning them off
2911 1.1 joerg /// may introduce control flow overheads without any tangible benefits.
2912 1.1 joerg /// Effective threads:
2913 1.1 joerg /// In order to comply with the argument requirements of the shuffle
2914 1.1 joerg /// function, we must keep all lanes holding data alive. But at most
2915 1.1 joerg /// half of them perform value aggregation; we refer to this half of
2916 1.1 joerg /// threads as effective. The other half is simply handing off their
2917 1.1 joerg /// data.
2918 1.1 joerg ///
2919 1.1 joerg /// Procedure
2920 1.1 joerg /// Value shuffle:
2921 1.1 joerg /// In this step active threads transfer data from higher lane positions
2922 1.1 joerg /// in the warp to lower lane positions, creating Remote Reduce list.
2923 1.1 joerg /// Value aggregation:
2924 1.1 joerg /// In this step, effective threads combine their thread local Reduce list
2925 1.1 joerg /// with Remote Reduce list and store the result in the thread local
2926 1.1 joerg /// Reduce list.
2927 1.1 joerg /// Value copy:
2928 1.1 joerg /// In this step, we deal with the assumption made by algorithm 2
2929 1.1 joerg /// (i.e. contiguity assumption). When we have an odd number of lanes
2930 1.1 joerg /// active, say 2k+1, only k threads will be effective and therefore k
2931 1.1 joerg /// new values will be produced. However, the Reduce list owned by the
2932 1.1 joerg /// (2k+1)th thread is ignored in the value aggregation. Therefore
2933 1.1 joerg /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2934 1.1 joerg /// that the contiguity assumption still holds.
2935 1.1 joerg static llvm::Function *emitShuffleAndReduceFunction(
2936 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2937 1.1 joerg QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2938 1.1 joerg ASTContext &C = CGM.getContext();
2939 1.1 joerg
2940 1.1 joerg // Thread local Reduce list used to host the values of data to be reduced.
2941 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2942 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
2943 1.1 joerg // Current lane id; could be logical.
2944 1.1 joerg ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2945 1.1 joerg ImplicitParamDecl::Other);
2946 1.1 joerg // Offset of the remote source lane relative to the current lane.
2947 1.1 joerg ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2948 1.1 joerg C.ShortTy, ImplicitParamDecl::Other);
2949 1.1 joerg // Algorithm version. This is expected to be known at compile time.
2950 1.1 joerg ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2951 1.1 joerg C.ShortTy, ImplicitParamDecl::Other);
2952 1.1 joerg FunctionArgList Args;
2953 1.1 joerg Args.push_back(&ReduceListArg);
2954 1.1 joerg Args.push_back(&LaneIDArg);
2955 1.1 joerg Args.push_back(&RemoteLaneOffsetArg);
2956 1.1 joerg Args.push_back(&AlgoVerArg);
2957 1.1 joerg
2958 1.1 joerg const CGFunctionInfo &CGFI =
2959 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2960 1.1 joerg auto *Fn = llvm::Function::Create(
2961 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2962 1.1 joerg "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2963 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2964 1.1 joerg Fn->setDoesNotRecurse();
2965 1.1 joerg
2966 1.1 joerg CodeGenFunction CGF(CGM);
2967 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2968 1.1 joerg
2969 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
2970 1.1 joerg
2971 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2972 1.1 joerg Address LocalReduceList(
2973 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(
2974 1.1 joerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2975 1.1 joerg C.VoidPtrTy, SourceLocation()),
2976 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2977 1.1 joerg CGF.getPointerAlign());
2978 1.1 joerg
2979 1.1 joerg Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2980 1.1 joerg llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2981 1.1 joerg AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2982 1.1 joerg
2983 1.1 joerg Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2984 1.1 joerg llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2985 1.1 joerg AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2986 1.1 joerg
2987 1.1 joerg Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2988 1.1 joerg llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2989 1.1 joerg AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2990 1.1 joerg
2991 1.1 joerg // Create a local thread-private variable to host the Reduce list
2992 1.1 joerg // from a remote lane.
2993 1.1 joerg Address RemoteReduceList =
2994 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2995 1.1 joerg
2996 1.1 joerg // This loop iterates through the list of reduce elements and copies,
2997 1.1 joerg // element by element, from a remote lane in the warp to RemoteReduceList,
2998 1.1 joerg // hosted on the thread's stack.
2999 1.1 joerg emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3000 1.1 joerg LocalReduceList, RemoteReduceList,
3001 1.1 joerg {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3002 1.1 joerg /*ScratchpadIndex=*/nullptr,
3003 1.1 joerg /*ScratchpadWidth=*/nullptr});
3004 1.1 joerg
3005 1.1 joerg // The actions to be performed on the Remote Reduce list is dependent
3006 1.1 joerg // on the algorithm version.
3007 1.1 joerg //
3008 1.1 joerg // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3009 1.1 joerg // LaneId % 2 == 0 && Offset > 0):
3010 1.1 joerg // do the reduction value aggregation
3011 1.1 joerg //
3012 1.1 joerg // The thread local variable Reduce list is mutated in place to host the
3013 1.1 joerg // reduced data, which is the aggregated value produced from local and
3014 1.1 joerg // remote lanes.
3015 1.1 joerg //
3016 1.1 joerg // Note that AlgoVer is expected to be a constant integer known at compile
3017 1.1 joerg // time.
3018 1.1 joerg // When AlgoVer==0, the first conjunction evaluates to true, making
3019 1.1 joerg // the entire predicate true during compile time.
3020 1.1 joerg // When AlgoVer==1, the second conjunction has only the second part to be
3021 1.1 joerg // evaluated during runtime. Other conjunctions evaluates to false
3022 1.1 joerg // during compile time.
3023 1.1 joerg // When AlgoVer==2, the third conjunction has only the second part to be
3024 1.1 joerg // evaluated during runtime. Other conjunctions evaluates to false
3025 1.1 joerg // during compile time.
3026 1.1 joerg llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3027 1.1 joerg
3028 1.1 joerg llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3029 1.1 joerg llvm::Value *CondAlgo1 = Bld.CreateAnd(
3030 1.1 joerg Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3031 1.1 joerg
3032 1.1 joerg llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3033 1.1 joerg llvm::Value *CondAlgo2 = Bld.CreateAnd(
3034 1.1 joerg Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3035 1.1 joerg CondAlgo2 = Bld.CreateAnd(
3036 1.1 joerg CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3037 1.1 joerg
3038 1.1 joerg llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3039 1.1 joerg CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3040 1.1 joerg
3041 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3042 1.1 joerg llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3043 1.1 joerg llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3044 1.1 joerg Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3045 1.1 joerg
3046 1.1 joerg CGF.EmitBlock(ThenBB);
3047 1.1 joerg // reduce_function(LocalReduceList, RemoteReduceList)
3048 1.1 joerg llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3049 1.1 joerg LocalReduceList.getPointer(), CGF.VoidPtrTy);
3050 1.1 joerg llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3051 1.1 joerg RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3052 1.1 joerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3053 1.1 joerg CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3054 1.1 joerg Bld.CreateBr(MergeBB);
3055 1.1 joerg
3056 1.1 joerg CGF.EmitBlock(ElseBB);
3057 1.1 joerg Bld.CreateBr(MergeBB);
3058 1.1 joerg
3059 1.1 joerg CGF.EmitBlock(MergeBB);
3060 1.1 joerg
3061 1.1 joerg // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3062 1.1 joerg // Reduce list.
3063 1.1 joerg Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3064 1.1 joerg llvm::Value *CondCopy = Bld.CreateAnd(
3065 1.1 joerg Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3066 1.1 joerg
3067 1.1 joerg llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3068 1.1 joerg llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3069 1.1 joerg llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3070 1.1 joerg Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3071 1.1 joerg
3072 1.1 joerg CGF.EmitBlock(CpyThenBB);
3073 1.1 joerg emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3074 1.1 joerg RemoteReduceList, LocalReduceList);
3075 1.1 joerg Bld.CreateBr(CpyMergeBB);
3076 1.1 joerg
3077 1.1 joerg CGF.EmitBlock(CpyElseBB);
3078 1.1 joerg Bld.CreateBr(CpyMergeBB);
3079 1.1 joerg
3080 1.1 joerg CGF.EmitBlock(CpyMergeBB);
3081 1.1 joerg
3082 1.1 joerg CGF.FinishFunction();
3083 1.1 joerg return Fn;
3084 1.1 joerg }
3085 1.1 joerg
3086 1.1 joerg /// This function emits a helper that copies all the reduction variables from
3087 1.1 joerg /// the team into the provided global buffer for the reduction variables.
3088 1.1 joerg ///
3089 1.1 joerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3090 1.1 joerg /// For all data entries D in reduce_data:
3091 1.1 joerg /// Copy local D to buffer.D[Idx]
3092 1.1 joerg static llvm::Value *emitListToGlobalCopyFunction(
3093 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3094 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc,
3095 1.1 joerg const RecordDecl *TeamReductionRec,
3096 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3097 1.1 joerg &VarFieldMap) {
3098 1.1 joerg ASTContext &C = CGM.getContext();
3099 1.1 joerg
3100 1.1 joerg // Buffer: global reduction buffer.
3101 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3102 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3103 1.1 joerg // Idx: index of the buffer.
3104 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3105 1.1 joerg ImplicitParamDecl::Other);
3106 1.1 joerg // ReduceList: thread local Reduce list.
3107 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3108 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3109 1.1 joerg FunctionArgList Args;
3110 1.1 joerg Args.push_back(&BufferArg);
3111 1.1 joerg Args.push_back(&IdxArg);
3112 1.1 joerg Args.push_back(&ReduceListArg);
3113 1.1 joerg
3114 1.1 joerg const CGFunctionInfo &CGFI =
3115 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3116 1.1 joerg auto *Fn = llvm::Function::Create(
3117 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3118 1.1 joerg "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3119 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3120 1.1 joerg Fn->setDoesNotRecurse();
3121 1.1 joerg CodeGenFunction CGF(CGM);
3122 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3123 1.1 joerg
3124 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
3125 1.1 joerg
3126 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3127 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3128 1.1 joerg Address LocalReduceList(
3129 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(
3130 1.1 joerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3131 1.1 joerg C.VoidPtrTy, Loc),
3132 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3133 1.1 joerg CGF.getPointerAlign());
3134 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3135 1.1 joerg llvm::Type *LLVMReductionsBufferTy =
3136 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3137 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3138 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3139 1.1 joerg LLVMReductionsBufferTy->getPointerTo());
3140 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3141 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3142 1.1 joerg /*Volatile=*/false, C.IntTy,
3143 1.1 joerg Loc)};
3144 1.1 joerg unsigned Idx = 0;
3145 1.1 joerg for (const Expr *Private : Privates) {
3146 1.1 joerg // Reduce element = LocalReduceList[i]
3147 1.1 joerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3148 1.1 joerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3149 1.1 joerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3150 1.1 joerg // elemptr = ((CopyType*)(elemptrptr)) + I
3151 1.1 joerg ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3152 1.1 joerg ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3153 1.1 joerg Address ElemPtr =
3154 1.1 joerg Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3155 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3156 1.1 joerg // Global = Buffer.VD[Idx];
3157 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3158 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField(
3159 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3160 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF);
3161 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3162 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3163 1.1 joerg GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3164 1.1 joerg switch (CGF.getEvaluationKind(Private->getType())) {
3165 1.1 joerg case TEK_Scalar: {
3166 1.1 joerg llvm::Value *V = CGF.EmitLoadOfScalar(
3167 1.1 joerg ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
3168 1.1 joerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
3169 1.1 joerg CGF.EmitStoreOfScalar(V, GlobLVal);
3170 1.1 joerg break;
3171 1.1 joerg }
3172 1.1 joerg case TEK_Complex: {
3173 1.1 joerg CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3174 1.1 joerg CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3175 1.1 joerg CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3176 1.1 joerg break;
3177 1.1 joerg }
3178 1.1 joerg case TEK_Aggregate:
3179 1.1 joerg CGF.EmitAggregateCopy(GlobLVal,
3180 1.1 joerg CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3181 1.1 joerg Private->getType(), AggValueSlot::DoesNotOverlap);
3182 1.1 joerg break;
3183 1.1 joerg }
3184 1.1 joerg ++Idx;
3185 1.1 joerg }
3186 1.1 joerg
3187 1.1 joerg CGF.FinishFunction();
3188 1.1 joerg return Fn;
3189 1.1 joerg }
3190 1.1 joerg
3191 1.1 joerg /// This function emits a helper that reduces all the reduction variables from
3192 1.1 joerg /// the team into the provided global buffer for the reduction variables.
3193 1.1 joerg ///
3194 1.1 joerg /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3195 1.1 joerg /// void *GlobPtrs[];
3196 1.1 joerg /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3197 1.1 joerg /// ...
3198 1.1 joerg /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3199 1.1 joerg /// reduce_function(GlobPtrs, reduce_data);
3200 1.1 joerg static llvm::Value *emitListToGlobalReduceFunction(
3201 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3202 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc,
3203 1.1 joerg const RecordDecl *TeamReductionRec,
3204 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3205 1.1 joerg &VarFieldMap,
3206 1.1 joerg llvm::Function *ReduceFn) {
3207 1.1 joerg ASTContext &C = CGM.getContext();
3208 1.1 joerg
3209 1.1 joerg // Buffer: global reduction buffer.
3210 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3211 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3212 1.1 joerg // Idx: index of the buffer.
3213 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3214 1.1 joerg ImplicitParamDecl::Other);
3215 1.1 joerg // ReduceList: thread local Reduce list.
3216 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3217 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3218 1.1 joerg FunctionArgList Args;
3219 1.1 joerg Args.push_back(&BufferArg);
3220 1.1 joerg Args.push_back(&IdxArg);
3221 1.1 joerg Args.push_back(&ReduceListArg);
3222 1.1 joerg
3223 1.1 joerg const CGFunctionInfo &CGFI =
3224 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3225 1.1 joerg auto *Fn = llvm::Function::Create(
3226 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3227 1.1 joerg "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3228 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3229 1.1 joerg Fn->setDoesNotRecurse();
3230 1.1 joerg CodeGenFunction CGF(CGM);
3231 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3232 1.1 joerg
3233 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
3234 1.1 joerg
3235 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3236 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3237 1.1 joerg llvm::Type *LLVMReductionsBufferTy =
3238 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3239 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3240 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3241 1.1 joerg LLVMReductionsBufferTy->getPointerTo());
3242 1.1 joerg
3243 1.1 joerg // 1. Build a list of reduction variables.
3244 1.1 joerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3245 1.1 joerg Address ReductionList =
3246 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3247 1.1 joerg auto IPriv = Privates.begin();
3248 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3249 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3250 1.1 joerg /*Volatile=*/false, C.IntTy,
3251 1.1 joerg Loc)};
3252 1.1 joerg unsigned Idx = 0;
3253 1.1 joerg for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3254 1.1 joerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3255 1.1 joerg // Global = Buffer.VD[Idx];
3256 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3257 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3258 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField(
3259 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3260 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF);
3261 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3262 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3263 1.1 joerg llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3264 1.1 joerg CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3265 1.1 joerg if ((*IPriv)->getType()->isVariablyModifiedType()) {
3266 1.1 joerg // Store array size.
3267 1.1 joerg ++Idx;
3268 1.1 joerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3269 1.1 joerg llvm::Value *Size = CGF.Builder.CreateIntCast(
3270 1.1 joerg CGF.getVLASize(
3271 1.1 joerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3272 1.1 joerg .NumElts,
3273 1.1 joerg CGF.SizeTy, /*isSigned=*/false);
3274 1.1 joerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3275 1.1 joerg Elem);
3276 1.1 joerg }
3277 1.1 joerg }
3278 1.1 joerg
3279 1.1 joerg // Call reduce_function(GlobalReduceList, ReduceList)
3280 1.1 joerg llvm::Value *GlobalReduceList =
3281 1.1 joerg CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3282 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3283 1.1 joerg llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3284 1.1 joerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3285 1.1 joerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3286 1.1 joerg CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3287 1.1 joerg CGF.FinishFunction();
3288 1.1 joerg return Fn;
3289 1.1 joerg }
3290 1.1 joerg
3291 1.1 joerg /// This function emits a helper that copies all the reduction variables from
3292 1.1 joerg /// the team into the provided global buffer for the reduction variables.
3293 1.1 joerg ///
3294 1.1 joerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3295 1.1 joerg /// For all data entries D in reduce_data:
3296 1.1 joerg /// Copy buffer.D[Idx] to local D;
3297 1.1 joerg static llvm::Value *emitGlobalToListCopyFunction(
3298 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3299 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc,
3300 1.1 joerg const RecordDecl *TeamReductionRec,
3301 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3302 1.1 joerg &VarFieldMap) {
3303 1.1 joerg ASTContext &C = CGM.getContext();
3304 1.1 joerg
3305 1.1 joerg // Buffer: global reduction buffer.
3306 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3307 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3308 1.1 joerg // Idx: index of the buffer.
3309 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3310 1.1 joerg ImplicitParamDecl::Other);
3311 1.1 joerg // ReduceList: thread local Reduce list.
3312 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3313 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3314 1.1 joerg FunctionArgList Args;
3315 1.1 joerg Args.push_back(&BufferArg);
3316 1.1 joerg Args.push_back(&IdxArg);
3317 1.1 joerg Args.push_back(&ReduceListArg);
3318 1.1 joerg
3319 1.1 joerg const CGFunctionInfo &CGFI =
3320 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3321 1.1 joerg auto *Fn = llvm::Function::Create(
3322 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3323 1.1 joerg "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3324 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3325 1.1 joerg Fn->setDoesNotRecurse();
3326 1.1 joerg CodeGenFunction CGF(CGM);
3327 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3328 1.1 joerg
3329 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
3330 1.1 joerg
3331 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3332 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3333 1.1 joerg Address LocalReduceList(
3334 1.1 joerg Bld.CreatePointerBitCastOrAddrSpaceCast(
3335 1.1 joerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3336 1.1 joerg C.VoidPtrTy, Loc),
3337 1.1 joerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3338 1.1 joerg CGF.getPointerAlign());
3339 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3340 1.1 joerg llvm::Type *LLVMReductionsBufferTy =
3341 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3342 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3343 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3344 1.1 joerg LLVMReductionsBufferTy->getPointerTo());
3345 1.1 joerg
3346 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3347 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3348 1.1 joerg /*Volatile=*/false, C.IntTy,
3349 1.1 joerg Loc)};
3350 1.1 joerg unsigned Idx = 0;
3351 1.1 joerg for (const Expr *Private : Privates) {
3352 1.1 joerg // Reduce element = LocalReduceList[i]
3353 1.1 joerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3354 1.1 joerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3355 1.1 joerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3356 1.1 joerg // elemptr = ((CopyType*)(elemptrptr)) + I
3357 1.1 joerg ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3358 1.1 joerg ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3359 1.1 joerg Address ElemPtr =
3360 1.1 joerg Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3361 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3362 1.1 joerg // Global = Buffer.VD[Idx];
3363 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3364 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField(
3365 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3366 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF);
3367 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3368 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3369 1.1 joerg GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3370 1.1 joerg switch (CGF.getEvaluationKind(Private->getType())) {
3371 1.1 joerg case TEK_Scalar: {
3372 1.1 joerg llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3373 1.1 joerg CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
3374 1.1 joerg LValueBaseInfo(AlignmentSource::Type),
3375 1.1 joerg TBAAAccessInfo());
3376 1.1 joerg break;
3377 1.1 joerg }
3378 1.1 joerg case TEK_Complex: {
3379 1.1 joerg CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3380 1.1 joerg CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3381 1.1 joerg /*isInit=*/false);
3382 1.1 joerg break;
3383 1.1 joerg }
3384 1.1 joerg case TEK_Aggregate:
3385 1.1 joerg CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3386 1.1 joerg GlobLVal, Private->getType(),
3387 1.1 joerg AggValueSlot::DoesNotOverlap);
3388 1.1 joerg break;
3389 1.1 joerg }
3390 1.1 joerg ++Idx;
3391 1.1 joerg }
3392 1.1 joerg
3393 1.1 joerg CGF.FinishFunction();
3394 1.1 joerg return Fn;
3395 1.1 joerg }
3396 1.1 joerg
3397 1.1 joerg /// This function emits a helper that reduces all the reduction variables from
3398 1.1 joerg /// the team into the provided global buffer for the reduction variables.
3399 1.1 joerg ///
3400 1.1 joerg /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3401 1.1 joerg /// void *GlobPtrs[];
3402 1.1 joerg /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3403 1.1 joerg /// ...
3404 1.1 joerg /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3405 1.1 joerg /// reduce_function(reduce_data, GlobPtrs);
3406 1.1 joerg static llvm::Value *emitGlobalToListReduceFunction(
3407 1.1 joerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3408 1.1 joerg QualType ReductionArrayTy, SourceLocation Loc,
3409 1.1 joerg const RecordDecl *TeamReductionRec,
3410 1.1 joerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3411 1.1 joerg &VarFieldMap,
3412 1.1 joerg llvm::Function *ReduceFn) {
3413 1.1 joerg ASTContext &C = CGM.getContext();
3414 1.1 joerg
3415 1.1 joerg // Buffer: global reduction buffer.
3416 1.1 joerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3417 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3418 1.1 joerg // Idx: index of the buffer.
3419 1.1 joerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3420 1.1 joerg ImplicitParamDecl::Other);
3421 1.1 joerg // ReduceList: thread local Reduce list.
3422 1.1 joerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3423 1.1 joerg C.VoidPtrTy, ImplicitParamDecl::Other);
3424 1.1 joerg FunctionArgList Args;
3425 1.1 joerg Args.push_back(&BufferArg);
3426 1.1 joerg Args.push_back(&IdxArg);
3427 1.1 joerg Args.push_back(&ReduceListArg);
3428 1.1 joerg
3429 1.1 joerg const CGFunctionInfo &CGFI =
3430 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3431 1.1 joerg auto *Fn = llvm::Function::Create(
3432 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3433 1.1 joerg "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3434 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3435 1.1 joerg Fn->setDoesNotRecurse();
3436 1.1 joerg CodeGenFunction CGF(CGM);
3437 1.1 joerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3438 1.1 joerg
3439 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
3440 1.1 joerg
3441 1.1 joerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3442 1.1 joerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3443 1.1 joerg llvm::Type *LLVMReductionsBufferTy =
3444 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3445 1.1 joerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3446 1.1 joerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3447 1.1 joerg LLVMReductionsBufferTy->getPointerTo());
3448 1.1 joerg
3449 1.1 joerg // 1. Build a list of reduction variables.
3450 1.1 joerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3451 1.1 joerg Address ReductionList =
3452 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3453 1.1 joerg auto IPriv = Privates.begin();
3454 1.1 joerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3455 1.1 joerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3456 1.1 joerg /*Volatile=*/false, C.IntTy,
3457 1.1 joerg Loc)};
3458 1.1 joerg unsigned Idx = 0;
3459 1.1 joerg for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3460 1.1 joerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3461 1.1 joerg // Global = Buffer.VD[Idx];
3462 1.1 joerg const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3463 1.1 joerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3464 1.1 joerg LValue GlobLVal = CGF.EmitLValueForField(
3465 1.1 joerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3466 1.1 joerg Address GlobAddr = GlobLVal.getAddress(CGF);
3467 1.1 joerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3468 1.1 joerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3469 1.1 joerg llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3470 1.1 joerg CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3471 1.1 joerg if ((*IPriv)->getType()->isVariablyModifiedType()) {
3472 1.1 joerg // Store array size.
3473 1.1 joerg ++Idx;
3474 1.1 joerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3475 1.1 joerg llvm::Value *Size = CGF.Builder.CreateIntCast(
3476 1.1 joerg CGF.getVLASize(
3477 1.1 joerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3478 1.1 joerg .NumElts,
3479 1.1 joerg CGF.SizeTy, /*isSigned=*/false);
3480 1.1 joerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3481 1.1 joerg Elem);
3482 1.1 joerg }
3483 1.1 joerg }
3484 1.1 joerg
3485 1.1 joerg // Call reduce_function(ReduceList, GlobalReduceList)
3486 1.1 joerg llvm::Value *GlobalReduceList =
3487 1.1 joerg CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3488 1.1 joerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3489 1.1 joerg llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3490 1.1 joerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3491 1.1 joerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3492 1.1 joerg CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3493 1.1 joerg CGF.FinishFunction();
3494 1.1 joerg return Fn;
3495 1.1 joerg }
3496 1.1 joerg
3497 1.1 joerg ///
3498 1.1 joerg /// Design of OpenMP reductions on the GPU
3499 1.1 joerg ///
3500 1.1 joerg /// Consider a typical OpenMP program with one or more reduction
3501 1.1 joerg /// clauses:
3502 1.1 joerg ///
3503 1.1 joerg /// float foo;
3504 1.1 joerg /// double bar;
3505 1.1 joerg /// #pragma omp target teams distribute parallel for \
3506 1.1 joerg /// reduction(+:foo) reduction(*:bar)
3507 1.1 joerg /// for (int i = 0; i < N; i++) {
3508 1.1 joerg /// foo += A[i]; bar *= B[i];
3509 1.1 joerg /// }
3510 1.1 joerg ///
3511 1.1 joerg /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3512 1.1 joerg /// all teams. In our OpenMP implementation on the NVPTX device an
3513 1.1 joerg /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3514 1.1 joerg /// within a team are mapped to CUDA threads within a threadblock.
3515 1.1 joerg /// Our goal is to efficiently aggregate values across all OpenMP
3516 1.1 joerg /// threads such that:
3517 1.1 joerg ///
3518 1.1 joerg /// - the compiler and runtime are logically concise, and
3519 1.1 joerg /// - the reduction is performed efficiently in a hierarchical
3520 1.1 joerg /// manner as follows: within OpenMP threads in the same warp,
3521 1.1 joerg /// across warps in a threadblock, and finally across teams on
3522 1.1 joerg /// the NVPTX device.
3523 1.1 joerg ///
3524 1.1 joerg /// Introduction to Decoupling
3525 1.1 joerg ///
3526 1.1 joerg /// We would like to decouple the compiler and the runtime so that the
3527 1.1 joerg /// latter is ignorant of the reduction variables (number, data types)
3528 1.1 joerg /// and the reduction operators. This allows a simpler interface
3529 1.1 joerg /// and implementation while still attaining good performance.
3530 1.1 joerg ///
3531 1.1 joerg /// Pseudocode for the aforementioned OpenMP program generated by the
3532 1.1 joerg /// compiler is as follows:
3533 1.1 joerg ///
3534 1.1 joerg /// 1. Create private copies of reduction variables on each OpenMP
3535 1.1 joerg /// thread: 'foo_private', 'bar_private'
3536 1.1 joerg /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3537 1.1 joerg /// to it and writes the result in 'foo_private' and 'bar_private'
3538 1.1 joerg /// respectively.
3539 1.1 joerg /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3540 1.1 joerg /// and store the result on the team master:
3541 1.1 joerg ///
3542 1.1 joerg /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3543 1.1 joerg /// reduceData, shuffleReduceFn, interWarpCpyFn)
3544 1.1 joerg ///
3545 1.1 joerg /// where:
3546 1.1 joerg /// struct ReduceData {
3547 1.1 joerg /// double *foo;
3548 1.1 joerg /// double *bar;
3549 1.1 joerg /// } reduceData
3550 1.1 joerg /// reduceData.foo = &foo_private
3551 1.1 joerg /// reduceData.bar = &bar_private
3552 1.1 joerg ///
3553 1.1 joerg /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3554 1.1 joerg /// auxiliary functions generated by the compiler that operate on
3555 1.1 joerg /// variables of type 'ReduceData'. They aid the runtime perform
3556 1.1 joerg /// algorithmic steps in a data agnostic manner.
3557 1.1 joerg ///
3558 1.1 joerg /// 'shuffleReduceFn' is a pointer to a function that reduces data
3559 1.1 joerg /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3560 1.1 joerg /// same warp. It takes the following arguments as input:
3561 1.1 joerg ///
3562 1.1 joerg /// a. variable of type 'ReduceData' on the calling lane,
3563 1.1 joerg /// b. its lane_id,
3564 1.1 joerg /// c. an offset relative to the current lane_id to generate a
3565 1.1 joerg /// remote_lane_id. The remote lane contains the second
3566 1.1 joerg /// variable of type 'ReduceData' that is to be reduced.
3567 1.1 joerg /// d. an algorithm version parameter determining which reduction
3568 1.1 joerg /// algorithm to use.
3569 1.1 joerg ///
3570 1.1 joerg /// 'shuffleReduceFn' retrieves data from the remote lane using
3571 1.1 joerg /// efficient GPU shuffle intrinsics and reduces, using the
3572 1.1 joerg /// algorithm specified by the 4th parameter, the two operands
3573 1.1 joerg /// element-wise. The result is written to the first operand.
3574 1.1 joerg ///
3575 1.1 joerg /// Different reduction algorithms are implemented in different
3576 1.1 joerg /// runtime functions, all calling 'shuffleReduceFn' to perform
3577 1.1 joerg /// the essential reduction step. Therefore, based on the 4th
3578 1.1 joerg /// parameter, this function behaves slightly differently to
3579 1.1 joerg /// cooperate with the runtime to ensure correctness under
3580 1.1 joerg /// different circumstances.
3581 1.1 joerg ///
3582 1.1 joerg /// 'InterWarpCpyFn' is a pointer to a function that transfers
3583 1.1 joerg /// reduced variables across warps. It tunnels, through CUDA
3584 1.1 joerg /// shared memory, the thread-private data of type 'ReduceData'
3585 1.1 joerg /// from lane 0 of each warp to a lane in the first warp.
3586 1.1 joerg /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3587 1.1 joerg /// The last team writes the global reduced value to memory.
3588 1.1 joerg ///
3589 1.1 joerg /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3590 1.1 joerg /// reduceData, shuffleReduceFn, interWarpCpyFn,
3591 1.1 joerg /// scratchpadCopyFn, loadAndReduceFn)
3592 1.1 joerg ///
3593 1.1 joerg /// 'scratchpadCopyFn' is a helper that stores reduced
3594 1.1 joerg /// data from the team master to a scratchpad array in
3595 1.1 joerg /// global memory.
3596 1.1 joerg ///
3597 1.1 joerg /// 'loadAndReduceFn' is a helper that loads data from
3598 1.1 joerg /// the scratchpad array and reduces it with the input
3599 1.1 joerg /// operand.
3600 1.1 joerg ///
3601 1.1 joerg /// These compiler generated functions hide address
3602 1.1 joerg /// calculation and alignment information from the runtime.
3603 1.1 joerg /// 5. if ret == 1:
3604 1.1 joerg /// The team master of the last team stores the reduced
3605 1.1 joerg /// result to the globals in memory.
3606 1.1 joerg /// foo += reduceData.foo; bar *= reduceData.bar
3607 1.1 joerg ///
3608 1.1 joerg ///
3609 1.1 joerg /// Warp Reduction Algorithms
3610 1.1 joerg ///
3611 1.1 joerg /// On the warp level, we have three algorithms implemented in the
3612 1.1 joerg /// OpenMP runtime depending on the number of active lanes:
3613 1.1 joerg ///
3614 1.1 joerg /// Full Warp Reduction
3615 1.1 joerg ///
3616 1.1 joerg /// The reduce algorithm within a warp where all lanes are active
3617 1.1 joerg /// is implemented in the runtime as follows:
3618 1.1 joerg ///
3619 1.1 joerg /// full_warp_reduce(void *reduce_data,
3620 1.1 joerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3621 1.1 joerg /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3622 1.1 joerg /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3623 1.1 joerg /// }
3624 1.1 joerg ///
3625 1.1 joerg /// The algorithm completes in log(2, WARPSIZE) steps.
3626 1.1 joerg ///
3627 1.1 joerg /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3628 1.1 joerg /// not used therefore we save instructions by not retrieving lane_id
3629 1.1 joerg /// from the corresponding special registers. The 4th parameter, which
3630 1.1 joerg /// represents the version of the algorithm being used, is set to 0 to
3631 1.1 joerg /// signify full warp reduction.
3632 1.1 joerg ///
3633 1.1 joerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3634 1.1 joerg ///
3635 1.1 joerg /// #reduce_elem refers to an element in the local lane's data structure
3636 1.1 joerg /// #remote_elem is retrieved from a remote lane
3637 1.1 joerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3638 1.1 joerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3639 1.1 joerg ///
3640 1.1 joerg /// Contiguous Partial Warp Reduction
3641 1.1 joerg ///
3642 1.1 joerg /// This reduce algorithm is used within a warp where only the first
3643 1.1 joerg /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3644 1.1 joerg /// number of OpenMP threads in a parallel region is not a multiple of
3645 1.1 joerg /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3646 1.1 joerg ///
3647 1.1 joerg /// void
3648 1.1 joerg /// contiguous_partial_reduce(void *reduce_data,
3649 1.1 joerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3650 1.1 joerg /// int size, int lane_id) {
3651 1.1 joerg /// int curr_size;
3652 1.1 joerg /// int offset;
3653 1.1 joerg /// curr_size = size;
3654 1.1 joerg /// mask = curr_size/2;
3655 1.1 joerg /// while (offset>0) {
3656 1.1 joerg /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3657 1.1 joerg /// curr_size = (curr_size+1)/2;
3658 1.1 joerg /// offset = curr_size/2;
3659 1.1 joerg /// }
3660 1.1 joerg /// }
3661 1.1 joerg ///
3662 1.1 joerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3663 1.1 joerg ///
3664 1.1 joerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3665 1.1 joerg /// if (lane_id < offset)
3666 1.1 joerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3667 1.1 joerg /// else
3668 1.1 joerg /// reduce_elem = remote_elem
3669 1.1 joerg ///
3670 1.1 joerg /// This algorithm assumes that the data to be reduced are located in a
3671 1.1 joerg /// contiguous subset of lanes starting from the first. When there is
3672 1.1 joerg /// an odd number of active lanes, the data in the last lane is not
3673 1.1 joerg /// aggregated with any other lane's dat but is instead copied over.
3674 1.1 joerg ///
3675 1.1 joerg /// Dispersed Partial Warp Reduction
3676 1.1 joerg ///
3677 1.1 joerg /// This algorithm is used within a warp when any discontiguous subset of
3678 1.1 joerg /// lanes are active. It is used to implement the reduction operation
3679 1.1 joerg /// across lanes in an OpenMP simd region or in a nested parallel region.
3680 1.1 joerg ///
3681 1.1 joerg /// void
3682 1.1 joerg /// dispersed_partial_reduce(void *reduce_data,
3683 1.1 joerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3684 1.1 joerg /// int size, remote_id;
3685 1.1 joerg /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3686 1.1 joerg /// do {
3687 1.1 joerg /// remote_id = next_active_lane_id_right_after_me();
3688 1.1 joerg /// # the above function returns 0 of no active lane
3689 1.1 joerg /// # is present right after the current lane.
3690 1.1 joerg /// size = number_of_active_lanes_in_this_warp();
3691 1.1 joerg /// logical_lane_id /= 2;
3692 1.1 joerg /// ShuffleReduceFn(reduce_data, logical_lane_id,
3693 1.1 joerg /// remote_id-1-threadIdx.x, 2);
3694 1.1 joerg /// } while (logical_lane_id % 2 == 0 && size > 1);
3695 1.1 joerg /// }
3696 1.1 joerg ///
3697 1.1 joerg /// There is no assumption made about the initial state of the reduction.
3698 1.1 joerg /// Any number of lanes (>=1) could be active at any position. The reduction
3699 1.1 joerg /// result is returned in the first active lane.
3700 1.1 joerg ///
3701 1.1 joerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3702 1.1 joerg ///
3703 1.1 joerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3704 1.1 joerg /// if (lane_id % 2 == 0 && offset > 0)
3705 1.1 joerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3706 1.1 joerg /// else
3707 1.1 joerg /// reduce_elem = remote_elem
3708 1.1 joerg ///
3709 1.1 joerg ///
3710 1.1 joerg /// Intra-Team Reduction
3711 1.1 joerg ///
3712 1.1 joerg /// This function, as implemented in the runtime call
3713 1.1 joerg /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3714 1.1 joerg /// threads in a team. It first reduces within a warp using the
3715 1.1 joerg /// aforementioned algorithms. We then proceed to gather all such
3716 1.1 joerg /// reduced values at the first warp.
3717 1.1 joerg ///
3718 1.1 joerg /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3719 1.1 joerg /// data from each of the "warp master" (zeroth lane of each warp, where
3720 1.1 joerg /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3721 1.1 joerg /// a mathematical sense) the problem of reduction across warp masters in
3722 1.1 joerg /// a block to the problem of warp reduction.
3723 1.1 joerg ///
3724 1.1 joerg ///
3725 1.1 joerg /// Inter-Team Reduction
3726 1.1 joerg ///
3727 1.1 joerg /// Once a team has reduced its data to a single value, it is stored in
3728 1.1 joerg /// a global scratchpad array. Since each team has a distinct slot, this
3729 1.1 joerg /// can be done without locking.
3730 1.1 joerg ///
3731 1.1 joerg /// The last team to write to the scratchpad array proceeds to reduce the
3732 1.1 joerg /// scratchpad array. One or more workers in the last team use the helper
3733 1.1 joerg /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3734 1.1 joerg /// the k'th worker reduces every k'th element.
3735 1.1 joerg ///
3736 1.1 joerg /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3737 1.1 joerg /// reduce across workers and compute a globally reduced value.
3738 1.1 joerg ///
3739 1.1 joerg void CGOpenMPRuntimeGPU::emitReduction(
3740 1.1 joerg CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3741 1.1 joerg ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3742 1.1 joerg ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3743 1.1 joerg if (!CGF.HaveInsertPoint())
3744 1.1 joerg return;
3745 1.1 joerg
3746 1.1 joerg bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3747 1.1 joerg #ifndef NDEBUG
3748 1.1 joerg bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3749 1.1 joerg #endif
3750 1.1 joerg
3751 1.1 joerg if (Options.SimpleReduction) {
3752 1.1 joerg assert(!TeamsReduction && !ParallelReduction &&
3753 1.1 joerg "Invalid reduction selection in emitReduction.");
3754 1.1 joerg CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3755 1.1 joerg ReductionOps, Options);
3756 1.1 joerg return;
3757 1.1 joerg }
3758 1.1 joerg
3759 1.1 joerg assert((TeamsReduction || ParallelReduction) &&
3760 1.1 joerg "Invalid reduction selection in emitReduction.");
3761 1.1 joerg
3762 1.1 joerg // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3763 1.1 joerg // RedList, shuffle_reduce_func, interwarp_copy_func);
3764 1.1 joerg // or
3765 1.1 joerg // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3766 1.1 joerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3767 1.1 joerg llvm::Value *ThreadId = getThreadID(CGF, Loc);
3768 1.1 joerg
3769 1.1 joerg llvm::Value *Res;
3770 1.1 joerg ASTContext &C = CGM.getContext();
3771 1.1 joerg // 1. Build a list of reduction variables.
3772 1.1 joerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3773 1.1 joerg auto Size = RHSExprs.size();
3774 1.1 joerg for (const Expr *E : Privates) {
3775 1.1 joerg if (E->getType()->isVariablyModifiedType())
3776 1.1 joerg // Reserve place for array size.
3777 1.1 joerg ++Size;
3778 1.1 joerg }
3779 1.1 joerg llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3780 1.1 joerg QualType ReductionArrayTy =
3781 1.1 joerg C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3782 1.1 joerg /*IndexTypeQuals=*/0);
3783 1.1 joerg Address ReductionList =
3784 1.1 joerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3785 1.1 joerg auto IPriv = Privates.begin();
3786 1.1 joerg unsigned Idx = 0;
3787 1.1 joerg for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3788 1.1 joerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3789 1.1 joerg CGF.Builder.CreateStore(
3790 1.1 joerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3791 1.1 joerg CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3792 1.1 joerg Elem);
3793 1.1 joerg if ((*IPriv)->getType()->isVariablyModifiedType()) {
3794 1.1 joerg // Store array size.
3795 1.1 joerg ++Idx;
3796 1.1 joerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3797 1.1 joerg llvm::Value *Size = CGF.Builder.CreateIntCast(
3798 1.1 joerg CGF.getVLASize(
3799 1.1 joerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3800 1.1 joerg .NumElts,
3801 1.1 joerg CGF.SizeTy, /*isSigned=*/false);
3802 1.1 joerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3803 1.1 joerg Elem);
3804 1.1 joerg }
3805 1.1 joerg }
3806 1.1 joerg
3807 1.1 joerg llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3808 1.1 joerg ReductionList.getPointer(), CGF.VoidPtrTy);
3809 1.1 joerg llvm::Function *ReductionFn = emitReductionFunction(
3810 1.1 joerg Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3811 1.1 joerg LHSExprs, RHSExprs, ReductionOps);
3812 1.1 joerg llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3813 1.1 joerg llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3814 1.1 joerg CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3815 1.1 joerg llvm::Value *InterWarpCopyFn =
3816 1.1 joerg emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3817 1.1 joerg
3818 1.1 joerg if (ParallelReduction) {
3819 1.1 joerg llvm::Value *Args[] = {RTLoc,
3820 1.1 joerg ThreadId,
3821 1.1 joerg CGF.Builder.getInt32(RHSExprs.size()),
3822 1.1 joerg ReductionArrayTySize,
3823 1.1 joerg RL,
3824 1.1 joerg ShuffleAndReduceFn,
3825 1.1 joerg InterWarpCopyFn};
3826 1.1 joerg
3827 1.1 joerg Res = CGF.EmitRuntimeCall(
3828 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
3829 1.1 joerg CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3830 1.1 joerg Args);
3831 1.1 joerg } else {
3832 1.1 joerg assert(TeamsReduction && "expected teams reduction.");
3833 1.1 joerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3834 1.1 joerg llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3835 1.1 joerg int Cnt = 0;
3836 1.1 joerg for (const Expr *DRE : Privates) {
3837 1.1 joerg PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3838 1.1 joerg ++Cnt;
3839 1.1 joerg }
3840 1.1 joerg const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3841 1.1 joerg CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3842 1.1 joerg C.getLangOpts().OpenMPCUDAReductionBufNum);
3843 1.1 joerg TeamsReductions.push_back(TeamReductionRec);
3844 1.1 joerg if (!KernelTeamsReductionPtr) {
3845 1.1 joerg KernelTeamsReductionPtr = new llvm::GlobalVariable(
3846 1.1 joerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3847 1.1 joerg llvm::GlobalValue::InternalLinkage, nullptr,
3848 1.1 joerg "_openmp_teams_reductions_buffer_$_$ptr");
3849 1.1 joerg }
3850 1.1 joerg llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3851 1.1 joerg Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3852 1.1 joerg /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3853 1.1 joerg llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3854 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3855 1.1 joerg llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3856 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3857 1.1 joerg ReductionFn);
3858 1.1 joerg llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3859 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3860 1.1 joerg llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3861 1.1 joerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3862 1.1 joerg ReductionFn);
3863 1.1 joerg
3864 1.1 joerg llvm::Value *Args[] = {
3865 1.1 joerg RTLoc,
3866 1.1 joerg ThreadId,
3867 1.1 joerg GlobalBufferPtr,
3868 1.1 joerg CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3869 1.1 joerg RL,
3870 1.1 joerg ShuffleAndReduceFn,
3871 1.1 joerg InterWarpCopyFn,
3872 1.1 joerg GlobalToBufferCpyFn,
3873 1.1 joerg GlobalToBufferRedFn,
3874 1.1 joerg BufferToGlobalCpyFn,
3875 1.1 joerg BufferToGlobalRedFn};
3876 1.1 joerg
3877 1.1 joerg Res = CGF.EmitRuntimeCall(
3878 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
3879 1.1 joerg CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3880 1.1 joerg Args);
3881 1.1 joerg }
3882 1.1 joerg
3883 1.1 joerg // 5. Build if (res == 1)
3884 1.1 joerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3885 1.1 joerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3886 1.1 joerg llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3887 1.1 joerg Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3888 1.1 joerg CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3889 1.1 joerg
3890 1.1 joerg // 6. Build then branch: where we have reduced values in the master
3891 1.1 joerg // thread in each team.
3892 1.1 joerg // __kmpc_end_reduce{_nowait}(<gtid>);
3893 1.1 joerg // break;
3894 1.1 joerg CGF.EmitBlock(ThenBB);
3895 1.1 joerg
3896 1.1 joerg // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3897 1.1 joerg auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3898 1.1 joerg this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3899 1.1 joerg auto IPriv = Privates.begin();
3900 1.1 joerg auto ILHS = LHSExprs.begin();
3901 1.1 joerg auto IRHS = RHSExprs.begin();
3902 1.1 joerg for (const Expr *E : ReductionOps) {
3903 1.1 joerg emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3904 1.1 joerg cast<DeclRefExpr>(*IRHS));
3905 1.1 joerg ++IPriv;
3906 1.1 joerg ++ILHS;
3907 1.1 joerg ++IRHS;
3908 1.1 joerg }
3909 1.1 joerg };
3910 1.1 joerg llvm::Value *EndArgs[] = {ThreadId};
3911 1.1 joerg RegionCodeGenTy RCG(CodeGen);
3912 1.1 joerg NVPTXActionTy Action(
3913 1.1 joerg nullptr, llvm::None,
3914 1.1 joerg OMPBuilder.getOrCreateRuntimeFunction(
3915 1.1 joerg CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3916 1.1 joerg EndArgs);
3917 1.1 joerg RCG.setAction(Action);
3918 1.1 joerg RCG(CGF);
3919 1.1 joerg // There is no need to emit line number for unconditional branch.
3920 1.1 joerg (void)ApplyDebugLocation::CreateEmpty(CGF);
3921 1.1 joerg CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3922 1.1 joerg }
3923 1.1 joerg
3924 1.1 joerg const VarDecl *
3925 1.1 joerg CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3926 1.1 joerg const VarDecl *NativeParam) const {
3927 1.1 joerg if (!NativeParam->getType()->isReferenceType())
3928 1.1 joerg return NativeParam;
3929 1.1 joerg QualType ArgType = NativeParam->getType();
3930 1.1 joerg QualifierCollector QC;
3931 1.1 joerg const Type *NonQualTy = QC.strip(ArgType);
3932 1.1 joerg QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3933 1.1 joerg if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3934 1.1 joerg if (Attr->getCaptureKind() == OMPC_map) {
3935 1.1 joerg PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3936 1.1 joerg LangAS::opencl_global);
3937 1.1 joerg } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
3938 1.1 joerg PointeeTy.isConstant(CGM.getContext())) {
3939 1.1 joerg PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3940 1.1 joerg LangAS::opencl_generic);
3941 1.1 joerg }
3942 1.1 joerg }
3943 1.1 joerg ArgType = CGM.getContext().getPointerType(PointeeTy);
3944 1.1 joerg QC.addRestrict();
3945 1.1 joerg enum { NVPTX_local_addr = 5 };
3946 1.1 joerg QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3947 1.1 joerg ArgType = QC.apply(CGM.getContext(), ArgType);
3948 1.1 joerg if (isa<ImplicitParamDecl>(NativeParam))
3949 1.1 joerg return ImplicitParamDecl::Create(
3950 1.1 joerg CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3951 1.1 joerg NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3952 1.1 joerg return ParmVarDecl::Create(
3953 1.1 joerg CGM.getContext(),
3954 1.1 joerg const_cast<DeclContext *>(NativeParam->getDeclContext()),
3955 1.1 joerg NativeParam->getBeginLoc(), NativeParam->getLocation(),
3956 1.1 joerg NativeParam->getIdentifier(), ArgType,
3957 1.1 joerg /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3958 1.1 joerg }
3959 1.1 joerg
3960 1.1 joerg Address
3961 1.1 joerg CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3962 1.1 joerg const VarDecl *NativeParam,
3963 1.1 joerg const VarDecl *TargetParam) const {
3964 1.1 joerg assert(NativeParam != TargetParam &&
3965 1.1 joerg NativeParam->getType()->isReferenceType() &&
3966 1.1 joerg "Native arg must not be the same as target arg.");
3967 1.1 joerg Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3968 1.1 joerg QualType NativeParamType = NativeParam->getType();
3969 1.1 joerg QualifierCollector QC;
3970 1.1 joerg const Type *NonQualTy = QC.strip(NativeParamType);
3971 1.1 joerg QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3972 1.1 joerg unsigned NativePointeeAddrSpace =
3973 1.1 joerg CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3974 1.1 joerg QualType TargetTy = TargetParam->getType();
3975 1.1 joerg llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3976 1.1 joerg LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3977 1.1 joerg // First cast to generic.
3978 1.1 joerg TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3979 1.1 joerg TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3980 1.1 joerg /*AddrSpace=*/0));
3981 1.1 joerg // Cast from generic to native address space.
3982 1.1 joerg TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3983 1.1 joerg TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3984 1.1 joerg NativePointeeAddrSpace));
3985 1.1 joerg Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3986 1.1 joerg CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3987 1.1 joerg NativeParamType);
3988 1.1 joerg return NativeParamAddr;
3989 1.1 joerg }
3990 1.1 joerg
3991 1.1 joerg void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3992 1.1 joerg CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3993 1.1 joerg ArrayRef<llvm::Value *> Args) const {
3994 1.1 joerg SmallVector<llvm::Value *, 4> TargetArgs;
3995 1.1 joerg TargetArgs.reserve(Args.size());
3996 1.1 joerg auto *FnType = OutlinedFn.getFunctionType();
3997 1.1 joerg for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3998 1.1 joerg if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3999 1.1 joerg TargetArgs.append(std::next(Args.begin(), I), Args.end());
4000 1.1 joerg break;
4001 1.1 joerg }
4002 1.1 joerg llvm::Type *TargetType = FnType->getParamType(I);
4003 1.1 joerg llvm::Value *NativeArg = Args[I];
4004 1.1 joerg if (!TargetType->isPointerTy()) {
4005 1.1 joerg TargetArgs.emplace_back(NativeArg);
4006 1.1 joerg continue;
4007 1.1 joerg }
4008 1.1 joerg llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4009 1.1 joerg NativeArg,
4010 1.1 joerg NativeArg->getType()->getPointerElementType()->getPointerTo());
4011 1.1 joerg TargetArgs.emplace_back(
4012 1.1 joerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4013 1.1 joerg }
4014 1.1 joerg CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4015 1.1 joerg }
4016 1.1 joerg
4017 1.1 joerg /// Emit function which wraps the outline parallel region
4018 1.1 joerg /// and controls the arguments which are passed to this function.
4019 1.1 joerg /// The wrapper ensures that the outlined function is called
4020 1.1 joerg /// with the correct arguments when data is shared.
4021 1.1 joerg llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
4022 1.1 joerg llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4023 1.1 joerg ASTContext &Ctx = CGM.getContext();
4024 1.1 joerg const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4025 1.1 joerg
4026 1.1 joerg // Create a function that takes as argument the source thread.
4027 1.1 joerg FunctionArgList WrapperArgs;
4028 1.1 joerg QualType Int16QTy =
4029 1.1 joerg Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4030 1.1 joerg QualType Int32QTy =
4031 1.1 joerg Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4032 1.1 joerg ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4033 1.1 joerg /*Id=*/nullptr, Int16QTy,
4034 1.1 joerg ImplicitParamDecl::Other);
4035 1.1 joerg ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4036 1.1 joerg /*Id=*/nullptr, Int32QTy,
4037 1.1 joerg ImplicitParamDecl::Other);
4038 1.1 joerg WrapperArgs.emplace_back(&ParallelLevelArg);
4039 1.1 joerg WrapperArgs.emplace_back(&WrapperArg);
4040 1.1 joerg
4041 1.1 joerg const CGFunctionInfo &CGFI =
4042 1.1 joerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4043 1.1 joerg
4044 1.1 joerg auto *Fn = llvm::Function::Create(
4045 1.1 joerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4046 1.1 joerg Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4047 1.1 joerg
4048 1.1 joerg // Ensure we do not inline the function. This is trivially true for the ones
4049 1.1 joerg // passed to __kmpc_fork_call but the ones calles in serialized regions
4050 1.1 joerg // could be inlined. This is not a perfect but it is closer to the invariant
4051 1.1 joerg // we want, namely, every data environment starts with a new function.
4052 1.1 joerg // TODO: We should pass the if condition to the runtime function and do the
4053 1.1 joerg // handling there. Much cleaner code.
4054 1.1 joerg Fn->addFnAttr(llvm::Attribute::NoInline);
4055 1.1 joerg
4056 1.1 joerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4057 1.1 joerg Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4058 1.1 joerg Fn->setDoesNotRecurse();
4059 1.1 joerg
4060 1.1 joerg CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4061 1.1 joerg CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4062 1.1 joerg D.getBeginLoc(), D.getBeginLoc());
4063 1.1 joerg
4064 1.1 joerg const auto *RD = CS.getCapturedRecordDecl();
4065 1.1 joerg auto CurField = RD->field_begin();
4066 1.1 joerg
4067 1.1 joerg Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4068 1.1 joerg /*Name=*/".zero.addr");
4069 1.1 joerg CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4070 1.1 joerg // Get the array of arguments.
4071 1.1 joerg SmallVector<llvm::Value *, 8> Args;
4072 1.1 joerg
4073 1.1 joerg Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4074 1.1 joerg Args.emplace_back(ZeroAddr.getPointer());
4075 1.1 joerg
4076 1.1 joerg CGBuilderTy &Bld = CGF.Builder;
4077 1.1 joerg auto CI = CS.capture_begin();
4078 1.1 joerg
4079 1.1 joerg // Use global memory for data sharing.
4080 1.1 joerg // Handle passing of global args to workers.
4081 1.1 joerg Address GlobalArgs =
4082 1.1 joerg CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4083 1.1 joerg llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4084 1.1 joerg llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4085 1.1 joerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4086 1.1 joerg CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
4087 1.1 joerg DataSharingArgs);
4088 1.1 joerg
4089 1.1 joerg // Retrieve the shared variables from the list of references returned
4090 1.1 joerg // by the runtime. Pass the variables to the outlined function.
4091 1.1 joerg Address SharedArgListAddress = Address::invalid();
4092 1.1 joerg if (CS.capture_size() > 0 ||
4093 1.1 joerg isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4094 1.1 joerg SharedArgListAddress = CGF.EmitLoadOfPointer(
4095 1.1 joerg GlobalArgs, CGF.getContext()
4096 1.1 joerg .getPointerType(CGF.getContext().getPointerType(
4097 1.1 joerg CGF.getContext().VoidPtrTy))
4098 1.1 joerg .castAs<PointerType>());
4099 1.1 joerg }
4100 1.1 joerg unsigned Idx = 0;
4101 1.1 joerg if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4102 1.1 joerg Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4103 1.1 joerg Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4104 1.1 joerg Src, CGF.SizeTy->getPointerTo());
4105 1.1 joerg llvm::Value *LB = CGF.EmitLoadOfScalar(
4106 1.1 joerg TypedAddress,
4107 1.1 joerg /*Volatile=*/false,
4108 1.1 joerg CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4109 1.1 joerg cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4110 1.1 joerg Args.emplace_back(LB);
4111 1.1 joerg ++Idx;
4112 1.1 joerg Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4113 1.1 joerg TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4114 1.1 joerg Src, CGF.SizeTy->getPointerTo());
4115 1.1 joerg llvm::Value *UB = CGF.EmitLoadOfScalar(
4116 1.1 joerg TypedAddress,
4117 1.1 joerg /*Volatile=*/false,
4118 1.1 joerg CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4119 1.1 joerg cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4120 1.1 joerg Args.emplace_back(UB);
4121 1.1 joerg ++Idx;
4122 1.1 joerg }
4123 1.1 joerg if (CS.capture_size() > 0) {
4124 1.1 joerg ASTContext &CGFContext = CGF.getContext();
4125 1.1 joerg for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4126 1.1 joerg QualType ElemTy = CurField->getType();
4127 1.1 joerg Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4128 1.1 joerg Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4129 1.1 joerg Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4130 1.1 joerg llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4131 1.1 joerg /*Volatile=*/false,
4132 1.1 joerg CGFContext.getPointerType(ElemTy),
4133 1.1 joerg CI->getLocation());
4134 1.1 joerg if (CI->capturesVariableByCopy() &&
4135 1.1 joerg !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4136 1.1 joerg Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4137 1.1 joerg CI->getLocation());
4138 1.1 joerg }
4139 1.1 joerg Args.emplace_back(Arg);
4140 1.1 joerg }
4141 1.1 joerg }
4142 1.1 joerg
4143 1.1 joerg emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4144 1.1 joerg CGF.FinishFunction();
4145 1.1 joerg return Fn;
4146 1.1 joerg }
4147 1.1 joerg
4148 1.1 joerg void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
4149 1.1 joerg const Decl *D) {
4150 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4151 1.1 joerg return;
4152 1.1 joerg
4153 1.1 joerg assert(D && "Expected function or captured|block decl.");
4154 1.1 joerg assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4155 1.1 joerg "Function is registered already.");
4156 1.1 joerg assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4157 1.1 joerg "Team is set but not processed.");
4158 1.1 joerg const Stmt *Body = nullptr;
4159 1.1 joerg bool NeedToDelayGlobalization = false;
4160 1.1 joerg if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4161 1.1 joerg Body = FD->getBody();
4162 1.1 joerg } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4163 1.1 joerg Body = BD->getBody();
4164 1.1 joerg } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4165 1.1 joerg Body = CD->getBody();
4166 1.1 joerg NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4167 1.1 joerg if (NeedToDelayGlobalization &&
4168 1.1 joerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
4169 1.1 joerg return;
4170 1.1 joerg }
4171 1.1 joerg if (!Body)
4172 1.1 joerg return;
4173 1.1 joerg CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4174 1.1 joerg VarChecker.Visit(Body);
4175 1.1 joerg const RecordDecl *GlobalizedVarsRecord =
4176 1.1 joerg VarChecker.getGlobalizedRecord(IsInTTDRegion);
4177 1.1 joerg TeamAndReductions.first = nullptr;
4178 1.1 joerg TeamAndReductions.second.clear();
4179 1.1 joerg ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4180 1.1 joerg VarChecker.getEscapedVariableLengthDecls();
4181 1.1 joerg if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4182 1.1 joerg return;
4183 1.1 joerg auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4184 1.1 joerg I->getSecond().MappedParams =
4185 1.1 joerg std::make_unique<CodeGenFunction::OMPMapVars>();
4186 1.1 joerg I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4187 1.1 joerg I->getSecond().EscapedParameters.insert(
4188 1.1 joerg VarChecker.getEscapedParameters().begin(),
4189 1.1 joerg VarChecker.getEscapedParameters().end());
4190 1.1 joerg I->getSecond().EscapedVariableLengthDecls.append(
4191 1.1 joerg EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4192 1.1 joerg DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4193 1.1 joerg for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4194 1.1 joerg assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4195 1.1 joerg const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4196 1.1 joerg Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4197 1.1 joerg }
4198 1.1 joerg if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4199 1.1 joerg CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4200 1.1 joerg VarChecker.Visit(Body);
4201 1.1 joerg I->getSecond().SecondaryGlobalRecord =
4202 1.1 joerg VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4203 1.1 joerg I->getSecond().SecondaryLocalVarData.emplace();
4204 1.1 joerg DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4205 1.1 joerg for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4206 1.1 joerg assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4207 1.1 joerg const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4208 1.1 joerg Data.insert(
4209 1.1 joerg std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4210 1.1 joerg }
4211 1.1 joerg }
4212 1.1 joerg if (!NeedToDelayGlobalization) {
4213 1.1 joerg emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4214 1.1 joerg struct GlobalizationScope final : EHScopeStack::Cleanup {
4215 1.1 joerg GlobalizationScope() = default;
4216 1.1 joerg
4217 1.1 joerg void Emit(CodeGenFunction &CGF, Flags flags) override {
4218 1.1 joerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
4219 1.1 joerg .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4220 1.1 joerg }
4221 1.1 joerg };
4222 1.1 joerg CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4223 1.1 joerg }
4224 1.1 joerg }
4225 1.1 joerg
4226 1.1 joerg Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
4227 1.1 joerg const VarDecl *VD) {
4228 1.1 joerg if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4229 1.1 joerg const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4230 1.1 joerg auto AS = LangAS::Default;
4231 1.1 joerg switch (A->getAllocatorType()) {
4232 1.1 joerg // Use the default allocator here as by default local vars are
4233 1.1 joerg // threadlocal.
4234 1.1 joerg case OMPAllocateDeclAttr::OMPNullMemAlloc:
4235 1.1 joerg case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4236 1.1 joerg case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4237 1.1 joerg case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4238 1.1 joerg case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4239 1.1 joerg // Follow the user decision - use default allocation.
4240 1.1 joerg return Address::invalid();
4241 1.1 joerg case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4242 1.1 joerg // TODO: implement aupport for user-defined allocators.
4243 1.1 joerg return Address::invalid();
4244 1.1 joerg case OMPAllocateDeclAttr::OMPConstMemAlloc:
4245 1.1 joerg AS = LangAS::cuda_constant;
4246 1.1 joerg break;
4247 1.1 joerg case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4248 1.1 joerg AS = LangAS::cuda_shared;
4249 1.1 joerg break;
4250 1.1 joerg case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4251 1.1 joerg case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4252 1.1 joerg break;
4253 1.1 joerg }
4254 1.1 joerg llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4255 1.1 joerg auto *GV = new llvm::GlobalVariable(
4256 1.1 joerg CGM.getModule(), VarTy, /*isConstant=*/false,
4257 1.1 joerg llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
4258 1.1 joerg VD->getName(),
4259 1.1 joerg /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4260 1.1 joerg CGM.getContext().getTargetAddressSpace(AS));
4261 1.1 joerg CharUnits Align = CGM.getContext().getDeclAlign(VD);
4262 1.1 joerg GV->setAlignment(Align.getAsAlign());
4263 1.1 joerg return Address(
4264 1.1 joerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4265 1.1 joerg GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
4266 1.1 joerg VD->getType().getAddressSpace()))),
4267 1.1 joerg Align);
4268 1.1 joerg }
4269 1.1 joerg
4270 1.1 joerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4271 1.1 joerg return Address::invalid();
4272 1.1 joerg
4273 1.1 joerg VD = VD->getCanonicalDecl();
4274 1.1 joerg auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4275 1.1 joerg if (I == FunctionGlobalizedDecls.end())
4276 1.1 joerg return Address::invalid();
4277 1.1 joerg auto VDI = I->getSecond().LocalVarData.find(VD);
4278 1.1 joerg if (VDI != I->getSecond().LocalVarData.end())
4279 1.1 joerg return VDI->second.PrivateAddr;
4280 1.1 joerg if (VD->hasAttrs()) {
4281 1.1 joerg for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4282 1.1 joerg E(VD->attr_end());
4283 1.1 joerg IT != E; ++IT) {
4284 1.1 joerg auto VDI = I->getSecond().LocalVarData.find(
4285 1.1 joerg cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4286 1.1 joerg ->getCanonicalDecl());
4287 1.1 joerg if (VDI != I->getSecond().LocalVarData.end())
4288 1.1 joerg return VDI->second.PrivateAddr;
4289 1.1 joerg }
4290 1.1 joerg }
4291 1.1 joerg
4292 1.1 joerg return Address::invalid();
4293 1.1 joerg }
4294 1.1 joerg
4295 1.1 joerg void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
4296 1.1 joerg FunctionGlobalizedDecls.erase(CGF.CurFn);
4297 1.1 joerg CGOpenMPRuntime::functionFinished(CGF);
4298 1.1 joerg }
4299 1.1 joerg
4300 1.1 joerg void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
4301 1.1 joerg CodeGenFunction &CGF, const OMPLoopDirective &S,
4302 1.1 joerg OpenMPDistScheduleClauseKind &ScheduleKind,
4303 1.1 joerg llvm::Value *&Chunk) const {
4304 1.1 joerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
4305 1.1 joerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
4306 1.1 joerg ScheduleKind = OMPC_DIST_SCHEDULE_static;
4307 1.1 joerg Chunk = CGF.EmitScalarConversion(
4308 1.1 joerg RT.getGPUNumThreads(CGF),
4309 1.1 joerg CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4310 1.1 joerg S.getIterationVariable()->getType(), S.getBeginLoc());
4311 1.1 joerg return;
4312 1.1 joerg }
4313 1.1 joerg CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4314 1.1 joerg CGF, S, ScheduleKind, Chunk);
4315 1.1 joerg }
4316 1.1 joerg
4317 1.1 joerg void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
4318 1.1 joerg CodeGenFunction &CGF, const OMPLoopDirective &S,
4319 1.1 joerg OpenMPScheduleClauseKind &ScheduleKind,
4320 1.1 joerg const Expr *&ChunkExpr) const {
4321 1.1 joerg ScheduleKind = OMPC_SCHEDULE_static;
4322 1.1 joerg // Chunk size is 1 in this case.
4323 1.1 joerg llvm::APInt ChunkSize(32, 1);
4324 1.1 joerg ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4325 1.1 joerg CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4326 1.1 joerg SourceLocation());
4327 1.1 joerg }
4328 1.1 joerg
4329 1.1 joerg void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
4330 1.1 joerg CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4331 1.1 joerg assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4332 1.1 joerg " Expected target-based directive.");
4333 1.1 joerg const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4334 1.1 joerg for (const CapturedStmt::Capture &C : CS->captures()) {
4335 1.1 joerg // Capture variables captured by reference in lambdas for target-based
4336 1.1 joerg // directives.
4337 1.1 joerg if (!C.capturesVariable())
4338 1.1 joerg continue;
4339 1.1 joerg const VarDecl *VD = C.getCapturedVar();
4340 1.1 joerg const auto *RD = VD->getType()
4341 1.1 joerg .getCanonicalType()
4342 1.1 joerg .getNonReferenceType()
4343 1.1 joerg ->getAsCXXRecordDecl();
4344 1.1 joerg if (!RD || !RD->isLambda())
4345 1.1 joerg continue;
4346 1.1 joerg Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4347 1.1 joerg LValue VDLVal;
4348 1.1 joerg if (VD->getType().getCanonicalType()->isReferenceType())
4349 1.1 joerg VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4350 1.1 joerg else
4351 1.1 joerg VDLVal = CGF.MakeAddrLValue(
4352 1.1 joerg VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4353 1.1 joerg llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4354 1.1 joerg FieldDecl *ThisCapture = nullptr;
4355 1.1 joerg RD->getCaptureFields(Captures, ThisCapture);
4356 1.1 joerg if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4357 1.1 joerg LValue ThisLVal =
4358 1.1 joerg CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4359 1.1 joerg llvm::Value *CXXThis = CGF.LoadCXXThis();
4360 1.1 joerg CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4361 1.1 joerg }
4362 1.1 joerg for (const LambdaCapture &LC : RD->captures()) {
4363 1.1 joerg if (LC.getCaptureKind() != LCK_ByRef)
4364 1.1 joerg continue;
4365 1.1 joerg const VarDecl *VD = LC.getCapturedVar();
4366 1.1 joerg if (!CS->capturesVariable(VD))
4367 1.1 joerg continue;
4368 1.1 joerg auto It = Captures.find(VD);
4369 1.1 joerg assert(It != Captures.end() && "Found lambda capture without field.");
4370 1.1 joerg LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4371 1.1 joerg Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4372 1.1 joerg if (VD->getType().getCanonicalType()->isReferenceType())
4373 1.1 joerg VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4374 1.1 joerg VD->getType().getCanonicalType())
4375 1.1 joerg .getAddress(CGF);
4376 1.1 joerg CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4377 1.1 joerg }
4378 1.1 joerg }
4379 1.1 joerg }
4380 1.1 joerg
4381 1.1 joerg unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const {
4382 1.1 joerg return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4383 1.1 joerg }
4384 1.1 joerg
4385 1.1 joerg bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4386 1.1 joerg LangAS &AS) {
4387 1.1 joerg if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4388 1.1 joerg return false;
4389 1.1 joerg const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4390 1.1 joerg switch(A->getAllocatorType()) {
4391 1.1 joerg case OMPAllocateDeclAttr::OMPNullMemAlloc:
4392 1.1 joerg case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4393 1.1 joerg // Not supported, fallback to the default mem space.
4394 1.1 joerg case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4395 1.1 joerg case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4396 1.1 joerg case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4397 1.1 joerg case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4398 1.1 joerg case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4399 1.1 joerg AS = LangAS::Default;
4400 1.1 joerg return true;
4401 1.1 joerg case OMPAllocateDeclAttr::OMPConstMemAlloc:
4402 1.1 joerg AS = LangAS::cuda_constant;
4403 1.1 joerg return true;
4404 1.1 joerg case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4405 1.1 joerg AS = LangAS::cuda_shared;
4406 1.1 joerg return true;
4407 1.1 joerg case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4408 1.1 joerg llvm_unreachable("Expected predefined allocator for the variables with the "
4409 1.1 joerg "static storage.");
4410 1.1 joerg }
4411 1.1 joerg return false;
4412 1.1 joerg }
4413 1.1 joerg
4414 1.1 joerg // Get current CudaArch and ignore any unknown values
4415 1.1 joerg static CudaArch getCudaArch(CodeGenModule &CGM) {
4416 1.1 joerg if (!CGM.getTarget().hasFeature("ptx"))
4417 1.1 joerg return CudaArch::UNKNOWN;
4418 1.1 joerg for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
4419 1.1 joerg if (Feature.getValue()) {
4420 1.1 joerg CudaArch Arch = StringToCudaArch(Feature.getKey());
4421 1.1 joerg if (Arch != CudaArch::UNKNOWN)
4422 1.1 joerg return Arch;
4423 1.1 joerg }
4424 1.1 joerg }
4425 1.1 joerg return CudaArch::UNKNOWN;
4426 1.1 joerg }
4427 1.1 joerg
4428 1.1 joerg /// Check to see if target architecture supports unified addressing which is
4429 1.1 joerg /// a restriction for OpenMP requires clause "unified_shared_memory".
4430 1.1 joerg void CGOpenMPRuntimeGPU::processRequiresDirective(
4431 1.1 joerg const OMPRequiresDecl *D) {
4432 1.1 joerg for (const OMPClause *Clause : D->clauselists()) {
4433 1.1 joerg if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4434 1.1 joerg CudaArch Arch = getCudaArch(CGM);
4435 1.1 joerg switch (Arch) {
4436 1.1 joerg case CudaArch::SM_20:
4437 1.1 joerg case CudaArch::SM_21:
4438 1.1 joerg case CudaArch::SM_30:
4439 1.1 joerg case CudaArch::SM_32:
4440 1.1 joerg case CudaArch::SM_35:
4441 1.1 joerg case CudaArch::SM_37:
4442 1.1 joerg case CudaArch::SM_50:
4443 1.1 joerg case CudaArch::SM_52:
4444 1.1 joerg case CudaArch::SM_53: {
4445 1.1 joerg SmallString<256> Buffer;
4446 1.1 joerg llvm::raw_svector_ostream Out(Buffer);
4447 1.1 joerg Out << "Target architecture " << CudaArchToString(Arch)
4448 1.1 joerg << " does not support unified addressing";
4449 1.1 joerg CGM.Error(Clause->getBeginLoc(), Out.str());
4450 1.1 joerg return;
4451 1.1 joerg }
4452 1.1 joerg case CudaArch::SM_60:
4453 1.1 joerg case CudaArch::SM_61:
4454 1.1 joerg case CudaArch::SM_62:
4455 1.1 joerg case CudaArch::SM_70:
4456 1.1 joerg case CudaArch::SM_72:
4457 1.1 joerg case CudaArch::SM_75:
4458 1.1 joerg case CudaArch::SM_80:
4459 1.1 joerg case CudaArch::SM_86:
4460 1.1 joerg case CudaArch::GFX600:
4461 1.1 joerg case CudaArch::GFX601:
4462 1.1 joerg case CudaArch::GFX602:
4463 1.1 joerg case CudaArch::GFX700:
4464 1.1 joerg case CudaArch::GFX701:
4465 1.1 joerg case CudaArch::GFX702:
4466 1.1 joerg case CudaArch::GFX703:
4467 1.1 joerg case CudaArch::GFX704:
4468 1.1 joerg case CudaArch::GFX705:
4469 1.1 joerg case CudaArch::GFX801:
4470 1.1 joerg case CudaArch::GFX802:
4471 1.1 joerg case CudaArch::GFX803:
4472 1.1 joerg case CudaArch::GFX805:
4473 1.1 joerg case CudaArch::GFX810:
4474 1.1 joerg case CudaArch::GFX900:
4475 1.1 joerg case CudaArch::GFX902:
4476 1.1 joerg case CudaArch::GFX904:
4477 1.1 joerg case CudaArch::GFX906:
4478 1.1 joerg case CudaArch::GFX908:
4479 1.1 joerg case CudaArch::GFX909:
4480 1.1 joerg case CudaArch::GFX90a:
4481 1.1 joerg case CudaArch::GFX90c:
4482 1.1 joerg case CudaArch::GFX1010:
4483 1.1 joerg case CudaArch::GFX1011:
4484 1.1 joerg case CudaArch::GFX1012:
4485 1.1 joerg case CudaArch::GFX1030:
4486 1.1 joerg case CudaArch::GFX1031:
4487 1.1 joerg case CudaArch::GFX1032:
4488 1.1 joerg case CudaArch::GFX1033:
4489 1.1 joerg case CudaArch::GFX1034:
4490 1.1 joerg case CudaArch::UNUSED:
4491 1.1 joerg case CudaArch::UNKNOWN:
4492 1.1 joerg break;
4493 1.1 joerg case CudaArch::LAST:
4494 1.1 joerg llvm_unreachable("Unexpected Cuda arch.");
4495 1.1 joerg }
4496 1.1 joerg }
4497 1.1 joerg }
4498 1.1 joerg CGOpenMPRuntime::processRequiresDirective(D);
4499 1.1 joerg }
4500 1.1 joerg
4501 1.1 joerg /// Get number of SMs and number of blocks per SM.
4502 1.1 joerg static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4503 1.1 joerg std::pair<unsigned, unsigned> Data;
4504 1.1 joerg if (CGM.getLangOpts().OpenMPCUDANumSMs)
4505 1.1 joerg Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4506 1.1 joerg if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4507 1.1 joerg Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4508 1.1 joerg if (Data.first && Data.second)
4509 1.1 joerg return Data;
4510 1.1 joerg switch (getCudaArch(CGM)) {
4511 1.1 joerg case CudaArch::SM_20:
4512 1.1 joerg case CudaArch::SM_21:
4513 1.1 joerg case CudaArch::SM_30:
4514 1.1 joerg case CudaArch::SM_32:
4515 1.1 joerg case CudaArch::SM_35:
4516 1.1 joerg case CudaArch::SM_37:
4517 1.1 joerg case CudaArch::SM_50:
4518 1.1 joerg case CudaArch::SM_52:
4519 1.1 joerg case CudaArch::SM_53:
4520 1.1 joerg return {16, 16};
4521 1.1 joerg case CudaArch::SM_60:
4522 1.1 joerg case CudaArch::SM_61:
4523 1.1 joerg case CudaArch::SM_62:
4524 1.1 joerg return {56, 32};
4525 1.1 joerg case CudaArch::SM_70:
4526 1.1 joerg case CudaArch::SM_72:
4527 1.1 joerg case CudaArch::SM_75:
4528 1.1 joerg case CudaArch::SM_80:
4529 1.1 joerg case CudaArch::SM_86:
4530 1.1 joerg return {84, 32};
4531 1.1 joerg case CudaArch::GFX600:
4532 1.1 joerg case CudaArch::GFX601:
4533 1.1 joerg case CudaArch::GFX602:
4534 1.1 joerg case CudaArch::GFX700:
4535 1.1 joerg case CudaArch::GFX701:
4536 1.1 joerg case CudaArch::GFX702:
4537 1.1 joerg case CudaArch::GFX703:
4538 1.1 joerg case CudaArch::GFX704:
4539 1.1 joerg case CudaArch::GFX705:
4540 1.1 joerg case CudaArch::GFX801:
4541 1.1 joerg case CudaArch::GFX802:
4542 1.1 joerg case CudaArch::GFX803:
4543 1.1 joerg case CudaArch::GFX805:
4544 1.1 joerg case CudaArch::GFX810:
4545 1.1 joerg case CudaArch::GFX900:
4546 1.1 joerg case CudaArch::GFX902:
4547 1.1 joerg case CudaArch::GFX904:
4548 1.1 joerg case CudaArch::GFX906:
4549 1.1 joerg case CudaArch::GFX908:
4550 1.1 joerg case CudaArch::GFX909:
4551 1.1 joerg case CudaArch::GFX90a:
4552 1.1 joerg case CudaArch::GFX90c:
4553 1.1 joerg case CudaArch::GFX1010:
4554 1.1 joerg case CudaArch::GFX1011:
4555 1.1 joerg case CudaArch::GFX1012:
4556 1.1 joerg case CudaArch::GFX1030:
4557 1.1 joerg case CudaArch::GFX1031:
4558 1.1 joerg case CudaArch::GFX1032:
4559 1.1 joerg case CudaArch::GFX1033:
4560 1.1 joerg case CudaArch::GFX1034:
4561 1.1 joerg case CudaArch::UNUSED:
4562 1.1 joerg case CudaArch::UNKNOWN:
4563 1.1 joerg break;
4564 1.1 joerg case CudaArch::LAST:
4565 1.1 joerg llvm_unreachable("Unexpected Cuda arch.");
4566 1.1 joerg }
4567 1.1 joerg llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4568 1.1 joerg }
4569 1.1 joerg
4570 1.1 joerg void CGOpenMPRuntimeGPU::clear() {
4571 1.1 joerg if (!GlobalizedRecords.empty() &&
4572 1.1 joerg !CGM.getLangOpts().OpenMPCUDATargetParallel) {
4573 1.1 joerg ASTContext &C = CGM.getContext();
4574 1.1 joerg llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4575 1.1 joerg llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
4576 1.1 joerg RecordDecl *StaticRD = C.buildImplicitRecord(
4577 1.1 joerg "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4578 1.1 joerg StaticRD->startDefinition();
4579 1.1 joerg RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4580 1.1 joerg "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4581 1.1 joerg SharedStaticRD->startDefinition();
4582 1.1 joerg for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4583 1.1 joerg if (Records.Records.empty())
4584 1.1 joerg continue;
4585 1.1 joerg unsigned Size = 0;
4586 1.1 joerg unsigned RecAlignment = 0;
4587 1.1 joerg for (const RecordDecl *RD : Records.Records) {
4588 1.1 joerg QualType RDTy = C.getRecordType(RD);
4589 1.1 joerg unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4590 1.1 joerg RecAlignment = std::max(RecAlignment, Alignment);
4591 1.1 joerg unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4592 1.1 joerg Size =
4593 1.1 joerg llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4594 1.1 joerg }
4595 1.1 joerg Size = llvm::alignTo(Size, RecAlignment);
4596 1.1 joerg llvm::APInt ArySize(/*numBits=*/64, Size);
4597 1.1 joerg QualType SubTy = C.getConstantArrayType(
4598 1.1 joerg C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4599 1.1 joerg const bool UseSharedMemory = Size <= SharedMemorySize;
4600 1.1 joerg auto *Field =
4601 1.1 joerg FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4602 1.1 joerg SourceLocation(), SourceLocation(), nullptr, SubTy,
4603 1.1 joerg C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4604 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false,
4605 1.1 joerg /*InitStyle=*/ICIS_NoInit);
4606 1.1 joerg Field->setAccess(AS_public);
4607 1.1 joerg if (UseSharedMemory) {
4608 1.1 joerg SharedStaticRD->addDecl(Field);
4609 1.1 joerg SharedRecs.push_back(&Records);
4610 1.1 joerg } else {
4611 1.1 joerg StaticRD->addDecl(Field);
4612 1.1 joerg GlobalRecs.push_back(&Records);
4613 1.1 joerg }
4614 1.1 joerg Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4615 1.1 joerg Records.UseSharedMemory->setInitializer(
4616 1.1 joerg llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4617 1.1 joerg }
4618 1.1 joerg // Allocate SharedMemorySize buffer for the shared memory.
4619 1.1 joerg // FIXME: nvlink does not handle weak linkage correctly (object with the
4620 1.1 joerg // different size are reported as erroneous).
4621 1.1 joerg // Restore this code as sson as nvlink is fixed.
4622 1.1 joerg if (!SharedStaticRD->field_empty()) {
4623 1.1 joerg llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
4624 1.1 joerg QualType SubTy = C.getConstantArrayType(
4625 1.1 joerg C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4626 1.1 joerg auto *Field = FieldDecl::Create(
4627 1.1 joerg C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4628 1.1 joerg C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4629 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false,
4630 1.1 joerg /*InitStyle=*/ICIS_NoInit);
4631 1.1 joerg Field->setAccess(AS_public);
4632 1.1 joerg SharedStaticRD->addDecl(Field);
4633 1.1 joerg }
4634 1.1 joerg SharedStaticRD->completeDefinition();
4635 1.1 joerg if (!SharedStaticRD->field_empty()) {
4636 1.1 joerg QualType StaticTy = C.getRecordType(SharedStaticRD);
4637 1.1 joerg llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4638 1.1 joerg auto *GV = new llvm::GlobalVariable(
4639 1.1 joerg CGM.getModule(), LLVMStaticTy,
4640 1.1 joerg /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4641 1.1 joerg llvm::UndefValue::get(LLVMStaticTy),
4642 1.1 joerg "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4643 1.1 joerg llvm::GlobalValue::NotThreadLocal,
4644 1.1 joerg C.getTargetAddressSpace(LangAS::cuda_shared));
4645 1.1 joerg auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4646 1.1 joerg GV, CGM.VoidPtrTy);
4647 1.1 joerg for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4648 1.1 joerg Rec->Buffer->replaceAllUsesWith(Replacement);
4649 1.1 joerg Rec->Buffer->eraseFromParent();
4650 1.1 joerg }
4651 1.1 joerg }
4652 1.1 joerg StaticRD->completeDefinition();
4653 1.1 joerg if (!StaticRD->field_empty()) {
4654 1.1 joerg QualType StaticTy = C.getRecordType(StaticRD);
4655 1.1 joerg std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4656 1.1 joerg llvm::APInt Size1(32, SMsBlockPerSM.second);
4657 1.1 joerg QualType Arr1Ty =
4658 1.1 joerg C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
4659 1.1 joerg /*IndexTypeQuals=*/0);
4660 1.1 joerg llvm::APInt Size2(32, SMsBlockPerSM.first);
4661 1.1 joerg QualType Arr2Ty =
4662 1.1 joerg C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
4663 1.1 joerg /*IndexTypeQuals=*/0);
4664 1.1 joerg llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4665 1.1 joerg // FIXME: nvlink does not handle weak linkage correctly (object with the
4666 1.1 joerg // different size are reported as erroneous).
4667 1.1 joerg // Restore CommonLinkage as soon as nvlink is fixed.
4668 1.1 joerg auto *GV = new llvm::GlobalVariable(
4669 1.1 joerg CGM.getModule(), LLVMArr2Ty,
4670 1.1 joerg /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4671 1.1 joerg llvm::Constant::getNullValue(LLVMArr2Ty),
4672 1.1 joerg "_openmp_static_glob_rd_$_");
4673 1.1 joerg auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4674 1.1 joerg GV, CGM.VoidPtrTy);
4675 1.1 joerg for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4676 1.1 joerg Rec->Buffer->replaceAllUsesWith(Replacement);
4677 1.1 joerg Rec->Buffer->eraseFromParent();
4678 1.1 joerg }
4679 1.1 joerg }
4680 1.1 joerg }
4681 1.1 joerg if (!TeamsReductions.empty()) {
4682 1.1 joerg ASTContext &C = CGM.getContext();
4683 1.1 joerg RecordDecl *StaticRD = C.buildImplicitRecord(
4684 1.1 joerg "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
4685 1.1 joerg StaticRD->startDefinition();
4686 1.1 joerg for (const RecordDecl *TeamReductionRec : TeamsReductions) {
4687 1.1 joerg QualType RecTy = C.getRecordType(TeamReductionRec);
4688 1.1 joerg auto *Field = FieldDecl::Create(
4689 1.1 joerg C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
4690 1.1 joerg C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
4691 1.1 joerg /*BW=*/nullptr, /*Mutable=*/false,
4692 1.1 joerg /*InitStyle=*/ICIS_NoInit);
4693 1.1 joerg Field->setAccess(AS_public);
4694 1.1 joerg StaticRD->addDecl(Field);
4695 1.1 joerg }
4696 1.1 joerg StaticRD->completeDefinition();
4697 1.1 joerg QualType StaticTy = C.getRecordType(StaticRD);
4698 1.1 joerg llvm::Type *LLVMReductionsBufferTy =
4699 1.1 joerg CGM.getTypes().ConvertTypeForMem(StaticTy);
4700 1.1 joerg // FIXME: nvlink does not handle weak linkage correctly (object with the
4701 1.1 joerg // different size are reported as erroneous).
4702 1.1 joerg // Restore CommonLinkage as soon as nvlink is fixed.
4703 1.1 joerg auto *GV = new llvm::GlobalVariable(
4704 1.1 joerg CGM.getModule(), LLVMReductionsBufferTy,
4705 1.1 joerg /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4706 1.1 joerg llvm::Constant::getNullValue(LLVMReductionsBufferTy),
4707 1.1 joerg "_openmp_teams_reductions_buffer_$_");
4708 1.1 joerg KernelTeamsReductionPtr->setInitializer(
4709 1.1 joerg llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4710 1.1 joerg CGM.VoidPtrTy));
4711 1.1 joerg }
4712 1.1 joerg CGOpenMPRuntime::clear();
4713 1.1 joerg }
4714