1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/OpenMPClause.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
34 ArrayRef<llvm::Value *> EnterArgs;
35 llvm::FunctionCallee ExitCallee = nullptr;
36 ArrayRef<llvm::Value *> ExitArgs;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 ArrayRef<llvm::Value *> EnterArgs,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
52 ContBlock = CGF.createBasicBlock(name: "omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
55 CGF.EmitBlock(BB: ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(Block: ContBlock);
61 CGF.EmitBlock(BB: ContBlock, IsFinished: true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 CGOpenMPRuntimeGPU::EM_Unknown;
76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(Val: RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Val: Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
105 return cast<ValueDecl>(Val: DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(Val: RefExpr);
107 return cast<ValueDecl>(Val: ME->getMemberDecl()->getCanonicalDecl());
108}
109
110static RecordDecl *buildRecordForGlobalizedVars(
111 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
131 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
136 if (Type->isLValueReferenceType())
137 Type = C.getPointerType(T: Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
140 SourceLocation Loc = VD->getLocation();
141 FieldDecl *Field;
142 if (SingleEscaped.count(Ptr: VD)) {
143 Field = FieldDecl::Create(
144 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
145 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(A: *I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
159 ASM: ArraySizeModifier::Normal, IndexTypeQuals: 0);
160 }
161 Field = FieldDecl::Create(
162 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
163 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(A: AlignedAttr::CreateImplicit(
169 Ctx&: C, /*IsAlignmentExpr=*/true,
170 Alignment: IntegerLiteral::Create(C, V: Align,
171 type: C.getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
172 l: SourceLocation()),
173 Range: {}, S: AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(D: Field);
176 MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180}
181
182/// Get the list of variables that can escape their declaration context.
183class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(Val: VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(Val: VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Kind: Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(Ptr: VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(X: VD);
242 else
243 DelayedVariableLengthDecls.insert(X: VD);
244 } else
245 EscapedDecls.insert(X: VD);
246 }
247
248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
252 if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(S: VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(Kind: C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
280 ArrayRef<const Expr *> Vars;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(Val: VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, BufSize: WarpSize);
319 }
320
321public:
322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, DKind: D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S: S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, Clauses: D->clauses(),
351 IsCombinedParallelRegion: CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(DKind: D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(Val: VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(Capture: &C) || isa<OMPCapturedExprDecl>(Val: VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(Val: VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(S: Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(S: Arg);
406 }
407 }
408 Visit(S: E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(Val: VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(S: E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(S: E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(S: E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(S: E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(S: Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(S: Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(Val: VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504CGOpenMPRuntimeGPU::ExecutionMode
505CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode;
507}
508
509CGOpenMPRuntimeGPU::DataSharingMode
510CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode;
512}
513
514/// Check for inner (nested) SPMD construct, if any
515static bool hasNestedSPMDDirective(ASTContext &Ctx,
516 const OMPExecutableDirective &D) {
517 const auto *CS = D.getInnermostCapturedStmt();
518 const auto *Body =
519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522 if (const auto *NestedDir =
523 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525 switch (D.getDirectiveKind()) {
526 case OMPD_target:
527 if (isOpenMPParallelDirective(DKind))
528 return true;
529 if (DKind == OMPD_teams) {
530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
532 if (!Body)
533 return false;
534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535 if (const auto *NND =
536 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
537 DKind = NND->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind))
539 return true;
540 }
541 }
542 return false;
543 case OMPD_target_teams:
544 return isOpenMPParallelDirective(DKind);
545 case OMPD_target_simd:
546 case OMPD_target_parallel:
547 case OMPD_target_parallel_for:
548 case OMPD_target_parallel_for_simd:
549 case OMPD_target_teams_distribute:
550 case OMPD_target_teams_distribute_simd:
551 case OMPD_target_teams_distribute_parallel_for:
552 case OMPD_target_teams_distribute_parallel_for_simd:
553 case OMPD_parallel:
554 case OMPD_for:
555 case OMPD_parallel_for:
556 case OMPD_parallel_master:
557 case OMPD_parallel_sections:
558 case OMPD_for_simd:
559 case OMPD_parallel_for_simd:
560 case OMPD_cancel:
561 case OMPD_cancellation_point:
562 case OMPD_ordered:
563 case OMPD_threadprivate:
564 case OMPD_allocate:
565 case OMPD_task:
566 case OMPD_simd:
567 case OMPD_sections:
568 case OMPD_section:
569 case OMPD_single:
570 case OMPD_master:
571 case OMPD_critical:
572 case OMPD_taskyield:
573 case OMPD_barrier:
574 case OMPD_taskwait:
575 case OMPD_taskgroup:
576 case OMPD_atomic:
577 case OMPD_flush:
578 case OMPD_depobj:
579 case OMPD_scan:
580 case OMPD_teams:
581 case OMPD_target_data:
582 case OMPD_target_exit_data:
583 case OMPD_target_enter_data:
584 case OMPD_distribute:
585 case OMPD_distribute_simd:
586 case OMPD_distribute_parallel_for:
587 case OMPD_distribute_parallel_for_simd:
588 case OMPD_teams_distribute:
589 case OMPD_teams_distribute_simd:
590 case OMPD_teams_distribute_parallel_for:
591 case OMPD_teams_distribute_parallel_for_simd:
592 case OMPD_target_update:
593 case OMPD_declare_simd:
594 case OMPD_declare_variant:
595 case OMPD_begin_declare_variant:
596 case OMPD_end_declare_variant:
597 case OMPD_declare_target:
598 case OMPD_end_declare_target:
599 case OMPD_declare_reduction:
600 case OMPD_declare_mapper:
601 case OMPD_taskloop:
602 case OMPD_taskloop_simd:
603 case OMPD_master_taskloop:
604 case OMPD_master_taskloop_simd:
605 case OMPD_parallel_master_taskloop:
606 case OMPD_parallel_master_taskloop_simd:
607 case OMPD_requires:
608 case OMPD_unknown:
609 default:
610 llvm_unreachable("Unexpected directive.");
611 }
612 }
613
614 return false;
615}
616
617static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618 const OMPExecutableDirective &D) {
619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620 switch (DirectiveKind) {
621 case OMPD_target:
622 case OMPD_target_teams:
623 return hasNestedSPMDDirective(Ctx, D);
624 case OMPD_target_parallel_loop:
625 case OMPD_target_parallel:
626 case OMPD_target_parallel_for:
627 case OMPD_target_parallel_for_simd:
628 case OMPD_target_teams_distribute_parallel_for:
629 case OMPD_target_teams_distribute_parallel_for_simd:
630 case OMPD_target_simd:
631 case OMPD_target_teams_distribute_simd:
632 return true;
633 case OMPD_target_teams_distribute:
634 return false;
635 case OMPD_target_teams_loop:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
639 return TTLD->canBeParallelFor();
640 return false;
641 case OMPD_parallel:
642 case OMPD_for:
643 case OMPD_parallel_for:
644 case OMPD_parallel_master:
645 case OMPD_parallel_sections:
646 case OMPD_for_simd:
647 case OMPD_parallel_for_simd:
648 case OMPD_cancel:
649 case OMPD_cancellation_point:
650 case OMPD_ordered:
651 case OMPD_threadprivate:
652 case OMPD_allocate:
653 case OMPD_task:
654 case OMPD_simd:
655 case OMPD_sections:
656 case OMPD_section:
657 case OMPD_single:
658 case OMPD_master:
659 case OMPD_critical:
660 case OMPD_taskyield:
661 case OMPD_barrier:
662 case OMPD_taskwait:
663 case OMPD_taskgroup:
664 case OMPD_atomic:
665 case OMPD_flush:
666 case OMPD_depobj:
667 case OMPD_scan:
668 case OMPD_teams:
669 case OMPD_target_data:
670 case OMPD_target_exit_data:
671 case OMPD_target_enter_data:
672 case OMPD_distribute:
673 case OMPD_distribute_simd:
674 case OMPD_distribute_parallel_for:
675 case OMPD_distribute_parallel_for_simd:
676 case OMPD_teams_distribute:
677 case OMPD_teams_distribute_simd:
678 case OMPD_teams_distribute_parallel_for:
679 case OMPD_teams_distribute_parallel_for_simd:
680 case OMPD_target_update:
681 case OMPD_declare_simd:
682 case OMPD_declare_variant:
683 case OMPD_begin_declare_variant:
684 case OMPD_end_declare_variant:
685 case OMPD_declare_target:
686 case OMPD_end_declare_target:
687 case OMPD_declare_reduction:
688 case OMPD_declare_mapper:
689 case OMPD_taskloop:
690 case OMPD_taskloop_simd:
691 case OMPD_master_taskloop:
692 case OMPD_master_taskloop_simd:
693 case OMPD_parallel_master_taskloop:
694 case OMPD_parallel_master_taskloop_simd:
695 case OMPD_requires:
696 case OMPD_unknown:
697 default:
698 break;
699 }
700 llvm_unreachable(
701 "Unknown programming model for OpenMP directive on NVPTX target.");
702}
703
704void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705 StringRef ParentName,
706 llvm::Function *&OutlinedFn,
707 llvm::Constant *&OutlinedFnID,
708 bool IsOffloadEntry,
709 const RegionCodeGenTy &CodeGen) {
710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711 EntryFunctionState EST;
712 WrapperFunctionsMap.clear();
713
714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy : public PrePostActionTy {
719 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720 const OMPExecutableDirective &D;
721
722 public:
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724 const OMPExecutableDirective &D)
725 : EST(EST), D(D) {}
726 void Enter(CodeGenFunction &CGF) override {
727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 }
732 void Exit(CodeGenFunction &CGF) override {
733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734 RT.clearLocThreadIdInsertPt(CGF);
735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 }
737 } Action(EST, D);
738 CodeGen.setAction(Action);
739 IsInTTDRegion = true;
740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741 IsOffloadEntry, CodeGen);
742 IsInTTDRegion = false;
743}
744
745void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746 CodeGenFunction &CGF,
747 EntryFunctionState &EST, bool IsSPMD) {
748 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
749 MaxTeamsVal = -1;
750 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
751 MinTeamsVal, MaxTeamsVal);
752
753 CGBuilderTy &Bld = CGF.Builder;
754 Bld.restoreIP(IP: OMPBuilder.createTargetInit(
755 Loc: Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
756 if (!IsSPMD)
757 emitGenericVarsProlog(CGF, Loc: EST.Loc);
758}
759
760void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761 EntryFunctionState &EST,
762 bool IsSPMD) {
763 if (!IsSPMD)
764 emitGenericVarsEpilog(CGF);
765
766 // This is temporary until we remove the fixed sized buffer.
767 ASTContext &C = CGM.getContext();
768 RecordDecl *StaticRD = C.buildImplicitRecord(
769 Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
770 StaticRD->startDefinition();
771 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772 QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
773 auto *Field = FieldDecl::Create(
774 C, DC: StaticRD, StartLoc: SourceLocation(), IdLoc: SourceLocation(), Id: nullptr, T: RecTy,
775 TInfo: C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit);
778 Field->setAccess(AS_public);
779 StaticRD->addDecl(D: Field);
780 }
781 StaticRD->completeDefinition();
782 QualType StaticTy = C.getRecordType(Decl: StaticRD);
783 llvm::Type *LLVMReductionsBufferTy =
784 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
785 const auto &DL = CGM.getModule().getDataLayout();
786 uint64_t ReductionDataSize =
787 TeamsReductions.empty()
788 ? 0
789 : DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
790 CGBuilderTy &Bld = CGF.Builder;
791 OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
792 TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
793 TeamsReductions.clear();
794}
795
796void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797 StringRef ParentName,
798 llvm::Function *&OutlinedFn,
799 llvm::Constant *&OutlinedFnID,
800 bool IsOffloadEntry,
801 const RegionCodeGenTy &CodeGen) {
802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803 EntryFunctionState EST;
804
805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy : public PrePostActionTy {
809 CGOpenMPRuntimeGPU &RT;
810 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811 bool IsBareKernel;
812 DataSharingMode Mode;
813 const OMPExecutableDirective &D;
814
815 public:
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818 bool IsBareKernel, const OMPExecutableDirective &D)
819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820 Mode(RT.CurrentDataSharingMode), D(D) {}
821 void Enter(CodeGenFunction &CGF) override {
822 if (IsBareKernel) {
823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824 return;
825 }
826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 }
830 void Exit(CodeGenFunction &CGF) override {
831 if (IsBareKernel) {
832 RT.CurrentDataSharingMode = Mode;
833 return;
834 }
835 RT.clearLocThreadIdInsertPt(CGF);
836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 }
838 } Action(*this, EST, IsBareKernel, D);
839 CodeGen.setAction(Action);
840 IsInTTDRegion = true;
841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842 IsOffloadEntry, CodeGen);
843 IsInTTDRegion = false;
844}
845
846void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective &D, StringRef ParentName,
848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850 if (!IsOffloadEntry) // Nothing to do.
851 return;
852
853 assert(!ParentName.empty() && "Invalid target region parent name!");
854
855 bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857 if (Mode || IsBareKernel)
858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859 CodeGen);
860 else
861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 CodeGen);
863}
864
865CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866 : CGOpenMPRuntime(CGM) {
867 llvm::OpenMPIRBuilderConfig Config(
868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869 CGM.getLangOpts().OpenMPOffloadMandatory,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 OMPBuilder.setConfig(Config);
873
874 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875 llvm_unreachable("OpenMP can only handle device code.");
876
877 if (CGM.getLangOpts().OpenMPCUDAMode)
878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879
880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
882 return;
883
884 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
885 Name: "__omp_rtl_debug_kind");
886 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
887 Name: "__omp_rtl_assume_teams_oversubscription");
888 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
889 Name: "__omp_rtl_assume_threads_oversubscription");
890 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
891 Name: "__omp_rtl_assume_no_thread_state");
892 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
893 Name: "__omp_rtl_assume_no_nested_parallelism");
894}
895
896void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
897 ProcBindKind ProcBind,
898 SourceLocation Loc) {
899 // Nothing to do.
900}
901
902void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
903 llvm::Value *NumThreads,
904 SourceLocation Loc) {
905 // Nothing to do.
906}
907
908void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
909 const Expr *NumTeams,
910 const Expr *ThreadLimit,
911 SourceLocation Loc) {}
912
913llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
914 CodeGenFunction &CGF, const OMPExecutableDirective &D,
915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
916 const RegionCodeGenTy &CodeGen) {
917 // Emit target region as a standalone region.
918 bool PrevIsInTTDRegion = IsInTTDRegion;
919 IsInTTDRegion = false;
920 auto *OutlinedFun =
921 cast<llvm::Function>(Val: CGOpenMPRuntime::emitParallelOutlinedFunction(
922 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923 IsInTTDRegion = PrevIsInTTDRegion;
924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925 llvm::Function *WrapperFun =
926 createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
927 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
928 }
929
930 return OutlinedFun;
931}
932
933/// Get list of lastprivate variables from the teams distribute ... or
934/// teams {distribute ...} directives.
935static void
936getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
937 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939 "expected teams directive.");
940 const OMPExecutableDirective *Dir = &D;
941 if (!isOpenMPDistributeDirective(DKind: D.getDirectiveKind())) {
942 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
943 Ctx,
944 Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945 /*IgnoreCaptured=*/true))) {
946 Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
947 if (Dir && !isOpenMPDistributeDirective(DKind: Dir->getDirectiveKind()))
948 Dir = nullptr;
949 }
950 }
951 if (!Dir)
952 return;
953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954 for (const Expr *E : C->getVarRefs())
955 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
956 }
957}
958
959/// Get list of reduction variables from the teams ... directives.
960static void
961getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964 "expected teams directive.");
965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966 for (const Expr *E : C->privates())
967 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
968 }
969}
970
971llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
972 CodeGenFunction &CGF, const OMPExecutableDirective &D,
973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
974 const RegionCodeGenTy &CodeGen) {
975 SourceLocation Loc = D.getBeginLoc();
976
977 const RecordDecl *GlobalizedRD = nullptr;
978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981 // Globalize team reductions variable unconditionally in all modes.
982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983 getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985 getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
986 if (!LastPrivatesReductions.empty()) {
987 GlobalizedRD = ::buildRecordForGlobalizedVars(
988 C&: CGM.getContext(), EscapedDecls: std::nullopt, EscapedDeclsForTeams: LastPrivatesReductions,
989 MappedDeclsFields, BufSize: WarpSize);
990 }
991 } else if (!LastPrivatesReductions.empty()) {
992 assert(!TeamAndReductions.first &&
993 "Previous team declaration is not expected.");
994 TeamAndReductions.first = D.getCapturedStmt(RegionKind: OMPD_teams)->getCapturedDecl();
995 std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
996 }
997
998 // Emit target region as a standalone region.
999 class NVPTXPrePostActionTy : public PrePostActionTy {
1000 SourceLocation &Loc;
1001 const RecordDecl *GlobalizedRD;
1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 &MappedDeclsFields;
1004
1005 public:
1006 NVPTXPrePostActionTy(
1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1009 &MappedDeclsFields)
1010 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1011 MappedDeclsFields(MappedDeclsFields) {}
1012 void Enter(CodeGenFunction &CGF) override {
1013 auto &Rt =
1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015 if (GlobalizedRD) {
1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1017 I->getSecond().MappedParams =
1018 std::make_unique<CodeGenFunction::OMPMapVars>();
1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1020 for (const auto &Pair : MappedDeclsFields) {
1021 assert(Pair.getFirst()->isCanonicalDecl() &&
1022 "Expected canonical declaration");
1023 Data.insert(KV: std::make_pair(x: Pair.getFirst(), y: MappedVarData()));
1024 }
1025 }
1026 Rt.emitGenericVarsProlog(CGF, Loc);
1027 }
1028 void Exit(CodeGenFunction &CGF) override {
1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030 .emitGenericVarsEpilog(CGF);
1031 }
1032 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1033 CodeGen.setAction(Action);
1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036
1037 return OutlinedFun;
1038}
1039
1040void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1041 SourceLocation Loc) {
1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043 return;
1044
1045 CGBuilderTy &Bld = CGF.Builder;
1046
1047 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1048 if (I == FunctionGlobalizedDecls.end())
1049 return;
1050
1051 for (auto &Rec : I->getSecond().LocalVarData) {
1052 const auto *VD = cast<VarDecl>(Val: Rec.first);
1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Ptr: Rec.first);
1054 QualType VarTy = VD->getType();
1055
1056 // Get the local allocation of a firstprivate variable before sharing
1057 llvm::Value *ParValue;
1058 if (EscapedParam) {
1059 LValue ParLVal =
1060 CGF.MakeAddrLValue(Addr: CGF.GetAddrOfLocalVar(VD), T: VD->getType());
1061 ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1062 }
1063
1064 // Allocate space for the variable to be globalized
1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1066 llvm::CallBase *VoidPtr =
1067 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1068 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1069 args: AllocArgs, name: VD->getName());
1070 // FIXME: We should use the variables actual alignment as an argument.
1071 VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1072 Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment,
1073 Val: CGM.getContext().getTargetInfo().getNewAlign() / 8));
1074
1075 // Cast the void pointer and get the address of the globalized variable.
1076 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(T: VarTy)->getPointerTo();
1077 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1078 V: VoidPtr, DestTy: VarPtrTy, Name: VD->getName() + "_on_stack");
1079 LValue VarAddr =
1080 CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1081 Rec.second.PrivateAddr = VarAddr.getAddress();
1082 Rec.second.GlobalizedVal = VoidPtr;
1083
1084 // Assign the local allocation to the newly globalized location.
1085 if (EscapedParam) {
1086 CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1087 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress());
1088 }
1089 if (auto *DI = CGF.getDebugInfo())
1090 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1091 }
1092
1093 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1094 const auto *VD = cast<VarDecl>(Val: ValueD);
1095 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1096 getKmpcAllocShared(CGF, VD);
1097 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1098 LValue Base = CGF.MakeAddrLValue(V: AddrSizePair.first, T: VD->getType(),
1099 Alignment: CGM.getContext().getDeclAlign(D: VD),
1100 Source: AlignmentSource::Decl);
1101 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress());
1102 }
1103 I->getSecond().MappedParams->apply(CGF);
1104}
1105
1106bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1107 const VarDecl *VD) const {
1108 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1109 if (I == FunctionGlobalizedDecls.end())
1110 return false;
1111
1112 // Check variable declaration is delayed:
1113 return llvm::is_contained(Range: I->getSecond().DelayedVariableLengthDecls, Element: VD);
1114}
1115
1116std::pair<llvm::Value *, llvm::Value *>
1117CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1118 const VarDecl *VD) {
1119 CGBuilderTy &Bld = CGF.Builder;
1120
1121 // Compute size and alignment.
1122 llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1123 CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
1124 Size = Bld.CreateNUWAdd(
1125 LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - 1));
1126 llvm::Value *AlignVal =
1127 llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1128 Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1129 Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1130
1131 // Allocate space for this VLA object to be globalized.
1132 llvm::Value *AllocArgs[] = {Size};
1133 llvm::CallBase *VoidPtr =
1134 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1135 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1136 args: AllocArgs, name: VD->getName());
1137 VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1138 Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment, Val: Align.getQuantity()));
1139
1140 return std::make_pair(x&: VoidPtr, y&: Size);
1141}
1142
1143void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1144 CodeGenFunction &CGF,
1145 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1146 // Deallocate the memory for each globalized VLA object
1147 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1148 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1149 args: {AddrSizePair.first, AddrSizePair.second});
1150}
1151
1152void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1153 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1154 return;
1155
1156 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1157 if (I != FunctionGlobalizedDecls.end()) {
1158 // Deallocate the memory for each globalized VLA object that was
1159 // globalized in the prolog (i.e. emitGenericVarsProlog).
1160 for (const auto &AddrSizePair :
1161 llvm::reverse(C&: I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1162 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1163 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1164 args: {AddrSizePair.first, AddrSizePair.second});
1165 }
1166 // Deallocate the memory for each globalized value
1167 for (auto &Rec : llvm::reverse(C&: I->getSecond().LocalVarData)) {
1168 const auto *VD = cast<VarDecl>(Val: Rec.first);
1169 I->getSecond().MappedParams->restore(CGF);
1170
1171 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1172 CGF.getTypeSize(Ty: VD->getType())};
1173 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1174 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1175 args: FreeArgs);
1176 }
1177 }
1178}
1179
1180void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1181 const OMPExecutableDirective &D,
1182 SourceLocation Loc,
1183 llvm::Function *OutlinedFn,
1184 ArrayRef<llvm::Value *> CapturedVars) {
1185 if (!CGF.HaveInsertPoint())
1186 return;
1187
1188 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1189
1190 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1191 /*Name=*/".zero.addr");
1192 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1193 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1194 // We don't emit any thread id function call in bare kernel, but because the
1195 // outlined function has a pointer argument, we emit a nullptr here.
1196 if (IsBareKernel)
1197 OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1198 else
1199 OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1200 OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1201 OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1202 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1203}
1204
1205void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1206 SourceLocation Loc,
1207 llvm::Function *OutlinedFn,
1208 ArrayRef<llvm::Value *> CapturedVars,
1209 const Expr *IfCond,
1210 llvm::Value *NumThreads) {
1211 if (!CGF.HaveInsertPoint())
1212 return;
1213
1214 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1215 NumThreads](CodeGenFunction &CGF,
1216 PrePostActionTy &Action) {
1217 CGBuilderTy &Bld = CGF.Builder;
1218 llvm::Value *NumThreadsVal = NumThreads;
1219 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1220 llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1221 if (WFn)
1222 ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1223 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1224
1225 // Create a private scope that will globalize the arguments
1226 // passed from the outside of the target region.
1227 // TODO: Is that needed?
1228 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1229
1230 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1231 Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1232 Name: "captured_vars_addrs");
1233 // There's something to share.
1234 if (!CapturedVars.empty()) {
1235 // Prepare for parallel region. Indicate the outlined function.
1236 ASTContext &Ctx = CGF.getContext();
1237 unsigned Idx = 0;
1238 for (llvm::Value *V : CapturedVars) {
1239 Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1240 llvm::Value *PtrV;
1241 if (V->getType()->isIntegerTy())
1242 PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1243 else
1244 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1245 CGF.EmitStoreOfScalar(Value: PtrV, Addr: Dst, /*Volatile=*/false,
1246 Ty: Ctx.getPointerType(T: Ctx.VoidPtrTy));
1247 ++Idx;
1248 }
1249 }
1250
1251 llvm::Value *IfCondVal = nullptr;
1252 if (IfCond)
1253 IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1254 /* isSigned */ false);
1255 else
1256 IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: 1);
1257
1258 if (!NumThreadsVal)
1259 NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1);
1260 else
1261 NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty),
1262
1263 assert(IfCondVal && "Expected a value");
1264 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1265 llvm::Value *Args[] = {
1266 RTLoc,
1267 getThreadID(CGF, Loc),
1268 IfCondVal,
1269 NumThreadsVal,
1270 llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1),
1271 FnPtr,
1272 ID,
1273 Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1274 DestTy: CGF.VoidPtrPtrTy),
1275 llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1276 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1277 M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1278 args: Args);
1279 };
1280
1281 RegionCodeGenTy RCG(ParallelGen);
1282 RCG(CGF);
1283}
1284
1285void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1286 // Always emit simple barriers!
1287 if (!CGF.HaveInsertPoint())
1288 return;
1289 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1290 // This function does not use parameters, so we can emit just default values.
1291 llvm::Value *Args[] = {
1292 llvm::ConstantPointerNull::get(
1293 T: cast<llvm::PointerType>(Val: getIdentTyPointerTy())),
1294 llvm::ConstantInt::get(Ty: CGF.Int32Ty, /*V=*/0, /*isSigned=*/IsSigned: true)};
1295 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1296 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1297 args: Args);
1298}
1299
1300void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1301 SourceLocation Loc,
1302 OpenMPDirectiveKind Kind, bool,
1303 bool) {
1304 // Always emit simple barriers!
1305 if (!CGF.HaveInsertPoint())
1306 return;
1307 // Build call __kmpc_cancel_barrier(loc, thread_id);
1308 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1309 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1310 getThreadID(CGF, Loc)};
1311
1312 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1313 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1314 args: Args);
1315}
1316
1317void CGOpenMPRuntimeGPU::emitCriticalRegion(
1318 CodeGenFunction &CGF, StringRef CriticalName,
1319 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1320 const Expr *Hint) {
1321 llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1322 llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1323 llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1324 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1325 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1326
1327 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1328
1329 // Get the mask of active threads in the warp.
1330 llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1331 M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1332 // Fetch team-local id of the thread.
1333 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1334
1335 // Get the width of the team.
1336 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1337
1338 // Initialize the counter variable for the loop.
1339 QualType Int32Ty =
1340 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1341 Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1342 LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1343 CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1344 /*isInit=*/true);
1345
1346 // Block checks if loop counter exceeds upper bound.
1347 CGF.EmitBlock(BB: LoopBB);
1348 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1349 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1350 CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1351
1352 // Block tests which single thread should execute region, and which threads
1353 // should go straight to synchronisation point.
1354 CGF.EmitBlock(BB: TestBB);
1355 CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1356 llvm::Value *CmpThreadToCounter =
1357 CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1358 CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1359
1360 // Block emits the body of the critical region.
1361 CGF.EmitBlock(BB: BodyBB);
1362
1363 // Output the critical statement.
1364 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1365 Hint);
1366
1367 // After the body surrounded by the critical region, the single executing
1368 // thread will jump to the synchronisation point.
1369 // Block waits for all threads in current team to finish then increments the
1370 // counter variable and returns to the loop.
1371 CGF.EmitBlock(BB: SyncBB);
1372 // Reconverge active threads in the warp.
1373 (void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1374 M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1375 args: Mask);
1376
1377 llvm::Value *IncCounterVal =
1378 CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: 1));
1379 CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1380 CGF.EmitBranch(Block: LoopBB);
1381
1382 // Block that is reached when all threads in the team complete the region.
1383 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
1384}
1385
1386/// Cast value to the specified type.
1387static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1388 QualType ValTy, QualType CastTy,
1389 SourceLocation Loc) {
1390 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1391 "Cast type must sized.");
1392 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1393 "Val type must sized.");
1394 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1395 if (ValTy == CastTy)
1396 return Val;
1397 if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1398 CGF.getContext().getTypeSizeInChars(T: CastTy))
1399 return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1400 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1401 return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1402 isSigned: CastTy->hasSignedIntegerRepresentation());
1403 Address CastItem = CGF.CreateMemTemp(T: CastTy);
1404 Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1405 CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /*Volatile=*/false, Ty: ValTy,
1406 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1407 TBAAInfo: TBAAAccessInfo());
1408 return CGF.EmitLoadOfScalar(Addr: CastItem, /*Volatile=*/false, Ty: CastTy, Loc,
1409 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1410 TBAAInfo: TBAAAccessInfo());
1411}
1412
1413///
1414/// Design of OpenMP reductions on the GPU
1415///
1416/// Consider a typical OpenMP program with one or more reduction
1417/// clauses:
1418///
1419/// float foo;
1420/// double bar;
1421/// #pragma omp target teams distribute parallel for \
1422/// reduction(+:foo) reduction(*:bar)
1423/// for (int i = 0; i < N; i++) {
1424/// foo += A[i]; bar *= B[i];
1425/// }
1426///
1427/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1428/// all teams. In our OpenMP implementation on the NVPTX device an
1429/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1430/// within a team are mapped to CUDA threads within a threadblock.
1431/// Our goal is to efficiently aggregate values across all OpenMP
1432/// threads such that:
1433///
1434/// - the compiler and runtime are logically concise, and
1435/// - the reduction is performed efficiently in a hierarchical
1436/// manner as follows: within OpenMP threads in the same warp,
1437/// across warps in a threadblock, and finally across teams on
1438/// the NVPTX device.
1439///
1440/// Introduction to Decoupling
1441///
1442/// We would like to decouple the compiler and the runtime so that the
1443/// latter is ignorant of the reduction variables (number, data types)
1444/// and the reduction operators. This allows a simpler interface
1445/// and implementation while still attaining good performance.
1446///
1447/// Pseudocode for the aforementioned OpenMP program generated by the
1448/// compiler is as follows:
1449///
1450/// 1. Create private copies of reduction variables on each OpenMP
1451/// thread: 'foo_private', 'bar_private'
1452/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1453/// to it and writes the result in 'foo_private' and 'bar_private'
1454/// respectively.
1455/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1456/// and store the result on the team master:
1457///
1458/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1459/// reduceData, shuffleReduceFn, interWarpCpyFn)
1460///
1461/// where:
1462/// struct ReduceData {
1463/// double *foo;
1464/// double *bar;
1465/// } reduceData
1466/// reduceData.foo = &foo_private
1467/// reduceData.bar = &bar_private
1468///
1469/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1470/// auxiliary functions generated by the compiler that operate on
1471/// variables of type 'ReduceData'. They aid the runtime perform
1472/// algorithmic steps in a data agnostic manner.
1473///
1474/// 'shuffleReduceFn' is a pointer to a function that reduces data
1475/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1476/// same warp. It takes the following arguments as input:
1477///
1478/// a. variable of type 'ReduceData' on the calling lane,
1479/// b. its lane_id,
1480/// c. an offset relative to the current lane_id to generate a
1481/// remote_lane_id. The remote lane contains the second
1482/// variable of type 'ReduceData' that is to be reduced.
1483/// d. an algorithm version parameter determining which reduction
1484/// algorithm to use.
1485///
1486/// 'shuffleReduceFn' retrieves data from the remote lane using
1487/// efficient GPU shuffle intrinsics and reduces, using the
1488/// algorithm specified by the 4th parameter, the two operands
1489/// element-wise. The result is written to the first operand.
1490///
1491/// Different reduction algorithms are implemented in different
1492/// runtime functions, all calling 'shuffleReduceFn' to perform
1493/// the essential reduction step. Therefore, based on the 4th
1494/// parameter, this function behaves slightly differently to
1495/// cooperate with the runtime to ensure correctness under
1496/// different circumstances.
1497///
1498/// 'InterWarpCpyFn' is a pointer to a function that transfers
1499/// reduced variables across warps. It tunnels, through CUDA
1500/// shared memory, the thread-private data of type 'ReduceData'
1501/// from lane 0 of each warp to a lane in the first warp.
1502/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1503/// The last team writes the global reduced value to memory.
1504///
1505/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1506/// reduceData, shuffleReduceFn, interWarpCpyFn,
1507/// scratchpadCopyFn, loadAndReduceFn)
1508///
1509/// 'scratchpadCopyFn' is a helper that stores reduced
1510/// data from the team master to a scratchpad array in
1511/// global memory.
1512///
1513/// 'loadAndReduceFn' is a helper that loads data from
1514/// the scratchpad array and reduces it with the input
1515/// operand.
1516///
1517/// These compiler generated functions hide address
1518/// calculation and alignment information from the runtime.
1519/// 5. if ret == 1:
1520/// The team master of the last team stores the reduced
1521/// result to the globals in memory.
1522/// foo += reduceData.foo; bar *= reduceData.bar
1523///
1524///
1525/// Warp Reduction Algorithms
1526///
1527/// On the warp level, we have three algorithms implemented in the
1528/// OpenMP runtime depending on the number of active lanes:
1529///
1530/// Full Warp Reduction
1531///
1532/// The reduce algorithm within a warp where all lanes are active
1533/// is implemented in the runtime as follows:
1534///
1535/// full_warp_reduce(void *reduce_data,
1536/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1537/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1538/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1539/// }
1540///
1541/// The algorithm completes in log(2, WARPSIZE) steps.
1542///
1543/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1544/// not used therefore we save instructions by not retrieving lane_id
1545/// from the corresponding special registers. The 4th parameter, which
1546/// represents the version of the algorithm being used, is set to 0 to
1547/// signify full warp reduction.
1548///
1549/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1550///
1551/// #reduce_elem refers to an element in the local lane's data structure
1552/// #remote_elem is retrieved from a remote lane
1553/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1554/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1555///
1556/// Contiguous Partial Warp Reduction
1557///
1558/// This reduce algorithm is used within a warp where only the first
1559/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1560/// number of OpenMP threads in a parallel region is not a multiple of
1561/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1562///
1563/// void
1564/// contiguous_partial_reduce(void *reduce_data,
1565/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1566/// int size, int lane_id) {
1567/// int curr_size;
1568/// int offset;
1569/// curr_size = size;
1570/// mask = curr_size/2;
1571/// while (offset>0) {
1572/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1573/// curr_size = (curr_size+1)/2;
1574/// offset = curr_size/2;
1575/// }
1576/// }
1577///
1578/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1579///
1580/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1581/// if (lane_id < offset)
1582/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1583/// else
1584/// reduce_elem = remote_elem
1585///
1586/// This algorithm assumes that the data to be reduced are located in a
1587/// contiguous subset of lanes starting from the first. When there is
1588/// an odd number of active lanes, the data in the last lane is not
1589/// aggregated with any other lane's dat but is instead copied over.
1590///
1591/// Dispersed Partial Warp Reduction
1592///
1593/// This algorithm is used within a warp when any discontiguous subset of
1594/// lanes are active. It is used to implement the reduction operation
1595/// across lanes in an OpenMP simd region or in a nested parallel region.
1596///
1597/// void
1598/// dispersed_partial_reduce(void *reduce_data,
1599/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1600/// int size, remote_id;
1601/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1602/// do {
1603/// remote_id = next_active_lane_id_right_after_me();
1604/// # the above function returns 0 of no active lane
1605/// # is present right after the current lane.
1606/// size = number_of_active_lanes_in_this_warp();
1607/// logical_lane_id /= 2;
1608/// ShuffleReduceFn(reduce_data, logical_lane_id,
1609/// remote_id-1-threadIdx.x, 2);
1610/// } while (logical_lane_id % 2 == 0 && size > 1);
1611/// }
1612///
1613/// There is no assumption made about the initial state of the reduction.
1614/// Any number of lanes (>=1) could be active at any position. The reduction
1615/// result is returned in the first active lane.
1616///
1617/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1618///
1619/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1620/// if (lane_id % 2 == 0 && offset > 0)
1621/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1622/// else
1623/// reduce_elem = remote_elem
1624///
1625///
1626/// Intra-Team Reduction
1627///
1628/// This function, as implemented in the runtime call
1629/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1630/// threads in a team. It first reduces within a warp using the
1631/// aforementioned algorithms. We then proceed to gather all such
1632/// reduced values at the first warp.
1633///
1634/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1635/// data from each of the "warp master" (zeroth lane of each warp, where
1636/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1637/// a mathematical sense) the problem of reduction across warp masters in
1638/// a block to the problem of warp reduction.
1639///
1640///
1641/// Inter-Team Reduction
1642///
1643/// Once a team has reduced its data to a single value, it is stored in
1644/// a global scratchpad array. Since each team has a distinct slot, this
1645/// can be done without locking.
1646///
1647/// The last team to write to the scratchpad array proceeds to reduce the
1648/// scratchpad array. One or more workers in the last team use the helper
1649/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1650/// the k'th worker reduces every k'th element.
1651///
1652/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1653/// reduce across workers and compute a globally reduced value.
1654///
1655void CGOpenMPRuntimeGPU::emitReduction(
1656 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1657 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1658 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1659 if (!CGF.HaveInsertPoint())
1660 return;
1661
1662 bool ParallelReduction = isOpenMPParallelDirective(DKind: Options.ReductionKind);
1663 bool DistributeReduction = isOpenMPDistributeDirective(DKind: Options.ReductionKind);
1664 bool TeamsReduction = isOpenMPTeamsDirective(DKind: Options.ReductionKind);
1665
1666 ASTContext &C = CGM.getContext();
1667
1668 if (Options.SimpleReduction) {
1669 assert(!TeamsReduction && !ParallelReduction &&
1670 "Invalid reduction selection in emitReduction.");
1671 (void)ParallelReduction;
1672 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1673 ReductionOps, Options);
1674 return;
1675 }
1676
1677 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1678 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1679 int Cnt = 0;
1680 for (const Expr *DRE : Privates) {
1681 PrivatesReductions[Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
1682 ++Cnt;
1683 }
1684 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1685 C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: std::nullopt, MappedDeclsFields&: VarFieldMap, BufSize: 1);
1686
1687 if (TeamsReduction)
1688 TeamsReductions.push_back(Elt: ReductionRec);
1689
1690 // Source location for the ident struct
1691 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1692
1693 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1694 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1695 CGF.AllocaInsertPt->getIterator());
1696 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1697 CGF.Builder.GetInsertPoint());
1698 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1699 CodeGenIP, CGF.SourceLocToDebugLoc(Location: Loc));
1700 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
1701
1702 CodeGenFunction::OMPPrivateScope Scope(CGF);
1703 unsigned Idx = 0;
1704 for (const Expr *Private : Privates) {
1705 llvm::Type *ElementType;
1706 llvm::Value *Variable;
1707 llvm::Value *PrivateVariable;
1708 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1709 ElementType = CGF.ConvertTypeForMem(T: Private->getType());
1710 const auto *RHSVar =
1711 cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs[Idx])->getDecl());
1712 PrivateVariable = CGF.GetAddrOfLocalVar(VD: RHSVar).emitRawPointer(CGF);
1713 const auto *LHSVar =
1714 cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs[Idx])->getDecl());
1715 Variable = CGF.GetAddrOfLocalVar(VD: LHSVar).emitRawPointer(CGF);
1716 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1717 switch (CGF.getEvaluationKind(T: Private->getType())) {
1718 case TEK_Scalar:
1719 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1720 break;
1721 case TEK_Complex:
1722 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1723 break;
1724 case TEK_Aggregate:
1725 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1726 break;
1727 }
1728 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1729 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1730 llvm::Function *NewFunc) {
1731 CGF.Builder.restoreIP(IP: CodeGenIP);
1732 auto *CurFn = CGF.CurFn;
1733 CGF.CurFn = NewFunc;
1734
1735 *LHSPtr = CGF.GetAddrOfLocalVar(
1736 VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs[I])->getDecl()))
1737 .emitRawPointer(CGF);
1738 *RHSPtr = CGF.GetAddrOfLocalVar(
1739 VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs[I])->getDecl()))
1740 .emitRawPointer(CGF);
1741
1742 emitSingleReductionCombiner(CGF, ReductionOp: ReductionOps[I], PrivateRef: Privates[I],
1743 LHS: cast<DeclRefExpr>(Val: LHSExprs[I]),
1744 RHS: cast<DeclRefExpr>(Val: RHSExprs[I]));
1745
1746 CGF.CurFn = CurFn;
1747
1748 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1749 CGF.Builder.GetInsertPoint());
1750 };
1751 ReductionInfos.emplace_back(Args: llvm::OpenMPIRBuilder::ReductionInfo(
1752 ElementType, Variable, PrivateVariable, EvalKind,
1753 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
1754 Idx++;
1755 }
1756
1757 CGF.Builder.restoreIP(IP: OMPBuilder.createReductionsGPU(
1758 Loc: OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, IsNoWait: false, IsTeamsReduction: TeamsReduction,
1759 HasDistribute: DistributeReduction, ReductionGenCBKind: llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1760 GridValue: CGF.getTarget().getGridValue(), ReductionBufNum: C.getLangOpts().OpenMPCUDAReductionBufNum,
1761 SrcLocInfo: RTLoc));
1762 return;
1763}
1764
1765const VarDecl *
1766CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1767 const VarDecl *NativeParam) const {
1768 if (!NativeParam->getType()->isReferenceType())
1769 return NativeParam;
1770 QualType ArgType = NativeParam->getType();
1771 QualifierCollector QC;
1772 const Type *NonQualTy = QC.strip(type: ArgType);
1773 QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1774 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1775 if (Attr->getCaptureKind() == OMPC_map) {
1776 PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
1777 AddressSpace: LangAS::opencl_global);
1778 }
1779 }
1780 ArgType = CGM.getContext().getPointerType(T: PointeeTy);
1781 QC.addRestrict();
1782 enum { NVPTX_local_addr = 5 };
1783 QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
1784 ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
1785 if (isa<ImplicitParamDecl>(Val: NativeParam))
1786 return ImplicitParamDecl::Create(
1787 C&: CGM.getContext(), /*DC=*/nullptr, IdLoc: NativeParam->getLocation(),
1788 Id: NativeParam->getIdentifier(), T: ArgType, ParamKind: ImplicitParamKind::Other);
1789 return ParmVarDecl::Create(
1790 C&: CGM.getContext(),
1791 DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
1792 StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
1793 Id: NativeParam->getIdentifier(), T: ArgType,
1794 /*TInfo=*/nullptr, S: SC_None, /*DefArg=*/nullptr);
1795}
1796
1797Address
1798CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1799 const VarDecl *NativeParam,
1800 const VarDecl *TargetParam) const {
1801 assert(NativeParam != TargetParam &&
1802 NativeParam->getType()->isReferenceType() &&
1803 "Native arg must not be the same as target arg.");
1804 Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
1805 QualType NativeParamType = NativeParam->getType();
1806 QualifierCollector QC;
1807 const Type *NonQualTy = QC.strip(type: NativeParamType);
1808 QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1809 unsigned NativePointeeAddrSpace =
1810 CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
1811 QualType TargetTy = TargetParam->getType();
1812 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /*Volatile=*/false,
1813 Ty: TargetTy, Loc: SourceLocation());
1814 // Cast to native address space.
1815 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1816 V: TargetAddr,
1817 DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
1818 Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
1819 CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /*Volatile=*/false,
1820 Ty: NativeParamType);
1821 return NativeParamAddr;
1822}
1823
1824void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1825 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1826 ArrayRef<llvm::Value *> Args) const {
1827 SmallVector<llvm::Value *, 4> TargetArgs;
1828 TargetArgs.reserve(N: Args.size());
1829 auto *FnType = OutlinedFn.getFunctionType();
1830 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1831 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1832 TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
1833 break;
1834 }
1835 llvm::Type *TargetType = FnType->getParamType(i: I);
1836 llvm::Value *NativeArg = Args[I];
1837 if (!TargetType->isPointerTy()) {
1838 TargetArgs.emplace_back(Args&: NativeArg);
1839 continue;
1840 }
1841 TargetArgs.emplace_back(
1842 Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
1843 }
1844 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: TargetArgs);
1845}
1846
1847/// Emit function which wraps the outline parallel region
1848/// and controls the arguments which are passed to this function.
1849/// The wrapper ensures that the outlined function is called
1850/// with the correct arguments when data is shared.
1851llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1852 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1853 ASTContext &Ctx = CGM.getContext();
1854 const auto &CS = *D.getCapturedStmt(RegionKind: OMPD_parallel);
1855
1856 // Create a function that takes as argument the source thread.
1857 FunctionArgList WrapperArgs;
1858 QualType Int16QTy =
1859 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1860 QualType Int32QTy =
1861 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1862 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1863 /*Id=*/nullptr, Int16QTy,
1864 ImplicitParamKind::Other);
1865 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1866 /*Id=*/nullptr, Int32QTy,
1867 ImplicitParamKind::Other);
1868 WrapperArgs.emplace_back(Args: &ParallelLevelArg);
1869 WrapperArgs.emplace_back(Args: &WrapperArg);
1870
1871 const CGFunctionInfo &CGFI =
1872 CGM.getTypes().arrangeBuiltinFunctionDeclaration(resultType: Ctx.VoidTy, args: WrapperArgs);
1873
1874 auto *Fn = llvm::Function::Create(
1875 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
1876 N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
1877
1878 // Ensure we do not inline the function. This is trivially true for the ones
1879 // passed to __kmpc_fork_call but the ones calles in serialized regions
1880 // could be inlined. This is not a perfect but it is closer to the invariant
1881 // we want, namely, every data environment starts with a new function.
1882 // TODO: We should pass the if condition to the runtime function and do the
1883 // handling there. Much cleaner code.
1884 Fn->addFnAttr(Kind: llvm::Attribute::NoInline);
1885
1886 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
1887 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1888 Fn->setDoesNotRecurse();
1889
1890 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1891 CGF.StartFunction(GD: GlobalDecl(), RetTy: Ctx.VoidTy, Fn, FnInfo: CGFI, Args: WrapperArgs,
1892 Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
1893
1894 const auto *RD = CS.getCapturedRecordDecl();
1895 auto CurField = RD->field_begin();
1896
1897 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1898 /*Name=*/".zero.addr");
1899 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1900 // Get the array of arguments.
1901 SmallVector<llvm::Value *, 8> Args;
1902
1903 Args.emplace_back(Args: CGF.GetAddrOfLocalVar(VD: &WrapperArg).emitRawPointer(CGF));
1904 Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
1905
1906 CGBuilderTy &Bld = CGF.Builder;
1907 auto CI = CS.capture_begin();
1908
1909 // Use global memory for data sharing.
1910 // Handle passing of global args to workers.
1911 RawAddress GlobalArgs =
1912 CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
1913 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1914 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1915 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1916 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
1917 args: DataSharingArgs);
1918
1919 // Retrieve the shared variables from the list of references returned
1920 // by the runtime. Pass the variables to the outlined function.
1921 Address SharedArgListAddress = Address::invalid();
1922 if (CS.capture_size() > 0 ||
1923 isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1924 SharedArgListAddress = CGF.EmitLoadOfPointer(
1925 Ptr: GlobalArgs, PtrTy: CGF.getContext()
1926 .getPointerType(T: CGF.getContext().VoidPtrTy)
1927 .castAs<PointerType>());
1928 }
1929 unsigned Idx = 0;
1930 if (isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1931 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1932 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1933 Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
1934 llvm::Value *LB = CGF.EmitLoadOfScalar(
1935 Addr: TypedAddress,
1936 /*Volatile=*/false,
1937 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1938 Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
1939 Args.emplace_back(Args&: LB);
1940 ++Idx;
1941 Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1942 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1943 Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
1944 llvm::Value *UB = CGF.EmitLoadOfScalar(
1945 Addr: TypedAddress,
1946 /*Volatile=*/false,
1947 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1948 Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
1949 Args.emplace_back(Args&: UB);
1950 ++Idx;
1951 }
1952 if (CS.capture_size() > 0) {
1953 ASTContext &CGFContext = CGF.getContext();
1954 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1955 QualType ElemTy = CurField->getType();
1956 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
1957 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1958 Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
1959 ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
1960 llvm::Value *Arg = CGF.EmitLoadOfScalar(Addr: TypedAddress,
1961 /*Volatile=*/false,
1962 Ty: CGFContext.getPointerType(T: ElemTy),
1963 Loc: CI->getLocation());
1964 if (CI->capturesVariableByCopy() &&
1965 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1966 Arg = castValueToType(CGF, Val: Arg, ValTy: ElemTy, CastTy: CGFContext.getUIntPtrType(),
1967 Loc: CI->getLocation());
1968 }
1969 Args.emplace_back(Args&: Arg);
1970 }
1971 }
1972
1973 emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
1974 CGF.FinishFunction();
1975 return Fn;
1976}
1977
1978void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1979 const Decl *D) {
1980 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1981 return;
1982
1983 assert(D && "Expected function or captured|block decl.");
1984 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1985 "Function is registered already.");
1986 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1987 "Team is set but not processed.");
1988 const Stmt *Body = nullptr;
1989 bool NeedToDelayGlobalization = false;
1990 if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
1991 Body = FD->getBody();
1992 } else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
1993 Body = BD->getBody();
1994 } else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
1995 Body = CD->getBody();
1996 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1997 if (NeedToDelayGlobalization &&
1998 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1999 return;
2000 }
2001 if (!Body)
2002 return;
2003 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2004 VarChecker.Visit(S: Body);
2005 const RecordDecl *GlobalizedVarsRecord =
2006 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2007 TeamAndReductions.first = nullptr;
2008 TeamAndReductions.second.clear();
2009 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2010 VarChecker.getEscapedVariableLengthDecls();
2011 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2012 VarChecker.getDelayedVariableLengthDecls();
2013 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2014 DelayedVariableLengthDecls.empty())
2015 return;
2016 auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
2017 I->getSecond().MappedParams =
2018 std::make_unique<CodeGenFunction::OMPMapVars>();
2019 I->getSecond().EscapedParameters.insert(
2020 I: VarChecker.getEscapedParameters().begin(),
2021 E: VarChecker.getEscapedParameters().end());
2022 I->getSecond().EscapedVariableLengthDecls.append(
2023 in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
2024 I->getSecond().DelayedVariableLengthDecls.append(
2025 in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
2026 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2027 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2028 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2029 Data.insert(KV: std::make_pair(x&: VD, y: MappedVarData()));
2030 }
2031 if (!NeedToDelayGlobalization) {
2032 emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
2033 struct GlobalizationScope final : EHScopeStack::Cleanup {
2034 GlobalizationScope() = default;
2035
2036 void Emit(CodeGenFunction &CGF, Flags flags) override {
2037 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2038 .emitGenericVarsEpilog(CGF);
2039 }
2040 };
2041 CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
2042 }
2043}
2044
2045Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2046 const VarDecl *VD) {
2047 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2048 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2049 auto AS = LangAS::Default;
2050 switch (A->getAllocatorType()) {
2051 // Use the default allocator here as by default local vars are
2052 // threadlocal.
2053 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2054 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2057 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2058 // Follow the user decision - use default allocation.
2059 return Address::invalid();
2060 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2061 // TODO: implement aupport for user-defined allocators.
2062 return Address::invalid();
2063 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2064 AS = LangAS::cuda_constant;
2065 break;
2066 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2067 AS = LangAS::cuda_shared;
2068 break;
2069 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2070 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2071 break;
2072 }
2073 llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
2074 auto *GV = new llvm::GlobalVariable(
2075 CGM.getModule(), VarTy, /*isConstant=*/false,
2076 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
2077 VD->getName(),
2078 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2079 CGM.getContext().getTargetAddressSpace(AS));
2080 CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
2081 GV->setAlignment(Align.getAsAlign());
2082 return Address(
2083 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2084 V: GV, DestTy: VarTy->getPointerTo(AddrSpace: CGM.getContext().getTargetAddressSpace(
2085 AS: VD->getType().getAddressSpace()))),
2086 VarTy, Align);
2087 }
2088
2089 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2090 return Address::invalid();
2091
2092 VD = VD->getCanonicalDecl();
2093 auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
2094 if (I == FunctionGlobalizedDecls.end())
2095 return Address::invalid();
2096 auto VDI = I->getSecond().LocalVarData.find(Key: VD);
2097 if (VDI != I->getSecond().LocalVarData.end())
2098 return VDI->second.PrivateAddr;
2099 if (VD->hasAttrs()) {
2100 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2101 E(VD->attr_end());
2102 IT != E; ++IT) {
2103 auto VDI = I->getSecond().LocalVarData.find(
2104 Key: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: IT->getRef())->getDecl())
2105 ->getCanonicalDecl());
2106 if (VDI != I->getSecond().LocalVarData.end())
2107 return VDI->second.PrivateAddr;
2108 }
2109 }
2110
2111 return Address::invalid();
2112}
2113
2114void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2115 FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
2116 CGOpenMPRuntime::functionFinished(CGF);
2117}
2118
2119void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2120 CodeGenFunction &CGF, const OMPLoopDirective &S,
2121 OpenMPDistScheduleClauseKind &ScheduleKind,
2122 llvm::Value *&Chunk) const {
2123 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2124 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2125 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2126 Chunk = CGF.EmitScalarConversion(
2127 Src: RT.getGPUNumThreads(CGF),
2128 SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
2129 DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
2130 return;
2131 }
2132 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2133 CGF, S, ScheduleKind, Chunk);
2134}
2135
2136void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2137 CodeGenFunction &CGF, const OMPLoopDirective &S,
2138 OpenMPScheduleClauseKind &ScheduleKind,
2139 const Expr *&ChunkExpr) const {
2140 ScheduleKind = OMPC_SCHEDULE_static;
2141 // Chunk size is 1 in this case.
2142 llvm::APInt ChunkSize(32, 1);
2143 ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
2144 type: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
2145 l: SourceLocation());
2146}
2147
2148void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2149 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2150 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2151 " Expected target-based directive.");
2152 const CapturedStmt *CS = D.getCapturedStmt(RegionKind: OMPD_target);
2153 for (const CapturedStmt::Capture &C : CS->captures()) {
2154 // Capture variables captured by reference in lambdas for target-based
2155 // directives.
2156 if (!C.capturesVariable())
2157 continue;
2158 const VarDecl *VD = C.getCapturedVar();
2159 const auto *RD = VD->getType()
2160 .getCanonicalType()
2161 .getNonReferenceType()
2162 ->getAsCXXRecordDecl();
2163 if (!RD || !RD->isLambda())
2164 continue;
2165 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2166 LValue VDLVal;
2167 if (VD->getType().getCanonicalType()->isReferenceType())
2168 VDLVal = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr, RefTy: VD->getType());
2169 else
2170 VDLVal = CGF.MakeAddrLValue(
2171 Addr: VDAddr, T: VD->getType().getCanonicalType().getNonReferenceType());
2172 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2173 FieldDecl *ThisCapture = nullptr;
2174 RD->getCaptureFields(Captures, ThisCapture);
2175 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2176 LValue ThisLVal =
2177 CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
2178 llvm::Value *CXXThis = CGF.LoadCXXThis();
2179 CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
2180 }
2181 for (const LambdaCapture &LC : RD->captures()) {
2182 if (LC.getCaptureKind() != LCK_ByRef)
2183 continue;
2184 const ValueDecl *VD = LC.getCapturedVar();
2185 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2186 // capturing structured bindings in lambdas yet.
2187 if (!CS->capturesVariable(Var: cast<VarDecl>(Val: VD)))
2188 continue;
2189 auto It = Captures.find(Val: VD);
2190 assert(It != Captures.end() && "Found lambda capture without field.");
2191 LValue VarLVal = CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: It->second);
2192 Address VDAddr = CGF.GetAddrOfLocalVar(VD: cast<VarDecl>(Val: VD));
2193 if (VD->getType().getCanonicalType()->isReferenceType())
2194 VDAddr = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr,
2195 RefTy: VD->getType().getCanonicalType())
2196 .getAddress();
2197 CGF.EmitStoreOfScalar(value: VDAddr.emitRawPointer(CGF), lvalue: VarLVal);
2198 }
2199 }
2200}
2201
2202bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2203 LangAS &AS) {
2204 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2205 return false;
2206 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2207 switch(A->getAllocatorType()) {
2208 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2209 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2210 // Not supported, fallback to the default mem space.
2211 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2212 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2213 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2214 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2215 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2216 AS = LangAS::Default;
2217 return true;
2218 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2219 AS = LangAS::cuda_constant;
2220 return true;
2221 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2222 AS = LangAS::cuda_shared;
2223 return true;
2224 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2225 llvm_unreachable("Expected predefined allocator for the variables with the "
2226 "static storage.");
2227 }
2228 return false;
2229}
2230
2231// Get current OffloadArch and ignore any unknown values
2232static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2233 if (!CGM.getTarget().hasFeature(Feature: "ptx"))
2234 return OffloadArch::UNKNOWN;
2235 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2236 if (Feature.getValue()) {
2237 OffloadArch Arch = StringToOffloadArch(S: Feature.getKey());
2238 if (Arch != OffloadArch::UNKNOWN)
2239 return Arch;
2240 }
2241 }
2242 return OffloadArch::UNKNOWN;
2243}
2244
2245/// Check to see if target architecture supports unified addressing which is
2246/// a restriction for OpenMP requires clause "unified_shared_memory".
2247void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2248 for (const OMPClause *Clause : D->clauselists()) {
2249 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2250 OffloadArch Arch = getOffloadArch(CGM);
2251 switch (Arch) {
2252 case OffloadArch::SM_20:
2253 case OffloadArch::SM_21:
2254 case OffloadArch::SM_30:
2255 case OffloadArch::SM_32_:
2256 case OffloadArch::SM_35:
2257 case OffloadArch::SM_37:
2258 case OffloadArch::SM_50:
2259 case OffloadArch::SM_52:
2260 case OffloadArch::SM_53: {
2261 SmallString<256> Buffer;
2262 llvm::raw_svector_ostream Out(Buffer);
2263 Out << "Target architecture " << OffloadArchToString(A: Arch)
2264 << " does not support unified addressing";
2265 CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
2266 return;
2267 }
2268 case OffloadArch::SM_60:
2269 case OffloadArch::SM_61:
2270 case OffloadArch::SM_62:
2271 case OffloadArch::SM_70:
2272 case OffloadArch::SM_72:
2273 case OffloadArch::SM_75:
2274 case OffloadArch::SM_80:
2275 case OffloadArch::SM_86:
2276 case OffloadArch::SM_87:
2277 case OffloadArch::SM_89:
2278 case OffloadArch::SM_90:
2279 case OffloadArch::SM_90a:
2280 case OffloadArch::GFX600:
2281 case OffloadArch::GFX601:
2282 case OffloadArch::GFX602:
2283 case OffloadArch::GFX700:
2284 case OffloadArch::GFX701:
2285 case OffloadArch::GFX702:
2286 case OffloadArch::GFX703:
2287 case OffloadArch::GFX704:
2288 case OffloadArch::GFX705:
2289 case OffloadArch::GFX801:
2290 case OffloadArch::GFX802:
2291 case OffloadArch::GFX803:
2292 case OffloadArch::GFX805:
2293 case OffloadArch::GFX810:
2294 case OffloadArch::GFX9_GENERIC:
2295 case OffloadArch::GFX900:
2296 case OffloadArch::GFX902:
2297 case OffloadArch::GFX904:
2298 case OffloadArch::GFX906:
2299 case OffloadArch::GFX908:
2300 case OffloadArch::GFX909:
2301 case OffloadArch::GFX90a:
2302 case OffloadArch::GFX90c:
2303 case OffloadArch::GFX940:
2304 case OffloadArch::GFX941:
2305 case OffloadArch::GFX942:
2306 case OffloadArch::GFX10_1_GENERIC:
2307 case OffloadArch::GFX1010:
2308 case OffloadArch::GFX1011:
2309 case OffloadArch::GFX1012:
2310 case OffloadArch::GFX1013:
2311 case OffloadArch::GFX10_3_GENERIC:
2312 case OffloadArch::GFX1030:
2313 case OffloadArch::GFX1031:
2314 case OffloadArch::GFX1032:
2315 case OffloadArch::GFX1033:
2316 case OffloadArch::GFX1034:
2317 case OffloadArch::GFX1035:
2318 case OffloadArch::GFX1036:
2319 case OffloadArch::GFX11_GENERIC:
2320 case OffloadArch::GFX1100:
2321 case OffloadArch::GFX1101:
2322 case OffloadArch::GFX1102:
2323 case OffloadArch::GFX1103:
2324 case OffloadArch::GFX1150:
2325 case OffloadArch::GFX1151:
2326 case OffloadArch::GFX1152:
2327 case OffloadArch::GFX12_GENERIC:
2328 case OffloadArch::GFX1200:
2329 case OffloadArch::GFX1201:
2330 case OffloadArch::AMDGCNSPIRV:
2331 case OffloadArch::Generic:
2332 case OffloadArch::UNUSED:
2333 case OffloadArch::UNKNOWN:
2334 break;
2335 case OffloadArch::LAST:
2336 llvm_unreachable("Unexpected GPU arch.");
2337 }
2338 }
2339 }
2340 CGOpenMPRuntime::processRequiresDirective(D);
2341}
2342
2343llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2344 CGBuilderTy &Bld = CGF.Builder;
2345 llvm::Module *M = &CGF.CGM.getModule();
2346 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2347 llvm::Function *F = M->getFunction(Name: LocSize);
2348 if (!F) {
2349 F = llvm::Function::Create(
2350 Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: std::nullopt, isVarArg: false),
2351 Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize, M: &CGF.CGM.getModule());
2352 }
2353 return Bld.CreateCall(Callee: F, Args: std::nullopt, Name: "nvptx_num_threads");
2354}
2355
2356llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2357 ArrayRef<llvm::Value *> Args{};
2358 return CGF.EmitRuntimeCall(
2359 callee: OMPBuilder.getOrCreateRuntimeFunction(
2360 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
2361 args: Args);
2362}
2363