1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CGDebugInfo.h"
16#include "CodeGenFunction.h"
17#include "clang/AST/Attr.h"
18#include "clang/AST/DeclOpenMP.h"
19#include "clang/AST/OpenMPClause.h"
20#include "clang/AST/StmtOpenMP.h"
21#include "clang/AST/StmtVisitor.h"
22#include "clang/Basic/Cuda.h"
23#include "llvm/ADT/SmallPtrSet.h"
24#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
25#include "llvm/Frontend/OpenMP/OMPGridValues.h"
26
27using namespace clang;
28using namespace CodeGen;
29using namespace llvm::omp;
30
31namespace {
32/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
33class NVPTXActionTy final : public PrePostActionTy {
34 llvm::FunctionCallee EnterCallee = nullptr;
35 ArrayRef<llvm::Value *> EnterArgs;
36 llvm::FunctionCallee ExitCallee = nullptr;
37 ArrayRef<llvm::Value *> ExitArgs;
38 bool Conditional = false;
39 llvm::BasicBlock *ContBlock = nullptr;
40
41public:
42 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 ArrayRef<llvm::Value *> EnterArgs,
44 llvm::FunctionCallee ExitCallee,
45 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
46 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
47 ExitArgs(ExitArgs), Conditional(Conditional) {}
48 void Enter(CodeGenFunction &CGF) override {
49 llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
50 if (Conditional) {
51 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
52 auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
53 ContBlock = CGF.createBasicBlock(name: "omp_if.end");
54 // Generate the branch (If-stmt)
55 CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
56 CGF.EmitBlock(BB: ThenBlock);
57 }
58 }
59 void Done(CodeGenFunction &CGF) {
60 // Emit the rest of blocks/branches
61 CGF.EmitBranch(Block: ContBlock);
62 CGF.EmitBlock(BB: ContBlock, IsFinished: true);
63 }
64 void Exit(CodeGenFunction &CGF) override {
65 CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
66 }
67};
68
69/// A class to track the execution mode when codegening directives within
70/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
71/// to the target region and used by containing directives such as 'parallel'
72/// to emit optimized code.
73class ExecutionRuntimeModesRAII {
74private:
75 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
76 CGOpenMPRuntimeGPU::EM_Unknown;
77 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
78
79public:
80 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
84 ExecMode = EntryMode;
85 }
86 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
87};
88
89static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
90 RefExpr = RefExpr->IgnoreParens();
91 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
92 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
93 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
94 Base = TempASE->getBase()->IgnoreParenImpCasts();
95 RefExpr = Base;
96 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(Val: RefExpr)) {
97 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
98 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Val: Base))
99 Base = TempOASE->getBase()->IgnoreParenImpCasts();
100 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
101 Base = TempASE->getBase()->IgnoreParenImpCasts();
102 RefExpr = Base;
103 }
104 RefExpr = RefExpr->IgnoreParenImpCasts();
105 if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
106 return cast<ValueDecl>(Val: DE->getDecl()->getCanonicalDecl());
107 const auto *ME = cast<MemberExpr>(Val: RefExpr);
108 return cast<ValueDecl>(Val: ME->getMemberDecl()->getCanonicalDecl());
109}
110
111static RecordDecl *buildRecordForGlobalizedVars(
112 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
113 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
114 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
115 &MappedDeclsFields,
116 int BufSize) {
117 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
118 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
119 return nullptr;
120 SmallVector<VarsDataTy, 4> GlobalizedVars;
121 for (const ValueDecl *D : EscapedDecls)
122 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
123 for (const ValueDecl *D : EscapedDeclsForTeams)
124 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
125
126 // Build struct _globalized_locals_ty {
127 // /* globalized vars */[WarSize] align (decl_align)
128 // /* globalized vars */ for EscapedDeclsForTeams
129 // };
130 RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
131 GlobalizedRD->startDefinition();
132 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(llvm::from_range,
133 EscapedDeclsForTeams);
134 for (const auto &Pair : GlobalizedVars) {
135 const ValueDecl *VD = Pair.second;
136 QualType Type = VD->getType();
137 if (Type->isLValueReferenceType())
138 Type = C.getPointerType(T: Type.getNonReferenceType());
139 else
140 Type = Type.getNonReferenceType();
141 SourceLocation Loc = VD->getLocation();
142 FieldDecl *Field;
143 if (SingleEscaped.count(Ptr: VD)) {
144 Field = FieldDecl::Create(
145 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
146 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
147 /*BW=*/nullptr, /*Mutable=*/false,
148 /*InitStyle=*/ICIS_NoInit);
149 Field->setAccess(AS_public);
150 if (VD->hasAttrs()) {
151 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
152 E(VD->getAttrs().end());
153 I != E; ++I)
154 Field->addAttr(A: *I);
155 }
156 } else {
157 if (BufSize > 1) {
158 llvm::APInt ArraySize(32, BufSize);
159 Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
160 ASM: ArraySizeModifier::Normal, IndexTypeQuals: 0);
161 }
162 Field = FieldDecl::Create(
163 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
164 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
165 /*BW=*/nullptr, /*Mutable=*/false,
166 /*InitStyle=*/ICIS_NoInit);
167 Field->setAccess(AS_public);
168 llvm::APInt Align(32, Pair.first.getQuantity());
169 Field->addAttr(A: AlignedAttr::CreateImplicit(
170 Ctx&: C, /*IsAlignmentExpr=*/true,
171 Alignment: IntegerLiteral::Create(C, V: Align,
172 type: C.getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
173 l: SourceLocation()),
174 Range: {}, S: AlignedAttr::GNU_aligned));
175 }
176 GlobalizedRD->addDecl(D: Field);
177 MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
178 }
179 GlobalizedRD->completeDefinition();
180 return GlobalizedRD;
181}
182
183/// Get the list of variables that can escape their declaration context.
184class CheckVarsEscapingDeclContext final
185 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
186 CodeGenFunction &CGF;
187 llvm::SetVector<const ValueDecl *> EscapedDecls;
188 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
189 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
190 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
191 RecordDecl *GlobalizedRD = nullptr;
192 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
193 bool AllEscaped = false;
194 bool IsForCombinedParallelRegion = false;
195
196 void markAsEscaped(const ValueDecl *VD) {
197 // Do not globalize declare target variables.
198 if (!isa<VarDecl>(Val: VD) ||
199 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
200 return;
201 VD = cast<ValueDecl>(Val: VD->getCanonicalDecl());
202 // Use user-specified allocation.
203 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
204 return;
205 // Variables captured by value must be globalized.
206 bool IsCaptured = false;
207 if (auto *CSI = CGF.CapturedStmtInfo) {
208 if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
209 // Check if need to capture the variable that was already captured by
210 // value in the outer region.
211 IsCaptured = true;
212 if (!IsForCombinedParallelRegion) {
213 if (!FD->hasAttrs())
214 return;
215 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
216 if (!Attr)
217 return;
218 if (((Attr->getCaptureKind() != OMPC_map) &&
219 !isOpenMPPrivate(Kind: Attr->getCaptureKind())) ||
220 ((Attr->getCaptureKind() == OMPC_map) &&
221 !FD->getType()->isAnyPointerType()))
222 return;
223 }
224 if (!FD->getType()->isReferenceType()) {
225 assert(!VD->getType()->isVariablyModifiedType() &&
226 "Parameter captured by value with variably modified type");
227 EscapedParameters.insert(Ptr: VD);
228 } else if (!IsForCombinedParallelRegion) {
229 return;
230 }
231 }
232 }
233 if ((!CGF.CapturedStmtInfo ||
234 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
235 VD->getType()->isReferenceType())
236 // Do not globalize variables with reference type.
237 return;
238 if (VD->getType()->isVariablyModifiedType()) {
239 // If not captured at the target region level then mark the escaped
240 // variable as delayed.
241 if (IsCaptured)
242 EscapedVariableLengthDecls.insert(X: VD);
243 else
244 DelayedVariableLengthDecls.insert(X: VD);
245 } else
246 EscapedDecls.insert(X: VD);
247 }
248
249 void VisitValueDecl(const ValueDecl *VD) {
250 if (VD->getType()->isLValueReferenceType())
251 markAsEscaped(VD);
252 if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
253 if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
254 const bool SavedAllEscaped = AllEscaped;
255 AllEscaped = VD->getType()->isLValueReferenceType();
256 Visit(S: VarD->getInit());
257 AllEscaped = SavedAllEscaped;
258 }
259 }
260 }
261 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
262 ArrayRef<OMPClause *> Clauses,
263 bool IsCombinedParallelRegion) {
264 if (!S)
265 return;
266 for (const CapturedStmt::Capture &C : S->captures()) {
267 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
268 const ValueDecl *VD = C.getCapturedVar();
269 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
270 if (IsCombinedParallelRegion) {
271 // Check if the variable is privatized in the combined construct and
272 // those private copies must be shared in the inner parallel
273 // directive.
274 IsForCombinedParallelRegion = false;
275 for (const OMPClause *C : Clauses) {
276 if (!isOpenMPPrivate(Kind: C->getClauseKind()) ||
277 C->getClauseKind() == OMPC_reduction ||
278 C->getClauseKind() == OMPC_linear ||
279 C->getClauseKind() == OMPC_private)
280 continue;
281 ArrayRef<const Expr *> Vars;
282 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
283 Vars = PC->getVarRefs();
284 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
285 Vars = PC->getVarRefs();
286 else
287 llvm_unreachable("Unexpected clause.");
288 for (const auto *E : Vars) {
289 const Decl *D =
290 cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
291 if (D == VD->getCanonicalDecl()) {
292 IsForCombinedParallelRegion = true;
293 break;
294 }
295 }
296 if (IsForCombinedParallelRegion)
297 break;
298 }
299 }
300 markAsEscaped(VD);
301 if (isa<OMPCapturedExprDecl>(Val: VD))
302 VisitValueDecl(VD);
303 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
304 }
305 }
306 }
307
308 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
309 assert(!GlobalizedRD &&
310 "Record for globalized variables is built already.");
311 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
312 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
313 if (IsInTTDRegion)
314 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
315 else
316 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
317 GlobalizedRD = ::buildRecordForGlobalizedVars(
318 C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
319 MappedDeclsFields, BufSize: WarpSize);
320 }
321
322public:
323 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
324 ArrayRef<const ValueDecl *> TeamsReductions)
325 : CGF(CGF), EscapedDecls(llvm::from_range, TeamsReductions) {}
326 ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, DKind: D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S: S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, Clauses: D->clauses(),
351 IsCombinedParallelRegion: CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(DKind: D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(Val: VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(Capture: &C) || isa<OMPCapturedExprDecl>(Val: VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(Val: VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(S: Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(S: Arg);
406 }
407 }
408 Visit(S: E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(Val: VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(S: E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(S: E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(S: E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(S: E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(S: Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(S: Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(Val: VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504CGOpenMPRuntimeGPU::ExecutionMode
505CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode;
507}
508
509CGOpenMPRuntimeGPU::DataSharingMode
510CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode;
512}
513
514/// Check for inner (nested) SPMD construct, if any
515static bool hasNestedSPMDDirective(ASTContext &Ctx,
516 const OMPExecutableDirective &D) {
517 const auto *CS = D.getInnermostCapturedStmt();
518 const auto *Body =
519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522 if (const auto *NestedDir =
523 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525 switch (D.getDirectiveKind()) {
526 case OMPD_target:
527 if (isOpenMPParallelDirective(DKind))
528 return true;
529 if (DKind == OMPD_teams) {
530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
532 if (!Body)
533 return false;
534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535 if (const auto *NND =
536 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
537 DKind = NND->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind))
539 return true;
540 }
541 }
542 return false;
543 case OMPD_target_teams:
544 return isOpenMPParallelDirective(DKind);
545 case OMPD_target_simd:
546 case OMPD_target_parallel:
547 case OMPD_target_parallel_for:
548 case OMPD_target_parallel_for_simd:
549 case OMPD_target_teams_distribute:
550 case OMPD_target_teams_distribute_simd:
551 case OMPD_target_teams_distribute_parallel_for:
552 case OMPD_target_teams_distribute_parallel_for_simd:
553 case OMPD_parallel:
554 case OMPD_for:
555 case OMPD_parallel_for:
556 case OMPD_parallel_master:
557 case OMPD_parallel_sections:
558 case OMPD_for_simd:
559 case OMPD_parallel_for_simd:
560 case OMPD_cancel:
561 case OMPD_cancellation_point:
562 case OMPD_ordered:
563 case OMPD_threadprivate:
564 case OMPD_allocate:
565 case OMPD_task:
566 case OMPD_simd:
567 case OMPD_sections:
568 case OMPD_section:
569 case OMPD_single:
570 case OMPD_master:
571 case OMPD_critical:
572 case OMPD_taskyield:
573 case OMPD_barrier:
574 case OMPD_taskwait:
575 case OMPD_taskgroup:
576 case OMPD_atomic:
577 case OMPD_flush:
578 case OMPD_depobj:
579 case OMPD_scan:
580 case OMPD_teams:
581 case OMPD_target_data:
582 case OMPD_target_exit_data:
583 case OMPD_target_enter_data:
584 case OMPD_distribute:
585 case OMPD_distribute_simd:
586 case OMPD_distribute_parallel_for:
587 case OMPD_distribute_parallel_for_simd:
588 case OMPD_teams_distribute:
589 case OMPD_teams_distribute_simd:
590 case OMPD_teams_distribute_parallel_for:
591 case OMPD_teams_distribute_parallel_for_simd:
592 case OMPD_target_update:
593 case OMPD_declare_simd:
594 case OMPD_declare_variant:
595 case OMPD_begin_declare_variant:
596 case OMPD_end_declare_variant:
597 case OMPD_declare_target:
598 case OMPD_end_declare_target:
599 case OMPD_declare_reduction:
600 case OMPD_declare_mapper:
601 case OMPD_taskloop:
602 case OMPD_taskloop_simd:
603 case OMPD_master_taskloop:
604 case OMPD_master_taskloop_simd:
605 case OMPD_parallel_master_taskloop:
606 case OMPD_parallel_master_taskloop_simd:
607 case OMPD_requires:
608 case OMPD_unknown:
609 default:
610 llvm_unreachable("Unexpected directive.");
611 }
612 }
613
614 return false;
615}
616
617static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618 const OMPExecutableDirective &D) {
619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620 switch (DirectiveKind) {
621 case OMPD_target:
622 case OMPD_target_teams:
623 return hasNestedSPMDDirective(Ctx, D);
624 case OMPD_target_parallel_loop:
625 case OMPD_target_parallel:
626 case OMPD_target_parallel_for:
627 case OMPD_target_parallel_for_simd:
628 case OMPD_target_teams_distribute_parallel_for:
629 case OMPD_target_teams_distribute_parallel_for_simd:
630 case OMPD_target_simd:
631 case OMPD_target_teams_distribute_simd:
632 return true;
633 case OMPD_target_teams_distribute:
634 return false;
635 case OMPD_target_teams_loop:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
639 return TTLD->canBeParallelFor();
640 return false;
641 case OMPD_parallel:
642 case OMPD_for:
643 case OMPD_parallel_for:
644 case OMPD_parallel_master:
645 case OMPD_parallel_sections:
646 case OMPD_for_simd:
647 case OMPD_parallel_for_simd:
648 case OMPD_cancel:
649 case OMPD_cancellation_point:
650 case OMPD_ordered:
651 case OMPD_threadprivate:
652 case OMPD_allocate:
653 case OMPD_task:
654 case OMPD_simd:
655 case OMPD_sections:
656 case OMPD_section:
657 case OMPD_single:
658 case OMPD_master:
659 case OMPD_critical:
660 case OMPD_taskyield:
661 case OMPD_barrier:
662 case OMPD_taskwait:
663 case OMPD_taskgroup:
664 case OMPD_atomic:
665 case OMPD_flush:
666 case OMPD_depobj:
667 case OMPD_scan:
668 case OMPD_teams:
669 case OMPD_target_data:
670 case OMPD_target_exit_data:
671 case OMPD_target_enter_data:
672 case OMPD_distribute:
673 case OMPD_distribute_simd:
674 case OMPD_distribute_parallel_for:
675 case OMPD_distribute_parallel_for_simd:
676 case OMPD_teams_distribute:
677 case OMPD_teams_distribute_simd:
678 case OMPD_teams_distribute_parallel_for:
679 case OMPD_teams_distribute_parallel_for_simd:
680 case OMPD_target_update:
681 case OMPD_declare_simd:
682 case OMPD_declare_variant:
683 case OMPD_begin_declare_variant:
684 case OMPD_end_declare_variant:
685 case OMPD_declare_target:
686 case OMPD_end_declare_target:
687 case OMPD_declare_reduction:
688 case OMPD_declare_mapper:
689 case OMPD_taskloop:
690 case OMPD_taskloop_simd:
691 case OMPD_master_taskloop:
692 case OMPD_master_taskloop_simd:
693 case OMPD_parallel_master_taskloop:
694 case OMPD_parallel_master_taskloop_simd:
695 case OMPD_requires:
696 case OMPD_unknown:
697 default:
698 break;
699 }
700 llvm_unreachable(
701 "Unknown programming model for OpenMP directive on NVPTX target.");
702}
703
704void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705 StringRef ParentName,
706 llvm::Function *&OutlinedFn,
707 llvm::Constant *&OutlinedFnID,
708 bool IsOffloadEntry,
709 const RegionCodeGenTy &CodeGen) {
710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711 EntryFunctionState EST;
712 WrapperFunctionsMap.clear();
713
714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy : public PrePostActionTy {
719 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720 const OMPExecutableDirective &D;
721
722 public:
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724 const OMPExecutableDirective &D)
725 : EST(EST), D(D) {}
726 void Enter(CodeGenFunction &CGF) override {
727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 }
732 void Exit(CodeGenFunction &CGF) override {
733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734 RT.clearLocThreadIdInsertPt(CGF);
735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 }
737 } Action(EST, D);
738 CodeGen.setAction(Action);
739 IsInTTDRegion = true;
740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741 IsOffloadEntry, CodeGen);
742 IsInTTDRegion = false;
743}
744
745void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746 CodeGenFunction &CGF,
747 EntryFunctionState &EST, bool IsSPMD) {
748 llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs;
749 Attrs.ExecFlags =
750 IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD
751 : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
752 computeMinAndMaxThreadsAndTeams(D, CGF, Attrs);
753
754 CGBuilderTy &Bld = CGF.Builder;
755 Bld.restoreIP(IP: OMPBuilder.createTargetInit(Loc: Bld, Attrs));
756 if (!IsSPMD)
757 emitGenericVarsProlog(CGF, Loc: EST.Loc);
758}
759
760void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761 EntryFunctionState &EST,
762 bool IsSPMD) {
763 if (!IsSPMD)
764 emitGenericVarsEpilog(CGF);
765
766 // This is temporary until we remove the fixed sized buffer.
767 ASTContext &C = CGM.getContext();
768 RecordDecl *StaticRD = C.buildImplicitRecord(
769 Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
770 StaticRD->startDefinition();
771 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772 QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
773 auto *Field = FieldDecl::Create(
774 C, DC: StaticRD, StartLoc: SourceLocation(), IdLoc: SourceLocation(), Id: nullptr, T: RecTy,
775 TInfo: C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit);
778 Field->setAccess(AS_public);
779 StaticRD->addDecl(D: Field);
780 }
781 StaticRD->completeDefinition();
782 QualType StaticTy = C.getRecordType(Decl: StaticRD);
783 llvm::Type *LLVMReductionsBufferTy =
784 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
785 const auto &DL = CGM.getModule().getDataLayout();
786 uint64_t ReductionDataSize =
787 TeamsReductions.empty()
788 ? 0
789 : DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
790 CGBuilderTy &Bld = CGF.Builder;
791 OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
792 TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
793 TeamsReductions.clear();
794}
795
796void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797 StringRef ParentName,
798 llvm::Function *&OutlinedFn,
799 llvm::Constant *&OutlinedFnID,
800 bool IsOffloadEntry,
801 const RegionCodeGenTy &CodeGen) {
802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803 EntryFunctionState EST;
804
805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy : public PrePostActionTy {
809 CGOpenMPRuntimeGPU &RT;
810 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811 bool IsBareKernel;
812 DataSharingMode Mode;
813 const OMPExecutableDirective &D;
814
815 public:
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818 bool IsBareKernel, const OMPExecutableDirective &D)
819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820 Mode(RT.CurrentDataSharingMode), D(D) {}
821 void Enter(CodeGenFunction &CGF) override {
822 if (IsBareKernel) {
823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824 return;
825 }
826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 }
830 void Exit(CodeGenFunction &CGF) override {
831 if (IsBareKernel) {
832 RT.CurrentDataSharingMode = Mode;
833 return;
834 }
835 RT.clearLocThreadIdInsertPt(CGF);
836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 }
838 } Action(*this, EST, IsBareKernel, D);
839 CodeGen.setAction(Action);
840 IsInTTDRegion = true;
841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842 IsOffloadEntry, CodeGen);
843 IsInTTDRegion = false;
844}
845
846void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective &D, StringRef ParentName,
848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850 if (!IsOffloadEntry) // Nothing to do.
851 return;
852
853 assert(!ParentName.empty() && "Invalid target region parent name!");
854
855 bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857 if (Mode || IsBareKernel)
858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859 CodeGen);
860 else
861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 CodeGen);
863}
864
865CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866 : CGOpenMPRuntime(CGM) {
867 llvm::OpenMPIRBuilderConfig Config(
868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869 CGM.getLangOpts().OpenMPOffloadMandatory,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 OMPBuilder.setConfig(Config);
873
874 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875 llvm_unreachable("OpenMP can only handle device code.");
876
877 if (CGM.getLangOpts().OpenMPCUDAMode)
878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879
880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
882 return;
883
884 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
885 Name: "__omp_rtl_debug_kind");
886 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
887 Name: "__omp_rtl_assume_teams_oversubscription");
888 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
889 Name: "__omp_rtl_assume_threads_oversubscription");
890 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
891 Name: "__omp_rtl_assume_no_thread_state");
892 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
893 Name: "__omp_rtl_assume_no_nested_parallelism");
894}
895
896void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
897 ProcBindKind ProcBind,
898 SourceLocation Loc) {
899 // Nothing to do.
900}
901
902void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
903 llvm::Value *NumThreads,
904 SourceLocation Loc) {
905 // Nothing to do.
906}
907
908void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
909 const Expr *NumTeams,
910 const Expr *ThreadLimit,
911 SourceLocation Loc) {}
912
913llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
914 CodeGenFunction &CGF, const OMPExecutableDirective &D,
915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
916 const RegionCodeGenTy &CodeGen) {
917 // Emit target region as a standalone region.
918 bool PrevIsInTTDRegion = IsInTTDRegion;
919 IsInTTDRegion = false;
920 auto *OutlinedFun =
921 cast<llvm::Function>(Val: CGOpenMPRuntime::emitParallelOutlinedFunction(
922 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923 IsInTTDRegion = PrevIsInTTDRegion;
924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925 llvm::Function *WrapperFun =
926 createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
927 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
928 }
929
930 return OutlinedFun;
931}
932
933/// Get list of lastprivate variables from the teams distribute ... or
934/// teams {distribute ...} directives.
935static void
936getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
937 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939 "expected teams directive.");
940 const OMPExecutableDirective *Dir = &D;
941 if (!isOpenMPDistributeDirective(DKind: D.getDirectiveKind())) {
942 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
943 Ctx,
944 Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945 /*IgnoreCaptured=*/true))) {
946 Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
947 if (Dir && !isOpenMPDistributeDirective(DKind: Dir->getDirectiveKind()))
948 Dir = nullptr;
949 }
950 }
951 if (!Dir)
952 return;
953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954 for (const Expr *E : C->getVarRefs())
955 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
956 }
957}
958
959/// Get list of reduction variables from the teams ... directives.
960static void
961getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964 "expected teams directive.");
965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966 for (const Expr *E : C->privates())
967 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
968 }
969}
970
971llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
972 CodeGenFunction &CGF, const OMPExecutableDirective &D,
973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
974 const RegionCodeGenTy &CodeGen) {
975 SourceLocation Loc = D.getBeginLoc();
976
977 const RecordDecl *GlobalizedRD = nullptr;
978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981 // Globalize team reductions variable unconditionally in all modes.
982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983 getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985 getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
986 if (!LastPrivatesReductions.empty()) {
987 GlobalizedRD = ::buildRecordForGlobalizedVars(
988 C&: CGM.getContext(), EscapedDecls: {}, EscapedDeclsForTeams: LastPrivatesReductions, MappedDeclsFields,
989 BufSize: WarpSize);
990 }
991 } else if (!LastPrivatesReductions.empty()) {
992 assert(!TeamAndReductions.first &&
993 "Previous team declaration is not expected.");
994 TeamAndReductions.first = D.getCapturedStmt(RegionKind: OMPD_teams)->getCapturedDecl();
995 std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
996 }
997
998 // Emit target region as a standalone region.
999 class NVPTXPrePostActionTy : public PrePostActionTy {
1000 SourceLocation &Loc;
1001 const RecordDecl *GlobalizedRD;
1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 &MappedDeclsFields;
1004
1005 public:
1006 NVPTXPrePostActionTy(
1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1009 &MappedDeclsFields)
1010 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1011 MappedDeclsFields(MappedDeclsFields) {}
1012 void Enter(CodeGenFunction &CGF) override {
1013 auto &Rt =
1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015 if (GlobalizedRD) {
1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1017 I->getSecond().MappedParams =
1018 std::make_unique<CodeGenFunction::OMPMapVars>();
1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1020 for (const auto &Pair : MappedDeclsFields) {
1021 assert(Pair.getFirst()->isCanonicalDecl() &&
1022 "Expected canonical declaration");
1023 Data.try_emplace(Key: Pair.getFirst());
1024 }
1025 }
1026 Rt.emitGenericVarsProlog(CGF, Loc);
1027 }
1028 void Exit(CodeGenFunction &CGF) override {
1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030 .emitGenericVarsEpilog(CGF);
1031 }
1032 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1033 CodeGen.setAction(Action);
1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036
1037 return OutlinedFun;
1038}
1039
1040void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1041 SourceLocation Loc) {
1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043 return;
1044
1045 CGBuilderTy &Bld = CGF.Builder;
1046
1047 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1048 if (I == FunctionGlobalizedDecls.end())
1049 return;
1050
1051 for (auto &Rec : I->getSecond().LocalVarData) {
1052 const auto *VD = cast<VarDecl>(Val: Rec.first);
1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Ptr: Rec.first);
1054 QualType VarTy = VD->getType();
1055
1056 // Get the local allocation of a firstprivate variable before sharing
1057 llvm::Value *ParValue;
1058 if (EscapedParam) {
1059 LValue ParLVal =
1060 CGF.MakeAddrLValue(Addr: CGF.GetAddrOfLocalVar(VD), T: VD->getType());
1061 ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1062 }
1063
1064 // Allocate space for the variable to be globalized
1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1066 llvm::CallBase *VoidPtr =
1067 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1068 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1069 args: AllocArgs, name: VD->getName());
1070 // FIXME: We should use the variables actual alignment as an argument.
1071 VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1072 Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment,
1073 Val: CGM.getContext().getTargetInfo().getNewAlign() / 8));
1074
1075 // Cast the void pointer and get the address of the globalized variable.
1076 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1077 V: VoidPtr, DestTy: Bld.getPtrTy(AddrSpace: 0), Name: VD->getName() + "_on_stack");
1078 LValue VarAddr =
1079 CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1080 Rec.second.PrivateAddr = VarAddr.getAddress();
1081 Rec.second.GlobalizedVal = VoidPtr;
1082
1083 // Assign the local allocation to the newly globalized location.
1084 if (EscapedParam) {
1085 CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1086 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress());
1087 }
1088 if (auto *DI = CGF.getDebugInfo())
1089 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1090 }
1091
1092 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1093 const auto *VD = cast<VarDecl>(Val: ValueD);
1094 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1095 getKmpcAllocShared(CGF, VD);
1096 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1097 LValue Base = CGF.MakeAddrLValue(V: AddrSizePair.first, T: VD->getType(),
1098 Alignment: CGM.getContext().getDeclAlign(D: VD),
1099 Source: AlignmentSource::Decl);
1100 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress());
1101 }
1102 I->getSecond().MappedParams->apply(CGF);
1103}
1104
1105bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1106 const VarDecl *VD) const {
1107 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1108 if (I == FunctionGlobalizedDecls.end())
1109 return false;
1110
1111 // Check variable declaration is delayed:
1112 return llvm::is_contained(Range: I->getSecond().DelayedVariableLengthDecls, Element: VD);
1113}
1114
1115std::pair<llvm::Value *, llvm::Value *>
1116CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1117 const VarDecl *VD) {
1118 CGBuilderTy &Bld = CGF.Builder;
1119
1120 // Compute size and alignment.
1121 llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1122 CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
1123 Size = Bld.CreateNUWAdd(
1124 LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - 1));
1125 llvm::Value *AlignVal =
1126 llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1127 Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1128 Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1129
1130 // Allocate space for this VLA object to be globalized.
1131 llvm::Value *AllocArgs[] = {Size};
1132 llvm::CallBase *VoidPtr =
1133 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1134 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1135 args: AllocArgs, name: VD->getName());
1136 VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1137 Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment, Val: Align.getQuantity()));
1138
1139 return std::make_pair(x&: VoidPtr, y&: Size);
1140}
1141
1142void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1143 CodeGenFunction &CGF,
1144 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1145 // Deallocate the memory for each globalized VLA object
1146 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1147 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1148 args: {AddrSizePair.first, AddrSizePair.second});
1149}
1150
1151void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1152 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1153 return;
1154
1155 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1156 if (I != FunctionGlobalizedDecls.end()) {
1157 // Deallocate the memory for each globalized VLA object that was
1158 // globalized in the prolog (i.e. emitGenericVarsProlog).
1159 for (const auto &AddrSizePair :
1160 llvm::reverse(C&: I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1161 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1162 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1163 args: {AddrSizePair.first, AddrSizePair.second});
1164 }
1165 // Deallocate the memory for each globalized value
1166 for (auto &Rec : llvm::reverse(C&: I->getSecond().LocalVarData)) {
1167 const auto *VD = cast<VarDecl>(Val: Rec.first);
1168 I->getSecond().MappedParams->restore(CGF);
1169
1170 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1171 CGF.getTypeSize(Ty: VD->getType())};
1172 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1173 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1174 args: FreeArgs);
1175 }
1176 }
1177}
1178
1179void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1180 const OMPExecutableDirective &D,
1181 SourceLocation Loc,
1182 llvm::Function *OutlinedFn,
1183 ArrayRef<llvm::Value *> CapturedVars) {
1184 if (!CGF.HaveInsertPoint())
1185 return;
1186
1187 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1188
1189 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1190 /*Name=*/".zero.addr");
1191 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1192 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1193 // We don't emit any thread id function call in bare kernel, but because the
1194 // outlined function has a pointer argument, we emit a nullptr here.
1195 if (IsBareKernel)
1196 OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1197 else
1198 OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1199 OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1200 OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1201 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1202}
1203
1204void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1205 SourceLocation Loc,
1206 llvm::Function *OutlinedFn,
1207 ArrayRef<llvm::Value *> CapturedVars,
1208 const Expr *IfCond,
1209 llvm::Value *NumThreads) {
1210 if (!CGF.HaveInsertPoint())
1211 return;
1212
1213 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1214 NumThreads](CodeGenFunction &CGF,
1215 PrePostActionTy &Action) {
1216 CGBuilderTy &Bld = CGF.Builder;
1217 llvm::Value *NumThreadsVal = NumThreads;
1218 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1219 llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1220 if (WFn)
1221 ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1222 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1223
1224 // Create a private scope that will globalize the arguments
1225 // passed from the outside of the target region.
1226 // TODO: Is that needed?
1227 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1228
1229 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1230 Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1231 Name: "captured_vars_addrs");
1232 // There's something to share.
1233 if (!CapturedVars.empty()) {
1234 // Prepare for parallel region. Indicate the outlined function.
1235 ASTContext &Ctx = CGF.getContext();
1236 unsigned Idx = 0;
1237 for (llvm::Value *V : CapturedVars) {
1238 Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1239 llvm::Value *PtrV;
1240 if (V->getType()->isIntegerTy())
1241 PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1242 else
1243 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1244 CGF.EmitStoreOfScalar(Value: PtrV, Addr: Dst, /*Volatile=*/false,
1245 Ty: Ctx.getPointerType(T: Ctx.VoidPtrTy));
1246 ++Idx;
1247 }
1248 }
1249
1250 llvm::Value *IfCondVal = nullptr;
1251 if (IfCond)
1252 IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1253 /* isSigned */ false);
1254 else
1255 IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: 1);
1256
1257 if (!NumThreadsVal)
1258 NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1);
1259 else
1260 NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty);
1261
1262 assert(IfCondVal && "Expected a value");
1263 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1264 llvm::Value *Args[] = {
1265 RTLoc,
1266 getThreadID(CGF, Loc),
1267 IfCondVal,
1268 NumThreadsVal,
1269 llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1),
1270 FnPtr,
1271 ID,
1272 Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1273 DestTy: CGF.VoidPtrPtrTy),
1274 llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1275 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1276 M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1277 args: Args);
1278 };
1279
1280 RegionCodeGenTy RCG(ParallelGen);
1281 RCG(CGF);
1282}
1283
1284void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1285 // Always emit simple barriers!
1286 if (!CGF.HaveInsertPoint())
1287 return;
1288 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1289 // This function does not use parameters, so we can emit just default values.
1290 llvm::Value *Args[] = {
1291 llvm::ConstantPointerNull::get(
1292 T: cast<llvm::PointerType>(Val: getIdentTyPointerTy())),
1293 llvm::ConstantInt::get(Ty: CGF.Int32Ty, /*V=*/0, /*isSigned=*/IsSigned: true)};
1294 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1295 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1296 args: Args);
1297}
1298
1299void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1300 SourceLocation Loc,
1301 OpenMPDirectiveKind Kind, bool,
1302 bool) {
1303 // Always emit simple barriers!
1304 if (!CGF.HaveInsertPoint())
1305 return;
1306 // Build call __kmpc_cancel_barrier(loc, thread_id);
1307 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1308 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1309 getThreadID(CGF, Loc)};
1310
1311 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1312 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1313 args: Args);
1314}
1315
1316void CGOpenMPRuntimeGPU::emitCriticalRegion(
1317 CodeGenFunction &CGF, StringRef CriticalName,
1318 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1319 const Expr *Hint) {
1320 llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1321 llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1322 llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1323 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1324 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1325
1326 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1327
1328 // Get the mask of active threads in the warp.
1329 llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1330 M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1331 // Fetch team-local id of the thread.
1332 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1333
1334 // Get the width of the team.
1335 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1336
1337 // Initialize the counter variable for the loop.
1338 QualType Int32Ty =
1339 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1340 Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1341 LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1342 CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1343 /*isInit=*/true);
1344
1345 // Block checks if loop counter exceeds upper bound.
1346 CGF.EmitBlock(BB: LoopBB);
1347 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1348 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1349 CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1350
1351 // Block tests which single thread should execute region, and which threads
1352 // should go straight to synchronisation point.
1353 CGF.EmitBlock(BB: TestBB);
1354 CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1355 llvm::Value *CmpThreadToCounter =
1356 CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1357 CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1358
1359 // Block emits the body of the critical region.
1360 CGF.EmitBlock(BB: BodyBB);
1361
1362 // Output the critical statement.
1363 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1364 Hint);
1365
1366 // After the body surrounded by the critical region, the single executing
1367 // thread will jump to the synchronisation point.
1368 // Block waits for all threads in current team to finish then increments the
1369 // counter variable and returns to the loop.
1370 CGF.EmitBlock(BB: SyncBB);
1371 // Reconverge active threads in the warp.
1372 (void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1373 M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1374 args: Mask);
1375
1376 llvm::Value *IncCounterVal =
1377 CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: 1));
1378 CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1379 CGF.EmitBranch(Block: LoopBB);
1380
1381 // Block that is reached when all threads in the team complete the region.
1382 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
1383}
1384
1385/// Cast value to the specified type.
1386static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1387 QualType ValTy, QualType CastTy,
1388 SourceLocation Loc) {
1389 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1390 "Cast type must sized.");
1391 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1392 "Val type must sized.");
1393 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1394 if (ValTy == CastTy)
1395 return Val;
1396 if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1397 CGF.getContext().getTypeSizeInChars(T: CastTy))
1398 return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1399 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1400 return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1401 isSigned: CastTy->hasSignedIntegerRepresentation());
1402 Address CastItem = CGF.CreateMemTemp(T: CastTy);
1403 Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1404 CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /*Volatile=*/false, Ty: ValTy,
1405 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1406 TBAAInfo: TBAAAccessInfo());
1407 return CGF.EmitLoadOfScalar(Addr: CastItem, /*Volatile=*/false, Ty: CastTy, Loc,
1408 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1409 TBAAInfo: TBAAAccessInfo());
1410}
1411
1412///
1413/// Design of OpenMP reductions on the GPU
1414///
1415/// Consider a typical OpenMP program with one or more reduction
1416/// clauses:
1417///
1418/// float foo;
1419/// double bar;
1420/// #pragma omp target teams distribute parallel for \
1421/// reduction(+:foo) reduction(*:bar)
1422/// for (int i = 0; i < N; i++) {
1423/// foo += A[i]; bar *= B[i];
1424/// }
1425///
1426/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1427/// all teams. In our OpenMP implementation on the NVPTX device an
1428/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1429/// within a team are mapped to CUDA threads within a threadblock.
1430/// Our goal is to efficiently aggregate values across all OpenMP
1431/// threads such that:
1432///
1433/// - the compiler and runtime are logically concise, and
1434/// - the reduction is performed efficiently in a hierarchical
1435/// manner as follows: within OpenMP threads in the same warp,
1436/// across warps in a threadblock, and finally across teams on
1437/// the NVPTX device.
1438///
1439/// Introduction to Decoupling
1440///
1441/// We would like to decouple the compiler and the runtime so that the
1442/// latter is ignorant of the reduction variables (number, data types)
1443/// and the reduction operators. This allows a simpler interface
1444/// and implementation while still attaining good performance.
1445///
1446/// Pseudocode for the aforementioned OpenMP program generated by the
1447/// compiler is as follows:
1448///
1449/// 1. Create private copies of reduction variables on each OpenMP
1450/// thread: 'foo_private', 'bar_private'
1451/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1452/// to it and writes the result in 'foo_private' and 'bar_private'
1453/// respectively.
1454/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1455/// and store the result on the team master:
1456///
1457/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1458/// reduceData, shuffleReduceFn, interWarpCpyFn)
1459///
1460/// where:
1461/// struct ReduceData {
1462/// double *foo;
1463/// double *bar;
1464/// } reduceData
1465/// reduceData.foo = &foo_private
1466/// reduceData.bar = &bar_private
1467///
1468/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1469/// auxiliary functions generated by the compiler that operate on
1470/// variables of type 'ReduceData'. They aid the runtime perform
1471/// algorithmic steps in a data agnostic manner.
1472///
1473/// 'shuffleReduceFn' is a pointer to a function that reduces data
1474/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1475/// same warp. It takes the following arguments as input:
1476///
1477/// a. variable of type 'ReduceData' on the calling lane,
1478/// b. its lane_id,
1479/// c. an offset relative to the current lane_id to generate a
1480/// remote_lane_id. The remote lane contains the second
1481/// variable of type 'ReduceData' that is to be reduced.
1482/// d. an algorithm version parameter determining which reduction
1483/// algorithm to use.
1484///
1485/// 'shuffleReduceFn' retrieves data from the remote lane using
1486/// efficient GPU shuffle intrinsics and reduces, using the
1487/// algorithm specified by the 4th parameter, the two operands
1488/// element-wise. The result is written to the first operand.
1489///
1490/// Different reduction algorithms are implemented in different
1491/// runtime functions, all calling 'shuffleReduceFn' to perform
1492/// the essential reduction step. Therefore, based on the 4th
1493/// parameter, this function behaves slightly differently to
1494/// cooperate with the runtime to ensure correctness under
1495/// different circumstances.
1496///
1497/// 'InterWarpCpyFn' is a pointer to a function that transfers
1498/// reduced variables across warps. It tunnels, through CUDA
1499/// shared memory, the thread-private data of type 'ReduceData'
1500/// from lane 0 of each warp to a lane in the first warp.
1501/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1502/// The last team writes the global reduced value to memory.
1503///
1504/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1505/// reduceData, shuffleReduceFn, interWarpCpyFn,
1506/// scratchpadCopyFn, loadAndReduceFn)
1507///
1508/// 'scratchpadCopyFn' is a helper that stores reduced
1509/// data from the team master to a scratchpad array in
1510/// global memory.
1511///
1512/// 'loadAndReduceFn' is a helper that loads data from
1513/// the scratchpad array and reduces it with the input
1514/// operand.
1515///
1516/// These compiler generated functions hide address
1517/// calculation and alignment information from the runtime.
1518/// 5. if ret == 1:
1519/// The team master of the last team stores the reduced
1520/// result to the globals in memory.
1521/// foo += reduceData.foo; bar *= reduceData.bar
1522///
1523///
1524/// Warp Reduction Algorithms
1525///
1526/// On the warp level, we have three algorithms implemented in the
1527/// OpenMP runtime depending on the number of active lanes:
1528///
1529/// Full Warp Reduction
1530///
1531/// The reduce algorithm within a warp where all lanes are active
1532/// is implemented in the runtime as follows:
1533///
1534/// full_warp_reduce(void *reduce_data,
1535/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1536/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1537/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1538/// }
1539///
1540/// The algorithm completes in log(2, WARPSIZE) steps.
1541///
1542/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1543/// not used therefore we save instructions by not retrieving lane_id
1544/// from the corresponding special registers. The 4th parameter, which
1545/// represents the version of the algorithm being used, is set to 0 to
1546/// signify full warp reduction.
1547///
1548/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1549///
1550/// #reduce_elem refers to an element in the local lane's data structure
1551/// #remote_elem is retrieved from a remote lane
1552/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1553/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1554///
1555/// Contiguous Partial Warp Reduction
1556///
1557/// This reduce algorithm is used within a warp where only the first
1558/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1559/// number of OpenMP threads in a parallel region is not a multiple of
1560/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1561///
1562/// void
1563/// contiguous_partial_reduce(void *reduce_data,
1564/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1565/// int size, int lane_id) {
1566/// int curr_size;
1567/// int offset;
1568/// curr_size = size;
1569/// mask = curr_size/2;
1570/// while (offset>0) {
1571/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1572/// curr_size = (curr_size+1)/2;
1573/// offset = curr_size/2;
1574/// }
1575/// }
1576///
1577/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1578///
1579/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1580/// if (lane_id < offset)
1581/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1582/// else
1583/// reduce_elem = remote_elem
1584///
1585/// This algorithm assumes that the data to be reduced are located in a
1586/// contiguous subset of lanes starting from the first. When there is
1587/// an odd number of active lanes, the data in the last lane is not
1588/// aggregated with any other lane's dat but is instead copied over.
1589///
1590/// Dispersed Partial Warp Reduction
1591///
1592/// This algorithm is used within a warp when any discontiguous subset of
1593/// lanes are active. It is used to implement the reduction operation
1594/// across lanes in an OpenMP simd region or in a nested parallel region.
1595///
1596/// void
1597/// dispersed_partial_reduce(void *reduce_data,
1598/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1599/// int size, remote_id;
1600/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1601/// do {
1602/// remote_id = next_active_lane_id_right_after_me();
1603/// # the above function returns 0 of no active lane
1604/// # is present right after the current lane.
1605/// size = number_of_active_lanes_in_this_warp();
1606/// logical_lane_id /= 2;
1607/// ShuffleReduceFn(reduce_data, logical_lane_id,
1608/// remote_id-1-threadIdx.x, 2);
1609/// } while (logical_lane_id % 2 == 0 && size > 1);
1610/// }
1611///
1612/// There is no assumption made about the initial state of the reduction.
1613/// Any number of lanes (>=1) could be active at any position. The reduction
1614/// result is returned in the first active lane.
1615///
1616/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1617///
1618/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1619/// if (lane_id % 2 == 0 && offset > 0)
1620/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1621/// else
1622/// reduce_elem = remote_elem
1623///
1624///
1625/// Intra-Team Reduction
1626///
1627/// This function, as implemented in the runtime call
1628/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1629/// threads in a team. It first reduces within a warp using the
1630/// aforementioned algorithms. We then proceed to gather all such
1631/// reduced values at the first warp.
1632///
1633/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1634/// data from each of the "warp master" (zeroth lane of each warp, where
1635/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1636/// a mathematical sense) the problem of reduction across warp masters in
1637/// a block to the problem of warp reduction.
1638///
1639///
1640/// Inter-Team Reduction
1641///
1642/// Once a team has reduced its data to a single value, it is stored in
1643/// a global scratchpad array. Since each team has a distinct slot, this
1644/// can be done without locking.
1645///
1646/// The last team to write to the scratchpad array proceeds to reduce the
1647/// scratchpad array. One or more workers in the last team use the helper
1648/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1649/// the k'th worker reduces every k'th element.
1650///
1651/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1652/// reduce across workers and compute a globally reduced value.
1653///
1654void CGOpenMPRuntimeGPU::emitReduction(
1655 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1656 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1657 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1658 if (!CGF.HaveInsertPoint())
1659 return;
1660
1661 bool ParallelReduction = isOpenMPParallelDirective(DKind: Options.ReductionKind);
1662 bool TeamsReduction = isOpenMPTeamsDirective(DKind: Options.ReductionKind);
1663
1664 ASTContext &C = CGM.getContext();
1665
1666 if (Options.SimpleReduction) {
1667 assert(!TeamsReduction && !ParallelReduction &&
1668 "Invalid reduction selection in emitReduction.");
1669 (void)ParallelReduction;
1670 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1671 ReductionOps, Options);
1672 return;
1673 }
1674
1675 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1676 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1677 int Cnt = 0;
1678 for (const Expr *DRE : Privates) {
1679 PrivatesReductions[Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
1680 ++Cnt;
1681 }
1682 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1683 C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: {}, MappedDeclsFields&: VarFieldMap, BufSize: 1);
1684
1685 if (TeamsReduction)
1686 TeamsReductions.push_back(Elt: ReductionRec);
1687
1688 // Source location for the ident struct
1689 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1690
1691 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1692 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1693 CGF.AllocaInsertPt->getIterator());
1694 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1695 CGF.Builder.GetInsertPoint());
1696 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1697 CodeGenIP, CGF.SourceLocToDebugLoc(Location: Loc));
1698 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
1699
1700 CodeGenFunction::OMPPrivateScope Scope(CGF);
1701 unsigned Idx = 0;
1702 for (const Expr *Private : Privates) {
1703 llvm::Type *ElementType;
1704 llvm::Value *Variable;
1705 llvm::Value *PrivateVariable;
1706 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1707 ElementType = CGF.ConvertTypeForMem(T: Private->getType());
1708 const auto *RHSVar =
1709 cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs[Idx])->getDecl());
1710 PrivateVariable = CGF.GetAddrOfLocalVar(VD: RHSVar).emitRawPointer(CGF);
1711 const auto *LHSVar =
1712 cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs[Idx])->getDecl());
1713 Variable = CGF.GetAddrOfLocalVar(VD: LHSVar).emitRawPointer(CGF);
1714 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1715 switch (CGF.getEvaluationKind(T: Private->getType())) {
1716 case TEK_Scalar:
1717 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1718 break;
1719 case TEK_Complex:
1720 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1721 break;
1722 case TEK_Aggregate:
1723 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1724 break;
1725 }
1726 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1727 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1728 llvm::Function *NewFunc) {
1729 CGF.Builder.restoreIP(IP: CodeGenIP);
1730 auto *CurFn = CGF.CurFn;
1731 CGF.CurFn = NewFunc;
1732
1733 *LHSPtr = CGF.GetAddrOfLocalVar(
1734 VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs[I])->getDecl()))
1735 .emitRawPointer(CGF);
1736 *RHSPtr = CGF.GetAddrOfLocalVar(
1737 VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs[I])->getDecl()))
1738 .emitRawPointer(CGF);
1739
1740 emitSingleReductionCombiner(CGF, ReductionOp: ReductionOps[I], PrivateRef: Privates[I],
1741 LHS: cast<DeclRefExpr>(Val: LHSExprs[I]),
1742 RHS: cast<DeclRefExpr>(Val: RHSExprs[I]));
1743
1744 CGF.CurFn = CurFn;
1745
1746 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1747 CGF.Builder.GetInsertPoint());
1748 };
1749 ReductionInfos.emplace_back(Args: llvm::OpenMPIRBuilder::ReductionInfo(
1750 ElementType, Variable, PrivateVariable, EvalKind,
1751 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
1752 Idx++;
1753 }
1754
1755 llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
1756 cantFail(ValOrErr: OMPBuilder.createReductionsGPU(
1757 Loc: OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, IsNoWait: false, IsTeamsReduction: TeamsReduction,
1758 ReductionGenCBKind: llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1759 GridValue: CGF.getTarget().getGridValue(),
1760 ReductionBufNum: C.getLangOpts().OpenMPCUDAReductionBufNum, SrcLocInfo: RTLoc));
1761 CGF.Builder.restoreIP(IP: AfterIP);
1762}
1763
1764const VarDecl *
1765CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1766 const VarDecl *NativeParam) const {
1767 if (!NativeParam->getType()->isReferenceType())
1768 return NativeParam;
1769 QualType ArgType = NativeParam->getType();
1770 QualifierCollector QC;
1771 const Type *NonQualTy = QC.strip(type: ArgType);
1772 QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1773 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1774 if (Attr->getCaptureKind() == OMPC_map) {
1775 PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
1776 AddressSpace: LangAS::opencl_global);
1777 }
1778 }
1779 ArgType = CGM.getContext().getPointerType(T: PointeeTy);
1780 QC.addRestrict();
1781 enum { NVPTX_local_addr = 5 };
1782 QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
1783 ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
1784 if (isa<ImplicitParamDecl>(Val: NativeParam))
1785 return ImplicitParamDecl::Create(
1786 C&: CGM.getContext(), /*DC=*/nullptr, IdLoc: NativeParam->getLocation(),
1787 Id: NativeParam->getIdentifier(), T: ArgType, ParamKind: ImplicitParamKind::Other);
1788 return ParmVarDecl::Create(
1789 C&: CGM.getContext(),
1790 DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
1791 StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
1792 Id: NativeParam->getIdentifier(), T: ArgType,
1793 /*TInfo=*/nullptr, S: SC_None, /*DefArg=*/nullptr);
1794}
1795
1796Address
1797CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1798 const VarDecl *NativeParam,
1799 const VarDecl *TargetParam) const {
1800 assert(NativeParam != TargetParam &&
1801 NativeParam->getType()->isReferenceType() &&
1802 "Native arg must not be the same as target arg.");
1803 Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
1804 QualType NativeParamType = NativeParam->getType();
1805 QualifierCollector QC;
1806 const Type *NonQualTy = QC.strip(type: NativeParamType);
1807 QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1808 unsigned NativePointeeAddrSpace =
1809 CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
1810 QualType TargetTy = TargetParam->getType();
1811 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /*Volatile=*/false,
1812 Ty: TargetTy, Loc: SourceLocation());
1813 // Cast to native address space.
1814 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1815 V: TargetAddr,
1816 DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
1817 Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
1818 CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /*Volatile=*/false,
1819 Ty: NativeParamType);
1820 return NativeParamAddr;
1821}
1822
1823void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1824 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1825 ArrayRef<llvm::Value *> Args) const {
1826 SmallVector<llvm::Value *, 4> TargetArgs;
1827 TargetArgs.reserve(N: Args.size());
1828 auto *FnType = OutlinedFn.getFunctionType();
1829 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1830 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1831 TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
1832 break;
1833 }
1834 llvm::Type *TargetType = FnType->getParamType(i: I);
1835 llvm::Value *NativeArg = Args[I];
1836 if (!TargetType->isPointerTy()) {
1837 TargetArgs.emplace_back(Args&: NativeArg);
1838 continue;
1839 }
1840 TargetArgs.emplace_back(
1841 Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
1842 }
1843 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: TargetArgs);
1844}
1845
1846/// Emit function which wraps the outline parallel region
1847/// and controls the arguments which are passed to this function.
1848/// The wrapper ensures that the outlined function is called
1849/// with the correct arguments when data is shared.
1850llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1851 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1852 ASTContext &Ctx = CGM.getContext();
1853 const auto &CS = *D.getCapturedStmt(RegionKind: OMPD_parallel);
1854
1855 // Create a function that takes as argument the source thread.
1856 FunctionArgList WrapperArgs;
1857 QualType Int16QTy =
1858 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1859 QualType Int32QTy =
1860 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1861 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1862 /*Id=*/nullptr, Int16QTy,
1863 ImplicitParamKind::Other);
1864 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1865 /*Id=*/nullptr, Int32QTy,
1866 ImplicitParamKind::Other);
1867 WrapperArgs.emplace_back(Args: &ParallelLevelArg);
1868 WrapperArgs.emplace_back(Args: &WrapperArg);
1869
1870 const CGFunctionInfo &CGFI =
1871 CGM.getTypes().arrangeBuiltinFunctionDeclaration(resultType: Ctx.VoidTy, args: WrapperArgs);
1872
1873 auto *Fn = llvm::Function::Create(
1874 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
1875 N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
1876
1877 // Ensure we do not inline the function. This is trivially true for the ones
1878 // passed to __kmpc_fork_call but the ones calles in serialized regions
1879 // could be inlined. This is not a perfect but it is closer to the invariant
1880 // we want, namely, every data environment starts with a new function.
1881 // TODO: We should pass the if condition to the runtime function and do the
1882 // handling there. Much cleaner code.
1883 Fn->addFnAttr(Kind: llvm::Attribute::NoInline);
1884
1885 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
1886 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1887 Fn->setDoesNotRecurse();
1888
1889 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1890 CGF.StartFunction(GD: GlobalDecl(), RetTy: Ctx.VoidTy, Fn, FnInfo: CGFI, Args: WrapperArgs,
1891 Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
1892
1893 const auto *RD = CS.getCapturedRecordDecl();
1894 auto CurField = RD->field_begin();
1895
1896 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1897 /*Name=*/".zero.addr");
1898 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1899 // Get the array of arguments.
1900 SmallVector<llvm::Value *, 8> Args;
1901
1902 Args.emplace_back(Args: CGF.GetAddrOfLocalVar(VD: &WrapperArg).emitRawPointer(CGF));
1903 Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
1904
1905 CGBuilderTy &Bld = CGF.Builder;
1906 auto CI = CS.capture_begin();
1907
1908 // Use global memory for data sharing.
1909 // Handle passing of global args to workers.
1910 RawAddress GlobalArgs =
1911 CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
1912 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1913 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1914 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1915 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
1916 args: DataSharingArgs);
1917
1918 // Retrieve the shared variables from the list of references returned
1919 // by the runtime. Pass the variables to the outlined function.
1920 Address SharedArgListAddress = Address::invalid();
1921 if (CS.capture_size() > 0 ||
1922 isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1923 SharedArgListAddress = CGF.EmitLoadOfPointer(
1924 Ptr: GlobalArgs, PtrTy: CGF.getContext()
1925 .getPointerType(T: CGF.getContext().VoidPtrTy)
1926 .castAs<PointerType>());
1927 }
1928 unsigned Idx = 0;
1929 if (isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1930 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1931 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1932 Addr: Src, Ty: Bld.getPtrTy(AddrSpace: 0), ElementTy: CGF.SizeTy);
1933 llvm::Value *LB = CGF.EmitLoadOfScalar(
1934 Addr: TypedAddress,
1935 /*Volatile=*/false,
1936 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1937 Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
1938 Args.emplace_back(Args&: LB);
1939 ++Idx;
1940 Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1941 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Src, Ty: Bld.getPtrTy(AddrSpace: 0),
1942 ElementTy: CGF.SizeTy);
1943 llvm::Value *UB = CGF.EmitLoadOfScalar(
1944 Addr: TypedAddress,
1945 /*Volatile=*/false,
1946 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1947 Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
1948 Args.emplace_back(Args&: UB);
1949 ++Idx;
1950 }
1951 if (CS.capture_size() > 0) {
1952 ASTContext &CGFContext = CGF.getContext();
1953 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1954 QualType ElemTy = CurField->getType();
1955 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
1956 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1957 Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
1958 ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
1959 llvm::Value *Arg = CGF.EmitLoadOfScalar(Addr: TypedAddress,
1960 /*Volatile=*/false,
1961 Ty: CGFContext.getPointerType(T: ElemTy),
1962 Loc: CI->getLocation());
1963 if (CI->capturesVariableByCopy() &&
1964 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1965 Arg = castValueToType(CGF, Val: Arg, ValTy: ElemTy, CastTy: CGFContext.getUIntPtrType(),
1966 Loc: CI->getLocation());
1967 }
1968 Args.emplace_back(Args&: Arg);
1969 }
1970 }
1971
1972 emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
1973 CGF.FinishFunction();
1974 return Fn;
1975}
1976
1977void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1978 const Decl *D) {
1979 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1980 return;
1981
1982 assert(D && "Expected function or captured|block decl.");
1983 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1984 "Function is registered already.");
1985 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1986 "Team is set but not processed.");
1987 const Stmt *Body = nullptr;
1988 bool NeedToDelayGlobalization = false;
1989 if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
1990 Body = FD->getBody();
1991 } else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
1992 Body = BD->getBody();
1993 } else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
1994 Body = CD->getBody();
1995 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1996 if (NeedToDelayGlobalization &&
1997 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1998 return;
1999 }
2000 if (!Body)
2001 return;
2002 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2003 VarChecker.Visit(S: Body);
2004 const RecordDecl *GlobalizedVarsRecord =
2005 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2006 TeamAndReductions.first = nullptr;
2007 TeamAndReductions.second.clear();
2008 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2009 VarChecker.getEscapedVariableLengthDecls();
2010 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2011 VarChecker.getDelayedVariableLengthDecls();
2012 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2013 DelayedVariableLengthDecls.empty())
2014 return;
2015 auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
2016 I->getSecond().MappedParams =
2017 std::make_unique<CodeGenFunction::OMPMapVars>();
2018 I->getSecond().EscapedParameters.insert(
2019 I: VarChecker.getEscapedParameters().begin(),
2020 E: VarChecker.getEscapedParameters().end());
2021 I->getSecond().EscapedVariableLengthDecls.append(
2022 in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
2023 I->getSecond().DelayedVariableLengthDecls.append(
2024 in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
2025 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2026 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2027 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2028 Data.try_emplace(Key: VD);
2029 }
2030 if (!NeedToDelayGlobalization) {
2031 emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
2032 struct GlobalizationScope final : EHScopeStack::Cleanup {
2033 GlobalizationScope() = default;
2034
2035 void Emit(CodeGenFunction &CGF, Flags flags) override {
2036 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2037 .emitGenericVarsEpilog(CGF);
2038 }
2039 };
2040 CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
2041 }
2042}
2043
2044Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2045 const VarDecl *VD) {
2046 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2047 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2048 auto AS = LangAS::Default;
2049 switch (A->getAllocatorType()) {
2050 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2051 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2052 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2053 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2054 break;
2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056 return Address::invalid();
2057 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2058 // TODO: implement aupport for user-defined allocators.
2059 return Address::invalid();
2060 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2061 AS = LangAS::cuda_constant;
2062 break;
2063 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2064 AS = LangAS::cuda_shared;
2065 break;
2066 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2067 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2068 break;
2069 }
2070 llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
2071 auto *GV = new llvm::GlobalVariable(
2072 CGM.getModule(), VarTy, /*isConstant=*/false,
2073 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
2074 VD->getName(),
2075 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2076 CGM.getContext().getTargetAddressSpace(AS));
2077 CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
2078 GV->setAlignment(Align.getAsAlign());
2079 return Address(
2080 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2081 V: GV, DestTy: CGF.Builder.getPtrTy(AddrSpace: CGM.getContext().getTargetAddressSpace(
2082 AS: VD->getType().getAddressSpace()))),
2083 VarTy, Align);
2084 }
2085
2086 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2087 return Address::invalid();
2088
2089 VD = VD->getCanonicalDecl();
2090 auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
2091 if (I == FunctionGlobalizedDecls.end())
2092 return Address::invalid();
2093 auto VDI = I->getSecond().LocalVarData.find(Key: VD);
2094 if (VDI != I->getSecond().LocalVarData.end())
2095 return VDI->second.PrivateAddr;
2096 if (VD->hasAttrs()) {
2097 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2098 E(VD->attr_end());
2099 IT != E; ++IT) {
2100 auto VDI = I->getSecond().LocalVarData.find(
2101 Key: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: IT->getRef())->getDecl())
2102 ->getCanonicalDecl());
2103 if (VDI != I->getSecond().LocalVarData.end())
2104 return VDI->second.PrivateAddr;
2105 }
2106 }
2107
2108 return Address::invalid();
2109}
2110
2111void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2112 FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
2113 CGOpenMPRuntime::functionFinished(CGF);
2114}
2115
2116void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2117 CodeGenFunction &CGF, const OMPLoopDirective &S,
2118 OpenMPDistScheduleClauseKind &ScheduleKind,
2119 llvm::Value *&Chunk) const {
2120 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2121 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2122 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2123 Chunk = CGF.EmitScalarConversion(
2124 Src: RT.getGPUNumThreads(CGF),
2125 SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
2126 DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
2127 return;
2128 }
2129 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2130 CGF, S, ScheduleKind, Chunk);
2131}
2132
2133void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2134 CodeGenFunction &CGF, const OMPLoopDirective &S,
2135 OpenMPScheduleClauseKind &ScheduleKind,
2136 const Expr *&ChunkExpr) const {
2137 ScheduleKind = OMPC_SCHEDULE_static;
2138 // Chunk size is 1 in this case.
2139 llvm::APInt ChunkSize(32, 1);
2140 ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
2141 type: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
2142 l: SourceLocation());
2143}
2144
2145void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2146 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2147 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2148 " Expected target-based directive.");
2149 const CapturedStmt *CS = D.getCapturedStmt(RegionKind: OMPD_target);
2150 for (const CapturedStmt::Capture &C : CS->captures()) {
2151 // Capture variables captured by reference in lambdas for target-based
2152 // directives.
2153 if (!C.capturesVariable())
2154 continue;
2155 const VarDecl *VD = C.getCapturedVar();
2156 const auto *RD = VD->getType()
2157 .getCanonicalType()
2158 .getNonReferenceType()
2159 ->getAsCXXRecordDecl();
2160 if (!RD || !RD->isLambda())
2161 continue;
2162 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2163 LValue VDLVal;
2164 if (VD->getType().getCanonicalType()->isReferenceType())
2165 VDLVal = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr, RefTy: VD->getType());
2166 else
2167 VDLVal = CGF.MakeAddrLValue(
2168 Addr: VDAddr, T: VD->getType().getCanonicalType().getNonReferenceType());
2169 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2170 FieldDecl *ThisCapture = nullptr;
2171 RD->getCaptureFields(Captures, ThisCapture);
2172 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2173 LValue ThisLVal =
2174 CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
2175 llvm::Value *CXXThis = CGF.LoadCXXThis();
2176 CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
2177 }
2178 for (const LambdaCapture &LC : RD->captures()) {
2179 if (LC.getCaptureKind() != LCK_ByRef)
2180 continue;
2181 const ValueDecl *VD = LC.getCapturedVar();
2182 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2183 // capturing structured bindings in lambdas yet.
2184 if (!CS->capturesVariable(Var: cast<VarDecl>(Val: VD)))
2185 continue;
2186 auto It = Captures.find(Val: VD);
2187 assert(It != Captures.end() && "Found lambda capture without field.");
2188 LValue VarLVal = CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: It->second);
2189 Address VDAddr = CGF.GetAddrOfLocalVar(VD: cast<VarDecl>(Val: VD));
2190 if (VD->getType().getCanonicalType()->isReferenceType())
2191 VDAddr = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr,
2192 RefTy: VD->getType().getCanonicalType())
2193 .getAddress();
2194 CGF.EmitStoreOfScalar(value: VDAddr.emitRawPointer(CGF), lvalue: VarLVal);
2195 }
2196 }
2197}
2198
2199bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2200 LangAS &AS) {
2201 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2202 return false;
2203 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2204 switch(A->getAllocatorType()) {
2205 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2206 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2207 // Not supported, fallback to the default mem space.
2208 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2209 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2210 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2211 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2212 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2213 AS = LangAS::Default;
2214 return true;
2215 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2216 AS = LangAS::cuda_constant;
2217 return true;
2218 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2219 AS = LangAS::cuda_shared;
2220 return true;
2221 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2222 llvm_unreachable("Expected predefined allocator for the variables with the "
2223 "static storage.");
2224 }
2225 return false;
2226}
2227
2228// Get current OffloadArch and ignore any unknown values
2229static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2230 if (!CGM.getTarget().hasFeature(Feature: "ptx"))
2231 return OffloadArch::UNKNOWN;
2232 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2233 if (Feature.getValue()) {
2234 OffloadArch Arch = StringToOffloadArch(S: Feature.getKey());
2235 if (Arch != OffloadArch::UNKNOWN)
2236 return Arch;
2237 }
2238 }
2239 return OffloadArch::UNKNOWN;
2240}
2241
2242/// Check to see if target architecture supports unified addressing which is
2243/// a restriction for OpenMP requires clause "unified_shared_memory".
2244void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2245 for (const OMPClause *Clause : D->clauselists()) {
2246 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2247 OffloadArch Arch = getOffloadArch(CGM);
2248 switch (Arch) {
2249 case OffloadArch::SM_20:
2250 case OffloadArch::SM_21:
2251 case OffloadArch::SM_30:
2252 case OffloadArch::SM_32_:
2253 case OffloadArch::SM_35:
2254 case OffloadArch::SM_37:
2255 case OffloadArch::SM_50:
2256 case OffloadArch::SM_52:
2257 case OffloadArch::SM_53: {
2258 SmallString<256> Buffer;
2259 llvm::raw_svector_ostream Out(Buffer);
2260 Out << "Target architecture " << OffloadArchToString(A: Arch)
2261 << " does not support unified addressing";
2262 CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
2263 return;
2264 }
2265 case OffloadArch::SM_60:
2266 case OffloadArch::SM_61:
2267 case OffloadArch::SM_62:
2268 case OffloadArch::SM_70:
2269 case OffloadArch::SM_72:
2270 case OffloadArch::SM_75:
2271 case OffloadArch::SM_80:
2272 case OffloadArch::SM_86:
2273 case OffloadArch::SM_87:
2274 case OffloadArch::SM_89:
2275 case OffloadArch::SM_90:
2276 case OffloadArch::SM_90a:
2277 case OffloadArch::SM_100:
2278 case OffloadArch::SM_100a:
2279 case OffloadArch::SM_101:
2280 case OffloadArch::SM_101a:
2281 case OffloadArch::SM_120:
2282 case OffloadArch::SM_120a:
2283 case OffloadArch::GFX600:
2284 case OffloadArch::GFX601:
2285 case OffloadArch::GFX602:
2286 case OffloadArch::GFX700:
2287 case OffloadArch::GFX701:
2288 case OffloadArch::GFX702:
2289 case OffloadArch::GFX703:
2290 case OffloadArch::GFX704:
2291 case OffloadArch::GFX705:
2292 case OffloadArch::GFX801:
2293 case OffloadArch::GFX802:
2294 case OffloadArch::GFX803:
2295 case OffloadArch::GFX805:
2296 case OffloadArch::GFX810:
2297 case OffloadArch::GFX9_GENERIC:
2298 case OffloadArch::GFX900:
2299 case OffloadArch::GFX902:
2300 case OffloadArch::GFX904:
2301 case OffloadArch::GFX906:
2302 case OffloadArch::GFX908:
2303 case OffloadArch::GFX909:
2304 case OffloadArch::GFX90a:
2305 case OffloadArch::GFX90c:
2306 case OffloadArch::GFX9_4_GENERIC:
2307 case OffloadArch::GFX942:
2308 case OffloadArch::GFX950:
2309 case OffloadArch::GFX10_1_GENERIC:
2310 case OffloadArch::GFX1010:
2311 case OffloadArch::GFX1011:
2312 case OffloadArch::GFX1012:
2313 case OffloadArch::GFX1013:
2314 case OffloadArch::GFX10_3_GENERIC:
2315 case OffloadArch::GFX1030:
2316 case OffloadArch::GFX1031:
2317 case OffloadArch::GFX1032:
2318 case OffloadArch::GFX1033:
2319 case OffloadArch::GFX1034:
2320 case OffloadArch::GFX1035:
2321 case OffloadArch::GFX1036:
2322 case OffloadArch::GFX11_GENERIC:
2323 case OffloadArch::GFX1100:
2324 case OffloadArch::GFX1101:
2325 case OffloadArch::GFX1102:
2326 case OffloadArch::GFX1103:
2327 case OffloadArch::GFX1150:
2328 case OffloadArch::GFX1151:
2329 case OffloadArch::GFX1152:
2330 case OffloadArch::GFX1153:
2331 case OffloadArch::GFX12_GENERIC:
2332 case OffloadArch::GFX1200:
2333 case OffloadArch::GFX1201:
2334 case OffloadArch::GFX1250:
2335 case OffloadArch::AMDGCNSPIRV:
2336 case OffloadArch::Generic:
2337 case OffloadArch::GRANITERAPIDS:
2338 case OffloadArch::BMG_G21:
2339 case OffloadArch::UNUSED:
2340 case OffloadArch::UNKNOWN:
2341 break;
2342 case OffloadArch::LAST:
2343 llvm_unreachable("Unexpected GPU arch.");
2344 }
2345 }
2346 }
2347 CGOpenMPRuntime::processRequiresDirective(D);
2348}
2349
2350llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2351 CGBuilderTy &Bld = CGF.Builder;
2352 llvm::Module *M = &CGF.CGM.getModule();
2353 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2354 llvm::Function *F = M->getFunction(Name: LocSize);
2355 if (!F) {
2356 F = llvm::Function::Create(Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: {}, isVarArg: false),
2357 Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize,
2358 M: &CGF.CGM.getModule());
2359 }
2360 return Bld.CreateCall(Callee: F, Args: {}, Name: "nvptx_num_threads");
2361}
2362
2363llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2364 ArrayRef<llvm::Value *> Args{};
2365 return CGF.EmitRuntimeCall(
2366 callee: OMPBuilder.getOrCreateRuntimeFunction(
2367 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
2368 args: Args);
2369}
2370