1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX, AMDGCN and SPIR-V.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CGDebugInfo.h"
16#include "CodeGenFunction.h"
17#include "clang/AST/Attr.h"
18#include "clang/AST/DeclOpenMP.h"
19#include "clang/AST/OpenMPClause.h"
20#include "clang/AST/StmtOpenMP.h"
21#include "clang/AST/StmtVisitor.h"
22#include "clang/Basic/Cuda.h"
23#include "llvm/ADT/SmallPtrSet.h"
24#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
25#include "llvm/Frontend/OpenMP/OMPGridValues.h"
26
27using namespace clang;
28using namespace CodeGen;
29using namespace llvm::omp;
30
31namespace {
32/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
33class NVPTXActionTy final : public PrePostActionTy {
34 llvm::FunctionCallee EnterCallee = nullptr;
35 ArrayRef<llvm::Value *> EnterArgs;
36 llvm::FunctionCallee ExitCallee = nullptr;
37 ArrayRef<llvm::Value *> ExitArgs;
38 bool Conditional = false;
39 llvm::BasicBlock *ContBlock = nullptr;
40
41public:
42 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 ArrayRef<llvm::Value *> EnterArgs,
44 llvm::FunctionCallee ExitCallee,
45 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
46 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
47 ExitArgs(ExitArgs), Conditional(Conditional) {}
48 void Enter(CodeGenFunction &CGF) override {
49 llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
50 if (Conditional) {
51 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
52 auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
53 ContBlock = CGF.createBasicBlock(name: "omp_if.end");
54 // Generate the branch (If-stmt)
55 CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
56 CGF.EmitBlock(BB: ThenBlock);
57 }
58 }
59 void Done(CodeGenFunction &CGF) {
60 // Emit the rest of blocks/branches
61 CGF.EmitBranch(Block: ContBlock);
62 CGF.EmitBlock(BB: ContBlock, IsFinished: true);
63 }
64 void Exit(CodeGenFunction &CGF) override {
65 CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
66 }
67};
68
69/// A class to track the execution mode when codegening directives within
70/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
71/// to the target region and used by containing directives such as 'parallel'
72/// to emit optimized code.
73class ExecutionRuntimeModesRAII {
74private:
75 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
76 CGOpenMPRuntimeGPU::EM_Unknown;
77 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
78
79public:
80 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
84 ExecMode = EntryMode;
85 }
86 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
87};
88
89static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
90 RefExpr = RefExpr->IgnoreParens();
91 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
92 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
93 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
94 Base = TempASE->getBase()->IgnoreParenImpCasts();
95 RefExpr = Base;
96 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(Val: RefExpr)) {
97 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
98 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Val: Base))
99 Base = TempOASE->getBase()->IgnoreParenImpCasts();
100 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
101 Base = TempASE->getBase()->IgnoreParenImpCasts();
102 RefExpr = Base;
103 }
104 RefExpr = RefExpr->IgnoreParenImpCasts();
105 if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
106 return cast<ValueDecl>(Val: DE->getDecl()->getCanonicalDecl());
107 const auto *ME = cast<MemberExpr>(Val: RefExpr);
108 return cast<ValueDecl>(Val: ME->getMemberDecl()->getCanonicalDecl());
109}
110
111static RecordDecl *buildRecordForGlobalizedVars(
112 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
113 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
114 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
115 &MappedDeclsFields,
116 int BufSize) {
117 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
118 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
119 return nullptr;
120 SmallVector<VarsDataTy, 4> GlobalizedVars;
121 for (const ValueDecl *D : EscapedDecls)
122 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
123 for (const ValueDecl *D : EscapedDeclsForTeams)
124 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
125
126 // Build struct _globalized_locals_ty {
127 // /* globalized vars */[WarSize] align (decl_align)
128 // /* globalized vars */ for EscapedDeclsForTeams
129 // };
130 RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
131 GlobalizedRD->startDefinition();
132 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(llvm::from_range,
133 EscapedDeclsForTeams);
134 for (const auto &Pair : GlobalizedVars) {
135 const ValueDecl *VD = Pair.second;
136 QualType Type = VD->getType();
137 if (Type->isLValueReferenceType())
138 Type = C.getPointerType(T: Type.getNonReferenceType());
139 else
140 Type = Type.getNonReferenceType();
141 SourceLocation Loc = VD->getLocation();
142 FieldDecl *Field;
143 if (SingleEscaped.count(Ptr: VD)) {
144 Field = FieldDecl::Create(
145 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
146 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
147 /*BW=*/nullptr, /*Mutable=*/false,
148 /*InitStyle=*/ICIS_NoInit);
149 Field->setAccess(AS_public);
150 if (VD->hasAttrs()) {
151 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
152 E(VD->getAttrs().end());
153 I != E; ++I)
154 Field->addAttr(A: *I);
155 }
156 } else {
157 if (BufSize > 1) {
158 llvm::APInt ArraySize(32, BufSize);
159 Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
160 ASM: ArraySizeModifier::Normal, IndexTypeQuals: 0);
161 }
162 Field = FieldDecl::Create(
163 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
164 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
165 /*BW=*/nullptr, /*Mutable=*/false,
166 /*InitStyle=*/ICIS_NoInit);
167 Field->setAccess(AS_public);
168 llvm::APInt Align(32, Pair.first.getQuantity());
169 Field->addAttr(A: AlignedAttr::CreateImplicit(
170 Ctx&: C, /*IsAlignmentExpr=*/true,
171 Alignment: IntegerLiteral::Create(C, V: Align,
172 type: C.getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
173 l: SourceLocation()),
174 Range: {}, S: AlignedAttr::GNU_aligned));
175 }
176 GlobalizedRD->addDecl(D: Field);
177 MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
178 }
179 GlobalizedRD->completeDefinition();
180 return GlobalizedRD;
181}
182
183/// Get the list of variables that can escape their declaration context.
184class CheckVarsEscapingDeclContext final
185 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
186 CodeGenFunction &CGF;
187 llvm::SetVector<const ValueDecl *> EscapedDecls;
188 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
189 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
190 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
191 RecordDecl *GlobalizedRD = nullptr;
192 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
193 bool AllEscaped = false;
194 bool IsForCombinedParallelRegion = false;
195
196 void markAsEscaped(const ValueDecl *VD) {
197 // Do not globalize declare target variables.
198 if (!isa<VarDecl>(Val: VD) ||
199 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
200 return;
201 VD = cast<ValueDecl>(Val: VD->getCanonicalDecl());
202 // Use user-specified allocation.
203 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
204 return;
205 // Variables captured by value must be globalized.
206 bool IsCaptured = false;
207 if (auto *CSI = CGF.CapturedStmtInfo) {
208 if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
209 // Check if need to capture the variable that was already captured by
210 // value in the outer region.
211 IsCaptured = true;
212 if (!IsForCombinedParallelRegion) {
213 if (!FD->hasAttrs())
214 return;
215 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
216 if (!Attr)
217 return;
218 if (((Attr->getCaptureKind() != OMPC_map) &&
219 !isOpenMPPrivate(Kind: Attr->getCaptureKind())) ||
220 ((Attr->getCaptureKind() == OMPC_map) &&
221 !FD->getType()->isAnyPointerType()))
222 return;
223 }
224 if (!FD->getType()->isReferenceType()) {
225 assert(!VD->getType()->isVariablyModifiedType() &&
226 "Parameter captured by value with variably modified type");
227 EscapedParameters.insert(Ptr: VD);
228 } else if (!IsForCombinedParallelRegion) {
229 return;
230 }
231 }
232 }
233 if ((!CGF.CapturedStmtInfo ||
234 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
235 VD->getType()->isReferenceType())
236 // Do not globalize variables with reference type.
237 return;
238 if (VD->getType()->isVariablyModifiedType()) {
239 // If not captured at the target region level then mark the escaped
240 // variable as delayed.
241 if (IsCaptured)
242 EscapedVariableLengthDecls.insert(X: VD);
243 else
244 DelayedVariableLengthDecls.insert(X: VD);
245 } else
246 EscapedDecls.insert(X: VD);
247 }
248
249 void VisitValueDecl(const ValueDecl *VD) {
250 if (VD->getType()->isLValueReferenceType())
251 markAsEscaped(VD);
252 if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
253 if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
254 const bool SavedAllEscaped = AllEscaped;
255 AllEscaped = VD->getType()->isLValueReferenceType();
256 Visit(S: VarD->getInit());
257 AllEscaped = SavedAllEscaped;
258 }
259 }
260 }
261 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
262 ArrayRef<OMPClause *> Clauses,
263 bool IsCombinedParallelRegion) {
264 if (!S)
265 return;
266 for (const CapturedStmt::Capture &C : S->captures()) {
267 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
268 const ValueDecl *VD = C.getCapturedVar();
269 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
270 if (IsCombinedParallelRegion) {
271 // Check if the variable is privatized in the combined construct and
272 // those private copies must be shared in the inner parallel
273 // directive.
274 IsForCombinedParallelRegion = false;
275 for (const OMPClause *C : Clauses) {
276 if (!isOpenMPPrivate(Kind: C->getClauseKind()) ||
277 C->getClauseKind() == OMPC_reduction ||
278 C->getClauseKind() == OMPC_linear ||
279 C->getClauseKind() == OMPC_private)
280 continue;
281 ArrayRef<const Expr *> Vars;
282 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
283 Vars = PC->getVarRefs();
284 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
285 Vars = PC->getVarRefs();
286 else
287 llvm_unreachable("Unexpected clause.");
288 for (const auto *E : Vars) {
289 const Decl *D =
290 cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
291 if (D == VD->getCanonicalDecl()) {
292 IsForCombinedParallelRegion = true;
293 break;
294 }
295 }
296 if (IsForCombinedParallelRegion)
297 break;
298 }
299 }
300 markAsEscaped(VD);
301 if (isa<OMPCapturedExprDecl>(Val: VD))
302 VisitValueDecl(VD);
303 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
304 }
305 }
306 }
307
308 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
309 assert(!GlobalizedRD &&
310 "Record for globalized variables is built already.");
311 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
312 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
313 if (IsInTTDRegion)
314 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
315 else
316 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
317 GlobalizedRD = ::buildRecordForGlobalizedVars(
318 C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
319 MappedDeclsFields, BufSize: WarpSize);
320 }
321
322public:
323 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
324 ArrayRef<const ValueDecl *> TeamsReductions)
325 : CGF(CGF), EscapedDecls(llvm::from_range, TeamsReductions) {}
326 ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, DKind: D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S: S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, Clauses: D->clauses(),
351 IsCombinedParallelRegion: CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(DKind: D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(Val: VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(Capture: &C) || isa<OMPCapturedExprDecl>(Val: VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(Val: VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(S: Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(S: Arg);
406 }
407 }
408 Visit(S: E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(Val: VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(S: E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(S: E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(S: E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(S: E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(S: Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(S: Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(Val: VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504CGOpenMPRuntimeGPU::ExecutionMode
505CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode;
507}
508
509CGOpenMPRuntimeGPU::DataSharingMode
510CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode;
512}
513
514/// Check for inner (nested) SPMD construct, if any
515static bool hasNestedSPMDDirective(ASTContext &Ctx,
516 const OMPExecutableDirective &D) {
517 const auto *CS = D.getInnermostCapturedStmt();
518 const auto *Body =
519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522 if (const auto *NestedDir =
523 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525 switch (D.getDirectiveKind()) {
526 case OMPD_target:
527 if (isOpenMPParallelDirective(DKind))
528 return true;
529 if (DKind == OMPD_teams) {
530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
532 if (!Body)
533 return false;
534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535 if (const auto *NND =
536 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
537 DKind = NND->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind))
539 return true;
540 }
541 }
542 return false;
543 case OMPD_target_teams:
544 return isOpenMPParallelDirective(DKind);
545 case OMPD_target_simd:
546 case OMPD_target_parallel:
547 case OMPD_target_parallel_for:
548 case OMPD_target_parallel_for_simd:
549 case OMPD_target_teams_distribute:
550 case OMPD_target_teams_distribute_simd:
551 case OMPD_target_teams_distribute_parallel_for:
552 case OMPD_target_teams_distribute_parallel_for_simd:
553 case OMPD_parallel:
554 case OMPD_for:
555 case OMPD_parallel_for:
556 case OMPD_parallel_master:
557 case OMPD_parallel_sections:
558 case OMPD_for_simd:
559 case OMPD_parallel_for_simd:
560 case OMPD_cancel:
561 case OMPD_cancellation_point:
562 case OMPD_ordered:
563 case OMPD_threadprivate:
564 case OMPD_allocate:
565 case OMPD_task:
566 case OMPD_simd:
567 case OMPD_sections:
568 case OMPD_section:
569 case OMPD_single:
570 case OMPD_master:
571 case OMPD_critical:
572 case OMPD_taskyield:
573 case OMPD_barrier:
574 case OMPD_taskwait:
575 case OMPD_taskgroup:
576 case OMPD_atomic:
577 case OMPD_flush:
578 case OMPD_depobj:
579 case OMPD_scan:
580 case OMPD_teams:
581 case OMPD_target_data:
582 case OMPD_target_exit_data:
583 case OMPD_target_enter_data:
584 case OMPD_distribute:
585 case OMPD_distribute_simd:
586 case OMPD_distribute_parallel_for:
587 case OMPD_distribute_parallel_for_simd:
588 case OMPD_teams_distribute:
589 case OMPD_teams_distribute_simd:
590 case OMPD_teams_distribute_parallel_for:
591 case OMPD_teams_distribute_parallel_for_simd:
592 case OMPD_target_update:
593 case OMPD_declare_simd:
594 case OMPD_declare_variant:
595 case OMPD_begin_declare_variant:
596 case OMPD_end_declare_variant:
597 case OMPD_declare_target:
598 case OMPD_end_declare_target:
599 case OMPD_declare_reduction:
600 case OMPD_declare_mapper:
601 case OMPD_taskloop:
602 case OMPD_taskloop_simd:
603 case OMPD_master_taskloop:
604 case OMPD_master_taskloop_simd:
605 case OMPD_parallel_master_taskloop:
606 case OMPD_parallel_master_taskloop_simd:
607 case OMPD_requires:
608 case OMPD_unknown:
609 default:
610 llvm_unreachable("Unexpected directive.");
611 }
612 }
613
614 return false;
615}
616
617static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618 const OMPExecutableDirective &D) {
619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620 switch (DirectiveKind) {
621 case OMPD_target:
622 case OMPD_target_teams:
623 return hasNestedSPMDDirective(Ctx, D);
624 case OMPD_target_parallel_loop:
625 case OMPD_target_parallel:
626 case OMPD_target_parallel_for:
627 case OMPD_target_parallel_for_simd:
628 case OMPD_target_teams_distribute_parallel_for:
629 case OMPD_target_teams_distribute_parallel_for_simd:
630 case OMPD_target_simd:
631 case OMPD_target_teams_distribute_simd:
632 return true;
633 case OMPD_target_teams_distribute:
634 return false;
635 case OMPD_target_teams_loop:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
639 return TTLD->canBeParallelFor();
640 return false;
641 case OMPD_parallel:
642 case OMPD_for:
643 case OMPD_parallel_for:
644 case OMPD_parallel_master:
645 case OMPD_parallel_sections:
646 case OMPD_for_simd:
647 case OMPD_parallel_for_simd:
648 case OMPD_cancel:
649 case OMPD_cancellation_point:
650 case OMPD_ordered:
651 case OMPD_threadprivate:
652 case OMPD_allocate:
653 case OMPD_task:
654 case OMPD_simd:
655 case OMPD_sections:
656 case OMPD_section:
657 case OMPD_single:
658 case OMPD_master:
659 case OMPD_critical:
660 case OMPD_taskyield:
661 case OMPD_barrier:
662 case OMPD_taskwait:
663 case OMPD_taskgroup:
664 case OMPD_atomic:
665 case OMPD_flush:
666 case OMPD_depobj:
667 case OMPD_scan:
668 case OMPD_teams:
669 case OMPD_target_data:
670 case OMPD_target_exit_data:
671 case OMPD_target_enter_data:
672 case OMPD_distribute:
673 case OMPD_distribute_simd:
674 case OMPD_distribute_parallel_for:
675 case OMPD_distribute_parallel_for_simd:
676 case OMPD_teams_distribute:
677 case OMPD_teams_distribute_simd:
678 case OMPD_teams_distribute_parallel_for:
679 case OMPD_teams_distribute_parallel_for_simd:
680 case OMPD_target_update:
681 case OMPD_declare_simd:
682 case OMPD_declare_variant:
683 case OMPD_begin_declare_variant:
684 case OMPD_end_declare_variant:
685 case OMPD_declare_target:
686 case OMPD_end_declare_target:
687 case OMPD_declare_reduction:
688 case OMPD_declare_mapper:
689 case OMPD_taskloop:
690 case OMPD_taskloop_simd:
691 case OMPD_master_taskloop:
692 case OMPD_master_taskloop_simd:
693 case OMPD_parallel_master_taskloop:
694 case OMPD_parallel_master_taskloop_simd:
695 case OMPD_requires:
696 case OMPD_unknown:
697 default:
698 break;
699 }
700 llvm_unreachable(
701 "Unknown programming model for OpenMP directive on NVPTX target.");
702}
703
704void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705 StringRef ParentName,
706 llvm::Function *&OutlinedFn,
707 llvm::Constant *&OutlinedFnID,
708 bool IsOffloadEntry,
709 const RegionCodeGenTy &CodeGen) {
710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711 EntryFunctionState EST;
712 WrapperFunctionsMap.clear();
713
714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy : public PrePostActionTy {
719 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720 const OMPExecutableDirective &D;
721
722 public:
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724 const OMPExecutableDirective &D)
725 : EST(EST), D(D) {}
726 void Enter(CodeGenFunction &CGF) override {
727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 }
732 void Exit(CodeGenFunction &CGF) override {
733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734 RT.clearLocThreadIdInsertPt(CGF);
735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 }
737 } Action(EST, D);
738 CodeGen.setAction(Action);
739 IsInTTDRegion = true;
740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741 IsOffloadEntry, CodeGen);
742 IsInTTDRegion = false;
743}
744
745void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746 CodeGenFunction &CGF,
747 EntryFunctionState &EST, bool IsSPMD) {
748 llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs;
749 Attrs.ExecFlags =
750 IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD
751 : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
752 computeMinAndMaxThreadsAndTeams(D, CGF, Attrs);
753
754 CGBuilderTy &Bld = CGF.Builder;
755 Bld.restoreIP(IP: OMPBuilder.createTargetInit(Loc: Bld, Attrs));
756 if (!IsSPMD)
757 emitGenericVarsProlog(CGF, Loc: EST.Loc);
758}
759
760void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761 EntryFunctionState &EST,
762 bool IsSPMD) {
763 if (!IsSPMD)
764 emitGenericVarsEpilog(CGF);
765
766 // This is temporary until we remove the fixed sized buffer.
767 ASTContext &C = CGM.getContext();
768 RecordDecl *StaticRD = C.buildImplicitRecord(
769 Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
770 StaticRD->startDefinition();
771 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772 CanQualType RecTy = C.getCanonicalTagType(TD: TeamReductionRec);
773 auto *Field = FieldDecl::Create(
774 C, DC: StaticRD, StartLoc: SourceLocation(), IdLoc: SourceLocation(), Id: nullptr, T: RecTy,
775 TInfo: C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit);
778 Field->setAccess(AS_public);
779 StaticRD->addDecl(D: Field);
780 }
781 StaticRD->completeDefinition();
782 CanQualType StaticTy = C.getCanonicalTagType(TD: StaticRD);
783 llvm::Type *LLVMReductionsBufferTy =
784 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
785 const auto &DL = CGM.getModule().getDataLayout();
786 uint64_t ReductionDataSize =
787 TeamsReductions.empty()
788 ? 0
789 : DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
790 CGBuilderTy &Bld = CGF.Builder;
791 OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
792 TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
793 TeamsReductions.clear();
794}
795
796void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797 StringRef ParentName,
798 llvm::Function *&OutlinedFn,
799 llvm::Constant *&OutlinedFnID,
800 bool IsOffloadEntry,
801 const RegionCodeGenTy &CodeGen) {
802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803 EntryFunctionState EST;
804
805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy : public PrePostActionTy {
809 CGOpenMPRuntimeGPU &RT;
810 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811 bool IsBareKernel;
812 DataSharingMode Mode;
813 const OMPExecutableDirective &D;
814
815 public:
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818 bool IsBareKernel, const OMPExecutableDirective &D)
819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820 Mode(RT.CurrentDataSharingMode), D(D) {}
821 void Enter(CodeGenFunction &CGF) override {
822 if (IsBareKernel) {
823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824 return;
825 }
826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 }
830 void Exit(CodeGenFunction &CGF) override {
831 if (IsBareKernel) {
832 RT.CurrentDataSharingMode = Mode;
833 return;
834 }
835 RT.clearLocThreadIdInsertPt(CGF);
836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 }
838 } Action(*this, EST, IsBareKernel, D);
839 CodeGen.setAction(Action);
840 IsInTTDRegion = true;
841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842 IsOffloadEntry, CodeGen);
843 IsInTTDRegion = false;
844}
845
846void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective &D, StringRef ParentName,
848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850 if (!IsOffloadEntry) // Nothing to do.
851 return;
852
853 assert(!ParentName.empty() && "Invalid target region parent name!");
854
855 bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857 if (Mode || IsBareKernel)
858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859 CodeGen);
860 else
861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 CodeGen);
863}
864
865CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866 : CGOpenMPRuntime(CGM) {
867 llvm::OpenMPIRBuilderConfig Config(
868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869 CGM.getLangOpts().OpenMPOffloadMandatory,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 Config.setDefaultTargetAS(
873 CGM.getContext().getTargetInfo().getTargetAddressSpace(AS: LangAS::Default));
874 Config.setRuntimeCC(CGM.getRuntimeCC());
875
876 OMPBuilder.setConfig(Config);
877
878 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
879 llvm_unreachable("OpenMP can only handle device code.");
880
881 if (CGM.getLangOpts().OpenMPCUDAMode)
882 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
883
884 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
885 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
886 return;
887
888 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
889 Name: "__omp_rtl_debug_kind");
890 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
891 Name: "__omp_rtl_assume_teams_oversubscription");
892 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
893 Name: "__omp_rtl_assume_threads_oversubscription");
894 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
895 Name: "__omp_rtl_assume_no_thread_state");
896 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
897 Name: "__omp_rtl_assume_no_nested_parallelism");
898}
899
900void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
901 ProcBindKind ProcBind,
902 SourceLocation Loc) {
903 // Nothing to do.
904}
905
906llvm::Value *CGOpenMPRuntimeGPU::emitMessageClause(CodeGenFunction &CGF,
907 const Expr *Message,
908 SourceLocation Loc) {
909 CGM.getDiags().Report(Loc, DiagID: diag::warn_omp_gpu_unsupported_clause)
910 << getOpenMPClauseName(C: OMPC_message);
911 return nullptr;
912}
913
914llvm::Value *
915CGOpenMPRuntimeGPU::emitSeverityClause(OpenMPSeverityClauseKind Severity,
916 SourceLocation Loc) {
917 CGM.getDiags().Report(Loc, DiagID: diag::warn_omp_gpu_unsupported_clause)
918 << getOpenMPClauseName(C: OMPC_severity);
919 return nullptr;
920}
921
922void CGOpenMPRuntimeGPU::emitNumThreadsClause(
923 CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc,
924 OpenMPNumThreadsClauseModifier Modifier, OpenMPSeverityClauseKind Severity,
925 SourceLocation SeverityLoc, const Expr *Message,
926 SourceLocation MessageLoc) {
927 if (Modifier == OMPC_NUMTHREADS_strict) {
928 CGM.getDiags().Report(Loc,
929 DiagID: diag::warn_omp_gpu_unsupported_modifier_for_clause)
930 << "strict" << getOpenMPClauseName(C: OMPC_num_threads);
931 return;
932 }
933
934 // Nothing to do.
935}
936
937void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
938 const Expr *NumTeams,
939 const Expr *ThreadLimit,
940 SourceLocation Loc) {}
941
942llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
943 CodeGenFunction &CGF, const OMPExecutableDirective &D,
944 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
945 const RegionCodeGenTy &CodeGen) {
946 // Emit target region as a standalone region.
947 bool PrevIsInTTDRegion = IsInTTDRegion;
948 IsInTTDRegion = false;
949 auto *OutlinedFun =
950 cast<llvm::Function>(Val: CGOpenMPRuntime::emitParallelOutlinedFunction(
951 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
952 IsInTTDRegion = PrevIsInTTDRegion;
953 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
954 llvm::Function *WrapperFun =
955 createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
956 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
957 }
958
959 return OutlinedFun;
960}
961
962/// Get list of lastprivate variables from the teams distribute ... or
963/// teams {distribute ...} directives.
964static void
965getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
966 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
967 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
968 "expected teams directive.");
969 const OMPExecutableDirective *Dir = &D;
970 if (!isOpenMPDistributeDirective(DKind: D.getDirectiveKind())) {
971 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
972 Ctx,
973 Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
974 /*IgnoreCaptured=*/true))) {
975 Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
976 if (Dir && !isOpenMPDistributeDirective(DKind: Dir->getDirectiveKind()))
977 Dir = nullptr;
978 }
979 }
980 if (!Dir)
981 return;
982 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
983 for (const Expr *E : C->getVarRefs())
984 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
985 }
986}
987
988/// Get list of reduction variables from the teams ... directives.
989static void
990getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
991 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
992 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
993 "expected teams directive.");
994 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
995 for (const Expr *E : C->privates())
996 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
997 }
998}
999
1000llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1001 CodeGenFunction &CGF, const OMPExecutableDirective &D,
1002 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
1003 const RegionCodeGenTy &CodeGen) {
1004 SourceLocation Loc = D.getBeginLoc();
1005
1006 const RecordDecl *GlobalizedRD = nullptr;
1007 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1009 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1010 // Globalize team reductions variable unconditionally in all modes.
1011 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1012 getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1013 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1014 getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1015 if (!LastPrivatesReductions.empty()) {
1016 GlobalizedRD = ::buildRecordForGlobalizedVars(
1017 C&: CGM.getContext(), EscapedDecls: {}, EscapedDeclsForTeams: LastPrivatesReductions, MappedDeclsFields,
1018 BufSize: WarpSize);
1019 }
1020 } else if (!LastPrivatesReductions.empty()) {
1021 assert(!TeamAndReductions.first &&
1022 "Previous team declaration is not expected.");
1023 TeamAndReductions.first = D.getCapturedStmt(RegionKind: OMPD_teams)->getCapturedDecl();
1024 std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
1025 }
1026
1027 // Emit target region as a standalone region.
1028 class NVPTXPrePostActionTy : public PrePostActionTy {
1029 SourceLocation &Loc;
1030 const RecordDecl *GlobalizedRD;
1031 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1032 &MappedDeclsFields;
1033
1034 public:
1035 NVPTXPrePostActionTy(
1036 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1037 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1038 &MappedDeclsFields)
1039 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1040 MappedDeclsFields(MappedDeclsFields) {}
1041 void Enter(CodeGenFunction &CGF) override {
1042 auto &Rt =
1043 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1044 if (GlobalizedRD) {
1045 auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1046 I->getSecond().MappedParams =
1047 std::make_unique<CodeGenFunction::OMPMapVars>();
1048 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1049 for (const auto &Pair : MappedDeclsFields) {
1050 assert(Pair.getFirst()->isCanonicalDecl() &&
1051 "Expected canonical declaration");
1052 Data.try_emplace(Key: Pair.getFirst());
1053 }
1054 }
1055 Rt.emitGenericVarsProlog(CGF, Loc);
1056 }
1057 void Exit(CodeGenFunction &CGF) override {
1058 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1059 .emitGenericVarsEpilog(CGF);
1060 }
1061 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1062 CodeGen.setAction(Action);
1063 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1064 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1065
1066 return OutlinedFun;
1067}
1068
1069void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1070 SourceLocation Loc) {
1071 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1072 return;
1073
1074 CGBuilderTy &Bld = CGF.Builder;
1075
1076 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1077 if (I == FunctionGlobalizedDecls.end())
1078 return;
1079
1080 for (auto &Rec : I->getSecond().LocalVarData) {
1081 const auto *VD = cast<VarDecl>(Val: Rec.first);
1082 bool EscapedParam = I->getSecond().EscapedParameters.count(Ptr: Rec.first);
1083 QualType VarTy = VD->getType();
1084
1085 // Get the local allocation of a firstprivate variable before sharing
1086 llvm::Value *ParValue;
1087 if (EscapedParam) {
1088 LValue ParLVal =
1089 CGF.MakeAddrLValue(Addr: CGF.GetAddrOfLocalVar(VD), T: VD->getType());
1090 ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1091 }
1092
1093 // Allocate space for the variable to be globalized
1094 llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1095 llvm::CallBase *VoidPtr =
1096 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1097 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1098 args: AllocArgs, name: VD->getName());
1099 // FIXME: We should use the variables actual alignment as an argument.
1100 VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1101 Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment,
1102 Val: CGM.getContext().getTargetInfo().getNewAlign() / 8));
1103
1104 // Cast the void pointer and get the address of the globalized variable.
1105 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1106 V: VoidPtr, DestTy: Bld.getPtrTy(AddrSpace: 0), Name: VD->getName() + "_on_stack");
1107 LValue VarAddr =
1108 CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1109 Rec.second.PrivateAddr = VarAddr.getAddress();
1110 Rec.second.GlobalizedVal = VoidPtr;
1111
1112 // Assign the local allocation to the newly globalized location.
1113 if (EscapedParam) {
1114 CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1115 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress());
1116 }
1117 if (auto *DI = CGF.getDebugInfo())
1118 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1119 }
1120
1121 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1122 const auto *VD = cast<VarDecl>(Val: ValueD);
1123 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1124 getKmpcAllocShared(CGF, VD);
1125 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1126 LValue Base = CGF.MakeAddrLValue(V: AddrSizePair.first, T: VD->getType(),
1127 Alignment: CGM.getContext().getDeclAlign(D: VD),
1128 Source: AlignmentSource::Decl);
1129 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress());
1130 }
1131 I->getSecond().MappedParams->apply(CGF);
1132}
1133
1134bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1135 const VarDecl *VD) const {
1136 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1137 if (I == FunctionGlobalizedDecls.end())
1138 return false;
1139
1140 // Check variable declaration is delayed:
1141 return llvm::is_contained(Range: I->getSecond().DelayedVariableLengthDecls, Element: VD);
1142}
1143
1144std::pair<llvm::Value *, llvm::Value *>
1145CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1146 const VarDecl *VD) {
1147 CGBuilderTy &Bld = CGF.Builder;
1148
1149 // Compute size and alignment.
1150 llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1151 CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
1152 Size = Bld.CreateNUWAdd(
1153 LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - 1));
1154 llvm::Value *AlignVal =
1155 llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1156 Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1157 Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1158
1159 // Allocate space for this VLA object to be globalized.
1160 llvm::Value *AllocArgs[] = {Size};
1161 llvm::CallBase *VoidPtr =
1162 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1163 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1164 args: AllocArgs, name: VD->getName());
1165 VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1166 Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment, Val: Align.getQuantity()));
1167
1168 return std::make_pair(x&: VoidPtr, y&: Size);
1169}
1170
1171void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1172 CodeGenFunction &CGF,
1173 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1174 // Deallocate the memory for each globalized VLA object
1175 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1176 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1177 args: {AddrSizePair.first, AddrSizePair.second});
1178}
1179
1180void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1181 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1182 return;
1183
1184 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1185 if (I != FunctionGlobalizedDecls.end()) {
1186 // Deallocate the memory for each globalized VLA object that was
1187 // globalized in the prolog (i.e. emitGenericVarsProlog).
1188 for (const auto &AddrSizePair :
1189 llvm::reverse(C&: I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1190 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1191 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1192 args: {AddrSizePair.first, AddrSizePair.second});
1193 }
1194 // Deallocate the memory for each globalized value
1195 for (auto &Rec : llvm::reverse(C&: I->getSecond().LocalVarData)) {
1196 const auto *VD = cast<VarDecl>(Val: Rec.first);
1197 I->getSecond().MappedParams->restore(CGF);
1198
1199 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1200 CGF.getTypeSize(Ty: VD->getType())};
1201 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1202 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1203 args: FreeArgs);
1204 }
1205 }
1206}
1207
1208void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1209 const OMPExecutableDirective &D,
1210 SourceLocation Loc,
1211 llvm::Function *OutlinedFn,
1212 ArrayRef<llvm::Value *> CapturedVars) {
1213 if (!CGF.HaveInsertPoint())
1214 return;
1215
1216 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1217
1218 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1219 /*Name=*/".zero.addr");
1220 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1221 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1222 // We don't emit any thread id function call in bare kernel, but because the
1223 // outlined function has a pointer argument, we emit a nullptr here.
1224 if (IsBareKernel)
1225 OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1226 else
1227 OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1228 OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1229 OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1230 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1231}
1232
1233void CGOpenMPRuntimeGPU::emitParallelCall(
1234 CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
1235 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond,
1236 llvm::Value *NumThreads, OpenMPNumThreadsClauseModifier NumThreadsModifier,
1237 OpenMPSeverityClauseKind Severity, const Expr *Message) {
1238 if (!CGF.HaveInsertPoint())
1239 return;
1240
1241 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1242 NumThreads](CodeGenFunction &CGF,
1243 PrePostActionTy &Action) {
1244 CGBuilderTy &Bld = CGF.Builder;
1245 llvm::Value *NumThreadsVal = NumThreads;
1246 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1247 llvm::PointerType *FnPtrTy = llvm::PointerType::get(
1248 C&: CGF.getLLVMContext(), AddressSpace: CGM.getDataLayout().getProgramAddressSpace());
1249
1250 llvm::Value *ID = llvm::ConstantPointerNull::get(T: FnPtrTy);
1251 if (WFn)
1252 ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: FnPtrTy);
1253
1254 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: FnPtrTy);
1255
1256 // Create a private scope that will globalize the arguments
1257 // passed from the outside of the target region.
1258 // TODO: Is that needed?
1259 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1260
1261 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1262 Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1263 Name: "captured_vars_addrs");
1264 // There's something to share.
1265 if (!CapturedVars.empty()) {
1266 // Prepare for parallel region. Indicate the outlined function.
1267 ASTContext &Ctx = CGF.getContext();
1268 unsigned Idx = 0;
1269 for (llvm::Value *V : CapturedVars) {
1270 Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1271 llvm::Value *PtrV;
1272 if (V->getType()->isIntegerTy())
1273 PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1274 else
1275 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1276 CGF.EmitStoreOfScalar(Value: PtrV, Addr: Dst, /*Volatile=*/false,
1277 Ty: Ctx.getPointerType(T: Ctx.VoidPtrTy));
1278 ++Idx;
1279 }
1280 }
1281
1282 llvm::Value *IfCondVal = nullptr;
1283 if (IfCond)
1284 IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1285 /* isSigned */ false);
1286 else
1287 IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: 1);
1288
1289 if (!NumThreadsVal)
1290 NumThreadsVal = llvm::ConstantInt::getAllOnesValue(Ty: CGF.Int32Ty);
1291 else
1292 NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty);
1293
1294 // No strict prescriptiveness for the number of threads.
1295 llvm::Value *StrictNumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: 0);
1296
1297 assert(IfCondVal && "Expected a value");
1298 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1299 llvm::Value *Args[] = {
1300 RTLoc,
1301 getThreadID(CGF, Loc),
1302 IfCondVal,
1303 NumThreadsVal,
1304 llvm::ConstantInt::getAllOnesValue(Ty: CGF.Int32Ty),
1305 FnPtr,
1306 ID,
1307 Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1308 DestTy: CGF.VoidPtrPtrTy),
1309 llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size()),
1310 StrictNumThreadsVal};
1311
1312 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1313 M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_60),
1314 args: Args);
1315 };
1316
1317 RegionCodeGenTy RCG(ParallelGen);
1318 RCG(CGF);
1319}
1320
1321void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1322 // Always emit simple barriers!
1323 if (!CGF.HaveInsertPoint())
1324 return;
1325 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1326 // This function does not use parameters, so we can emit just default values.
1327 llvm::Value *Args[] = {
1328 llvm::ConstantPointerNull::get(
1329 T: cast<llvm::PointerType>(Val: getIdentTyPointerTy())),
1330 llvm::ConstantInt::get(Ty: CGF.Int32Ty, /*V=*/0, /*isSigned=*/IsSigned: true)};
1331 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1332 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1333 args: Args);
1334}
1335
1336void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1337 SourceLocation Loc,
1338 OpenMPDirectiveKind Kind, bool,
1339 bool) {
1340 // Always emit simple barriers!
1341 if (!CGF.HaveInsertPoint())
1342 return;
1343 // Build call __kmpc_cancel_barrier(loc, thread_id);
1344 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1345 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1346 getThreadID(CGF, Loc)};
1347
1348 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1349 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1350 args: Args);
1351}
1352
1353void CGOpenMPRuntimeGPU::emitCriticalRegion(
1354 CodeGenFunction &CGF, StringRef CriticalName,
1355 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1356 const Expr *Hint) {
1357 llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1358 llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1359 llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1360 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1361 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1362
1363 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1364
1365 // Get the mask of active threads in the warp.
1366 llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1367 M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1368 // Fetch team-local id of the thread.
1369 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1370
1371 // Get the width of the team.
1372 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1373
1374 // Initialize the counter variable for the loop.
1375 QualType Int32Ty =
1376 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1377 Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1378 LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1379 CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1380 /*isInit=*/true);
1381
1382 // Block checks if loop counter exceeds upper bound.
1383 CGF.EmitBlock(BB: LoopBB);
1384 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1385 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1386 CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1387
1388 // Block tests which single thread should execute region, and which threads
1389 // should go straight to synchronisation point.
1390 CGF.EmitBlock(BB: TestBB);
1391 CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1392 llvm::Value *CmpThreadToCounter =
1393 CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1394 CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1395
1396 // Block emits the body of the critical region.
1397 CGF.EmitBlock(BB: BodyBB);
1398
1399 // Output the critical statement.
1400 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1401 Hint);
1402
1403 // After the body surrounded by the critical region, the single executing
1404 // thread will jump to the synchronisation point.
1405 // Block waits for all threads in current team to finish then increments the
1406 // counter variable and returns to the loop.
1407 CGF.EmitBlock(BB: SyncBB);
1408 // Reconverge active threads in the warp.
1409 (void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1410 M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1411 args: Mask);
1412
1413 llvm::Value *IncCounterVal =
1414 CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: 1));
1415 CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1416 CGF.EmitBranch(Block: LoopBB);
1417
1418 // Block that is reached when all threads in the team complete the region.
1419 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
1420}
1421
1422/// Cast value to the specified type.
1423static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1424 QualType ValTy, QualType CastTy,
1425 SourceLocation Loc) {
1426 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1427 "Cast type must sized.");
1428 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1429 "Val type must sized.");
1430 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1431 if (ValTy == CastTy)
1432 return Val;
1433 if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1434 CGF.getContext().getTypeSizeInChars(T: CastTy))
1435 return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1436 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1437 return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1438 isSigned: CastTy->hasSignedIntegerRepresentation());
1439 Address CastItem = CGF.CreateMemTemp(T: CastTy);
1440 Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1441 CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /*Volatile=*/false, Ty: ValTy,
1442 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1443 TBAAInfo: TBAAAccessInfo());
1444 return CGF.EmitLoadOfScalar(Addr: CastItem, /*Volatile=*/false, Ty: CastTy, Loc,
1445 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1446 TBAAInfo: TBAAAccessInfo());
1447}
1448
1449///
1450/// Design of OpenMP reductions on the GPU
1451///
1452/// Consider a typical OpenMP program with one or more reduction
1453/// clauses:
1454///
1455/// float foo;
1456/// double bar;
1457/// #pragma omp target teams distribute parallel for \
1458/// reduction(+:foo) reduction(*:bar)
1459/// for (int i = 0; i < N; i++) {
1460/// foo += A[i]; bar *= B[i];
1461/// }
1462///
1463/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1464/// all teams. In our OpenMP implementation on the NVPTX device an
1465/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1466/// within a team are mapped to CUDA threads within a threadblock.
1467/// Our goal is to efficiently aggregate values across all OpenMP
1468/// threads such that:
1469///
1470/// - the compiler and runtime are logically concise, and
1471/// - the reduction is performed efficiently in a hierarchical
1472/// manner as follows: within OpenMP threads in the same warp,
1473/// across warps in a threadblock, and finally across teams on
1474/// the NVPTX device.
1475///
1476/// Introduction to Decoupling
1477///
1478/// We would like to decouple the compiler and the runtime so that the
1479/// latter is ignorant of the reduction variables (number, data types)
1480/// and the reduction operators. This allows a simpler interface
1481/// and implementation while still attaining good performance.
1482///
1483/// Pseudocode for the aforementioned OpenMP program generated by the
1484/// compiler is as follows:
1485///
1486/// 1. Create private copies of reduction variables on each OpenMP
1487/// thread: 'foo_private', 'bar_private'
1488/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1489/// to it and writes the result in 'foo_private' and 'bar_private'
1490/// respectively.
1491/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1492/// and store the result on the team master:
1493///
1494/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1495/// reduceData, shuffleReduceFn, interWarpCpyFn)
1496///
1497/// where:
1498/// struct ReduceData {
1499/// double *foo;
1500/// double *bar;
1501/// } reduceData
1502/// reduceData.foo = &foo_private
1503/// reduceData.bar = &bar_private
1504///
1505/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1506/// auxiliary functions generated by the compiler that operate on
1507/// variables of type 'ReduceData'. They aid the runtime perform
1508/// algorithmic steps in a data agnostic manner.
1509///
1510/// 'shuffleReduceFn' is a pointer to a function that reduces data
1511/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1512/// same warp. It takes the following arguments as input:
1513///
1514/// a. variable of type 'ReduceData' on the calling lane,
1515/// b. its lane_id,
1516/// c. an offset relative to the current lane_id to generate a
1517/// remote_lane_id. The remote lane contains the second
1518/// variable of type 'ReduceData' that is to be reduced.
1519/// d. an algorithm version parameter determining which reduction
1520/// algorithm to use.
1521///
1522/// 'shuffleReduceFn' retrieves data from the remote lane using
1523/// efficient GPU shuffle intrinsics and reduces, using the
1524/// algorithm specified by the 4th parameter, the two operands
1525/// element-wise. The result is written to the first operand.
1526///
1527/// Different reduction algorithms are implemented in different
1528/// runtime functions, all calling 'shuffleReduceFn' to perform
1529/// the essential reduction step. Therefore, based on the 4th
1530/// parameter, this function behaves slightly differently to
1531/// cooperate with the runtime to ensure correctness under
1532/// different circumstances.
1533///
1534/// 'InterWarpCpyFn' is a pointer to a function that transfers
1535/// reduced variables across warps. It tunnels, through CUDA
1536/// shared memory, the thread-private data of type 'ReduceData'
1537/// from lane 0 of each warp to a lane in the first warp.
1538/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1539/// The last team writes the global reduced value to memory.
1540///
1541/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1542/// reduceData, shuffleReduceFn, interWarpCpyFn,
1543/// scratchpadCopyFn, loadAndReduceFn)
1544///
1545/// 'scratchpadCopyFn' is a helper that stores reduced
1546/// data from the team master to a scratchpad array in
1547/// global memory.
1548///
1549/// 'loadAndReduceFn' is a helper that loads data from
1550/// the scratchpad array and reduces it with the input
1551/// operand.
1552///
1553/// These compiler generated functions hide address
1554/// calculation and alignment information from the runtime.
1555/// 5. if ret == 1:
1556/// The team master of the last team stores the reduced
1557/// result to the globals in memory.
1558/// foo += reduceData.foo; bar *= reduceData.bar
1559///
1560///
1561/// Warp Reduction Algorithms
1562///
1563/// On the warp level, we have three algorithms implemented in the
1564/// OpenMP runtime depending on the number of active lanes:
1565///
1566/// Full Warp Reduction
1567///
1568/// The reduce algorithm within a warp where all lanes are active
1569/// is implemented in the runtime as follows:
1570///
1571/// full_warp_reduce(void *reduce_data,
1572/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1573/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1574/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1575/// }
1576///
1577/// The algorithm completes in log(2, WARPSIZE) steps.
1578///
1579/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1580/// not used therefore we save instructions by not retrieving lane_id
1581/// from the corresponding special registers. The 4th parameter, which
1582/// represents the version of the algorithm being used, is set to 0 to
1583/// signify full warp reduction.
1584///
1585/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1586///
1587/// #reduce_elem refers to an element in the local lane's data structure
1588/// #remote_elem is retrieved from a remote lane
1589/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1590/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1591///
1592/// Contiguous Partial Warp Reduction
1593///
1594/// This reduce algorithm is used within a warp where only the first
1595/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1596/// number of OpenMP threads in a parallel region is not a multiple of
1597/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1598///
1599/// void
1600/// contiguous_partial_reduce(void *reduce_data,
1601/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1602/// int size, int lane_id) {
1603/// int curr_size;
1604/// int offset;
1605/// curr_size = size;
1606/// mask = curr_size/2;
1607/// while (offset>0) {
1608/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1609/// curr_size = (curr_size+1)/2;
1610/// offset = curr_size/2;
1611/// }
1612/// }
1613///
1614/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1615///
1616/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1617/// if (lane_id < offset)
1618/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1619/// else
1620/// reduce_elem = remote_elem
1621///
1622/// This algorithm assumes that the data to be reduced are located in a
1623/// contiguous subset of lanes starting from the first. When there is
1624/// an odd number of active lanes, the data in the last lane is not
1625/// aggregated with any other lane's dat but is instead copied over.
1626///
1627/// Dispersed Partial Warp Reduction
1628///
1629/// This algorithm is used within a warp when any discontiguous subset of
1630/// lanes are active. It is used to implement the reduction operation
1631/// across lanes in an OpenMP simd region or in a nested parallel region.
1632///
1633/// void
1634/// dispersed_partial_reduce(void *reduce_data,
1635/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1636/// int size, remote_id;
1637/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1638/// do {
1639/// remote_id = next_active_lane_id_right_after_me();
1640/// # the above function returns 0 of no active lane
1641/// # is present right after the current lane.
1642/// size = number_of_active_lanes_in_this_warp();
1643/// logical_lane_id /= 2;
1644/// ShuffleReduceFn(reduce_data, logical_lane_id,
1645/// remote_id-1-threadIdx.x, 2);
1646/// } while (logical_lane_id % 2 == 0 && size > 1);
1647/// }
1648///
1649/// There is no assumption made about the initial state of the reduction.
1650/// Any number of lanes (>=1) could be active at any position. The reduction
1651/// result is returned in the first active lane.
1652///
1653/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1654///
1655/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1656/// if (lane_id % 2 == 0 && offset > 0)
1657/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1658/// else
1659/// reduce_elem = remote_elem
1660///
1661///
1662/// Intra-Team Reduction
1663///
1664/// This function, as implemented in the runtime call
1665/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1666/// threads in a team. It first reduces within a warp using the
1667/// aforementioned algorithms. We then proceed to gather all such
1668/// reduced values at the first warp.
1669///
1670/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1671/// data from each of the "warp master" (zeroth lane of each warp, where
1672/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1673/// a mathematical sense) the problem of reduction across warp masters in
1674/// a block to the problem of warp reduction.
1675///
1676///
1677/// Inter-Team Reduction
1678///
1679/// Once a team has reduced its data to a single value, it is stored in
1680/// a global scratchpad array. Since each team has a distinct slot, this
1681/// can be done without locking.
1682///
1683/// The last team to write to the scratchpad array proceeds to reduce the
1684/// scratchpad array. One or more workers in the last team use the helper
1685/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1686/// the k'th worker reduces every k'th element.
1687///
1688/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1689/// reduce across workers and compute a globally reduced value.
1690///
1691void CGOpenMPRuntimeGPU::emitReduction(
1692 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1693 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1694 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1695 if (!CGF.HaveInsertPoint())
1696 return;
1697
1698 bool ParallelReduction = isOpenMPParallelDirective(DKind: Options.ReductionKind);
1699 bool TeamsReduction = isOpenMPTeamsDirective(DKind: Options.ReductionKind);
1700
1701 ASTContext &C = CGM.getContext();
1702
1703 if (Options.SimpleReduction) {
1704 assert(!TeamsReduction && !ParallelReduction &&
1705 "Invalid reduction selection in emitReduction.");
1706 (void)ParallelReduction;
1707 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1708 ReductionOps, Options);
1709 return;
1710 }
1711
1712 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1713 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1714 int Cnt = 0;
1715 for (const Expr *DRE : Privates) {
1716 PrivatesReductions[Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
1717 ++Cnt;
1718 }
1719 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1720 C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: {}, MappedDeclsFields&: VarFieldMap, BufSize: 1);
1721
1722 if (TeamsReduction)
1723 TeamsReductions.push_back(Elt: ReductionRec);
1724
1725 // Source location for the ident struct
1726 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1727
1728 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1729 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1730 CGF.AllocaInsertPt->getIterator());
1731 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1732 CGF.Builder.GetInsertPoint());
1733 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1734 CodeGenIP, CGF.SourceLocToDebugLoc(Location: Loc));
1735 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo, 2> ReductionInfos;
1736
1737 CodeGenFunction::OMPPrivateScope Scope(CGF);
1738 unsigned Idx = 0;
1739 for (const Expr *Private : Privates) {
1740 llvm::Type *ElementType;
1741 llvm::Value *Variable;
1742 llvm::Value *PrivateVariable;
1743 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1744 ElementType = CGF.ConvertTypeForMem(T: Private->getType());
1745 const auto *RHSVar =
1746 cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs[Idx])->getDecl());
1747 PrivateVariable = CGF.GetAddrOfLocalVar(VD: RHSVar).emitRawPointer(CGF);
1748 const auto *LHSVar =
1749 cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs[Idx])->getDecl());
1750 Variable = CGF.GetAddrOfLocalVar(VD: LHSVar).emitRawPointer(CGF);
1751 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1752 switch (CGF.getEvaluationKind(T: Private->getType())) {
1753 case TEK_Scalar:
1754 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1755 break;
1756 case TEK_Complex:
1757 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1758 break;
1759 case TEK_Aggregate:
1760 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1761 break;
1762 }
1763 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1764 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1765 llvm::Function *NewFunc) {
1766 CGF.Builder.restoreIP(IP: CodeGenIP);
1767 auto *CurFn = CGF.CurFn;
1768 CGF.CurFn = NewFunc;
1769
1770 *LHSPtr = CGF.GetAddrOfLocalVar(
1771 VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs[I])->getDecl()))
1772 .emitRawPointer(CGF);
1773 *RHSPtr = CGF.GetAddrOfLocalVar(
1774 VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs[I])->getDecl()))
1775 .emitRawPointer(CGF);
1776
1777 emitSingleReductionCombiner(CGF, ReductionOp: ReductionOps[I], PrivateRef: Privates[I],
1778 LHS: cast<DeclRefExpr>(Val: LHSExprs[I]),
1779 RHS: cast<DeclRefExpr>(Val: RHSExprs[I]));
1780
1781 CGF.CurFn = CurFn;
1782
1783 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1784 CGF.Builder.GetInsertPoint());
1785 };
1786 ReductionInfos.emplace_back(Args: llvm::OpenMPIRBuilder::ReductionInfo(
1787 ElementType, Variable, PrivateVariable, EvalKind,
1788 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen,
1789 /*DataPtrPtrGen=*/nullptr));
1790 Idx++;
1791 }
1792
1793 llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
1794 cantFail(ValOrErr: OMPBuilder.createReductionsGPU(
1795 Loc: OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, IsNoWait: false,
1796 IsTeamsReduction: TeamsReduction, ReductionGenCBKind: llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1797 GridValue: CGF.getTarget().getGridValue(),
1798 ReductionBufNum: C.getLangOpts().OpenMPCUDAReductionBufNum, SrcLocInfo: RTLoc));
1799 CGF.Builder.restoreIP(IP: AfterIP);
1800}
1801
1802const VarDecl *
1803CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1804 const VarDecl *NativeParam) const {
1805 if (!NativeParam->getType()->isReferenceType())
1806 return NativeParam;
1807 QualType ArgType = NativeParam->getType();
1808 QualifierCollector QC;
1809 const Type *NonQualTy = QC.strip(type: ArgType);
1810 QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1811 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1812 if (Attr->getCaptureKind() == OMPC_map) {
1813 PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
1814 AddressSpace: LangAS::opencl_global);
1815 }
1816 }
1817 ArgType = CGM.getContext().getPointerType(T: PointeeTy);
1818 QC.addRestrict();
1819 enum { NVPTX_local_addr = 5 };
1820 QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
1821 ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
1822 if (isa<ImplicitParamDecl>(Val: NativeParam))
1823 return ImplicitParamDecl::Create(
1824 C&: CGM.getContext(), /*DC=*/nullptr, IdLoc: NativeParam->getLocation(),
1825 Id: NativeParam->getIdentifier(), T: ArgType, ParamKind: ImplicitParamKind::Other);
1826 return ParmVarDecl::Create(
1827 C&: CGM.getContext(),
1828 DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
1829 StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
1830 Id: NativeParam->getIdentifier(), T: ArgType,
1831 /*TInfo=*/nullptr, S: SC_None, /*DefArg=*/nullptr);
1832}
1833
1834Address
1835CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1836 const VarDecl *NativeParam,
1837 const VarDecl *TargetParam) const {
1838 assert(NativeParam != TargetParam &&
1839 NativeParam->getType()->isReferenceType() &&
1840 "Native arg must not be the same as target arg.");
1841 Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
1842 QualType NativeParamType = NativeParam->getType();
1843 QualifierCollector QC;
1844 const Type *NonQualTy = QC.strip(type: NativeParamType);
1845 QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1846 unsigned NativePointeeAddrSpace =
1847 CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
1848 QualType TargetTy = TargetParam->getType();
1849 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /*Volatile=*/false,
1850 Ty: TargetTy, Loc: SourceLocation());
1851 // Cast to native address space.
1852 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1853 V: TargetAddr,
1854 DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
1855 Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
1856 CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /*Volatile=*/false,
1857 Ty: NativeParamType);
1858 return NativeParamAddr;
1859}
1860
1861void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1862 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1863 ArrayRef<llvm::Value *> Args) const {
1864 SmallVector<llvm::Value *, 4> TargetArgs;
1865 TargetArgs.reserve(N: Args.size());
1866 auto *FnType = OutlinedFn.getFunctionType();
1867 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1868 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1869 TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
1870 break;
1871 }
1872 llvm::Type *TargetType = FnType->getParamType(i: I);
1873 llvm::Value *NativeArg = Args[I];
1874 if (!TargetType->isPointerTy()) {
1875 TargetArgs.emplace_back(Args&: NativeArg);
1876 continue;
1877 }
1878 TargetArgs.emplace_back(
1879 Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
1880 }
1881 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: TargetArgs);
1882}
1883
1884/// Emit function which wraps the outline parallel region
1885/// and controls the arguments which are passed to this function.
1886/// The wrapper ensures that the outlined function is called
1887/// with the correct arguments when data is shared.
1888llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1889 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1890 ASTContext &Ctx = CGM.getContext();
1891 const auto &CS = *D.getCapturedStmt(RegionKind: OMPD_parallel);
1892
1893 // Create a function that takes as argument the source thread.
1894 FunctionArgList WrapperArgs;
1895 QualType Int16QTy =
1896 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1897 QualType Int32QTy =
1898 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1899 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1900 /*Id=*/nullptr, Int16QTy,
1901 ImplicitParamKind::Other);
1902 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1903 /*Id=*/nullptr, Int32QTy,
1904 ImplicitParamKind::Other);
1905 WrapperArgs.emplace_back(Args: &ParallelLevelArg);
1906 WrapperArgs.emplace_back(Args: &WrapperArg);
1907
1908 const CGFunctionInfo &CGFI =
1909 CGM.getTypes().arrangeBuiltinFunctionDeclaration(resultType: Ctx.VoidTy, args: WrapperArgs);
1910
1911 auto *Fn = llvm::Function::Create(
1912 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
1913 N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
1914
1915 // Ensure we do not inline the function. This is trivially true for the ones
1916 // passed to __kmpc_fork_call but the ones calles in serialized regions
1917 // could be inlined. This is not a perfect but it is closer to the invariant
1918 // we want, namely, every data environment starts with a new function.
1919 // TODO: We should pass the if condition to the runtime function and do the
1920 // handling there. Much cleaner code.
1921 Fn->addFnAttr(Kind: llvm::Attribute::NoInline);
1922
1923 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
1924 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1925 Fn->setDoesNotRecurse();
1926
1927 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1928 CGF.StartFunction(GD: GlobalDecl(), RetTy: Ctx.VoidTy, Fn, FnInfo: CGFI, Args: WrapperArgs,
1929 Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
1930
1931 const auto *RD = CS.getCapturedRecordDecl();
1932 auto CurField = RD->field_begin();
1933
1934 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1935 /*Name=*/".zero.addr");
1936 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1937 // Get the array of arguments.
1938 SmallVector<llvm::Value *, 8> Args;
1939
1940 Args.emplace_back(Args: CGF.GetAddrOfLocalVar(VD: &WrapperArg).emitRawPointer(CGF));
1941 Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
1942
1943 CGBuilderTy &Bld = CGF.Builder;
1944 auto CI = CS.capture_begin();
1945
1946 // Use global memory for data sharing.
1947 // Handle passing of global args to workers.
1948 RawAddress GlobalArgs =
1949 CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
1950 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1951 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1952 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1953 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
1954 args: DataSharingArgs);
1955
1956 // Retrieve the shared variables from the list of references returned
1957 // by the runtime. Pass the variables to the outlined function.
1958 Address SharedArgListAddress = Address::invalid();
1959 if (CS.capture_size() > 0 ||
1960 isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1961 SharedArgListAddress = CGF.EmitLoadOfPointer(
1962 Ptr: GlobalArgs, PtrTy: CGF.getContext()
1963 .getPointerType(T: CGF.getContext().VoidPtrTy)
1964 .castAs<PointerType>());
1965 }
1966 unsigned Idx = 0;
1967 if (isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1968 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1969 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1970 Addr: Src, Ty: Bld.getPtrTy(AddrSpace: 0), ElementTy: CGF.SizeTy);
1971 llvm::Value *LB = CGF.EmitLoadOfScalar(
1972 Addr: TypedAddress,
1973 /*Volatile=*/false,
1974 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1975 Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
1976 Args.emplace_back(Args&: LB);
1977 ++Idx;
1978 Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1979 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Src, Ty: Bld.getPtrTy(AddrSpace: 0),
1980 ElementTy: CGF.SizeTy);
1981 llvm::Value *UB = CGF.EmitLoadOfScalar(
1982 Addr: TypedAddress,
1983 /*Volatile=*/false,
1984 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1985 Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
1986 Args.emplace_back(Args&: UB);
1987 ++Idx;
1988 }
1989 if (CS.capture_size() > 0) {
1990 ASTContext &CGFContext = CGF.getContext();
1991 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1992 QualType ElemTy = CurField->getType();
1993 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
1994 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1995 Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
1996 ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
1997 llvm::Value *Arg = CGF.EmitLoadOfScalar(Addr: TypedAddress,
1998 /*Volatile=*/false,
1999 Ty: CGFContext.getPointerType(T: ElemTy),
2000 Loc: CI->getLocation());
2001 if (CI->capturesVariableByCopy() &&
2002 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
2003 Arg = castValueToType(CGF, Val: Arg, ValTy: ElemTy, CastTy: CGFContext.getUIntPtrType(),
2004 Loc: CI->getLocation());
2005 }
2006 Args.emplace_back(Args&: Arg);
2007 }
2008 }
2009
2010 emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
2011 CGF.FinishFunction();
2012 return Fn;
2013}
2014
2015void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
2016 const Decl *D) {
2017 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2018 return;
2019
2020 assert(D && "Expected function or captured|block decl.");
2021 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
2022 "Function is registered already.");
2023 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
2024 "Team is set but not processed.");
2025 const Stmt *Body = nullptr;
2026 bool NeedToDelayGlobalization = false;
2027 if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
2028 Body = FD->getBody();
2029 } else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
2030 Body = BD->getBody();
2031 } else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
2032 Body = CD->getBody();
2033 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
2034 if (NeedToDelayGlobalization &&
2035 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
2036 return;
2037 }
2038 if (!Body)
2039 return;
2040 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2041 VarChecker.Visit(S: Body);
2042 const RecordDecl *GlobalizedVarsRecord =
2043 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2044 TeamAndReductions.first = nullptr;
2045 TeamAndReductions.second.clear();
2046 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2047 VarChecker.getEscapedVariableLengthDecls();
2048 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2049 VarChecker.getDelayedVariableLengthDecls();
2050 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2051 DelayedVariableLengthDecls.empty())
2052 return;
2053 auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
2054 I->getSecond().MappedParams =
2055 std::make_unique<CodeGenFunction::OMPMapVars>();
2056 I->getSecond().EscapedParameters.insert(
2057 I: VarChecker.getEscapedParameters().begin(),
2058 E: VarChecker.getEscapedParameters().end());
2059 I->getSecond().EscapedVariableLengthDecls.append(
2060 in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
2061 I->getSecond().DelayedVariableLengthDecls.append(
2062 in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
2063 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2064 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2065 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2066 Data.try_emplace(Key: VD);
2067 }
2068 if (!NeedToDelayGlobalization) {
2069 emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
2070 struct GlobalizationScope final : EHScopeStack::Cleanup {
2071 GlobalizationScope() = default;
2072
2073 void Emit(CodeGenFunction &CGF, Flags flags) override {
2074 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2075 .emitGenericVarsEpilog(CGF);
2076 }
2077 };
2078 CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
2079 }
2080}
2081
2082Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2083 const VarDecl *VD) {
2084 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2085 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2086 auto AS = LangAS::Default;
2087 switch (A->getAllocatorType()) {
2088 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2089 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2090 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2091 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2092 break;
2093 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2094 return Address::invalid();
2095 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2096 // TODO: implement aupport for user-defined allocators.
2097 return Address::invalid();
2098 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2099 AS = LangAS::cuda_constant;
2100 break;
2101 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2102 AS = LangAS::cuda_shared;
2103 break;
2104 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2105 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2106 break;
2107 }
2108 llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
2109 auto *GV = new llvm::GlobalVariable(
2110 CGM.getModule(), VarTy, /*isConstant=*/false,
2111 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
2112 VD->getName(),
2113 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2114 CGM.getContext().getTargetAddressSpace(AS));
2115 CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
2116 GV->setAlignment(Align.getAsAlign());
2117 return Address(
2118 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2119 V: GV, DestTy: CGF.Builder.getPtrTy(AddrSpace: CGM.getContext().getTargetAddressSpace(
2120 AS: VD->getType().getAddressSpace()))),
2121 VarTy, Align);
2122 }
2123
2124 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2125 return Address::invalid();
2126
2127 VD = VD->getCanonicalDecl();
2128 auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
2129 if (I == FunctionGlobalizedDecls.end())
2130 return Address::invalid();
2131 auto VDI = I->getSecond().LocalVarData.find(Key: VD);
2132 if (VDI != I->getSecond().LocalVarData.end())
2133 return VDI->second.PrivateAddr;
2134 if (VD->hasAttrs()) {
2135 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2136 E(VD->attr_end());
2137 IT != E; ++IT) {
2138 auto VDI = I->getSecond().LocalVarData.find(
2139 Key: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: IT->getRef())->getDecl())
2140 ->getCanonicalDecl());
2141 if (VDI != I->getSecond().LocalVarData.end())
2142 return VDI->second.PrivateAddr;
2143 }
2144 }
2145
2146 return Address::invalid();
2147}
2148
2149void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2150 FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
2151 CGOpenMPRuntime::functionFinished(CGF);
2152}
2153
2154void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2155 CodeGenFunction &CGF, const OMPLoopDirective &S,
2156 OpenMPDistScheduleClauseKind &ScheduleKind,
2157 llvm::Value *&Chunk) const {
2158 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2159 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2160 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2161 Chunk = CGF.EmitScalarConversion(
2162 Src: RT.getGPUNumThreads(CGF),
2163 SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
2164 DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
2165 return;
2166 }
2167 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2168 CGF, S, ScheduleKind, Chunk);
2169}
2170
2171void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2172 CodeGenFunction &CGF, const OMPLoopDirective &S,
2173 OpenMPScheduleClauseKind &ScheduleKind,
2174 const Expr *&ChunkExpr) const {
2175 ScheduleKind = OMPC_SCHEDULE_static;
2176 // Chunk size is 1 in this case.
2177 llvm::APInt ChunkSize(32, 1);
2178 ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
2179 type: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
2180 l: SourceLocation());
2181}
2182
2183void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2184 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2185 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2186 " Expected target-based directive.");
2187 const CapturedStmt *CS = D.getCapturedStmt(RegionKind: OMPD_target);
2188 for (const CapturedStmt::Capture &C : CS->captures()) {
2189 // Capture variables captured by reference in lambdas for target-based
2190 // directives.
2191 if (!C.capturesVariable())
2192 continue;
2193 const VarDecl *VD = C.getCapturedVar();
2194 const auto *RD = VD->getType()
2195 .getCanonicalType()
2196 .getNonReferenceType()
2197 ->getAsCXXRecordDecl();
2198 if (!RD || !RD->isLambda())
2199 continue;
2200 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2201 LValue VDLVal;
2202 if (VD->getType().getCanonicalType()->isReferenceType())
2203 VDLVal = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr, RefTy: VD->getType());
2204 else
2205 VDLVal = CGF.MakeAddrLValue(
2206 Addr: VDAddr, T: VD->getType().getCanonicalType().getNonReferenceType());
2207 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2208 FieldDecl *ThisCapture = nullptr;
2209 RD->getCaptureFields(Captures, ThisCapture);
2210 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2211 LValue ThisLVal =
2212 CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
2213 llvm::Value *CXXThis = CGF.LoadCXXThis();
2214 CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
2215 }
2216 for (const LambdaCapture &LC : RD->captures()) {
2217 if (LC.getCaptureKind() != LCK_ByRef)
2218 continue;
2219 const ValueDecl *VD = LC.getCapturedVar();
2220 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2221 // capturing structured bindings in lambdas yet.
2222 if (!CS->capturesVariable(Var: cast<VarDecl>(Val: VD)))
2223 continue;
2224 auto It = Captures.find(Val: VD);
2225 assert(It != Captures.end() && "Found lambda capture without field.");
2226 LValue VarLVal = CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: It->second);
2227 Address VDAddr = CGF.GetAddrOfLocalVar(VD: cast<VarDecl>(Val: VD));
2228 if (VD->getType().getCanonicalType()->isReferenceType())
2229 VDAddr = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr,
2230 RefTy: VD->getType().getCanonicalType())
2231 .getAddress();
2232 CGF.EmitStoreOfScalar(value: VDAddr.emitRawPointer(CGF), lvalue: VarLVal);
2233 }
2234 }
2235}
2236
2237bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2238 LangAS &AS) {
2239 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2240 return false;
2241 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2242 switch(A->getAllocatorType()) {
2243 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2244 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2245 // Not supported, fallback to the default mem space.
2246 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2247 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2248 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2249 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2250 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2251 AS = LangAS::Default;
2252 return true;
2253 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2254 AS = LangAS::cuda_constant;
2255 return true;
2256 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2257 AS = LangAS::cuda_shared;
2258 return true;
2259 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2260 llvm_unreachable("Expected predefined allocator for the variables with the "
2261 "static storage.");
2262 }
2263 return false;
2264}
2265
2266// Get current OffloadArch and ignore any unknown values
2267static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2268 if (!CGM.getTarget().hasFeature(Feature: "ptx"))
2269 return OffloadArch::UNKNOWN;
2270 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2271 if (Feature.getValue()) {
2272 OffloadArch Arch = StringToOffloadArch(S: Feature.getKey());
2273 if (Arch != OffloadArch::UNKNOWN)
2274 return Arch;
2275 }
2276 }
2277 return OffloadArch::UNKNOWN;
2278}
2279
2280/// Check to see if target architecture supports unified addressing which is
2281/// a restriction for OpenMP requires clause "unified_shared_memory".
2282void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2283 for (const OMPClause *Clause : D->clauselists()) {
2284 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2285 OffloadArch Arch = getOffloadArch(CGM);
2286 switch (Arch) {
2287 case OffloadArch::SM_20:
2288 case OffloadArch::SM_21:
2289 case OffloadArch::SM_30:
2290 case OffloadArch::SM_32_:
2291 case OffloadArch::SM_35:
2292 case OffloadArch::SM_37:
2293 case OffloadArch::SM_50:
2294 case OffloadArch::SM_52:
2295 case OffloadArch::SM_53: {
2296 SmallString<256> Buffer;
2297 llvm::raw_svector_ostream Out(Buffer);
2298 Out << "Target architecture " << OffloadArchToString(A: Arch)
2299 << " does not support unified addressing";
2300 CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
2301 return;
2302 }
2303 case OffloadArch::SM_60:
2304 case OffloadArch::SM_61:
2305 case OffloadArch::SM_62:
2306 case OffloadArch::SM_70:
2307 case OffloadArch::SM_72:
2308 case OffloadArch::SM_75:
2309 case OffloadArch::SM_80:
2310 case OffloadArch::SM_86:
2311 case OffloadArch::SM_87:
2312 case OffloadArch::SM_88:
2313 case OffloadArch::SM_89:
2314 case OffloadArch::SM_90:
2315 case OffloadArch::SM_90a:
2316 case OffloadArch::SM_100:
2317 case OffloadArch::SM_100a:
2318 case OffloadArch::SM_101:
2319 case OffloadArch::SM_101a:
2320 case OffloadArch::SM_103:
2321 case OffloadArch::SM_103a:
2322 case OffloadArch::SM_110:
2323 case OffloadArch::SM_110a:
2324 case OffloadArch::SM_120:
2325 case OffloadArch::SM_120a:
2326 case OffloadArch::SM_121:
2327 case OffloadArch::SM_121a:
2328 case OffloadArch::GFX600:
2329 case OffloadArch::GFX601:
2330 case OffloadArch::GFX602:
2331 case OffloadArch::GFX700:
2332 case OffloadArch::GFX701:
2333 case OffloadArch::GFX702:
2334 case OffloadArch::GFX703:
2335 case OffloadArch::GFX704:
2336 case OffloadArch::GFX705:
2337 case OffloadArch::GFX801:
2338 case OffloadArch::GFX802:
2339 case OffloadArch::GFX803:
2340 case OffloadArch::GFX805:
2341 case OffloadArch::GFX810:
2342 case OffloadArch::GFX9_GENERIC:
2343 case OffloadArch::GFX900:
2344 case OffloadArch::GFX902:
2345 case OffloadArch::GFX904:
2346 case OffloadArch::GFX906:
2347 case OffloadArch::GFX908:
2348 case OffloadArch::GFX909:
2349 case OffloadArch::GFX90a:
2350 case OffloadArch::GFX90c:
2351 case OffloadArch::GFX9_4_GENERIC:
2352 case OffloadArch::GFX942:
2353 case OffloadArch::GFX950:
2354 case OffloadArch::GFX10_1_GENERIC:
2355 case OffloadArch::GFX1010:
2356 case OffloadArch::GFX1011:
2357 case OffloadArch::GFX1012:
2358 case OffloadArch::GFX1013:
2359 case OffloadArch::GFX10_3_GENERIC:
2360 case OffloadArch::GFX1030:
2361 case OffloadArch::GFX1031:
2362 case OffloadArch::GFX1032:
2363 case OffloadArch::GFX1033:
2364 case OffloadArch::GFX1034:
2365 case OffloadArch::GFX1035:
2366 case OffloadArch::GFX1036:
2367 case OffloadArch::GFX11_GENERIC:
2368 case OffloadArch::GFX1100:
2369 case OffloadArch::GFX1101:
2370 case OffloadArch::GFX1102:
2371 case OffloadArch::GFX1103:
2372 case OffloadArch::GFX1150:
2373 case OffloadArch::GFX1151:
2374 case OffloadArch::GFX1152:
2375 case OffloadArch::GFX1153:
2376 case OffloadArch::GFX12_GENERIC:
2377 case OffloadArch::GFX1200:
2378 case OffloadArch::GFX1201:
2379 case OffloadArch::GFX1250:
2380 case OffloadArch::GFX1251:
2381 case OffloadArch::GFX1310:
2382 case OffloadArch::AMDGCNSPIRV:
2383 case OffloadArch::Generic:
2384 case OffloadArch::GRANITERAPIDS:
2385 case OffloadArch::BMG_G21:
2386 case OffloadArch::UNUSED:
2387 case OffloadArch::UNKNOWN:
2388 break;
2389 case OffloadArch::LAST:
2390 llvm_unreachable("Unexpected GPU arch.");
2391 }
2392 }
2393 }
2394 CGOpenMPRuntime::processRequiresDirective(D);
2395}
2396
2397llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2398 CGBuilderTy &Bld = CGF.Builder;
2399 llvm::Module *M = &CGF.CGM.getModule();
2400 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2401 llvm::Function *F = M->getFunction(Name: LocSize);
2402 if (!F) {
2403 F = llvm::Function::Create(Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: {}, isVarArg: false),
2404 Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize,
2405 M: &CGF.CGM.getModule());
2406 }
2407 return Bld.CreateCall(Callee: F, Args: {}, Name: "nvptx_num_threads");
2408}
2409
2410llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2411 ArrayRef<llvm::Value *> Args{};
2412 return CGF.EmitRuntimeCall(
2413 callee: OMPBuilder.getOrCreateRuntimeFunction(
2414 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
2415 args: Args);
2416}
2417