CGOpenMPRuntimeGPU.cpp source code [llvm_projects/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp]

1	//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a generalized class for OpenMP runtime code generation
10	// specialized by GPU targets NVPTX and AMDGCN.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeGPU.h"
15	#include "CodeGenFunction.h"
16	#include "clang/AST/Attr.h"
17	#include "clang/AST/DeclOpenMP.h"
18	#include "clang/AST/OpenMPClause.h"
19	#include "clang/AST/StmtOpenMP.h"
20	#include "clang/AST/StmtVisitor.h"
21	#include "clang/Basic/Cuda.h"
22	#include "llvm/ADT/SmallPtrSet.h"
23	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24	#include "llvm/Support/MathExtras.h"
25
26	using namespace clang;
27	using namespace CodeGen;
28	using namespace llvm::omp;
29
30	namespace {
31	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32	class NVPTXActionTy final : public PrePostActionTy {
33	llvm::FunctionCallee EnterCallee = nullptr;
34	ArrayRef<llvm::Value *> EnterArgs;
35	llvm::FunctionCallee ExitCallee = nullptr;
36	ArrayRef<llvm::Value *> ExitArgs;
37	bool Conditional = false;
38	llvm::BasicBlock ContBlock = nullptr*;
39
40	public:
41	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42	ArrayRef<llvm::Value *> EnterArgs,
43	llvm::FunctionCallee ExitCallee,
44	ArrayRef<llvm::Value > ExitArgs, bool* Conditional = false)
45	: EnterCallee (EnterCallee), EnterArgs (EnterArgs), ExitCallee (ExitCallee),
46	ExitArgs (ExitArgs), Conditional(Conditional) {}
47	void Enter(CodeGenFunction &CGF) override {
48	llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
49	if (Conditional) {
50	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
51	auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
52	ContBlock = CGF.createBasicBlock(name: "omp_if.end");
53	// Generate the branch (If-stmt)
54	CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
55	CGF.EmitBlock(BB: ThenBlock);
56	}
57	}
58	void Done(CodeGenFunction &CGF) {
59	// Emit the rest of blocks/branches
60	CGF.EmitBranch(Block: ContBlock);
61	CGF.EmitBlock(BB: ContBlock, IsFinished: true);
62	}
63	void Exit(CodeGenFunction &CGF) override {
64	CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
65	}
66	};
67
68	/// A class to track the execution mode when codegening directives within
69	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
70	/// to the target region and used by containing directives such as 'parallel'
71	/// to emit optimized code.
72	class ExecutionRuntimeModesRAII {
73	private:
74	CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75	CGOpenMPRuntimeGPU::EM_Unknown;
76	CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78	public:
79	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80	CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81	: ExecMode(ExecMode) {
82	SavedExecMode = ExecMode;
83	ExecMode = EntryMode;
84	}
85	~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86	};
87
88	static const ValueDecl getPrivateItem(const* Expr *RefExpr) {
89	RefExpr = RefExpr->IgnoreParens();
90	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
91	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
93	Base = TempASE->getBase()->IgnoreParenImpCasts();
94	RefExpr = Base;
95	} else if (auto *OASE = dyn_cast<ArraySectionExpr>(Val: RefExpr)) {
96	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97	while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Val: Base))
98	Base = TempOASE->getBase()->IgnoreParenImpCasts();
99	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
100	Base = TempASE->getBase()->IgnoreParenImpCasts();
101	RefExpr = Base;
102	}
103	RefExpr = RefExpr->IgnoreParenImpCasts();
104	if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
105	return cast<ValueDecl>(Val: DE->getDecl()->getCanonicalDecl());
106	const auto *ME = cast<MemberExpr>(Val: RefExpr);
107	return cast<ValueDecl>(Val: ME->getMemberDecl()->getCanonicalDecl());
108	}
109
110	static RecordDecl *buildRecordForGlobalizedVars(
111	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
114	&MappedDeclsFields,
115	int BufSize) {
116	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
117	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118	return nullptr;
119	SmallVector<VarsDataTy, `4`> GlobalizedVars;
120	for (const ValueDecl *D : EscapedDecls)
121	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
122	for (const ValueDecl *D : EscapedDeclsForTeams)
123	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
124
125	// Build struct _globalized_locals_ty {
126	// / globalized vars /[WarSize] align (decl_align)
127	// / globalized vars / for EscapedDeclsForTeams
128	// };
129	RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
130	GlobalizedRD->startDefinition();
131	llvm::SmallPtrSet<const ValueDecl *, `16`> SingleEscaped(
132	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133	for (const auto &Pair : GlobalizedVars) {
134	const ValueDecl *VD = Pair.second;
135	QualType Type = VD->getType();
136	if (Type ->isLValueReferenceType())
137	Type = C.getPointerType(T: Type.getNonReferenceType());
138	else
139	Type = Type.getNonReferenceType();
140	SourceLocation Loc = VD->getLocation();
141	FieldDecl *Field;
142	if (SingleEscaped.count(Ptr: VD)) {
143	Field = FieldDecl::Create(
144	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
145	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
146	/BW=/nullptr, /Mutable=/false,
147	/InitStyle=/ICIS_NoInit);
148	Field->setAccess(AS_public);
149	if (VD->hasAttrs()) {
150	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151	E(VD->getAttrs().end());
152	I != E; ++I)
153	Field->addAttr(A: *I);
154	}
155	} else {
156	if (BufSize > `1`) {
157	llvm::APInt ArraySize(`32`, BufSize);
158	Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
159	ASM: ArraySizeModifier::Normal, IndexTypeQuals: `0`);
160	}
161	Field = FieldDecl::Create(
162	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
163	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
164	/BW=/nullptr, /Mutable=/false,
165	/InitStyle=/ICIS_NoInit);
166	Field->setAccess(AS_public);
167	llvm::APInt Align(`32`, Pair.first.getQuantity());
168	Field->addAttr(A: AlignedAttr::CreateImplicit(
169	Ctx&: C, /IsAlignmentExpr=/true,
170	Alignment: IntegerLiteral::Create(C, V: Align,
171	type: C.getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
172	l: SourceLocation ()),
173	Range: {}, S: AlignedAttr::GNU_aligned));
174	}
175	GlobalizedRD->addDecl(D: Field);
176	MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
177	}
178	GlobalizedRD->completeDefinition();
179	return GlobalizedRD;
180	}
181
182	/// Get the list of variables that can escape their declaration context.
183	class CheckVarsEscapingDeclContext final
184	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185	CodeGenFunction &CGF;
186	llvm::SetVector<const ValueDecl *> EscapedDecls;
187	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188	llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189	llvm::SmallPtrSet<const Decl *, `4`> EscapedParameters;
190	RecordDecl GlobalizedRD = nullptr*;
191	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
192	bool AllEscaped = false;
193	bool IsForCombinedParallelRegion = false;
194
195	void markAsEscaped(const ValueDecl *VD) {
196	// Do not globalize declare target variables.
197	if (!isa<VarDecl>(Val: VD) \|\|
198	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199	return;
200	VD = cast<ValueDecl>(Val: VD->getCanonicalDecl());
201	// Use user-specified allocation.
202	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203	return;
204	// Variables captured by value must be globalized.
205	bool IsCaptured = false;
206	if (auto *CSI = CGF.CapturedStmtInfo) {
207	if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
208	// Check if need to capture the variable that was already captured by
209	// value in the outer region.
210	IsCaptured = true;
211	if (!IsForCombinedParallelRegion) {
212	if (!FD->hasAttrs())
213	return;
214	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215	if (!Attr)
216	return;
217	if (((Attr->getCaptureKind() != OMPC_map) &&
218	!isOpenMPPrivate(Kind: Attr->getCaptureKind())) \|\|
219	((Attr->getCaptureKind() == OMPC_map) &&
220	!FD->getType()->isAnyPointerType()))
221	return;
222	}
223	if (!FD->getType()->isReferenceType()) {
224	assert(!VD->getType()->isVariablyModifiedType() &&
225	"Parameter captured by value with variably modified type");
226	EscapedParameters.insert(Ptr: VD);
227	} else if (!IsForCombinedParallelRegion) {
228	return;
229	}
230	}
231	}
232	if ((!CGF.CapturedStmtInfo \|\|
233	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234	VD->getType()->isReferenceType())
235	// Do not globalize variables with reference type.
236	return;
237	if (VD->getType()->isVariablyModifiedType()) {
238	// If not captured at the target region level then mark the escaped
239	// variable as delayed.
240	if (IsCaptured)
241	EscapedVariableLengthDecls.insert(X: VD);
242	else
243	DelayedVariableLengthDecls.insert(X: VD);
244	} else
245	EscapedDecls.insert(X: VD);
246	}
247
248	void VisitValueDecl(const ValueDecl *VD) {
249	if (VD->getType()->isLValueReferenceType())
250	markAsEscaped(VD);
251	if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
252	if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
253	const bool SavedAllEscaped = AllEscaped;
254	AllEscaped = VD->getType()->isLValueReferenceType();
255	Visit(S: VarD->getInit());
256	AllEscaped = SavedAllEscaped;
257	}
258	}
259	}
260	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261	ArrayRef<OMPClause *> Clauses,
262	bool IsCombinedParallelRegion) {
263	if (!S)
264	return;
265	for (const CapturedStmt::Capture &C : S->captures()) {
266	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267	const ValueDecl *VD = C.getCapturedVar();
268	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269	if (IsCombinedParallelRegion) {
270	// Check if the variable is privatized in the combined construct and
271	// those private copies must be shared in the inner parallel
272	// directive.
273	IsForCombinedParallelRegion = false;
274	for (const OMPClause *C : Clauses) {
275	if (!isOpenMPPrivate(Kind: C->getClauseKind()) \|\|
276	C->getClauseKind() == OMPC_reduction \|\|
277	C->getClauseKind() == OMPC_linear \|\|
278	C->getClauseKind() == OMPC_private)
279	continue;
280	ArrayRef<const Expr *> Vars;
281	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
282	Vars = PC->getVarRefs();
283	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
284	Vars = PC->getVarRefs();
285	else
286	llvm_unreachable("Unexpected clause.");
287	for (const auto *E : Vars) {
288	const Decl *D =
289	cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
290	if (D == VD->getCanonicalDecl()) {
291	IsForCombinedParallelRegion = true;
292	break;
293	}
294	}
295	if (IsForCombinedParallelRegion)
296	break;
297	}
298	}
299	markAsEscaped(VD);
300	if (isa<OMPCapturedExprDecl>(Val: VD))
301	VisitValueDecl(VD);
302	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303	}
304	}
305	}
306
307	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308	assert(!GlobalizedRD &&
309	"Record for globalized variables is built already.");
310	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312	if (IsInTTDRegion)
313	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314	else
315	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316	GlobalizedRD = ::buildRecordForGlobalizedVars(
317	C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
318	MappedDeclsFields, BufSize: WarpSize);
319	}
320
321	public:
322	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323	ArrayRef<const ValueDecl *> TeamsReductions)
324	: CGF(CGF), EscapedDecls (TeamsReductions.begin(), TeamsReductions.end()) {
325	}
326	virtual ~CheckVarsEscapingDeclContext() = default;
327	void VisitDeclStmt(const DeclStmt *S) {
328	if (!S)
329	return;
330	for (const Decl *D : S->decls())
331	if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332	VisitValueDecl(VD);
333	}
334	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335	if (!D)
336	return;
337	if (!D->hasAssociatedStmt())
338	return;
339	if (const auto *S =
340	dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341	// Do not analyze directives that do not actually require capturing,
342	// like `omp for` or `omp simd` directives.
343	llvm::SmallVector<OpenMPDirectiveKind, `4`> CaptureRegions;
344	getOpenMPCaptureRegions(CaptureRegions, DKind: D->getDirectiveKind());
345	if (CaptureRegions.size() == `1` && CaptureRegions.back() == OMPD_unknown) {
346	VisitStmt(S: S->getCapturedStmt());
347	return;
348	}
349	VisitOpenMPCapturedStmt(
350	S, Clauses: D->clauses(),
351	IsCombinedParallelRegion: CaptureRegions.back() == OMPD_parallel &&
352	isOpenMPDistributeDirective(DKind: D->getDirectiveKind()));
353	}
354	}
355	void VisitCapturedStmt(const CapturedStmt *S) {
356	if (!S)
357	return;
358	for (const CapturedStmt::Capture &C : S->captures()) {
359	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360	const ValueDecl *VD = C.getCapturedVar();
361	markAsEscaped(VD);
362	if (isa<OMPCapturedExprDecl>(Val: VD))
363	VisitValueDecl(VD);
364	}
365	}
366	}
367	void VisitLambdaExpr(const LambdaExpr *E) {
368	if (!E)
369	return;
370	for (const LambdaCapture &C : E->captures()) {
371	if (C.capturesVariable()) {
372	if (C.getCaptureKind() == LCK_ByRef) {
373	const ValueDecl *VD = C.getCapturedVar();
374	markAsEscaped(VD);
375	if (E->isInitCapture(Capture: &C) \|\| isa<OMPCapturedExprDecl>(Val: VD))
376	VisitValueDecl(VD);
377	}
378	}
379	}
380	}
381	void VisitBlockExpr(const BlockExpr *E) {
382	if (!E)
383	return;
384	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385	if (C.isByRef()) {
386	const VarDecl *VD = C.getVariable();
387	markAsEscaped(VD);
388	if (isa<OMPCapturedExprDecl>(Val: VD) \|\| VD->isInitCapture())
389	VisitValueDecl(VD);
390	}
391	}
392	}
393	void VisitCallExpr(const CallExpr *E) {
394	if (!E)
395	return;
396	for (const Expr *Arg : E->arguments()) {
397	if (!Arg)
398	continue;
399	if (Arg->isLValue()) {
400	const bool SavedAllEscaped = AllEscaped;
401	AllEscaped = true;
402	Visit(S: Arg);
403	AllEscaped = SavedAllEscaped;
404	} else {
405	Visit(S: Arg);
406	}
407	}
408	Visit(S: E->getCallee());
409	}
410	void VisitDeclRefExpr(const DeclRefExpr *E) {
411	if (!E)
412	return;
413	const ValueDecl *VD = E->getDecl();
414	if (AllEscaped)
415	markAsEscaped(VD);
416	if (isa<OMPCapturedExprDecl>(Val: VD))
417	VisitValueDecl(VD);
418	else if (VD->isInitCapture())
419	VisitValueDecl(VD);
420	}
421	void VisitUnaryOperator(const UnaryOperator *E) {
422	if (!E)
423	return;
424	if (E->getOpcode() == UO_AddrOf) {
425	const bool SavedAllEscaped = AllEscaped;
426	AllEscaped = true;
427	Visit(S: E->getSubExpr());
428	AllEscaped = SavedAllEscaped;
429	} else {
430	Visit(S: E->getSubExpr());
431	}
432	}
433	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434	if (!E)
435	return;
436	if (E->getCastKind() == CK_ArrayToPointerDecay) {
437	const bool SavedAllEscaped = AllEscaped;
438	AllEscaped = true;
439	Visit(S: E->getSubExpr());
440	AllEscaped = SavedAllEscaped;
441	} else {
442	Visit(S: E->getSubExpr());
443	}
444	}
445	void VisitExpr(const Expr *E) {
446	if (!E)
447	return;
448	bool SavedAllEscaped = AllEscaped;
449	if (!E->isLValue())
450	AllEscaped = false;
451	for (const Stmt *Child : E->children())
452	if (Child)
453	Visit(S: Child);
454	AllEscaped = SavedAllEscaped;
455	}
456	void VisitStmt(const Stmt *S) {
457	if (!S)
458	return;
459	for (const Stmt *Child : S->children())
460	if (Child)
461	Visit(S: Child);
462	}
463
464	/// Returns the record that handles all the escaped local variables and used
465	/// instead of their original storage.
466	const RecordDecl getGlobalizedRecord(bool* IsInTTDRegion) {
467	if (!GlobalizedRD)
468	buildRecordForGlobalizedVars(IsInTTDRegion);
469	return GlobalizedRD;
470	}
471
472	/// Returns the field in the globalized record for the escaped variable.
473	const FieldDecl getFieldForGlobalizedVar(const* ValueDecl VD) const* {
474	assert(GlobalizedRD &&
475	"Record for globalized variables must be generated already.");
476	return MappedDeclsFields.lookup(Val: VD);
477	}
478
479	/// Returns the list of the escaped local variables/parameters.
480	ArrayRef<const ValueDecl > getEscapedDecls() const* {
481	return EscapedDecls.getArrayRef();
482	}
483
484	/// Checks if the escaped local variable is actually a parameter passed by
485	/// value.
486	const llvm::SmallPtrSetImpl<const Decl > &getEscapedParameters() const* {
487	return EscapedParameters;
488	}
489
490	/// Returns the list of the escaped variables with the variably modified
491	/// types.
492	ArrayRef<const ValueDecl > getEscapedVariableLengthDecls() const* {
493	return EscapedVariableLengthDecls.getArrayRef();
494	}
495
496	/// Returns the list of the delayed variables with the variably modified
497	/// types.
498	ArrayRef<const ValueDecl > getDelayedVariableLengthDecls() const* {
499	return DelayedVariableLengthDecls.getArrayRef();
500	}
501	};
502	} // anonymous namespace
503
504	CGOpenMPRuntimeGPU::ExecutionMode
505	CGOpenMPRuntimeGPU::getExecutionMode() const {
506	return CurrentExecutionMode;
507	}
508
509	CGOpenMPRuntimeGPU::DataSharingMode
510	CGOpenMPRuntimeGPU::getDataSharingMode() const {
511	return CurrentDataSharingMode;
512	}
513
514	/// Check for inner (nested) SPMD construct, if any
515	static bool hasNestedSPMDDirective(ASTContext &Ctx,
516	const OMPExecutableDirective &D) {
517	const auto *CS = D.getInnermostCapturedStmt();
518	const auto *Body =
519	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
520	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522	if (const auto *NestedDir =
523	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
524	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525	switch (D.getDirectiveKind()) {
526	case OMPD_target:
527	if (isOpenMPParallelDirective(DKind))
528	return true;
529	if (DKind == OMPD_teams) {
530	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531	/IgnoreCaptured=/true);
532	if (!Body)
533	return false;
534	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535	if (const auto *NND =
536	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
537	DKind = NND->getDirectiveKind();
538	if (isOpenMPParallelDirective(DKind))
539	return true;
540	}
541	}
542	return false;
543	case OMPD_target_teams:
544	return isOpenMPParallelDirective(DKind);
545	case OMPD_target_simd:
546	case OMPD_target_parallel:
547	case OMPD_target_parallel_for:
548	case OMPD_target_parallel_for_simd:
549	case OMPD_target_teams_distribute:
550	case OMPD_target_teams_distribute_simd:
551	case OMPD_target_teams_distribute_parallel_for:
552	case OMPD_target_teams_distribute_parallel_for_simd:
553	case OMPD_parallel:
554	case OMPD_for:
555	case OMPD_parallel_for:
556	case OMPD_parallel_master:
557	case OMPD_parallel_sections:
558	case OMPD_for_simd:
559	case OMPD_parallel_for_simd:
560	case OMPD_cancel:
561	case OMPD_cancellation_point:
562	case OMPD_ordered:
563	case OMPD_threadprivate:
564	case OMPD_allocate:
565	case OMPD_task:
566	case OMPD_simd:
567	case OMPD_sections:
568	case OMPD_section:
569	case OMPD_single:
570	case OMPD_master:
571	case OMPD_critical:
572	case OMPD_taskyield:
573	case OMPD_barrier:
574	case OMPD_taskwait:
575	case OMPD_taskgroup:
576	case OMPD_atomic:
577	case OMPD_flush:
578	case OMPD_depobj:
579	case OMPD_scan:
580	case OMPD_teams:
581	case OMPD_target_data:
582	case OMPD_target_exit_data:
583	case OMPD_target_enter_data:
584	case OMPD_distribute:
585	case OMPD_distribute_simd:
586	case OMPD_distribute_parallel_for:
587	case OMPD_distribute_parallel_for_simd:
588	case OMPD_teams_distribute:
589	case OMPD_teams_distribute_simd:
590	case OMPD_teams_distribute_parallel_for:
591	case OMPD_teams_distribute_parallel_for_simd:
592	case OMPD_target_update:
593	case OMPD_declare_simd:
594	case OMPD_declare_variant:
595	case OMPD_begin_declare_variant:
596	case OMPD_end_declare_variant:
597	case OMPD_declare_target:
598	case OMPD_end_declare_target:
599	case OMPD_declare_reduction:
600	case OMPD_declare_mapper:
601	case OMPD_taskloop:
602	case OMPD_taskloop_simd:
603	case OMPD_master_taskloop:
604	case OMPD_master_taskloop_simd:
605	case OMPD_parallel_master_taskloop:
606	case OMPD_parallel_master_taskloop_simd:
607	case OMPD_requires:
608	case OMPD_unknown:
609	default:
610	llvm_unreachable("Unexpected directive.");
611	}
612	}
613
614	return false;
615	}
616
617	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618	const OMPExecutableDirective &D) {
619	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620	switch (DirectiveKind) {
621	case OMPD_target:
622	case OMPD_target_teams:
623	return hasNestedSPMDDirective(Ctx, D);
624	case OMPD_target_parallel_loop:
625	case OMPD_target_parallel:
626	case OMPD_target_parallel_for:
627	case OMPD_target_parallel_for_simd:
628	case OMPD_target_teams_distribute_parallel_for:
629	case OMPD_target_teams_distribute_parallel_for_simd:
630	case OMPD_target_simd:
631	case OMPD_target_teams_distribute_simd:
632	return true;
633	case OMPD_target_teams_distribute:
634	return false;
635	case OMPD_target_teams_loop:
636	// Whether this is true or not depends on how the directive will
637	// eventually be emitted.
638	if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
639	return TTLD->canBeParallelFor();
640	return false;
641	case OMPD_parallel:
642	case OMPD_for:
643	case OMPD_parallel_for:
644	case OMPD_parallel_master:
645	case OMPD_parallel_sections:
646	case OMPD_for_simd:
647	case OMPD_parallel_for_simd:
648	case OMPD_cancel:
649	case OMPD_cancellation_point:
650	case OMPD_ordered:
651	case OMPD_threadprivate:
652	case OMPD_allocate:
653	case OMPD_task:
654	case OMPD_simd:
655	case OMPD_sections:
656	case OMPD_section:
657	case OMPD_single:
658	case OMPD_master:
659	case OMPD_critical:
660	case OMPD_taskyield:
661	case OMPD_barrier:
662	case OMPD_taskwait:
663	case OMPD_taskgroup:
664	case OMPD_atomic:
665	case OMPD_flush:
666	case OMPD_depobj:
667	case OMPD_scan:
668	case OMPD_teams:
669	case OMPD_target_data:
670	case OMPD_target_exit_data:
671	case OMPD_target_enter_data:
672	case OMPD_distribute:
673	case OMPD_distribute_simd:
674	case OMPD_distribute_parallel_for:
675	case OMPD_distribute_parallel_for_simd:
676	case OMPD_teams_distribute:
677	case OMPD_teams_distribute_simd:
678	case OMPD_teams_distribute_parallel_for:
679	case OMPD_teams_distribute_parallel_for_simd:
680	case OMPD_target_update:
681	case OMPD_declare_simd:
682	case OMPD_declare_variant:
683	case OMPD_begin_declare_variant:
684	case OMPD_end_declare_variant:
685	case OMPD_declare_target:
686	case OMPD_end_declare_target:
687	case OMPD_declare_reduction:
688	case OMPD_declare_mapper:
689	case OMPD_taskloop:
690	case OMPD_taskloop_simd:
691	case OMPD_master_taskloop:
692	case OMPD_master_taskloop_simd:
693	case OMPD_parallel_master_taskloop:
694	case OMPD_parallel_master_taskloop_simd:
695	case OMPD_requires:
696	case OMPD_unknown:
697	default:
698	break;
699	}
700	llvm_unreachable(
701	"Unknown programming model for OpenMP directive on NVPTX target.");
702	}
703
704	void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705	StringRef ParentName,
706	llvm::Function *&OutlinedFn,
707	llvm::Constant *&OutlinedFnID,
708	bool IsOffloadEntry,
709	const RegionCodeGenTy &CodeGen) {
710	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711	EntryFunctionState EST;
712	WrapperFunctionsMap.clear();
713
714	[[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715	assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717	// Emit target region as a standalone region.
718	class NVPTXPrePostActionTy : public PrePostActionTy {
719	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720	const OMPExecutableDirective &D;
721
722	public:
723	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724	const OMPExecutableDirective &D)
725	: EST(EST), D(D) {}
726	void Enter(CodeGenFunction &CGF) override {
727	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728	RT.emitKernelInit(D, CGF, EST, / IsSPMD / false);
729	// Skip target region initialization.
730	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
731	}
732	void Exit(CodeGenFunction &CGF) override {
733	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734	RT.clearLocThreadIdInsertPt(CGF);
735	RT.emitKernelDeinit(CGF, EST, / IsSPMD / false);
736	}
737	} Action(EST, D);
738	CodeGen.setAction(Action);
739	IsInTTDRegion = true;
740	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741	IsOffloadEntry, CodeGen);
742	IsInTTDRegion = false;
743	}
744
745	void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746	CodeGenFunction &CGF,
747	EntryFunctionState &EST, bool IsSPMD) {
748	int32_t MinThreadsVal = `1`, MaxThreadsVal = -`1`, MinTeamsVal = `1`,
749	MaxTeamsVal = -`1`;
750	computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
751	MinTeamsVal, MaxTeamsVal);
752
753	CGBuilderTy &Bld = CGF.Builder;
754	Bld.restoreIP(IP: OMPBuilder.createTargetInit(
755	Loc: Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
756	if (!IsSPMD)
757	emitGenericVarsProlog(CGF, Loc: EST.Loc);
758	}
759
760	void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761	EntryFunctionState &EST,
762	bool IsSPMD) {
763	if (!IsSPMD)
764	emitGenericVarsEpilog(CGF);
765
766	// This is temporary until we remove the fixed sized buffer.
767	ASTContext &C = CGM.getContext();
768	RecordDecl *StaticRD = C.buildImplicitRecord(
769	Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
770	StaticRD->startDefinition();
771	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772	QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
773	auto *Field = FieldDecl::Create(
774	C, DC: StaticRD, StartLoc: SourceLocation (), IdLoc: SourceLocation (), Id: nullptr, T: RecTy,
775	TInfo: C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation ()),
776	/BW=/nullptr, /Mutable=/false,
777	/InitStyle=/ICIS_NoInit);
778	Field->setAccess(AS_public);
779	StaticRD->addDecl(D: Field);
780	}
781	StaticRD->completeDefinition();
782	QualType StaticTy = C.getRecordType(Decl: StaticRD);
783	llvm::Type *LLVMReductionsBufferTy =
784	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
785	const auto &DL = CGM.getModule().getDataLayout();
786	uint64_t ReductionDataSize =
787	TeamsReductions.empty()
788	? `0`
789	: DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
790	CGBuilderTy &Bld = CGF.Builder;
791	OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
792	TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
793	TeamsReductions.clear();
794	}
795
796	void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797	StringRef ParentName,
798	llvm::Function *&OutlinedFn,
799	llvm::Constant *&OutlinedFnID,
800	bool IsOffloadEntry,
801	const RegionCodeGenTy &CodeGen) {
802	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803	EntryFunctionState EST;
804
805	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807	// Emit target region as a standalone region.
808	class NVPTXPrePostActionTy : public PrePostActionTy {
809	CGOpenMPRuntimeGPU &RT;
810	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811	bool IsBareKernel;
812	DataSharingMode Mode;
813	const OMPExecutableDirective &D;
814
815	public:
816	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817	CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818	bool IsBareKernel, const OMPExecutableDirective &D)
819	: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820	Mode(RT.CurrentDataSharingMode), D(D) {}
821	void Enter(CodeGenFunction &CGF) override {
822	if (IsBareKernel) {
823	RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824	return;
825	}
826	RT.emitKernelInit(D, CGF, EST, / IsSPMD / true);
827	// Skip target region initialization.
828	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
829	}
830	void Exit(CodeGenFunction &CGF) override {
831	if (IsBareKernel) {
832	RT.CurrentDataSharingMode = Mode;
833	return;
834	}
835	RT.clearLocThreadIdInsertPt(CGF);
836	RT.emitKernelDeinit(CGF, EST, / IsSPMD / true);
837	}
838	} Action(*this, EST, IsBareKernel, D);
839	CodeGen.setAction(Action);
840	IsInTTDRegion = true;
841	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842	IsOffloadEntry, CodeGen);
843	IsInTTDRegion = false;
844	}
845
846	void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847	const OMPExecutableDirective &D, StringRef ParentName,
848	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
849	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850	if (!IsOffloadEntry) // Nothing to do.
851	return;
852
853	assert(!ParentName.empty() && "Invalid target region parent name!");
854
855	bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
856	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857	if (Mode \|\| IsBareKernel)
858	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859	CodeGen);
860	else
861	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862	CodeGen);
863	}
864
865	CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866	: CGOpenMPRuntime (CGM) {
867	llvm::OpenMPIRBuilderConfig Config(
868	CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869	CGM.getLangOpts().OpenMPOffloadMandatory,
870	/HasRequiresReverseOffload/ false, /HasRequiresUnifiedAddress/ false,
871	hasRequiresUnifiedSharedMemory(), /HasRequiresDynamicAllocators/ false);
872	OMPBuilder.setConfig(Config);
873
874	if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875	llvm_unreachable("OpenMP can only handle device code.");
876
877	if (CGM.getLangOpts().OpenMPCUDAMode)
878	CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879
880	llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
881	if (CGM.getLangOpts().NoGPULib \|\| CGM.getLangOpts().OMPHostIRFile.empty())
882	return;
883
884	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
885	Name: "__omp_rtl_debug_kind");
886	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
887	Name: "__omp_rtl_assume_teams_oversubscription");
888	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
889	Name: "__omp_rtl_assume_threads_oversubscription");
890	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
891	Name: "__omp_rtl_assume_no_thread_state");
892	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
893	Name: "__omp_rtl_assume_no_nested_parallelism");
894	}
895
896	void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
897	ProcBindKind ProcBind,
898	SourceLocation Loc) {
899	// Nothing to do.
900	}
901
902	void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
903	llvm::Value *NumThreads,
904	SourceLocation Loc) {
905	// Nothing to do.
906	}
907
908	void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
909	const Expr *NumTeams,
910	const Expr *ThreadLimit,
911	SourceLocation Loc) {}
912
913	llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
914	CodeGenFunction &CGF, const OMPExecutableDirective &D,
915	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
916	const RegionCodeGenTy &CodeGen) {
917	// Emit target region as a standalone region.
918	bool PrevIsInTTDRegion = IsInTTDRegion;
919	IsInTTDRegion = false;
920	auto *OutlinedFun =
921	cast<llvm::Function>(Val: CGOpenMPRuntime::emitParallelOutlinedFunction(
922	CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923	IsInTTDRegion = PrevIsInTTDRegion;
924	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925	llvm::Function *WrapperFun =
926	createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
927	WrapperFunctionsMap [OutlinedFun] = WrapperFun;
928	}
929
930	return OutlinedFun;
931	}
932
933	/// Get list of lastprivate variables from the teams distribute ... or
934	/// teams {distribute ...} directives.
935	static void
936	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
937	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
938	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939	"expected teams directive.");
940	const OMPExecutableDirective *Dir = &D;
941	if (!isOpenMPDistributeDirective(DKind: D.getDirectiveKind())) {
942	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
943	Ctx,
944	Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945	/IgnoreCaptured=/true))) {
946	Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
947	if (Dir && !isOpenMPDistributeDirective(DKind: Dir->getDirectiveKind()))
948	Dir = nullptr;
949	}
950	}
951	if (!Dir)
952	return;
953	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954	for (const Expr *E : C->getVarRefs())
955	Vars.push_back(Elt: getPrivateItem(RefExpr: E));
956	}
957	}
958
959	/// Get list of reduction variables from the teams ... directives.
960	static void
961	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964	"expected teams directive.");
965	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966	for (const Expr *E : C->privates())
967	Vars.push_back(Elt: getPrivateItem(RefExpr: E));
968	}
969	}
970
971	llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
972	CodeGenFunction &CGF, const OMPExecutableDirective &D,
973	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
974	const RegionCodeGenTy &CodeGen) {
975	SourceLocation Loc = D.getBeginLoc();
976
977	const RecordDecl GlobalizedRD = nullptr*;
978	llvm::SmallVector<const ValueDecl *, `4`> LastPrivatesReductions;
979	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
980	unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981	// Globalize team reductions variable unconditionally in all modes.
982	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983	getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
984	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985	getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
986	if (!LastPrivatesReductions.empty()) {
987	GlobalizedRD = ::buildRecordForGlobalizedVars(
988	C&: CGM.getContext(), EscapedDecls: std::nullopt, EscapedDeclsForTeams: LastPrivatesReductions,
989	MappedDeclsFields, BufSize: WarpSize);
990	}
991	} else if (!LastPrivatesReductions.empty()) {
992	assert(!TeamAndReductions.first &&
993	"Previous team declaration is not expected.");
994	TeamAndReductions.first = D.getCapturedStmt(RegionKind: OMPD_teams)->getCapturedDecl();
995	std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
996	}
997
998	// Emit target region as a standalone region.
999	class NVPTXPrePostActionTy : public PrePostActionTy {
1000	SourceLocation &Loc;
1001	const RecordDecl *GlobalizedRD;
1002	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1003	&MappedDeclsFields;
1004
1005	public:
1006	NVPTXPrePostActionTy(
1007	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1009	&MappedDeclsFields)
1010	: Loc(Loc), GlobalizedRD(GlobalizedRD),
1011	MappedDeclsFields(MappedDeclsFields) {}
1012	void Enter(CodeGenFunction &CGF) override {
1013	auto &Rt =
1014	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015	if (GlobalizedRD) {
1016	auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1017	I ->getSecond().MappedParams =
1018	std::make_unique<CodeGenFunction::OMPMapVars>();
1019	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
1020	for (const auto &Pair : MappedDeclsFields) {
1021	assert(Pair.getFirst()->isCanonicalDecl() &&
1022	"Expected canonical declaration");
1023	Data.insert(KV: std::make_pair(x: Pair.getFirst(), y: MappedVarData ()));
1024	}
1025	}
1026	Rt.emitGenericVarsProlog(CGF, Loc);
1027	}
1028	void Exit(CodeGenFunction &CGF) override {
1029	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030	.emitGenericVarsEpilog(CGF);
1031	}
1032	} Action(Loc, GlobalizedRD, MappedDeclsFields);
1033	CodeGen.setAction(Action);
1034	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035	CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036
1037	return OutlinedFun;
1038	}
1039
1040	void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1041	SourceLocation Loc) {
1042	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043	return;
1044
1045	CGBuilderTy &Bld = CGF.Builder;
1046
1047	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1048	if (I == FunctionGlobalizedDecls.end())
1049	return;
1050
1051	for (auto &Rec : I ->getSecond().LocalVarData) {
1052	const auto *VD = cast<VarDecl>(Val: Rec.first);
1053	bool EscapedParam = I ->getSecond().EscapedParameters.count(Ptr: Rec.first);
1054	QualType VarTy = VD->getType();
1055
1056	// Get the local allocation of a firstprivate variable before sharing
1057	llvm::Value *ParValue;
1058	if (EscapedParam) {
1059	LValue ParLVal =
1060	CGF.MakeAddrLValue(Addr: CGF.GetAddrOfLocalVar(VD), T: VD->getType());
1061	ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1062	}
1063
1064	// Allocate space for the variable to be globalized
1065	llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1066	llvm::CallBase *VoidPtr =
1067	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1068	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1069	args: AllocArgs, name: VD->getName());
1070	// FIXME: We should use the variables actual alignment as an argument.
1071	VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1072	Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment,
1073	Val: CGM.getContext().getTargetInfo().getNewAlign() / `8`));
1074
1075	// Cast the void pointer and get the address of the globalized variable.
1076	llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(T: VarTy)->getPointerTo();
1077	llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1078	V: VoidPtr, DestTy: VarPtrTy, Name: VD->getName() + "_on_stack");
1079	LValue VarAddr =
1080	CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1081	Rec.second.PrivateAddr = VarAddr.getAddress();
1082	Rec.second.GlobalizedVal = VoidPtr;
1083
1084	// Assign the local allocation to the newly globalized location.
1085	if (EscapedParam) {
1086	CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1087	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress());
1088	}
1089	if (auto *DI = CGF.getDebugInfo())
1090	VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1091	}
1092
1093	for (const auto *ValueD : I ->getSecond().EscapedVariableLengthDecls) {
1094	const auto *VD = cast<VarDecl>(Val: ValueD);
1095	std::pair<llvm::Value , llvm::Value > AddrSizePair =
1096	getKmpcAllocShared(CGF, VD);
1097	I ->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1098	LValue Base = CGF.MakeAddrLValue(V: AddrSizePair.first, T: VD->getType(),
1099	Alignment: CGM.getContext().getDeclAlign(D: VD),
1100	Source: AlignmentSource::Decl);
1101	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress());
1102	}
1103	I ->getSecond().MappedParams ->apply(CGF);
1104	}
1105
1106	bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1107	const VarDecl VD) const* {
1108	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1109	if (I == FunctionGlobalizedDecls.end())
1110	return false;
1111
1112	// Check variable declaration is delayed:
1113	return llvm::is_contained(Range: I ->getSecond().DelayedVariableLengthDecls, Element: VD);
1114	}
1115
1116	std::pair<llvm::Value , llvm::Value >
1117	CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1118	const VarDecl *VD) {
1119	CGBuilderTy &Bld = CGF.Builder;
1120
1121	// Compute size and alignment.
1122	llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1123	CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
1124	Size = Bld.CreateNUWAdd(
1125	LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - `1`));
1126	llvm::Value *AlignVal =
1127	llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1128	Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1129	Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1130
1131	// Allocate space for this VLA object to be globalized.
1132	llvm::Value *AllocArgs[] = {Size};
1133	llvm::CallBase *VoidPtr =
1134	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1135	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1136	args: AllocArgs, name: VD->getName());
1137	VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1138	Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment, Val: Align.getQuantity()));
1139
1140	return std::make_pair(x&: VoidPtr, y&: Size);
1141	}
1142
1143	void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1144	CodeGenFunction &CGF,
1145	const std::pair<llvm::Value , llvm::Value > &AddrSizePair) {
1146	// Deallocate the memory for each globalized VLA object
1147	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1148	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1149	args: {AddrSizePair.first, AddrSizePair.second});
1150	}
1151
1152	void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1153	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1154	return;
1155
1156	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1157	if (I != FunctionGlobalizedDecls.end()) {
1158	// Deallocate the memory for each globalized VLA object that was
1159	// globalized in the prolog (i.e. emitGenericVarsProlog).
1160	for (const auto &AddrSizePair :
1161	llvm::reverse(C&: I ->getSecond().EscapedVariableLengthDeclsAddrs)) {
1162	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1163	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1164	args: {AddrSizePair.first, AddrSizePair.second});
1165	}
1166	// Deallocate the memory for each globalized value
1167	for (auto &Rec : llvm::reverse(C&: I ->getSecond().LocalVarData)) {
1168	const auto *VD = cast<VarDecl>(Val: Rec.first);
1169	I ->getSecond().MappedParams ->restore(CGF);
1170
1171	llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1172	CGF.getTypeSize(Ty: VD->getType())};
1173	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1174	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1175	args: FreeArgs);
1176	}
1177	}
1178	}
1179
1180	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1181	const OMPExecutableDirective &D,
1182	SourceLocation Loc,
1183	llvm::Function *OutlinedFn,
1184	ArrayRef<llvm::Value *> CapturedVars) {
1185	if (!CGF.HaveInsertPoint())
1186	return;
1187
1188	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1189
1190	RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1191	/Name=/".zero.addr");
1192	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
1193	llvm::SmallVector<llvm::Value *, `16`> OutlinedFnArgs;
1194	// We don't emit any thread id function call in bare kernel, but because the
1195	// outlined function has a pointer argument, we emit a nullptr here.
1196	if (IsBareKernel)
1197	OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1198	else
1199	OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1200	OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1201	OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1202	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1203	}
1204
1205	void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1206	SourceLocation Loc,
1207	llvm::Function *OutlinedFn,
1208	ArrayRef<llvm::Value *> CapturedVars,
1209	const Expr *IfCond,
1210	llvm::Value *NumThreads) {
1211	if (!CGF.HaveInsertPoint())
1212	return;
1213
1214	auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1215	NumThreads](CodeGenFunction &CGF,
1216	PrePostActionTy &Action) {
1217	CGBuilderTy &Bld = CGF.Builder;
1218	llvm::Value *NumThreadsVal = NumThreads;
1219	llvm::Function *WFn = WrapperFunctionsMap [OutlinedFn];
1220	llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1221	if (WFn)
1222	ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1223	llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1224
1225	// Create a private scope that will globalize the arguments
1226	// passed from the outside of the target region.
1227	// TODO: Is that needed?
1228	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1229
1230	Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1231	Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1232	Name: "captured_vars_addrs");
1233	// There's something to share.
1234	if (!CapturedVars.empty()) {
1235	// Prepare for parallel region. Indicate the outlined function.
1236	ASTContext &Ctx = CGF.getContext();
1237	unsigned Idx = `0`;
1238	for (llvm::Value *V : CapturedVars) {
1239	Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1240	llvm::Value *PtrV;
1241	if (V->getType()->isIntegerTy())
1242	PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1243	else
1244	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1245	CGF.EmitStoreOfScalar(Value: PtrV, Addr: Dst, /Volatile=/false,
1246	Ty: Ctx.getPointerType(T: Ctx.VoidPtrTy));
1247	++Idx;
1248	}
1249	}
1250
1251	llvm::Value IfCondVal = nullptr*;
1252	if (IfCond)
1253	IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1254	/ isSigned / false);
1255	else
1256	IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: `1`);
1257
1258	if (!NumThreadsVal)
1259	NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`);
1260	else
1261	NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty),
1262
1263	assert(IfCondVal && "Expected a value");
1264	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1265	llvm::Value *Args[] = {
1266	RTLoc,
1267	getThreadID(CGF, Loc),
1268	IfCondVal,
1269	NumThreadsVal,
1270	llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`),
1271	FnPtr,
1272	ID,
1273	Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1274	DestTy: CGF.VoidPtrPtrTy),
1275	llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1276	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1277	M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1278	args: Args);
1279	};
1280
1281	RegionCodeGenTy RCG(ParallelGen);
1282	RCG (CGF);
1283	}
1284
1285	void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1286	// Always emit simple barriers!
1287	if (!CGF.HaveInsertPoint())
1288	return;
1289	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1290	// This function does not use parameters, so we can emit just default values.
1291	llvm::Value *Args[] = {
1292	llvm::ConstantPointerNull::get(
1293	T: cast<llvm::PointerType>(Val: getIdentTyPointerTy())),
1294	llvm::ConstantInt::get(Ty: CGF.Int32Ty, /V=/`0`, /isSigned=/IsSigned: true)};
1295	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1296	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1297	args: Args);
1298	}
1299
1300	void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1301	SourceLocation Loc,
1302	OpenMPDirectiveKind Kind, bool,
1303	bool) {
1304	// Always emit simple barriers!
1305	if (!CGF.HaveInsertPoint())
1306	return;
1307	// Build call __kmpc_cancel_barrier(loc, thread_id);
1308	unsigned Flags = getDefaultFlagsForBarriers(Kind);
1309	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1310	getThreadID(CGF, Loc)};
1311
1312	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1313	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1314	args: Args);
1315	}
1316
1317	void CGOpenMPRuntimeGPU::emitCriticalRegion(
1318	CodeGenFunction &CGF, StringRef CriticalName,
1319	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1320	const Expr *Hint) {
1321	llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1322	llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1323	llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1324	llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1325	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1326
1327	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1328
1329	// Get the mask of active threads in the warp.
1330	llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1331	M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1332	// Fetch team-local id of the thread.
1333	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1334
1335	// Get the width of the team.
1336	llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1337
1338	// Initialize the counter variable for the loop.
1339	QualType Int32Ty =
1340	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/`0`);
1341	Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1342	LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1343	CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1344	/isInit=/true);
1345
1346	// Block checks if loop counter exceeds upper bound.
1347	CGF.EmitBlock(BB: LoopBB);
1348	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1349	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1350	CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1351
1352	// Block tests which single thread should execute region, and which threads
1353	// should go straight to synchronisation point.
1354	CGF.EmitBlock(BB: TestBB);
1355	CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1356	llvm::Value *CmpThreadToCounter =
1357	CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1358	CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1359
1360	// Block emits the body of the critical region.
1361	CGF.EmitBlock(BB: BodyBB);
1362
1363	// Output the critical statement.
1364	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1365	Hint);
1366
1367	// After the body surrounded by the critical region, the single executing
1368	// thread will jump to the synchronisation point.
1369	// Block waits for all threads in current team to finish then increments the
1370	// counter variable and returns to the loop.
1371	CGF.EmitBlock(BB: SyncBB);
1372	// Reconverge active threads in the warp.
1373	(void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1374	M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1375	args: Mask);
1376
1377	llvm::Value *IncCounterVal =
1378	CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: `1`));
1379	CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1380	CGF.EmitBranch(Block: LoopBB);
1381
1382	// Block that is reached when all threads in the team complete the region.
1383	CGF.EmitBlock(BB: ExitBB, /IsFinished=/true);
1384	}
1385
1386	/// Cast value to the specified type.
1387	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
1388	QualType ValTy, QualType CastTy,
1389	SourceLocation Loc) {
1390	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1391	"Cast type must sized.");
1392	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1393	"Val type must sized.");
1394	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1395	if (ValTy == CastTy)
1396	return Val;
1397	if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1398	CGF.getContext().getTypeSizeInChars(T: CastTy))
1399	return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1400	if (CastTy ->isIntegerType() && ValTy ->isIntegerType())
1401	return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1402	isSigned: CastTy ->hasSignedIntegerRepresentation());
1403	Address CastItem = CGF.CreateMemTemp(T: CastTy);
1404	Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1405	CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /Volatile=/false, Ty: ValTy,
1406	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1407	TBAAInfo: TBAAAccessInfo ());
1408	return CGF.EmitLoadOfScalar(Addr: CastItem, /Volatile=/false, Ty: CastTy, Loc,
1409	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1410	TBAAInfo: TBAAAccessInfo ());
1411	}
1412
1413	///
1414	/// Design of OpenMP reductions on the GPU
1415	///
1416	/// Consider a typical OpenMP program with one or more reduction
1417	/// clauses:
1418	///
1419	/// float foo;
1420	/// double bar;
1421	/// #pragma omp target teams distribute parallel for \
1422	/// reduction(+:foo) reduction(:bar)*
1423	/// for (int i = 0; i < N; i++) {
1424	/// foo += A[i]; bar = B[i];*
1425	/// }
1426	///
1427	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1428	/// all teams. In our OpenMP implementation on the NVPTX device an
1429	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1430	/// within a team are mapped to CUDA threads within a threadblock.
1431	/// Our goal is to efficiently aggregate values across all OpenMP
1432	/// threads such that:
1433	///
1434	/// - the compiler and runtime are logically concise, and
1435	/// - the reduction is performed efficiently in a hierarchical
1436	/// manner as follows: within OpenMP threads in the same warp,
1437	/// across warps in a threadblock, and finally across teams on
1438	/// the NVPTX device.
1439	///
1440	/// Introduction to Decoupling
1441	///
1442	/// We would like to decouple the compiler and the runtime so that the
1443	/// latter is ignorant of the reduction variables (number, data types)
1444	/// and the reduction operators. This allows a simpler interface
1445	/// and implementation while still attaining good performance.
1446	///
1447	/// Pseudocode for the aforementioned OpenMP program generated by the
1448	/// compiler is as follows:
1449	///
1450	/// 1. Create private copies of reduction variables on each OpenMP
1451	/// thread: 'foo_private', 'bar_private'
1452	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1453	/// to it and writes the result in 'foo_private' and 'bar_private'
1454	/// respectively.
1455	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1456	/// and store the result on the team master:
1457	///
1458	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1459	/// reduceData, shuffleReduceFn, interWarpCpyFn)
1460	///
1461	/// where:
1462	/// struct ReduceData {
1463	/// double foo;*
1464	/// double bar;*
1465	/// } reduceData
1466	/// reduceData.foo = &foo_private
1467	/// reduceData.bar = &bar_private
1468	///
1469	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1470	/// auxiliary functions generated by the compiler that operate on
1471	/// variables of type 'ReduceData'. They aid the runtime perform
1472	/// algorithmic steps in a data agnostic manner.
1473	///
1474	/// 'shuffleReduceFn' is a pointer to a function that reduces data
1475	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1476	/// same warp. It takes the following arguments as input:
1477	///
1478	/// a. variable of type 'ReduceData' on the calling lane,
1479	/// b. its lane_id,
1480	/// c. an offset relative to the current lane_id to generate a
1481	/// remote_lane_id. The remote lane contains the second
1482	/// variable of type 'ReduceData' that is to be reduced.
1483	/// d. an algorithm version parameter determining which reduction
1484	/// algorithm to use.
1485	///
1486	/// 'shuffleReduceFn' retrieves data from the remote lane using
1487	/// efficient GPU shuffle intrinsics and reduces, using the
1488	/// algorithm specified by the 4th parameter, the two operands
1489	/// element-wise. The result is written to the first operand.
1490	///
1491	/// Different reduction algorithms are implemented in different
1492	/// runtime functions, all calling 'shuffleReduceFn' to perform
1493	/// the essential reduction step. Therefore, based on the 4th
1494	/// parameter, this function behaves slightly differently to
1495	/// cooperate with the runtime to ensure correctness under
1496	/// different circumstances.
1497	///
1498	/// 'InterWarpCpyFn' is a pointer to a function that transfers
1499	/// reduced variables across warps. It tunnels, through CUDA
1500	/// shared memory, the thread-private data of type 'ReduceData'
1501	/// from lane 0 of each warp to a lane in the first warp.
1502	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1503	/// The last team writes the global reduced value to memory.
1504	///
1505	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1506	/// reduceData, shuffleReduceFn, interWarpCpyFn,
1507	/// scratchpadCopyFn, loadAndReduceFn)
1508	///
1509	/// 'scratchpadCopyFn' is a helper that stores reduced
1510	/// data from the team master to a scratchpad array in
1511	/// global memory.
1512	///
1513	/// 'loadAndReduceFn' is a helper that loads data from
1514	/// the scratchpad array and reduces it with the input
1515	/// operand.
1516	///
1517	/// These compiler generated functions hide address
1518	/// calculation and alignment information from the runtime.
1519	/// 5. if ret == 1:
1520	/// The team master of the last team stores the reduced
1521	/// result to the globals in memory.
1522	/// foo += reduceData.foo; bar = reduceData.bar*
1523	///
1524	///
1525	/// Warp Reduction Algorithms
1526	///
1527	/// On the warp level, we have three algorithms implemented in the
1528	/// OpenMP runtime depending on the number of active lanes:
1529	///
1530	/// Full Warp Reduction
1531	///
1532	/// The reduce algorithm within a warp where all lanes are active
1533	/// is implemented in the runtime as follows:
1534	///
1535	/// full_warp_reduce(void reduce_data,*
1536	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1537	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1538	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1539	/// }
1540	///
1541	/// The algorithm completes in log(2, WARPSIZE) steps.
1542	///
1543	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1544	/// not used therefore we save instructions by not retrieving lane_id
1545	/// from the corresponding special registers. The 4th parameter, which
1546	/// represents the version of the algorithm being used, is set to 0 to
1547	/// signify full warp reduction.
1548	///
1549	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1550	///
1551	/// #reduce_elem refers to an element in the local lane's data structure
1552	/// #remote_elem is retrieved from a remote lane
1553	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1554	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1555	///
1556	/// Contiguous Partial Warp Reduction
1557	///
1558	/// This reduce algorithm is used within a warp where only the first
1559	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1560	/// number of OpenMP threads in a parallel region is not a multiple of
1561	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1562	///
1563	/// void
1564	/// contiguous_partial_reduce(void reduce_data,*
1565	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1566	/// int size, int lane_id) {
1567	/// int curr_size;
1568	/// int offset;
1569	/// curr_size = size;
1570	/// mask = curr_size/2;
1571	/// while (offset>0) {
1572	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1573	/// curr_size = (curr_size+1)/2;
1574	/// offset = curr_size/2;
1575	/// }
1576	/// }
1577	///
1578	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1579	///
1580	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1581	/// if (lane_id < offset)
1582	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1583	/// else
1584	/// reduce_elem = remote_elem
1585	///
1586	/// This algorithm assumes that the data to be reduced are located in a
1587	/// contiguous subset of lanes starting from the first. When there is
1588	/// an odd number of active lanes, the data in the last lane is not
1589	/// aggregated with any other lane's dat but is instead copied over.
1590	///
1591	/// Dispersed Partial Warp Reduction
1592	///
1593	/// This algorithm is used within a warp when any discontiguous subset of
1594	/// lanes are active. It is used to implement the reduction operation
1595	/// across lanes in an OpenMP simd region or in a nested parallel region.
1596	///
1597	/// void
1598	/// dispersed_partial_reduce(void reduce_data,*
1599	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1600	/// int size, remote_id;
1601	/// int logical_lane_id = number_of_active_lanes_before_me() 2;*
1602	/// do {
1603	/// remote_id = next_active_lane_id_right_after_me();
1604	/// # the above function returns 0 of no active lane
1605	/// # is present right after the current lane.
1606	/// size = number_of_active_lanes_in_this_warp();
1607	/// logical_lane_id /= 2;
1608	/// ShuffleReduceFn(reduce_data, logical_lane_id,
1609	/// remote_id-1-threadIdx.x, 2);
1610	/// } while (logical_lane_id % 2 == 0 && size > 1);
1611	/// }
1612	///
1613	/// There is no assumption made about the initial state of the reduction.
1614	/// Any number of lanes (>=1) could be active at any position. The reduction
1615	/// result is returned in the first active lane.
1616	///
1617	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1618	///
1619	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1620	/// if (lane_id % 2 == 0 && offset > 0)
1621	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1622	/// else
1623	/// reduce_elem = remote_elem
1624	///
1625	///
1626	/// Intra-Team Reduction
1627	///
1628	/// This function, as implemented in the runtime call
1629	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1630	/// threads in a team. It first reduces within a warp using the
1631	/// aforementioned algorithms. We then proceed to gather all such
1632	/// reduced values at the first warp.
1633	///
1634	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1635	/// data from each of the "warp master" (zeroth lane of each warp, where
1636	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1637	/// a mathematical sense) the problem of reduction across warp masters in
1638	/// a block to the problem of warp reduction.
1639	///
1640	///
1641	/// Inter-Team Reduction
1642	///
1643	/// Once a team has reduced its data to a single value, it is stored in
1644	/// a global scratchpad array. Since each team has a distinct slot, this
1645	/// can be done without locking.
1646	///
1647	/// The last team to write to the scratchpad array proceeds to reduce the
1648	/// scratchpad array. One or more workers in the last team use the helper
1649	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1650	/// the k'th worker reduces every k'th element.
1651	///
1652	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1653	/// reduce across workers and compute a globally reduced value.
1654	///
1655	void CGOpenMPRuntimeGPU::emitReduction(
1656	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1657	ArrayRef<const Expr > LHSExprs, ArrayRef<const* Expr *> RHSExprs,
1658	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1659	if (!CGF.HaveInsertPoint())
1660	return;
1661
1662	bool ParallelReduction = isOpenMPParallelDirective(DKind: Options.ReductionKind);
1663	bool DistributeReduction = isOpenMPDistributeDirective(DKind: Options.ReductionKind);
1664	bool TeamsReduction = isOpenMPTeamsDirective(DKind: Options.ReductionKind);
1665
1666	ASTContext &C = CGM.getContext();
1667
1668	if (Options.SimpleReduction) {
1669	assert(!TeamsReduction && !ParallelReduction &&
1670	"Invalid reduction selection in emitReduction.");
1671	(void)ParallelReduction;
1672	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1673	ReductionOps, Options);
1674	return;
1675	}
1676
1677	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> VarFieldMap;
1678	llvm::SmallVector<const ValueDecl *, `4`> PrivatesReductions(Privates.size());
1679	int Cnt = `0`;
1680	for (const Expr *DRE : Privates) {
1681	PrivatesReductions [Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
1682	++Cnt;
1683	}
1684	const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1685	C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: std::nullopt, MappedDeclsFields&: VarFieldMap, BufSize: `1`);
1686
1687	if (TeamsReduction)
1688	TeamsReductions.push_back(Elt: ReductionRec);
1689
1690	// Source location for the ident struct
1691	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1692
1693	using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1694	InsertPointTy AllocaIP(CGF.AllocaInsertPt ->getParent(),
1695	CGF.AllocaInsertPt ->getIterator());
1696	InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1697	CGF.Builder.GetInsertPoint());
1698	llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1699	CodeGenIP, CGF.SourceLocToDebugLoc(Location: Loc));
1700	llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
1701
1702	CodeGenFunction::OMPPrivateScope Scope(CGF);
1703	unsigned Idx = `0`;
1704	for (const Expr *Private : Privates) {
1705	llvm::Type *ElementType;
1706	llvm::Value *Variable;
1707	llvm::Value *PrivateVariable;
1708	llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1709	ElementType = CGF.ConvertTypeForMem(T: Private->getType());
1710	const auto *RHSVar =
1711	cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs [Idx])->getDecl());
1712	PrivateVariable = CGF.GetAddrOfLocalVar(VD: RHSVar).emitRawPointer(CGF);
1713	const auto *LHSVar =
1714	cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs [Idx])->getDecl());
1715	Variable = CGF.GetAddrOfLocalVar(VD: LHSVar).emitRawPointer(CGF);
1716	llvm::OpenMPIRBuilder::EvalKind EvalKind;
1717	switch (CGF.getEvaluationKind(T: Private->getType())) {
1718	case TEK_Scalar:
1719	EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1720	break;
1721	case TEK_Complex:
1722	EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1723	break;
1724	case TEK_Aggregate:
1725	EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1726	break;
1727	}
1728	auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1729	llvm::Value LHSPtr, llvm::Value RHSPtr,
1730	llvm::Function *NewFunc) {
1731	CGF.Builder.restoreIP(IP: CodeGenIP);
1732	auto *CurFn = CGF.CurFn;
1733	CGF.CurFn = NewFunc;
1734
1735	*LHSPtr = CGF.GetAddrOfLocalVar(
1736	VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs [I])->getDecl()))
1737	.emitRawPointer(CGF);
1738	*RHSPtr = CGF.GetAddrOfLocalVar(
1739	VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs [I])->getDecl()))
1740	.emitRawPointer(CGF);
1741
1742	emitSingleReductionCombiner(CGF, ReductionOp: ReductionOps [I], PrivateRef: Privates [I],
1743	LHS: cast<DeclRefExpr>(Val: LHSExprs [I]),
1744	RHS: cast<DeclRefExpr>(Val: RHSExprs [I]));
1745
1746	CGF.CurFn = CurFn;
1747
1748	return InsertPointTy (CGF.Builder.GetInsertBlock(),
1749	CGF.Builder.GetInsertPoint());
1750	};
1751	ReductionInfos.emplace_back(Args: llvm::OpenMPIRBuilder::ReductionInfo (
1752	ElementType, Variable, PrivateVariable, EvalKind,
1753	/ReductionGen=/nullptr, ReductionGen, AtomicReductionGen));
1754	Idx++;
1755	}
1756
1757	CGF.Builder.restoreIP(IP: OMPBuilder.createReductionsGPU(
1758	Loc: OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, IsNoWait: false, IsTeamsReduction: TeamsReduction,
1759	HasDistribute: DistributeReduction, ReductionGenCBKind: llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1760	GridValue: CGF.getTarget().getGridValue(), ReductionBufNum: C.getLangOpts().OpenMPCUDAReductionBufNum,
1761	SrcLocInfo: RTLoc));
1762	return;
1763	}
1764
1765	const VarDecl *
1766	CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1767	const VarDecl NativeParam) const* {
1768	if (!NativeParam->getType()->isReferenceType())
1769	return NativeParam;
1770	QualType ArgType = NativeParam->getType();
1771	QualifierCollector QC;
1772	const Type *NonQualTy = QC.strip(type: ArgType);
1773	QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1774	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1775	if (Attr->getCaptureKind() == OMPC_map) {
1776	PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
1777	AddressSpace: LangAS::opencl_global);
1778	}
1779	}
1780	ArgType = CGM.getContext().getPointerType(T: PointeeTy);
1781	QC.addRestrict();
1782	enum { NVPTX_local_addr = `5` };
1783	QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
1784	ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
1785	if (isa<ImplicitParamDecl>(Val: NativeParam))
1786	return ImplicitParamDecl::Create(
1787	C&: CGM.getContext(), /DC=/nullptr, IdLoc: NativeParam->getLocation(),
1788	Id: NativeParam->getIdentifier(), T: ArgType, ParamKind: ImplicitParamKind::Other);
1789	return ParmVarDecl::Create(
1790	C&: CGM.getContext(),
1791	DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
1792	StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
1793	Id: NativeParam->getIdentifier(), T: ArgType,
1794	/TInfo=/nullptr, S: SC_None, /DefArg=/nullptr);
1795	}
1796
1797	Address
1798	CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1799	const VarDecl *NativeParam,
1800	const VarDecl TargetParam) const* {
1801	assert(NativeParam != TargetParam &&
1802	NativeParam->getType()->isReferenceType() &&
1803	"Native arg must not be the same as target arg.");
1804	Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
1805	QualType NativeParamType = NativeParam->getType();
1806	QualifierCollector QC;
1807	const Type *NonQualTy = QC.strip(type: NativeParamType);
1808	QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1809	unsigned NativePointeeAddrSpace =
1810	CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
1811	QualType TargetTy = TargetParam->getType();
1812	llvm::Value TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /Volatile=/*false,
1813	Ty: TargetTy, Loc: SourceLocation ());
1814	// Cast to native address space.
1815	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1816	V: TargetAddr,
1817	DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
1818	Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
1819	CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /Volatile=/false,
1820	Ty: NativeParamType);
1821	return NativeParamAddr;
1822	}
1823
1824	void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1825	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1826	ArrayRef<llvm::Value > Args) const* {
1827	SmallVector<llvm::Value *, `4`> TargetArgs;
1828	TargetArgs.reserve(N: Args.size());
1829	auto *FnType = OutlinedFn.getFunctionType();
1830	for (unsigned I = `0`, E = Args.size(); I < E; ++I) {
1831	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1832	TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
1833	break;
1834	}
1835	llvm::Type *TargetType = FnType->getParamType(i: I);
1836	llvm::Value *NativeArg = Args [I];
1837	if (!TargetType->isPointerTy()) {
1838	TargetArgs.emplace_back(Args&: NativeArg);
1839	continue;
1840	}
1841	TargetArgs.emplace_back(
1842	Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
1843	}
1844	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: TargetArgs);
1845	}
1846
1847	/// Emit function which wraps the outline parallel region
1848	/// and controls the arguments which are passed to this function.
1849	/// The wrapper ensures that the outlined function is called
1850	/// with the correct arguments when data is shared.
1851	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1852	llvm::Function OutlinedParallelFn, const* OMPExecutableDirective &D) {
1853	ASTContext &Ctx = CGM.getContext();
1854	const auto &CS = *D.getCapturedStmt(RegionKind: OMPD_parallel);
1855
1856	// Create a function that takes as argument the source thread.
1857	FunctionArgList WrapperArgs;
1858	QualType Int16QTy =
1859	Ctx.getIntTypeForBitwidth(/DestWidth=/`16`, /Signed=/false);
1860	QualType Int32QTy =
1861	Ctx.getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/false);
1862	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
1863	/Id=/nullptr, Int16QTy,
1864	ImplicitParamKind::Other);
1865	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
1866	/Id=/nullptr, Int32QTy,
1867	ImplicitParamKind::Other);
1868	WrapperArgs.emplace_back(Args: &ParallelLevelArg);
1869	WrapperArgs.emplace_back(Args: &WrapperArg);
1870
1871	const CGFunctionInfo &CGFI =
1872	CGM.getTypes().arrangeBuiltinFunctionDeclaration(resultType: Ctx.VoidTy, args: WrapperArgs);
1873
1874	auto *Fn = llvm::Function::Create(
1875	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
1876	N: Twine (OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
1877
1878	// Ensure we do not inline the function. This is trivially true for the ones
1879	// passed to __kmpc_fork_call but the ones calles in serialized regions
1880	// could be inlined. This is not a perfect but it is closer to the invariant
1881	// we want, namely, every data environment starts with a new function.
1882	// TODO: We should pass the if condition to the runtime function and do the
1883	// handling there. Much cleaner code.
1884	Fn->addFnAttr(Kind: llvm::Attribute::NoInline);
1885
1886	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
1887	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1888	Fn->setDoesNotRecurse();
1889
1890	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
1891	CGF.StartFunction(GD: GlobalDecl (), RetTy: Ctx.VoidTy, Fn, FnInfo: CGFI, Args: WrapperArgs,
1892	Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
1893
1894	const auto *RD = CS.getCapturedRecordDecl();
1895	auto CurField = RD->field_begin();
1896
1897	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1898	/Name=/".zero.addr");
1899	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
1900	// Get the array of arguments.
1901	SmallVector<llvm::Value *, `8`> Args;
1902
1903	Args.emplace_back(Args: CGF.GetAddrOfLocalVar(VD: &WrapperArg).emitRawPointer(CGF));
1904	Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
1905
1906	CGBuilderTy &Bld = CGF.Builder;
1907	auto CI = CS.capture_begin();
1908
1909	// Use global memory for data sharing.
1910	// Handle passing of global args to workers.
1911	RawAddress GlobalArgs =
1912	CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
1913	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1914	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1915	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1916	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
1917	args: DataSharingArgs);
1918
1919	// Retrieve the shared variables from the list of references returned
1920	// by the runtime. Pass the variables to the outlined function.
1921	Address SharedArgListAddress = Address::invalid();
1922	if (CS.capture_size() > `0` \|\|
1923	isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1924	SharedArgListAddress = CGF.EmitLoadOfPointer(
1925	Ptr: GlobalArgs, PtrTy: CGF.getContext()
1926	.getPointerType(T: CGF.getContext().VoidPtrTy)
1927	.castAs<PointerType>());
1928	}
1929	unsigned Idx = `0`;
1930	if (isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1931	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1932	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1933	Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
1934	llvm::Value *LB = CGF.EmitLoadOfScalar(
1935	Addr: TypedAddress,
1936	/Volatile=/false,
1937	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1938	Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
1939	Args.emplace_back(Args&: LB);
1940	++Idx;
1941	Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1942	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1943	Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
1944	llvm::Value *UB = CGF.EmitLoadOfScalar(
1945	Addr: TypedAddress,
1946	/Volatile=/false,
1947	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1948	Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
1949	Args.emplace_back(Args&: UB);
1950	++Idx;
1951	}
1952	if (CS.capture_size() > `0`) {
1953	ASTContext &CGFContext = CGF.getContext();
1954	for (unsigned I = `0`, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1955	QualType ElemTy = CurField ->getType();
1956	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
1957	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1958	Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
1959	ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
1960	llvm::Value *Arg = CGF.EmitLoadOfScalar(Addr: TypedAddress,
1961	/Volatile=/false,
1962	Ty: CGFContext.getPointerType(T: ElemTy),
1963	Loc: CI->getLocation());
1964	if (CI->capturesVariableByCopy() &&
1965	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
1966	Arg = castValueToType(CGF, Val: Arg, ValTy: ElemTy, CastTy: CGFContext.getUIntPtrType(),
1967	Loc: CI->getLocation());
1968	}
1969	Args.emplace_back(Args&: Arg);
1970	}
1971	}
1972
1973	emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
1974	CGF.FinishFunction();
1975	return Fn;
1976	}
1977
1978	void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1979	const Decl *D) {
1980	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1981	return;
1982
1983	assert(D && "Expected function or captured\|block decl.");
1984	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == `0` &&
1985	"Function is registered already.");
1986	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&
1987	"Team is set but not processed.");
1988	const Stmt Body = nullptr*;
1989	bool NeedToDelayGlobalization = false;
1990	if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
1991	Body = FD->getBody();
1992	} else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
1993	Body = BD->getBody();
1994	} else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
1995	Body = CD->getBody();
1996	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1997	if (NeedToDelayGlobalization &&
1998	getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1999	return;
2000	}
2001	if (!Body)
2002	return;
2003	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2004	VarChecker.Visit(S: Body);
2005	const RecordDecl *GlobalizedVarsRecord =
2006	VarChecker.getGlobalizedRecord(IsInTTDRegion);
2007	TeamAndReductions.first = nullptr;
2008	TeamAndReductions.second.clear();
2009	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2010	VarChecker.getEscapedVariableLengthDecls();
2011	ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2012	VarChecker.getDelayedVariableLengthDecls();
2013	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2014	DelayedVariableLengthDecls.empty())
2015	return;
2016	auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
2017	I ->getSecond().MappedParams =
2018	std::make_unique<CodeGenFunction::OMPMapVars>();
2019	I ->getSecond().EscapedParameters.insert(
2020	I: VarChecker.getEscapedParameters().begin(),
2021	E: VarChecker.getEscapedParameters().end());
2022	I ->getSecond().EscapedVariableLengthDecls.append(
2023	in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
2024	I ->getSecond().DelayedVariableLengthDecls.append(
2025	in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
2026	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
2027	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2028	assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2029	Data.insert(KV: std::make_pair(x&: VD, y: MappedVarData ()));
2030	}
2031	if (!NeedToDelayGlobalization) {
2032	emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
2033	struct GlobalizationScope final : EHScopeStack::Cleanup {
2034	GlobalizationScope() = default;
2035
2036	void Emit(CodeGenFunction &CGF, Flags flags) override {
2037	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2038	.emitGenericVarsEpilog(CGF);
2039	}
2040	};
2041	CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
2042	}
2043	}
2044
2045	Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2046	const VarDecl *VD) {
2047	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2048	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2049	auto AS = LangAS::Default;
2050	switch (A->getAllocatorType()) {
2051	// Use the default allocator here as by default local vars are
2052	// threadlocal.
2053	case OMPAllocateDeclAttr::OMPNullMemAlloc:
2054	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2055	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2057	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2058	// Follow the user decision - use default allocation.
2059	return Address::invalid();
2060	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2061	// TODO: implement aupport for user-defined allocators.
2062	return Address::invalid();
2063	case OMPAllocateDeclAttr::OMPConstMemAlloc:
2064	AS = LangAS::cuda_constant;
2065	break;
2066	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2067	AS = LangAS::cuda_shared;
2068	break;
2069	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2070	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2071	break;
2072	}
2073	llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
2074	auto GV = new* llvm::GlobalVariable (
2075	CGM.getModule(), VarTy, /isConstant=/false,
2076	llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
2077	VD->getName(),
2078	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
2079	CGM.getContext().getTargetAddressSpace(AS));
2080	CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
2081	GV->setAlignment(Align.getAsAlign());
2082	return Address (
2083	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2084	V: GV, DestTy: VarTy->getPointerTo(AddrSpace: CGM.getContext().getTargetAddressSpace(
2085	AS: VD->getType().getAddressSpace()))),
2086	VarTy, Align);
2087	}
2088
2089	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2090	return Address::invalid();
2091
2092	VD = VD->getCanonicalDecl();
2093	auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
2094	if (I == FunctionGlobalizedDecls.end())
2095	return Address::invalid();
2096	auto VDI = I ->getSecond().LocalVarData.find(Key: VD);
2097	if (VDI != I ->getSecond().LocalVarData.end())
2098	return VDI->second.PrivateAddr;
2099	if (VD->hasAttrs()) {
2100	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2101	E(VD->attr_end());
2102	IT != E; ++IT) {
2103	auto VDI = I ->getSecond().LocalVarData.find(
2104	Key: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: IT ->getRef())->getDecl())
2105	->getCanonicalDecl());
2106	if (VDI != I ->getSecond().LocalVarData.end())
2107	return VDI->second.PrivateAddr;
2108	}
2109	}
2110
2111	return Address::invalid();
2112	}
2113
2114	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2115	FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
2116	CGOpenMPRuntime::functionFinished(CGF);
2117	}
2118
2119	void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2120	CodeGenFunction &CGF, const OMPLoopDirective &S,
2121	OpenMPDistScheduleClauseKind &ScheduleKind,
2122	llvm::Value &Chunk) const* {
2123	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2124	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2125	ScheduleKind = OMPC_DIST_SCHEDULE_static;
2126	Chunk = CGF.EmitScalarConversion(
2127	Src: RT.getGPUNumThreads(CGF),
2128	SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
2129	DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
2130	return;
2131	}
2132	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2133	CGF, S, ScheduleKind, Chunk);
2134	}
2135
2136	void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2137	CodeGenFunction &CGF, const OMPLoopDirective &S,
2138	OpenMPScheduleClauseKind &ScheduleKind,
2139	const Expr &ChunkExpr) const* {
2140	ScheduleKind = OMPC_SCHEDULE_static;
2141	// Chunk size is 1 in this case.
2142	llvm::APInt ChunkSize(`32`, `1`);
2143	ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
2144	type: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
2145	l: SourceLocation ());
2146	}
2147
2148	void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2149	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2150	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2151	" Expected target-based directive.");
2152	const CapturedStmt *CS = D.getCapturedStmt(RegionKind: OMPD_target);
2153	for (const CapturedStmt::Capture &C : CS->captures()) {
2154	// Capture variables captured by reference in lambdas for target-based
2155	// directives.
2156	if (!C.capturesVariable())
2157	continue;
2158	const VarDecl *VD = C.getCapturedVar();
2159	const auto *RD = VD->getType()
2160	.getCanonicalType()
2161	.getNonReferenceType()
2162	->getAsCXXRecordDecl();
2163	if (!RD \|\| !RD->isLambda())
2164	continue;
2165	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2166	LValue VDLVal;
2167	if (VD->getType().getCanonicalType()->isReferenceType())
2168	VDLVal = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr, RefTy: VD->getType());
2169	else
2170	VDLVal = CGF.MakeAddrLValue(
2171	Addr: VDAddr, T: VD->getType().getCanonicalType().getNonReferenceType());
2172	llvm::DenseMap<const ValueDecl , FieldDecl > Captures;
2173	FieldDecl ThisCapture = nullptr*;
2174	RD->getCaptureFields(Captures, ThisCapture);
2175	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2176	LValue ThisLVal =
2177	CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
2178	llvm::Value *CXXThis = CGF.LoadCXXThis();
2179	CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
2180	}
2181	for (const LambdaCapture &LC : RD->captures()) {
2182	if (LC.getCaptureKind() != LCK_ByRef)
2183	continue;
2184	const ValueDecl *VD = LC.getCapturedVar();
2185	// FIXME: For now VD is always a VarDecl because OpenMP does not support
2186	// capturing structured bindings in lambdas yet.
2187	if (!CS->capturesVariable(Var: cast<VarDecl>(Val: VD)))
2188	continue;
2189	auto It = Captures.find(Val: VD);
2190	assert(It != Captures.end() && "Found lambda capture without field.");
2191	LValue VarLVal = CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: It ->second);
2192	Address VDAddr = CGF.GetAddrOfLocalVar(VD: cast<VarDecl>(Val: VD));
2193	if (VD->getType().getCanonicalType()->isReferenceType())
2194	VDAddr = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr,
2195	RefTy: VD->getType().getCanonicalType())
2196	.getAddress();
2197	CGF.EmitStoreOfScalar(value: VDAddr.emitRawPointer(CGF), lvalue: VarLVal);
2198	}
2199	}
2200	}
2201
2202	bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2203	LangAS &AS) {
2204	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
2205	return false;
2206	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2207	switch(A->getAllocatorType()) {
2208	case OMPAllocateDeclAttr::OMPNullMemAlloc:
2209	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2210	// Not supported, fallback to the default mem space.
2211	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2212	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2213	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2214	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2215	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2216	AS = LangAS::Default;
2217	return true;
2218	case OMPAllocateDeclAttr::OMPConstMemAlloc:
2219	AS = LangAS::cuda_constant;
2220	return true;
2221	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2222	AS = LangAS::cuda_shared;
2223	return true;
2224	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2225	llvm_unreachable("Expected predefined allocator for the variables with the "
2226	"static storage.");
2227	}
2228	return false;
2229	}
2230
2231	// Get current OffloadArch and ignore any unknown values
2232	static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2233	if (!CGM.getTarget().hasFeature(Feature: "ptx"))
2234	return OffloadArch::UNKNOWN;
2235	for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2236	if (Feature.getValue()) {
2237	OffloadArch Arch = StringToOffloadArch(S: Feature.getKey());
2238	if (Arch != OffloadArch::UNKNOWN)
2239	return Arch;
2240	}
2241	}
2242	return OffloadArch::UNKNOWN;
2243	}
2244
2245	/// Check to see if target architecture supports unified addressing which is
2246	/// a restriction for OpenMP requires clause "unified_shared_memory".
2247	void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2248	for (const OMPClause *Clause : D->clauselists()) {
2249	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2250	OffloadArch Arch = getOffloadArch(CGM);
2251	switch (Arch) {
2252	case OffloadArch::SM_20:
2253	case OffloadArch::SM_21:
2254	case OffloadArch::SM_30:
2255	case OffloadArch::SM_32_:
2256	case OffloadArch::SM_35:
2257	case OffloadArch::SM_37:
2258	case OffloadArch::SM_50:
2259	case OffloadArch::SM_52:
2260	case OffloadArch::SM_53: {
2261	SmallString<`256`> Buffer;
2262	llvm::raw_svector_ostream Out(Buffer);
2263	Out << "Target architecture " << OffloadArchToString(A: Arch)
2264	<< " does not support unified addressing";
2265	CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
2266	return;
2267	}
2268	case OffloadArch::SM_60:
2269	case OffloadArch::SM_61:
2270	case OffloadArch::SM_62:
2271	case OffloadArch::SM_70:
2272	case OffloadArch::SM_72:
2273	case OffloadArch::SM_75:
2274	case OffloadArch::SM_80:
2275	case OffloadArch::SM_86:
2276	case OffloadArch::SM_87:
2277	case OffloadArch::SM_89:
2278	case OffloadArch::SM_90:
2279	case OffloadArch::SM_90a:
2280	case OffloadArch::GFX600:
2281	case OffloadArch::GFX601:
2282	case OffloadArch::GFX602:
2283	case OffloadArch::GFX700:
2284	case OffloadArch::GFX701:
2285	case OffloadArch::GFX702:
2286	case OffloadArch::GFX703:
2287	case OffloadArch::GFX704:
2288	case OffloadArch::GFX705:
2289	case OffloadArch::GFX801:
2290	case OffloadArch::GFX802:
2291	case OffloadArch::GFX803:
2292	case OffloadArch::GFX805:
2293	case OffloadArch::GFX810:
2294	case OffloadArch::GFX9_GENERIC:
2295	case OffloadArch::GFX900:
2296	case OffloadArch::GFX902:
2297	case OffloadArch::GFX904:
2298	case OffloadArch::GFX906:
2299	case OffloadArch::GFX908:
2300	case OffloadArch::GFX909:
2301	case OffloadArch::GFX90a:
2302	case OffloadArch::GFX90c:
2303	case OffloadArch::GFX940:
2304	case OffloadArch::GFX941:
2305	case OffloadArch::GFX942:
2306	case OffloadArch::GFX10_1_GENERIC:
2307	case OffloadArch::GFX1010:
2308	case OffloadArch::GFX1011:
2309	case OffloadArch::GFX1012:
2310	case OffloadArch::GFX1013:
2311	case OffloadArch::GFX10_3_GENERIC:
2312	case OffloadArch::GFX1030:
2313	case OffloadArch::GFX1031:
2314	case OffloadArch::GFX1032:
2315	case OffloadArch::GFX1033:
2316	case OffloadArch::GFX1034:
2317	case OffloadArch::GFX1035:
2318	case OffloadArch::GFX1036:
2319	case OffloadArch::GFX11_GENERIC:
2320	case OffloadArch::GFX1100:
2321	case OffloadArch::GFX1101:
2322	case OffloadArch::GFX1102:
2323	case OffloadArch::GFX1103:
2324	case OffloadArch::GFX1150:
2325	case OffloadArch::GFX1151:
2326	case OffloadArch::GFX1152:
2327	case OffloadArch::GFX12_GENERIC:
2328	case OffloadArch::GFX1200:
2329	case OffloadArch::GFX1201:
2330	case OffloadArch::AMDGCNSPIRV:
2331	case OffloadArch::Generic:
2332	case OffloadArch::UNUSED:
2333	case OffloadArch::UNKNOWN:
2334	break;
2335	case OffloadArch::LAST:
2336	llvm_unreachable("Unexpected GPU arch.");
2337	}
2338	}
2339	}
2340	CGOpenMPRuntime::processRequiresDirective(D);
2341	}
2342
2343	llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2344	CGBuilderTy &Bld = CGF.Builder;
2345	llvm::Module *M = &CGF.CGM.getModule();
2346	const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2347	llvm::Function *F = M->getFunction(Name: LocSize);
2348	if (!F) {
2349	F = llvm::Function::Create(
2350	Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: std::nullopt, isVarArg: false),
2351	Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize, M: &CGF.CGM.getModule());
2352	}
2353	return Bld.CreateCall(Callee: F, Args: std::nullopt, Name: "nvptx_num_threads");
2354	}
2355
2356	llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2357	ArrayRef<llvm::Value *> Args{};
2358	return CGF.EmitRuntimeCall(
2359	callee: OMPBuilder.getOrCreateRuntimeFunction(
2360	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
2361	args: Args);
2362	}
2363

Browse the source code of llvm_projects/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp