CGOpenMPRuntimeGPU.cpp source code [llvm_projects/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp]

1	//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a generalized class for OpenMP runtime code generation
10	// specialized by GPU targets NVPTX and AMDGCN.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeGPU.h"
15	#include "CGDebugInfo.h"
16	#include "CodeGenFunction.h"
17	#include "clang/AST/Attr.h"
18	#include "clang/AST/DeclOpenMP.h"
19	#include "clang/AST/OpenMPClause.h"
20	#include "clang/AST/StmtOpenMP.h"
21	#include "clang/AST/StmtVisitor.h"
22	#include "clang/Basic/Cuda.h"
23	#include "llvm/ADT/SmallPtrSet.h"
24	#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
25	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
26
27	using namespace clang;
28	using namespace CodeGen;
29	using namespace llvm::omp;
30
31	namespace {
32	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
33	class NVPTXActionTy final : public PrePostActionTy {
34	llvm::FunctionCallee EnterCallee = nullptr;
35	ArrayRef<llvm::Value *> EnterArgs;
36	llvm::FunctionCallee ExitCallee = nullptr;
37	ArrayRef<llvm::Value *> ExitArgs;
38	bool Conditional = false;
39	llvm::BasicBlock ContBlock = nullptr*;
40
41	public:
42	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43	ArrayRef<llvm::Value *> EnterArgs,
44	llvm::FunctionCallee ExitCallee,
45	ArrayRef<llvm::Value > ExitArgs, bool* Conditional = false)
46	: EnterCallee (EnterCallee), EnterArgs (EnterArgs), ExitCallee (ExitCallee),
47	ExitArgs (ExitArgs), Conditional(Conditional) {}
48	void Enter(CodeGenFunction &CGF) override {
49	llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
50	if (Conditional) {
51	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
52	auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
53	ContBlock = CGF.createBasicBlock(name: "omp_if.end");
54	// Generate the branch (If-stmt)
55	CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
56	CGF.EmitBlock(BB: ThenBlock);
57	}
58	}
59	void Done(CodeGenFunction &CGF) {
60	// Emit the rest of blocks/branches
61	CGF.EmitBranch(Block: ContBlock);
62	CGF.EmitBlock(BB: ContBlock, IsFinished: true);
63	}
64	void Exit(CodeGenFunction &CGF) override {
65	CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
66	}
67	};
68
69	/// A class to track the execution mode when codegening directives within
70	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
71	/// to the target region and used by containing directives such as 'parallel'
72	/// to emit optimized code.
73	class ExecutionRuntimeModesRAII {
74	private:
75	CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
76	CGOpenMPRuntimeGPU::EM_Unknown;
77	CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
78
79	public:
80	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81	CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
82	: ExecMode(ExecMode) {
83	SavedExecMode = ExecMode;
84	ExecMode = EntryMode;
85	}
86	~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
87	};
88
89	static const ValueDecl getPrivateItem(const* Expr *RefExpr) {
90	RefExpr = RefExpr->IgnoreParens();
91	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
92	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
93	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
94	Base = TempASE->getBase()->IgnoreParenImpCasts();
95	RefExpr = Base;
96	} else if (auto *OASE = dyn_cast<ArraySectionExpr>(Val: RefExpr)) {
97	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
98	while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Val: Base))
99	Base = TempOASE->getBase()->IgnoreParenImpCasts();
100	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
101	Base = TempASE->getBase()->IgnoreParenImpCasts();
102	RefExpr = Base;
103	}
104	RefExpr = RefExpr->IgnoreParenImpCasts();
105	if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
106	return cast<ValueDecl>(Val: DE->getDecl()->getCanonicalDecl());
107	const auto *ME = cast<MemberExpr>(Val: RefExpr);
108	return cast<ValueDecl>(Val: ME->getMemberDecl()->getCanonicalDecl());
109	}
110
111	static RecordDecl *buildRecordForGlobalizedVars(
112	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
113	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
114	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
115	&MappedDeclsFields,
116	int BufSize) {
117	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
118	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
119	return nullptr;
120	SmallVector<VarsDataTy, `4`> GlobalizedVars;
121	for (const ValueDecl *D : EscapedDecls)
122	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
123	for (const ValueDecl *D : EscapedDeclsForTeams)
124	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
125
126	// Build struct _globalized_locals_ty {
127	// / globalized vars /[WarSize] align (decl_align)
128	// / globalized vars / for EscapedDeclsForTeams
129	// };
130	RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
131	GlobalizedRD->startDefinition();
132	llvm::SmallPtrSet<const ValueDecl *, `16`> SingleEscaped(llvm::from_range,
133	EscapedDeclsForTeams);
134	for (const auto &Pair : GlobalizedVars) {
135	const ValueDecl *VD = Pair.second;
136	QualType Type = VD->getType();
137	if (Type ->isLValueReferenceType())
138	Type = C.getPointerType(T: Type.getNonReferenceType());
139	else
140	Type = Type.getNonReferenceType();
141	SourceLocation Loc = VD->getLocation();
142	FieldDecl *Field;
143	if (SingleEscaped.count(Ptr: VD)) {
144	Field = FieldDecl::Create(
145	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
146	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
147	/BW=/nullptr, /Mutable=/false,
148	/InitStyle=/ICIS_NoInit);
149	Field->setAccess(AS_public);
150	if (VD->hasAttrs()) {
151	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
152	E(VD->getAttrs().end());
153	I != E; ++I)
154	Field->addAttr(A: *I);
155	}
156	} else {
157	if (BufSize > `1`) {
158	llvm::APInt ArraySize(`32`, BufSize);
159	Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
160	ASM: ArraySizeModifier::Normal, IndexTypeQuals: `0`);
161	}
162	Field = FieldDecl::Create(
163	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
164	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
165	/BW=/nullptr, /Mutable=/false,
166	/InitStyle=/ICIS_NoInit);
167	Field->setAccess(AS_public);
168	llvm::APInt Align(`32`, Pair.first.getQuantity());
169	Field->addAttr(A: AlignedAttr::CreateImplicit(
170	Ctx&: C, /IsAlignmentExpr=/true,
171	Alignment: IntegerLiteral::Create(C, V: Align,
172	type: C.getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
173	l: SourceLocation ()),
174	Range: {}, S: AlignedAttr::GNU_aligned));
175	}
176	GlobalizedRD->addDecl(D: Field);
177	MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
178	}
179	GlobalizedRD->completeDefinition();
180	return GlobalizedRD;
181	}
182
183	/// Get the list of variables that can escape their declaration context.
184	class CheckVarsEscapingDeclContext final
185	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
186	CodeGenFunction &CGF;
187	llvm::SetVector<const ValueDecl *> EscapedDecls;
188	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
189	llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
190	llvm::SmallPtrSet<const Decl *, `4`> EscapedParameters;
191	RecordDecl GlobalizedRD = nullptr*;
192	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
193	bool AllEscaped = false;
194	bool IsForCombinedParallelRegion = false;
195
196	void markAsEscaped(const ValueDecl *VD) {
197	// Do not globalize declare target variables.
198	if (!isa<VarDecl>(Val: VD) \|\|
199	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
200	return;
201	VD = cast<ValueDecl>(Val: VD->getCanonicalDecl());
202	// Use user-specified allocation.
203	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
204	return;
205	// Variables captured by value must be globalized.
206	bool IsCaptured = false;
207	if (auto *CSI = CGF.CapturedStmtInfo) {
208	if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
209	// Check if need to capture the variable that was already captured by
210	// value in the outer region.
211	IsCaptured = true;
212	if (!IsForCombinedParallelRegion) {
213	if (!FD->hasAttrs())
214	return;
215	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
216	if (!Attr)
217	return;
218	if (((Attr->getCaptureKind() != OMPC_map) &&
219	!isOpenMPPrivate(Kind: Attr->getCaptureKind())) \|\|
220	((Attr->getCaptureKind() == OMPC_map) &&
221	!FD->getType()->isAnyPointerType()))
222	return;
223	}
224	if (!FD->getType()->isReferenceType()) {
225	assert(!VD->getType()->isVariablyModifiedType() &&
226	"Parameter captured by value with variably modified type");
227	EscapedParameters.insert(Ptr: VD);
228	} else if (!IsForCombinedParallelRegion) {
229	return;
230	}
231	}
232	}
233	if ((!CGF.CapturedStmtInfo \|\|
234	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
235	VD->getType()->isReferenceType())
236	// Do not globalize variables with reference type.
237	return;
238	if (VD->getType()->isVariablyModifiedType()) {
239	// If not captured at the target region level then mark the escaped
240	// variable as delayed.
241	if (IsCaptured)
242	EscapedVariableLengthDecls.insert(X: VD);
243	else
244	DelayedVariableLengthDecls.insert(X: VD);
245	} else
246	EscapedDecls.insert(X: VD);
247	}
248
249	void VisitValueDecl(const ValueDecl *VD) {
250	if (VD->getType()->isLValueReferenceType())
251	markAsEscaped(VD);
252	if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
253	if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
254	const bool SavedAllEscaped = AllEscaped;
255	AllEscaped = VD->getType()->isLValueReferenceType();
256	Visit(S: VarD->getInit());
257	AllEscaped = SavedAllEscaped;
258	}
259	}
260	}
261	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
262	ArrayRef<OMPClause *> Clauses,
263	bool IsCombinedParallelRegion) {
264	if (!S)
265	return;
266	for (const CapturedStmt::Capture &C : S->captures()) {
267	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
268	const ValueDecl *VD = C.getCapturedVar();
269	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
270	if (IsCombinedParallelRegion) {
271	// Check if the variable is privatized in the combined construct and
272	// those private copies must be shared in the inner parallel
273	// directive.
274	IsForCombinedParallelRegion = false;
275	for (const OMPClause *C : Clauses) {
276	if (!isOpenMPPrivate(Kind: C->getClauseKind()) \|\|
277	C->getClauseKind() == OMPC_reduction \|\|
278	C->getClauseKind() == OMPC_linear \|\|
279	C->getClauseKind() == OMPC_private)
280	continue;
281	ArrayRef<const Expr *> Vars;
282	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
283	Vars = PC->getVarRefs();
284	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
285	Vars = PC->getVarRefs();
286	else
287	llvm_unreachable("Unexpected clause.");
288	for (const auto *E : Vars) {
289	const Decl *D =
290	cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
291	if (D == VD->getCanonicalDecl()) {
292	IsForCombinedParallelRegion = true;
293	break;
294	}
295	}
296	if (IsForCombinedParallelRegion)
297	break;
298	}
299	}
300	markAsEscaped(VD);
301	if (isa<OMPCapturedExprDecl>(Val: VD))
302	VisitValueDecl(VD);
303	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
304	}
305	}
306	}
307
308	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
309	assert(!GlobalizedRD &&
310	"Record for globalized variables is built already.");
311	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
312	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
313	if (IsInTTDRegion)
314	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
315	else
316	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
317	GlobalizedRD = ::buildRecordForGlobalizedVars(
318	C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
319	MappedDeclsFields, BufSize: WarpSize);
320	}
321
322	public:
323	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
324	ArrayRef<const ValueDecl *> TeamsReductions)
325	: CGF(CGF), EscapedDecls (llvm::from_range, TeamsReductions) {}
326	~CheckVarsEscapingDeclContext() = default;
327	void VisitDeclStmt(const DeclStmt *S) {
328	if (!S)
329	return;
330	for (const Decl *D : S->decls())
331	if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332	VisitValueDecl(VD);
333	}
334	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335	if (!D)
336	return;
337	if (!D->hasAssociatedStmt())
338	return;
339	if (const auto *S =
340	dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341	// Do not analyze directives that do not actually require capturing,
342	// like `omp for` or `omp simd` directives.
343	llvm::SmallVector<OpenMPDirectiveKind, `4`> CaptureRegions;
344	getOpenMPCaptureRegions(CaptureRegions, DKind: D->getDirectiveKind());
345	if (CaptureRegions.size() == `1` && CaptureRegions.back() == OMPD_unknown) {
346	VisitStmt(S: S->getCapturedStmt());
347	return;
348	}
349	VisitOpenMPCapturedStmt(
350	S, Clauses: D->clauses(),
351	IsCombinedParallelRegion: CaptureRegions.back() == OMPD_parallel &&
352	isOpenMPDistributeDirective(DKind: D->getDirectiveKind()));
353	}
354	}
355	void VisitCapturedStmt(const CapturedStmt *S) {
356	if (!S)
357	return;
358	for (const CapturedStmt::Capture &C : S->captures()) {
359	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360	const ValueDecl *VD = C.getCapturedVar();
361	markAsEscaped(VD);
362	if (isa<OMPCapturedExprDecl>(Val: VD))
363	VisitValueDecl(VD);
364	}
365	}
366	}
367	void VisitLambdaExpr(const LambdaExpr *E) {
368	if (!E)
369	return;
370	for (const LambdaCapture &C : E->captures()) {
371	if (C.capturesVariable()) {
372	if (C.getCaptureKind() == LCK_ByRef) {
373	const ValueDecl *VD = C.getCapturedVar();
374	markAsEscaped(VD);
375	if (E->isInitCapture(Capture: &C) \|\| isa<OMPCapturedExprDecl>(Val: VD))
376	VisitValueDecl(VD);
377	}
378	}
379	}
380	}
381	void VisitBlockExpr(const BlockExpr *E) {
382	if (!E)
383	return;
384	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385	if (C.isByRef()) {
386	const VarDecl *VD = C.getVariable();
387	markAsEscaped(VD);
388	if (isa<OMPCapturedExprDecl>(Val: VD) \|\| VD->isInitCapture())
389	VisitValueDecl(VD);
390	}
391	}
392	}
393	void VisitCallExpr(const CallExpr *E) {
394	if (!E)
395	return;
396	for (const Expr *Arg : E->arguments()) {
397	if (!Arg)
398	continue;
399	if (Arg->isLValue()) {
400	const bool SavedAllEscaped = AllEscaped;
401	AllEscaped = true;
402	Visit(S: Arg);
403	AllEscaped = SavedAllEscaped;
404	} else {
405	Visit(S: Arg);
406	}
407	}
408	Visit(S: E->getCallee());
409	}
410	void VisitDeclRefExpr(const DeclRefExpr *E) {
411	if (!E)
412	return;
413	const ValueDecl *VD = E->getDecl();
414	if (AllEscaped)
415	markAsEscaped(VD);
416	if (isa<OMPCapturedExprDecl>(Val: VD))
417	VisitValueDecl(VD);
418	else if (VD->isInitCapture())
419	VisitValueDecl(VD);
420	}
421	void VisitUnaryOperator(const UnaryOperator *E) {
422	if (!E)
423	return;
424	if (E->getOpcode() == UO_AddrOf) {
425	const bool SavedAllEscaped = AllEscaped;
426	AllEscaped = true;
427	Visit(S: E->getSubExpr());
428	AllEscaped = SavedAllEscaped;
429	} else {
430	Visit(S: E->getSubExpr());
431	}
432	}
433	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434	if (!E)
435	return;
436	if (E->getCastKind() == CK_ArrayToPointerDecay) {
437	const bool SavedAllEscaped = AllEscaped;
438	AllEscaped = true;
439	Visit(S: E->getSubExpr());
440	AllEscaped = SavedAllEscaped;
441	} else {
442	Visit(S: E->getSubExpr());
443	}
444	}
445	void VisitExpr(const Expr *E) {
446	if (!E)
447	return;
448	bool SavedAllEscaped = AllEscaped;
449	if (!E->isLValue())
450	AllEscaped = false;
451	for (const Stmt *Child : E->children())
452	if (Child)
453	Visit(S: Child);
454	AllEscaped = SavedAllEscaped;
455	}
456	void VisitStmt(const Stmt *S) {
457	if (!S)
458	return;
459	for (const Stmt *Child : S->children())
460	if (Child)
461	Visit(S: Child);
462	}
463
464	/// Returns the record that handles all the escaped local variables and used
465	/// instead of their original storage.
466	const RecordDecl getGlobalizedRecord(bool* IsInTTDRegion) {
467	if (!GlobalizedRD)
468	buildRecordForGlobalizedVars(IsInTTDRegion);
469	return GlobalizedRD;
470	}
471
472	/// Returns the field in the globalized record for the escaped variable.
473	const FieldDecl getFieldForGlobalizedVar(const* ValueDecl VD) const* {
474	assert(GlobalizedRD &&
475	"Record for globalized variables must be generated already.");
476	return MappedDeclsFields.lookup(Val: VD);
477	}
478
479	/// Returns the list of the escaped local variables/parameters.
480	ArrayRef<const ValueDecl > getEscapedDecls() const* {
481	return EscapedDecls.getArrayRef();
482	}
483
484	/// Checks if the escaped local variable is actually a parameter passed by
485	/// value.
486	const llvm::SmallPtrSetImpl<const Decl > &getEscapedParameters() const* {
487	return EscapedParameters;
488	}
489
490	/// Returns the list of the escaped variables with the variably modified
491	/// types.
492	ArrayRef<const ValueDecl > getEscapedVariableLengthDecls() const* {
493	return EscapedVariableLengthDecls.getArrayRef();
494	}
495
496	/// Returns the list of the delayed variables with the variably modified
497	/// types.
498	ArrayRef<const ValueDecl > getDelayedVariableLengthDecls() const* {
499	return DelayedVariableLengthDecls.getArrayRef();
500	}
501	};
502	} // anonymous namespace
503
504	CGOpenMPRuntimeGPU::ExecutionMode
505	CGOpenMPRuntimeGPU::getExecutionMode() const {
506	return CurrentExecutionMode;
507	}
508
509	CGOpenMPRuntimeGPU::DataSharingMode
510	CGOpenMPRuntimeGPU::getDataSharingMode() const {
511	return CurrentDataSharingMode;
512	}
513
514	/// Check for inner (nested) SPMD construct, if any
515	static bool hasNestedSPMDDirective(ASTContext &Ctx,
516	const OMPExecutableDirective &D) {
517	const auto *CS = D.getInnermostCapturedStmt();
518	const auto *Body =
519	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
520	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522	if (const auto *NestedDir =
523	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
524	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525	switch (D.getDirectiveKind()) {
526	case OMPD_target:
527	if (isOpenMPParallelDirective(DKind))
528	return true;
529	if (DKind == OMPD_teams) {
530	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531	/IgnoreCaptured=/true);
532	if (!Body)
533	return false;
534	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535	if (const auto *NND =
536	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
537	DKind = NND->getDirectiveKind();
538	if (isOpenMPParallelDirective(DKind))
539	return true;
540	}
541	}
542	return false;
543	case OMPD_target_teams:
544	return isOpenMPParallelDirective(DKind);
545	case OMPD_target_simd:
546	case OMPD_target_parallel:
547	case OMPD_target_parallel_for:
548	case OMPD_target_parallel_for_simd:
549	case OMPD_target_teams_distribute:
550	case OMPD_target_teams_distribute_simd:
551	case OMPD_target_teams_distribute_parallel_for:
552	case OMPD_target_teams_distribute_parallel_for_simd:
553	case OMPD_parallel:
554	case OMPD_for:
555	case OMPD_parallel_for:
556	case OMPD_parallel_master:
557	case OMPD_parallel_sections:
558	case OMPD_for_simd:
559	case OMPD_parallel_for_simd:
560	case OMPD_cancel:
561	case OMPD_cancellation_point:
562	case OMPD_ordered:
563	case OMPD_threadprivate:
564	case OMPD_allocate:
565	case OMPD_task:
566	case OMPD_simd:
567	case OMPD_sections:
568	case OMPD_section:
569	case OMPD_single:
570	case OMPD_master:
571	case OMPD_critical:
572	case OMPD_taskyield:
573	case OMPD_barrier:
574	case OMPD_taskwait:
575	case OMPD_taskgroup:
576	case OMPD_atomic:
577	case OMPD_flush:
578	case OMPD_depobj:
579	case OMPD_scan:
580	case OMPD_teams:
581	case OMPD_target_data:
582	case OMPD_target_exit_data:
583	case OMPD_target_enter_data:
584	case OMPD_distribute:
585	case OMPD_distribute_simd:
586	case OMPD_distribute_parallel_for:
587	case OMPD_distribute_parallel_for_simd:
588	case OMPD_teams_distribute:
589	case OMPD_teams_distribute_simd:
590	case OMPD_teams_distribute_parallel_for:
591	case OMPD_teams_distribute_parallel_for_simd:
592	case OMPD_target_update:
593	case OMPD_declare_simd:
594	case OMPD_declare_variant:
595	case OMPD_begin_declare_variant:
596	case OMPD_end_declare_variant:
597	case OMPD_declare_target:
598	case OMPD_end_declare_target:
599	case OMPD_declare_reduction:
600	case OMPD_declare_mapper:
601	case OMPD_taskloop:
602	case OMPD_taskloop_simd:
603	case OMPD_master_taskloop:
604	case OMPD_master_taskloop_simd:
605	case OMPD_parallel_master_taskloop:
606	case OMPD_parallel_master_taskloop_simd:
607	case OMPD_requires:
608	case OMPD_unknown:
609	default:
610	llvm_unreachable("Unexpected directive.");
611	}
612	}
613
614	return false;
615	}
616
617	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618	const OMPExecutableDirective &D) {
619	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620	switch (DirectiveKind) {
621	case OMPD_target:
622	case OMPD_target_teams:
623	return hasNestedSPMDDirective(Ctx, D);
624	case OMPD_target_parallel_loop:
625	case OMPD_target_parallel:
626	case OMPD_target_parallel_for:
627	case OMPD_target_parallel_for_simd:
628	case OMPD_target_teams_distribute_parallel_for:
629	case OMPD_target_teams_distribute_parallel_for_simd:
630	case OMPD_target_simd:
631	case OMPD_target_teams_distribute_simd:
632	return true;
633	case OMPD_target_teams_distribute:
634	return false;
635	case OMPD_target_teams_loop:
636	// Whether this is true or not depends on how the directive will
637	// eventually be emitted.
638	if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
639	return TTLD->canBeParallelFor();
640	return false;
641	case OMPD_parallel:
642	case OMPD_for:
643	case OMPD_parallel_for:
644	case OMPD_parallel_master:
645	case OMPD_parallel_sections:
646	case OMPD_for_simd:
647	case OMPD_parallel_for_simd:
648	case OMPD_cancel:
649	case OMPD_cancellation_point:
650	case OMPD_ordered:
651	case OMPD_threadprivate:
652	case OMPD_allocate:
653	case OMPD_task:
654	case OMPD_simd:
655	case OMPD_sections:
656	case OMPD_section:
657	case OMPD_single:
658	case OMPD_master:
659	case OMPD_critical:
660	case OMPD_taskyield:
661	case OMPD_barrier:
662	case OMPD_taskwait:
663	case OMPD_taskgroup:
664	case OMPD_atomic:
665	case OMPD_flush:
666	case OMPD_depobj:
667	case OMPD_scan:
668	case OMPD_teams:
669	case OMPD_target_data:
670	case OMPD_target_exit_data:
671	case OMPD_target_enter_data:
672	case OMPD_distribute:
673	case OMPD_distribute_simd:
674	case OMPD_distribute_parallel_for:
675	case OMPD_distribute_parallel_for_simd:
676	case OMPD_teams_distribute:
677	case OMPD_teams_distribute_simd:
678	case OMPD_teams_distribute_parallel_for:
679	case OMPD_teams_distribute_parallel_for_simd:
680	case OMPD_target_update:
681	case OMPD_declare_simd:
682	case OMPD_declare_variant:
683	case OMPD_begin_declare_variant:
684	case OMPD_end_declare_variant:
685	case OMPD_declare_target:
686	case OMPD_end_declare_target:
687	case OMPD_declare_reduction:
688	case OMPD_declare_mapper:
689	case OMPD_taskloop:
690	case OMPD_taskloop_simd:
691	case OMPD_master_taskloop:
692	case OMPD_master_taskloop_simd:
693	case OMPD_parallel_master_taskloop:
694	case OMPD_parallel_master_taskloop_simd:
695	case OMPD_requires:
696	case OMPD_unknown:
697	default:
698	break;
699	}
700	llvm_unreachable(
701	"Unknown programming model for OpenMP directive on NVPTX target.");
702	}
703
704	void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705	StringRef ParentName,
706	llvm::Function *&OutlinedFn,
707	llvm::Constant *&OutlinedFnID,
708	bool IsOffloadEntry,
709	const RegionCodeGenTy &CodeGen) {
710	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711	EntryFunctionState EST;
712	WrapperFunctionsMap.clear();
713
714	[[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715	assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717	// Emit target region as a standalone region.
718	class NVPTXPrePostActionTy : public PrePostActionTy {
719	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720	const OMPExecutableDirective &D;
721
722	public:
723	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724	const OMPExecutableDirective &D)
725	: EST(EST), D(D) {}
726	void Enter(CodeGenFunction &CGF) override {
727	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728	RT.emitKernelInit(D, CGF, EST, / IsSPMD / false);
729	// Skip target region initialization.
730	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
731	}
732	void Exit(CodeGenFunction &CGF) override {
733	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734	RT.clearLocThreadIdInsertPt(CGF);
735	RT.emitKernelDeinit(CGF, EST, / IsSPMD / false);
736	}
737	} Action(EST, D);
738	CodeGen.setAction(Action);
739	IsInTTDRegion = true;
740	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741	IsOffloadEntry, CodeGen);
742	IsInTTDRegion = false;
743	}
744
745	void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746	CodeGenFunction &CGF,
747	EntryFunctionState &EST, bool IsSPMD) {
748	llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs;
749	Attrs.ExecFlags =
750	IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD
751	: llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
752	computeMinAndMaxThreadsAndTeams(D, CGF, Attrs);
753
754	CGBuilderTy &Bld = CGF.Builder;
755	Bld.restoreIP(IP: OMPBuilder.createTargetInit(Loc: Bld, Attrs));
756	if (!IsSPMD)
757	emitGenericVarsProlog(CGF, Loc: EST.Loc);
758	}
759
760	void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761	EntryFunctionState &EST,
762	bool IsSPMD) {
763	if (!IsSPMD)
764	emitGenericVarsEpilog(CGF);
765
766	// This is temporary until we remove the fixed sized buffer.
767	ASTContext &C = CGM.getContext();
768	RecordDecl *StaticRD = C.buildImplicitRecord(
769	Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
770	StaticRD->startDefinition();
771	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772	QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
773	auto *Field = FieldDecl::Create(
774	C, DC: StaticRD, StartLoc: SourceLocation (), IdLoc: SourceLocation (), Id: nullptr, T: RecTy,
775	TInfo: C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation ()),
776	/BW=/nullptr, /Mutable=/false,
777	/InitStyle=/ICIS_NoInit);
778	Field->setAccess(AS_public);
779	StaticRD->addDecl(D: Field);
780	}
781	StaticRD->completeDefinition();
782	QualType StaticTy = C.getRecordType(Decl: StaticRD);
783	llvm::Type *LLVMReductionsBufferTy =
784	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
785	const auto &DL = CGM.getModule().getDataLayout();
786	uint64_t ReductionDataSize =
787	TeamsReductions.empty()
788	? `0`
789	: DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
790	CGBuilderTy &Bld = CGF.Builder;
791	OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
792	TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
793	TeamsReductions.clear();
794	}
795
796	void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797	StringRef ParentName,
798	llvm::Function *&OutlinedFn,
799	llvm::Constant *&OutlinedFnID,
800	bool IsOffloadEntry,
801	const RegionCodeGenTy &CodeGen) {
802	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803	EntryFunctionState EST;
804
805	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807	// Emit target region as a standalone region.
808	class NVPTXPrePostActionTy : public PrePostActionTy {
809	CGOpenMPRuntimeGPU &RT;
810	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811	bool IsBareKernel;
812	DataSharingMode Mode;
813	const OMPExecutableDirective &D;
814
815	public:
816	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817	CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818	bool IsBareKernel, const OMPExecutableDirective &D)
819	: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820	Mode(RT.CurrentDataSharingMode), D(D) {}
821	void Enter(CodeGenFunction &CGF) override {
822	if (IsBareKernel) {
823	RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824	return;
825	}
826	RT.emitKernelInit(D, CGF, EST, / IsSPMD / true);
827	// Skip target region initialization.
828	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
829	}
830	void Exit(CodeGenFunction &CGF) override {
831	if (IsBareKernel) {
832	RT.CurrentDataSharingMode = Mode;
833	return;
834	}
835	RT.clearLocThreadIdInsertPt(CGF);
836	RT.emitKernelDeinit(CGF, EST, / IsSPMD / true);
837	}
838	} Action(*this, EST, IsBareKernel, D);
839	CodeGen.setAction(Action);
840	IsInTTDRegion = true;
841	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842	IsOffloadEntry, CodeGen);
843	IsInTTDRegion = false;
844	}
845
846	void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847	const OMPExecutableDirective &D, StringRef ParentName,
848	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
849	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850	if (!IsOffloadEntry) // Nothing to do.
851	return;
852
853	assert(!ParentName.empty() && "Invalid target region parent name!");
854
855	bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
856	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857	if (Mode \|\| IsBareKernel)
858	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859	CodeGen);
860	else
861	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862	CodeGen);
863	}
864
865	CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866	: CGOpenMPRuntime (CGM) {
867	llvm::OpenMPIRBuilderConfig Config(
868	CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869	CGM.getLangOpts().OpenMPOffloadMandatory,
870	/HasRequiresReverseOffload/ false, /HasRequiresUnifiedAddress/ false,
871	hasRequiresUnifiedSharedMemory(), /HasRequiresDynamicAllocators/ false);
872	OMPBuilder.setConfig(Config);
873
874	if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875	llvm_unreachable("OpenMP can only handle device code.");
876
877	if (CGM.getLangOpts().OpenMPCUDAMode)
878	CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879
880	llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
881	if (CGM.getLangOpts().NoGPULib \|\| CGM.getLangOpts().OMPHostIRFile.empty())
882	return;
883
884	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
885	Name: "__omp_rtl_debug_kind");
886	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
887	Name: "__omp_rtl_assume_teams_oversubscription");
888	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
889	Name: "__omp_rtl_assume_threads_oversubscription");
890	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
891	Name: "__omp_rtl_assume_no_thread_state");
892	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
893	Name: "__omp_rtl_assume_no_nested_parallelism");
894	}
895
896	void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
897	ProcBindKind ProcBind,
898	SourceLocation Loc) {
899	// Nothing to do.
900	}
901
902	void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
903	llvm::Value *NumThreads,
904	SourceLocation Loc) {
905	// Nothing to do.
906	}
907
908	void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
909	const Expr *NumTeams,
910	const Expr *ThreadLimit,
911	SourceLocation Loc) {}
912
913	llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
914	CodeGenFunction &CGF, const OMPExecutableDirective &D,
915	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
916	const RegionCodeGenTy &CodeGen) {
917	// Emit target region as a standalone region.
918	bool PrevIsInTTDRegion = IsInTTDRegion;
919	IsInTTDRegion = false;
920	auto *OutlinedFun =
921	cast<llvm::Function>(Val: CGOpenMPRuntime::emitParallelOutlinedFunction(
922	CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923	IsInTTDRegion = PrevIsInTTDRegion;
924	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925	llvm::Function *WrapperFun =
926	createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
927	WrapperFunctionsMap [OutlinedFun] = WrapperFun;
928	}
929
930	return OutlinedFun;
931	}
932
933	/// Get list of lastprivate variables from the teams distribute ... or
934	/// teams {distribute ...} directives.
935	static void
936	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
937	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
938	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939	"expected teams directive.");
940	const OMPExecutableDirective *Dir = &D;
941	if (!isOpenMPDistributeDirective(DKind: D.getDirectiveKind())) {
942	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
943	Ctx,
944	Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945	/IgnoreCaptured=/true))) {
946	Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
947	if (Dir && !isOpenMPDistributeDirective(DKind: Dir->getDirectiveKind()))
948	Dir = nullptr;
949	}
950	}
951	if (!Dir)
952	return;
953	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954	for (const Expr *E : C->getVarRefs())
955	Vars.push_back(Elt: getPrivateItem(RefExpr: E));
956	}
957	}
958
959	/// Get list of reduction variables from the teams ... directives.
960	static void
961	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964	"expected teams directive.");
965	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966	for (const Expr *E : C->privates())
967	Vars.push_back(Elt: getPrivateItem(RefExpr: E));
968	}
969	}
970
971	llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
972	CodeGenFunction &CGF, const OMPExecutableDirective &D,
973	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
974	const RegionCodeGenTy &CodeGen) {
975	SourceLocation Loc = D.getBeginLoc();
976
977	const RecordDecl GlobalizedRD = nullptr*;
978	llvm::SmallVector<const ValueDecl *, `4`> LastPrivatesReductions;
979	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
980	unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981	// Globalize team reductions variable unconditionally in all modes.
982	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983	getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
984	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985	getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
986	if (!LastPrivatesReductions.empty()) {
987	GlobalizedRD = ::buildRecordForGlobalizedVars(
988	C&: CGM.getContext(), EscapedDecls: {}, EscapedDeclsForTeams: LastPrivatesReductions, MappedDeclsFields,
989	BufSize: WarpSize);
990	}
991	} else if (!LastPrivatesReductions.empty()) {
992	assert(!TeamAndReductions.first &&
993	"Previous team declaration is not expected.");
994	TeamAndReductions.first = D.getCapturedStmt(RegionKind: OMPD_teams)->getCapturedDecl();
995	std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
996	}
997
998	// Emit target region as a standalone region.
999	class NVPTXPrePostActionTy : public PrePostActionTy {
1000	SourceLocation &Loc;
1001	const RecordDecl *GlobalizedRD;
1002	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1003	&MappedDeclsFields;
1004
1005	public:
1006	NVPTXPrePostActionTy(
1007	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1009	&MappedDeclsFields)
1010	: Loc(Loc), GlobalizedRD(GlobalizedRD),
1011	MappedDeclsFields(MappedDeclsFields) {}
1012	void Enter(CodeGenFunction &CGF) override {
1013	auto &Rt =
1014	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015	if (GlobalizedRD) {
1016	auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1017	I ->getSecond().MappedParams =
1018	std::make_unique<CodeGenFunction::OMPMapVars>();
1019	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
1020	for (const auto &Pair : MappedDeclsFields) {
1021	assert(Pair.getFirst()->isCanonicalDecl() &&
1022	"Expected canonical declaration");
1023	Data.try_emplace(Key: Pair.getFirst());
1024	}
1025	}
1026	Rt.emitGenericVarsProlog(CGF, Loc);
1027	}
1028	void Exit(CodeGenFunction &CGF) override {
1029	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030	.emitGenericVarsEpilog(CGF);
1031	}
1032	} Action(Loc, GlobalizedRD, MappedDeclsFields);
1033	CodeGen.setAction(Action);
1034	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035	CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036
1037	return OutlinedFun;
1038	}
1039
1040	void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1041	SourceLocation Loc) {
1042	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043	return;
1044
1045	CGBuilderTy &Bld = CGF.Builder;
1046
1047	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1048	if (I == FunctionGlobalizedDecls.end())
1049	return;
1050
1051	for (auto &Rec : I ->getSecond().LocalVarData) {
1052	const auto *VD = cast<VarDecl>(Val: Rec.first);
1053	bool EscapedParam = I ->getSecond().EscapedParameters.count(Ptr: Rec.first);
1054	QualType VarTy = VD->getType();
1055
1056	// Get the local allocation of a firstprivate variable before sharing
1057	llvm::Value *ParValue;
1058	if (EscapedParam) {
1059	LValue ParLVal =
1060	CGF.MakeAddrLValue(Addr: CGF.GetAddrOfLocalVar(VD), T: VD->getType());
1061	ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1062	}
1063
1064	// Allocate space for the variable to be globalized
1065	llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1066	llvm::CallBase *VoidPtr =
1067	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1068	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1069	args: AllocArgs, name: VD->getName());
1070	// FIXME: We should use the variables actual alignment as an argument.
1071	VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1072	Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment,
1073	Val: CGM.getContext().getTargetInfo().getNewAlign() / `8`));
1074
1075	// Cast the void pointer and get the address of the globalized variable.
1076	llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1077	V: VoidPtr, DestTy: Bld.getPtrTy(AddrSpace: `0`), Name: VD->getName() + "_on_stack");
1078	LValue VarAddr =
1079	CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1080	Rec.second.PrivateAddr = VarAddr.getAddress();
1081	Rec.second.GlobalizedVal = VoidPtr;
1082
1083	// Assign the local allocation to the newly globalized location.
1084	if (EscapedParam) {
1085	CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1086	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress());
1087	}
1088	if (auto *DI = CGF.getDebugInfo())
1089	VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1090	}
1091
1092	for (const auto *ValueD : I ->getSecond().EscapedVariableLengthDecls) {
1093	const auto *VD = cast<VarDecl>(Val: ValueD);
1094	std::pair<llvm::Value , llvm::Value > AddrSizePair =
1095	getKmpcAllocShared(CGF, VD);
1096	I ->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1097	LValue Base = CGF.MakeAddrLValue(V: AddrSizePair.first, T: VD->getType(),
1098	Alignment: CGM.getContext().getDeclAlign(D: VD),
1099	Source: AlignmentSource::Decl);
1100	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress());
1101	}
1102	I ->getSecond().MappedParams ->apply(CGF);
1103	}
1104
1105	bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1106	const VarDecl VD) const* {
1107	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1108	if (I == FunctionGlobalizedDecls.end())
1109	return false;
1110
1111	// Check variable declaration is delayed:
1112	return llvm::is_contained(Range: I ->getSecond().DelayedVariableLengthDecls, Element: VD);
1113	}
1114
1115	std::pair<llvm::Value , llvm::Value >
1116	CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1117	const VarDecl *VD) {
1118	CGBuilderTy &Bld = CGF.Builder;
1119
1120	// Compute size and alignment.
1121	llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1122	CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
1123	Size = Bld.CreateNUWAdd(
1124	LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - `1`));
1125	llvm::Value *AlignVal =
1126	llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1127	Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1128	Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1129
1130	// Allocate space for this VLA object to be globalized.
1131	llvm::Value *AllocArgs[] = {Size};
1132	llvm::CallBase *VoidPtr =
1133	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1134	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1135	args: AllocArgs, name: VD->getName());
1136	VoidPtr->addRetAttr(Attr: llvm::Attribute::get(
1137	Context&: CGM.getLLVMContext(), Kind: llvm::Attribute::Alignment, Val: Align.getQuantity()));
1138
1139	return std::make_pair(x&: VoidPtr, y&: Size);
1140	}
1141
1142	void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1143	CodeGenFunction &CGF,
1144	const std::pair<llvm::Value , llvm::Value > &AddrSizePair) {
1145	// Deallocate the memory for each globalized VLA object
1146	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1147	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1148	args: {AddrSizePair.first, AddrSizePair.second});
1149	}
1150
1151	void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1152	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1153	return;
1154
1155	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1156	if (I != FunctionGlobalizedDecls.end()) {
1157	// Deallocate the memory for each globalized VLA object that was
1158	// globalized in the prolog (i.e. emitGenericVarsProlog).
1159	for (const auto &AddrSizePair :
1160	llvm::reverse(C&: I ->getSecond().EscapedVariableLengthDeclsAddrs)) {
1161	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1162	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1163	args: {AddrSizePair.first, AddrSizePair.second});
1164	}
1165	// Deallocate the memory for each globalized value
1166	for (auto &Rec : llvm::reverse(C&: I ->getSecond().LocalVarData)) {
1167	const auto *VD = cast<VarDecl>(Val: Rec.first);
1168	I ->getSecond().MappedParams ->restore(CGF);
1169
1170	llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1171	CGF.getTypeSize(Ty: VD->getType())};
1172	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1173	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1174	args: FreeArgs);
1175	}
1176	}
1177	}
1178
1179	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1180	const OMPExecutableDirective &D,
1181	SourceLocation Loc,
1182	llvm::Function *OutlinedFn,
1183	ArrayRef<llvm::Value *> CapturedVars) {
1184	if (!CGF.HaveInsertPoint())
1185	return;
1186
1187	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1188
1189	RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1190	/Name=/".zero.addr");
1191	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
1192	llvm::SmallVector<llvm::Value *, `16`> OutlinedFnArgs;
1193	// We don't emit any thread id function call in bare kernel, but because the
1194	// outlined function has a pointer argument, we emit a nullptr here.
1195	if (IsBareKernel)
1196	OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1197	else
1198	OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1199	OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1200	OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1201	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1202	}
1203
1204	void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1205	SourceLocation Loc,
1206	llvm::Function *OutlinedFn,
1207	ArrayRef<llvm::Value *> CapturedVars,
1208	const Expr *IfCond,
1209	llvm::Value *NumThreads) {
1210	if (!CGF.HaveInsertPoint())
1211	return;
1212
1213	auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1214	NumThreads](CodeGenFunction &CGF,
1215	PrePostActionTy &Action) {
1216	CGBuilderTy &Bld = CGF.Builder;
1217	llvm::Value *NumThreadsVal = NumThreads;
1218	llvm::Function *WFn = WrapperFunctionsMap [OutlinedFn];
1219	llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1220	if (WFn)
1221	ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1222	llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1223
1224	// Create a private scope that will globalize the arguments
1225	// passed from the outside of the target region.
1226	// TODO: Is that needed?
1227	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1228
1229	Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1230	Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1231	Name: "captured_vars_addrs");
1232	// There's something to share.
1233	if (!CapturedVars.empty()) {
1234	// Prepare for parallel region. Indicate the outlined function.
1235	ASTContext &Ctx = CGF.getContext();
1236	unsigned Idx = `0`;
1237	for (llvm::Value *V : CapturedVars) {
1238	Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1239	llvm::Value *PtrV;
1240	if (V->getType()->isIntegerTy())
1241	PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1242	else
1243	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1244	CGF.EmitStoreOfScalar(Value: PtrV, Addr: Dst, /Volatile=/false,
1245	Ty: Ctx.getPointerType(T: Ctx.VoidPtrTy));
1246	++Idx;
1247	}
1248	}
1249
1250	llvm::Value IfCondVal = nullptr*;
1251	if (IfCond)
1252	IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1253	/ isSigned / false);
1254	else
1255	IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: `1`);
1256
1257	if (!NumThreadsVal)
1258	NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`);
1259	else
1260	NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty);
1261
1262	assert(IfCondVal && "Expected a value");
1263	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1264	llvm::Value *Args[] = {
1265	RTLoc,
1266	getThreadID(CGF, Loc),
1267	IfCondVal,
1268	NumThreadsVal,
1269	llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`),
1270	FnPtr,
1271	ID,
1272	Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1273	DestTy: CGF.VoidPtrPtrTy),
1274	llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1275	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1276	M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1277	args: Args);
1278	};
1279
1280	RegionCodeGenTy RCG(ParallelGen);
1281	RCG (CGF);
1282	}
1283
1284	void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1285	// Always emit simple barriers!
1286	if (!CGF.HaveInsertPoint())
1287	return;
1288	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1289	// This function does not use parameters, so we can emit just default values.
1290	llvm::Value *Args[] = {
1291	llvm::ConstantPointerNull::get(
1292	T: cast<llvm::PointerType>(Val: getIdentTyPointerTy())),
1293	llvm::ConstantInt::get(Ty: CGF.Int32Ty, /V=/`0`, /isSigned=/IsSigned: true)};
1294	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1295	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1296	args: Args);
1297	}
1298
1299	void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1300	SourceLocation Loc,
1301	OpenMPDirectiveKind Kind, bool,
1302	bool) {
1303	// Always emit simple barriers!
1304	if (!CGF.HaveInsertPoint())
1305	return;
1306	// Build call __kmpc_cancel_barrier(loc, thread_id);
1307	unsigned Flags = getDefaultFlagsForBarriers(Kind);
1308	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1309	getThreadID(CGF, Loc)};
1310
1311	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1312	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1313	args: Args);
1314	}
1315
1316	void CGOpenMPRuntimeGPU::emitCriticalRegion(
1317	CodeGenFunction &CGF, StringRef CriticalName,
1318	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1319	const Expr *Hint) {
1320	llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1321	llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1322	llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1323	llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1324	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1325
1326	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1327
1328	// Get the mask of active threads in the warp.
1329	llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1330	M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1331	// Fetch team-local id of the thread.
1332	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1333
1334	// Get the width of the team.
1335	llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1336
1337	// Initialize the counter variable for the loop.
1338	QualType Int32Ty =
1339	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/`0`);
1340	Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1341	LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1342	CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1343	/isInit=/true);
1344
1345	// Block checks if loop counter exceeds upper bound.
1346	CGF.EmitBlock(BB: LoopBB);
1347	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1348	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1349	CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1350
1351	// Block tests which single thread should execute region, and which threads
1352	// should go straight to synchronisation point.
1353	CGF.EmitBlock(BB: TestBB);
1354	CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1355	llvm::Value *CmpThreadToCounter =
1356	CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1357	CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1358
1359	// Block emits the body of the critical region.
1360	CGF.EmitBlock(BB: BodyBB);
1361
1362	// Output the critical statement.
1363	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1364	Hint);
1365
1366	// After the body surrounded by the critical region, the single executing
1367	// thread will jump to the synchronisation point.
1368	// Block waits for all threads in current team to finish then increments the
1369	// counter variable and returns to the loop.
1370	CGF.EmitBlock(BB: SyncBB);
1371	// Reconverge active threads in the warp.
1372	(void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1373	M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1374	args: Mask);
1375
1376	llvm::Value *IncCounterVal =
1377	CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: `1`));
1378	CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1379	CGF.EmitBranch(Block: LoopBB);
1380
1381	// Block that is reached when all threads in the team complete the region.
1382	CGF.EmitBlock(BB: ExitBB, /IsFinished=/true);
1383	}
1384
1385	/// Cast value to the specified type.
1386	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
1387	QualType ValTy, QualType CastTy,
1388	SourceLocation Loc) {
1389	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1390	"Cast type must sized.");
1391	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1392	"Val type must sized.");
1393	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1394	if (ValTy == CastTy)
1395	return Val;
1396	if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1397	CGF.getContext().getTypeSizeInChars(T: CastTy))
1398	return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1399	if (CastTy ->isIntegerType() && ValTy ->isIntegerType())
1400	return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1401	isSigned: CastTy ->hasSignedIntegerRepresentation());
1402	Address CastItem = CGF.CreateMemTemp(T: CastTy);
1403	Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1404	CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /Volatile=/false, Ty: ValTy,
1405	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1406	TBAAInfo: TBAAAccessInfo ());
1407	return CGF.EmitLoadOfScalar(Addr: CastItem, /Volatile=/false, Ty: CastTy, Loc,
1408	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1409	TBAAInfo: TBAAAccessInfo ());
1410	}
1411
1412	///
1413	/// Design of OpenMP reductions on the GPU
1414	///
1415	/// Consider a typical OpenMP program with one or more reduction
1416	/// clauses:
1417	///
1418	/// float foo;
1419	/// double bar;
1420	/// #pragma omp target teams distribute parallel for \
1421	/// reduction(+:foo) reduction(:bar)*
1422	/// for (int i = 0; i < N; i++) {
1423	/// foo += A[i]; bar = B[i];*
1424	/// }
1425	///
1426	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1427	/// all teams. In our OpenMP implementation on the NVPTX device an
1428	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1429	/// within a team are mapped to CUDA threads within a threadblock.
1430	/// Our goal is to efficiently aggregate values across all OpenMP
1431	/// threads such that:
1432	///
1433	/// - the compiler and runtime are logically concise, and
1434	/// - the reduction is performed efficiently in a hierarchical
1435	/// manner as follows: within OpenMP threads in the same warp,
1436	/// across warps in a threadblock, and finally across teams on
1437	/// the NVPTX device.
1438	///
1439	/// Introduction to Decoupling
1440	///
1441	/// We would like to decouple the compiler and the runtime so that the
1442	/// latter is ignorant of the reduction variables (number, data types)
1443	/// and the reduction operators. This allows a simpler interface
1444	/// and implementation while still attaining good performance.
1445	///
1446	/// Pseudocode for the aforementioned OpenMP program generated by the
1447	/// compiler is as follows:
1448	///
1449	/// 1. Create private copies of reduction variables on each OpenMP
1450	/// thread: 'foo_private', 'bar_private'
1451	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1452	/// to it and writes the result in 'foo_private' and 'bar_private'
1453	/// respectively.
1454	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1455	/// and store the result on the team master:
1456	///
1457	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1458	/// reduceData, shuffleReduceFn, interWarpCpyFn)
1459	///
1460	/// where:
1461	/// struct ReduceData {
1462	/// double foo;*
1463	/// double bar;*
1464	/// } reduceData
1465	/// reduceData.foo = &foo_private
1466	/// reduceData.bar = &bar_private
1467	///
1468	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1469	/// auxiliary functions generated by the compiler that operate on
1470	/// variables of type 'ReduceData'. They aid the runtime perform
1471	/// algorithmic steps in a data agnostic manner.
1472	///
1473	/// 'shuffleReduceFn' is a pointer to a function that reduces data
1474	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1475	/// same warp. It takes the following arguments as input:
1476	///
1477	/// a. variable of type 'ReduceData' on the calling lane,
1478	/// b. its lane_id,
1479	/// c. an offset relative to the current lane_id to generate a
1480	/// remote_lane_id. The remote lane contains the second
1481	/// variable of type 'ReduceData' that is to be reduced.
1482	/// d. an algorithm version parameter determining which reduction
1483	/// algorithm to use.
1484	///
1485	/// 'shuffleReduceFn' retrieves data from the remote lane using
1486	/// efficient GPU shuffle intrinsics and reduces, using the
1487	/// algorithm specified by the 4th parameter, the two operands
1488	/// element-wise. The result is written to the first operand.
1489	///
1490	/// Different reduction algorithms are implemented in different
1491	/// runtime functions, all calling 'shuffleReduceFn' to perform
1492	/// the essential reduction step. Therefore, based on the 4th
1493	/// parameter, this function behaves slightly differently to
1494	/// cooperate with the runtime to ensure correctness under
1495	/// different circumstances.
1496	///
1497	/// 'InterWarpCpyFn' is a pointer to a function that transfers
1498	/// reduced variables across warps. It tunnels, through CUDA
1499	/// shared memory, the thread-private data of type 'ReduceData'
1500	/// from lane 0 of each warp to a lane in the first warp.
1501	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1502	/// The last team writes the global reduced value to memory.
1503	///
1504	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1505	/// reduceData, shuffleReduceFn, interWarpCpyFn,
1506	/// scratchpadCopyFn, loadAndReduceFn)
1507	///
1508	/// 'scratchpadCopyFn' is a helper that stores reduced
1509	/// data from the team master to a scratchpad array in
1510	/// global memory.
1511	///
1512	/// 'loadAndReduceFn' is a helper that loads data from
1513	/// the scratchpad array and reduces it with the input
1514	/// operand.
1515	///
1516	/// These compiler generated functions hide address
1517	/// calculation and alignment information from the runtime.
1518	/// 5. if ret == 1:
1519	/// The team master of the last team stores the reduced
1520	/// result to the globals in memory.
1521	/// foo += reduceData.foo; bar = reduceData.bar*
1522	///
1523	///
1524	/// Warp Reduction Algorithms
1525	///
1526	/// On the warp level, we have three algorithms implemented in the
1527	/// OpenMP runtime depending on the number of active lanes:
1528	///
1529	/// Full Warp Reduction
1530	///
1531	/// The reduce algorithm within a warp where all lanes are active
1532	/// is implemented in the runtime as follows:
1533	///
1534	/// full_warp_reduce(void reduce_data,*
1535	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1536	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1537	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1538	/// }
1539	///
1540	/// The algorithm completes in log(2, WARPSIZE) steps.
1541	///
1542	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1543	/// not used therefore we save instructions by not retrieving lane_id
1544	/// from the corresponding special registers. The 4th parameter, which
1545	/// represents the version of the algorithm being used, is set to 0 to
1546	/// signify full warp reduction.
1547	///
1548	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1549	///
1550	/// #reduce_elem refers to an element in the local lane's data structure
1551	/// #remote_elem is retrieved from a remote lane
1552	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1553	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1554	///
1555	/// Contiguous Partial Warp Reduction
1556	///
1557	/// This reduce algorithm is used within a warp where only the first
1558	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1559	/// number of OpenMP threads in a parallel region is not a multiple of
1560	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1561	///
1562	/// void
1563	/// contiguous_partial_reduce(void reduce_data,*
1564	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1565	/// int size, int lane_id) {
1566	/// int curr_size;
1567	/// int offset;
1568	/// curr_size = size;
1569	/// mask = curr_size/2;
1570	/// while (offset>0) {
1571	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1572	/// curr_size = (curr_size+1)/2;
1573	/// offset = curr_size/2;
1574	/// }
1575	/// }
1576	///
1577	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1578	///
1579	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1580	/// if (lane_id < offset)
1581	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1582	/// else
1583	/// reduce_elem = remote_elem
1584	///
1585	/// This algorithm assumes that the data to be reduced are located in a
1586	/// contiguous subset of lanes starting from the first. When there is
1587	/// an odd number of active lanes, the data in the last lane is not
1588	/// aggregated with any other lane's dat but is instead copied over.
1589	///
1590	/// Dispersed Partial Warp Reduction
1591	///
1592	/// This algorithm is used within a warp when any discontiguous subset of
1593	/// lanes are active. It is used to implement the reduction operation
1594	/// across lanes in an OpenMP simd region or in a nested parallel region.
1595	///
1596	/// void
1597	/// dispersed_partial_reduce(void reduce_data,*
1598	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1599	/// int size, remote_id;
1600	/// int logical_lane_id = number_of_active_lanes_before_me() 2;*
1601	/// do {
1602	/// remote_id = next_active_lane_id_right_after_me();
1603	/// # the above function returns 0 of no active lane
1604	/// # is present right after the current lane.
1605	/// size = number_of_active_lanes_in_this_warp();
1606	/// logical_lane_id /= 2;
1607	/// ShuffleReduceFn(reduce_data, logical_lane_id,
1608	/// remote_id-1-threadIdx.x, 2);
1609	/// } while (logical_lane_id % 2 == 0 && size > 1);
1610	/// }
1611	///
1612	/// There is no assumption made about the initial state of the reduction.
1613	/// Any number of lanes (>=1) could be active at any position. The reduction
1614	/// result is returned in the first active lane.
1615	///
1616	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1617	///
1618	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1619	/// if (lane_id % 2 == 0 && offset > 0)
1620	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1621	/// else
1622	/// reduce_elem = remote_elem
1623	///
1624	///
1625	/// Intra-Team Reduction
1626	///
1627	/// This function, as implemented in the runtime call
1628	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1629	/// threads in a team. It first reduces within a warp using the
1630	/// aforementioned algorithms. We then proceed to gather all such
1631	/// reduced values at the first warp.
1632	///
1633	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1634	/// data from each of the "warp master" (zeroth lane of each warp, where
1635	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1636	/// a mathematical sense) the problem of reduction across warp masters in
1637	/// a block to the problem of warp reduction.
1638	///
1639	///
1640	/// Inter-Team Reduction
1641	///
1642	/// Once a team has reduced its data to a single value, it is stored in
1643	/// a global scratchpad array. Since each team has a distinct slot, this
1644	/// can be done without locking.
1645	///
1646	/// The last team to write to the scratchpad array proceeds to reduce the
1647	/// scratchpad array. One or more workers in the last team use the helper
1648	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1649	/// the k'th worker reduces every k'th element.
1650	///
1651	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1652	/// reduce across workers and compute a globally reduced value.
1653	///
1654	void CGOpenMPRuntimeGPU::emitReduction(
1655	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1656	ArrayRef<const Expr > LHSExprs, ArrayRef<const* Expr *> RHSExprs,
1657	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1658	if (!CGF.HaveInsertPoint())
1659	return;
1660
1661	bool ParallelReduction = isOpenMPParallelDirective(DKind: Options.ReductionKind);
1662	bool TeamsReduction = isOpenMPTeamsDirective(DKind: Options.ReductionKind);
1663
1664	ASTContext &C = CGM.getContext();
1665
1666	if (Options.SimpleReduction) {
1667	assert(!TeamsReduction && !ParallelReduction &&
1668	"Invalid reduction selection in emitReduction.");
1669	(void)ParallelReduction;
1670	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1671	ReductionOps, Options);
1672	return;
1673	}
1674
1675	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> VarFieldMap;
1676	llvm::SmallVector<const ValueDecl *, `4`> PrivatesReductions(Privates.size());
1677	int Cnt = `0`;
1678	for (const Expr *DRE : Privates) {
1679	PrivatesReductions [Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
1680	++Cnt;
1681	}
1682	const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1683	C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: {}, MappedDeclsFields&: VarFieldMap, BufSize: `1`);
1684
1685	if (TeamsReduction)
1686	TeamsReductions.push_back(Elt: ReductionRec);
1687
1688	// Source location for the ident struct
1689	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1690
1691	using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1692	InsertPointTy AllocaIP(CGF.AllocaInsertPt ->getParent(),
1693	CGF.AllocaInsertPt ->getIterator());
1694	InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1695	CGF.Builder.GetInsertPoint());
1696	llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1697	CodeGenIP, CGF.SourceLocToDebugLoc(Location: Loc));
1698	llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
1699
1700	CodeGenFunction::OMPPrivateScope Scope(CGF);
1701	unsigned Idx = `0`;
1702	for (const Expr *Private : Privates) {
1703	llvm::Type *ElementType;
1704	llvm::Value *Variable;
1705	llvm::Value *PrivateVariable;
1706	llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1707	ElementType = CGF.ConvertTypeForMem(T: Private->getType());
1708	const auto *RHSVar =
1709	cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs [Idx])->getDecl());
1710	PrivateVariable = CGF.GetAddrOfLocalVar(VD: RHSVar).emitRawPointer(CGF);
1711	const auto *LHSVar =
1712	cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs [Idx])->getDecl());
1713	Variable = CGF.GetAddrOfLocalVar(VD: LHSVar).emitRawPointer(CGF);
1714	llvm::OpenMPIRBuilder::EvalKind EvalKind;
1715	switch (CGF.getEvaluationKind(T: Private->getType())) {
1716	case TEK_Scalar:
1717	EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1718	break;
1719	case TEK_Complex:
1720	EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1721	break;
1722	case TEK_Aggregate:
1723	EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1724	break;
1725	}
1726	auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1727	llvm::Value LHSPtr, llvm::Value RHSPtr,
1728	llvm::Function *NewFunc) {
1729	CGF.Builder.restoreIP(IP: CodeGenIP);
1730	auto *CurFn = CGF.CurFn;
1731	CGF.CurFn = NewFunc;
1732
1733	*LHSPtr = CGF.GetAddrOfLocalVar(
1734	VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: LHSExprs [I])->getDecl()))
1735	.emitRawPointer(CGF);
1736	*RHSPtr = CGF.GetAddrOfLocalVar(
1737	VD: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: RHSExprs [I])->getDecl()))
1738	.emitRawPointer(CGF);
1739
1740	emitSingleReductionCombiner(CGF, ReductionOp: ReductionOps [I], PrivateRef: Privates [I],
1741	LHS: cast<DeclRefExpr>(Val: LHSExprs [I]),
1742	RHS: cast<DeclRefExpr>(Val: RHSExprs [I]));
1743
1744	CGF.CurFn = CurFn;
1745
1746	return InsertPointTy (CGF.Builder.GetInsertBlock(),
1747	CGF.Builder.GetInsertPoint());
1748	};
1749	ReductionInfos.emplace_back(Args: llvm::OpenMPIRBuilder::ReductionInfo (
1750	ElementType, Variable, PrivateVariable, EvalKind,
1751	/ReductionGen=/nullptr, ReductionGen, AtomicReductionGen));
1752	Idx++;
1753	}
1754
1755	llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
1756	cantFail(ValOrErr: OMPBuilder.createReductionsGPU(
1757	Loc: OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, IsNoWait: false, IsTeamsReduction: TeamsReduction,
1758	ReductionGenCBKind: llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1759	GridValue: CGF.getTarget().getGridValue(),
1760	ReductionBufNum: C.getLangOpts().OpenMPCUDAReductionBufNum, SrcLocInfo: RTLoc));
1761	CGF.Builder.restoreIP(IP: AfterIP);
1762	}
1763
1764	const VarDecl *
1765	CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1766	const VarDecl NativeParam) const* {
1767	if (!NativeParam->getType()->isReferenceType())
1768	return NativeParam;
1769	QualType ArgType = NativeParam->getType();
1770	QualifierCollector QC;
1771	const Type *NonQualTy = QC.strip(type: ArgType);
1772	QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1773	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1774	if (Attr->getCaptureKind() == OMPC_map) {
1775	PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
1776	AddressSpace: LangAS::opencl_global);
1777	}
1778	}
1779	ArgType = CGM.getContext().getPointerType(T: PointeeTy);
1780	QC.addRestrict();
1781	enum { NVPTX_local_addr = `5` };
1782	QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
1783	ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
1784	if (isa<ImplicitParamDecl>(Val: NativeParam))
1785	return ImplicitParamDecl::Create(
1786	C&: CGM.getContext(), /DC=/nullptr, IdLoc: NativeParam->getLocation(),
1787	Id: NativeParam->getIdentifier(), T: ArgType, ParamKind: ImplicitParamKind::Other);
1788	return ParmVarDecl::Create(
1789	C&: CGM.getContext(),
1790	DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
1791	StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
1792	Id: NativeParam->getIdentifier(), T: ArgType,
1793	/TInfo=/nullptr, S: SC_None, /DefArg=/nullptr);
1794	}
1795
1796	Address
1797	CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1798	const VarDecl *NativeParam,
1799	const VarDecl TargetParam) const* {
1800	assert(NativeParam != TargetParam &&
1801	NativeParam->getType()->isReferenceType() &&
1802	"Native arg must not be the same as target arg.");
1803	Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
1804	QualType NativeParamType = NativeParam->getType();
1805	QualifierCollector QC;
1806	const Type *NonQualTy = QC.strip(type: NativeParamType);
1807	QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
1808	unsigned NativePointeeAddrSpace =
1809	CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
1810	QualType TargetTy = TargetParam->getType();
1811	llvm::Value TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /Volatile=/*false,
1812	Ty: TargetTy, Loc: SourceLocation ());
1813	// Cast to native address space.
1814	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1815	V: TargetAddr,
1816	DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
1817	Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
1818	CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /Volatile=/false,
1819	Ty: NativeParamType);
1820	return NativeParamAddr;
1821	}
1822
1823	void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1824	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1825	ArrayRef<llvm::Value > Args) const* {
1826	SmallVector<llvm::Value *, `4`> TargetArgs;
1827	TargetArgs.reserve(N: Args.size());
1828	auto *FnType = OutlinedFn.getFunctionType();
1829	for (unsigned I = `0`, E = Args.size(); I < E; ++I) {
1830	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1831	TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
1832	break;
1833	}
1834	llvm::Type *TargetType = FnType->getParamType(i: I);
1835	llvm::Value *NativeArg = Args [I];
1836	if (!TargetType->isPointerTy()) {
1837	TargetArgs.emplace_back(Args&: NativeArg);
1838	continue;
1839	}
1840	TargetArgs.emplace_back(
1841	Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
1842	}
1843	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: TargetArgs);
1844	}
1845
1846	/// Emit function which wraps the outline parallel region
1847	/// and controls the arguments which are passed to this function.
1848	/// The wrapper ensures that the outlined function is called
1849	/// with the correct arguments when data is shared.
1850	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1851	llvm::Function OutlinedParallelFn, const* OMPExecutableDirective &D) {
1852	ASTContext &Ctx = CGM.getContext();
1853	const auto &CS = *D.getCapturedStmt(RegionKind: OMPD_parallel);
1854
1855	// Create a function that takes as argument the source thread.
1856	FunctionArgList WrapperArgs;
1857	QualType Int16QTy =
1858	Ctx.getIntTypeForBitwidth(/DestWidth=/`16`, /Signed=/false);
1859	QualType Int32QTy =
1860	Ctx.getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/false);
1861	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
1862	/Id=/nullptr, Int16QTy,
1863	ImplicitParamKind::Other);
1864	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
1865	/Id=/nullptr, Int32QTy,
1866	ImplicitParamKind::Other);
1867	WrapperArgs.emplace_back(Args: &ParallelLevelArg);
1868	WrapperArgs.emplace_back(Args: &WrapperArg);
1869
1870	const CGFunctionInfo &CGFI =
1871	CGM.getTypes().arrangeBuiltinFunctionDeclaration(resultType: Ctx.VoidTy, args: WrapperArgs);
1872
1873	auto *Fn = llvm::Function::Create(
1874	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
1875	N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
1876
1877	// Ensure we do not inline the function. This is trivially true for the ones
1878	// passed to __kmpc_fork_call but the ones calles in serialized regions
1879	// could be inlined. This is not a perfect but it is closer to the invariant
1880	// we want, namely, every data environment starts with a new function.
1881	// TODO: We should pass the if condition to the runtime function and do the
1882	// handling there. Much cleaner code.
1883	Fn->addFnAttr(Kind: llvm::Attribute::NoInline);
1884
1885	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
1886	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1887	Fn->setDoesNotRecurse();
1888
1889	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
1890	CGF.StartFunction(GD: GlobalDecl (), RetTy: Ctx.VoidTy, Fn, FnInfo: CGFI, Args: WrapperArgs,
1891	Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
1892
1893	const auto *RD = CS.getCapturedRecordDecl();
1894	auto CurField = RD->field_begin();
1895
1896	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1897	/Name=/".zero.addr");
1898	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
1899	// Get the array of arguments.
1900	SmallVector<llvm::Value *, `8`> Args;
1901
1902	Args.emplace_back(Args: CGF.GetAddrOfLocalVar(VD: &WrapperArg).emitRawPointer(CGF));
1903	Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
1904
1905	CGBuilderTy &Bld = CGF.Builder;
1906	auto CI = CS.capture_begin();
1907
1908	// Use global memory for data sharing.
1909	// Handle passing of global args to workers.
1910	RawAddress GlobalArgs =
1911	CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
1912	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1913	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1914	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1915	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
1916	args: DataSharingArgs);
1917
1918	// Retrieve the shared variables from the list of references returned
1919	// by the runtime. Pass the variables to the outlined function.
1920	Address SharedArgListAddress = Address::invalid();
1921	if (CS.capture_size() > `0` \|\|
1922	isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1923	SharedArgListAddress = CGF.EmitLoadOfPointer(
1924	Ptr: GlobalArgs, PtrTy: CGF.getContext()
1925	.getPointerType(T: CGF.getContext().VoidPtrTy)
1926	.castAs<PointerType>());
1927	}
1928	unsigned Idx = `0`;
1929	if (isOpenMPLoopBoundSharingDirective(Kind: D.getDirectiveKind())) {
1930	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1931	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1932	Addr: Src, Ty: Bld.getPtrTy(AddrSpace: `0`), ElementTy: CGF.SizeTy);
1933	llvm::Value *LB = CGF.EmitLoadOfScalar(
1934	Addr: TypedAddress,
1935	/Volatile=/false,
1936	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1937	Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
1938	Args.emplace_back(Args&: LB);
1939	++Idx;
1940	Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
1941	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Src, Ty: Bld.getPtrTy(AddrSpace: `0`),
1942	ElementTy: CGF.SizeTy);
1943	llvm::Value *UB = CGF.EmitLoadOfScalar(
1944	Addr: TypedAddress,
1945	/Volatile=/false,
1946	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
1947	Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
1948	Args.emplace_back(Args&: UB);
1949	++Idx;
1950	}
1951	if (CS.capture_size() > `0`) {
1952	ASTContext &CGFContext = CGF.getContext();
1953	for (unsigned I = `0`, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1954	QualType ElemTy = CurField ->getType();
1955	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
1956	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1957	Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
1958	ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
1959	llvm::Value *Arg = CGF.EmitLoadOfScalar(Addr: TypedAddress,
1960	/Volatile=/false,
1961	Ty: CGFContext.getPointerType(T: ElemTy),
1962	Loc: CI->getLocation());
1963	if (CI->capturesVariableByCopy() &&
1964	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
1965	Arg = castValueToType(CGF, Val: Arg, ValTy: ElemTy, CastTy: CGFContext.getUIntPtrType(),
1966	Loc: CI->getLocation());
1967	}
1968	Args.emplace_back(Args&: Arg);
1969	}
1970	}
1971
1972	emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
1973	CGF.FinishFunction();
1974	return Fn;
1975	}
1976
1977	void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1978	const Decl *D) {
1979	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1980	return;
1981
1982	assert(D && "Expected function or captured\|block decl.");
1983	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == `0` &&
1984	"Function is registered already.");
1985	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&
1986	"Team is set but not processed.");
1987	const Stmt Body = nullptr*;
1988	bool NeedToDelayGlobalization = false;
1989	if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
1990	Body = FD->getBody();
1991	} else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
1992	Body = BD->getBody();
1993	} else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
1994	Body = CD->getBody();
1995	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1996	if (NeedToDelayGlobalization &&
1997	getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1998	return;
1999	}
2000	if (!Body)
2001	return;
2002	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2003	VarChecker.Visit(S: Body);
2004	const RecordDecl *GlobalizedVarsRecord =
2005	VarChecker.getGlobalizedRecord(IsInTTDRegion);
2006	TeamAndReductions.first = nullptr;
2007	TeamAndReductions.second.clear();
2008	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2009	VarChecker.getEscapedVariableLengthDecls();
2010	ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2011	VarChecker.getDelayedVariableLengthDecls();
2012	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2013	DelayedVariableLengthDecls.empty())
2014	return;
2015	auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
2016	I ->getSecond().MappedParams =
2017	std::make_unique<CodeGenFunction::OMPMapVars>();
2018	I ->getSecond().EscapedParameters.insert(
2019	I: VarChecker.getEscapedParameters().begin(),
2020	E: VarChecker.getEscapedParameters().end());
2021	I ->getSecond().EscapedVariableLengthDecls.append(
2022	in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
2023	I ->getSecond().DelayedVariableLengthDecls.append(
2024	in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
2025	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
2026	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2027	assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2028	Data.try_emplace(Key: VD);
2029	}
2030	if (!NeedToDelayGlobalization) {
2031	emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
2032	struct GlobalizationScope final : EHScopeStack::Cleanup {
2033	GlobalizationScope() = default;
2034
2035	void Emit(CodeGenFunction &CGF, Flags flags) override {
2036	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2037	.emitGenericVarsEpilog(CGF);
2038	}
2039	};
2040	CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
2041	}
2042	}
2043
2044	Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2045	const VarDecl *VD) {
2046	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2047	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2048	auto AS = LangAS::Default;
2049	switch (A->getAllocatorType()) {
2050	case OMPAllocateDeclAttr::OMPNullMemAlloc:
2051	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2052	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2053	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2054	break;
2055	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056	return Address::invalid();
2057	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2058	// TODO: implement aupport for user-defined allocators.
2059	return Address::invalid();
2060	case OMPAllocateDeclAttr::OMPConstMemAlloc:
2061	AS = LangAS::cuda_constant;
2062	break;
2063	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2064	AS = LangAS::cuda_shared;
2065	break;
2066	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2067	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2068	break;
2069	}
2070	llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
2071	auto GV = new* llvm::GlobalVariable (
2072	CGM.getModule(), VarTy, /isConstant=/false,
2073	llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
2074	VD->getName(),
2075	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
2076	CGM.getContext().getTargetAddressSpace(AS));
2077	CharUnits Align = CGM.getContext().getDeclAlign(D: VD);
2078	GV->setAlignment(Align.getAsAlign());
2079	return Address (
2080	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2081	V: GV, DestTy: CGF.Builder.getPtrTy(AddrSpace: CGM.getContext().getTargetAddressSpace(
2082	AS: VD->getType().getAddressSpace()))),
2083	VarTy, Align);
2084	}
2085
2086	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2087	return Address::invalid();
2088
2089	VD = VD->getCanonicalDecl();
2090	auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
2091	if (I == FunctionGlobalizedDecls.end())
2092	return Address::invalid();
2093	auto VDI = I ->getSecond().LocalVarData.find(Key: VD);
2094	if (VDI != I ->getSecond().LocalVarData.end())
2095	return VDI->second.PrivateAddr;
2096	if (VD->hasAttrs()) {
2097	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2098	E(VD->attr_end());
2099	IT != E; ++IT) {
2100	auto VDI = I ->getSecond().LocalVarData.find(
2101	Key: cast<VarDecl>(Val: cast<DeclRefExpr>(Val: IT ->getRef())->getDecl())
2102	->getCanonicalDecl());
2103	if (VDI != I ->getSecond().LocalVarData.end())
2104	return VDI->second.PrivateAddr;
2105	}
2106	}
2107
2108	return Address::invalid();
2109	}
2110
2111	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2112	FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
2113	CGOpenMPRuntime::functionFinished(CGF);
2114	}
2115
2116	void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2117	CodeGenFunction &CGF, const OMPLoopDirective &S,
2118	OpenMPDistScheduleClauseKind &ScheduleKind,
2119	llvm::Value &Chunk) const* {
2120	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2121	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2122	ScheduleKind = OMPC_DIST_SCHEDULE_static;
2123	Chunk = CGF.EmitScalarConversion(
2124	Src: RT.getGPUNumThreads(CGF),
2125	SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
2126	DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
2127	return;
2128	}
2129	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2130	CGF, S, ScheduleKind, Chunk);
2131	}
2132
2133	void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2134	CodeGenFunction &CGF, const OMPLoopDirective &S,
2135	OpenMPScheduleClauseKind &ScheduleKind,
2136	const Expr &ChunkExpr) const* {
2137	ScheduleKind = OMPC_SCHEDULE_static;
2138	// Chunk size is 1 in this case.
2139	llvm::APInt ChunkSize(`32`, `1`);
2140	ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
2141	type: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
2142	l: SourceLocation ());
2143	}
2144
2145	void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2146	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2147	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2148	" Expected target-based directive.");
2149	const CapturedStmt *CS = D.getCapturedStmt(RegionKind: OMPD_target);
2150	for (const CapturedStmt::Capture &C : CS->captures()) {
2151	// Capture variables captured by reference in lambdas for target-based
2152	// directives.
2153	if (!C.capturesVariable())
2154	continue;
2155	const VarDecl *VD = C.getCapturedVar();
2156	const auto *RD = VD->getType()
2157	.getCanonicalType()
2158	.getNonReferenceType()
2159	->getAsCXXRecordDecl();
2160	if (!RD \|\| !RD->isLambda())
2161	continue;
2162	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2163	LValue VDLVal;
2164	if (VD->getType().getCanonicalType()->isReferenceType())
2165	VDLVal = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr, RefTy: VD->getType());
2166	else
2167	VDLVal = CGF.MakeAddrLValue(
2168	Addr: VDAddr, T: VD->getType().getCanonicalType().getNonReferenceType());
2169	llvm::DenseMap<const ValueDecl , FieldDecl > Captures;
2170	FieldDecl ThisCapture = nullptr*;
2171	RD->getCaptureFields(Captures, ThisCapture);
2172	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2173	LValue ThisLVal =
2174	CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
2175	llvm::Value *CXXThis = CGF.LoadCXXThis();
2176	CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
2177	}
2178	for (const LambdaCapture &LC : RD->captures()) {
2179	if (LC.getCaptureKind() != LCK_ByRef)
2180	continue;
2181	const ValueDecl *VD = LC.getCapturedVar();
2182	// FIXME: For now VD is always a VarDecl because OpenMP does not support
2183	// capturing structured bindings in lambdas yet.
2184	if (!CS->capturesVariable(Var: cast<VarDecl>(Val: VD)))
2185	continue;
2186	auto It = Captures.find(Val: VD);
2187	assert(It != Captures.end() && "Found lambda capture without field.");
2188	LValue VarLVal = CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: It ->second);
2189	Address VDAddr = CGF.GetAddrOfLocalVar(VD: cast<VarDecl>(Val: VD));
2190	if (VD->getType().getCanonicalType()->isReferenceType())
2191	VDAddr = CGF.EmitLoadOfReferenceLValue(RefAddr: VDAddr,
2192	RefTy: VD->getType().getCanonicalType())
2193	.getAddress();
2194	CGF.EmitStoreOfScalar(value: VDAddr.emitRawPointer(CGF), lvalue: VarLVal);
2195	}
2196	}
2197	}
2198
2199	bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2200	LangAS &AS) {
2201	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
2202	return false;
2203	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2204	switch(A->getAllocatorType()) {
2205	case OMPAllocateDeclAttr::OMPNullMemAlloc:
2206	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2207	// Not supported, fallback to the default mem space.
2208	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2209	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2210	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2211	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2212	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2213	AS = LangAS::Default;
2214	return true;
2215	case OMPAllocateDeclAttr::OMPConstMemAlloc:
2216	AS = LangAS::cuda_constant;
2217	return true;
2218	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2219	AS = LangAS::cuda_shared;
2220	return true;
2221	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2222	llvm_unreachable("Expected predefined allocator for the variables with the "
2223	"static storage.");
2224	}
2225	return false;
2226	}
2227
2228	// Get current OffloadArch and ignore any unknown values
2229	static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2230	if (!CGM.getTarget().hasFeature(Feature: "ptx"))
2231	return OffloadArch::UNKNOWN;
2232	for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2233	if (Feature.getValue()) {
2234	OffloadArch Arch = StringToOffloadArch(S: Feature.getKey());
2235	if (Arch != OffloadArch::UNKNOWN)
2236	return Arch;
2237	}
2238	}
2239	return OffloadArch::UNKNOWN;
2240	}
2241
2242	/// Check to see if target architecture supports unified addressing which is
2243	/// a restriction for OpenMP requires clause "unified_shared_memory".
2244	void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2245	for (const OMPClause *Clause : D->clauselists()) {
2246	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2247	OffloadArch Arch = getOffloadArch(CGM);
2248	switch (Arch) {
2249	case OffloadArch::SM_20:
2250	case OffloadArch::SM_21:
2251	case OffloadArch::SM_30:
2252	case OffloadArch::SM_32_:
2253	case OffloadArch::SM_35:
2254	case OffloadArch::SM_37:
2255	case OffloadArch::SM_50:
2256	case OffloadArch::SM_52:
2257	case OffloadArch::SM_53: {
2258	SmallString<`256`> Buffer;
2259	llvm::raw_svector_ostream Out(Buffer);
2260	Out << "Target architecture " << OffloadArchToString(A: Arch)
2261	<< " does not support unified addressing";
2262	CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
2263	return;
2264	}
2265	case OffloadArch::SM_60:
2266	case OffloadArch::SM_61:
2267	case OffloadArch::SM_62:
2268	case OffloadArch::SM_70:
2269	case OffloadArch::SM_72:
2270	case OffloadArch::SM_75:
2271	case OffloadArch::SM_80:
2272	case OffloadArch::SM_86:
2273	case OffloadArch::SM_87:
2274	case OffloadArch::SM_89:
2275	case OffloadArch::SM_90:
2276	case OffloadArch::SM_90a:
2277	case OffloadArch::SM_100:
2278	case OffloadArch::SM_100a:
2279	case OffloadArch::SM_101:
2280	case OffloadArch::SM_101a:
2281	case OffloadArch::SM_120:
2282	case OffloadArch::SM_120a:
2283	case OffloadArch::GFX600:
2284	case OffloadArch::GFX601:
2285	case OffloadArch::GFX602:
2286	case OffloadArch::GFX700:
2287	case OffloadArch::GFX701:
2288	case OffloadArch::GFX702:
2289	case OffloadArch::GFX703:
2290	case OffloadArch::GFX704:
2291	case OffloadArch::GFX705:
2292	case OffloadArch::GFX801:
2293	case OffloadArch::GFX802:
2294	case OffloadArch::GFX803:
2295	case OffloadArch::GFX805:
2296	case OffloadArch::GFX810:
2297	case OffloadArch::GFX9_GENERIC:
2298	case OffloadArch::GFX900:
2299	case OffloadArch::GFX902:
2300	case OffloadArch::GFX904:
2301	case OffloadArch::GFX906:
2302	case OffloadArch::GFX908:
2303	case OffloadArch::GFX909:
2304	case OffloadArch::GFX90a:
2305	case OffloadArch::GFX90c:
2306	case OffloadArch::GFX9_4_GENERIC:
2307	case OffloadArch::GFX942:
2308	case OffloadArch::GFX950:
2309	case OffloadArch::GFX10_1_GENERIC:
2310	case OffloadArch::GFX1010:
2311	case OffloadArch::GFX1011:
2312	case OffloadArch::GFX1012:
2313	case OffloadArch::GFX1013:
2314	case OffloadArch::GFX10_3_GENERIC:
2315	case OffloadArch::GFX1030:
2316	case OffloadArch::GFX1031:
2317	case OffloadArch::GFX1032:
2318	case OffloadArch::GFX1033:
2319	case OffloadArch::GFX1034:
2320	case OffloadArch::GFX1035:
2321	case OffloadArch::GFX1036:
2322	case OffloadArch::GFX11_GENERIC:
2323	case OffloadArch::GFX1100:
2324	case OffloadArch::GFX1101:
2325	case OffloadArch::GFX1102:
2326	case OffloadArch::GFX1103:
2327	case OffloadArch::GFX1150:
2328	case OffloadArch::GFX1151:
2329	case OffloadArch::GFX1152:
2330	case OffloadArch::GFX1153:
2331	case OffloadArch::GFX12_GENERIC:
2332	case OffloadArch::GFX1200:
2333	case OffloadArch::GFX1201:
2334	case OffloadArch::GFX1250:
2335	case OffloadArch::AMDGCNSPIRV:
2336	case OffloadArch::Generic:
2337	case OffloadArch::GRANITERAPIDS:
2338	case OffloadArch::BMG_G21:
2339	case OffloadArch::UNUSED:
2340	case OffloadArch::UNKNOWN:
2341	break;
2342	case OffloadArch::LAST:
2343	llvm_unreachable("Unexpected GPU arch.");
2344	}
2345	}
2346	}
2347	CGOpenMPRuntime::processRequiresDirective(D);
2348	}
2349
2350	llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2351	CGBuilderTy &Bld = CGF.Builder;
2352	llvm::Module *M = &CGF.CGM.getModule();
2353	const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2354	llvm::Function *F = M->getFunction(Name: LocSize);
2355	if (!F) {
2356	F = llvm::Function::Create(Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: {}, isVarArg: false),
2357	Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize,
2358	M: &CGF.CGM.getModule());
2359	}
2360	return Bld.CreateCall(Callee: F, Args: {}, Name: "nvptx_num_threads");
2361	}
2362
2363	llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2364	ArrayRef<llvm::Value *> Args{};
2365	return CGF.EmitRuntimeCall(
2366	callee: OMPBuilder.getOrCreateRuntimeFunction(
2367	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
2368	args: Args);
2369	}
2370

Browse the source code of llvm_projects/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp