CGCUDANV.cpp source code [llvm_projects/clang/lib/CodeGen/CGCUDANV.cpp]

1	//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10	// runtime library.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGCUDARuntime.h"
15	#include "CGCXXABI.h"
16	#include "CodeGenFunction.h"
17	#include "CodeGenModule.h"
18	#include "clang/AST/CharUnits.h"
19	#include "clang/AST/Decl.h"
20	#include "clang/Basic/Cuda.h"
21	#include "clang/CodeGen/CodeGenABITypes.h"
22	#include "clang/CodeGen/ConstantInitBuilder.h"
23	#include "llvm/ADT/StringRef.h"
24	#include "llvm/Frontend/Offloading/Utility.h"
25	#include "llvm/IR/BasicBlock.h"
26	#include "llvm/IR/Constants.h"
27	#include "llvm/IR/DerivedTypes.h"
28	#include "llvm/IR/GlobalValue.h"
29	#include "llvm/IR/ReplaceConstant.h"
30	#include "llvm/ProfileData/InstrProf.h"
31	#include "llvm/Support/Format.h"
32	#include "llvm/Support/MD5.h"
33	#include "llvm/Support/VirtualFileSystem.h"
34	#include "llvm/Transforms/Utils/ModuleUtils.h"
35
36	using namespace clang;
37	using namespace CodeGen;
38
39	namespace {
40	constexpr unsigned CudaFatMagic = `0x466243b1`;
41	constexpr unsigned HIPFatMagic = `0x48495046`; // "HIPF"
42
43	class CGNVCUDARuntime : public CGCUDARuntime {
44
45	/// The prefix used for function calls and section names (CUDA, HIP, LLVM)
46	StringRef Prefix;
47
48	private:
49	llvm::IntegerType IntTy, SizeTy;
50	llvm::Type *VoidTy;
51	llvm::PointerType *PtrTy;
52
53	/// Convenience reference to LLVM Context
54	llvm::LLVMContext &Context;
55	/// Convenience reference to the current module
56	llvm::Module &TheModule;
57	/// Keeps track of kernel launch stubs and handles emitted in this module
58	struct KernelInfo {
59	llvm::Function Kernel; // stub function to help launch kernel*
60	const Decl *D;
61	};
62	llvm::SmallVector<KernelInfo, `16`> EmittedKernels;
63	// Map a kernel mangled name to a symbol for identifying kernel in host code
64	// For CUDA, the symbol for identifying the kernel is the same as the device
65	// stub function. For HIP, they are different.
66	llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
67	// Map a kernel handle to the kernel stub.
68	llvm::DenseMap<llvm::GlobalValue , llvm::Function > KernelStubs;
69	struct VarInfo {
70	llvm::GlobalVariable *Var;
71	const VarDecl *D;
72	DeviceVarFlags Flags;
73	};
74	llvm::SmallVector<VarInfo, `16`> DeviceVars;
75	/// Keeps track of variable containing handle of GPU binary. Populated by
76	/// ModuleCtorFunction() and used to create corresponding cleanup calls in
77	/// ModuleDtorFunction()
78	llvm::GlobalVariable GpuBinaryHandle = nullptr*;
79	/// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
80	/// emitted only for HIP host compiles when PGO is on. Registered via
81	/// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
82	/// can locate the device-side table by name.
83	llvm::GlobalVariable OffloadProfShadow = nullptr*;
84	struct OffloadProfSectionShadowInfo {
85	llvm::GlobalVariable *Shadow;
86	std::string DeviceName;
87	};
88	llvm::SmallVector<OffloadProfSectionShadowInfo, `16`> OffloadProfSectionShadows;
89	/// Whether we generate relocatable device code.
90	bool RelocatableDeviceCode;
91	/// Mangle context for device.
92	std::unique_ptr<MangleContext> DeviceMC;
93
94	llvm::FunctionCallee getSetupArgumentFn() const;
95	llvm::FunctionCallee getLaunchFn() const;
96
97	llvm::FunctionType getRegisterGlobalsFnTy() const*;
98	llvm::FunctionType getCallbackFnTy() const*;
99	llvm::FunctionType getRegisterLinkedBinaryFnTy() const*;
100	std::string addPrefixToName(StringRef FuncName) const;
101	std::string addUnderscoredPrefixToName(StringRef FuncName) const;
102
103	/// Creates a function to register all kernel stubs generated in this module.
104	llvm::Function *makeRegisterGlobalsFn();
105
106	/// Helper function that generates a constant string and returns a pointer to
107	/// the start of the string. The result of this function can be used anywhere
108	/// where the C code specifies const char.*
109	llvm::Constant makeConstantString(const* std::string &Str,
110	const std::string &Name = "") {
111	return CGM.GetAddrOfConstantCString(Str, GlobalName: Name).getPointer();
112	}
113
114	/// Helper function which generates an initialized constant array from Str,
115	/// and optionally sets section name and alignment. AddNull specifies whether
116	/// the array should nave NUL termination.
117	llvm::Constant *makeConstantArray(StringRef Str,
118	StringRef Name = "",
119	StringRef SectionName = "",
120	unsigned Alignment = `0`,
121	bool AddNull = false) {
122	llvm::Constant *Value =
123	llvm::ConstantDataArray::getString(Context, Initializer: Str, AddNull);
124	auto GV = new* llvm::GlobalVariable(
125	TheModule, Value->getType(), /isConstant=/true,
126	llvm::GlobalValue::PrivateLinkage, Value, Name);
127	if (!SectionName.empty()) {
128	GV->setSection(SectionName);
129	// Mark the address as used which make sure that this section isn't
130	// merged and we will really have it in the object file.
131	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
132	}
133	if (Alignment)
134	GV->setAlignment(llvm::Align(Alignment));
135	return GV;
136	}
137
138	/// Helper function that generates an empty dummy function returning void.
139	llvm::Function makeDummyFunction(llvm::FunctionType FnTy) {
140	assert(FnTy->getReturnType()->isVoidTy() &&
141	"Can only generate dummy functions returning void!");
142	llvm::Function *DummyFunc = llvm::Function::Create(
143	Ty: FnTy, Linkage: llvm::GlobalValue::InternalLinkage, N: "dummy", M: &TheModule);
144
145	llvm::BasicBlock *DummyBlock =
146	llvm::BasicBlock::Create(Context, Name: "", Parent: DummyFunc);
147	CGBuilderTy FuncBuilder(CGM, Context);
148	FuncBuilder.SetInsertPoint(DummyBlock);
149	FuncBuilder.CreateRetVoid();
150
151	return DummyFunc;
152	}
153
154	Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
155	Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
156	FunctionArgList &Args);
157	void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
158	void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
159	std::string getDeviceSideName(const NamedDecl *ND) override;
160
161	void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
162	bool Extern, bool Constant) {
163	DeviceVars.push_back(Elt: {.Var: &Var,
164	.D: VD,
165	.Flags: {DeviceVarFlags::Variable, Extern, Constant,
166	VD->hasAttr<HIPManagedAttr>(),
167	/Normalized/ false, `0`}});
168	}
169	void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
170	bool Extern, int Type) {
171	DeviceVars.push_back(Elt: {.Var: &Var,
172	.D: VD,
173	.Flags: {DeviceVarFlags::Surface, Extern, /Constant/ false,
174	/Managed/ false,
175	/Normalized/ false, Type}});
176	}
177	void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
178	bool Extern, int Type, bool Normalized) {
179	DeviceVars.push_back(Elt: {.Var: &Var,
180	.D: VD,
181	.Flags: {DeviceVarFlags::Texture, Extern, /Constant/ false,
182	/Managed/ false, Normalized, Type}});
183	}
184
185	/// Creates module constructor function
186	llvm::Function *makeModuleCtorFunction();
187	/// Creates module destructor function
188	llvm::Function *makeModuleDtorFunction();
189	/// Transform managed variables for device compilation.
190	void transformManagedVars();
191	/// Create offloading entries to register globals in RDC mode.
192	void createOffloadingEntries();
193	/// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
194	/// On the device side, InstrProfiling emits the populated section-bounds
195	/// table only when the TU has real profile data. On the host side it is a
196	/// placeholder void shadow stored in*
197	/// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
198	/// or createOffloadingEntries (RDC) so the runtime can locate the
199	/// device-side table by name.
200	void emitOffloadProfilingSections();
201
202	public:
203	CGNVCUDARuntime(CodeGenModule &CGM);
204
205	llvm::GlobalValue getKernelHandle(llvm::Function F, GlobalDecl GD) override;
206	llvm::Function getKernelStub(llvm::GlobalValue Handle) override {
207	auto Loc = KernelStubs.find(Val: Handle);
208	assert(Loc != KernelStubs.end());
209	return Loc ->second;
210	}
211	void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
212	void handleVarRegistration(const VarDecl *VD,
213	llvm::GlobalVariable &Var) override;
214	void
215	internalizeDeviceSideVar(const VarDecl *D,
216	llvm::GlobalValue::LinkageTypes &Linkage) override;
217
218	llvm::Function *finalizeModule() override;
219	};
220
221	} // end anonymous namespace
222
223	std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
224	return (Prefix + FuncName).str();
225	}
226	std::string
227	CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
228	return ("__" + Prefix + FuncName).str();
229	}
230
231	static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
232	// If the host and device have different C++ ABIs, mark it as the device
233	// mangle context so that the mangling needs to retrieve the additional
234	// device lambda mangling number instead of the regular host one.
235	if (CGM.getContext().getAuxTargetInfo() &&
236	CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
237	CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
238	return std::unique_ptr<MangleContext>(
239	CGM.getContext().createDeviceMangleContext(
240	T: *CGM.getContext().getAuxTargetInfo()));
241	}
242
243	return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
244	T: CGM.getContext().getAuxTargetInfo()));
245	}
246
247	CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
248	: CGCUDARuntime (CGM), Context(CGM.getLLVMContext()),
249	TheModule(CGM.getModule()),
250	RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
251	DeviceMC(InitDeviceMC(CGM)) {
252	IntTy = CGM.IntTy;
253	SizeTy = CGM.SizeTy;
254	VoidTy = CGM.VoidTy;
255	PtrTy = CGM.DefaultPtrTy;
256
257	if (CGM.getLangOpts().OffloadViaLLVM)
258	Prefix = "llvm";
259	else if (CGM.getLangOpts().HIP)
260	Prefix = "hip";
261	else
262	Prefix = "cuda";
263	}
264
265	llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
266	// cudaError_t cudaSetupArgument(void , size_t, size_t)*
267	llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
268	return CGM.CreateRuntimeFunction(
269	Ty: llvm::FunctionType::get(Result: IntTy, Params, isVarArg: false),
270	Name: addPrefixToName(FuncName: "SetupArgument"));
271	}
272
273	llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
274	if (CGM.getLangOpts().HIP) {
275	// hipError_t hipLaunchByPtr(char );*
276	return CGM.CreateRuntimeFunction(
277	Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), Name: "hipLaunchByPtr");
278	}
279	// cudaError_t cudaLaunch(char );*
280	return CGM.CreateRuntimeFunction(Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false),
281	Name: "cudaLaunch");
282	}
283
284	llvm::FunctionType CGNVCUDARuntime::getRegisterGlobalsFnTy() const* {
285	return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
286	}
287
288	llvm::FunctionType CGNVCUDARuntime::getCallbackFnTy() const* {
289	return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
290	}
291
292	llvm::FunctionType CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const* {
293	llvm::Type *Params[] = {llvm::PointerType::getUnqual(C&: Context), PtrTy, PtrTy,
294	llvm::PointerType::getUnqual(C&: Context)};
295	return llvm::FunctionType::get(Result: VoidTy, Params, isVarArg: false);
296	}
297
298	std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
299	GlobalDecl GD;
300	// D could be either a kernel or a variable.
301	if (auto *FD = dyn_cast<FunctionDecl>(Val: ND))
302	GD = GlobalDecl (FD, KernelReferenceKind::Kernel);
303	else
304	GD = GlobalDecl (ND);
305	std::string DeviceSideName;
306	MangleContext *MC;
307	if (CGM.getLangOpts().CUDAIsDevice)
308	MC = &CGM.getCXXABI().getMangleContext();
309	else
310	MC = DeviceMC.get();
311	if (MC->shouldMangleDeclName(D: ND)) {
312	SmallString<`256`> Buffer;
313	llvm::raw_svector_ostream Out(Buffer);
314	MC->mangleName(GD, Out);
315	DeviceSideName = std::string (Out.str());
316	} else
317	DeviceSideName = std::string (ND->getIdentifier()->getName());
318
319	// Make unique name for device side static file-scope variable for HIP.
320	if (CGM.getContext().shouldExternalize(D: ND) &&
321	CGM.getLangOpts().GPURelocatableDeviceCode) {
322	SmallString<`256`> Buffer;
323	llvm::raw_svector_ostream Out(Buffer);
324	Out << DeviceSideName;
325	CGM.printPostfixForExternalizedDecl(OS&: Out, D: ND);
326	DeviceSideName = std::string (Out.str());
327	}
328	return DeviceSideName;
329	}
330
331	void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
332	FunctionArgList &Args) {
333	EmittedKernels.push_back(Elt: {.Kernel: CGF.CurFn, .D: CGF.CurFuncDecl});
334	if (auto *GV =
335	dyn_cast<llvm::GlobalVariable>(Val: KernelHandles [CGF.CurFn->getName()])) {
336	GV->setLinkage(CGF.CurFn->getLinkage());
337	GV->setInitializer(CGF.CurFn);
338	}
339	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
340	CudaFeature::CUDA_USES_NEW_LAUNCH) \|\|
341	(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) \|\|
342	(CGF.getLangOpts().OffloadViaLLVM))
343	emitDeviceStubBodyNew(CGF, Args);
344	else
345	emitDeviceStubBodyLegacy(CGF, Args);
346	}
347
348	/// Build the input as a sized array of pointers so that it can be launched by
349	/// the offloading runtime.
350	Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
351	FunctionArgList &Args) {
352	SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
353	for (auto &Arg : Args)
354	ArgTypes.push_back(Elt: CGF.ConvertTypeForMem(T: Arg->getType()));
355	llvm::StructType *KernelArgsTy = llvm::StructType::create(Elements: ArgTypes);
356	llvm::Type *KernelArgsPtrsTy = llvm::ArrayType::get(ElementType: PtrTy, NumElements: Args.size());
357
358	auto *Int32Ty = CGF.Builder.getInt32Ty();
359	KernelLaunchParamsTypes.push_back(Elt: Int32Ty);
360	KernelLaunchParamsTypes.push_back(Elt: PtrTy);
361
362	llvm::StructType *KernelLaunchParamsTy =
363	llvm::StructType::create(Elements: KernelLaunchParamsTypes);
364	Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
365	Ty: KernelArgsTy, align: CharUnits::fromQuantity(Quantity: `16`), Name: "kernel_args");
366	Address KernelArgsPtrs = CGF.CreateTempAllocaWithoutCast(
367	Ty: KernelArgsPtrsTy, align: CharUnits::fromQuantity(Quantity: `16`), Name: "kernel_args_ptrs");
368	Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
369	Ty: KernelLaunchParamsTy, align: CharUnits::fromQuantity(Quantity: `16`),
370	Name: "kernel_launch_params");
371
372	CGF.Builder.CreateStore(Val: llvm::ConstantInt::get(Ty: Int32Ty, V: Args.size()),
373	Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: `0`));
374	CGF.Builder.CreateStore(Val: KernelArgsPtrs.emitRawPointer(CGF),
375	Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: `1`));
376
377	for (unsigned i = `0`; i < Args.size(); ++i) {
378	auto *ArgVal = CGF.Builder.CreateLoad(Addr: CGF.GetAddrOfLocalVar(VD: Args [i]));
379	Address ArgAddr = CGF.Builder.CreateStructGEP(Addr: KernelArgs, Index: i);
380	CGF.Builder.CreateStore(Val: ArgVal, Addr: ArgAddr);
381	CGF.Builder.CreateStore(Val: ArgAddr.emitRawPointer(CGF),
382	Addr: CGF.Builder.CreateConstArrayGEP(Addr: KernelArgsPtrs, Index: i));
383	}
384
385	return KernelLaunchParams;
386	}
387
388	Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
389	FunctionArgList &Args) {
390	// Calculate amount of space we will need for all arguments. If we have no
391	// args, allocate a single pointer so we still have a valid pointer to the
392	// argument array that we can pass to runtime, even if it will be unused.
393	Address KernelArgs = CGF.CreateTempAlloca(
394	Ty: PtrTy, UseAddrSpace: LangAS::Default, align: CharUnits::fromQuantity(Quantity: `16`), Name: "kernel_args",
395	ArraySize: llvm::ConstantInt::get(Ty: SizeTy, V: std::max<size_t>(a: `1`, b: Args.size())));
396	// Store pointers to the arguments in a locally allocated launch_args.
397	for (unsigned i = `0`; i < Args.size(); ++i) {
398	llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(VD: Args [i]).emitRawPointer(CGF);
399	llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(V: VarPtr, DestTy: PtrTy);
400	CGF.Builder.CreateDefaultAlignedStore(
401	Val: VoidVarPtr, Addr: CGF.Builder.CreateConstGEP1_32(
402	Ty: PtrTy, Ptr: KernelArgs.emitRawPointer(CGF), Idx0: i));
403	}
404	return KernelArgs;
405	}
406
407	// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
408	// array and kernels are launched using cudaLaunchKernel().
409	void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
410	FunctionArgList &Args) {
411	// Build the shadow stack entry at the very start of the function.
412	Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
413	? prepareKernelArgsLLVMOffload(CGF, Args)
414	: prepareKernelArgs(CGF, Args);
415
416	llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
417
418	// Lookup cudaLaunchKernel/hipLaunchKernel function.
419	// HIP kernel launching API name depends on -fgpu-default-stream option. For
420	// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
421	// it is hipLaunchKernel_spt.
422	// cudaError_t cudaLaunchKernel(const void func, dim3 gridDim, dim3 blockDim,*
423	// void args, size_t sharedMem,
424	// cudaStream_t stream);
425	// hipError_t hipLaunchKernel[_spt](const void func, dim3 gridDim,*
426	// dim3 blockDim, void args,
427	// size_t sharedMem, hipStream_t stream);
428	TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
429	DeclContext *DC = TranslationUnitDecl::castToDeclContext(D: TUDecl);
430	std::string KernelLaunchAPI = "LaunchKernel";
431	if (CGF.getLangOpts().GPUDefaultStream ==
432	LangOptions::GPUDefaultStreamKind::PerThread) {
433	if (CGF.getLangOpts().HIP)
434	KernelLaunchAPI = KernelLaunchAPI + "_spt";
435	else if (CGF.getLangOpts().CUDA)
436	KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
437	}
438	auto LaunchKernelName = addPrefixToName(FuncName: KernelLaunchAPI);
439	const IdentifierInfo &cudaLaunchKernelII =
440	CGM.getContext().Idents.get(Name: LaunchKernelName);
441	FunctionDecl cudaLaunchKernelFD = nullptr*;
442	for (auto *Result : DC->lookup(Name: &cudaLaunchKernelII)) {
443	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Val: Result))
444	cudaLaunchKernelFD = FD;
445	}
446
447	if (cudaLaunchKernelFD == nullptr) {
448	CGM.Error(loc: CGF.CurFuncDecl->getLocation(),
449	error: "Can't find declaration for " + LaunchKernelName);
450	return;
451	}
452	// Create temporary dim3 grid_dim, block_dim.
453	ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(i: `1`);
454	QualType Dim3Ty = GridDimParam->getType();
455	Address GridDim = CGF.CreateMemTempWithoutCast(
456	T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: `8`), Name: "grid_dim");
457	Address BlockDim = CGF.CreateMemTempWithoutCast(
458	T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: `8`), Name: "block_dim");
459	Address ShmemSize = CGF.CreateTempAlloca(Ty: SizeTy, UseAddrSpace: LangAS::Default,
460	align: CGM.getSizeAlign(), Name: "shmem_size");
461	Address Stream = CGF.CreateTempAlloca(Ty: PtrTy, UseAddrSpace: LangAS::Default,
462	align: CGM.getPointerAlign(), Name: "stream");
463	llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
464	Ty: llvm::FunctionType::get(Result: IntTy,
465	Params: {/gridDim=/GridDim.getType(),
466	/blockDim=/BlockDim.getType(),
467	/ShmemSize=/ShmemSize.getType(),
468	/Stream=/Stream.getType()},
469	/isVarArg=/false),
470	Name: addUnderscoredPrefixToName(FuncName: "PopCallConfiguration"));
471
472	CGF.EmitRuntimeCallOrInvoke(callee: cudaPopConfigFn, args: {GridDim.emitRawPointer(CGF),
473	BlockDim.emitRawPointer(CGF),
474	ShmemSize.emitRawPointer(CGF),
475	Stream.emitRawPointer(CGF)});
476
477	// Emit the call to cudaLaunch
478	llvm::Value *Kernel =
479	CGF.Builder.CreatePointerCast(V: KernelHandles [CGF.CurFn->getName()], DestTy: PtrTy);
480	CallArgList LaunchKernelArgs;
481	LaunchKernelArgs.add(rvalue: RValue::get(V: Kernel),
482	type: cudaLaunchKernelFD->getParamDecl(i: `0`)->getType());
483	LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: GridDim), type: Dim3Ty);
484	LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: BlockDim), type: Dim3Ty);
485	LaunchKernelArgs.add(rvalue: RValue::get(Addr: KernelArgs, CGF),
486	type: cudaLaunchKernelFD->getParamDecl(i: `3`)->getType());
487	LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: ShmemSize)),
488	type: cudaLaunchKernelFD->getParamDecl(i: `4`)->getType());
489	LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: Stream)),
490	type: cudaLaunchKernelFD->getParamDecl(i: `5`)->getType());
491
492	QualType QT = cudaLaunchKernelFD->getType();
493	QualType CQT = QT.getCanonicalType();
494	llvm::Type *Ty = CGM.getTypes().ConvertType(T: CQT);
495	llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
496
497	const CGFunctionInfo &FI =
498	CGM.getTypes().arrangeFunctionDeclaration(GD: cudaLaunchKernelFD);
499	llvm::FunctionCallee cudaLaunchKernelFn =
500	CGM.CreateRuntimeFunction(Ty: FTy, Name: LaunchKernelName);
501	CGF.EmitCall(CallInfo: FI, Callee: CGCallee::forDirect(functionPtr: cudaLaunchKernelFn), ReturnValue: ReturnValueSlot (),
502	Args: LaunchKernelArgs);
503
504	// To prevent CUDA device stub functions from being merged by ICF in MSVC
505	// environment, create an unique global variable for each kernel and write to
506	// the variable in the device stub.
507	if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
508	!CGF.getLangOpts().HIP) {
509	llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Val: Kernel);
510	std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
511
512	llvm::GlobalVariable *HandleVar =
513	CGM.getModule().getNamedGlobal(Name: GlobalVarName);
514	if (!HandleVar) {
515	HandleVar = new llvm::GlobalVariable(
516	CGM.getModule(), CGM.Int8Ty,
517	/Constant=/false, KernelFunction->getLinkage(),
518	llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: `0`), GlobalVarName);
519	HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
520	HandleVar->setVisibility(KernelFunction->getVisibility());
521	if (KernelFunction->hasComdat())
522	HandleVar->setComdat(CGM.getModule().getOrInsertComdat(Name: GlobalVarName));
523	}
524
525	CGF.Builder.CreateAlignedStore(Val: llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: `1`),
526	Addr: HandleVar, Align: CharUnits::One(),
527	/IsVolatile=/true);
528	}
529
530	CGF.EmitBranch(Block: EndBlock);
531
532	CGF.EmitBlock(BB: EndBlock);
533	}
534
535	void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
536	FunctionArgList &Args) {
537	// Emit a call to cudaSetupArgument for each arg in Args.
538	llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
539	llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
540	CharUnits Offset = CharUnits::Zero();
541	for (const VarDecl *A : Args) {
542	auto TInfo = CGM.getContext().getTypeInfoInChars(T: A->getType());
543	Offset = Offset.alignTo(Align: TInfo.Align);
544	llvm::Value *Args[] = {
545	CGF.Builder.CreatePointerCast(
546	V: CGF.GetAddrOfLocalVar(VD: A).emitRawPointer(CGF), DestTy: PtrTy),
547	llvm::ConstantInt::get(Ty: SizeTy, V: TInfo.Width.getQuantity()),
548	llvm::ConstantInt::get(Ty: SizeTy, V: Offset.getQuantity()),
549	};
550	llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(callee: cudaSetupArgFn, args: Args);
551	llvm::Constant *Zero = llvm::ConstantInt::get(Ty: IntTy, V: `0`);
552	llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(LHS: CB, RHS: Zero);
553	llvm::BasicBlock *NextBlock = CGF.createBasicBlock(name: "setup.next");
554	CGF.Builder.CreateCondBr(Cond: CBZero, True: NextBlock, False: EndBlock);
555	CGF.EmitBlock(BB: NextBlock);
556	Offset += TInfo.Width;
557	}
558
559	// Emit the call to cudaLaunch
560	llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
561	llvm::Value *Arg =
562	CGF.Builder.CreatePointerCast(V: KernelHandles [CGF.CurFn->getName()], DestTy: PtrTy);
563	CGF.EmitRuntimeCallOrInvoke(callee: cudaLaunchFn, args: Arg);
564	CGF.EmitBranch(Block: EndBlock);
565
566	CGF.EmitBlock(BB: EndBlock);
567	}
568
569	// Replace the original variable Var with the address loaded from variable
570	// ManagedVar populated by HIP runtime.
571	static void replaceManagedVar(llvm::GlobalVariable *Var,
572	llvm::GlobalVariable *ManagedVar) {
573	SmallVector<SmallVector<llvm::User *, `8`>, `8`> WorkList;
574	for (auto &&VarUse : Var->uses()) {
575	WorkList.push_back(Elt: {VarUse.getUser()});
576	}
577	while (!WorkList.empty()) {
578	auto &&WorkItem = WorkList.pop_back_val();
579	auto *U = WorkItem.back();
580	if (isa<llvm::ConstantExpr>(Val: U)) {
581	for (auto &&UU : U->uses()) {
582	WorkItem.push_back(Elt: UU.getUser());
583	WorkList.push_back(Elt: WorkItem);
584	WorkItem.pop_back();
585	}
586	continue;
587	}
588	if (auto *I = dyn_cast<llvm::Instruction>(Val: U)) {
589	llvm::Value *OldV = Var;
590	llvm::Instruction NewV = new* llvm::LoadInst(
591	Var->getType(), ManagedVar, "ld.managed", false,
592	llvm::Align(Var->getAlignment()), I->getIterator());
593	WorkItem.pop_back();
594	// Replace constant expressions directly or indirectly using the managed
595	// variable with instructions.
596	for (auto &&Op : WorkItem) {
597	auto *CE = cast<llvm::ConstantExpr>(Val: Op);
598	auto *NewInst = CE->getAsInstruction();
599	NewInst->insertBefore(BB&: *I->getParent(), InsertPos: I->getIterator());
600	NewInst->replaceUsesOfWith(From: OldV, To: NewV);
601	OldV = CE;
602	NewV = NewInst;
603	}
604	I->replaceUsesOfWith(From: OldV, To: NewV);
605	} else {
606	llvm_unreachable("Invalid use of managed variable");
607	}
608	}
609	}
610
611	/// Creates a function that sets up state on the host side for CUDA objects that
612	/// have a presence on both the host and device sides. Specifically, registers
613	/// the host side of kernel functions and device global variables with the CUDA
614	/// runtime.
615	/// \code
616	/// void __cuda_register_globals(void* GpuBinaryHandle) {*
617	/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
618	/// ...
619	/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
620	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
621	/// ...
622	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
623	/// }
624	/// \endcode
625	llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
626	// No need to register anything
627	if (EmittedKernels.empty() && DeviceVars.empty())
628	return nullptr;
629
630	llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
631	Ty: getRegisterGlobalsFnTy(), Linkage: llvm::GlobalValue::InternalLinkage,
632	N: addUnderscoredPrefixToName(FuncName: "_register_globals"), M: &TheModule);
633	llvm::BasicBlock *EntryBB =
634	llvm::BasicBlock::Create(Context, Name: "entry", Parent: RegisterKernelsFunc);
635	CGBuilderTy Builder(CGM, Context);
636	Builder.SetInsertPoint(EntryBB);
637
638	// void __cudaRegisterFunction(void , const char , char , const char ,*
639	// int, uint3, uint3, dim3, dim3, int)*
640	llvm::Type *RegisterFuncParams[] = {
641	PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
642	PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(C&: Context)};
643	llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
644	Ty: llvm::FunctionType::get(Result: IntTy, Params: RegisterFuncParams, isVarArg: false),
645	Name: addUnderscoredPrefixToName(FuncName: "RegisterFunction"));
646
647	// Extract GpuBinaryHandle passed as the first argument passed to
648	// __cuda_register_globals() and generate __cudaRegisterFunction() call for
649	// each emitted kernel.
650	llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
651	for (auto &&I : EmittedKernels) {
652	llvm::Constant *KernelName =
653	makeConstantString(Str: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)));
654	llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(T: PtrTy);
655	llvm::Value *Args[] = {
656	&GpuBinaryHandlePtr,
657	KernelHandles [I.Kernel->getName()],
658	KernelName,
659	KernelName,
660	llvm::ConstantInt::getAllOnesValue(Ty: IntTy),
661	NullPtr,
662	NullPtr,
663	NullPtr,
664	NullPtr,
665	llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Context))};
666	Builder.CreateCall(Callee: RegisterFunc, Args);
667	}
668
669	llvm::Type *VarSizeTy = IntTy;
670	// For HIP or CUDA 9.0+, device variable size is type of `size_t`.
671	if (CGM.getLangOpts().HIP \|\|
672	ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
673	VarSizeTy = SizeTy;
674
675	// void __cudaRegisterVar(void , char , char , const char ,*
676	// int, int, int, int)
677	llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
678	IntTy, VarSizeTy, IntTy, IntTy};
679	llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
680	Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterVarParams, isVarArg: false),
681	Name: addUnderscoredPrefixToName(FuncName: "RegisterVar"));
682	// void __hipRegisterManagedVar(void , char , char , const char ,*
683	// size_t, unsigned)
684	llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
685	PtrTy, VarSizeTy, IntTy};
686	llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
687	Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterManagedVarParams, isVarArg: false),
688	Name: addUnderscoredPrefixToName(FuncName: "RegisterManagedVar"));
689	// void __cudaRegisterSurface(void , const struct surfaceReference ,*
690	// const void , const char , int, int);*
691	llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
692	Ty: llvm::FunctionType::get(
693	Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, isVarArg: false),
694	Name: addUnderscoredPrefixToName(FuncName: "RegisterSurface"));
695	// void __cudaRegisterTexture(void , const struct textureReference ,*
696	// const void , const char , int, int, int)*
697	llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
698	Ty: llvm::FunctionType::get(
699	Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, isVarArg: false),
700	Name: addUnderscoredPrefixToName(FuncName: "RegisterTexture"));
701	for (auto &&Info : DeviceVars) {
702	llvm::GlobalVariable *Var = Info.Var;
703	assert((!Var->isDeclaration() \|\| Info.Flags.isManaged()) &&
704	"External variables should not show up here, except HIP managed "
705	"variables");
706	llvm::Constant *VarName = makeConstantString(Str: getDeviceSideName(ND: Info.D));
707	switch (Info.Flags.getKind()) {
708	case DeviceVarFlags::Variable: {
709	uint64_t VarSize =
710	CGM.getDataLayout().getTypeAllocSize(Ty: Var->getValueType());
711	if (Info.Flags.isManaged()) {
712	assert(Var->getName().ends_with(".managed") &&
713	"HIP managed variables not transformed");
714	auto *ManagedVar = CGM.getModule().getNamedGlobal(
715	Name: Var->getName().drop_back(N: StringRef(".managed").size()));
716	llvm::Value *Args[] = {
717	&GpuBinaryHandlePtr,
718	ManagedVar,
719	Var,
720	VarName,
721	llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
722	llvm::ConstantInt::get(Ty: IntTy, V: Var->getAlignment())};
723	if (!Var->isDeclaration())
724	Builder.CreateCall(Callee: RegisterManagedVar, Args);
725	} else {
726	llvm::Value *Args[] = {
727	&GpuBinaryHandlePtr,
728	Var,
729	VarName,
730	VarName,
731	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern()),
732	llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
733	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isConstant()),
734	llvm::ConstantInt::get(Ty: IntTy, V: `0`)};
735	Builder.CreateCall(Callee: RegisterVar, Args);
736	}
737	break;
738	}
739	case DeviceVarFlags::Surface:
740	Builder.CreateCall(
741	Callee: RegisterSurf,
742	Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
743	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
744	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
745	break;
746	case DeviceVarFlags::Texture:
747	Builder.CreateCall(
748	Callee: RegisterTex,
749	Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
750	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
751	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isNormalized()),
752	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
753	break;
754	}
755	}
756
757	// Register the per-TU offload-profiling shadow so the host runtime can
758	// locate the matching device-side __llvm_profile_sections_<CUID>. We
759	// emit both __hipRegisterVar (so the HIP runtime can map the host
760	// shadow to the device symbol) and
761	// __llvm_profile_offload_register_shadow_variable (so the profile
762	// runtime adds the shadow to its drain list).
763	if (OffloadProfShadow) {
764	llvm::Constant *Name =
765	makeConstantString(Str: std::string (OffloadProfShadow->getName()));
766	llvm::Constant *IntZero = llvm::ConstantInt::get(Ty: IntTy, V: `0`);
767	llvm::Value *RegisterVarArgs[] = {
768	&GpuBinaryHandlePtr,
769	OffloadProfShadow,
770	Name,
771	Name,
772	IntZero,
773	llvm::ConstantInt::get(Ty: VarSizeTy,
774	V: CGM.getDataLayout().getPointerSize(/AS=/`0`)),
775	IntZero,
776	IntZero};
777	Builder.CreateCall(Callee: RegisterVar, Args: RegisterVarArgs);
778
779	llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
780	Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false),
781	Name: "__llvm_profile_offload_register_shadow_variable");
782	Builder.CreateCall(Callee: RegisterShadow, Args: {OffloadProfShadow});
783	}
784
785	if (!OffloadProfSectionShadows.empty()) {
786	llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
787	Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false),
788	Name: "__llvm_profile_offload_register_section_shadow_variable");
789	llvm::Constant *IntZero = llvm::ConstantInt::get(Ty: IntTy, V: `0`);
790	for (const auto &Info : OffloadProfSectionShadows) {
791	llvm::Constant *Name = makeConstantString(Str: Info.DeviceName);
792	llvm::Value *RegisterVarArgs[] = {
793	&GpuBinaryHandlePtr,
794	Info.Shadow,
795	Name,
796	Name,
797	IntZero,
798	llvm::ConstantInt::get(Ty: VarSizeTy,
799	V: CGM.getDataLayout().getPointerSize(/AS=/`0`)),
800	IntZero,
801	IntZero};
802	Builder.CreateCall(Callee: RegisterVar, Args: RegisterVarArgs);
803	Builder.CreateCall(Callee: RegisterSectionShadow, Args: {Info.Shadow});
804	}
805	}
806
807	Builder.CreateRetVoid();
808	return RegisterKernelsFunc;
809	}
810
811	/// Creates a global constructor function for the module:
812	///
813	/// For CUDA:
814	/// \code
815	/// void __cuda_module_ctor() {
816	/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
817	/// __cuda_register_globals(Handle);
818	/// }
819	/// \endcode
820	///
821	/// For HIP:
822	/// \code
823	/// void __hip_module_ctor() {
824	/// if (__hip_gpubin_handle == 0) {
825	/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
826	/// __hip_register_globals(__hip_gpubin_handle);
827	/// }
828	/// }
829	/// \endcode
830	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
831	bool IsHIP = CGM.getLangOpts().HIP;
832	bool IsCUDA = CGM.getLangOpts().CUDA;
833	// No need to generate ctors/dtors if there is no GPU binary.
834	StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
835	if (CudaGpuBinaryFileName.empty() && !IsHIP)
836	return nullptr;
837	if ((IsHIP \|\| (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
838	DeviceVars.empty())
839	return nullptr;
840
841	// void __{cuda\|hip}_register_globals(void handle);*
842	llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
843	// We always need a function to pass in as callback. Create a dummy
844	// implementation if we don't need to register anything.
845	if (RelocatableDeviceCode && !RegisterGlobalsFunc)
846	RegisterGlobalsFunc = makeDummyFunction(FnTy: getRegisterGlobalsFnTy());
847
848	// void * __{cuda\|hip}RegisterFatBinary(void );
849	llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
850	Ty: llvm::FunctionType::get(Result: PtrTy, Params: PtrTy, isVarArg: false),
851	Name: addUnderscoredPrefixToName(FuncName: "RegisterFatBinary"));
852	// struct { int magic, int version, void gpu_binary, void * dont_care };*
853	llvm::StructType *FatbinWrapperTy =
854	llvm::StructType::get(elt1: IntTy, elts: IntTy, elts: PtrTy, elts: PtrTy);
855
856	// Register GPU binary with the CUDA runtime, store returned handle in a
857	// global variable and save a reference in GpuBinaryHandle to be cleaned up
858	// in destructor on exit. Then associate all known kernels with the GPU binary
859	// handle so CUDA runtime can figure out what to call on the GPU side.
860	std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
861	if (!CudaGpuBinaryFileName.empty()) {
862	auto VFS = CGM.getFileSystem();
863	auto CudaGpuBinaryOrErr =
864	VFS ->getBufferForFile(Name: CudaGpuBinaryFileName, FileSize: -`1`, RequiresNullTerminator: false);
865	if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
866	CGM.getDiags().Report(DiagID: diag::err_cannot_open_file)
867	<< CudaGpuBinaryFileName << EC.message();
868	return nullptr;
869	}
870	CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
871	}
872
873	llvm::Function *ModuleCtorFunc = llvm::Function::Create(
874	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
875	Linkage: llvm::GlobalValue::InternalLinkage,
876	N: addUnderscoredPrefixToName(FuncName: "_module_ctor"), M: &TheModule);
877	llvm::BasicBlock *CtorEntryBB =
878	llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleCtorFunc);
879	CGBuilderTy CtorBuilder(CGM, Context);
880
881	CtorBuilder.SetInsertPoint(CtorEntryBB);
882
883	const char *FatbinConstantName;
884	const char *FatbinSectionName;
885	const char *ModuleIDSectionName;
886	StringRef ModuleIDPrefix;
887	llvm::Constant *FatBinStr;
888	unsigned FatMagic;
889	if (IsHIP) {
890	// On macOS (Mach-O), section names must be in "segment,section" format.
891	FatbinConstantName =
892	CGM.getTriple().isMacOSX() ? "__HIP,__hip_fatbin" : ".hip_fatbin";
893	FatbinSectionName =
894	CGM.getTriple().isMacOSX() ? "__HIP,__fatbin" : ".hipFatBinSegment";
895
896	ModuleIDSectionName =
897	CGM.getTriple().isMacOSX() ? "__HIP,__module_id" : "__hip_module_id";
898	ModuleIDPrefix = "__hip_";
899
900	if (CudaGpuBinary) {
901	// If fatbin is available from early finalization, create a string
902	// literal containing the fat binary loaded from the given file.
903	const unsigned HIPCodeObjectAlign = `4096`;
904	FatBinStr = makeConstantArray(Str: std::string (CudaGpuBinary ->getBuffer()), Name: "",
905	SectionName: FatbinConstantName, Alignment: HIPCodeObjectAlign);
906	} else {
907	// If fatbin is not available, create an external symbol
908	// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
909	// to contain the fat binary but will be populated somewhere else,
910	// e.g. by lld through link script.
911	FatBinStr = new llvm::GlobalVariable(
912	CGM.getModule(), CGM.Int8Ty,
913	/isConstant=/true, llvm::GlobalValue::ExternalLinkage, nullptr,
914	"__hip_fatbin" + (CGM.getLangOpts().CUID.empty()
915	? ""
916	: "_" + CGM.getContext().getCUIDHash()),
917	nullptr, llvm::GlobalVariable::NotThreadLocal);
918	cast<llvm::GlobalVariable>(Val: FatBinStr)->setSection(FatbinConstantName);
919	}
920
921	FatMagic = HIPFatMagic;
922	} else {
923	if (RelocatableDeviceCode)
924	FatbinConstantName = CGM.getTriple().isMacOSX()
925	? "__NV_CUDA,__nv_relfatbin"
926	: "__nv_relfatbin";
927	else
928	FatbinConstantName =
929	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
930	// NVIDIA's cuobjdump looks for fatbins in this section.
931	FatbinSectionName =
932	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
933
934	ModuleIDSectionName = CGM.getTriple().isMacOSX()
935	? "__NV_CUDA,__nv_module_id"
936	: "__nv_module_id";
937	ModuleIDPrefix = "__nv_";
938
939	// For CUDA, create a string literal containing the fat binary loaded from
940	// the given file.
941	FatBinStr = makeConstantArray(Str: std::string (CudaGpuBinary ->getBuffer()), Name: "",
942	SectionName: FatbinConstantName, Alignment: `8`);
943	FatMagic = CudaFatMagic;
944	}
945
946	// Create initialized wrapper structure that points to the loaded GPU binary
947	ConstantInitBuilder Builder(CGM);
948	auto Values = Builder.beginStruct(structTy: FatbinWrapperTy);
949	// Fatbin wrapper magic.
950	Values.addInt(intTy: IntTy, value: FatMagic);
951	// Fatbin version.
952	Values.addInt(intTy: IntTy, value: `1`);
953	// Data.
954	Values.add(value: FatBinStr);
955	// Unused in fatbin v1.
956	Values.add(value: llvm::ConstantPointerNull::get(T: PtrTy));
957	llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
958	args: addUnderscoredPrefixToName(FuncName: "_fatbin_wrapper"), args: CGM.getPointerAlign(),
959	/constant/ args: true);
960	FatbinWrapper->setSection(FatbinSectionName);
961	CGM.getSanitizerMetadata()->disableSanitizerForGlobal(GV: FatbinWrapper);
962
963	// There is only one HIP fat binary per linked module, however there are
964	// multiple constructor functions. Make sure the fat binary is registered
965	// only once. The constructor functions are executed by the dynamic loader
966	// before the program gains control. The dynamic loader cannot execute the
967	// constructor functions concurrently since doing that would not guarantee
968	// thread safety of the loaded program. Therefore we can assume sequential
969	// execution of constructor functions here.
970	if (IsHIP) {
971	auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage
972	: llvm::GlobalValue::InternalLinkage;
973	llvm::BasicBlock *IfBlock =
974	llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleCtorFunc);
975	llvm::BasicBlock *ExitBlock =
976	llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleCtorFunc);
977	// The name, size, and initialization pattern of this variable is part
978	// of HIP ABI.
979	GpuBinaryHandle = new llvm::GlobalVariable(
980	TheModule, PtrTy, /isConstant=/false, Linkage,
981	/Initializer=/
982	!RelocatableDeviceCode ? llvm::ConstantPointerNull::get(T: PtrTy)
983	: nullptr,
984	"__hip_gpubin_handle" + (CGM.getLangOpts().CUID.empty()
985	? ""
986	: "_" + CGM.getContext().getCUIDHash()));
987	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
988	// Prevent the weak symbol in different shared libraries being merged.
989	if (Linkage != llvm::GlobalValue::InternalLinkage)
990	GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
991	Address GpuBinaryAddr(
992	GpuBinaryHandle, PtrTy,
993	CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
994	{
995	auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
996	llvm::Constant *Zero =
997	llvm::Constant::getNullValue(Ty: HandleValue->getType());
998	llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(LHS: HandleValue, RHS: Zero);
999	CtorBuilder.CreateCondBr(Cond: EQZero, True: IfBlock, False: ExitBlock);
1000	}
1001	{
1002	CtorBuilder.SetInsertPoint(IfBlock);
1003	// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
1004	llvm::CallInst *RegisterFatbinCall =
1005	CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
1006	CtorBuilder.CreateStore(Val: RegisterFatbinCall, Addr: GpuBinaryAddr);
1007	CtorBuilder.CreateBr(Dest: ExitBlock);
1008	}
1009	{
1010	CtorBuilder.SetInsertPoint(ExitBlock);
1011	// Call __hip_register_globals(GpuBinaryHandle);
1012	if (RegisterGlobalsFunc) {
1013	auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
1014	CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: HandleValue);
1015	}
1016	}
1017	} else if (!RelocatableDeviceCode) {
1018	// Register binary with CUDA runtime. This is substantially different in
1019	// default mode vs. separate compilation!
1020	// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
1021	llvm::CallInst *RegisterFatbinCall =
1022	CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
1023	GpuBinaryHandle = new llvm::GlobalVariable(
1024	TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
1025	llvm::ConstantPointerNull::get(T: PtrTy), "__cuda_gpubin_handle");
1026	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
1027	CtorBuilder.CreateAlignedStore(Val: RegisterFatbinCall, Addr: GpuBinaryHandle,
1028	Align: CGM.getPointerAlign());
1029
1030	// Call __cuda_register_globals(GpuBinaryHandle);
1031	if (RegisterGlobalsFunc)
1032	CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: RegisterFatbinCall);
1033
1034	// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
1035	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
1036	CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
1037	// void __cudaRegisterFatBinaryEnd(void );
1038	llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
1039	Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
1040	Name: "__cudaRegisterFatBinaryEnd");
1041	CtorBuilder.CreateCall(Callee: RegisterFatbinEndFunc, Args: RegisterFatbinCall);
1042	}
1043	} else {
1044	// Generate a unique module ID.
1045	// Note that this is unique in a build (with some collision probability
1046	// inherent to MD5 hashing) as long as each compilation sees modules with
1047	// different `SourceFileName`s. Builds using absolute paths or paths
1048	// relative to the same base path should be OK. This is similar to the
1049	// guarantees for ThinLTO and GlobalValue's GUID.
1050	// If desired, a stronger uniqueness guarantee could be computed (with a
1051	// small refactoring) with `llvm::getUniqueModuleId`, which hashes the
1052	// module content (and, therefore, a compile-time tradeoff).
1053	SmallString<`64`> ModuleID;
1054	llvm::raw_svector_ostream OS(ModuleID);
1055	OS << ModuleIDPrefix
1056	<< llvm::format(Fmt: "%" PRIx64,
1057	Vals: llvm::MD5Hash(Str: TheModule.getSourceFileName()));
1058	llvm::Constant *ModuleIDConstant = makeConstantArray(
1059	Str: std::string(ModuleID), Name: "", SectionName: ModuleIDSectionName, Alignment: `32`, /AddNull=/true);
1060
1061	// Create an alias for the FatbinWrapper that nvcc will look for.
1062	llvm::GlobalAlias::create(Linkage: llvm::GlobalValue::ExternalLinkage,
1063	Name: Twine("__fatbinwrap") + ModuleID, Aliasee: FatbinWrapper);
1064
1065	// void __cudaRegisterLinkedBinary%ModuleID%(void ()(void ), void ,*
1066	// void , void ()(void ))
1067	SmallString<`128`> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
1068	RegisterLinkedBinaryName += ModuleID;
1069	llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
1070	Ty: getRegisterLinkedBinaryFnTy(), Name: RegisterLinkedBinaryName);
1071
1072	assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
1073	llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
1074	makeDummyFunction(FnTy: getCallbackFnTy())};
1075	CtorBuilder.CreateCall(Callee: RegisterLinkedBinaryFunc, Args);
1076	}
1077
1078	// Create destructor and register it with atexit() the way NVCC does it. Doing
1079	// it during regular destructor phase worked in CUDA before 9.2 but results in
1080	// double-free in 9.2.
1081	if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
1082	// extern "C" int atexit(void (f)(void));*
1083	llvm::FunctionType *AtExitTy =
1084	llvm::FunctionType::get(Result: IntTy, Params: CleanupFn->getType(), isVarArg: false);
1085	llvm::FunctionCallee AtExitFunc =
1086	CGM.CreateRuntimeFunction(Ty: AtExitTy, Name: "atexit", ExtraAttrs: llvm::AttributeList(),
1087	/Local=/true);
1088	CtorBuilder.CreateCall(Callee: AtExitFunc, Args: CleanupFn);
1089	}
1090
1091	CtorBuilder.CreateRetVoid();
1092	return ModuleCtorFunc;
1093	}
1094
1095	/// Creates a global destructor function that unregisters the GPU code blob
1096	/// registered by constructor.
1097	///
1098	/// For CUDA:
1099	/// \code
1100	/// void __cuda_module_dtor() {
1101	/// __cudaUnregisterFatBinary(Handle);
1102	/// }
1103	/// \endcode
1104	///
1105	/// For HIP:
1106	/// \code
1107	/// void __hip_module_dtor() {
1108	/// if (__hip_gpubin_handle) {
1109	/// __hipUnregisterFatBinary(__hip_gpubin_handle);
1110	/// __hip_gpubin_handle = 0;
1111	/// }
1112	/// }
1113	/// \endcode
1114	llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
1115	// No need for destructor if we don't have a handle to unregister.
1116	if (!GpuBinaryHandle)
1117	return nullptr;
1118
1119	// void __cudaUnregisterFatBinary(void * handle);*
1120	llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
1121	Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
1122	Name: addUnderscoredPrefixToName(FuncName: "UnregisterFatBinary"));
1123
1124	llvm::Function *ModuleDtorFunc = llvm::Function::Create(
1125	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
1126	Linkage: llvm::GlobalValue::InternalLinkage,
1127	N: addUnderscoredPrefixToName(FuncName: "_module_dtor"), M: &TheModule);
1128
1129	llvm::BasicBlock *DtorEntryBB =
1130	llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleDtorFunc);
1131	CGBuilderTy DtorBuilder(CGM, Context);
1132	DtorBuilder.SetInsertPoint(DtorEntryBB);
1133
1134	Address GpuBinaryAddr(
1135	GpuBinaryHandle, GpuBinaryHandle->getValueType(),
1136	CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
1137	auto *HandleValue = DtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
1138	// There is only one HIP fat binary per linked module, however there are
1139	// multiple destructor functions. Make sure the fat binary is unregistered
1140	// only once.
1141	if (CGM.getLangOpts().HIP) {
1142	llvm::BasicBlock *IfBlock =
1143	llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleDtorFunc);
1144	llvm::BasicBlock *ExitBlock =
1145	llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleDtorFunc);
1146	llvm::Constant *Zero = llvm::Constant::getNullValue(Ty: HandleValue->getType());
1147	llvm::Value *NEZero = DtorBuilder.CreateICmpNE(LHS: HandleValue, RHS: Zero);
1148	DtorBuilder.CreateCondBr(Cond: NEZero, True: IfBlock, False: ExitBlock);
1149
1150	DtorBuilder.SetInsertPoint(IfBlock);
1151	DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue);
1152	DtorBuilder.CreateStore(Val: Zero, Addr: GpuBinaryAddr);
1153	DtorBuilder.CreateBr(Dest: ExitBlock);
1154
1155	DtorBuilder.SetInsertPoint(ExitBlock);
1156	} else {
1157	DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue);
1158	}
1159	DtorBuilder.CreateRetVoid();
1160	return ModuleDtorFunc;
1161	}
1162
1163	CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1164	return new CGNVCUDARuntime (CGM);
1165	}
1166
1167	void CGNVCUDARuntime::internalizeDeviceSideVar(
1168	const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1169	// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1170	// global variables become internal definitions. These have to be internal in
1171	// order to prevent name conflicts with global host variables with the same
1172	// name in a different TUs.
1173	//
1174	// For -fgpu-rdc, the shadow variables should not be internalized because
1175	// they may be accessed by different TU.
1176	if (CGM.getLangOpts().GPURelocatableDeviceCode)
1177	return;
1178
1179	// __shared__ variables are odd. Shadows do get created, but
1180	// they are not registered with the CUDA runtime, so they
1181	// can't really be used to access their device-side
1182	// counterparts. It's not clear yet whether it's nvcc's bug or
1183	// a feature, but we've got to do the same for compatibility.
1184	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>() \|\|
1185	D->hasAttr<CUDASharedAttr>() \|\|
1186	D->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
1187	D->getType()->isCUDADeviceBuiltinTextureType()) {
1188	Linkage = llvm::GlobalValue::InternalLinkage;
1189	}
1190	}
1191
1192	void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1193	llvm::GlobalVariable &GV) {
1194	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>()) {
1195	// Shadow variables and their properties must be registered with CUDA
1196	// runtime. Skip Extern global variables, which will be registered in
1197	// the TU where they are defined.
1198	//
1199	// Don't register a C++17 inline variable. The local symbol can be
1200	// discarded and referencing a discarded local symbol from outside the
1201	// comdat (__cuda_register_globals) is disallowed by the ELF spec.
1202	//
1203	// HIP managed variables need to be always recorded in device and host
1204	// compilations for transformation.
1205	//
1206	// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1207	// added to llvm.compiler-used, therefore they are safe to be registered.
1208	if ((!D->hasExternalStorage() && !D->isInline()) \|\|
1209	CGM.getContext().CUDADeviceVarODRUsedByHost.contains(key: D) \|\|
1210	D->hasAttr<HIPManagedAttr>()) {
1211	registerDeviceVar(VD: D, Var&: GV, Extern: !D->hasDefinition(),
1212	Constant: D->hasAttr<CUDAConstantAttr>());
1213	}
1214	} else if (D->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
1215	D->getType()->isCUDADeviceBuiltinTextureType()) {
1216	// Builtin surfaces and textures and their template arguments are
1217	// also registered with CUDA runtime.
1218	const auto *TD = cast<ClassTemplateSpecializationDecl>(
1219	Val: D->getType()->castAsCXXRecordDecl());
1220	const TemplateArgumentList &Args = TD->getTemplateArgs();
1221	if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1222	assert(Args.size() == `2` &&
1223	"Unexpected number of template arguments of CUDA device "
1224	"builtin surface type.");
1225	auto SurfType = Args [`1`].getAsIntegral();
1226	if (!D->hasExternalStorage())
1227	registerDeviceSurf(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: SurfType.getSExtValue());
1228	} else {
1229	assert(Args.size() == `3` &&
1230	"Unexpected number of template arguments of CUDA device "
1231	"builtin texture type.");
1232	auto TexType = Args [`1`].getAsIntegral();
1233	auto Normalized = Args [`2`].getAsIntegral();
1234	if (!D->hasExternalStorage())
1235	registerDeviceTex(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: TexType.getSExtValue(),
1236	Normalized: Normalized.getZExtValue());
1237	}
1238	}
1239	}
1240
1241	// Transform managed variables to pointers to managed variables in device code.
1242	// Each use of the original managed variable is replaced by a load from the
1243	// transformed managed variable. The transformed managed variable contains
1244	// the address of managed memory which will be allocated by the runtime.
1245	void CGNVCUDARuntime::transformManagedVars() {
1246	for (auto &&Info : DeviceVars) {
1247	llvm::GlobalVariable *Var = Info.Var;
1248	if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1249	Info.Flags.isManaged()) {
1250	auto ManagedVar = new* llvm::GlobalVariable(
1251	CGM.getModule(), Var->getType(),
1252	/isConstant=/false, Var->getLinkage(),
1253	/Init=/Var->isDeclaration()
1254	? nullptr
1255	: llvm::ConstantPointerNull::get(T: Var->getType()),
1256	/Name=/"", /InsertBefore=/nullptr,
1257	llvm::GlobalVariable::NotThreadLocal,
1258	CGM.getContext().getTargetAddressSpace(AS: CGM.getLangOpts().CUDAIsDevice
1259	? LangAS::cuda_device
1260	: LangAS::Default));
1261	ManagedVar->setDSOLocal(Var->isDSOLocal());
1262	ManagedVar->setVisibility(Var->getVisibility());
1263	ManagedVar->setExternallyInitialized(true);
1264	replaceManagedVar(Var, ManagedVar);
1265	ManagedVar->takeName(V: Var);
1266	Var->setName(Twine(ManagedVar->getName()) + ".managed");
1267	// Keep managed variables even if they are not used in device code since
1268	// they need to be allocated by the runtime.
1269	if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1270	assert(!ManagedVar->isDeclaration());
1271	CGM.addCompilerUsedGlobal(GV: Var);
1272	CGM.addCompilerUsedGlobal(GV: ManagedVar);
1273	}
1274	}
1275	}
1276	}
1277
1278	// Creates offloading entries for all the kernels and globals that must be
1279	// registered. The linker will provide a pointer to this section so we can
1280	// register the symbols with the linked device image.
1281	void CGNVCUDARuntime::createOffloadingEntries() {
1282	llvm::object::OffloadKind Kind = CGM.getLangOpts().HIP
1283	? llvm::object::OffloadKind::OFK_HIP
1284	: llvm::object::OffloadKind::OFK_Cuda;
1285	// For now, just spoof this as OpenMP because that's the runtime it uses.
1286	if (CGM.getLangOpts().OffloadViaLLVM)
1287	Kind = llvm::object::OffloadKind::OFK_OpenMP;
1288
1289	llvm::Module &M = CGM.getModule();
1290	for (KernelInfo &I : EmittedKernels)
1291	llvm::offloading::emitOffloadingEntry(
1292	M, Kind, Addr: KernelHandles [I.Kernel->getName()],
1293	Name: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)), /Flags=/Size: `0`, /Data=/Flags: `0`,
1294	Data: llvm::offloading::OffloadGlobalEntry);
1295
1296	for (VarInfo &I : DeviceVars) {
1297	uint64_t VarSize =
1298	CGM.getDataLayout().getTypeAllocSize(Ty: I.Var->getValueType());
1299	int32_t Flags =
1300	(I.Flags.isExtern()
1301	? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1302	: `0`) \|
1303	(I.Flags.isConstant()
1304	? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1305	: `0`) \|
1306	(I.Flags.isNormalized()
1307	? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1308	: `0`);
1309	if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1310	if (I.Flags.isManaged()) {
1311	assert(I.Var->getName().ends_with(".managed") &&
1312	"HIP managed variables not transformed");
1313
1314	auto *ManagedVar = M.getNamedGlobal(
1315	Name: I.Var->getName().drop_back(N: StringRef(".managed").size()));
1316	llvm::offloading::emitOffloadingEntry(
1317	M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1318	Flags: llvm::offloading::OffloadGlobalManagedEntry \| Flags,
1319	/Data=/I.Var->getAlignment(), AuxAddr: ManagedVar);
1320	} else {
1321	llvm::offloading::emitOffloadingEntry(
1322	M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1323	Flags: llvm::offloading::OffloadGlobalEntry \| Flags,
1324	/Data=/`0`);
1325	}
1326	} else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1327	llvm::offloading::emitOffloadingEntry(
1328	M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1329	Flags: llvm::offloading::OffloadGlobalSurfaceEntry \| Flags,
1330	Data: I.Flags.getSurfTexType());
1331	} else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1332	llvm::offloading::emitOffloadingEntry(
1333	M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1334	Flags: llvm::offloading::OffloadGlobalTextureEntry \| Flags,
1335	Data: I.Flags.getSurfTexType());
1336	}
1337	}
1338
1339	// Register the per-TU offload-profiling shadow. The offloading entry
1340	// makes the linker-wrapper emit the host __hipRegisterVar call in the
1341	// combined ctor. Separately emit a per-TU ctor that registers the
1342	// shadow with the profile runtime's drain list.
1343	if (OffloadProfShadow) {
1344	llvm::offloading::emitOffloadingEntry(
1345	M, Kind, Addr: OffloadProfShadow, Name: OffloadProfShadow->getName(),
1346	Size: CGM.getDataLayout().getPointerSize(/AS=/`0`),
1347	Flags: llvm::offloading::OffloadGlobalEntry, /Data=/`0`);
1348
1349	llvm::LLVMContext &Ctx = M.getContext();
1350	auto *PtrTy = llvm::PointerType::getUnqual(C&: Ctx);
1351	llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
1352	Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false),
1353	Name: "__llvm_profile_offload_register_shadow_variable");
1354	llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction(
1355	Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false),
1356	Name: "__llvm_profile_offload_register_section_shadow_variable");
1357	auto *CtorFn = llvm::Function::Create(
1358	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
1359	Linkage: llvm::GlobalValue::InternalLinkage,
1360	N: "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), M: &M);
1361	auto *Entry = llvm::BasicBlock::Create(Context&: Ctx, Name: "entry", Parent: CtorFn);
1362	llvm::IRBuilder<> B(Entry);
1363	B.CreateCall(Callee: RegisterShadow, Args: {OffloadProfShadow});
1364	for (const auto &Info : OffloadProfSectionShadows) {
1365	llvm::offloading::emitOffloadingEntry(
1366	M, Kind, Addr: Info.Shadow, Name: Info.DeviceName,
1367	Size: CGM.getDataLayout().getPointerSize(/AS=/`0`),
1368	Flags: llvm::offloading::OffloadGlobalEntry, /Data=/`0`);
1369	B.CreateCall(Callee: RegisterSectionShadow, Args: {Info.Shadow});
1370	}
1371	B.CreateRetVoid();
1372	llvm::appendToGlobalCtors(M, F: CtorFn, /Priority=/`65535`);
1373	}
1374	}
1375
1376	// For HIP host+device compiles with PGO enabled, emit the host-side shadow for
1377	// the per-TU __llvm_profile_sections_<CUID> global. Device-side section table
1378	// emission is owned by InstrProfiling so it can be gated on real profile data.
1379	void CGNVCUDARuntime::emitOffloadProfilingSections() {
1380	if (!CGM.getLangOpts().HIP)
1381	return;
1382	if (!CGM.getCodeGenOpts().hasProfileInstr())
1383	return;
1384
1385	StringRef CUIDHash = CGM.getContext().getCUIDHash();
1386	if (CUIDHash.empty())
1387	return;
1388
1389	llvm::Module &M = CGM.getModule();
1390	llvm::LLVMContext &Ctx = M.getContext();
1391	std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
1392
1393	// If the global already exists (e.g. another TU was merged in), don't
1394	// duplicate it.
1395	if (M.getNamedValue(Name))
1396	return;
1397
1398	if (CGM.getLangOpts().CUDAIsDevice) {
1399	// Device side: emit only the per-TU names postfix marker. The sections
1400	// struct is emitted later by the InstrProfiling pass, which emits it only
1401	// when the TU has profile data, avoiding dangling section references.
1402	unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
1403	std::string NamesVarPostfixVarName =
1404	std::string (llvm::getInstrProfNamesVarPostfixVarName());
1405	if (!M.getNamedValue(Name: NamesVarPostfixVarName)) {
1406	auto *NamesVarPostfix = llvm::ConstantDataArray::getString(
1407	Context&: Ctx, Initializer: (llvm::Twine("_") + CUIDHash).str(), AddNull: true);
1408	auto NamesGV = new* llvm::GlobalVariable(
1409	M, NamesVarPostfix->getType(), /isConstant=/true,
1410	llvm::GlobalValue::PrivateLinkage, NamesVarPostfix,
1411	NamesVarPostfixVarName,
1412	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
1413	GlobalAS);
1414	CGM.addCompilerUsedGlobal(GV: NamesGV);
1415	}
1416	return;
1417	}
1418
1419	// Host side: emit an opaque void shadow. Layout doesn't matter — the*
1420	// runtime locates it by name via hipGetSymbolAddress and treats it as
1421	// the address of the device-side struct. Registration with the HIP
1422	// runtime is added by makeRegisterGlobalsFn (non-RDC) or
1423	// createOffloadingEntries (RDC).
1424	auto *PtrTy = llvm::PointerType::getUnqual(C&: Ctx);
1425	OffloadProfShadow = new llvm::GlobalVariable(
1426	M, PtrTy, /isConstant=/false, llvm::GlobalValue::ExternalLinkage,
1427	llvm::ConstantPointerNull::get(T: PtrTy), Name);
1428	CGM.addCompilerUsedGlobal(GV: OffloadProfShadow);
1429
1430	auto AddSectionShadow = [&](StringRef Kind, const Twine &DeviceName) {
1431	std::string ShadowName =
1432	(Twine("__llvm_profile_shadow_") + Kind + "_" + CUIDHash + "_" +
1433	Twine(OffloadProfSectionShadows.size()))
1434	.str();
1435	auto Shadow = new* llvm::GlobalVariable(
1436	M, PtrTy, /isConstant=/false, llvm::GlobalValue::ExternalLinkage,
1437	llvm::ConstantPointerNull::get(T: PtrTy), ShadowName);
1438	CGM.addCompilerUsedGlobal(GV: Shadow);
1439	OffloadProfSectionShadows.push_back(Elt: {.Shadow: Shadow, .DeviceName: DeviceName.str()});
1440	};
1441
1442	// Keep this order in sync with the runtime: data, counters, uniform counters,
1443	// then names.
1444	for (auto &&I : EmittedKernels) {
1445	std::string KernelName = getDeviceSideName(ND: cast<NamedDecl>(Val: I.D));
1446	AddSectionShadow ("data", Twine("__profd_") + KernelName);
1447	AddSectionShadow ("cnts", Twine("__profc_") + KernelName);
1448	AddSectionShadow ("ucnts", Twine("__llvm_prf_unifcnt_") + KernelName);
1449	AddSectionShadow ("names",
1450	Twine (llvm::getInstrProfNamesVarName()) + "_" + CUIDHash);
1451	}
1452	}
1453
1454	// Returns module constructor to be added.
1455	llvm::Function *CGNVCUDARuntime::finalizeModule() {
1456	transformManagedVars();
1457	emitOffloadProfilingSections();
1458	if (CGM.getLangOpts().CUDAIsDevice) {
1459	// Mark ODR-used device variables as compiler used to prevent it from being
1460	// eliminated by optimization. This is necessary for device variables
1461	// ODR-used by host functions. Sema correctly marks them as ODR-used no
1462	// matter whether they are ODR-used by device or host functions.
1463	//
1464	// We do not need to do this if the variable has used attribute since it
1465	// has already been added.
1466	//
1467	// Static device variables have been externalized at this point, therefore
1468	// variables with LLVM private or internal linkage need not be added.
1469	for (auto &&Info : DeviceVars) {
1470	auto Kind = Info.Flags.getKind();
1471	if (!Info.Var->isDeclaration() &&
1472	!llvm::GlobalValue::isLocalLinkage(Linkage: Info.Var->getLinkage()) &&
1473	(Kind == DeviceVarFlags::Variable \|\|
1474	Kind == DeviceVarFlags::Surface \|\|
1475	Kind == DeviceVarFlags::Texture) &&
1476	Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1477	CGM.addCompilerUsedGlobal(GV: Info.Var);
1478	}
1479	}
1480	return nullptr;
1481	}
1482	if (CGM.getLangOpts().OffloadViaLLVM \|\|
1483	(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
1484	createOffloadingEntries();
1485	else
1486	return makeModuleCtorFunction();
1487
1488	return nullptr;
1489	}
1490
1491	llvm::GlobalValue CGNVCUDARuntime::getKernelHandle(llvm::Function F,
1492	GlobalDecl GD) {
1493	auto Loc = KernelHandles.find(Val: F->getName());
1494	if (Loc != KernelHandles.end()) {
1495	auto OldHandle = Loc ->second;
1496	if (KernelStubs [OldHandle] == F)
1497	return OldHandle;
1498
1499	// We've found the function name, but F itself has changed, so we need to
1500	// update the references.
1501	if (CGM.getLangOpts().HIP) {
1502	// For HIP compilation the handle itself does not change, so we only need
1503	// to update the Stub value.
1504	KernelStubs [OldHandle] = F;
1505	return OldHandle;
1506	}
1507	// For non-HIP compilation, erase the old Stub and fall-through to creating
1508	// new entries.
1509	KernelStubs.erase(Val: OldHandle);
1510	}
1511
1512	if (!CGM.getLangOpts().HIP) {
1513	KernelHandles [F->getName()] = F;
1514	KernelStubs [F] = F;
1515	return F;
1516	}
1517
1518	auto Var = new* llvm::GlobalVariable(
1519	TheModule, F->getType(), /isConstant=/true, F->getLinkage(),
1520	/Initializer=/nullptr,
1521	CGM.getMangledName(
1522	GD: GD.getWithKernelReferenceKind(Kind: KernelReferenceKind::Kernel)));
1523	Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1524	Var->setDSOLocal(F->isDSOLocal());
1525	Var->setVisibility(F->getVisibility());
1526	auto *FD = cast<FunctionDecl>(Val: GD.getDecl());
1527	auto *FT = FD->getPrimaryTemplate();
1528	if (!FT \|\| FT->isThisDeclarationADefinition())
1529	CGM.maybeSetTrivialComdat(D: FD, GO&: Var);
1530	KernelHandles [F->getName()] = Var;
1531	KernelStubs [Var] = F;
1532	return Var;
1533	}
1534

Browse the source code of llvm_projects/clang/lib/CodeGen/CGCUDANV.cpp