CGCUDANV.cpp source code [llvm_projects/clang/lib/CodeGen/CGCUDANV.cpp]

1	//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10	// runtime library.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGCUDARuntime.h"
15	#include "CGCXXABI.h"
16	#include "CodeGenFunction.h"
17	#include "CodeGenModule.h"
18	#include "clang/AST/Decl.h"
19	#include "clang/Basic/Cuda.h"
20	#include "clang/CodeGen/CodeGenABITypes.h"
21	#include "clang/CodeGen/ConstantInitBuilder.h"
22	#include "llvm/Frontend/Offloading/Utility.h"
23	#include "llvm/IR/BasicBlock.h"
24	#include "llvm/IR/Constants.h"
25	#include "llvm/IR/DerivedTypes.h"
26	#include "llvm/IR/ReplaceConstant.h"
27	#include "llvm/Support/Format.h"
28	#include "llvm/Support/VirtualFileSystem.h"
29
30	using namespace clang;
31	using namespace CodeGen;
32
33	namespace {
34	constexpr unsigned CudaFatMagic = `0x466243b1`;
35	constexpr unsigned HIPFatMagic = `0x48495046`; // "HIPF"
36
37	class CGNVCUDARuntime : public CGCUDARuntime {
38
39	private:
40	llvm::IntegerType IntTy, SizeTy;
41	llvm::Type *VoidTy;
42	llvm::PointerType *PtrTy;
43
44	/// Convenience reference to LLVM Context
45	llvm::LLVMContext &Context;
46	/// Convenience reference to the current module
47	llvm::Module &TheModule;
48	/// Keeps track of kernel launch stubs and handles emitted in this module
49	struct KernelInfo {
50	llvm::Function Kernel; // stub function to help launch kernel*
51	const Decl *D;
52	};
53	llvm::SmallVector<KernelInfo, `16`> EmittedKernels;
54	// Map a kernel mangled name to a symbol for identifying kernel in host code
55	// For CUDA, the symbol for identifying the kernel is the same as the device
56	// stub function. For HIP, they are different.
57	llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58	// Map a kernel handle to the kernel stub.
59	llvm::DenseMap<llvm::GlobalValue , llvm::Function > KernelStubs;
60	struct VarInfo {
61	llvm::GlobalVariable *Var;
62	const VarDecl *D;
63	DeviceVarFlags Flags;
64	};
65	llvm::SmallVector<VarInfo, `16`> DeviceVars;
66	/// Keeps track of variable containing handle of GPU binary. Populated by
67	/// ModuleCtorFunction() and used to create corresponding cleanup calls in
68	/// ModuleDtorFunction()
69	llvm::GlobalVariable GpuBinaryHandle = nullptr*;
70	/// Whether we generate relocatable device code.
71	bool RelocatableDeviceCode;
72	/// Mangle context for device.
73	std::unique_ptr<MangleContext> DeviceMC;
74
75	llvm::FunctionCallee getSetupArgumentFn() const;
76	llvm::FunctionCallee getLaunchFn() const;
77
78	llvm::FunctionType getRegisterGlobalsFnTy() const*;
79	llvm::FunctionType getCallbackFnTy() const*;
80	llvm::FunctionType getRegisterLinkedBinaryFnTy() const*;
81	std::string addPrefixToName(StringRef FuncName) const;
82	std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83
84	/// Creates a function to register all kernel stubs generated in this module.
85	llvm::Function *makeRegisterGlobalsFn();
86
87	/// Helper function that generates a constant string and returns a pointer to
88	/// the start of the string. The result of this function can be used anywhere
89	/// where the C code specifies const char.*
90	llvm::Constant makeConstantString(const* std::string &Str,
91	const std::string &Name = "") {
92	return CGM.GetAddrOfConstantCString(Str, GlobalName: Name.c_str()).getPointer();
93	}
94
95	/// Helper function which generates an initialized constant array from Str,
96	/// and optionally sets section name and alignment. AddNull specifies whether
97	/// the array should nave NUL termination.
98	llvm::Constant *makeConstantArray(StringRef Str,
99	StringRef Name = "",
100	StringRef SectionName = "",
101	unsigned Alignment = `0`,
102	bool AddNull = false) {
103	llvm::Constant *Value =
104	llvm::ConstantDataArray::getString(Context, Initializer: Str, AddNull);
105	auto GV = new* llvm::GlobalVariable (
106	TheModule, Value->getType(), /isConstant=/true,
107	llvm::GlobalValue::PrivateLinkage, Value, Name);
108	if (!SectionName.empty()) {
109	GV->setSection(SectionName);
110	// Mark the address as used which make sure that this section isn't
111	// merged and we will really have it in the object file.
112	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113	}
114	if (Alignment)
115	GV->setAlignment(llvm::Align (Alignment));
116	return GV;
117	}
118
119	/// Helper function that generates an empty dummy function returning void.
120	llvm::Function makeDummyFunction(llvm::FunctionType FnTy) {
121	assert(FnTy->getReturnType()->isVoidTy() &&
122	"Can only generate dummy functions returning void!");
123	llvm::Function *DummyFunc = llvm::Function::Create(
124	Ty: FnTy, Linkage: llvm::GlobalValue::InternalLinkage, N: "dummy", M: &TheModule);
125
126	llvm::BasicBlock *DummyBlock =
127	llvm::BasicBlock::Create(Context, Name: "", Parent: DummyFunc);
128	CGBuilderTy FuncBuilder(CGM, Context);
129	FuncBuilder.SetInsertPoint(DummyBlock);
130	FuncBuilder.CreateRetVoid();
131
132	return DummyFunc;
133	}
134
135	void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136	void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137	std::string getDeviceSideName(const NamedDecl *ND) override;
138
139	void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140	bool Extern, bool Constant) {
141	DeviceVars.push_back(Elt: {.Var: &Var,
142	.D: VD,
143	.Flags: {DeviceVarFlags::Variable, Extern, Constant,
144	VD->hasAttr<HIPManagedAttr>(),
145	/Normalized/ false, `0`}});
146	}
147	void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148	bool Extern, int Type) {
149	DeviceVars.push_back(Elt: {.Var: &Var,
150	.D: VD,
151	.Flags: {DeviceVarFlags::Surface, Extern, /Constant/ false,
152	/Managed/ false,
153	/Normalized/ false, Type}});
154	}
155	void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156	bool Extern, int Type, bool Normalized) {
157	DeviceVars.push_back(Elt: {.Var: &Var,
158	.D: VD,
159	.Flags: {DeviceVarFlags::Texture, Extern, /Constant/ false,
160	/Managed/ false, Normalized, Type}});
161	}
162
163	/// Creates module constructor function
164	llvm::Function *makeModuleCtorFunction();
165	/// Creates module destructor function
166	llvm::Function *makeModuleDtorFunction();
167	/// Transform managed variables for device compilation.
168	void transformManagedVars();
169	/// Create offloading entries to register globals in RDC mode.
170	void createOffloadingEntries();
171
172	public:
173	CGNVCUDARuntime(CodeGenModule &CGM);
174
175	llvm::GlobalValue getKernelHandle(llvm::Function F, GlobalDecl GD) override;
176	llvm::Function getKernelStub(llvm::GlobalValue Handle) override {
177	auto Loc = KernelStubs.find(Val: Handle);
178	assert(Loc != KernelStubs.end());
179	return Loc ->second;
180	}
181	void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182	void handleVarRegistration(const VarDecl *VD,
183	llvm::GlobalVariable &Var) override;
184	void
185	internalizeDeviceSideVar(const VarDecl *D,
186	llvm::GlobalValue::LinkageTypes &Linkage) override;
187
188	llvm::Function *finalizeModule() override;
189	};
190
191	} // end anonymous namespace
192
193	std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194	if (CGM.getLangOpts().HIP)
195	return ((Twine ("hip") + Twine (FuncName)).str());
196	return ((Twine ("cuda") + Twine (FuncName)).str());
197	}
198	std::string
199	CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200	if (CGM.getLangOpts().HIP)
201	return ((Twine ("__hip") + Twine (FuncName)).str());
202	return ((Twine ("__cuda") + Twine (FuncName)).str());
203	}
204
205	static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206	// If the host and device have different C++ ABIs, mark it as the device
207	// mangle context so that the mangling needs to retrieve the additional
208	// device lambda mangling number instead of the regular host one.
209	if (CGM.getContext().getAuxTargetInfo() &&
210	CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
211	CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
212	return std::unique_ptr<MangleContext>(
213	CGM.getContext().createDeviceMangleContext(
214	T: *CGM.getContext().getAuxTargetInfo()));
215	}
216
217	return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
218	T: CGM.getContext().getAuxTargetInfo()));
219	}
220
221	CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222	: CGCUDARuntime (CGM), Context(CGM.getLLVMContext()),
223	TheModule(CGM.getModule()),
224	RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225	DeviceMC(InitDeviceMC(CGM)) {
226	IntTy = CGM.IntTy;
227	SizeTy = CGM.SizeTy;
228	VoidTy = CGM.VoidTy;
229	PtrTy = CGM.UnqualPtrTy;
230	}
231
232	llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233	// cudaError_t cudaSetupArgument(void , size_t, size_t)*
234	llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235	return CGM.CreateRuntimeFunction(
236	Ty: llvm::FunctionType::get(Result: IntTy, Params, isVarArg: false),
237	Name: addPrefixToName(FuncName: "SetupArgument"));
238	}
239
240	llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241	if (CGM.getLangOpts().HIP) {
242	// hipError_t hipLaunchByPtr(char );*
243	return CGM.CreateRuntimeFunction(
244	Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), Name: "hipLaunchByPtr");
245	}
246	// cudaError_t cudaLaunch(char );*
247	return CGM.CreateRuntimeFunction(Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false),
248	Name: "cudaLaunch");
249	}
250
251	llvm::FunctionType CGNVCUDARuntime::getRegisterGlobalsFnTy() const* {
252	return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
253	}
254
255	llvm::FunctionType CGNVCUDARuntime::getCallbackFnTy() const* {
256	return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
257	}
258
259	llvm::FunctionType CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const* {
260	llvm::Type *Params[] = {llvm::PointerType::getUnqual(C&: Context), PtrTy, PtrTy,
261	llvm::PointerType::getUnqual(C&: Context)};
262	return llvm::FunctionType::get(Result: VoidTy, Params, isVarArg: false);
263	}
264
265	std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266	GlobalDecl GD;
267	// D could be either a kernel or a variable.
268	if (auto *FD = dyn_cast<FunctionDecl>(Val: ND))
269	GD = GlobalDecl (FD, KernelReferenceKind::Kernel);
270	else
271	GD = GlobalDecl (ND);
272	std::string DeviceSideName;
273	MangleContext *MC;
274	if (CGM.getLangOpts().CUDAIsDevice)
275	MC = &CGM.getCXXABI().getMangleContext();
276	else
277	MC = DeviceMC.get();
278	if (MC->shouldMangleDeclName(D: ND)) {
279	SmallString<`256`> Buffer;
280	llvm::raw_svector_ostream Out(Buffer);
281	MC->mangleName(GD, Out);
282	DeviceSideName = std::string (Out.str());
283	} else
284	DeviceSideName = std::string (ND->getIdentifier()->getName());
285
286	// Make unique name for device side static file-scope variable for HIP.
287	if (CGM.getContext().shouldExternalize(D: ND) &&
288	CGM.getLangOpts().GPURelocatableDeviceCode) {
289	SmallString<`256`> Buffer;
290	llvm::raw_svector_ostream Out(Buffer);
291	Out << DeviceSideName;
292	CGM.printPostfixForExternalizedDecl(OS&: Out, D: ND);
293	DeviceSideName = std::string (Out.str());
294	}
295	return DeviceSideName;
296	}
297
298	void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299	FunctionArgList &Args) {
300	EmittedKernels.push_back(Elt: {.Kernel: CGF.CurFn, .D: CGF.CurFuncDecl});
301	if (auto *GV =
302	dyn_cast<llvm::GlobalVariable>(Val: KernelHandles [CGF.CurFn->getName()])) {
303	GV->setLinkage(CGF.CurFn->getLinkage());
304	GV->setInitializer(CGF.CurFn);
305	}
306	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307	CudaFeature::CUDA_USES_NEW_LAUNCH) \|\|
308	(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309	emitDeviceStubBodyNew(CGF, Args);
310	else
311	emitDeviceStubBodyLegacy(CGF, Args);
312	}
313
314	// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315	// array and kernels are launched using cudaLaunchKernel().
316	void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317	FunctionArgList &Args) {
318	// Build the shadow stack entry at the very start of the function.
319
320	// Calculate amount of space we will need for all arguments. If we have no
321	// args, allocate a single pointer so we still have a valid pointer to the
322	// argument array that we can pass to runtime, even if it will be unused.
323	Address KernelArgs = CGF.CreateTempAlloca(
324	Ty: PtrTy, align: CharUnits::fromQuantity(Quantity: `16`), Name: "kernel_args",
325	ArraySize: llvm::ConstantInt::get(Ty: SizeTy, V: std::max<size_t>(a: `1`, b: Args.size())));
326	// Store pointers to the arguments in a locally allocated launch_args.
327	for (unsigned i = `0`; i < Args.size(); ++i) {
328	llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(VD: Args [i]).emitRawPointer(CGF);
329	llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(V: VarPtr, DestTy: PtrTy);
330	CGF.Builder.CreateDefaultAlignedStore(
331	Val: VoidVarPtr, Addr: CGF.Builder.CreateConstGEP1_32(
332	Ty: PtrTy, Ptr: KernelArgs.emitRawPointer(CGF), Idx0: i));
333	}
334
335	llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
336
337	// Lookup cudaLaunchKernel/hipLaunchKernel function.
338	// HIP kernel launching API name depends on -fgpu-default-stream option. For
339	// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340	// it is hipLaunchKernel_spt.
341	// cudaError_t cudaLaunchKernel(const void func, dim3 gridDim, dim3 blockDim,*
342	// void args, size_t sharedMem,
343	// cudaStream_t stream);
344	// hipError_t hipLaunchKernel[_spt](const void func, dim3 gridDim,*
345	// dim3 blockDim, void args,
346	// size_t sharedMem, hipStream_t stream);
347	TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
348	DeclContext *DC = TranslationUnitDecl::castToDeclContext(D: TUDecl);
349	std::string KernelLaunchAPI = "LaunchKernel";
350	if (CGF.getLangOpts().GPUDefaultStream ==
351	LangOptions::GPUDefaultStreamKind::PerThread) {
352	if (CGF.getLangOpts().HIP)
353	KernelLaunchAPI = KernelLaunchAPI + "_spt";
354	else if (CGF.getLangOpts().CUDA)
355	KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356	}
357	auto LaunchKernelName = addPrefixToName(FuncName: KernelLaunchAPI);
358	const IdentifierInfo &cudaLaunchKernelII =
359	CGM.getContext().Idents.get(Name: LaunchKernelName);
360	FunctionDecl cudaLaunchKernelFD = nullptr*;
361	for (auto *Result : DC->lookup(Name: &cudaLaunchKernelII)) {
362	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Val: Result))
363	cudaLaunchKernelFD = FD;
364	}
365
366	if (cudaLaunchKernelFD == nullptr) {
367	CGM.Error(loc: CGF.CurFuncDecl->getLocation(),
368	error: "Can't find declaration for " + LaunchKernelName);
369	return;
370	}
371	// Create temporary dim3 grid_dim, block_dim.
372	ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(i: `1`);
373	QualType Dim3Ty = GridDimParam->getType();
374	Address GridDim =
375	CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: `8`), Name: "grid_dim");
376	Address BlockDim =
377	CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: `8`), Name: "block_dim");
378	Address ShmemSize =
379	CGF.CreateTempAlloca(Ty: SizeTy, align: CGM.getSizeAlign(), Name: "shmem_size");
380	Address Stream = CGF.CreateTempAlloca(Ty: PtrTy, align: CGM.getPointerAlign(), Name: "stream");
381	llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382	Ty: llvm::FunctionType::get(Result: IntTy,
383	Params: {/gridDim=/GridDim.getType(),
384	/blockDim=/BlockDim.getType(),
385	/ShmemSize=/ShmemSize.getType(),
386	/Stream=/Stream.getType()},
387	/isVarArg=/false),
388	Name: addUnderscoredPrefixToName(FuncName: "PopCallConfiguration"));
389
390	CGF.EmitRuntimeCallOrInvoke(callee: cudaPopConfigFn, args: {GridDim.emitRawPointer(CGF),
391	BlockDim.emitRawPointer(CGF),
392	ShmemSize.emitRawPointer(CGF),
393	Stream.emitRawPointer(CGF)});
394
395	// Emit the call to cudaLaunch
396	llvm::Value *Kernel =
397	CGF.Builder.CreatePointerCast(V: KernelHandles [CGF.CurFn->getName()], DestTy: PtrTy);
398	CallArgList LaunchKernelArgs;
399	LaunchKernelArgs.add(rvalue: RValue::get(V: Kernel),
400	type: cudaLaunchKernelFD->getParamDecl(i: `0`)->getType());
401	LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: GridDim), type: Dim3Ty);
402	LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: BlockDim), type: Dim3Ty);
403	LaunchKernelArgs.add(rvalue: RValue::get(Addr: KernelArgs, CGF),
404	type: cudaLaunchKernelFD->getParamDecl(i: `3`)->getType());
405	LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: ShmemSize)),
406	type: cudaLaunchKernelFD->getParamDecl(i: `4`)->getType());
407	LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: Stream)),
408	type: cudaLaunchKernelFD->getParamDecl(i: `5`)->getType());
409
410	QualType QT = cudaLaunchKernelFD->getType();
411	QualType CQT = QT.getCanonicalType();
412	llvm::Type *Ty = CGM.getTypes().ConvertType(T: CQT);
413	llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
414
415	const CGFunctionInfo &FI =
416	CGM.getTypes().arrangeFunctionDeclaration(FD: cudaLaunchKernelFD);
417	llvm::FunctionCallee cudaLaunchKernelFn =
418	CGM.CreateRuntimeFunction(Ty: FTy, Name: LaunchKernelName);
419	CGF.EmitCall(CallInfo: FI, Callee: CGCallee::forDirect(functionPtr: cudaLaunchKernelFn), ReturnValue: ReturnValueSlot (),
420	Args: LaunchKernelArgs);
421
422	// To prevent CUDA device stub functions from being merged by ICF in MSVC
423	// environment, create an unique global variable for each kernel and write to
424	// the variable in the device stub.
425	if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426	!CGF.getLangOpts().HIP) {
427	llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Val: Kernel);
428	std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429
430	llvm::GlobalVariable *HandleVar =
431	CGM.getModule().getNamedGlobal(Name: GlobalVarName);
432	if (!HandleVar) {
433	HandleVar = new llvm::GlobalVariable (
434	CGM.getModule(), CGM.Int8Ty,
435	/Constant=/false, KernelFunction->getLinkage(),
436	llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: `0`), GlobalVarName);
437	HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438	HandleVar->setVisibility(KernelFunction->getVisibility());
439	if (KernelFunction->hasComdat())
440	HandleVar->setComdat(CGM.getModule().getOrInsertComdat(Name: GlobalVarName));
441	}
442
443	CGF.Builder.CreateAlignedStore(Val: llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: `1`),
444	Addr: HandleVar, Align: CharUnits::One(),
445	/IsVolatile=/true);
446	}
447
448	CGF.EmitBranch(Block: EndBlock);
449
450	CGF.EmitBlock(BB: EndBlock);
451	}
452
453	void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454	FunctionArgList &Args) {
455	// Emit a call to cudaSetupArgument for each arg in Args.
456	llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457	llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
458	CharUnits Offset = CharUnits::Zero();
459	for (const VarDecl *A : Args) {
460	auto TInfo = CGM.getContext().getTypeInfoInChars(T: A->getType());
461	Offset = Offset.alignTo(Align: TInfo.Align);
462	llvm::Value *Args[] = {
463	CGF.Builder.CreatePointerCast(
464	V: CGF.GetAddrOfLocalVar(VD: A).emitRawPointer(CGF), DestTy: PtrTy),
465	llvm::ConstantInt::get(Ty: SizeTy, V: TInfo.Width.getQuantity()),
466	llvm::ConstantInt::get(Ty: SizeTy, V: Offset.getQuantity()),
467	};
468	llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(callee: cudaSetupArgFn, args: Args);
469	llvm::Constant *Zero = llvm::ConstantInt::get(Ty: IntTy, V: `0`);
470	llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(LHS: CB, RHS: Zero);
471	llvm::BasicBlock *NextBlock = CGF.createBasicBlock(name: "setup.next");
472	CGF.Builder.CreateCondBr(Cond: CBZero, True: NextBlock, False: EndBlock);
473	CGF.EmitBlock(BB: NextBlock);
474	Offset += TInfo.Width;
475	}
476
477	// Emit the call to cudaLaunch
478	llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479	llvm::Value *Arg =
480	CGF.Builder.CreatePointerCast(V: KernelHandles [CGF.CurFn->getName()], DestTy: PtrTy);
481	CGF.EmitRuntimeCallOrInvoke(callee: cudaLaunchFn, args: Arg);
482	CGF.EmitBranch(Block: EndBlock);
483
484	CGF.EmitBlock(BB: EndBlock);
485	}
486
487	// Replace the original variable Var with the address loaded from variable
488	// ManagedVar populated by HIP runtime.
489	static void replaceManagedVar(llvm::GlobalVariable *Var,
490	llvm::GlobalVariable *ManagedVar) {
491	SmallVector<SmallVector<llvm::User *, `8`>, `8`> WorkList;
492	for (auto &&VarUse : Var->uses()) {
493	WorkList.push_back(Elt: {VarUse.getUser()});
494	}
495	while (!WorkList.empty()) {
496	auto &&WorkItem = WorkList.pop_back_val();
497	auto *U = WorkItem.back();
498	if (isa<llvm::ConstantExpr>(Val: U)) {
499	for (auto &&UU : U->uses()) {
500	WorkItem.push_back(Elt: UU.getUser());
501	WorkList.push_back(Elt: WorkItem);
502	WorkItem.pop_back();
503	}
504	continue;
505	}
506	if (auto *I = dyn_cast<llvm::Instruction>(Val: U)) {
507	llvm::Value *OldV = Var;
508	llvm::Instruction *NewV =
509	new llvm::LoadInst (Var->getType(), ManagedVar, "ld.managed", false,
510	llvm::Align (Var->getAlignment()), I);
511	WorkItem.pop_back();
512	// Replace constant expressions directly or indirectly using the managed
513	// variable with instructions.
514	for (auto &&Op : WorkItem) {
515	auto *CE = cast<llvm::ConstantExpr>(Val: Op);
516	auto *NewInst = CE->getAsInstruction();
517	NewInst->insertBefore(BB&: *I->getParent(), InsertPos: I->getIterator());
518	NewInst->replaceUsesOfWith(From: OldV, To: NewV);
519	OldV = CE;
520	NewV = NewInst;
521	}
522	I->replaceUsesOfWith(From: OldV, To: NewV);
523	} else {
524	llvm_unreachable("Invalid use of managed variable");
525	}
526	}
527	}
528
529	/// Creates a function that sets up state on the host side for CUDA objects that
530	/// have a presence on both the host and device sides. Specifically, registers
531	/// the host side of kernel functions and device global variables with the CUDA
532	/// runtime.
533	/// \code
534	/// void __cuda_register_globals(void* GpuBinaryHandle) {*
535	/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536	/// ...
537	/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539	/// ...
540	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541	/// }
542	/// \endcode
543	llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544	// No need to register anything
545	if (EmittedKernels.empty() && DeviceVars.empty())
546	return nullptr;
547
548	llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549	Ty: getRegisterGlobalsFnTy(), Linkage: llvm::GlobalValue::InternalLinkage,
550	N: addUnderscoredPrefixToName(FuncName: "_register_globals"), M: &TheModule);
551	llvm::BasicBlock *EntryBB =
552	llvm::BasicBlock::Create(Context, Name: "entry", Parent: RegisterKernelsFunc);
553	CGBuilderTy Builder(CGM, Context);
554	Builder.SetInsertPoint(EntryBB);
555
556	// void __cudaRegisterFunction(void , const char , char , const char ,*
557	// int, uint3, uint3, dim3, dim3, int)*
558	llvm::Type *RegisterFuncParams[] = {
559	PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560	PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(C&: Context)};
561	llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562	Ty: llvm::FunctionType::get(Result: IntTy, Params: RegisterFuncParams, isVarArg: false),
563	Name: addUnderscoredPrefixToName(FuncName: "RegisterFunction"));
564
565	// Extract GpuBinaryHandle passed as the first argument passed to
566	// __cuda_register_globals() and generate __cudaRegisterFunction() call for
567	// each emitted kernel.
568	llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569	for (auto &&I : EmittedKernels) {
570	llvm::Constant *KernelName =
571	makeConstantString(Str: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)));
572	llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(T: PtrTy);
573	llvm::Value *Args[] = {
574	&GpuBinaryHandlePtr,
575	KernelHandles [I.Kernel->getName()],
576	KernelName,
577	KernelName,
578	llvm::ConstantInt::get(Ty: IntTy, V: -`1`),
579	NullPtr,
580	NullPtr,
581	NullPtr,
582	NullPtr,
583	llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Context))};
584	Builder.CreateCall(Callee: RegisterFunc, Args);
585	}
586
587	llvm::Type *VarSizeTy = IntTy;
588	// For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589	if (CGM.getLangOpts().HIP \|\|
590	ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591	VarSizeTy = SizeTy;
592
593	// void __cudaRegisterVar(void , char , char , const char ,*
594	// int, int, int, int)
595	llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
596	IntTy, VarSizeTy, IntTy, IntTy};
597	llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598	Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterVarParams, isVarArg: false),
599	Name: addUnderscoredPrefixToName(FuncName: "RegisterVar"));
600	// void __hipRegisterManagedVar(void , char , char , const char ,*
601	// size_t, unsigned)
602	llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
603	PtrTy, VarSizeTy, IntTy};
604	llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605	Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterManagedVarParams, isVarArg: false),
606	Name: addUnderscoredPrefixToName(FuncName: "RegisterManagedVar"));
607	// void __cudaRegisterSurface(void , const struct surfaceReference ,*
608	// const void , const char , int, int);*
609	llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610	Ty: llvm::FunctionType::get(
611	Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, isVarArg: false),
612	Name: addUnderscoredPrefixToName(FuncName: "RegisterSurface"));
613	// void __cudaRegisterTexture(void , const struct textureReference ,*
614	// const void , const char , int, int, int)*
615	llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616	Ty: llvm::FunctionType::get(
617	Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, isVarArg: false),
618	Name: addUnderscoredPrefixToName(FuncName: "RegisterTexture"));
619	for (auto &&Info : DeviceVars) {
620	llvm::GlobalVariable *Var = Info.Var;
621	assert((!Var->isDeclaration() \|\| Info.Flags.isManaged()) &&
622	"External variables should not show up here, except HIP managed "
623	"variables");
624	llvm::Constant *VarName = makeConstantString(Str: getDeviceSideName(ND: Info.D));
625	switch (Info.Flags.getKind()) {
626	case DeviceVarFlags::Variable: {
627	uint64_t VarSize =
628	CGM.getDataLayout().getTypeAllocSize(Ty: Var->getValueType());
629	if (Info.Flags.isManaged()) {
630	assert(Var->getName().ends_with(".managed") &&
631	"HIP managed variables not transformed");
632	auto *ManagedVar = CGM.getModule().getNamedGlobal(
633	Name: Var->getName().drop_back(N: StringRef (".managed").size()));
634	llvm::Value *Args[] = {
635	&GpuBinaryHandlePtr,
636	ManagedVar,
637	Var,
638	VarName,
639	llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
640	llvm::ConstantInt::get(Ty: IntTy, V: Var->getAlignment())};
641	if (!Var->isDeclaration())
642	Builder.CreateCall(Callee: RegisterManagedVar, Args);
643	} else {
644	llvm::Value *Args[] = {
645	&GpuBinaryHandlePtr,
646	Var,
647	VarName,
648	VarName,
649	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern()),
650	llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
651	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isConstant()),
652	llvm::ConstantInt::get(Ty: IntTy, V: `0`)};
653	Builder.CreateCall(Callee: RegisterVar, Args);
654	}
655	break;
656	}
657	case DeviceVarFlags::Surface:
658	Builder.CreateCall(
659	Callee: RegisterSurf,
660	Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
661	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
662	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
663	break;
664	case DeviceVarFlags::Texture:
665	Builder.CreateCall(
666	Callee: RegisterTex,
667	Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
668	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
669	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isNormalized()),
670	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
671	break;
672	}
673	}
674
675	Builder.CreateRetVoid();
676	return RegisterKernelsFunc;
677	}
678
679	/// Creates a global constructor function for the module:
680	///
681	/// For CUDA:
682	/// \code
683	/// void __cuda_module_ctor() {
684	/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685	/// __cuda_register_globals(Handle);
686	/// }
687	/// \endcode
688	///
689	/// For HIP:
690	/// \code
691	/// void __hip_module_ctor() {
692	/// if (__hip_gpubin_handle == 0) {
693	/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
694	/// __hip_register_globals(__hip_gpubin_handle);
695	/// }
696	/// }
697	/// \endcode
698	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699	bool IsHIP = CGM.getLangOpts().HIP;
700	bool IsCUDA = CGM.getLangOpts().CUDA;
701	// No need to generate ctors/dtors if there is no GPU binary.
702	StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703	if (CudaGpuBinaryFileName.empty() && !IsHIP)
704	return nullptr;
705	if ((IsHIP \|\| (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706	DeviceVars.empty())
707	return nullptr;
708
709	// void __{cuda\|hip}_register_globals(void handle);*
710	llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711	// We always need a function to pass in as callback. Create a dummy
712	// implementation if we don't need to register anything.
713	if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714	RegisterGlobalsFunc = makeDummyFunction(FnTy: getRegisterGlobalsFnTy());
715
716	// void * __{cuda\|hip}RegisterFatBinary(void );
717	llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718	Ty: llvm::FunctionType::get(Result: PtrTy, Params: PtrTy, isVarArg: false),
719	Name: addUnderscoredPrefixToName(FuncName: "RegisterFatBinary"));
720	// struct { int magic, int version, void gpu_binary, void * dont_care };*
721	llvm::StructType *FatbinWrapperTy =
722	llvm::StructType::get(elt1: IntTy, elts: IntTy, elts: PtrTy, elts: PtrTy);
723
724	// Register GPU binary with the CUDA runtime, store returned handle in a
725	// global variable and save a reference in GpuBinaryHandle to be cleaned up
726	// in destructor on exit. Then associate all known kernels with the GPU binary
727	// handle so CUDA runtime can figure out what to call on the GPU side.
728	std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729	if (!CudaGpuBinaryFileName.empty()) {
730	auto VFS = CGM.getFileSystem();
731	auto CudaGpuBinaryOrErr =
732	VFS ->getBufferForFile(Name: CudaGpuBinaryFileName, FileSize: -`1`, RequiresNullTerminator: false);
733	if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734	CGM.getDiags().Report(DiagID: diag::err_cannot_open_file)
735	<< CudaGpuBinaryFileName << EC.message();
736	return nullptr;
737	}
738	CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739	}
740
741	llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
743	Linkage: llvm::GlobalValue::InternalLinkage,
744	N: addUnderscoredPrefixToName(FuncName: "_module_ctor"), M: &TheModule);
745	llvm::BasicBlock *CtorEntryBB =
746	llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleCtorFunc);
747	CGBuilderTy CtorBuilder(CGM, Context);
748
749	CtorBuilder.SetInsertPoint(CtorEntryBB);
750
751	const char *FatbinConstantName;
752	const char *FatbinSectionName;
753	const char *ModuleIDSectionName;
754	StringRef ModuleIDPrefix;
755	llvm::Constant *FatBinStr;
756	unsigned FatMagic;
757	if (IsHIP) {
758	FatbinConstantName = ".hip_fatbin";
759	FatbinSectionName = ".hipFatBinSegment";
760
761	ModuleIDSectionName = "__hip_module_id";
762	ModuleIDPrefix = "__hip_";
763
764	if (CudaGpuBinary) {
765	// If fatbin is available from early finalization, create a string
766	// literal containing the fat binary loaded from the given file.
767	const unsigned HIPCodeObjectAlign = `4096`;
768	FatBinStr = makeConstantArray(Str: std::string (CudaGpuBinary ->getBuffer()), Name: "",
769	SectionName: FatbinConstantName, Alignment: HIPCodeObjectAlign);
770	} else {
771	// If fatbin is not available, create an external symbol
772	// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773	// to contain the fat binary but will be populated somewhere else,
774	// e.g. by lld through link script.
775	FatBinStr = new llvm::GlobalVariable (
776	CGM.getModule(), CGM.Int8Ty,
777	/isConstant=/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778	"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779	llvm::GlobalVariable::NotThreadLocal);
780	cast<llvm::GlobalVariable>(Val: FatBinStr)->setSection(FatbinConstantName);
781	}
782
783	FatMagic = HIPFatMagic;
784	} else {
785	if (RelocatableDeviceCode)
786	FatbinConstantName = CGM.getTriple().isMacOSX()
787	? "__NV_CUDA,__nv_relfatbin"
788	: "__nv_relfatbin";
789	else
790	FatbinConstantName =
791	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792	// NVIDIA's cuobjdump looks for fatbins in this section.
793	FatbinSectionName =
794	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795
796	ModuleIDSectionName = CGM.getTriple().isMacOSX()
797	? "__NV_CUDA,__nv_module_id"
798	: "__nv_module_id";
799	ModuleIDPrefix = "__nv_";
800
801	// For CUDA, create a string literal containing the fat binary loaded from
802	// the given file.
803	FatBinStr = makeConstantArray(Str: std::string (CudaGpuBinary ->getBuffer()), Name: "",
804	SectionName: FatbinConstantName, Alignment: `8`);
805	FatMagic = CudaFatMagic;
806	}
807
808	// Create initialized wrapper structure that points to the loaded GPU binary
809	ConstantInitBuilder Builder(CGM);
810	auto Values = Builder.beginStruct(structTy: FatbinWrapperTy);
811	// Fatbin wrapper magic.
812	Values.addInt(intTy: IntTy, value: FatMagic);
813	// Fatbin version.
814	Values.addInt(intTy: IntTy, value: `1`);
815	// Data.
816	Values.add(value: FatBinStr);
817	// Unused in fatbin v1.
818	Values.add(value: llvm::ConstantPointerNull::get(T: PtrTy));
819	llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820	args: addUnderscoredPrefixToName(FuncName: "_fatbin_wrapper"), args: CGM.getPointerAlign(),
821	/constant/ args: true);
822	FatbinWrapper->setSection(FatbinSectionName);
823
824	// There is only one HIP fat binary per linked module, however there are
825	// multiple constructor functions. Make sure the fat binary is registered
826	// only once. The constructor functions are executed by the dynamic loader
827	// before the program gains control. The dynamic loader cannot execute the
828	// constructor functions concurrently since doing that would not guarantee
829	// thread safety of the loaded program. Therefore we can assume sequential
830	// execution of constructor functions here.
831	if (IsHIP) {
832	auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833	: llvm::GlobalValue::ExternalLinkage;
834	llvm::BasicBlock *IfBlock =
835	llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleCtorFunc);
836	llvm::BasicBlock *ExitBlock =
837	llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleCtorFunc);
838	// The name, size, and initialization pattern of this variable is part
839	// of HIP ABI.
840	GpuBinaryHandle = new llvm::GlobalVariable (
841	TheModule, PtrTy, /isConstant=/false, Linkage,
842	/Initializer=/
843	CudaGpuBinary ? llvm::ConstantPointerNull::get(T: PtrTy) : nullptr,
844	CudaGpuBinary
845	? "__hip_gpubin_handle"
846	: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848	// Prevent the weak symbol in different shared libraries being merged.
849	if (Linkage != llvm::GlobalValue::InternalLinkage)
850	GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851	Address GpuBinaryAddr(
852	GpuBinaryHandle, PtrTy,
853	CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
854	{
855	auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
856	llvm::Constant *Zero =
857	llvm::Constant::getNullValue(Ty: HandleValue->getType());
858	llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(LHS: HandleValue, RHS: Zero);
859	CtorBuilder.CreateCondBr(Cond: EQZero, True: IfBlock, False: ExitBlock);
860	}
861	{
862	CtorBuilder.SetInsertPoint(IfBlock);
863	// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864	llvm::CallInst *RegisterFatbinCall =
865	CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
866	CtorBuilder.CreateStore(Val: RegisterFatbinCall, Addr: GpuBinaryAddr);
867	CtorBuilder.CreateBr(Dest: ExitBlock);
868	}
869	{
870	CtorBuilder.SetInsertPoint(ExitBlock);
871	// Call __hip_register_globals(GpuBinaryHandle);
872	if (RegisterGlobalsFunc) {
873	auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
874	CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: HandleValue);
875	}
876	}
877	} else if (!RelocatableDeviceCode) {
878	// Register binary with CUDA runtime. This is substantially different in
879	// default mode vs. separate compilation!
880	// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881	llvm::CallInst *RegisterFatbinCall =
882	CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
883	GpuBinaryHandle = new llvm::GlobalVariable (
884	TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885	llvm::ConstantPointerNull::get(T: PtrTy), "__cuda_gpubin_handle");
886	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887	CtorBuilder.CreateAlignedStore(Val: RegisterFatbinCall, Addr: GpuBinaryHandle,
888	Align: CGM.getPointerAlign());
889
890	// Call __cuda_register_globals(GpuBinaryHandle);
891	if (RegisterGlobalsFunc)
892	CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: RegisterFatbinCall);
893
894	// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896	CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897	// void __cudaRegisterFatBinaryEnd(void );
898	llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899	Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
900	Name: "__cudaRegisterFatBinaryEnd");
901	CtorBuilder.CreateCall(Callee: RegisterFatbinEndFunc, Args: RegisterFatbinCall);
902	}
903	} else {
904	// Generate a unique module ID.
905	SmallString<`64`> ModuleID;
906	llvm::raw_svector_ostream OS(ModuleID);
907	OS << ModuleIDPrefix << llvm::format(Fmt: "%" PRIx64, Vals: FatbinWrapper->getGUID());
908	llvm::Constant *ModuleIDConstant = makeConstantArray(
909	Str: std::string(ModuleID), Name: "", SectionName: ModuleIDSectionName, Alignment: `32`, /AddNull=/true);
910
911	// Create an alias for the FatbinWrapper that nvcc will look for.
912	llvm::GlobalAlias::create(Linkage: llvm::GlobalValue::ExternalLinkage,
913	Name: Twine ("__fatbinwrap") + ModuleID, Aliasee: FatbinWrapper);
914
915	// void __cudaRegisterLinkedBinary%ModuleID%(void ()(void ), void ,*
916	// void , void ()(void ))
917	SmallString<`128`> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918	RegisterLinkedBinaryName += ModuleID;
919	llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920	Ty: getRegisterLinkedBinaryFnTy(), Name: RegisterLinkedBinaryName);
921
922	assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923	llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924	makeDummyFunction(FnTy: getCallbackFnTy())};
925	CtorBuilder.CreateCall(Callee: RegisterLinkedBinaryFunc, Args);
926	}
927
928	// Create destructor and register it with atexit() the way NVCC does it. Doing
929	// it during regular destructor phase worked in CUDA before 9.2 but results in
930	// double-free in 9.2.
931	if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932	// extern "C" int atexit(void (f)(void));*
933	llvm::FunctionType *AtExitTy =
934	llvm::FunctionType::get(Result: IntTy, Params: CleanupFn->getType(), isVarArg: false);
935	llvm::FunctionCallee AtExitFunc =
936	CGM.CreateRuntimeFunction(Ty: AtExitTy, Name: "atexit", ExtraAttrs: llvm::AttributeList (),
937	/Local=/true);
938	CtorBuilder.CreateCall(Callee: AtExitFunc, Args: CleanupFn);
939	}
940
941	CtorBuilder.CreateRetVoid();
942	return ModuleCtorFunc;
943	}
944
945	/// Creates a global destructor function that unregisters the GPU code blob
946	/// registered by constructor.
947	///
948	/// For CUDA:
949	/// \code
950	/// void __cuda_module_dtor() {
951	/// __cudaUnregisterFatBinary(Handle);
952	/// }
953	/// \endcode
954	///
955	/// For HIP:
956	/// \code
957	/// void __hip_module_dtor() {
958	/// if (__hip_gpubin_handle) {
959	/// __hipUnregisterFatBinary(__hip_gpubin_handle);
960	/// __hip_gpubin_handle = 0;
961	/// }
962	/// }
963	/// \endcode
964	llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965	// No need for destructor if we don't have a handle to unregister.
966	if (!GpuBinaryHandle)
967	return nullptr;
968
969	// void __cudaUnregisterFatBinary(void * handle);*
970	llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971	Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
972	Name: addUnderscoredPrefixToName(FuncName: "UnregisterFatBinary"));
973
974	llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
976	Linkage: llvm::GlobalValue::InternalLinkage,
977	N: addUnderscoredPrefixToName(FuncName: "_module_dtor"), M: &TheModule);
978
979	llvm::BasicBlock *DtorEntryBB =
980	llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleDtorFunc);
981	CGBuilderTy DtorBuilder(CGM, Context);
982	DtorBuilder.SetInsertPoint(DtorEntryBB);
983
984	Address GpuBinaryAddr(
985	GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986	CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
987	auto *HandleValue = DtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
988	// There is only one HIP fat binary per linked module, however there are
989	// multiple destructor functions. Make sure the fat binary is unregistered
990	// only once.
991	if (CGM.getLangOpts().HIP) {
992	llvm::BasicBlock *IfBlock =
993	llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleDtorFunc);
994	llvm::BasicBlock *ExitBlock =
995	llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleDtorFunc);
996	llvm::Constant *Zero = llvm::Constant::getNullValue(Ty: HandleValue->getType());
997	llvm::Value *NEZero = DtorBuilder.CreateICmpNE(LHS: HandleValue, RHS: Zero);
998	DtorBuilder.CreateCondBr(Cond: NEZero, True: IfBlock, False: ExitBlock);
999
1000	DtorBuilder.SetInsertPoint(IfBlock);
1001	DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue);
1002	DtorBuilder.CreateStore(Val: Zero, Addr: GpuBinaryAddr);
1003	DtorBuilder.CreateBr(Dest: ExitBlock);
1004
1005	DtorBuilder.SetInsertPoint(ExitBlock);
1006	} else {
1007	DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue);
1008	}
1009	DtorBuilder.CreateRetVoid();
1010	return ModuleDtorFunc;
1011	}
1012
1013	CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1014	return new CGNVCUDARuntime (CGM);
1015	}
1016
1017	void CGNVCUDARuntime::internalizeDeviceSideVar(
1018	const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019	// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020	// global variables become internal definitions. These have to be internal in
1021	// order to prevent name conflicts with global host variables with the same
1022	// name in a different TUs.
1023	//
1024	// For -fgpu-rdc, the shadow variables should not be internalized because
1025	// they may be accessed by different TU.
1026	if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027	return;
1028
1029	// __shared__ variables are odd. Shadows do get created, but
1030	// they are not registered with the CUDA runtime, so they
1031	// can't really be used to access their device-side
1032	// counterparts. It's not clear yet whether it's nvcc's bug or
1033	// a feature, but we've got to do the same for compatibility.
1034	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>() \|\|
1035	D->hasAttr<CUDASharedAttr>() \|\|
1036	D->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
1037	D->getType()->isCUDADeviceBuiltinTextureType()) {
1038	Linkage = llvm::GlobalValue::InternalLinkage;
1039	}
1040	}
1041
1042	void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043	llvm::GlobalVariable &GV) {
1044	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>()) {
1045	// Shadow variables and their properties must be registered with CUDA
1046	// runtime. Skip Extern global variables, which will be registered in
1047	// the TU where they are defined.
1048	//
1049	// Don't register a C++17 inline variable. The local symbol can be
1050	// discarded and referencing a discarded local symbol from outside the
1051	// comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052	//
1053	// HIP managed variables need to be always recorded in device and host
1054	// compilations for transformation.
1055	//
1056	// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057	// added to llvm.compiler-used, therefore they are safe to be registered.
1058	if ((!D->hasExternalStorage() && !D->isInline()) \|\|
1059	CGM.getContext().CUDADeviceVarODRUsedByHost.contains(V: D) \|\|
1060	D->hasAttr<HIPManagedAttr>()) {
1061	registerDeviceVar(VD: D, Var&: GV, Extern: !D->hasDefinition(),
1062	Constant: D->hasAttr<CUDAConstantAttr>());
1063	}
1064	} else if (D->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
1065	D->getType()->isCUDADeviceBuiltinTextureType()) {
1066	// Builtin surfaces and textures and their template arguments are
1067	// also registered with CUDA runtime.
1068	const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069	Val: D->getType()->castAs<RecordType>()->getDecl());
1070	const TemplateArgumentList &Args = TD->getTemplateArgs();
1071	if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072	assert(Args.size() == `2` &&
1073	"Unexpected number of template arguments of CUDA device "
1074	"builtin surface type.");
1075	auto SurfType = Args [`1`].getAsIntegral();
1076	if (!D->hasExternalStorage())
1077	registerDeviceSurf(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: SurfType.getSExtValue());
1078	} else {
1079	assert(Args.size() == `3` &&
1080	"Unexpected number of template arguments of CUDA device "
1081	"builtin texture type.");
1082	auto TexType = Args [`1`].getAsIntegral();
1083	auto Normalized = Args [`2`].getAsIntegral();
1084	if (!D->hasExternalStorage())
1085	registerDeviceTex(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: TexType.getSExtValue(),
1086	Normalized: Normalized.getZExtValue());
1087	}
1088	}
1089	}
1090
1091	// Transform managed variables to pointers to managed variables in device code.
1092	// Each use of the original managed variable is replaced by a load from the
1093	// transformed managed variable. The transformed managed variable contains
1094	// the address of managed memory which will be allocated by the runtime.
1095	void CGNVCUDARuntime::transformManagedVars() {
1096	for (auto &&Info : DeviceVars) {
1097	llvm::GlobalVariable *Var = Info.Var;
1098	if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099	Info.Flags.isManaged()) {
1100	auto ManagedVar = new* llvm::GlobalVariable (
1101	CGM.getModule(), Var->getType(),
1102	/isConstant=/false, Var->getLinkage(),
1103	/Init=/Var->isDeclaration()
1104	? nullptr
1105	: llvm::ConstantPointerNull::get(T: Var->getType()),
1106	/Name=/"", /InsertBefore=/nullptr,
1107	llvm::GlobalVariable::NotThreadLocal,
1108	CGM.getContext().getTargetAddressSpace(AS: CGM.getLangOpts().CUDAIsDevice
1109	? LangAS::cuda_device
1110	: LangAS::Default));
1111	ManagedVar->setDSOLocal(Var->isDSOLocal());
1112	ManagedVar->setVisibility(Var->getVisibility());
1113	ManagedVar->setExternallyInitialized(true);
1114	replaceManagedVar(Var, ManagedVar);
1115	ManagedVar->takeName(V: Var);
1116	Var->setName(Twine (ManagedVar->getName()) + ".managed");
1117	// Keep managed variables even if they are not used in device code since
1118	// they need to be allocated by the runtime.
1119	if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120	assert(!ManagedVar->isDeclaration());
1121	CGM.addCompilerUsedGlobal(GV: Var);
1122	CGM.addCompilerUsedGlobal(GV: ManagedVar);
1123	}
1124	}
1125	}
1126	}
1127
1128	// Creates offloading entries for all the kernels and globals that must be
1129	// registered. The linker will provide a pointer to this section so we can
1130	// register the symbols with the linked device image.
1131	void CGNVCUDARuntime::createOffloadingEntries() {
1132	StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133	: "cuda_offloading_entries";
1134	llvm::Module &M = CGM.getModule();
1135	for (KernelInfo &I : EmittedKernels)
1136	llvm::offloading::emitOffloadingEntry(
1137	M, Addr: KernelHandles [I.Kernel->getName()],
1138	Name: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)), /Flags=/Size: `0`, /Data=/Flags: `0`,
1139	Data: llvm::offloading::OffloadGlobalEntry, SectionName: Section);
1140
1141	for (VarInfo &I : DeviceVars) {
1142	uint64_t VarSize =
1143	CGM.getDataLayout().getTypeAllocSize(Ty: I.Var->getValueType());
1144	int32_t Flags =
1145	(I.Flags.isExtern()
1146	? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147	: `0`) \|
1148	(I.Flags.isConstant()
1149	? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150	: `0`) \|
1151	(I.Flags.isNormalized()
1152	? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153	: `0`);
1154	if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155	llvm::offloading::emitOffloadingEntry(
1156	M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1157	Flags: (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158	: llvm::offloading::OffloadGlobalEntry) \|
1159	Flags,
1160	/Data=/`0`, SectionName: Section);
1161	} else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162	llvm::offloading::emitOffloadingEntry(
1163	M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1164	Flags: llvm::offloading::OffloadGlobalSurfaceEntry \| Flags,
1165	Data: I.Flags.getSurfTexType(), SectionName: Section);
1166	} else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167	llvm::offloading::emitOffloadingEntry(
1168	M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1169	Flags: llvm::offloading::OffloadGlobalTextureEntry \| Flags,
1170	Data: I.Flags.getSurfTexType(), SectionName: Section);
1171	}
1172	}
1173	}
1174
1175	// Returns module constructor to be added.
1176	llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177	transformManagedVars();
1178	if (CGM.getLangOpts().CUDAIsDevice) {
1179	// Mark ODR-used device variables as compiler used to prevent it from being
1180	// eliminated by optimization. This is necessary for device variables
1181	// ODR-used by host functions. Sema correctly marks them as ODR-used no
1182	// matter whether they are ODR-used by device or host functions.
1183	//
1184	// We do not need to do this if the variable has used attribute since it
1185	// has already been added.
1186	//
1187	// Static device variables have been externalized at this point, therefore
1188	// variables with LLVM private or internal linkage need not be added.
1189	for (auto &&Info : DeviceVars) {
1190	auto Kind = Info.Flags.getKind();
1191	if (!Info.Var->isDeclaration() &&
1192	!llvm::GlobalValue::isLocalLinkage(Linkage: Info.Var->getLinkage()) &&
1193	(Kind == DeviceVarFlags::Variable \|\|
1194	Kind == DeviceVarFlags::Surface \|\|
1195	Kind == DeviceVarFlags::Texture) &&
1196	Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197	CGM.addCompilerUsedGlobal(GV: Info.Var);
1198	}
1199	}
1200	return nullptr;
1201	}
1202	if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203	createOffloadingEntries();
1204	else
1205	return makeModuleCtorFunction();
1206
1207	return nullptr;
1208	}
1209
1210	llvm::GlobalValue CGNVCUDARuntime::getKernelHandle(llvm::Function F,
1211	GlobalDecl GD) {
1212	auto Loc = KernelHandles.find(Val: F->getName());
1213	if (Loc != KernelHandles.end()) {
1214	auto OldHandle = Loc ->second;
1215	if (KernelStubs [OldHandle] == F)
1216	return OldHandle;
1217
1218	// We've found the function name, but F itself has changed, so we need to
1219	// update the references.
1220	if (CGM.getLangOpts().HIP) {
1221	// For HIP compilation the handle itself does not change, so we only need
1222	// to update the Stub value.
1223	KernelStubs [OldHandle] = F;
1224	return OldHandle;
1225	}
1226	// For non-HIP compilation, erase the old Stub and fall-through to creating
1227	// new entries.
1228	KernelStubs.erase(Val: OldHandle);
1229	}
1230
1231	if (!CGM.getLangOpts().HIP) {
1232	KernelHandles [F->getName()] = F;
1233	KernelStubs [F] = F;
1234	return F;
1235	}
1236
1237	auto Var = new* llvm::GlobalVariable (
1238	TheModule, F->getType(), /isConstant=/true, F->getLinkage(),
1239	/Initializer=/nullptr,
1240	CGM.getMangledName(
1241	GD: GD.getWithKernelReferenceKind(Kind: KernelReferenceKind::Kernel)));
1242	Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243	Var->setDSOLocal(F->isDSOLocal());
1244	Var->setVisibility(F->getVisibility());
1245	auto *FD = cast<FunctionDecl>(Val: GD.getDecl());
1246	auto *FT = FD->getPrimaryTemplate();
1247	if (!FT \|\| FT->isThisDeclarationADefinition())
1248	CGM.maybeSetTrivialComdat(D: FD, GO&: Var);
1249	KernelHandles [F->getName()] = Var;
1250	KernelStubs [Var] = F;
1251	return Var;
1252	}
1253

Browse the source code of llvm_projects/clang/lib/CodeGen/CGCUDANV.cpp