1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CGCXXABI.h"
16#include "CodeGenFunction.h"
17#include "CodeGenModule.h"
18#include "clang/AST/Decl.h"
19#include "clang/Basic/Cuda.h"
20#include "clang/CodeGen/CodeGenABITypes.h"
21#include "clang/CodeGen/ConstantInitBuilder.h"
22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
29
30using namespace clang;
31using namespace CodeGen;
32
33namespace {
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37class CGNVCUDARuntime : public CGCUDARuntime {
38
39private:
40 llvm::IntegerType *IntTy, *SizeTy;
41 llvm::Type *VoidTy;
42 llvm::PointerType *PtrTy;
43
44 /// Convenience reference to LLVM Context
45 llvm::LLVMContext &Context;
46 /// Convenience reference to the current module
47 llvm::Module &TheModule;
48 /// Keeps track of kernel launch stubs and handles emitted in this module
49 struct KernelInfo {
50 llvm::Function *Kernel; // stub function to help launch kernel
51 const Decl *D;
52 };
53 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
54 // Map a kernel mangled name to a symbol for identifying kernel in host code
55 // For CUDA, the symbol for identifying the kernel is the same as the device
56 // stub function. For HIP, they are different.
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58 // Map a kernel handle to the kernel stub.
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60 struct VarInfo {
61 llvm::GlobalVariable *Var;
62 const VarDecl *D;
63 DeviceVarFlags Flags;
64 };
65 llvm::SmallVector<VarInfo, 16> DeviceVars;
66 /// Keeps track of variable containing handle of GPU binary. Populated by
67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68 /// ModuleDtorFunction()
69 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70 /// Whether we generate relocatable device code.
71 bool RelocatableDeviceCode;
72 /// Mangle context for device.
73 std::unique_ptr<MangleContext> DeviceMC;
74
75 llvm::FunctionCallee getSetupArgumentFn() const;
76 llvm::FunctionCallee getLaunchFn() const;
77
78 llvm::FunctionType *getRegisterGlobalsFnTy() const;
79 llvm::FunctionType *getCallbackFnTy() const;
80 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
81 std::string addPrefixToName(StringRef FuncName) const;
82 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83
84 /// Creates a function to register all kernel stubs generated in this module.
85 llvm::Function *makeRegisterGlobalsFn();
86
87 /// Helper function that generates a constant string and returns a pointer to
88 /// the start of the string. The result of this function can be used anywhere
89 /// where the C code specifies const char*.
90 llvm::Constant *makeConstantString(const std::string &Str,
91 const std::string &Name = "") {
92 return CGM.GetAddrOfConstantCString(Str, GlobalName: Name.c_str()).getPointer();
93 }
94
95 /// Helper function which generates an initialized constant array from Str,
96 /// and optionally sets section name and alignment. AddNull specifies whether
97 /// the array should nave NUL termination.
98 llvm::Constant *makeConstantArray(StringRef Str,
99 StringRef Name = "",
100 StringRef SectionName = "",
101 unsigned Alignment = 0,
102 bool AddNull = false) {
103 llvm::Constant *Value =
104 llvm::ConstantDataArray::getString(Context, Initializer: Str, AddNull);
105 auto *GV = new llvm::GlobalVariable(
106 TheModule, Value->getType(), /*isConstant=*/true,
107 llvm::GlobalValue::PrivateLinkage, Value, Name);
108 if (!SectionName.empty()) {
109 GV->setSection(SectionName);
110 // Mark the address as used which make sure that this section isn't
111 // merged and we will really have it in the object file.
112 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113 }
114 if (Alignment)
115 GV->setAlignment(llvm::Align(Alignment));
116 return GV;
117 }
118
119 /// Helper function that generates an empty dummy function returning void.
120 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121 assert(FnTy->getReturnType()->isVoidTy() &&
122 "Can only generate dummy functions returning void!");
123 llvm::Function *DummyFunc = llvm::Function::Create(
124 Ty: FnTy, Linkage: llvm::GlobalValue::InternalLinkage, N: "dummy", M: &TheModule);
125
126 llvm::BasicBlock *DummyBlock =
127 llvm::BasicBlock::Create(Context, Name: "", Parent: DummyFunc);
128 CGBuilderTy FuncBuilder(CGM, Context);
129 FuncBuilder.SetInsertPoint(DummyBlock);
130 FuncBuilder.CreateRetVoid();
131
132 return DummyFunc;
133 }
134
135 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137 std::string getDeviceSideName(const NamedDecl *ND) override;
138
139 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140 bool Extern, bool Constant) {
141 DeviceVars.push_back(Elt: {.Var: &Var,
142 .D: VD,
143 .Flags: {DeviceVarFlags::Variable, Extern, Constant,
144 VD->hasAttr<HIPManagedAttr>(),
145 /*Normalized*/ false, 0}});
146 }
147 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148 bool Extern, int Type) {
149 DeviceVars.push_back(Elt: {.Var: &Var,
150 .D: VD,
151 .Flags: {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
152 /*Managed*/ false,
153 /*Normalized*/ false, Type}});
154 }
155 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156 bool Extern, int Type, bool Normalized) {
157 DeviceVars.push_back(Elt: {.Var: &Var,
158 .D: VD,
159 .Flags: {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
160 /*Managed*/ false, Normalized, Type}});
161 }
162
163 /// Creates module constructor function
164 llvm::Function *makeModuleCtorFunction();
165 /// Creates module destructor function
166 llvm::Function *makeModuleDtorFunction();
167 /// Transform managed variables for device compilation.
168 void transformManagedVars();
169 /// Create offloading entries to register globals in RDC mode.
170 void createOffloadingEntries();
171
172public:
173 CGNVCUDARuntime(CodeGenModule &CGM);
174
175 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
176 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
177 auto Loc = KernelStubs.find(Val: Handle);
178 assert(Loc != KernelStubs.end());
179 return Loc->second;
180 }
181 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182 void handleVarRegistration(const VarDecl *VD,
183 llvm::GlobalVariable &Var) override;
184 void
185 internalizeDeviceSideVar(const VarDecl *D,
186 llvm::GlobalValue::LinkageTypes &Linkage) override;
187
188 llvm::Function *finalizeModule() override;
189};
190
191} // end anonymous namespace
192
193std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194 if (CGM.getLangOpts().HIP)
195 return ((Twine("hip") + Twine(FuncName)).str());
196 return ((Twine("cuda") + Twine(FuncName)).str());
197}
198std::string
199CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200 if (CGM.getLangOpts().HIP)
201 return ((Twine("__hip") + Twine(FuncName)).str());
202 return ((Twine("__cuda") + Twine(FuncName)).str());
203}
204
205static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206 // If the host and device have different C++ ABIs, mark it as the device
207 // mangle context so that the mangling needs to retrieve the additional
208 // device lambda mangling number instead of the regular host one.
209 if (CGM.getContext().getAuxTargetInfo() &&
210 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
211 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
212 return std::unique_ptr<MangleContext>(
213 CGM.getContext().createDeviceMangleContext(
214 T: *CGM.getContext().getAuxTargetInfo()));
215 }
216
217 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
218 T: CGM.getContext().getAuxTargetInfo()));
219}
220
221CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
223 TheModule(CGM.getModule()),
224 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225 DeviceMC(InitDeviceMC(CGM)) {
226 IntTy = CGM.IntTy;
227 SizeTy = CGM.SizeTy;
228 VoidTy = CGM.VoidTy;
229 PtrTy = CGM.UnqualPtrTy;
230}
231
232llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
234 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235 return CGM.CreateRuntimeFunction(
236 Ty: llvm::FunctionType::get(Result: IntTy, Params, isVarArg: false),
237 Name: addPrefixToName(FuncName: "SetupArgument"));
238}
239
240llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241 if (CGM.getLangOpts().HIP) {
242 // hipError_t hipLaunchByPtr(char *);
243 return CGM.CreateRuntimeFunction(
244 Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), Name: "hipLaunchByPtr");
245 }
246 // cudaError_t cudaLaunch(char *);
247 return CGM.CreateRuntimeFunction(Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false),
248 Name: "cudaLaunch");
249}
250
251llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
252 return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
253}
254
255llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
256 return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
257}
258
259llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
260 llvm::Type *Params[] = {llvm::PointerType::getUnqual(C&: Context), PtrTy, PtrTy,
261 llvm::PointerType::getUnqual(C&: Context)};
262 return llvm::FunctionType::get(Result: VoidTy, Params, isVarArg: false);
263}
264
265std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266 GlobalDecl GD;
267 // D could be either a kernel or a variable.
268 if (auto *FD = dyn_cast<FunctionDecl>(Val: ND))
269 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
270 else
271 GD = GlobalDecl(ND);
272 std::string DeviceSideName;
273 MangleContext *MC;
274 if (CGM.getLangOpts().CUDAIsDevice)
275 MC = &CGM.getCXXABI().getMangleContext();
276 else
277 MC = DeviceMC.get();
278 if (MC->shouldMangleDeclName(D: ND)) {
279 SmallString<256> Buffer;
280 llvm::raw_svector_ostream Out(Buffer);
281 MC->mangleName(GD, Out);
282 DeviceSideName = std::string(Out.str());
283 } else
284 DeviceSideName = std::string(ND->getIdentifier()->getName());
285
286 // Make unique name for device side static file-scope variable for HIP.
287 if (CGM.getContext().shouldExternalize(D: ND) &&
288 CGM.getLangOpts().GPURelocatableDeviceCode) {
289 SmallString<256> Buffer;
290 llvm::raw_svector_ostream Out(Buffer);
291 Out << DeviceSideName;
292 CGM.printPostfixForExternalizedDecl(OS&: Out, D: ND);
293 DeviceSideName = std::string(Out.str());
294 }
295 return DeviceSideName;
296}
297
298void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299 FunctionArgList &Args) {
300 EmittedKernels.push_back(Elt: {.Kernel: CGF.CurFn, .D: CGF.CurFuncDecl});
301 if (auto *GV =
302 dyn_cast<llvm::GlobalVariable>(Val: KernelHandles[CGF.CurFn->getName()])) {
303 GV->setLinkage(CGF.CurFn->getLinkage());
304 GV->setInitializer(CGF.CurFn);
305 }
306 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309 emitDeviceStubBodyNew(CGF, Args);
310 else
311 emitDeviceStubBodyLegacy(CGF, Args);
312}
313
314// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315// array and kernels are launched using cudaLaunchKernel().
316void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317 FunctionArgList &Args) {
318 // Build the shadow stack entry at the very start of the function.
319
320 // Calculate amount of space we will need for all arguments. If we have no
321 // args, allocate a single pointer so we still have a valid pointer to the
322 // argument array that we can pass to runtime, even if it will be unused.
323 Address KernelArgs = CGF.CreateTempAlloca(
324 Ty: PtrTy, align: CharUnits::fromQuantity(Quantity: 16), Name: "kernel_args",
325 ArraySize: llvm::ConstantInt::get(Ty: SizeTy, V: std::max<size_t>(a: 1, b: Args.size())));
326 // Store pointers to the arguments in a locally allocated launch_args.
327 for (unsigned i = 0; i < Args.size(); ++i) {
328 llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(VD: Args[i]).emitRawPointer(CGF);
329 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(V: VarPtr, DestTy: PtrTy);
330 CGF.Builder.CreateDefaultAlignedStore(
331 Val: VoidVarPtr, Addr: CGF.Builder.CreateConstGEP1_32(
332 Ty: PtrTy, Ptr: KernelArgs.emitRawPointer(CGF), Idx0: i));
333 }
334
335 llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
336
337 // Lookup cudaLaunchKernel/hipLaunchKernel function.
338 // HIP kernel launching API name depends on -fgpu-default-stream option. For
339 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340 // it is hipLaunchKernel_spt.
341 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
342 // void **args, size_t sharedMem,
343 // cudaStream_t stream);
344 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
345 // dim3 blockDim, void **args,
346 // size_t sharedMem, hipStream_t stream);
347 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
348 DeclContext *DC = TranslationUnitDecl::castToDeclContext(D: TUDecl);
349 std::string KernelLaunchAPI = "LaunchKernel";
350 if (CGF.getLangOpts().GPUDefaultStream ==
351 LangOptions::GPUDefaultStreamKind::PerThread) {
352 if (CGF.getLangOpts().HIP)
353 KernelLaunchAPI = KernelLaunchAPI + "_spt";
354 else if (CGF.getLangOpts().CUDA)
355 KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356 }
357 auto LaunchKernelName = addPrefixToName(FuncName: KernelLaunchAPI);
358 const IdentifierInfo &cudaLaunchKernelII =
359 CGM.getContext().Idents.get(Name: LaunchKernelName);
360 FunctionDecl *cudaLaunchKernelFD = nullptr;
361 for (auto *Result : DC->lookup(Name: &cudaLaunchKernelII)) {
362 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Val: Result))
363 cudaLaunchKernelFD = FD;
364 }
365
366 if (cudaLaunchKernelFD == nullptr) {
367 CGM.Error(loc: CGF.CurFuncDecl->getLocation(),
368 error: "Can't find declaration for " + LaunchKernelName);
369 return;
370 }
371 // Create temporary dim3 grid_dim, block_dim.
372 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(i: 1);
373 QualType Dim3Ty = GridDimParam->getType();
374 Address GridDim =
375 CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: 8), Name: "grid_dim");
376 Address BlockDim =
377 CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: 8), Name: "block_dim");
378 Address ShmemSize =
379 CGF.CreateTempAlloca(Ty: SizeTy, align: CGM.getSizeAlign(), Name: "shmem_size");
380 Address Stream = CGF.CreateTempAlloca(Ty: PtrTy, align: CGM.getPointerAlign(), Name: "stream");
381 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382 Ty: llvm::FunctionType::get(Result: IntTy,
383 Params: {/*gridDim=*/GridDim.getType(),
384 /*blockDim=*/BlockDim.getType(),
385 /*ShmemSize=*/ShmemSize.getType(),
386 /*Stream=*/Stream.getType()},
387 /*isVarArg=*/false),
388 Name: addUnderscoredPrefixToName(FuncName: "PopCallConfiguration"));
389
390 CGF.EmitRuntimeCallOrInvoke(callee: cudaPopConfigFn, args: {GridDim.emitRawPointer(CGF),
391 BlockDim.emitRawPointer(CGF),
392 ShmemSize.emitRawPointer(CGF),
393 Stream.emitRawPointer(CGF)});
394
395 // Emit the call to cudaLaunch
396 llvm::Value *Kernel =
397 CGF.Builder.CreatePointerCast(V: KernelHandles[CGF.CurFn->getName()], DestTy: PtrTy);
398 CallArgList LaunchKernelArgs;
399 LaunchKernelArgs.add(rvalue: RValue::get(V: Kernel),
400 type: cudaLaunchKernelFD->getParamDecl(i: 0)->getType());
401 LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: GridDim), type: Dim3Ty);
402 LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: BlockDim), type: Dim3Ty);
403 LaunchKernelArgs.add(rvalue: RValue::get(Addr: KernelArgs, CGF),
404 type: cudaLaunchKernelFD->getParamDecl(i: 3)->getType());
405 LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: ShmemSize)),
406 type: cudaLaunchKernelFD->getParamDecl(i: 4)->getType());
407 LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: Stream)),
408 type: cudaLaunchKernelFD->getParamDecl(i: 5)->getType());
409
410 QualType QT = cudaLaunchKernelFD->getType();
411 QualType CQT = QT.getCanonicalType();
412 llvm::Type *Ty = CGM.getTypes().ConvertType(T: CQT);
413 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
414
415 const CGFunctionInfo &FI =
416 CGM.getTypes().arrangeFunctionDeclaration(FD: cudaLaunchKernelFD);
417 llvm::FunctionCallee cudaLaunchKernelFn =
418 CGM.CreateRuntimeFunction(Ty: FTy, Name: LaunchKernelName);
419 CGF.EmitCall(CallInfo: FI, Callee: CGCallee::forDirect(functionPtr: cudaLaunchKernelFn), ReturnValue: ReturnValueSlot(),
420 Args: LaunchKernelArgs);
421
422 // To prevent CUDA device stub functions from being merged by ICF in MSVC
423 // environment, create an unique global variable for each kernel and write to
424 // the variable in the device stub.
425 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426 !CGF.getLangOpts().HIP) {
427 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Val: Kernel);
428 std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429
430 llvm::GlobalVariable *HandleVar =
431 CGM.getModule().getNamedGlobal(Name: GlobalVarName);
432 if (!HandleVar) {
433 HandleVar = new llvm::GlobalVariable(
434 CGM.getModule(), CGM.Int8Ty,
435 /*Constant=*/false, KernelFunction->getLinkage(),
436 llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: 0), GlobalVarName);
437 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438 HandleVar->setVisibility(KernelFunction->getVisibility());
439 if (KernelFunction->hasComdat())
440 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(Name: GlobalVarName));
441 }
442
443 CGF.Builder.CreateAlignedStore(Val: llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: 1),
444 Addr: HandleVar, Align: CharUnits::One(),
445 /*IsVolatile=*/true);
446 }
447
448 CGF.EmitBranch(Block: EndBlock);
449
450 CGF.EmitBlock(BB: EndBlock);
451}
452
453void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454 FunctionArgList &Args) {
455 // Emit a call to cudaSetupArgument for each arg in Args.
456 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457 llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
458 CharUnits Offset = CharUnits::Zero();
459 for (const VarDecl *A : Args) {
460 auto TInfo = CGM.getContext().getTypeInfoInChars(T: A->getType());
461 Offset = Offset.alignTo(Align: TInfo.Align);
462 llvm::Value *Args[] = {
463 CGF.Builder.CreatePointerCast(
464 V: CGF.GetAddrOfLocalVar(VD: A).emitRawPointer(CGF), DestTy: PtrTy),
465 llvm::ConstantInt::get(Ty: SizeTy, V: TInfo.Width.getQuantity()),
466 llvm::ConstantInt::get(Ty: SizeTy, V: Offset.getQuantity()),
467 };
468 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(callee: cudaSetupArgFn, args: Args);
469 llvm::Constant *Zero = llvm::ConstantInt::get(Ty: IntTy, V: 0);
470 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(LHS: CB, RHS: Zero);
471 llvm::BasicBlock *NextBlock = CGF.createBasicBlock(name: "setup.next");
472 CGF.Builder.CreateCondBr(Cond: CBZero, True: NextBlock, False: EndBlock);
473 CGF.EmitBlock(BB: NextBlock);
474 Offset += TInfo.Width;
475 }
476
477 // Emit the call to cudaLaunch
478 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479 llvm::Value *Arg =
480 CGF.Builder.CreatePointerCast(V: KernelHandles[CGF.CurFn->getName()], DestTy: PtrTy);
481 CGF.EmitRuntimeCallOrInvoke(callee: cudaLaunchFn, args: Arg);
482 CGF.EmitBranch(Block: EndBlock);
483
484 CGF.EmitBlock(BB: EndBlock);
485}
486
487// Replace the original variable Var with the address loaded from variable
488// ManagedVar populated by HIP runtime.
489static void replaceManagedVar(llvm::GlobalVariable *Var,
490 llvm::GlobalVariable *ManagedVar) {
491 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
492 for (auto &&VarUse : Var->uses()) {
493 WorkList.push_back(Elt: {VarUse.getUser()});
494 }
495 while (!WorkList.empty()) {
496 auto &&WorkItem = WorkList.pop_back_val();
497 auto *U = WorkItem.back();
498 if (isa<llvm::ConstantExpr>(Val: U)) {
499 for (auto &&UU : U->uses()) {
500 WorkItem.push_back(Elt: UU.getUser());
501 WorkList.push_back(Elt: WorkItem);
502 WorkItem.pop_back();
503 }
504 continue;
505 }
506 if (auto *I = dyn_cast<llvm::Instruction>(Val: U)) {
507 llvm::Value *OldV = Var;
508 llvm::Instruction *NewV =
509 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
510 llvm::Align(Var->getAlignment()), I);
511 WorkItem.pop_back();
512 // Replace constant expressions directly or indirectly using the managed
513 // variable with instructions.
514 for (auto &&Op : WorkItem) {
515 auto *CE = cast<llvm::ConstantExpr>(Val: Op);
516 auto *NewInst = CE->getAsInstruction();
517 NewInst->insertBefore(BB&: *I->getParent(), InsertPos: I->getIterator());
518 NewInst->replaceUsesOfWith(From: OldV, To: NewV);
519 OldV = CE;
520 NewV = NewInst;
521 }
522 I->replaceUsesOfWith(From: OldV, To: NewV);
523 } else {
524 llvm_unreachable("Invalid use of managed variable");
525 }
526 }
527}
528
529/// Creates a function that sets up state on the host side for CUDA objects that
530/// have a presence on both the host and device sides. Specifically, registers
531/// the host side of kernel functions and device global variables with the CUDA
532/// runtime.
533/// \code
534/// void __cuda_register_globals(void** GpuBinaryHandle) {
535/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536/// ...
537/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539/// ...
540/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541/// }
542/// \endcode
543llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544 // No need to register anything
545 if (EmittedKernels.empty() && DeviceVars.empty())
546 return nullptr;
547
548 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549 Ty: getRegisterGlobalsFnTy(), Linkage: llvm::GlobalValue::InternalLinkage,
550 N: addUnderscoredPrefixToName(FuncName: "_register_globals"), M: &TheModule);
551 llvm::BasicBlock *EntryBB =
552 llvm::BasicBlock::Create(Context, Name: "entry", Parent: RegisterKernelsFunc);
553 CGBuilderTy Builder(CGM, Context);
554 Builder.SetInsertPoint(EntryBB);
555
556 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
557 // int, uint3*, uint3*, dim3*, dim3*, int*)
558 llvm::Type *RegisterFuncParams[] = {
559 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(C&: Context)};
561 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562 Ty: llvm::FunctionType::get(Result: IntTy, Params: RegisterFuncParams, isVarArg: false),
563 Name: addUnderscoredPrefixToName(FuncName: "RegisterFunction"));
564
565 // Extract GpuBinaryHandle passed as the first argument passed to
566 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
567 // each emitted kernel.
568 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569 for (auto &&I : EmittedKernels) {
570 llvm::Constant *KernelName =
571 makeConstantString(Str: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)));
572 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(T: PtrTy);
573 llvm::Value *Args[] = {
574 &GpuBinaryHandlePtr,
575 KernelHandles[I.Kernel->getName()],
576 KernelName,
577 KernelName,
578 llvm::ConstantInt::get(Ty: IntTy, V: -1),
579 NullPtr,
580 NullPtr,
581 NullPtr,
582 NullPtr,
583 llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Context))};
584 Builder.CreateCall(Callee: RegisterFunc, Args);
585 }
586
587 llvm::Type *VarSizeTy = IntTy;
588 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589 if (CGM.getLangOpts().HIP ||
590 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591 VarSizeTy = SizeTy;
592
593 // void __cudaRegisterVar(void **, char *, char *, const char *,
594 // int, int, int, int)
595 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
596 IntTy, VarSizeTy, IntTy, IntTy};
597 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598 Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterVarParams, isVarArg: false),
599 Name: addUnderscoredPrefixToName(FuncName: "RegisterVar"));
600 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
601 // size_t, unsigned)
602 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
603 PtrTy, VarSizeTy, IntTy};
604 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605 Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterManagedVarParams, isVarArg: false),
606 Name: addUnderscoredPrefixToName(FuncName: "RegisterManagedVar"));
607 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
608 // const void **, const char *, int, int);
609 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610 Ty: llvm::FunctionType::get(
611 Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, isVarArg: false),
612 Name: addUnderscoredPrefixToName(FuncName: "RegisterSurface"));
613 // void __cudaRegisterTexture(void **, const struct textureReference *,
614 // const void **, const char *, int, int, int)
615 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616 Ty: llvm::FunctionType::get(
617 Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, isVarArg: false),
618 Name: addUnderscoredPrefixToName(FuncName: "RegisterTexture"));
619 for (auto &&Info : DeviceVars) {
620 llvm::GlobalVariable *Var = Info.Var;
621 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622 "External variables should not show up here, except HIP managed "
623 "variables");
624 llvm::Constant *VarName = makeConstantString(Str: getDeviceSideName(ND: Info.D));
625 switch (Info.Flags.getKind()) {
626 case DeviceVarFlags::Variable: {
627 uint64_t VarSize =
628 CGM.getDataLayout().getTypeAllocSize(Ty: Var->getValueType());
629 if (Info.Flags.isManaged()) {
630 assert(Var->getName().ends_with(".managed") &&
631 "HIP managed variables not transformed");
632 auto *ManagedVar = CGM.getModule().getNamedGlobal(
633 Name: Var->getName().drop_back(N: StringRef(".managed").size()));
634 llvm::Value *Args[] = {
635 &GpuBinaryHandlePtr,
636 ManagedVar,
637 Var,
638 VarName,
639 llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
640 llvm::ConstantInt::get(Ty: IntTy, V: Var->getAlignment())};
641 if (!Var->isDeclaration())
642 Builder.CreateCall(Callee: RegisterManagedVar, Args);
643 } else {
644 llvm::Value *Args[] = {
645 &GpuBinaryHandlePtr,
646 Var,
647 VarName,
648 VarName,
649 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern()),
650 llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
651 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isConstant()),
652 llvm::ConstantInt::get(Ty: IntTy, V: 0)};
653 Builder.CreateCall(Callee: RegisterVar, Args);
654 }
655 break;
656 }
657 case DeviceVarFlags::Surface:
658 Builder.CreateCall(
659 Callee: RegisterSurf,
660 Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
661 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
662 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
663 break;
664 case DeviceVarFlags::Texture:
665 Builder.CreateCall(
666 Callee: RegisterTex,
667 Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
668 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
669 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isNormalized()),
670 llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
671 break;
672 }
673 }
674
675 Builder.CreateRetVoid();
676 return RegisterKernelsFunc;
677}
678
679/// Creates a global constructor function for the module:
680///
681/// For CUDA:
682/// \code
683/// void __cuda_module_ctor() {
684/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685/// __cuda_register_globals(Handle);
686/// }
687/// \endcode
688///
689/// For HIP:
690/// \code
691/// void __hip_module_ctor() {
692/// if (__hip_gpubin_handle == 0) {
693/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
694/// __hip_register_globals(__hip_gpubin_handle);
695/// }
696/// }
697/// \endcode
698llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699 bool IsHIP = CGM.getLangOpts().HIP;
700 bool IsCUDA = CGM.getLangOpts().CUDA;
701 // No need to generate ctors/dtors if there is no GPU binary.
702 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703 if (CudaGpuBinaryFileName.empty() && !IsHIP)
704 return nullptr;
705 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706 DeviceVars.empty())
707 return nullptr;
708
709 // void __{cuda|hip}_register_globals(void* handle);
710 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711 // We always need a function to pass in as callback. Create a dummy
712 // implementation if we don't need to register anything.
713 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714 RegisterGlobalsFunc = makeDummyFunction(FnTy: getRegisterGlobalsFnTy());
715
716 // void ** __{cuda|hip}RegisterFatBinary(void *);
717 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718 Ty: llvm::FunctionType::get(Result: PtrTy, Params: PtrTy, isVarArg: false),
719 Name: addUnderscoredPrefixToName(FuncName: "RegisterFatBinary"));
720 // struct { int magic, int version, void * gpu_binary, void * dont_care };
721 llvm::StructType *FatbinWrapperTy =
722 llvm::StructType::get(elt1: IntTy, elts: IntTy, elts: PtrTy, elts: PtrTy);
723
724 // Register GPU binary with the CUDA runtime, store returned handle in a
725 // global variable and save a reference in GpuBinaryHandle to be cleaned up
726 // in destructor on exit. Then associate all known kernels with the GPU binary
727 // handle so CUDA runtime can figure out what to call on the GPU side.
728 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729 if (!CudaGpuBinaryFileName.empty()) {
730 auto VFS = CGM.getFileSystem();
731 auto CudaGpuBinaryOrErr =
732 VFS->getBufferForFile(Name: CudaGpuBinaryFileName, FileSize: -1, RequiresNullTerminator: false);
733 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734 CGM.getDiags().Report(DiagID: diag::err_cannot_open_file)
735 << CudaGpuBinaryFileName << EC.message();
736 return nullptr;
737 }
738 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739 }
740
741 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742 Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
743 Linkage: llvm::GlobalValue::InternalLinkage,
744 N: addUnderscoredPrefixToName(FuncName: "_module_ctor"), M: &TheModule);
745 llvm::BasicBlock *CtorEntryBB =
746 llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleCtorFunc);
747 CGBuilderTy CtorBuilder(CGM, Context);
748
749 CtorBuilder.SetInsertPoint(CtorEntryBB);
750
751 const char *FatbinConstantName;
752 const char *FatbinSectionName;
753 const char *ModuleIDSectionName;
754 StringRef ModuleIDPrefix;
755 llvm::Constant *FatBinStr;
756 unsigned FatMagic;
757 if (IsHIP) {
758 FatbinConstantName = ".hip_fatbin";
759 FatbinSectionName = ".hipFatBinSegment";
760
761 ModuleIDSectionName = "__hip_module_id";
762 ModuleIDPrefix = "__hip_";
763
764 if (CudaGpuBinary) {
765 // If fatbin is available from early finalization, create a string
766 // literal containing the fat binary loaded from the given file.
767 const unsigned HIPCodeObjectAlign = 4096;
768 FatBinStr = makeConstantArray(Str: std::string(CudaGpuBinary->getBuffer()), Name: "",
769 SectionName: FatbinConstantName, Alignment: HIPCodeObjectAlign);
770 } else {
771 // If fatbin is not available, create an external symbol
772 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773 // to contain the fat binary but will be populated somewhere else,
774 // e.g. by lld through link script.
775 FatBinStr = new llvm::GlobalVariable(
776 CGM.getModule(), CGM.Int8Ty,
777 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779 llvm::GlobalVariable::NotThreadLocal);
780 cast<llvm::GlobalVariable>(Val: FatBinStr)->setSection(FatbinConstantName);
781 }
782
783 FatMagic = HIPFatMagic;
784 } else {
785 if (RelocatableDeviceCode)
786 FatbinConstantName = CGM.getTriple().isMacOSX()
787 ? "__NV_CUDA,__nv_relfatbin"
788 : "__nv_relfatbin";
789 else
790 FatbinConstantName =
791 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792 // NVIDIA's cuobjdump looks for fatbins in this section.
793 FatbinSectionName =
794 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795
796 ModuleIDSectionName = CGM.getTriple().isMacOSX()
797 ? "__NV_CUDA,__nv_module_id"
798 : "__nv_module_id";
799 ModuleIDPrefix = "__nv_";
800
801 // For CUDA, create a string literal containing the fat binary loaded from
802 // the given file.
803 FatBinStr = makeConstantArray(Str: std::string(CudaGpuBinary->getBuffer()), Name: "",
804 SectionName: FatbinConstantName, Alignment: 8);
805 FatMagic = CudaFatMagic;
806 }
807
808 // Create initialized wrapper structure that points to the loaded GPU binary
809 ConstantInitBuilder Builder(CGM);
810 auto Values = Builder.beginStruct(structTy: FatbinWrapperTy);
811 // Fatbin wrapper magic.
812 Values.addInt(intTy: IntTy, value: FatMagic);
813 // Fatbin version.
814 Values.addInt(intTy: IntTy, value: 1);
815 // Data.
816 Values.add(value: FatBinStr);
817 // Unused in fatbin v1.
818 Values.add(value: llvm::ConstantPointerNull::get(T: PtrTy));
819 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820 args: addUnderscoredPrefixToName(FuncName: "_fatbin_wrapper"), args: CGM.getPointerAlign(),
821 /*constant*/ args: true);
822 FatbinWrapper->setSection(FatbinSectionName);
823
824 // There is only one HIP fat binary per linked module, however there are
825 // multiple constructor functions. Make sure the fat binary is registered
826 // only once. The constructor functions are executed by the dynamic loader
827 // before the program gains control. The dynamic loader cannot execute the
828 // constructor functions concurrently since doing that would not guarantee
829 // thread safety of the loaded program. Therefore we can assume sequential
830 // execution of constructor functions here.
831 if (IsHIP) {
832 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833 : llvm::GlobalValue::ExternalLinkage;
834 llvm::BasicBlock *IfBlock =
835 llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleCtorFunc);
836 llvm::BasicBlock *ExitBlock =
837 llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleCtorFunc);
838 // The name, size, and initialization pattern of this variable is part
839 // of HIP ABI.
840 GpuBinaryHandle = new llvm::GlobalVariable(
841 TheModule, PtrTy, /*isConstant=*/false, Linkage,
842 /*Initializer=*/
843 CudaGpuBinary ? llvm::ConstantPointerNull::get(T: PtrTy) : nullptr,
844 CudaGpuBinary
845 ? "__hip_gpubin_handle"
846 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848 // Prevent the weak symbol in different shared libraries being merged.
849 if (Linkage != llvm::GlobalValue::InternalLinkage)
850 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851 Address GpuBinaryAddr(
852 GpuBinaryHandle, PtrTy,
853 CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
854 {
855 auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
856 llvm::Constant *Zero =
857 llvm::Constant::getNullValue(Ty: HandleValue->getType());
858 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(LHS: HandleValue, RHS: Zero);
859 CtorBuilder.CreateCondBr(Cond: EQZero, True: IfBlock, False: ExitBlock);
860 }
861 {
862 CtorBuilder.SetInsertPoint(IfBlock);
863 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864 llvm::CallInst *RegisterFatbinCall =
865 CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
866 CtorBuilder.CreateStore(Val: RegisterFatbinCall, Addr: GpuBinaryAddr);
867 CtorBuilder.CreateBr(Dest: ExitBlock);
868 }
869 {
870 CtorBuilder.SetInsertPoint(ExitBlock);
871 // Call __hip_register_globals(GpuBinaryHandle);
872 if (RegisterGlobalsFunc) {
873 auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
874 CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: HandleValue);
875 }
876 }
877 } else if (!RelocatableDeviceCode) {
878 // Register binary with CUDA runtime. This is substantially different in
879 // default mode vs. separate compilation!
880 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881 llvm::CallInst *RegisterFatbinCall =
882 CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
883 GpuBinaryHandle = new llvm::GlobalVariable(
884 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885 llvm::ConstantPointerNull::get(T: PtrTy), "__cuda_gpubin_handle");
886 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887 CtorBuilder.CreateAlignedStore(Val: RegisterFatbinCall, Addr: GpuBinaryHandle,
888 Align: CGM.getPointerAlign());
889
890 // Call __cuda_register_globals(GpuBinaryHandle);
891 if (RegisterGlobalsFunc)
892 CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: RegisterFatbinCall);
893
894 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897 // void __cudaRegisterFatBinaryEnd(void **);
898 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899 Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
900 Name: "__cudaRegisterFatBinaryEnd");
901 CtorBuilder.CreateCall(Callee: RegisterFatbinEndFunc, Args: RegisterFatbinCall);
902 }
903 } else {
904 // Generate a unique module ID.
905 SmallString<64> ModuleID;
906 llvm::raw_svector_ostream OS(ModuleID);
907 OS << ModuleIDPrefix << llvm::format(Fmt: "%" PRIx64, Vals: FatbinWrapper->getGUID());
908 llvm::Constant *ModuleIDConstant = makeConstantArray(
909 Str: std::string(ModuleID), Name: "", SectionName: ModuleIDSectionName, Alignment: 32, /*AddNull=*/true);
910
911 // Create an alias for the FatbinWrapper that nvcc will look for.
912 llvm::GlobalAlias::create(Linkage: llvm::GlobalValue::ExternalLinkage,
913 Name: Twine("__fatbinwrap") + ModuleID, Aliasee: FatbinWrapper);
914
915 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
916 // void *, void (*)(void **))
917 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918 RegisterLinkedBinaryName += ModuleID;
919 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920 Ty: getRegisterLinkedBinaryFnTy(), Name: RegisterLinkedBinaryName);
921
922 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924 makeDummyFunction(FnTy: getCallbackFnTy())};
925 CtorBuilder.CreateCall(Callee: RegisterLinkedBinaryFunc, Args);
926 }
927
928 // Create destructor and register it with atexit() the way NVCC does it. Doing
929 // it during regular destructor phase worked in CUDA before 9.2 but results in
930 // double-free in 9.2.
931 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932 // extern "C" int atexit(void (*f)(void));
933 llvm::FunctionType *AtExitTy =
934 llvm::FunctionType::get(Result: IntTy, Params: CleanupFn->getType(), isVarArg: false);
935 llvm::FunctionCallee AtExitFunc =
936 CGM.CreateRuntimeFunction(Ty: AtExitTy, Name: "atexit", ExtraAttrs: llvm::AttributeList(),
937 /*Local=*/true);
938 CtorBuilder.CreateCall(Callee: AtExitFunc, Args: CleanupFn);
939 }
940
941 CtorBuilder.CreateRetVoid();
942 return ModuleCtorFunc;
943}
944
945/// Creates a global destructor function that unregisters the GPU code blob
946/// registered by constructor.
947///
948/// For CUDA:
949/// \code
950/// void __cuda_module_dtor() {
951/// __cudaUnregisterFatBinary(Handle);
952/// }
953/// \endcode
954///
955/// For HIP:
956/// \code
957/// void __hip_module_dtor() {
958/// if (__hip_gpubin_handle) {
959/// __hipUnregisterFatBinary(__hip_gpubin_handle);
960/// __hip_gpubin_handle = 0;
961/// }
962/// }
963/// \endcode
964llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965 // No need for destructor if we don't have a handle to unregister.
966 if (!GpuBinaryHandle)
967 return nullptr;
968
969 // void __cudaUnregisterFatBinary(void ** handle);
970 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971 Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
972 Name: addUnderscoredPrefixToName(FuncName: "UnregisterFatBinary"));
973
974 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975 Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
976 Linkage: llvm::GlobalValue::InternalLinkage,
977 N: addUnderscoredPrefixToName(FuncName: "_module_dtor"), M: &TheModule);
978
979 llvm::BasicBlock *DtorEntryBB =
980 llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleDtorFunc);
981 CGBuilderTy DtorBuilder(CGM, Context);
982 DtorBuilder.SetInsertPoint(DtorEntryBB);
983
984 Address GpuBinaryAddr(
985 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986 CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
987 auto *HandleValue = DtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
988 // There is only one HIP fat binary per linked module, however there are
989 // multiple destructor functions. Make sure the fat binary is unregistered
990 // only once.
991 if (CGM.getLangOpts().HIP) {
992 llvm::BasicBlock *IfBlock =
993 llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleDtorFunc);
994 llvm::BasicBlock *ExitBlock =
995 llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleDtorFunc);
996 llvm::Constant *Zero = llvm::Constant::getNullValue(Ty: HandleValue->getType());
997 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(LHS: HandleValue, RHS: Zero);
998 DtorBuilder.CreateCondBr(Cond: NEZero, True: IfBlock, False: ExitBlock);
999
1000 DtorBuilder.SetInsertPoint(IfBlock);
1001 DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue);
1002 DtorBuilder.CreateStore(Val: Zero, Addr: GpuBinaryAddr);
1003 DtorBuilder.CreateBr(Dest: ExitBlock);
1004
1005 DtorBuilder.SetInsertPoint(ExitBlock);
1006 } else {
1007 DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue);
1008 }
1009 DtorBuilder.CreateRetVoid();
1010 return ModuleDtorFunc;
1011}
1012
1013CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1014 return new CGNVCUDARuntime(CGM);
1015}
1016
1017void CGNVCUDARuntime::internalizeDeviceSideVar(
1018 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020 // global variables become internal definitions. These have to be internal in
1021 // order to prevent name conflicts with global host variables with the same
1022 // name in a different TUs.
1023 //
1024 // For -fgpu-rdc, the shadow variables should not be internalized because
1025 // they may be accessed by different TU.
1026 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027 return;
1028
1029 // __shared__ variables are odd. Shadows do get created, but
1030 // they are not registered with the CUDA runtime, so they
1031 // can't really be used to access their device-side
1032 // counterparts. It's not clear yet whether it's nvcc's bug or
1033 // a feature, but we've got to do the same for compatibility.
1034 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1035 D->hasAttr<CUDASharedAttr>() ||
1036 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037 D->getType()->isCUDADeviceBuiltinTextureType()) {
1038 Linkage = llvm::GlobalValue::InternalLinkage;
1039 }
1040}
1041
1042void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043 llvm::GlobalVariable &GV) {
1044 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1045 // Shadow variables and their properties must be registered with CUDA
1046 // runtime. Skip Extern global variables, which will be registered in
1047 // the TU where they are defined.
1048 //
1049 // Don't register a C++17 inline variable. The local symbol can be
1050 // discarded and referencing a discarded local symbol from outside the
1051 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052 //
1053 // HIP managed variables need to be always recorded in device and host
1054 // compilations for transformation.
1055 //
1056 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057 // added to llvm.compiler-used, therefore they are safe to be registered.
1058 if ((!D->hasExternalStorage() && !D->isInline()) ||
1059 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(V: D) ||
1060 D->hasAttr<HIPManagedAttr>()) {
1061 registerDeviceVar(VD: D, Var&: GV, Extern: !D->hasDefinition(),
1062 Constant: D->hasAttr<CUDAConstantAttr>());
1063 }
1064 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065 D->getType()->isCUDADeviceBuiltinTextureType()) {
1066 // Builtin surfaces and textures and their template arguments are
1067 // also registered with CUDA runtime.
1068 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069 Val: D->getType()->castAs<RecordType>()->getDecl());
1070 const TemplateArgumentList &Args = TD->getTemplateArgs();
1071 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072 assert(Args.size() == 2 &&
1073 "Unexpected number of template arguments of CUDA device "
1074 "builtin surface type.");
1075 auto SurfType = Args[1].getAsIntegral();
1076 if (!D->hasExternalStorage())
1077 registerDeviceSurf(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: SurfType.getSExtValue());
1078 } else {
1079 assert(Args.size() == 3 &&
1080 "Unexpected number of template arguments of CUDA device "
1081 "builtin texture type.");
1082 auto TexType = Args[1].getAsIntegral();
1083 auto Normalized = Args[2].getAsIntegral();
1084 if (!D->hasExternalStorage())
1085 registerDeviceTex(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: TexType.getSExtValue(),
1086 Normalized: Normalized.getZExtValue());
1087 }
1088 }
1089}
1090
1091// Transform managed variables to pointers to managed variables in device code.
1092// Each use of the original managed variable is replaced by a load from the
1093// transformed managed variable. The transformed managed variable contains
1094// the address of managed memory which will be allocated by the runtime.
1095void CGNVCUDARuntime::transformManagedVars() {
1096 for (auto &&Info : DeviceVars) {
1097 llvm::GlobalVariable *Var = Info.Var;
1098 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099 Info.Flags.isManaged()) {
1100 auto *ManagedVar = new llvm::GlobalVariable(
1101 CGM.getModule(), Var->getType(),
1102 /*isConstant=*/false, Var->getLinkage(),
1103 /*Init=*/Var->isDeclaration()
1104 ? nullptr
1105 : llvm::ConstantPointerNull::get(T: Var->getType()),
1106 /*Name=*/"", /*InsertBefore=*/nullptr,
1107 llvm::GlobalVariable::NotThreadLocal,
1108 CGM.getContext().getTargetAddressSpace(AS: CGM.getLangOpts().CUDAIsDevice
1109 ? LangAS::cuda_device
1110 : LangAS::Default));
1111 ManagedVar->setDSOLocal(Var->isDSOLocal());
1112 ManagedVar->setVisibility(Var->getVisibility());
1113 ManagedVar->setExternallyInitialized(true);
1114 replaceManagedVar(Var, ManagedVar);
1115 ManagedVar->takeName(V: Var);
1116 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1117 // Keep managed variables even if they are not used in device code since
1118 // they need to be allocated by the runtime.
1119 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120 assert(!ManagedVar->isDeclaration());
1121 CGM.addCompilerUsedGlobal(GV: Var);
1122 CGM.addCompilerUsedGlobal(GV: ManagedVar);
1123 }
1124 }
1125 }
1126}
1127
1128// Creates offloading entries for all the kernels and globals that must be
1129// registered. The linker will provide a pointer to this section so we can
1130// register the symbols with the linked device image.
1131void CGNVCUDARuntime::createOffloadingEntries() {
1132 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133 : "cuda_offloading_entries";
1134 llvm::Module &M = CGM.getModule();
1135 for (KernelInfo &I : EmittedKernels)
1136 llvm::offloading::emitOffloadingEntry(
1137 M, Addr: KernelHandles[I.Kernel->getName()],
1138 Name: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)), /*Flags=*/Size: 0, /*Data=*/Flags: 0,
1139 Data: llvm::offloading::OffloadGlobalEntry, SectionName: Section);
1140
1141 for (VarInfo &I : DeviceVars) {
1142 uint64_t VarSize =
1143 CGM.getDataLayout().getTypeAllocSize(Ty: I.Var->getValueType());
1144 int32_t Flags =
1145 (I.Flags.isExtern()
1146 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147 : 0) |
1148 (I.Flags.isConstant()
1149 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150 : 0) |
1151 (I.Flags.isNormalized()
1152 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153 : 0);
1154 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155 llvm::offloading::emitOffloadingEntry(
1156 M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1157 Flags: (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158 : llvm::offloading::OffloadGlobalEntry) |
1159 Flags,
1160 /*Data=*/0, SectionName: Section);
1161 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162 llvm::offloading::emitOffloadingEntry(
1163 M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1164 Flags: llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165 Data: I.Flags.getSurfTexType(), SectionName: Section);
1166 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167 llvm::offloading::emitOffloadingEntry(
1168 M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize,
1169 Flags: llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170 Data: I.Flags.getSurfTexType(), SectionName: Section);
1171 }
1172 }
1173}
1174
1175// Returns module constructor to be added.
1176llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177 transformManagedVars();
1178 if (CGM.getLangOpts().CUDAIsDevice) {
1179 // Mark ODR-used device variables as compiler used to prevent it from being
1180 // eliminated by optimization. This is necessary for device variables
1181 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1182 // matter whether they are ODR-used by device or host functions.
1183 //
1184 // We do not need to do this if the variable has used attribute since it
1185 // has already been added.
1186 //
1187 // Static device variables have been externalized at this point, therefore
1188 // variables with LLVM private or internal linkage need not be added.
1189 for (auto &&Info : DeviceVars) {
1190 auto Kind = Info.Flags.getKind();
1191 if (!Info.Var->isDeclaration() &&
1192 !llvm::GlobalValue::isLocalLinkage(Linkage: Info.Var->getLinkage()) &&
1193 (Kind == DeviceVarFlags::Variable ||
1194 Kind == DeviceVarFlags::Surface ||
1195 Kind == DeviceVarFlags::Texture) &&
1196 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197 CGM.addCompilerUsedGlobal(GV: Info.Var);
1198 }
1199 }
1200 return nullptr;
1201 }
1202 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203 createOffloadingEntries();
1204 else
1205 return makeModuleCtorFunction();
1206
1207 return nullptr;
1208}
1209
1210llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1211 GlobalDecl GD) {
1212 auto Loc = KernelHandles.find(Val: F->getName());
1213 if (Loc != KernelHandles.end()) {
1214 auto OldHandle = Loc->second;
1215 if (KernelStubs[OldHandle] == F)
1216 return OldHandle;
1217
1218 // We've found the function name, but F itself has changed, so we need to
1219 // update the references.
1220 if (CGM.getLangOpts().HIP) {
1221 // For HIP compilation the handle itself does not change, so we only need
1222 // to update the Stub value.
1223 KernelStubs[OldHandle] = F;
1224 return OldHandle;
1225 }
1226 // For non-HIP compilation, erase the old Stub and fall-through to creating
1227 // new entries.
1228 KernelStubs.erase(Val: OldHandle);
1229 }
1230
1231 if (!CGM.getLangOpts().HIP) {
1232 KernelHandles[F->getName()] = F;
1233 KernelStubs[F] = F;
1234 return F;
1235 }
1236
1237 auto *Var = new llvm::GlobalVariable(
1238 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1239 /*Initializer=*/nullptr,
1240 CGM.getMangledName(
1241 GD: GD.getWithKernelReferenceKind(Kind: KernelReferenceKind::Kernel)));
1242 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243 Var->setDSOLocal(F->isDSOLocal());
1244 Var->setVisibility(F->getVisibility());
1245 auto *FD = cast<FunctionDecl>(Val: GD.getDecl());
1246 auto *FT = FD->getPrimaryTemplate();
1247 if (!FT || FT->isThisDeclarationADefinition())
1248 CGM.maybeSetTrivialComdat(D: *FD, GO&: *Var);
1249 KernelHandles[F->getName()] = Var;
1250 KernelStubs[Var] = F;
1251 return Var;
1252}
1253