1 | //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This provides a class for CUDA code generation targeting the NVIDIA CUDA |
10 | // runtime library. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "CGCUDARuntime.h" |
15 | #include "CGCXXABI.h" |
16 | #include "CodeGenFunction.h" |
17 | #include "CodeGenModule.h" |
18 | #include "clang/AST/Decl.h" |
19 | #include "clang/Basic/Cuda.h" |
20 | #include "clang/CodeGen/CodeGenABITypes.h" |
21 | #include "clang/CodeGen/ConstantInitBuilder.h" |
22 | #include "llvm/Frontend/Offloading/Utility.h" |
23 | #include "llvm/IR/BasicBlock.h" |
24 | #include "llvm/IR/Constants.h" |
25 | #include "llvm/IR/DerivedTypes.h" |
26 | #include "llvm/IR/ReplaceConstant.h" |
27 | #include "llvm/Support/Format.h" |
28 | #include "llvm/Support/VirtualFileSystem.h" |
29 | |
30 | using namespace clang; |
31 | using namespace CodeGen; |
32 | |
33 | namespace { |
34 | constexpr unsigned CudaFatMagic = 0x466243b1; |
35 | constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" |
36 | |
37 | class CGNVCUDARuntime : public CGCUDARuntime { |
38 | |
39 | private: |
40 | llvm::IntegerType *IntTy, *SizeTy; |
41 | llvm::Type *VoidTy; |
42 | llvm::PointerType *PtrTy; |
43 | |
44 | /// Convenience reference to LLVM Context |
45 | llvm::LLVMContext &Context; |
46 | /// Convenience reference to the current module |
47 | llvm::Module &TheModule; |
48 | /// Keeps track of kernel launch stubs and handles emitted in this module |
49 | struct KernelInfo { |
50 | llvm::Function *Kernel; // stub function to help launch kernel |
51 | const Decl *D; |
52 | }; |
53 | llvm::SmallVector<KernelInfo, 16> EmittedKernels; |
54 | // Map a kernel mangled name to a symbol for identifying kernel in host code |
55 | // For CUDA, the symbol for identifying the kernel is the same as the device |
56 | // stub function. For HIP, they are different. |
57 | llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles; |
58 | // Map a kernel handle to the kernel stub. |
59 | llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; |
60 | struct VarInfo { |
61 | llvm::GlobalVariable *Var; |
62 | const VarDecl *D; |
63 | DeviceVarFlags Flags; |
64 | }; |
65 | llvm::SmallVector<VarInfo, 16> DeviceVars; |
66 | /// Keeps track of variable containing handle of GPU binary. Populated by |
67 | /// ModuleCtorFunction() and used to create corresponding cleanup calls in |
68 | /// ModuleDtorFunction() |
69 | llvm::GlobalVariable *GpuBinaryHandle = nullptr; |
70 | /// Whether we generate relocatable device code. |
71 | bool RelocatableDeviceCode; |
72 | /// Mangle context for device. |
73 | std::unique_ptr<MangleContext> DeviceMC; |
74 | |
75 | llvm::FunctionCallee getSetupArgumentFn() const; |
76 | llvm::FunctionCallee getLaunchFn() const; |
77 | |
78 | llvm::FunctionType *getRegisterGlobalsFnTy() const; |
79 | llvm::FunctionType *getCallbackFnTy() const; |
80 | llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; |
81 | std::string addPrefixToName(StringRef FuncName) const; |
82 | std::string addUnderscoredPrefixToName(StringRef FuncName) const; |
83 | |
84 | /// Creates a function to register all kernel stubs generated in this module. |
85 | llvm::Function *makeRegisterGlobalsFn(); |
86 | |
87 | /// Helper function that generates a constant string and returns a pointer to |
88 | /// the start of the string. The result of this function can be used anywhere |
89 | /// where the C code specifies const char*. |
90 | llvm::Constant *makeConstantString(const std::string &Str, |
91 | const std::string &Name = "" ) { |
92 | return CGM.GetAddrOfConstantCString(Str, GlobalName: Name.c_str()).getPointer(); |
93 | } |
94 | |
95 | /// Helper function which generates an initialized constant array from Str, |
96 | /// and optionally sets section name and alignment. AddNull specifies whether |
97 | /// the array should nave NUL termination. |
98 | llvm::Constant *makeConstantArray(StringRef Str, |
99 | StringRef Name = "" , |
100 | StringRef SectionName = "" , |
101 | unsigned Alignment = 0, |
102 | bool AddNull = false) { |
103 | llvm::Constant *Value = |
104 | llvm::ConstantDataArray::getString(Context, Initializer: Str, AddNull); |
105 | auto *GV = new llvm::GlobalVariable( |
106 | TheModule, Value->getType(), /*isConstant=*/true, |
107 | llvm::GlobalValue::PrivateLinkage, Value, Name); |
108 | if (!SectionName.empty()) { |
109 | GV->setSection(SectionName); |
110 | // Mark the address as used which make sure that this section isn't |
111 | // merged and we will really have it in the object file. |
112 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); |
113 | } |
114 | if (Alignment) |
115 | GV->setAlignment(llvm::Align(Alignment)); |
116 | return GV; |
117 | } |
118 | |
119 | /// Helper function that generates an empty dummy function returning void. |
120 | llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { |
121 | assert(FnTy->getReturnType()->isVoidTy() && |
122 | "Can only generate dummy functions returning void!" ); |
123 | llvm::Function *DummyFunc = llvm::Function::Create( |
124 | Ty: FnTy, Linkage: llvm::GlobalValue::InternalLinkage, N: "dummy" , M: &TheModule); |
125 | |
126 | llvm::BasicBlock *DummyBlock = |
127 | llvm::BasicBlock::Create(Context, Name: "" , Parent: DummyFunc); |
128 | CGBuilderTy FuncBuilder(CGM, Context); |
129 | FuncBuilder.SetInsertPoint(DummyBlock); |
130 | FuncBuilder.CreateRetVoid(); |
131 | |
132 | return DummyFunc; |
133 | } |
134 | |
135 | void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); |
136 | void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); |
137 | std::string getDeviceSideName(const NamedDecl *ND) override; |
138 | |
139 | void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, |
140 | bool Extern, bool Constant) { |
141 | DeviceVars.push_back(Elt: {.Var: &Var, |
142 | .D: VD, |
143 | .Flags: {DeviceVarFlags::Variable, Extern, Constant, |
144 | VD->hasAttr<HIPManagedAttr>(), |
145 | /*Normalized*/ false, 0}}); |
146 | } |
147 | void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, |
148 | bool Extern, int Type) { |
149 | DeviceVars.push_back(Elt: {.Var: &Var, |
150 | .D: VD, |
151 | .Flags: {DeviceVarFlags::Surface, Extern, /*Constant*/ false, |
152 | /*Managed*/ false, |
153 | /*Normalized*/ false, Type}}); |
154 | } |
155 | void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, |
156 | bool Extern, int Type, bool Normalized) { |
157 | DeviceVars.push_back(Elt: {.Var: &Var, |
158 | .D: VD, |
159 | .Flags: {DeviceVarFlags::Texture, Extern, /*Constant*/ false, |
160 | /*Managed*/ false, Normalized, Type}}); |
161 | } |
162 | |
163 | /// Creates module constructor function |
164 | llvm::Function *makeModuleCtorFunction(); |
165 | /// Creates module destructor function |
166 | llvm::Function *makeModuleDtorFunction(); |
167 | /// Transform managed variables for device compilation. |
168 | void transformManagedVars(); |
169 | /// Create offloading entries to register globals in RDC mode. |
170 | void createOffloadingEntries(); |
171 | |
172 | public: |
173 | CGNVCUDARuntime(CodeGenModule &CGM); |
174 | |
175 | llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; |
176 | llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { |
177 | auto Loc = KernelStubs.find(Val: Handle); |
178 | assert(Loc != KernelStubs.end()); |
179 | return Loc->second; |
180 | } |
181 | void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; |
182 | void handleVarRegistration(const VarDecl *VD, |
183 | llvm::GlobalVariable &Var) override; |
184 | void |
185 | internalizeDeviceSideVar(const VarDecl *D, |
186 | llvm::GlobalValue::LinkageTypes &Linkage) override; |
187 | |
188 | llvm::Function *finalizeModule() override; |
189 | }; |
190 | |
191 | } // end anonymous namespace |
192 | |
193 | std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
194 | if (CGM.getLangOpts().HIP) |
195 | return ((Twine("hip" ) + Twine(FuncName)).str()); |
196 | return ((Twine("cuda" ) + Twine(FuncName)).str()); |
197 | } |
198 | std::string |
199 | CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { |
200 | if (CGM.getLangOpts().HIP) |
201 | return ((Twine("__hip" ) + Twine(FuncName)).str()); |
202 | return ((Twine("__cuda" ) + Twine(FuncName)).str()); |
203 | } |
204 | |
205 | static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { |
206 | // If the host and device have different C++ ABIs, mark it as the device |
207 | // mangle context so that the mangling needs to retrieve the additional |
208 | // device lambda mangling number instead of the regular host one. |
209 | if (CGM.getContext().getAuxTargetInfo() && |
210 | CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && |
211 | CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { |
212 | return std::unique_ptr<MangleContext>( |
213 | CGM.getContext().createDeviceMangleContext( |
214 | T: *CGM.getContext().getAuxTargetInfo())); |
215 | } |
216 | |
217 | return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( |
218 | T: CGM.getContext().getAuxTargetInfo())); |
219 | } |
220 | |
221 | CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) |
222 | : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), |
223 | TheModule(CGM.getModule()), |
224 | RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), |
225 | DeviceMC(InitDeviceMC(CGM)) { |
226 | IntTy = CGM.IntTy; |
227 | SizeTy = CGM.SizeTy; |
228 | VoidTy = CGM.VoidTy; |
229 | PtrTy = CGM.UnqualPtrTy; |
230 | } |
231 | |
232 | llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { |
233 | // cudaError_t cudaSetupArgument(void *, size_t, size_t) |
234 | llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy}; |
235 | return CGM.CreateRuntimeFunction( |
236 | Ty: llvm::FunctionType::get(Result: IntTy, Params, isVarArg: false), |
237 | Name: addPrefixToName(FuncName: "SetupArgument" )); |
238 | } |
239 | |
240 | llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { |
241 | if (CGM.getLangOpts().HIP) { |
242 | // hipError_t hipLaunchByPtr(char *); |
243 | return CGM.CreateRuntimeFunction( |
244 | Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), Name: "hipLaunchByPtr" ); |
245 | } |
246 | // cudaError_t cudaLaunch(char *); |
247 | return CGM.CreateRuntimeFunction(Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), |
248 | Name: "cudaLaunch" ); |
249 | } |
250 | |
251 | llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { |
252 | return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false); |
253 | } |
254 | |
255 | llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { |
256 | return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false); |
257 | } |
258 | |
259 | llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { |
260 | llvm::Type *Params[] = {llvm::PointerType::getUnqual(C&: Context), PtrTy, PtrTy, |
261 | llvm::PointerType::getUnqual(C&: Context)}; |
262 | return llvm::FunctionType::get(Result: VoidTy, Params, isVarArg: false); |
263 | } |
264 | |
265 | std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { |
266 | GlobalDecl GD; |
267 | // D could be either a kernel or a variable. |
268 | if (auto *FD = dyn_cast<FunctionDecl>(Val: ND)) |
269 | GD = GlobalDecl(FD, KernelReferenceKind::Kernel); |
270 | else |
271 | GD = GlobalDecl(ND); |
272 | std::string DeviceSideName; |
273 | MangleContext *MC; |
274 | if (CGM.getLangOpts().CUDAIsDevice) |
275 | MC = &CGM.getCXXABI().getMangleContext(); |
276 | else |
277 | MC = DeviceMC.get(); |
278 | if (MC->shouldMangleDeclName(D: ND)) { |
279 | SmallString<256> Buffer; |
280 | llvm::raw_svector_ostream Out(Buffer); |
281 | MC->mangleName(GD, Out); |
282 | DeviceSideName = std::string(Out.str()); |
283 | } else |
284 | DeviceSideName = std::string(ND->getIdentifier()->getName()); |
285 | |
286 | // Make unique name for device side static file-scope variable for HIP. |
287 | if (CGM.getContext().shouldExternalize(D: ND) && |
288 | CGM.getLangOpts().GPURelocatableDeviceCode) { |
289 | SmallString<256> Buffer; |
290 | llvm::raw_svector_ostream Out(Buffer); |
291 | Out << DeviceSideName; |
292 | CGM.printPostfixForExternalizedDecl(OS&: Out, D: ND); |
293 | DeviceSideName = std::string(Out.str()); |
294 | } |
295 | return DeviceSideName; |
296 | } |
297 | |
298 | void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, |
299 | FunctionArgList &Args) { |
300 | EmittedKernels.push_back(Elt: {.Kernel: CGF.CurFn, .D: CGF.CurFuncDecl}); |
301 | if (auto *GV = |
302 | dyn_cast<llvm::GlobalVariable>(Val: KernelHandles[CGF.CurFn->getName()])) { |
303 | GV->setLinkage(CGF.CurFn->getLinkage()); |
304 | GV->setInitializer(CGF.CurFn); |
305 | } |
306 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
307 | CudaFeature::CUDA_USES_NEW_LAUNCH) || |
308 | (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) |
309 | emitDeviceStubBodyNew(CGF, Args); |
310 | else |
311 | emitDeviceStubBodyLegacy(CGF, Args); |
312 | } |
313 | |
314 | // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local |
315 | // array and kernels are launched using cudaLaunchKernel(). |
316 | void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, |
317 | FunctionArgList &Args) { |
318 | // Build the shadow stack entry at the very start of the function. |
319 | |
320 | // Calculate amount of space we will need for all arguments. If we have no |
321 | // args, allocate a single pointer so we still have a valid pointer to the |
322 | // argument array that we can pass to runtime, even if it will be unused. |
323 | Address KernelArgs = CGF.CreateTempAlloca( |
324 | Ty: PtrTy, align: CharUnits::fromQuantity(Quantity: 16), Name: "kernel_args" , |
325 | ArraySize: llvm::ConstantInt::get(Ty: SizeTy, V: std::max<size_t>(a: 1, b: Args.size()))); |
326 | // Store pointers to the arguments in a locally allocated launch_args. |
327 | for (unsigned i = 0; i < Args.size(); ++i) { |
328 | llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(VD: Args[i]).emitRawPointer(CGF); |
329 | llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(V: VarPtr, DestTy: PtrTy); |
330 | CGF.Builder.CreateDefaultAlignedStore( |
331 | Val: VoidVarPtr, Addr: CGF.Builder.CreateConstGEP1_32( |
332 | Ty: PtrTy, Ptr: KernelArgs.emitRawPointer(CGF), Idx0: i)); |
333 | } |
334 | |
335 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end" ); |
336 | |
337 | // Lookup cudaLaunchKernel/hipLaunchKernel function. |
338 | // HIP kernel launching API name depends on -fgpu-default-stream option. For |
339 | // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', |
340 | // it is hipLaunchKernel_spt. |
341 | // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, |
342 | // void **args, size_t sharedMem, |
343 | // cudaStream_t stream); |
344 | // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, |
345 | // dim3 blockDim, void **args, |
346 | // size_t sharedMem, hipStream_t stream); |
347 | TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); |
348 | DeclContext *DC = TranslationUnitDecl::castToDeclContext(D: TUDecl); |
349 | std::string KernelLaunchAPI = "LaunchKernel" ; |
350 | if (CGF.getLangOpts().GPUDefaultStream == |
351 | LangOptions::GPUDefaultStreamKind::PerThread) { |
352 | if (CGF.getLangOpts().HIP) |
353 | KernelLaunchAPI = KernelLaunchAPI + "_spt" ; |
354 | else if (CGF.getLangOpts().CUDA) |
355 | KernelLaunchAPI = KernelLaunchAPI + "_ptsz" ; |
356 | } |
357 | auto LaunchKernelName = addPrefixToName(FuncName: KernelLaunchAPI); |
358 | const IdentifierInfo &cudaLaunchKernelII = |
359 | CGM.getContext().Idents.get(Name: LaunchKernelName); |
360 | FunctionDecl *cudaLaunchKernelFD = nullptr; |
361 | for (auto *Result : DC->lookup(Name: &cudaLaunchKernelII)) { |
362 | if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Val: Result)) |
363 | cudaLaunchKernelFD = FD; |
364 | } |
365 | |
366 | if (cudaLaunchKernelFD == nullptr) { |
367 | CGM.Error(loc: CGF.CurFuncDecl->getLocation(), |
368 | error: "Can't find declaration for " + LaunchKernelName); |
369 | return; |
370 | } |
371 | // Create temporary dim3 grid_dim, block_dim. |
372 | ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(i: 1); |
373 | QualType Dim3Ty = GridDimParam->getType(); |
374 | Address GridDim = |
375 | CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: 8), Name: "grid_dim" ); |
376 | Address BlockDim = |
377 | CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: 8), Name: "block_dim" ); |
378 | Address ShmemSize = |
379 | CGF.CreateTempAlloca(Ty: SizeTy, align: CGM.getSizeAlign(), Name: "shmem_size" ); |
380 | Address Stream = CGF.CreateTempAlloca(Ty: PtrTy, align: CGM.getPointerAlign(), Name: "stream" ); |
381 | llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( |
382 | Ty: llvm::FunctionType::get(Result: IntTy, |
383 | Params: {/*gridDim=*/GridDim.getType(), |
384 | /*blockDim=*/BlockDim.getType(), |
385 | /*ShmemSize=*/ShmemSize.getType(), |
386 | /*Stream=*/Stream.getType()}, |
387 | /*isVarArg=*/false), |
388 | Name: addUnderscoredPrefixToName(FuncName: "PopCallConfiguration" )); |
389 | |
390 | CGF.EmitRuntimeCallOrInvoke(callee: cudaPopConfigFn, args: {GridDim.emitRawPointer(CGF), |
391 | BlockDim.emitRawPointer(CGF), |
392 | ShmemSize.emitRawPointer(CGF), |
393 | Stream.emitRawPointer(CGF)}); |
394 | |
395 | // Emit the call to cudaLaunch |
396 | llvm::Value *Kernel = |
397 | CGF.Builder.CreatePointerCast(V: KernelHandles[CGF.CurFn->getName()], DestTy: PtrTy); |
398 | CallArgList LaunchKernelArgs; |
399 | LaunchKernelArgs.add(rvalue: RValue::get(V: Kernel), |
400 | type: cudaLaunchKernelFD->getParamDecl(i: 0)->getType()); |
401 | LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: GridDim), type: Dim3Ty); |
402 | LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: BlockDim), type: Dim3Ty); |
403 | LaunchKernelArgs.add(rvalue: RValue::get(Addr: KernelArgs, CGF), |
404 | type: cudaLaunchKernelFD->getParamDecl(i: 3)->getType()); |
405 | LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: ShmemSize)), |
406 | type: cudaLaunchKernelFD->getParamDecl(i: 4)->getType()); |
407 | LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: Stream)), |
408 | type: cudaLaunchKernelFD->getParamDecl(i: 5)->getType()); |
409 | |
410 | QualType QT = cudaLaunchKernelFD->getType(); |
411 | QualType CQT = QT.getCanonicalType(); |
412 | llvm::Type *Ty = CGM.getTypes().ConvertType(T: CQT); |
413 | llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty); |
414 | |
415 | const CGFunctionInfo &FI = |
416 | CGM.getTypes().arrangeFunctionDeclaration(FD: cudaLaunchKernelFD); |
417 | llvm::FunctionCallee cudaLaunchKernelFn = |
418 | CGM.CreateRuntimeFunction(Ty: FTy, Name: LaunchKernelName); |
419 | CGF.EmitCall(CallInfo: FI, Callee: CGCallee::forDirect(functionPtr: cudaLaunchKernelFn), ReturnValue: ReturnValueSlot(), |
420 | Args: LaunchKernelArgs); |
421 | |
422 | // To prevent CUDA device stub functions from being merged by ICF in MSVC |
423 | // environment, create an unique global variable for each kernel and write to |
424 | // the variable in the device stub. |
425 | if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && |
426 | !CGF.getLangOpts().HIP) { |
427 | llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Val: Kernel); |
428 | std::string GlobalVarName = (KernelFunction->getName() + ".id" ).str(); |
429 | |
430 | llvm::GlobalVariable *HandleVar = |
431 | CGM.getModule().getNamedGlobal(Name: GlobalVarName); |
432 | if (!HandleVar) { |
433 | HandleVar = new llvm::GlobalVariable( |
434 | CGM.getModule(), CGM.Int8Ty, |
435 | /*Constant=*/false, KernelFunction->getLinkage(), |
436 | llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: 0), GlobalVarName); |
437 | HandleVar->setDSOLocal(KernelFunction->isDSOLocal()); |
438 | HandleVar->setVisibility(KernelFunction->getVisibility()); |
439 | if (KernelFunction->hasComdat()) |
440 | HandleVar->setComdat(CGM.getModule().getOrInsertComdat(Name: GlobalVarName)); |
441 | } |
442 | |
443 | CGF.Builder.CreateAlignedStore(Val: llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: 1), |
444 | Addr: HandleVar, Align: CharUnits::One(), |
445 | /*IsVolatile=*/true); |
446 | } |
447 | |
448 | CGF.EmitBranch(Block: EndBlock); |
449 | |
450 | CGF.EmitBlock(BB: EndBlock); |
451 | } |
452 | |
453 | void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, |
454 | FunctionArgList &Args) { |
455 | // Emit a call to cudaSetupArgument for each arg in Args. |
456 | llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); |
457 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end" ); |
458 | CharUnits Offset = CharUnits::Zero(); |
459 | for (const VarDecl *A : Args) { |
460 | auto TInfo = CGM.getContext().getTypeInfoInChars(T: A->getType()); |
461 | Offset = Offset.alignTo(Align: TInfo.Align); |
462 | llvm::Value *Args[] = { |
463 | CGF.Builder.CreatePointerCast( |
464 | V: CGF.GetAddrOfLocalVar(VD: A).emitRawPointer(CGF), DestTy: PtrTy), |
465 | llvm::ConstantInt::get(Ty: SizeTy, V: TInfo.Width.getQuantity()), |
466 | llvm::ConstantInt::get(Ty: SizeTy, V: Offset.getQuantity()), |
467 | }; |
468 | llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(callee: cudaSetupArgFn, args: Args); |
469 | llvm::Constant *Zero = llvm::ConstantInt::get(Ty: IntTy, V: 0); |
470 | llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(LHS: CB, RHS: Zero); |
471 | llvm::BasicBlock *NextBlock = CGF.createBasicBlock(name: "setup.next" ); |
472 | CGF.Builder.CreateCondBr(Cond: CBZero, True: NextBlock, False: EndBlock); |
473 | CGF.EmitBlock(BB: NextBlock); |
474 | Offset += TInfo.Width; |
475 | } |
476 | |
477 | // Emit the call to cudaLaunch |
478 | llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); |
479 | llvm::Value *Arg = |
480 | CGF.Builder.CreatePointerCast(V: KernelHandles[CGF.CurFn->getName()], DestTy: PtrTy); |
481 | CGF.EmitRuntimeCallOrInvoke(callee: cudaLaunchFn, args: Arg); |
482 | CGF.EmitBranch(Block: EndBlock); |
483 | |
484 | CGF.EmitBlock(BB: EndBlock); |
485 | } |
486 | |
487 | // Replace the original variable Var with the address loaded from variable |
488 | // ManagedVar populated by HIP runtime. |
489 | static void replaceManagedVar(llvm::GlobalVariable *Var, |
490 | llvm::GlobalVariable *ManagedVar) { |
491 | SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; |
492 | for (auto &&VarUse : Var->uses()) { |
493 | WorkList.push_back(Elt: {VarUse.getUser()}); |
494 | } |
495 | while (!WorkList.empty()) { |
496 | auto &&WorkItem = WorkList.pop_back_val(); |
497 | auto *U = WorkItem.back(); |
498 | if (isa<llvm::ConstantExpr>(Val: U)) { |
499 | for (auto &&UU : U->uses()) { |
500 | WorkItem.push_back(Elt: UU.getUser()); |
501 | WorkList.push_back(Elt: WorkItem); |
502 | WorkItem.pop_back(); |
503 | } |
504 | continue; |
505 | } |
506 | if (auto *I = dyn_cast<llvm::Instruction>(Val: U)) { |
507 | llvm::Value *OldV = Var; |
508 | llvm::Instruction *NewV = |
509 | new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed" , false, |
510 | llvm::Align(Var->getAlignment()), I); |
511 | WorkItem.pop_back(); |
512 | // Replace constant expressions directly or indirectly using the managed |
513 | // variable with instructions. |
514 | for (auto &&Op : WorkItem) { |
515 | auto *CE = cast<llvm::ConstantExpr>(Val: Op); |
516 | auto *NewInst = CE->getAsInstruction(); |
517 | NewInst->insertBefore(BB&: *I->getParent(), InsertPos: I->getIterator()); |
518 | NewInst->replaceUsesOfWith(From: OldV, To: NewV); |
519 | OldV = CE; |
520 | NewV = NewInst; |
521 | } |
522 | I->replaceUsesOfWith(From: OldV, To: NewV); |
523 | } else { |
524 | llvm_unreachable("Invalid use of managed variable" ); |
525 | } |
526 | } |
527 | } |
528 | |
529 | /// Creates a function that sets up state on the host side for CUDA objects that |
530 | /// have a presence on both the host and device sides. Specifically, registers |
531 | /// the host side of kernel functions and device global variables with the CUDA |
532 | /// runtime. |
533 | /// \code |
534 | /// void __cuda_register_globals(void** GpuBinaryHandle) { |
535 | /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); |
536 | /// ... |
537 | /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); |
538 | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); |
539 | /// ... |
540 | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); |
541 | /// } |
542 | /// \endcode |
543 | llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { |
544 | // No need to register anything |
545 | if (EmittedKernels.empty() && DeviceVars.empty()) |
546 | return nullptr; |
547 | |
548 | llvm::Function *RegisterKernelsFunc = llvm::Function::Create( |
549 | Ty: getRegisterGlobalsFnTy(), Linkage: llvm::GlobalValue::InternalLinkage, |
550 | N: addUnderscoredPrefixToName(FuncName: "_register_globals" ), M: &TheModule); |
551 | llvm::BasicBlock *EntryBB = |
552 | llvm::BasicBlock::Create(Context, Name: "entry" , Parent: RegisterKernelsFunc); |
553 | CGBuilderTy Builder(CGM, Context); |
554 | Builder.SetInsertPoint(EntryBB); |
555 | |
556 | // void __cudaRegisterFunction(void **, const char *, char *, const char *, |
557 | // int, uint3*, uint3*, dim3*, dim3*, int*) |
558 | llvm::Type *RegisterFuncParams[] = { |
559 | PtrTy, PtrTy, PtrTy, PtrTy, IntTy, |
560 | PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(C&: Context)}; |
561 | llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( |
562 | Ty: llvm::FunctionType::get(Result: IntTy, Params: RegisterFuncParams, isVarArg: false), |
563 | Name: addUnderscoredPrefixToName(FuncName: "RegisterFunction" )); |
564 | |
565 | // Extract GpuBinaryHandle passed as the first argument passed to |
566 | // __cuda_register_globals() and generate __cudaRegisterFunction() call for |
567 | // each emitted kernel. |
568 | llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); |
569 | for (auto &&I : EmittedKernels) { |
570 | llvm::Constant *KernelName = |
571 | makeConstantString(Str: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D))); |
572 | llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(T: PtrTy); |
573 | llvm::Value *Args[] = { |
574 | &GpuBinaryHandlePtr, |
575 | KernelHandles[I.Kernel->getName()], |
576 | KernelName, |
577 | KernelName, |
578 | llvm::ConstantInt::get(Ty: IntTy, V: -1), |
579 | NullPtr, |
580 | NullPtr, |
581 | NullPtr, |
582 | NullPtr, |
583 | llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Context))}; |
584 | Builder.CreateCall(Callee: RegisterFunc, Args); |
585 | } |
586 | |
587 | llvm::Type *VarSizeTy = IntTy; |
588 | // For HIP or CUDA 9.0+, device variable size is type of `size_t`. |
589 | if (CGM.getLangOpts().HIP || |
590 | ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) |
591 | VarSizeTy = SizeTy; |
592 | |
593 | // void __cudaRegisterVar(void **, char *, char *, const char *, |
594 | // int, int, int, int) |
595 | llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy, |
596 | IntTy, VarSizeTy, IntTy, IntTy}; |
597 | llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( |
598 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterVarParams, isVarArg: false), |
599 | Name: addUnderscoredPrefixToName(FuncName: "RegisterVar" )); |
600 | // void __hipRegisterManagedVar(void **, char *, char *, const char *, |
601 | // size_t, unsigned) |
602 | llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy, |
603 | PtrTy, VarSizeTy, IntTy}; |
604 | llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( |
605 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterManagedVarParams, isVarArg: false), |
606 | Name: addUnderscoredPrefixToName(FuncName: "RegisterManagedVar" )); |
607 | // void __cudaRegisterSurface(void **, const struct surfaceReference *, |
608 | // const void **, const char *, int, int); |
609 | llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( |
610 | Ty: llvm::FunctionType::get( |
611 | Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, isVarArg: false), |
612 | Name: addUnderscoredPrefixToName(FuncName: "RegisterSurface" )); |
613 | // void __cudaRegisterTexture(void **, const struct textureReference *, |
614 | // const void **, const char *, int, int, int) |
615 | llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( |
616 | Ty: llvm::FunctionType::get( |
617 | Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, isVarArg: false), |
618 | Name: addUnderscoredPrefixToName(FuncName: "RegisterTexture" )); |
619 | for (auto &&Info : DeviceVars) { |
620 | llvm::GlobalVariable *Var = Info.Var; |
621 | assert((!Var->isDeclaration() || Info.Flags.isManaged()) && |
622 | "External variables should not show up here, except HIP managed " |
623 | "variables" ); |
624 | llvm::Constant *VarName = makeConstantString(Str: getDeviceSideName(ND: Info.D)); |
625 | switch (Info.Flags.getKind()) { |
626 | case DeviceVarFlags::Variable: { |
627 | uint64_t VarSize = |
628 | CGM.getDataLayout().getTypeAllocSize(Ty: Var->getValueType()); |
629 | if (Info.Flags.isManaged()) { |
630 | assert(Var->getName().ends_with(".managed" ) && |
631 | "HIP managed variables not transformed" ); |
632 | auto *ManagedVar = CGM.getModule().getNamedGlobal( |
633 | Name: Var->getName().drop_back(N: StringRef(".managed" ).size())); |
634 | llvm::Value *Args[] = { |
635 | &GpuBinaryHandlePtr, |
636 | ManagedVar, |
637 | Var, |
638 | VarName, |
639 | llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize), |
640 | llvm::ConstantInt::get(Ty: IntTy, V: Var->getAlignment())}; |
641 | if (!Var->isDeclaration()) |
642 | Builder.CreateCall(Callee: RegisterManagedVar, Args); |
643 | } else { |
644 | llvm::Value *Args[] = { |
645 | &GpuBinaryHandlePtr, |
646 | Var, |
647 | VarName, |
648 | VarName, |
649 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern()), |
650 | llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize), |
651 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isConstant()), |
652 | llvm::ConstantInt::get(Ty: IntTy, V: 0)}; |
653 | Builder.CreateCall(Callee: RegisterVar, Args); |
654 | } |
655 | break; |
656 | } |
657 | case DeviceVarFlags::Surface: |
658 | Builder.CreateCall( |
659 | Callee: RegisterSurf, |
660 | Args: {&GpuBinaryHandlePtr, Var, VarName, VarName, |
661 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()), |
662 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())}); |
663 | break; |
664 | case DeviceVarFlags::Texture: |
665 | Builder.CreateCall( |
666 | Callee: RegisterTex, |
667 | Args: {&GpuBinaryHandlePtr, Var, VarName, VarName, |
668 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()), |
669 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isNormalized()), |
670 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())}); |
671 | break; |
672 | } |
673 | } |
674 | |
675 | Builder.CreateRetVoid(); |
676 | return RegisterKernelsFunc; |
677 | } |
678 | |
679 | /// Creates a global constructor function for the module: |
680 | /// |
681 | /// For CUDA: |
682 | /// \code |
683 | /// void __cuda_module_ctor() { |
684 | /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); |
685 | /// __cuda_register_globals(Handle); |
686 | /// } |
687 | /// \endcode |
688 | /// |
689 | /// For HIP: |
690 | /// \code |
691 | /// void __hip_module_ctor() { |
692 | /// if (__hip_gpubin_handle == 0) { |
693 | /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); |
694 | /// __hip_register_globals(__hip_gpubin_handle); |
695 | /// } |
696 | /// } |
697 | /// \endcode |
698 | llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { |
699 | bool IsHIP = CGM.getLangOpts().HIP; |
700 | bool IsCUDA = CGM.getLangOpts().CUDA; |
701 | // No need to generate ctors/dtors if there is no GPU binary. |
702 | StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; |
703 | if (CudaGpuBinaryFileName.empty() && !IsHIP) |
704 | return nullptr; |
705 | if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && |
706 | DeviceVars.empty()) |
707 | return nullptr; |
708 | |
709 | // void __{cuda|hip}_register_globals(void* handle); |
710 | llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); |
711 | // We always need a function to pass in as callback. Create a dummy |
712 | // implementation if we don't need to register anything. |
713 | if (RelocatableDeviceCode && !RegisterGlobalsFunc) |
714 | RegisterGlobalsFunc = makeDummyFunction(FnTy: getRegisterGlobalsFnTy()); |
715 | |
716 | // void ** __{cuda|hip}RegisterFatBinary(void *); |
717 | llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( |
718 | Ty: llvm::FunctionType::get(Result: PtrTy, Params: PtrTy, isVarArg: false), |
719 | Name: addUnderscoredPrefixToName(FuncName: "RegisterFatBinary" )); |
720 | // struct { int magic, int version, void * gpu_binary, void * dont_care }; |
721 | llvm::StructType *FatbinWrapperTy = |
722 | llvm::StructType::get(elt1: IntTy, elts: IntTy, elts: PtrTy, elts: PtrTy); |
723 | |
724 | // Register GPU binary with the CUDA runtime, store returned handle in a |
725 | // global variable and save a reference in GpuBinaryHandle to be cleaned up |
726 | // in destructor on exit. Then associate all known kernels with the GPU binary |
727 | // handle so CUDA runtime can figure out what to call on the GPU side. |
728 | std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; |
729 | if (!CudaGpuBinaryFileName.empty()) { |
730 | auto VFS = CGM.getFileSystem(); |
731 | auto CudaGpuBinaryOrErr = |
732 | VFS->getBufferForFile(Name: CudaGpuBinaryFileName, FileSize: -1, RequiresNullTerminator: false); |
733 | if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { |
734 | CGM.getDiags().Report(DiagID: diag::err_cannot_open_file) |
735 | << CudaGpuBinaryFileName << EC.message(); |
736 | return nullptr; |
737 | } |
738 | CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); |
739 | } |
740 | |
741 | llvm::Function *ModuleCtorFunc = llvm::Function::Create( |
742 | Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false), |
743 | Linkage: llvm::GlobalValue::InternalLinkage, |
744 | N: addUnderscoredPrefixToName(FuncName: "_module_ctor" ), M: &TheModule); |
745 | llvm::BasicBlock *CtorEntryBB = |
746 | llvm::BasicBlock::Create(Context, Name: "entry" , Parent: ModuleCtorFunc); |
747 | CGBuilderTy CtorBuilder(CGM, Context); |
748 | |
749 | CtorBuilder.SetInsertPoint(CtorEntryBB); |
750 | |
751 | const char *FatbinConstantName; |
752 | const char *FatbinSectionName; |
753 | const char *ModuleIDSectionName; |
754 | StringRef ModuleIDPrefix; |
755 | llvm::Constant *FatBinStr; |
756 | unsigned FatMagic; |
757 | if (IsHIP) { |
758 | FatbinConstantName = ".hip_fatbin" ; |
759 | FatbinSectionName = ".hipFatBinSegment" ; |
760 | |
761 | ModuleIDSectionName = "__hip_module_id" ; |
762 | ModuleIDPrefix = "__hip_" ; |
763 | |
764 | if (CudaGpuBinary) { |
765 | // If fatbin is available from early finalization, create a string |
766 | // literal containing the fat binary loaded from the given file. |
767 | const unsigned HIPCodeObjectAlign = 4096; |
768 | FatBinStr = makeConstantArray(Str: std::string(CudaGpuBinary->getBuffer()), Name: "" , |
769 | SectionName: FatbinConstantName, Alignment: HIPCodeObjectAlign); |
770 | } else { |
771 | // If fatbin is not available, create an external symbol |
772 | // __hip_fatbin in section .hip_fatbin. The external symbol is supposed |
773 | // to contain the fat binary but will be populated somewhere else, |
774 | // e.g. by lld through link script. |
775 | FatBinStr = new llvm::GlobalVariable( |
776 | CGM.getModule(), CGM.Int8Ty, |
777 | /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, |
778 | "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr, |
779 | llvm::GlobalVariable::NotThreadLocal); |
780 | cast<llvm::GlobalVariable>(Val: FatBinStr)->setSection(FatbinConstantName); |
781 | } |
782 | |
783 | FatMagic = HIPFatMagic; |
784 | } else { |
785 | if (RelocatableDeviceCode) |
786 | FatbinConstantName = CGM.getTriple().isMacOSX() |
787 | ? "__NV_CUDA,__nv_relfatbin" |
788 | : "__nv_relfatbin" ; |
789 | else |
790 | FatbinConstantName = |
791 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin" ; |
792 | // NVIDIA's cuobjdump looks for fatbins in this section. |
793 | FatbinSectionName = |
794 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment" ; |
795 | |
796 | ModuleIDSectionName = CGM.getTriple().isMacOSX() |
797 | ? "__NV_CUDA,__nv_module_id" |
798 | : "__nv_module_id" ; |
799 | ModuleIDPrefix = "__nv_" ; |
800 | |
801 | // For CUDA, create a string literal containing the fat binary loaded from |
802 | // the given file. |
803 | FatBinStr = makeConstantArray(Str: std::string(CudaGpuBinary->getBuffer()), Name: "" , |
804 | SectionName: FatbinConstantName, Alignment: 8); |
805 | FatMagic = CudaFatMagic; |
806 | } |
807 | |
808 | // Create initialized wrapper structure that points to the loaded GPU binary |
809 | ConstantInitBuilder Builder(CGM); |
810 | auto Values = Builder.beginStruct(structTy: FatbinWrapperTy); |
811 | // Fatbin wrapper magic. |
812 | Values.addInt(intTy: IntTy, value: FatMagic); |
813 | // Fatbin version. |
814 | Values.addInt(intTy: IntTy, value: 1); |
815 | // Data. |
816 | Values.add(value: FatBinStr); |
817 | // Unused in fatbin v1. |
818 | Values.add(value: llvm::ConstantPointerNull::get(T: PtrTy)); |
819 | llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( |
820 | args: addUnderscoredPrefixToName(FuncName: "_fatbin_wrapper" ), args: CGM.getPointerAlign(), |
821 | /*constant*/ args: true); |
822 | FatbinWrapper->setSection(FatbinSectionName); |
823 | |
824 | // There is only one HIP fat binary per linked module, however there are |
825 | // multiple constructor functions. Make sure the fat binary is registered |
826 | // only once. The constructor functions are executed by the dynamic loader |
827 | // before the program gains control. The dynamic loader cannot execute the |
828 | // constructor functions concurrently since doing that would not guarantee |
829 | // thread safety of the loaded program. Therefore we can assume sequential |
830 | // execution of constructor functions here. |
831 | if (IsHIP) { |
832 | auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage |
833 | : llvm::GlobalValue::ExternalLinkage; |
834 | llvm::BasicBlock *IfBlock = |
835 | llvm::BasicBlock::Create(Context, Name: "if" , Parent: ModuleCtorFunc); |
836 | llvm::BasicBlock *ExitBlock = |
837 | llvm::BasicBlock::Create(Context, Name: "exit" , Parent: ModuleCtorFunc); |
838 | // The name, size, and initialization pattern of this variable is part |
839 | // of HIP ABI. |
840 | GpuBinaryHandle = new llvm::GlobalVariable( |
841 | TheModule, PtrTy, /*isConstant=*/false, Linkage, |
842 | /*Initializer=*/ |
843 | CudaGpuBinary ? llvm::ConstantPointerNull::get(T: PtrTy) : nullptr, |
844 | CudaGpuBinary |
845 | ? "__hip_gpubin_handle" |
846 | : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash()); |
847 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
848 | // Prevent the weak symbol in different shared libraries being merged. |
849 | if (Linkage != llvm::GlobalValue::InternalLinkage) |
850 | GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); |
851 | Address GpuBinaryAddr( |
852 | GpuBinaryHandle, PtrTy, |
853 | CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment())); |
854 | { |
855 | auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr); |
856 | llvm::Constant *Zero = |
857 | llvm::Constant::getNullValue(Ty: HandleValue->getType()); |
858 | llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(LHS: HandleValue, RHS: Zero); |
859 | CtorBuilder.CreateCondBr(Cond: EQZero, True: IfBlock, False: ExitBlock); |
860 | } |
861 | { |
862 | CtorBuilder.SetInsertPoint(IfBlock); |
863 | // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); |
864 | llvm::CallInst *RegisterFatbinCall = |
865 | CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper); |
866 | CtorBuilder.CreateStore(Val: RegisterFatbinCall, Addr: GpuBinaryAddr); |
867 | CtorBuilder.CreateBr(Dest: ExitBlock); |
868 | } |
869 | { |
870 | CtorBuilder.SetInsertPoint(ExitBlock); |
871 | // Call __hip_register_globals(GpuBinaryHandle); |
872 | if (RegisterGlobalsFunc) { |
873 | auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr); |
874 | CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: HandleValue); |
875 | } |
876 | } |
877 | } else if (!RelocatableDeviceCode) { |
878 | // Register binary with CUDA runtime. This is substantially different in |
879 | // default mode vs. separate compilation! |
880 | // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); |
881 | llvm::CallInst *RegisterFatbinCall = |
882 | CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper); |
883 | GpuBinaryHandle = new llvm::GlobalVariable( |
884 | TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage, |
885 | llvm::ConstantPointerNull::get(T: PtrTy), "__cuda_gpubin_handle" ); |
886 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
887 | CtorBuilder.CreateAlignedStore(Val: RegisterFatbinCall, Addr: GpuBinaryHandle, |
888 | Align: CGM.getPointerAlign()); |
889 | |
890 | // Call __cuda_register_globals(GpuBinaryHandle); |
891 | if (RegisterGlobalsFunc) |
892 | CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: RegisterFatbinCall); |
893 | |
894 | // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. |
895 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
896 | CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { |
897 | // void __cudaRegisterFatBinaryEnd(void **); |
898 | llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( |
899 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false), |
900 | Name: "__cudaRegisterFatBinaryEnd" ); |
901 | CtorBuilder.CreateCall(Callee: RegisterFatbinEndFunc, Args: RegisterFatbinCall); |
902 | } |
903 | } else { |
904 | // Generate a unique module ID. |
905 | SmallString<64> ModuleID; |
906 | llvm::raw_svector_ostream OS(ModuleID); |
907 | OS << ModuleIDPrefix << llvm::format(Fmt: "%" PRIx64, Vals: FatbinWrapper->getGUID()); |
908 | llvm::Constant *ModuleIDConstant = makeConstantArray( |
909 | Str: std::string(ModuleID), Name: "" , SectionName: ModuleIDSectionName, Alignment: 32, /*AddNull=*/true); |
910 | |
911 | // Create an alias for the FatbinWrapper that nvcc will look for. |
912 | llvm::GlobalAlias::create(Linkage: llvm::GlobalValue::ExternalLinkage, |
913 | Name: Twine("__fatbinwrap" ) + ModuleID, Aliasee: FatbinWrapper); |
914 | |
915 | // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, |
916 | // void *, void (*)(void **)) |
917 | SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary" ); |
918 | RegisterLinkedBinaryName += ModuleID; |
919 | llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( |
920 | Ty: getRegisterLinkedBinaryFnTy(), Name: RegisterLinkedBinaryName); |
921 | |
922 | assert(RegisterGlobalsFunc && "Expecting at least dummy function!" ); |
923 | llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant, |
924 | makeDummyFunction(FnTy: getCallbackFnTy())}; |
925 | CtorBuilder.CreateCall(Callee: RegisterLinkedBinaryFunc, Args); |
926 | } |
927 | |
928 | // Create destructor and register it with atexit() the way NVCC does it. Doing |
929 | // it during regular destructor phase worked in CUDA before 9.2 but results in |
930 | // double-free in 9.2. |
931 | if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { |
932 | // extern "C" int atexit(void (*f)(void)); |
933 | llvm::FunctionType *AtExitTy = |
934 | llvm::FunctionType::get(Result: IntTy, Params: CleanupFn->getType(), isVarArg: false); |
935 | llvm::FunctionCallee AtExitFunc = |
936 | CGM.CreateRuntimeFunction(Ty: AtExitTy, Name: "atexit" , ExtraAttrs: llvm::AttributeList(), |
937 | /*Local=*/true); |
938 | CtorBuilder.CreateCall(Callee: AtExitFunc, Args: CleanupFn); |
939 | } |
940 | |
941 | CtorBuilder.CreateRetVoid(); |
942 | return ModuleCtorFunc; |
943 | } |
944 | |
945 | /// Creates a global destructor function that unregisters the GPU code blob |
946 | /// registered by constructor. |
947 | /// |
948 | /// For CUDA: |
949 | /// \code |
950 | /// void __cuda_module_dtor() { |
951 | /// __cudaUnregisterFatBinary(Handle); |
952 | /// } |
953 | /// \endcode |
954 | /// |
955 | /// For HIP: |
956 | /// \code |
957 | /// void __hip_module_dtor() { |
958 | /// if (__hip_gpubin_handle) { |
959 | /// __hipUnregisterFatBinary(__hip_gpubin_handle); |
960 | /// __hip_gpubin_handle = 0; |
961 | /// } |
962 | /// } |
963 | /// \endcode |
964 | llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { |
965 | // No need for destructor if we don't have a handle to unregister. |
966 | if (!GpuBinaryHandle) |
967 | return nullptr; |
968 | |
969 | // void __cudaUnregisterFatBinary(void ** handle); |
970 | llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( |
971 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false), |
972 | Name: addUnderscoredPrefixToName(FuncName: "UnregisterFatBinary" )); |
973 | |
974 | llvm::Function *ModuleDtorFunc = llvm::Function::Create( |
975 | Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false), |
976 | Linkage: llvm::GlobalValue::InternalLinkage, |
977 | N: addUnderscoredPrefixToName(FuncName: "_module_dtor" ), M: &TheModule); |
978 | |
979 | llvm::BasicBlock *DtorEntryBB = |
980 | llvm::BasicBlock::Create(Context, Name: "entry" , Parent: ModuleDtorFunc); |
981 | CGBuilderTy DtorBuilder(CGM, Context); |
982 | DtorBuilder.SetInsertPoint(DtorEntryBB); |
983 | |
984 | Address GpuBinaryAddr( |
985 | GpuBinaryHandle, GpuBinaryHandle->getValueType(), |
986 | CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment())); |
987 | auto *HandleValue = DtorBuilder.CreateLoad(Addr: GpuBinaryAddr); |
988 | // There is only one HIP fat binary per linked module, however there are |
989 | // multiple destructor functions. Make sure the fat binary is unregistered |
990 | // only once. |
991 | if (CGM.getLangOpts().HIP) { |
992 | llvm::BasicBlock *IfBlock = |
993 | llvm::BasicBlock::Create(Context, Name: "if" , Parent: ModuleDtorFunc); |
994 | llvm::BasicBlock *ExitBlock = |
995 | llvm::BasicBlock::Create(Context, Name: "exit" , Parent: ModuleDtorFunc); |
996 | llvm::Constant *Zero = llvm::Constant::getNullValue(Ty: HandleValue->getType()); |
997 | llvm::Value *NEZero = DtorBuilder.CreateICmpNE(LHS: HandleValue, RHS: Zero); |
998 | DtorBuilder.CreateCondBr(Cond: NEZero, True: IfBlock, False: ExitBlock); |
999 | |
1000 | DtorBuilder.SetInsertPoint(IfBlock); |
1001 | DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue); |
1002 | DtorBuilder.CreateStore(Val: Zero, Addr: GpuBinaryAddr); |
1003 | DtorBuilder.CreateBr(Dest: ExitBlock); |
1004 | |
1005 | DtorBuilder.SetInsertPoint(ExitBlock); |
1006 | } else { |
1007 | DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue); |
1008 | } |
1009 | DtorBuilder.CreateRetVoid(); |
1010 | return ModuleDtorFunc; |
1011 | } |
1012 | |
1013 | CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { |
1014 | return new CGNVCUDARuntime(CGM); |
1015 | } |
1016 | |
1017 | void CGNVCUDARuntime::internalizeDeviceSideVar( |
1018 | const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { |
1019 | // For -fno-gpu-rdc, host-side shadows of external declarations of device-side |
1020 | // global variables become internal definitions. These have to be internal in |
1021 | // order to prevent name conflicts with global host variables with the same |
1022 | // name in a different TUs. |
1023 | // |
1024 | // For -fgpu-rdc, the shadow variables should not be internalized because |
1025 | // they may be accessed by different TU. |
1026 | if (CGM.getLangOpts().GPURelocatableDeviceCode) |
1027 | return; |
1028 | |
1029 | // __shared__ variables are odd. Shadows do get created, but |
1030 | // they are not registered with the CUDA runtime, so they |
1031 | // can't really be used to access their device-side |
1032 | // counterparts. It's not clear yet whether it's nvcc's bug or |
1033 | // a feature, but we've got to do the same for compatibility. |
1034 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || |
1035 | D->hasAttr<CUDASharedAttr>() || |
1036 | D->getType()->isCUDADeviceBuiltinSurfaceType() || |
1037 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
1038 | Linkage = llvm::GlobalValue::InternalLinkage; |
1039 | } |
1040 | } |
1041 | |
1042 | void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, |
1043 | llvm::GlobalVariable &GV) { |
1044 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { |
1045 | // Shadow variables and their properties must be registered with CUDA |
1046 | // runtime. Skip Extern global variables, which will be registered in |
1047 | // the TU where they are defined. |
1048 | // |
1049 | // Don't register a C++17 inline variable. The local symbol can be |
1050 | // discarded and referencing a discarded local symbol from outside the |
1051 | // comdat (__cuda_register_globals) is disallowed by the ELF spec. |
1052 | // |
1053 | // HIP managed variables need to be always recorded in device and host |
1054 | // compilations for transformation. |
1055 | // |
1056 | // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are |
1057 | // added to llvm.compiler-used, therefore they are safe to be registered. |
1058 | if ((!D->hasExternalStorage() && !D->isInline()) || |
1059 | CGM.getContext().CUDADeviceVarODRUsedByHost.contains(V: D) || |
1060 | D->hasAttr<HIPManagedAttr>()) { |
1061 | registerDeviceVar(VD: D, Var&: GV, Extern: !D->hasDefinition(), |
1062 | Constant: D->hasAttr<CUDAConstantAttr>()); |
1063 | } |
1064 | } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || |
1065 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
1066 | // Builtin surfaces and textures and their template arguments are |
1067 | // also registered with CUDA runtime. |
1068 | const auto *TD = cast<ClassTemplateSpecializationDecl>( |
1069 | Val: D->getType()->castAs<RecordType>()->getDecl()); |
1070 | const TemplateArgumentList &Args = TD->getTemplateArgs(); |
1071 | if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { |
1072 | assert(Args.size() == 2 && |
1073 | "Unexpected number of template arguments of CUDA device " |
1074 | "builtin surface type." ); |
1075 | auto SurfType = Args[1].getAsIntegral(); |
1076 | if (!D->hasExternalStorage()) |
1077 | registerDeviceSurf(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: SurfType.getSExtValue()); |
1078 | } else { |
1079 | assert(Args.size() == 3 && |
1080 | "Unexpected number of template arguments of CUDA device " |
1081 | "builtin texture type." ); |
1082 | auto TexType = Args[1].getAsIntegral(); |
1083 | auto Normalized = Args[2].getAsIntegral(); |
1084 | if (!D->hasExternalStorage()) |
1085 | registerDeviceTex(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: TexType.getSExtValue(), |
1086 | Normalized: Normalized.getZExtValue()); |
1087 | } |
1088 | } |
1089 | } |
1090 | |
1091 | // Transform managed variables to pointers to managed variables in device code. |
1092 | // Each use of the original managed variable is replaced by a load from the |
1093 | // transformed managed variable. The transformed managed variable contains |
1094 | // the address of managed memory which will be allocated by the runtime. |
1095 | void CGNVCUDARuntime::transformManagedVars() { |
1096 | for (auto &&Info : DeviceVars) { |
1097 | llvm::GlobalVariable *Var = Info.Var; |
1098 | if (Info.Flags.getKind() == DeviceVarFlags::Variable && |
1099 | Info.Flags.isManaged()) { |
1100 | auto *ManagedVar = new llvm::GlobalVariable( |
1101 | CGM.getModule(), Var->getType(), |
1102 | /*isConstant=*/false, Var->getLinkage(), |
1103 | /*Init=*/Var->isDeclaration() |
1104 | ? nullptr |
1105 | : llvm::ConstantPointerNull::get(T: Var->getType()), |
1106 | /*Name=*/"" , /*InsertBefore=*/nullptr, |
1107 | llvm::GlobalVariable::NotThreadLocal, |
1108 | CGM.getContext().getTargetAddressSpace(AS: CGM.getLangOpts().CUDAIsDevice |
1109 | ? LangAS::cuda_device |
1110 | : LangAS::Default)); |
1111 | ManagedVar->setDSOLocal(Var->isDSOLocal()); |
1112 | ManagedVar->setVisibility(Var->getVisibility()); |
1113 | ManagedVar->setExternallyInitialized(true); |
1114 | replaceManagedVar(Var, ManagedVar); |
1115 | ManagedVar->takeName(V: Var); |
1116 | Var->setName(Twine(ManagedVar->getName()) + ".managed" ); |
1117 | // Keep managed variables even if they are not used in device code since |
1118 | // they need to be allocated by the runtime. |
1119 | if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) { |
1120 | assert(!ManagedVar->isDeclaration()); |
1121 | CGM.addCompilerUsedGlobal(GV: Var); |
1122 | CGM.addCompilerUsedGlobal(GV: ManagedVar); |
1123 | } |
1124 | } |
1125 | } |
1126 | } |
1127 | |
1128 | // Creates offloading entries for all the kernels and globals that must be |
1129 | // registered. The linker will provide a pointer to this section so we can |
1130 | // register the symbols with the linked device image. |
1131 | void CGNVCUDARuntime::createOffloadingEntries() { |
1132 | StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" |
1133 | : "cuda_offloading_entries" ; |
1134 | llvm::Module &M = CGM.getModule(); |
1135 | for (KernelInfo &I : EmittedKernels) |
1136 | llvm::offloading::emitOffloadingEntry( |
1137 | M, Addr: KernelHandles[I.Kernel->getName()], |
1138 | Name: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)), /*Flags=*/Size: 0, /*Data=*/Flags: 0, |
1139 | Data: llvm::offloading::OffloadGlobalEntry, SectionName: Section); |
1140 | |
1141 | for (VarInfo &I : DeviceVars) { |
1142 | uint64_t VarSize = |
1143 | CGM.getDataLayout().getTypeAllocSize(Ty: I.Var->getValueType()); |
1144 | int32_t Flags = |
1145 | (I.Flags.isExtern() |
1146 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern) |
1147 | : 0) | |
1148 | (I.Flags.isConstant() |
1149 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant) |
1150 | : 0) | |
1151 | (I.Flags.isNormalized() |
1152 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized) |
1153 | : 0); |
1154 | if (I.Flags.getKind() == DeviceVarFlags::Variable) { |
1155 | llvm::offloading::emitOffloadingEntry( |
1156 | M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
1157 | Flags: (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry |
1158 | : llvm::offloading::OffloadGlobalEntry) | |
1159 | Flags, |
1160 | /*Data=*/0, SectionName: Section); |
1161 | } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { |
1162 | llvm::offloading::emitOffloadingEntry( |
1163 | M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
1164 | Flags: llvm::offloading::OffloadGlobalSurfaceEntry | Flags, |
1165 | Data: I.Flags.getSurfTexType(), SectionName: Section); |
1166 | } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { |
1167 | llvm::offloading::emitOffloadingEntry( |
1168 | M, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
1169 | Flags: llvm::offloading::OffloadGlobalTextureEntry | Flags, |
1170 | Data: I.Flags.getSurfTexType(), SectionName: Section); |
1171 | } |
1172 | } |
1173 | } |
1174 | |
1175 | // Returns module constructor to be added. |
1176 | llvm::Function *CGNVCUDARuntime::finalizeModule() { |
1177 | transformManagedVars(); |
1178 | if (CGM.getLangOpts().CUDAIsDevice) { |
1179 | // Mark ODR-used device variables as compiler used to prevent it from being |
1180 | // eliminated by optimization. This is necessary for device variables |
1181 | // ODR-used by host functions. Sema correctly marks them as ODR-used no |
1182 | // matter whether they are ODR-used by device or host functions. |
1183 | // |
1184 | // We do not need to do this if the variable has used attribute since it |
1185 | // has already been added. |
1186 | // |
1187 | // Static device variables have been externalized at this point, therefore |
1188 | // variables with LLVM private or internal linkage need not be added. |
1189 | for (auto &&Info : DeviceVars) { |
1190 | auto Kind = Info.Flags.getKind(); |
1191 | if (!Info.Var->isDeclaration() && |
1192 | !llvm::GlobalValue::isLocalLinkage(Linkage: Info.Var->getLinkage()) && |
1193 | (Kind == DeviceVarFlags::Variable || |
1194 | Kind == DeviceVarFlags::Surface || |
1195 | Kind == DeviceVarFlags::Texture) && |
1196 | Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) { |
1197 | CGM.addCompilerUsedGlobal(GV: Info.Var); |
1198 | } |
1199 | } |
1200 | return nullptr; |
1201 | } |
1202 | if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) |
1203 | createOffloadingEntries(); |
1204 | else |
1205 | return makeModuleCtorFunction(); |
1206 | |
1207 | return nullptr; |
1208 | } |
1209 | |
1210 | llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, |
1211 | GlobalDecl GD) { |
1212 | auto Loc = KernelHandles.find(Val: F->getName()); |
1213 | if (Loc != KernelHandles.end()) { |
1214 | auto OldHandle = Loc->second; |
1215 | if (KernelStubs[OldHandle] == F) |
1216 | return OldHandle; |
1217 | |
1218 | // We've found the function name, but F itself has changed, so we need to |
1219 | // update the references. |
1220 | if (CGM.getLangOpts().HIP) { |
1221 | // For HIP compilation the handle itself does not change, so we only need |
1222 | // to update the Stub value. |
1223 | KernelStubs[OldHandle] = F; |
1224 | return OldHandle; |
1225 | } |
1226 | // For non-HIP compilation, erase the old Stub and fall-through to creating |
1227 | // new entries. |
1228 | KernelStubs.erase(Val: OldHandle); |
1229 | } |
1230 | |
1231 | if (!CGM.getLangOpts().HIP) { |
1232 | KernelHandles[F->getName()] = F; |
1233 | KernelStubs[F] = F; |
1234 | return F; |
1235 | } |
1236 | |
1237 | auto *Var = new llvm::GlobalVariable( |
1238 | TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), |
1239 | /*Initializer=*/nullptr, |
1240 | CGM.getMangledName( |
1241 | GD: GD.getWithKernelReferenceKind(Kind: KernelReferenceKind::Kernel))); |
1242 | Var->setAlignment(CGM.getPointerAlign().getAsAlign()); |
1243 | Var->setDSOLocal(F->isDSOLocal()); |
1244 | Var->setVisibility(F->getVisibility()); |
1245 | auto *FD = cast<FunctionDecl>(Val: GD.getDecl()); |
1246 | auto *FT = FD->getPrimaryTemplate(); |
1247 | if (!FT || FT->isThisDeclarationADefinition()) |
1248 | CGM.maybeSetTrivialComdat(D: *FD, GO&: *Var); |
1249 | KernelHandles[F->getName()] = Var; |
1250 | KernelStubs[Var] = F; |
1251 | return Var; |
1252 | } |
1253 | |