| 1 | //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This provides a class for CUDA code generation targeting the NVIDIA CUDA |
| 10 | // runtime library. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "CGCUDARuntime.h" |
| 15 | #include "CGCXXABI.h" |
| 16 | #include "CodeGenFunction.h" |
| 17 | #include "CodeGenModule.h" |
| 18 | #include "clang/AST/CharUnits.h" |
| 19 | #include "clang/AST/Decl.h" |
| 20 | #include "clang/Basic/Cuda.h" |
| 21 | #include "clang/CodeGen/CodeGenABITypes.h" |
| 22 | #include "clang/CodeGen/ConstantInitBuilder.h" |
| 23 | #include "llvm/ADT/StringRef.h" |
| 24 | #include "llvm/Frontend/Offloading/Utility.h" |
| 25 | #include "llvm/IR/BasicBlock.h" |
| 26 | #include "llvm/IR/Constants.h" |
| 27 | #include "llvm/IR/DerivedTypes.h" |
| 28 | #include "llvm/IR/ReplaceConstant.h" |
| 29 | #include "llvm/ProfileData/InstrProf.h" |
| 30 | #include "llvm/Support/Format.h" |
| 31 | #include "llvm/Support/VirtualFileSystem.h" |
| 32 | #include "llvm/Transforms/Utils/ModuleUtils.h" |
| 33 | |
| 34 | using namespace clang; |
| 35 | using namespace CodeGen; |
| 36 | |
| 37 | namespace { |
| 38 | constexpr unsigned CudaFatMagic = 0x466243b1; |
| 39 | constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" |
| 40 | |
| 41 | class CGNVCUDARuntime : public CGCUDARuntime { |
| 42 | |
| 43 | /// The prefix used for function calls and section names (CUDA, HIP, LLVM) |
| 44 | StringRef Prefix; |
| 45 | |
| 46 | private: |
| 47 | llvm::IntegerType *IntTy, *SizeTy; |
| 48 | llvm::Type *VoidTy; |
| 49 | llvm::PointerType *PtrTy; |
| 50 | |
| 51 | /// Convenience reference to LLVM Context |
| 52 | llvm::LLVMContext &Context; |
| 53 | /// Convenience reference to the current module |
| 54 | llvm::Module &TheModule; |
| 55 | /// Keeps track of kernel launch stubs and handles emitted in this module |
| 56 | struct KernelInfo { |
| 57 | llvm::Function *Kernel; // stub function to help launch kernel |
| 58 | const Decl *D; |
| 59 | }; |
| 60 | llvm::SmallVector<KernelInfo, 16> EmittedKernels; |
| 61 | // Map a kernel mangled name to a symbol for identifying kernel in host code |
| 62 | // For CUDA, the symbol for identifying the kernel is the same as the device |
| 63 | // stub function. For HIP, they are different. |
| 64 | llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles; |
| 65 | // Map a kernel handle to the kernel stub. |
| 66 | llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; |
| 67 | struct VarInfo { |
| 68 | llvm::GlobalVariable *Var; |
| 69 | const VarDecl *D; |
| 70 | DeviceVarFlags Flags; |
| 71 | }; |
| 72 | llvm::SmallVector<VarInfo, 16> DeviceVars; |
| 73 | /// Keeps track of variable containing handle of GPU binary. Populated by |
| 74 | /// ModuleCtorFunction() and used to create corresponding cleanup calls in |
| 75 | /// ModuleDtorFunction() |
| 76 | llvm::GlobalVariable *GpuBinaryHandle = nullptr; |
| 77 | /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global, |
| 78 | /// emitted only for HIP host compiles when PGO is on. Registered via |
| 79 | /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime |
| 80 | /// can locate the device-side table by name. |
| 81 | llvm::GlobalVariable *OffloadProfShadow = nullptr; |
| 82 | struct OffloadProfSectionShadowInfo { |
| 83 | llvm::GlobalVariable *Shadow; |
| 84 | std::string DeviceName; |
| 85 | }; |
| 86 | llvm::SmallVector<OffloadProfSectionShadowInfo, 16> OffloadProfSectionShadows; |
| 87 | /// Whether we generate relocatable device code. |
| 88 | bool RelocatableDeviceCode; |
| 89 | /// Mangle context for device. |
| 90 | std::unique_ptr<MangleContext> DeviceMC; |
| 91 | |
| 92 | llvm::FunctionCallee getSetupArgumentFn() const; |
| 93 | llvm::FunctionCallee getLaunchFn() const; |
| 94 | |
| 95 | llvm::FunctionType *getRegisterGlobalsFnTy() const; |
| 96 | llvm::FunctionType *getCallbackFnTy() const; |
| 97 | llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; |
| 98 | std::string addPrefixToName(StringRef FuncName) const; |
| 99 | std::string addUnderscoredPrefixToName(StringRef FuncName) const; |
| 100 | |
| 101 | /// Creates a function to register all kernel stubs generated in this module. |
| 102 | llvm::Function *makeRegisterGlobalsFn(); |
| 103 | |
| 104 | /// Helper function that generates a constant string and returns a pointer to |
| 105 | /// the start of the string. The result of this function can be used anywhere |
| 106 | /// where the C code specifies const char*. |
| 107 | llvm::Constant *makeConstantString(const std::string &Str, |
| 108 | const std::string &Name = "" ) { |
| 109 | return CGM.GetAddrOfConstantCString(Str, GlobalName: Name).getPointer(); |
| 110 | } |
| 111 | |
| 112 | /// Helper function which generates an initialized constant array from Str, |
| 113 | /// and optionally sets section name and alignment. AddNull specifies whether |
| 114 | /// the array should nave NUL termination. |
| 115 | llvm::Constant *makeConstantArray(StringRef Str, |
| 116 | StringRef Name = "" , |
| 117 | StringRef SectionName = "" , |
| 118 | unsigned Alignment = 0, |
| 119 | bool AddNull = false) { |
| 120 | llvm::Constant *Value = |
| 121 | llvm::ConstantDataArray::getString(Context, Initializer: Str, AddNull); |
| 122 | auto *GV = new llvm::GlobalVariable( |
| 123 | TheModule, Value->getType(), /*isConstant=*/true, |
| 124 | llvm::GlobalValue::PrivateLinkage, Value, Name); |
| 125 | if (!SectionName.empty()) { |
| 126 | GV->setSection(SectionName); |
| 127 | // Mark the address as used which make sure that this section isn't |
| 128 | // merged and we will really have it in the object file. |
| 129 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); |
| 130 | } |
| 131 | if (Alignment) |
| 132 | GV->setAlignment(llvm::Align(Alignment)); |
| 133 | return GV; |
| 134 | } |
| 135 | |
| 136 | /// Helper function that generates an empty dummy function returning void. |
| 137 | llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { |
| 138 | assert(FnTy->getReturnType()->isVoidTy() && |
| 139 | "Can only generate dummy functions returning void!" ); |
| 140 | llvm::Function *DummyFunc = llvm::Function::Create( |
| 141 | Ty: FnTy, Linkage: llvm::GlobalValue::InternalLinkage, N: "dummy" , M: &TheModule); |
| 142 | |
| 143 | llvm::BasicBlock *DummyBlock = |
| 144 | llvm::BasicBlock::Create(Context, Name: "" , Parent: DummyFunc); |
| 145 | CGBuilderTy FuncBuilder(CGM, Context); |
| 146 | FuncBuilder.SetInsertPoint(DummyBlock); |
| 147 | FuncBuilder.CreateRetVoid(); |
| 148 | |
| 149 | return DummyFunc; |
| 150 | } |
| 151 | |
| 152 | Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args); |
| 153 | Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, |
| 154 | FunctionArgList &Args); |
| 155 | void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); |
| 156 | void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); |
| 157 | std::string getDeviceSideName(const NamedDecl *ND) override; |
| 158 | |
| 159 | void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, |
| 160 | bool Extern, bool Constant) { |
| 161 | DeviceVars.push_back(Elt: {.Var: &Var, |
| 162 | .D: VD, |
| 163 | .Flags: {DeviceVarFlags::Variable, Extern, Constant, |
| 164 | VD->hasAttr<HIPManagedAttr>(), |
| 165 | /*Normalized*/ false, 0}}); |
| 166 | } |
| 167 | void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, |
| 168 | bool Extern, int Type) { |
| 169 | DeviceVars.push_back(Elt: {.Var: &Var, |
| 170 | .D: VD, |
| 171 | .Flags: {DeviceVarFlags::Surface, Extern, /*Constant*/ false, |
| 172 | /*Managed*/ false, |
| 173 | /*Normalized*/ false, Type}}); |
| 174 | } |
| 175 | void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, |
| 176 | bool Extern, int Type, bool Normalized) { |
| 177 | DeviceVars.push_back(Elt: {.Var: &Var, |
| 178 | .D: VD, |
| 179 | .Flags: {DeviceVarFlags::Texture, Extern, /*Constant*/ false, |
| 180 | /*Managed*/ false, Normalized, Type}}); |
| 181 | } |
| 182 | |
| 183 | /// Creates module constructor function |
| 184 | llvm::Function *makeModuleCtorFunction(); |
| 185 | /// Creates module destructor function |
| 186 | llvm::Function *makeModuleDtorFunction(); |
| 187 | /// Transform managed variables for device compilation. |
| 188 | void transformManagedVars(); |
| 189 | /// Create offloading entries to register globals in RDC mode. |
| 190 | void createOffloadingEntries(); |
| 191 | /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global. |
| 192 | /// On the device side, InstrProfiling emits the populated section-bounds |
| 193 | /// table only when the TU has real profile data. On the host side it is a |
| 194 | /// placeholder void* shadow stored in |
| 195 | /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC) |
| 196 | /// or createOffloadingEntries (RDC) so the runtime can locate the |
| 197 | /// device-side table by name. |
| 198 | void emitOffloadProfilingSections(); |
| 199 | |
| 200 | public: |
| 201 | CGNVCUDARuntime(CodeGenModule &CGM); |
| 202 | |
| 203 | llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; |
| 204 | llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { |
| 205 | auto Loc = KernelStubs.find(Val: Handle); |
| 206 | assert(Loc != KernelStubs.end()); |
| 207 | return Loc->second; |
| 208 | } |
| 209 | void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; |
| 210 | void handleVarRegistration(const VarDecl *VD, |
| 211 | llvm::GlobalVariable &Var) override; |
| 212 | void |
| 213 | internalizeDeviceSideVar(const VarDecl *D, |
| 214 | llvm::GlobalValue::LinkageTypes &Linkage) override; |
| 215 | |
| 216 | llvm::Function *finalizeModule() override; |
| 217 | }; |
| 218 | |
| 219 | } // end anonymous namespace |
| 220 | |
| 221 | std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
| 222 | return (Prefix + FuncName).str(); |
| 223 | } |
| 224 | std::string |
| 225 | CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { |
| 226 | return ("__" + Prefix + FuncName).str(); |
| 227 | } |
| 228 | |
| 229 | static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { |
| 230 | // If the host and device have different C++ ABIs, mark it as the device |
| 231 | // mangle context so that the mangling needs to retrieve the additional |
| 232 | // device lambda mangling number instead of the regular host one. |
| 233 | if (CGM.getContext().getAuxTargetInfo() && |
| 234 | CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && |
| 235 | CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { |
| 236 | return std::unique_ptr<MangleContext>( |
| 237 | CGM.getContext().createDeviceMangleContext( |
| 238 | T: *CGM.getContext().getAuxTargetInfo())); |
| 239 | } |
| 240 | |
| 241 | return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( |
| 242 | T: CGM.getContext().getAuxTargetInfo())); |
| 243 | } |
| 244 | |
| 245 | CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) |
| 246 | : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), |
| 247 | TheModule(CGM.getModule()), |
| 248 | RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), |
| 249 | DeviceMC(InitDeviceMC(CGM)) { |
| 250 | IntTy = CGM.IntTy; |
| 251 | SizeTy = CGM.SizeTy; |
| 252 | VoidTy = CGM.VoidTy; |
| 253 | PtrTy = CGM.DefaultPtrTy; |
| 254 | |
| 255 | if (CGM.getLangOpts().OffloadViaLLVM) |
| 256 | Prefix = "llvm" ; |
| 257 | else if (CGM.getLangOpts().HIP) |
| 258 | Prefix = "hip" ; |
| 259 | else |
| 260 | Prefix = "cuda" ; |
| 261 | } |
| 262 | |
| 263 | llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { |
| 264 | // cudaError_t cudaSetupArgument(void *, size_t, size_t) |
| 265 | llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy}; |
| 266 | return CGM.CreateRuntimeFunction( |
| 267 | Ty: llvm::FunctionType::get(Result: IntTy, Params, isVarArg: false), |
| 268 | Name: addPrefixToName(FuncName: "SetupArgument" )); |
| 269 | } |
| 270 | |
| 271 | llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { |
| 272 | if (CGM.getLangOpts().HIP) { |
| 273 | // hipError_t hipLaunchByPtr(char *); |
| 274 | return CGM.CreateRuntimeFunction( |
| 275 | Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), Name: "hipLaunchByPtr" ); |
| 276 | } |
| 277 | // cudaError_t cudaLaunch(char *); |
| 278 | return CGM.CreateRuntimeFunction(Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), |
| 279 | Name: "cudaLaunch" ); |
| 280 | } |
| 281 | |
| 282 | llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { |
| 283 | return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false); |
| 284 | } |
| 285 | |
| 286 | llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { |
| 287 | return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false); |
| 288 | } |
| 289 | |
| 290 | llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { |
| 291 | llvm::Type *Params[] = {llvm::PointerType::getUnqual(C&: Context), PtrTy, PtrTy, |
| 292 | llvm::PointerType::getUnqual(C&: Context)}; |
| 293 | return llvm::FunctionType::get(Result: VoidTy, Params, isVarArg: false); |
| 294 | } |
| 295 | |
| 296 | std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { |
| 297 | GlobalDecl GD; |
| 298 | // D could be either a kernel or a variable. |
| 299 | if (auto *FD = dyn_cast<FunctionDecl>(Val: ND)) |
| 300 | GD = GlobalDecl(FD, KernelReferenceKind::Kernel); |
| 301 | else |
| 302 | GD = GlobalDecl(ND); |
| 303 | std::string DeviceSideName; |
| 304 | MangleContext *MC; |
| 305 | if (CGM.getLangOpts().CUDAIsDevice) |
| 306 | MC = &CGM.getCXXABI().getMangleContext(); |
| 307 | else |
| 308 | MC = DeviceMC.get(); |
| 309 | if (MC->shouldMangleDeclName(D: ND)) { |
| 310 | SmallString<256> Buffer; |
| 311 | llvm::raw_svector_ostream Out(Buffer); |
| 312 | MC->mangleName(GD, Out); |
| 313 | DeviceSideName = std::string(Out.str()); |
| 314 | } else |
| 315 | DeviceSideName = std::string(ND->getIdentifier()->getName()); |
| 316 | |
| 317 | // Make unique name for device side static file-scope variable for HIP. |
| 318 | if (CGM.getContext().shouldExternalize(D: ND) && |
| 319 | CGM.getLangOpts().GPURelocatableDeviceCode) { |
| 320 | SmallString<256> Buffer; |
| 321 | llvm::raw_svector_ostream Out(Buffer); |
| 322 | Out << DeviceSideName; |
| 323 | CGM.printPostfixForExternalizedDecl(OS&: Out, D: ND); |
| 324 | DeviceSideName = std::string(Out.str()); |
| 325 | } |
| 326 | return DeviceSideName; |
| 327 | } |
| 328 | |
| 329 | void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, |
| 330 | FunctionArgList &Args) { |
| 331 | EmittedKernels.push_back(Elt: {.Kernel: CGF.CurFn, .D: CGF.CurFuncDecl}); |
| 332 | if (auto *GV = |
| 333 | dyn_cast<llvm::GlobalVariable>(Val: KernelHandles[CGF.CurFn->getName()])) { |
| 334 | GV->setLinkage(CGF.CurFn->getLinkage()); |
| 335 | GV->setInitializer(CGF.CurFn); |
| 336 | } |
| 337 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
| 338 | CudaFeature::CUDA_USES_NEW_LAUNCH) || |
| 339 | (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) || |
| 340 | (CGF.getLangOpts().OffloadViaLLVM)) |
| 341 | emitDeviceStubBodyNew(CGF, Args); |
| 342 | else |
| 343 | emitDeviceStubBodyLegacy(CGF, Args); |
| 344 | } |
| 345 | |
| 346 | /// Build the input as a sized array of pointers so that it can be launched by |
| 347 | /// the offloading runtime. |
| 348 | Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF, |
| 349 | FunctionArgList &Args) { |
| 350 | SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes; |
| 351 | for (auto &Arg : Args) |
| 352 | ArgTypes.push_back(Elt: CGF.ConvertTypeForMem(T: Arg->getType())); |
| 353 | llvm::StructType *KernelArgsTy = llvm::StructType::create(Elements: ArgTypes); |
| 354 | llvm::Type *KernelArgsPtrsTy = llvm::ArrayType::get(ElementType: PtrTy, NumElements: Args.size()); |
| 355 | |
| 356 | auto *Int32Ty = CGF.Builder.getInt32Ty(); |
| 357 | KernelLaunchParamsTypes.push_back(Elt: Int32Ty); |
| 358 | KernelLaunchParamsTypes.push_back(Elt: PtrTy); |
| 359 | |
| 360 | llvm::StructType *KernelLaunchParamsTy = |
| 361 | llvm::StructType::create(Elements: KernelLaunchParamsTypes); |
| 362 | Address KernelArgs = CGF.CreateTempAllocaWithoutCast( |
| 363 | Ty: KernelArgsTy, align: CharUnits::fromQuantity(Quantity: 16), Name: "kernel_args" ); |
| 364 | Address KernelArgsPtrs = CGF.CreateTempAllocaWithoutCast( |
| 365 | Ty: KernelArgsPtrsTy, align: CharUnits::fromQuantity(Quantity: 16), Name: "kernel_args_ptrs" ); |
| 366 | Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast( |
| 367 | Ty: KernelLaunchParamsTy, align: CharUnits::fromQuantity(Quantity: 16), |
| 368 | Name: "kernel_launch_params" ); |
| 369 | |
| 370 | CGF.Builder.CreateStore(Val: llvm::ConstantInt::get(Ty: Int32Ty, V: Args.size()), |
| 371 | Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: 0)); |
| 372 | CGF.Builder.CreateStore(Val: KernelArgsPtrs.emitRawPointer(CGF), |
| 373 | Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: 1)); |
| 374 | |
| 375 | for (unsigned i = 0; i < Args.size(); ++i) { |
| 376 | auto *ArgVal = CGF.Builder.CreateLoad(Addr: CGF.GetAddrOfLocalVar(VD: Args[i])); |
| 377 | Address ArgAddr = CGF.Builder.CreateStructGEP(Addr: KernelArgs, Index: i); |
| 378 | CGF.Builder.CreateStore(Val: ArgVal, Addr: ArgAddr); |
| 379 | CGF.Builder.CreateStore(Val: ArgAddr.emitRawPointer(CGF), |
| 380 | Addr: CGF.Builder.CreateConstArrayGEP(Addr: KernelArgsPtrs, Index: i)); |
| 381 | } |
| 382 | |
| 383 | return KernelLaunchParams; |
| 384 | } |
| 385 | |
| 386 | Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF, |
| 387 | FunctionArgList &Args) { |
| 388 | // Calculate amount of space we will need for all arguments. If we have no |
| 389 | // args, allocate a single pointer so we still have a valid pointer to the |
| 390 | // argument array that we can pass to runtime, even if it will be unused. |
| 391 | Address KernelArgs = CGF.CreateTempAlloca( |
| 392 | Ty: PtrTy, UseAddrSpace: LangAS::Default, align: CharUnits::fromQuantity(Quantity: 16), Name: "kernel_args" , |
| 393 | ArraySize: llvm::ConstantInt::get(Ty: SizeTy, V: std::max<size_t>(a: 1, b: Args.size()))); |
| 394 | // Store pointers to the arguments in a locally allocated launch_args. |
| 395 | for (unsigned i = 0; i < Args.size(); ++i) { |
| 396 | llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(VD: Args[i]).emitRawPointer(CGF); |
| 397 | llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(V: VarPtr, DestTy: PtrTy); |
| 398 | CGF.Builder.CreateDefaultAlignedStore( |
| 399 | Val: VoidVarPtr, Addr: CGF.Builder.CreateConstGEP1_32( |
| 400 | Ty: PtrTy, Ptr: KernelArgs.emitRawPointer(CGF), Idx0: i)); |
| 401 | } |
| 402 | return KernelArgs; |
| 403 | } |
| 404 | |
| 405 | // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local |
| 406 | // array and kernels are launched using cudaLaunchKernel(). |
| 407 | void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, |
| 408 | FunctionArgList &Args) { |
| 409 | // Build the shadow stack entry at the very start of the function. |
| 410 | Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM |
| 411 | ? prepareKernelArgsLLVMOffload(CGF, Args) |
| 412 | : prepareKernelArgs(CGF, Args); |
| 413 | |
| 414 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end" ); |
| 415 | |
| 416 | // Lookup cudaLaunchKernel/hipLaunchKernel function. |
| 417 | // HIP kernel launching API name depends on -fgpu-default-stream option. For |
| 418 | // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', |
| 419 | // it is hipLaunchKernel_spt. |
| 420 | // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, |
| 421 | // void **args, size_t sharedMem, |
| 422 | // cudaStream_t stream); |
| 423 | // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, |
| 424 | // dim3 blockDim, void **args, |
| 425 | // size_t sharedMem, hipStream_t stream); |
| 426 | TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); |
| 427 | DeclContext *DC = TranslationUnitDecl::castToDeclContext(D: TUDecl); |
| 428 | std::string KernelLaunchAPI = "LaunchKernel" ; |
| 429 | if (CGF.getLangOpts().GPUDefaultStream == |
| 430 | LangOptions::GPUDefaultStreamKind::PerThread) { |
| 431 | if (CGF.getLangOpts().HIP) |
| 432 | KernelLaunchAPI = KernelLaunchAPI + "_spt" ; |
| 433 | else if (CGF.getLangOpts().CUDA) |
| 434 | KernelLaunchAPI = KernelLaunchAPI + "_ptsz" ; |
| 435 | } |
| 436 | auto LaunchKernelName = addPrefixToName(FuncName: KernelLaunchAPI); |
| 437 | const IdentifierInfo &cudaLaunchKernelII = |
| 438 | CGM.getContext().Idents.get(Name: LaunchKernelName); |
| 439 | FunctionDecl *cudaLaunchKernelFD = nullptr; |
| 440 | for (auto *Result : DC->lookup(Name: &cudaLaunchKernelII)) { |
| 441 | if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Val: Result)) |
| 442 | cudaLaunchKernelFD = FD; |
| 443 | } |
| 444 | |
| 445 | if (cudaLaunchKernelFD == nullptr) { |
| 446 | CGM.Error(loc: CGF.CurFuncDecl->getLocation(), |
| 447 | error: "Can't find declaration for " + LaunchKernelName); |
| 448 | return; |
| 449 | } |
| 450 | // Create temporary dim3 grid_dim, block_dim. |
| 451 | ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(i: 1); |
| 452 | QualType Dim3Ty = GridDimParam->getType(); |
| 453 | Address GridDim = CGF.CreateMemTempWithoutCast( |
| 454 | T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: 8), Name: "grid_dim" ); |
| 455 | Address BlockDim = CGF.CreateMemTempWithoutCast( |
| 456 | T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: 8), Name: "block_dim" ); |
| 457 | Address ShmemSize = CGF.CreateTempAlloca(Ty: SizeTy, UseAddrSpace: LangAS::Default, |
| 458 | align: CGM.getSizeAlign(), Name: "shmem_size" ); |
| 459 | Address Stream = CGF.CreateTempAlloca(Ty: PtrTy, UseAddrSpace: LangAS::Default, |
| 460 | align: CGM.getPointerAlign(), Name: "stream" ); |
| 461 | llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( |
| 462 | Ty: llvm::FunctionType::get(Result: IntTy, |
| 463 | Params: {/*gridDim=*/GridDim.getType(), |
| 464 | /*blockDim=*/BlockDim.getType(), |
| 465 | /*ShmemSize=*/ShmemSize.getType(), |
| 466 | /*Stream=*/Stream.getType()}, |
| 467 | /*isVarArg=*/false), |
| 468 | Name: addUnderscoredPrefixToName(FuncName: "PopCallConfiguration" )); |
| 469 | |
| 470 | CGF.EmitRuntimeCallOrInvoke(callee: cudaPopConfigFn, args: {GridDim.emitRawPointer(CGF), |
| 471 | BlockDim.emitRawPointer(CGF), |
| 472 | ShmemSize.emitRawPointer(CGF), |
| 473 | Stream.emitRawPointer(CGF)}); |
| 474 | |
| 475 | // Emit the call to cudaLaunch |
| 476 | llvm::Value *Kernel = |
| 477 | CGF.Builder.CreatePointerCast(V: KernelHandles[CGF.CurFn->getName()], DestTy: PtrTy); |
| 478 | CallArgList LaunchKernelArgs; |
| 479 | LaunchKernelArgs.add(rvalue: RValue::get(V: Kernel), |
| 480 | type: cudaLaunchKernelFD->getParamDecl(i: 0)->getType()); |
| 481 | LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: GridDim), type: Dim3Ty); |
| 482 | LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: BlockDim), type: Dim3Ty); |
| 483 | LaunchKernelArgs.add(rvalue: RValue::get(Addr: KernelArgs, CGF), |
| 484 | type: cudaLaunchKernelFD->getParamDecl(i: 3)->getType()); |
| 485 | LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: ShmemSize)), |
| 486 | type: cudaLaunchKernelFD->getParamDecl(i: 4)->getType()); |
| 487 | LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: Stream)), |
| 488 | type: cudaLaunchKernelFD->getParamDecl(i: 5)->getType()); |
| 489 | |
| 490 | QualType QT = cudaLaunchKernelFD->getType(); |
| 491 | QualType CQT = QT.getCanonicalType(); |
| 492 | llvm::Type *Ty = CGM.getTypes().ConvertType(T: CQT); |
| 493 | llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty); |
| 494 | |
| 495 | const CGFunctionInfo &FI = |
| 496 | CGM.getTypes().arrangeFunctionDeclaration(GD: cudaLaunchKernelFD); |
| 497 | llvm::FunctionCallee cudaLaunchKernelFn = |
| 498 | CGM.CreateRuntimeFunction(Ty: FTy, Name: LaunchKernelName); |
| 499 | CGF.EmitCall(CallInfo: FI, Callee: CGCallee::forDirect(functionPtr: cudaLaunchKernelFn), ReturnValue: ReturnValueSlot(), |
| 500 | Args: LaunchKernelArgs); |
| 501 | |
| 502 | // To prevent CUDA device stub functions from being merged by ICF in MSVC |
| 503 | // environment, create an unique global variable for each kernel and write to |
| 504 | // the variable in the device stub. |
| 505 | if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && |
| 506 | !CGF.getLangOpts().HIP) { |
| 507 | llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Val: Kernel); |
| 508 | std::string GlobalVarName = (KernelFunction->getName() + ".id" ).str(); |
| 509 | |
| 510 | llvm::GlobalVariable *HandleVar = |
| 511 | CGM.getModule().getNamedGlobal(Name: GlobalVarName); |
| 512 | if (!HandleVar) { |
| 513 | HandleVar = new llvm::GlobalVariable( |
| 514 | CGM.getModule(), CGM.Int8Ty, |
| 515 | /*Constant=*/false, KernelFunction->getLinkage(), |
| 516 | llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: 0), GlobalVarName); |
| 517 | HandleVar->setDSOLocal(KernelFunction->isDSOLocal()); |
| 518 | HandleVar->setVisibility(KernelFunction->getVisibility()); |
| 519 | if (KernelFunction->hasComdat()) |
| 520 | HandleVar->setComdat(CGM.getModule().getOrInsertComdat(Name: GlobalVarName)); |
| 521 | } |
| 522 | |
| 523 | CGF.Builder.CreateAlignedStore(Val: llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: 1), |
| 524 | Addr: HandleVar, Align: CharUnits::One(), |
| 525 | /*IsVolatile=*/true); |
| 526 | } |
| 527 | |
| 528 | CGF.EmitBranch(Block: EndBlock); |
| 529 | |
| 530 | CGF.EmitBlock(BB: EndBlock); |
| 531 | } |
| 532 | |
| 533 | void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, |
| 534 | FunctionArgList &Args) { |
| 535 | // Emit a call to cudaSetupArgument for each arg in Args. |
| 536 | llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); |
| 537 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end" ); |
| 538 | CharUnits Offset = CharUnits::Zero(); |
| 539 | for (const VarDecl *A : Args) { |
| 540 | auto TInfo = CGM.getContext().getTypeInfoInChars(T: A->getType()); |
| 541 | Offset = Offset.alignTo(Align: TInfo.Align); |
| 542 | llvm::Value *Args[] = { |
| 543 | CGF.Builder.CreatePointerCast( |
| 544 | V: CGF.GetAddrOfLocalVar(VD: A).emitRawPointer(CGF), DestTy: PtrTy), |
| 545 | llvm::ConstantInt::get(Ty: SizeTy, V: TInfo.Width.getQuantity()), |
| 546 | llvm::ConstantInt::get(Ty: SizeTy, V: Offset.getQuantity()), |
| 547 | }; |
| 548 | llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(callee: cudaSetupArgFn, args: Args); |
| 549 | llvm::Constant *Zero = llvm::ConstantInt::get(Ty: IntTy, V: 0); |
| 550 | llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(LHS: CB, RHS: Zero); |
| 551 | llvm::BasicBlock *NextBlock = CGF.createBasicBlock(name: "setup.next" ); |
| 552 | CGF.Builder.CreateCondBr(Cond: CBZero, True: NextBlock, False: EndBlock); |
| 553 | CGF.EmitBlock(BB: NextBlock); |
| 554 | Offset += TInfo.Width; |
| 555 | } |
| 556 | |
| 557 | // Emit the call to cudaLaunch |
| 558 | llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); |
| 559 | llvm::Value *Arg = |
| 560 | CGF.Builder.CreatePointerCast(V: KernelHandles[CGF.CurFn->getName()], DestTy: PtrTy); |
| 561 | CGF.EmitRuntimeCallOrInvoke(callee: cudaLaunchFn, args: Arg); |
| 562 | CGF.EmitBranch(Block: EndBlock); |
| 563 | |
| 564 | CGF.EmitBlock(BB: EndBlock); |
| 565 | } |
| 566 | |
| 567 | // Replace the original variable Var with the address loaded from variable |
| 568 | // ManagedVar populated by HIP runtime. |
| 569 | static void replaceManagedVar(llvm::GlobalVariable *Var, |
| 570 | llvm::GlobalVariable *ManagedVar) { |
| 571 | SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; |
| 572 | for (auto &&VarUse : Var->uses()) { |
| 573 | WorkList.push_back(Elt: {VarUse.getUser()}); |
| 574 | } |
| 575 | while (!WorkList.empty()) { |
| 576 | auto &&WorkItem = WorkList.pop_back_val(); |
| 577 | auto *U = WorkItem.back(); |
| 578 | if (isa<llvm::ConstantExpr>(Val: U)) { |
| 579 | for (auto &&UU : U->uses()) { |
| 580 | WorkItem.push_back(Elt: UU.getUser()); |
| 581 | WorkList.push_back(Elt: WorkItem); |
| 582 | WorkItem.pop_back(); |
| 583 | } |
| 584 | continue; |
| 585 | } |
| 586 | if (auto *I = dyn_cast<llvm::Instruction>(Val: U)) { |
| 587 | llvm::Value *OldV = Var; |
| 588 | llvm::Instruction *NewV = new llvm::LoadInst( |
| 589 | Var->getType(), ManagedVar, "ld.managed" , false, |
| 590 | llvm::Align(Var->getAlignment()), I->getIterator()); |
| 591 | WorkItem.pop_back(); |
| 592 | // Replace constant expressions directly or indirectly using the managed |
| 593 | // variable with instructions. |
| 594 | for (auto &&Op : WorkItem) { |
| 595 | auto *CE = cast<llvm::ConstantExpr>(Val: Op); |
| 596 | auto *NewInst = CE->getAsInstruction(); |
| 597 | NewInst->insertBefore(BB&: *I->getParent(), InsertPos: I->getIterator()); |
| 598 | NewInst->replaceUsesOfWith(From: OldV, To: NewV); |
| 599 | OldV = CE; |
| 600 | NewV = NewInst; |
| 601 | } |
| 602 | I->replaceUsesOfWith(From: OldV, To: NewV); |
| 603 | } else { |
| 604 | llvm_unreachable("Invalid use of managed variable" ); |
| 605 | } |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | /// Creates a function that sets up state on the host side for CUDA objects that |
| 610 | /// have a presence on both the host and device sides. Specifically, registers |
| 611 | /// the host side of kernel functions and device global variables with the CUDA |
| 612 | /// runtime. |
| 613 | /// \code |
| 614 | /// void __cuda_register_globals(void** GpuBinaryHandle) { |
| 615 | /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); |
| 616 | /// ... |
| 617 | /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); |
| 618 | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); |
| 619 | /// ... |
| 620 | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); |
| 621 | /// } |
| 622 | /// \endcode |
| 623 | llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { |
| 624 | // No need to register anything |
| 625 | if (EmittedKernels.empty() && DeviceVars.empty()) |
| 626 | return nullptr; |
| 627 | |
| 628 | llvm::Function *RegisterKernelsFunc = llvm::Function::Create( |
| 629 | Ty: getRegisterGlobalsFnTy(), Linkage: llvm::GlobalValue::InternalLinkage, |
| 630 | N: addUnderscoredPrefixToName(FuncName: "_register_globals" ), M: &TheModule); |
| 631 | llvm::BasicBlock *EntryBB = |
| 632 | llvm::BasicBlock::Create(Context, Name: "entry" , Parent: RegisterKernelsFunc); |
| 633 | CGBuilderTy Builder(CGM, Context); |
| 634 | Builder.SetInsertPoint(EntryBB); |
| 635 | |
| 636 | // void __cudaRegisterFunction(void **, const char *, char *, const char *, |
| 637 | // int, uint3*, uint3*, dim3*, dim3*, int*) |
| 638 | llvm::Type *RegisterFuncParams[] = { |
| 639 | PtrTy, PtrTy, PtrTy, PtrTy, IntTy, |
| 640 | PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(C&: Context)}; |
| 641 | llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( |
| 642 | Ty: llvm::FunctionType::get(Result: IntTy, Params: RegisterFuncParams, isVarArg: false), |
| 643 | Name: addUnderscoredPrefixToName(FuncName: "RegisterFunction" )); |
| 644 | |
| 645 | // Extract GpuBinaryHandle passed as the first argument passed to |
| 646 | // __cuda_register_globals() and generate __cudaRegisterFunction() call for |
| 647 | // each emitted kernel. |
| 648 | llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); |
| 649 | for (auto &&I : EmittedKernels) { |
| 650 | llvm::Constant *KernelName = |
| 651 | makeConstantString(Str: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D))); |
| 652 | llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(T: PtrTy); |
| 653 | llvm::Value *Args[] = { |
| 654 | &GpuBinaryHandlePtr, |
| 655 | KernelHandles[I.Kernel->getName()], |
| 656 | KernelName, |
| 657 | KernelName, |
| 658 | llvm::ConstantInt::getAllOnesValue(Ty: IntTy), |
| 659 | NullPtr, |
| 660 | NullPtr, |
| 661 | NullPtr, |
| 662 | NullPtr, |
| 663 | llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Context))}; |
| 664 | Builder.CreateCall(Callee: RegisterFunc, Args); |
| 665 | } |
| 666 | |
| 667 | llvm::Type *VarSizeTy = IntTy; |
| 668 | // For HIP or CUDA 9.0+, device variable size is type of `size_t`. |
| 669 | if (CGM.getLangOpts().HIP || |
| 670 | ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) |
| 671 | VarSizeTy = SizeTy; |
| 672 | |
| 673 | // void __cudaRegisterVar(void **, char *, char *, const char *, |
| 674 | // int, int, int, int) |
| 675 | llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy, |
| 676 | IntTy, VarSizeTy, IntTy, IntTy}; |
| 677 | llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( |
| 678 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterVarParams, isVarArg: false), |
| 679 | Name: addUnderscoredPrefixToName(FuncName: "RegisterVar" )); |
| 680 | // void __hipRegisterManagedVar(void **, char *, char *, const char *, |
| 681 | // size_t, unsigned) |
| 682 | llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy, |
| 683 | PtrTy, VarSizeTy, IntTy}; |
| 684 | llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( |
| 685 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterManagedVarParams, isVarArg: false), |
| 686 | Name: addUnderscoredPrefixToName(FuncName: "RegisterManagedVar" )); |
| 687 | // void __cudaRegisterSurface(void **, const struct surfaceReference *, |
| 688 | // const void **, const char *, int, int); |
| 689 | llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( |
| 690 | Ty: llvm::FunctionType::get( |
| 691 | Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, isVarArg: false), |
| 692 | Name: addUnderscoredPrefixToName(FuncName: "RegisterSurface" )); |
| 693 | // void __cudaRegisterTexture(void **, const struct textureReference *, |
| 694 | // const void **, const char *, int, int, int) |
| 695 | llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( |
| 696 | Ty: llvm::FunctionType::get( |
| 697 | Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, isVarArg: false), |
| 698 | Name: addUnderscoredPrefixToName(FuncName: "RegisterTexture" )); |
| 699 | for (auto &&Info : DeviceVars) { |
| 700 | llvm::GlobalVariable *Var = Info.Var; |
| 701 | assert((!Var->isDeclaration() || Info.Flags.isManaged()) && |
| 702 | "External variables should not show up here, except HIP managed " |
| 703 | "variables" ); |
| 704 | llvm::Constant *VarName = makeConstantString(Str: getDeviceSideName(ND: Info.D)); |
| 705 | switch (Info.Flags.getKind()) { |
| 706 | case DeviceVarFlags::Variable: { |
| 707 | uint64_t VarSize = |
| 708 | CGM.getDataLayout().getTypeAllocSize(Ty: Var->getValueType()); |
| 709 | if (Info.Flags.isManaged()) { |
| 710 | assert(Var->getName().ends_with(".managed" ) && |
| 711 | "HIP managed variables not transformed" ); |
| 712 | auto *ManagedVar = CGM.getModule().getNamedGlobal( |
| 713 | Name: Var->getName().drop_back(N: StringRef(".managed" ).size())); |
| 714 | llvm::Value *Args[] = { |
| 715 | &GpuBinaryHandlePtr, |
| 716 | ManagedVar, |
| 717 | Var, |
| 718 | VarName, |
| 719 | llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize), |
| 720 | llvm::ConstantInt::get(Ty: IntTy, V: Var->getAlignment())}; |
| 721 | if (!Var->isDeclaration()) |
| 722 | Builder.CreateCall(Callee: RegisterManagedVar, Args); |
| 723 | } else { |
| 724 | llvm::Value *Args[] = { |
| 725 | &GpuBinaryHandlePtr, |
| 726 | Var, |
| 727 | VarName, |
| 728 | VarName, |
| 729 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern()), |
| 730 | llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize), |
| 731 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isConstant()), |
| 732 | llvm::ConstantInt::get(Ty: IntTy, V: 0)}; |
| 733 | Builder.CreateCall(Callee: RegisterVar, Args); |
| 734 | } |
| 735 | break; |
| 736 | } |
| 737 | case DeviceVarFlags::Surface: |
| 738 | Builder.CreateCall( |
| 739 | Callee: RegisterSurf, |
| 740 | Args: {&GpuBinaryHandlePtr, Var, VarName, VarName, |
| 741 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()), |
| 742 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())}); |
| 743 | break; |
| 744 | case DeviceVarFlags::Texture: |
| 745 | Builder.CreateCall( |
| 746 | Callee: RegisterTex, |
| 747 | Args: {&GpuBinaryHandlePtr, Var, VarName, VarName, |
| 748 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()), |
| 749 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isNormalized()), |
| 750 | llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())}); |
| 751 | break; |
| 752 | } |
| 753 | } |
| 754 | |
| 755 | // Register the per-TU offload-profiling shadow so the host runtime can |
| 756 | // locate the matching device-side __llvm_profile_sections_<CUID>. We |
| 757 | // emit both __hipRegisterVar (so the HIP runtime can map the host |
| 758 | // shadow to the device symbol) and |
| 759 | // __llvm_profile_offload_register_shadow_variable (so the profile |
| 760 | // runtime adds the shadow to its drain list). |
| 761 | if (OffloadProfShadow) { |
| 762 | llvm::Constant *Name = |
| 763 | makeConstantString(Str: std::string(OffloadProfShadow->getName())); |
| 764 | llvm::Constant *IntZero = llvm::ConstantInt::get(Ty: IntTy, V: 0); |
| 765 | llvm::Value *RegisterVarArgs[] = { |
| 766 | &GpuBinaryHandlePtr, |
| 767 | OffloadProfShadow, |
| 768 | Name, |
| 769 | Name, |
| 770 | IntZero, |
| 771 | llvm::ConstantInt::get(Ty: VarSizeTy, |
| 772 | V: CGM.getDataLayout().getPointerSize(/*AS=*/0)), |
| 773 | IntZero, |
| 774 | IntZero}; |
| 775 | Builder.CreateCall(Callee: RegisterVar, Args: RegisterVarArgs); |
| 776 | |
| 777 | llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( |
| 778 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false), |
| 779 | Name: "__llvm_profile_offload_register_shadow_variable" ); |
| 780 | Builder.CreateCall(Callee: RegisterShadow, Args: {OffloadProfShadow}); |
| 781 | } |
| 782 | |
| 783 | if (!OffloadProfSectionShadows.empty()) { |
| 784 | llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction( |
| 785 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false), |
| 786 | Name: "__llvm_profile_offload_register_section_shadow_variable" ); |
| 787 | llvm::Constant *IntZero = llvm::ConstantInt::get(Ty: IntTy, V: 0); |
| 788 | for (const auto &Info : OffloadProfSectionShadows) { |
| 789 | llvm::Constant *Name = makeConstantString(Str: Info.DeviceName); |
| 790 | llvm::Value *RegisterVarArgs[] = { |
| 791 | &GpuBinaryHandlePtr, |
| 792 | Info.Shadow, |
| 793 | Name, |
| 794 | Name, |
| 795 | IntZero, |
| 796 | llvm::ConstantInt::get(Ty: VarSizeTy, |
| 797 | V: CGM.getDataLayout().getPointerSize(/*AS=*/0)), |
| 798 | IntZero, |
| 799 | IntZero}; |
| 800 | Builder.CreateCall(Callee: RegisterVar, Args: RegisterVarArgs); |
| 801 | Builder.CreateCall(Callee: RegisterSectionShadow, Args: {Info.Shadow}); |
| 802 | } |
| 803 | } |
| 804 | |
| 805 | Builder.CreateRetVoid(); |
| 806 | return RegisterKernelsFunc; |
| 807 | } |
| 808 | |
| 809 | /// Creates a global constructor function for the module: |
| 810 | /// |
| 811 | /// For CUDA: |
| 812 | /// \code |
| 813 | /// void __cuda_module_ctor() { |
| 814 | /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); |
| 815 | /// __cuda_register_globals(Handle); |
| 816 | /// } |
| 817 | /// \endcode |
| 818 | /// |
| 819 | /// For HIP: |
| 820 | /// \code |
| 821 | /// void __hip_module_ctor() { |
| 822 | /// if (__hip_gpubin_handle == 0) { |
| 823 | /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); |
| 824 | /// __hip_register_globals(__hip_gpubin_handle); |
| 825 | /// } |
| 826 | /// } |
| 827 | /// \endcode |
| 828 | llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { |
| 829 | bool IsHIP = CGM.getLangOpts().HIP; |
| 830 | bool IsCUDA = CGM.getLangOpts().CUDA; |
| 831 | // No need to generate ctors/dtors if there is no GPU binary. |
| 832 | StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; |
| 833 | if (CudaGpuBinaryFileName.empty() && !IsHIP) |
| 834 | return nullptr; |
| 835 | if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && |
| 836 | DeviceVars.empty()) |
| 837 | return nullptr; |
| 838 | |
| 839 | // void __{cuda|hip}_register_globals(void* handle); |
| 840 | llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); |
| 841 | // We always need a function to pass in as callback. Create a dummy |
| 842 | // implementation if we don't need to register anything. |
| 843 | if (RelocatableDeviceCode && !RegisterGlobalsFunc) |
| 844 | RegisterGlobalsFunc = makeDummyFunction(FnTy: getRegisterGlobalsFnTy()); |
| 845 | |
| 846 | // void ** __{cuda|hip}RegisterFatBinary(void *); |
| 847 | llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( |
| 848 | Ty: llvm::FunctionType::get(Result: PtrTy, Params: PtrTy, isVarArg: false), |
| 849 | Name: addUnderscoredPrefixToName(FuncName: "RegisterFatBinary" )); |
| 850 | // struct { int magic, int version, void * gpu_binary, void * dont_care }; |
| 851 | llvm::StructType *FatbinWrapperTy = |
| 852 | llvm::StructType::get(elt1: IntTy, elts: IntTy, elts: PtrTy, elts: PtrTy); |
| 853 | |
| 854 | // Register GPU binary with the CUDA runtime, store returned handle in a |
| 855 | // global variable and save a reference in GpuBinaryHandle to be cleaned up |
| 856 | // in destructor on exit. Then associate all known kernels with the GPU binary |
| 857 | // handle so CUDA runtime can figure out what to call on the GPU side. |
| 858 | std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; |
| 859 | if (!CudaGpuBinaryFileName.empty()) { |
| 860 | auto VFS = CGM.getFileSystem(); |
| 861 | auto CudaGpuBinaryOrErr = |
| 862 | VFS->getBufferForFile(Name: CudaGpuBinaryFileName, FileSize: -1, RequiresNullTerminator: false); |
| 863 | if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { |
| 864 | CGM.getDiags().Report(DiagID: diag::err_cannot_open_file) |
| 865 | << CudaGpuBinaryFileName << EC.message(); |
| 866 | return nullptr; |
| 867 | } |
| 868 | CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); |
| 869 | } |
| 870 | |
| 871 | llvm::Function *ModuleCtorFunc = llvm::Function::Create( |
| 872 | Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false), |
| 873 | Linkage: llvm::GlobalValue::InternalLinkage, |
| 874 | N: addUnderscoredPrefixToName(FuncName: "_module_ctor" ), M: &TheModule); |
| 875 | llvm::BasicBlock *CtorEntryBB = |
| 876 | llvm::BasicBlock::Create(Context, Name: "entry" , Parent: ModuleCtorFunc); |
| 877 | CGBuilderTy CtorBuilder(CGM, Context); |
| 878 | |
| 879 | CtorBuilder.SetInsertPoint(CtorEntryBB); |
| 880 | |
| 881 | const char *FatbinConstantName; |
| 882 | const char *FatbinSectionName; |
| 883 | const char *ModuleIDSectionName; |
| 884 | StringRef ModuleIDPrefix; |
| 885 | llvm::Constant *FatBinStr; |
| 886 | unsigned FatMagic; |
| 887 | if (IsHIP) { |
| 888 | // On macOS (Mach-O), section names must be in "segment,section" format. |
| 889 | FatbinConstantName = |
| 890 | CGM.getTriple().isMacOSX() ? "__HIP,__hip_fatbin" : ".hip_fatbin" ; |
| 891 | FatbinSectionName = |
| 892 | CGM.getTriple().isMacOSX() ? "__HIP,__fatbin" : ".hipFatBinSegment" ; |
| 893 | |
| 894 | ModuleIDSectionName = |
| 895 | CGM.getTriple().isMacOSX() ? "__HIP,__module_id" : "__hip_module_id" ; |
| 896 | ModuleIDPrefix = "__hip_" ; |
| 897 | |
| 898 | if (CudaGpuBinary) { |
| 899 | // If fatbin is available from early finalization, create a string |
| 900 | // literal containing the fat binary loaded from the given file. |
| 901 | const unsigned HIPCodeObjectAlign = 4096; |
| 902 | FatBinStr = makeConstantArray(Str: std::string(CudaGpuBinary->getBuffer()), Name: "" , |
| 903 | SectionName: FatbinConstantName, Alignment: HIPCodeObjectAlign); |
| 904 | } else { |
| 905 | // If fatbin is not available, create an external symbol |
| 906 | // __hip_fatbin in section .hip_fatbin. The external symbol is supposed |
| 907 | // to contain the fat binary but will be populated somewhere else, |
| 908 | // e.g. by lld through link script. |
| 909 | FatBinStr = new llvm::GlobalVariable( |
| 910 | CGM.getModule(), CGM.Int8Ty, |
| 911 | /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, |
| 912 | "__hip_fatbin" + (CGM.getLangOpts().CUID.empty() |
| 913 | ? "" |
| 914 | : "_" + CGM.getContext().getCUIDHash()), |
| 915 | nullptr, llvm::GlobalVariable::NotThreadLocal); |
| 916 | cast<llvm::GlobalVariable>(Val: FatBinStr)->setSection(FatbinConstantName); |
| 917 | } |
| 918 | |
| 919 | FatMagic = HIPFatMagic; |
| 920 | } else { |
| 921 | if (RelocatableDeviceCode) |
| 922 | FatbinConstantName = CGM.getTriple().isMacOSX() |
| 923 | ? "__NV_CUDA,__nv_relfatbin" |
| 924 | : "__nv_relfatbin" ; |
| 925 | else |
| 926 | FatbinConstantName = |
| 927 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin" ; |
| 928 | // NVIDIA's cuobjdump looks for fatbins in this section. |
| 929 | FatbinSectionName = |
| 930 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment" ; |
| 931 | |
| 932 | ModuleIDSectionName = CGM.getTriple().isMacOSX() |
| 933 | ? "__NV_CUDA,__nv_module_id" |
| 934 | : "__nv_module_id" ; |
| 935 | ModuleIDPrefix = "__nv_" ; |
| 936 | |
| 937 | // For CUDA, create a string literal containing the fat binary loaded from |
| 938 | // the given file. |
| 939 | FatBinStr = makeConstantArray(Str: std::string(CudaGpuBinary->getBuffer()), Name: "" , |
| 940 | SectionName: FatbinConstantName, Alignment: 8); |
| 941 | FatMagic = CudaFatMagic; |
| 942 | } |
| 943 | |
| 944 | // Create initialized wrapper structure that points to the loaded GPU binary |
| 945 | ConstantInitBuilder Builder(CGM); |
| 946 | auto Values = Builder.beginStruct(structTy: FatbinWrapperTy); |
| 947 | // Fatbin wrapper magic. |
| 948 | Values.addInt(intTy: IntTy, value: FatMagic); |
| 949 | // Fatbin version. |
| 950 | Values.addInt(intTy: IntTy, value: 1); |
| 951 | // Data. |
| 952 | Values.add(value: FatBinStr); |
| 953 | // Unused in fatbin v1. |
| 954 | Values.add(value: llvm::ConstantPointerNull::get(T: PtrTy)); |
| 955 | llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( |
| 956 | args: addUnderscoredPrefixToName(FuncName: "_fatbin_wrapper" ), args: CGM.getPointerAlign(), |
| 957 | /*constant*/ args: true); |
| 958 | FatbinWrapper->setSection(FatbinSectionName); |
| 959 | CGM.getSanitizerMetadata()->disableSanitizerForGlobal(GV: FatbinWrapper); |
| 960 | |
| 961 | // There is only one HIP fat binary per linked module, however there are |
| 962 | // multiple constructor functions. Make sure the fat binary is registered |
| 963 | // only once. The constructor functions are executed by the dynamic loader |
| 964 | // before the program gains control. The dynamic loader cannot execute the |
| 965 | // constructor functions concurrently since doing that would not guarantee |
| 966 | // thread safety of the loaded program. Therefore we can assume sequential |
| 967 | // execution of constructor functions here. |
| 968 | if (IsHIP) { |
| 969 | auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage |
| 970 | : llvm::GlobalValue::InternalLinkage; |
| 971 | llvm::BasicBlock *IfBlock = |
| 972 | llvm::BasicBlock::Create(Context, Name: "if" , Parent: ModuleCtorFunc); |
| 973 | llvm::BasicBlock *ExitBlock = |
| 974 | llvm::BasicBlock::Create(Context, Name: "exit" , Parent: ModuleCtorFunc); |
| 975 | // The name, size, and initialization pattern of this variable is part |
| 976 | // of HIP ABI. |
| 977 | GpuBinaryHandle = new llvm::GlobalVariable( |
| 978 | TheModule, PtrTy, /*isConstant=*/false, Linkage, |
| 979 | /*Initializer=*/ |
| 980 | !RelocatableDeviceCode ? llvm::ConstantPointerNull::get(T: PtrTy) |
| 981 | : nullptr, |
| 982 | "__hip_gpubin_handle" + (CGM.getLangOpts().CUID.empty() |
| 983 | ? "" |
| 984 | : "_" + CGM.getContext().getCUIDHash())); |
| 985 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
| 986 | // Prevent the weak symbol in different shared libraries being merged. |
| 987 | if (Linkage != llvm::GlobalValue::InternalLinkage) |
| 988 | GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); |
| 989 | Address GpuBinaryAddr( |
| 990 | GpuBinaryHandle, PtrTy, |
| 991 | CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment())); |
| 992 | { |
| 993 | auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr); |
| 994 | llvm::Constant *Zero = |
| 995 | llvm::Constant::getNullValue(Ty: HandleValue->getType()); |
| 996 | llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(LHS: HandleValue, RHS: Zero); |
| 997 | CtorBuilder.CreateCondBr(Cond: EQZero, True: IfBlock, False: ExitBlock); |
| 998 | } |
| 999 | { |
| 1000 | CtorBuilder.SetInsertPoint(IfBlock); |
| 1001 | // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); |
| 1002 | llvm::CallInst *RegisterFatbinCall = |
| 1003 | CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper); |
| 1004 | CtorBuilder.CreateStore(Val: RegisterFatbinCall, Addr: GpuBinaryAddr); |
| 1005 | CtorBuilder.CreateBr(Dest: ExitBlock); |
| 1006 | } |
| 1007 | { |
| 1008 | CtorBuilder.SetInsertPoint(ExitBlock); |
| 1009 | // Call __hip_register_globals(GpuBinaryHandle); |
| 1010 | if (RegisterGlobalsFunc) { |
| 1011 | auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr); |
| 1012 | CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: HandleValue); |
| 1013 | } |
| 1014 | } |
| 1015 | } else if (!RelocatableDeviceCode) { |
| 1016 | // Register binary with CUDA runtime. This is substantially different in |
| 1017 | // default mode vs. separate compilation! |
| 1018 | // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); |
| 1019 | llvm::CallInst *RegisterFatbinCall = |
| 1020 | CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper); |
| 1021 | GpuBinaryHandle = new llvm::GlobalVariable( |
| 1022 | TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage, |
| 1023 | llvm::ConstantPointerNull::get(T: PtrTy), "__cuda_gpubin_handle" ); |
| 1024 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
| 1025 | CtorBuilder.CreateAlignedStore(Val: RegisterFatbinCall, Addr: GpuBinaryHandle, |
| 1026 | Align: CGM.getPointerAlign()); |
| 1027 | |
| 1028 | // Call __cuda_register_globals(GpuBinaryHandle); |
| 1029 | if (RegisterGlobalsFunc) |
| 1030 | CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: RegisterFatbinCall); |
| 1031 | |
| 1032 | // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. |
| 1033 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
| 1034 | CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { |
| 1035 | // void __cudaRegisterFatBinaryEnd(void **); |
| 1036 | llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( |
| 1037 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false), |
| 1038 | Name: "__cudaRegisterFatBinaryEnd" ); |
| 1039 | CtorBuilder.CreateCall(Callee: RegisterFatbinEndFunc, Args: RegisterFatbinCall); |
| 1040 | } |
| 1041 | } else { |
| 1042 | // Generate a unique module ID. |
| 1043 | SmallString<64> ModuleID; |
| 1044 | llvm::raw_svector_ostream OS(ModuleID); |
| 1045 | OS << ModuleIDPrefix << llvm::format(Fmt: "%" PRIx64, Vals: FatbinWrapper->getGUID()); |
| 1046 | llvm::Constant *ModuleIDConstant = makeConstantArray( |
| 1047 | Str: std::string(ModuleID), Name: "" , SectionName: ModuleIDSectionName, Alignment: 32, /*AddNull=*/true); |
| 1048 | |
| 1049 | // Create an alias for the FatbinWrapper that nvcc will look for. |
| 1050 | llvm::GlobalAlias::create(Linkage: llvm::GlobalValue::ExternalLinkage, |
| 1051 | Name: Twine("__fatbinwrap" ) + ModuleID, Aliasee: FatbinWrapper); |
| 1052 | |
| 1053 | // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, |
| 1054 | // void *, void (*)(void **)) |
| 1055 | SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary" ); |
| 1056 | RegisterLinkedBinaryName += ModuleID; |
| 1057 | llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( |
| 1058 | Ty: getRegisterLinkedBinaryFnTy(), Name: RegisterLinkedBinaryName); |
| 1059 | |
| 1060 | assert(RegisterGlobalsFunc && "Expecting at least dummy function!" ); |
| 1061 | llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant, |
| 1062 | makeDummyFunction(FnTy: getCallbackFnTy())}; |
| 1063 | CtorBuilder.CreateCall(Callee: RegisterLinkedBinaryFunc, Args); |
| 1064 | } |
| 1065 | |
| 1066 | // Create destructor and register it with atexit() the way NVCC does it. Doing |
| 1067 | // it during regular destructor phase worked in CUDA before 9.2 but results in |
| 1068 | // double-free in 9.2. |
| 1069 | if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { |
| 1070 | // extern "C" int atexit(void (*f)(void)); |
| 1071 | llvm::FunctionType *AtExitTy = |
| 1072 | llvm::FunctionType::get(Result: IntTy, Params: CleanupFn->getType(), isVarArg: false); |
| 1073 | llvm::FunctionCallee AtExitFunc = |
| 1074 | CGM.CreateRuntimeFunction(Ty: AtExitTy, Name: "atexit" , ExtraAttrs: llvm::AttributeList(), |
| 1075 | /*Local=*/true); |
| 1076 | CtorBuilder.CreateCall(Callee: AtExitFunc, Args: CleanupFn); |
| 1077 | } |
| 1078 | |
| 1079 | CtorBuilder.CreateRetVoid(); |
| 1080 | return ModuleCtorFunc; |
| 1081 | } |
| 1082 | |
| 1083 | /// Creates a global destructor function that unregisters the GPU code blob |
| 1084 | /// registered by constructor. |
| 1085 | /// |
| 1086 | /// For CUDA: |
| 1087 | /// \code |
| 1088 | /// void __cuda_module_dtor() { |
| 1089 | /// __cudaUnregisterFatBinary(Handle); |
| 1090 | /// } |
| 1091 | /// \endcode |
| 1092 | /// |
| 1093 | /// For HIP: |
| 1094 | /// \code |
| 1095 | /// void __hip_module_dtor() { |
| 1096 | /// if (__hip_gpubin_handle) { |
| 1097 | /// __hipUnregisterFatBinary(__hip_gpubin_handle); |
| 1098 | /// __hip_gpubin_handle = 0; |
| 1099 | /// } |
| 1100 | /// } |
| 1101 | /// \endcode |
| 1102 | llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { |
| 1103 | // No need for destructor if we don't have a handle to unregister. |
| 1104 | if (!GpuBinaryHandle) |
| 1105 | return nullptr; |
| 1106 | |
| 1107 | // void __cudaUnregisterFatBinary(void ** handle); |
| 1108 | llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( |
| 1109 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false), |
| 1110 | Name: addUnderscoredPrefixToName(FuncName: "UnregisterFatBinary" )); |
| 1111 | |
| 1112 | llvm::Function *ModuleDtorFunc = llvm::Function::Create( |
| 1113 | Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false), |
| 1114 | Linkage: llvm::GlobalValue::InternalLinkage, |
| 1115 | N: addUnderscoredPrefixToName(FuncName: "_module_dtor" ), M: &TheModule); |
| 1116 | |
| 1117 | llvm::BasicBlock *DtorEntryBB = |
| 1118 | llvm::BasicBlock::Create(Context, Name: "entry" , Parent: ModuleDtorFunc); |
| 1119 | CGBuilderTy DtorBuilder(CGM, Context); |
| 1120 | DtorBuilder.SetInsertPoint(DtorEntryBB); |
| 1121 | |
| 1122 | Address GpuBinaryAddr( |
| 1123 | GpuBinaryHandle, GpuBinaryHandle->getValueType(), |
| 1124 | CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment())); |
| 1125 | auto *HandleValue = DtorBuilder.CreateLoad(Addr: GpuBinaryAddr); |
| 1126 | // There is only one HIP fat binary per linked module, however there are |
| 1127 | // multiple destructor functions. Make sure the fat binary is unregistered |
| 1128 | // only once. |
| 1129 | if (CGM.getLangOpts().HIP) { |
| 1130 | llvm::BasicBlock *IfBlock = |
| 1131 | llvm::BasicBlock::Create(Context, Name: "if" , Parent: ModuleDtorFunc); |
| 1132 | llvm::BasicBlock *ExitBlock = |
| 1133 | llvm::BasicBlock::Create(Context, Name: "exit" , Parent: ModuleDtorFunc); |
| 1134 | llvm::Constant *Zero = llvm::Constant::getNullValue(Ty: HandleValue->getType()); |
| 1135 | llvm::Value *NEZero = DtorBuilder.CreateICmpNE(LHS: HandleValue, RHS: Zero); |
| 1136 | DtorBuilder.CreateCondBr(Cond: NEZero, True: IfBlock, False: ExitBlock); |
| 1137 | |
| 1138 | DtorBuilder.SetInsertPoint(IfBlock); |
| 1139 | DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue); |
| 1140 | DtorBuilder.CreateStore(Val: Zero, Addr: GpuBinaryAddr); |
| 1141 | DtorBuilder.CreateBr(Dest: ExitBlock); |
| 1142 | |
| 1143 | DtorBuilder.SetInsertPoint(ExitBlock); |
| 1144 | } else { |
| 1145 | DtorBuilder.CreateCall(Callee: UnregisterFatbinFunc, Args: HandleValue); |
| 1146 | } |
| 1147 | DtorBuilder.CreateRetVoid(); |
| 1148 | return ModuleDtorFunc; |
| 1149 | } |
| 1150 | |
| 1151 | CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { |
| 1152 | return new CGNVCUDARuntime(CGM); |
| 1153 | } |
| 1154 | |
| 1155 | void CGNVCUDARuntime::internalizeDeviceSideVar( |
| 1156 | const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { |
| 1157 | // For -fno-gpu-rdc, host-side shadows of external declarations of device-side |
| 1158 | // global variables become internal definitions. These have to be internal in |
| 1159 | // order to prevent name conflicts with global host variables with the same |
| 1160 | // name in a different TUs. |
| 1161 | // |
| 1162 | // For -fgpu-rdc, the shadow variables should not be internalized because |
| 1163 | // they may be accessed by different TU. |
| 1164 | if (CGM.getLangOpts().GPURelocatableDeviceCode) |
| 1165 | return; |
| 1166 | |
| 1167 | // __shared__ variables are odd. Shadows do get created, but |
| 1168 | // they are not registered with the CUDA runtime, so they |
| 1169 | // can't really be used to access their device-side |
| 1170 | // counterparts. It's not clear yet whether it's nvcc's bug or |
| 1171 | // a feature, but we've got to do the same for compatibility. |
| 1172 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || |
| 1173 | D->hasAttr<CUDASharedAttr>() || |
| 1174 | D->getType()->isCUDADeviceBuiltinSurfaceType() || |
| 1175 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
| 1176 | Linkage = llvm::GlobalValue::InternalLinkage; |
| 1177 | } |
| 1178 | } |
| 1179 | |
| 1180 | void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, |
| 1181 | llvm::GlobalVariable &GV) { |
| 1182 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { |
| 1183 | // Shadow variables and their properties must be registered with CUDA |
| 1184 | // runtime. Skip Extern global variables, which will be registered in |
| 1185 | // the TU where they are defined. |
| 1186 | // |
| 1187 | // Don't register a C++17 inline variable. The local symbol can be |
| 1188 | // discarded and referencing a discarded local symbol from outside the |
| 1189 | // comdat (__cuda_register_globals) is disallowed by the ELF spec. |
| 1190 | // |
| 1191 | // HIP managed variables need to be always recorded in device and host |
| 1192 | // compilations for transformation. |
| 1193 | // |
| 1194 | // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are |
| 1195 | // added to llvm.compiler-used, therefore they are safe to be registered. |
| 1196 | if ((!D->hasExternalStorage() && !D->isInline()) || |
| 1197 | CGM.getContext().CUDADeviceVarODRUsedByHost.contains(key: D) || |
| 1198 | D->hasAttr<HIPManagedAttr>()) { |
| 1199 | registerDeviceVar(VD: D, Var&: GV, Extern: !D->hasDefinition(), |
| 1200 | Constant: D->hasAttr<CUDAConstantAttr>()); |
| 1201 | } |
| 1202 | } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || |
| 1203 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
| 1204 | // Builtin surfaces and textures and their template arguments are |
| 1205 | // also registered with CUDA runtime. |
| 1206 | const auto *TD = cast<ClassTemplateSpecializationDecl>( |
| 1207 | Val: D->getType()->castAsCXXRecordDecl()); |
| 1208 | const TemplateArgumentList &Args = TD->getTemplateArgs(); |
| 1209 | if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { |
| 1210 | assert(Args.size() == 2 && |
| 1211 | "Unexpected number of template arguments of CUDA device " |
| 1212 | "builtin surface type." ); |
| 1213 | auto SurfType = Args[1].getAsIntegral(); |
| 1214 | if (!D->hasExternalStorage()) |
| 1215 | registerDeviceSurf(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: SurfType.getSExtValue()); |
| 1216 | } else { |
| 1217 | assert(Args.size() == 3 && |
| 1218 | "Unexpected number of template arguments of CUDA device " |
| 1219 | "builtin texture type." ); |
| 1220 | auto TexType = Args[1].getAsIntegral(); |
| 1221 | auto Normalized = Args[2].getAsIntegral(); |
| 1222 | if (!D->hasExternalStorage()) |
| 1223 | registerDeviceTex(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: TexType.getSExtValue(), |
| 1224 | Normalized: Normalized.getZExtValue()); |
| 1225 | } |
| 1226 | } |
| 1227 | } |
| 1228 | |
| 1229 | // Transform managed variables to pointers to managed variables in device code. |
| 1230 | // Each use of the original managed variable is replaced by a load from the |
| 1231 | // transformed managed variable. The transformed managed variable contains |
| 1232 | // the address of managed memory which will be allocated by the runtime. |
| 1233 | void CGNVCUDARuntime::transformManagedVars() { |
| 1234 | for (auto &&Info : DeviceVars) { |
| 1235 | llvm::GlobalVariable *Var = Info.Var; |
| 1236 | if (Info.Flags.getKind() == DeviceVarFlags::Variable && |
| 1237 | Info.Flags.isManaged()) { |
| 1238 | auto *ManagedVar = new llvm::GlobalVariable( |
| 1239 | CGM.getModule(), Var->getType(), |
| 1240 | /*isConstant=*/false, Var->getLinkage(), |
| 1241 | /*Init=*/Var->isDeclaration() |
| 1242 | ? nullptr |
| 1243 | : llvm::ConstantPointerNull::get(T: Var->getType()), |
| 1244 | /*Name=*/"" , /*InsertBefore=*/nullptr, |
| 1245 | llvm::GlobalVariable::NotThreadLocal, |
| 1246 | CGM.getContext().getTargetAddressSpace(AS: CGM.getLangOpts().CUDAIsDevice |
| 1247 | ? LangAS::cuda_device |
| 1248 | : LangAS::Default)); |
| 1249 | ManagedVar->setDSOLocal(Var->isDSOLocal()); |
| 1250 | ManagedVar->setVisibility(Var->getVisibility()); |
| 1251 | ManagedVar->setExternallyInitialized(true); |
| 1252 | replaceManagedVar(Var, ManagedVar); |
| 1253 | ManagedVar->takeName(V: Var); |
| 1254 | Var->setName(Twine(ManagedVar->getName()) + ".managed" ); |
| 1255 | // Keep managed variables even if they are not used in device code since |
| 1256 | // they need to be allocated by the runtime. |
| 1257 | if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) { |
| 1258 | assert(!ManagedVar->isDeclaration()); |
| 1259 | CGM.addCompilerUsedGlobal(GV: Var); |
| 1260 | CGM.addCompilerUsedGlobal(GV: ManagedVar); |
| 1261 | } |
| 1262 | } |
| 1263 | } |
| 1264 | } |
| 1265 | |
| 1266 | // Creates offloading entries for all the kernels and globals that must be |
| 1267 | // registered. The linker will provide a pointer to this section so we can |
| 1268 | // register the symbols with the linked device image. |
| 1269 | void CGNVCUDARuntime::createOffloadingEntries() { |
| 1270 | llvm::object::OffloadKind Kind = CGM.getLangOpts().HIP |
| 1271 | ? llvm::object::OffloadKind::OFK_HIP |
| 1272 | : llvm::object::OffloadKind::OFK_Cuda; |
| 1273 | // For now, just spoof this as OpenMP because that's the runtime it uses. |
| 1274 | if (CGM.getLangOpts().OffloadViaLLVM) |
| 1275 | Kind = llvm::object::OffloadKind::OFK_OpenMP; |
| 1276 | |
| 1277 | llvm::Module &M = CGM.getModule(); |
| 1278 | for (KernelInfo &I : EmittedKernels) |
| 1279 | llvm::offloading::emitOffloadingEntry( |
| 1280 | M, Kind, Addr: KernelHandles[I.Kernel->getName()], |
| 1281 | Name: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)), /*Flags=*/Size: 0, /*Data=*/Flags: 0, |
| 1282 | Data: llvm::offloading::OffloadGlobalEntry); |
| 1283 | |
| 1284 | for (VarInfo &I : DeviceVars) { |
| 1285 | uint64_t VarSize = |
| 1286 | CGM.getDataLayout().getTypeAllocSize(Ty: I.Var->getValueType()); |
| 1287 | int32_t Flags = |
| 1288 | (I.Flags.isExtern() |
| 1289 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern) |
| 1290 | : 0) | |
| 1291 | (I.Flags.isConstant() |
| 1292 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant) |
| 1293 | : 0) | |
| 1294 | (I.Flags.isNormalized() |
| 1295 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized) |
| 1296 | : 0); |
| 1297 | if (I.Flags.getKind() == DeviceVarFlags::Variable) { |
| 1298 | if (I.Flags.isManaged()) { |
| 1299 | assert(I.Var->getName().ends_with(".managed" ) && |
| 1300 | "HIP managed variables not transformed" ); |
| 1301 | |
| 1302 | auto *ManagedVar = M.getNamedGlobal( |
| 1303 | Name: I.Var->getName().drop_back(N: StringRef(".managed" ).size())); |
| 1304 | llvm::offloading::emitOffloadingEntry( |
| 1305 | M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
| 1306 | Flags: llvm::offloading::OffloadGlobalManagedEntry | Flags, |
| 1307 | /*Data=*/I.Var->getAlignment(), AuxAddr: ManagedVar); |
| 1308 | } else { |
| 1309 | llvm::offloading::emitOffloadingEntry( |
| 1310 | M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
| 1311 | Flags: llvm::offloading::OffloadGlobalEntry | Flags, |
| 1312 | /*Data=*/0); |
| 1313 | } |
| 1314 | } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { |
| 1315 | llvm::offloading::emitOffloadingEntry( |
| 1316 | M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
| 1317 | Flags: llvm::offloading::OffloadGlobalSurfaceEntry | Flags, |
| 1318 | Data: I.Flags.getSurfTexType()); |
| 1319 | } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { |
| 1320 | llvm::offloading::emitOffloadingEntry( |
| 1321 | M, Kind, Addr: I.Var, Name: getDeviceSideName(ND: I.D), Size: VarSize, |
| 1322 | Flags: llvm::offloading::OffloadGlobalTextureEntry | Flags, |
| 1323 | Data: I.Flags.getSurfTexType()); |
| 1324 | } |
| 1325 | } |
| 1326 | |
| 1327 | // Register the per-TU offload-profiling shadow. The offloading entry |
| 1328 | // makes the linker-wrapper emit the host __hipRegisterVar call in the |
| 1329 | // combined ctor. Separately emit a per-TU ctor that registers the |
| 1330 | // shadow with the profile runtime's drain list. |
| 1331 | if (OffloadProfShadow) { |
| 1332 | llvm::offloading::emitOffloadingEntry( |
| 1333 | M, Kind, Addr: OffloadProfShadow, Name: OffloadProfShadow->getName(), |
| 1334 | Size: CGM.getDataLayout().getPointerSize(/*AS=*/0), |
| 1335 | Flags: llvm::offloading::OffloadGlobalEntry, /*Data=*/0); |
| 1336 | |
| 1337 | llvm::LLVMContext &Ctx = M.getContext(); |
| 1338 | auto *PtrTy = llvm::PointerType::getUnqual(C&: Ctx); |
| 1339 | llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( |
| 1340 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false), |
| 1341 | Name: "__llvm_profile_offload_register_shadow_variable" ); |
| 1342 | llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction( |
| 1343 | Ty: llvm::FunctionType::get(Result: VoidTy, Params: {PtrTy}, isVarArg: false), |
| 1344 | Name: "__llvm_profile_offload_register_section_shadow_variable" ); |
| 1345 | auto *CtorFn = llvm::Function::Create( |
| 1346 | Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false), |
| 1347 | Linkage: llvm::GlobalValue::InternalLinkage, |
| 1348 | N: "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), M: &M); |
| 1349 | auto *Entry = llvm::BasicBlock::Create(Context&: Ctx, Name: "entry" , Parent: CtorFn); |
| 1350 | llvm::IRBuilder<> B(Entry); |
| 1351 | B.CreateCall(Callee: RegisterShadow, Args: {OffloadProfShadow}); |
| 1352 | for (const auto &Info : OffloadProfSectionShadows) { |
| 1353 | llvm::offloading::emitOffloadingEntry( |
| 1354 | M, Kind, Addr: Info.Shadow, Name: Info.DeviceName, |
| 1355 | Size: CGM.getDataLayout().getPointerSize(/*AS=*/0), |
| 1356 | Flags: llvm::offloading::OffloadGlobalEntry, /*Data=*/0); |
| 1357 | B.CreateCall(Callee: RegisterSectionShadow, Args: {Info.Shadow}); |
| 1358 | } |
| 1359 | B.CreateRetVoid(); |
| 1360 | llvm::appendToGlobalCtors(M, F: CtorFn, /*Priority=*/65535); |
| 1361 | } |
| 1362 | } |
| 1363 | |
| 1364 | // For HIP host+device compiles with PGO enabled, emit the host-side shadow for |
| 1365 | // the per-TU __llvm_profile_sections_<CUID> global. Device-side section table |
| 1366 | // emission is owned by InstrProfiling so it can be gated on real profile data. |
| 1367 | void CGNVCUDARuntime::emitOffloadProfilingSections() { |
| 1368 | if (!CGM.getLangOpts().HIP) |
| 1369 | return; |
| 1370 | if (!CGM.getCodeGenOpts().hasProfileInstr()) |
| 1371 | return; |
| 1372 | |
| 1373 | StringRef CUIDHash = CGM.getContext().getCUIDHash(); |
| 1374 | if (CUIDHash.empty()) |
| 1375 | return; |
| 1376 | |
| 1377 | llvm::Module &M = CGM.getModule(); |
| 1378 | llvm::LLVMContext &Ctx = M.getContext(); |
| 1379 | std::string Name = ("__llvm_profile_sections_" + CUIDHash).str(); |
| 1380 | |
| 1381 | // If the global already exists (e.g. another TU was merged in), don't |
| 1382 | // duplicate it. |
| 1383 | if (M.getNamedValue(Name)) |
| 1384 | return; |
| 1385 | |
| 1386 | if (CGM.getLangOpts().CUDAIsDevice) { |
| 1387 | // Device side: emit only the per-TU names postfix marker. The sections |
| 1388 | // struct is emitted later by the InstrProfiling pass, which emits it only |
| 1389 | // when the TU has profile data, avoiding dangling section references. |
| 1390 | unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); |
| 1391 | std::string NamesVarPostfixVarName = |
| 1392 | std::string(llvm::getInstrProfNamesVarPostfixVarName()); |
| 1393 | if (!M.getNamedValue(Name: NamesVarPostfixVarName)) { |
| 1394 | auto *NamesVarPostfix = llvm::ConstantDataArray::getString( |
| 1395 | Context&: Ctx, Initializer: (llvm::Twine("_" ) + CUIDHash).str(), AddNull: true); |
| 1396 | auto *NamesGV = new llvm::GlobalVariable( |
| 1397 | M, NamesVarPostfix->getType(), /*isConstant=*/true, |
| 1398 | llvm::GlobalValue::PrivateLinkage, NamesVarPostfix, |
| 1399 | NamesVarPostfixVarName, |
| 1400 | /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, |
| 1401 | GlobalAS); |
| 1402 | CGM.addCompilerUsedGlobal(GV: NamesGV); |
| 1403 | } |
| 1404 | return; |
| 1405 | } |
| 1406 | |
| 1407 | // Host side: emit an opaque void* shadow. Layout doesn't matter — the |
| 1408 | // runtime locates it by name via hipGetSymbolAddress and treats it as |
| 1409 | // the address of the device-side struct. Registration with the HIP |
| 1410 | // runtime is added by makeRegisterGlobalsFn (non-RDC) or |
| 1411 | // createOffloadingEntries (RDC). |
| 1412 | auto *PtrTy = llvm::PointerType::getUnqual(C&: Ctx); |
| 1413 | OffloadProfShadow = new llvm::GlobalVariable( |
| 1414 | M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, |
| 1415 | llvm::ConstantPointerNull::get(T: PtrTy), Name); |
| 1416 | CGM.addCompilerUsedGlobal(GV: OffloadProfShadow); |
| 1417 | |
| 1418 | auto AddSectionShadow = [&](StringRef Kind, const Twine &DeviceName) { |
| 1419 | std::string ShadowName = |
| 1420 | (Twine("__llvm_profile_shadow_" ) + Kind + "_" + CUIDHash + "_" + |
| 1421 | Twine(OffloadProfSectionShadows.size())) |
| 1422 | .str(); |
| 1423 | auto *Shadow = new llvm::GlobalVariable( |
| 1424 | M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, |
| 1425 | llvm::ConstantPointerNull::get(T: PtrTy), ShadowName); |
| 1426 | CGM.addCompilerUsedGlobal(GV: Shadow); |
| 1427 | OffloadProfSectionShadows.push_back(Elt: {.Shadow: Shadow, .DeviceName: DeviceName.str()}); |
| 1428 | }; |
| 1429 | |
| 1430 | // Keep this order in sync with the runtime: data, counters, uniform counters, |
| 1431 | // then names. |
| 1432 | for (auto &&I : EmittedKernels) { |
| 1433 | std::string KernelName = getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)); |
| 1434 | AddSectionShadow("data" , Twine("__profd_" ) + KernelName); |
| 1435 | AddSectionShadow("cnts" , Twine("__profc_" ) + KernelName); |
| 1436 | AddSectionShadow("ucnts" , Twine("__llvm_prf_unifcnt_" ) + KernelName); |
| 1437 | AddSectionShadow("names" , |
| 1438 | Twine(llvm::getInstrProfNamesVarName()) + "_" + CUIDHash); |
| 1439 | } |
| 1440 | } |
| 1441 | |
| 1442 | // Returns module constructor to be added. |
| 1443 | llvm::Function *CGNVCUDARuntime::finalizeModule() { |
| 1444 | transformManagedVars(); |
| 1445 | emitOffloadProfilingSections(); |
| 1446 | if (CGM.getLangOpts().CUDAIsDevice) { |
| 1447 | // Mark ODR-used device variables as compiler used to prevent it from being |
| 1448 | // eliminated by optimization. This is necessary for device variables |
| 1449 | // ODR-used by host functions. Sema correctly marks them as ODR-used no |
| 1450 | // matter whether they are ODR-used by device or host functions. |
| 1451 | // |
| 1452 | // We do not need to do this if the variable has used attribute since it |
| 1453 | // has already been added. |
| 1454 | // |
| 1455 | // Static device variables have been externalized at this point, therefore |
| 1456 | // variables with LLVM private or internal linkage need not be added. |
| 1457 | for (auto &&Info : DeviceVars) { |
| 1458 | auto Kind = Info.Flags.getKind(); |
| 1459 | if (!Info.Var->isDeclaration() && |
| 1460 | !llvm::GlobalValue::isLocalLinkage(Linkage: Info.Var->getLinkage()) && |
| 1461 | (Kind == DeviceVarFlags::Variable || |
| 1462 | Kind == DeviceVarFlags::Surface || |
| 1463 | Kind == DeviceVarFlags::Texture) && |
| 1464 | Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) { |
| 1465 | CGM.addCompilerUsedGlobal(GV: Info.Var); |
| 1466 | } |
| 1467 | } |
| 1468 | return nullptr; |
| 1469 | } |
| 1470 | if (CGM.getLangOpts().OffloadViaLLVM || |
| 1471 | (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) |
| 1472 | createOffloadingEntries(); |
| 1473 | else |
| 1474 | return makeModuleCtorFunction(); |
| 1475 | |
| 1476 | return nullptr; |
| 1477 | } |
| 1478 | |
| 1479 | llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, |
| 1480 | GlobalDecl GD) { |
| 1481 | auto Loc = KernelHandles.find(Val: F->getName()); |
| 1482 | if (Loc != KernelHandles.end()) { |
| 1483 | auto OldHandle = Loc->second; |
| 1484 | if (KernelStubs[OldHandle] == F) |
| 1485 | return OldHandle; |
| 1486 | |
| 1487 | // We've found the function name, but F itself has changed, so we need to |
| 1488 | // update the references. |
| 1489 | if (CGM.getLangOpts().HIP) { |
| 1490 | // For HIP compilation the handle itself does not change, so we only need |
| 1491 | // to update the Stub value. |
| 1492 | KernelStubs[OldHandle] = F; |
| 1493 | return OldHandle; |
| 1494 | } |
| 1495 | // For non-HIP compilation, erase the old Stub and fall-through to creating |
| 1496 | // new entries. |
| 1497 | KernelStubs.erase(Val: OldHandle); |
| 1498 | } |
| 1499 | |
| 1500 | if (!CGM.getLangOpts().HIP) { |
| 1501 | KernelHandles[F->getName()] = F; |
| 1502 | KernelStubs[F] = F; |
| 1503 | return F; |
| 1504 | } |
| 1505 | |
| 1506 | auto *Var = new llvm::GlobalVariable( |
| 1507 | TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), |
| 1508 | /*Initializer=*/nullptr, |
| 1509 | CGM.getMangledName( |
| 1510 | GD: GD.getWithKernelReferenceKind(Kind: KernelReferenceKind::Kernel))); |
| 1511 | Var->setAlignment(CGM.getPointerAlign().getAsAlign()); |
| 1512 | Var->setDSOLocal(F->isDSOLocal()); |
| 1513 | Var->setVisibility(F->getVisibility()); |
| 1514 | auto *FD = cast<FunctionDecl>(Val: GD.getDecl()); |
| 1515 | auto *FT = FD->getPrimaryTemplate(); |
| 1516 | if (!FT || FT->isThisDeclarationADefinition()) |
| 1517 | CGM.maybeSetTrivialComdat(D: *FD, GO&: *Var); |
| 1518 | KernelHandles[F->getName()] = Var; |
| 1519 | KernelStubs[Var] = F; |
| 1520 | return Var; |
| 1521 | } |
| 1522 | |