1 | //===- AMDGPU.cpp ---------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "ABIInfoImpl.h" |
10 | #include "TargetInfo.h" |
11 | #include "clang/Basic/TargetOptions.h" |
12 | |
13 | using namespace clang; |
14 | using namespace clang::CodeGen; |
15 | |
16 | //===----------------------------------------------------------------------===// |
17 | // AMDGPU ABI Implementation |
18 | //===----------------------------------------------------------------------===// |
19 | |
20 | namespace { |
21 | |
22 | class AMDGPUABIInfo final : public DefaultABIInfo { |
23 | private: |
24 | static const unsigned MaxNumRegsForArgsRet = 16; |
25 | |
26 | unsigned numRegsForType(QualType Ty) const; |
27 | |
28 | bool isHomogeneousAggregateBaseType(QualType Ty) const override; |
29 | bool isHomogeneousAggregateSmallEnough(const Type *Base, |
30 | uint64_t Members) const override; |
31 | |
32 | // Coerce HIP scalar pointer arguments from generic pointers to global ones. |
33 | llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, |
34 | unsigned ToAS) const { |
35 | // Single value types. |
36 | auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Val: Ty); |
37 | if (PtrTy && PtrTy->getAddressSpace() == FromAS) |
38 | return llvm::PointerType::get(C&: Ty->getContext(), AddressSpace: ToAS); |
39 | return Ty; |
40 | } |
41 | |
42 | public: |
43 | explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : |
44 | DefaultABIInfo(CGT) {} |
45 | |
46 | ABIArgInfo classifyReturnType(QualType RetTy) const; |
47 | ABIArgInfo classifyKernelArgumentType(QualType Ty) const; |
48 | ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic, |
49 | unsigned &NumRegsLeft) const; |
50 | |
51 | void computeInfo(CGFunctionInfo &FI) const override; |
52 | RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, |
53 | AggValueSlot Slot) const override; |
54 | }; |
55 | |
56 | bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { |
57 | return true; |
58 | } |
59 | |
60 | bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( |
61 | const Type *Base, uint64_t Members) const { |
62 | uint32_t NumRegs = (getContext().getTypeSize(T: Base) + 31) / 32; |
63 | |
64 | // Homogeneous Aggregates may occupy at most 16 registers. |
65 | return Members * NumRegs <= MaxNumRegsForArgsRet; |
66 | } |
67 | |
68 | /// Estimate number of registers the type will use when passed in registers. |
69 | unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { |
70 | unsigned NumRegs = 0; |
71 | |
72 | if (const VectorType *VT = Ty->getAs<VectorType>()) { |
73 | // Compute from the number of elements. The reported size is based on the |
74 | // in-memory size, which includes the padding 4th element for 3-vectors. |
75 | QualType EltTy = VT->getElementType(); |
76 | unsigned EltSize = getContext().getTypeSize(T: EltTy); |
77 | |
78 | // 16-bit element vectors should be passed as packed. |
79 | if (EltSize == 16) |
80 | return (VT->getNumElements() + 1) / 2; |
81 | |
82 | unsigned EltNumRegs = (EltSize + 31) / 32; |
83 | return EltNumRegs * VT->getNumElements(); |
84 | } |
85 | |
86 | if (const RecordType *RT = Ty->getAs<RecordType>()) { |
87 | const RecordDecl *RD = RT->getDecl(); |
88 | assert(!RD->hasFlexibleArrayMember()); |
89 | |
90 | for (const FieldDecl *Field : RD->fields()) { |
91 | QualType FieldTy = Field->getType(); |
92 | NumRegs += numRegsForType(Ty: FieldTy); |
93 | } |
94 | |
95 | return NumRegs; |
96 | } |
97 | |
98 | return (getContext().getTypeSize(T: Ty) + 31) / 32; |
99 | } |
100 | |
101 | void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { |
102 | llvm::CallingConv::ID CC = FI.getCallingConvention(); |
103 | |
104 | if (!getCXXABI().classifyReturnType(FI)) |
105 | FI.getReturnInfo() = classifyReturnType(RetTy: FI.getReturnType()); |
106 | |
107 | unsigned ArgumentIndex = 0; |
108 | const unsigned numFixedArguments = FI.getNumRequiredArgs(); |
109 | |
110 | unsigned NumRegsLeft = MaxNumRegsForArgsRet; |
111 | for (auto &Arg : FI.arguments()) { |
112 | if (CC == llvm::CallingConv::AMDGPU_KERNEL) { |
113 | Arg.info = classifyKernelArgumentType(Ty: Arg.type); |
114 | } else { |
115 | bool FixedArgument = ArgumentIndex++ < numFixedArguments; |
116 | Arg.info = classifyArgumentType(Ty: Arg.type, Variadic: !FixedArgument, NumRegsLeft); |
117 | } |
118 | } |
119 | } |
120 | |
121 | RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, |
122 | QualType Ty, AggValueSlot Slot) const { |
123 | const bool IsIndirect = false; |
124 | const bool AllowHigherAlign = false; |
125 | return emitVoidPtrVAArg(CGF, VAListAddr, ValueTy: Ty, IsIndirect, |
126 | ValueInfo: getContext().getTypeInfoInChars(T: Ty), |
127 | SlotSizeAndAlign: CharUnits::fromQuantity(Quantity: 4), AllowHigherAlign, Slot); |
128 | } |
129 | |
130 | ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { |
131 | if (isAggregateTypeForABI(T: RetTy)) { |
132 | // Records with non-trivial destructors/copy-constructors should not be |
133 | // returned by value. |
134 | if (!getRecordArgABI(T: RetTy, CXXABI&: getCXXABI())) { |
135 | // Ignore empty structs/unions. |
136 | if (isEmptyRecord(Context&: getContext(), T: RetTy, AllowArrays: true)) |
137 | return ABIArgInfo::getIgnore(); |
138 | |
139 | // Lower single-element structs to just return a regular value. |
140 | if (const Type *SeltTy = isSingleElementStruct(T: RetTy, Context&: getContext())) |
141 | return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0))); |
142 | |
143 | if (const RecordType *RT = RetTy->getAs<RecordType>()) { |
144 | const RecordDecl *RD = RT->getDecl(); |
145 | if (RD->hasFlexibleArrayMember()) |
146 | return DefaultABIInfo::classifyReturnType(RetTy); |
147 | } |
148 | |
149 | // Pack aggregates <= 4 bytes into single VGPR or pair. |
150 | uint64_t Size = getContext().getTypeSize(T: RetTy); |
151 | if (Size <= 16) |
152 | return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext())); |
153 | |
154 | if (Size <= 32) |
155 | return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext())); |
156 | |
157 | if (Size <= 64) { |
158 | llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext()); |
159 | return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2)); |
160 | } |
161 | |
162 | if (numRegsForType(Ty: RetTy) <= MaxNumRegsForArgsRet) |
163 | return ABIArgInfo::getDirect(); |
164 | } |
165 | } |
166 | |
167 | // Otherwise just do the default thing. |
168 | return DefaultABIInfo::classifyReturnType(RetTy); |
169 | } |
170 | |
171 | /// For kernels all parameters are really passed in a special buffer. It doesn't |
172 | /// make sense to pass anything byval, so everything must be direct. |
173 | ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { |
174 | Ty = useFirstFieldIfTransparentUnion(Ty); |
175 | |
176 | // TODO: Can we omit empty structs? |
177 | |
178 | if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext())) |
179 | Ty = QualType(SeltTy, 0); |
180 | |
181 | llvm::Type *OrigLTy = CGT.ConvertType(T: Ty); |
182 | llvm::Type *LTy = OrigLTy; |
183 | if (getContext().getLangOpts().HIP) { |
184 | LTy = coerceKernelArgumentType( |
185 | Ty: OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(AS: LangAS::Default), |
186 | /*ToAS=*/getContext().getTargetAddressSpace(AS: LangAS::cuda_device)); |
187 | } |
188 | |
189 | // FIXME: Should also use this for OpenCL, but it requires addressing the |
190 | // problem of kernels being called. |
191 | // |
192 | // FIXME: This doesn't apply the optimization of coercing pointers in structs |
193 | // to global address space when using byref. This would require implementing a |
194 | // new kind of coercion of the in-memory type when for indirect arguments. |
195 | if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && |
196 | isAggregateTypeForABI(T: Ty)) { |
197 | return ABIArgInfo::getIndirectAliased( |
198 | Alignment: getContext().getTypeAlignInChars(T: Ty), |
199 | AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_constant), |
200 | Realign: false /*Realign*/, Padding: nullptr /*Padding*/); |
201 | } |
202 | |
203 | // If we set CanBeFlattened to true, CodeGen will expand the struct to its |
204 | // individual elements, which confuses the Clover OpenCL backend; therefore we |
205 | // have to set it to false here. Other args of getDirect() are just defaults. |
206 | return ABIArgInfo::getDirect(T: LTy, Offset: 0, Padding: nullptr, CanBeFlattened: false); |
207 | } |
208 | |
209 | ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic, |
210 | unsigned &NumRegsLeft) const { |
211 | assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow" ); |
212 | |
213 | Ty = useFirstFieldIfTransparentUnion(Ty); |
214 | |
215 | if (Variadic) { |
216 | return ABIArgInfo::getDirect(/*T=*/nullptr, |
217 | /*Offset=*/0, |
218 | /*Padding=*/nullptr, |
219 | /*CanBeFlattened=*/false, |
220 | /*Align=*/0); |
221 | } |
222 | |
223 | if (isAggregateTypeForABI(T: Ty)) { |
224 | // Records with non-trivial destructors/copy-constructors should not be |
225 | // passed by value. |
226 | if (auto RAA = getRecordArgABI(T: Ty, CXXABI&: getCXXABI())) |
227 | return getNaturalAlignIndirect(Ty, ByVal: RAA == CGCXXABI::RAA_DirectInMemory); |
228 | |
229 | // Ignore empty structs/unions. |
230 | if (isEmptyRecord(Context&: getContext(), T: Ty, AllowArrays: true)) |
231 | return ABIArgInfo::getIgnore(); |
232 | |
233 | // Lower single-element structs to just pass a regular value. TODO: We |
234 | // could do reasonable-size multiple-element structs too, using getExpand(), |
235 | // though watch out for things like bitfields. |
236 | if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext())) |
237 | return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0))); |
238 | |
239 | if (const RecordType *RT = Ty->getAs<RecordType>()) { |
240 | const RecordDecl *RD = RT->getDecl(); |
241 | if (RD->hasFlexibleArrayMember()) |
242 | return DefaultABIInfo::classifyArgumentType(RetTy: Ty); |
243 | } |
244 | |
245 | // Pack aggregates <= 8 bytes into single VGPR or pair. |
246 | uint64_t Size = getContext().getTypeSize(T: Ty); |
247 | if (Size <= 64) { |
248 | unsigned NumRegs = (Size + 31) / 32; |
249 | NumRegsLeft -= std::min(a: NumRegsLeft, b: NumRegs); |
250 | |
251 | if (Size <= 16) |
252 | return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext())); |
253 | |
254 | if (Size <= 32) |
255 | return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext())); |
256 | |
257 | // XXX: Should this be i64 instead, and should the limit increase? |
258 | llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext()); |
259 | return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2)); |
260 | } |
261 | |
262 | if (NumRegsLeft > 0) { |
263 | unsigned NumRegs = numRegsForType(Ty); |
264 | if (NumRegsLeft >= NumRegs) { |
265 | NumRegsLeft -= NumRegs; |
266 | return ABIArgInfo::getDirect(); |
267 | } |
268 | } |
269 | |
270 | // Use pass-by-reference in stead of pass-by-value for struct arguments in |
271 | // function ABI. |
272 | return ABIArgInfo::getIndirectAliased( |
273 | Alignment: getContext().getTypeAlignInChars(T: Ty), |
274 | AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_private)); |
275 | } |
276 | |
277 | // Otherwise just do the default thing. |
278 | ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(RetTy: Ty); |
279 | if (!ArgInfo.isIndirect()) { |
280 | unsigned NumRegs = numRegsForType(Ty); |
281 | NumRegsLeft -= std::min(a: NumRegs, b: NumRegsLeft); |
282 | } |
283 | |
284 | return ArgInfo; |
285 | } |
286 | |
287 | class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { |
288 | public: |
289 | AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) |
290 | : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(args&: CGT)) {} |
291 | |
292 | void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, |
293 | CodeGenModule &CGM) const; |
294 | |
295 | void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; |
296 | |
297 | void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, |
298 | CodeGen::CodeGenModule &M) const override; |
299 | unsigned getOpenCLKernelCallingConv() const override; |
300 | |
301 | llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, |
302 | llvm::PointerType *T, QualType QT) const override; |
303 | |
304 | LangAS getASTAllocaAddressSpace() const override { |
305 | return getLangASFromTargetAS( |
306 | TargetAS: getABIInfo().getDataLayout().getAllocaAddrSpace()); |
307 | } |
308 | LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, |
309 | const VarDecl *D) const override; |
310 | llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, |
311 | SyncScope Scope, |
312 | llvm::AtomicOrdering Ordering, |
313 | llvm::LLVMContext &Ctx) const override; |
314 | llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, |
315 | llvm::Function *BlockInvokeFunc, |
316 | llvm::Type *BlockTy) const override; |
317 | bool shouldEmitStaticExternCAliases() const override; |
318 | bool shouldEmitDWARFBitFieldSeparators() const override; |
319 | void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; |
320 | }; |
321 | } |
322 | |
323 | static bool requiresAMDGPUProtectedVisibility(const Decl *D, |
324 | llvm::GlobalValue *GV) { |
325 | if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) |
326 | return false; |
327 | |
328 | return !D->hasAttr<OMPDeclareTargetDeclAttr>() && |
329 | (D->hasAttr<OpenCLKernelAttr>() || |
330 | (isa<FunctionDecl>(Val: D) && D->hasAttr<CUDAGlobalAttr>()) || |
331 | (isa<VarDecl>(Val: D) && |
332 | (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || |
333 | cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinSurfaceType() || |
334 | cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinTextureType()))); |
335 | } |
336 | |
337 | void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( |
338 | const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { |
339 | const auto *ReqdWGS = |
340 | M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; |
341 | const bool IsOpenCLKernel = |
342 | M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); |
343 | const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); |
344 | |
345 | const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); |
346 | if (ReqdWGS || FlatWGS) { |
347 | M.handleAMDGPUFlatWorkGroupSizeAttr(F, A: FlatWGS, ReqdWGS); |
348 | } else if (IsOpenCLKernel || IsHIPKernel) { |
349 | // By default, restrict the maximum size to a value specified by |
350 | // --gpu-max-threads-per-block=n or its default value for HIP. |
351 | const unsigned OpenCLDefaultMaxWorkGroupSize = 256; |
352 | const unsigned DefaultMaxWorkGroupSize = |
353 | IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize |
354 | : M.getLangOpts().GPUMaxThreadsPerBlock; |
355 | std::string AttrVal = |
356 | std::string("1," ) + llvm::utostr(X: DefaultMaxWorkGroupSize); |
357 | F->addFnAttr(Kind: "amdgpu-flat-work-group-size" , Val: AttrVal); |
358 | } |
359 | |
360 | if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) |
361 | M.handleAMDGPUWavesPerEUAttr(F, A: Attr); |
362 | |
363 | if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { |
364 | unsigned NumSGPR = Attr->getNumSGPR(); |
365 | |
366 | if (NumSGPR != 0) |
367 | F->addFnAttr(Kind: "amdgpu-num-sgpr" , Val: llvm::utostr(X: NumSGPR)); |
368 | } |
369 | |
370 | if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { |
371 | uint32_t NumVGPR = Attr->getNumVGPR(); |
372 | |
373 | if (NumVGPR != 0) |
374 | F->addFnAttr(Kind: "amdgpu-num-vgpr" , Val: llvm::utostr(X: NumVGPR)); |
375 | } |
376 | |
377 | if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) { |
378 | uint32_t X = Attr->getMaxNumWorkGroupsX() |
379 | ->EvaluateKnownConstInt(Ctx: M.getContext()) |
380 | .getExtValue(); |
381 | // Y and Z dimensions default to 1 if not specified |
382 | uint32_t Y = Attr->getMaxNumWorkGroupsY() |
383 | ? Attr->getMaxNumWorkGroupsY() |
384 | ->EvaluateKnownConstInt(Ctx: M.getContext()) |
385 | .getExtValue() |
386 | : 1; |
387 | uint32_t Z = Attr->getMaxNumWorkGroupsZ() |
388 | ? Attr->getMaxNumWorkGroupsZ() |
389 | ->EvaluateKnownConstInt(Ctx: M.getContext()) |
390 | .getExtValue() |
391 | : 1; |
392 | |
393 | llvm::SmallString<32> AttrVal; |
394 | llvm::raw_svector_ostream OS(AttrVal); |
395 | OS << X << ',' << Y << ',' << Z; |
396 | |
397 | F->addFnAttr(Kind: "amdgpu-max-num-workgroups" , Val: AttrVal.str()); |
398 | } |
399 | } |
400 | |
401 | /// Emits control constants used to change per-architecture behaviour in the |
402 | /// AMDGPU ROCm device libraries. |
403 | void AMDGPUTargetCodeGenInfo::emitTargetGlobals( |
404 | CodeGen::CodeGenModule &CGM) const { |
405 | StringRef Name = "__oclc_ABI_version" ; |
406 | llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name); |
407 | if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(Linkage: OriginalGV->getLinkage())) |
408 | return; |
409 | |
410 | if (CGM.getTarget().getTargetOpts().CodeObjectVersion == |
411 | llvm::CodeObjectVersionKind::COV_None) |
412 | return; |
413 | |
414 | auto *Type = llvm::IntegerType::getIntNTy(C&: CGM.getModule().getContext(), N: 32); |
415 | llvm::Constant *COV = llvm::ConstantInt::get( |
416 | Ty: Type, V: CGM.getTarget().getTargetOpts().CodeObjectVersion); |
417 | |
418 | // It needs to be constant weak_odr without externally_initialized so that |
419 | // the load instuction can be eliminated by the IPSCCP. |
420 | auto *GV = new llvm::GlobalVariable( |
421 | CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name, |
422 | nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, |
423 | CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant)); |
424 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); |
425 | GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); |
426 | |
427 | // Replace any external references to this variable with the new global. |
428 | if (OriginalGV) { |
429 | OriginalGV->replaceAllUsesWith(V: GV); |
430 | GV->takeName(V: OriginalGV); |
431 | OriginalGV->eraseFromParent(); |
432 | } |
433 | } |
434 | |
435 | void AMDGPUTargetCodeGenInfo::setTargetAttributes( |
436 | const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { |
437 | if (requiresAMDGPUProtectedVisibility(D, GV)) { |
438 | GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); |
439 | GV->setDSOLocal(true); |
440 | } |
441 | |
442 | if (GV->isDeclaration()) |
443 | return; |
444 | |
445 | llvm::Function *F = dyn_cast<llvm::Function>(Val: GV); |
446 | if (!F) |
447 | return; |
448 | |
449 | const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(Val: D); |
450 | if (FD) |
451 | setFunctionDeclAttributes(FD, F, M); |
452 | |
453 | if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) |
454 | F->addFnAttr(Kind: "amdgpu-unsafe-fp-atomics" , Val: "true" ); |
455 | |
456 | if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) |
457 | F->addFnAttr(Kind: "amdgpu-ieee" , Val: "false" ); |
458 | } |
459 | |
460 | unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { |
461 | return llvm::CallingConv::AMDGPU_KERNEL; |
462 | } |
463 | |
464 | // Currently LLVM assumes null pointers always have value 0, |
465 | // which results in incorrectly transformed IR. Therefore, instead of |
466 | // emitting null pointers in private and local address spaces, a null |
467 | // pointer in generic address space is emitted which is casted to a |
468 | // pointer in local or private address space. |
469 | llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( |
470 | const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, |
471 | QualType QT) const { |
472 | if (CGM.getContext().getTargetNullPointerValue(QT) == 0) |
473 | return llvm::ConstantPointerNull::get(T: PT); |
474 | |
475 | auto &Ctx = CGM.getContext(); |
476 | auto NPT = llvm::PointerType::get( |
477 | C&: PT->getContext(), AddressSpace: Ctx.getTargetAddressSpace(AS: LangAS::opencl_generic)); |
478 | return llvm::ConstantExpr::getAddrSpaceCast( |
479 | C: llvm::ConstantPointerNull::get(T: NPT), Ty: PT); |
480 | } |
481 | |
482 | LangAS |
483 | AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, |
484 | const VarDecl *D) const { |
485 | assert(!CGM.getLangOpts().OpenCL && |
486 | !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && |
487 | "Address space agnostic languages only" ); |
488 | LangAS DefaultGlobalAS = getLangASFromTargetAS( |
489 | TargetAS: CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_global)); |
490 | if (!D) |
491 | return DefaultGlobalAS; |
492 | |
493 | LangAS AddrSpace = D->getType().getAddressSpace(); |
494 | if (AddrSpace != LangAS::Default) |
495 | return AddrSpace; |
496 | |
497 | // Only promote to address space 4 if VarDecl has constant initialization. |
498 | if (D->getType().isConstantStorage(Ctx: CGM.getContext(), ExcludeCtor: false, ExcludeDtor: false) && |
499 | D->hasConstantInitialization()) { |
500 | if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) |
501 | return *ConstAS; |
502 | } |
503 | return DefaultGlobalAS; |
504 | } |
505 | |
506 | llvm::SyncScope::ID |
507 | AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, |
508 | SyncScope Scope, |
509 | llvm::AtomicOrdering Ordering, |
510 | llvm::LLVMContext &Ctx) const { |
511 | std::string Name; |
512 | switch (Scope) { |
513 | case SyncScope::HIPSingleThread: |
514 | case SyncScope::SingleScope: |
515 | Name = "singlethread" ; |
516 | break; |
517 | case SyncScope::HIPWavefront: |
518 | case SyncScope::OpenCLSubGroup: |
519 | case SyncScope::WavefrontScope: |
520 | Name = "wavefront" ; |
521 | break; |
522 | case SyncScope::HIPWorkgroup: |
523 | case SyncScope::OpenCLWorkGroup: |
524 | case SyncScope::WorkgroupScope: |
525 | Name = "workgroup" ; |
526 | break; |
527 | case SyncScope::HIPAgent: |
528 | case SyncScope::OpenCLDevice: |
529 | case SyncScope::DeviceScope: |
530 | Name = "agent" ; |
531 | break; |
532 | case SyncScope::SystemScope: |
533 | case SyncScope::HIPSystem: |
534 | case SyncScope::OpenCLAllSVMDevices: |
535 | Name = "" ; |
536 | break; |
537 | } |
538 | |
539 | if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { |
540 | if (!Name.empty()) |
541 | Name = Twine(Twine(Name) + Twine("-" )).str(); |
542 | |
543 | Name = Twine(Twine(Name) + Twine("one-as" )).str(); |
544 | } |
545 | |
546 | return Ctx.getOrInsertSyncScopeID(SSN: Name); |
547 | } |
548 | |
549 | bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { |
550 | return false; |
551 | } |
552 | |
553 | bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { |
554 | return true; |
555 | } |
556 | |
557 | void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( |
558 | const FunctionType *&FT) const { |
559 | FT = getABIInfo().getContext().adjustFunctionType( |
560 | Fn: FT, EInfo: FT->getExtInfo().withCallingConv(cc: CC_OpenCLKernel)); |
561 | } |
562 | |
563 | /// Create an OpenCL kernel for an enqueued block. |
564 | /// |
565 | /// The type of the first argument (the block literal) is the struct type |
566 | /// of the block literal instead of a pointer type. The first argument |
567 | /// (block literal) is passed directly by value to the kernel. The kernel |
568 | /// allocates the same type of struct on stack and stores the block literal |
569 | /// to it and passes its pointer to the block invoke function. The kernel |
570 | /// has "enqueued-block" function attribute and kernel argument metadata. |
571 | llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( |
572 | CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { |
573 | auto &Builder = CGF.Builder; |
574 | auto &C = CGF.getLLVMContext(); |
575 | |
576 | auto *InvokeFT = Invoke->getFunctionType(); |
577 | llvm::SmallVector<llvm::Type *, 2> ArgTys; |
578 | llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; |
579 | llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; |
580 | llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; |
581 | llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; |
582 | llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; |
583 | llvm::SmallVector<llvm::Metadata *, 8> ArgNames; |
584 | |
585 | ArgTys.push_back(Elt: BlockTy); |
586 | ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal" )); |
587 | AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 0))); |
588 | ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal" )); |
589 | ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "" )); |
590 | AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none" )); |
591 | ArgNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "block_literal" )); |
592 | for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { |
593 | ArgTys.push_back(Elt: InvokeFT->getParamType(i: I)); |
594 | ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*" )); |
595 | AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 3))); |
596 | AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none" )); |
597 | ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*" )); |
598 | ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "" )); |
599 | ArgNames.push_back( |
600 | Elt: llvm::MDString::get(Context&: C, Str: (Twine("local_arg" ) + Twine(I)).str())); |
601 | } |
602 | std::string Name = Invoke->getName().str() + "_kernel" ; |
603 | auto *FT = llvm::FunctionType::get(Result: llvm::Type::getVoidTy(C), Params: ArgTys, isVarArg: false); |
604 | auto *F = llvm::Function::Create(Ty: FT, Linkage: llvm::GlobalValue::InternalLinkage, N: Name, |
605 | M: &CGF.CGM.getModule()); |
606 | F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); |
607 | |
608 | llvm::AttrBuilder KernelAttrs(C); |
609 | // FIXME: The invoke isn't applying the right attributes either |
610 | // FIXME: This is missing setTargetAttributes |
611 | CGF.CGM.addDefaultFunctionDefinitionAttributes(attrs&: KernelAttrs); |
612 | KernelAttrs.addAttribute(A: "enqueued-block" ); |
613 | F->addFnAttrs(Attrs: KernelAttrs); |
614 | |
615 | auto IP = CGF.Builder.saveIP(); |
616 | auto *BB = llvm::BasicBlock::Create(Context&: C, Name: "entry" , Parent: F); |
617 | Builder.SetInsertPoint(BB); |
618 | const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(Ty: BlockTy); |
619 | auto *BlockPtr = Builder.CreateAlloca(Ty: BlockTy, ArraySize: nullptr); |
620 | BlockPtr->setAlignment(BlockAlign); |
621 | Builder.CreateAlignedStore(Val: F->arg_begin(), Ptr: BlockPtr, Align: BlockAlign); |
622 | auto *Cast = Builder.CreatePointerCast(V: BlockPtr, DestTy: InvokeFT->getParamType(i: 0)); |
623 | llvm::SmallVector<llvm::Value *, 2> Args; |
624 | Args.push_back(Elt: Cast); |
625 | for (llvm::Argument &A : llvm::drop_begin(RangeOrContainer: F->args())) |
626 | Args.push_back(Elt: &A); |
627 | llvm::CallInst *call = Builder.CreateCall(Callee: Invoke, Args); |
628 | call->setCallingConv(Invoke->getCallingConv()); |
629 | Builder.CreateRetVoid(); |
630 | Builder.restoreIP(IP); |
631 | |
632 | F->setMetadata(Kind: "kernel_arg_addr_space" , Node: llvm::MDNode::get(Context&: C, MDs: AddressQuals)); |
633 | F->setMetadata(Kind: "kernel_arg_access_qual" , Node: llvm::MDNode::get(Context&: C, MDs: AccessQuals)); |
634 | F->setMetadata(Kind: "kernel_arg_type" , Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeNames)); |
635 | F->setMetadata(Kind: "kernel_arg_base_type" , |
636 | Node: llvm::MDNode::get(Context&: C, MDs: ArgBaseTypeNames)); |
637 | F->setMetadata(Kind: "kernel_arg_type_qual" , Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeQuals)); |
638 | if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) |
639 | F->setMetadata(Kind: "kernel_arg_name" , Node: llvm::MDNode::get(Context&: C, MDs: ArgNames)); |
640 | |
641 | return F; |
642 | } |
643 | |
644 | void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( |
645 | llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS, |
646 | const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal, |
647 | int32_t *MaxThreadsVal) { |
648 | unsigned Min = 0; |
649 | unsigned Max = 0; |
650 | if (FlatWGS) { |
651 | Min = FlatWGS->getMin()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue(); |
652 | Max = FlatWGS->getMax()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue(); |
653 | } |
654 | if (ReqdWGS && Min == 0 && Max == 0) |
655 | Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); |
656 | |
657 | if (Min != 0) { |
658 | assert(Min <= Max && "Min must be less than or equal Max" ); |
659 | |
660 | if (MinThreadsVal) |
661 | *MinThreadsVal = Min; |
662 | if (MaxThreadsVal) |
663 | *MaxThreadsVal = Max; |
664 | std::string AttrVal = llvm::utostr(X: Min) + "," + llvm::utostr(X: Max); |
665 | if (F) |
666 | F->addFnAttr(Kind: "amdgpu-flat-work-group-size" , Val: AttrVal); |
667 | } else |
668 | assert(Max == 0 && "Max must be zero" ); |
669 | } |
670 | |
671 | void CodeGenModule::handleAMDGPUWavesPerEUAttr( |
672 | llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { |
673 | unsigned Min = |
674 | Attr->getMin()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue(); |
675 | unsigned Max = |
676 | Attr->getMax() |
677 | ? Attr->getMax()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue() |
678 | : 0; |
679 | |
680 | if (Min != 0) { |
681 | assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max" ); |
682 | |
683 | std::string AttrVal = llvm::utostr(X: Min); |
684 | if (Max != 0) |
685 | AttrVal = AttrVal + "," + llvm::utostr(X: Max); |
686 | F->addFnAttr(Kind: "amdgpu-waves-per-eu" , Val: AttrVal); |
687 | } else |
688 | assert(Max == 0 && "Max must be zero" ); |
689 | } |
690 | |
691 | std::unique_ptr<TargetCodeGenInfo> |
692 | CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { |
693 | return std::make_unique<AMDGPUTargetCodeGenInfo>(args&: CGM.getTypes()); |
694 | } |
695 | |