1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
11#include "llvm/ADT/StringExtras.h"
12#include "llvm/Support/AMDGPUAddrSpace.h"
13
14using namespace clang;
15using namespace clang::CodeGen;
16
17//===----------------------------------------------------------------------===//
18// AMDGPU ABI Implementation
19//===----------------------------------------------------------------------===//
20
21namespace {
22
23class AMDGPUABIInfo final : public DefaultABIInfo {
24private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
26
27 unsigned numRegsForType(QualType Ty) const;
28
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 bool isHomogeneousAggregateSmallEnough(const Type *Base,
31 uint64_t Members) const override;
32
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Val: Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(C&: Ty->getContext(), AddressSpace: ToAS);
40 return Ty;
41 }
42
43public:
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
46
47 ABIArgInfo classifyReturnType(QualType RetTy) const;
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50 unsigned &NumRegsLeft) const;
51
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
55
56 llvm::FixedVectorType *
57 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58 const LangOptions &Opt) const override {
59 // We have legal instructions for 96-bit so 3x32 can be supported.
60 // FIXME: This check should be a subtarget feature as technically SI doesn't
61 // support it.
62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(Ty: T) == 96)
63 return T;
64 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65 }
66};
67
68bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69 return true;
70}
71
72bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73 const Type *Base, uint64_t Members) const {
74 uint32_t NumRegs = (getContext().getTypeSize(T: Base) + 31) / 32;
75
76 // Homogeneous Aggregates may occupy at most 16 registers.
77 return Members * NumRegs <= MaxNumRegsForArgsRet;
78}
79
80/// Estimate number of registers the type will use when passed in registers.
81unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82 unsigned NumRegs = 0;
83
84 if (const VectorType *VT = Ty->getAs<VectorType>()) {
85 // Compute from the number of elements. The reported size is based on the
86 // in-memory size, which includes the padding 4th element for 3-vectors.
87 QualType EltTy = VT->getElementType();
88 unsigned EltSize = getContext().getTypeSize(T: EltTy);
89
90 // 16-bit element vectors should be passed as packed.
91 if (EltSize == 16)
92 return (VT->getNumElements() + 1) / 2;
93
94 unsigned EltNumRegs = (EltSize + 31) / 32;
95 return EltNumRegs * VT->getNumElements();
96 }
97
98 if (const RecordType *RT = Ty->getAs<RecordType>()) {
99 const RecordDecl *RD = RT->getDecl();
100 assert(!RD->hasFlexibleArrayMember());
101
102 for (const FieldDecl *Field : RD->fields()) {
103 QualType FieldTy = Field->getType();
104 NumRegs += numRegsForType(Ty: FieldTy);
105 }
106
107 return NumRegs;
108 }
109
110 return (getContext().getTypeSize(T: Ty) + 31) / 32;
111}
112
113void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
114 llvm::CallingConv::ID CC = FI.getCallingConvention();
115
116 if (!getCXXABI().classifyReturnType(FI))
117 FI.getReturnInfo() = classifyReturnType(RetTy: FI.getReturnType());
118
119 unsigned ArgumentIndex = 0;
120 const unsigned numFixedArguments = FI.getNumRequiredArgs();
121
122 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
123 for (auto &Arg : FI.arguments()) {
124 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
125 Arg.info = classifyKernelArgumentType(Ty: Arg.type);
126 } else {
127 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
128 Arg.info = classifyArgumentType(Ty: Arg.type, Variadic: !FixedArgument, NumRegsLeft);
129 }
130 }
131}
132
133RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
134 QualType Ty, AggValueSlot Slot) const {
135 const bool IsIndirect = false;
136 const bool AllowHigherAlign = false;
137 return emitVoidPtrVAArg(CGF, VAListAddr, ValueTy: Ty, IsIndirect,
138 ValueInfo: getContext().getTypeInfoInChars(T: Ty),
139 SlotSizeAndAlign: CharUnits::fromQuantity(Quantity: 4), AllowHigherAlign, Slot);
140}
141
142ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
143 if (isAggregateTypeForABI(T: RetTy)) {
144 // Records with non-trivial destructors/copy-constructors should not be
145 // returned by value.
146 if (!getRecordArgABI(T: RetTy, CXXABI&: getCXXABI())) {
147 // Ignore empty structs/unions.
148 if (isEmptyRecord(Context&: getContext(), T: RetTy, AllowArrays: true))
149 return ABIArgInfo::getIgnore();
150
151 // Lower single-element structs to just return a regular value.
152 if (const Type *SeltTy = isSingleElementStruct(T: RetTy, Context&: getContext()))
153 return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0)));
154
155 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
156 const RecordDecl *RD = RT->getDecl();
157 if (RD->hasFlexibleArrayMember())
158 return DefaultABIInfo::classifyReturnType(RetTy);
159 }
160
161 // Pack aggregates <= 4 bytes into single VGPR or pair.
162 uint64_t Size = getContext().getTypeSize(T: RetTy);
163 if (Size <= 16)
164 return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
165
166 if (Size <= 32)
167 return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
168
169 if (Size <= 64) {
170 llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
171 return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2));
172 }
173
174 if (numRegsForType(Ty: RetTy) <= MaxNumRegsForArgsRet)
175 return ABIArgInfo::getDirect();
176 }
177 }
178
179 // Otherwise just do the default thing.
180 return DefaultABIInfo::classifyReturnType(RetTy);
181}
182
183/// For kernels all parameters are really passed in a special buffer. It doesn't
184/// make sense to pass anything byval, so everything must be direct.
185ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
186 Ty = useFirstFieldIfTransparentUnion(Ty);
187
188 // TODO: Can we omit empty structs?
189
190 if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
191 Ty = QualType(SeltTy, 0);
192
193 llvm::Type *OrigLTy = CGT.ConvertType(T: Ty);
194 llvm::Type *LTy = OrigLTy;
195 if (getContext().getLangOpts().HIP) {
196 LTy = coerceKernelArgumentType(
197 Ty: OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(AS: LangAS::Default),
198 /*ToAS=*/getContext().getTargetAddressSpace(AS: LangAS::cuda_device));
199 }
200
201 // FIXME: This doesn't apply the optimization of coercing pointers in structs
202 // to global address space when using byref. This would require implementing a
203 // new kind of coercion of the in-memory type when for indirect arguments.
204 if (LTy == OrigLTy && isAggregateTypeForABI(T: Ty)) {
205 return ABIArgInfo::getIndirectAliased(
206 Alignment: getContext().getTypeAlignInChars(T: Ty),
207 AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_constant),
208 Realign: false /*Realign*/, Padding: nullptr /*Padding*/);
209 }
210
211 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
212 // individual elements, which confuses the Clover OpenCL backend; therefore we
213 // have to set it to false here. Other args of getDirect() are just defaults.
214 return ABIArgInfo::getDirect(T: LTy, Offset: 0, Padding: nullptr, CanBeFlattened: false);
215}
216
217ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
218 unsigned &NumRegsLeft) const {
219 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
220
221 Ty = useFirstFieldIfTransparentUnion(Ty);
222
223 if (Variadic) {
224 return ABIArgInfo::getDirect(/*T=*/nullptr,
225 /*Offset=*/0,
226 /*Padding=*/nullptr,
227 /*CanBeFlattened=*/false,
228 /*Align=*/0);
229 }
230
231 if (isAggregateTypeForABI(T: Ty)) {
232 // Records with non-trivial destructors/copy-constructors should not be
233 // passed by value.
234 if (auto RAA = getRecordArgABI(T: Ty, CXXABI&: getCXXABI()))
235 return getNaturalAlignIndirect(Ty, AddrSpace: getDataLayout().getAllocaAddrSpace(),
236 ByVal: RAA == CGCXXABI::RAA_DirectInMemory);
237
238 // Ignore empty structs/unions.
239 if (isEmptyRecord(Context&: getContext(), T: Ty, AllowArrays: true))
240 return ABIArgInfo::getIgnore();
241
242 // Lower single-element structs to just pass a regular value. TODO: We
243 // could do reasonable-size multiple-element structs too, using getExpand(),
244 // though watch out for things like bitfields.
245 if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
246 return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0)));
247
248 if (const RecordType *RT = Ty->getAs<RecordType>()) {
249 const RecordDecl *RD = RT->getDecl();
250 if (RD->hasFlexibleArrayMember())
251 return DefaultABIInfo::classifyArgumentType(RetTy: Ty);
252 }
253
254 // Pack aggregates <= 8 bytes into single VGPR or pair.
255 uint64_t Size = getContext().getTypeSize(T: Ty);
256 if (Size <= 64) {
257 unsigned NumRegs = (Size + 31) / 32;
258 NumRegsLeft -= std::min(a: NumRegsLeft, b: NumRegs);
259
260 if (Size <= 16)
261 return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
262
263 if (Size <= 32)
264 return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
265
266 // XXX: Should this be i64 instead, and should the limit increase?
267 llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
268 return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2));
269 }
270
271 if (NumRegsLeft > 0) {
272 unsigned NumRegs = numRegsForType(Ty);
273 if (NumRegsLeft >= NumRegs) {
274 NumRegsLeft -= NumRegs;
275 return ABIArgInfo::getDirect();
276 }
277 }
278
279 // Use pass-by-reference in stead of pass-by-value for struct arguments in
280 // function ABI.
281 return ABIArgInfo::getIndirectAliased(
282 Alignment: getContext().getTypeAlignInChars(T: Ty),
283 AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_private));
284 }
285
286 // Otherwise just do the default thing.
287 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(RetTy: Ty);
288 if (!ArgInfo.isIndirect()) {
289 unsigned NumRegs = numRegsForType(Ty);
290 NumRegsLeft -= std::min(a: NumRegs, b: NumRegsLeft);
291 }
292
293 return ArgInfo;
294}
295
296class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
297public:
298 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
299 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(args&: CGT)) {}
300
301 bool supportsLibCall() const override { return false; }
302 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
303 CodeGenModule &CGM) const;
304
305 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
306 CodeGen::CodeGenModule &M) const override;
307 unsigned getDeviceKernelCallingConv() const override;
308
309 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
310 llvm::PointerType *T, QualType QT) const override;
311
312 LangAS getASTAllocaAddressSpace() const override {
313 return getLangASFromTargetAS(
314 TargetAS: getABIInfo().getDataLayout().getAllocaAddrSpace());
315 }
316 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
317 const VarDecl *D) const override;
318 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
319 SyncScope Scope,
320 llvm::AtomicOrdering Ordering,
321 llvm::LLVMContext &Ctx) const override;
322 void setTargetAtomicMetadata(CodeGenFunction &CGF,
323 llvm::Instruction &AtomicInst,
324 const AtomicExpr *Expr = nullptr) const override;
325 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
326 llvm::Function *BlockInvokeFunc,
327 llvm::Type *BlockTy) const override;
328 bool shouldEmitStaticExternCAliases() const override;
329 bool shouldEmitDWARFBitFieldSeparators() const override;
330 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
331};
332}
333
334static bool requiresAMDGPUProtectedVisibility(const Decl *D,
335 llvm::GlobalValue *GV) {
336 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
337 return false;
338
339 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
340 (D->hasAttr<DeviceKernelAttr>() ||
341 (isa<FunctionDecl>(Val: D) && D->hasAttr<CUDAGlobalAttr>()) ||
342 (isa<VarDecl>(Val: D) &&
343 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
344 cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
345 cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinTextureType())));
346}
347
348void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
349 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
350 const auto *ReqdWGS =
351 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
352 const bool IsOpenCLKernel =
353 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
354 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
355
356 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
357 if (ReqdWGS || FlatWGS) {
358 M.handleAMDGPUFlatWorkGroupSizeAttr(F, A: FlatWGS, ReqdWGS);
359 } else if (IsOpenCLKernel || IsHIPKernel) {
360 // By default, restrict the maximum size to a value specified by
361 // --gpu-max-threads-per-block=n or its default value for HIP.
362 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
363 const unsigned DefaultMaxWorkGroupSize =
364 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
365 : M.getLangOpts().GPUMaxThreadsPerBlock;
366 std::string AttrVal =
367 std::string("1,") + llvm::utostr(X: DefaultMaxWorkGroupSize);
368 F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
369 }
370
371 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
372 M.handleAMDGPUWavesPerEUAttr(F, A: Attr);
373
374 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
375 unsigned NumSGPR = Attr->getNumSGPR();
376
377 if (NumSGPR != 0)
378 F->addFnAttr(Kind: "amdgpu-num-sgpr", Val: llvm::utostr(X: NumSGPR));
379 }
380
381 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
382 uint32_t NumVGPR = Attr->getNumVGPR();
383
384 if (NumVGPR != 0)
385 F->addFnAttr(Kind: "amdgpu-num-vgpr", Val: llvm::utostr(X: NumVGPR));
386 }
387
388 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
389 uint32_t X = Attr->getMaxNumWorkGroupsX()
390 ->EvaluateKnownConstInt(Ctx: M.getContext())
391 .getExtValue();
392 // Y and Z dimensions default to 1 if not specified
393 uint32_t Y = Attr->getMaxNumWorkGroupsY()
394 ? Attr->getMaxNumWorkGroupsY()
395 ->EvaluateKnownConstInt(Ctx: M.getContext())
396 .getExtValue()
397 : 1;
398 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
399 ? Attr->getMaxNumWorkGroupsZ()
400 ->EvaluateKnownConstInt(Ctx: M.getContext())
401 .getExtValue()
402 : 1;
403
404 llvm::SmallString<32> AttrVal;
405 llvm::raw_svector_ostream OS(AttrVal);
406 OS << X << ',' << Y << ',' << Z;
407
408 F->addFnAttr(Kind: "amdgpu-max-num-workgroups", Val: AttrVal.str());
409 }
410}
411
412void AMDGPUTargetCodeGenInfo::setTargetAttributes(
413 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
414 if (requiresAMDGPUProtectedVisibility(D, GV)) {
415 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
416 GV->setDSOLocal(true);
417 }
418
419 if (GV->isDeclaration())
420 return;
421
422 llvm::Function *F = dyn_cast<llvm::Function>(Val: GV);
423 if (!F)
424 return;
425
426 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(Val: D);
427 if (FD)
428 setFunctionDeclAttributes(FD, F, M);
429
430 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
431 F->addFnAttr(Kind: "amdgpu-ieee", Val: "false");
432}
433
434unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
435 return llvm::CallingConv::AMDGPU_KERNEL;
436}
437
438// Currently LLVM assumes null pointers always have value 0,
439// which results in incorrectly transformed IR. Therefore, instead of
440// emitting null pointers in private and local address spaces, a null
441// pointer in generic address space is emitted which is casted to a
442// pointer in local or private address space.
443llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
444 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
445 QualType QT) const {
446 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
447 return llvm::ConstantPointerNull::get(T: PT);
448
449 auto &Ctx = CGM.getContext();
450 auto NPT = llvm::PointerType::get(
451 C&: PT->getContext(), AddressSpace: Ctx.getTargetAddressSpace(AS: LangAS::opencl_generic));
452 return llvm::ConstantExpr::getAddrSpaceCast(
453 C: llvm::ConstantPointerNull::get(T: NPT), Ty: PT);
454}
455
456LangAS
457AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
458 const VarDecl *D) const {
459 assert(!CGM.getLangOpts().OpenCL &&
460 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
461 "Address space agnostic languages only");
462 LangAS DefaultGlobalAS = getLangASFromTargetAS(
463 TargetAS: CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_global));
464 if (!D)
465 return DefaultGlobalAS;
466
467 LangAS AddrSpace = D->getType().getAddressSpace();
468 if (AddrSpace != LangAS::Default)
469 return AddrSpace;
470
471 // Only promote to address space 4 if VarDecl has constant initialization.
472 if (D->getType().isConstantStorage(Ctx: CGM.getContext(), ExcludeCtor: false, ExcludeDtor: false) &&
473 D->hasConstantInitialization()) {
474 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
475 return *ConstAS;
476 }
477 return DefaultGlobalAS;
478}
479
480llvm::SyncScope::ID
481AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
482 SyncScope Scope,
483 llvm::AtomicOrdering Ordering,
484 llvm::LLVMContext &Ctx) const {
485 std::string Name;
486 switch (Scope) {
487 case SyncScope::HIPSingleThread:
488 case SyncScope::SingleScope:
489 Name = "singlethread";
490 break;
491 case SyncScope::HIPWavefront:
492 case SyncScope::OpenCLSubGroup:
493 case SyncScope::WavefrontScope:
494 Name = "wavefront";
495 break;
496 case SyncScope::HIPWorkgroup:
497 case SyncScope::OpenCLWorkGroup:
498 case SyncScope::WorkgroupScope:
499 Name = "workgroup";
500 break;
501 case SyncScope::HIPAgent:
502 case SyncScope::OpenCLDevice:
503 case SyncScope::DeviceScope:
504 Name = "agent";
505 break;
506 case SyncScope::SystemScope:
507 case SyncScope::HIPSystem:
508 case SyncScope::OpenCLAllSVMDevices:
509 Name = "";
510 break;
511 }
512
513 // OpenCL assumes by default that atomic scopes are per-address space for
514 // non-sequentially consistent operations.
515 if (Scope >= SyncScope::OpenCLWorkGroup &&
516 Scope <= SyncScope::OpenCLSubGroup &&
517 Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
518 if (!Name.empty())
519 Name = Twine(Twine(Name) + Twine("-")).str();
520
521 Name = Twine(Twine(Name) + Twine("one-as")).str();
522 }
523
524 return Ctx.getOrInsertSyncScopeID(SSN: Name);
525}
526
527void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
528 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
529 const AtomicExpr *AE) const {
530 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(Val: &AtomicInst);
531 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(Val: &AtomicInst);
532
533 // OpenCL and old style HIP atomics consider atomics targeting thread private
534 // memory to be undefined.
535 //
536 // TODO: This is probably undefined for atomic load/store, but there's not
537 // much direct codegen benefit to knowing this.
538 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
539 (CmpX &&
540 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
541 AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
542 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
543 llvm::MDNode *ASRange = MDHelper.createRange(
544 Lo: llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
545 Hi: llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
546 AtomicInst.setMetadata(KindID: llvm::LLVMContext::MD_noalias_addrspace, Node: ASRange);
547 }
548
549 if (!RMW)
550 return;
551
552 AtomicOptions AO = CGF.CGM.getAtomicOpts();
553 llvm::MDNode *Empty = llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {});
554 if (!AO.getOption(Kind: clang::AtomicOptionKind::FineGrainedMemory))
555 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: Empty);
556 if (!AO.getOption(Kind: clang::AtomicOptionKind::RemoteMemory))
557 RMW->setMetadata(Kind: "amdgpu.no.remote.memory", Node: Empty);
558 if (AO.getOption(Kind: clang::AtomicOptionKind::IgnoreDenormalMode) &&
559 RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
560 RMW->getType()->isFloatTy())
561 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: Empty);
562}
563
564bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
565 return false;
566}
567
568bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
569 return true;
570}
571
572void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
573 const FunctionType *&FT) const {
574 FT = getABIInfo().getContext().adjustFunctionType(
575 Fn: FT, EInfo: FT->getExtInfo().withCallingConv(cc: CC_DeviceKernel));
576}
577
578/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
579/// enqueue.
580///
581/// ptr addrspace(1) kernel_object, i32 private_segment_size,
582/// i32 group_segment_size
583
584static llvm::StructType *
585getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
586 llvm::Type *KernelDescriptorPtrTy) {
587 llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
588 return llvm::StructType::create(Context&: C, Elements: {KernelDescriptorPtrTy, Int32, Int32},
589 Name: "block.runtime.handle.t");
590}
591
592/// Create an OpenCL kernel for an enqueued block.
593///
594/// The type of the first argument (the block literal) is the struct type
595/// of the block literal instead of a pointer type. The first argument
596/// (block literal) is passed directly by value to the kernel. The kernel
597/// allocates the same type of struct on stack and stores the block literal
598/// to it and passes its pointer to the block invoke function. The kernel
599/// has "enqueued-block" function attribute and kernel argument metadata.
600llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
601 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
602 auto &Builder = CGF.Builder;
603 auto &C = CGF.getLLVMContext();
604
605 auto *InvokeFT = Invoke->getFunctionType();
606 llvm::SmallVector<llvm::Type *, 2> ArgTys;
607 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
608 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
609 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
610 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
611 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
612 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
613
614 ArgTys.push_back(Elt: BlockTy);
615 ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
616 AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 0)));
617 ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
618 ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
619 AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
620 ArgNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "block_literal"));
621 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
622 ArgTys.push_back(Elt: InvokeFT->getParamType(i: I));
623 ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
624 AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 3)));
625 AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
626 ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
627 ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
628 ArgNames.push_back(
629 Elt: llvm::MDString::get(Context&: C, Str: (Twine("local_arg") + Twine(I)).str()));
630 }
631
632 llvm::Module &Mod = CGF.CGM.getModule();
633 const llvm::DataLayout &DL = Mod.getDataLayout();
634
635 llvm::Twine Name = Invoke->getName() + "_kernel";
636 auto *FT = llvm::FunctionType::get(Result: llvm::Type::getVoidTy(C), Params: ArgTys, isVarArg: false);
637
638 // The kernel itself can be internal, the runtime does not directly access the
639 // kernel address (only the kernel descriptor).
640 auto *F = llvm::Function::Create(Ty: FT, Linkage: llvm::GlobalValue::InternalLinkage, N: Name,
641 M: &Mod);
642 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
643
644 llvm::AttrBuilder KernelAttrs(C);
645 // FIXME: The invoke isn't applying the right attributes either
646 // FIXME: This is missing setTargetAttributes
647 CGF.CGM.addDefaultFunctionDefinitionAttributes(attrs&: KernelAttrs);
648 F->addFnAttrs(Attrs: KernelAttrs);
649
650 auto IP = CGF.Builder.saveIP();
651 auto *BB = llvm::BasicBlock::Create(Context&: C, Name: "entry", Parent: F);
652 Builder.SetInsertPoint(BB);
653 const auto BlockAlign = DL.getPrefTypeAlign(Ty: BlockTy);
654 auto *BlockPtr = Builder.CreateAlloca(Ty: BlockTy, ArraySize: nullptr);
655 BlockPtr->setAlignment(BlockAlign);
656 Builder.CreateAlignedStore(Val: F->arg_begin(), Ptr: BlockPtr, Align: BlockAlign);
657 auto *Cast = Builder.CreatePointerCast(V: BlockPtr, DestTy: InvokeFT->getParamType(i: 0));
658 llvm::SmallVector<llvm::Value *, 2> Args;
659 Args.push_back(Elt: Cast);
660 for (llvm::Argument &A : llvm::drop_begin(RangeOrContainer: F->args()))
661 Args.push_back(Elt: &A);
662 llvm::CallInst *call = Builder.CreateCall(Callee: Invoke, Args);
663 call->setCallingConv(Invoke->getCallingConv());
664 Builder.CreateRetVoid();
665 Builder.restoreIP(IP);
666
667 F->setMetadata(Kind: "kernel_arg_addr_space", Node: llvm::MDNode::get(Context&: C, MDs: AddressQuals));
668 F->setMetadata(Kind: "kernel_arg_access_qual", Node: llvm::MDNode::get(Context&: C, MDs: AccessQuals));
669 F->setMetadata(Kind: "kernel_arg_type", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeNames));
670 F->setMetadata(Kind: "kernel_arg_base_type",
671 Node: llvm::MDNode::get(Context&: C, MDs: ArgBaseTypeNames));
672 F->setMetadata(Kind: "kernel_arg_type_qual", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeQuals));
673 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
674 F->setMetadata(Kind: "kernel_arg_name", Node: llvm::MDNode::get(Context&: C, MDs: ArgNames));
675
676 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
677 C, KernelDescriptorPtrTy: llvm::PointerType::get(C, AddressSpace: DL.getDefaultGlobalsAddressSpace()));
678 llvm::Constant *RuntimeHandleInitializer =
679 llvm::ConstantAggregateZero::get(Ty: HandleTy);
680
681 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
682
683 // The runtime needs access to the runtime handle as an external symbol. The
684 // runtime handle will need to be made external later, in
685 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
686 // inside the runtime handle, and is not directly referenced.
687
688 // TODO: We would initialize the first field by declaring F->getName() + ".kd"
689 // to reference the kernel descriptor. The runtime wouldn't need to bother
690 // setting it. We would need to have a final symbol name though.
691 // TODO: Can we directly use an external symbol with getGlobalIdentifier?
692 auto *RuntimeHandle = new llvm::GlobalVariable(
693 Mod, HandleTy,
694 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
695 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
696 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
697 DL.getDefaultGlobalsAddressSpace(),
698 /*isExternallyInitialized=*/true);
699
700 llvm::MDNode *HandleAsMD =
701 llvm::MDNode::get(Context&: C, MDs: llvm::ValueAsMetadata::get(V: RuntimeHandle));
702 F->setMetadata(KindID: llvm::LLVMContext::MD_associated, Node: HandleAsMD);
703
704 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
705
706 CGF.CGM.addUsedGlobal(GV: F);
707 CGF.CGM.addUsedGlobal(GV: RuntimeHandle);
708 return RuntimeHandle;
709}
710
711void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
712 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
713 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
714 int32_t *MaxThreadsVal) {
715 unsigned Min = 0;
716 unsigned Max = 0;
717 auto Eval = [&](Expr *E) {
718 return E->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
719 };
720 if (FlatWGS) {
721 Min = Eval(FlatWGS->getMin());
722 Max = Eval(FlatWGS->getMax());
723 }
724 if (ReqdWGS && Min == 0 && Max == 0)
725 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
726 Eval(ReqdWGS->getZDim());
727
728 if (Min != 0) {
729 assert(Min <= Max && "Min must be less than or equal Max");
730
731 if (MinThreadsVal)
732 *MinThreadsVal = Min;
733 if (MaxThreadsVal)
734 *MaxThreadsVal = Max;
735 std::string AttrVal = llvm::utostr(X: Min) + "," + llvm::utostr(X: Max);
736 if (F)
737 F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
738 } else
739 assert(Max == 0 && "Max must be zero");
740}
741
742void CodeGenModule::handleAMDGPUWavesPerEUAttr(
743 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
744 unsigned Min =
745 Attr->getMin()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
746 unsigned Max =
747 Attr->getMax()
748 ? Attr->getMax()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue()
749 : 0;
750
751 if (Min != 0) {
752 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
753
754 std::string AttrVal = llvm::utostr(X: Min);
755 if (Max != 0)
756 AttrVal = AttrVal + "," + llvm::utostr(X: Max);
757 F->addFnAttr(Kind: "amdgpu-waves-per-eu", Val: AttrVal);
758 } else
759 assert(Max == 0 && "Max must be zero");
760}
761
762std::unique_ptr<TargetCodeGenInfo>
763CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
764 return std::make_unique<AMDGPUTargetCodeGenInfo>(args&: CGM.getTypes());
765}
766