1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
11#include "llvm/ADT/StringExtras.h"
12#include "llvm/Support/AMDGPUAddrSpace.h"
13
14using namespace clang;
15using namespace clang::CodeGen;
16
17//===----------------------------------------------------------------------===//
18// AMDGPU ABI Implementation
19//===----------------------------------------------------------------------===//
20
21namespace {
22
23class AMDGPUABIInfo final : public DefaultABIInfo {
24private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
26
27 uint64_t numRegsForType(QualType Ty) const;
28
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 bool isHomogeneousAggregateSmallEnough(const Type *Base,
31 uint64_t Members) const override;
32
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Val: Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(C&: Ty->getContext(), AddressSpace: ToAS);
40 return Ty;
41 }
42
43public:
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
46
47 ABIArgInfo classifyReturnType(QualType RetTy) const;
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50 unsigned &NumRegsLeft) const;
51
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
55
56 llvm::FixedVectorType *
57 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58 const LangOptions &Opt) const override {
59 // We have legal instructions for 96-bit so 3x32 can be supported.
60 // FIXME: This check should be a subtarget feature as technically SI doesn't
61 // support it.
62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(Ty: T) == 96)
63 return T;
64 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65 }
66};
67
68bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69 return true;
70}
71
72bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73 const Type *Base, uint64_t Members) const {
74 uint32_t NumRegs = (getContext().getTypeSize(T: Base) + 31) / 32;
75
76 // Homogeneous Aggregates may occupy at most 16 registers.
77 return Members * NumRegs <= MaxNumRegsForArgsRet;
78}
79
80/// Estimate number of registers the type will use when passed in registers.
81uint64_t AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82 uint64_t NumRegs = 0;
83
84 if (const VectorType *VT = Ty->getAs<VectorType>()) {
85 // Compute from the number of elements. The reported size is based on the
86 // in-memory size, which includes the padding 4th element for 3-vectors.
87 QualType EltTy = VT->getElementType();
88 uint64_t EltSize = getContext().getTypeSize(T: EltTy);
89
90 // 16-bit element vectors should be passed as packed.
91 if (EltSize == 16)
92 return (VT->getNumElements() + 1) / 2;
93
94 uint64_t EltNumRegs = (EltSize + 31) / 32;
95 return EltNumRegs * VT->getNumElements();
96 }
97
98 if (const auto *RD = Ty->getAsRecordDecl()) {
99 assert(!RD->hasFlexibleArrayMember());
100
101 for (const FieldDecl *Field : RD->fields()) {
102 QualType FieldTy = Field->getType();
103 NumRegs += numRegsForType(Ty: FieldTy);
104 }
105
106 return NumRegs;
107 }
108
109 return (getContext().getTypeSize(T: Ty) + 31) / 32;
110}
111
112void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
113 llvm::CallingConv::ID CC = FI.getCallingConvention();
114
115 if (!getCXXABI().classifyReturnType(FI))
116 FI.getReturnInfo() = classifyReturnType(RetTy: FI.getReturnType());
117
118 unsigned ArgumentIndex = 0;
119 const unsigned numFixedArguments = FI.getNumRequiredArgs();
120
121 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
122 for (auto &Arg : FI.arguments()) {
123 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
124 Arg.info = classifyKernelArgumentType(Ty: Arg.type);
125 } else {
126 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
127 Arg.info = classifyArgumentType(Ty: Arg.type, Variadic: !FixedArgument, NumRegsLeft);
128 }
129 }
130}
131
132RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
133 QualType Ty, AggValueSlot Slot) const {
134 const bool IsIndirect = false;
135 const bool AllowHigherAlign = false;
136 return emitVoidPtrVAArg(CGF, VAListAddr, ValueTy: Ty, IsIndirect,
137 ValueInfo: getContext().getTypeInfoInChars(T: Ty),
138 SlotSizeAndAlign: CharUnits::fromQuantity(Quantity: 4), AllowHigherAlign, Slot);
139}
140
141ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
142 if (isAggregateTypeForABI(T: RetTy)) {
143 // Records with non-trivial destructors/copy-constructors should not be
144 // returned by value.
145 if (!getRecordArgABI(T: RetTy, CXXABI&: getCXXABI())) {
146 // Ignore empty structs/unions.
147 if (isEmptyRecord(Context&: getContext(), T: RetTy, AllowArrays: true))
148 return ABIArgInfo::getIgnore();
149
150 // Lower single-element structs to just return a regular value.
151 if (const Type *SeltTy = isSingleElementStruct(T: RetTy, Context&: getContext()))
152 return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0)));
153
154 if (const auto *RD = RetTy->getAsRecordDecl();
155 RD && RD->hasFlexibleArrayMember())
156 return DefaultABIInfo::classifyReturnType(RetTy);
157
158 // Pack aggregates <= 4 bytes into single VGPR or pair.
159 uint64_t Size = getContext().getTypeSize(T: RetTy);
160 if (Size <= 16)
161 return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
162
163 if (Size <= 32)
164 return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
165
166 if (Size <= 64) {
167 llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
168 return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2));
169 }
170
171 if (numRegsForType(Ty: RetTy) <= MaxNumRegsForArgsRet)
172 return ABIArgInfo::getDirect();
173 }
174 }
175
176 // Otherwise just do the default thing.
177 return DefaultABIInfo::classifyReturnType(RetTy);
178}
179
180/// For kernels all parameters are really passed in a special buffer. It doesn't
181/// make sense to pass anything byval, so everything must be direct.
182ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
183 Ty = useFirstFieldIfTransparentUnion(Ty);
184
185 // TODO: Can we omit empty structs?
186
187 if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
188 Ty = QualType(SeltTy, 0);
189
190 llvm::Type *OrigLTy = CGT.ConvertType(T: Ty);
191 llvm::Type *LTy = OrigLTy;
192 if (getContext().getLangOpts().HIP) {
193 LTy = coerceKernelArgumentType(
194 Ty: OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(AS: LangAS::Default),
195 /*ToAS=*/getContext().getTargetAddressSpace(AS: LangAS::cuda_device));
196 }
197
198 // FIXME: This doesn't apply the optimization of coercing pointers in structs
199 // to global address space when using byref. This would require implementing a
200 // new kind of coercion of the in-memory type when for indirect arguments.
201 if (LTy == OrigLTy && isAggregateTypeForABI(T: Ty)) {
202 return ABIArgInfo::getIndirectAliased(
203 Alignment: getContext().getTypeAlignInChars(T: Ty),
204 AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_constant),
205 Realign: false /*Realign*/, Padding: nullptr /*Padding*/);
206 }
207
208 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
209 // individual elements, which confuses the Clover OpenCL backend; therefore we
210 // have to set it to false here. Other args of getDirect() are just defaults.
211 return ABIArgInfo::getDirect(T: LTy, Offset: 0, Padding: nullptr, CanBeFlattened: false);
212}
213
214ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
215 unsigned &NumRegsLeft) const {
216 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
217
218 Ty = useFirstFieldIfTransparentUnion(Ty);
219
220 if (Variadic) {
221 return ABIArgInfo::getDirect(/*T=*/nullptr,
222 /*Offset=*/0,
223 /*Padding=*/nullptr,
224 /*CanBeFlattened=*/false,
225 /*Align=*/0);
226 }
227
228 if (isAggregateTypeForABI(T: Ty)) {
229 // Records with non-trivial destructors/copy-constructors should not be
230 // passed by value.
231 if (auto RAA = getRecordArgABI(T: Ty, CXXABI&: getCXXABI()))
232 return getNaturalAlignIndirect(Ty, AddrSpace: getDataLayout().getAllocaAddrSpace(),
233 ByVal: RAA == CGCXXABI::RAA_DirectInMemory);
234
235 // Ignore empty structs/unions.
236 if (isEmptyRecord(Context&: getContext(), T: Ty, AllowArrays: true))
237 return ABIArgInfo::getIgnore();
238
239 // Lower single-element structs to just pass a regular value. TODO: We
240 // could do reasonable-size multiple-element structs too, using getExpand(),
241 // though watch out for things like bitfields.
242 if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
243 return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0)));
244
245 if (const auto *RD = Ty->getAsRecordDecl();
246 RD && RD->hasFlexibleArrayMember())
247 return DefaultABIInfo::classifyArgumentType(RetTy: Ty);
248
249 // Pack aggregates <= 8 bytes into single VGPR or pair.
250 uint64_t Size = getContext().getTypeSize(T: Ty);
251 if (Size <= 64) {
252 unsigned NumRegs = (Size + 31) / 32;
253 NumRegsLeft -= std::min(a: NumRegsLeft, b: NumRegs);
254
255 if (Size <= 16)
256 return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
257
258 if (Size <= 32)
259 return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
260
261 // XXX: Should this be i64 instead, and should the limit increase?
262 llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
263 return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2));
264 }
265
266 if (NumRegsLeft > 0) {
267 uint64_t NumRegs = numRegsForType(Ty);
268 if (NumRegsLeft >= NumRegs) {
269 NumRegsLeft -= NumRegs;
270 return ABIArgInfo::getDirect();
271 }
272 }
273
274 // Use pass-by-reference in stead of pass-by-value for struct arguments in
275 // function ABI.
276 return ABIArgInfo::getIndirectAliased(
277 Alignment: getContext().getTypeAlignInChars(T: Ty),
278 AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_private));
279 }
280
281 // Otherwise just do the default thing.
282 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(RetTy: Ty);
283 if (!ArgInfo.isIndirect()) {
284 uint64_t NumRegs = numRegsForType(Ty);
285 NumRegsLeft -= std::min(a: NumRegs, b: uint64_t{NumRegsLeft});
286 }
287
288 return ArgInfo;
289}
290
291class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
292public:
293 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
294 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(args&: CGT)) {}
295
296 bool supportsLibCall() const override { return false; }
297 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
298 CodeGenModule &CGM) const;
299
300 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
301 CodeGen::CodeGenModule &M) const override;
302 unsigned getDeviceKernelCallingConv() const override;
303
304 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
305 llvm::PointerType *T, QualType QT) const override;
306
307 LangAS getASTAllocaAddressSpace() const override {
308 return getLangASFromTargetAS(
309 TargetAS: getABIInfo().getDataLayout().getAllocaAddrSpace());
310 }
311 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
312 const VarDecl *D) const override;
313 StringRef getLLVMSyncScopeStr(const LangOptions &LangOpts, SyncScope Scope,
314 llvm::AtomicOrdering Ordering) const override;
315 void setTargetAtomicMetadata(CodeGenFunction &CGF,
316 llvm::Instruction &AtomicInst,
317 const AtomicExpr *Expr = nullptr) const override;
318 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
319 llvm::Function *BlockInvokeFunc,
320 llvm::Type *BlockTy) const override;
321 bool shouldEmitStaticExternCAliases() const override;
322 bool shouldEmitDWARFBitFieldSeparators() const override;
323 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
324};
325}
326
327static bool requiresAMDGPUProtectedVisibility(const Decl *D,
328 llvm::GlobalValue *GV) {
329 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
330 return false;
331
332 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
333 (D->hasAttr<DeviceKernelAttr>() ||
334 (isa<FunctionDecl>(Val: D) && D->hasAttr<CUDAGlobalAttr>()) ||
335 (isa<VarDecl>(Val: D) &&
336 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
337 cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
338 cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinTextureType())));
339}
340
341void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
342 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
343 const auto *ReqdWGS =
344 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
345 const bool IsOpenCLKernel =
346 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
347 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
348
349 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
350 if (ReqdWGS || FlatWGS) {
351 M.handleAMDGPUFlatWorkGroupSizeAttr(F, A: FlatWGS, ReqdWGS);
352 } else if (IsOpenCLKernel || IsHIPKernel) {
353 // By default, restrict the maximum size to a value specified by
354 // --gpu-max-threads-per-block=n or its default value for HIP.
355 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
356 const unsigned DefaultMaxWorkGroupSize =
357 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
358 : M.getLangOpts().GPUMaxThreadsPerBlock;
359 std::string AttrVal =
360 std::string("1,") + llvm::utostr(X: DefaultMaxWorkGroupSize);
361 F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
362 }
363
364 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
365 M.handleAMDGPUWavesPerEUAttr(F, A: Attr);
366
367 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
368 unsigned NumSGPR = Attr->getNumSGPR();
369
370 if (NumSGPR != 0)
371 F->addFnAttr(Kind: "amdgpu-num-sgpr", Val: llvm::utostr(X: NumSGPR));
372 }
373
374 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
375 uint32_t NumVGPR = Attr->getNumVGPR();
376
377 if (NumVGPR != 0)
378 F->addFnAttr(Kind: "amdgpu-num-vgpr", Val: llvm::utostr(X: NumVGPR));
379 }
380
381 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
382 uint32_t X = Attr->getMaxNumWorkGroupsX()
383 ->EvaluateKnownConstInt(Ctx: M.getContext())
384 .getExtValue();
385 // Y and Z dimensions default to 1 if not specified
386 uint32_t Y = Attr->getMaxNumWorkGroupsY()
387 ? Attr->getMaxNumWorkGroupsY()
388 ->EvaluateKnownConstInt(Ctx: M.getContext())
389 .getExtValue()
390 : 1;
391 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
392 ? Attr->getMaxNumWorkGroupsZ()
393 ->EvaluateKnownConstInt(Ctx: M.getContext())
394 .getExtValue()
395 : 1;
396
397 llvm::SmallString<32> AttrVal;
398 llvm::raw_svector_ostream OS(AttrVal);
399 OS << X << ',' << Y << ',' << Z;
400
401 F->addFnAttr(Kind: "amdgpu-max-num-workgroups", Val: AttrVal.str());
402 }
403
404 if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
405 auto GetExprVal = [&](const auto &E) {
406 return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
407 };
408 unsigned X = GetExprVal(Attr->getX());
409 unsigned Y = GetExprVal(Attr->getY());
410 unsigned Z = GetExprVal(Attr->getZ());
411 llvm::SmallString<32> AttrVal;
412 llvm::raw_svector_ostream OS(AttrVal);
413 OS << X << ',' << Y << ',' << Z;
414 F->addFnAttr(Kind: "amdgpu-cluster-dims", Val: AttrVal.str());
415 }
416
417 // OpenCL doesn't support cluster feature.
418 const TargetInfo &TTI = M.getContext().getTargetInfo();
419 if ((IsOpenCLKernel &&
420 TTI.hasFeatureEnabled(Features: TTI.getTargetOpts().FeatureMap, Name: "clusters")) ||
421 FD->hasAttr<CUDANoClusterAttr>())
422 F->addFnAttr(Kind: "amdgpu-cluster-dims", Val: "0,0,0");
423}
424
425void AMDGPUTargetCodeGenInfo::setTargetAttributes(
426 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
427 if (requiresAMDGPUProtectedVisibility(D, GV)) {
428 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
429 GV->setDSOLocal(true);
430 }
431
432 if (GV->isDeclaration())
433 return;
434
435 llvm::Function *F = dyn_cast<llvm::Function>(Val: GV);
436 if (!F)
437 return;
438
439 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(Val: D);
440 if (FD)
441 setFunctionDeclAttributes(FD, F, M);
442 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
443 F->addFnAttr(Kind: "amdgpu-ieee", Val: "false");
444 if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
445 F->addFnAttr(Kind: "amdgpu-expand-waitcnt-profiling");
446}
447
448unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
449 return llvm::CallingConv::AMDGPU_KERNEL;
450}
451
452// Currently LLVM assumes null pointers always have value 0,
453// which results in incorrectly transformed IR. Therefore, instead of
454// emitting null pointers in private and local address spaces, a null
455// pointer in generic address space is emitted which is casted to a
456// pointer in local or private address space.
457llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
458 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
459 QualType QT) const {
460 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
461 return llvm::ConstantPointerNull::get(T: PT);
462
463 auto &Ctx = CGM.getContext();
464 auto NPT = llvm::PointerType::get(
465 C&: PT->getContext(), AddressSpace: Ctx.getTargetAddressSpace(AS: LangAS::opencl_generic));
466 return llvm::ConstantExpr::getAddrSpaceCast(
467 C: llvm::ConstantPointerNull::get(T: NPT), Ty: PT);
468}
469
470LangAS
471AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
472 const VarDecl *D) const {
473 assert(!CGM.getLangOpts().OpenCL &&
474 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
475 "Address space agnostic languages only");
476 LangAS DefaultGlobalAS = getLangASFromTargetAS(
477 TargetAS: CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_global));
478 if (!D)
479 return DefaultGlobalAS;
480
481 LangAS AddrSpace = D->getType().getAddressSpace();
482 if (AddrSpace != LangAS::Default)
483 return AddrSpace;
484
485 // Only promote to address space 4 if VarDecl has constant initialization.
486 if (D->getType().isConstantStorage(Ctx: CGM.getContext(), ExcludeCtor: false, ExcludeDtor: false) &&
487 D->hasConstantInitialization()) {
488 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
489 return *ConstAS;
490 }
491 return DefaultGlobalAS;
492}
493
494StringRef AMDGPUTargetCodeGenInfo::getLLVMSyncScopeStr(
495 const LangOptions &LangOpts, SyncScope Scope,
496 llvm::AtomicOrdering Ordering) const {
497
498 // OpenCL assumes by default that atomic scopes are per-address space for
499 // non-sequentially consistent operations.
500 bool IsOneAs = (Scope >= SyncScope::OpenCLWorkGroup &&
501 Scope <= SyncScope::OpenCLSubGroup &&
502 Ordering != llvm::AtomicOrdering::SequentiallyConsistent);
503
504 switch (Scope) {
505 case SyncScope::HIPSingleThread:
506 case SyncScope::SingleScope:
507 return IsOneAs ? "singlethread-one-as" : "singlethread";
508 case SyncScope::HIPWavefront:
509 case SyncScope::OpenCLSubGroup:
510 case SyncScope::WavefrontScope:
511 return IsOneAs ? "wavefront-one-as" : "wavefront";
512 case SyncScope::HIPCluster:
513 case SyncScope::ClusterScope:
514 assert(!IsOneAs && "OpenCL does not have cluster scope");
515 return "cluster";
516 case SyncScope::HIPWorkgroup:
517 case SyncScope::OpenCLWorkGroup:
518 case SyncScope::WorkgroupScope:
519 return IsOneAs ? "workgroup-one-as" : "workgroup";
520 case SyncScope::HIPAgent:
521 case SyncScope::OpenCLDevice:
522 case SyncScope::DeviceScope:
523 return IsOneAs ? "agent-one-as" : "agent";
524 case SyncScope::SystemScope:
525 case SyncScope::HIPSystem:
526 case SyncScope::OpenCLAllSVMDevices:
527 return IsOneAs ? "one-as" : "";
528 }
529 llvm_unreachable("Unknown SyncScope enum");
530}
531
532void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
533 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
534 const AtomicExpr *AE) const {
535 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(Val: &AtomicInst);
536 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(Val: &AtomicInst);
537
538 // OpenCL and old style HIP atomics consider atomics targeting thread private
539 // memory to be undefined.
540 //
541 // TODO: This is probably undefined for atomic load/store, but there's not
542 // much direct codegen benefit to knowing this.
543 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
544 (CmpX &&
545 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
546 AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
547 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
548 llvm::MDNode *ASRange = MDHelper.createRange(
549 Lo: llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
550 Hi: llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
551 AtomicInst.setMetadata(KindID: llvm::LLVMContext::MD_noalias_addrspace, Node: ASRange);
552 }
553
554 if (!RMW)
555 return;
556
557 AtomicOptions AO = CGF.CGM.getAtomicOpts();
558 llvm::MDNode *Empty = llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {});
559 if (!AO.getOption(Kind: clang::AtomicOptionKind::FineGrainedMemory))
560 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: Empty);
561 if (!AO.getOption(Kind: clang::AtomicOptionKind::RemoteMemory))
562 RMW->setMetadata(Kind: "amdgpu.no.remote.memory", Node: Empty);
563 if (AO.getOption(Kind: clang::AtomicOptionKind::IgnoreDenormalMode) &&
564 RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
565 RMW->getType()->isFloatTy())
566 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: Empty);
567}
568
569bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
570 return false;
571}
572
573bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
574 return true;
575}
576
577void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
578 const FunctionType *&FT) const {
579 FT = getABIInfo().getContext().adjustFunctionType(
580 Fn: FT, EInfo: FT->getExtInfo().withCallingConv(cc: CC_DeviceKernel));
581}
582
583/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
584/// enqueue.
585///
586/// ptr addrspace(1) kernel_object, i32 private_segment_size,
587/// i32 group_segment_size
588
589static llvm::StructType *
590getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
591 llvm::Type *KernelDescriptorPtrTy) {
592 llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
593 return llvm::StructType::create(Context&: C, Elements: {KernelDescriptorPtrTy, Int32, Int32},
594 Name: "block.runtime.handle.t");
595}
596
597/// Create an OpenCL kernel for an enqueued block.
598///
599/// The type of the first argument (the block literal) is the struct type
600/// of the block literal instead of a pointer type. The first argument
601/// (block literal) is passed directly by value to the kernel. The kernel
602/// allocates the same type of struct on stack and stores the block literal
603/// to it and passes its pointer to the block invoke function. The kernel
604/// has "enqueued-block" function attribute and kernel argument metadata.
605llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
606 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
607 auto &Builder = CGF.Builder;
608 auto &C = CGF.getLLVMContext();
609
610 auto *InvokeFT = Invoke->getFunctionType();
611 llvm::SmallVector<llvm::Type *, 2> ArgTys;
612 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
613 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
614 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
615 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
616 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
617 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
618
619 ArgTys.push_back(Elt: BlockTy);
620 ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
621 AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 0)));
622 ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
623 ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
624 AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
625 ArgNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "block_literal"));
626 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
627 ArgTys.push_back(Elt: InvokeFT->getParamType(i: I));
628 ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
629 AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 3)));
630 AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
631 ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
632 ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
633 ArgNames.push_back(
634 Elt: llvm::MDString::get(Context&: C, Str: (Twine("local_arg") + Twine(I)).str()));
635 }
636
637 llvm::Module &Mod = CGF.CGM.getModule();
638 const llvm::DataLayout &DL = Mod.getDataLayout();
639
640 llvm::Twine Name = Invoke->getName() + "_kernel";
641 auto *FT = llvm::FunctionType::get(Result: llvm::Type::getVoidTy(C), Params: ArgTys, isVarArg: false);
642
643 // The kernel itself can be internal, the runtime does not directly access the
644 // kernel address (only the kernel descriptor).
645 auto *F = llvm::Function::Create(Ty: FT, Linkage: llvm::GlobalValue::InternalLinkage, N: Name,
646 M: &Mod);
647 F->setCallingConv(getDeviceKernelCallingConv());
648
649 llvm::AttrBuilder KernelAttrs(C);
650 // FIXME: The invoke isn't applying the right attributes either
651 // FIXME: This is missing setTargetAttributes
652 CGF.CGM.addDefaultFunctionDefinitionAttributes(attrs&: KernelAttrs);
653 F->addFnAttrs(Attrs: KernelAttrs);
654
655 auto IP = CGF.Builder.saveIP();
656 auto *BB = llvm::BasicBlock::Create(Context&: C, Name: "entry", Parent: F);
657 Builder.SetInsertPoint(BB);
658 const auto BlockAlign = DL.getPrefTypeAlign(Ty: BlockTy);
659 auto *BlockPtr = Builder.CreateAlloca(Ty: BlockTy, ArraySize: nullptr);
660 BlockPtr->setAlignment(BlockAlign);
661 Builder.CreateAlignedStore(Val: F->arg_begin(), Ptr: BlockPtr, Align: BlockAlign);
662 auto *Cast = Builder.CreatePointerCast(V: BlockPtr, DestTy: InvokeFT->getParamType(i: 0));
663 llvm::SmallVector<llvm::Value *, 2> Args;
664 Args.push_back(Elt: Cast);
665 for (llvm::Argument &A : llvm::drop_begin(RangeOrContainer: F->args()))
666 Args.push_back(Elt: &A);
667 llvm::CallInst *call = Builder.CreateCall(Callee: Invoke, Args);
668 call->setCallingConv(Invoke->getCallingConv());
669 Builder.CreateRetVoid();
670 Builder.restoreIP(IP);
671
672 F->setMetadata(Kind: "kernel_arg_addr_space", Node: llvm::MDNode::get(Context&: C, MDs: AddressQuals));
673 F->setMetadata(Kind: "kernel_arg_access_qual", Node: llvm::MDNode::get(Context&: C, MDs: AccessQuals));
674 F->setMetadata(Kind: "kernel_arg_type", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeNames));
675 F->setMetadata(Kind: "kernel_arg_base_type",
676 Node: llvm::MDNode::get(Context&: C, MDs: ArgBaseTypeNames));
677 F->setMetadata(Kind: "kernel_arg_type_qual", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeQuals));
678 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
679 F->setMetadata(Kind: "kernel_arg_name", Node: llvm::MDNode::get(Context&: C, MDs: ArgNames));
680
681 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
682 C, KernelDescriptorPtrTy: llvm::PointerType::get(C, AddressSpace: DL.getDefaultGlobalsAddressSpace()));
683 llvm::Constant *RuntimeHandleInitializer =
684 llvm::ConstantAggregateZero::get(Ty: HandleTy);
685
686 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
687
688 // The runtime needs access to the runtime handle as an external symbol. The
689 // runtime handle will need to be made external later, in
690 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
691 // inside the runtime handle, and is not directly referenced.
692
693 // TODO: We would initialize the first field by declaring F->getName() + ".kd"
694 // to reference the kernel descriptor. The runtime wouldn't need to bother
695 // setting it. We would need to have a final symbol name though.
696 // TODO: Can we directly use an external symbol with getGlobalIdentifier?
697 auto *RuntimeHandle = new llvm::GlobalVariable(
698 Mod, HandleTy,
699 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
700 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
701 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
702 DL.getDefaultGlobalsAddressSpace(),
703 /*isExternallyInitialized=*/true);
704
705 llvm::MDNode *HandleAsMD =
706 llvm::MDNode::get(Context&: C, MDs: llvm::ValueAsMetadata::get(V: RuntimeHandle));
707 F->setMetadata(KindID: llvm::LLVMContext::MD_associated, Node: HandleAsMD);
708
709 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
710
711 CGF.CGM.addUsedGlobal(GV: F);
712 CGF.CGM.addUsedGlobal(GV: RuntimeHandle);
713 return RuntimeHandle;
714}
715
716void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
717 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
718 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
719 int32_t *MaxThreadsVal) {
720 unsigned Min = 0;
721 unsigned Max = 0;
722 auto Eval = [&](Expr *E) {
723 return E->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
724 };
725 if (FlatWGS) {
726 Min = Eval(FlatWGS->getMin());
727 Max = Eval(FlatWGS->getMax());
728 }
729 if (ReqdWGS && Min == 0 && Max == 0)
730 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
731 Eval(ReqdWGS->getZDim());
732
733 if (Min != 0) {
734 assert(Min <= Max && "Min must be less than or equal Max");
735
736 if (MinThreadsVal)
737 *MinThreadsVal = Min;
738 if (MaxThreadsVal)
739 *MaxThreadsVal = Max;
740 std::string AttrVal = llvm::utostr(X: Min) + "," + llvm::utostr(X: Max);
741 if (F)
742 F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
743 } else
744 assert(Max == 0 && "Max must be zero");
745}
746
747void CodeGenModule::handleAMDGPUWavesPerEUAttr(
748 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
749 unsigned Min =
750 Attr->getMin()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
751 unsigned Max =
752 Attr->getMax()
753 ? Attr->getMax()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue()
754 : 0;
755
756 if (Min != 0) {
757 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
758
759 std::string AttrVal = llvm::utostr(X: Min);
760 if (Max != 0)
761 AttrVal = AttrVal + "," + llvm::utostr(X: Max);
762 F->addFnAttr(Kind: "amdgpu-waves-per-eu", Val: AttrVal);
763 } else
764 assert(Max == 0 && "Max must be zero");
765}
766
767std::unique_ptr<TargetCodeGenInfo>
768CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
769 return std::make_unique<AMDGPUTargetCodeGenInfo>(args&: CGM.getTypes());
770}
771