1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "R600Subtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23#include "llvm/CodeGen/MachineScheduler.h"
24#include "llvm/CodeGen/TargetFrameLowering.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/IntrinsicsR600.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "amdgpu-subtarget"
34
35// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
36// allows the given function to achieve an occupancy of NWaves waves per
37// SIMD / EU, taking into account only the function's *maximum* workgroup size.
38unsigned
39AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
40 const Function &F) const {
41 const unsigned WaveSize = getWavefrontSize();
42 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
43 const unsigned WavesPerWorkgroup =
44 std::max(a: 1u, b: (WorkGroupSize + WaveSize - 1) / WaveSize);
45
46 const unsigned WorkGroupsPerCU =
47 std::max(a: 1u, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
48
49 return getLocalMemorySize() / WorkGroupsPerCU;
50}
51
52std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
53 uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
54
55 // LDS granularity accounted for by aligning the queried LDS size to the
56 // allocation block size.
57 const unsigned Granularity = std::max(a: LDSAllocationGranularity, b: 1u);
58 LDSBytes = alignTo(Value: LDSBytes, Align: Granularity);
59 const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(a: LDSBytes, b: 1u);
60
61 // Queried LDS size may be larger than available on a CU, in which case we
62 // consider the only achievable occupancy to be 1, in line with what we
63 // consider the occupancy to be when the number of requested registers in a
64 // particular bank is higher than the number of available ones in that bank.
65 if (!MaxWGsLDS)
66 return {1, 1};
67
68 const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
69
70 auto PropsFromWGSize = [=](unsigned WGSize)
71 -> std::tuple<const unsigned, const unsigned, unsigned> {
72 unsigned WavesPerWG = divideCeil(Numerator: WGSize, Denominator: WaveSize);
73 unsigned WGsPerCU = std::min(a: getMaxWorkGroupsPerCU(FlatWorkGroupSize: WGSize), b: MaxWGsLDS);
74 return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
75 };
76
77 // The maximum group size will generally yield the minimum number of
78 // workgroups, maximum number of waves, and minimum occupancy. The opposite is
79 // generally true for the minimum group size. LDS or barrier ressource
80 // limitations can flip those minimums/maximums.
81 const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
82 auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
83 auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
84
85 // It is possible that we end up with flipped minimum and maximum number of
86 // waves per CU when the number of minimum/maximum concurrent groups on the CU
87 // is limited by LDS usage or barrier resources.
88 if (MinWavesPerCU >= MaxWavesPerCU) {
89 std::swap(a&: MinWavesPerCU, b&: MaxWavesPerCU);
90 } else {
91 const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
92
93 // Look for a potential smaller group size than the maximum which decreases
94 // the concurrent number of waves on the CU for the same number of
95 // concurrent workgroups on the CU.
96 unsigned MinWavesPerCUForWGSize =
97 divideCeil(Numerator: WaveSlotsPerCU, Denominator: MinWGsPerCU + 1) * MinWGsPerCU;
98 if (MinWavesPerCU > MinWavesPerCUForWGSize) {
99 unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
100 if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
101 // There may exist a smaller group size than the maximum that achieves
102 // the minimum number of waves per CU. This group size is the largest
103 // possible size that requires MaxWavesPerWG - E waves where E is
104 // maximized under the following constraints.
105 // 1. 0 <= E <= ExcessSlotsPerWG
106 // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
107 MinWavesPerCU -= MinWGsPerCU * std::min(a: ExcessSlotsPerWG,
108 b: MaxWavesPerWG - MinWavesPerWG);
109 }
110 }
111
112 // Look for a potential larger group size than the minimum which increases
113 // the concurrent number of waves on the CU for the same number of
114 // concurrent workgroups on the CU.
115 unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
116 if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
117 // There may exist a larger group size than the minimum that achieves the
118 // maximum number of waves per CU. This group size is the smallest
119 // possible size that requires MinWavesPerWG + L waves where L is
120 // maximized under the following constraints.
121 // 1. 0 <= L <= LeftoverSlotsPerWG
122 // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
123 MaxWavesPerCU += MaxWGsPerCU * std::min(a: LeftoverSlotsPerWG,
124 b: ((MaxWGSize - 1) / WaveSize) + 1 -
125 MinWavesPerWG);
126 }
127 }
128
129 // Return the minimum/maximum number of waves on any EU, assuming that all
130 // wavefronts are spread across all EUs as evenly as possible.
131 return {std::clamp(val: MinWavesPerCU / getEUsPerCU(), lo: 1U, hi: WavesPerEU),
132 std::clamp(val: divideCeil(Numerator: MaxWavesPerCU, Denominator: getEUsPerCU()), lo: 1U, hi: WavesPerEU)};
133}
134
135std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
136 const MachineFunction &MF) const {
137 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
138 return getOccupancyWithWorkGroupSizes(LDSBytes: MFI->getLDSSize(), F: MF.getFunction());
139}
140
141std::pair<unsigned, unsigned>
142AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
143 switch (CC) {
144 case CallingConv::AMDGPU_VS:
145 case CallingConv::AMDGPU_LS:
146 case CallingConv::AMDGPU_HS:
147 case CallingConv::AMDGPU_ES:
148 case CallingConv::AMDGPU_GS:
149 case CallingConv::AMDGPU_PS:
150 return std::pair(1, getWavefrontSize());
151 default:
152 return std::pair(1u, getMaxFlatWorkGroupSize());
153 }
154}
155
156std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
157 const Function &F) const {
158 // Default minimum/maximum flat work group sizes.
159 std::pair<unsigned, unsigned> Default =
160 getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
161
162 // Requested minimum/maximum flat work group sizes.
163 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
164 F, Name: "amdgpu-flat-work-group-size", Default);
165
166 // Make sure requested minimum is less than requested maximum.
167 if (Requested.first > Requested.second)
168 return Default;
169
170 // Make sure requested values do not violate subtarget's specifications.
171 if (Requested.first < getMinFlatWorkGroupSize())
172 return Default;
173 if (Requested.second > getMaxFlatWorkGroupSize())
174 return Default;
175
176 return Requested;
177}
178
179bool AMDGPUSubtarget::isSingleWavefrontWorkgroup(const Function &F) const {
180 return getFlatWorkGroupSizes(F).second <= getWavefrontSize();
181}
182
183std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
184 std::pair<unsigned, unsigned> RequestedWavesPerEU,
185 std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
186 // Default minimum/maximum number of waves per EU. The range of flat workgroup
187 // sizes limits the achievable maximum, and we aim to support enough waves per
188 // EU so that we can concurrently execute all waves of a single workgroup of
189 // maximum size on a CU.
190 std::pair<unsigned, unsigned> Default = {
191 getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second),
192 getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
193 Default.first = std::min(a: Default.first, b: Default.second);
194
195 // Make sure requested minimum is within the default range and lower than the
196 // requested maximum. The latter must not violate target specification.
197 if (RequestedWavesPerEU.first < Default.first ||
198 RequestedWavesPerEU.first > Default.second ||
199 RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
200 RequestedWavesPerEU.second > getMaxWavesPerEU())
201 return Default;
202
203 // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
204 RequestedWavesPerEU.second =
205 std::min(a: RequestedWavesPerEU.second, b: Default.second);
206 return RequestedWavesPerEU;
207}
208
209std::pair<unsigned, unsigned>
210AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
211 // Default/requested minimum/maximum flat work group sizes.
212 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
213 // Minimum number of bytes allocated in the LDS.
214 unsigned LDSBytes =
215 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size", Default: {0, UINT32_MAX},
216 /*OnlyFirstRequired=*/true)
217 .first;
218 return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
219}
220
221std::pair<unsigned, unsigned>
222AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
223 unsigned LDSBytes, const Function &F) const {
224 // Default minimum/maximum number of waves per execution unit.
225 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
226
227 // Requested minimum/maximum number of waves per execution unit.
228 std::pair<unsigned, unsigned> Requested =
229 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default, OnlyFirstRequired: true);
230 return getEffectiveWavesPerEU(RequestedWavesPerEU: Requested, FlatWorkGroupSizes, LDSBytes);
231}
232
233std::optional<unsigned>
234AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
235 unsigned Dim) const {
236 auto *Node = Kernel.getMetadata(Kind: "reqd_work_group_size");
237 if (Node && Node->getNumOperands() == 3)
238 return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue();
239 return std::nullopt;
240}
241
242bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
243 const Function &F, bool RequiresUniformYZ) const {
244 auto *Node = F.getMetadata(Kind: "reqd_work_group_size");
245 if (!Node || Node->getNumOperands() != 3)
246 return false;
247 unsigned XLen =
248 mdconst::extract<ConstantInt>(MD: Node->getOperand(I: 0))->getZExtValue();
249 unsigned YLen =
250 mdconst::extract<ConstantInt>(MD: Node->getOperand(I: 1))->getZExtValue();
251 unsigned ZLen =
252 mdconst::extract<ConstantInt>(MD: Node->getOperand(I: 2))->getZExtValue();
253
254 bool Is1D = YLen <= 1 && ZLen <= 1;
255 bool IsXLargeEnough =
256 isPowerOf2_32(Value: XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());
257 return Is1D || IsXLargeEnough;
258}
259
260bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
261 return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv());
262}
263
264unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
265 unsigned Dimension) const {
266 std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension);
267 if (ReqdSize)
268 return *ReqdSize - 1;
269 return getFlatWorkGroupSizes(F: Kernel).second - 1;
270}
271
272bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
273 for (int I = 0; I < 3; ++I) {
274 if (getMaxWorkitemID(Kernel: Func, Dimension: I) > 0)
275 return false;
276 }
277
278 // If the function may call the WWM intrinsic, just return false as
279 // all threads will be active at some point
280 if (!Func.hasFnAttribute(Kind: "amdgpu-no-wwm"))
281 return false;
282
283 return true;
284}
285
286bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
287 Function *Kernel = I->getFunction();
288 unsigned MinSize = 0;
289 unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second;
290 bool IdQuery = false;
291
292 // If reqd_work_group_size is present it narrows value down.
293 if (auto *CI = dyn_cast<CallInst>(Val: I)) {
294 const Function *F = CI->getCalledFunction();
295 if (F) {
296 unsigned Dim = UINT_MAX;
297 switch (F->getIntrinsicID()) {
298 case Intrinsic::amdgcn_workitem_id_x:
299 case Intrinsic::r600_read_tidig_x:
300 IdQuery = true;
301 [[fallthrough]];
302 case Intrinsic::r600_read_local_size_x:
303 Dim = 0;
304 break;
305 case Intrinsic::amdgcn_workitem_id_y:
306 case Intrinsic::r600_read_tidig_y:
307 IdQuery = true;
308 [[fallthrough]];
309 case Intrinsic::r600_read_local_size_y:
310 Dim = 1;
311 break;
312 case Intrinsic::amdgcn_workitem_id_z:
313 case Intrinsic::r600_read_tidig_z:
314 IdQuery = true;
315 [[fallthrough]];
316 case Intrinsic::r600_read_local_size_z:
317 Dim = 2;
318 break;
319 default:
320 break;
321 }
322
323 if (Dim <= 3) {
324 std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim);
325 if (ReqdSize)
326 MinSize = MaxSize = *ReqdSize;
327 }
328 }
329 }
330
331 if (!MaxSize)
332 return false;
333
334 // Range metadata is [Lo, Hi). For ID query we need to pass max size
335 // as Hi. For size query we need to pass Hi + 1.
336 if (IdQuery)
337 MinSize = 0;
338 else
339 ++MaxSize;
340
341 APInt Lower{32, MinSize};
342 APInt Upper{32, MaxSize};
343 if (auto *CI = dyn_cast<CallBase>(Val: I)) {
344 ConstantRange Range(Lower, Upper);
345 CI->addRangeRetAttr(CR: Range);
346 } else {
347 MDBuilder MDB(I->getContext());
348 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper);
349 I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange);
350 }
351 return true;
352}
353
354unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
355
356 // We don't allocate the segment if we know the implicit arguments weren't
357 // used, even if the ABI implies we need them.
358 if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
359 return 0;
360
361 if (isMesaKernel(F))
362 return 16;
363
364 // Assume all implicit inputs are used by default
365 const Module *M = F.getParent();
366 unsigned NBytes =
367 AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
368 return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes",
369 Default: NBytes);
370}
371
372uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
373 Align &MaxAlign) const {
374 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
375 F.getCallingConv() == CallingConv::SPIR_KERNEL);
376
377 const DataLayout &DL = F.getDataLayout();
378 uint64_t ExplicitArgBytes = 0;
379 MaxAlign = Align(1);
380
381 for (const Argument &Arg : F.args()) {
382 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument"))
383 continue;
384
385 const bool IsByRef = Arg.hasByRefAttr();
386 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
387 Align Alignment = DL.getValueOrABITypeAlignment(
388 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy);
389 uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
390 ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize;
391 MaxAlign = std::max(a: MaxAlign, b: Alignment);
392 }
393
394 return ExplicitArgBytes;
395}
396
397unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
398 Align &MaxAlign) const {
399 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
400 F.getCallingConv() != CallingConv::SPIR_KERNEL)
401 return 0;
402
403 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
404
405 unsigned ExplicitOffset = getExplicitKernelArgOffset();
406
407 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
408 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
409 if (ImplicitBytes != 0) {
410 const Align Alignment = getAlignmentForImplicitArgPtr();
411 TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes;
412 MaxAlign = std::max(a: MaxAlign, b: Alignment);
413 }
414
415 // Being able to dereference past the end is useful for emitting scalar loads.
416 return alignTo(Value: TotalSize, Align: 4);
417}
418
419AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
420 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
421 : AMDGPUDwarfFlavour::Wave64;
422}
423
424const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
425 if (MF.getTarget().getTargetTriple().isAMDGCN())
426 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
427 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
428}
429
430const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
431 if (TM.getTargetTriple().isAMDGCN())
432 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
433 return static_cast<const AMDGPUSubtarget &>(
434 TM.getSubtarget<R600Subtarget>(F));
435}
436
437// FIXME: This has no reason to be in subtarget
438SmallVector<unsigned>
439AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
440 return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups", Size: 3,
441 DefaultVal: std::numeric_limits<uint32_t>::max());
442}
443