1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "R600Subtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23#include "llvm/CodeGen/MachineScheduler.h"
24#include "llvm/CodeGen/TargetFrameLowering.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/IntrinsicsR600.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "amdgpu-subtarget"
34
35AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
36
37bool AMDGPUSubtarget::useRealTrue16Insts() const {
38 return hasTrue16BitInsts() && EnableRealTrue16Insts;
39}
40
41// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
42// allows the given function to achieve an occupancy of NWaves waves per
43// SIMD / EU, taking into account only the function's *maximum* workgroup size.
44unsigned
45AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
46 const Function &F) const {
47 const unsigned WaveSize = getWavefrontSize();
48 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
49 const unsigned WavesPerWorkgroup =
50 std::max(a: 1u, b: (WorkGroupSize + WaveSize - 1) / WaveSize);
51
52 const unsigned WorkGroupsPerCU =
53 std::max(a: 1u, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
54
55 return getLocalMemorySize() / WorkGroupsPerCU;
56}
57
58std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
59 uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
60
61 // FIXME: We should take into account the LDS allocation granularity.
62 const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(a: LDSBytes, b: 1u);
63
64 // Queried LDS size may be larger than available on a CU, in which case we
65 // consider the only achievable occupancy to be 1, in line with what we
66 // consider the occupancy to be when the number of requested registers in a
67 // particular bank is higher than the number of available ones in that bank.
68 if (!MaxWGsLDS)
69 return {1, 1};
70
71 const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
72
73 auto PropsFromWGSize = [=](unsigned WGSize)
74 -> std::tuple<const unsigned, const unsigned, unsigned> {
75 unsigned WavesPerWG = divideCeil(Numerator: WGSize, Denominator: WaveSize);
76 unsigned WGsPerCU = std::min(a: getMaxWorkGroupsPerCU(FlatWorkGroupSize: WGSize), b: MaxWGsLDS);
77 return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
78 };
79
80 // The maximum group size will generally yield the minimum number of
81 // workgroups, maximum number of waves, and minimum occupancy. The opposite is
82 // generally true for the minimum group size. LDS or barrier ressource
83 // limitations can flip those minimums/maximums.
84 const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
85 auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
86 auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
87
88 // It is possible that we end up with flipped minimum and maximum number of
89 // waves per CU when the number of minimum/maximum concurrent groups on the CU
90 // is limited by LDS usage or barrier resources.
91 if (MinWavesPerCU >= MaxWavesPerCU) {
92 std::swap(a&: MinWavesPerCU, b&: MaxWavesPerCU);
93 } else {
94 const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
95
96 // Look for a potential smaller group size than the maximum which decreases
97 // the concurrent number of waves on the CU for the same number of
98 // concurrent workgroups on the CU.
99 unsigned MinWavesPerCUForWGSize =
100 divideCeil(Numerator: WaveSlotsPerCU, Denominator: MinWGsPerCU + 1) * MinWGsPerCU;
101 if (MinWavesPerCU > MinWavesPerCUForWGSize) {
102 unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
103 if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
104 // There may exist a smaller group size than the maximum that achieves
105 // the minimum number of waves per CU. This group size is the largest
106 // possible size that requires MaxWavesPerWG - E waves where E is
107 // maximized under the following constraints.
108 // 1. 0 <= E <= ExcessSlotsPerWG
109 // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
110 MinWavesPerCU -= MinWGsPerCU * std::min(a: ExcessSlotsPerWG,
111 b: MaxWavesPerWG - MinWavesPerWG);
112 }
113 }
114
115 // Look for a potential larger group size than the minimum which increases
116 // the concurrent number of waves on the CU for the same number of
117 // concurrent workgroups on the CU.
118 unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
119 if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
120 // There may exist a larger group size than the minimum that achieves the
121 // maximum number of waves per CU. This group size is the smallest
122 // possible size that requires MinWavesPerWG + L waves where L is
123 // maximized under the following constraints.
124 // 1. 0 <= L <= LeftoverSlotsPerWG
125 // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
126 MaxWavesPerCU += MaxWGsPerCU * std::min(a: LeftoverSlotsPerWG,
127 b: ((MaxWGSize - 1) / WaveSize) + 1 -
128 MinWavesPerWG);
129 }
130 }
131
132 // Return the minimum/maximum number of waves on any EU, assuming that all
133 // wavefronts are spread across all EUs as evenly as possible.
134 return {std::clamp(val: MinWavesPerCU / getEUsPerCU(), lo: 1U, hi: WavesPerEU),
135 std::clamp(val: divideCeil(Numerator: MaxWavesPerCU, Denominator: getEUsPerCU()), lo: 1U, hi: WavesPerEU)};
136}
137
138std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
139 const MachineFunction &MF) const {
140 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
141 return getOccupancyWithWorkGroupSizes(LDSBytes: MFI->getLDSSize(), F: MF.getFunction());
142}
143
144std::pair<unsigned, unsigned>
145AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
146 switch (CC) {
147 case CallingConv::AMDGPU_VS:
148 case CallingConv::AMDGPU_LS:
149 case CallingConv::AMDGPU_HS:
150 case CallingConv::AMDGPU_ES:
151 case CallingConv::AMDGPU_GS:
152 case CallingConv::AMDGPU_PS:
153 return std::pair(1, getWavefrontSize());
154 default:
155 return std::pair(1u, getMaxFlatWorkGroupSize());
156 }
157}
158
159std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
160 const Function &F) const {
161 // Default minimum/maximum flat work group sizes.
162 std::pair<unsigned, unsigned> Default =
163 getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
164
165 // Requested minimum/maximum flat work group sizes.
166 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
167 F, Name: "amdgpu-flat-work-group-size", Default);
168
169 // Make sure requested minimum is less than requested maximum.
170 if (Requested.first > Requested.second)
171 return Default;
172
173 // Make sure requested values do not violate subtarget's specifications.
174 if (Requested.first < getMinFlatWorkGroupSize())
175 return Default;
176 if (Requested.second > getMaxFlatWorkGroupSize())
177 return Default;
178
179 return Requested;
180}
181
182std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
183 std::pair<unsigned, unsigned> RequestedWavesPerEU,
184 std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
185 // Default minimum/maximum number of waves per EU. The range of flat workgroup
186 // sizes limits the achievable maximum, and we aim to support enough waves per
187 // EU so that we can concurrently execute all waves of a single workgroup of
188 // maximum size on a CU.
189 std::pair<unsigned, unsigned> Default = {
190 getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second),
191 getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
192 Default.first = std::min(a: Default.first, b: Default.second);
193
194 // Make sure requested minimum is within the default range and lower than the
195 // requested maximum. The latter must not violate target specification.
196 if (RequestedWavesPerEU.first < Default.first ||
197 RequestedWavesPerEU.first > Default.second ||
198 RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
199 RequestedWavesPerEU.second > getMaxWavesPerEU())
200 return Default;
201
202 // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
203 RequestedWavesPerEU.second =
204 std::min(a: RequestedWavesPerEU.second, b: Default.second);
205 return RequestedWavesPerEU;
206}
207
208std::pair<unsigned, unsigned>
209AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
210 // Default/requested minimum/maximum flat work group sizes.
211 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
212 // Minimum number of bytes allocated in the LDS.
213 unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
214 Default: {0, UINT32_MAX}, OnlyFirstRequired: true)
215 .first;
216 return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
217}
218
219std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
220 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
221 // Minimum number of bytes allocated in the LDS.
222 unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
223 Default: {0, UINT32_MAX}, OnlyFirstRequired: true)
224 .first;
225 return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
226}
227
228std::pair<unsigned, unsigned>
229AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
230 unsigned LDSBytes, const Function &F) const {
231 // Default minimum/maximum number of waves per execution unit.
232 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
233
234 // Requested minimum/maximum number of waves per execution unit.
235 std::pair<unsigned, unsigned> Requested =
236 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default, OnlyFirstRequired: true);
237 return getEffectiveWavesPerEU(RequestedWavesPerEU: Requested, FlatWorkGroupSizes, LDSBytes);
238}
239
240static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
241 auto *Node = Kernel.getMetadata(Kind: "reqd_work_group_size");
242 if (Node && Node->getNumOperands() == 3)
243 return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue();
244 return std::numeric_limits<unsigned>::max();
245}
246
247bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
248 return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv());
249}
250
251unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
252 unsigned Dimension) const {
253 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension);
254 if (ReqdSize != std::numeric_limits<unsigned>::max())
255 return ReqdSize - 1;
256 return getFlatWorkGroupSizes(F: Kernel).second - 1;
257}
258
259bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
260 for (int I = 0; I < 3; ++I) {
261 if (getMaxWorkitemID(Kernel: Func, Dimension: I) > 0)
262 return false;
263 }
264
265 return true;
266}
267
268bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
269 Function *Kernel = I->getParent()->getParent();
270 unsigned MinSize = 0;
271 unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second;
272 bool IdQuery = false;
273
274 // If reqd_work_group_size is present it narrows value down.
275 if (auto *CI = dyn_cast<CallInst>(Val: I)) {
276 const Function *F = CI->getCalledFunction();
277 if (F) {
278 unsigned Dim = UINT_MAX;
279 switch (F->getIntrinsicID()) {
280 case Intrinsic::amdgcn_workitem_id_x:
281 case Intrinsic::r600_read_tidig_x:
282 IdQuery = true;
283 [[fallthrough]];
284 case Intrinsic::r600_read_local_size_x:
285 Dim = 0;
286 break;
287 case Intrinsic::amdgcn_workitem_id_y:
288 case Intrinsic::r600_read_tidig_y:
289 IdQuery = true;
290 [[fallthrough]];
291 case Intrinsic::r600_read_local_size_y:
292 Dim = 1;
293 break;
294 case Intrinsic::amdgcn_workitem_id_z:
295 case Intrinsic::r600_read_tidig_z:
296 IdQuery = true;
297 [[fallthrough]];
298 case Intrinsic::r600_read_local_size_z:
299 Dim = 2;
300 break;
301 default:
302 break;
303 }
304
305 if (Dim <= 3) {
306 unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim);
307 if (ReqdSize != std::numeric_limits<unsigned>::max())
308 MinSize = MaxSize = ReqdSize;
309 }
310 }
311 }
312
313 if (!MaxSize)
314 return false;
315
316 // Range metadata is [Lo, Hi). For ID query we need to pass max size
317 // as Hi. For size query we need to pass Hi + 1.
318 if (IdQuery)
319 MinSize = 0;
320 else
321 ++MaxSize;
322
323 APInt Lower{32, MinSize};
324 APInt Upper{32, MaxSize};
325 if (auto *CI = dyn_cast<CallBase>(Val: I)) {
326 ConstantRange Range(Lower, Upper);
327 CI->addRangeRetAttr(CR: Range);
328 } else {
329 MDBuilder MDB(I->getContext());
330 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper);
331 I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange);
332 }
333 return true;
334}
335
336unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
337 assert(AMDGPU::isKernel(F.getCallingConv()));
338
339 // We don't allocate the segment if we know the implicit arguments weren't
340 // used, even if the ABI implies we need them.
341 if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
342 return 0;
343
344 if (isMesaKernel(F))
345 return 16;
346
347 // Assume all implicit inputs are used by default
348 const Module *M = F.getParent();
349 unsigned NBytes =
350 AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
351 return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes",
352 Default: NBytes);
353}
354
355uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
356 Align &MaxAlign) const {
357 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
358 F.getCallingConv() == CallingConv::SPIR_KERNEL);
359
360 const DataLayout &DL = F.getDataLayout();
361 uint64_t ExplicitArgBytes = 0;
362 MaxAlign = Align(1);
363
364 for (const Argument &Arg : F.args()) {
365 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument"))
366 continue;
367
368 const bool IsByRef = Arg.hasByRefAttr();
369 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
370 Align Alignment = DL.getValueOrABITypeAlignment(
371 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy);
372 uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
373 ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize;
374 MaxAlign = std::max(a: MaxAlign, b: Alignment);
375 }
376
377 return ExplicitArgBytes;
378}
379
380unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
381 Align &MaxAlign) const {
382 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
383 F.getCallingConv() != CallingConv::SPIR_KERNEL)
384 return 0;
385
386 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
387
388 unsigned ExplicitOffset = getExplicitKernelArgOffset();
389
390 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
391 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
392 if (ImplicitBytes != 0) {
393 const Align Alignment = getAlignmentForImplicitArgPtr();
394 TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes;
395 MaxAlign = std::max(a: MaxAlign, b: Alignment);
396 }
397
398 // Being able to dereference past the end is useful for emitting scalar loads.
399 return alignTo(Value: TotalSize, Align: 4);
400}
401
402AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
403 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
404 : AMDGPUDwarfFlavour::Wave64;
405}
406
407const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
408 if (MF.getTarget().getTargetTriple().isAMDGCN())
409 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
410 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
411}
412
413const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
414 if (TM.getTargetTriple().isAMDGCN())
415 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
416 return static_cast<const AMDGPUSubtarget &>(
417 TM.getSubtarget<R600Subtarget>(F));
418}
419
420// FIXME: This has no reason to be in subtarget
421SmallVector<unsigned>
422AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
423 return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups", Size: 3,
424 DefaultVal: std::numeric_limits<uint32_t>::max());
425}
426