1 | //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Implements the AMDGPU specific subclass of TargetSubtarget. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUSubtarget.h" |
15 | #include "AMDGPUCallLowering.h" |
16 | #include "AMDGPUInstructionSelector.h" |
17 | #include "AMDGPULegalizerInfo.h" |
18 | #include "AMDGPURegisterBankInfo.h" |
19 | #include "R600Subtarget.h" |
20 | #include "SIMachineFunctionInfo.h" |
21 | #include "Utils/AMDGPUBaseInfo.h" |
22 | #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" |
23 | #include "llvm/CodeGen/MachineScheduler.h" |
24 | #include "llvm/CodeGen/TargetFrameLowering.h" |
25 | #include "llvm/IR/DiagnosticInfo.h" |
26 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
27 | #include "llvm/IR/IntrinsicsR600.h" |
28 | #include "llvm/IR/MDBuilder.h" |
29 | #include <algorithm> |
30 | |
31 | using namespace llvm; |
32 | |
33 | #define DEBUG_TYPE "amdgpu-subtarget" |
34 | |
35 | AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} |
36 | |
37 | bool AMDGPUSubtarget::useRealTrue16Insts() const { |
38 | return hasTrue16BitInsts() && EnableRealTrue16Insts; |
39 | } |
40 | |
41 | // Returns the maximum per-workgroup LDS allocation size (in bytes) that still |
42 | // allows the given function to achieve an occupancy of NWaves waves per |
43 | // SIMD / EU, taking into account only the function's *maximum* workgroup size. |
44 | unsigned |
45 | AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, |
46 | const Function &F) const { |
47 | const unsigned WaveSize = getWavefrontSize(); |
48 | const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; |
49 | const unsigned WavesPerWorkgroup = |
50 | std::max(a: 1u, b: (WorkGroupSize + WaveSize - 1) / WaveSize); |
51 | |
52 | const unsigned WorkGroupsPerCU = |
53 | std::max(a: 1u, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup); |
54 | |
55 | return getLocalMemorySize() / WorkGroupsPerCU; |
56 | } |
57 | |
58 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( |
59 | uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
60 | |
61 | // FIXME: We should take into account the LDS allocation granularity. |
62 | const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(a: LDSBytes, b: 1u); |
63 | |
64 | // Queried LDS size may be larger than available on a CU, in which case we |
65 | // consider the only achievable occupancy to be 1, in line with what we |
66 | // consider the occupancy to be when the number of requested registers in a |
67 | // particular bank is higher than the number of available ones in that bank. |
68 | if (!MaxWGsLDS) |
69 | return {1, 1}; |
70 | |
71 | const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); |
72 | |
73 | auto PropsFromWGSize = [=](unsigned WGSize) |
74 | -> std::tuple<const unsigned, const unsigned, unsigned> { |
75 | unsigned WavesPerWG = divideCeil(Numerator: WGSize, Denominator: WaveSize); |
76 | unsigned WGsPerCU = std::min(a: getMaxWorkGroupsPerCU(FlatWorkGroupSize: WGSize), b: MaxWGsLDS); |
77 | return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU}; |
78 | }; |
79 | |
80 | // The maximum group size will generally yield the minimum number of |
81 | // workgroups, maximum number of waves, and minimum occupancy. The opposite is |
82 | // generally true for the minimum group size. LDS or barrier ressource |
83 | // limitations can flip those minimums/maximums. |
84 | const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes; |
85 | auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); |
86 | auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); |
87 | |
88 | // It is possible that we end up with flipped minimum and maximum number of |
89 | // waves per CU when the number of minimum/maximum concurrent groups on the CU |
90 | // is limited by LDS usage or barrier resources. |
91 | if (MinWavesPerCU >= MaxWavesPerCU) { |
92 | std::swap(a&: MinWavesPerCU, b&: MaxWavesPerCU); |
93 | } else { |
94 | const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); |
95 | |
96 | // Look for a potential smaller group size than the maximum which decreases |
97 | // the concurrent number of waves on the CU for the same number of |
98 | // concurrent workgroups on the CU. |
99 | unsigned MinWavesPerCUForWGSize = |
100 | divideCeil(Numerator: WaveSlotsPerCU, Denominator: MinWGsPerCU + 1) * MinWGsPerCU; |
101 | if (MinWavesPerCU > MinWavesPerCUForWGSize) { |
102 | unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize; |
103 | if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) { |
104 | // There may exist a smaller group size than the maximum that achieves |
105 | // the minimum number of waves per CU. This group size is the largest |
106 | // possible size that requires MaxWavesPerWG - E waves where E is |
107 | // maximized under the following constraints. |
108 | // 1. 0 <= E <= ExcessSlotsPerWG |
109 | // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize |
110 | MinWavesPerCU -= MinWGsPerCU * std::min(a: ExcessSlotsPerWG, |
111 | b: MaxWavesPerWG - MinWavesPerWG); |
112 | } |
113 | } |
114 | |
115 | // Look for a potential larger group size than the minimum which increases |
116 | // the concurrent number of waves on the CU for the same number of |
117 | // concurrent workgroups on the CU. |
118 | unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG; |
119 | if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) { |
120 | // There may exist a larger group size than the minimum that achieves the |
121 | // maximum number of waves per CU. This group size is the smallest |
122 | // possible size that requires MinWavesPerWG + L waves where L is |
123 | // maximized under the following constraints. |
124 | // 1. 0 <= L <= LeftoverSlotsPerWG |
125 | // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize |
126 | MaxWavesPerCU += MaxWGsPerCU * std::min(a: LeftoverSlotsPerWG, |
127 | b: ((MaxWGSize - 1) / WaveSize) + 1 - |
128 | MinWavesPerWG); |
129 | } |
130 | } |
131 | |
132 | // Return the minimum/maximum number of waves on any EU, assuming that all |
133 | // wavefronts are spread across all EUs as evenly as possible. |
134 | return {std::clamp(val: MinWavesPerCU / getEUsPerCU(), lo: 1U, hi: WavesPerEU), |
135 | std::clamp(val: divideCeil(Numerator: MaxWavesPerCU, Denominator: getEUsPerCU()), lo: 1U, hi: WavesPerEU)}; |
136 | } |
137 | |
138 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( |
139 | const MachineFunction &MF) const { |
140 | const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
141 | return getOccupancyWithWorkGroupSizes(LDSBytes: MFI->getLDSSize(), F: MF.getFunction()); |
142 | } |
143 | |
144 | std::pair<unsigned, unsigned> |
145 | AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { |
146 | switch (CC) { |
147 | case CallingConv::AMDGPU_VS: |
148 | case CallingConv::AMDGPU_LS: |
149 | case CallingConv::AMDGPU_HS: |
150 | case CallingConv::AMDGPU_ES: |
151 | case CallingConv::AMDGPU_GS: |
152 | case CallingConv::AMDGPU_PS: |
153 | return std::pair(1, getWavefrontSize()); |
154 | default: |
155 | return std::pair(1u, getMaxFlatWorkGroupSize()); |
156 | } |
157 | } |
158 | |
159 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( |
160 | const Function &F) const { |
161 | // Default minimum/maximum flat work group sizes. |
162 | std::pair<unsigned, unsigned> Default = |
163 | getDefaultFlatWorkGroupSize(CC: F.getCallingConv()); |
164 | |
165 | // Requested minimum/maximum flat work group sizes. |
166 | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( |
167 | F, Name: "amdgpu-flat-work-group-size" , Default); |
168 | |
169 | // Make sure requested minimum is less than requested maximum. |
170 | if (Requested.first > Requested.second) |
171 | return Default; |
172 | |
173 | // Make sure requested values do not violate subtarget's specifications. |
174 | if (Requested.first < getMinFlatWorkGroupSize()) |
175 | return Default; |
176 | if (Requested.second > getMaxFlatWorkGroupSize()) |
177 | return Default; |
178 | |
179 | return Requested; |
180 | } |
181 | |
182 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( |
183 | std::pair<unsigned, unsigned> RequestedWavesPerEU, |
184 | std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const { |
185 | // Default minimum/maximum number of waves per EU. The range of flat workgroup |
186 | // sizes limits the achievable maximum, and we aim to support enough waves per |
187 | // EU so that we can concurrently execute all waves of a single workgroup of |
188 | // maximum size on a CU. |
189 | std::pair<unsigned, unsigned> Default = { |
190 | getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second), |
191 | getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; |
192 | Default.first = std::min(a: Default.first, b: Default.second); |
193 | |
194 | // Make sure requested minimum is within the default range and lower than the |
195 | // requested maximum. The latter must not violate target specification. |
196 | if (RequestedWavesPerEU.first < Default.first || |
197 | RequestedWavesPerEU.first > Default.second || |
198 | RequestedWavesPerEU.first > RequestedWavesPerEU.second || |
199 | RequestedWavesPerEU.second > getMaxWavesPerEU()) |
200 | return Default; |
201 | |
202 | // We cannot exceed maximum occupancy implied by flat workgroup size and LDS. |
203 | RequestedWavesPerEU.second = |
204 | std::min(a: RequestedWavesPerEU.second, b: Default.second); |
205 | return RequestedWavesPerEU; |
206 | } |
207 | |
208 | std::pair<unsigned, unsigned> |
209 | AMDGPUSubtarget::getWavesPerEU(const Function &F) const { |
210 | // Default/requested minimum/maximum flat work group sizes. |
211 | std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); |
212 | // Minimum number of bytes allocated in the LDS. |
213 | unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size" , |
214 | Default: {0, UINT32_MAX}, OnlyFirstRequired: true) |
215 | .first; |
216 | return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); |
217 | } |
218 | |
219 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( |
220 | const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
221 | // Minimum number of bytes allocated in the LDS. |
222 | unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size" , |
223 | Default: {0, UINT32_MAX}, OnlyFirstRequired: true) |
224 | .first; |
225 | return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); |
226 | } |
227 | |
228 | std::pair<unsigned, unsigned> |
229 | AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes, |
230 | unsigned LDSBytes, const Function &F) const { |
231 | // Default minimum/maximum number of waves per execution unit. |
232 | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
233 | |
234 | // Requested minimum/maximum number of waves per execution unit. |
235 | std::pair<unsigned, unsigned> Requested = |
236 | AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu" , Default, OnlyFirstRequired: true); |
237 | return getEffectiveWavesPerEU(RequestedWavesPerEU: Requested, FlatWorkGroupSizes, LDSBytes); |
238 | } |
239 | |
240 | static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { |
241 | auto *Node = Kernel.getMetadata(Kind: "reqd_work_group_size" ); |
242 | if (Node && Node->getNumOperands() == 3) |
243 | return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue(); |
244 | return std::numeric_limits<unsigned>::max(); |
245 | } |
246 | |
247 | bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { |
248 | return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv()); |
249 | } |
250 | |
251 | unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, |
252 | unsigned Dimension) const { |
253 | unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension); |
254 | if (ReqdSize != std::numeric_limits<unsigned>::max()) |
255 | return ReqdSize - 1; |
256 | return getFlatWorkGroupSizes(F: Kernel).second - 1; |
257 | } |
258 | |
259 | bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { |
260 | for (int I = 0; I < 3; ++I) { |
261 | if (getMaxWorkitemID(Kernel: Func, Dimension: I) > 0) |
262 | return false; |
263 | } |
264 | |
265 | return true; |
266 | } |
267 | |
268 | bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { |
269 | Function *Kernel = I->getParent()->getParent(); |
270 | unsigned MinSize = 0; |
271 | unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second; |
272 | bool IdQuery = false; |
273 | |
274 | // If reqd_work_group_size is present it narrows value down. |
275 | if (auto *CI = dyn_cast<CallInst>(Val: I)) { |
276 | const Function *F = CI->getCalledFunction(); |
277 | if (F) { |
278 | unsigned Dim = UINT_MAX; |
279 | switch (F->getIntrinsicID()) { |
280 | case Intrinsic::amdgcn_workitem_id_x: |
281 | case Intrinsic::r600_read_tidig_x: |
282 | IdQuery = true; |
283 | [[fallthrough]]; |
284 | case Intrinsic::r600_read_local_size_x: |
285 | Dim = 0; |
286 | break; |
287 | case Intrinsic::amdgcn_workitem_id_y: |
288 | case Intrinsic::r600_read_tidig_y: |
289 | IdQuery = true; |
290 | [[fallthrough]]; |
291 | case Intrinsic::r600_read_local_size_y: |
292 | Dim = 1; |
293 | break; |
294 | case Intrinsic::amdgcn_workitem_id_z: |
295 | case Intrinsic::r600_read_tidig_z: |
296 | IdQuery = true; |
297 | [[fallthrough]]; |
298 | case Intrinsic::r600_read_local_size_z: |
299 | Dim = 2; |
300 | break; |
301 | default: |
302 | break; |
303 | } |
304 | |
305 | if (Dim <= 3) { |
306 | unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim); |
307 | if (ReqdSize != std::numeric_limits<unsigned>::max()) |
308 | MinSize = MaxSize = ReqdSize; |
309 | } |
310 | } |
311 | } |
312 | |
313 | if (!MaxSize) |
314 | return false; |
315 | |
316 | // Range metadata is [Lo, Hi). For ID query we need to pass max size |
317 | // as Hi. For size query we need to pass Hi + 1. |
318 | if (IdQuery) |
319 | MinSize = 0; |
320 | else |
321 | ++MaxSize; |
322 | |
323 | APInt Lower{32, MinSize}; |
324 | APInt Upper{32, MaxSize}; |
325 | if (auto *CI = dyn_cast<CallBase>(Val: I)) { |
326 | ConstantRange Range(Lower, Upper); |
327 | CI->addRangeRetAttr(CR: Range); |
328 | } else { |
329 | MDBuilder MDB(I->getContext()); |
330 | MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper); |
331 | I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange); |
332 | } |
333 | return true; |
334 | } |
335 | |
336 | unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { |
337 | assert(AMDGPU::isKernel(F.getCallingConv())); |
338 | |
339 | // We don't allocate the segment if we know the implicit arguments weren't |
340 | // used, even if the ABI implies we need them. |
341 | if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr" )) |
342 | return 0; |
343 | |
344 | if (isMesaKernel(F)) |
345 | return 16; |
346 | |
347 | // Assume all implicit inputs are used by default |
348 | const Module *M = F.getParent(); |
349 | unsigned NBytes = |
350 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; |
351 | return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes" , |
352 | Default: NBytes); |
353 | } |
354 | |
355 | uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, |
356 | Align &MaxAlign) const { |
357 | assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
358 | F.getCallingConv() == CallingConv::SPIR_KERNEL); |
359 | |
360 | const DataLayout &DL = F.getDataLayout(); |
361 | uint64_t ExplicitArgBytes = 0; |
362 | MaxAlign = Align(1); |
363 | |
364 | for (const Argument &Arg : F.args()) { |
365 | if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument" )) |
366 | continue; |
367 | |
368 | const bool IsByRef = Arg.hasByRefAttr(); |
369 | Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); |
370 | Align Alignment = DL.getValueOrABITypeAlignment( |
371 | Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy); |
372 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy); |
373 | ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize; |
374 | MaxAlign = std::max(a: MaxAlign, b: Alignment); |
375 | } |
376 | |
377 | return ExplicitArgBytes; |
378 | } |
379 | |
380 | unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, |
381 | Align &MaxAlign) const { |
382 | if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && |
383 | F.getCallingConv() != CallingConv::SPIR_KERNEL) |
384 | return 0; |
385 | |
386 | uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); |
387 | |
388 | unsigned ExplicitOffset = getExplicitKernelArgOffset(); |
389 | |
390 | uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; |
391 | unsigned ImplicitBytes = getImplicitArgNumBytes(F); |
392 | if (ImplicitBytes != 0) { |
393 | const Align Alignment = getAlignmentForImplicitArgPtr(); |
394 | TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes; |
395 | MaxAlign = std::max(a: MaxAlign, b: Alignment); |
396 | } |
397 | |
398 | // Being able to dereference past the end is useful for emitting scalar loads. |
399 | return alignTo(Value: TotalSize, Align: 4); |
400 | } |
401 | |
402 | AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { |
403 | return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 |
404 | : AMDGPUDwarfFlavour::Wave64; |
405 | } |
406 | |
407 | const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { |
408 | if (MF.getTarget().getTargetTriple().isAMDGCN()) |
409 | return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); |
410 | return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>()); |
411 | } |
412 | |
413 | const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { |
414 | if (TM.getTargetTriple().isAMDGCN()) |
415 | return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); |
416 | return static_cast<const AMDGPUSubtarget &>( |
417 | TM.getSubtarget<R600Subtarget>(F)); |
418 | } |
419 | |
420 | // FIXME: This has no reason to be in subtarget |
421 | SmallVector<unsigned> |
422 | AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { |
423 | return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups" , Size: 3, |
424 | DefaultVal: std::numeric_limits<uint32_t>::max()); |
425 | } |
426 | |