1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/IR/CallingConv.h"
19#include "llvm/Support/Alignment.h"
20#include "llvm/TargetParser/Triple.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
30class AMDGPUSubtarget {
31public:
32 enum Generation {
33 INVALID = 0,
34 R600 = 1,
35 R700 = 2,
36 EVERGREEN = 3,
37 NORTHERN_ISLANDS = 4,
38 SOUTHERN_ISLANDS = 5,
39 SEA_ISLANDS = 6,
40 VOLCANIC_ISLANDS = 7,
41 GFX9 = 8,
42 GFX10 = 9,
43 GFX11 = 10,
44 GFX12 = 11,
45 };
46
47private:
48 Triple TargetTriple;
49
50protected:
51 bool GCN3Encoding = false;
52 bool Has16BitInsts = false;
53 bool HasTrue16BitInsts = false;
54 bool HasFP8ConversionScaleInsts = false;
55 bool HasBF8ConversionScaleInsts = false;
56 bool HasFP4ConversionScaleInsts = false;
57 bool HasFP6BF6ConversionScaleInsts = false;
58 bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
59 bool HasCvtPkF16F32Inst = false;
60 bool HasF32ToF16BF16ConversionSRInsts = false;
61 bool EnableRealTrue16Insts = false;
62 bool HasBF16ConversionInsts = false;
63 bool HasMadMixInsts = false;
64 bool HasMadMacF32Insts = false;
65 bool HasDsSrc2Insts = false;
66 bool HasSDWA = false;
67 bool HasVOP3PInsts = false;
68 bool HasMulI24 = true;
69 bool HasMulU24 = true;
70 bool HasSMulHi = false;
71 bool HasInv2PiInlineImm = false;
72 bool HasFminFmaxLegacy = true;
73 bool EnablePromoteAlloca = false;
74 bool HasTrigReducedRange = false;
75 bool FastFMAF32 = false;
76 unsigned EUsPerCU = 4;
77 unsigned MaxWavesPerEU = 10;
78 unsigned LocalMemorySize = 0;
79 unsigned AddressableLocalMemorySize = 0;
80 char WavefrontSizeLog2 = 0;
81
82public:
83 AMDGPUSubtarget(Triple TT);
84
85 static const AMDGPUSubtarget &get(const MachineFunction &MF);
86 static const AMDGPUSubtarget &get(const TargetMachine &TM,
87 const Function &F);
88
89 /// \returns Default range flat work group size for a calling convention.
90 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
91
92 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
93 /// for function \p F, or minimum/maximum flat work group sizes explicitly
94 /// requested using "amdgpu-flat-work-group-size" attribute attached to
95 /// function \p F.
96 ///
97 /// \returns Subtarget's default values if explicitly requested values cannot
98 /// be converted to integer, or violate subtarget's specifications.
99 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
100
101 /// \returns Subtarget's default pair of minimum/maximum number of waves per
102 /// execution unit for function \p F, or minimum/maximum number of waves per
103 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
104 /// attached to function \p F.
105 ///
106 /// \returns Subtarget's default values if explicitly requested values cannot
107 /// be converted to integer, violate subtarget's specifications, or are not
108 /// compatible with minimum/maximum number of waves limited by flat work group
109 /// size, register usage, and/or lds usage.
110 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
111
112 /// Overload which uses the specified values for the flat work group sizes,
113 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
114 /// correspond to the function's value for getFlatWorkGroupSizes.
115 std::pair<unsigned, unsigned>
116 getWavesPerEU(const Function &F,
117 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
118
119 /// Overload which uses the specified values for the flat workgroup sizes and
120 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
121 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
122 /// LDSBytes to the per-workgroup LDS allocation.
123 std::pair<unsigned, unsigned>
124 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
125 unsigned LDSBytes, const Function &F) const;
126
127 /// Returns the target minimum/maximum number of waves per EU. This is based
128 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
129 /// limited by the maximum achievable occupancy derived from the range of \p
130 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
131 std::pair<unsigned, unsigned>
132 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
133 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
134 unsigned LDSBytes) const;
135
136 /// Return the amount of LDS that can be used that will not restrict the
137 /// occupancy lower than WaveCount.
138 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
139 const Function &) const;
140
141 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
142 /// be achieved when the only function running on a CU is \p F and each
143 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
144 /// This notably depends on the range of allowed flat group sizes for the
145 /// function and hardware characteristics.
146 std::pair<unsigned, unsigned>
147 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
148 return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
149 }
150
151 /// Overload which uses the specified values for the flat work group sizes,
152 /// rather than querying the function itself. \p FlatWorkGroupSizes should
153 /// correspond to the function's value for getFlatWorkGroupSizes.
154 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
155 uint32_t LDSBytes,
156 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
157
158 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
159 /// be achieved when the only function running on a CU is \p MF. This notably
160 /// depends on the range of allowed flat group sizes for the function, the
161 /// amount of per-workgroup LDS space required by the function, and hardware
162 /// characteristics.
163 std::pair<unsigned, unsigned>
164 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
165
166 bool isAmdHsaOS() const {
167 return TargetTriple.getOS() == Triple::AMDHSA;
168 }
169
170 bool isAmdPalOS() const {
171 return TargetTriple.getOS() == Triple::AMDPAL;
172 }
173
174 bool isMesa3DOS() const {
175 return TargetTriple.getOS() == Triple::Mesa3D;
176 }
177
178 bool isMesaKernel(const Function &F) const;
179
180 bool isAmdHsaOrMesa(const Function &F) const {
181 return isAmdHsaOS() || isMesaKernel(F);
182 }
183
184 bool isGCN() const { return TargetTriple.isAMDGCN(); }
185
186 bool isGCN3Encoding() const {
187 return GCN3Encoding;
188 }
189
190 bool has16BitInsts() const {
191 return Has16BitInsts;
192 }
193
194 /// Return true if the subtarget supports True16 instructions.
195 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
196
197 /// Return true if real (non-fake) variants of True16 instructions using
198 /// 16-bit registers should be code-generated. Fake True16 instructions are
199 /// identical to non-fake ones except that they take 32-bit registers as
200 /// operands and always use their low halves.
201 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
202 // supported and the support for fake True16 instructions is removed.
203 bool useRealTrue16Insts() const;
204
205 bool hasBF16ConversionInsts() const {
206 return HasBF16ConversionInsts;
207 }
208
209 bool hasMadMixInsts() const {
210 return HasMadMixInsts;
211 }
212
213 bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
214
215 bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
216
217 bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
218
219 bool hasFP6BF6ConversionScaleInsts() const {
220 return HasFP6BF6ConversionScaleInsts;
221 }
222
223 bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {
224 return HasF16BF16ToFP6BF6ConversionScaleInsts;
225 }
226
227 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
228
229 bool hasF32ToF16BF16ConversionSRInsts() const {
230 return HasF32ToF16BF16ConversionSRInsts;
231 }
232
233 bool hasMadMacF32Insts() const {
234 return HasMadMacF32Insts || !isGCN();
235 }
236
237 bool hasDsSrc2Insts() const {
238 return HasDsSrc2Insts;
239 }
240
241 bool hasSDWA() const {
242 return HasSDWA;
243 }
244
245 bool hasVOP3PInsts() const {
246 return HasVOP3PInsts;
247 }
248
249 bool hasMulI24() const {
250 return HasMulI24;
251 }
252
253 bool hasMulU24() const {
254 return HasMulU24;
255 }
256
257 bool hasSMulHi() const {
258 return HasSMulHi;
259 }
260
261 bool hasInv2PiInlineImm() const {
262 return HasInv2PiInlineImm;
263 }
264
265 bool hasFminFmaxLegacy() const {
266 return HasFminFmaxLegacy;
267 }
268
269 bool hasTrigReducedRange() const {
270 return HasTrigReducedRange;
271 }
272
273 bool hasFastFMAF32() const {
274 return FastFMAF32;
275 }
276
277 bool isPromoteAllocaEnabled() const {
278 return EnablePromoteAlloca;
279 }
280
281 unsigned getWavefrontSize() const {
282 return 1 << WavefrontSizeLog2;
283 }
284
285 unsigned getWavefrontSizeLog2() const {
286 return WavefrontSizeLog2;
287 }
288
289 /// Return the maximum number of bytes of LDS available for all workgroups
290 /// running on the same WGP or CU.
291 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
292 /// limited to 64k.
293 unsigned getLocalMemorySize() const {
294 return LocalMemorySize;
295 }
296
297 /// Return the maximum number of bytes of LDS that can be allocated to a
298 /// single workgroup.
299 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
300 /// 128k in total.
301 unsigned getAddressableLocalMemorySize() const {
302 return AddressableLocalMemorySize;
303 }
304
305 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
306 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
307 /// CU mode into account.
308 unsigned getEUsPerCU() const { return EUsPerCU; }
309
310 Align getAlignmentForImplicitArgPtr() const {
311 return isAmdHsaOS() ? Align(8) : Align(4);
312 }
313
314 /// Returns the offset in bytes from the start of the input buffer
315 /// of the first explicit kernel argument.
316 unsigned getExplicitKernelArgOffset() const {
317 switch (TargetTriple.getOS()) {
318 case Triple::AMDHSA:
319 case Triple::AMDPAL:
320 case Triple::Mesa3D:
321 return 0;
322 case Triple::UnknownOS:
323 default:
324 // For legacy reasons unknown/other is treated as a different version of
325 // mesa.
326 return 36;
327 }
328
329 llvm_unreachable("invalid triple OS");
330 }
331
332 /// \returns Maximum number of work groups per compute unit supported by the
333 /// subtarget and limited by given \p FlatWorkGroupSize.
334 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
335
336 /// \returns Minimum flat work group size supported by the subtarget.
337 virtual unsigned getMinFlatWorkGroupSize() const = 0;
338
339 /// \returns Maximum flat work group size supported by the subtarget.
340 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
341
342 /// \returns Number of waves per execution unit required to support the given
343 /// \p FlatWorkGroupSize.
344 virtual unsigned
345 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
346
347 /// \returns Minimum number of waves per execution unit supported by the
348 /// subtarget.
349 virtual unsigned getMinWavesPerEU() const = 0;
350
351 /// \returns Maximum number of waves per execution unit supported by the
352 /// subtarget without any kind of limitation.
353 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
354
355 /// Return the maximum workitem ID value in the function, for the given (0, 1,
356 /// 2) dimension.
357 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
358
359 /// Return the number of work groups for the function.
360 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
361
362 /// Return true if only a single workitem can be active in a wave.
363 bool isSingleLaneExecution(const Function &Kernel) const;
364
365 /// Creates value range metadata on an workitemid.* intrinsic call or load.
366 bool makeLIDRangeMetadata(Instruction *I) const;
367
368 /// \returns Number of bytes of arguments that are passed to a shader or
369 /// kernel in addition to the explicit ones declared for the function.
370 unsigned getImplicitArgNumBytes(const Function &F) const;
371 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
372 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
373
374 /// \returns Corresponding DWARF register number mapping flavour for the
375 /// \p WavefrontSize.
376 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
377
378 virtual ~AMDGPUSubtarget() = default;
379};
380
381} // end namespace llvm
382
383#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
384