1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/IR/CallingConv.h"
19#include "llvm/Support/Alignment.h"
20#include "llvm/TargetParser/Triple.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
30class AMDGPUSubtarget {
31public:
32 enum Generation {
33 INVALID = 0,
34 R600 = 1,
35 R700 = 2,
36 EVERGREEN = 3,
37 NORTHERN_ISLANDS = 4,
38 SOUTHERN_ISLANDS = 5,
39 SEA_ISLANDS = 6,
40 VOLCANIC_ISLANDS = 7,
41 GFX9 = 8,
42 GFX10 = 9,
43 GFX11 = 10,
44 GFX12 = 11,
45 GFX13 = 12,
46 };
47
48private:
49 Triple TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
60 unsigned AddressableLocalMemorySize = 0;
61 char WavefrontSizeLog2 = 0;
62
63public:
64 AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
65
66 static const AMDGPUSubtarget &get(const MachineFunction &MF);
67 static const AMDGPUSubtarget &get(const TargetMachine &TM,
68 const Function &F);
69
70 /// \returns Default range flat work group size for a calling convention.
71 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
72
73 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
74 /// for function \p F, or minimum/maximum flat work group sizes explicitly
75 /// requested using "amdgpu-flat-work-group-size" attribute attached to
76 /// function \p F.
77 ///
78 /// \returns Subtarget's default values if explicitly requested values cannot
79 /// be converted to integer, or violate subtarget's specifications.
80 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
81
82 /// \returns The required size of workgroups that will be used to execute \p F
83 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
84 /// metadata. Otherwise, returns std::nullopt.
85 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
86 unsigned Dim) const;
87
88 /// \returns true if \p F will execute in a manner that leaves the X
89 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
90 /// wavefrontsize is uniform. This is true if either the Y and Z block
91 /// dimensions are known to always be 1 or if the X dimension will always be a
92 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
93 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
94 /// wavesize64 would ordinarily pass this test, it won't with
95 /// \pRequiresUniformYZ).
96 ///
97 /// This information is currently only gathered from the !reqd_work_group_size
98 /// metadata on \p F, but this may be improved in the future.
99 bool hasWavefrontsEvenlySplittingXDim(const Function &F,
100 bool REquiresUniformYZ = false) const;
101
102 /// \returns Subtarget's default pair of minimum/maximum number of waves per
103 /// execution unit for function \p F, or minimum/maximum number of waves per
104 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
105 /// attached to function \p F.
106 ///
107 /// \returns Subtarget's default values if explicitly requested values cannot
108 /// be converted to integer, violate subtarget's specifications, or are not
109 /// compatible with minimum/maximum number of waves limited by flat work group
110 /// size, register usage, and/or lds usage.
111 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
112
113 /// Overload which uses the specified values for the flat work group sizes,
114 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
115 /// correspond to the function's value for getFlatWorkGroupSizes.
116 std::pair<unsigned, unsigned>
117 getWavesPerEU(const Function &F,
118 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
119
120 /// Overload which uses the specified values for the flat workgroup sizes and
121 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
122 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
123 /// LDSBytes to the per-workgroup LDS allocation.
124 std::pair<unsigned, unsigned>
125 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
126 unsigned LDSBytes, const Function &F) const;
127
128 /// Returns the target minimum/maximum number of waves per EU. This is based
129 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
130 /// limited by the maximum achievable occupancy derived from the range of \p
131 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
132 std::pair<unsigned, unsigned>
133 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
134 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
135 unsigned LDSBytes) const;
136
137 /// Return the amount of LDS that can be used that will not restrict the
138 /// occupancy lower than WaveCount.
139 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
140 const Function &) const;
141
142 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
143 /// be achieved when the only function running on a CU is \p F and each
144 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
145 /// This notably depends on the range of allowed flat group sizes for the
146 /// function and hardware characteristics.
147 std::pair<unsigned, unsigned>
148 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
149 return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
150 }
151
152 /// Overload which uses the specified values for the flat work group sizes,
153 /// rather than querying the function itself. \p FlatWorkGroupSizes should
154 /// correspond to the function's value for getFlatWorkGroupSizes.
155 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
156 uint32_t LDSBytes,
157 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
158
159 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
160 /// be achieved when the only function running on a CU is \p MF. This notably
161 /// depends on the range of allowed flat group sizes for the function, the
162 /// amount of per-workgroup LDS space required by the function, and hardware
163 /// characteristics.
164 std::pair<unsigned, unsigned>
165 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
166
167 bool isAmdHsaOS() const {
168 return TargetTriple.getOS() == Triple::AMDHSA;
169 }
170
171 bool isAmdPalOS() const {
172 return TargetTriple.getOS() == Triple::AMDPAL;
173 }
174
175 bool isMesa3DOS() const {
176 return TargetTriple.getOS() == Triple::Mesa3D;
177 }
178
179 bool isMesaKernel(const Function &F) const;
180
181 bool isAmdHsaOrMesa(const Function &F) const {
182 return isAmdHsaOS() || isMesaKernel(F);
183 }
184
185 bool isGCN() const { return TargetTriple.isAMDGCN(); }
186
187 //==---------------------------------------------------------------------===//
188 // TableGen-generated feature getters.
189 //==---------------------------------------------------------------------===//
190#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
191 virtual bool GETTER() const { return false; }
192#include "AMDGPUGenSubtargetInfo.inc"
193 //==---------------------------------------------------------------------===//
194
195 /// Return true if real (non-fake) variants of True16 instructions using
196 /// 16-bit registers should be code-generated. Fake True16 instructions are
197 /// identical to non-fake ones except that they take 32-bit registers as
198 /// operands and always use their low halves.
199 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
200 // supported and the support for fake True16 instructions is removed.
201 bool useRealTrue16Insts() const {
202 return hasTrue16BitInsts() && enableRealTrue16Insts();
203 }
204
205 bool hasMulI24() const {
206 return HasMulI24;
207 }
208
209 bool hasMulU24() const {
210 return HasMulU24;
211 }
212
213 bool hasSMulHi() const {
214 return HasSMulHi;
215 }
216
217 bool hasFminFmaxLegacy() const {
218 return HasFminFmaxLegacy;
219 }
220
221 unsigned getWavefrontSize() const {
222 return 1 << WavefrontSizeLog2;
223 }
224
225 unsigned getWavefrontSizeLog2() const {
226 return WavefrontSizeLog2;
227 }
228
229 /// Return the maximum number of bytes of LDS available for all workgroups
230 /// running on the same WGP or CU.
231 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
232 /// limited to 64k.
233 unsigned getLocalMemorySize() const {
234 return LocalMemorySize;
235 }
236
237 /// Return the maximum number of bytes of LDS that can be allocated to a
238 /// single workgroup.
239 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
240 /// 128k in total.
241 unsigned getAddressableLocalMemorySize() const {
242 return AddressableLocalMemorySize;
243 }
244
245 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
246 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
247 /// CU mode into account.
248 unsigned getEUsPerCU() const { return EUsPerCU; }
249
250 Align getAlignmentForImplicitArgPtr() const {
251 return isAmdHsaOS() ? Align(8) : Align(4);
252 }
253
254 /// Returns the offset in bytes from the start of the input buffer
255 /// of the first explicit kernel argument.
256 unsigned getExplicitKernelArgOffset() const {
257 switch (TargetTriple.getOS()) {
258 case Triple::AMDHSA:
259 case Triple::AMDPAL:
260 case Triple::Mesa3D:
261 return 0;
262 case Triple::UnknownOS:
263 default:
264 // For legacy reasons unknown/other is treated as a different version of
265 // mesa.
266 return 36;
267 }
268
269 llvm_unreachable("invalid triple OS");
270 }
271
272 /// \returns Maximum number of work groups per compute unit supported by the
273 /// subtarget and limited by given \p FlatWorkGroupSize.
274 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
275
276 /// \returns Minimum flat work group size supported by the subtarget.
277 virtual unsigned getMinFlatWorkGroupSize() const = 0;
278
279 /// \returns Maximum flat work group size supported by the subtarget.
280 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
281
282 /// \returns Number of waves per execution unit required to support the given
283 /// \p FlatWorkGroupSize.
284 virtual unsigned
285 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
286
287 /// \returns Minimum number of waves per execution unit supported by the
288 /// subtarget.
289 virtual unsigned getMinWavesPerEU() const = 0;
290
291 /// \returns Maximum number of waves per execution unit supported by the
292 /// subtarget without any kind of limitation.
293 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
294
295 /// Return the maximum workitem ID value in the function, for the given (0, 1,
296 /// 2) dimension.
297 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
298
299 /// Return the number of work groups for the function.
300 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
301
302 /// Return true if only a single workitem can be active in a wave.
303 bool isSingleLaneExecution(const Function &Kernel) const;
304
305 /// Creates value range metadata on an workitemid.* intrinsic call or load.
306 bool makeLIDRangeMetadata(Instruction *I) const;
307
308 /// \returns Number of bytes of arguments that are passed to a shader or
309 /// kernel in addition to the explicit ones declared for the function.
310 unsigned getImplicitArgNumBytes(const Function &F) const;
311 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
312 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
313
314 /// \returns Corresponding DWARF register number mapping flavour for the
315 /// \p WavefrontSize.
316 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
317
318 virtual ~AMDGPUSubtarget() = default;
319};
320
321} // end namespace llvm
322
323#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
324