1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/IR/CallingConv.h"
19#include "llvm/Support/Alignment.h"
20#include "llvm/TargetParser/Triple.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
30class AMDGPUSubtarget {
31public:
32 enum Generation {
33 INVALID = 0,
34 R600 = 1,
35 R700 = 2,
36 EVERGREEN = 3,
37 NORTHERN_ISLANDS = 4,
38 SOUTHERN_ISLANDS = 5,
39 SEA_ISLANDS = 6,
40 VOLCANIC_ISLANDS = 7,
41 GFX9 = 8,
42 GFX10 = 9,
43 GFX11 = 10,
44 GFX12 = 11,
45 GFX13 = 12,
46 };
47
48private:
49 Triple TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
60 unsigned AddressableLocalMemorySize = 0;
61 char WavefrontSizeLog2 = 0;
62
63public:
64 AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
65
66 static const AMDGPUSubtarget &get(const MachineFunction &MF);
67 static const AMDGPUSubtarget &get(const TargetMachine &TM,
68 const Function &F);
69
70 /// \returns Default range flat work group size for a calling convention.
71 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
72
73 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
74 /// for function \p F, or minimum/maximum flat work group sizes explicitly
75 /// requested using "amdgpu-flat-work-group-size" attribute attached to
76 /// function \p F.
77 ///
78 /// \returns Subtarget's default values if explicitly requested values cannot
79 /// be converted to integer, or violate subtarget's specifications.
80 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
81
82 /// \returns The required size of workgroups that will be used to execute \p F
83 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
84 /// metadata. Otherwise, returns std::nullopt.
85 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
86 unsigned Dim) const;
87
88 /// \returns true if \p F will execute in a manner that leaves the X
89 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
90 /// wavefrontsize is uniform. This is true if either the Y and Z block
91 /// dimensions are known to always be 1 or if the X dimension will always be a
92 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
93 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
94 /// wavesize64 would ordinarily pass this test, it won't with
95 /// \pRequiresUniformYZ).
96 ///
97 /// This information is currently only gathered from the !reqd_work_group_size
98 /// metadata on \p F, but this may be improved in the future.
99 bool hasWavefrontsEvenlySplittingXDim(const Function &F,
100 bool REquiresUniformYZ = false) const;
101
102 /// \returns Subtarget's default pair of minimum/maximum number of waves per
103 /// execution unit for function \p F, or minimum/maximum number of waves per
104 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
105 /// attached to function \p F.
106 ///
107 /// \returns Subtarget's default values if explicitly requested values cannot
108 /// be converted to integer, violate subtarget's specifications, or are not
109 /// compatible with minimum/maximum number of waves limited by flat work group
110 /// size, register usage, and/or lds usage.
111 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
112
113 /// Overload which uses the specified values for the flat workgroup sizes and
114 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
115 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
116 /// LDSBytes to the per-workgroup LDS allocation.
117 std::pair<unsigned, unsigned>
118 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
119 unsigned LDSBytes, const Function &F) const;
120
121 /// Returns the target minimum/maximum number of waves per EU. This is based
122 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
123 /// limited by the maximum achievable occupancy derived from the range of \p
124 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
125 std::pair<unsigned, unsigned>
126 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
127 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
128 unsigned LDSBytes) const;
129
130 /// Return the amount of LDS that can be used that will not restrict the
131 /// occupancy lower than WaveCount.
132 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
133 const Function &) const;
134
135 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
136 /// be achieved when the only function running on a CU is \p F and each
137 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
138 /// This notably depends on the range of allowed flat group sizes for the
139 /// function and hardware characteristics.
140 std::pair<unsigned, unsigned>
141 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
142 return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
143 }
144
145 /// Overload which uses the specified values for the flat work group sizes,
146 /// rather than querying the function itself. \p FlatWorkGroupSizes should
147 /// correspond to the function's value for getFlatWorkGroupSizes.
148 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
149 uint32_t LDSBytes,
150 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
151
152 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
153 /// be achieved when the only function running on a CU is \p MF. This notably
154 /// depends on the range of allowed flat group sizes for the function, the
155 /// amount of per-workgroup LDS space required by the function, and hardware
156 /// characteristics.
157 std::pair<unsigned, unsigned>
158 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
159
160 bool isAmdHsaOS() const {
161 return TargetTriple.getOS() == Triple::AMDHSA;
162 }
163
164 bool isAmdPalOS() const {
165 return TargetTriple.getOS() == Triple::AMDPAL;
166 }
167
168 bool isMesa3DOS() const {
169 return TargetTriple.getOS() == Triple::Mesa3D;
170 }
171
172 bool isMesaKernel(const Function &F) const;
173
174 bool isAmdHsaOrMesa(const Function &F) const {
175 return isAmdHsaOS() || isMesaKernel(F);
176 }
177
178 bool isGCN() const { return TargetTriple.isAMDGCN(); }
179
180 //==---------------------------------------------------------------------===//
181 // TableGen-generated feature getters.
182 //==---------------------------------------------------------------------===//
183#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
184 virtual bool GETTER() const { return false; }
185#include "AMDGPUGenSubtargetInfo.inc"
186 //==---------------------------------------------------------------------===//
187
188 /// Return true if real (non-fake) variants of True16 instructions using
189 /// 16-bit registers should be code-generated. Fake True16 instructions are
190 /// identical to non-fake ones except that they take 32-bit registers as
191 /// operands and always use their low halves.
192 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
193 // supported and the support for fake True16 instructions is removed.
194 bool useRealTrue16Insts() const {
195 return hasTrue16BitInsts() && enableRealTrue16Insts();
196 }
197
198 bool hasMulI24() const {
199 return HasMulI24;
200 }
201
202 bool hasMulU24() const {
203 return HasMulU24;
204 }
205
206 bool hasSMulHi() const {
207 return HasSMulHi;
208 }
209
210 bool hasFminFmaxLegacy() const {
211 return HasFminFmaxLegacy;
212 }
213
214 unsigned getWavefrontSize() const {
215 return 1 << WavefrontSizeLog2;
216 }
217
218 unsigned getWavefrontSizeLog2() const {
219 return WavefrontSizeLog2;
220 }
221
222 /// Return the maximum number of bytes of LDS available for all workgroups
223 /// running on the same WGP or CU.
224 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
225 /// limited to 64k.
226 unsigned getLocalMemorySize() const {
227 return LocalMemorySize;
228 }
229
230 /// Return the maximum number of bytes of LDS that can be allocated to a
231 /// single workgroup.
232 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
233 /// 128k in total.
234 unsigned getAddressableLocalMemorySize() const {
235 return AddressableLocalMemorySize;
236 }
237
238 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
239 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
240 /// CU mode into account.
241 unsigned getEUsPerCU() const { return EUsPerCU; }
242
243 Align getAlignmentForImplicitArgPtr() const {
244 return isAmdHsaOS() ? Align(8) : Align(4);
245 }
246
247 /// Returns the offset in bytes from the start of the input buffer
248 /// of the first explicit kernel argument.
249 unsigned getExplicitKernelArgOffset() const {
250 switch (TargetTriple.getOS()) {
251 case Triple::AMDHSA:
252 case Triple::AMDPAL:
253 case Triple::Mesa3D:
254 return 0;
255 case Triple::UnknownOS:
256 default:
257 // For legacy reasons unknown/other is treated as a different version of
258 // mesa.
259 return 36;
260 }
261
262 llvm_unreachable("invalid triple OS");
263 }
264
265 /// \returns Maximum number of work groups per compute unit supported by the
266 /// subtarget and limited by given \p FlatWorkGroupSize.
267 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
268
269 /// \returns Minimum flat work group size supported by the subtarget.
270 virtual unsigned getMinFlatWorkGroupSize() const = 0;
271
272 /// \returns Maximum flat work group size supported by the subtarget.
273 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
274
275 /// \returns Number of waves per execution unit required to support the given
276 /// \p FlatWorkGroupSize.
277 virtual unsigned
278 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
279
280 /// \returns Minimum number of waves per execution unit supported by the
281 /// subtarget.
282 virtual unsigned getMinWavesPerEU() const = 0;
283
284 /// \returns Maximum number of waves per execution unit supported by the
285 /// subtarget without any kind of limitation.
286 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
287
288 /// Return the maximum workitem ID value in the function, for the given (0, 1,
289 /// 2) dimension.
290 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
291
292 /// Return the number of work groups for the function.
293 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
294
295 /// Return true if only a single workitem can be active in a wave.
296 bool isSingleLaneExecution(const Function &Kernel) const;
297
298 /// Creates value range metadata on an workitemid.* intrinsic call or load.
299 bool makeLIDRangeMetadata(Instruction *I) const;
300
301 /// \returns Number of bytes of arguments that are passed to a shader or
302 /// kernel in addition to the explicit ones declared for the function.
303 unsigned getImplicitArgNumBytes(const Function &F) const;
304 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
305 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
306
307 /// \returns Corresponding DWARF register number mapping flavour for the
308 /// \p WavefrontSize.
309 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
310
311 virtual ~AMDGPUSubtarget() = default;
312};
313
314} // end namespace llvm
315
316#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
317