1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/IR/CallingConv.h"
19#include "llvm/Support/Alignment.h"
20#include "llvm/TargetParser/Triple.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
30class AMDGPUSubtarget {
31public:
32 enum Generation {
33 INVALID = 0,
34 R600 = 1,
35 R700 = 2,
36 EVERGREEN = 3,
37 NORTHERN_ISLANDS = 4,
38 SOUTHERN_ISLANDS = 5,
39 SEA_ISLANDS = 6,
40 VOLCANIC_ISLANDS = 7,
41 GFX9 = 8,
42 GFX10 = 9,
43 GFX11 = 10,
44 GFX12 = 11,
45 GFX13 = 12,
46 };
47
48private:
49 const Triple &TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
60 unsigned AddressableLocalMemorySize = 0;
61 char WavefrontSizeLog2 = 0;
62 unsigned FlatOffsetBitWidth = 0;
63
64public:
65 AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
66
67 static const AMDGPUSubtarget &get(const MachineFunction &MF);
68 static const AMDGPUSubtarget &get(const TargetMachine &TM,
69 const Function &F);
70
71 /// \returns Default range flat work group size for a calling convention.
72 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
73
74 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
75 /// for function \p F, or minimum/maximum flat work group sizes explicitly
76 /// requested using "amdgpu-flat-work-group-size" attribute attached to
77 /// function \p F.
78 ///
79 /// \returns Subtarget's default values if explicitly requested values cannot
80 /// be converted to integer, or violate subtarget's specifications.
81 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
82
83 /// \returns The required size of workgroups that will be used to execute \p F
84 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
85 /// metadata. Otherwise, returns std::nullopt.
86 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
87 unsigned Dim) const;
88
89 /// \returns true if \p F will execute in a manner that leaves the X
90 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
91 /// wavefrontsize is uniform. This is true if either the Y and Z block
92 /// dimensions are known to always be 1 or if the X dimension will always be a
93 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
94 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
95 /// wavesize64 would ordinarily pass this test, it won't with
96 /// \pRequiresUniformYZ).
97 ///
98 /// This information is currently only gathered from the !reqd_work_group_size
99 /// metadata on \p F, but this may be improved in the future.
100 bool hasWavefrontsEvenlySplittingXDim(const Function &F,
101 bool REquiresUniformYZ = false) const;
102
103 /// \returns Subtarget's default pair of minimum/maximum number of waves per
104 /// execution unit for function \p F, or minimum/maximum number of waves per
105 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
106 /// attached to function \p F.
107 ///
108 /// \returns Subtarget's default values if explicitly requested values cannot
109 /// be converted to integer, violate subtarget's specifications, or are not
110 /// compatible with minimum/maximum number of waves limited by flat work group
111 /// size, register usage, and/or lds usage.
112 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
113
114 /// Overload which uses the specified values for the flat workgroup sizes and
115 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
116 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
117 /// LDSBytes to the per-workgroup LDS allocation.
118 std::pair<unsigned, unsigned>
119 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
120 unsigned LDSBytes, const Function &F) const;
121
122 /// Returns the target minimum/maximum number of waves per EU. This is based
123 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
124 /// limited by the maximum achievable occupancy derived from the range of \p
125 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
126 std::pair<unsigned, unsigned>
127 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
128 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
129 unsigned LDSBytes) const;
130
131 /// Return the amount of LDS that can be used that will not restrict the
132 /// occupancy lower than WaveCount.
133 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
134 const Function &) const;
135
136 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
137 /// be achieved when the only function running on a CU is \p F and each
138 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
139 /// This notably depends on the range of allowed flat group sizes for the
140 /// function and hardware characteristics.
141 std::pair<unsigned, unsigned>
142 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
143 return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
144 }
145
146 /// Overload which uses the specified values for the flat work group sizes,
147 /// rather than querying the function itself. \p FlatWorkGroupSizes should
148 /// correspond to the function's value for getFlatWorkGroupSizes.
149 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
150 uint32_t LDSBytes,
151 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
152
153 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
154 /// be achieved when the only function running on a CU is \p MF. This notably
155 /// depends on the range of allowed flat group sizes for the function, the
156 /// amount of per-workgroup LDS space required by the function, and hardware
157 /// characteristics.
158 std::pair<unsigned, unsigned>
159 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
160
161 bool isAmdHsaOS() const {
162 return TargetTriple.getOS() == Triple::AMDHSA;
163 }
164
165 bool isAmdPalOS() const {
166 return TargetTriple.getOS() == Triple::AMDPAL;
167 }
168
169 bool isMesa3DOS() const {
170 return TargetTriple.getOS() == Triple::Mesa3D;
171 }
172
173 bool isMesaKernel(const Function &F) const;
174
175 bool isAmdHsaOrMesa(const Function &F) const {
176 return isAmdHsaOS() || isMesaKernel(F);
177 }
178
179 bool isGCN() const { return TargetTriple.isAMDGCN(); }
180
181 //==---------------------------------------------------------------------===//
182 // TableGen-generated feature getters.
183 //==---------------------------------------------------------------------===//
184#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
185 virtual bool GETTER() const { return false; }
186#include "AMDGPUGenSubtargetInfo.inc"
187 //==---------------------------------------------------------------------===//
188
189 /// Return true if real (non-fake) variants of True16 instructions using
190 /// 16-bit registers should be code-generated. Fake True16 instructions are
191 /// identical to non-fake ones except that they take 32-bit registers as
192 /// operands and always use their low halves.
193 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
194 // supported and the support for fake True16 instructions is removed.
195 bool useRealTrue16Insts() const {
196 return hasTrue16BitInsts() && enableRealTrue16Insts();
197 }
198
199 bool hasMulI24() const {
200 return HasMulI24;
201 }
202
203 bool hasMulU24() const {
204 return HasMulU24;
205 }
206
207 bool hasSMulHi() const {
208 return HasSMulHi;
209 }
210
211 bool hasFminFmaxLegacy() const {
212 return HasFminFmaxLegacy;
213 }
214
215 unsigned getWavefrontSize() const {
216 return 1 << WavefrontSizeLog2;
217 }
218
219 unsigned getWavefrontSizeLog2() const {
220 return WavefrontSizeLog2;
221 }
222
223 /// Return the maximum number of bytes of LDS available for all workgroups
224 /// running on the same WGP or CU.
225 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
226 /// limited to 64k.
227 unsigned getLocalMemorySize() const {
228 return LocalMemorySize;
229 }
230
231 /// Return the maximum number of bytes of LDS that can be allocated to a
232 /// single workgroup.
233 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
234 /// 128k in total.
235 unsigned getAddressableLocalMemorySize() const {
236 return AddressableLocalMemorySize;
237 }
238
239 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
240 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
241 /// CU mode into account.
242 unsigned getEUsPerCU() const { return EUsPerCU; }
243
244 Align getAlignmentForImplicitArgPtr() const {
245 return isAmdHsaOS() ? Align(8) : Align(4);
246 }
247
248 /// Returns the offset in bytes from the start of the input buffer
249 /// of the first explicit kernel argument.
250 unsigned getExplicitKernelArgOffset() const {
251 switch (TargetTriple.getOS()) {
252 case Triple::AMDHSA:
253 case Triple::AMDPAL:
254 case Triple::Mesa3D:
255 return 0;
256 case Triple::UnknownOS:
257 default:
258 // For legacy reasons unknown/other is treated as a different version of
259 // mesa.
260 return 36;
261 }
262
263 llvm_unreachable("invalid triple OS");
264 }
265
266 /// \returns Maximum number of work groups per compute unit supported by the
267 /// subtarget and limited by given \p FlatWorkGroupSize.
268 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
269
270 /// \returns Minimum flat work group size supported by the subtarget.
271 virtual unsigned getMinFlatWorkGroupSize() const = 0;
272
273 /// \returns Maximum flat work group size supported by the subtarget.
274 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
275
276 /// \returns Number of waves per execution unit required to support the given
277 /// \p FlatWorkGroupSize.
278 virtual unsigned
279 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
280
281 /// \returns Minimum number of waves per execution unit supported by the
282 /// subtarget.
283 virtual unsigned getMinWavesPerEU() const = 0;
284
285 /// \returns Maximum number of waves per execution unit supported by the
286 /// subtarget without any kind of limitation.
287 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
288
289 /// Return the maximum workitem ID value in the function, for the given (0, 1,
290 /// 2) dimension.
291 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
292
293 /// Return the number of work groups for the function.
294 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
295
296 /// Return true if only a single workitem can be active in a wave.
297 bool isSingleLaneExecution(const Function &Kernel) const;
298
299 /// Creates value range metadata on an workitemid.* intrinsic call or load.
300 bool makeLIDRangeMetadata(Instruction *I) const;
301
302 /// \returns Number of bytes of arguments that are passed to a shader or
303 /// kernel in addition to the explicit ones declared for the function.
304 unsigned getImplicitArgNumBytes(const Function &F) const;
305 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
306 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
307
308 /// \returns Corresponding DWARF register number mapping flavour for the
309 /// \p WavefrontSize.
310 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
311
312 virtual ~AMDGPUSubtarget() = default;
313};
314
315} // end namespace llvm
316
317#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
318