1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/IR/CallingConv.h"
19#include "llvm/Support/Alignment.h"
20#include "llvm/TargetParser/Triple.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
30class AMDGPUSubtarget {
31public:
32 enum Generation {
33 INVALID = 0,
34 R600 = 1,
35 R700 = 2,
36 EVERGREEN = 3,
37 NORTHERN_ISLANDS = 4,
38 SOUTHERN_ISLANDS = 5,
39 SEA_ISLANDS = 6,
40 VOLCANIC_ISLANDS = 7,
41 GFX9 = 8,
42 GFX10 = 9,
43 GFX11 = 10,
44 GFX12 = 11,
45 GFX13 = 12,
46 };
47
48private:
49 const Triple &TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
60 unsigned AddressableLocalMemorySize = 0;
61 unsigned LDSAllocationGranularity = 0;
62 char WavefrontSizeLog2 = 0;
63 unsigned FlatOffsetBitWidth = 0;
64
65public:
66 AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
67
68 static const AMDGPUSubtarget &get(const MachineFunction &MF);
69 static const AMDGPUSubtarget &get(const TargetMachine &TM,
70 const Function &F);
71
72 /// \returns Default range flat work group size for a calling convention.
73 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
74
75 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
76 /// for function \p F, or minimum/maximum flat work group sizes explicitly
77 /// requested using "amdgpu-flat-work-group-size" attribute attached to
78 /// function \p F.
79 ///
80 /// \returns Subtarget's default values if explicitly requested values cannot
81 /// be converted to integer, or violate subtarget's specifications.
82 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
83
84 /// \returns true if the maximum flat work-group size for \p F is at most the
85 /// wavefront size, so a work-group may fit in a single wavefront.
86 bool isSingleWavefrontWorkgroup(const Function &F) const;
87
88 /// \returns The required size of workgroups that will be used to execute \p F
89 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
90 /// metadata. Otherwise, returns std::nullopt.
91 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
92 unsigned Dim) const;
93
94 /// \returns true if \p F will execute in a manner that leaves the X
95 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
96 /// wavefrontsize is uniform. This is true if either the Y and Z block
97 /// dimensions are known to always be 1 or if the X dimension will always be a
98 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
99 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
100 /// wavesize64 would ordinarily pass this test, it won't with
101 /// \pRequiresUniformYZ).
102 ///
103 /// This information is currently only gathered from the !reqd_work_group_size
104 /// metadata on \p F, but this may be improved in the future.
105 bool hasWavefrontsEvenlySplittingXDim(const Function &F,
106 bool REquiresUniformYZ = false) const;
107
108 /// \returns Subtarget's default pair of minimum/maximum number of waves per
109 /// execution unit for function \p F, or minimum/maximum number of waves per
110 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
111 /// attached to function \p F.
112 ///
113 /// \returns Subtarget's default values if explicitly requested values cannot
114 /// be converted to integer, violate subtarget's specifications, or are not
115 /// compatible with minimum/maximum number of waves limited by flat work group
116 /// size, register usage, and/or lds usage.
117 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
118
119 /// Overload which uses the specified values for the flat workgroup sizes and
120 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
121 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
122 /// LDSBytes to the per-workgroup LDS allocation.
123 std::pair<unsigned, unsigned>
124 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
125 unsigned LDSBytes, const Function &F) const;
126
127 /// Returns the target minimum/maximum number of waves per EU. This is based
128 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
129 /// limited by the maximum achievable occupancy derived from the range of \p
130 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
131 std::pair<unsigned, unsigned>
132 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
133 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
134 unsigned LDSBytes) const;
135
136 /// Return the amount of LDS that can be used that will not restrict the
137 /// occupancy lower than WaveCount.
138 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
139 const Function &) const;
140
141 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
142 /// be achieved when the only function running on a CU is \p F and each
143 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
144 /// This notably depends on the range of allowed flat group sizes for the
145 /// function and hardware characteristics.
146 std::pair<unsigned, unsigned>
147 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
148 return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
149 }
150
151 /// Overload which uses the specified values for the flat work group sizes,
152 /// rather than querying the function itself. \p FlatWorkGroupSizes should
153 /// correspond to the function's value for getFlatWorkGroupSizes.
154 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
155 uint32_t LDSBytes,
156 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
157
158 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
159 /// be achieved when the only function running on a CU is \p MF. This notably
160 /// depends on the range of allowed flat group sizes for the function, the
161 /// amount of per-workgroup LDS space required by the function, and hardware
162 /// characteristics.
163 std::pair<unsigned, unsigned>
164 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
165
166 bool isAmdHsaOS() const {
167 return TargetTriple.getOS() == Triple::AMDHSA;
168 }
169
170 bool isAmdPalOS() const {
171 return TargetTriple.getOS() == Triple::AMDPAL;
172 }
173
174 bool isMesa3DOS() const {
175 return TargetTriple.getOS() == Triple::Mesa3D;
176 }
177
178 bool isMesaKernel(const Function &F) const;
179
180 bool isAmdHsaOrMesa(const Function &F) const {
181 return isAmdHsaOS() || isMesaKernel(F);
182 }
183
184 bool isGCN() const { return TargetTriple.isAMDGCN(); }
185
186 //==---------------------------------------------------------------------===//
187 // TableGen-generated feature getters.
188 //==---------------------------------------------------------------------===//
189#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
190 virtual bool GETTER() const { return false; }
191#include "AMDGPUGenSubtargetInfo.inc"
192 //==---------------------------------------------------------------------===//
193
194 /// Return true if real (non-fake) variants of True16 instructions using
195 /// 16-bit registers should be code-generated. Fake True16 instructions are
196 /// identical to non-fake ones except that they take 32-bit registers as
197 /// operands and always use their low halves.
198 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
199 // supported and the support for fake True16 instructions is removed.
200 bool useRealTrue16Insts() const {
201 return hasTrue16BitInsts() && enableRealTrue16Insts();
202 }
203
204 bool hasMulI24() const {
205 return HasMulI24;
206 }
207
208 bool hasMulU24() const {
209 return HasMulU24;
210 }
211
212 bool hasSMulHi() const {
213 return HasSMulHi;
214 }
215
216 bool hasFminFmaxLegacy() const {
217 return HasFminFmaxLegacy;
218 }
219
220 unsigned getWavefrontSize() const {
221 return 1 << WavefrontSizeLog2;
222 }
223
224 unsigned getWavefrontSizeLog2() const {
225 return WavefrontSizeLog2;
226 }
227
228 /// Return the maximum number of bytes of LDS available for all workgroups
229 /// running on the same WGP or CU.
230 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
231 /// limited to 64k.
232 unsigned getLocalMemorySize() const {
233 return LocalMemorySize;
234 }
235
236 /// Return the maximum number of bytes of LDS that can be allocated to a
237 /// single workgroup.
238 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
239 /// 128k in total.
240 unsigned getAddressableLocalMemorySize() const {
241 return AddressableLocalMemorySize;
242 }
243
244 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
245 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
246 /// CU mode into account.
247 unsigned getEUsPerCU() const { return EUsPerCU; }
248
249 Align getAlignmentForImplicitArgPtr() const {
250 return isAmdHsaOS() ? Align(8) : Align(4);
251 }
252
253 /// Returns the offset in bytes from the start of the input buffer
254 /// of the first explicit kernel argument.
255 unsigned getExplicitKernelArgOffset() const {
256 switch (TargetTriple.getOS()) {
257 case Triple::AMDHSA:
258 case Triple::AMDPAL:
259 case Triple::Mesa3D:
260 return 0;
261 case Triple::UnknownOS:
262 default:
263 // For legacy reasons unknown/other is treated as a different version of
264 // mesa.
265 return 36;
266 }
267
268 llvm_unreachable("invalid triple OS");
269 }
270
271 /// \returns Maximum number of work groups per compute unit supported by the
272 /// subtarget and limited by given \p FlatWorkGroupSize.
273 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
274
275 /// \returns Minimum flat work group size supported by the subtarget.
276 virtual unsigned getMinFlatWorkGroupSize() const = 0;
277
278 /// \returns Maximum flat work group size supported by the subtarget.
279 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
280
281 /// \returns Number of waves per execution unit required to support the given
282 /// \p FlatWorkGroupSize.
283 virtual unsigned
284 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
285
286 /// \returns Minimum number of waves per execution unit supported by the
287 /// subtarget.
288 virtual unsigned getMinWavesPerEU() const = 0;
289
290 /// \returns Maximum number of waves per execution unit supported by the
291 /// subtarget without any kind of limitation.
292 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
293
294 /// Return the maximum workitem ID value in the function, for the given (0, 1,
295 /// 2) dimension.
296 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
297
298 /// Return the number of work groups for the function.
299 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
300
301 /// Return true if only a single workitem can be active in a wave.
302 bool isSingleLaneExecution(const Function &Kernel) const;
303
304 /// Creates value range metadata on an workitemid.* intrinsic call or load.
305 bool makeLIDRangeMetadata(Instruction *I) const;
306
307 /// \returns Number of bytes of arguments that are passed to a shader or
308 /// kernel in addition to the explicit ones declared for the function.
309 unsigned getImplicitArgNumBytes(const Function &F) const;
310 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
311 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
312
313 /// \returns Corresponding DWARF register number mapping flavour for the
314 /// \p WavefrontSize.
315 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
316
317 virtual ~AMDGPUSubtarget() = default;
318};
319
320} // end namespace llvm
321
322#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
323