1 | //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //==-----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Base class for AMDGPU specific classes of TargetSubtarget. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
15 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
16 | |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/IR/CallingConv.h" |
19 | #include "llvm/Support/Alignment.h" |
20 | #include "llvm/TargetParser/Triple.h" |
21 | |
22 | namespace llvm { |
23 | |
24 | enum AMDGPUDwarfFlavour : unsigned; |
25 | class Function; |
26 | class Instruction; |
27 | class MachineFunction; |
28 | class TargetMachine; |
29 | |
30 | class AMDGPUSubtarget { |
31 | public: |
32 | enum Generation { |
33 | INVALID = 0, |
34 | R600 = 1, |
35 | R700 = 2, |
36 | EVERGREEN = 3, |
37 | NORTHERN_ISLANDS = 4, |
38 | SOUTHERN_ISLANDS = 5, |
39 | SEA_ISLANDS = 6, |
40 | VOLCANIC_ISLANDS = 7, |
41 | GFX9 = 8, |
42 | GFX10 = 9, |
43 | GFX11 = 10, |
44 | GFX12 = 11, |
45 | }; |
46 | |
47 | private: |
48 | Triple TargetTriple; |
49 | |
50 | protected: |
51 | bool GCN3Encoding = false; |
52 | bool Has16BitInsts = false; |
53 | bool HasTrue16BitInsts = false; |
54 | bool HasFP8ConversionScaleInsts = false; |
55 | bool HasBF8ConversionScaleInsts = false; |
56 | bool HasFP4ConversionScaleInsts = false; |
57 | bool HasFP6BF6ConversionScaleInsts = false; |
58 | bool HasF16BF16ToFP6BF6ConversionScaleInsts = false; |
59 | bool HasCvtPkF16F32Inst = false; |
60 | bool HasF32ToF16BF16ConversionSRInsts = false; |
61 | bool EnableRealTrue16Insts = false; |
62 | bool HasBF16ConversionInsts = false; |
63 | bool HasMadMixInsts = false; |
64 | bool HasMadMacF32Insts = false; |
65 | bool HasDsSrc2Insts = false; |
66 | bool HasSDWA = false; |
67 | bool HasVOP3PInsts = false; |
68 | bool HasMulI24 = true; |
69 | bool HasMulU24 = true; |
70 | bool HasSMulHi = false; |
71 | bool HasInv2PiInlineImm = false; |
72 | bool HasFminFmaxLegacy = true; |
73 | bool EnablePromoteAlloca = false; |
74 | bool HasTrigReducedRange = false; |
75 | bool FastFMAF32 = false; |
76 | unsigned EUsPerCU = 4; |
77 | unsigned MaxWavesPerEU = 10; |
78 | unsigned LocalMemorySize = 0; |
79 | unsigned AddressableLocalMemorySize = 0; |
80 | char WavefrontSizeLog2 = 0; |
81 | |
82 | public: |
83 | AMDGPUSubtarget(Triple TT); |
84 | |
85 | static const AMDGPUSubtarget &get(const MachineFunction &MF); |
86 | static const AMDGPUSubtarget &get(const TargetMachine &TM, |
87 | const Function &F); |
88 | |
89 | /// \returns Default range flat work group size for a calling convention. |
90 | std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; |
91 | |
92 | /// \returns Subtarget's default pair of minimum/maximum flat work group sizes |
93 | /// for function \p F, or minimum/maximum flat work group sizes explicitly |
94 | /// requested using "amdgpu-flat-work-group-size" attribute attached to |
95 | /// function \p F. |
96 | /// |
97 | /// \returns Subtarget's default values if explicitly requested values cannot |
98 | /// be converted to integer, or violate subtarget's specifications. |
99 | std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; |
100 | |
101 | /// \returns Subtarget's default pair of minimum/maximum number of waves per |
102 | /// execution unit for function \p F, or minimum/maximum number of waves per |
103 | /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute |
104 | /// attached to function \p F. |
105 | /// |
106 | /// \returns Subtarget's default values if explicitly requested values cannot |
107 | /// be converted to integer, violate subtarget's specifications, or are not |
108 | /// compatible with minimum/maximum number of waves limited by flat work group |
109 | /// size, register usage, and/or lds usage. |
110 | std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; |
111 | |
112 | /// Overload which uses the specified values for the flat work group sizes, |
113 | /// rather than querying the function itself. \p FlatWorkGroupSizes Should |
114 | /// correspond to the function's value for getFlatWorkGroupSizes. |
115 | std::pair<unsigned, unsigned> |
116 | getWavesPerEU(const Function &F, |
117 | std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; |
118 | |
119 | /// Overload which uses the specified values for the flat workgroup sizes and |
120 | /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes |
121 | /// should correspond to the function's value for getFlatWorkGroupSizes and \p |
122 | /// LDSBytes to the per-workgroup LDS allocation. |
123 | std::pair<unsigned, unsigned> |
124 | getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes, |
125 | unsigned LDSBytes, const Function &F) const; |
126 | |
127 | /// Returns the target minimum/maximum number of waves per EU. This is based |
128 | /// on the minimum/maximum number of \p RequestedWavesPerEU and further |
129 | /// limited by the maximum achievable occupancy derived from the range of \p |
130 | /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. |
131 | std::pair<unsigned, unsigned> |
132 | getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU, |
133 | std::pair<unsigned, unsigned> FlatWorkGroupSizes, |
134 | unsigned LDSBytes) const; |
135 | |
136 | /// Return the amount of LDS that can be used that will not restrict the |
137 | /// occupancy lower than WaveCount. |
138 | unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
139 | const Function &) const; |
140 | |
141 | /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can |
142 | /// be achieved when the only function running on a CU is \p F and each |
143 | /// workgroup running the function requires \p LDSBytes bytes of LDS space. |
144 | /// This notably depends on the range of allowed flat group sizes for the |
145 | /// function and hardware characteristics. |
146 | std::pair<unsigned, unsigned> |
147 | getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const { |
148 | return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F)); |
149 | } |
150 | |
151 | /// Overload which uses the specified values for the flat work group sizes, |
152 | /// rather than querying the function itself. \p FlatWorkGroupSizes should |
153 | /// correspond to the function's value for getFlatWorkGroupSizes. |
154 | std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes( |
155 | uint32_t LDSBytes, |
156 | std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; |
157 | |
158 | /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can |
159 | /// be achieved when the only function running on a CU is \p MF. This notably |
160 | /// depends on the range of allowed flat group sizes for the function, the |
161 | /// amount of per-workgroup LDS space required by the function, and hardware |
162 | /// characteristics. |
163 | std::pair<unsigned, unsigned> |
164 | getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const; |
165 | |
166 | bool isAmdHsaOS() const { |
167 | return TargetTriple.getOS() == Triple::AMDHSA; |
168 | } |
169 | |
170 | bool isAmdPalOS() const { |
171 | return TargetTriple.getOS() == Triple::AMDPAL; |
172 | } |
173 | |
174 | bool isMesa3DOS() const { |
175 | return TargetTriple.getOS() == Triple::Mesa3D; |
176 | } |
177 | |
178 | bool isMesaKernel(const Function &F) const; |
179 | |
180 | bool isAmdHsaOrMesa(const Function &F) const { |
181 | return isAmdHsaOS() || isMesaKernel(F); |
182 | } |
183 | |
184 | bool isGCN() const { return TargetTriple.isAMDGCN(); } |
185 | |
186 | bool isGCN3Encoding() const { |
187 | return GCN3Encoding; |
188 | } |
189 | |
190 | bool has16BitInsts() const { |
191 | return Has16BitInsts; |
192 | } |
193 | |
194 | /// Return true if the subtarget supports True16 instructions. |
195 | bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } |
196 | |
197 | /// Return true if real (non-fake) variants of True16 instructions using |
198 | /// 16-bit registers should be code-generated. Fake True16 instructions are |
199 | /// identical to non-fake ones except that they take 32-bit registers as |
200 | /// operands and always use their low halves. |
201 | // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully |
202 | // supported and the support for fake True16 instructions is removed. |
203 | bool useRealTrue16Insts() const; |
204 | |
205 | bool hasBF16ConversionInsts() const { |
206 | return HasBF16ConversionInsts; |
207 | } |
208 | |
209 | bool hasMadMixInsts() const { |
210 | return HasMadMixInsts; |
211 | } |
212 | |
213 | bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; } |
214 | |
215 | bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; } |
216 | |
217 | bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; } |
218 | |
219 | bool hasFP6BF6ConversionScaleInsts() const { |
220 | return HasFP6BF6ConversionScaleInsts; |
221 | } |
222 | |
223 | bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { |
224 | return HasF16BF16ToFP6BF6ConversionScaleInsts; |
225 | } |
226 | |
227 | bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; } |
228 | |
229 | bool hasF32ToF16BF16ConversionSRInsts() const { |
230 | return HasF32ToF16BF16ConversionSRInsts; |
231 | } |
232 | |
233 | bool hasMadMacF32Insts() const { |
234 | return HasMadMacF32Insts || !isGCN(); |
235 | } |
236 | |
237 | bool hasDsSrc2Insts() const { |
238 | return HasDsSrc2Insts; |
239 | } |
240 | |
241 | bool hasSDWA() const { |
242 | return HasSDWA; |
243 | } |
244 | |
245 | bool hasVOP3PInsts() const { |
246 | return HasVOP3PInsts; |
247 | } |
248 | |
249 | bool hasMulI24() const { |
250 | return HasMulI24; |
251 | } |
252 | |
253 | bool hasMulU24() const { |
254 | return HasMulU24; |
255 | } |
256 | |
257 | bool hasSMulHi() const { |
258 | return HasSMulHi; |
259 | } |
260 | |
261 | bool hasInv2PiInlineImm() const { |
262 | return HasInv2PiInlineImm; |
263 | } |
264 | |
265 | bool hasFminFmaxLegacy() const { |
266 | return HasFminFmaxLegacy; |
267 | } |
268 | |
269 | bool hasTrigReducedRange() const { |
270 | return HasTrigReducedRange; |
271 | } |
272 | |
273 | bool hasFastFMAF32() const { |
274 | return FastFMAF32; |
275 | } |
276 | |
277 | bool isPromoteAllocaEnabled() const { |
278 | return EnablePromoteAlloca; |
279 | } |
280 | |
281 | unsigned getWavefrontSize() const { |
282 | return 1 << WavefrontSizeLog2; |
283 | } |
284 | |
285 | unsigned getWavefrontSizeLog2() const { |
286 | return WavefrontSizeLog2; |
287 | } |
288 | |
289 | /// Return the maximum number of bytes of LDS available for all workgroups |
290 | /// running on the same WGP or CU. |
291 | /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is |
292 | /// limited to 64k. |
293 | unsigned getLocalMemorySize() const { |
294 | return LocalMemorySize; |
295 | } |
296 | |
297 | /// Return the maximum number of bytes of LDS that can be allocated to a |
298 | /// single workgroup. |
299 | /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has |
300 | /// 128k in total. |
301 | unsigned getAddressableLocalMemorySize() const { |
302 | return AddressableLocalMemorySize; |
303 | } |
304 | |
305 | /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the |
306 | /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs. |
307 | /// CU mode into account. |
308 | unsigned getEUsPerCU() const { return EUsPerCU; } |
309 | |
310 | Align getAlignmentForImplicitArgPtr() const { |
311 | return isAmdHsaOS() ? Align(8) : Align(4); |
312 | } |
313 | |
314 | /// Returns the offset in bytes from the start of the input buffer |
315 | /// of the first explicit kernel argument. |
316 | unsigned getExplicitKernelArgOffset() const { |
317 | switch (TargetTriple.getOS()) { |
318 | case Triple::AMDHSA: |
319 | case Triple::AMDPAL: |
320 | case Triple::Mesa3D: |
321 | return 0; |
322 | case Triple::UnknownOS: |
323 | default: |
324 | // For legacy reasons unknown/other is treated as a different version of |
325 | // mesa. |
326 | return 36; |
327 | } |
328 | |
329 | llvm_unreachable("invalid triple OS" ); |
330 | } |
331 | |
332 | /// \returns Maximum number of work groups per compute unit supported by the |
333 | /// subtarget and limited by given \p FlatWorkGroupSize. |
334 | virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; |
335 | |
336 | /// \returns Minimum flat work group size supported by the subtarget. |
337 | virtual unsigned getMinFlatWorkGroupSize() const = 0; |
338 | |
339 | /// \returns Maximum flat work group size supported by the subtarget. |
340 | virtual unsigned getMaxFlatWorkGroupSize() const = 0; |
341 | |
342 | /// \returns Number of waves per execution unit required to support the given |
343 | /// \p FlatWorkGroupSize. |
344 | virtual unsigned |
345 | getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; |
346 | |
347 | /// \returns Minimum number of waves per execution unit supported by the |
348 | /// subtarget. |
349 | virtual unsigned getMinWavesPerEU() const = 0; |
350 | |
351 | /// \returns Maximum number of waves per execution unit supported by the |
352 | /// subtarget without any kind of limitation. |
353 | unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } |
354 | |
355 | /// Return the maximum workitem ID value in the function, for the given (0, 1, |
356 | /// 2) dimension. |
357 | unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; |
358 | |
359 | /// Return the number of work groups for the function. |
360 | SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const; |
361 | |
362 | /// Return true if only a single workitem can be active in a wave. |
363 | bool isSingleLaneExecution(const Function &Kernel) const; |
364 | |
365 | /// Creates value range metadata on an workitemid.* intrinsic call or load. |
366 | bool makeLIDRangeMetadata(Instruction *I) const; |
367 | |
368 | /// \returns Number of bytes of arguments that are passed to a shader or |
369 | /// kernel in addition to the explicit ones declared for the function. |
370 | unsigned getImplicitArgNumBytes(const Function &F) const; |
371 | uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; |
372 | unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; |
373 | |
374 | /// \returns Corresponding DWARF register number mapping flavour for the |
375 | /// \p WavefrontSize. |
376 | AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; |
377 | |
378 | virtual ~AMDGPUSubtarget() = default; |
379 | }; |
380 | |
381 | } // end namespace llvm |
382 | |
383 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
384 | |