1 | //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //==-----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Base class for AMDGPU specific classes of TargetSubtarget. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
15 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
16 | |
17 | #include "llvm/IR/CallingConv.h" |
18 | #include "llvm/Support/Alignment.h" |
19 | #include "llvm/TargetParser/Triple.h" |
20 | |
21 | namespace llvm { |
22 | |
23 | enum AMDGPUDwarfFlavour : unsigned; |
24 | class Function; |
25 | class Instruction; |
26 | class MachineFunction; |
27 | class TargetMachine; |
28 | |
29 | class AMDGPUSubtarget { |
30 | public: |
31 | enum Generation { |
32 | INVALID = 0, |
33 | R600 = 1, |
34 | R700 = 2, |
35 | EVERGREEN = 3, |
36 | NORTHERN_ISLANDS = 4, |
37 | SOUTHERN_ISLANDS = 5, |
38 | SEA_ISLANDS = 6, |
39 | VOLCANIC_ISLANDS = 7, |
40 | GFX9 = 8, |
41 | GFX10 = 9, |
42 | GFX11 = 10, |
43 | GFX12 = 11, |
44 | }; |
45 | |
46 | private: |
47 | Triple TargetTriple; |
48 | |
49 | protected: |
50 | bool GCN3Encoding = false; |
51 | bool Has16BitInsts = false; |
52 | bool HasTrue16BitInsts = false; |
53 | bool EnableRealTrue16Insts = false; |
54 | bool HasMadMixInsts = false; |
55 | bool HasMadMacF32Insts = false; |
56 | bool HasDsSrc2Insts = false; |
57 | bool HasSDWA = false; |
58 | bool HasVOP3PInsts = false; |
59 | bool HasMulI24 = true; |
60 | bool HasMulU24 = true; |
61 | bool HasSMulHi = false; |
62 | bool HasInv2PiInlineImm = false; |
63 | bool HasFminFmaxLegacy = true; |
64 | bool EnablePromoteAlloca = false; |
65 | bool HasTrigReducedRange = false; |
66 | bool FastFMAF32 = false; |
67 | unsigned EUsPerCU = 4; |
68 | unsigned MaxWavesPerEU = 10; |
69 | unsigned LocalMemorySize = 0; |
70 | unsigned AddressableLocalMemorySize = 0; |
71 | char WavefrontSizeLog2 = 0; |
72 | |
73 | public: |
74 | AMDGPUSubtarget(Triple TT); |
75 | |
76 | static const AMDGPUSubtarget &get(const MachineFunction &MF); |
77 | static const AMDGPUSubtarget &get(const TargetMachine &TM, |
78 | const Function &F); |
79 | |
80 | /// \returns Default range flat work group size for a calling convention. |
81 | std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; |
82 | |
83 | /// \returns Subtarget's default pair of minimum/maximum flat work group sizes |
84 | /// for function \p F, or minimum/maximum flat work group sizes explicitly |
85 | /// requested using "amdgpu-flat-work-group-size" attribute attached to |
86 | /// function \p F. |
87 | /// |
88 | /// \returns Subtarget's default values if explicitly requested values cannot |
89 | /// be converted to integer, or violate subtarget's specifications. |
90 | std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; |
91 | |
92 | /// \returns Subtarget's default pair of minimum/maximum number of waves per |
93 | /// execution unit for function \p F, or minimum/maximum number of waves per |
94 | /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute |
95 | /// attached to function \p F. |
96 | /// |
97 | /// \returns Subtarget's default values if explicitly requested values cannot |
98 | /// be converted to integer, violate subtarget's specifications, or are not |
99 | /// compatible with minimum/maximum number of waves limited by flat work group |
100 | /// size, register usage, and/or lds usage. |
101 | std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const { |
102 | // Default/requested minimum/maximum flat work group sizes. |
103 | std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); |
104 | return getWavesPerEU(F, FlatWorkGroupSizes); |
105 | } |
106 | |
107 | /// Overload which uses the specified values for the flat work group sizes, |
108 | /// rather than querying the function itself. \p FlatWorkGroupSizes Should |
109 | /// correspond to the function's value for getFlatWorkGroupSizes. |
110 | std::pair<unsigned, unsigned> |
111 | getWavesPerEU(const Function &F, |
112 | std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; |
113 | std::pair<unsigned, unsigned> getEffectiveWavesPerEU( |
114 | std::pair<unsigned, unsigned> WavesPerEU, |
115 | std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; |
116 | |
117 | /// Return the amount of LDS that can be used that will not restrict the |
118 | /// occupancy lower than WaveCount. |
119 | unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
120 | const Function &) const; |
121 | |
122 | /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if |
123 | /// the given LDS memory size is the only constraint. |
124 | unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; |
125 | |
126 | unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; |
127 | |
128 | bool isAmdHsaOS() const { |
129 | return TargetTriple.getOS() == Triple::AMDHSA; |
130 | } |
131 | |
132 | bool isAmdPalOS() const { |
133 | return TargetTriple.getOS() == Triple::AMDPAL; |
134 | } |
135 | |
136 | bool isMesa3DOS() const { |
137 | return TargetTriple.getOS() == Triple::Mesa3D; |
138 | } |
139 | |
140 | bool isMesaKernel(const Function &F) const; |
141 | |
142 | bool isAmdHsaOrMesa(const Function &F) const { |
143 | return isAmdHsaOS() || isMesaKernel(F); |
144 | } |
145 | |
146 | bool isGCN() const { |
147 | return TargetTriple.getArch() == Triple::amdgcn; |
148 | } |
149 | |
150 | bool isGCN3Encoding() const { |
151 | return GCN3Encoding; |
152 | } |
153 | |
154 | bool has16BitInsts() const { |
155 | return Has16BitInsts; |
156 | } |
157 | |
158 | /// Return true if the subtarget supports True16 instructions. |
159 | bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } |
160 | |
161 | /// Return true if real (non-fake) variants of True16 instructions using |
162 | /// 16-bit registers should be code-generated. Fake True16 instructions are |
163 | /// identical to non-fake ones except that they take 32-bit registers as |
164 | /// operands and always use their low halves. |
165 | // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully |
166 | // supported and the support for fake True16 instructions is removed. |
167 | bool useRealTrue16Insts() const; |
168 | |
169 | bool hasMadMixInsts() const { |
170 | return HasMadMixInsts; |
171 | } |
172 | |
173 | bool hasMadMacF32Insts() const { |
174 | return HasMadMacF32Insts || !isGCN(); |
175 | } |
176 | |
177 | bool hasDsSrc2Insts() const { |
178 | return HasDsSrc2Insts; |
179 | } |
180 | |
181 | bool hasSDWA() const { |
182 | return HasSDWA; |
183 | } |
184 | |
185 | bool hasVOP3PInsts() const { |
186 | return HasVOP3PInsts; |
187 | } |
188 | |
189 | bool hasMulI24() const { |
190 | return HasMulI24; |
191 | } |
192 | |
193 | bool hasMulU24() const { |
194 | return HasMulU24; |
195 | } |
196 | |
197 | bool hasSMulHi() const { |
198 | return HasSMulHi; |
199 | } |
200 | |
201 | bool hasInv2PiInlineImm() const { |
202 | return HasInv2PiInlineImm; |
203 | } |
204 | |
205 | bool hasFminFmaxLegacy() const { |
206 | return HasFminFmaxLegacy; |
207 | } |
208 | |
209 | bool hasTrigReducedRange() const { |
210 | return HasTrigReducedRange; |
211 | } |
212 | |
213 | bool hasFastFMAF32() const { |
214 | return FastFMAF32; |
215 | } |
216 | |
217 | bool isPromoteAllocaEnabled() const { |
218 | return EnablePromoteAlloca; |
219 | } |
220 | |
221 | unsigned getWavefrontSize() const { |
222 | return 1 << WavefrontSizeLog2; |
223 | } |
224 | |
225 | unsigned getWavefrontSizeLog2() const { |
226 | return WavefrontSizeLog2; |
227 | } |
228 | |
229 | unsigned getLocalMemorySize() const { |
230 | return LocalMemorySize; |
231 | } |
232 | |
233 | unsigned getAddressableLocalMemorySize() const { |
234 | return AddressableLocalMemorySize; |
235 | } |
236 | |
237 | /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the |
238 | /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs. |
239 | /// CU mode into account. |
240 | unsigned getEUsPerCU() const { return EUsPerCU; } |
241 | |
242 | Align getAlignmentForImplicitArgPtr() const { |
243 | return isAmdHsaOS() ? Align(8) : Align(4); |
244 | } |
245 | |
246 | /// Returns the offset in bytes from the start of the input buffer |
247 | /// of the first explicit kernel argument. |
248 | unsigned getExplicitKernelArgOffset() const { |
249 | switch (TargetTriple.getOS()) { |
250 | case Triple::AMDHSA: |
251 | case Triple::AMDPAL: |
252 | case Triple::Mesa3D: |
253 | return 0; |
254 | case Triple::UnknownOS: |
255 | default: |
256 | // For legacy reasons unknown/other is treated as a different version of |
257 | // mesa. |
258 | return 36; |
259 | } |
260 | |
261 | llvm_unreachable("invalid triple OS" ); |
262 | } |
263 | |
264 | /// \returns Maximum number of work groups per compute unit supported by the |
265 | /// subtarget and limited by given \p FlatWorkGroupSize. |
266 | virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; |
267 | |
268 | /// \returns Minimum flat work group size supported by the subtarget. |
269 | virtual unsigned getMinFlatWorkGroupSize() const = 0; |
270 | |
271 | /// \returns Maximum flat work group size supported by the subtarget. |
272 | virtual unsigned getMaxFlatWorkGroupSize() const = 0; |
273 | |
274 | /// \returns Number of waves per execution unit required to support the given |
275 | /// \p FlatWorkGroupSize. |
276 | virtual unsigned |
277 | getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; |
278 | |
279 | /// \returns Minimum number of waves per execution unit supported by the |
280 | /// subtarget. |
281 | virtual unsigned getMinWavesPerEU() const = 0; |
282 | |
283 | /// \returns Maximum number of waves per execution unit supported by the |
284 | /// subtarget without any kind of limitation. |
285 | unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } |
286 | |
287 | /// Return the maximum workitem ID value in the function, for the given (0, 1, |
288 | /// 2) dimension. |
289 | unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; |
290 | |
291 | /// Return the number of work groups for the function. |
292 | SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const; |
293 | |
294 | /// Return true if only a single workitem can be active in a wave. |
295 | bool isSingleLaneExecution(const Function &Kernel) const; |
296 | |
297 | /// Creates value range metadata on an workitemid.* intrinsic call or load. |
298 | bool makeLIDRangeMetadata(Instruction *I) const; |
299 | |
300 | /// \returns Number of bytes of arguments that are passed to a shader or |
301 | /// kernel in addition to the explicit ones declared for the function. |
302 | unsigned getImplicitArgNumBytes(const Function &F) const; |
303 | uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; |
304 | unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; |
305 | |
306 | /// \returns Corresponding DWARF register number mapping flavour for the |
307 | /// \p WavefrontSize. |
308 | AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; |
309 | |
310 | virtual ~AMDGPUSubtarget() = default; |
311 | }; |
312 | |
313 | } // end namespace llvm |
314 | |
315 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
316 | |