AMDGPUSubtarget.h source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h]

1	//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Base class for AMDGPU specific classes of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17	#include "llvm/ADT/SmallVector.h"
18	#include "llvm/IR/CallingConv.h"
19	#include "llvm/Support/Alignment.h"
20	#include "llvm/TargetParser/Triple.h"
21
22	namespace llvm {
23
24	enum AMDGPUDwarfFlavour : unsigned;
25	class Function;
26	class Instruction;
27	class MachineFunction;
28	class TargetMachine;
29
30	class AMDGPUSubtarget {
31	public:
32	enum Generation {
33	INVALID = `0`,
34	R600 = `1`,
35	R700 = `2`,
36	EVERGREEN = `3`,
37	NORTHERN_ISLANDS = `4`,
38	SOUTHERN_ISLANDS = `5`,
39	SEA_ISLANDS = `6`,
40	VOLCANIC_ISLANDS = `7`,
41	GFX9 = `8`,
42	GFX10 = `9`,
43	GFX11 = `10`,
44	GFX12 = `11`,
45	};
46
47	private:
48	Triple TargetTriple;
49
50	protected:
51	bool GCN3Encoding = false;
52	bool Has16BitInsts = false;
53	bool HasTrue16BitInsts = false;
54	bool HasFP8ConversionScaleInsts = false;
55	bool HasBF8ConversionScaleInsts = false;
56	bool HasFP4ConversionScaleInsts = false;
57	bool HasFP6BF6ConversionScaleInsts = false;
58	bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
59	bool HasCvtPkF16F32Inst = false;
60	bool HasF32ToF16BF16ConversionSRInsts = false;
61	bool EnableRealTrue16Insts = false;
62	bool HasBF16ConversionInsts = false;
63	bool HasMadMixInsts = false;
64	bool HasMadMacF32Insts = false;
65	bool HasDsSrc2Insts = false;
66	bool HasSDWA = false;
67	bool HasVOP3PInsts = false;
68	bool HasMulI24 = true;
69	bool HasMulU24 = true;
70	bool HasSMulHi = false;
71	bool HasInv2PiInlineImm = false;
72	bool HasFminFmaxLegacy = true;
73	bool EnablePromoteAlloca = false;
74	bool HasTrigReducedRange = false;
75	bool FastFMAF32 = false;
76	unsigned EUsPerCU = `4`;
77	unsigned MaxWavesPerEU = `10`;
78	unsigned LocalMemorySize = `0`;
79	unsigned AddressableLocalMemorySize = `0`;
80	char WavefrontSizeLog2 = `0`;
81
82	public:
83	AMDGPUSubtarget(Triple TT);
84
85	static const AMDGPUSubtarget &get(const MachineFunction &MF);
86	static const AMDGPUSubtarget &get(const TargetMachine &TM,
87	const Function &F);
88
89	/// \returns Default range flat work group size for a calling convention.
90	std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
91
92	/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
93	/// for function \p F, or minimum/maximum flat work group sizes explicitly
94	/// requested using "amdgpu-flat-work-group-size" attribute attached to
95	/// function \p F.
96	///
97	/// \returns Subtarget's default values if explicitly requested values cannot
98	/// be converted to integer, or violate subtarget's specifications.
99	std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
100
101	/// \returns Subtarget's default pair of minimum/maximum number of waves per
102	/// execution unit for function \p F, or minimum/maximum number of waves per
103	/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
104	/// attached to function \p F.
105	///
106	/// \returns Subtarget's default values if explicitly requested values cannot
107	/// be converted to integer, violate subtarget's specifications, or are not
108	/// compatible with minimum/maximum number of waves limited by flat work group
109	/// size, register usage, and/or lds usage.
110	std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
111
112	/// Overload which uses the specified values for the flat work group sizes,
113	/// rather than querying the function itself. \p FlatWorkGroupSizes Should
114	/// correspond to the function's value for getFlatWorkGroupSizes.
115	std::pair<unsigned, unsigned>
116	getWavesPerEU(const Function &F,
117	std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
118
119	/// Overload which uses the specified values for the flat workgroup sizes and
120	/// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
121	/// should correspond to the function's value for getFlatWorkGroupSizes and \p
122	/// LDSBytes to the per-workgroup LDS allocation.
123	std::pair<unsigned, unsigned>
124	getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
125	unsigned LDSBytes, const Function &F) const;
126
127	/// Returns the target minimum/maximum number of waves per EU. This is based
128	/// on the minimum/maximum number of \p RequestedWavesPerEU and further
129	/// limited by the maximum achievable occupancy derived from the range of \p
130	/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
131	std::pair<unsigned, unsigned>
132	getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
133	std::pair<unsigned, unsigned> FlatWorkGroupSizes,
134	unsigned LDSBytes) const;
135
136	/// Return the amount of LDS that can be used that will not restrict the
137	/// occupancy lower than WaveCount.
138	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
139	const Function &) const;
140
141	/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
142	/// be achieved when the only function running on a CU is \p F and each
143	/// workgroup running the function requires \p LDSBytes bytes of LDS space.
144	/// This notably depends on the range of allowed flat group sizes for the
145	/// function and hardware characteristics.
146	std::pair<unsigned, unsigned>
147	getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
148	return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
149	}
150
151	/// Overload which uses the specified values for the flat work group sizes,
152	/// rather than querying the function itself. \p FlatWorkGroupSizes should
153	/// correspond to the function's value for getFlatWorkGroupSizes.
154	std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
155	uint32_t LDSBytes,
156	std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
157
158	/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
159	/// be achieved when the only function running on a CU is \p MF. This notably
160	/// depends on the range of allowed flat group sizes for the function, the
161	/// amount of per-workgroup LDS space required by the function, and hardware
162	/// characteristics.
163	std::pair<unsigned, unsigned>
164	getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
165
166	bool isAmdHsaOS() const {
167	return TargetTriple.getOS() == Triple::AMDHSA;
168	}
169
170	bool isAmdPalOS() const {
171	return TargetTriple.getOS() == Triple::AMDPAL;
172	}
173
174	bool isMesa3DOS() const {
175	return TargetTriple.getOS() == Triple::Mesa3D;
176	}
177
178	bool isMesaKernel(const Function &F) const;
179
180	bool isAmdHsaOrMesa(const Function &F) const {
181	return isAmdHsaOS() \|\| isMesaKernel(F);
182	}
183
184	bool isGCN() const { return TargetTriple.isAMDGCN(); }
185
186	bool isGCN3Encoding() const {
187	return GCN3Encoding;
188	}
189
190	bool has16BitInsts() const {
191	return Has16BitInsts;
192	}
193
194	/// Return true if the subtarget supports True16 instructions.
195	bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
196
197	/// Return true if real (non-fake) variants of True16 instructions using
198	/// 16-bit registers should be code-generated. Fake True16 instructions are
199	/// identical to non-fake ones except that they take 32-bit registers as
200	/// operands and always use their low halves.
201	// TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
202	// supported and the support for fake True16 instructions is removed.
203	bool useRealTrue16Insts() const;
204
205	bool hasBF16ConversionInsts() const {
206	return HasBF16ConversionInsts;
207	}
208
209	bool hasMadMixInsts() const {
210	return HasMadMixInsts;
211	}
212
213	bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
214
215	bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
216
217	bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
218
219	bool hasFP6BF6ConversionScaleInsts() const {
220	return HasFP6BF6ConversionScaleInsts;
221	}
222
223	bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {
224	return HasF16BF16ToFP6BF6ConversionScaleInsts;
225	}
226
227	bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
228
229	bool hasF32ToF16BF16ConversionSRInsts() const {
230	return HasF32ToF16BF16ConversionSRInsts;
231	}
232
233	bool hasMadMacF32Insts() const {
234	return HasMadMacF32Insts \|\| !isGCN();
235	}
236
237	bool hasDsSrc2Insts() const {
238	return HasDsSrc2Insts;
239	}
240
241	bool hasSDWA() const {
242	return HasSDWA;
243	}
244
245	bool hasVOP3PInsts() const {
246	return HasVOP3PInsts;
247	}
248
249	bool hasMulI24() const {
250	return HasMulI24;
251	}
252
253	bool hasMulU24() const {
254	return HasMulU24;
255	}
256
257	bool hasSMulHi() const {
258	return HasSMulHi;
259	}
260
261	bool hasInv2PiInlineImm() const {
262	return HasInv2PiInlineImm;
263	}
264
265	bool hasFminFmaxLegacy() const {
266	return HasFminFmaxLegacy;
267	}
268
269	bool hasTrigReducedRange() const {
270	return HasTrigReducedRange;
271	}
272
273	bool hasFastFMAF32() const {
274	return FastFMAF32;
275	}
276
277	bool isPromoteAllocaEnabled() const {
278	return EnablePromoteAlloca;
279	}
280
281	unsigned getWavefrontSize() const {
282	return `1` << WavefrontSizeLog2;
283	}
284
285	unsigned getWavefrontSizeLog2() const {
286	return WavefrontSizeLog2;
287	}
288
289	/// Return the maximum number of bytes of LDS available for all workgroups
290	/// running on the same WGP or CU.
291	/// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
292	/// limited to 64k.
293	unsigned getLocalMemorySize() const {
294	return LocalMemorySize;
295	}
296
297	/// Return the maximum number of bytes of LDS that can be allocated to a
298	/// single workgroup.
299	/// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
300	/// 128k in total.
301	unsigned getAddressableLocalMemorySize() const {
302	return AddressableLocalMemorySize;
303	}
304
305	/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
306	/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
307	/// CU mode into account.
308	unsigned getEUsPerCU() const { return EUsPerCU; }
309
310	Align getAlignmentForImplicitArgPtr() const {
311	return isAmdHsaOS() ? Align (`8`) : Align (`4`);
312	}
313
314	/// Returns the offset in bytes from the start of the input buffer
315	/// of the first explicit kernel argument.
316	unsigned getExplicitKernelArgOffset() const {
317	switch (TargetTriple.getOS()) {
318	case Triple::AMDHSA:
319	case Triple::AMDPAL:
320	case Triple::Mesa3D:
321	return `0`;
322	case Triple::UnknownOS:
323	default:
324	// For legacy reasons unknown/other is treated as a different version of
325	// mesa.
326	return `36`;
327	}
328
329	llvm_unreachable("invalid triple OS");
330	}
331
332	/// \returns Maximum number of work groups per compute unit supported by the
333	/// subtarget and limited by given \p FlatWorkGroupSize.
334	virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = `0`;
335
336	/// \returns Minimum flat work group size supported by the subtarget.
337	virtual unsigned getMinFlatWorkGroupSize() const = `0`;
338
339	/// \returns Maximum flat work group size supported by the subtarget.
340	virtual unsigned getMaxFlatWorkGroupSize() const = `0`;
341
342	/// \returns Number of waves per execution unit required to support the given
343	/// \p FlatWorkGroupSize.
344	virtual unsigned
345	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = `0`;
346
347	/// \returns Minimum number of waves per execution unit supported by the
348	/// subtarget.
349	virtual unsigned getMinWavesPerEU() const = `0`;
350
351	/// \returns Maximum number of waves per execution unit supported by the
352	/// subtarget without any kind of limitation.
353	unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
354
355	/// Return the maximum workitem ID value in the function, for the given (0, 1,
356	/// 2) dimension.
357	unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
358
359	/// Return the number of work groups for the function.
360	SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
361
362	/// Return true if only a single workitem can be active in a wave.
363	bool isSingleLaneExecution(const Function &Kernel) const;
364
365	/// Creates value range metadata on an workitemid. intrinsic call or load.*
366	bool makeLIDRangeMetadata(Instruction I) const*;
367
368	/// \returns Number of bytes of arguments that are passed to a shader or
369	/// kernel in addition to the explicit ones declared for the function.
370	unsigned getImplicitArgNumBytes(const Function &F) const;
371	uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
372	unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
373
374	/// \returns Corresponding DWARF register number mapping flavour for the
375	/// \p WavefrontSize.
376	AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
377
378	virtual ~AMDGPUSubtarget() = default;
379	};
380
381	} // end namespace llvm
382
383	#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
384

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h