AMDGPUSubtarget.h source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h]

1	//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Base class for AMDGPU specific classes of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17	#include "llvm/ADT/SmallVector.h"
18	#include "llvm/IR/CallingConv.h"
19	#include "llvm/Support/Alignment.h"
20	#include "llvm/TargetParser/Triple.h"
21
22	namespace llvm {
23
24	enum AMDGPUDwarfFlavour : unsigned;
25	class Function;
26	class Instruction;
27	class MachineFunction;
28	class TargetMachine;
29
30	class AMDGPUSubtarget {
31	public:
32	enum Generation {
33	INVALID = `0`,
34	R600 = `1`,
35	R700 = `2`,
36	EVERGREEN = `3`,
37	NORTHERN_ISLANDS = `4`,
38	SOUTHERN_ISLANDS = `5`,
39	SEA_ISLANDS = `6`,
40	VOLCANIC_ISLANDS = `7`,
41	GFX9 = `8`,
42	GFX10 = `9`,
43	GFX11 = `10`,
44	GFX12 = `11`,
45	GFX13 = `12`,
46	};
47
48	private:
49	const Triple &TargetTriple;
50
51	protected:
52	bool HasMulI24 = true;
53	bool HasMulU24 = true;
54	bool HasSMulHi = false;
55	bool HasFminFmaxLegacy = true;
56
57	unsigned EUsPerCU = `4`;
58	unsigned MaxWavesPerEU = `10`;
59	unsigned LocalMemorySize = `0`;
60	unsigned AddressableLocalMemorySize = `0`;
61	char WavefrontSizeLog2 = `0`;
62	unsigned FlatOffsetBitWidth = `0`;
63
64	public:
65	AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
66
67	static const AMDGPUSubtarget &get(const MachineFunction &MF);
68	static const AMDGPUSubtarget &get(const TargetMachine &TM,
69	const Function &F);
70
71	/// \returns Default range flat work group size for a calling convention.
72	std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
73
74	/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
75	/// for function \p F, or minimum/maximum flat work group sizes explicitly
76	/// requested using "amdgpu-flat-work-group-size" attribute attached to
77	/// function \p F.
78	///
79	/// \returns Subtarget's default values if explicitly requested values cannot
80	/// be converted to integer, or violate subtarget's specifications.
81	std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
82
83	/// \returns The required size of workgroups that will be used to execute \p F
84	/// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
85	/// metadata. Otherwise, returns std::nullopt.
86	std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
87	unsigned Dim) const;
88
89	/// \returns true if \p F will execute in a manner that leaves the X
90	/// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
91	/// wavefrontsize is uniform. This is true if either the Y and Z block
92	/// dimensions are known to always be 1 or if the X dimension will always be a
93	/// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
94	/// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
95	/// wavesize64 would ordinarily pass this test, it won't with
96	/// \pRequiresUniformYZ).
97	///
98	/// This information is currently only gathered from the !reqd_work_group_size
99	/// metadata on \p F, but this may be improved in the future.
100	bool hasWavefrontsEvenlySplittingXDim(const Function &F,
101	bool REquiresUniformYZ = false) const;
102
103	/// \returns Subtarget's default pair of minimum/maximum number of waves per
104	/// execution unit for function \p F, or minimum/maximum number of waves per
105	/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
106	/// attached to function \p F.
107	///
108	/// \returns Subtarget's default values if explicitly requested values cannot
109	/// be converted to integer, violate subtarget's specifications, or are not
110	/// compatible with minimum/maximum number of waves limited by flat work group
111	/// size, register usage, and/or lds usage.
112	std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
113
114	/// Overload which uses the specified values for the flat workgroup sizes and
115	/// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
116	/// should correspond to the function's value for getFlatWorkGroupSizes and \p
117	/// LDSBytes to the per-workgroup LDS allocation.
118	std::pair<unsigned, unsigned>
119	getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
120	unsigned LDSBytes, const Function &F) const;
121
122	/// Returns the target minimum/maximum number of waves per EU. This is based
123	/// on the minimum/maximum number of \p RequestedWavesPerEU and further
124	/// limited by the maximum achievable occupancy derived from the range of \p
125	/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
126	std::pair<unsigned, unsigned>
127	getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
128	std::pair<unsigned, unsigned> FlatWorkGroupSizes,
129	unsigned LDSBytes) const;
130
131	/// Return the amount of LDS that can be used that will not restrict the
132	/// occupancy lower than WaveCount.
133	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
134	const Function &) const;
135
136	/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
137	/// be achieved when the only function running on a CU is \p F and each
138	/// workgroup running the function requires \p LDSBytes bytes of LDS space.
139	/// This notably depends on the range of allowed flat group sizes for the
140	/// function and hardware characteristics.
141	std::pair<unsigned, unsigned>
142	getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
143	return getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes: getFlatWorkGroupSizes(F));
144	}
145
146	/// Overload which uses the specified values for the flat work group sizes,
147	/// rather than querying the function itself. \p FlatWorkGroupSizes should
148	/// correspond to the function's value for getFlatWorkGroupSizes.
149	std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
150	uint32_t LDSBytes,
151	std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
152
153	/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
154	/// be achieved when the only function running on a CU is \p MF. This notably
155	/// depends on the range of allowed flat group sizes for the function, the
156	/// amount of per-workgroup LDS space required by the function, and hardware
157	/// characteristics.
158	std::pair<unsigned, unsigned>
159	getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
160
161	bool isAmdHsaOS() const {
162	return TargetTriple.getOS() == Triple::AMDHSA;
163	}
164
165	bool isAmdPalOS() const {
166	return TargetTriple.getOS() == Triple::AMDPAL;
167	}
168
169	bool isMesa3DOS() const {
170	return TargetTriple.getOS() == Triple::Mesa3D;
171	}
172
173	bool isMesaKernel(const Function &F) const;
174
175	bool isAmdHsaOrMesa(const Function &F) const {
176	return isAmdHsaOS() \|\| isMesaKernel(F);
177	}
178
179	bool isGCN() const { return TargetTriple.isAMDGCN(); }
180
181	//==---------------------------------------------------------------------===//
182	// TableGen-generated feature getters.
183	//==---------------------------------------------------------------------===//
184	#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
185	virtual bool GETTER() const { return false; }
186	#include "AMDGPUGenSubtargetInfo.inc"
187	//==---------------------------------------------------------------------===//
188
189	/// Return true if real (non-fake) variants of True16 instructions using
190	/// 16-bit registers should be code-generated. Fake True16 instructions are
191	/// identical to non-fake ones except that they take 32-bit registers as
192	/// operands and always use their low halves.
193	// TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
194	// supported and the support for fake True16 instructions is removed.
195	bool useRealTrue16Insts() const {
196	return hasTrue16BitInsts() && enableRealTrue16Insts();
197	}
198
199	bool hasMulI24() const {
200	return HasMulI24;
201	}
202
203	bool hasMulU24() const {
204	return HasMulU24;
205	}
206
207	bool hasSMulHi() const {
208	return HasSMulHi;
209	}
210
211	bool hasFminFmaxLegacy() const {
212	return HasFminFmaxLegacy;
213	}
214
215	unsigned getWavefrontSize() const {
216	return `1` << WavefrontSizeLog2;
217	}
218
219	unsigned getWavefrontSizeLog2() const {
220	return WavefrontSizeLog2;
221	}
222
223	/// Return the maximum number of bytes of LDS available for all workgroups
224	/// running on the same WGP or CU.
225	/// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
226	/// limited to 64k.
227	unsigned getLocalMemorySize() const {
228	return LocalMemorySize;
229	}
230
231	/// Return the maximum number of bytes of LDS that can be allocated to a
232	/// single workgroup.
233	/// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
234	/// 128k in total.
235	unsigned getAddressableLocalMemorySize() const {
236	return AddressableLocalMemorySize;
237	}
238
239	/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
240	/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
241	/// CU mode into account.
242	unsigned getEUsPerCU() const { return EUsPerCU; }
243
244	Align getAlignmentForImplicitArgPtr() const {
245	return isAmdHsaOS() ? Align (`8`) : Align (`4`);
246	}
247
248	/// Returns the offset in bytes from the start of the input buffer
249	/// of the first explicit kernel argument.
250	unsigned getExplicitKernelArgOffset() const {
251	switch (TargetTriple.getOS()) {
252	case Triple::AMDHSA:
253	case Triple::AMDPAL:
254	case Triple::Mesa3D:
255	return `0`;
256	case Triple::UnknownOS:
257	default:
258	// For legacy reasons unknown/other is treated as a different version of
259	// mesa.
260	return `36`;
261	}
262
263	llvm_unreachable("invalid triple OS");
264	}
265
266	/// \returns Maximum number of work groups per compute unit supported by the
267	/// subtarget and limited by given \p FlatWorkGroupSize.
268	virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = `0`;
269
270	/// \returns Minimum flat work group size supported by the subtarget.
271	virtual unsigned getMinFlatWorkGroupSize() const = `0`;
272
273	/// \returns Maximum flat work group size supported by the subtarget.
274	virtual unsigned getMaxFlatWorkGroupSize() const = `0`;
275
276	/// \returns Number of waves per execution unit required to support the given
277	/// \p FlatWorkGroupSize.
278	virtual unsigned
279	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = `0`;
280
281	/// \returns Minimum number of waves per execution unit supported by the
282	/// subtarget.
283	virtual unsigned getMinWavesPerEU() const = `0`;
284
285	/// \returns Maximum number of waves per execution unit supported by the
286	/// subtarget without any kind of limitation.
287	unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
288
289	/// Return the maximum workitem ID value in the function, for the given (0, 1,
290	/// 2) dimension.
291	unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
292
293	/// Return the number of work groups for the function.
294	SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
295
296	/// Return true if only a single workitem can be active in a wave.
297	bool isSingleLaneExecution(const Function &Kernel) const;
298
299	/// Creates value range metadata on an workitemid. intrinsic call or load.*
300	bool makeLIDRangeMetadata(Instruction I) const*;
301
302	/// \returns Number of bytes of arguments that are passed to a shader or
303	/// kernel in addition to the explicit ones declared for the function.
304	unsigned getImplicitArgNumBytes(const Function &F) const;
305	uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
306	unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
307
308	/// \returns Corresponding DWARF register number mapping flavour for the
309	/// \p WavefrontSize.
310	AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
311
312	virtual ~AMDGPUSubtarget() = default;
313	};
314
315	} // end namespace llvm
316
317	#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
318

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h