AMDGPUSubtarget.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp]

1	//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Implements the AMDGPU specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUSubtarget.h"
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPUInstructionSelector.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "R600Subtarget.h"
20	#include "SIMachineFunctionInfo.h"
21	#include "Utils/AMDGPUBaseInfo.h"
22	#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23	#include "llvm/CodeGen/MachineScheduler.h"
24	#include "llvm/CodeGen/TargetFrameLowering.h"
25	#include "llvm/IR/DiagnosticInfo.h"
26	#include "llvm/IR/IntrinsicsAMDGPU.h"
27	#include "llvm/IR/IntrinsicsR600.h"
28	#include "llvm/IR/MDBuilder.h"
29	#include <algorithm>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "amdgpu-subtarget"
34
35	// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
36	// allows the given function to achieve an occupancy of NWaves waves per
37	// SIMD / EU, taking into account only the function's maximum* workgroup size.*
38	unsigned
39	AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
40	const Function &F) const {
41	const unsigned WaveSize = getWavefrontSize();
42	const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
43	const unsigned WavesPerWorkgroup =
44	std::max(a: `1u`, b: (WorkGroupSize + WaveSize - `1`) / WaveSize);
45
46	const unsigned WorkGroupsPerCU =
47	std::max(a: `1u`, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
48
49	return getLocalMemorySize() / WorkGroupsPerCU;
50	}
51
52	std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
53	uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
54
55	// FIXME: We should take into account the LDS allocation granularity.
56	const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(a: LDSBytes, b: `1u`);
57
58	// Queried LDS size may be larger than available on a CU, in which case we
59	// consider the only achievable occupancy to be 1, in line with what we
60	// consider the occupancy to be when the number of requested registers in a
61	// particular bank is higher than the number of available ones in that bank.
62	if (!MaxWGsLDS)
63	return {`1`, `1`};
64
65	const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
66
67	auto PropsFromWGSize = [=](unsigned WGSize)
68	-> std::tuple<const unsigned, const unsigned, unsigned> {
69	unsigned WavesPerWG = divideCeil(Numerator: WGSize, Denominator: WaveSize);
70	unsigned WGsPerCU = std::min(a: getMaxWorkGroupsPerCU(FlatWorkGroupSize: WGSize), b: MaxWGsLDS);
71	return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
72	};
73
74	// The maximum group size will generally yield the minimum number of
75	// workgroups, maximum number of waves, and minimum occupancy. The opposite is
76	// generally true for the minimum group size. LDS or barrier ressource
77	// limitations can flip those minimums/maximums.
78	const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
79	auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize (MinWGSize);
80	auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize (MaxWGSize);
81
82	// It is possible that we end up with flipped minimum and maximum number of
83	// waves per CU when the number of minimum/maximum concurrent groups on the CU
84	// is limited by LDS usage or barrier resources.
85	if (MinWavesPerCU >= MaxWavesPerCU) {
86	std::swap(a&: MinWavesPerCU, b&: MaxWavesPerCU);
87	} else {
88	const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
89
90	// Look for a potential smaller group size than the maximum which decreases
91	// the concurrent number of waves on the CU for the same number of
92	// concurrent workgroups on the CU.
93	unsigned MinWavesPerCUForWGSize =
94	divideCeil(Numerator: WaveSlotsPerCU, Denominator: MinWGsPerCU + `1`) * MinWGsPerCU;
95	if (MinWavesPerCU > MinWavesPerCUForWGSize) {
96	unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
97	if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
98	// There may exist a smaller group size than the maximum that achieves
99	// the minimum number of waves per CU. This group size is the largest
100	// possible size that requires MaxWavesPerWG - E waves where E is
101	// maximized under the following constraints.
102	// 1. 0 <= E <= ExcessSlotsPerWG
103	// 2. (MaxWavesPerWG - E) WaveSize >= MinWGSize*
104	MinWavesPerCU -= MinWGsPerCU * std::min(a: ExcessSlotsPerWG,
105	b: MaxWavesPerWG - MinWavesPerWG);
106	}
107	}
108
109	// Look for a potential larger group size than the minimum which increases
110	// the concurrent number of waves on the CU for the same number of
111	// concurrent workgroups on the CU.
112	unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
113	if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
114	// There may exist a larger group size than the minimum that achieves the
115	// maximum number of waves per CU. This group size is the smallest
116	// possible size that requires MinWavesPerWG + L waves where L is
117	// maximized under the following constraints.
118	// 1. 0 <= L <= LeftoverSlotsPerWG
119	// 2. (MinWavesPerWG + L - 1) WaveSize <= MaxWGSize*
120	MaxWavesPerCU += MaxWGsPerCU * std::min(a: LeftoverSlotsPerWG,
121	b: ((MaxWGSize - `1`) / WaveSize) + `1` -
122	MinWavesPerWG);
123	}
124	}
125
126	// Return the minimum/maximum number of waves on any EU, assuming that all
127	// wavefronts are spread across all EUs as evenly as possible.
128	return {std::clamp(val: MinWavesPerCU / getEUsPerCU(), lo: `1U`, hi: WavesPerEU),
129	std::clamp(val: divideCeil(Numerator: MaxWavesPerCU, Denominator: getEUsPerCU()), lo: `1U`, hi: WavesPerEU)};
130	}
131
132	std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
133	const MachineFunction &MF) const {
134	const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
135	return getOccupancyWithWorkGroupSizes(LDSBytes: MFI->getLDSSize(), F: MF.getFunction());
136	}
137
138	std::pair<unsigned, unsigned>
139	AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
140	switch (CC) {
141	case CallingConv::AMDGPU_VS:
142	case CallingConv::AMDGPU_LS:
143	case CallingConv::AMDGPU_HS:
144	case CallingConv::AMDGPU_ES:
145	case CallingConv::AMDGPU_GS:
146	case CallingConv::AMDGPU_PS:
147	return std::pair(`1`, getWavefrontSize());
148	default:
149	return std::pair(`1u`, getMaxFlatWorkGroupSize());
150	}
151	}
152
153	std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
154	const Function &F) const {
155	// Default minimum/maximum flat work group sizes.
156	std::pair<unsigned, unsigned> Default =
157	getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
158
159	// Requested minimum/maximum flat work group sizes.
160	std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
161	F, Name: "amdgpu-flat-work-group-size", Default);
162
163	// Make sure requested minimum is less than requested maximum.
164	if (Requested.first > Requested.second)
165	return Default;
166
167	// Make sure requested values do not violate subtarget's specifications.
168	if (Requested.first < getMinFlatWorkGroupSize())
169	return Default;
170	if (Requested.second > getMaxFlatWorkGroupSize())
171	return Default;
172
173	return Requested;
174	}
175
176	std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
177	std::pair<unsigned, unsigned> RequestedWavesPerEU,
178	std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
179	// Default minimum/maximum number of waves per EU. The range of flat workgroup
180	// sizes limits the achievable maximum, and we aim to support enough waves per
181	// EU so that we can concurrently execute all waves of a single workgroup of
182	// maximum size on a CU.
183	std::pair<unsigned, unsigned> Default = {
184	getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second),
185	getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
186	Default.first = std::min(a: Default.first, b: Default.second);
187
188	// Make sure requested minimum is within the default range and lower than the
189	// requested maximum. The latter must not violate target specification.
190	if (RequestedWavesPerEU.first < Default.first \|\|
191	RequestedWavesPerEU.first > Default.second \|\|
192	RequestedWavesPerEU.first > RequestedWavesPerEU.second \|\|
193	RequestedWavesPerEU.second > getMaxWavesPerEU())
194	return Default;
195
196	// We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
197	RequestedWavesPerEU.second =
198	std::min(a: RequestedWavesPerEU.second, b: Default.second);
199	return RequestedWavesPerEU;
200	}
201
202	std::pair<unsigned, unsigned>
203	AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
204	// Default/requested minimum/maximum flat work group sizes.
205	std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
206	// Minimum number of bytes allocated in the LDS.
207	unsigned LDSBytes =
208	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size", Default: {`0`, UINT32_MAX},
209	/OnlyFirstRequired=/true)
210	.first;
211	return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
212	}
213
214	std::pair<unsigned, unsigned>
215	AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
216	unsigned LDSBytes, const Function &F) const {
217	// Default minimum/maximum number of waves per execution unit.
218	std::pair<unsigned, unsigned> Default(`1`, getMaxWavesPerEU());
219
220	// Requested minimum/maximum number of waves per execution unit.
221	std::pair<unsigned, unsigned> Requested =
222	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default, OnlyFirstRequired: true);
223	return getEffectiveWavesPerEU(RequestedWavesPerEU: Requested, FlatWorkGroupSizes, LDSBytes);
224	}
225
226	std::optional<unsigned>
227	AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
228	unsigned Dim) const {
229	auto *Node = Kernel.getMetadata(Kind: "reqd_work_group_size");
230	if (Node && Node->getNumOperands() == `3`)
231	return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue();
232	return std::nullopt;
233	}
234
235	bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
236	const Function &F, bool RequiresUniformYZ) const {
237	auto *Node = F.getMetadata(Kind: "reqd_work_group_size");
238	if (!Node \|\| Node->getNumOperands() != `3`)
239	return false;
240	unsigned XLen =
241	mdconst::extract<ConstantInt>(MD: Node->getOperand(I: `0`))->getZExtValue();
242	unsigned YLen =
243	mdconst::extract<ConstantInt>(MD: Node->getOperand(I: `1`))->getZExtValue();
244	unsigned ZLen =
245	mdconst::extract<ConstantInt>(MD: Node->getOperand(I: `2`))->getZExtValue();
246
247	bool Is1D = YLen <= `1` && ZLen <= `1`;
248	bool IsXLargeEnough =
249	isPowerOf2_32(Value: XLen) && (!RequiresUniformYZ \|\| XLen >= getWavefrontSize());
250	return Is1D \|\| IsXLargeEnough;
251	}
252
253	bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
254	return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv());
255	}
256
257	unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
258	unsigned Dimension) const {
259	std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension);
260	if (ReqdSize)
261	return *ReqdSize - `1`;
262	return getFlatWorkGroupSizes(F: Kernel).second - `1`;
263	}
264
265	bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
266	for (int I = `0`; I < `3`; ++I) {
267	if (getMaxWorkitemID(Kernel: Func, Dimension: I) > `0`)
268	return false;
269	}
270
271	return true;
272	}
273
274	bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction I) const* {
275	Function *Kernel = I->getFunction();
276	unsigned MinSize = `0`;
277	unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second;
278	bool IdQuery = false;
279
280	// If reqd_work_group_size is present it narrows value down.
281	if (auto *CI = dyn_cast<CallInst>(Val: I)) {
282	const Function *F = CI->getCalledFunction();
283	if (F) {
284	unsigned Dim = UINT_MAX;
285	switch (F->getIntrinsicID()) {
286	case Intrinsic::amdgcn_workitem_id_x:
287	case Intrinsic::r600_read_tidig_x:
288	IdQuery = true;
289	[[fallthrough]];
290	case Intrinsic::r600_read_local_size_x:
291	Dim = `0`;
292	break;
293	case Intrinsic::amdgcn_workitem_id_y:
294	case Intrinsic::r600_read_tidig_y:
295	IdQuery = true;
296	[[fallthrough]];
297	case Intrinsic::r600_read_local_size_y:
298	Dim = `1`;
299	break;
300	case Intrinsic::amdgcn_workitem_id_z:
301	case Intrinsic::r600_read_tidig_z:
302	IdQuery = true;
303	[[fallthrough]];
304	case Intrinsic::r600_read_local_size_z:
305	Dim = `2`;
306	break;
307	default:
308	break;
309	}
310
311	if (Dim <= `3`) {
312	std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim);
313	if (ReqdSize)
314	MinSize = MaxSize = *ReqdSize;
315	}
316	}
317	}
318
319	if (!MaxSize)
320	return false;
321
322	// Range metadata is [Lo, Hi). For ID query we need to pass max size
323	// as Hi. For size query we need to pass Hi + 1.
324	if (IdQuery)
325	MinSize = `0`;
326	else
327	++MaxSize;
328
329	APInt Lower{`32`, MinSize};
330	APInt Upper{`32`, MaxSize};
331	if (auto *CI = dyn_cast<CallBase>(Val: I)) {
332	ConstantRange Range(Lower, Upper);
333	CI->addRangeRetAttr(CR: Range);
334	} else {
335	MDBuilder MDB(I->getContext());
336	MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper);
337	I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange);
338	}
339	return true;
340	}
341
342	unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
343
344	// We don't allocate the segment if we know the implicit arguments weren't
345	// used, even if the ABI implies we need them.
346	if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
347	return `0`;
348
349	if (isMesaKernel(F))
350	return `16`;
351
352	// Assume all implicit inputs are used by default
353	const Module *M = F.getParent();
354	unsigned NBytes =
355	AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? `256` : `56`;
356	return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes",
357	Default: NBytes);
358	}
359
360	uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
361	Align &MaxAlign) const {
362	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
363	F.getCallingConv() == CallingConv::SPIR_KERNEL);
364
365	const DataLayout &DL = F.getDataLayout();
366	uint64_t ExplicitArgBytes = `0`;
367	MaxAlign = Align (`1`);
368
369	for (const Argument &Arg : F.args()) {
370	if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument"))
371	continue;
372
373	const bool IsByRef = Arg.hasByRefAttr();
374	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
375	Align Alignment = DL.getValueOrABITypeAlignment(
376	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy);
377	uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
378	ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize;
379	MaxAlign = std::max(a: MaxAlign, b: Alignment);
380	}
381
382	return ExplicitArgBytes;
383	}
384
385	unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
386	Align &MaxAlign) const {
387	if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
388	F.getCallingConv() != CallingConv::SPIR_KERNEL)
389	return `0`;
390
391	uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
392
393	unsigned ExplicitOffset = getExplicitKernelArgOffset();
394
395	uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
396	unsigned ImplicitBytes = getImplicitArgNumBytes(F);
397	if (ImplicitBytes != `0`) {
398	const Align Alignment = getAlignmentForImplicitArgPtr();
399	TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes;
400	MaxAlign = std::max(a: MaxAlign, b: Alignment);
401	}
402
403	// Being able to dereference past the end is useful for emitting scalar loads.
404	return alignTo(Value: TotalSize, Align: `4`);
405	}
406
407	AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
408	return getWavefrontSize() == `32` ? AMDGPUDwarfFlavour::Wave32
409	: AMDGPUDwarfFlavour::Wave64;
410	}
411
412	const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
413	if (MF.getTarget().getTargetTriple().isAMDGCN())
414	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
415	return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
416	}
417
418	const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
419	if (TM.getTargetTriple().isAMDGCN())
420	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
421	return static_cast<const AMDGPUSubtarget &>(
422	TM.getSubtarget<R600Subtarget>(F));
423	}
424
425	// FIXME: This has no reason to be in subtarget
426	SmallVector<unsigned>
427	AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
428	return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups", Size: `3`,
429	DefaultVal: std::numeric_limits<uint32_t>::max());
430	}
431

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp