AMDGPUSubtarget.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp]

1	//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Implements the AMDGPU specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUSubtarget.h"
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPUInstructionSelector.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "R600Subtarget.h"
20	#include "SIMachineFunctionInfo.h"
21	#include "Utils/AMDGPUBaseInfo.h"
22	#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23	#include "llvm/CodeGen/MachineScheduler.h"
24	#include "llvm/CodeGen/TargetFrameLowering.h"
25	#include "llvm/IR/DiagnosticInfo.h"
26	#include "llvm/IR/IntrinsicsAMDGPU.h"
27	#include "llvm/IR/IntrinsicsR600.h"
28	#include "llvm/IR/MDBuilder.h"
29	#include <algorithm>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "amdgpu-subtarget"
34
35	AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple (std::move(TT)) {}
36
37	bool AMDGPUSubtarget::useRealTrue16Insts() const {
38	return hasTrue16BitInsts() && EnableRealTrue16Insts;
39	}
40
41	// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
42	// allows the given function to achieve an occupancy of NWaves waves per
43	// SIMD / EU, taking into account only the function's maximum* workgroup size.*
44	unsigned
45	AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
46	const Function &F) const {
47	const unsigned WaveSize = getWavefrontSize();
48	const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
49	const unsigned WavesPerWorkgroup =
50	std::max(a: `1u`, b: (WorkGroupSize + WaveSize - `1`) / WaveSize);
51
52	const unsigned WorkGroupsPerCU =
53	std::max(a: `1u`, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
54
55	return getLocalMemorySize() / WorkGroupsPerCU;
56	}
57
58	std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
59	uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
60
61	// FIXME: We should take into account the LDS allocation granularity.
62	const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(a: LDSBytes, b: `1u`);
63
64	// Queried LDS size may be larger than available on a CU, in which case we
65	// consider the only achievable occupancy to be 1, in line with what we
66	// consider the occupancy to be when the number of requested registers in a
67	// particular bank is higher than the number of available ones in that bank.
68	if (!MaxWGsLDS)
69	return {`1`, `1`};
70
71	const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
72
73	auto PropsFromWGSize = [=](unsigned WGSize)
74	-> std::tuple<const unsigned, const unsigned, unsigned> {
75	unsigned WavesPerWG = divideCeil(Numerator: WGSize, Denominator: WaveSize);
76	unsigned WGsPerCU = std::min(a: getMaxWorkGroupsPerCU(FlatWorkGroupSize: WGSize), b: MaxWGsLDS);
77	return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
78	};
79
80	// The maximum group size will generally yield the minimum number of
81	// workgroups, maximum number of waves, and minimum occupancy. The opposite is
82	// generally true for the minimum group size. LDS or barrier ressource
83	// limitations can flip those minimums/maximums.
84	const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
85	auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize (MinWGSize);
86	auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize (MaxWGSize);
87
88	// It is possible that we end up with flipped minimum and maximum number of
89	// waves per CU when the number of minimum/maximum concurrent groups on the CU
90	// is limited by LDS usage or barrier resources.
91	if (MinWavesPerCU >= MaxWavesPerCU) {
92	std::swap(a&: MinWavesPerCU, b&: MaxWavesPerCU);
93	} else {
94	const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
95
96	// Look for a potential smaller group size than the maximum which decreases
97	// the concurrent number of waves on the CU for the same number of
98	// concurrent workgroups on the CU.
99	unsigned MinWavesPerCUForWGSize =
100	divideCeil(Numerator: WaveSlotsPerCU, Denominator: MinWGsPerCU + `1`) * MinWGsPerCU;
101	if (MinWavesPerCU > MinWavesPerCUForWGSize) {
102	unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
103	if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
104	// There may exist a smaller group size than the maximum that achieves
105	// the minimum number of waves per CU. This group size is the largest
106	// possible size that requires MaxWavesPerWG - E waves where E is
107	// maximized under the following constraints.
108	// 1. 0 <= E <= ExcessSlotsPerWG
109	// 2. (MaxWavesPerWG - E) WaveSize >= MinWGSize*
110	MinWavesPerCU -= MinWGsPerCU * std::min(a: ExcessSlotsPerWG,
111	b: MaxWavesPerWG - MinWavesPerWG);
112	}
113	}
114
115	// Look for a potential larger group size than the minimum which increases
116	// the concurrent number of waves on the CU for the same number of
117	// concurrent workgroups on the CU.
118	unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
119	if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
120	// There may exist a larger group size than the minimum that achieves the
121	// maximum number of waves per CU. This group size is the smallest
122	// possible size that requires MinWavesPerWG + L waves where L is
123	// maximized under the following constraints.
124	// 1. 0 <= L <= LeftoverSlotsPerWG
125	// 2. (MinWavesPerWG + L - 1) WaveSize <= MaxWGSize*
126	MaxWavesPerCU += MaxWGsPerCU * std::min(a: LeftoverSlotsPerWG,
127	b: ((MaxWGSize - `1`) / WaveSize) + `1` -
128	MinWavesPerWG);
129	}
130	}
131
132	// Return the minimum/maximum number of waves on any EU, assuming that all
133	// wavefronts are spread across all EUs as evenly as possible.
134	return {std::clamp(val: MinWavesPerCU / getEUsPerCU(), lo: `1U`, hi: WavesPerEU),
135	std::clamp(val: divideCeil(Numerator: MaxWavesPerCU, Denominator: getEUsPerCU()), lo: `1U`, hi: WavesPerEU)};
136	}
137
138	std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
139	const MachineFunction &MF) const {
140	const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
141	return getOccupancyWithWorkGroupSizes(LDSBytes: MFI->getLDSSize(), F: MF.getFunction());
142	}
143
144	std::pair<unsigned, unsigned>
145	AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
146	switch (CC) {
147	case CallingConv::AMDGPU_VS:
148	case CallingConv::AMDGPU_LS:
149	case CallingConv::AMDGPU_HS:
150	case CallingConv::AMDGPU_ES:
151	case CallingConv::AMDGPU_GS:
152	case CallingConv::AMDGPU_PS:
153	return std::pair(`1`, getWavefrontSize());
154	default:
155	return std::pair(`1u`, getMaxFlatWorkGroupSize());
156	}
157	}
158
159	std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
160	const Function &F) const {
161	// Default minimum/maximum flat work group sizes.
162	std::pair<unsigned, unsigned> Default =
163	getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
164
165	// Requested minimum/maximum flat work group sizes.
166	std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
167	F, Name: "amdgpu-flat-work-group-size", Default);
168
169	// Make sure requested minimum is less than requested maximum.
170	if (Requested.first > Requested.second)
171	return Default;
172
173	// Make sure requested values do not violate subtarget's specifications.
174	if (Requested.first < getMinFlatWorkGroupSize())
175	return Default;
176	if (Requested.second > getMaxFlatWorkGroupSize())
177	return Default;
178
179	return Requested;
180	}
181
182	std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
183	std::pair<unsigned, unsigned> RequestedWavesPerEU,
184	std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
185	// Default minimum/maximum number of waves per EU. The range of flat workgroup
186	// sizes limits the achievable maximum, and we aim to support enough waves per
187	// EU so that we can concurrently execute all waves of a single workgroup of
188	// maximum size on a CU.
189	std::pair<unsigned, unsigned> Default = {
190	getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second),
191	getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
192	Default.first = std::min(a: Default.first, b: Default.second);
193
194	// Make sure requested minimum is within the default range and lower than the
195	// requested maximum. The latter must not violate target specification.
196	if (RequestedWavesPerEU.first < Default.first \|\|
197	RequestedWavesPerEU.first > Default.second \|\|
198	RequestedWavesPerEU.first > RequestedWavesPerEU.second \|\|
199	RequestedWavesPerEU.second > getMaxWavesPerEU())
200	return Default;
201
202	// We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
203	RequestedWavesPerEU.second =
204	std::min(a: RequestedWavesPerEU.second, b: Default.second);
205	return RequestedWavesPerEU;
206	}
207
208	std::pair<unsigned, unsigned>
209	AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
210	// Default/requested minimum/maximum flat work group sizes.
211	std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
212	// Minimum number of bytes allocated in the LDS.
213	unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
214	Default: {`0`, UINT32_MAX}, OnlyFirstRequired: true)
215	.first;
216	return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
217	}
218
219	std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
220	const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
221	// Minimum number of bytes allocated in the LDS.
222	unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
223	Default: {`0`, UINT32_MAX}, OnlyFirstRequired: true)
224	.first;
225	return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
226	}
227
228	std::pair<unsigned, unsigned>
229	AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
230	unsigned LDSBytes, const Function &F) const {
231	// Default minimum/maximum number of waves per execution unit.
232	std::pair<unsigned, unsigned> Default(`1`, getMaxWavesPerEU());
233
234	// Requested minimum/maximum number of waves per execution unit.
235	std::pair<unsigned, unsigned> Requested =
236	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default, OnlyFirstRequired: true);
237	return getEffectiveWavesPerEU(RequestedWavesPerEU: Requested, FlatWorkGroupSizes, LDSBytes);
238	}
239
240	static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
241	auto *Node = Kernel.getMetadata(Kind: "reqd_work_group_size");
242	if (Node && Node->getNumOperands() == `3`)
243	return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue();
244	return std::numeric_limits<unsigned>::max();
245	}
246
247	bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
248	return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv());
249	}
250
251	unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
252	unsigned Dimension) const {
253	unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension);
254	if (ReqdSize != std::numeric_limits<unsigned>::max())
255	return ReqdSize - `1`;
256	return getFlatWorkGroupSizes(F: Kernel).second - `1`;
257	}
258
259	bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
260	for (int I = `0`; I < `3`; ++I) {
261	if (getMaxWorkitemID(Kernel: Func, Dimension: I) > `0`)
262	return false;
263	}
264
265	return true;
266	}
267
268	bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction I) const* {
269	Function *Kernel = I->getParent()->getParent();
270	unsigned MinSize = `0`;
271	unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second;
272	bool IdQuery = false;
273
274	// If reqd_work_group_size is present it narrows value down.
275	if (auto *CI = dyn_cast<CallInst>(Val: I)) {
276	const Function *F = CI->getCalledFunction();
277	if (F) {
278	unsigned Dim = UINT_MAX;
279	switch (F->getIntrinsicID()) {
280	case Intrinsic::amdgcn_workitem_id_x:
281	case Intrinsic::r600_read_tidig_x:
282	IdQuery = true;
283	[[fallthrough]];
284	case Intrinsic::r600_read_local_size_x:
285	Dim = `0`;
286	break;
287	case Intrinsic::amdgcn_workitem_id_y:
288	case Intrinsic::r600_read_tidig_y:
289	IdQuery = true;
290	[[fallthrough]];
291	case Intrinsic::r600_read_local_size_y:
292	Dim = `1`;
293	break;
294	case Intrinsic::amdgcn_workitem_id_z:
295	case Intrinsic::r600_read_tidig_z:
296	IdQuery = true;
297	[[fallthrough]];
298	case Intrinsic::r600_read_local_size_z:
299	Dim = `2`;
300	break;
301	default:
302	break;
303	}
304
305	if (Dim <= `3`) {
306	unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim);
307	if (ReqdSize != std::numeric_limits<unsigned>::max())
308	MinSize = MaxSize = ReqdSize;
309	}
310	}
311	}
312
313	if (!MaxSize)
314	return false;
315
316	// Range metadata is [Lo, Hi). For ID query we need to pass max size
317	// as Hi. For size query we need to pass Hi + 1.
318	if (IdQuery)
319	MinSize = `0`;
320	else
321	++MaxSize;
322
323	APInt Lower{`32`, MinSize};
324	APInt Upper{`32`, MaxSize};
325	if (auto *CI = dyn_cast<CallBase>(Val: I)) {
326	ConstantRange Range(Lower, Upper);
327	CI->addRangeRetAttr(CR: Range);
328	} else {
329	MDBuilder MDB(I->getContext());
330	MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper);
331	I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange);
332	}
333	return true;
334	}
335
336	unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
337	assert(AMDGPU::isKernel(F.getCallingConv()));
338
339	// We don't allocate the segment if we know the implicit arguments weren't
340	// used, even if the ABI implies we need them.
341	if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
342	return `0`;
343
344	if (isMesaKernel(F))
345	return `16`;
346
347	// Assume all implicit inputs are used by default
348	const Module *M = F.getParent();
349	unsigned NBytes =
350	AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? `256` : `56`;
351	return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes",
352	Default: NBytes);
353	}
354
355	uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
356	Align &MaxAlign) const {
357	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
358	F.getCallingConv() == CallingConv::SPIR_KERNEL);
359
360	const DataLayout &DL = F.getDataLayout();
361	uint64_t ExplicitArgBytes = `0`;
362	MaxAlign = Align (`1`);
363
364	for (const Argument &Arg : F.args()) {
365	if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument"))
366	continue;
367
368	const bool IsByRef = Arg.hasByRefAttr();
369	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
370	Align Alignment = DL.getValueOrABITypeAlignment(
371	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy);
372	uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
373	ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize;
374	MaxAlign = std::max(a: MaxAlign, b: Alignment);
375	}
376
377	return ExplicitArgBytes;
378	}
379
380	unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
381	Align &MaxAlign) const {
382	if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
383	F.getCallingConv() != CallingConv::SPIR_KERNEL)
384	return `0`;
385
386	uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
387
388	unsigned ExplicitOffset = getExplicitKernelArgOffset();
389
390	uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
391	unsigned ImplicitBytes = getImplicitArgNumBytes(F);
392	if (ImplicitBytes != `0`) {
393	const Align Alignment = getAlignmentForImplicitArgPtr();
394	TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes;
395	MaxAlign = std::max(a: MaxAlign, b: Alignment);
396	}
397
398	// Being able to dereference past the end is useful for emitting scalar loads.
399	return alignTo(Value: TotalSize, Align: `4`);
400	}
401
402	AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
403	return getWavefrontSize() == `32` ? AMDGPUDwarfFlavour::Wave32
404	: AMDGPUDwarfFlavour::Wave64;
405	}
406
407	const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
408	if (MF.getTarget().getTargetTriple().isAMDGCN())
409	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
410	return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
411	}
412
413	const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
414	if (TM.getTargetTriple().isAMDGCN())
415	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
416	return static_cast<const AMDGPUSubtarget &>(
417	TM.getSubtarget<R600Subtarget>(F));
418	}
419
420	// FIXME: This has no reason to be in subtarget
421	SmallVector<unsigned>
422	AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
423	return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups", Size: `3`,
424	DefaultVal: std::numeric_limits<uint32_t>::max());
425	}
426

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp