AMDGPUResourceUsageAnalysis.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp]

1	//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// \brief Analyzes how many registers and other resources are used by
11	/// functions.
12	///
13	/// The results of this analysis are used to fill the register usage, flat
14	/// usage, etc. into hardware registers.
15	///
16	//===----------------------------------------------------------------------===//
17
18	#include "AMDGPUResourceUsageAnalysis.h"
19	#include "AMDGPU.h"
20	#include "GCNSubtarget.h"
21	#include "SIMachineFunctionInfo.h"
22	#include "llvm/CodeGen/MachineFrameInfo.h"
23	#include "llvm/CodeGen/MachineModuleInfo.h"
24	#include "llvm/CodeGen/TargetPassConfig.h"
25	#include "llvm/IR/GlobalValue.h"
26	#include "llvm/Target/TargetMachine.h"
27
28	using namespace llvm;
29	using namespace llvm::AMDGPU;
30
31	#define DEBUG_TYPE "amdgpu-resource-usage"
32
33	char llvm::AMDGPUResourceUsageAnalysis::ID = `0`;
34	char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
35
36	// In code object v4 and older, we need to tell the runtime some amount ahead of
37	// time if we don't know the true stack size. Assume a smaller number if this is
38	// only due to dynamic / non-entry block allocas.
39	static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
40	"amdgpu-assume-external-call-stack-size",
41	cl::desc ("Assumed stack use of any external call (in bytes)"), cl::Hidden,
42	cl::init(Val: `16384`));
43
44	static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
45	"amdgpu-assume-dynamic-stack-object-size",
46	cl::desc ("Assumed extra stack use if there are any "
47	"variable sized objects (in bytes)"),
48	cl::Hidden, cl::init(Val: `4096`));
49
50	INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
51	"Function register usage analysis", true, true)
52
53	static const Function getCalleeFunction(const* MachineOperand &Op) {
54	if (Op.isImm()) {
55	assert(Op.getImm() == `0`);
56	return nullptr;
57	}
58	return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases());
59	}
60
61	static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
62	const SIInstrInfo &TII, unsigned Reg) {
63	for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
64	if (!UseOp.isImplicit() \|\| !TII.isFLAT(MI: *UseOp.getParent()))
65	return true;
66	}
67
68	return false;
69	}
70
71	bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
72	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
73	if (!TPC)
74	return false;
75
76	const TargetMachine &TM = TPC->getTM<TargetMachine>();
77	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
78
79	// By default, for code object v5 and later, track only the minimum scratch
80	// size
81	uint32_t AssumedStackSizeForDynamicSizeObjects =
82	clAssumedStackSizeForDynamicSizeObjects;
83	uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
84	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
85	AMDGPU::AMDHSA_COV5 \|\|
86	STI.getTargetTriple().getOS() == Triple::AMDPAL) {
87	if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
88	AssumedStackSizeForDynamicSizeObjects = `0`;
89	if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
90	AssumedStackSizeForExternalCall = `0`;
91	}
92
93	ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
94	AssumedStackSizeForExternalCall);
95
96	return false;
97	}
98
99	AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
100	AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
101	const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
102	uint32_t AssumedStackSizeForExternalCall) const {
103	SIFunctionResourceInfo Info;
104
105	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
106	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
107	const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
108	const MachineRegisterInfo &MRI = MF.getRegInfo();
109	const SIInstrInfo *TII = ST.getInstrInfo();
110	const SIRegisterInfo &TRI = TII->getRegisterInfo();
111
112	Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) \|\|
113	MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) \|\|
114	MRI.isLiveIn(Reg: MFI->getPreloadedReg(
115	Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
116
117	// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
118	// instructions aren't used to access the scratch buffer. Inline assembly may
119	// need it though.
120	//
121	// If we only have implicit uses of flat_scr on flat instructions, it is not
122	// really needed.
123	if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
124	(!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) &&
125	!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) &&
126	!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) {
127	Info.UsesFlatScratch = false;
128	}
129
130	Info.PrivateSegmentSize = FrameInfo.getStackSize();
131
132	// Assume a big number if there are any unknown sized objects.
133	Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
134	if (Info.HasDynamicallySizedStack)
135	Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
136
137	if (MFI->isStackRealigned())
138	Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
139
140	Info.UsesVCC =
141	MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) \|\| MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI);
142
143	// If there are no calls, MachineRegisterInfo can tell us the used register
144	// count easily.
145	// A tail call isn't considered a call for MachineFrameInfo's purposes.
146	if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
147	Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass);
148	Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::SGPR_32RegClass);
149	if (ST.hasMAIInsts())
150	Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::AGPR_32RegClass);
151	return Info;
152	}
153
154	int32_t MaxVGPR = -`1`;
155	int32_t MaxAGPR = -`1`;
156	int32_t MaxSGPR = -`1`;
157	Info.CalleeSegmentSize = `0`;
158
159	for (const MachineBasicBlock &MBB : MF) {
160	for (const MachineInstr &MI : MBB) {
161	// TODO: Check regmasks? Do they occur anywhere except calls?
162	for (const MachineOperand &MO : MI.operands()) {
163	unsigned Width = `0`;
164	bool IsSGPR = false;
165	bool IsAGPR = false;
166
167	if (!MO.isReg())
168	continue;
169
170	Register Reg = MO.getReg();
171	switch (Reg) {
172	case AMDGPU::EXEC:
173	case AMDGPU::EXEC_LO:
174	case AMDGPU::EXEC_HI:
175	case AMDGPU::SCC:
176	case AMDGPU::M0:
177	case AMDGPU::M0_LO16:
178	case AMDGPU::M0_HI16:
179	case AMDGPU::SRC_SHARED_BASE_LO:
180	case AMDGPU::SRC_SHARED_BASE:
181	case AMDGPU::SRC_SHARED_LIMIT_LO:
182	case AMDGPU::SRC_SHARED_LIMIT:
183	case AMDGPU::SRC_PRIVATE_BASE_LO:
184	case AMDGPU::SRC_PRIVATE_BASE:
185	case AMDGPU::SRC_PRIVATE_LIMIT_LO:
186	case AMDGPU::SRC_PRIVATE_LIMIT:
187	case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
188	case AMDGPU::SGPR_NULL:
189	case AMDGPU::SGPR_NULL64:
190	case AMDGPU::MODE:
191	continue;
192
193	case AMDGPU::NoRegister:
194	assert(MI.isDebugInstr() &&
195	"Instruction uses invalid noreg register");
196	continue;
197
198	case AMDGPU::VCC:
199	case AMDGPU::VCC_LO:
200	case AMDGPU::VCC_HI:
201	case AMDGPU::VCC_LO_LO16:
202	case AMDGPU::VCC_LO_HI16:
203	case AMDGPU::VCC_HI_LO16:
204	case AMDGPU::VCC_HI_HI16:
205	Info.UsesVCC = true;
206	continue;
207
208	case AMDGPU::FLAT_SCR:
209	case AMDGPU::FLAT_SCR_LO:
210	case AMDGPU::FLAT_SCR_HI:
211	continue;
212
213	case AMDGPU::XNACK_MASK:
214	case AMDGPU::XNACK_MASK_LO:
215	case AMDGPU::XNACK_MASK_HI:
216	llvm_unreachable("xnack_mask registers should not be used");
217
218	case AMDGPU::LDS_DIRECT:
219	llvm_unreachable("lds_direct register should not be used");
220
221	case AMDGPU::TBA:
222	case AMDGPU::TBA_LO:
223	case AMDGPU::TBA_HI:
224	case AMDGPU::TMA:
225	case AMDGPU::TMA_LO:
226	case AMDGPU::TMA_HI:
227	llvm_unreachable("trap handler registers should not be used");
228
229	case AMDGPU::SRC_VCCZ:
230	llvm_unreachable("src_vccz register should not be used");
231
232	case AMDGPU::SRC_EXECZ:
233	llvm_unreachable("src_execz register should not be used");
234
235	case AMDGPU::SRC_SCC:
236	llvm_unreachable("src_scc register should not be used");
237
238	default:
239	break;
240	}
241
242	if (AMDGPU::SGPR_32RegClass.contains(Reg) \|\|
243	AMDGPU::SGPR_LO16RegClass.contains(Reg) \|\|
244	AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
245	IsSGPR = true;
246	Width = `1`;
247	} else if (AMDGPU::VGPR_32RegClass.contains(Reg) \|\|
248	AMDGPU::VGPR_16RegClass.contains(Reg)) {
249	IsSGPR = false;
250	Width = `1`;
251	} else if (AMDGPU::AGPR_32RegClass.contains(Reg) \|\|
252	AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
253	IsSGPR = false;
254	IsAGPR = true;
255	Width = `1`;
256	} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
257	IsSGPR = true;
258	Width = `2`;
259	} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
260	IsSGPR = false;
261	Width = `2`;
262	} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
263	IsSGPR = false;
264	IsAGPR = true;
265	Width = `2`;
266	} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
267	IsSGPR = false;
268	Width = `3`;
269	} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
270	IsSGPR = true;
271	Width = `3`;
272	} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
273	IsSGPR = false;
274	IsAGPR = true;
275	Width = `3`;
276	} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
277	IsSGPR = true;
278	Width = `4`;
279	} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
280	IsSGPR = false;
281	Width = `4`;
282	} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
283	IsSGPR = false;
284	IsAGPR = true;
285	Width = `4`;
286	} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
287	IsSGPR = false;
288	Width = `5`;
289	} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
290	IsSGPR = true;
291	Width = `5`;
292	} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
293	IsSGPR = false;
294	IsAGPR = true;
295	Width = `5`;
296	} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
297	IsSGPR = false;
298	Width = `6`;
299	} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
300	IsSGPR = true;
301	Width = `6`;
302	} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
303	IsSGPR = false;
304	IsAGPR = true;
305	Width = `6`;
306	} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
307	IsSGPR = false;
308	Width = `7`;
309	} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
310	IsSGPR = true;
311	Width = `7`;
312	} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
313	IsSGPR = false;
314	IsAGPR = true;
315	Width = `7`;
316	} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
317	IsSGPR = true;
318	Width = `8`;
319	} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
320	IsSGPR = false;
321	Width = `8`;
322	} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
323	IsSGPR = false;
324	IsAGPR = true;
325	Width = `8`;
326	} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
327	IsSGPR = false;
328	Width = `9`;
329	} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
330	IsSGPR = true;
331	Width = `9`;
332	} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
333	IsSGPR = false;
334	IsAGPR = true;
335	Width = `9`;
336	} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
337	IsSGPR = false;
338	Width = `10`;
339	} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
340	IsSGPR = true;
341	Width = `10`;
342	} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
343	IsSGPR = false;
344	IsAGPR = true;
345	Width = `10`;
346	} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
347	IsSGPR = false;
348	Width = `11`;
349	} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
350	IsSGPR = true;
351	Width = `11`;
352	} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
353	IsSGPR = false;
354	IsAGPR = true;
355	Width = `11`;
356	} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
357	IsSGPR = false;
358	Width = `12`;
359	} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
360	IsSGPR = true;
361	Width = `12`;
362	} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
363	IsSGPR = false;
364	IsAGPR = true;
365	Width = `12`;
366	} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
367	IsSGPR = true;
368	Width = `16`;
369	} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
370	IsSGPR = false;
371	Width = `16`;
372	} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
373	IsSGPR = false;
374	IsAGPR = true;
375	Width = `16`;
376	} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
377	IsSGPR = true;
378	Width = `32`;
379	} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
380	IsSGPR = false;
381	Width = `32`;
382	} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
383	IsSGPR = false;
384	IsAGPR = true;
385	Width = `32`;
386	} else {
387	// We only expect TTMP registers or registers that do not belong to
388	// any RC.
389	assert((AMDGPU::TTMP_32RegClass.contains(Reg) \|\|
390	AMDGPU::TTMP_64RegClass.contains(Reg) \|\|
391	AMDGPU::TTMP_128RegClass.contains(Reg) \|\|
392	AMDGPU::TTMP_256RegClass.contains(Reg) \|\|
393	AMDGPU::TTMP_512RegClass.contains(Reg) \|\|
394	!TRI.getPhysRegBaseClass(Reg)) &&
395	"Unknown register class");
396	}
397	unsigned HWReg = TRI.getHWRegIndex(Reg);
398	int MaxUsed = HWReg + Width - `1`;
399	if (IsSGPR) {
400	MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
401	} else if (IsAGPR) {
402	MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
403	} else {
404	MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
405	}
406	}
407
408	if (MI.isCall()) {
409	// Pseudo used just to encode the underlying global. Is there a better
410	// way to track this?
411
412	const MachineOperand *CalleeOp =
413	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::callee);
414
415	const Function Callee = getCalleeFunction(Op: CalleeOp);
416
417	// Avoid crashing on undefined behavior with an illegal call to a
418	// kernel. If a callsite's calling convention doesn't match the
419	// function's, it's undefined behavior. If the callsite calling
420	// convention does match, that would have errored earlier.
421	if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv()))
422	report_fatal_error(reason: "invalid call to entry function");
423
424	auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
425	return F == &MF.getFunction();
426	};
427
428	if (Callee && !isSameFunction (MF, Callee))
429	Info.Callees.push_back(Elt: Callee);
430
431	bool IsIndirect = !Callee \|\| Callee->isDeclaration();
432
433	// FIXME: Call site could have norecurse on it
434	if (!Callee \|\| !Callee->doesNotRecurse()) {
435	Info.HasRecursion = true;
436
437	// TODO: If we happen to know there is no stack usage in the
438	// callgraph, we don't need to assume an infinitely growing stack.
439	if (!MI.isReturn()) {
440	// We don't need to assume an unknown stack size for tail calls.
441
442	// FIXME: This only benefits in the case where the kernel does not
443	// directly call the tail called function. If a kernel directly
444	// calls a tail recursive function, we'll assume maximum stack size
445	// based on the regular call instruction.
446	Info.CalleeSegmentSize = std::max(
447	a: Info.CalleeSegmentSize,
448	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
449	}
450	}
451
452	if (IsIndirect) {
453	Info.CalleeSegmentSize =
454	std::max(a: Info.CalleeSegmentSize,
455	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
456
457	// Register usage of indirect calls gets handled later
458	Info.UsesVCC = true;
459	Info.UsesFlatScratch = ST.hasFlatAddressSpace();
460	Info.HasDynamicallySizedStack = true;
461	Info.HasIndirectCall = true;
462	}
463	}
464	}
465	}
466
467	Info.NumExplicitSGPR = MaxSGPR + `1`;
468	Info.NumVGPR = MaxVGPR + `1`;
469	Info.NumAGPR = MaxAGPR + `1`;
470
471	return Info;
472	}
473

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp