AMDGPUResourceUsageAnalysis.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp]

1	//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// \brief Analyzes how many registers and other resources are used by
11	/// functions.
12	///
13	/// The results of this analysis are used to fill the register usage, flat
14	/// usage, etc. into hardware registers.
15	///
16	//===----------------------------------------------------------------------===//
17
18	#include "AMDGPUResourceUsageAnalysis.h"
19	#include "AMDGPU.h"
20	#include "GCNSubtarget.h"
21	#include "SIMachineFunctionInfo.h"
22	#include "llvm/CodeGen/MachineFrameInfo.h"
23	#include "llvm/CodeGen/MachineModuleInfo.h"
24	#include "llvm/CodeGen/TargetPassConfig.h"
25	#include "llvm/IR/GlobalValue.h"
26	#include "llvm/Target/TargetMachine.h"
27
28	using namespace llvm;
29	using namespace llvm::AMDGPU;
30
31	#define DEBUG_TYPE "amdgpu-resource-usage"
32
33	char llvm::AMDGPUResourceUsageAnalysisWrapperPass::ID = `0`;
34	char &llvm::AMDGPUResourceUsageAnalysisID =
35	AMDGPUResourceUsageAnalysisWrapperPass::ID;
36
37	// In code object v4 and older, we need to tell the runtime some amount ahead of
38	// time if we don't know the true stack size. Assume a smaller number if this is
39	// only due to dynamic / non-entry block allocas.
40	static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
41	"amdgpu-assume-external-call-stack-size",
42	cl::desc ("Assumed stack use of any external call (in bytes)"), cl::Hidden,
43	cl::init(Val: `16384`));
44
45	static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
46	"amdgpu-assume-dynamic-stack-object-size",
47	cl::desc ("Assumed extra stack use if there are any "
48	"variable sized objects (in bytes)"),
49	cl::Hidden, cl::init(Val: `4096`));
50
51	INITIALIZE_PASS(AMDGPUResourceUsageAnalysisWrapperPass, DEBUG_TYPE,
52	"Function register usage analysis", true, true)
53
54	static const Function getCalleeFunction(const* MachineOperand &Op) {
55	if (Op.isImm()) {
56	assert(Op.getImm() == `0`);
57	return nullptr;
58	}
59	return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases());
60	}
61
62	static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
63	const SIInstrInfo &TII, unsigned Reg) {
64	for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
65	if (!UseOp.isImplicit() \|\| !TII.isFLAT(MI: *UseOp.getParent()))
66	return true;
67	}
68
69	return false;
70	}
71
72	bool AMDGPUResourceUsageAnalysisWrapperPass::runOnMachineFunction(
73	MachineFunction &MF) {
74	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
75	if (!TPC)
76	return false;
77
78	const TargetMachine &TM = TPC->getTM<TargetMachine>();
79	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
80
81	// By default, for code object v5 and later, track only the minimum scratch
82	// size
83	uint32_t AssumedStackSizeForDynamicSizeObjects =
84	clAssumedStackSizeForDynamicSizeObjects;
85	uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
86	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
87	AMDGPU::AMDHSA_COV5 \|\|
88	STI.getTargetTriple().getOS() == Triple::AMDPAL) {
89	if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
90	AssumedStackSizeForDynamicSizeObjects = `0`;
91	if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
92	AssumedStackSizeForExternalCall = `0`;
93	}
94
95	ResourceInfo = AMDGPUResourceUsageAnalysisImpl ().analyzeResourceUsage(
96	MF, AssumedStackSizeForDynamicSizeObjects,
97	AssumedStackSizeForExternalCall);
98
99	return false;
100	}
101
102	AnalysisKey AMDGPUResourceUsageAnalysis::Key;
103	AMDGPUResourceUsageAnalysis::Result
104	AMDGPUResourceUsageAnalysis::run(MachineFunction &MF,
105	MachineFunctionAnalysisManager &MFAM) {
106	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
107
108	// By default, for code object v5 and later, track only the minimum scratch
109	// size
110	uint32_t AssumedStackSizeForDynamicSizeObjects =
111	clAssumedStackSizeForDynamicSizeObjects;
112	uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
113	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
114	AMDGPU::AMDHSA_COV5 \|\|
115	STI.getTargetTriple().getOS() == Triple::AMDPAL) {
116	if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
117	AssumedStackSizeForDynamicSizeObjects = `0`;
118	if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
119	AssumedStackSizeForExternalCall = `0`;
120	}
121
122	return AMDGPUResourceUsageAnalysisImpl ().analyzeResourceUsage(
123	MF, AssumedStackSizeForDynamicSizeObjects,
124	AssumedStackSizeForExternalCall);
125	}
126
127	AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo
128	AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
129	const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
130	uint32_t AssumedStackSizeForExternalCall) const {
131	SIFunctionResourceInfo Info;
132
133	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
134	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
135	const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
136	const MachineRegisterInfo &MRI = MF.getRegInfo();
137	const SIInstrInfo *TII = ST.getInstrInfo();
138	const SIRegisterInfo &TRI = TII->getRegisterInfo();
139
140	Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) \|\|
141	MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) \|\|
142	MRI.isLiveIn(Reg: MFI->getPreloadedReg(
143	Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
144
145	Info.NumNamedBarrier = MFI->getNumNamedBarriers();
146
147	// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
148	// instructions aren't used to access the scratch buffer. Inline assembly may
149	// need it though.
150	//
151	// If we only have implicit uses of flat_scr on flat instructions, it is not
152	// really needed.
153	if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
154	(!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) &&
155	!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) &&
156	!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) {
157	Info.UsesFlatScratch = false;
158	}
159
160	Info.PrivateSegmentSize = FrameInfo.getStackSize();
161
162	// Assume a big number if there are any unknown sized objects.
163	Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
164	if (Info.HasDynamicallySizedStack)
165	Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
166
167	if (MFI->isStackRealigned())
168	Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
169
170	Info.UsesVCC =
171	MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) \|\| MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI);
172	Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::SGPR_32RegClass,
173	/IncludeCalls=/false);
174	if (ST.hasMAIInsts())
175	Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::AGPR_32RegClass,
176	/IncludeCalls=/false);
177
178	// If there are no calls, MachineRegisterInfo can tell us the used register
179	// count easily.
180	// A tail call isn't considered a call for MachineFrameInfo's purposes.
181	if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
182	Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass,
183	/IncludeCalls=/false);
184	return Info;
185	}
186
187	int32_t MaxVGPR = -`1`;
188	Info.CalleeSegmentSize = `0`;
189
190	for (const MachineBasicBlock &MBB : MF) {
191	for (const MachineInstr &MI : MBB) {
192	for (unsigned I = `0`; I < MI.getNumOperands(); ++I) {
193	const MachineOperand &MO = MI.getOperand(i: I);
194
195	if (!MO.isReg())
196	continue;
197
198	Register Reg = MO.getReg();
199	switch (Reg) {
200	case AMDGPU::NoRegister:
201	assert(MI.isDebugInstr() &&
202	"Instruction uses invalid noreg register");
203	continue;
204
205	case AMDGPU::XNACK_MASK:
206	case AMDGPU::XNACK_MASK_LO:
207	case AMDGPU::XNACK_MASK_HI:
208	llvm_unreachable("xnack_mask registers should not be used");
209
210	case AMDGPU::LDS_DIRECT:
211	llvm_unreachable("lds_direct register should not be used");
212
213	case AMDGPU::TBA:
214	case AMDGPU::TBA_LO:
215	case AMDGPU::TBA_HI:
216	case AMDGPU::TMA:
217	case AMDGPU::TMA_LO:
218	case AMDGPU::TMA_HI:
219	llvm_unreachable("trap handler registers should not be used");
220
221	case AMDGPU::SRC_VCCZ:
222	llvm_unreachable("src_vccz register should not be used");
223
224	case AMDGPU::SRC_EXECZ:
225	llvm_unreachable("src_execz register should not be used");
226
227	case AMDGPU::SRC_SCC:
228	llvm_unreachable("src_scc register should not be used");
229
230	default:
231	break;
232	}
233
234	const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
235	assert((!RC \|\| TRI.isVGPRClass(RC) \|\| TRI.isSGPRClass(RC) \|\|
236	TRI.isAGPRClass(RC) \|\| AMDGPU::TTMP_32RegClass.contains(Reg) \|\|
237	AMDGPU::TTMP_64RegClass.contains(Reg) \|\|
238	AMDGPU::TTMP_128RegClass.contains(Reg) \|\|
239	AMDGPU::TTMP_256RegClass.contains(Reg) \|\|
240	AMDGPU::TTMP_512RegClass.contains(Reg)) &&
241	"Unknown register class");
242
243	if (!RC \|\| !TRI.isVGPRClass(RC))
244	continue;
245
246	if (MI.isCall() \|\| MI.isMetaInstruction())
247	continue;
248
249	unsigned Width = divideCeil(Numerator: TRI.getRegSizeInBits(RC: *RC), Denominator: `32`);
250	unsigned HWReg = TRI.getHWRegIndex(Reg);
251	int MaxUsed = HWReg + Width - `1`;
252	MaxVGPR = std::max(a: MaxUsed, b: MaxVGPR);
253	}
254
255	if (MI.isCall()) {
256	// Pseudo used just to encode the underlying global. Is there a better
257	// way to track this?
258
259	// TODO: Some of the generic call-like pseudos do not encode the callee,
260	// so we overly conservatively treat this as an indirect call.
261	const MachineOperand *CalleeOp =
262	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::callee);
263
264	const Function *Callee =
265	CalleeOp ? getCalleeFunction(Op: CalleeOp) : nullptr*;
266
267	auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
268	return F == &MF.getFunction();
269	};
270
271	if (Callee && !isSameFunction (MF, Callee))
272	Info.Callees.push_back(Elt: Callee);
273
274	bool IsIndirect = !Callee \|\| Callee->isDeclaration();
275
276	// FIXME: Call site could have norecurse on it
277	if (!Callee \|\| !Callee->doesNotRecurse()) {
278	Info.HasRecursion = true;
279
280	// TODO: If we happen to know there is no stack usage in the
281	// callgraph, we don't need to assume an infinitely growing stack.
282	if (!MI.isReturn()) {
283	// We don't need to assume an unknown stack size for tail calls.
284
285	// FIXME: This only benefits in the case where the kernel does not
286	// directly call the tail called function. If a kernel directly
287	// calls a tail recursive function, we'll assume maximum stack size
288	// based on the regular call instruction.
289	Info.CalleeSegmentSize = std::max(
290	a: Info.CalleeSegmentSize,
291	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
292	}
293	}
294
295	if (IsIndirect) {
296	Info.CalleeSegmentSize =
297	std::max(a: Info.CalleeSegmentSize,
298	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
299
300	// Register usage of indirect calls gets handled later
301	Info.UsesVCC = true;
302	Info.UsesFlatScratch = ST.hasFlatAddressSpace();
303	Info.HasDynamicallySizedStack = true;
304	Info.HasIndirectCall = true;
305	}
306	}
307	}
308	}
309
310	Info.NumVGPR = MaxVGPR + `1`;
311
312	return Info;
313	}
314

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp