AMDGPUResourceUsageAnalysis.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp]

1	//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// \brief Analyzes how many registers and other resources are used by
11	/// functions.
12	///
13	/// The results of this analysis are used to fill the register usage, flat
14	/// usage, etc. into hardware registers.
15	///
16	/// The analysis takes callees into account. E.g. if a function A that needs 10
17	/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18	/// will return 20.
19	/// It is assumed that an indirect call can go into any function except
20	/// hardware-entrypoints. Therefore the register usage of functions with
21	/// indirect calls is estimated as the maximum of all non-entrypoint functions
22	/// in the module.
23	///
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPUResourceUsageAnalysis.h"
27	#include "AMDGPU.h"
28	#include "GCNSubtarget.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "llvm/ADT/PostOrderIterator.h"
31	#include "llvm/Analysis/CallGraph.h"
32	#include "llvm/CodeGen/MachineFrameInfo.h"
33	#include "llvm/CodeGen/TargetPassConfig.h"
34	#include "llvm/IR/GlobalAlias.h"
35	#include "llvm/IR/GlobalValue.h"
36	#include "llvm/Target/TargetMachine.h"
37
38	using namespace llvm;
39	using namespace llvm::AMDGPU;
40
41	#define DEBUG_TYPE "amdgpu-resource-usage"
42
43	char llvm::AMDGPUResourceUsageAnalysis::ID = `0`;
44	char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45
46	// In code object v4 and older, we need to tell the runtime some amount ahead of
47	// time if we don't know the true stack size. Assume a smaller number if this is
48	// only due to dynamic / non-entry block allocas.
49	static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
50	"amdgpu-assume-external-call-stack-size",
51	cl::desc ("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52	cl::init(Val: `16384`));
53
54	static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
55	"amdgpu-assume-dynamic-stack-object-size",
56	cl::desc ("Assumed extra stack use if there are any "
57	"variable sized objects (in bytes)"),
58	cl::Hidden, cl::init(Val: `4096`));
59
60	INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61	"Function register usage analysis", true, true)
62
63	static const Function getCalleeFunction(const* MachineOperand &Op) {
64	if (Op.isImm()) {
65	assert(Op.getImm() == `0`);
66	return nullptr;
67	}
68	return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases());
69	}
70
71	static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72	const SIInstrInfo &TII, unsigned Reg) {
73	for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74	if (!UseOp.isImplicit() \|\| !TII.isFLAT(MI: *UseOp.getParent()))
75	return true;
76	}
77
78	return false;
79	}
80
81	int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82	const GCNSubtarget &ST) const {
83	return NumExplicitSGPR +
84	IsaInfo::getNumExtraSGPRs(STI: &ST, VCCUsed: UsesVCC, FlatScrUsed: UsesFlatScratch,
85	XNACKUsed: ST.getTargetID().isXnackOnOrAny());
86	}
87
88	int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89	const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90	return AMDGPU::getTotalNumVGPRs(has90AInsts: ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
91	}
92
93	int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
94	const GCNSubtarget &ST) const {
95	return getTotalNumVGPRs(ST, ArgNumAGPR: NumAGPR, ArgNumVGPR: NumVGPR);
96	}
97
98	bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
99	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
100	if (!TPC)
101	return false;
102
103	MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
104	const TargetMachine &TM = TPC->getTM<TargetMachine>();
105	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
106	bool HasIndirectCall = false;
107
108	CallGraph CG = CallGraph (M);
109	auto End = po_end(G: &CG);
110
111	// By default, for code object v5 and later, track only the minimum scratch
112	// size
113	uint32_t AssumedStackSizeForDynamicSizeObjects =
114	clAssumedStackSizeForDynamicSizeObjects;
115	uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
116	if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 \|\|
117	STI.getTargetTriple().getOS() == Triple::AMDPAL) {
118	if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == `0`)
119	AssumedStackSizeForDynamicSizeObjects = `0`;
120	if (clAssumedStackSizeForExternalCall.getNumOccurrences() == `0`)
121	AssumedStackSizeForExternalCall = `0`;
122	}
123
124	for (auto IT = po_begin(G: &CG); IT != End; ++IT) {
125	Function *F = IT ->getFunction();
126	if (!F \|\| F->isDeclaration())
127	continue;
128
129	MachineFunction MF = MMI.getMachineFunction(F: F);
130	assert(MF && "function must have been generated already");
131
132	auto CI =
133	CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo ()));
134	SIFunctionResourceInfo &Info = CI.first ->second;
135	assert(CI.second && "should only be called once per function");
136	Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects,
137	AssumedStackSizeForExternalCall);
138	HasIndirectCall \|= Info.HasIndirectCall;
139	}
140
141	// It's possible we have unreachable functions in the module which weren't
142	// visited by the PO traversal. Make sure we have some resource counts to
143	// report.
144	for (const auto &IT : CG) {
145	const Function *F = IT.first;
146	if (!F \|\| F->isDeclaration())
147	continue;
148
149	auto CI =
150	CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo ()));
151	if (!CI.second) // Skip already visited functions
152	continue;
153
154	SIFunctionResourceInfo &Info = CI.first ->second;
155	MachineFunction MF = MMI.getMachineFunction(F: F);
156	assert(MF && "function must have been generated already");
157	Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects,
158	AssumedStackSizeForExternalCall);
159	HasIndirectCall \|= Info.HasIndirectCall;
160	}
161
162	if (HasIndirectCall)
163	propagateIndirectCallRegisterUsage();
164
165	return false;
166	}
167
168	AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
169	AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
170	const MachineFunction &MF, const TargetMachine &TM,
171	uint32_t AssumedStackSizeForDynamicSizeObjects,
172	uint32_t AssumedStackSizeForExternalCall) const {
173	SIFunctionResourceInfo Info;
174
175	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
176	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
177	const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
178	const MachineRegisterInfo &MRI = MF.getRegInfo();
179	const SIInstrInfo *TII = ST.getInstrInfo();
180	const SIRegisterInfo &TRI = TII->getRegisterInfo();
181
182	Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) \|\|
183	MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) \|\|
184	MRI.isLiveIn(Reg: MFI->getPreloadedReg(
185	Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
186
187	// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
188	// instructions aren't used to access the scratch buffer. Inline assembly may
189	// need it though.
190	//
191	// If we only have implicit uses of flat_scr on flat instructions, it is not
192	// really needed.
193	if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
194	(!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) &&
195	!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) &&
196	!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) {
197	Info.UsesFlatScratch = false;
198	}
199
200	Info.PrivateSegmentSize = FrameInfo.getStackSize();
201
202	// Assume a big number if there are any unknown sized objects.
203	Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
204	if (Info.HasDynamicallySizedStack)
205	Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
206
207	if (MFI->isStackRealigned())
208	Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
209
210	Info.UsesVCC =
211	MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) \|\| MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI);
212
213	// If there are no calls, MachineRegisterInfo can tell us the used register
214	// count easily.
215	// A tail call isn't considered a call for MachineFrameInfo's purposes.
216	if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
217	MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
218	for (MCPhysReg Reg : reverse(C: AMDGPU::VGPR_32RegClass.getRegisters())) {
219	if (MRI.isPhysRegUsed(PhysReg: Reg)) {
220	HighestVGPRReg = Reg;
221	break;
222	}
223	}
224
225	if (ST.hasMAIInsts()) {
226	MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
227	for (MCPhysReg Reg : reverse(C: AMDGPU::AGPR_32RegClass.getRegisters())) {
228	if (MRI.isPhysRegUsed(PhysReg: Reg)) {
229	HighestAGPRReg = Reg;
230	break;
231	}
232	}
233	Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
234	? `0`
235	: TRI.getHWRegIndex(Reg: HighestAGPRReg) + `1`;
236	}
237
238	MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
239	for (MCPhysReg Reg : reverse(C: AMDGPU::SGPR_32RegClass.getRegisters())) {
240	if (MRI.isPhysRegUsed(PhysReg: Reg)) {
241	HighestSGPRReg = Reg;
242	break;
243	}
244	}
245
246	// We found the maximum register index. They start at 0, so add one to get
247	// the number of registers.
248	Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
249	? `0`
250	: TRI.getHWRegIndex(Reg: HighestVGPRReg) + `1`;
251	Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
252	? `0`
253	: TRI.getHWRegIndex(Reg: HighestSGPRReg) + `1`;
254
255	return Info;
256	}
257
258	int32_t MaxVGPR = -`1`;
259	int32_t MaxAGPR = -`1`;
260	int32_t MaxSGPR = -`1`;
261	uint64_t CalleeFrameSize = `0`;
262
263	for (const MachineBasicBlock &MBB : MF) {
264	for (const MachineInstr &MI : MBB) {
265	// TODO: Check regmasks? Do they occur anywhere except calls?
266	for (const MachineOperand &MO : MI.operands()) {
267	unsigned Width = `0`;
268	bool IsSGPR = false;
269	bool IsAGPR = false;
270
271	if (!MO.isReg())
272	continue;
273
274	Register Reg = MO.getReg();
275	switch (Reg) {
276	case AMDGPU::EXEC:
277	case AMDGPU::EXEC_LO:
278	case AMDGPU::EXEC_HI:
279	case AMDGPU::SCC:
280	case AMDGPU::M0:
281	case AMDGPU::M0_LO16:
282	case AMDGPU::M0_HI16:
283	case AMDGPU::SRC_SHARED_BASE_LO:
284	case AMDGPU::SRC_SHARED_BASE:
285	case AMDGPU::SRC_SHARED_LIMIT_LO:
286	case AMDGPU::SRC_SHARED_LIMIT:
287	case AMDGPU::SRC_PRIVATE_BASE_LO:
288	case AMDGPU::SRC_PRIVATE_BASE:
289	case AMDGPU::SRC_PRIVATE_LIMIT_LO:
290	case AMDGPU::SRC_PRIVATE_LIMIT:
291	case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292	case AMDGPU::SGPR_NULL:
293	case AMDGPU::SGPR_NULL64:
294	case AMDGPU::MODE:
295	continue;
296
297	case AMDGPU::NoRegister:
298	assert(MI.isDebugInstr() &&
299	"Instruction uses invalid noreg register");
300	continue;
301
302	case AMDGPU::VCC:
303	case AMDGPU::VCC_LO:
304	case AMDGPU::VCC_HI:
305	case AMDGPU::VCC_LO_LO16:
306	case AMDGPU::VCC_LO_HI16:
307	case AMDGPU::VCC_HI_LO16:
308	case AMDGPU::VCC_HI_HI16:
309	Info.UsesVCC = true;
310	continue;
311
312	case AMDGPU::FLAT_SCR:
313	case AMDGPU::FLAT_SCR_LO:
314	case AMDGPU::FLAT_SCR_HI:
315	continue;
316
317	case AMDGPU::XNACK_MASK:
318	case AMDGPU::XNACK_MASK_LO:
319	case AMDGPU::XNACK_MASK_HI:
320	llvm_unreachable("xnack_mask registers should not be used");
321
322	case AMDGPU::LDS_DIRECT:
323	llvm_unreachable("lds_direct register should not be used");
324
325	case AMDGPU::TBA:
326	case AMDGPU::TBA_LO:
327	case AMDGPU::TBA_HI:
328	case AMDGPU::TMA:
329	case AMDGPU::TMA_LO:
330	case AMDGPU::TMA_HI:
331	llvm_unreachable("trap handler registers should not be used");
332
333	case AMDGPU::SRC_VCCZ:
334	llvm_unreachable("src_vccz register should not be used");
335
336	case AMDGPU::SRC_EXECZ:
337	llvm_unreachable("src_execz register should not be used");
338
339	case AMDGPU::SRC_SCC:
340	llvm_unreachable("src_scc register should not be used");
341
342	default:
343	break;
344	}
345
346	if (AMDGPU::SGPR_32RegClass.contains(Reg) \|\|
347	AMDGPU::SGPR_LO16RegClass.contains(Reg) \|\|
348	AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
349	IsSGPR = true;
350	Width = `1`;
351	} else if (AMDGPU::VGPR_32RegClass.contains(Reg) \|\|
352	AMDGPU::VGPR_16RegClass.contains(Reg)) {
353	IsSGPR = false;
354	Width = `1`;
355	} else if (AMDGPU::AGPR_32RegClass.contains(Reg) \|\|
356	AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
357	IsSGPR = false;
358	IsAGPR = true;
359	Width = `1`;
360	} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
361	IsSGPR = true;
362	Width = `2`;
363	} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
364	IsSGPR = false;
365	Width = `2`;
366	} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
367	IsSGPR = false;
368	IsAGPR = true;
369	Width = `2`;
370	} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
371	IsSGPR = false;
372	Width = `3`;
373	} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
374	IsSGPR = true;
375	Width = `3`;
376	} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
377	IsSGPR = false;
378	IsAGPR = true;
379	Width = `3`;
380	} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
381	IsSGPR = true;
382	Width = `4`;
383	} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
384	IsSGPR = false;
385	Width = `4`;
386	} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
387	IsSGPR = false;
388	IsAGPR = true;
389	Width = `4`;
390	} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
391	IsSGPR = false;
392	Width = `5`;
393	} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
394	IsSGPR = true;
395	Width = `5`;
396	} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
397	IsSGPR = false;
398	IsAGPR = true;
399	Width = `5`;
400	} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
401	IsSGPR = false;
402	Width = `6`;
403	} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
404	IsSGPR = true;
405	Width = `6`;
406	} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
407	IsSGPR = false;
408	IsAGPR = true;
409	Width = `6`;
410	} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
411	IsSGPR = false;
412	Width = `7`;
413	} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
414	IsSGPR = true;
415	Width = `7`;
416	} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
417	IsSGPR = false;
418	IsAGPR = true;
419	Width = `7`;
420	} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
421	IsSGPR = true;
422	Width = `8`;
423	} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
424	IsSGPR = false;
425	Width = `8`;
426	} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
427	IsSGPR = false;
428	IsAGPR = true;
429	Width = `8`;
430	} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
431	IsSGPR = false;
432	Width = `9`;
433	} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
434	IsSGPR = true;
435	Width = `9`;
436	} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
437	IsSGPR = false;
438	IsAGPR = true;
439	Width = `9`;
440	} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
441	IsSGPR = false;
442	Width = `10`;
443	} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
444	IsSGPR = true;
445	Width = `10`;
446	} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
447	IsSGPR = false;
448	IsAGPR = true;
449	Width = `10`;
450	} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
451	IsSGPR = false;
452	Width = `11`;
453	} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
454	IsSGPR = true;
455	Width = `11`;
456	} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
457	IsSGPR = false;
458	IsAGPR = true;
459	Width = `11`;
460	} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
461	IsSGPR = false;
462	Width = `12`;
463	} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
464	IsSGPR = true;
465	Width = `12`;
466	} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
467	IsSGPR = false;
468	IsAGPR = true;
469	Width = `12`;
470	} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
471	IsSGPR = true;
472	Width = `16`;
473	} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
474	IsSGPR = false;
475	Width = `16`;
476	} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
477	IsSGPR = false;
478	IsAGPR = true;
479	Width = `16`;
480	} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
481	IsSGPR = true;
482	Width = `32`;
483	} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
484	IsSGPR = false;
485	Width = `32`;
486	} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
487	IsSGPR = false;
488	IsAGPR = true;
489	Width = `32`;
490	} else {
491	// We only expect TTMP registers or registers that do not belong to
492	// any RC.
493	assert((AMDGPU::TTMP_32RegClass.contains(Reg) \|\|
494	AMDGPU::TTMP_64RegClass.contains(Reg) \|\|
495	AMDGPU::TTMP_128RegClass.contains(Reg) \|\|
496	AMDGPU::TTMP_256RegClass.contains(Reg) \|\|
497	AMDGPU::TTMP_512RegClass.contains(Reg) \|\|
498	!TRI.getPhysRegBaseClass(Reg)) &&
499	"Unknown register class");
500	}
501	unsigned HWReg = TRI.getHWRegIndex(Reg);
502	int MaxUsed = HWReg + Width - `1`;
503	if (IsSGPR) {
504	MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
505	} else if (IsAGPR) {
506	MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
507	} else {
508	MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
509	}
510	}
511
512	if (MI.isCall()) {
513	// Pseudo used just to encode the underlying global. Is there a better
514	// way to track this?
515
516	const MachineOperand *CalleeOp =
517	TII->getNamedOperand(MI, OpName: AMDGPU::OpName::callee);
518
519	const Function Callee = getCalleeFunction(Op: CalleeOp);
520	DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
521	CallGraphResourceInfo.end();
522
523	// Avoid crashing on undefined behavior with an illegal call to a
524	// kernel. If a callsite's calling convention doesn't match the
525	// function's, it's undefined behavior. If the callsite calling
526	// convention does match, that would have errored earlier.
527	if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv()))
528	report_fatal_error(reason: "invalid call to entry function");
529
530	bool IsIndirect = !Callee \|\| Callee->isDeclaration();
531	if (!IsIndirect)
532	I = CallGraphResourceInfo.find(Val: Callee);
533
534	// FIXME: Call site could have norecurse on it
535	if (!Callee \|\| !Callee->doesNotRecurse()) {
536	Info.HasRecursion = true;
537
538	// TODO: If we happen to know there is no stack usage in the
539	// callgraph, we don't need to assume an infinitely growing stack.
540	if (!MI.isReturn()) {
541	// We don't need to assume an unknown stack size for tail calls.
542
543	// FIXME: This only benefits in the case where the kernel does not
544	// directly call the tail called function. If a kernel directly
545	// calls a tail recursive function, we'll assume maximum stack size
546	// based on the regular call instruction.
547	CalleeFrameSize = std::max(
548	a: CalleeFrameSize,
549	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
550	}
551	}
552
553	if (IsIndirect \|\| I == CallGraphResourceInfo.end()) {
554	CalleeFrameSize =
555	std::max(a: CalleeFrameSize,
556	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
557
558	// Register usage of indirect calls gets handled later
559	Info.UsesVCC = true;
560	Info.UsesFlatScratch = ST.hasFlatAddressSpace();
561	Info.HasDynamicallySizedStack = true;
562	Info.HasIndirectCall = true;
563	} else {
564	// We force CodeGen to run in SCC order, so the callee's register
565	// usage etc. should be the cumulative usage of all callees.
566	MaxSGPR = std::max(a: I ->second.NumExplicitSGPR - `1`, b: MaxSGPR);
567	MaxVGPR = std::max(a: I ->second.NumVGPR - `1`, b: MaxVGPR);
568	MaxAGPR = std::max(a: I ->second.NumAGPR - `1`, b: MaxAGPR);
569	CalleeFrameSize =
570	std::max(a: I ->second.PrivateSegmentSize, b: CalleeFrameSize);
571	Info.UsesVCC \|= I ->second.UsesVCC;
572	Info.UsesFlatScratch \|= I ->second.UsesFlatScratch;
573	Info.HasDynamicallySizedStack \|= I ->second.HasDynamicallySizedStack;
574	Info.HasRecursion \|= I ->second.HasRecursion;
575	Info.HasIndirectCall \|= I ->second.HasIndirectCall;
576	}
577	}
578	}
579	}
580
581	Info.NumExplicitSGPR = MaxSGPR + `1`;
582	Info.NumVGPR = MaxVGPR + `1`;
583	Info.NumAGPR = MaxAGPR + `1`;
584	Info.PrivateSegmentSize += CalleeFrameSize;
585
586	return Info;
587	}
588
589	void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
590	// Collect the maximum number of registers from non-hardware-entrypoints.
591	// All these functions are potential targets for indirect calls.
592	int32_t NonKernelMaxSGPRs = `0`;
593	int32_t NonKernelMaxVGPRs = `0`;
594	int32_t NonKernelMaxAGPRs = `0`;
595
596	for (const auto &I : CallGraphResourceInfo) {
597	if (!AMDGPU::isEntryFunctionCC(CC: I.getFirst()->getCallingConv())) {
598	auto &Info = I.getSecond();
599	NonKernelMaxSGPRs = std::max(a: NonKernelMaxSGPRs, b: Info.NumExplicitSGPR);
600	NonKernelMaxVGPRs = std::max(a: NonKernelMaxVGPRs, b: Info.NumVGPR);
601	NonKernelMaxAGPRs = std::max(a: NonKernelMaxAGPRs, b: Info.NumAGPR);
602	}
603	}
604
605	// Add register usage for functions with indirect calls.
606	// For calls to unknown functions, we assume the maximum register usage of
607	// all non-hardware-entrypoints in the current module.
608	for (auto &I : CallGraphResourceInfo) {
609	auto &Info = I.getSecond();
610	if (Info.HasIndirectCall) {
611	Info.NumExplicitSGPR = std::max(a: Info.NumExplicitSGPR, b: NonKernelMaxSGPRs);
612	Info.NumVGPR = std::max(a: Info.NumVGPR, b: NonKernelMaxVGPRs);
613	Info.NumAGPR = std::max(a: Info.NumAGPR, b: NonKernelMaxAGPRs);
614	}
615	}
616	}
617

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp