1//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes how many registers and other resources are used by
11/// functions.
12///
13/// The results of this analysis are used to fill the register usage, flat
14/// usage, etc. into hardware registers.
15///
16/// The analysis takes callees into account. E.g. if a function A that needs 10
17/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18/// will return 20.
19/// It is assumed that an indirect call can go into any function except
20/// hardware-entrypoints. Therefore the register usage of functions with
21/// indirect calls is estimated as the maximum of all non-entrypoint functions
22/// in the module.
23///
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPUResourceUsageAnalysis.h"
27#include "AMDGPU.h"
28#include "GCNSubtarget.h"
29#include "SIMachineFunctionInfo.h"
30#include "llvm/ADT/PostOrderIterator.h"
31#include "llvm/Analysis/CallGraph.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/TargetPassConfig.h"
34#include "llvm/IR/GlobalAlias.h"
35#include "llvm/IR/GlobalValue.h"
36#include "llvm/Target/TargetMachine.h"
37
38using namespace llvm;
39using namespace llvm::AMDGPU;
40
41#define DEBUG_TYPE "amdgpu-resource-usage"
42
43char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45
46// In code object v4 and older, we need to tell the runtime some amount ahead of
47// time if we don't know the true stack size. Assume a smaller number if this is
48// only due to dynamic / non-entry block allocas.
49static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
50 "amdgpu-assume-external-call-stack-size",
51 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52 cl::init(Val: 16384));
53
54static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc("Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
58 cl::Hidden, cl::init(Val: 4096));
59
60INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61 "Function register usage analysis", true, true)
62
63static const Function *getCalleeFunction(const MachineOperand &Op) {
64 if (Op.isImm()) {
65 assert(Op.getImm() == 0);
66 return nullptr;
67 }
68 return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases());
69}
70
71static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72 const SIInstrInfo &TII, unsigned Reg) {
73 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74 if (!UseOp.isImplicit() || !TII.isFLAT(MI: *UseOp.getParent()))
75 return true;
76 }
77
78 return false;
79}
80
81int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82 const GCNSubtarget &ST) const {
83 return NumExplicitSGPR +
84 IsaInfo::getNumExtraSGPRs(STI: &ST, VCCUsed: UsesVCC, FlatScrUsed: UsesFlatScratch,
85 XNACKUsed: ST.getTargetID().isXnackOnOrAny());
86}
87
88int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90 return AMDGPU::getTotalNumVGPRs(has90AInsts: ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
91}
92
93int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
94 const GCNSubtarget &ST) const {
95 return getTotalNumVGPRs(ST, ArgNumAGPR: NumAGPR, ArgNumVGPR: NumVGPR);
96}
97
98bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
99 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
100 if (!TPC)
101 return false;
102
103 MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
104 const TargetMachine &TM = TPC->getTM<TargetMachine>();
105 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
106 bool HasIndirectCall = false;
107
108 CallGraph CG = CallGraph(M);
109 auto End = po_end(G: &CG);
110
111 // By default, for code object v5 and later, track only the minimum scratch
112 // size
113 uint32_t AssumedStackSizeForDynamicSizeObjects =
114 clAssumedStackSizeForDynamicSizeObjects;
115 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
116 if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
117 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
118 if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
119 AssumedStackSizeForDynamicSizeObjects = 0;
120 if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
121 AssumedStackSizeForExternalCall = 0;
122 }
123
124 for (auto IT = po_begin(G: &CG); IT != End; ++IT) {
125 Function *F = IT->getFunction();
126 if (!F || F->isDeclaration())
127 continue;
128
129 MachineFunction *MF = MMI.getMachineFunction(F: *F);
130 assert(MF && "function must have been generated already");
131
132 auto CI =
133 CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo()));
134 SIFunctionResourceInfo &Info = CI.first->second;
135 assert(CI.second && "should only be called once per function");
136 Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects,
137 AssumedStackSizeForExternalCall);
138 HasIndirectCall |= Info.HasIndirectCall;
139 }
140
141 // It's possible we have unreachable functions in the module which weren't
142 // visited by the PO traversal. Make sure we have some resource counts to
143 // report.
144 for (const auto &IT : CG) {
145 const Function *F = IT.first;
146 if (!F || F->isDeclaration())
147 continue;
148
149 auto CI =
150 CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo()));
151 if (!CI.second) // Skip already visited functions
152 continue;
153
154 SIFunctionResourceInfo &Info = CI.first->second;
155 MachineFunction *MF = MMI.getMachineFunction(F: *F);
156 assert(MF && "function must have been generated already");
157 Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects,
158 AssumedStackSizeForExternalCall);
159 HasIndirectCall |= Info.HasIndirectCall;
160 }
161
162 if (HasIndirectCall)
163 propagateIndirectCallRegisterUsage();
164
165 return false;
166}
167
168AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
169AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
170 const MachineFunction &MF, const TargetMachine &TM,
171 uint32_t AssumedStackSizeForDynamicSizeObjects,
172 uint32_t AssumedStackSizeForExternalCall) const {
173 SIFunctionResourceInfo Info;
174
175 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
176 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
177 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
178 const MachineRegisterInfo &MRI = MF.getRegInfo();
179 const SIInstrInfo *TII = ST.getInstrInfo();
180 const SIRegisterInfo &TRI = TII->getRegisterInfo();
181
182 Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) ||
183 MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) ||
184 MRI.isLiveIn(Reg: MFI->getPreloadedReg(
185 Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
186
187 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
188 // instructions aren't used to access the scratch buffer. Inline assembly may
189 // need it though.
190 //
191 // If we only have implicit uses of flat_scr on flat instructions, it is not
192 // really needed.
193 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
194 (!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) &&
195 !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) &&
196 !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) {
197 Info.UsesFlatScratch = false;
198 }
199
200 Info.PrivateSegmentSize = FrameInfo.getStackSize();
201
202 // Assume a big number if there are any unknown sized objects.
203 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
204 if (Info.HasDynamicallySizedStack)
205 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
206
207 if (MFI->isStackRealigned())
208 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
209
210 Info.UsesVCC =
211 MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) || MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI);
212
213 // If there are no calls, MachineRegisterInfo can tell us the used register
214 // count easily.
215 // A tail call isn't considered a call for MachineFrameInfo's purposes.
216 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
217 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
218 for (MCPhysReg Reg : reverse(C: AMDGPU::VGPR_32RegClass.getRegisters())) {
219 if (MRI.isPhysRegUsed(PhysReg: Reg)) {
220 HighestVGPRReg = Reg;
221 break;
222 }
223 }
224
225 if (ST.hasMAIInsts()) {
226 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
227 for (MCPhysReg Reg : reverse(C: AMDGPU::AGPR_32RegClass.getRegisters())) {
228 if (MRI.isPhysRegUsed(PhysReg: Reg)) {
229 HighestAGPRReg = Reg;
230 break;
231 }
232 }
233 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
234 ? 0
235 : TRI.getHWRegIndex(Reg: HighestAGPRReg) + 1;
236 }
237
238 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
239 for (MCPhysReg Reg : reverse(C: AMDGPU::SGPR_32RegClass.getRegisters())) {
240 if (MRI.isPhysRegUsed(PhysReg: Reg)) {
241 HighestSGPRReg = Reg;
242 break;
243 }
244 }
245
246 // We found the maximum register index. They start at 0, so add one to get
247 // the number of registers.
248 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
249 ? 0
250 : TRI.getHWRegIndex(Reg: HighestVGPRReg) + 1;
251 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
252 ? 0
253 : TRI.getHWRegIndex(Reg: HighestSGPRReg) + 1;
254
255 return Info;
256 }
257
258 int32_t MaxVGPR = -1;
259 int32_t MaxAGPR = -1;
260 int32_t MaxSGPR = -1;
261 uint64_t CalleeFrameSize = 0;
262
263 for (const MachineBasicBlock &MBB : MF) {
264 for (const MachineInstr &MI : MBB) {
265 // TODO: Check regmasks? Do they occur anywhere except calls?
266 for (const MachineOperand &MO : MI.operands()) {
267 unsigned Width = 0;
268 bool IsSGPR = false;
269 bool IsAGPR = false;
270
271 if (!MO.isReg())
272 continue;
273
274 Register Reg = MO.getReg();
275 switch (Reg) {
276 case AMDGPU::EXEC:
277 case AMDGPU::EXEC_LO:
278 case AMDGPU::EXEC_HI:
279 case AMDGPU::SCC:
280 case AMDGPU::M0:
281 case AMDGPU::M0_LO16:
282 case AMDGPU::M0_HI16:
283 case AMDGPU::SRC_SHARED_BASE_LO:
284 case AMDGPU::SRC_SHARED_BASE:
285 case AMDGPU::SRC_SHARED_LIMIT_LO:
286 case AMDGPU::SRC_SHARED_LIMIT:
287 case AMDGPU::SRC_PRIVATE_BASE_LO:
288 case AMDGPU::SRC_PRIVATE_BASE:
289 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
290 case AMDGPU::SRC_PRIVATE_LIMIT:
291 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292 case AMDGPU::SGPR_NULL:
293 case AMDGPU::SGPR_NULL64:
294 case AMDGPU::MODE:
295 continue;
296
297 case AMDGPU::NoRegister:
298 assert(MI.isDebugInstr() &&
299 "Instruction uses invalid noreg register");
300 continue;
301
302 case AMDGPU::VCC:
303 case AMDGPU::VCC_LO:
304 case AMDGPU::VCC_HI:
305 case AMDGPU::VCC_LO_LO16:
306 case AMDGPU::VCC_LO_HI16:
307 case AMDGPU::VCC_HI_LO16:
308 case AMDGPU::VCC_HI_HI16:
309 Info.UsesVCC = true;
310 continue;
311
312 case AMDGPU::FLAT_SCR:
313 case AMDGPU::FLAT_SCR_LO:
314 case AMDGPU::FLAT_SCR_HI:
315 continue;
316
317 case AMDGPU::XNACK_MASK:
318 case AMDGPU::XNACK_MASK_LO:
319 case AMDGPU::XNACK_MASK_HI:
320 llvm_unreachable("xnack_mask registers should not be used");
321
322 case AMDGPU::LDS_DIRECT:
323 llvm_unreachable("lds_direct register should not be used");
324
325 case AMDGPU::TBA:
326 case AMDGPU::TBA_LO:
327 case AMDGPU::TBA_HI:
328 case AMDGPU::TMA:
329 case AMDGPU::TMA_LO:
330 case AMDGPU::TMA_HI:
331 llvm_unreachable("trap handler registers should not be used");
332
333 case AMDGPU::SRC_VCCZ:
334 llvm_unreachable("src_vccz register should not be used");
335
336 case AMDGPU::SRC_EXECZ:
337 llvm_unreachable("src_execz register should not be used");
338
339 case AMDGPU::SRC_SCC:
340 llvm_unreachable("src_scc register should not be used");
341
342 default:
343 break;
344 }
345
346 if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
347 AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
348 AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
349 IsSGPR = true;
350 Width = 1;
351 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
352 AMDGPU::VGPR_16RegClass.contains(Reg)) {
353 IsSGPR = false;
354 Width = 1;
355 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
356 AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
357 IsSGPR = false;
358 IsAGPR = true;
359 Width = 1;
360 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
361 IsSGPR = true;
362 Width = 2;
363 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
364 IsSGPR = false;
365 Width = 2;
366 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
367 IsSGPR = false;
368 IsAGPR = true;
369 Width = 2;
370 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
371 IsSGPR = false;
372 Width = 3;
373 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
374 IsSGPR = true;
375 Width = 3;
376 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
377 IsSGPR = false;
378 IsAGPR = true;
379 Width = 3;
380 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
381 IsSGPR = true;
382 Width = 4;
383 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
384 IsSGPR = false;
385 Width = 4;
386 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
387 IsSGPR = false;
388 IsAGPR = true;
389 Width = 4;
390 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
391 IsSGPR = false;
392 Width = 5;
393 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
394 IsSGPR = true;
395 Width = 5;
396 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
397 IsSGPR = false;
398 IsAGPR = true;
399 Width = 5;
400 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
401 IsSGPR = false;
402 Width = 6;
403 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
404 IsSGPR = true;
405 Width = 6;
406 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
407 IsSGPR = false;
408 IsAGPR = true;
409 Width = 6;
410 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
411 IsSGPR = false;
412 Width = 7;
413 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
414 IsSGPR = true;
415 Width = 7;
416 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
417 IsSGPR = false;
418 IsAGPR = true;
419 Width = 7;
420 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
421 IsSGPR = true;
422 Width = 8;
423 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
424 IsSGPR = false;
425 Width = 8;
426 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
427 IsSGPR = false;
428 IsAGPR = true;
429 Width = 8;
430 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
431 IsSGPR = false;
432 Width = 9;
433 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
434 IsSGPR = true;
435 Width = 9;
436 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
437 IsSGPR = false;
438 IsAGPR = true;
439 Width = 9;
440 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
441 IsSGPR = false;
442 Width = 10;
443 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
444 IsSGPR = true;
445 Width = 10;
446 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
447 IsSGPR = false;
448 IsAGPR = true;
449 Width = 10;
450 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
451 IsSGPR = false;
452 Width = 11;
453 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
454 IsSGPR = true;
455 Width = 11;
456 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
457 IsSGPR = false;
458 IsAGPR = true;
459 Width = 11;
460 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
461 IsSGPR = false;
462 Width = 12;
463 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
464 IsSGPR = true;
465 Width = 12;
466 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
467 IsSGPR = false;
468 IsAGPR = true;
469 Width = 12;
470 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
471 IsSGPR = true;
472 Width = 16;
473 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
474 IsSGPR = false;
475 Width = 16;
476 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
477 IsSGPR = false;
478 IsAGPR = true;
479 Width = 16;
480 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
481 IsSGPR = true;
482 Width = 32;
483 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
484 IsSGPR = false;
485 Width = 32;
486 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
487 IsSGPR = false;
488 IsAGPR = true;
489 Width = 32;
490 } else {
491 // We only expect TTMP registers or registers that do not belong to
492 // any RC.
493 assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
494 AMDGPU::TTMP_64RegClass.contains(Reg) ||
495 AMDGPU::TTMP_128RegClass.contains(Reg) ||
496 AMDGPU::TTMP_256RegClass.contains(Reg) ||
497 AMDGPU::TTMP_512RegClass.contains(Reg) ||
498 !TRI.getPhysRegBaseClass(Reg)) &&
499 "Unknown register class");
500 }
501 unsigned HWReg = TRI.getHWRegIndex(Reg);
502 int MaxUsed = HWReg + Width - 1;
503 if (IsSGPR) {
504 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
505 } else if (IsAGPR) {
506 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
507 } else {
508 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
509 }
510 }
511
512 if (MI.isCall()) {
513 // Pseudo used just to encode the underlying global. Is there a better
514 // way to track this?
515
516 const MachineOperand *CalleeOp =
517 TII->getNamedOperand(MI, OpName: AMDGPU::OpName::callee);
518
519 const Function *Callee = getCalleeFunction(Op: *CalleeOp);
520 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
521 CallGraphResourceInfo.end();
522
523 // Avoid crashing on undefined behavior with an illegal call to a
524 // kernel. If a callsite's calling convention doesn't match the
525 // function's, it's undefined behavior. If the callsite calling
526 // convention does match, that would have errored earlier.
527 if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv()))
528 report_fatal_error(reason: "invalid call to entry function");
529
530 bool IsIndirect = !Callee || Callee->isDeclaration();
531 if (!IsIndirect)
532 I = CallGraphResourceInfo.find(Val: Callee);
533
534 // FIXME: Call site could have norecurse on it
535 if (!Callee || !Callee->doesNotRecurse()) {
536 Info.HasRecursion = true;
537
538 // TODO: If we happen to know there is no stack usage in the
539 // callgraph, we don't need to assume an infinitely growing stack.
540 if (!MI.isReturn()) {
541 // We don't need to assume an unknown stack size for tail calls.
542
543 // FIXME: This only benefits in the case where the kernel does not
544 // directly call the tail called function. If a kernel directly
545 // calls a tail recursive function, we'll assume maximum stack size
546 // based on the regular call instruction.
547 CalleeFrameSize = std::max(
548 a: CalleeFrameSize,
549 b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
550 }
551 }
552
553 if (IsIndirect || I == CallGraphResourceInfo.end()) {
554 CalleeFrameSize =
555 std::max(a: CalleeFrameSize,
556 b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
557
558 // Register usage of indirect calls gets handled later
559 Info.UsesVCC = true;
560 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
561 Info.HasDynamicallySizedStack = true;
562 Info.HasIndirectCall = true;
563 } else {
564 // We force CodeGen to run in SCC order, so the callee's register
565 // usage etc. should be the cumulative usage of all callees.
566 MaxSGPR = std::max(a: I->second.NumExplicitSGPR - 1, b: MaxSGPR);
567 MaxVGPR = std::max(a: I->second.NumVGPR - 1, b: MaxVGPR);
568 MaxAGPR = std::max(a: I->second.NumAGPR - 1, b: MaxAGPR);
569 CalleeFrameSize =
570 std::max(a: I->second.PrivateSegmentSize, b: CalleeFrameSize);
571 Info.UsesVCC |= I->second.UsesVCC;
572 Info.UsesFlatScratch |= I->second.UsesFlatScratch;
573 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
574 Info.HasRecursion |= I->second.HasRecursion;
575 Info.HasIndirectCall |= I->second.HasIndirectCall;
576 }
577 }
578 }
579 }
580
581 Info.NumExplicitSGPR = MaxSGPR + 1;
582 Info.NumVGPR = MaxVGPR + 1;
583 Info.NumAGPR = MaxAGPR + 1;
584 Info.PrivateSegmentSize += CalleeFrameSize;
585
586 return Info;
587}
588
589void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
590 // Collect the maximum number of registers from non-hardware-entrypoints.
591 // All these functions are potential targets for indirect calls.
592 int32_t NonKernelMaxSGPRs = 0;
593 int32_t NonKernelMaxVGPRs = 0;
594 int32_t NonKernelMaxAGPRs = 0;
595
596 for (const auto &I : CallGraphResourceInfo) {
597 if (!AMDGPU::isEntryFunctionCC(CC: I.getFirst()->getCallingConv())) {
598 auto &Info = I.getSecond();
599 NonKernelMaxSGPRs = std::max(a: NonKernelMaxSGPRs, b: Info.NumExplicitSGPR);
600 NonKernelMaxVGPRs = std::max(a: NonKernelMaxVGPRs, b: Info.NumVGPR);
601 NonKernelMaxAGPRs = std::max(a: NonKernelMaxAGPRs, b: Info.NumAGPR);
602 }
603 }
604
605 // Add register usage for functions with indirect calls.
606 // For calls to unknown functions, we assume the maximum register usage of
607 // all non-hardware-entrypoints in the current module.
608 for (auto &I : CallGraphResourceInfo) {
609 auto &Info = I.getSecond();
610 if (Info.HasIndirectCall) {
611 Info.NumExplicitSGPR = std::max(a: Info.NumExplicitSGPR, b: NonKernelMaxSGPRs);
612 Info.NumVGPR = std::max(a: Info.NumVGPR, b: NonKernelMaxVGPRs);
613 Info.NumAGPR = std::max(a: Info.NumAGPR, b: NonKernelMaxAGPRs);
614 }
615 }
616}
617