1//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes how many registers and other resources are used by
11/// functions.
12///
13/// The results of this analysis are used to fill the register usage, flat
14/// usage, etc. into hardware registers.
15///
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUResourceUsageAnalysis.h"
19#include "AMDGPU.h"
20#include "AMDGPUTargetMachine.h"
21#include "GCNSubtarget.h"
22#include "SIMachineFunctionInfo.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/MachineModuleInfo.h"
25#include "llvm/CodeGen/TargetPassConfig.h"
26#include "llvm/IR/GlobalValue.h"
27#include "llvm/Target/TargetMachine.h"
28
29using namespace llvm;
30using namespace llvm::AMDGPU;
31
32#define DEBUG_TYPE "amdgpu-resource-usage"
33
34char llvm::AMDGPUResourceUsageAnalysisWrapperPass::ID = 0;
35char &llvm::AMDGPUResourceUsageAnalysisID =
36 AMDGPUResourceUsageAnalysisWrapperPass::ID;
37
38// In code object v4 and older, we need to tell the runtime some amount ahead of
39// time if we don't know the true stack size. Assume a smaller number if this is
40// only due to dynamic / non-entry block allocas.
41static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
42 "amdgpu-assume-external-call-stack-size",
43 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
44 cl::init(Val: 16384));
45
46static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
47 "amdgpu-assume-dynamic-stack-object-size",
48 cl::desc("Assumed extra stack use if there are any "
49 "variable sized objects (in bytes)"),
50 cl::Hidden, cl::init(Val: 4096));
51
52INITIALIZE_PASS(AMDGPUResourceUsageAnalysisWrapperPass, DEBUG_TYPE,
53 "Function register usage analysis", true, true)
54
55static const Function *getCalleeFunction(const MachineOperand &Op) {
56 if (Op.isImm()) {
57 assert(Op.getImm() == 0);
58 return nullptr;
59 }
60 return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases());
61}
62
63static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
64 const SIInstrInfo &TII, unsigned Reg) {
65 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
66 if (!UseOp.isImplicit() || !TII.isFLAT(MI: *UseOp.getParent()))
67 return true;
68 }
69
70 return false;
71}
72
73bool AMDGPUResourceUsageAnalysisWrapperPass::runOnMachineFunction(
74 MachineFunction &MF) {
75 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
76 if (!TPC)
77 return false;
78
79 const TargetMachine &TM = TPC->getTM<TargetMachine>();
80 const MCSubtargetInfo &STI = TM.getMCSubtargetInfo();
81
82 // By default, for code object v5 and later, track only the minimum scratch
83 // size
84 uint32_t AssumedStackSizeForDynamicSizeObjects =
85 clAssumedStackSizeForDynamicSizeObjects;
86 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
87 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
88 AMDGPU::AMDHSA_COV5 ||
89 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
90 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
91 AssumedStackSizeForDynamicSizeObjects = 0;
92 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
93 AssumedStackSizeForExternalCall = 0;
94 }
95
96 ResourceInfo = AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
97 MF, AssumedStackSizeForDynamicSizeObjects,
98 AssumedStackSizeForExternalCall);
99
100 return false;
101}
102
103AnalysisKey AMDGPUResourceUsageAnalysis::Key;
104AMDGPUResourceUsageAnalysis::Result
105AMDGPUResourceUsageAnalysis::run(MachineFunction &MF,
106 MachineFunctionAnalysisManager &MFAM) {
107 const MCSubtargetInfo &STI = TM.getMCSubtargetInfo();
108
109 // By default, for code object v5 and later, track only the minimum scratch
110 // size
111 uint32_t AssumedStackSizeForDynamicSizeObjects =
112 clAssumedStackSizeForDynamicSizeObjects;
113 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
114 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
115 AMDGPU::AMDHSA_COV5 ||
116 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
117 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
118 AssumedStackSizeForDynamicSizeObjects = 0;
119 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
120 AssumedStackSizeForExternalCall = 0;
121 }
122
123 return AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
124 MF, AssumedStackSizeForDynamicSizeObjects,
125 AssumedStackSizeForExternalCall);
126}
127
128AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo
129AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
130 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
131 uint32_t AssumedStackSizeForExternalCall) const {
132 SIFunctionResourceInfo Info;
133
134 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
135 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
136 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
137 const MachineRegisterInfo &MRI = MF.getRegInfo();
138 const SIInstrInfo *TII = ST.getInstrInfo();
139 const SIRegisterInfo &TRI = TII->getRegisterInfo();
140
141 Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) ||
142 MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) ||
143 MRI.isLiveIn(Reg: MFI->getPreloadedReg(
144 Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
145
146 Info.NumNamedBarrier = MFI->getNumNamedBarriers();
147
148 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
149 // instructions aren't used to access the scratch buffer. Inline assembly may
150 // need it though.
151 //
152 // If we only have implicit uses of flat_scr on flat instructions, it is not
153 // really needed.
154 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
155 (!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) &&
156 !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) &&
157 !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) {
158 Info.UsesFlatScratch = false;
159 }
160
161 Info.PrivateSegmentSize = FrameInfo.getStackSize();
162
163 // Assume a big number if there are any unknown sized objects.
164 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
165 if (Info.HasDynamicallySizedStack)
166 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
167
168 if (MFI->isStackRealigned())
169 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
170
171 Info.UsesVCC =
172 MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) || MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI);
173 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::SGPR_32RegClass,
174 /*IncludeCalls=*/false);
175 if (ST.hasMAIInsts())
176 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::AGPR_32RegClass,
177 /*IncludeCalls=*/false);
178
179 // If there are no calls, MachineRegisterInfo can tell us the used register
180 // count easily.
181 // A tail call isn't considered a call for MachineFrameInfo's purposes.
182 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
183 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass,
184 /*IncludeCalls=*/false);
185 return Info;
186 }
187
188 int32_t MaxVGPR = -1;
189 Info.CalleeSegmentSize = 0;
190
191 for (const MachineBasicBlock &MBB : MF) {
192 for (const MachineInstr &MI : MBB) {
193 for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
194 const MachineOperand &MO = MI.getOperand(i: I);
195
196 if (!MO.isReg())
197 continue;
198
199 Register Reg = MO.getReg();
200 switch (Reg) {
201 case AMDGPU::NoRegister:
202 assert(MI.isDebugInstr() &&
203 "Instruction uses invalid noreg register");
204 continue;
205
206 case AMDGPU::XNACK_MASK:
207 case AMDGPU::XNACK_MASK_LO:
208 case AMDGPU::XNACK_MASK_HI:
209 llvm_unreachable("xnack_mask registers should not be used");
210
211 case AMDGPU::LDS_DIRECT:
212 llvm_unreachable("lds_direct register should not be used");
213
214 case AMDGPU::TBA:
215 case AMDGPU::TBA_LO:
216 case AMDGPU::TBA_HI:
217 case AMDGPU::TMA:
218 case AMDGPU::TMA_LO:
219 case AMDGPU::TMA_HI:
220 llvm_unreachable("trap handler registers should not be used");
221
222 case AMDGPU::SRC_VCCZ:
223 llvm_unreachable("src_vccz register should not be used");
224
225 case AMDGPU::SRC_EXECZ:
226 llvm_unreachable("src_execz register should not be used");
227
228 case AMDGPU::SRC_SCC:
229 llvm_unreachable("src_scc register should not be used");
230
231 default:
232 break;
233 }
234
235 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
236 assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
237 TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
238 AMDGPU::TTMP_64RegClass.contains(Reg) ||
239 AMDGPU::TTMP_128RegClass.contains(Reg) ||
240 AMDGPU::TTMP_256RegClass.contains(Reg) ||
241 AMDGPU::TTMP_512RegClass.contains(Reg)) &&
242 "Unknown register class");
243
244 if (!RC || !TRI.isVGPRClass(RC))
245 continue;
246
247 if (MI.isCall() || MI.isMetaInstruction())
248 continue;
249
250 unsigned Width = divideCeil(Numerator: TRI.getRegSizeInBits(RC: *RC), Denominator: 32);
251 unsigned HWReg = TRI.getHWRegIndex(Reg);
252 int MaxUsed = HWReg + Width - 1;
253 MaxVGPR = std::max(a: MaxUsed, b: MaxVGPR);
254 }
255
256 if (MI.isCall()) {
257 // Pseudo used just to encode the underlying global. Is there a better
258 // way to track this?
259
260 // TODO: Some of the generic call-like pseudos do not encode the callee,
261 // so we overly conservatively treat this as an indirect call.
262 const MachineOperand *CalleeOp =
263 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::callee);
264
265 const Function *Callee =
266 CalleeOp ? getCalleeFunction(Op: *CalleeOp) : nullptr;
267
268 auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
269 return F == &MF.getFunction();
270 };
271
272 if (Callee && !isSameFunction(MF, Callee))
273 Info.Callees.push_back(Elt: Callee);
274
275 bool IsIndirect = !Callee || Callee->isDeclaration();
276 Info.HasIndirectCall |= IsIndirect;
277
278 bool IsChainCall = MI.getOpcode() == AMDGPU::SI_TCRETURN_CHAIN;
279 Info.HasNonChainIndirectCall |= (!IsChainCall && IsIndirect);
280
281 // In object linking mode the linker has the full cross-TU view. It
282 // propagates resource usage across both direct calls to external
283 // declarations and true indirect calls. Skip the compile-time
284 // conservative assumptions so that the locally emitted metadata
285 // describes this function's own usage only.
286 if (AMDGPUTargetMachine::EnableObjectLinking)
287 continue;
288
289 // FIXME: Call site could have norecurse on it
290 if (!Callee || !Callee->doesNotRecurse()) {
291 Info.HasRecursion = true;
292
293 // TODO: If we happen to know there is no stack usage in the
294 // callgraph, we don't need to assume an infinitely growing stack.
295 if (!MI.isReturn()) {
296 // We don't need to assume an unknown stack size for tail calls.
297
298 // FIXME: This only benefits in the case where the kernel does not
299 // directly call the tail called function. If a kernel directly
300 // calls a tail recursive function, we'll assume maximum stack size
301 // based on the regular call instruction.
302 Info.CalleeSegmentSize = std::max(
303 a: Info.CalleeSegmentSize,
304 b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
305 }
306 }
307
308 if (IsIndirect) {
309 Info.CalleeSegmentSize =
310 std::max(a: Info.CalleeSegmentSize,
311 b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
312
313 // Register usage of indirect calls gets handled later
314 Info.UsesVCC = true;
315 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
316 Info.HasDynamicallySizedStack = true;
317 }
318 }
319 }
320 }
321
322 Info.NumVGPR = MaxVGPR + 1;
323
324 return Info;
325}
326