1//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes how many registers and other resources are used by
11/// functions.
12///
13/// The results of this analysis are used to fill the register usage, flat
14/// usage, etc. into hardware registers.
15///
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUResourceUsageAnalysis.h"
19#include "AMDGPU.h"
20#include "GCNSubtarget.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/MachineFrameInfo.h"
23#include "llvm/CodeGen/MachineModuleInfo.h"
24#include "llvm/CodeGen/TargetPassConfig.h"
25#include "llvm/IR/GlobalValue.h"
26#include "llvm/Target/TargetMachine.h"
27
28using namespace llvm;
29using namespace llvm::AMDGPU;
30
31#define DEBUG_TYPE "amdgpu-resource-usage"
32
33char llvm::AMDGPUResourceUsageAnalysisWrapperPass::ID = 0;
34char &llvm::AMDGPUResourceUsageAnalysisID =
35 AMDGPUResourceUsageAnalysisWrapperPass::ID;
36
37// In code object v4 and older, we need to tell the runtime some amount ahead of
38// time if we don't know the true stack size. Assume a smaller number if this is
39// only due to dynamic / non-entry block allocas.
40static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
41 "amdgpu-assume-external-call-stack-size",
42 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
43 cl::init(Val: 16384));
44
45static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
46 "amdgpu-assume-dynamic-stack-object-size",
47 cl::desc("Assumed extra stack use if there are any "
48 "variable sized objects (in bytes)"),
49 cl::Hidden, cl::init(Val: 4096));
50
51INITIALIZE_PASS(AMDGPUResourceUsageAnalysisWrapperPass, DEBUG_TYPE,
52 "Function register usage analysis", true, true)
53
54static const Function *getCalleeFunction(const MachineOperand &Op) {
55 if (Op.isImm()) {
56 assert(Op.getImm() == 0);
57 return nullptr;
58 }
59 return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases());
60}
61
62static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
63 const SIInstrInfo &TII, unsigned Reg) {
64 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
65 if (!UseOp.isImplicit() || !TII.isFLAT(MI: *UseOp.getParent()))
66 return true;
67 }
68
69 return false;
70}
71
72bool AMDGPUResourceUsageAnalysisWrapperPass::runOnMachineFunction(
73 MachineFunction &MF) {
74 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
75 if (!TPC)
76 return false;
77
78 const TargetMachine &TM = TPC->getTM<TargetMachine>();
79 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
80
81 // By default, for code object v5 and later, track only the minimum scratch
82 // size
83 uint32_t AssumedStackSizeForDynamicSizeObjects =
84 clAssumedStackSizeForDynamicSizeObjects;
85 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
86 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
87 AMDGPU::AMDHSA_COV5 ||
88 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
89 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
90 AssumedStackSizeForDynamicSizeObjects = 0;
91 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
92 AssumedStackSizeForExternalCall = 0;
93 }
94
95 ResourceInfo = AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
96 MF, AssumedStackSizeForDynamicSizeObjects,
97 AssumedStackSizeForExternalCall);
98
99 return false;
100}
101
102AnalysisKey AMDGPUResourceUsageAnalysis::Key;
103AMDGPUResourceUsageAnalysis::Result
104AMDGPUResourceUsageAnalysis::run(MachineFunction &MF,
105 MachineFunctionAnalysisManager &MFAM) {
106 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
107
108 // By default, for code object v5 and later, track only the minimum scratch
109 // size
110 uint32_t AssumedStackSizeForDynamicSizeObjects =
111 clAssumedStackSizeForDynamicSizeObjects;
112 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
113 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
114 AMDGPU::AMDHSA_COV5 ||
115 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
116 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
117 AssumedStackSizeForDynamicSizeObjects = 0;
118 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
119 AssumedStackSizeForExternalCall = 0;
120 }
121
122 return AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
123 MF, AssumedStackSizeForDynamicSizeObjects,
124 AssumedStackSizeForExternalCall);
125}
126
127AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo
128AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
129 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
130 uint32_t AssumedStackSizeForExternalCall) const {
131 SIFunctionResourceInfo Info;
132
133 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
134 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
135 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
136 const MachineRegisterInfo &MRI = MF.getRegInfo();
137 const SIInstrInfo *TII = ST.getInstrInfo();
138 const SIRegisterInfo &TRI = TII->getRegisterInfo();
139
140 Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) ||
141 MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) ||
142 MRI.isLiveIn(Reg: MFI->getPreloadedReg(
143 Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
144
145 Info.NumNamedBarrier = MFI->getNumNamedBarriers();
146
147 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
148 // instructions aren't used to access the scratch buffer. Inline assembly may
149 // need it though.
150 //
151 // If we only have implicit uses of flat_scr on flat instructions, it is not
152 // really needed.
153 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
154 (!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) &&
155 !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) &&
156 !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) {
157 Info.UsesFlatScratch = false;
158 }
159
160 Info.PrivateSegmentSize = FrameInfo.getStackSize();
161
162 // Assume a big number if there are any unknown sized objects.
163 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
164 if (Info.HasDynamicallySizedStack)
165 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
166
167 if (MFI->isStackRealigned())
168 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
169
170 Info.UsesVCC =
171 MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) || MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI);
172 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::SGPR_32RegClass,
173 /*IncludeCalls=*/false);
174 if (ST.hasMAIInsts())
175 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::AGPR_32RegClass,
176 /*IncludeCalls=*/false);
177
178 // If there are no calls, MachineRegisterInfo can tell us the used register
179 // count easily.
180 // A tail call isn't considered a call for MachineFrameInfo's purposes.
181 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
182 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass,
183 /*IncludeCalls=*/false);
184 return Info;
185 }
186
187 int32_t MaxVGPR = -1;
188 Info.CalleeSegmentSize = 0;
189
190 for (const MachineBasicBlock &MBB : MF) {
191 for (const MachineInstr &MI : MBB) {
192 for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
193 const MachineOperand &MO = MI.getOperand(i: I);
194
195 if (!MO.isReg())
196 continue;
197
198 Register Reg = MO.getReg();
199 switch (Reg) {
200 case AMDGPU::NoRegister:
201 assert(MI.isDebugInstr() &&
202 "Instruction uses invalid noreg register");
203 continue;
204
205 case AMDGPU::XNACK_MASK:
206 case AMDGPU::XNACK_MASK_LO:
207 case AMDGPU::XNACK_MASK_HI:
208 llvm_unreachable("xnack_mask registers should not be used");
209
210 case AMDGPU::LDS_DIRECT:
211 llvm_unreachable("lds_direct register should not be used");
212
213 case AMDGPU::TBA:
214 case AMDGPU::TBA_LO:
215 case AMDGPU::TBA_HI:
216 case AMDGPU::TMA:
217 case AMDGPU::TMA_LO:
218 case AMDGPU::TMA_HI:
219 llvm_unreachable("trap handler registers should not be used");
220
221 case AMDGPU::SRC_VCCZ:
222 llvm_unreachable("src_vccz register should not be used");
223
224 case AMDGPU::SRC_EXECZ:
225 llvm_unreachable("src_execz register should not be used");
226
227 case AMDGPU::SRC_SCC:
228 llvm_unreachable("src_scc register should not be used");
229
230 default:
231 break;
232 }
233
234 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
235 assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
236 TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
237 AMDGPU::TTMP_64RegClass.contains(Reg) ||
238 AMDGPU::TTMP_128RegClass.contains(Reg) ||
239 AMDGPU::TTMP_256RegClass.contains(Reg) ||
240 AMDGPU::TTMP_512RegClass.contains(Reg)) &&
241 "Unknown register class");
242
243 if (!RC || !TRI.isVGPRClass(RC))
244 continue;
245
246 if (MI.isCall() || MI.isMetaInstruction())
247 continue;
248
249 unsigned Width = divideCeil(Numerator: TRI.getRegSizeInBits(RC: *RC), Denominator: 32);
250 unsigned HWReg = TRI.getHWRegIndex(Reg);
251 int MaxUsed = HWReg + Width - 1;
252 MaxVGPR = std::max(a: MaxUsed, b: MaxVGPR);
253 }
254
255 if (MI.isCall()) {
256 // Pseudo used just to encode the underlying global. Is there a better
257 // way to track this?
258
259 // TODO: Some of the generic call-like pseudos do not encode the callee,
260 // so we overly conservatively treat this as an indirect call.
261 const MachineOperand *CalleeOp =
262 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::callee);
263
264 const Function *Callee =
265 CalleeOp ? getCalleeFunction(Op: *CalleeOp) : nullptr;
266
267 auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
268 return F == &MF.getFunction();
269 };
270
271 if (Callee && !isSameFunction(MF, Callee))
272 Info.Callees.push_back(Elt: Callee);
273
274 bool IsIndirect = !Callee || Callee->isDeclaration();
275
276 // FIXME: Call site could have norecurse on it
277 if (!Callee || !Callee->doesNotRecurse()) {
278 Info.HasRecursion = true;
279
280 // TODO: If we happen to know there is no stack usage in the
281 // callgraph, we don't need to assume an infinitely growing stack.
282 if (!MI.isReturn()) {
283 // We don't need to assume an unknown stack size for tail calls.
284
285 // FIXME: This only benefits in the case where the kernel does not
286 // directly call the tail called function. If a kernel directly
287 // calls a tail recursive function, we'll assume maximum stack size
288 // based on the regular call instruction.
289 Info.CalleeSegmentSize = std::max(
290 a: Info.CalleeSegmentSize,
291 b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
292 }
293 }
294
295 if (IsIndirect) {
296 Info.CalleeSegmentSize =
297 std::max(a: Info.CalleeSegmentSize,
298 b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
299
300 // Register usage of indirect calls gets handled later
301 Info.UsesVCC = true;
302 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
303 Info.HasDynamicallySizedStack = true;
304 Info.HasIndirectCall = true;
305 }
306 }
307 }
308 }
309
310 Info.NumVGPR = MaxVGPR + 1;
311
312 return Info;
313}
314