1 | //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Analyzes how many registers and other resources are used by |
11 | /// functions. |
12 | /// |
13 | /// The results of this analysis are used to fill the register usage, flat |
14 | /// usage, etc. into hardware registers. |
15 | /// |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "AMDGPUResourceUsageAnalysis.h" |
19 | #include "AMDGPU.h" |
20 | #include "GCNSubtarget.h" |
21 | #include "SIMachineFunctionInfo.h" |
22 | #include "llvm/CodeGen/MachineFrameInfo.h" |
23 | #include "llvm/CodeGen/MachineModuleInfo.h" |
24 | #include "llvm/CodeGen/TargetPassConfig.h" |
25 | #include "llvm/IR/GlobalValue.h" |
26 | #include "llvm/Target/TargetMachine.h" |
27 | |
28 | using namespace llvm; |
29 | using namespace llvm::AMDGPU; |
30 | |
31 | #define DEBUG_TYPE "amdgpu-resource-usage" |
32 | |
33 | char llvm::AMDGPUResourceUsageAnalysis::ID = 0; |
34 | char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; |
35 | |
36 | // In code object v4 and older, we need to tell the runtime some amount ahead of |
37 | // time if we don't know the true stack size. Assume a smaller number if this is |
38 | // only due to dynamic / non-entry block allocas. |
39 | static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( |
40 | "amdgpu-assume-external-call-stack-size" , |
41 | cl::desc("Assumed stack use of any external call (in bytes)" ), cl::Hidden, |
42 | cl::init(Val: 16384)); |
43 | |
44 | static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( |
45 | "amdgpu-assume-dynamic-stack-object-size" , |
46 | cl::desc("Assumed extra stack use if there are any " |
47 | "variable sized objects (in bytes)" ), |
48 | cl::Hidden, cl::init(Val: 4096)); |
49 | |
50 | INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, |
51 | "Function register usage analysis" , true, true) |
52 | |
53 | static const Function *getCalleeFunction(const MachineOperand &Op) { |
54 | if (Op.isImm()) { |
55 | assert(Op.getImm() == 0); |
56 | return nullptr; |
57 | } |
58 | return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases()); |
59 | } |
60 | |
61 | static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, |
62 | const SIInstrInfo &TII, unsigned Reg) { |
63 | for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { |
64 | if (!UseOp.isImplicit() || !TII.isFLAT(MI: *UseOp.getParent())) |
65 | return true; |
66 | } |
67 | |
68 | return false; |
69 | } |
70 | |
71 | bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) { |
72 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
73 | if (!TPC) |
74 | return false; |
75 | |
76 | const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
77 | const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); |
78 | |
79 | // By default, for code object v5 and later, track only the minimum scratch |
80 | // size |
81 | uint32_t AssumedStackSizeForDynamicSizeObjects = |
82 | clAssumedStackSizeForDynamicSizeObjects; |
83 | uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; |
84 | if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >= |
85 | AMDGPU::AMDHSA_COV5 || |
86 | STI.getTargetTriple().getOS() == Triple::AMDPAL) { |
87 | if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) |
88 | AssumedStackSizeForDynamicSizeObjects = 0; |
89 | if (!clAssumedStackSizeForExternalCall.getNumOccurrences()) |
90 | AssumedStackSizeForExternalCall = 0; |
91 | } |
92 | |
93 | ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects, |
94 | AssumedStackSizeForExternalCall); |
95 | |
96 | return false; |
97 | } |
98 | |
99 | AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo |
100 | AMDGPUResourceUsageAnalysis::analyzeResourceUsage( |
101 | const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, |
102 | uint32_t AssumedStackSizeForExternalCall) const { |
103 | SIFunctionResourceInfo Info; |
104 | |
105 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
106 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
107 | const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
108 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
109 | const SIInstrInfo *TII = ST.getInstrInfo(); |
110 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
111 | |
112 | Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) || |
113 | MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) || |
114 | MRI.isLiveIn(Reg: MFI->getPreloadedReg( |
115 | Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); |
116 | |
117 | // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat |
118 | // instructions aren't used to access the scratch buffer. Inline assembly may |
119 | // need it though. |
120 | // |
121 | // If we only have implicit uses of flat_scr on flat instructions, it is not |
122 | // really needed. |
123 | if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && |
124 | (!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) && |
125 | !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) && |
126 | !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) { |
127 | Info.UsesFlatScratch = false; |
128 | } |
129 | |
130 | Info.PrivateSegmentSize = FrameInfo.getStackSize(); |
131 | |
132 | // Assume a big number if there are any unknown sized objects. |
133 | Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); |
134 | if (Info.HasDynamicallySizedStack) |
135 | Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; |
136 | |
137 | if (MFI->isStackRealigned()) |
138 | Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); |
139 | |
140 | Info.UsesVCC = |
141 | MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) || MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI); |
142 | |
143 | // If there are no calls, MachineRegisterInfo can tell us the used register |
144 | // count easily. |
145 | // A tail call isn't considered a call for MachineFrameInfo's purposes. |
146 | if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { |
147 | Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass); |
148 | Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::SGPR_32RegClass); |
149 | if (ST.hasMAIInsts()) |
150 | Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::AGPR_32RegClass); |
151 | return Info; |
152 | } |
153 | |
154 | int32_t MaxVGPR = -1; |
155 | int32_t MaxAGPR = -1; |
156 | int32_t MaxSGPR = -1; |
157 | Info.CalleeSegmentSize = 0; |
158 | |
159 | for (const MachineBasicBlock &MBB : MF) { |
160 | for (const MachineInstr &MI : MBB) { |
161 | // TODO: Check regmasks? Do they occur anywhere except calls? |
162 | for (const MachineOperand &MO : MI.operands()) { |
163 | unsigned Width = 0; |
164 | bool IsSGPR = false; |
165 | bool IsAGPR = false; |
166 | |
167 | if (!MO.isReg()) |
168 | continue; |
169 | |
170 | Register Reg = MO.getReg(); |
171 | switch (Reg) { |
172 | case AMDGPU::EXEC: |
173 | case AMDGPU::EXEC_LO: |
174 | case AMDGPU::EXEC_HI: |
175 | case AMDGPU::SCC: |
176 | case AMDGPU::M0: |
177 | case AMDGPU::M0_LO16: |
178 | case AMDGPU::M0_HI16: |
179 | case AMDGPU::SRC_SHARED_BASE_LO: |
180 | case AMDGPU::SRC_SHARED_BASE: |
181 | case AMDGPU::SRC_SHARED_LIMIT_LO: |
182 | case AMDGPU::SRC_SHARED_LIMIT: |
183 | case AMDGPU::SRC_PRIVATE_BASE_LO: |
184 | case AMDGPU::SRC_PRIVATE_BASE: |
185 | case AMDGPU::SRC_PRIVATE_LIMIT_LO: |
186 | case AMDGPU::SRC_PRIVATE_LIMIT: |
187 | case AMDGPU::SRC_POPS_EXITING_WAVE_ID: |
188 | case AMDGPU::SGPR_NULL: |
189 | case AMDGPU::SGPR_NULL64: |
190 | case AMDGPU::MODE: |
191 | continue; |
192 | |
193 | case AMDGPU::NoRegister: |
194 | assert(MI.isDebugInstr() && |
195 | "Instruction uses invalid noreg register" ); |
196 | continue; |
197 | |
198 | case AMDGPU::VCC: |
199 | case AMDGPU::VCC_LO: |
200 | case AMDGPU::VCC_HI: |
201 | case AMDGPU::VCC_LO_LO16: |
202 | case AMDGPU::VCC_LO_HI16: |
203 | case AMDGPU::VCC_HI_LO16: |
204 | case AMDGPU::VCC_HI_HI16: |
205 | Info.UsesVCC = true; |
206 | continue; |
207 | |
208 | case AMDGPU::FLAT_SCR: |
209 | case AMDGPU::FLAT_SCR_LO: |
210 | case AMDGPU::FLAT_SCR_HI: |
211 | continue; |
212 | |
213 | case AMDGPU::XNACK_MASK: |
214 | case AMDGPU::XNACK_MASK_LO: |
215 | case AMDGPU::XNACK_MASK_HI: |
216 | llvm_unreachable("xnack_mask registers should not be used" ); |
217 | |
218 | case AMDGPU::LDS_DIRECT: |
219 | llvm_unreachable("lds_direct register should not be used" ); |
220 | |
221 | case AMDGPU::TBA: |
222 | case AMDGPU::TBA_LO: |
223 | case AMDGPU::TBA_HI: |
224 | case AMDGPU::TMA: |
225 | case AMDGPU::TMA_LO: |
226 | case AMDGPU::TMA_HI: |
227 | llvm_unreachable("trap handler registers should not be used" ); |
228 | |
229 | case AMDGPU::SRC_VCCZ: |
230 | llvm_unreachable("src_vccz register should not be used" ); |
231 | |
232 | case AMDGPU::SRC_EXECZ: |
233 | llvm_unreachable("src_execz register should not be used" ); |
234 | |
235 | case AMDGPU::SRC_SCC: |
236 | llvm_unreachable("src_scc register should not be used" ); |
237 | |
238 | default: |
239 | break; |
240 | } |
241 | |
242 | if (AMDGPU::SGPR_32RegClass.contains(Reg) || |
243 | AMDGPU::SGPR_LO16RegClass.contains(Reg) || |
244 | AMDGPU::SGPR_HI16RegClass.contains(Reg)) { |
245 | IsSGPR = true; |
246 | Width = 1; |
247 | } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || |
248 | AMDGPU::VGPR_16RegClass.contains(Reg)) { |
249 | IsSGPR = false; |
250 | Width = 1; |
251 | } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || |
252 | AMDGPU::AGPR_LO16RegClass.contains(Reg)) { |
253 | IsSGPR = false; |
254 | IsAGPR = true; |
255 | Width = 1; |
256 | } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { |
257 | IsSGPR = true; |
258 | Width = 2; |
259 | } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { |
260 | IsSGPR = false; |
261 | Width = 2; |
262 | } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { |
263 | IsSGPR = false; |
264 | IsAGPR = true; |
265 | Width = 2; |
266 | } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { |
267 | IsSGPR = false; |
268 | Width = 3; |
269 | } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { |
270 | IsSGPR = true; |
271 | Width = 3; |
272 | } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { |
273 | IsSGPR = false; |
274 | IsAGPR = true; |
275 | Width = 3; |
276 | } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { |
277 | IsSGPR = true; |
278 | Width = 4; |
279 | } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { |
280 | IsSGPR = false; |
281 | Width = 4; |
282 | } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { |
283 | IsSGPR = false; |
284 | IsAGPR = true; |
285 | Width = 4; |
286 | } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { |
287 | IsSGPR = false; |
288 | Width = 5; |
289 | } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { |
290 | IsSGPR = true; |
291 | Width = 5; |
292 | } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { |
293 | IsSGPR = false; |
294 | IsAGPR = true; |
295 | Width = 5; |
296 | } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { |
297 | IsSGPR = false; |
298 | Width = 6; |
299 | } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { |
300 | IsSGPR = true; |
301 | Width = 6; |
302 | } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { |
303 | IsSGPR = false; |
304 | IsAGPR = true; |
305 | Width = 6; |
306 | } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { |
307 | IsSGPR = false; |
308 | Width = 7; |
309 | } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { |
310 | IsSGPR = true; |
311 | Width = 7; |
312 | } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { |
313 | IsSGPR = false; |
314 | IsAGPR = true; |
315 | Width = 7; |
316 | } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { |
317 | IsSGPR = true; |
318 | Width = 8; |
319 | } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { |
320 | IsSGPR = false; |
321 | Width = 8; |
322 | } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { |
323 | IsSGPR = false; |
324 | IsAGPR = true; |
325 | Width = 8; |
326 | } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { |
327 | IsSGPR = false; |
328 | Width = 9; |
329 | } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { |
330 | IsSGPR = true; |
331 | Width = 9; |
332 | } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { |
333 | IsSGPR = false; |
334 | IsAGPR = true; |
335 | Width = 9; |
336 | } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { |
337 | IsSGPR = false; |
338 | Width = 10; |
339 | } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { |
340 | IsSGPR = true; |
341 | Width = 10; |
342 | } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { |
343 | IsSGPR = false; |
344 | IsAGPR = true; |
345 | Width = 10; |
346 | } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { |
347 | IsSGPR = false; |
348 | Width = 11; |
349 | } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { |
350 | IsSGPR = true; |
351 | Width = 11; |
352 | } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { |
353 | IsSGPR = false; |
354 | IsAGPR = true; |
355 | Width = 11; |
356 | } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { |
357 | IsSGPR = false; |
358 | Width = 12; |
359 | } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { |
360 | IsSGPR = true; |
361 | Width = 12; |
362 | } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { |
363 | IsSGPR = false; |
364 | IsAGPR = true; |
365 | Width = 12; |
366 | } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { |
367 | IsSGPR = true; |
368 | Width = 16; |
369 | } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { |
370 | IsSGPR = false; |
371 | Width = 16; |
372 | } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { |
373 | IsSGPR = false; |
374 | IsAGPR = true; |
375 | Width = 16; |
376 | } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { |
377 | IsSGPR = true; |
378 | Width = 32; |
379 | } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { |
380 | IsSGPR = false; |
381 | Width = 32; |
382 | } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { |
383 | IsSGPR = false; |
384 | IsAGPR = true; |
385 | Width = 32; |
386 | } else { |
387 | // We only expect TTMP registers or registers that do not belong to |
388 | // any RC. |
389 | assert((AMDGPU::TTMP_32RegClass.contains(Reg) || |
390 | AMDGPU::TTMP_64RegClass.contains(Reg) || |
391 | AMDGPU::TTMP_128RegClass.contains(Reg) || |
392 | AMDGPU::TTMP_256RegClass.contains(Reg) || |
393 | AMDGPU::TTMP_512RegClass.contains(Reg) || |
394 | !TRI.getPhysRegBaseClass(Reg)) && |
395 | "Unknown register class" ); |
396 | } |
397 | unsigned HWReg = TRI.getHWRegIndex(Reg); |
398 | int MaxUsed = HWReg + Width - 1; |
399 | if (IsSGPR) { |
400 | MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; |
401 | } else if (IsAGPR) { |
402 | MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; |
403 | } else { |
404 | MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; |
405 | } |
406 | } |
407 | |
408 | if (MI.isCall()) { |
409 | // Pseudo used just to encode the underlying global. Is there a better |
410 | // way to track this? |
411 | |
412 | const MachineOperand *CalleeOp = |
413 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::callee); |
414 | |
415 | const Function *Callee = getCalleeFunction(Op: *CalleeOp); |
416 | |
417 | // Avoid crashing on undefined behavior with an illegal call to a |
418 | // kernel. If a callsite's calling convention doesn't match the |
419 | // function's, it's undefined behavior. If the callsite calling |
420 | // convention does match, that would have errored earlier. |
421 | if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv())) |
422 | report_fatal_error(reason: "invalid call to entry function" ); |
423 | |
424 | auto isSameFunction = [](const MachineFunction &MF, const Function *F) { |
425 | return F == &MF.getFunction(); |
426 | }; |
427 | |
428 | if (Callee && !isSameFunction(MF, Callee)) |
429 | Info.Callees.push_back(Elt: Callee); |
430 | |
431 | bool IsIndirect = !Callee || Callee->isDeclaration(); |
432 | |
433 | // FIXME: Call site could have norecurse on it |
434 | if (!Callee || !Callee->doesNotRecurse()) { |
435 | Info.HasRecursion = true; |
436 | |
437 | // TODO: If we happen to know there is no stack usage in the |
438 | // callgraph, we don't need to assume an infinitely growing stack. |
439 | if (!MI.isReturn()) { |
440 | // We don't need to assume an unknown stack size for tail calls. |
441 | |
442 | // FIXME: This only benefits in the case where the kernel does not |
443 | // directly call the tail called function. If a kernel directly |
444 | // calls a tail recursive function, we'll assume maximum stack size |
445 | // based on the regular call instruction. |
446 | Info.CalleeSegmentSize = std::max( |
447 | a: Info.CalleeSegmentSize, |
448 | b: static_cast<uint64_t>(AssumedStackSizeForExternalCall)); |
449 | } |
450 | } |
451 | |
452 | if (IsIndirect) { |
453 | Info.CalleeSegmentSize = |
454 | std::max(a: Info.CalleeSegmentSize, |
455 | b: static_cast<uint64_t>(AssumedStackSizeForExternalCall)); |
456 | |
457 | // Register usage of indirect calls gets handled later |
458 | Info.UsesVCC = true; |
459 | Info.UsesFlatScratch = ST.hasFlatAddressSpace(); |
460 | Info.HasDynamicallySizedStack = true; |
461 | Info.HasIndirectCall = true; |
462 | } |
463 | } |
464 | } |
465 | } |
466 | |
467 | Info.NumExplicitSGPR = MaxSGPR + 1; |
468 | Info.NumVGPR = MaxVGPR + 1; |
469 | Info.NumAGPR = MaxAGPR + 1; |
470 | |
471 | return Info; |
472 | } |
473 | |