1 | //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Analyzes how many registers and other resources are used by |
11 | /// functions. |
12 | /// |
13 | /// The results of this analysis are used to fill the register usage, flat |
14 | /// usage, etc. into hardware registers. |
15 | /// |
16 | /// The analysis takes callees into account. E.g. if a function A that needs 10 |
17 | /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A |
18 | /// will return 20. |
19 | /// It is assumed that an indirect call can go into any function except |
20 | /// hardware-entrypoints. Therefore the register usage of functions with |
21 | /// indirect calls is estimated as the maximum of all non-entrypoint functions |
22 | /// in the module. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "AMDGPUResourceUsageAnalysis.h" |
27 | #include "AMDGPU.h" |
28 | #include "GCNSubtarget.h" |
29 | #include "SIMachineFunctionInfo.h" |
30 | #include "llvm/ADT/PostOrderIterator.h" |
31 | #include "llvm/Analysis/CallGraph.h" |
32 | #include "llvm/CodeGen/MachineFrameInfo.h" |
33 | #include "llvm/CodeGen/TargetPassConfig.h" |
34 | #include "llvm/IR/GlobalAlias.h" |
35 | #include "llvm/IR/GlobalValue.h" |
36 | #include "llvm/Target/TargetMachine.h" |
37 | |
38 | using namespace llvm; |
39 | using namespace llvm::AMDGPU; |
40 | |
41 | #define DEBUG_TYPE "amdgpu-resource-usage" |
42 | |
43 | char llvm::AMDGPUResourceUsageAnalysis::ID = 0; |
44 | char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; |
45 | |
46 | // In code object v4 and older, we need to tell the runtime some amount ahead of |
47 | // time if we don't know the true stack size. Assume a smaller number if this is |
48 | // only due to dynamic / non-entry block allocas. |
49 | static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( |
50 | "amdgpu-assume-external-call-stack-size" , |
51 | cl::desc("Assumed stack use of any external call (in bytes)" ), cl::Hidden, |
52 | cl::init(Val: 16384)); |
53 | |
54 | static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( |
55 | "amdgpu-assume-dynamic-stack-object-size" , |
56 | cl::desc("Assumed extra stack use if there are any " |
57 | "variable sized objects (in bytes)" ), |
58 | cl::Hidden, cl::init(Val: 4096)); |
59 | |
60 | INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, |
61 | "Function register usage analysis" , true, true) |
62 | |
63 | static const Function *getCalleeFunction(const MachineOperand &Op) { |
64 | if (Op.isImm()) { |
65 | assert(Op.getImm() == 0); |
66 | return nullptr; |
67 | } |
68 | return cast<Function>(Val: Op.getGlobal()->stripPointerCastsAndAliases()); |
69 | } |
70 | |
71 | static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, |
72 | const SIInstrInfo &TII, unsigned Reg) { |
73 | for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { |
74 | if (!UseOp.isImplicit() || !TII.isFLAT(MI: *UseOp.getParent())) |
75 | return true; |
76 | } |
77 | |
78 | return false; |
79 | } |
80 | |
81 | int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( |
82 | const GCNSubtarget &ST) const { |
83 | return NumExplicitSGPR + |
84 | IsaInfo::getNumExtraSGPRs(STI: &ST, VCCUsed: UsesVCC, FlatScrUsed: UsesFlatScratch, |
85 | XNACKUsed: ST.getTargetID().isXnackOnOrAny()); |
86 | } |
87 | |
88 | int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( |
89 | const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { |
90 | return AMDGPU::getTotalNumVGPRs(has90AInsts: ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); |
91 | } |
92 | |
93 | int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( |
94 | const GCNSubtarget &ST) const { |
95 | return getTotalNumVGPRs(ST, ArgNumAGPR: NumAGPR, ArgNumVGPR: NumVGPR); |
96 | } |
97 | |
98 | bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { |
99 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
100 | if (!TPC) |
101 | return false; |
102 | |
103 | MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); |
104 | const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
105 | const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); |
106 | bool HasIndirectCall = false; |
107 | |
108 | CallGraph CG = CallGraph(M); |
109 | auto End = po_end(G: &CG); |
110 | |
111 | // By default, for code object v5 and later, track only the minimum scratch |
112 | // size |
113 | uint32_t AssumedStackSizeForDynamicSizeObjects = |
114 | clAssumedStackSizeForDynamicSizeObjects; |
115 | uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; |
116 | if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || |
117 | STI.getTargetTriple().getOS() == Triple::AMDPAL) { |
118 | if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0) |
119 | AssumedStackSizeForDynamicSizeObjects = 0; |
120 | if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0) |
121 | AssumedStackSizeForExternalCall = 0; |
122 | } |
123 | |
124 | for (auto IT = po_begin(G: &CG); IT != End; ++IT) { |
125 | Function *F = IT->getFunction(); |
126 | if (!F || F->isDeclaration()) |
127 | continue; |
128 | |
129 | MachineFunction *MF = MMI.getMachineFunction(F: *F); |
130 | assert(MF && "function must have been generated already" ); |
131 | |
132 | auto CI = |
133 | CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo())); |
134 | SIFunctionResourceInfo &Info = CI.first->second; |
135 | assert(CI.second && "should only be called once per function" ); |
136 | Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects, |
137 | AssumedStackSizeForExternalCall); |
138 | HasIndirectCall |= Info.HasIndirectCall; |
139 | } |
140 | |
141 | // It's possible we have unreachable functions in the module which weren't |
142 | // visited by the PO traversal. Make sure we have some resource counts to |
143 | // report. |
144 | for (const auto &IT : CG) { |
145 | const Function *F = IT.first; |
146 | if (!F || F->isDeclaration()) |
147 | continue; |
148 | |
149 | auto CI = |
150 | CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo())); |
151 | if (!CI.second) // Skip already visited functions |
152 | continue; |
153 | |
154 | SIFunctionResourceInfo &Info = CI.first->second; |
155 | MachineFunction *MF = MMI.getMachineFunction(F: *F); |
156 | assert(MF && "function must have been generated already" ); |
157 | Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects, |
158 | AssumedStackSizeForExternalCall); |
159 | HasIndirectCall |= Info.HasIndirectCall; |
160 | } |
161 | |
162 | if (HasIndirectCall) |
163 | propagateIndirectCallRegisterUsage(); |
164 | |
165 | return false; |
166 | } |
167 | |
168 | AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo |
169 | AMDGPUResourceUsageAnalysis::analyzeResourceUsage( |
170 | const MachineFunction &MF, const TargetMachine &TM, |
171 | uint32_t AssumedStackSizeForDynamicSizeObjects, |
172 | uint32_t AssumedStackSizeForExternalCall) const { |
173 | SIFunctionResourceInfo Info; |
174 | |
175 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
176 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
177 | const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
178 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
179 | const SIInstrInfo *TII = ST.getInstrInfo(); |
180 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
181 | |
182 | Info.UsesFlatScratch = MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_LO) || |
183 | MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR_HI) || |
184 | MRI.isLiveIn(Reg: MFI->getPreloadedReg( |
185 | Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); |
186 | |
187 | // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat |
188 | // instructions aren't used to access the scratch buffer. Inline assembly may |
189 | // need it though. |
190 | // |
191 | // If we only have implicit uses of flat_scr on flat instructions, it is not |
192 | // really needed. |
193 | if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && |
194 | (!hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR) && |
195 | !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_LO) && |
196 | !hasAnyNonFlatUseOfReg(MRI, TII: *TII, Reg: AMDGPU::FLAT_SCR_HI))) { |
197 | Info.UsesFlatScratch = false; |
198 | } |
199 | |
200 | Info.PrivateSegmentSize = FrameInfo.getStackSize(); |
201 | |
202 | // Assume a big number if there are any unknown sized objects. |
203 | Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); |
204 | if (Info.HasDynamicallySizedStack) |
205 | Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; |
206 | |
207 | if (MFI->isStackRealigned()) |
208 | Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); |
209 | |
210 | Info.UsesVCC = |
211 | MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_LO) || MRI.isPhysRegUsed(PhysReg: AMDGPU::VCC_HI); |
212 | |
213 | // If there are no calls, MachineRegisterInfo can tell us the used register |
214 | // count easily. |
215 | // A tail call isn't considered a call for MachineFrameInfo's purposes. |
216 | if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { |
217 | MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; |
218 | for (MCPhysReg Reg : reverse(C: AMDGPU::VGPR_32RegClass.getRegisters())) { |
219 | if (MRI.isPhysRegUsed(PhysReg: Reg)) { |
220 | HighestVGPRReg = Reg; |
221 | break; |
222 | } |
223 | } |
224 | |
225 | if (ST.hasMAIInsts()) { |
226 | MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; |
227 | for (MCPhysReg Reg : reverse(C: AMDGPU::AGPR_32RegClass.getRegisters())) { |
228 | if (MRI.isPhysRegUsed(PhysReg: Reg)) { |
229 | HighestAGPRReg = Reg; |
230 | break; |
231 | } |
232 | } |
233 | Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister |
234 | ? 0 |
235 | : TRI.getHWRegIndex(Reg: HighestAGPRReg) + 1; |
236 | } |
237 | |
238 | MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; |
239 | for (MCPhysReg Reg : reverse(C: AMDGPU::SGPR_32RegClass.getRegisters())) { |
240 | if (MRI.isPhysRegUsed(PhysReg: Reg)) { |
241 | HighestSGPRReg = Reg; |
242 | break; |
243 | } |
244 | } |
245 | |
246 | // We found the maximum register index. They start at 0, so add one to get |
247 | // the number of registers. |
248 | Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister |
249 | ? 0 |
250 | : TRI.getHWRegIndex(Reg: HighestVGPRReg) + 1; |
251 | Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister |
252 | ? 0 |
253 | : TRI.getHWRegIndex(Reg: HighestSGPRReg) + 1; |
254 | |
255 | return Info; |
256 | } |
257 | |
258 | int32_t MaxVGPR = -1; |
259 | int32_t MaxAGPR = -1; |
260 | int32_t MaxSGPR = -1; |
261 | uint64_t CalleeFrameSize = 0; |
262 | |
263 | for (const MachineBasicBlock &MBB : MF) { |
264 | for (const MachineInstr &MI : MBB) { |
265 | // TODO: Check regmasks? Do they occur anywhere except calls? |
266 | for (const MachineOperand &MO : MI.operands()) { |
267 | unsigned Width = 0; |
268 | bool IsSGPR = false; |
269 | bool IsAGPR = false; |
270 | |
271 | if (!MO.isReg()) |
272 | continue; |
273 | |
274 | Register Reg = MO.getReg(); |
275 | switch (Reg) { |
276 | case AMDGPU::EXEC: |
277 | case AMDGPU::EXEC_LO: |
278 | case AMDGPU::EXEC_HI: |
279 | case AMDGPU::SCC: |
280 | case AMDGPU::M0: |
281 | case AMDGPU::M0_LO16: |
282 | case AMDGPU::M0_HI16: |
283 | case AMDGPU::SRC_SHARED_BASE_LO: |
284 | case AMDGPU::SRC_SHARED_BASE: |
285 | case AMDGPU::SRC_SHARED_LIMIT_LO: |
286 | case AMDGPU::SRC_SHARED_LIMIT: |
287 | case AMDGPU::SRC_PRIVATE_BASE_LO: |
288 | case AMDGPU::SRC_PRIVATE_BASE: |
289 | case AMDGPU::SRC_PRIVATE_LIMIT_LO: |
290 | case AMDGPU::SRC_PRIVATE_LIMIT: |
291 | case AMDGPU::SRC_POPS_EXITING_WAVE_ID: |
292 | case AMDGPU::SGPR_NULL: |
293 | case AMDGPU::SGPR_NULL64: |
294 | case AMDGPU::MODE: |
295 | continue; |
296 | |
297 | case AMDGPU::NoRegister: |
298 | assert(MI.isDebugInstr() && |
299 | "Instruction uses invalid noreg register" ); |
300 | continue; |
301 | |
302 | case AMDGPU::VCC: |
303 | case AMDGPU::VCC_LO: |
304 | case AMDGPU::VCC_HI: |
305 | case AMDGPU::VCC_LO_LO16: |
306 | case AMDGPU::VCC_LO_HI16: |
307 | case AMDGPU::VCC_HI_LO16: |
308 | case AMDGPU::VCC_HI_HI16: |
309 | Info.UsesVCC = true; |
310 | continue; |
311 | |
312 | case AMDGPU::FLAT_SCR: |
313 | case AMDGPU::FLAT_SCR_LO: |
314 | case AMDGPU::FLAT_SCR_HI: |
315 | continue; |
316 | |
317 | case AMDGPU::XNACK_MASK: |
318 | case AMDGPU::XNACK_MASK_LO: |
319 | case AMDGPU::XNACK_MASK_HI: |
320 | llvm_unreachable("xnack_mask registers should not be used" ); |
321 | |
322 | case AMDGPU::LDS_DIRECT: |
323 | llvm_unreachable("lds_direct register should not be used" ); |
324 | |
325 | case AMDGPU::TBA: |
326 | case AMDGPU::TBA_LO: |
327 | case AMDGPU::TBA_HI: |
328 | case AMDGPU::TMA: |
329 | case AMDGPU::TMA_LO: |
330 | case AMDGPU::TMA_HI: |
331 | llvm_unreachable("trap handler registers should not be used" ); |
332 | |
333 | case AMDGPU::SRC_VCCZ: |
334 | llvm_unreachable("src_vccz register should not be used" ); |
335 | |
336 | case AMDGPU::SRC_EXECZ: |
337 | llvm_unreachable("src_execz register should not be used" ); |
338 | |
339 | case AMDGPU::SRC_SCC: |
340 | llvm_unreachable("src_scc register should not be used" ); |
341 | |
342 | default: |
343 | break; |
344 | } |
345 | |
346 | if (AMDGPU::SGPR_32RegClass.contains(Reg) || |
347 | AMDGPU::SGPR_LO16RegClass.contains(Reg) || |
348 | AMDGPU::SGPR_HI16RegClass.contains(Reg)) { |
349 | IsSGPR = true; |
350 | Width = 1; |
351 | } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || |
352 | AMDGPU::VGPR_16RegClass.contains(Reg)) { |
353 | IsSGPR = false; |
354 | Width = 1; |
355 | } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || |
356 | AMDGPU::AGPR_LO16RegClass.contains(Reg)) { |
357 | IsSGPR = false; |
358 | IsAGPR = true; |
359 | Width = 1; |
360 | } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { |
361 | IsSGPR = true; |
362 | Width = 2; |
363 | } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { |
364 | IsSGPR = false; |
365 | Width = 2; |
366 | } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { |
367 | IsSGPR = false; |
368 | IsAGPR = true; |
369 | Width = 2; |
370 | } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { |
371 | IsSGPR = false; |
372 | Width = 3; |
373 | } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { |
374 | IsSGPR = true; |
375 | Width = 3; |
376 | } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { |
377 | IsSGPR = false; |
378 | IsAGPR = true; |
379 | Width = 3; |
380 | } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { |
381 | IsSGPR = true; |
382 | Width = 4; |
383 | } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { |
384 | IsSGPR = false; |
385 | Width = 4; |
386 | } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { |
387 | IsSGPR = false; |
388 | IsAGPR = true; |
389 | Width = 4; |
390 | } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { |
391 | IsSGPR = false; |
392 | Width = 5; |
393 | } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { |
394 | IsSGPR = true; |
395 | Width = 5; |
396 | } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { |
397 | IsSGPR = false; |
398 | IsAGPR = true; |
399 | Width = 5; |
400 | } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { |
401 | IsSGPR = false; |
402 | Width = 6; |
403 | } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { |
404 | IsSGPR = true; |
405 | Width = 6; |
406 | } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { |
407 | IsSGPR = false; |
408 | IsAGPR = true; |
409 | Width = 6; |
410 | } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { |
411 | IsSGPR = false; |
412 | Width = 7; |
413 | } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { |
414 | IsSGPR = true; |
415 | Width = 7; |
416 | } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { |
417 | IsSGPR = false; |
418 | IsAGPR = true; |
419 | Width = 7; |
420 | } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { |
421 | IsSGPR = true; |
422 | Width = 8; |
423 | } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { |
424 | IsSGPR = false; |
425 | Width = 8; |
426 | } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { |
427 | IsSGPR = false; |
428 | IsAGPR = true; |
429 | Width = 8; |
430 | } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { |
431 | IsSGPR = false; |
432 | Width = 9; |
433 | } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { |
434 | IsSGPR = true; |
435 | Width = 9; |
436 | } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { |
437 | IsSGPR = false; |
438 | IsAGPR = true; |
439 | Width = 9; |
440 | } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { |
441 | IsSGPR = false; |
442 | Width = 10; |
443 | } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { |
444 | IsSGPR = true; |
445 | Width = 10; |
446 | } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { |
447 | IsSGPR = false; |
448 | IsAGPR = true; |
449 | Width = 10; |
450 | } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { |
451 | IsSGPR = false; |
452 | Width = 11; |
453 | } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { |
454 | IsSGPR = true; |
455 | Width = 11; |
456 | } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { |
457 | IsSGPR = false; |
458 | IsAGPR = true; |
459 | Width = 11; |
460 | } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { |
461 | IsSGPR = false; |
462 | Width = 12; |
463 | } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { |
464 | IsSGPR = true; |
465 | Width = 12; |
466 | } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { |
467 | IsSGPR = false; |
468 | IsAGPR = true; |
469 | Width = 12; |
470 | } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { |
471 | IsSGPR = true; |
472 | Width = 16; |
473 | } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { |
474 | IsSGPR = false; |
475 | Width = 16; |
476 | } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { |
477 | IsSGPR = false; |
478 | IsAGPR = true; |
479 | Width = 16; |
480 | } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { |
481 | IsSGPR = true; |
482 | Width = 32; |
483 | } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { |
484 | IsSGPR = false; |
485 | Width = 32; |
486 | } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { |
487 | IsSGPR = false; |
488 | IsAGPR = true; |
489 | Width = 32; |
490 | } else { |
491 | // We only expect TTMP registers or registers that do not belong to |
492 | // any RC. |
493 | assert((AMDGPU::TTMP_32RegClass.contains(Reg) || |
494 | AMDGPU::TTMP_64RegClass.contains(Reg) || |
495 | AMDGPU::TTMP_128RegClass.contains(Reg) || |
496 | AMDGPU::TTMP_256RegClass.contains(Reg) || |
497 | AMDGPU::TTMP_512RegClass.contains(Reg) || |
498 | !TRI.getPhysRegBaseClass(Reg)) && |
499 | "Unknown register class" ); |
500 | } |
501 | unsigned HWReg = TRI.getHWRegIndex(Reg); |
502 | int MaxUsed = HWReg + Width - 1; |
503 | if (IsSGPR) { |
504 | MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; |
505 | } else if (IsAGPR) { |
506 | MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; |
507 | } else { |
508 | MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; |
509 | } |
510 | } |
511 | |
512 | if (MI.isCall()) { |
513 | // Pseudo used just to encode the underlying global. Is there a better |
514 | // way to track this? |
515 | |
516 | const MachineOperand *CalleeOp = |
517 | TII->getNamedOperand(MI, OpName: AMDGPU::OpName::callee); |
518 | |
519 | const Function *Callee = getCalleeFunction(Op: *CalleeOp); |
520 | DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = |
521 | CallGraphResourceInfo.end(); |
522 | |
523 | // Avoid crashing on undefined behavior with an illegal call to a |
524 | // kernel. If a callsite's calling convention doesn't match the |
525 | // function's, it's undefined behavior. If the callsite calling |
526 | // convention does match, that would have errored earlier. |
527 | if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv())) |
528 | report_fatal_error(reason: "invalid call to entry function" ); |
529 | |
530 | bool IsIndirect = !Callee || Callee->isDeclaration(); |
531 | if (!IsIndirect) |
532 | I = CallGraphResourceInfo.find(Val: Callee); |
533 | |
534 | // FIXME: Call site could have norecurse on it |
535 | if (!Callee || !Callee->doesNotRecurse()) { |
536 | Info.HasRecursion = true; |
537 | |
538 | // TODO: If we happen to know there is no stack usage in the |
539 | // callgraph, we don't need to assume an infinitely growing stack. |
540 | if (!MI.isReturn()) { |
541 | // We don't need to assume an unknown stack size for tail calls. |
542 | |
543 | // FIXME: This only benefits in the case where the kernel does not |
544 | // directly call the tail called function. If a kernel directly |
545 | // calls a tail recursive function, we'll assume maximum stack size |
546 | // based on the regular call instruction. |
547 | CalleeFrameSize = std::max( |
548 | a: CalleeFrameSize, |
549 | b: static_cast<uint64_t>(AssumedStackSizeForExternalCall)); |
550 | } |
551 | } |
552 | |
553 | if (IsIndirect || I == CallGraphResourceInfo.end()) { |
554 | CalleeFrameSize = |
555 | std::max(a: CalleeFrameSize, |
556 | b: static_cast<uint64_t>(AssumedStackSizeForExternalCall)); |
557 | |
558 | // Register usage of indirect calls gets handled later |
559 | Info.UsesVCC = true; |
560 | Info.UsesFlatScratch = ST.hasFlatAddressSpace(); |
561 | Info.HasDynamicallySizedStack = true; |
562 | Info.HasIndirectCall = true; |
563 | } else { |
564 | // We force CodeGen to run in SCC order, so the callee's register |
565 | // usage etc. should be the cumulative usage of all callees. |
566 | MaxSGPR = std::max(a: I->second.NumExplicitSGPR - 1, b: MaxSGPR); |
567 | MaxVGPR = std::max(a: I->second.NumVGPR - 1, b: MaxVGPR); |
568 | MaxAGPR = std::max(a: I->second.NumAGPR - 1, b: MaxAGPR); |
569 | CalleeFrameSize = |
570 | std::max(a: I->second.PrivateSegmentSize, b: CalleeFrameSize); |
571 | Info.UsesVCC |= I->second.UsesVCC; |
572 | Info.UsesFlatScratch |= I->second.UsesFlatScratch; |
573 | Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; |
574 | Info.HasRecursion |= I->second.HasRecursion; |
575 | Info.HasIndirectCall |= I->second.HasIndirectCall; |
576 | } |
577 | } |
578 | } |
579 | } |
580 | |
581 | Info.NumExplicitSGPR = MaxSGPR + 1; |
582 | Info.NumVGPR = MaxVGPR + 1; |
583 | Info.NumAGPR = MaxAGPR + 1; |
584 | Info.PrivateSegmentSize += CalleeFrameSize; |
585 | |
586 | return Info; |
587 | } |
588 | |
589 | void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { |
590 | // Collect the maximum number of registers from non-hardware-entrypoints. |
591 | // All these functions are potential targets for indirect calls. |
592 | int32_t NonKernelMaxSGPRs = 0; |
593 | int32_t NonKernelMaxVGPRs = 0; |
594 | int32_t NonKernelMaxAGPRs = 0; |
595 | |
596 | for (const auto &I : CallGraphResourceInfo) { |
597 | if (!AMDGPU::isEntryFunctionCC(CC: I.getFirst()->getCallingConv())) { |
598 | auto &Info = I.getSecond(); |
599 | NonKernelMaxSGPRs = std::max(a: NonKernelMaxSGPRs, b: Info.NumExplicitSGPR); |
600 | NonKernelMaxVGPRs = std::max(a: NonKernelMaxVGPRs, b: Info.NumVGPR); |
601 | NonKernelMaxAGPRs = std::max(a: NonKernelMaxAGPRs, b: Info.NumAGPR); |
602 | } |
603 | } |
604 | |
605 | // Add register usage for functions with indirect calls. |
606 | // For calls to unknown functions, we assume the maximum register usage of |
607 | // all non-hardware-entrypoints in the current module. |
608 | for (auto &I : CallGraphResourceInfo) { |
609 | auto &Info = I.getSecond(); |
610 | if (Info.HasIndirectCall) { |
611 | Info.NumExplicitSGPR = std::max(a: Info.NumExplicitSGPR, b: NonKernelMaxSGPRs); |
612 | Info.NumVGPR = std::max(a: Info.NumVGPR, b: NonKernelMaxVGPRs); |
613 | Info.NumAGPR = std::max(a: Info.NumAGPR, b: NonKernelMaxAGPRs); |
614 | } |
615 | } |
616 | } |
617 | |