1 | //===-- SILowerSGPRSPills.cpp ---------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all |
10 | // SGPR spills, so must insert CSR SGPR spills as well as expand them. |
11 | // |
12 | // This pass must never create new SGPR virtual registers. |
13 | // |
14 | // FIXME: Must stop RegScavenger spills in later passes. |
15 | // |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "AMDGPU.h" |
19 | #include "GCNSubtarget.h" |
20 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
21 | #include "SIMachineFunctionInfo.h" |
22 | #include "llvm/CodeGen/LiveIntervals.h" |
23 | #include "llvm/CodeGen/MachineFrameInfo.h" |
24 | #include "llvm/CodeGen/RegisterScavenging.h" |
25 | #include "llvm/InitializePasses.h" |
26 | |
27 | using namespace llvm; |
28 | |
29 | #define DEBUG_TYPE "si-lower-sgpr-spills" |
30 | |
31 | using MBBVector = SmallVector<MachineBasicBlock *, 4>; |
32 | |
33 | namespace { |
34 | |
35 | class SILowerSGPRSpills : public MachineFunctionPass { |
36 | private: |
37 | const SIRegisterInfo *TRI = nullptr; |
38 | const SIInstrInfo *TII = nullptr; |
39 | LiveIntervals *LIS = nullptr; |
40 | SlotIndexes *Indexes = nullptr; |
41 | |
42 | // Save and Restore blocks of the current function. Typically there is a |
43 | // single save block, unless Windows EH funclets are involved. |
44 | MBBVector SaveBlocks; |
45 | MBBVector RestoreBlocks; |
46 | |
47 | public: |
48 | static char ID; |
49 | |
50 | SILowerSGPRSpills() : MachineFunctionPass(ID) {} |
51 | |
52 | void calculateSaveRestoreBlocks(MachineFunction &MF); |
53 | bool spillCalleeSavedRegs(MachineFunction &MF, |
54 | SmallVectorImpl<int> &CalleeSavedFIs); |
55 | void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); |
56 | |
57 | bool runOnMachineFunction(MachineFunction &MF) override; |
58 | |
59 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
60 | AU.setPreservesAll(); |
61 | MachineFunctionPass::getAnalysisUsage(AU); |
62 | } |
63 | |
64 | MachineFunctionProperties getClearedProperties() const override { |
65 | // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs. |
66 | return MachineFunctionProperties() |
67 | .set(MachineFunctionProperties::Property::IsSSA) |
68 | .set(MachineFunctionProperties::Property::NoVRegs); |
69 | } |
70 | }; |
71 | |
72 | } // end anonymous namespace |
73 | |
74 | char SILowerSGPRSpills::ID = 0; |
75 | |
76 | INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, |
77 | "SI lower SGPR spill instructions" , false, false) |
78 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
79 | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
80 | INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, |
81 | "SI lower SGPR spill instructions" , false, false) |
82 | |
83 | char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; |
84 | |
85 | /// Insert spill code for the callee-saved registers used in the function. |
86 | static void insertCSRSaves(MachineBasicBlock &SaveBlock, |
87 | ArrayRef<CalleeSavedInfo> CSI, SlotIndexes *Indexes, |
88 | LiveIntervals *LIS) { |
89 | MachineFunction &MF = *SaveBlock.getParent(); |
90 | const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); |
91 | const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); |
92 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
93 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
94 | const SIRegisterInfo *RI = ST.getRegisterInfo(); |
95 | |
96 | MachineBasicBlock::iterator I = SaveBlock.begin(); |
97 | if (!TFI->spillCalleeSavedRegisters(MBB&: SaveBlock, MI: I, CSI, TRI)) { |
98 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
99 | |
100 | for (const CalleeSavedInfo &CS : CSI) { |
101 | // Insert the spill to the stack frame. |
102 | MCRegister Reg = CS.getReg(); |
103 | |
104 | MachineInstrSpan MIS(I, &SaveBlock); |
105 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( |
106 | Reg, VT: Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); |
107 | |
108 | // If this value was already livein, we probably have a direct use of the |
109 | // incoming register value, so don't kill at the spill point. This happens |
110 | // since we pass some special inputs (workgroup IDs) in the callee saved |
111 | // range. |
112 | const bool IsLiveIn = MRI.isLiveIn(Reg); |
113 | TII.storeRegToStackSlot(MBB&: SaveBlock, MI: I, SrcReg: Reg, isKill: !IsLiveIn, FrameIndex: CS.getFrameIdx(), |
114 | RC, TRI, VReg: Register()); |
115 | |
116 | if (Indexes) { |
117 | assert(std::distance(MIS.begin(), I) == 1); |
118 | MachineInstr &Inst = *std::prev(x: I); |
119 | Indexes->insertMachineInstrInMaps(MI&: Inst); |
120 | } |
121 | |
122 | if (LIS) |
123 | LIS->removeAllRegUnitsForPhysReg(Reg); |
124 | } |
125 | } |
126 | } |
127 | |
128 | /// Insert restore code for the callee-saved registers used in the function. |
129 | static void insertCSRRestores(MachineBasicBlock &RestoreBlock, |
130 | MutableArrayRef<CalleeSavedInfo> CSI, |
131 | SlotIndexes *Indexes, LiveIntervals *LIS) { |
132 | MachineFunction &MF = *RestoreBlock.getParent(); |
133 | const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); |
134 | const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); |
135 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
136 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
137 | const SIRegisterInfo *RI = ST.getRegisterInfo(); |
138 | // Restore all registers immediately before the return and any |
139 | // terminators that precede it. |
140 | MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); |
141 | |
142 | // FIXME: Just emit the readlane/writelane directly |
143 | if (!TFI->restoreCalleeSavedRegisters(MBB&: RestoreBlock, MI: I, CSI, TRI)) { |
144 | for (const CalleeSavedInfo &CI : reverse(C&: CSI)) { |
145 | Register Reg = CI.getReg(); |
146 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( |
147 | Reg, VT: Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); |
148 | |
149 | TII.loadRegFromStackSlot(MBB&: RestoreBlock, MI: I, DestReg: Reg, FrameIndex: CI.getFrameIdx(), RC, TRI, |
150 | VReg: Register()); |
151 | assert(I != RestoreBlock.begin() && |
152 | "loadRegFromStackSlot didn't insert any code!" ); |
153 | // Insert in reverse order. loadRegFromStackSlot can insert |
154 | // multiple instructions. |
155 | |
156 | if (Indexes) { |
157 | MachineInstr &Inst = *std::prev(x: I); |
158 | Indexes->insertMachineInstrInMaps(MI&: Inst); |
159 | } |
160 | |
161 | if (LIS) |
162 | LIS->removeAllRegUnitsForPhysReg(Reg); |
163 | } |
164 | } |
165 | } |
166 | |
167 | /// Compute the sets of entry and return blocks for saving and restoring |
168 | /// callee-saved registers, and placing prolog and epilog code. |
169 | void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { |
170 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
171 | |
172 | // Even when we do not change any CSR, we still want to insert the |
173 | // prologue and epilogue of the function. |
174 | // So set the save points for those. |
175 | |
176 | // Use the points found by shrink-wrapping, if any. |
177 | if (MFI.getSavePoint()) { |
178 | SaveBlocks.push_back(Elt: MFI.getSavePoint()); |
179 | assert(MFI.getRestorePoint() && "Both restore and save must be set" ); |
180 | MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); |
181 | // If RestoreBlock does not have any successor and is not a return block |
182 | // then the end point is unreachable and we do not need to insert any |
183 | // epilogue. |
184 | if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) |
185 | RestoreBlocks.push_back(Elt: RestoreBlock); |
186 | return; |
187 | } |
188 | |
189 | // Save refs to entry and return blocks. |
190 | SaveBlocks.push_back(Elt: &MF.front()); |
191 | for (MachineBasicBlock &MBB : MF) { |
192 | if (MBB.isEHFuncletEntry()) |
193 | SaveBlocks.push_back(Elt: &MBB); |
194 | if (MBB.isReturnBlock()) |
195 | RestoreBlocks.push_back(Elt: &MBB); |
196 | } |
197 | } |
198 | |
199 | // TODO: To support shrink wrapping, this would need to copy |
200 | // PrologEpilogInserter's updateLiveness. |
201 | static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) { |
202 | MachineBasicBlock &EntryBB = MF.front(); |
203 | |
204 | for (const CalleeSavedInfo &CSIReg : CSI) |
205 | EntryBB.addLiveIn(PhysReg: CSIReg.getReg()); |
206 | EntryBB.sortUniqueLiveIns(); |
207 | } |
208 | |
209 | bool SILowerSGPRSpills::spillCalleeSavedRegs( |
210 | MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) { |
211 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
212 | const Function &F = MF.getFunction(); |
213 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
214 | const SIFrameLowering *TFI = ST.getFrameLowering(); |
215 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
216 | RegScavenger *RS = nullptr; |
217 | |
218 | // Determine which of the registers in the callee save list should be saved. |
219 | BitVector SavedRegs; |
220 | TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS); |
221 | |
222 | // Add the code to save and restore the callee saved registers. |
223 | if (!F.hasFnAttribute(Kind: Attribute::Naked)) { |
224 | // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is |
225 | // necessary for verifier liveness checks. |
226 | MFI.setCalleeSavedInfoValid(true); |
227 | |
228 | std::vector<CalleeSavedInfo> CSI; |
229 | const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); |
230 | |
231 | for (unsigned I = 0; CSRegs[I]; ++I) { |
232 | MCRegister Reg = CSRegs[I]; |
233 | |
234 | if (SavedRegs.test(Idx: Reg)) { |
235 | const TargetRegisterClass *RC = |
236 | TRI->getMinimalPhysRegClass(Reg, VT: MVT::i32); |
237 | int JunkFI = MFI.CreateStackObject(Size: TRI->getSpillSize(RC: *RC), |
238 | Alignment: TRI->getSpillAlign(RC: *RC), isSpillSlot: true); |
239 | |
240 | CSI.emplace_back(args&: Reg, args&: JunkFI); |
241 | CalleeSavedFIs.push_back(Elt: JunkFI); |
242 | } |
243 | } |
244 | |
245 | if (!CSI.empty()) { |
246 | for (MachineBasicBlock *SaveBlock : SaveBlocks) |
247 | insertCSRSaves(SaveBlock&: *SaveBlock, CSI, Indexes, LIS); |
248 | |
249 | // Add live ins to save blocks. |
250 | assert(SaveBlocks.size() == 1 && "shrink wrapping not fully implemented" ); |
251 | updateLiveness(MF, CSI); |
252 | |
253 | for (MachineBasicBlock *RestoreBlock : RestoreBlocks) |
254 | insertCSRRestores(RestoreBlock&: *RestoreBlock, CSI, Indexes, LIS); |
255 | return true; |
256 | } |
257 | } |
258 | |
259 | return false; |
260 | } |
261 | |
262 | void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, |
263 | LiveIntervals *LIS) { |
264 | // TODO: This is a workaround to avoid the unmodelled liveness computed with |
265 | // whole-wave virtual registers when allocated together with the regular VGPR |
266 | // virtual registers. Presently, the liveness computed during the regalloc is |
267 | // only uniform (or single lane aware) and it doesn't take account of the |
268 | // divergent control flow that exists for our GPUs. Since the WWM registers |
269 | // can modify inactive lanes, the wave-aware liveness should be computed for |
270 | // the virtual registers to accurately plot their interferences. Without |
271 | // having the divergent CFG for the function, it is difficult to implement the |
272 | // wave-aware liveness info. Until then, we conservatively extend the liveness |
273 | // of the wwm registers into the entire function so that they won't be reused |
274 | // without first spilling/splitting their liveranges. |
275 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
276 | |
277 | // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. |
278 | for (auto Reg : MFI->getSGPRSpillVGPRs()) { |
279 | for (MachineBasicBlock *SaveBlock : SaveBlocks) { |
280 | MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); |
281 | DebugLoc DL = SaveBlock->findDebugLoc(MBBI: InsertBefore); |
282 | auto MIB = BuildMI(BB&: *SaveBlock, I: InsertBefore, MIMD: DL, |
283 | MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Reg); |
284 | MFI->setFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG); |
285 | // Set SGPR_SPILL asm printer flag |
286 | MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); |
287 | if (LIS) { |
288 | LIS->InsertMachineInstrInMaps(MI&: *MIB); |
289 | } |
290 | } |
291 | } |
292 | |
293 | // Insert the KILL in the return blocks to extend their liveness untill the |
294 | // end of function. Insert a separate KILL for each VGPR. |
295 | for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { |
296 | MachineBasicBlock::iterator InsertBefore = |
297 | RestoreBlock->getFirstTerminator(); |
298 | DebugLoc DL = RestoreBlock->findDebugLoc(MBBI: InsertBefore); |
299 | for (auto Reg : MFI->getSGPRSpillVGPRs()) { |
300 | auto MIB = BuildMI(BB&: *RestoreBlock, I: InsertBefore, MIMD: DL, |
301 | MCID: TII->get(Opcode: TargetOpcode::KILL)); |
302 | MIB.addReg(RegNo: Reg); |
303 | if (LIS) |
304 | LIS->InsertMachineInstrInMaps(MI&: *MIB); |
305 | } |
306 | } |
307 | } |
308 | |
309 | bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
310 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
311 | TII = ST.getInstrInfo(); |
312 | TRI = &TII->getRegisterInfo(); |
313 | |
314 | auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>(); |
315 | LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; |
316 | auto *SIWrapper = getAnalysisIfAvailable<SlotIndexesWrapperPass>(); |
317 | Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr; |
318 | |
319 | assert(SaveBlocks.empty() && RestoreBlocks.empty()); |
320 | |
321 | // First, expose any CSR SGPR spills. This is mostly the same as what PEI |
322 | // does, but somewhat simpler. |
323 | calculateSaveRestoreBlocks(MF); |
324 | SmallVector<int> CalleeSavedFIs; |
325 | bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs); |
326 | |
327 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
328 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
329 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
330 | |
331 | if (!MFI.hasStackObjects() && !HasCSRs) { |
332 | SaveBlocks.clear(); |
333 | RestoreBlocks.clear(); |
334 | return false; |
335 | } |
336 | |
337 | bool MadeChange = false; |
338 | bool SpilledToVirtVGPRLanes = false; |
339 | |
340 | // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be |
341 | // handled as SpilledToReg in regular PrologEpilogInserter. |
342 | const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() && |
343 | (HasCSRs || FuncInfo->hasSpilledSGPRs()); |
344 | if (HasSGPRSpillToVGPR) { |
345 | // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs |
346 | // are spilled to VGPRs, in which case we can eliminate the stack usage. |
347 | // |
348 | // This operates under the assumption that only other SGPR spills are users |
349 | // of the frame index. |
350 | |
351 | // To track the spill frame indices handled in this pass. |
352 | BitVector SpillFIs(MFI.getObjectIndexEnd(), false); |
353 | |
354 | for (MachineBasicBlock &MBB : MF) { |
355 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) { |
356 | if (!TII->isSGPRSpill(MI)) |
357 | continue; |
358 | |
359 | int FI = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::addr)->getIndex(); |
360 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
361 | |
362 | bool IsCalleeSaveSGPRSpill = llvm::is_contained(Range&: CalleeSavedFIs, Element: FI); |
363 | if (IsCalleeSaveSGPRSpill) { |
364 | // Spill callee-saved SGPRs into physical VGPR lanes. |
365 | |
366 | // TODO: This is to ensure the CFIs are static for efficient frame |
367 | // unwinding in the debugger. Spilling them into virtual VGPR lanes |
368 | // involve regalloc to allocate the physical VGPRs and that might |
369 | // cause intermediate spill/split of such liveranges for successful |
370 | // allocation. This would result in broken CFI encoding unless the |
371 | // regalloc aware CFI generation to insert new CFIs along with the |
372 | // intermediate spills is implemented. There is no such support |
373 | // currently exist in the LLVM compiler. |
374 | if (FuncInfo->allocateSGPRSpillToVGPRLane( |
375 | MF, FI, /*SpillToPhysVGPRLane=*/true)) { |
376 | bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( |
377 | MI, FI, RS: nullptr, Indexes, LIS, SpillToPhysVGPRLane: true); |
378 | if (!Spilled) |
379 | llvm_unreachable( |
380 | "failed to spill SGPR to physical VGPR lane when allocated" ); |
381 | } |
382 | } else { |
383 | if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { |
384 | bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( |
385 | MI, FI, RS: nullptr, Indexes, LIS); |
386 | if (!Spilled) |
387 | llvm_unreachable( |
388 | "failed to spill SGPR to virtual VGPR lane when allocated" ); |
389 | SpillFIs.set(FI); |
390 | SpilledToVirtVGPRLanes = true; |
391 | } |
392 | } |
393 | } |
394 | } |
395 | |
396 | if (SpilledToVirtVGPRLanes) { |
397 | extendWWMVirtRegLiveness(MF, LIS); |
398 | if (LIS) { |
399 | // Compute the LiveInterval for the newly created virtual registers. |
400 | for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) |
401 | LIS->createAndComputeVirtRegInterval(Reg); |
402 | } |
403 | } |
404 | |
405 | for (MachineBasicBlock &MBB : MF) { |
406 | // FIXME: The dead frame indices are replaced with a null register from |
407 | // the debug value instructions. We should instead, update it with the |
408 | // correct register value. But not sure the register value alone is |
409 | // adequate to lower the DIExpression. It should be worked out later. |
410 | for (MachineInstr &MI : MBB) { |
411 | if (MI.isDebugValue() && MI.getOperand(i: 0).isFI() && |
412 | !MFI.isFixedObjectIndex(ObjectIdx: MI.getOperand(i: 0).getIndex()) && |
413 | SpillFIs[MI.getOperand(i: 0).getIndex()]) { |
414 | MI.getOperand(i: 0).ChangeToRegister(Reg: Register(), isDef: false /*isDef*/); |
415 | } |
416 | } |
417 | } |
418 | |
419 | // All those frame indices which are dead by now should be removed from the |
420 | // function frame. Otherwise, there is a side effect such as re-mapping of |
421 | // free frame index ids by the later pass(es) like "stack slot coloring" |
422 | // which in turn could mess-up with the book keeping of "frame index to VGPR |
423 | // lane". |
424 | FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); |
425 | |
426 | MadeChange = true; |
427 | } |
428 | |
429 | if (SpilledToVirtVGPRLanes) { |
430 | const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); |
431 | // Shift back the reserved SGPR for EXEC copy into the lowest range. |
432 | // This SGPR is reserved to handle the whole-wave spill/copy operations |
433 | // that might get inserted during vgpr regalloc. |
434 | Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); |
435 | if (UnusedLowSGPR && TRI->getHWRegIndex(Reg: UnusedLowSGPR) < |
436 | TRI->getHWRegIndex(Reg: FuncInfo->getSGPRForEXECCopy())) |
437 | FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); |
438 | } else { |
439 | // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM |
440 | // spills/copies. Reset the SGPR reserved for EXEC copy. |
441 | FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); |
442 | } |
443 | |
444 | SaveBlocks.clear(); |
445 | RestoreBlocks.clear(); |
446 | |
447 | return MadeChange; |
448 | } |
449 | |