1 | //===-- SILowerSGPRSPills.cpp ---------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all |
10 | // SGPR spills, so must insert CSR SGPR spills as well as expand them. |
11 | // |
12 | // This pass must never create new SGPR virtual registers. |
13 | // |
14 | // FIXME: Must stop RegScavenger spills in later passes. |
15 | // |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "SILowerSGPRSpills.h" |
19 | #include "AMDGPU.h" |
20 | #include "GCNSubtarget.h" |
21 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
22 | #include "SIMachineFunctionInfo.h" |
23 | #include "llvm/CodeGen/LiveIntervals.h" |
24 | #include "llvm/CodeGen/MachineDominators.h" |
25 | #include "llvm/CodeGen/MachineFrameInfo.h" |
26 | #include "llvm/CodeGen/RegisterScavenging.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | #define DEBUG_TYPE "si-lower-sgpr-spills" |
31 | |
32 | using MBBVector = SmallVector<MachineBasicBlock *, 4>; |
33 | |
34 | namespace { |
35 | |
36 | static cl::opt<unsigned> MaxNumVGPRsForWwmAllocation( |
37 | "amdgpu-num-vgprs-for-wwm-alloc" , |
38 | cl::desc("Max num VGPRs for whole-wave register allocation." ), |
39 | cl::ReallyHidden, cl::init(Val: 10)); |
40 | |
41 | class SILowerSGPRSpills { |
42 | private: |
43 | const SIRegisterInfo *TRI = nullptr; |
44 | const SIInstrInfo *TII = nullptr; |
45 | LiveIntervals *LIS = nullptr; |
46 | SlotIndexes *Indexes = nullptr; |
47 | MachineDominatorTree *MDT = nullptr; |
48 | |
49 | // Save and Restore blocks of the current function. Typically there is a |
50 | // single save block, unless Windows EH funclets are involved. |
51 | MBBVector SaveBlocks; |
52 | MBBVector RestoreBlocks; |
53 | |
54 | public: |
55 | SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes, |
56 | MachineDominatorTree *MDT) |
57 | : LIS(LIS), Indexes(Indexes), MDT(MDT) {} |
58 | bool run(MachineFunction &MF); |
59 | void calculateSaveRestoreBlocks(MachineFunction &MF); |
60 | bool spillCalleeSavedRegs(MachineFunction &MF, |
61 | SmallVectorImpl<int> &CalleeSavedFIs); |
62 | void updateLaneVGPRDomInstr( |
63 | int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, |
64 | DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr); |
65 | void determineRegsForWWMAllocation(MachineFunction &MF, BitVector &RegMask); |
66 | }; |
67 | |
68 | class SILowerSGPRSpillsLegacy : public MachineFunctionPass { |
69 | public: |
70 | static char ID; |
71 | |
72 | SILowerSGPRSpillsLegacy() : MachineFunctionPass(ID) {} |
73 | |
74 | bool runOnMachineFunction(MachineFunction &MF) override; |
75 | |
76 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
77 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
78 | AU.setPreservesAll(); |
79 | MachineFunctionPass::getAnalysisUsage(AU); |
80 | } |
81 | |
82 | MachineFunctionProperties getClearedProperties() const override { |
83 | // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs. |
84 | return MachineFunctionProperties().setIsSSA().setNoVRegs(); |
85 | } |
86 | }; |
87 | |
88 | } // end anonymous namespace |
89 | |
90 | char SILowerSGPRSpillsLegacy::ID = 0; |
91 | |
92 | INITIALIZE_PASS_BEGIN(SILowerSGPRSpillsLegacy, DEBUG_TYPE, |
93 | "SI lower SGPR spill instructions" , false, false) |
94 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
95 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
96 | INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) |
97 | INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE, |
98 | "SI lower SGPR spill instructions" , false, false) |
99 | |
100 | char &llvm::SILowerSGPRSpillsLegacyID = SILowerSGPRSpillsLegacy::ID; |
101 | |
102 | static bool isLiveIntoMBB(MCRegister Reg, MachineBasicBlock &MBB, |
103 | const TargetRegisterInfo *TRI) { |
104 | for (MCRegAliasIterator R(Reg, TRI, true); R.isValid(); ++R) { |
105 | if (MBB.isLiveIn(Reg: *R)) { |
106 | return true; |
107 | } |
108 | } |
109 | return false; |
110 | } |
111 | |
112 | /// Insert spill code for the callee-saved registers used in the function. |
113 | static void insertCSRSaves(MachineBasicBlock &SaveBlock, |
114 | ArrayRef<CalleeSavedInfo> CSI, SlotIndexes *Indexes, |
115 | LiveIntervals *LIS) { |
116 | MachineFunction &MF = *SaveBlock.getParent(); |
117 | const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); |
118 | const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); |
119 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
120 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
121 | const SIRegisterInfo *RI = ST.getRegisterInfo(); |
122 | |
123 | MachineBasicBlock::iterator I = SaveBlock.begin(); |
124 | if (!TFI->spillCalleeSavedRegisters(MBB&: SaveBlock, MI: I, CSI, TRI)) { |
125 | for (const CalleeSavedInfo &CS : CSI) { |
126 | // Insert the spill to the stack frame. |
127 | MCRegister Reg = CS.getReg(); |
128 | |
129 | MachineInstrSpan MIS(I, &SaveBlock); |
130 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( |
131 | Reg, VT: Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); |
132 | |
133 | // If this value was already livein, we probably have a direct use of the |
134 | // incoming register value, so don't kill at the spill point. This happens |
135 | // since we pass some special inputs (workgroup IDs) in the callee saved |
136 | // range. |
137 | const bool IsLiveIn = isLiveIntoMBB(Reg, MBB&: SaveBlock, TRI); |
138 | TII.storeRegToStackSlot(MBB&: SaveBlock, MI: I, SrcReg: Reg, isKill: !IsLiveIn, FrameIndex: CS.getFrameIdx(), |
139 | RC, TRI, VReg: Register()); |
140 | |
141 | if (Indexes) { |
142 | assert(std::distance(MIS.begin(), I) == 1); |
143 | MachineInstr &Inst = *std::prev(x: I); |
144 | Indexes->insertMachineInstrInMaps(MI&: Inst); |
145 | } |
146 | |
147 | if (LIS) |
148 | LIS->removeAllRegUnitsForPhysReg(Reg); |
149 | } |
150 | } else { |
151 | // TFI doesn't update Indexes and LIS, so we have to do it separately. |
152 | if (Indexes) |
153 | Indexes->repairIndexesInRange(MBB: &SaveBlock, Begin: SaveBlock.begin(), End: I); |
154 | |
155 | if (LIS) |
156 | for (const CalleeSavedInfo &CS : CSI) |
157 | LIS->removeAllRegUnitsForPhysReg(Reg: CS.getReg()); |
158 | } |
159 | } |
160 | |
161 | /// Insert restore code for the callee-saved registers used in the function. |
162 | static void insertCSRRestores(MachineBasicBlock &RestoreBlock, |
163 | MutableArrayRef<CalleeSavedInfo> CSI, |
164 | SlotIndexes *Indexes, LiveIntervals *LIS) { |
165 | MachineFunction &MF = *RestoreBlock.getParent(); |
166 | const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); |
167 | const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); |
168 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
169 | // Restore all registers immediately before the return and any |
170 | // terminators that precede it. |
171 | MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); |
172 | const MachineBasicBlock::iterator BeforeRestoresI = |
173 | I == RestoreBlock.begin() ? I : std::prev(x: I); |
174 | |
175 | // FIXME: Just emit the readlane/writelane directly |
176 | if (!TFI->restoreCalleeSavedRegisters(MBB&: RestoreBlock, MI: I, CSI, TRI)) { |
177 | for (const CalleeSavedInfo &CI : reverse(C&: CSI)) { |
178 | // Insert in reverse order. loadRegFromStackSlot can insert |
179 | // multiple instructions. |
180 | TFI->restoreCalleeSavedRegister(MBB&: RestoreBlock, MI: I, CS: CI, TII: &TII, TRI); |
181 | |
182 | if (Indexes) { |
183 | MachineInstr &Inst = *std::prev(x: I); |
184 | Indexes->insertMachineInstrInMaps(MI&: Inst); |
185 | } |
186 | |
187 | if (LIS) |
188 | LIS->removeAllRegUnitsForPhysReg(Reg: CI.getReg()); |
189 | } |
190 | } else { |
191 | // TFI doesn't update Indexes and LIS, so we have to do it separately. |
192 | if (Indexes) |
193 | Indexes->repairIndexesInRange(MBB: &RestoreBlock, Begin: BeforeRestoresI, |
194 | End: RestoreBlock.getFirstTerminator()); |
195 | |
196 | if (LIS) |
197 | for (const CalleeSavedInfo &CS : CSI) |
198 | LIS->removeAllRegUnitsForPhysReg(Reg: CS.getReg()); |
199 | } |
200 | } |
201 | |
202 | /// Compute the sets of entry and return blocks for saving and restoring |
203 | /// callee-saved registers, and placing prolog and epilog code. |
204 | void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { |
205 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
206 | |
207 | // Even when we do not change any CSR, we still want to insert the |
208 | // prologue and epilogue of the function. |
209 | // So set the save points for those. |
210 | |
211 | // Use the points found by shrink-wrapping, if any. |
212 | if (MFI.getSavePoint()) { |
213 | SaveBlocks.push_back(Elt: MFI.getSavePoint()); |
214 | assert(MFI.getRestorePoint() && "Both restore and save must be set" ); |
215 | MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); |
216 | // If RestoreBlock does not have any successor and is not a return block |
217 | // then the end point is unreachable and we do not need to insert any |
218 | // epilogue. |
219 | if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) |
220 | RestoreBlocks.push_back(Elt: RestoreBlock); |
221 | return; |
222 | } |
223 | |
224 | // Save refs to entry and return blocks. |
225 | SaveBlocks.push_back(Elt: &MF.front()); |
226 | for (MachineBasicBlock &MBB : MF) { |
227 | if (MBB.isEHFuncletEntry()) |
228 | SaveBlocks.push_back(Elt: &MBB); |
229 | if (MBB.isReturnBlock()) |
230 | RestoreBlocks.push_back(Elt: &MBB); |
231 | } |
232 | } |
233 | |
234 | // TODO: To support shrink wrapping, this would need to copy |
235 | // PrologEpilogInserter's updateLiveness. |
236 | static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) { |
237 | MachineBasicBlock &EntryBB = MF.front(); |
238 | |
239 | for (const CalleeSavedInfo &CSIReg : CSI) |
240 | EntryBB.addLiveIn(PhysReg: CSIReg.getReg()); |
241 | EntryBB.sortUniqueLiveIns(); |
242 | } |
243 | |
244 | bool SILowerSGPRSpills::spillCalleeSavedRegs( |
245 | MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) { |
246 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
247 | const Function &F = MF.getFunction(); |
248 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
249 | const SIFrameLowering *TFI = ST.getFrameLowering(); |
250 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
251 | RegScavenger *RS = nullptr; |
252 | |
253 | // Determine which of the registers in the callee save list should be saved. |
254 | BitVector SavedRegs; |
255 | TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS); |
256 | |
257 | // Add the code to save and restore the callee saved registers. |
258 | if (!F.hasFnAttribute(Kind: Attribute::Naked)) { |
259 | // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is |
260 | // necessary for verifier liveness checks. |
261 | MFI.setCalleeSavedInfoValid(true); |
262 | |
263 | std::vector<CalleeSavedInfo> CSI; |
264 | const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); |
265 | |
266 | for (unsigned I = 0; CSRegs[I]; ++I) { |
267 | MCRegister Reg = CSRegs[I]; |
268 | |
269 | if (SavedRegs.test(Idx: Reg)) { |
270 | const TargetRegisterClass *RC = |
271 | TRI->getMinimalPhysRegClass(Reg, VT: MVT::i32); |
272 | int JunkFI = MFI.CreateStackObject(Size: TRI->getSpillSize(RC: *RC), |
273 | Alignment: TRI->getSpillAlign(RC: *RC), isSpillSlot: true); |
274 | |
275 | CSI.emplace_back(args&: Reg, args&: JunkFI); |
276 | CalleeSavedFIs.push_back(Elt: JunkFI); |
277 | } |
278 | } |
279 | |
280 | if (!CSI.empty()) { |
281 | for (MachineBasicBlock *SaveBlock : SaveBlocks) |
282 | insertCSRSaves(SaveBlock&: *SaveBlock, CSI, Indexes, LIS); |
283 | |
284 | // Add live ins to save blocks. |
285 | assert(SaveBlocks.size() == 1 && "shrink wrapping not fully implemented" ); |
286 | updateLiveness(MF, CSI); |
287 | |
288 | for (MachineBasicBlock *RestoreBlock : RestoreBlocks) |
289 | insertCSRRestores(RestoreBlock&: *RestoreBlock, CSI, Indexes, LIS); |
290 | return true; |
291 | } |
292 | } |
293 | |
294 | return false; |
295 | } |
296 | |
297 | void SILowerSGPRSpills::updateLaneVGPRDomInstr( |
298 | int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, |
299 | DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) { |
300 | // For the Def of a virtual LaneVPGR to dominate all its uses, we should |
301 | // insert an IMPLICIT_DEF before the dominating spill. Switching to a |
302 | // depth first order doesn't really help since the machine function can be in |
303 | // the unstructured control flow post-SSA. For each virtual register, hence |
304 | // finding the common dominator to get either the dominating spill or a block |
305 | // dominating all spills. |
306 | SIMachineFunctionInfo *FuncInfo = |
307 | MBB->getParent()->getInfo<SIMachineFunctionInfo>(); |
308 | ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills = |
309 | FuncInfo->getSGPRSpillToVirtualVGPRLanes(FrameIndex: FI); |
310 | Register PrevLaneVGPR; |
311 | for (auto &Spill : VGPRSpills) { |
312 | if (PrevLaneVGPR == Spill.VGPR) |
313 | continue; |
314 | |
315 | PrevLaneVGPR = Spill.VGPR; |
316 | auto I = LaneVGPRDomInstr.find(Val: Spill.VGPR); |
317 | if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { |
318 | // Initially add the spill instruction itself for Insertion point. |
319 | LaneVGPRDomInstr[Spill.VGPR] = InsertPt; |
320 | } else { |
321 | assert(I != LaneVGPRDomInstr.end()); |
322 | auto PrevInsertPt = I->second; |
323 | MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); |
324 | if (DomMBB == MBB) { |
325 | // The insertion point earlier selected in a predecessor block whose |
326 | // spills are currently being lowered. The earlier InsertPt would be |
327 | // the one just before the block terminator and it should be changed |
328 | // if we insert any new spill in it. |
329 | if (MDT->dominates(A: &*InsertPt, B: &*PrevInsertPt)) |
330 | I->second = InsertPt; |
331 | |
332 | continue; |
333 | } |
334 | |
335 | // Find the common dominator block between PrevInsertPt and the |
336 | // current spill. |
337 | DomMBB = MDT->findNearestCommonDominator(A: DomMBB, B: MBB); |
338 | if (DomMBB == MBB) |
339 | I->second = InsertPt; |
340 | else if (DomMBB != PrevInsertPt->getParent()) |
341 | I->second = &(*DomMBB->getFirstTerminator()); |
342 | } |
343 | } |
344 | } |
345 | |
346 | void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, |
347 | BitVector &RegMask) { |
348 | // Determine an optimal number of VGPRs for WWM allocation. The complement |
349 | // list will be available for allocating other VGPR virtual registers. |
350 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
351 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
352 | BitVector ReservedRegs = TRI->getReservedRegs(MF); |
353 | BitVector NonWwmAllocMask(TRI->getNumRegs()); |
354 | |
355 | // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future |
356 | // to have a balanced allocation between WWM values and per-thread vector |
357 | // register operands. |
358 | unsigned NumRegs = MaxNumVGPRsForWwmAllocation; |
359 | NumRegs = |
360 | std::min(a: static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), b: NumRegs); |
361 | |
362 | auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); |
363 | // Try to use the highest available registers for now. Later after |
364 | // vgpr-regalloc, they can be shifted to the lowest range. |
365 | unsigned I = 0; |
366 | for (unsigned Reg = AMDGPU::VGPR0 + MaxNumVGPRs - 1; |
367 | (I < NumRegs) && (Reg >= AMDGPU::VGPR0); --Reg) { |
368 | if (!ReservedRegs.test(Idx: Reg) && |
369 | !MRI.isPhysRegUsed(PhysReg: Reg, /*SkipRegMaskTest=*/true)) { |
370 | TRI->markSuperRegs(RegisterSet&: RegMask, Reg); |
371 | ++I; |
372 | } |
373 | } |
374 | |
375 | if (I != NumRegs) { |
376 | // Reserve an arbitrary register and report the error. |
377 | TRI->markSuperRegs(RegisterSet&: RegMask, Reg: AMDGPU::VGPR0); |
378 | MF.getFunction().getContext().emitError( |
379 | ErrorStr: "can't find enough VGPRs for wwm-regalloc" ); |
380 | } |
381 | } |
382 | |
383 | bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) { |
384 | auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>(); |
385 | LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; |
386 | auto *SIWrapper = getAnalysisIfAvailable<SlotIndexesWrapperPass>(); |
387 | SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr; |
388 | MachineDominatorTree *MDT = |
389 | &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
390 | return SILowerSGPRSpills(LIS, Indexes, MDT).run(MF); |
391 | } |
392 | |
393 | bool SILowerSGPRSpills::run(MachineFunction &MF) { |
394 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
395 | TII = ST.getInstrInfo(); |
396 | TRI = &TII->getRegisterInfo(); |
397 | |
398 | assert(SaveBlocks.empty() && RestoreBlocks.empty()); |
399 | |
400 | // First, expose any CSR SGPR spills. This is mostly the same as what PEI |
401 | // does, but somewhat simpler. |
402 | calculateSaveRestoreBlocks(MF); |
403 | SmallVector<int> CalleeSavedFIs; |
404 | bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs); |
405 | |
406 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
407 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
408 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
409 | |
410 | if (!MFI.hasStackObjects() && !HasCSRs) { |
411 | SaveBlocks.clear(); |
412 | RestoreBlocks.clear(); |
413 | return false; |
414 | } |
415 | |
416 | bool MadeChange = false; |
417 | bool SpilledToVirtVGPRLanes = false; |
418 | |
419 | // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be |
420 | // handled as SpilledToReg in regular PrologEpilogInserter. |
421 | const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() && |
422 | (HasCSRs || FuncInfo->hasSpilledSGPRs()); |
423 | if (HasSGPRSpillToVGPR) { |
424 | // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs |
425 | // are spilled to VGPRs, in which case we can eliminate the stack usage. |
426 | // |
427 | // This operates under the assumption that only other SGPR spills are users |
428 | // of the frame index. |
429 | |
430 | // To track the spill frame indices handled in this pass. |
431 | BitVector SpillFIs(MFI.getObjectIndexEnd(), false); |
432 | |
433 | // To track the IMPLICIT_DEF insertion point for the lane vgprs. |
434 | DenseMap<Register, MachineBasicBlock::iterator> LaneVGPRDomInstr; |
435 | |
436 | for (MachineBasicBlock &MBB : MF) { |
437 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) { |
438 | if (!TII->isSGPRSpill(MI)) |
439 | continue; |
440 | |
441 | if (MI.getOperand(i: 0).isUndef()) { |
442 | if (Indexes) |
443 | Indexes->removeMachineInstrFromMaps(MI); |
444 | MI.eraseFromParent(); |
445 | continue; |
446 | } |
447 | |
448 | int FI = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::addr)->getIndex(); |
449 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
450 | |
451 | bool IsCalleeSaveSGPRSpill = llvm::is_contained(Range&: CalleeSavedFIs, Element: FI); |
452 | if (IsCalleeSaveSGPRSpill) { |
453 | // Spill callee-saved SGPRs into physical VGPR lanes. |
454 | |
455 | // TODO: This is to ensure the CFIs are static for efficient frame |
456 | // unwinding in the debugger. Spilling them into virtual VGPR lanes |
457 | // involve regalloc to allocate the physical VGPRs and that might |
458 | // cause intermediate spill/split of such liveranges for successful |
459 | // allocation. This would result in broken CFI encoding unless the |
460 | // regalloc aware CFI generation to insert new CFIs along with the |
461 | // intermediate spills is implemented. There is no such support |
462 | // currently exist in the LLVM compiler. |
463 | if (FuncInfo->allocateSGPRSpillToVGPRLane( |
464 | MF, FI, /*SpillToPhysVGPRLane=*/true)) { |
465 | bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( |
466 | MI, FI, RS: nullptr, Indexes, LIS, SpillToPhysVGPRLane: true); |
467 | if (!Spilled) |
468 | llvm_unreachable( |
469 | "failed to spill SGPR to physical VGPR lane when allocated" ); |
470 | } |
471 | } else { |
472 | MachineInstrSpan MIS(&MI, &MBB); |
473 | if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { |
474 | bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( |
475 | MI, FI, RS: nullptr, Indexes, LIS); |
476 | if (!Spilled) |
477 | llvm_unreachable( |
478 | "failed to spill SGPR to virtual VGPR lane when allocated" ); |
479 | SpillFIs.set(FI); |
480 | updateLaneVGPRDomInstr(FI, MBB: &MBB, InsertPt: MIS.begin(), LaneVGPRDomInstr); |
481 | SpilledToVirtVGPRLanes = true; |
482 | } |
483 | } |
484 | } |
485 | } |
486 | |
487 | for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { |
488 | auto InsertPt = LaneVGPRDomInstr[Reg]; |
489 | // Insert the IMPLICIT_DEF at the identified points. |
490 | MachineBasicBlock &Block = *InsertPt->getParent(); |
491 | DebugLoc DL = Block.findDebugLoc(MBBI: InsertPt); |
492 | auto MIB = |
493 | BuildMI(BB&: Block, I&: *InsertPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Reg); |
494 | |
495 | // Add WWM flag to the virtual register. |
496 | FuncInfo->setFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG); |
497 | |
498 | // Set SGPR_SPILL asm printer flag |
499 | MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); |
500 | if (LIS) { |
501 | LIS->InsertMachineInstrInMaps(MI&: *MIB); |
502 | LIS->createAndComputeVirtRegInterval(Reg); |
503 | } |
504 | } |
505 | |
506 | // Determine the registers for WWM allocation and also compute the register |
507 | // mask for non-wwm VGPR allocation. |
508 | if (FuncInfo->getSGPRSpillVGPRs().size()) { |
509 | BitVector WwmRegMask(TRI->getNumRegs()); |
510 | |
511 | determineRegsForWWMAllocation(MF, RegMask&: WwmRegMask); |
512 | |
513 | BitVector NonWwmRegMask(WwmRegMask); |
514 | NonWwmRegMask.flip().clearBitsNotInMask(Mask: TRI->getAllVGPRRegMask()); |
515 | |
516 | // The complement set will be the registers for non-wwm (per-thread) vgpr |
517 | // allocation. |
518 | FuncInfo->updateNonWWMRegMask(RegMask&: NonWwmRegMask); |
519 | } |
520 | |
521 | for (MachineBasicBlock &MBB : MF) { |
522 | // FIXME: The dead frame indices are replaced with a null register from |
523 | // the debug value instructions. We should instead, update it with the |
524 | // correct register value. But not sure the register value alone is |
525 | // adequate to lower the DIExpression. It should be worked out later. |
526 | for (MachineInstr &MI : MBB) { |
527 | if (MI.isDebugValue()) { |
528 | uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0; |
529 | if (MI.getOperand(i: StackOperandIdx).isFI() && |
530 | !MFI.isFixedObjectIndex( |
531 | ObjectIdx: MI.getOperand(i: StackOperandIdx).getIndex()) && |
532 | SpillFIs[MI.getOperand(i: StackOperandIdx).getIndex()]) { |
533 | MI.getOperand(i: StackOperandIdx) |
534 | .ChangeToRegister(Reg: Register(), isDef: false /*isDef*/); |
535 | } |
536 | } |
537 | } |
538 | } |
539 | |
540 | // All those frame indices which are dead by now should be removed from the |
541 | // function frame. Otherwise, there is a side effect such as re-mapping of |
542 | // free frame index ids by the later pass(es) like "stack slot coloring" |
543 | // which in turn could mess-up with the book keeping of "frame index to VGPR |
544 | // lane". |
545 | FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); |
546 | |
547 | MadeChange = true; |
548 | } |
549 | |
550 | if (SpilledToVirtVGPRLanes) { |
551 | const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); |
552 | // Shift back the reserved SGPR for EXEC copy into the lowest range. |
553 | // This SGPR is reserved to handle the whole-wave spill/copy operations |
554 | // that might get inserted during vgpr regalloc. |
555 | Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); |
556 | if (UnusedLowSGPR && TRI->getHWRegIndex(Reg: UnusedLowSGPR) < |
557 | TRI->getHWRegIndex(Reg: FuncInfo->getSGPRForEXECCopy())) |
558 | FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); |
559 | } else { |
560 | // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM |
561 | // spills/copies. Reset the SGPR reserved for EXEC copy. |
562 | FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); |
563 | } |
564 | |
565 | SaveBlocks.clear(); |
566 | RestoreBlocks.clear(); |
567 | |
568 | return MadeChange; |
569 | } |
570 | |
571 | PreservedAnalyses |
572 | SILowerSGPRSpillsPass::run(MachineFunction &MF, |
573 | MachineFunctionAnalysisManager &MFAM) { |
574 | MFPropsModifier _(*this, MF); |
575 | auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(IR&: MF); |
576 | auto *Indexes = MFAM.getCachedResult<SlotIndexesAnalysis>(IR&: MF); |
577 | MachineDominatorTree *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF); |
578 | SILowerSGPRSpills(LIS, Indexes, MDT).run(MF); |
579 | return PreservedAnalyses::all(); |
580 | } |
581 | |