1//===-- SILowerSGPRSPills.cpp ---------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all
10// SGPR spills, so must insert CSR SGPR spills as well as expand them.
11//
12// This pass must never create new SGPR virtual registers.
13//
14// FIXME: Must stop RegScavenger spills in later passes.
15//
16//===----------------------------------------------------------------------===//
17
18#include "SILowerSGPRSpills.h"
19#include "AMDGPU.h"
20#include "GCNSubtarget.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIMachineFunctionInfo.h"
23#include "SISpillUtils.h"
24#include "llvm/CodeGen/LiveIntervals.h"
25#include "llvm/CodeGen/MachineCycleAnalysis.h"
26#include "llvm/CodeGen/MachineDominators.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/RegisterScavenging.h"
29#include "llvm/InitializePasses.h"
30
31using namespace llvm;
32
33#define DEBUG_TYPE "si-lower-sgpr-spills"
34
35using MBBVector = SmallVector<MachineBasicBlock *, 4>;
36
37namespace {
38
39/// Insertion point for IMPLICIT_DEF: iterator may be MBB::end() and can't be
40/// dereferenced so the parent block is stored explicitly.
41struct LaneVGPRInsertPt {
42 MachineBasicBlock *MBB;
43 MachineBasicBlock::iterator It;
44};
45
46static LaneVGPRInsertPt insertPt(MachineBasicBlock *MBB,
47 MachineBasicBlock::iterator It) {
48 return {.MBB: MBB, .It: It};
49}
50
51static cl::opt<unsigned> MaxNumVGPRsForWwmAllocation(
52 "amdgpu-num-vgprs-for-wwm-alloc",
53 cl::desc("Max num VGPRs for whole-wave register allocation."),
54 cl::ReallyHidden, cl::init(Val: 10));
55
56class SILowerSGPRSpills {
57private:
58 const SIRegisterInfo *TRI = nullptr;
59 const SIInstrInfo *TII = nullptr;
60 LiveIntervals *LIS = nullptr;
61 SlotIndexes *Indexes = nullptr;
62 MachineDominatorTree *MDT = nullptr;
63 MachineCycleInfo *MCI = nullptr;
64
65 // Save and Restore blocks of the current function. Typically there is a
66 // single save block, unless Windows EH funclets are involved.
67 MBBVector SaveBlocks;
68 MBBVector RestoreBlocks;
69
70 MachineBasicBlock *getCycleDomBB(MachineCycle *C);
71
72public:
73 SILowerSGPRSpills(LiveIntervals *LIS, SlotIndexes *Indexes,
74 MachineDominatorTree *MDT, MachineCycleInfo *MCI)
75 : LIS(LIS), Indexes(Indexes), MDT(MDT), MCI(MCI) {}
76 bool run(MachineFunction &MF);
77 void calculateSaveRestoreBlocks(MachineFunction &MF);
78 bool spillCalleeSavedRegs(MachineFunction &MF,
79 SmallVectorImpl<int> &CalleeSavedFIs);
80 void updateLaneVGPRDomInstr(
81 int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
82 DenseMap<Register, LaneVGPRInsertPt> &LaneVGPRDomInstr);
83 void determineRegsForWWMAllocation(MachineFunction &MF, BitVector &RegMask);
84};
85
86class SILowerSGPRSpillsLegacy : public MachineFunctionPass {
87public:
88 static char ID;
89
90 SILowerSGPRSpillsLegacy() : MachineFunctionPass(ID) {}
91
92 bool runOnMachineFunction(MachineFunction &MF) override;
93
94 void getAnalysisUsage(AnalysisUsage &AU) const override {
95 AU.addRequired<MachineDominatorTreeWrapperPass>();
96 AU.addRequired<MachineCycleInfoWrapperPass>();
97 AU.setPreservesAll();
98 MachineFunctionPass::getAnalysisUsage(AU);
99 }
100
101 MachineFunctionProperties getClearedProperties() const override {
102 // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
103 return MachineFunctionProperties().setIsSSA().setNoVRegs();
104 }
105};
106
107} // end anonymous namespace
108
109char SILowerSGPRSpillsLegacy::ID = 0;
110
111INITIALIZE_PASS_BEGIN(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
112 "SI lower SGPR spill instructions", false, false)
113INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
114INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
115INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
116INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
117INITIALIZE_PASS_END(SILowerSGPRSpillsLegacy, DEBUG_TYPE,
118 "SI lower SGPR spill instructions", false, false)
119
120char &llvm::SILowerSGPRSpillsLegacyID = SILowerSGPRSpillsLegacy::ID;
121
122/// Insert spill code for the callee-saved registers used in the function.
123static void insertCSRSaves(const GCNSubtarget &ST, MachineBasicBlock &SaveBlock,
124 ArrayRef<CalleeSavedInfo> CSI, SlotIndexes *Indexes,
125 LiveIntervals *LIS) {
126 const TargetFrameLowering *TFI = ST.getFrameLowering();
127 const TargetRegisterInfo *TRI = ST.getRegisterInfo();
128 MachineBasicBlock::iterator I = SaveBlock.begin();
129 MachineInstrSpan MIS(I, &SaveBlock);
130 bool Success = TFI->spillCalleeSavedRegisters(MBB&: SaveBlock, MI: I, CSI, TRI);
131 assert(Success && "spillCalleeSavedRegisters should always succeed");
132 (void)Success;
133
134 // TFI doesn't update Indexes and LIS, so we have to do it separately.
135 if (Indexes)
136 Indexes->repairIndexesInRange(MBB: &SaveBlock, Begin: SaveBlock.begin(), End: I);
137
138 if (LIS)
139 for (const CalleeSavedInfo &CS : CSI)
140 LIS->removeAllRegUnitsForPhysReg(Reg: CS.getReg());
141}
142
143/// Insert restore code for the callee-saved registers used in the function.
144static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
145 MutableArrayRef<CalleeSavedInfo> CSI,
146 SlotIndexes *Indexes, LiveIntervals *LIS) {
147 MachineFunction &MF = *RestoreBlock.getParent();
148 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
149 const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
150 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
151 // Restore all registers immediately before the return and any
152 // terminators that precede it.
153 MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
154 const MachineBasicBlock::iterator BeforeRestoresI =
155 I == RestoreBlock.begin() ? I : std::prev(x: I);
156
157 // FIXME: Just emit the readlane/writelane directly
158 if (!TFI->restoreCalleeSavedRegisters(MBB&: RestoreBlock, MI: I, CSI, TRI)) {
159 for (const CalleeSavedInfo &CI : reverse(C&: CSI)) {
160 // Insert in reverse order. loadRegFromStackSlot can insert
161 // multiple instructions.
162 TFI->restoreCalleeSavedRegister(MBB&: RestoreBlock, MI: I, CS: CI, TII: &TII, TRI);
163
164 if (Indexes) {
165 MachineInstr &Inst = *std::prev(x: I);
166 Indexes->insertMachineInstrInMaps(MI&: Inst);
167 }
168
169 if (LIS)
170 LIS->removeAllRegUnitsForPhysReg(Reg: CI.getReg());
171 }
172 } else {
173 // TFI doesn't update Indexes and LIS, so we have to do it separately.
174 if (Indexes)
175 Indexes->repairIndexesInRange(MBB: &RestoreBlock, Begin: BeforeRestoresI,
176 End: RestoreBlock.getFirstTerminator());
177
178 if (LIS)
179 for (const CalleeSavedInfo &CS : CSI)
180 LIS->removeAllRegUnitsForPhysReg(Reg: CS.getReg());
181 }
182}
183
184/// Compute the sets of entry and return blocks for saving and restoring
185/// callee-saved registers, and placing prolog and epilog code.
186void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
187 const MachineFrameInfo &MFI = MF.getFrameInfo();
188
189 // Even when we do not change any CSR, we still want to insert the
190 // prologue and epilogue of the function.
191 // So set the save points for those.
192
193 // Use the points found by shrink-wrapping, if any.
194 if (!MFI.getSavePoints().empty()) {
195 assert(MFI.getSavePoints().size() == 1 &&
196 "Multiple save points not yet supported!");
197 const auto &SavePoint = *MFI.getSavePoints().begin();
198 SaveBlocks.push_back(Elt: SavePoint.first);
199 assert(MFI.getRestorePoints().size() == 1 &&
200 "Multiple restore points not yet supported!");
201 const auto &RestorePoint = *MFI.getRestorePoints().begin();
202 MachineBasicBlock *RestoreBlock = RestorePoint.first;
203 // If RestoreBlock does not have any successor and is not a return block
204 // then the end point is unreachable and we do not need to insert any
205 // epilogue.
206 if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
207 RestoreBlocks.push_back(Elt: RestoreBlock);
208 return;
209 }
210
211 // Save refs to entry and return blocks.
212 SaveBlocks.push_back(Elt: &MF.front());
213 for (MachineBasicBlock &MBB : MF) {
214 if (MBB.isEHFuncletEntry())
215 SaveBlocks.push_back(Elt: &MBB);
216 if (MBB.isReturnBlock())
217 RestoreBlocks.push_back(Elt: &MBB);
218 }
219}
220
221// TODO: To support shrink wrapping, this would need to copy
222// PrologEpilogInserter's updateLiveness.
223static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
224 MachineBasicBlock &EntryBB = MF.front();
225
226 for (const CalleeSavedInfo &CSIReg : CSI)
227 EntryBB.addLiveIn(PhysReg: CSIReg.getReg());
228 EntryBB.sortUniqueLiveIns();
229}
230
231bool SILowerSGPRSpills::spillCalleeSavedRegs(
232 MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
233 MachineRegisterInfo &MRI = MF.getRegInfo();
234 const Function &F = MF.getFunction();
235 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
236 const SIFrameLowering *TFI = ST.getFrameLowering();
237 MachineFrameInfo &MFI = MF.getFrameInfo();
238 RegScavenger *RS = nullptr;
239
240 // Determine which of the registers in the callee save list should be saved.
241 BitVector SavedRegs;
242 TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS);
243
244 // Add the code to save and restore the callee saved registers.
245 if (!F.hasFnAttribute(Kind: Attribute::Naked)) {
246 // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is
247 // necessary for verifier liveness checks.
248 MFI.setCalleeSavedInfoValid(true);
249
250 std::vector<CalleeSavedInfo> CSI;
251 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
252 MCRegister RetAddrReg = TRI->getReturnAddressReg(MF);
253 MCRegister RetAddrRegSub0 = TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0);
254 MCRegister RetAddrRegSub1 = TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1);
255 bool SpillRetAddrReg = false;
256
257 for (unsigned I = 0; CSRegs[I]; ++I) {
258 MCRegister Reg = CSRegs[I];
259
260 if (SavedRegs.test(Idx: Reg)) {
261 if (Reg == RetAddrRegSub0 || Reg == RetAddrRegSub1) {
262 SpillRetAddrReg = true;
263 continue;
264 }
265
266 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
267 int JunkFI = MFI.CreateStackObject(Size: TRI->getSpillSize(RC: *RC),
268 Alignment: TRI->getSpillAlign(RC: *RC), isSpillSlot: true,
269 Alloca: nullptr, ID: TRI->getSpillStackID(RC: *RC));
270
271 CSI.emplace_back(args&: Reg, args&: JunkFI);
272 CalleeSavedFIs.push_back(Elt: JunkFI);
273 }
274 }
275
276 // Return address uses a register pair. Add the super register to the
277 // CSI list so that it's easier to identify the entire spill and CFI
278 // can be emitted appropriately.
279 if (SpillRetAddrReg) {
280 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg: RetAddrReg);
281 int JunkFI =
282 MFI.CreateStackObject(Size: TRI->getSpillSize(RC: *RC), Alignment: TRI->getSpillAlign(RC: *RC),
283 isSpillSlot: true, Alloca: nullptr, ID: TRI->getSpillStackID(RC: *RC));
284 CSI.push_back(x: CalleeSavedInfo(RetAddrReg, JunkFI));
285 CalleeSavedFIs.push_back(Elt: JunkFI);
286 }
287
288 if (!CSI.empty()) {
289 for (MachineBasicBlock *SaveBlock : SaveBlocks)
290 insertCSRSaves(ST, SaveBlock&: *SaveBlock, CSI, Indexes, LIS);
291
292 // Add live ins to save blocks.
293 assert(SaveBlocks.size() == 1 && "shrink wrapping not fully implemented");
294 updateLiveness(MF, CSI);
295
296 for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
297 insertCSRRestores(RestoreBlock&: *RestoreBlock, CSI, Indexes, LIS);
298 return true;
299 }
300 }
301
302 return false;
303}
304
305MachineBasicBlock *SILowerSGPRSpills::getCycleDomBB(MachineCycle *C) {
306 // If the insertion point lands on a cycle entry, move it to a block that
307 // dominates all entries.
308 if (C->isReducible()) {
309 if (auto *IDom = MDT->getNode(BB: C->getHeader())->getIDom())
310 return IDom->getBlock();
311 llvm_unreachable("Expected cycle to have an IDom.");
312 return nullptr;
313 }
314
315 const SmallVectorImpl<MachineBasicBlock *> &Entries = C->getEntries();
316 assert(!Entries.empty() && "Expected cycle to have at least one entry.");
317 MachineBasicBlock *EntryBB = Entries[0];
318 for (unsigned I = 1; I < Entries.size(); ++I)
319 EntryBB = MDT->findNearestCommonDominator(A: EntryBB, B: Entries[I]);
320 return EntryBB;
321}
322
323void SILowerSGPRSpills::updateLaneVGPRDomInstr(
324 int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
325 DenseMap<Register, LaneVGPRInsertPt> &LaneVGPRDomInstr) {
326 // For the Def of a virtual LaneVGPR to dominate all its uses, we should
327 // insert an IMPLICIT_DEF before the dominating spill. Switching to a
328 // depth first order doesn't really help since the machine function can be in
329 // the unstructured control flow post-SSA. For each virtual register, hence
330 // finding the common dominator to get either the dominating spill or a block
331 // dominating all spills.
332 SIMachineFunctionInfo *FuncInfo =
333 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
334 ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
335 FuncInfo->getSGPRSpillToVirtualVGPRLanes(FrameIndex: FI);
336 Register PrevLaneVGPR;
337 for (auto &Spill : VGPRSpills) {
338 if (PrevLaneVGPR == Spill.VGPR)
339 continue;
340
341 PrevLaneVGPR = Spill.VGPR;
342 auto I = LaneVGPRDomInstr.find(Val: Spill.VGPR);
343 if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
344 LaneVGPRDomInstr[Spill.VGPR] = insertPt(MBB, It: InsertPt);
345 } else {
346 assert(I != LaneVGPRDomInstr.end());
347 LaneVGPRInsertPt Prev = I->second;
348 MachineBasicBlock *PrevInsertMBB = Prev.MBB;
349 MachineBasicBlock::iterator PrevInsertPt = Prev.It;
350 MachineBasicBlock *DomMBB = PrevInsertMBB;
351 if (DomMBB == MBB) {
352 // The insertion point earlier selected in a predecessor block whose
353 // spills are currently being lowered. The earlier InsertPt would be
354 // the one just before the block terminator and it should be changed
355 // if we insert any new spill in it.
356 if (PrevInsertPt == MBB->end() ||
357 MDT->dominates(A: &*InsertPt, B: &*PrevInsertPt))
358 I->second = insertPt(MBB, It: InsertPt);
359
360 continue;
361 }
362
363 // Find the common dominator block between PrevInsertPt and the
364 // current spill.
365 DomMBB = MDT->findNearestCommonDominator(A: DomMBB, B: MBB);
366
367 if (DomMBB == MBB)
368 I->second = insertPt(MBB, It: InsertPt);
369 else if (DomMBB != PrevInsertMBB)
370 I->second = insertPt(MBB: DomMBB, It: DomMBB->getFirstTerminator());
371 }
372 }
373}
374
375void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
376 BitVector &RegMask) {
377 // Determine an optimal number of VGPRs for WWM allocation. The complement
378 // list will be available for allocating other VGPR virtual registers.
379 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
380 MachineRegisterInfo &MRI = MF.getRegInfo();
381 BitVector ReservedRegs = TRI->getReservedRegs(MF);
382 BitVector NonWwmAllocMask(TRI->getNumRegs());
383 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
384
385 // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future
386 // to have a balanced allocation between WWM values and per-thread vector
387 // register operands.
388 unsigned NumRegs = MaxNumVGPRsForWwmAllocation;
389 NumRegs =
390 std::min(a: static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), b: NumRegs);
391
392 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(F: MF.getFunction());
393 // Try to use the highest available registers for now. Later after
394 // vgpr-regalloc, they can be shifted to the lowest range.
395 unsigned I = 0;
396 for (unsigned Reg = AMDGPU::VGPR0 + MaxNumVGPRs - 1;
397 (I < NumRegs) && (Reg >= AMDGPU::VGPR0); --Reg) {
398 if (!ReservedRegs.test(Idx: Reg) &&
399 !MRI.isPhysRegUsed(PhysReg: Reg, /*SkipRegMaskTest=*/true)) {
400 TRI->markSuperRegs(RegisterSet&: RegMask, Reg);
401 ++I;
402 }
403 }
404
405 if (I != NumRegs) {
406 // Reserve an arbitrary register and report the error.
407 TRI->markSuperRegs(RegisterSet&: RegMask, Reg: AMDGPU::VGPR0);
408 MF.getFunction().getContext().emitError(
409 ErrorStr: "cannot find enough VGPRs for wwm-regalloc");
410 }
411}
412
413bool SILowerSGPRSpillsLegacy::runOnMachineFunction(MachineFunction &MF) {
414 auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
415 LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
416 auto *SIWrapper = getAnalysisIfAvailable<SlotIndexesWrapperPass>();
417 SlotIndexes *Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr;
418 MachineDominatorTree *MDT =
419 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
420 MachineCycleInfo *MCI =
421 &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
422 return SILowerSGPRSpills(LIS, Indexes, MDT, MCI).run(MF);
423}
424
425bool SILowerSGPRSpills::run(MachineFunction &MF) {
426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427 TII = ST.getInstrInfo();
428 TRI = &TII->getRegisterInfo();
429
430 assert(SaveBlocks.empty() && RestoreBlocks.empty());
431
432 // First, expose any CSR SGPR spills. This is mostly the same as what PEI
433 // does, but somewhat simpler.
434 calculateSaveRestoreBlocks(MF);
435 SmallVector<int> CalleeSavedFIs;
436 bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
437
438 MachineFrameInfo &MFI = MF.getFrameInfo();
439 MachineRegisterInfo &MRI = MF.getRegInfo();
440 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
441
442 if (!MFI.hasStackObjects() && !HasCSRs) {
443 SaveBlocks.clear();
444 RestoreBlocks.clear();
445 return false;
446 }
447
448 bool MadeChange = false;
449 bool SpilledToVirtVGPRLanes = false;
450
451 // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
452 // handled as SpilledToReg in regular PrologEpilogInserter.
453 const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
454 (HasCSRs || FuncInfo->hasSpilledSGPRs());
455 if (HasSGPRSpillToVGPR) {
456 // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
457 // are spilled to VGPRs, in which case we can eliminate the stack usage.
458 //
459 // This operates under the assumption that only other SGPR spills are users
460 // of the frame index.
461
462 // To track the spill frame indices handled in this pass.
463 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
464
465 // To track the IMPLICIT_DEF insertion point for the lane vgprs.
466 DenseMap<Register, LaneVGPRInsertPt> LaneVGPRDomInstr;
467
468 for (MachineBasicBlock &MBB : MF) {
469 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
470 if (!TII->isSGPRSpill(MI))
471 continue;
472
473 if (MI.getOperand(i: 0).isUndef()) {
474 if (Indexes)
475 Indexes->removeMachineInstrFromMaps(MI);
476 MI.eraseFromParent();
477 continue;
478 }
479
480 int FI = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::addr)->getIndex();
481 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
482
483 bool IsCalleeSaveSGPRSpill = llvm::is_contained(Range&: CalleeSavedFIs, Element: FI);
484 if (IsCalleeSaveSGPRSpill) {
485 // Spill callee-saved SGPRs into physical VGPR lanes.
486
487 // TODO: This is to ensure the CFIs are static for efficient frame
488 // unwinding in the debugger. Spilling them into virtual VGPR lanes
489 // involve regalloc to allocate the physical VGPRs and that might
490 // cause intermediate spill/split of such liveranges for successful
491 // allocation. This would result in broken CFI encoding unless the
492 // regalloc aware CFI generation to insert new CFIs along with the
493 // intermediate spills is implemented. There is no such support
494 // currently exist in the LLVM compiler.
495 if (FuncInfo->allocateSGPRSpillToVGPRLane(
496 MF, FI, /*SpillToPhysVGPRLane=*/true)) {
497 bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
498 MI, FI, RS: nullptr, Indexes, LIS, SpillToPhysVGPRLane: true);
499 if (!Spilled)
500 llvm_unreachable(
501 "failed to spill SGPR to physical VGPR lane when allocated");
502 }
503 } else {
504 MachineInstrSpan MIS(&MI, &MBB);
505 if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
506 bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
507 MI, FI, RS: nullptr, Indexes, LIS);
508 if (!Spilled)
509 llvm_unreachable(
510 "failed to spill SGPR to virtual VGPR lane when allocated");
511 SpillFIs.set(FI);
512 updateLaneVGPRDomInstr(FI, MBB: &MBB, InsertPt: MIS.begin(), LaneVGPRDomInstr);
513 SpilledToVirtVGPRLanes = true;
514 }
515 }
516 }
517 }
518
519 for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
520 LaneVGPRInsertPt IP = LaneVGPRDomInstr[Reg];
521 if (MachineCycle *C = MCI->getTopLevelParentCycle(Block: IP.MBB)) {
522 MachineBasicBlock *AdjMBB = getCycleDomBB(C);
523 IP = insertPt(MBB: AdjMBB, It: AdjMBB->getFirstTerminator());
524 }
525 // Insert the IMPLICIT_DEF at the identified points.
526 MachineBasicBlock &Block = *IP.MBB;
527 DebugLoc DL = Block.findDebugLoc(MBBI: IP.It);
528 auto MIB = BuildMI(BB&: Block, I: IP.It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Reg);
529
530 // Add WWM flag to the virtual register.
531 FuncInfo->setFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
532
533 // Set SGPR_SPILL asm printer flag
534 MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
535 if (LIS) {
536 LIS->InsertMachineInstrInMaps(MI&: *MIB);
537 LIS->createAndComputeVirtRegInterval(Reg);
538 }
539 }
540
541 // Determine the registers for WWM allocation and also compute the register
542 // mask for non-wwm VGPR allocation.
543 if (FuncInfo->getSGPRSpillVGPRs().size()) {
544 BitVector WwmRegMask(TRI->getNumRegs());
545
546 determineRegsForWWMAllocation(MF, RegMask&: WwmRegMask);
547
548 BitVector NonWwmRegMask(WwmRegMask);
549 NonWwmRegMask.flip().clearBitsNotInMask(Mask: TRI->getAllVGPRRegMask());
550
551 // The complement set will be the registers for non-wwm (per-thread) vgpr
552 // allocation.
553 FuncInfo->updateNonWWMRegMask(RegMask&: NonWwmRegMask);
554 }
555
556 for (MachineBasicBlock &MBB : MF)
557 clearDebugInfoForSpillFIs(MFI, MBB, SpillFIs);
558
559 // All those frame indices which are dead by now should be removed from the
560 // function frame. Otherwise, there is a side effect such as re-mapping of
561 // free frame index ids by the later pass(es) like "stack slot coloring"
562 // which in turn could mess-up with the book keeping of "frame index to VGPR
563 // lane".
564 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
565
566 MadeChange = true;
567 }
568
569 if (SpilledToVirtVGPRLanes) {
570 const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
571 // Shift back the reserved SGPR for EXEC copy into the lowest range.
572 // This SGPR is reserved to handle the whole-wave spill/copy operations
573 // that might get inserted during vgpr regalloc.
574 Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF);
575 if (UnusedLowSGPR && TRI->getHWRegIndex(Reg: UnusedLowSGPR) <
576 TRI->getHWRegIndex(Reg: FuncInfo->getSGPRForEXECCopy()))
577 FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
578 } else {
579 // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
580 // spills/copies. Reset the SGPR reserved for EXEC copy.
581 FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
582 }
583
584 SaveBlocks.clear();
585 RestoreBlocks.clear();
586
587 return MadeChange;
588}
589
590PreservedAnalyses
591SILowerSGPRSpillsPass::run(MachineFunction &MF,
592 MachineFunctionAnalysisManager &MFAM) {
593 MFPropsModifier _(*this, MF);
594 auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(IR&: MF);
595 auto *Indexes = MFAM.getCachedResult<SlotIndexesAnalysis>(IR&: MF);
596 MachineDominatorTree *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF);
597 MachineCycleInfo &MCI = MFAM.getResult<MachineCycleAnalysis>(IR&: MF);
598 SILowerSGPRSpills(LIS, Indexes, MDT, &MCI).run(MF);
599 return PreservedAnalyses::all();
600}
601