1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "AMDGPULaneMaskUtils.h"
12#include "GCNSubtarget.h"
13#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14#include "SIMachineFunctionInfo.h"
15#include "SISpillUtils.h"
16#include "llvm/BinaryFormat/Dwarf.h"
17#include "llvm/CodeGen/LiveRegUnits.h"
18#include "llvm/CodeGen/MachineFrameInfo.h"
19#include "llvm/CodeGen/MachineModuleInfo.h"
20#include "llvm/CodeGen/RegisterScavenging.h"
21#include "llvm/Support/LEB128.h"
22#include "llvm/Target/TargetMachine.h"
23
24using namespace llvm;
25
26#define DEBUG_TYPE "frame-info"
27
28static cl::opt<bool> EnableSpillVGPRToAGPR(
29 "amdgpu-spill-vgpr-to-agpr",
30 cl::desc("Enable spilling VGPRs to AGPRs"),
31 cl::ReallyHidden,
32 cl::init(Val: true));
33
34static constexpr unsigned SGPRBitSize = 32;
35static constexpr unsigned SGPRByteSize = SGPRBitSize / 8;
36static constexpr unsigned VGPRLaneBitSize = 32;
37
38// Find a register matching \p RC from \p LiveUnits which is unused and
39// available throughout the function. On failure, returns AMDGPU::NoRegister.
40// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
41// MCRegisters. This should reduce the number of iterations and avoid redundant
42// checking.
43static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
44 const LiveRegUnits &LiveUnits,
45 const TargetRegisterClass &RC) {
46 for (MCRegister Reg : RC) {
47 if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) &&
48 !MRI.isReserved(PhysReg: Reg))
49 return Reg;
50 }
51 return MCRegister();
52}
53
54static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) {
55 assert(DwarfReg >= 0);
56 if (DwarfReg < 32) {
57 OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg);
58 } else {
59 OS << uint8_t(dwarf::DW_OP_regx);
60 encodeULEB128(Value: DwarfReg, OS);
61 }
62}
63
64static MCCFIInstruction
65createScaledCFAInPrivateWave(const GCNSubtarget &ST,
66 MCRegister DwarfStackPtrReg) {
67 assert(ST.enableFlatScratch());
68
69 // When flat scratch is enabled, the stack pointer is an address in the
70 // private_lane DWARF address space (i.e. swizzled), but in order to
71 // accurately and efficiently describe things like masked spills of vector
72 // registers we want to define the CFA to be an address in the private_wave
73 // DWARF address space (i.e. unswizzled). To achieve this we scale the stack
74 // pointer by the wavefront size, implemented as (SP << wave_size_log2).
75 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
76 assert(WavefrontSizeLog2 < 32);
77
78 SmallString<20> Block;
79 raw_svector_ostream OSBlock(Block);
80 encodeDwarfRegisterLocation(DwarfReg: DwarfStackPtrReg, OS&: OSBlock);
81 OSBlock << uint8_t(dwarf::DW_OP_deref_size) << uint8_t(SGPRByteSize)
82 << uint8_t(dwarf::DW_OP_lit0 + WavefrontSizeLog2)
83 << uint8_t(dwarf::DW_OP_shl)
84 << uint8_t(dwarf::DW_OP_lit0 +
85 dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave)
86 << uint8_t(dwarf::DW_OP_LLVM_user)
87 << uint8_t(dwarf::DW_OP_LLVM_form_aspace_address);
88
89 SmallString<20> CFIInst;
90 raw_svector_ostream OSCFIInst(CFIInst);
91 OSCFIInst << uint8_t(dwarf::DW_CFA_def_cfa_expression);
92 encodeULEB128(Value: Block.size(), OS&: OSCFIInst);
93 OSCFIInst << Block;
94
95 return MCCFIInstruction::createEscape(L: nullptr, Vals: OSCFIInst.str());
96}
97
98void SIFrameLowering::emitDefCFA(MachineBasicBlock &MBB,
99 MachineBasicBlock::iterator MBBI,
100 DebugLoc const &DL, MCRegister StackPtrReg,
101 bool AspaceAlreadyDefined,
102 MachineInstr::MIFlag Flags) const {
103 MachineFunction &MF = *MBB.getParent();
104 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
105 const SIRegisterInfo *TRI = ST.getRegisterInfo();
106
107 MCRegister DwarfStackPtrReg = TRI->getDwarfRegNum(Reg: StackPtrReg, isEH: false);
108 MCCFIInstruction CFIInst =
109 ST.enableFlatScratch()
110 ? createScaledCFAInPrivateWave(ST, DwarfStackPtrReg)
111 : (AspaceAlreadyDefined
112 ? MCCFIInstruction::createLLVMDefAspaceCfa(
113 L: nullptr, Register: DwarfStackPtrReg, Offset: 0,
114 AddressSpace: dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave, Loc: SMLoc())
115 : MCCFIInstruction::createDefCfaRegister(L: nullptr,
116 Register: DwarfStackPtrReg));
117 buildCFI(MBB, MBBI, DL, CFIInst, flag: Flags);
118}
119
120// Find a scratch register that we can use in the prologue. We avoid using
121// callee-save registers since they may appear to be free when this is called
122// from canUseAsPrologue (during shrink wrapping), but then no longer be free
123// when this is called from emitPrologue.
124static MCRegister findScratchNonCalleeSaveRegister(
125 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
126 const TargetRegisterClass &RC, bool Unused = false) {
127 // Mark callee saved registers as used so we will not choose them.
128 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
129 for (unsigned i = 0; CSRegs[i]; ++i)
130 LiveUnits.addReg(Reg: CSRegs[i]);
131
132 // We are looking for a register that can be used throughout the entire
133 // function, so any use is unacceptable.
134 if (Unused)
135 return findUnusedRegister(MRI, LiveUnits, RC);
136
137 for (MCRegister Reg : RC) {
138 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg))
139 return Reg;
140 }
141
142 return MCRegister();
143}
144
145/// Query target location for spilling SGPRs
146/// \p IncludeScratchCopy : Also look for free scratch SGPRs
147static void getVGPRSpillLaneOrTempRegister(
148 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
149 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
150 bool IncludeScratchCopy = true) {
151 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
152 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
153
154 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
155 const SIRegisterInfo *TRI = ST.getRegisterInfo();
156 unsigned Size = TRI->getSpillSize(RC);
157 Align Alignment = TRI->getSpillAlign(RC);
158
159 // We need to save and restore the given SGPR.
160
161 Register ScratchSGPR;
162 // 1: Try to save the given register into an unused scratch SGPR. The
163 // LiveUnits should have all the callee saved registers marked as used. For
164 // certain cases we skip copy to scratch SGPR.
165 if (IncludeScratchCopy)
166 ScratchSGPR = findUnusedRegister(MRI&: MF.getRegInfo(), LiveUnits, RC);
167
168 if (!ScratchSGPR) {
169 int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr,
170 ID: TargetStackID::SGPRSpill);
171
172 if (TRI->spillSGPRToVGPR() &&
173 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
174 /*IsPrologEpilog=*/true)) {
175 // 2: There's no free lane to spill, and no free register to save the
176 // SGPR, so we're forced to take another VGPR to use for the spill.
177 MFI->addToPrologEpilogSGPRSpills(
178 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
179 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
180
181 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
182 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
183 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
184 << '\n';);
185 } else {
186 // Remove dead <FI> index
187 MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI);
188 // 3: If all else fails, spill the register to memory.
189 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
190 MFI->addToPrologEpilogSGPRSpills(
191 Reg: SGPR,
192 SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
193 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
194 << printReg(SGPR, TRI) << '\n');
195 }
196 } else {
197 MFI->addToPrologEpilogSGPRSpills(
198 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
199 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
200 LiveUnits.addReg(Reg: ScratchSGPR);
201 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
202 << printReg(ScratchSGPR, TRI) << '\n');
203 }
204}
205
206// We need to specially emit stack operations here because a different frame
207// register is used than in the rest of the function, as getFrameRegister would
208// use.
209static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
210 const SIMachineFunctionInfo &FuncInfo,
211 LiveRegUnits &LiveUnits, MachineFunction &MF,
212 MachineBasicBlock &MBB,
213 MachineBasicBlock::iterator I, const DebugLoc &DL,
214 Register SpillReg, int FI, Register FrameReg,
215 int64_t DwordOff = 0) {
216 unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
217 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
218
219 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
220 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
221 MachineMemOperand *MMO = MF.getMachineMemOperand(
222 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
223 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
224 LiveUnits.addReg(Reg: SpillReg);
225 bool IsKill = !MBB.isLiveIn(Reg: SpillReg);
226 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg,
227 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
228 if (IsKill)
229 LiveUnits.removeReg(Reg: SpillReg);
230}
231
232static void buildEpilogRestore(const GCNSubtarget &ST,
233 const SIRegisterInfo &TRI,
234 const SIMachineFunctionInfo &FuncInfo,
235 LiveRegUnits &LiveUnits, MachineFunction &MF,
236 MachineBasicBlock &MBB,
237 MachineBasicBlock::iterator I,
238 const DebugLoc &DL, Register SpillReg, int FI,
239 Register FrameReg, int64_t DwordOff = 0) {
240 unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
241 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
242
243 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
244 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
245 MachineMemOperand *MMO = MF.getMachineMemOperand(
246 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
247 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
248 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg,
249 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
250}
251
252static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
253 const DebugLoc &DL, const SIInstrInfo *TII,
254 Register TargetReg) {
255 MachineFunction *MF = MBB.getParent();
256 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
257 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
258 const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
259 Register TargetLo = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub0);
260 Register TargetHi = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub1);
261
262 if (MFI->getGITPtrHigh() != 0xffffffff) {
263 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi)
264 .addImm(Val: MFI->getGITPtrHigh())
265 .addReg(RegNo: TargetReg, Flags: RegState::ImplicitDefine);
266 } else {
267 const MCInstrDesc &GetPC64 = TII->get(Opcode: AMDGPU::S_GETPC_B64_pseudo);
268 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg);
269 }
270 Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF);
271 MF->getRegInfo().addLiveIn(Reg: GitPtrLo);
272 MBB.addLiveIn(PhysReg: GitPtrLo);
273 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo)
274 .addReg(RegNo: GitPtrLo);
275}
276
277static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
278 const SIMachineFunctionInfo *FuncInfo,
279 MachineFunction &MF, MachineBasicBlock &MBB,
280 MachineBasicBlock::iterator MBBI, bool IsProlog) {
281 if (LiveUnits.empty()) {
282 LiveUnits.init(TRI);
283 if (IsProlog) {
284 LiveUnits.addLiveIns(MBB);
285 } else {
286 // In epilog.
287 LiveUnits.addLiveOuts(MBB);
288 LiveUnits.stepBackward(MI: *MBBI);
289 }
290 }
291}
292
293namespace llvm {
294
295// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
296// BP, etc. These spills are delayed until the current function's frame is
297// finalized. For a given register, the builder uses the
298// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
299class PrologEpilogSGPRSpillBuilder {
300 MachineBasicBlock::iterator MI;
301 MachineBasicBlock &MBB;
302 MachineFunction &MF;
303 const GCNSubtarget &ST;
304 MachineFrameInfo &MFI;
305 SIMachineFunctionInfo *FuncInfo;
306 const SIInstrInfo *TII;
307 const SIRegisterInfo &TRI;
308 const MCRegisterInfo *MCRI;
309 const SIFrameLowering *TFI;
310 Register SuperReg;
311 const PrologEpilogSGPRSaveRestoreInfo SI;
312 LiveRegUnits &LiveUnits;
313 const DebugLoc &DL;
314 Register FrameReg;
315 ArrayRef<int16_t> SplitParts;
316 unsigned NumSubRegs;
317 unsigned EltSize = 4;
318 bool IsFramePtrPrologSpill;
319 bool NeedsFrameMoves;
320
321 static bool isExec(Register Reg) {
322 return Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::EXEC;
323 }
324
325 /// If this builder requires SuperReg-based CFI, which is emitted after all
326 /// SubRegs are actually spilled, return the Register which should be used
327 /// as input to getDwarfRegNum. Otherwise, CFI should be generated per-SubReg.
328 ///
329 /// Note: Most spills handled by this builder generate CFI after each
330 /// SubReg spill, as each SubReg maps directly to a CFI register via
331 /// getDwarfRegNum(SubReg, false). All other cases currently currently
332 /// correspond to the SuperReg directly.
333 MCRegister getCFISuperReg() const {
334 if (IsFramePtrPrologSpill)
335 return FuncInfo->getFrameOffsetReg();
336 // FIXME: CFI for EXEC needs a fix by accurately computing the spill
337 // offset for both the low and high components.
338 if (isExec(Reg: SuperReg))
339 return AMDGPU::EXEC;
340 return {};
341 }
342
343 void saveToMemory(const int FI) const {
344 MachineRegisterInfo &MRI = MF.getRegInfo();
345 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
346 assert(!MFI.isDeadObjectIndex(FI));
347
348 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true);
349
350 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
351 MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
352 if (!TmpVGPR)
353 report_fatal_error(reason: "failed to find free scratch register");
354
355 auto BuildCFI = [&](Register Reg) {
356 TFI->buildCFI(MBB, MBBI: MI, DL,
357 CFIInst: MCCFIInstruction::createOffset(
358 L: nullptr, Register: MCRI->getDwarfRegNum(Reg, isEH: false),
359 Offset: MFI.getObjectOffset(ObjectIdx: FI) * ST.getWavefrontSize()));
360 };
361 MCRegister CFISuperReg = getCFISuperReg();
362 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
363 Register SubReg = NumSubRegs == 1
364 ? SuperReg
365 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
366 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR)
367 .addReg(RegNo: SubReg);
368
369 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR,
370 FI, FrameReg, DwordOff);
371 if (NeedsFrameMoves && !CFISuperReg)
372 BuildCFI(SubReg);
373 DwordOff += 4;
374 }
375 if (NeedsFrameMoves && CFISuperReg)
376 BuildCFI(CFISuperReg);
377 }
378
379 void saveToVGPRLane(const int FI) const {
380 assert(!MFI.isDeadObjectIndex(FI));
381
382 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
383 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
384 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
385 assert(Spill.size() == NumSubRegs);
386
387 MCRegister CFISuperReg = getCFISuperReg();
388 for (unsigned I = 0; I < NumSubRegs; ++I) {
389 Register SubReg = NumSubRegs == 1
390 ? SuperReg
391 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
392 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR),
393 DestReg: Spill[I].VGPR)
394 .addReg(RegNo: SubReg)
395 .addImm(Val: Spill[I].Lane)
396 .addReg(RegNo: Spill[I].VGPR, Flags: RegState::Undef);
397 if (NeedsFrameMoves && !CFISuperReg)
398 TFI->buildCFIForSGPRToVGPRSpill(MBB, MBBI: MI, DL, SGPR: SubReg, VGPR: Spill[I].VGPR,
399 Lane: Spill[I].Lane);
400 }
401 if (NeedsFrameMoves && CFISuperReg)
402 TFI->buildCFIForSGPRToVGPRSpill(MBB, MBBI: MI, DL, SGPR: CFISuperReg, VGPRSpills: Spill);
403 }
404
405 void copyToScratchSGPR(Register DstReg) const {
406 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg)
407 .addReg(RegNo: SuperReg)
408 .setMIFlag(MachineInstr::FrameSetup);
409 if (NeedsFrameMoves) {
410 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: DstReg);
411 ArrayRef<int16_t> DstSplitParts = TRI.getRegSplitParts(RC, EltSize);
412 assert(NumSubRegs == (DstSplitParts.empty() ? 1 : DstSplitParts.size()));
413 MCRegister CFISuperReg = getCFISuperReg();
414 if (NumSubRegs == 1) {
415 TFI->buildCFI(
416 MBB, MBBI: MI, DL,
417 CFIInst: MCCFIInstruction::createRegister(
418 L: nullptr,
419 Register1: MCRI->getDwarfRegNum(
420 Reg: CFISuperReg ? CFISuperReg : SuperReg.asMCReg(), isEH: false),
421 Register2: MCRI->getDwarfRegNum(Reg: DstReg, isEH: false)));
422 } else if (isExec(Reg: CFISuperReg)) {
423 assert(NumSubRegs == 2 && "EXEC larger than 64-bit");
424 TFI->buildCFIForRegToSGPRPairSpill(MBB, MBBI: MI, DL, Reg: CFISuperReg, SGPRPair: DstReg);
425 } else {
426 for (unsigned I = 0; I < NumSubRegs; ++I) {
427 MCRegister SrcSubReg = TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]);
428 MCRegister DstSubReg = TRI.getSubReg(Reg: DstReg, Idx: DstSplitParts[I]);
429 TFI->buildCFI(MBB, MBBI: MI, DL,
430 CFIInst: MCCFIInstruction::createRegister(
431 L: nullptr, Register1: MCRI->getDwarfRegNum(Reg: SrcSubReg, isEH: false),
432 Register2: MCRI->getDwarfRegNum(Reg: DstSubReg, isEH: false)));
433 }
434 }
435 }
436 }
437
438 void restoreFromMemory(const int FI) {
439 MachineRegisterInfo &MRI = MF.getRegInfo();
440 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
441
442 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false);
443 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
444 MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
445 if (!TmpVGPR)
446 report_fatal_error(reason: "failed to find free scratch register");
447
448 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
449 MCRegister SubReg = NumSubRegs == 1
450 ? SuperReg.asMCReg()
451 : TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]);
452
453 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL,
454 SpillReg: TmpVGPR, FI, FrameReg, DwordOff);
455 assert(SubReg.isPhysical());
456
457 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SubReg)
458 .addReg(RegNo: TmpVGPR, Flags: RegState::Kill);
459 DwordOff += 4;
460 }
461 }
462
463 void restoreFromVGPRLane(const int FI) {
464 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
465 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
466 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
467 assert(Spill.size() == NumSubRegs);
468
469 for (unsigned I = 0; I < NumSubRegs; ++I) {
470 MCRegister SubReg = NumSubRegs == 1
471 ? SuperReg.asMCReg()
472 : TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]);
473 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg)
474 .addReg(RegNo: Spill[I].VGPR)
475 .addImm(Val: Spill[I].Lane);
476 }
477 }
478
479 void copyFromScratchSGPR(Register SrcReg) const {
480 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SuperReg)
481 .addReg(RegNo: SrcReg)
482 .setMIFlag(MachineInstr::FrameDestroy);
483 }
484
485public:
486 PrologEpilogSGPRSpillBuilder(Register Reg,
487 const PrologEpilogSGPRSaveRestoreInfo SI,
488 MachineBasicBlock &MBB,
489 MachineBasicBlock::iterator MI,
490 const DebugLoc &DL, const SIInstrInfo *TII,
491 const SIRegisterInfo &TRI,
492 LiveRegUnits &LiveUnits, Register FrameReg,
493 bool IsFramePtrPrologSpill = false)
494 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
495 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
496 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
497 MCRI(MF.getContext().getRegisterInfo()), TFI(ST.getFrameLowering()),
498 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), FrameReg(FrameReg),
499 IsFramePtrPrologSpill(IsFramePtrPrologSpill),
500 NeedsFrameMoves(MF.needsFrameMoves()) {
501 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg);
502 SplitParts = TRI.getRegSplitParts(RC, EltSize);
503 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
504
505 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
506 }
507
508 void save() {
509 switch (SI.getKind()) {
510 case SGPRSaveKind::SPILL_TO_MEM:
511 return saveToMemory(FI: SI.getIndex());
512 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
513 return saveToVGPRLane(FI: SI.getIndex());
514 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
515 return copyToScratchSGPR(DstReg: SI.getReg());
516 }
517 }
518
519 void restore() {
520 switch (SI.getKind()) {
521 case SGPRSaveKind::SPILL_TO_MEM:
522 return restoreFromMemory(FI: SI.getIndex());
523 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
524 return restoreFromVGPRLane(FI: SI.getIndex());
525 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
526 return copyFromScratchSGPR(SrcReg: SI.getReg());
527 }
528 }
529};
530
531} // namespace llvm
532
533// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
534void SIFrameLowering::emitEntryFunctionFlatScratchInit(
535 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
536 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
537 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
538 const SIInstrInfo *TII = ST.getInstrInfo();
539 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
540 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
541
542 // We don't need this if we only have spills since there is no user facing
543 // scratch.
544
545 // TODO: If we know we don't have flat instructions earlier, we can omit
546 // this from the input registers.
547 //
548 // TODO: We only need to know if we access scratch space through a flat
549 // pointer. Because we only detect if flat instructions are used at all,
550 // this will be used more often than necessary on VI.
551
552 Register FlatScrInitLo;
553 Register FlatScrInitHi;
554
555 if (ST.isAmdPalOS()) {
556 // Extract the scratch offset from the descriptor in the GIT
557 LiveRegUnits LiveUnits;
558 LiveUnits.init(TRI: *TRI);
559 LiveUnits.addLiveIns(MBB);
560
561 // Find unused reg to load flat scratch init into
562 MachineRegisterInfo &MRI = MF.getRegInfo();
563 Register FlatScrInit = AMDGPU::NoRegister;
564 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
565 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
566 AllSGPR64s = AllSGPR64s.slice(
567 N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded));
568 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
569 for (MCPhysReg Reg : AllSGPR64s) {
570 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) &&
571 MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg)) {
572 FlatScrInit = Reg;
573 break;
574 }
575 }
576 assert(FlatScrInit && "Failed to find free register for scratch init");
577
578 FlatScrInitLo = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub0);
579 FlatScrInitHi = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub1);
580
581 buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit);
582
583 // We now have the GIT ptr - now get the scratch descriptor from the entry
584 // at offset 0 (or offset 16 for a compute shader).
585 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
586 const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
587 auto *MMO = MF.getMachineMemOperand(
588 PtrInfo,
589 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
590 MachineMemOperand::MODereferenceable,
591 Size: 8, BaseAlignment: Align(4));
592 unsigned Offset =
593 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
594 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
595 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
596 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit)
597 .addReg(RegNo: FlatScrInit)
598 .addImm(Val: EncodedOffset) // offset
599 .addImm(Val: 0) // cpol
600 .addMemOperand(MMO);
601
602 // Mask the offset in [47:0] of the descriptor
603 const MCInstrDesc &SAndB32 = TII->get(Opcode: AMDGPU::S_AND_B32);
604 auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi)
605 .addReg(RegNo: FlatScrInitHi)
606 .addImm(Val: 0xffff);
607 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
608 } else {
609 Register FlatScratchInitReg =
610 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
611 assert(FlatScratchInitReg);
612
613 MachineRegisterInfo &MRI = MF.getRegInfo();
614 MRI.addLiveIn(Reg: FlatScratchInitReg);
615 MBB.addLiveIn(PhysReg: FlatScratchInitReg);
616
617 FlatScrInitLo = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub0);
618 FlatScrInitHi = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub1);
619 }
620
621 // Do a 64-bit pointer add.
622 if (ST.flatScratchIsPointer()) {
623 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
624 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: FlatScrInitLo)
625 .addReg(RegNo: FlatScrInitLo)
626 .addReg(RegNo: ScratchWaveOffsetReg);
627 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
628 DestReg: FlatScrInitHi)
629 .addReg(RegNo: FlatScrInitHi)
630 .addImm(Val: 0);
631 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
632
633 using namespace AMDGPU::Hwreg;
634 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
635 .addReg(RegNo: FlatScrInitLo)
636 .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_LO, Values: 0, Values: 32)));
637 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
638 .addReg(RegNo: FlatScrInitHi)
639 .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_HI, Values: 0, Values: 32)));
640 return;
641 }
642
643 // For GFX9.
644 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: AMDGPU::FLAT_SCR_LO)
645 .addReg(RegNo: FlatScrInitLo)
646 .addReg(RegNo: ScratchWaveOffsetReg);
647 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
648 DestReg: AMDGPU::FLAT_SCR_HI)
649 .addReg(RegNo: FlatScrInitHi)
650 .addImm(Val: 0);
651 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
652
653 return;
654 }
655
656 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
657
658 // Copy the size in bytes.
659 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::FLAT_SCR_LO)
660 .addReg(RegNo: FlatScrInitHi, Flags: RegState::Kill);
661
662 // Add wave offset in bytes to private base offset.
663 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
664 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FlatScrInitLo)
665 .addReg(RegNo: FlatScrInitLo)
666 .addReg(RegNo: ScratchWaveOffsetReg);
667
668 // Convert offset to 256-byte units.
669 auto LShr = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32),
670 DestReg: AMDGPU::FLAT_SCR_HI)
671 .addReg(RegNo: FlatScrInitLo, Flags: RegState::Kill)
672 .addImm(Val: 8);
673 LShr->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
674}
675
676// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
677// memory. They should have been removed by now.
678static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
679 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
680 I != E; ++I) {
681 if (!MFI.isDeadObjectIndex(ObjectIdx: I))
682 return false;
683 }
684
685 return true;
686}
687
688// Shift down registers reserved for the scratch RSRC.
689Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
690 MachineFunction &MF) const {
691
692 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
693 const SIInstrInfo *TII = ST.getInstrInfo();
694 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
695 MachineRegisterInfo &MRI = MF.getRegInfo();
696 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
697
698 assert(MFI->isEntryFunction());
699
700 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
701
702 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) &&
703 allStackObjectsAreDead(MFI: MF.getFrameInfo())))
704 return Register();
705
706 if (ST.hasSGPRInitBug() ||
707 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
708 return ScratchRsrcReg;
709
710 // We reserved the last registers for this. Shift it down to the end of those
711 // which were actually used.
712 //
713 // FIXME: It might be safer to use a pseudoregister before replacement.
714
715 // FIXME: We should be able to eliminate unused input registers. We only
716 // cannot do this for the resources required for scratch access. For now we
717 // skip over user SGPRs and may leave unused holes.
718
719 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
720 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
721 AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded));
722
723 // Skip the last N reserved elements because they should have already been
724 // reserved for VCC etc.
725 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
726 for (MCPhysReg Reg : AllSGPR128s) {
727 // Pick the first unallocated one. Make sure we don't clobber the other
728 // reserved input we needed. Also for PAL, make sure we don't clobber
729 // the GIT pointer passed in SGPR0 or SGPR8.
730 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
731 (!GITPtrLoReg || !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg))) {
732 MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg);
733 MFI->setScratchRSrcReg(Reg);
734 MRI.reserveReg(PhysReg: Reg, TRI);
735 return Reg;
736 }
737 }
738
739 return ScratchRsrcReg;
740}
741
742static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
743 return ST.hasFlatScratchEnabled() ? 1 : ST.getWavefrontSize();
744}
745
746void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
747 MachineBasicBlock &MBB) const {
748 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
749
750 // FIXME: If we only have SGPR spills, we won't actually be using scratch
751 // memory since these spill to VGPRs. We should be cleaning up these unused
752 // SGPR spill frame indices somewhere.
753
754 // FIXME: We still have implicit uses on SGPR spill instructions in case they
755 // need to spill to vector memory. It's likely that will not happen, but at
756 // this point it appears we need the setup. This part of the prolog should be
757 // emitted after frame indices are eliminated.
758
759 // FIXME: Remove all of the isPhysRegUsed checks
760
761 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
762 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
763 const SIInstrInfo *TII = ST.getInstrInfo();
764 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
765 MachineRegisterInfo &MRI = MF.getRegInfo();
766 const Function &F = MF.getFunction();
767 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
768
769 assert(MFI->isEntryFunction());
770
771 // Debug location must be unknown since the first debug location is used to
772 // determine the end of the prologue.
773 DebugLoc DL;
774 MachineBasicBlock::iterator I = MBB.begin();
775
776 if (MF.needsFrameMoves()) {
777 // On entry the SP/FP are not set up, so we need to define the CFA in terms
778 // of a literal location expression.
779 static const char CFAEncodedInstUserOpsArr[] = {
780 dwarf::DW_CFA_def_cfa_expression,
781 4, // length
782 static_cast<char>(dwarf::DW_OP_lit0),
783 static_cast<char>(dwarf::DW_OP_lit0 +
784 dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave),
785 static_cast<char>(dwarf::DW_OP_LLVM_user),
786 static_cast<char>(dwarf::DW_OP_LLVM_form_aspace_address)};
787 static StringRef CFAEncodedInstUserOps =
788 StringRef(CFAEncodedInstUserOpsArr, sizeof(CFAEncodedInstUserOpsArr));
789 buildCFI(MBB, MBBI: I, DL,
790 CFIInst: MCCFIInstruction::createEscape(L: nullptr, Vals: CFAEncodedInstUserOps,
791 Loc: SMLoc(),
792 Comment: "CFA is 0 in private_wave aspace"));
793 // Unwinding halts when the return address (PC) is undefined.
794 buildCFI(MBB, MBBI: I, DL,
795 CFIInst: MCCFIInstruction::createUndefined(
796 L: nullptr, Register: TRI->getDwarfRegNum(Reg: AMDGPU::PC_REG, isEH: false)));
797 }
798
799 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
800 Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
801
802 // We need to do the replacement of the private segment buffer register even
803 // if there are no stack objects. There could be stores to undef or a
804 // constant without an associated object.
805 //
806 // This will return `Register()` in cases where there are no actual
807 // uses of the SRSRC.
808 Register ScratchRsrcReg;
809 if (!ST.hasFlatScratchEnabled())
810 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
811
812 // Make the selected register live throughout the function.
813 if (ScratchRsrcReg) {
814 for (MachineBasicBlock &OtherBB : MF) {
815 if (&OtherBB != &MBB) {
816 OtherBB.addLiveIn(PhysReg: ScratchRsrcReg);
817 }
818 }
819 }
820
821 // Now that we have fixed the reserved SRSRC we need to locate the
822 // (potentially) preloaded SRSRC.
823 Register PreloadedScratchRsrcReg;
824 if (ST.isAmdHsaOrMesa(F)) {
825 PreloadedScratchRsrcReg =
826 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
827 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
828 // We added live-ins during argument lowering, but since they were not
829 // used they were deleted. We're adding the uses now, so add them back.
830 MRI.addLiveIn(Reg: PreloadedScratchRsrcReg);
831 MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg);
832 }
833 }
834
835 // We found the SRSRC first because it needs four registers and has an
836 // alignment requirement. If the SRSRC that we found is clobbering with
837 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
838 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
839 // wave offset to a free SGPR.
840 Register ScratchWaveOffsetReg;
841 if (PreloadedScratchWaveOffsetReg &&
842 TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: PreloadedScratchWaveOffsetReg)) {
843 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
844 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
845 AllSGPRs = AllSGPRs.slice(
846 N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded));
847 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
848 for (MCPhysReg Reg : AllSGPRs) {
849 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
850 !TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: Reg) && GITPtrLoReg != Reg) {
851 ScratchWaveOffsetReg = Reg;
852 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchWaveOffsetReg)
853 .addReg(RegNo: PreloadedScratchWaveOffsetReg, Flags: RegState::Kill);
854 break;
855 }
856 }
857
858 // FIXME: We can spill incoming arguments and restore at the end of the
859 // prolog.
860 if (!ScratchWaveOffsetReg)
861 report_fatal_error(
862 reason: "could not find temporary scratch offset register in prolog");
863 } else {
864 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
865 }
866 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
867
868 unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
869 if (!mayReserveScratchForCWSR(MF)) {
870 if (hasFP(MF)) {
871 Register FPReg = MFI->getFrameOffsetReg();
872 assert(FPReg != AMDGPU::FP_REG);
873 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: FPReg).addImm(Val: 0);
874 }
875
876 if (requiresStackPointerReference(MF)) {
877 Register SPReg = MFI->getStackPtrOffsetReg();
878 assert(SPReg != AMDGPU::SP_REG);
879 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
880 }
881 } else {
882 // We need to check if we're on a compute queue - if we are, then the CWSR
883 // trap handler may need to store some VGPRs on the stack. The first VGPR
884 // block is saved separately, so we only need to allocate space for any
885 // additional VGPR blocks used. For now, we will make sure there's enough
886 // room for the theoretical maximum number of VGPRs that can be allocated.
887 // FIXME: Figure out if the shader uses fewer VGPRs in practice.
888 assert(hasFP(MF));
889 Register FPReg = MFI->getFrameOffsetReg();
890 assert(FPReg != AMDGPU::FP_REG);
891 unsigned VGPRSize = llvm::alignTo(
892 Size: (ST.getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()) -
893 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: ST,
894 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize())) *
895 4,
896 A: FrameInfo.getMaxAlign());
897 MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
898
899 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GET_STACK_BASE), DestReg: FPReg);
900 if (requiresStackPointerReference(MF)) {
901 Register SPReg = MFI->getStackPtrOffsetReg();
902 assert(SPReg != AMDGPU::SP_REG);
903
904 // If at least one of the constants can be inlined, then we can use
905 // s_cselect. Otherwise, use a mov and cmovk.
906 if (AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm()) ||
907 AMDGPU::isInlinableLiteral32(Literal: Offset + VGPRSize,
908 HasInv2Pi: ST.hasInv2PiInlineImm())) {
909 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SPReg)
910 .addImm(Val: Offset + VGPRSize)
911 .addImm(Val: Offset);
912 } else {
913 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
914 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: SPReg)
915 .addImm(Val: Offset + VGPRSize);
916 }
917 }
918 }
919
920 bool NeedsFlatScratchInit =
921 MFI->getUserSGPRInfo().hasFlatScratchInit() &&
922 (MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
923 (!allStackObjectsAreDead(MFI: FrameInfo) && ST.hasFlatScratchEnabled()));
924
925 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
926 PreloadedScratchWaveOffsetReg && !ST.hasArchitectedFlatScratch()) {
927 MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg);
928 MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg);
929 }
930
931 if (NeedsFlatScratchInit) {
932 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
933 }
934
935 if (ScratchRsrcReg) {
936 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
937 PreloadedPrivateBufferReg: PreloadedScratchRsrcReg,
938 ScratchRsrcReg, ScratchWaveOffsetReg);
939 }
940
941 if (ST.hasWaitXcnt()) {
942 // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
943 // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
944 // insertion logic, which assumes multi-group mode by default.
945 unsigned RegEncoding =
946 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 25, Values: 1);
947 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
948 .addImm(Val: 1)
949 .addImm(Val: RegEncoding);
950 }
951}
952
953// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
954void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
955 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
956 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
957 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
958
959 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
960 const SIInstrInfo *TII = ST.getInstrInfo();
961 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
962 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
963 const Function &Fn = MF.getFunction();
964
965 if (ST.isAmdPalOS()) {
966 // The pointer to the GIT is formed from the offset passed in and either
967 // the amdgpu-git-ptr-high function attribute or the top part of the PC
968 Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
969 Register Rsrc03 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
970
971 buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01);
972
973 // We now have the GIT ptr - now get the scratch descriptor from the entry
974 // at offset 0 (or offset 16 for a compute shader).
975 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
976 const MCInstrDesc &LoadDwordX4 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX4_IMM);
977 auto *MMO = MF.getMachineMemOperand(
978 PtrInfo,
979 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
980 MachineMemOperand::MODereferenceable,
981 Size: 16, BaseAlignment: Align(4));
982 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
983 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
984 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
985 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg)
986 .addReg(RegNo: Rsrc01)
987 .addImm(Val: EncodedOffset) // offset
988 .addImm(Val: 0) // cpol
989 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine)
990 .addMemOperand(MMO);
991
992 // The driver will always set the SRD for wave 64 (bits 118:117 of
993 // descriptor / bits 22:21 of third sub-reg will be 0b11)
994 // If the shader is actually wave32 we have to modify the const_index_stride
995 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
996 // reason the driver does this is that there can be cases where it presents
997 // 2 shaders with different wave size (e.g. VsFs).
998 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
999 if (ST.isWave32()) {
1000 const MCInstrDesc &SBitsetB32 = TII->get(Opcode: AMDGPU::S_BITSET0_B32);
1001 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03)
1002 .addImm(Val: 21)
1003 .addReg(RegNo: Rsrc03);
1004 }
1005 } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) {
1006 assert(!ST.isAmdHsaOrMesa(Fn));
1007 const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
1008
1009 Register Rsrc2 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub2);
1010 Register Rsrc3 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
1011
1012 // Use relocations to get the pointer, and setup the other bits manually.
1013 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
1014
1015 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
1016 Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
1017
1018 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1019 const MCInstrDesc &Mov64 = TII->get(Opcode: AMDGPU::S_MOV_B64);
1020
1021 BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01)
1022 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
1023 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1024 } else {
1025 const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
1026
1027 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1028 auto *MMO = MF.getMachineMemOperand(
1029 PtrInfo,
1030 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
1031 MachineMemOperand::MODereferenceable,
1032 Size: 8, BaseAlignment: Align(4));
1033 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01)
1034 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
1035 .addImm(Val: 0) // offset
1036 .addImm(Val: 0) // cpol
1037 .addMemOperand(MMO)
1038 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1039
1040 MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR());
1041 MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR());
1042 }
1043 } else {
1044 Register Rsrc0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
1045 Register Rsrc1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
1046
1047 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0)
1048 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0")
1049 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1050
1051 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1)
1052 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1")
1053 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1054 }
1055
1056 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2)
1057 .addImm(Val: Lo_32(Value: Rsrc23))
1058 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1059
1060 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3)
1061 .addImm(Val: Hi_32(Value: Rsrc23))
1062 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1063 } else if (ST.isAmdHsaOrMesa(F: Fn)) {
1064 assert(PreloadedScratchRsrcReg);
1065
1066 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
1067 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchRsrcReg)
1068 .addReg(RegNo: PreloadedScratchRsrcReg, Flags: RegState::Kill);
1069 }
1070 }
1071
1072 // Add the scratch wave offset into the scratch RSRC.
1073 //
1074 // We only want to update the first 48 bits, which is the base address
1075 // pointer, without touching the adjacent 16 bits of flags. We know this add
1076 // cannot carry-out from bit 47, otherwise the scratch allocation would be
1077 // impossible to fit in the 48-bit global address space.
1078 //
1079 // TODO: Evaluate if it is better to just construct an SRD using the flat
1080 // scratch init and some constants rather than update the one we are passed.
1081 Register ScratchRsrcSub0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
1082 Register ScratchRsrcSub1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
1083
1084 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
1085 // the kernel body via inreg arguments.
1086 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: ScratchRsrcSub0)
1087 .addReg(RegNo: ScratchRsrcSub0)
1088 .addReg(RegNo: ScratchWaveOffsetReg)
1089 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1090 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: ScratchRsrcSub1)
1091 .addReg(RegNo: ScratchRsrcSub1)
1092 .addImm(Val: 0)
1093 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
1094 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1095}
1096
1097bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
1098 switch (ID) {
1099 case TargetStackID::Default:
1100 case TargetStackID::NoAlloc:
1101 case TargetStackID::SGPRSpill:
1102 return true;
1103 case TargetStackID::ScalableVector:
1104 case TargetStackID::ScalablePredicateVector:
1105 case TargetStackID::WasmLocal:
1106 return false;
1107 }
1108 llvm_unreachable("Invalid TargetStackID::Value");
1109}
1110
1111void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
1112 MachineBasicBlock::iterator MBBI,
1113 const DebugLoc &DL) const {
1114 const MachineFunction &MF = *MBB.getParent();
1115 const MachineRegisterInfo &MRI = MF.getRegInfo();
1116 const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
1117 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1118 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
1119 MCRegister StackPtrReg =
1120 MF.getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg();
1121
1122 emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/true,
1123 Flags: MachineInstr::FrameSetup);
1124
1125 buildCFIForRegToSGPRPairSpill(MBB, MBBI, DL, Reg: AMDGPU::PC_REG,
1126 SGPRPair: TRI.getReturnAddressReg(MF));
1127
1128 BitVector IsCalleeSaved(TRI.getNumRegs());
1129 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
1130 for (unsigned I = 0; CSRegs[I]; ++I) {
1131 IsCalleeSaved.set(CSRegs[I]);
1132 }
1133 auto ProcessReg = [&](MCPhysReg Reg) {
1134 // VCC is not preserved across calls.
1135 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
1136 return;
1137 if (IsCalleeSaved.test(Idx: Reg) || !MRI.isPhysRegModified(PhysReg: Reg))
1138 return;
1139 MCRegister DwarfReg = MCRI->getDwarfRegNum(Reg, isEH: false);
1140 buildCFI(MBB, MBBI, DL,
1141 CFIInst: MCCFIInstruction::createUndefined(L: nullptr, Register: DwarfReg));
1142 };
1143
1144 // Emit CFI rules for caller saved Arch VGPRs which are clobbered
1145 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
1146 for_each(Range: AMDGPU::VGPR_32RegClass.getRegisters().take_front(N: NumArchVGPRs),
1147 F: ProcessReg);
1148
1149 // Emit CFI rules for caller saved Accum VGPRs which are clobbered
1150 if (ST.hasMAIInsts()) {
1151 for_each(Range: AMDGPU::AGPR_32RegClass.getRegisters(), F: ProcessReg);
1152 }
1153
1154 // Emit CFI rules for caller saved SGPRs which are clobbered
1155 for_each(Range: AMDGPU::SGPR_32RegClass.getRegisters(), F: ProcessReg);
1156}
1157
1158// Activate only the inactive lanes when \p EnableInactiveLanes is true.
1159// Otherwise, activate all lanes. It returns the saved exec.
1160static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
1161 MachineFunction &MF,
1162 MachineBasicBlock &MBB,
1163 MachineBasicBlock::iterator MBBI,
1164 const DebugLoc &DL, bool IsProlog,
1165 bool EnableInactiveLanes) {
1166 Register ScratchExecCopy;
1167 MachineRegisterInfo &MRI = MF.getRegInfo();
1168 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1169 const SIInstrInfo *TII = ST.getInstrInfo();
1170 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1171 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1172
1173 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
1174
1175 if (FuncInfo->isWholeWaveFunction()) {
1176 // Whole wave functions already have a copy of the original EXEC mask that
1177 // we can use.
1178 assert(IsProlog && "Epilog should look at return, not setup");
1179 ScratchExecCopy =
1180 TII->getWholeWaveFunctionSetup(MF)->getOperand(i: 0).getReg();
1181 assert(ScratchExecCopy && "Couldn't find copy of EXEC");
1182 } else {
1183 ScratchExecCopy = findScratchNonCalleeSaveRegister(
1184 MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass());
1185 }
1186
1187 if (!ScratchExecCopy)
1188 report_fatal_error(reason: "failed to find free scratch register");
1189
1190 LiveUnits.addReg(Reg: ScratchExecCopy);
1191
1192 const unsigned SaveExecOpc =
1193 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
1194 : AMDGPU::S_OR_SAVEEXEC_B32)
1195 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
1196 : AMDGPU::S_OR_SAVEEXEC_B64);
1197 auto SaveExec =
1198 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: SaveExecOpc), DestReg: ScratchExecCopy).addImm(Val: -1);
1199 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1200
1201 return ScratchExecCopy;
1202}
1203
1204void SIFrameLowering::emitCSRSpillStores(
1205 MachineFunction &MF, MachineBasicBlock &MBB,
1206 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
1207 LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy,
1208 const bool NeedsFrameMoves) const {
1209 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1210 MachineFrameInfo &MFI = MF.getFrameInfo();
1211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1212 const SIInstrInfo *TII = ST.getInstrInfo();
1213 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1214 const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
1215 MachineRegisterInfo &MRI = MF.getRegInfo();
1216 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1217
1218 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
1219 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
1220 // might end up flipping the EXEC bits twice.
1221 Register ScratchExecCopy;
1222 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1223 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1224 if (!WWMScratchRegs.empty())
1225 ScratchExecCopy =
1226 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1227 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
1228
1229 auto StoreWWMRegisters =
1230 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1231 for (const auto &Reg : WWMRegs) {
1232 Register VGPR = Reg.first;
1233 int FI = Reg.second;
1234 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1235 SpillReg: VGPR, FI, FrameReg);
1236 if (NeedsFrameMoves) {
1237 // We spill the entire VGPR, so we can get away with just cfi_offset
1238 buildCFI(MBB, MBBI, DL,
1239 CFIInst: MCCFIInstruction::createOffset(
1240 L: nullptr, Register: MCRI->getDwarfRegNum(Reg: VGPR, isEH: false),
1241 Offset: MFI.getObjectOffset(ObjectIdx: FI) * ST.getWavefrontSize()));
1242 }
1243 }
1244 };
1245
1246 for (const Register Reg : make_first_range(c&: WWMScratchRegs)) {
1247 if (!MRI.isReserved(PhysReg: Reg)) {
1248 MRI.addLiveIn(Reg);
1249 MBB.addLiveIn(PhysReg: Reg);
1250 }
1251 }
1252 StoreWWMRegisters(WWMScratchRegs);
1253
1254 auto EnableAllLanes = [&]() {
1255 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
1256 };
1257
1258 if (!WWMCalleeSavedRegs.empty()) {
1259 if (ScratchExecCopy) {
1260 EnableAllLanes();
1261 } else {
1262 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1263 /*IsProlog*/ true,
1264 /*EnableInactiveLanes*/ false);
1265 }
1266 }
1267
1268 StoreWWMRegisters(WWMCalleeSavedRegs);
1269 if (FuncInfo->isWholeWaveFunction()) {
1270 // If we have already saved some WWM CSR registers, then the EXEC is already
1271 // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
1272 if (!ScratchExecCopy)
1273 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
1274 /*EnableInactiveLanes*/ true);
1275 else if (WWMCalleeSavedRegs.empty())
1276 EnableAllLanes();
1277 } else if (ScratchExecCopy) {
1278 // FIXME: Split block and make terminator.
1279 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
1280 .addReg(RegNo: ScratchExecCopy, Flags: RegState::Kill);
1281 LiveUnits.addReg(Reg: ScratchExecCopy);
1282 }
1283
1284 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1285
1286 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1287 // Special handle FP spill:
1288 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1289 // Otherwise, FP has been moved to a temporary register and spill it
1290 // instead.
1291 bool IsFramePtrPrologSpill = Spill.first == FramePtrReg;
1292 Register Reg = IsFramePtrPrologSpill ? FramePtrRegScratchCopy : Spill.first;
1293 if (!Reg)
1294 continue;
1295
1296 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1297 LiveUnits, FrameReg, IsFramePtrPrologSpill);
1298 SB.save();
1299 }
1300
1301 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1302 // such scratch registers live throughout the function.
1303 SmallVector<Register, 1> ScratchSGPRs;
1304 FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs);
1305 if (!ScratchSGPRs.empty()) {
1306 for (MachineBasicBlock &MBB : MF) {
1307 for (MCPhysReg Reg : ScratchSGPRs)
1308 MBB.addLiveIn(PhysReg: Reg);
1309
1310 MBB.sortUniqueLiveIns();
1311 }
1312 if (!LiveUnits.empty()) {
1313 for (MCPhysReg Reg : ScratchSGPRs)
1314 LiveUnits.addReg(Reg);
1315 }
1316 }
1317
1318 // Remove the spill entry created for EXEC. It is needed only for CFISaves in
1319 // the prologue.
1320 if (TRI.isCFISavedRegsSpillEnabled())
1321 FuncInfo->removePrologEpilogSGPRSpillEntry(Reg: TRI.getExec());
1322}
1323
1324void SIFrameLowering::emitCSRSpillRestores(
1325 MachineFunction &MF, MachineBasicBlock &MBB,
1326 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
1327 LiveRegUnits &LiveUnits, Register FrameReg,
1328 Register FramePtrRegScratchCopy) const {
1329 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1330 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1331 const SIInstrInfo *TII = ST.getInstrInfo();
1332 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1333 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1334 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1335
1336 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1337 // Special handle FP restore:
1338 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1339 // the FP value to a temporary register. The frame pointer should be
1340 // overwritten only at the end when all other spills are restored from
1341 // current frame.
1342 Register Reg =
1343 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1344 if (!Reg)
1345 continue;
1346
1347 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1348 LiveUnits, FrameReg);
1349 SB.restore();
1350 }
1351
1352 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1353 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1354 // this, we might end up flipping the EXEC bits twice.
1355 Register ScratchExecCopy;
1356 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1357 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1358 auto RestoreWWMRegisters =
1359 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1360 for (const auto &Reg : WWMRegs) {
1361 Register VGPR = Reg.first;
1362 int FI = Reg.second;
1363 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1364 SpillReg: VGPR, FI, FrameReg);
1365 }
1366 };
1367
1368 if (FuncInfo->isWholeWaveFunction()) {
1369 // For whole wave functions, the EXEC is already -1 at this point.
1370 // Therefore, we can restore the CSR WWM registers right away.
1371 RestoreWWMRegisters(WWMCalleeSavedRegs);
1372
1373 // The original EXEC is the first operand of the return instruction.
1374 MachineInstr &Return = MBB.instr_back();
1375 unsigned Opcode = Return.getOpcode();
1376 switch (Opcode) {
1377 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
1378 Opcode = AMDGPU::SI_RETURN;
1379 break;
1380 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
1381 Opcode = AMDGPU::SI_TCRETURN_GFX;
1382 break;
1383 default:
1384 llvm_unreachable("Unexpected return inst");
1385 }
1386 Register OrigExec = Return.getOperand(i: 0).getReg();
1387
1388 if (!WWMScratchRegs.empty()) {
1389 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.XorOpc), DestReg: LMC.ExecReg)
1390 .addReg(RegNo: OrigExec)
1391 .addImm(Val: -1);
1392 RestoreWWMRegisters(WWMScratchRegs);
1393 }
1394
1395 // Restore original EXEC.
1396 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addReg(RegNo: OrigExec);
1397
1398 // Drop the first operand and update the opcode.
1399 Return.removeOperand(OpNo: 0);
1400 Return.setDesc(TII->get(Opcode));
1401
1402 return;
1403 }
1404
1405 if (!WWMScratchRegs.empty()) {
1406 ScratchExecCopy =
1407 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1408 /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
1409 }
1410 RestoreWWMRegisters(WWMScratchRegs);
1411 if (!WWMCalleeSavedRegs.empty()) {
1412 if (ScratchExecCopy) {
1413 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
1414 } else {
1415 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1416 /*IsProlog*/ false,
1417 /*EnableInactiveLanes*/ false);
1418 }
1419 }
1420
1421 RestoreWWMRegisters(WWMCalleeSavedRegs);
1422 if (ScratchExecCopy) {
1423 // FIXME: Split block and make terminator.
1424 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
1425 .addReg(RegNo: ScratchExecCopy, Flags: RegState::Kill);
1426 }
1427}
1428
1429void SIFrameLowering::emitPrologue(MachineFunction &MF,
1430 MachineBasicBlock &MBB) const {
1431 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1432 if (FuncInfo->isEntryFunction()) {
1433 emitEntryFunctionPrologue(MF, MBB);
1434 return;
1435 }
1436
1437 MachineFrameInfo &MFI = MF.getFrameInfo();
1438 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1439 const SIInstrInfo *TII = ST.getInstrInfo();
1440 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1441 MachineRegisterInfo &MRI = MF.getRegInfo();
1442
1443 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1444 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1445 Register BasePtrReg =
1446 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1447 LiveRegUnits LiveUnits;
1448
1449 MachineBasicBlock::iterator MBBI = MBB.begin();
1450 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1451 // to determine the end of the prologue.
1452 DebugLoc DL;
1453
1454 bool HasFP = false;
1455 bool HasBP = false;
1456 uint32_t NumBytes = MFI.getStackSize();
1457 uint32_t RoundedSize = NumBytes;
1458
1459 // Functions that never return don't need to save and restore the FP or BP.
1460 const Function &F = MF.getFunction();
1461 bool SavesStackRegs =
1462 !F.hasFnAttribute(Kind: Attribute::NoReturn) && !FuncInfo->isChainFunction();
1463
1464 const bool NeedsFrameMoves = MF.needsFrameMoves();
1465
1466 if (NeedsFrameMoves)
1467 emitPrologueEntryCFI(MBB, MBBI, DL);
1468
1469 if (TRI.hasStackRealignment(MF))
1470 HasFP = true;
1471
1472 Register FramePtrRegScratchCopy;
1473 if (!HasFP && !hasFP(MF)) {
1474 // Emit the CSR spill stores with SP base register.
1475 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg,
1476 FramePtrRegScratchCopy, NeedsFrameMoves);
1477 } else if (SavesStackRegs) {
1478 // CSR spill stores will use FP as base register.
1479 Register SGPRForFPSaveRestoreCopy =
1480 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1481
1482 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1483 if (SGPRForFPSaveRestoreCopy) {
1484 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1485 // the extra FP copy needed in the other two cases when FP is spilled to
1486 // memory or to a VGPR lane.
1487 PrologEpilogSGPRSpillBuilder SB(
1488 FramePtrReg,
1489 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI,
1490 DL, TII, TRI, LiveUnits, FramePtrReg,
1491 /*IsFramePtrPrologSpill*/ true);
1492 SB.save();
1493 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1494 } else {
1495 // Copy FP into a new scratch register so that its previous value can be
1496 // spilled after setting up the new frame.
1497 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1498 MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1499 if (!FramePtrRegScratchCopy)
1500 report_fatal_error(reason: "failed to find free scratch register");
1501
1502 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1503 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrRegScratchCopy)
1504 .addReg(RegNo: FramePtrReg);
1505 }
1506 }
1507
1508 if (HasFP) {
1509 const unsigned Alignment = MFI.getMaxAlign().value();
1510
1511 RoundedSize += Alignment;
1512 if (LiveUnits.empty()) {
1513 LiveUnits.init(TRI);
1514 LiveUnits.addLiveIns(MBB);
1515 }
1516
1517 // s_add_i32 s33, s32, NumBytes
1518 // s_and_b32 s33, s33, 0b111...0000
1519 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FramePtrReg)
1520 .addReg(RegNo: StackPtrReg)
1521 .addImm(Val: (Alignment - 1) * getScratchScaleFactor(ST))
1522 .setMIFlag(MachineInstr::FrameSetup);
1523 auto And = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: FramePtrReg)
1524 .addReg(RegNo: FramePtrReg, Flags: RegState::Kill)
1525 .addImm(Val: -Alignment * getScratchScaleFactor(ST))
1526 .setMIFlag(MachineInstr::FrameSetup);
1527 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1528 FuncInfo->setIsStackRealigned(true);
1529 } else if ((HasFP = hasFP(MF))) {
1530 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1531 .addReg(RegNo: StackPtrReg)
1532 .setMIFlag(MachineInstr::FrameSetup);
1533 }
1534
1535 // If FP is used, emit the CSR spills with FP base register.
1536 if (HasFP) {
1537 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1538 FramePtrRegScratchCopy, NeedsFrameMoves);
1539 if (FramePtrRegScratchCopy)
1540 LiveUnits.removeReg(Reg: FramePtrRegScratchCopy);
1541 }
1542
1543 // If we need a base pointer, set it up here. It's whatever the value of
1544 // the stack pointer is at this point. Any variable size objects will be
1545 // allocated after this, so we can still use the base pointer to reference
1546 // the incoming arguments.
1547 if ((HasBP = TRI.hasBasePointer(MF))) {
1548 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BasePtrReg)
1549 .addReg(RegNo: StackPtrReg)
1550 .setMIFlag(MachineInstr::FrameSetup);
1551 }
1552
1553 if (HasFP) {
1554 if (NeedsFrameMoves)
1555 emitDefCFA(MBB, MBBI, DL, StackPtrReg: FramePtrReg, /*AspaceAlreadyDefined=*/false,
1556 Flags: MachineInstr::FrameSetup);
1557 }
1558
1559 if (HasFP && RoundedSize != 0) {
1560 auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg)
1561 .addReg(RegNo: StackPtrReg)
1562 .addImm(Val: RoundedSize * getScratchScaleFactor(ST))
1563 .setMIFlag(MachineInstr::FrameSetup);
1564 Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1565 }
1566
1567 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1568 (void)FPSaved;
1569 assert((!HasFP || FPSaved || !SavesStackRegs) &&
1570 "Needed to save FP but didn't save it anywhere");
1571
1572 // If we allow spilling to AGPRs we may have saved FP but then spill
1573 // everything into AGPRs instead of the stack.
1574 assert((HasFP || !FPSaved || !SavesStackRegs || EnableSpillVGPRToAGPR) &&
1575 "Saved FP but didn't need it");
1576
1577 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg);
1578 (void)BPSaved;
1579 assert((!HasBP || BPSaved || !SavesStackRegs) &&
1580 "Needed to save BP but didn't save it anywhere");
1581
1582 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1583
1584 if (FuncInfo->isWholeWaveFunction()) {
1585 // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
1586 TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
1587 }
1588}
1589
1590void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1591 MachineBasicBlock &MBB) const {
1592 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1593 if (FuncInfo->isEntryFunction())
1594 return;
1595
1596 const MachineFrameInfo &MFI = MF.getFrameInfo();
1597 if (FuncInfo->isChainFunction() && !MFI.hasTailCall())
1598 return;
1599
1600 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1601 const SIInstrInfo *TII = ST.getInstrInfo();
1602 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1603 MachineRegisterInfo &MRI = MF.getRegInfo();
1604 LiveRegUnits LiveUnits;
1605 // Get the insert location for the epilogue. If there were no terminators in
1606 // the block, get the last instruction.
1607 MachineBasicBlock::iterator MBBI = MBB.end();
1608 DebugLoc DL;
1609 if (!MBB.empty()) {
1610 MBBI = MBB.getLastNonDebugInstr();
1611 if (MBBI != MBB.end())
1612 DL = MBBI->getDebugLoc();
1613
1614 MBBI = MBB.getFirstTerminator();
1615 }
1616
1617 uint32_t NumBytes = MFI.getStackSize();
1618 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1619 ? NumBytes + MFI.getMaxAlign().value()
1620 : NumBytes;
1621 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1622 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1623 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1624
1625 if (RoundedSize != 0) {
1626 if (TRI.hasBasePointer(MF)) {
1627 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1628 .addReg(RegNo: TRI.getBaseRegister())
1629 .setMIFlag(MachineInstr::FrameDestroy);
1630 } else if (hasFP(MF)) {
1631 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1632 .addReg(RegNo: FramePtrReg)
1633 .setMIFlag(MachineInstr::FrameDestroy);
1634 }
1635 }
1636
1637 Register FramePtrRegScratchCopy;
1638 Register SGPRForFPSaveRestoreCopy =
1639 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1640 if (FPSaved) {
1641 // CSR spill restores should use FP as base register. If
1642 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1643 // into a new scratch register and copy to FP later when other registers are
1644 // restored from the current stack frame.
1645 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1646 if (SGPRForFPSaveRestoreCopy) {
1647 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1648 } else {
1649 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1650 MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1651 if (!FramePtrRegScratchCopy)
1652 report_fatal_error(reason: "failed to find free scratch register");
1653
1654 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1655 }
1656
1657 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1658 FramePtrRegScratchCopy);
1659 }
1660
1661 if (hasFP(MF) && MF.needsFrameMoves()) {
1662 emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/false,
1663 Flags: MachineInstr::FrameDestroy);
1664 }
1665
1666 if (FPSaved) {
1667 // Insert the copy to restore FP.
1668 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1669 : FramePtrRegScratchCopy;
1670 MachineInstrBuilder MIB =
1671 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1672 .addReg(RegNo: SrcReg);
1673 if (SGPRForFPSaveRestoreCopy)
1674 MIB.setMIFlag(MachineInstr::FrameDestroy);
1675 } else {
1676 // Insert the CSR spill restores with SP as the base register.
1677 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg,
1678 FramePtrRegScratchCopy);
1679 }
1680}
1681
1682#ifndef NDEBUG
1683static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1684 const MachineFrameInfo &MFI = MF.getFrameInfo();
1685 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1686 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1687 I != E; ++I) {
1688 if (!MFI.isDeadObjectIndex(I) &&
1689 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1690 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1691 return false;
1692 }
1693 }
1694
1695 return true;
1696}
1697#endif
1698
1699StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1700 int FI,
1701 Register &FrameReg) const {
1702 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1703
1704 FrameReg = RI->getFrameRegister(MF);
1705 return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI));
1706}
1707
1708void SIFrameLowering::processFunctionBeforeFrameFinalized(
1709 MachineFunction &MF,
1710 RegScavenger *RS) const {
1711 MachineFrameInfo &MFI = MF.getFrameInfo();
1712
1713 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1714 const SIInstrInfo *TII = ST.getInstrInfo();
1715 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1716 MachineRegisterInfo &MRI = MF.getRegInfo();
1717 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1718
1719 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1720 && EnableSpillVGPRToAGPR;
1721
1722 if (SpillVGPRToAGPR) {
1723 // To track the spill frame indices handled in this pass.
1724 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1725 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1726
1727 bool SeenDbgInstr = false;
1728
1729 for (MachineBasicBlock &MBB : MF) {
1730 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
1731 int FrameIndex;
1732 if (MI.isDebugInstr())
1733 SeenDbgInstr = true;
1734
1735 if (TII->isVGPRSpill(MI)) {
1736 // Try to eliminate stack used by VGPR spills before frame
1737 // finalization.
1738 unsigned FIOp = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
1739 Name: AMDGPU::OpName::vaddr);
1740 int FI = MI.getOperand(i: FIOp).getIndex();
1741 Register VReg =
1742 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
1743 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1744 isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) {
1745 assert(RS != nullptr);
1746 RS->enterBasicBlockEnd(MBB);
1747 RS->backward(I: std::next(x: MI.getIterator()));
1748 TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS);
1749 SpillFIs.set(FI);
1750 continue;
1751 }
1752 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1753 TII->isLoadFromStackSlot(MI, FrameIndex))
1754 if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex))
1755 NonVGPRSpillFIs.set(FrameIndex);
1756 }
1757 }
1758
1759 // Stack slot coloring may assign different objects to the same stack slot.
1760 // If not, then the VGPR to AGPR spill slot is dead.
1761 for (unsigned FI : SpillFIs.set_bits())
1762 if (!NonVGPRSpillFIs.test(Idx: FI))
1763 FuncInfo->setVGPRToAGPRSpillDead(FI);
1764
1765 for (MachineBasicBlock &MBB : MF) {
1766 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1767 MBB.addLiveIn(PhysReg: Reg);
1768
1769 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1770 MBB.addLiveIn(PhysReg: Reg);
1771
1772 MBB.sortUniqueLiveIns();
1773
1774 if (!SpillFIs.empty() && SeenDbgInstr)
1775 clearDebugInfoForSpillFIs(MFI, MBB, SpillFIs);
1776 }
1777 }
1778
1779 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1780 // can. Any remaining SGPR spills will go to memory, so move them back to the
1781 // default stack.
1782 bool HaveSGPRToVMemSpill =
1783 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1784 assert(allSGPRSpillsAreDead(MF) &&
1785 "SGPR spill should have been removed in SILowerSGPRSpills");
1786
1787 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1788 // but currently hasNonSpillStackObjects is set only from source
1789 // allocas. Stack temps produced from legalization are not counted currently.
1790 if (!allStackObjectsAreDead(MFI)) {
1791 assert(RS && "RegScavenger required if spilling");
1792
1793 // Add an emergency spill slot
1794 RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI));
1795
1796 // If we are spilling SGPRs to memory with a large frame, we may need a
1797 // second VGPR emergency frame index.
1798 if (HaveSGPRToVMemSpill &&
1799 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1800 RS->addScavengingFrameIndex(FI: MFI.CreateSpillStackObject(Size: 4, Alignment: Align(4)));
1801 }
1802 }
1803}
1804
1805void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1806 MachineFunction &MF, RegScavenger *RS) const {
1807 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1809 MachineRegisterInfo &MRI = MF.getRegInfo();
1810 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1811
1812 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1813 // On gfx908, we had initially reserved highest available VGPR for AGPR
1814 // copy. Now since we are done with RA, check if there exist an unused VGPR
1815 // which is lower than the eariler reserved VGPR before RA. If one exist,
1816 // use it for AGPR copy instead of one reserved before RA.
1817 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1818 Register UnusedLowVGPR =
1819 TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
1820 if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) <
1821 TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) {
1822 // Reserve this newly identified VGPR (for AGPR copy)
1823 // reserved registers should already be frozen at this point
1824 // so we can avoid calling MRI.freezeReservedRegs and just use
1825 // MRI.reserveReg
1826 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1827 MRI.reserveReg(PhysReg: UnusedLowVGPR, TRI);
1828 }
1829 }
1830 // We initally reserved the highest available SGPR pair for long branches
1831 // now, after RA, we shift down to a lower unused one if one exists
1832 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1833 Register UnusedLowSGPR =
1834 TRI->findUnusedRegister(MRI, RC: &AMDGPU::SGPR_64RegClass, MF);
1835 // If LongBranchReservedReg is null then we didn't find a long branch
1836 // and never reserved a register to begin with so there is nothing to
1837 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1838 // register to use so just keep the original one we set.
1839 if (LongBranchReservedReg && UnusedLowSGPR) {
1840 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1841 MRI.reserveReg(PhysReg: UnusedLowSGPR, TRI);
1842 }
1843}
1844
1845// The special SGPR spills like the one needed for FP, BP or any reserved
1846// registers delayed until frame lowering.
1847void SIFrameLowering::determinePrologEpilogSGPRSaves(
1848 MachineFunction &MF, BitVector &SavedVGPRs,
1849 bool NeedExecCopyReservedReg) const {
1850 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1851 MachineRegisterInfo &MRI = MF.getRegInfo();
1852 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1853 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1854 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1855 LiveRegUnits LiveUnits;
1856 LiveUnits.init(TRI: *TRI);
1857 // Initially mark callee saved registers as used so we will not choose them
1858 // while looking for scratch SGPRs.
1859 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1860 for (unsigned I = 0; CSRegs[I]; ++I)
1861 LiveUnits.addReg(Reg: CSRegs[I]);
1862
1863 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1864
1865 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1866 if (NeedExecCopyReservedReg ||
1867 (ReservedRegForExecCopy &&
1868 MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1869 MRI.reserveReg(PhysReg: ReservedRegForExecCopy, TRI);
1870 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1871 if (UnusedScratchReg) {
1872 // If found any unused scratch SGPR, reserve the register itself for Exec
1873 // copy and there is no need for any spills in that case.
1874 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1875 MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg);
1876 LiveUnits.addReg(Reg: UnusedScratchReg);
1877 } else {
1878 // Needs spill.
1879 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1880 "Re-reserving spill slot for EXEC copy register");
1881 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC,
1882 /*IncludeScratchCopy=*/false);
1883 }
1884 } else if (ReservedRegForExecCopy) {
1885 // Reset it at this point. There are no whole-wave copies and spills
1886 // encountered.
1887 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1888 }
1889
1890 if (TRI->isCFISavedRegsSpillEnabled()) {
1891 Register Exec = TRI->getExec();
1892 assert(!MFI->hasPrologEpilogSGPRSpillEntry(Exec) &&
1893 "Re-reserving spill slot for EXEC");
1894 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: Exec, RC);
1895 }
1896
1897 // Functions that don't return to the caller don't need to preserve
1898 // the FP and BP.
1899 const Function &F = MF.getFunction();
1900 if (F.hasFnAttribute(Kind: Attribute::NoReturn) ||
1901 AMDGPU::isChainCC(CC: F.getCallingConv()))
1902 return;
1903
1904 // hasFP only knows about stack objects that already exist. We're now
1905 // determining the stack slots that will be created, so we have to predict
1906 // them. Stack objects force FP usage with calls.
1907 //
1908 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1909 // don't want to report it here.
1910 //
1911 // FIXME: Is this really hasReservedCallFrame?
1912 const bool WillHaveFP =
1913 FrameInfo.hasCalls() &&
1914 (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo));
1915
1916 if (WillHaveFP || hasFP(MF)) {
1917 Register FramePtrReg = MFI->getFrameOffsetReg();
1918 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1919 "Re-reserving spill slot for FP");
1920 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg);
1921 }
1922
1923 if (TRI->hasBasePointer(MF)) {
1924 Register BasePtrReg = TRI->getBaseRegister();
1925 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1926 "Re-reserving spill slot for BP");
1927 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg);
1928 }
1929}
1930
1931// Only report VGPRs to generic code.
1932void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1933 BitVector &SavedVGPRs,
1934 RegScavenger *RS) const {
1935 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936
1937 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1938 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1939 // we don't need to save and restore anything.
1940 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1941 return;
1942
1943 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS);
1944
1945 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1946 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1947 const SIInstrInfo *TII = ST.getInstrInfo();
1948 bool NeedExecCopyReservedReg = false;
1949
1950 MachineInstr *ReturnMI = nullptr;
1951 for (MachineBasicBlock &MBB : MF) {
1952 for (MachineInstr &MI : MBB) {
1953 // TODO: Walking through all MBBs here would be a bad heuristic. Better
1954 // handle them elsewhere.
1955 if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode()))
1956 NeedExecCopyReservedReg = true;
1957 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1958 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1959 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1960 (MFI->isChainFunction() &&
1961 TII->isChainCallOpcode(Opcode: MI.getOpcode()))) {
1962 // We expect all return to be the same size.
1963 assert(!ReturnMI ||
1964 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1965 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1966 ReturnMI = &MI;
1967 }
1968 }
1969 }
1970
1971 SmallVector<Register> SortedWWMVGPRs;
1972 for (Register Reg : MFI->getWWMReservedRegs()) {
1973 // The shift-back is needed only for the VGPRs used for SGPR spills and they
1974 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1975 // reserved registers.
1976 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1977 if (TRI->getRegSizeInBits(RC: *RC) != 32)
1978 continue;
1979 SortedWWMVGPRs.push_back(Elt: Reg);
1980 }
1981
1982 sort(C&: SortedWWMVGPRs, Comp: std::greater<Register>());
1983 MFI->shiftWwmVGPRsToLowestRange(MF, WWMVGPRs&: SortedWWMVGPRs, SavedVGPRs);
1984
1985 if (MFI->isEntryFunction())
1986 return;
1987
1988 if (MFI->isWholeWaveFunction()) {
1989 // In practice, all the VGPRs are WWM registers, and we will need to save at
1990 // least their inactive lanes. Add them to WWMReservedRegs.
1991 assert(!NeedExecCopyReservedReg &&
1992 "Whole wave functions can use the reg mapped for their i1 argument");
1993
1994 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
1995 for (MCRegister Reg :
1996 AMDGPU::VGPR_32RegClass.getRegisters().take_front(N: NumArchVGPRs))
1997 if (MF.getRegInfo().isPhysRegModified(PhysReg: Reg)) {
1998 MFI->reserveWWMRegister(Reg);
1999 MF.begin()->addLiveIn(PhysReg: Reg);
2000 }
2001 MF.begin()->sortUniqueLiveIns();
2002 }
2003
2004 // Remove any VGPRs used in the return value because these do not need to be saved.
2005 // This prevents CSR restore from clobbering return VGPRs.
2006 if (ReturnMI) {
2007 for (auto &Op : ReturnMI->operands()) {
2008 if (Op.isReg())
2009 SavedVGPRs.reset(Idx: Op.getReg());
2010 }
2011 }
2012
2013 // Create the stack objects for WWM registers now.
2014 for (Register Reg : MFI->getWWMReservedRegs()) {
2015 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
2016 MFI->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(RC: *RC),
2017 Alignment: TRI->getSpillAlign(RC: *RC));
2018 }
2019
2020 // Ignore the SGPRs the default implementation found.
2021 SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask());
2022
2023 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
2024 // In gfx908 there was do AGPR loads and stores and thus spilling also
2025 // require a temporary VGPR.
2026 if (!ST.hasGFX90AInsts())
2027 SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask());
2028
2029 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
2030
2031 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
2032 // allow the default insertion to handle them.
2033 for (auto &Reg : MFI->getWWMSpills())
2034 SavedVGPRs.reset(Idx: Reg.first);
2035}
2036
2037void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
2038 BitVector &SavedRegs,
2039 RegScavenger *RS) const {
2040 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2041 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2042 if (MFI->isEntryFunction())
2043 return;
2044
2045 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2046 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2047
2048 // The SP is specifically managed and we don't want extra spills of it.
2049 SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg());
2050
2051 const BitVector AllSavedRegs = SavedRegs;
2052 SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask());
2053
2054 // We have to anticipate introducing CSR VGPR spills or spill of caller
2055 // save VGPR reserved for SGPR spills as we now always create stack entry
2056 // for it, if we don't have any stack objects already, since we require a FP
2057 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
2058 // there are any SGPR spills. Whether they are CSR spills or otherwise.
2059 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
2060 const bool WillHaveFP =
2061 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
2062
2063 // FP will be specially managed like SP.
2064 if (WillHaveFP || hasFP(MF))
2065 SavedRegs.reset(Idx: MFI->getFrameOffsetReg());
2066
2067 // Return address use with return instruction is hidden through the SI_RETURN
2068 // pseudo. Given that and since the IPRA computes actual register usage and
2069 // does not use CSR list, the clobbering of return address by function calls
2070 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
2071 // usage collection. This will ensure save/restore of return address happens
2072 // in those scenarios.
2073 const MachineRegisterInfo &MRI = MF.getRegInfo();
2074 Register RetAddrReg = TRI->getReturnAddressReg(MF);
2075 if (!MFI->isEntryFunction() &&
2076 (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) {
2077 SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0));
2078 SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1));
2079 }
2080}
2081
2082static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
2083 const GCNSubtarget &ST,
2084 std::vector<CalleeSavedInfo> &CSI) {
2085 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2086 MachineFrameInfo &MFI = MF.getFrameInfo();
2087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2088
2089 assert(
2090 llvm::is_sorted(CSI,
2091 [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
2092 return A.getReg() < B.getReg();
2093 }) &&
2094 "Callee saved registers not sorted");
2095
2096 auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
2097 return !CSI.isSpilledToReg() &&
2098 TRI->getPhysRegBaseClass(Reg: CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
2099 !FuncInfo->isWWMReservedRegister(Reg: CSI.getReg());
2100 };
2101
2102 auto CSEnd = CSI.end();
2103 for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
2104 Register Reg = CSIt->getReg();
2105 if (!CanUseBlockOps(*CSIt))
2106 continue;
2107
2108 // Find all the regs that will fit in a 32-bit mask starting at the current
2109 // reg and build said mask. It should have 1 for every register that's
2110 // included, with the current register as the least significant bit.
2111 uint32_t Mask = 1;
2112 CSEnd = std::remove_if(
2113 first: CSIt + 1, last: CSEnd, pred: [&](const CalleeSavedInfo &CSI) -> bool {
2114 if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
2115 Mask |= 1 << (CSI.getReg() - Reg);
2116 return true;
2117 } else {
2118 return false;
2119 }
2120 });
2121
2122 const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
2123 Register RegBlock =
2124 TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
2125 if (!RegBlock) {
2126 // We couldn't find a super register for the block. This can happen if
2127 // the register we started with is too high (e.g. v232 if the maximum is
2128 // v255). We therefore try to get the last register block and figure out
2129 // the mask from there.
2130 Register LastBlockStart =
2131 AMDGPU::VGPR0 + alignDown(Value: Reg - AMDGPU::VGPR0, Align: 32);
2132 RegBlock =
2133 TRI->getMatchingSuperReg(Reg: LastBlockStart, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
2134 assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
2135 "Couldn't find super register");
2136 int RegDelta = Reg - LastBlockStart;
2137 assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
2138 "Bad shift amount");
2139 Mask <<= RegDelta;
2140 }
2141
2142 FuncInfo->setMaskForVGPRBlockOps(RegisterBlock: RegBlock, Mask);
2143
2144 // The stack objects can be a bit smaller than the register block if we know
2145 // some of the high bits of Mask are 0. This may happen often with calling
2146 // conventions where the caller and callee-saved VGPRs are interleaved at
2147 // a small boundary (e.g. 8 or 16).
2148 int UnusedBits = llvm::countl_zero(Val: Mask);
2149 unsigned BlockSize = TRI->getSpillSize(RC: *BlockRegClass) - UnusedBits * 4;
2150 int FrameIdx =
2151 MFI.CreateStackObject(Size: BlockSize, Alignment: TRI->getSpillAlign(RC: *BlockRegClass),
2152 /*isSpillSlot=*/true);
2153 MFI.setIsCalleeSavedObjectIndex(ObjectIdx: FrameIdx, IsCalleeSaved: true);
2154
2155 CSIt->setFrameIdx(FrameIdx);
2156 CSIt->setReg(RegBlock);
2157 }
2158 CSI.erase(first: CSEnd, last: CSI.end());
2159}
2160
2161bool SIFrameLowering::assignCalleeSavedSpillSlots(
2162 MachineFunction &MF, const TargetRegisterInfo *TRI,
2163 std::vector<CalleeSavedInfo> &CSI) const {
2164 if (CSI.empty())
2165 return true; // Early exit if no callee saved registers are modified!
2166
2167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2168 bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
2169
2170 if (UseVGPRBlocks)
2171 assignSlotsUsingVGPRBlocks(MF, ST, CSI);
2172
2173 return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks;
2174}
2175
2176bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl(
2177 MachineFunction &MF, const TargetRegisterInfo *TRI,
2178 std::vector<CalleeSavedInfo> &CSI) const {
2179 if (CSI.empty())
2180 return true; // Early exit if no callee saved registers are modified!
2181
2182 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2183 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2184 const SIRegisterInfo *RI = ST.getRegisterInfo();
2185 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
2186 Register BasePtrReg = RI->getBaseRegister();
2187 Register SGPRForFPSaveRestoreCopy =
2188 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
2189 Register SGPRForBPSaveRestoreCopy =
2190 FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg);
2191 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
2192 return false;
2193
2194 unsigned NumModifiedRegs = 0;
2195
2196 if (SGPRForFPSaveRestoreCopy)
2197 NumModifiedRegs++;
2198 if (SGPRForBPSaveRestoreCopy)
2199 NumModifiedRegs++;
2200
2201 for (auto &CS : CSI) {
2202 if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
2203 CS.setDstReg(SGPRForFPSaveRestoreCopy);
2204 if (--NumModifiedRegs)
2205 break;
2206 } else if (CS.getReg() == BasePtrReg.asMCReg() &&
2207 SGPRForBPSaveRestoreCopy) {
2208 CS.setDstReg(SGPRForBPSaveRestoreCopy);
2209 if (--NumModifiedRegs)
2210 break;
2211 }
2212 }
2213
2214 return false;
2215}
2216
2217bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
2218 const MachineFunction &MF) const {
2219
2220 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2221 const MachineFrameInfo &MFI = MF.getFrameInfo();
2222 const SIInstrInfo *TII = ST.getInstrInfo();
2223 uint64_t EstStackSize = MFI.estimateStackSize(MF);
2224 uint64_t MaxOffset = EstStackSize - 1;
2225
2226 // We need the emergency stack slots to be allocated in range of the
2227 // MUBUF/flat scratch immediate offset from the base register, so assign these
2228 // first at the incoming SP position.
2229 //
2230 // TODO: We could try sorting the objects to find a hole in the first bytes
2231 // rather than allocating as close to possible. This could save a lot of space
2232 // on frames with alignment requirements.
2233 if (ST.hasFlatScratchEnabled()) {
2234 if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2235 FlatVariant: AMDGPU::FlatAddrSpace::FlatScratch))
2236 return false;
2237 } else {
2238 if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset))
2239 return false;
2240 }
2241
2242 return true;
2243}
2244
2245/// Return the set of all root registers of regunits live-in to @p MBB.
2246///
2247/// Intended to avoid using the expensive @c MCRegAliasIterator when deciding
2248/// if a register to be spilled is already live-in (see @c isAnyRootLiveIn).
2249static SparseBitVector<> buildLiveInRoots(const MachineBasicBlock &MBB,
2250 const SIRegisterInfo &TRI) {
2251 SparseBitVector<> LiveInRoots;
2252 for (const auto &LI : MBB.liveins()) {
2253 for (MCRegUnitMaskIterator MI(LI.PhysReg, &TRI); MI.isValid(); ++MI) {
2254 auto [Unit, UnitLaneMask] = *MI;
2255 if ((LI.LaneMask & UnitLaneMask).none())
2256 continue;
2257 for (MCRegUnitRootIterator RI(Unit, &TRI); RI.isValid(); ++RI)
2258 LiveInRoots.set(*RI);
2259 }
2260 }
2261 return LiveInRoots;
2262}
2263
2264/// Returns true iff any root of @p Reg is in @p LiveInRoots
2265/// (see @c buildLiveInRoots).
2266static bool isAnyRootLiveIn(const SparseBitVector<> &LiveInRoots,
2267 const SIRegisterInfo &TRI, MCRegister Reg) {
2268 for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) {
2269 for (MCRegUnitRootIterator RI(*UI, &TRI); RI.isValid(); ++RI) {
2270 if (LiveInRoots.test(Idx: *RI))
2271 return true;
2272 }
2273 }
2274 return false;
2275}
2276
2277void SIFrameLowering::spillCalleeSavedRegisterWithoutBlockOps(
2278 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2279 const CalleeSavedInfo &CS, const SIInstrInfo *TII,
2280 const SIRegisterInfo &TRI,
2281 const std::optional<SparseBitVector<>> &LiveInRoots) const {
2282 MCRegister Reg = CS.getReg();
2283
2284 // We assume a sortUniqueLiveIns later
2285 MBB.addLiveIn(PhysReg: Reg);
2286
2287 if (CS.isSpilledToReg()) {
2288 BuildMI(BB&: MBB, I: MI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: CS.getDstReg())
2289 .addReg(RegNo: Reg, Flags: getKillRegState(B: true));
2290 } else {
2291 const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
2292 bool IsKill = true;
2293 // If this value was already livein, we probably have a direct use of
2294 // the incoming register value, so don't kill at the spill point. This
2295 // happens since we pass some special inputs (workgroup IDs) in the
2296 // callee saved range.
2297 if (LiveInRoots)
2298 IsKill = !isAnyRootLiveIn(LiveInRoots: *LiveInRoots, TRI, Reg);
2299 TII->storeRegToStackSlotCFI(MBB, MI, SrcReg: Reg, isKill: IsKill, FrameIndex: CS.getFrameIdx(), RC);
2300 }
2301}
2302
2303bool SIFrameLowering::spillCalleeSavedRegisters(
2304 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2305 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *OrigTRI) const {
2306 auto &TRI = *static_cast<const SIRegisterInfo *>(OrigTRI);
2307 MachineFunction *MF = MBB.getParent();
2308 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2309 const SIInstrInfo *TII = ST.getInstrInfo();
2310
2311 std::optional<SparseBitVector<>> LiveInRoots;
2312 if (MBB.getParent()->getRegInfo().tracksLiveness())
2313 LiveInRoots = buildLiveInRoots(MBB, TRI);
2314
2315 if (!ST.useVGPRBlockOpsForCSR()) {
2316 for (const CalleeSavedInfo &CS : CSI)
2317 spillCalleeSavedRegisterWithoutBlockOps(MBB, MI, CS, TII, TRI,
2318 LiveInRoots);
2319 if (LiveInRoots)
2320 MBB.sortUniqueLiveIns();
2321 return true;
2322 }
2323
2324 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2325 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2326
2327 const TargetRegisterClass *BlockRegClass = TRI.getRegClassForBlockOp(MF: *MF);
2328 for (const CalleeSavedInfo &CS : CSI) {
2329 Register Reg = CS.getReg();
2330 if (!BlockRegClass->contains(Reg) ||
2331 !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
2332 spillCalleeSavedRegisterWithoutBlockOps(MBB, MI, CS, TII, TRI,
2333 LiveInRoots);
2334 continue;
2335 }
2336
2337 // Build a scratch block store.
2338 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
2339 int FrameIndex = CS.getFrameIdx();
2340 MachinePointerInfo PtrInfo =
2341 MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
2342 MachineMemOperand *MMO =
2343 MF->getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
2344 Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
2345 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
2346
2347 BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(),
2348 MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE))
2349 .addReg(RegNo: Reg, Flags: getKillRegState(B: false))
2350 .addFrameIndex(Idx: FrameIndex)
2351 .addReg(RegNo: FuncInfo->getStackPtrOffsetReg())
2352 .addImm(Val: 0)
2353 .addImm(Val: Mask)
2354 .addMemOperand(MMO);
2355
2356 FuncInfo->setHasSpilledVGPRs();
2357
2358 // Add the register to the liveins. This is necessary because if any of the
2359 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2360 // then the whole block will be marked as reserved and `updateLiveness` will
2361 // skip it.
2362 if (LiveInRoots)
2363 MBB.addLiveIn(PhysReg: Reg);
2364 }
2365 if (LiveInRoots)
2366 MBB.sortUniqueLiveIns();
2367
2368 return true;
2369}
2370
2371bool SIFrameLowering::restoreCalleeSavedRegisters(
2372 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2373 MutableArrayRef<CalleeSavedInfo> CSI,
2374 const TargetRegisterInfo *OrigTRI) const {
2375 auto &TRI = *static_cast<const SIRegisterInfo *>(OrigTRI);
2376 MachineFunction *MF = MBB.getParent();
2377 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2378 if (!ST.useVGPRBlockOpsForCSR())
2379 return false;
2380
2381 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2382 MachineFrameInfo &MFI = MF->getFrameInfo();
2383 const SIInstrInfo *TII = ST.getInstrInfo();
2384 const TargetRegisterClass *BlockRegClass = TRI.getRegClassForBlockOp(MF: *MF);
2385 for (const CalleeSavedInfo &CS : reverse(C&: CSI)) {
2386 Register Reg = CS.getReg();
2387 if (!BlockRegClass->contains(Reg) ||
2388 !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
2389 restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI: &TRI);
2390 continue;
2391 }
2392
2393 // Build a scratch block load.
2394 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
2395 int FrameIndex = CS.getFrameIdx();
2396 MachinePointerInfo PtrInfo =
2397 MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
2398 MachineMemOperand *MMO = MF->getMachineMemOperand(
2399 PtrInfo, F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIndex),
2400 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIndex));
2401
2402 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(),
2403 MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), DestReg: Reg)
2404 .addFrameIndex(Idx: FrameIndex)
2405 .addReg(RegNo: FuncInfo->getStackPtrOffsetReg())
2406 .addImm(Val: 0)
2407 .addImm(Val: Mask)
2408 .addMemOperand(MMO);
2409 TRI.addImplicitUsesForBlockCSRLoad(MIB, BlockReg: Reg);
2410
2411 // Add the register to the liveins. This is necessary because if any of the
2412 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2413 // then the whole block will be marked as reserved and `updateLiveness` will
2414 // skip it.
2415 MBB.addLiveIn(PhysReg: Reg);
2416 }
2417
2418 MBB.sortUniqueLiveIns();
2419 return true;
2420}
2421
2422MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
2423 MachineFunction &MF,
2424 MachineBasicBlock &MBB,
2425 MachineBasicBlock::iterator I) const {
2426 int64_t Amount = I->getOperand(i: 0).getImm();
2427 if (Amount == 0)
2428 return MBB.erase(I);
2429
2430 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2431 const SIInstrInfo *TII = ST.getInstrInfo();
2432 const DebugLoc &DL = I->getDebugLoc();
2433 unsigned Opc = I->getOpcode();
2434 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
2435 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0;
2436
2437 if (!hasReservedCallFrame(MF)) {
2438 Amount = alignTo(Size: Amount, A: getStackAlign());
2439 assert(isUInt<32>(Amount) && "exceeded stack address space size");
2440 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2441 Register SPReg = MFI->getStackPtrOffsetReg();
2442
2443 Amount *= getScratchScaleFactor(ST);
2444 if (IsDestroy)
2445 Amount = -Amount;
2446 auto Add = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SPReg)
2447 .addReg(RegNo: SPReg)
2448 .addImm(Val: Amount);
2449 Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
2450 } else if (CalleePopAmount != 0) {
2451 llvm_unreachable("is this used?");
2452 }
2453
2454 return MBB.erase(I);
2455}
2456
2457/// Returns true if the frame will require a reference to the stack pointer.
2458///
2459/// This is the set of conditions common to setting up the stack pointer in a
2460/// kernel, and for using a frame pointer in a callable function.
2461///
2462/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2463/// references SP.
2464static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
2465 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
2466}
2467
2468// The FP for kernels is always known 0, so we never really need to setup an
2469// explicit register for it. However, DisableFramePointerElim will force us to
2470// use a register for it.
2471bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
2472 const MachineFrameInfo &MFI = MF.getFrameInfo();
2473
2474 // For entry functions we can use an immediate offset in most cases,
2475 // so the presence of calls doesn't imply we need a distinct frame pointer.
2476 if (MFI.hasCalls() &&
2477 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
2478 // All offsets are unsigned, so need to be addressed in the same direction
2479 // as stack growth.
2480
2481 // FIXME: This function is pretty broken, since it can be called before the
2482 // frame layout is determined or CSR spills are inserted.
2483 return MFI.getStackSize() != 0;
2484 }
2485
2486 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
2487 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
2488 MF) ||
2489 mayReserveScratchForCWSR(MF) ||
2490 MF.getTarget().Options.DisableFramePointerElim(MF);
2491}
2492
2493bool SIFrameLowering::mayReserveScratchForCWSR(
2494 const MachineFunction &MF) const {
2495 return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
2496 AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) &&
2497 AMDGPU::isCompute(CC: MF.getFunction().getCallingConv());
2498}
2499
2500// This is essentially a reduced version of hasFP for entry functions. Since the
2501// stack pointer is known 0 on entry to kernels, we never really need an FP
2502// register. We may need to initialize the stack pointer depending on the frame
2503// properties, which logically overlaps many of the cases where an ordinary
2504// function would require an FP.
2505bool SIFrameLowering::requiresStackPointerReference(
2506 const MachineFunction &MF) const {
2507 // Callable functions always require a stack pointer reference.
2508 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
2509 "only expected to call this for entry points functions");
2510
2511 const MachineFrameInfo &MFI = MF.getFrameInfo();
2512
2513 // Entry points ordinarily don't need to initialize SP. We have to set it up
2514 // for callees if there are any. Also note tail calls are only possible via
2515 // the `llvm.amdgcn.cs.chain` intrinsic.
2516 if (MFI.hasCalls() || MFI.hasTailCall())
2517 return true;
2518
2519 // We still need to initialize the SP if we're doing anything weird that
2520 // references the SP, like variable sized stack objects.
2521 return frameTriviallyRequiresSP(MFI);
2522}
2523
2524MachineInstr *SIFrameLowering::buildCFI(MachineBasicBlock &MBB,
2525 MachineBasicBlock::iterator MBBI,
2526 const DebugLoc &DL,
2527 const MCCFIInstruction &CFIInst,
2528 MachineInstr::MIFlag Flag) const {
2529 MachineFunction &MF = *MBB.getParent();
2530 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2531 return BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
2532 .addCFIIndex(CFIIndex: MF.addFrameInst(Inst: CFIInst))
2533 .setMIFlag(Flag);
2534}
2535
2536MachineInstr *SIFrameLowering::buildCFIForVRegToVRegSpill(
2537 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2538 const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const {
2539 MachineFunction &MF = *MBB.getParent();
2540 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2542
2543 MCRegister MaskReg = MCRI.getDwarfRegNum(
2544 Reg: ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, isEH: false);
2545 auto CFIInst = MCCFIInstruction::createLLVMVectorRegisterMask(
2546 L: nullptr, Register: MCRI.getDwarfRegNum(Reg, isEH: false),
2547 SpillRegister: MCRI.getDwarfRegNum(Reg: RegCopy, isEH: false), SpillRegisterLaneSizeInBits: VGPRLaneBitSize, MaskRegister: MaskReg,
2548 MaskRegisterSizeInBits: ST.getWavefrontSize());
2549 return buildCFI(MBB, MBBI, DL, CFIInst: std::move(CFIInst));
2550}
2551
2552MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
2553 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2554 const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR,
2555 const int Lane) const {
2556 const MachineFunction &MF = *MBB.getParent();
2557 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2558
2559 int DwarfSGPR = MCRI.getDwarfRegNum(Reg: SGPR, isEH: false);
2560 int DwarfVGPR = MCRI.getDwarfRegNum(Reg: VGPR, isEH: false);
2561 assert(DwarfSGPR != -1 && DwarfVGPR != -1);
2562 assert(Lane != -1 && "Expected a lane to be present");
2563
2564 // Build a CFI instruction that represents a SGPR spilled to a single lane of
2565 // a VGPR.
2566 MCCFIInstruction::VectorRegisterWithLane VR{.Register: unsigned(DwarfVGPR),
2567 .Lane: unsigned(Lane), .SizeInBits: VGPRLaneBitSize};
2568 auto CFIInst =
2569 MCCFIInstruction::createLLVMVectorRegisters(L: nullptr, Register: DwarfSGPR, VectorRegisters: {VR});
2570 return buildCFI(MBB, MBBI, DL, CFIInst: std::move(CFIInst));
2571}
2572
2573MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
2574 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2575 const DebugLoc &DL, MCRegister SGPR,
2576 ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills) const {
2577 if (VGPRSpills.size() == 1u)
2578 return buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, SGPR, VGPR: VGPRSpills[0].VGPR,
2579 Lane: VGPRSpills[0].Lane);
2580 const MachineFunction &MF = *MBB.getParent();
2581 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2582
2583 int DwarfSGPR = MCRI.getDwarfRegNum(Reg: SGPR, isEH: false);
2584 assert(DwarfSGPR != -1);
2585
2586 // Build a CFI instruction that represents a SGPR spilled to multiple lanes of
2587 // multiple VGPRs.
2588
2589 SmallVector<MCCFIInstruction::VectorRegisterWithLane> VGPRs;
2590 for (SIRegisterInfo::SpilledReg Spill : VGPRSpills) {
2591 int DwarfVGPR = MCRI.getDwarfRegNum(Reg: Spill.VGPR, isEH: false);
2592 assert(DwarfVGPR != -1);
2593 assert(Spill.hasLane() && "Expected a lane to be present");
2594 VGPRs.push_back(
2595 Elt: {.Register: unsigned(DwarfVGPR), .Lane: unsigned(Spill.Lane), .SizeInBits: VGPRLaneBitSize});
2596 }
2597
2598 auto CFIInst = MCCFIInstruction::createLLVMVectorRegisters(L: nullptr, Register: DwarfSGPR,
2599 VectorRegisters: std::move(VGPRs));
2600 return buildCFI(MBB, MBBI, DL, CFIInst: std::move(CFIInst));
2601}
2602
2603MachineInstr *SIFrameLowering::buildCFIForSGPRToVMEMSpill(
2604 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2605 const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const {
2606 MachineFunction &MF = *MBB.getParent();
2607 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2608 return buildCFI(MBB, MBBI, DL,
2609 CFIInst: llvm::MCCFIInstruction::createOffset(
2610 L: nullptr, Register: MCRI.getDwarfRegNum(Reg: SGPR, isEH: false), Offset));
2611}
2612
2613MachineInstr *SIFrameLowering::buildCFIForVGPRToVMEMSpill(
2614 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2615 const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const {
2616 const MachineFunction &MF = *MBB.getParent();
2617 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2618 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2619
2620 int DwarfVGPR = MCRI.getDwarfRegNum(Reg: VGPR, isEH: false);
2621 assert(DwarfVGPR != -1);
2622
2623 MCRegister MaskReg = MCRI.getDwarfRegNum(
2624 Reg: ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, isEH: false);
2625 auto CFIInst = MCCFIInstruction::createLLVMVectorOffset(
2626 L: nullptr, Register: DwarfVGPR, RegisterSizeInBits: VGPRLaneBitSize, MaskRegister: MaskReg, MaskRegisterSizeInBits: ST.getWavefrontSize(),
2627 Offset);
2628 return buildCFI(MBB, MBBI, DL, CFIInst: std::move(CFIInst));
2629}
2630
2631MachineInstr *SIFrameLowering::buildCFIForRegToSGPRPairSpill(
2632 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2633 const DebugLoc &DL, const MCRegister Reg, const MCRegister SGPRPair) const {
2634 const MachineFunction &MF = *MBB.getParent();
2635 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2636 const SIRegisterInfo &TRI = *ST.getRegisterInfo();
2637
2638 MCRegister SGPR0 = TRI.getSubReg(Reg: SGPRPair, Idx: AMDGPU::sub0);
2639 MCRegister SGPR1 = TRI.getSubReg(Reg: SGPRPair, Idx: AMDGPU::sub1);
2640
2641 int DwarfReg = TRI.getDwarfRegNum(Reg, isEH: false);
2642 int DwarfSGPR0 = TRI.getDwarfRegNum(Reg: SGPR0, isEH: false);
2643 int DwarfSGPR1 = TRI.getDwarfRegNum(Reg: SGPR1, isEH: false);
2644 assert(DwarfReg != -1 && DwarfSGPR0 != -1 && DwarfSGPR1 != -1);
2645
2646 auto CFIInst = MCCFIInstruction::createLLVMRegisterPair(
2647 L: nullptr, Register: DwarfReg, R1: DwarfSGPR0, R1SizeInBits: SGPRBitSize, R2: DwarfSGPR1, R2SizeInBits: SGPRBitSize);
2648 return buildCFI(MBB, MBBI, DL, CFIInst: std::move(CFIInst));
2649}
2650
2651MachineInstr *SIFrameLowering::buildCFIForSameValue(
2652 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2653 const DebugLoc &DL, MCRegister Reg) const {
2654 const MachineFunction &MF = *MBB.getParent();
2655 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2656 int DwarfReg = MCRI.getDwarfRegNum(Reg, /*isEH=*/false);
2657 auto CFIInst = MCCFIInstruction::createSameValue(L: nullptr, Register: DwarfReg);
2658 return buildCFI(MBB, MBBI, DL, CFIInst: std::move(CFIInst));
2659}
2660