1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "AMDGPULaneMaskUtils.h"
12#include "GCNSubtarget.h"
13#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14#include "SIMachineFunctionInfo.h"
15#include "SISpillUtils.h"
16#include "llvm/CodeGen/LiveRegUnits.h"
17#include "llvm/CodeGen/MachineFrameInfo.h"
18#include "llvm/CodeGen/RegisterScavenging.h"
19#include "llvm/Target/TargetMachine.h"
20
21using namespace llvm;
22
23#define DEBUG_TYPE "frame-info"
24
25static cl::opt<bool> EnableSpillVGPRToAGPR(
26 "amdgpu-spill-vgpr-to-agpr",
27 cl::desc("Enable spilling VGPRs to AGPRs"),
28 cl::ReallyHidden,
29 cl::init(Val: true));
30
31// Find a register matching \p RC from \p LiveUnits which is unused and
32// available throughout the function. On failure, returns AMDGPU::NoRegister.
33// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
34// MCRegisters. This should reduce the number of iterations and avoid redundant
35// checking.
36static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
37 const LiveRegUnits &LiveUnits,
38 const TargetRegisterClass &RC) {
39 for (MCRegister Reg : RC) {
40 if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) &&
41 !MRI.isReserved(PhysReg: Reg))
42 return Reg;
43 }
44 return MCRegister();
45}
46
47// Find a scratch register that we can use in the prologue. We avoid using
48// callee-save registers since they may appear to be free when this is called
49// from canUseAsPrologue (during shrink wrapping), but then no longer be free
50// when this is called from emitPrologue.
51static MCRegister findScratchNonCalleeSaveRegister(
52 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
53 const TargetRegisterClass &RC, bool Unused = false) {
54 // Mark callee saved registers as used so we will not choose them.
55 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
56 for (unsigned i = 0; CSRegs[i]; ++i)
57 LiveUnits.addReg(Reg: CSRegs[i]);
58
59 // We are looking for a register that can be used throughout the entire
60 // function, so any use is unacceptable.
61 if (Unused)
62 return findUnusedRegister(MRI, LiveUnits, RC);
63
64 for (MCRegister Reg : RC) {
65 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg))
66 return Reg;
67 }
68
69 return MCRegister();
70}
71
72/// Query target location for spilling SGPRs
73/// \p IncludeScratchCopy : Also look for free scratch SGPRs
74static void getVGPRSpillLaneOrTempRegister(
75 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
76 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
77 bool IncludeScratchCopy = true) {
78 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
79 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
80
81 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
82 const SIRegisterInfo *TRI = ST.getRegisterInfo();
83 unsigned Size = TRI->getSpillSize(RC);
84 Align Alignment = TRI->getSpillAlign(RC);
85
86 // We need to save and restore the given SGPR.
87
88 Register ScratchSGPR;
89 // 1: Try to save the given register into an unused scratch SGPR. The
90 // LiveUnits should have all the callee saved registers marked as used. For
91 // certain cases we skip copy to scratch SGPR.
92 if (IncludeScratchCopy)
93 ScratchSGPR = findUnusedRegister(MRI&: MF.getRegInfo(), LiveUnits, RC);
94
95 if (!ScratchSGPR) {
96 int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr,
97 ID: TargetStackID::SGPRSpill);
98
99 if (TRI->spillSGPRToVGPR() &&
100 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
101 /*IsPrologEpilog=*/true)) {
102 // 2: There's no free lane to spill, and no free register to save the
103 // SGPR, so we're forced to take another VGPR to use for the spill.
104 MFI->addToPrologEpilogSGPRSpills(
105 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
106 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
107
108 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
109 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
110 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
111 << '\n';);
112 } else {
113 // Remove dead <FI> index
114 MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI);
115 // 3: If all else fails, spill the register to memory.
116 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
117 MFI->addToPrologEpilogSGPRSpills(
118 Reg: SGPR,
119 SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
120 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
121 << printReg(SGPR, TRI) << '\n');
122 }
123 } else {
124 MFI->addToPrologEpilogSGPRSpills(
125 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
126 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
127 LiveUnits.addReg(Reg: ScratchSGPR);
128 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
129 << printReg(ScratchSGPR, TRI) << '\n');
130 }
131}
132
133// We need to specially emit stack operations here because a different frame
134// register is used than in the rest of the function, as getFrameRegister would
135// use.
136static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
137 const SIMachineFunctionInfo &FuncInfo,
138 LiveRegUnits &LiveUnits, MachineFunction &MF,
139 MachineBasicBlock &MBB,
140 MachineBasicBlock::iterator I, const DebugLoc &DL,
141 Register SpillReg, int FI, Register FrameReg,
142 int64_t DwordOff = 0) {
143 unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
144 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
145
146 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
147 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
148 MachineMemOperand *MMO = MF.getMachineMemOperand(
149 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
150 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
151 LiveUnits.addReg(Reg: SpillReg);
152 bool IsKill = !MBB.isLiveIn(Reg: SpillReg);
153 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg,
154 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
155 if (IsKill)
156 LiveUnits.removeReg(Reg: SpillReg);
157}
158
159static void buildEpilogRestore(const GCNSubtarget &ST,
160 const SIRegisterInfo &TRI,
161 const SIMachineFunctionInfo &FuncInfo,
162 LiveRegUnits &LiveUnits, MachineFunction &MF,
163 MachineBasicBlock &MBB,
164 MachineBasicBlock::iterator I,
165 const DebugLoc &DL, Register SpillReg, int FI,
166 Register FrameReg, int64_t DwordOff = 0) {
167 unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
168 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
169
170 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
171 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
172 MachineMemOperand *MMO = MF.getMachineMemOperand(
173 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
174 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
175 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg,
176 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
177}
178
179static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
180 const DebugLoc &DL, const SIInstrInfo *TII,
181 Register TargetReg) {
182 MachineFunction *MF = MBB.getParent();
183 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
184 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
185 const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
186 Register TargetLo = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub0);
187 Register TargetHi = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub1);
188
189 if (MFI->getGITPtrHigh() != 0xffffffff) {
190 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi)
191 .addImm(Val: MFI->getGITPtrHigh())
192 .addReg(RegNo: TargetReg, Flags: RegState::ImplicitDefine);
193 } else {
194 const MCInstrDesc &GetPC64 = TII->get(Opcode: AMDGPU::S_GETPC_B64_pseudo);
195 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg);
196 }
197 Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF);
198 MF->getRegInfo().addLiveIn(Reg: GitPtrLo);
199 MBB.addLiveIn(PhysReg: GitPtrLo);
200 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo)
201 .addReg(RegNo: GitPtrLo);
202}
203
204static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
205 const SIMachineFunctionInfo *FuncInfo,
206 MachineFunction &MF, MachineBasicBlock &MBB,
207 MachineBasicBlock::iterator MBBI, bool IsProlog) {
208 if (LiveUnits.empty()) {
209 LiveUnits.init(TRI);
210 if (IsProlog) {
211 LiveUnits.addLiveIns(MBB);
212 } else {
213 // In epilog.
214 LiveUnits.addLiveOuts(MBB);
215 LiveUnits.stepBackward(MI: *MBBI);
216 }
217 }
218}
219
220namespace llvm {
221
222// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
223// BP, etc. These spills are delayed until the current function's frame is
224// finalized. For a given register, the builder uses the
225// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
226class PrologEpilogSGPRSpillBuilder {
227 MachineBasicBlock::iterator MI;
228 MachineBasicBlock &MBB;
229 MachineFunction &MF;
230 const GCNSubtarget &ST;
231 MachineFrameInfo &MFI;
232 SIMachineFunctionInfo *FuncInfo;
233 const SIInstrInfo *TII;
234 const SIRegisterInfo &TRI;
235 Register SuperReg;
236 const PrologEpilogSGPRSaveRestoreInfo SI;
237 LiveRegUnits &LiveUnits;
238 const DebugLoc &DL;
239 Register FrameReg;
240 ArrayRef<int16_t> SplitParts;
241 unsigned NumSubRegs;
242 unsigned EltSize = 4;
243
244 void saveToMemory(const int FI) const {
245 MachineRegisterInfo &MRI = MF.getRegInfo();
246 assert(!MFI.isDeadObjectIndex(FI));
247
248 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true);
249
250 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
251 MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
252 if (!TmpVGPR)
253 report_fatal_error(reason: "failed to find free scratch register");
254
255 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
256 Register SubReg = NumSubRegs == 1
257 ? SuperReg
258 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
259 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR)
260 .addReg(RegNo: SubReg);
261
262 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR,
263 FI, FrameReg, DwordOff);
264 DwordOff += 4;
265 }
266 }
267
268 void saveToVGPRLane(const int FI) const {
269 assert(!MFI.isDeadObjectIndex(FI));
270
271 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
272 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
273 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
274 assert(Spill.size() == NumSubRegs);
275
276 for (unsigned I = 0; I < NumSubRegs; ++I) {
277 Register SubReg = NumSubRegs == 1
278 ? SuperReg
279 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
280 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR),
281 DestReg: Spill[I].VGPR)
282 .addReg(RegNo: SubReg)
283 .addImm(Val: Spill[I].Lane)
284 .addReg(RegNo: Spill[I].VGPR, Flags: RegState::Undef);
285 }
286 }
287
288 void copyToScratchSGPR(Register DstReg) const {
289 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg)
290 .addReg(RegNo: SuperReg)
291 .setMIFlag(MachineInstr::FrameSetup);
292 }
293
294 void restoreFromMemory(const int FI) {
295 MachineRegisterInfo &MRI = MF.getRegInfo();
296
297 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false);
298 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
299 MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
300 if (!TmpVGPR)
301 report_fatal_error(reason: "failed to find free scratch register");
302
303 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
304 Register SubReg = NumSubRegs == 1
305 ? SuperReg
306 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
307
308 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL,
309 SpillReg: TmpVGPR, FI, FrameReg, DwordOff);
310 assert(SubReg.isPhysical());
311
312 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SubReg)
313 .addReg(RegNo: TmpVGPR, Flags: RegState::Kill);
314 DwordOff += 4;
315 }
316 }
317
318 void restoreFromVGPRLane(const int FI) {
319 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
320 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
321 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
322 assert(Spill.size() == NumSubRegs);
323
324 for (unsigned I = 0; I < NumSubRegs; ++I) {
325 Register SubReg = NumSubRegs == 1
326 ? SuperReg
327 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
328 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg)
329 .addReg(RegNo: Spill[I].VGPR)
330 .addImm(Val: Spill[I].Lane);
331 }
332 }
333
334 void copyFromScratchSGPR(Register SrcReg) const {
335 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SuperReg)
336 .addReg(RegNo: SrcReg)
337 .setMIFlag(MachineInstr::FrameDestroy);
338 }
339
340public:
341 PrologEpilogSGPRSpillBuilder(Register Reg,
342 const PrologEpilogSGPRSaveRestoreInfo SI,
343 MachineBasicBlock &MBB,
344 MachineBasicBlock::iterator MI,
345 const DebugLoc &DL, const SIInstrInfo *TII,
346 const SIRegisterInfo &TRI,
347 LiveRegUnits &LiveUnits, Register FrameReg)
348 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
349 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
350 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
351 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
352 FrameReg(FrameReg) {
353 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg);
354 SplitParts = TRI.getRegSplitParts(RC, EltSize);
355 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
356
357 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
358 }
359
360 void save() {
361 switch (SI.getKind()) {
362 case SGPRSaveKind::SPILL_TO_MEM:
363 return saveToMemory(FI: SI.getIndex());
364 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
365 return saveToVGPRLane(FI: SI.getIndex());
366 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
367 return copyToScratchSGPR(DstReg: SI.getReg());
368 }
369 }
370
371 void restore() {
372 switch (SI.getKind()) {
373 case SGPRSaveKind::SPILL_TO_MEM:
374 return restoreFromMemory(FI: SI.getIndex());
375 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
376 return restoreFromVGPRLane(FI: SI.getIndex());
377 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
378 return copyFromScratchSGPR(SrcReg: SI.getReg());
379 }
380 }
381};
382
383} // namespace llvm
384
385// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
386void SIFrameLowering::emitEntryFunctionFlatScratchInit(
387 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
388 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
389 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
390 const SIInstrInfo *TII = ST.getInstrInfo();
391 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
392 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
393
394 // We don't need this if we only have spills since there is no user facing
395 // scratch.
396
397 // TODO: If we know we don't have flat instructions earlier, we can omit
398 // this from the input registers.
399 //
400 // TODO: We only need to know if we access scratch space through a flat
401 // pointer. Because we only detect if flat instructions are used at all,
402 // this will be used more often than necessary on VI.
403
404 Register FlatScrInitLo;
405 Register FlatScrInitHi;
406
407 if (ST.isAmdPalOS()) {
408 // Extract the scratch offset from the descriptor in the GIT
409 LiveRegUnits LiveUnits;
410 LiveUnits.init(TRI: *TRI);
411 LiveUnits.addLiveIns(MBB);
412
413 // Find unused reg to load flat scratch init into
414 MachineRegisterInfo &MRI = MF.getRegInfo();
415 Register FlatScrInit = AMDGPU::NoRegister;
416 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
417 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
418 AllSGPR64s = AllSGPR64s.slice(
419 N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded));
420 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
421 for (MCPhysReg Reg : AllSGPR64s) {
422 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) &&
423 MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg)) {
424 FlatScrInit = Reg;
425 break;
426 }
427 }
428 assert(FlatScrInit && "Failed to find free register for scratch init");
429
430 FlatScrInitLo = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub0);
431 FlatScrInitHi = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub1);
432
433 buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit);
434
435 // We now have the GIT ptr - now get the scratch descriptor from the entry
436 // at offset 0 (or offset 16 for a compute shader).
437 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
438 const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
439 auto *MMO = MF.getMachineMemOperand(
440 PtrInfo,
441 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
442 MachineMemOperand::MODereferenceable,
443 Size: 8, BaseAlignment: Align(4));
444 unsigned Offset =
445 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
446 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
447 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
448 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit)
449 .addReg(RegNo: FlatScrInit)
450 .addImm(Val: EncodedOffset) // offset
451 .addImm(Val: 0) // cpol
452 .addMemOperand(MMO);
453
454 // Mask the offset in [47:0] of the descriptor
455 const MCInstrDesc &SAndB32 = TII->get(Opcode: AMDGPU::S_AND_B32);
456 auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi)
457 .addReg(RegNo: FlatScrInitHi)
458 .addImm(Val: 0xffff);
459 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
460 } else {
461 Register FlatScratchInitReg =
462 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
463 assert(FlatScratchInitReg);
464
465 MachineRegisterInfo &MRI = MF.getRegInfo();
466 MRI.addLiveIn(Reg: FlatScratchInitReg);
467 MBB.addLiveIn(PhysReg: FlatScratchInitReg);
468
469 FlatScrInitLo = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub0);
470 FlatScrInitHi = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub1);
471 }
472
473 // Do a 64-bit pointer add.
474 if (ST.flatScratchIsPointer()) {
475 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
476 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: FlatScrInitLo)
477 .addReg(RegNo: FlatScrInitLo)
478 .addReg(RegNo: ScratchWaveOffsetReg);
479 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
480 DestReg: FlatScrInitHi)
481 .addReg(RegNo: FlatScrInitHi)
482 .addImm(Val: 0);
483 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
484
485 using namespace AMDGPU::Hwreg;
486 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
487 .addReg(RegNo: FlatScrInitLo)
488 .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_LO, Values: 0, Values: 32)));
489 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
490 .addReg(RegNo: FlatScrInitHi)
491 .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_HI, Values: 0, Values: 32)));
492 return;
493 }
494
495 // For GFX9.
496 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: AMDGPU::FLAT_SCR_LO)
497 .addReg(RegNo: FlatScrInitLo)
498 .addReg(RegNo: ScratchWaveOffsetReg);
499 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
500 DestReg: AMDGPU::FLAT_SCR_HI)
501 .addReg(RegNo: FlatScrInitHi)
502 .addImm(Val: 0);
503 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
504
505 return;
506 }
507
508 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
509
510 // Copy the size in bytes.
511 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::FLAT_SCR_LO)
512 .addReg(RegNo: FlatScrInitHi, Flags: RegState::Kill);
513
514 // Add wave offset in bytes to private base offset.
515 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
516 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FlatScrInitLo)
517 .addReg(RegNo: FlatScrInitLo)
518 .addReg(RegNo: ScratchWaveOffsetReg);
519
520 // Convert offset to 256-byte units.
521 auto LShr = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32),
522 DestReg: AMDGPU::FLAT_SCR_HI)
523 .addReg(RegNo: FlatScrInitLo, Flags: RegState::Kill)
524 .addImm(Val: 8);
525 LShr->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
526}
527
528// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
529// memory. They should have been removed by now.
530static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
531 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
532 I != E; ++I) {
533 if (!MFI.isDeadObjectIndex(ObjectIdx: I))
534 return false;
535 }
536
537 return true;
538}
539
540// Shift down registers reserved for the scratch RSRC.
541Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
542 MachineFunction &MF) const {
543
544 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
545 const SIInstrInfo *TII = ST.getInstrInfo();
546 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
547 MachineRegisterInfo &MRI = MF.getRegInfo();
548 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
549
550 assert(MFI->isEntryFunction());
551
552 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
553
554 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) &&
555 allStackObjectsAreDead(MFI: MF.getFrameInfo())))
556 return Register();
557
558 if (ST.hasSGPRInitBug() ||
559 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
560 return ScratchRsrcReg;
561
562 // We reserved the last registers for this. Shift it down to the end of those
563 // which were actually used.
564 //
565 // FIXME: It might be safer to use a pseudoregister before replacement.
566
567 // FIXME: We should be able to eliminate unused input registers. We only
568 // cannot do this for the resources required for scratch access. For now we
569 // skip over user SGPRs and may leave unused holes.
570
571 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
572 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
573 AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded));
574
575 // Skip the last N reserved elements because they should have already been
576 // reserved for VCC etc.
577 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
578 for (MCPhysReg Reg : AllSGPR128s) {
579 // Pick the first unallocated one. Make sure we don't clobber the other
580 // reserved input we needed. Also for PAL, make sure we don't clobber
581 // the GIT pointer passed in SGPR0 or SGPR8.
582 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
583 (!GITPtrLoReg || !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg))) {
584 MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg);
585 MFI->setScratchRSrcReg(Reg);
586 MRI.reserveReg(PhysReg: Reg, TRI);
587 return Reg;
588 }
589 }
590
591 return ScratchRsrcReg;
592}
593
594static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
595 return ST.hasFlatScratchEnabled() ? 1 : ST.getWavefrontSize();
596}
597
598void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
599 MachineBasicBlock &MBB) const {
600 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
601
602 // FIXME: If we only have SGPR spills, we won't actually be using scratch
603 // memory since these spill to VGPRs. We should be cleaning up these unused
604 // SGPR spill frame indices somewhere.
605
606 // FIXME: We still have implicit uses on SGPR spill instructions in case they
607 // need to spill to vector memory. It's likely that will not happen, but at
608 // this point it appears we need the setup. This part of the prolog should be
609 // emitted after frame indices are eliminated.
610
611 // FIXME: Remove all of the isPhysRegUsed checks
612
613 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
614 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
615 const SIInstrInfo *TII = ST.getInstrInfo();
616 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
617 MachineRegisterInfo &MRI = MF.getRegInfo();
618 const Function &F = MF.getFunction();
619 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
620
621 assert(MFI->isEntryFunction());
622
623 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
624 Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
625
626 // We need to do the replacement of the private segment buffer register even
627 // if there are no stack objects. There could be stores to undef or a
628 // constant without an associated object.
629 //
630 // This will return `Register()` in cases where there are no actual
631 // uses of the SRSRC.
632 Register ScratchRsrcReg;
633 if (!ST.hasFlatScratchEnabled())
634 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
635
636 // Make the selected register live throughout the function.
637 if (ScratchRsrcReg) {
638 for (MachineBasicBlock &OtherBB : MF) {
639 if (&OtherBB != &MBB) {
640 OtherBB.addLiveIn(PhysReg: ScratchRsrcReg);
641 }
642 }
643 }
644
645 // Now that we have fixed the reserved SRSRC we need to locate the
646 // (potentially) preloaded SRSRC.
647 Register PreloadedScratchRsrcReg;
648 if (ST.isAmdHsaOrMesa(F)) {
649 PreloadedScratchRsrcReg =
650 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
651 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
652 // We added live-ins during argument lowering, but since they were not
653 // used they were deleted. We're adding the uses now, so add them back.
654 MRI.addLiveIn(Reg: PreloadedScratchRsrcReg);
655 MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg);
656 }
657 }
658
659 // Debug location must be unknown since the first debug location is used to
660 // determine the end of the prologue.
661 DebugLoc DL;
662 MachineBasicBlock::iterator I = MBB.begin();
663
664 // We found the SRSRC first because it needs four registers and has an
665 // alignment requirement. If the SRSRC that we found is clobbering with
666 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
667 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
668 // wave offset to a free SGPR.
669 Register ScratchWaveOffsetReg;
670 if (PreloadedScratchWaveOffsetReg &&
671 TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: PreloadedScratchWaveOffsetReg)) {
672 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
673 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
674 AllSGPRs = AllSGPRs.slice(
675 N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded));
676 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
677 for (MCPhysReg Reg : AllSGPRs) {
678 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
679 !TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: Reg) && GITPtrLoReg != Reg) {
680 ScratchWaveOffsetReg = Reg;
681 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchWaveOffsetReg)
682 .addReg(RegNo: PreloadedScratchWaveOffsetReg, Flags: RegState::Kill);
683 break;
684 }
685 }
686
687 // FIXME: We can spill incoming arguments and restore at the end of the
688 // prolog.
689 if (!ScratchWaveOffsetReg)
690 report_fatal_error(
691 reason: "could not find temporary scratch offset register in prolog");
692 } else {
693 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
694 }
695 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
696
697 unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
698 if (!mayReserveScratchForCWSR(MF)) {
699 if (hasFP(MF)) {
700 Register FPReg = MFI->getFrameOffsetReg();
701 assert(FPReg != AMDGPU::FP_REG);
702 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: FPReg).addImm(Val: 0);
703 }
704
705 if (requiresStackPointerReference(MF)) {
706 Register SPReg = MFI->getStackPtrOffsetReg();
707 assert(SPReg != AMDGPU::SP_REG);
708 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
709 }
710 } else {
711 // We need to check if we're on a compute queue - if we are, then the CWSR
712 // trap handler may need to store some VGPRs on the stack. The first VGPR
713 // block is saved separately, so we only need to allocate space for any
714 // additional VGPR blocks used. For now, we will make sure there's enough
715 // room for the theoretical maximum number of VGPRs that can be allocated.
716 // FIXME: Figure out if the shader uses fewer VGPRs in practice.
717 assert(hasFP(MF));
718 Register FPReg = MFI->getFrameOffsetReg();
719 assert(FPReg != AMDGPU::FP_REG);
720 unsigned VGPRSize = llvm::alignTo(
721 Size: (ST.getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()) -
722 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST,
723 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize())) *
724 4,
725 A: FrameInfo.getMaxAlign());
726 MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
727
728 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GET_STACK_BASE), DestReg: FPReg);
729 if (requiresStackPointerReference(MF)) {
730 Register SPReg = MFI->getStackPtrOffsetReg();
731 assert(SPReg != AMDGPU::SP_REG);
732
733 // If at least one of the constants can be inlined, then we can use
734 // s_cselect. Otherwise, use a mov and cmovk.
735 if (AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm()) ||
736 AMDGPU::isInlinableLiteral32(Literal: Offset + VGPRSize,
737 HasInv2Pi: ST.hasInv2PiInlineImm())) {
738 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SPReg)
739 .addImm(Val: Offset + VGPRSize)
740 .addImm(Val: Offset);
741 } else {
742 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
743 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: SPReg)
744 .addImm(Val: Offset + VGPRSize);
745 }
746 }
747 }
748
749 bool NeedsFlatScratchInit =
750 MFI->getUserSGPRInfo().hasFlatScratchInit() &&
751 (MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
752 (!allStackObjectsAreDead(MFI: FrameInfo) && ST.hasFlatScratchEnabled()));
753
754 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
755 PreloadedScratchWaveOffsetReg && !ST.hasArchitectedFlatScratch()) {
756 MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg);
757 MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg);
758 }
759
760 if (NeedsFlatScratchInit) {
761 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
762 }
763
764 if (ScratchRsrcReg) {
765 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
766 PreloadedPrivateBufferReg: PreloadedScratchRsrcReg,
767 ScratchRsrcReg, ScratchWaveOffsetReg);
768 }
769
770 if (ST.hasWaitXcnt()) {
771 // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
772 // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
773 // insertion logic, which assumes multi-group mode by default.
774 unsigned RegEncoding =
775 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 25, Values: 1);
776 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
777 .addImm(Val: 1)
778 .addImm(Val: RegEncoding);
779 }
780}
781
782// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
783void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
784 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
785 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
786 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
787
788 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
789 const SIInstrInfo *TII = ST.getInstrInfo();
790 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
791 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
792 const Function &Fn = MF.getFunction();
793
794 if (ST.isAmdPalOS()) {
795 // The pointer to the GIT is formed from the offset passed in and either
796 // the amdgpu-git-ptr-high function attribute or the top part of the PC
797 Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
798 Register Rsrc03 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
799
800 buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01);
801
802 // We now have the GIT ptr - now get the scratch descriptor from the entry
803 // at offset 0 (or offset 16 for a compute shader).
804 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
805 const MCInstrDesc &LoadDwordX4 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX4_IMM);
806 auto *MMO = MF.getMachineMemOperand(
807 PtrInfo,
808 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
809 MachineMemOperand::MODereferenceable,
810 Size: 16, BaseAlignment: Align(4));
811 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
812 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
813 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
814 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg)
815 .addReg(RegNo: Rsrc01)
816 .addImm(Val: EncodedOffset) // offset
817 .addImm(Val: 0) // cpol
818 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine)
819 .addMemOperand(MMO);
820
821 // The driver will always set the SRD for wave 64 (bits 118:117 of
822 // descriptor / bits 22:21 of third sub-reg will be 0b11)
823 // If the shader is actually wave32 we have to modify the const_index_stride
824 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
825 // reason the driver does this is that there can be cases where it presents
826 // 2 shaders with different wave size (e.g. VsFs).
827 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
828 if (ST.isWave32()) {
829 const MCInstrDesc &SBitsetB32 = TII->get(Opcode: AMDGPU::S_BITSET0_B32);
830 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03)
831 .addImm(Val: 21)
832 .addReg(RegNo: Rsrc03);
833 }
834 } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) {
835 assert(!ST.isAmdHsaOrMesa(Fn));
836 const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
837
838 Register Rsrc2 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub2);
839 Register Rsrc3 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
840
841 // Use relocations to get the pointer, and setup the other bits manually.
842 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
843
844 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
845 Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
846
847 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
848 const MCInstrDesc &Mov64 = TII->get(Opcode: AMDGPU::S_MOV_B64);
849
850 BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01)
851 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
852 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
853 } else {
854 const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
855
856 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
857 auto *MMO = MF.getMachineMemOperand(
858 PtrInfo,
859 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
860 MachineMemOperand::MODereferenceable,
861 Size: 8, BaseAlignment: Align(4));
862 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01)
863 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
864 .addImm(Val: 0) // offset
865 .addImm(Val: 0) // cpol
866 .addMemOperand(MMO)
867 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
868
869 MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR());
870 MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR());
871 }
872 } else {
873 Register Rsrc0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
874 Register Rsrc1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
875
876 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0)
877 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0")
878 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
879
880 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1)
881 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1")
882 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
883 }
884
885 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2)
886 .addImm(Val: Lo_32(Value: Rsrc23))
887 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
888
889 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3)
890 .addImm(Val: Hi_32(Value: Rsrc23))
891 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
892 } else if (ST.isAmdHsaOrMesa(F: Fn)) {
893 assert(PreloadedScratchRsrcReg);
894
895 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
896 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchRsrcReg)
897 .addReg(RegNo: PreloadedScratchRsrcReg, Flags: RegState::Kill);
898 }
899 }
900
901 // Add the scratch wave offset into the scratch RSRC.
902 //
903 // We only want to update the first 48 bits, which is the base address
904 // pointer, without touching the adjacent 16 bits of flags. We know this add
905 // cannot carry-out from bit 47, otherwise the scratch allocation would be
906 // impossible to fit in the 48-bit global address space.
907 //
908 // TODO: Evaluate if it is better to just construct an SRD using the flat
909 // scratch init and some constants rather than update the one we are passed.
910 Register ScratchRsrcSub0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
911 Register ScratchRsrcSub1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
912
913 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
914 // the kernel body via inreg arguments.
915 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: ScratchRsrcSub0)
916 .addReg(RegNo: ScratchRsrcSub0)
917 .addReg(RegNo: ScratchWaveOffsetReg)
918 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
919 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: ScratchRsrcSub1)
920 .addReg(RegNo: ScratchRsrcSub1)
921 .addImm(Val: 0)
922 .addReg(RegNo: ScratchRsrcReg, Flags: RegState::ImplicitDefine);
923 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
924}
925
926bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
927 switch (ID) {
928 case TargetStackID::Default:
929 case TargetStackID::NoAlloc:
930 case TargetStackID::SGPRSpill:
931 return true;
932 case TargetStackID::ScalableVector:
933 case TargetStackID::ScalablePredicateVector:
934 case TargetStackID::WasmLocal:
935 return false;
936 }
937 llvm_unreachable("Invalid TargetStackID::Value");
938}
939
940// Activate only the inactive lanes when \p EnableInactiveLanes is true.
941// Otherwise, activate all lanes. It returns the saved exec.
942static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
943 MachineFunction &MF,
944 MachineBasicBlock &MBB,
945 MachineBasicBlock::iterator MBBI,
946 const DebugLoc &DL, bool IsProlog,
947 bool EnableInactiveLanes) {
948 Register ScratchExecCopy;
949 MachineRegisterInfo &MRI = MF.getRegInfo();
950 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
951 const SIInstrInfo *TII = ST.getInstrInfo();
952 const SIRegisterInfo &TRI = TII->getRegisterInfo();
953 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
954
955 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
956
957 if (FuncInfo->isWholeWaveFunction()) {
958 // Whole wave functions already have a copy of the original EXEC mask that
959 // we can use.
960 assert(IsProlog && "Epilog should look at return, not setup");
961 ScratchExecCopy =
962 TII->getWholeWaveFunctionSetup(MF)->getOperand(i: 0).getReg();
963 assert(ScratchExecCopy && "Couldn't find copy of EXEC");
964 } else {
965 ScratchExecCopy = findScratchNonCalleeSaveRegister(
966 MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass());
967 }
968
969 if (!ScratchExecCopy)
970 report_fatal_error(reason: "failed to find free scratch register");
971
972 LiveUnits.addReg(Reg: ScratchExecCopy);
973
974 const unsigned SaveExecOpc =
975 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
976 : AMDGPU::S_OR_SAVEEXEC_B32)
977 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
978 : AMDGPU::S_OR_SAVEEXEC_B64);
979 auto SaveExec =
980 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: SaveExecOpc), DestReg: ScratchExecCopy).addImm(Val: -1);
981 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
982
983 return ScratchExecCopy;
984}
985
986void SIFrameLowering::emitCSRSpillStores(
987 MachineFunction &MF, MachineBasicBlock &MBB,
988 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
989 Register FrameReg, Register FramePtrRegScratchCopy) const {
990 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
991 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
992 const SIInstrInfo *TII = ST.getInstrInfo();
993 const SIRegisterInfo &TRI = TII->getRegisterInfo();
994 MachineRegisterInfo &MRI = MF.getRegInfo();
995 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
996
997 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
998 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
999 // might end up flipping the EXEC bits twice.
1000 Register ScratchExecCopy;
1001 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1002 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1003 if (!WWMScratchRegs.empty())
1004 ScratchExecCopy =
1005 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1006 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
1007
1008 auto StoreWWMRegisters =
1009 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1010 for (const auto &Reg : WWMRegs) {
1011 Register VGPR = Reg.first;
1012 int FI = Reg.second;
1013 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1014 SpillReg: VGPR, FI, FrameReg);
1015 }
1016 };
1017
1018 for (const Register Reg : make_first_range(c&: WWMScratchRegs)) {
1019 if (!MRI.isReserved(PhysReg: Reg)) {
1020 MRI.addLiveIn(Reg);
1021 MBB.addLiveIn(PhysReg: Reg);
1022 }
1023 }
1024 StoreWWMRegisters(WWMScratchRegs);
1025
1026 auto EnableAllLanes = [&]() {
1027 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
1028 };
1029
1030 if (!WWMCalleeSavedRegs.empty()) {
1031 if (ScratchExecCopy) {
1032 EnableAllLanes();
1033 } else {
1034 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1035 /*IsProlog*/ true,
1036 /*EnableInactiveLanes*/ false);
1037 }
1038 }
1039
1040 StoreWWMRegisters(WWMCalleeSavedRegs);
1041 if (FuncInfo->isWholeWaveFunction()) {
1042 // If we have already saved some WWM CSR registers, then the EXEC is already
1043 // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
1044 if (!ScratchExecCopy)
1045 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
1046 /*EnableInactiveLanes*/ true);
1047 else if (WWMCalleeSavedRegs.empty())
1048 EnableAllLanes();
1049 } else if (ScratchExecCopy) {
1050 // FIXME: Split block and make terminator.
1051 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
1052 .addReg(RegNo: ScratchExecCopy, Flags: RegState::Kill);
1053 LiveUnits.addReg(Reg: ScratchExecCopy);
1054 }
1055
1056 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1057
1058 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1059 // Special handle FP spill:
1060 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1061 // Otherwise, FP has been moved to a temporary register and spill it
1062 // instead.
1063 Register Reg =
1064 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1065 if (!Reg)
1066 continue;
1067
1068 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1069 LiveUnits, FrameReg);
1070 SB.save();
1071 }
1072
1073 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1074 // such scratch registers live throughout the function.
1075 SmallVector<Register, 1> ScratchSGPRs;
1076 FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs);
1077 if (!ScratchSGPRs.empty()) {
1078 for (MachineBasicBlock &MBB : MF) {
1079 for (MCPhysReg Reg : ScratchSGPRs)
1080 MBB.addLiveIn(PhysReg: Reg);
1081
1082 MBB.sortUniqueLiveIns();
1083 }
1084 if (!LiveUnits.empty()) {
1085 for (MCPhysReg Reg : ScratchSGPRs)
1086 LiveUnits.addReg(Reg);
1087 }
1088 }
1089}
1090
1091void SIFrameLowering::emitCSRSpillRestores(
1092 MachineFunction &MF, MachineBasicBlock &MBB,
1093 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1094 Register FrameReg, Register FramePtrRegScratchCopy) const {
1095 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1096 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1097 const SIInstrInfo *TII = ST.getInstrInfo();
1098 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1099 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1100 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1101
1102 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1103 // Special handle FP restore:
1104 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1105 // the FP value to a temporary register. The frame pointer should be
1106 // overwritten only at the end when all other spills are restored from
1107 // current frame.
1108 Register Reg =
1109 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1110 if (!Reg)
1111 continue;
1112
1113 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1114 LiveUnits, FrameReg);
1115 SB.restore();
1116 }
1117
1118 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1119 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1120 // this, we might end up flipping the EXEC bits twice.
1121 Register ScratchExecCopy;
1122 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1123 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1124 auto RestoreWWMRegisters =
1125 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1126 for (const auto &Reg : WWMRegs) {
1127 Register VGPR = Reg.first;
1128 int FI = Reg.second;
1129 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1130 SpillReg: VGPR, FI, FrameReg);
1131 }
1132 };
1133
1134 if (FuncInfo->isWholeWaveFunction()) {
1135 // For whole wave functions, the EXEC is already -1 at this point.
1136 // Therefore, we can restore the CSR WWM registers right away.
1137 RestoreWWMRegisters(WWMCalleeSavedRegs);
1138
1139 // The original EXEC is the first operand of the return instruction.
1140 MachineInstr &Return = MBB.instr_back();
1141 unsigned Opcode = Return.getOpcode();
1142 switch (Opcode) {
1143 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
1144 Opcode = AMDGPU::SI_RETURN;
1145 break;
1146 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
1147 Opcode = AMDGPU::SI_TCRETURN_GFX;
1148 break;
1149 default:
1150 llvm_unreachable("Unexpected return inst");
1151 }
1152 Register OrigExec = Return.getOperand(i: 0).getReg();
1153
1154 if (!WWMScratchRegs.empty()) {
1155 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.XorOpc), DestReg: LMC.ExecReg)
1156 .addReg(RegNo: OrigExec)
1157 .addImm(Val: -1);
1158 RestoreWWMRegisters(WWMScratchRegs);
1159 }
1160
1161 // Restore original EXEC.
1162 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addReg(RegNo: OrigExec);
1163
1164 // Drop the first operand and update the opcode.
1165 Return.removeOperand(OpNo: 0);
1166 Return.setDesc(TII->get(Opcode));
1167
1168 return;
1169 }
1170
1171 if (!WWMScratchRegs.empty()) {
1172 ScratchExecCopy =
1173 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1174 /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
1175 }
1176 RestoreWWMRegisters(WWMScratchRegs);
1177 if (!WWMCalleeSavedRegs.empty()) {
1178 if (ScratchExecCopy) {
1179 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
1180 } else {
1181 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1182 /*IsProlog*/ false,
1183 /*EnableInactiveLanes*/ false);
1184 }
1185 }
1186
1187 RestoreWWMRegisters(WWMCalleeSavedRegs);
1188 if (ScratchExecCopy) {
1189 // FIXME: Split block and make terminator.
1190 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
1191 .addReg(RegNo: ScratchExecCopy, Flags: RegState::Kill);
1192 }
1193}
1194
1195void SIFrameLowering::emitPrologue(MachineFunction &MF,
1196 MachineBasicBlock &MBB) const {
1197 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1198 if (FuncInfo->isEntryFunction()) {
1199 emitEntryFunctionPrologue(MF, MBB);
1200 return;
1201 }
1202
1203 MachineFrameInfo &MFI = MF.getFrameInfo();
1204 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1205 const SIInstrInfo *TII = ST.getInstrInfo();
1206 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1207 MachineRegisterInfo &MRI = MF.getRegInfo();
1208
1209 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1210 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1211 Register BasePtrReg =
1212 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1213 LiveRegUnits LiveUnits;
1214
1215 MachineBasicBlock::iterator MBBI = MBB.begin();
1216 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1217 // to determine the end of the prologue.
1218 DebugLoc DL;
1219
1220 bool HasFP = false;
1221 bool HasBP = false;
1222 uint32_t NumBytes = MFI.getStackSize();
1223 uint32_t RoundedSize = NumBytes;
1224
1225 // Chain functions never return, so there's no need to save and restore the FP
1226 // or BP.
1227 bool SavesStackRegs = !FuncInfo->isChainFunction();
1228
1229 if (TRI.hasStackRealignment(MF))
1230 HasFP = true;
1231
1232 Register FramePtrRegScratchCopy;
1233 if (!HasFP && !hasFP(MF)) {
1234 // Emit the CSR spill stores with SP base register.
1235 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg,
1236 FramePtrRegScratchCopy);
1237 } else if (SavesStackRegs) {
1238 // CSR spill stores will use FP as base register.
1239 Register SGPRForFPSaveRestoreCopy =
1240 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1241
1242 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1243 if (SGPRForFPSaveRestoreCopy) {
1244 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1245 // the extra FP copy needed in the other two cases when FP is spilled to
1246 // memory or to a VGPR lane.
1247 PrologEpilogSGPRSpillBuilder SB(
1248 FramePtrReg,
1249 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI,
1250 DL, TII, TRI, LiveUnits, FramePtrReg);
1251 SB.save();
1252 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1253 } else {
1254 // Copy FP into a new scratch register so that its previous value can be
1255 // spilled after setting up the new frame.
1256 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1257 MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1258 if (!FramePtrRegScratchCopy)
1259 report_fatal_error(reason: "failed to find free scratch register");
1260
1261 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1262 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrRegScratchCopy)
1263 .addReg(RegNo: FramePtrReg);
1264 }
1265 }
1266
1267 if (HasFP) {
1268 const unsigned Alignment = MFI.getMaxAlign().value();
1269
1270 RoundedSize += Alignment;
1271 if (LiveUnits.empty()) {
1272 LiveUnits.init(TRI);
1273 LiveUnits.addLiveIns(MBB);
1274 }
1275
1276 // s_add_i32 s33, s32, NumBytes
1277 // s_and_b32 s33, s33, 0b111...0000
1278 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FramePtrReg)
1279 .addReg(RegNo: StackPtrReg)
1280 .addImm(Val: (Alignment - 1) * getScratchScaleFactor(ST))
1281 .setMIFlag(MachineInstr::FrameSetup);
1282 auto And = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: FramePtrReg)
1283 .addReg(RegNo: FramePtrReg, Flags: RegState::Kill)
1284 .addImm(Val: -Alignment * getScratchScaleFactor(ST))
1285 .setMIFlag(MachineInstr::FrameSetup);
1286 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1287 FuncInfo->setIsStackRealigned(true);
1288 } else if ((HasFP = hasFP(MF))) {
1289 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1290 .addReg(RegNo: StackPtrReg)
1291 .setMIFlag(MachineInstr::FrameSetup);
1292 }
1293
1294 // If FP is used, emit the CSR spills with FP base register.
1295 if (HasFP) {
1296 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1297 FramePtrRegScratchCopy);
1298 if (FramePtrRegScratchCopy)
1299 LiveUnits.removeReg(Reg: FramePtrRegScratchCopy);
1300 }
1301
1302 // If we need a base pointer, set it up here. It's whatever the value of
1303 // the stack pointer is at this point. Any variable size objects will be
1304 // allocated after this, so we can still use the base pointer to reference
1305 // the incoming arguments.
1306 if ((HasBP = TRI.hasBasePointer(MF))) {
1307 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BasePtrReg)
1308 .addReg(RegNo: StackPtrReg)
1309 .setMIFlag(MachineInstr::FrameSetup);
1310 }
1311
1312 if (HasFP && RoundedSize != 0) {
1313 auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg)
1314 .addReg(RegNo: StackPtrReg)
1315 .addImm(Val: RoundedSize * getScratchScaleFactor(ST))
1316 .setMIFlag(MachineInstr::FrameSetup);
1317 Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1318 }
1319
1320 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1321 (void)FPSaved;
1322 assert((!HasFP || FPSaved || !SavesStackRegs) &&
1323 "Needed to save FP but didn't save it anywhere");
1324
1325 // If we allow spilling to AGPRs we may have saved FP but then spill
1326 // everything into AGPRs instead of the stack.
1327 assert((HasFP || !FPSaved || !SavesStackRegs || EnableSpillVGPRToAGPR) &&
1328 "Saved FP but didn't need it");
1329
1330 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg);
1331 (void)BPSaved;
1332 assert((!HasBP || BPSaved || !SavesStackRegs) &&
1333 "Needed to save BP but didn't save it anywhere");
1334
1335 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1336
1337 if (FuncInfo->isWholeWaveFunction()) {
1338 // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
1339 TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
1340 }
1341}
1342
1343void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1344 MachineBasicBlock &MBB) const {
1345 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1346 if (FuncInfo->isEntryFunction())
1347 return;
1348
1349 const MachineFrameInfo &MFI = MF.getFrameInfo();
1350 if (FuncInfo->isChainFunction() && !MFI.hasTailCall())
1351 return;
1352
1353 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1354 const SIInstrInfo *TII = ST.getInstrInfo();
1355 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1356 MachineRegisterInfo &MRI = MF.getRegInfo();
1357 LiveRegUnits LiveUnits;
1358 // Get the insert location for the epilogue. If there were no terminators in
1359 // the block, get the last instruction.
1360 MachineBasicBlock::iterator MBBI = MBB.end();
1361 DebugLoc DL;
1362 if (!MBB.empty()) {
1363 MBBI = MBB.getLastNonDebugInstr();
1364 if (MBBI != MBB.end())
1365 DL = MBBI->getDebugLoc();
1366
1367 MBBI = MBB.getFirstTerminator();
1368 }
1369
1370 uint32_t NumBytes = MFI.getStackSize();
1371 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1372 ? NumBytes + MFI.getMaxAlign().value()
1373 : NumBytes;
1374 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1375 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1376 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1377
1378 if (RoundedSize != 0) {
1379 if (TRI.hasBasePointer(MF)) {
1380 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1381 .addReg(RegNo: TRI.getBaseRegister())
1382 .setMIFlag(MachineInstr::FrameDestroy);
1383 } else if (hasFP(MF)) {
1384 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1385 .addReg(RegNo: FramePtrReg)
1386 .setMIFlag(MachineInstr::FrameDestroy);
1387 }
1388 }
1389
1390 Register FramePtrRegScratchCopy;
1391 Register SGPRForFPSaveRestoreCopy =
1392 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1393 if (FPSaved) {
1394 // CSR spill restores should use FP as base register. If
1395 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1396 // into a new scratch register and copy to FP later when other registers are
1397 // restored from the current stack frame.
1398 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1399 if (SGPRForFPSaveRestoreCopy) {
1400 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1401 } else {
1402 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1403 MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1404 if (!FramePtrRegScratchCopy)
1405 report_fatal_error(reason: "failed to find free scratch register");
1406
1407 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1408 }
1409
1410 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1411 FramePtrRegScratchCopy);
1412 }
1413
1414 if (FPSaved) {
1415 // Insert the copy to restore FP.
1416 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1417 : FramePtrRegScratchCopy;
1418 MachineInstrBuilder MIB =
1419 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1420 .addReg(RegNo: SrcReg);
1421 if (SGPRForFPSaveRestoreCopy)
1422 MIB.setMIFlag(MachineInstr::FrameDestroy);
1423 } else {
1424 // Insert the CSR spill restores with SP as the base register.
1425 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg,
1426 FramePtrRegScratchCopy);
1427 }
1428}
1429
1430#ifndef NDEBUG
1431static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1432 const MachineFrameInfo &MFI = MF.getFrameInfo();
1433 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1434 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1435 I != E; ++I) {
1436 if (!MFI.isDeadObjectIndex(I) &&
1437 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1438 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1439 return false;
1440 }
1441 }
1442
1443 return true;
1444}
1445#endif
1446
1447StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1448 int FI,
1449 Register &FrameReg) const {
1450 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1451
1452 FrameReg = RI->getFrameRegister(MF);
1453 return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI));
1454}
1455
1456void SIFrameLowering::processFunctionBeforeFrameFinalized(
1457 MachineFunction &MF,
1458 RegScavenger *RS) const {
1459 MachineFrameInfo &MFI = MF.getFrameInfo();
1460
1461 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1462 const SIInstrInfo *TII = ST.getInstrInfo();
1463 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1464 MachineRegisterInfo &MRI = MF.getRegInfo();
1465 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1466
1467 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1468 && EnableSpillVGPRToAGPR;
1469
1470 if (SpillVGPRToAGPR) {
1471 // To track the spill frame indices handled in this pass.
1472 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1473 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1474
1475 bool SeenDbgInstr = false;
1476
1477 for (MachineBasicBlock &MBB : MF) {
1478 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
1479 int FrameIndex;
1480 if (MI.isDebugInstr())
1481 SeenDbgInstr = true;
1482
1483 if (TII->isVGPRSpill(MI)) {
1484 // Try to eliminate stack used by VGPR spills before frame
1485 // finalization.
1486 unsigned FIOp = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
1487 Name: AMDGPU::OpName::vaddr);
1488 int FI = MI.getOperand(i: FIOp).getIndex();
1489 Register VReg =
1490 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
1491 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1492 isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) {
1493 assert(RS != nullptr);
1494 RS->enterBasicBlockEnd(MBB);
1495 RS->backward(I: std::next(x: MI.getIterator()));
1496 TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS);
1497 SpillFIs.set(FI);
1498 continue;
1499 }
1500 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1501 TII->isLoadFromStackSlot(MI, FrameIndex))
1502 if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex))
1503 NonVGPRSpillFIs.set(FrameIndex);
1504 }
1505 }
1506
1507 // Stack slot coloring may assign different objects to the same stack slot.
1508 // If not, then the VGPR to AGPR spill slot is dead.
1509 for (unsigned FI : SpillFIs.set_bits())
1510 if (!NonVGPRSpillFIs.test(Idx: FI))
1511 FuncInfo->setVGPRToAGPRSpillDead(FI);
1512
1513 for (MachineBasicBlock &MBB : MF) {
1514 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1515 MBB.addLiveIn(PhysReg: Reg);
1516
1517 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1518 MBB.addLiveIn(PhysReg: Reg);
1519
1520 MBB.sortUniqueLiveIns();
1521
1522 if (!SpillFIs.empty() && SeenDbgInstr)
1523 clearDebugInfoForSpillFIs(MFI, MBB, SpillFIs);
1524 }
1525 }
1526
1527 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1528 // can. Any remaining SGPR spills will go to memory, so move them back to the
1529 // default stack.
1530 bool HaveSGPRToVMemSpill =
1531 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1532 assert(allSGPRSpillsAreDead(MF) &&
1533 "SGPR spill should have been removed in SILowerSGPRSpills");
1534
1535 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1536 // but currently hasNonSpillStackObjects is set only from source
1537 // allocas. Stack temps produced from legalization are not counted currently.
1538 if (!allStackObjectsAreDead(MFI)) {
1539 assert(RS && "RegScavenger required if spilling");
1540
1541 // Add an emergency spill slot
1542 RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI));
1543
1544 // If we are spilling SGPRs to memory with a large frame, we may need a
1545 // second VGPR emergency frame index.
1546 if (HaveSGPRToVMemSpill &&
1547 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1548 RS->addScavengingFrameIndex(FI: MFI.CreateSpillStackObject(Size: 4, Alignment: Align(4)));
1549 }
1550 }
1551}
1552
1553void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1554 MachineFunction &MF, RegScavenger *RS) const {
1555 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1556 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1557 MachineRegisterInfo &MRI = MF.getRegInfo();
1558 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1559
1560 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1561 // On gfx908, we had initially reserved highest available VGPR for AGPR
1562 // copy. Now since we are done with RA, check if there exist an unused VGPR
1563 // which is lower than the eariler reserved VGPR before RA. If one exist,
1564 // use it for AGPR copy instead of one reserved before RA.
1565 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1566 Register UnusedLowVGPR =
1567 TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
1568 if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) <
1569 TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) {
1570 // Reserve this newly identified VGPR (for AGPR copy)
1571 // reserved registers should already be frozen at this point
1572 // so we can avoid calling MRI.freezeReservedRegs and just use
1573 // MRI.reserveReg
1574 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1575 MRI.reserveReg(PhysReg: UnusedLowVGPR, TRI);
1576 }
1577 }
1578 // We initally reserved the highest available SGPR pair for long branches
1579 // now, after RA, we shift down to a lower unused one if one exists
1580 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1581 Register UnusedLowSGPR =
1582 TRI->findUnusedRegister(MRI, RC: &AMDGPU::SGPR_64RegClass, MF);
1583 // If LongBranchReservedReg is null then we didn't find a long branch
1584 // and never reserved a register to begin with so there is nothing to
1585 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1586 // register to use so just keep the original one we set.
1587 if (LongBranchReservedReg && UnusedLowSGPR) {
1588 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1589 MRI.reserveReg(PhysReg: UnusedLowSGPR, TRI);
1590 }
1591}
1592
1593// The special SGPR spills like the one needed for FP, BP or any reserved
1594// registers delayed until frame lowering.
1595void SIFrameLowering::determinePrologEpilogSGPRSaves(
1596 MachineFunction &MF, BitVector &SavedVGPRs,
1597 bool NeedExecCopyReservedReg) const {
1598 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1599 MachineRegisterInfo &MRI = MF.getRegInfo();
1600 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1601 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1602 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1603 LiveRegUnits LiveUnits;
1604 LiveUnits.init(TRI: *TRI);
1605 // Initially mark callee saved registers as used so we will not choose them
1606 // while looking for scratch SGPRs.
1607 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1608 for (unsigned I = 0; CSRegs[I]; ++I)
1609 LiveUnits.addReg(Reg: CSRegs[I]);
1610
1611 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1612
1613 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1614 if (NeedExecCopyReservedReg ||
1615 (ReservedRegForExecCopy &&
1616 MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1617 MRI.reserveReg(PhysReg: ReservedRegForExecCopy, TRI);
1618 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1619 if (UnusedScratchReg) {
1620 // If found any unused scratch SGPR, reserve the register itself for Exec
1621 // copy and there is no need for any spills in that case.
1622 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1623 MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg);
1624 LiveUnits.addReg(Reg: UnusedScratchReg);
1625 } else {
1626 // Needs spill.
1627 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1628 "Re-reserving spill slot for EXEC copy register");
1629 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC,
1630 /*IncludeScratchCopy=*/false);
1631 }
1632 } else if (ReservedRegForExecCopy) {
1633 // Reset it at this point. There are no whole-wave copies and spills
1634 // encountered.
1635 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1636 }
1637
1638 // Chain functions don't return to the caller, so they don't need to preserve
1639 // the FP and BP.
1640 if (MFI->isChainFunction())
1641 return;
1642
1643 // hasFP only knows about stack objects that already exist. We're now
1644 // determining the stack slots that will be created, so we have to predict
1645 // them. Stack objects force FP usage with calls.
1646 //
1647 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1648 // don't want to report it here.
1649 //
1650 // FIXME: Is this really hasReservedCallFrame?
1651 const bool WillHaveFP =
1652 FrameInfo.hasCalls() &&
1653 (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo));
1654
1655 if (WillHaveFP || hasFP(MF)) {
1656 Register FramePtrReg = MFI->getFrameOffsetReg();
1657 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1658 "Re-reserving spill slot for FP");
1659 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg);
1660 }
1661
1662 if (TRI->hasBasePointer(MF)) {
1663 Register BasePtrReg = TRI->getBaseRegister();
1664 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1665 "Re-reserving spill slot for BP");
1666 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg);
1667 }
1668}
1669
1670// Only report VGPRs to generic code.
1671void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1672 BitVector &SavedVGPRs,
1673 RegScavenger *RS) const {
1674 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1675
1676 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1677 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1678 // we don't need to save and restore anything.
1679 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1680 return;
1681
1682 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS);
1683
1684 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1685 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1686 const SIInstrInfo *TII = ST.getInstrInfo();
1687 bool NeedExecCopyReservedReg = false;
1688
1689 MachineInstr *ReturnMI = nullptr;
1690 for (MachineBasicBlock &MBB : MF) {
1691 for (MachineInstr &MI : MBB) {
1692 // TODO: Walking through all MBBs here would be a bad heuristic. Better
1693 // handle them elsewhere.
1694 if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode()))
1695 NeedExecCopyReservedReg = true;
1696 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1697 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1698 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1699 (MFI->isChainFunction() &&
1700 TII->isChainCallOpcode(Opcode: MI.getOpcode()))) {
1701 // We expect all return to be the same size.
1702 assert(!ReturnMI ||
1703 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1704 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1705 ReturnMI = &MI;
1706 }
1707 }
1708 }
1709
1710 SmallVector<Register> SortedWWMVGPRs;
1711 for (Register Reg : MFI->getWWMReservedRegs()) {
1712 // The shift-back is needed only for the VGPRs used for SGPR spills and they
1713 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1714 // reserved registers.
1715 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1716 if (TRI->getRegSizeInBits(RC: *RC) != 32)
1717 continue;
1718 SortedWWMVGPRs.push_back(Elt: Reg);
1719 }
1720
1721 sort(C&: SortedWWMVGPRs, Comp: std::greater<Register>());
1722 MFI->shiftWwmVGPRsToLowestRange(MF, WWMVGPRs&: SortedWWMVGPRs, SavedVGPRs);
1723
1724 if (MFI->isEntryFunction())
1725 return;
1726
1727 if (MFI->isWholeWaveFunction()) {
1728 // In practice, all the VGPRs are WWM registers, and we will need to save at
1729 // least their inactive lanes. Add them to WWMReservedRegs.
1730 assert(!NeedExecCopyReservedReg &&
1731 "Whole wave functions can use the reg mapped for their i1 argument");
1732
1733 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
1734 for (MCRegister Reg :
1735 AMDGPU::VGPR_32RegClass.getRegisters().take_front(N: NumArchVGPRs))
1736 if (MF.getRegInfo().isPhysRegModified(PhysReg: Reg)) {
1737 MFI->reserveWWMRegister(Reg);
1738 MF.begin()->addLiveIn(PhysReg: Reg);
1739 }
1740 MF.begin()->sortUniqueLiveIns();
1741 }
1742
1743 // Remove any VGPRs used in the return value because these do not need to be saved.
1744 // This prevents CSR restore from clobbering return VGPRs.
1745 if (ReturnMI) {
1746 for (auto &Op : ReturnMI->operands()) {
1747 if (Op.isReg())
1748 SavedVGPRs.reset(Idx: Op.getReg());
1749 }
1750 }
1751
1752 // Create the stack objects for WWM registers now.
1753 for (Register Reg : MFI->getWWMReservedRegs()) {
1754 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1755 MFI->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(RC: *RC),
1756 Alignment: TRI->getSpillAlign(RC: *RC));
1757 }
1758
1759 // Ignore the SGPRs the default implementation found.
1760 SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask());
1761
1762 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1763 // In gfx908 there was do AGPR loads and stores and thus spilling also
1764 // require a temporary VGPR.
1765 if (!ST.hasGFX90AInsts())
1766 SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask());
1767
1768 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1769
1770 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1771 // allow the default insertion to handle them.
1772 for (auto &Reg : MFI->getWWMSpills())
1773 SavedVGPRs.reset(Idx: Reg.first);
1774}
1775
1776void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1777 BitVector &SavedRegs,
1778 RegScavenger *RS) const {
1779 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1780 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1781 if (MFI->isEntryFunction())
1782 return;
1783
1784 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1785 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1786
1787 // The SP is specifically managed and we don't want extra spills of it.
1788 SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg());
1789
1790 const BitVector AllSavedRegs = SavedRegs;
1791 SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask());
1792
1793 // We have to anticipate introducing CSR VGPR spills or spill of caller
1794 // save VGPR reserved for SGPR spills as we now always create stack entry
1795 // for it, if we don't have any stack objects already, since we require a FP
1796 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1797 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1798 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1799 const bool WillHaveFP =
1800 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1801
1802 // FP will be specially managed like SP.
1803 if (WillHaveFP || hasFP(MF))
1804 SavedRegs.reset(Idx: MFI->getFrameOffsetReg());
1805
1806 // Return address use with return instruction is hidden through the SI_RETURN
1807 // pseudo. Given that and since the IPRA computes actual register usage and
1808 // does not use CSR list, the clobbering of return address by function calls
1809 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1810 // usage collection. This will ensure save/restore of return address happens
1811 // in those scenarios.
1812 const MachineRegisterInfo &MRI = MF.getRegInfo();
1813 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1814 if (!MFI->isEntryFunction() &&
1815 (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) {
1816 SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0));
1817 SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1));
1818 }
1819}
1820
1821static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
1822 const GCNSubtarget &ST,
1823 std::vector<CalleeSavedInfo> &CSI) {
1824 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1825 MachineFrameInfo &MFI = MF.getFrameInfo();
1826 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1827
1828 assert(
1829 llvm::is_sorted(CSI,
1830 [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1831 return A.getReg() < B.getReg();
1832 }) &&
1833 "Callee saved registers not sorted");
1834
1835 auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1836 return !CSI.isSpilledToReg() &&
1837 TRI->getPhysRegBaseClass(Reg: CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
1838 !FuncInfo->isWWMReservedRegister(Reg: CSI.getReg());
1839 };
1840
1841 auto CSEnd = CSI.end();
1842 for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1843 Register Reg = CSIt->getReg();
1844 if (!CanUseBlockOps(*CSIt))
1845 continue;
1846
1847 // Find all the regs that will fit in a 32-bit mask starting at the current
1848 // reg and build said mask. It should have 1 for every register that's
1849 // included, with the current register as the least significant bit.
1850 uint32_t Mask = 1;
1851 CSEnd = std::remove_if(
1852 first: CSIt + 1, last: CSEnd, pred: [&](const CalleeSavedInfo &CSI) -> bool {
1853 if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
1854 Mask |= 1 << (CSI.getReg() - Reg);
1855 return true;
1856 } else {
1857 return false;
1858 }
1859 });
1860
1861 const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
1862 Register RegBlock =
1863 TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
1864 if (!RegBlock) {
1865 // We couldn't find a super register for the block. This can happen if
1866 // the register we started with is too high (e.g. v232 if the maximum is
1867 // v255). We therefore try to get the last register block and figure out
1868 // the mask from there.
1869 Register LastBlockStart =
1870 AMDGPU::VGPR0 + alignDown(Value: Reg - AMDGPU::VGPR0, Align: 32);
1871 RegBlock =
1872 TRI->getMatchingSuperReg(Reg: LastBlockStart, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
1873 assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
1874 "Couldn't find super register");
1875 int RegDelta = Reg - LastBlockStart;
1876 assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
1877 "Bad shift amount");
1878 Mask <<= RegDelta;
1879 }
1880
1881 FuncInfo->setMaskForVGPRBlockOps(RegisterBlock: RegBlock, Mask);
1882
1883 // The stack objects can be a bit smaller than the register block if we know
1884 // some of the high bits of Mask are 0. This may happen often with calling
1885 // conventions where the caller and callee-saved VGPRs are interleaved at
1886 // a small boundary (e.g. 8 or 16).
1887 int UnusedBits = llvm::countl_zero(Val: Mask);
1888 unsigned BlockSize = TRI->getSpillSize(RC: *BlockRegClass) - UnusedBits * 4;
1889 int FrameIdx =
1890 MFI.CreateStackObject(Size: BlockSize, Alignment: TRI->getSpillAlign(RC: *BlockRegClass),
1891 /*isSpillSlot=*/true);
1892 MFI.setIsCalleeSavedObjectIndex(ObjectIdx: FrameIdx, IsCalleeSaved: true);
1893
1894 CSIt->setFrameIdx(FrameIdx);
1895 CSIt->setReg(RegBlock);
1896 }
1897 CSI.erase(first: CSEnd, last: CSI.end());
1898}
1899
1900bool SIFrameLowering::assignCalleeSavedSpillSlots(
1901 MachineFunction &MF, const TargetRegisterInfo *TRI,
1902 std::vector<CalleeSavedInfo> &CSI) const {
1903 if (CSI.empty())
1904 return true; // Early exit if no callee saved registers are modified!
1905
1906 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1907 bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1908
1909 if (UseVGPRBlocks)
1910 assignSlotsUsingVGPRBlocks(MF, ST, CSI);
1911
1912 return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks;
1913}
1914
1915bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl(
1916 MachineFunction &MF, const TargetRegisterInfo *TRI,
1917 std::vector<CalleeSavedInfo> &CSI) const {
1918 if (CSI.empty())
1919 return true; // Early exit if no callee saved registers are modified!
1920
1921 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1922 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1923 const SIRegisterInfo *RI = ST.getRegisterInfo();
1924 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1925 Register BasePtrReg = RI->getBaseRegister();
1926 Register SGPRForFPSaveRestoreCopy =
1927 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1928 Register SGPRForBPSaveRestoreCopy =
1929 FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg);
1930 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1931 return false;
1932
1933 unsigned NumModifiedRegs = 0;
1934
1935 if (SGPRForFPSaveRestoreCopy)
1936 NumModifiedRegs++;
1937 if (SGPRForBPSaveRestoreCopy)
1938 NumModifiedRegs++;
1939
1940 for (auto &CS : CSI) {
1941 if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
1942 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1943 if (--NumModifiedRegs)
1944 break;
1945 } else if (CS.getReg() == BasePtrReg.asMCReg() &&
1946 SGPRForBPSaveRestoreCopy) {
1947 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1948 if (--NumModifiedRegs)
1949 break;
1950 }
1951 }
1952
1953 return false;
1954}
1955
1956bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1957 const MachineFunction &MF) const {
1958
1959 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1960 const MachineFrameInfo &MFI = MF.getFrameInfo();
1961 const SIInstrInfo *TII = ST.getInstrInfo();
1962 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1963 uint64_t MaxOffset = EstStackSize - 1;
1964
1965 // We need the emergency stack slots to be allocated in range of the
1966 // MUBUF/flat scratch immediate offset from the base register, so assign these
1967 // first at the incoming SP position.
1968 //
1969 // TODO: We could try sorting the objects to find a hole in the first bytes
1970 // rather than allocating as close to possible. This could save a lot of space
1971 // on frames with alignment requirements.
1972 if (ST.hasFlatScratchEnabled()) {
1973 if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1974 FlatVariant: SIInstrFlags::FlatScratch))
1975 return false;
1976 } else {
1977 if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset))
1978 return false;
1979 }
1980
1981 return true;
1982}
1983
1984bool SIFrameLowering::spillCalleeSavedRegisters(
1985 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1986 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1987 MachineFunction *MF = MBB.getParent();
1988 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1989 if (!ST.useVGPRBlockOpsForCSR())
1990 return false;
1991
1992 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1993 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1994 const SIInstrInfo *TII = ST.getInstrInfo();
1995 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1996
1997 const TargetRegisterClass *BlockRegClass =
1998 static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(MF: *MF);
1999 for (const CalleeSavedInfo &CS : CSI) {
2000 Register Reg = CS.getReg();
2001 if (!BlockRegClass->contains(Reg) ||
2002 !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
2003 spillCalleeSavedRegister(SaveBlock&: MBB, MI, CS, TII, TRI);
2004 continue;
2005 }
2006
2007 // Build a scratch block store.
2008 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
2009 int FrameIndex = CS.getFrameIdx();
2010 MachinePointerInfo PtrInfo =
2011 MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
2012 MachineMemOperand *MMO =
2013 MF->getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
2014 Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
2015 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
2016
2017 BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(),
2018 MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
2019 .addReg(RegNo: Reg, Flags: getKillRegState(B: false))
2020 .addFrameIndex(Idx: FrameIndex)
2021 .addReg(RegNo: MFI->getStackPtrOffsetReg())
2022 .addImm(Val: 0)
2023 .addImm(Val: Mask)
2024 .addMemOperand(MMO);
2025
2026 FuncInfo->setHasSpilledVGPRs();
2027
2028 // Add the register to the liveins. This is necessary because if any of the
2029 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2030 // then the whole block will be marked as reserved and `updateLiveness` will
2031 // skip it.
2032 MBB.addLiveIn(PhysReg: Reg);
2033 }
2034 MBB.sortUniqueLiveIns();
2035
2036 return true;
2037}
2038
2039bool SIFrameLowering::restoreCalleeSavedRegisters(
2040 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2041 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2042 MachineFunction *MF = MBB.getParent();
2043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2044 if (!ST.useVGPRBlockOpsForCSR())
2045 return false;
2046
2047 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2048 MachineFrameInfo &MFI = MF->getFrameInfo();
2049 const SIInstrInfo *TII = ST.getInstrInfo();
2050 const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
2051 const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(MF: *MF);
2052 for (const CalleeSavedInfo &CS : reverse(C&: CSI)) {
2053 Register Reg = CS.getReg();
2054 if (!BlockRegClass->contains(Reg) ||
2055 !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
2056 restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
2057 continue;
2058 }
2059
2060 // Build a scratch block load.
2061 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
2062 int FrameIndex = CS.getFrameIdx();
2063 MachinePointerInfo PtrInfo =
2064 MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
2065 MachineMemOperand *MMO = MF->getMachineMemOperand(
2066 PtrInfo, F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIndex),
2067 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIndex));
2068
2069 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(),
2070 MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), DestReg: Reg)
2071 .addFrameIndex(Idx: FrameIndex)
2072 .addReg(RegNo: FuncInfo->getStackPtrOffsetReg())
2073 .addImm(Val: 0)
2074 .addImm(Val: Mask)
2075 .addMemOperand(MMO);
2076 SITRI->addImplicitUsesForBlockCSRLoad(MIB, BlockReg: Reg);
2077
2078 // Add the register to the liveins. This is necessary because if any of the
2079 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2080 // then the whole block will be marked as reserved and `updateLiveness` will
2081 // skip it.
2082 MBB.addLiveIn(PhysReg: Reg);
2083 }
2084
2085 MBB.sortUniqueLiveIns();
2086 return true;
2087}
2088
2089MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
2090 MachineFunction &MF,
2091 MachineBasicBlock &MBB,
2092 MachineBasicBlock::iterator I) const {
2093 int64_t Amount = I->getOperand(i: 0).getImm();
2094 if (Amount == 0)
2095 return MBB.erase(I);
2096
2097 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2098 const SIInstrInfo *TII = ST.getInstrInfo();
2099 const DebugLoc &DL = I->getDebugLoc();
2100 unsigned Opc = I->getOpcode();
2101 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
2102 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0;
2103
2104 if (!hasReservedCallFrame(MF)) {
2105 Amount = alignTo(Size: Amount, A: getStackAlign());
2106 assert(isUInt<32>(Amount) && "exceeded stack address space size");
2107 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2108 Register SPReg = MFI->getStackPtrOffsetReg();
2109
2110 Amount *= getScratchScaleFactor(ST);
2111 if (IsDestroy)
2112 Amount = -Amount;
2113 auto Add = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SPReg)
2114 .addReg(RegNo: SPReg)
2115 .addImm(Val: Amount);
2116 Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
2117 } else if (CalleePopAmount != 0) {
2118 llvm_unreachable("is this used?");
2119 }
2120
2121 return MBB.erase(I);
2122}
2123
2124/// Returns true if the frame will require a reference to the stack pointer.
2125///
2126/// This is the set of conditions common to setting up the stack pointer in a
2127/// kernel, and for using a frame pointer in a callable function.
2128///
2129/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2130/// references SP.
2131static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
2132 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
2133}
2134
2135// The FP for kernels is always known 0, so we never really need to setup an
2136// explicit register for it. However, DisableFramePointerElim will force us to
2137// use a register for it.
2138bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
2139 const MachineFrameInfo &MFI = MF.getFrameInfo();
2140
2141 // For entry functions we can use an immediate offset in most cases,
2142 // so the presence of calls doesn't imply we need a distinct frame pointer.
2143 if (MFI.hasCalls() &&
2144 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
2145 // All offsets are unsigned, so need to be addressed in the same direction
2146 // as stack growth.
2147
2148 // FIXME: This function is pretty broken, since it can be called before the
2149 // frame layout is determined or CSR spills are inserted.
2150 return MFI.getStackSize() != 0;
2151 }
2152
2153 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
2154 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
2155 MF) ||
2156 mayReserveScratchForCWSR(MF) ||
2157 MF.getTarget().Options.DisableFramePointerElim(MF);
2158}
2159
2160bool SIFrameLowering::mayReserveScratchForCWSR(
2161 const MachineFunction &MF) const {
2162 return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
2163 AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) &&
2164 AMDGPU::isCompute(CC: MF.getFunction().getCallingConv());
2165}
2166
2167// This is essentially a reduced version of hasFP for entry functions. Since the
2168// stack pointer is known 0 on entry to kernels, we never really need an FP
2169// register. We may need to initialize the stack pointer depending on the frame
2170// properties, which logically overlaps many of the cases where an ordinary
2171// function would require an FP.
2172bool SIFrameLowering::requiresStackPointerReference(
2173 const MachineFunction &MF) const {
2174 // Callable functions always require a stack pointer reference.
2175 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
2176 "only expected to call this for entry points functions");
2177
2178 const MachineFrameInfo &MFI = MF.getFrameInfo();
2179
2180 // Entry points ordinarily don't need to initialize SP. We have to set it up
2181 // for callees if there are any. Also note tail calls are only possible via
2182 // the `llvm.amdgcn.cs.chain` intrinsic.
2183 if (MFI.hasCalls() || MFI.hasTailCall())
2184 return true;
2185
2186 // We still need to initialize the SP if we're doing anything weird that
2187 // references the SP, like variable sized stack objects.
2188 return frameTriviallyRequiresSP(MFI);
2189}
2190