1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "GCNSubtarget.h"
12#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13#include "SIMachineFunctionInfo.h"
14#include "llvm/CodeGen/LiveRegUnits.h"
15#include "llvm/CodeGen/MachineFrameInfo.h"
16#include "llvm/CodeGen/RegisterScavenging.h"
17#include "llvm/Target/TargetMachine.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "frame-info"
22
23static cl::opt<bool> EnableSpillVGPRToAGPR(
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
26 cl::ReallyHidden,
27 cl::init(Val: true));
28
29// Find a register matching \p RC from \p LiveUnits which is unused and
30// available throughout the function. On failure, returns AMDGPU::NoRegister.
31// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32// MCRegisters. This should reduce the number of iterations and avoid redundant
33// checking.
34static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(PhysReg: Reg))
40 return Reg;
41 }
42 return MCRegister();
43}
44
45// Find a scratch register that we can use in the prologue. We avoid using
46// callee-save registers since they may appear to be free when this is called
47// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48// when this is called from emitPrologue.
49static MCRegister findScratchNonCalleeSaveRegister(
50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(Reg: CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68}
69
70/// Query target location for spilling SGPRs
71/// \p IncludeScratchCopy : Also look for free scratch SGPRs
72static void getVGPRSpillLaneOrTempRegister(
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MRI&: MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr,
95 ID: TargetStackID::SGPRSpill);
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
102 MFI->addToPrologEpilogSGPRSpills(
103 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
112 MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI);
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115 MFI->addToPrologEpilogSGPRSpills(
116 Reg: SGPR,
117 SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
122 MFI->addToPrologEpilogSGPRSpills(
123 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(Reg: ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129}
130
131// We need to specially emit stack operations here because a different frame
132// register is used than in the rest of the function, as getFrameRegister would
133// use.
134static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
137 MachineBasicBlock &MBB,
138 MachineBasicBlock::iterator I, const DebugLoc &DL,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146 MachineMemOperand *MMO = MF.getMachineMemOperand(
147 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
148 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
149 LiveUnits.addReg(Reg: SpillReg);
150 bool IsKill = !MBB.isLiveIn(Reg: SpillReg);
151 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg,
152 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(Reg: SpillReg);
155}
156
157static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
161 MachineBasicBlock &MBB,
162 MachineBasicBlock::iterator I,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170 MachineMemOperand *MMO = MF.getMachineMemOperand(
171 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
172 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
173 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg,
174 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
175}
176
177static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
180 MachineFunction *MF = MBB.getParent();
181 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi)
189 .addImm(Val: MFI->getGITPtrHigh())
190 .addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(Opcode: AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF);
196 MF->getRegInfo().addLiveIn(Reg: GitPtrLo);
197 MBB.addLiveIn(PhysReg: GitPtrLo);
198 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo)
199 .addReg(RegNo: GitPtrLo);
200}
201
202static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
204 MachineFunction &MF, MachineBasicBlock &MBB,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(MI: *MBBI);
214 }
215 }
216}
217
218namespace llvm {
219
220// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221// BP, etc. These spills are delayed until the current function's frame is
222// finalized. For a given register, the builder uses the
223// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224class PrologEpilogSGPRSpillBuilder {
225 MachineBasicBlock::iterator MI;
226 MachineBasicBlock &MBB;
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
234 const PrologEpilogSGPRSaveRestoreInfo SI;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
242 void saveToMemory(const int FI) const {
243 MachineRegisterInfo &MRI = MF.getRegInfo();
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true);
247
248 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249 MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error(reason: "failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
257 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR)
258 .addReg(RegNo: SubReg);
259
260 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
269 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
278 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR),
279 DestReg: Spill[I].VGPR)
280 .addReg(RegNo: SubReg)
281 .addImm(Val: Spill[I].Lane)
282 .addReg(RegNo: Spill[I].VGPR, flags: RegState::Undef);
283 }
284 }
285
286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg)
288 .addReg(RegNo: SuperReg)
289 .setMIFlag(MachineInstr::FrameSetup);
290 }
291
292 void restoreFromMemory(const int FI) {
293 MachineRegisterInfo &MRI = MF.getRegInfo();
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false);
296 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297 MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error(reason: "failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL,
307 SpillReg: TmpVGPR, FI, FrameReg, DwordOff);
308 MRI.constrainRegClass(Reg: SubReg, RC: &AMDGPU::SReg_32_XM0RegClass);
309 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SubReg)
310 .addReg(RegNo: TmpVGPR, flags: RegState::Kill);
311 DwordOff += 4;
312 }
313 }
314
315 void restoreFromVGPRLane(const int FI) {
316 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
317 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
318 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
319 assert(Spill.size() == NumSubRegs);
320
321 for (unsigned I = 0; I < NumSubRegs; ++I) {
322 Register SubReg = NumSubRegs == 1
323 ? SuperReg
324 : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I]));
325 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg)
326 .addReg(RegNo: Spill[I].VGPR)
327 .addImm(Val: Spill[I].Lane);
328 }
329 }
330
331 void copyFromScratchSGPR(Register SrcReg) const {
332 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SuperReg)
333 .addReg(RegNo: SrcReg)
334 .setMIFlag(MachineInstr::FrameDestroy);
335 }
336
337public:
338 PrologEpilogSGPRSpillBuilder(Register Reg,
339 const PrologEpilogSGPRSaveRestoreInfo SI,
340 MachineBasicBlock &MBB,
341 MachineBasicBlock::iterator MI,
342 const DebugLoc &DL, const SIInstrInfo *TII,
343 const SIRegisterInfo &TRI,
344 LiveRegUnits &LiveUnits, Register FrameReg)
345 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
346 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
347 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
348 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
349 FrameReg(FrameReg) {
350 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg);
351 SplitParts = TRI.getRegSplitParts(RC, EltSize);
352 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
353
354 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
355 }
356
357 void save() {
358 switch (SI.getKind()) {
359 case SGPRSaveKind::SPILL_TO_MEM:
360 return saveToMemory(FI: SI.getIndex());
361 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
362 return saveToVGPRLane(FI: SI.getIndex());
363 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
364 return copyToScratchSGPR(DstReg: SI.getReg());
365 }
366 }
367
368 void restore() {
369 switch (SI.getKind()) {
370 case SGPRSaveKind::SPILL_TO_MEM:
371 return restoreFromMemory(FI: SI.getIndex());
372 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
373 return restoreFromVGPRLane(FI: SI.getIndex());
374 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
375 return copyFromScratchSGPR(SrcReg: SI.getReg());
376 }
377 }
378};
379
380} // namespace llvm
381
382// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
383void SIFrameLowering::emitEntryFunctionFlatScratchInit(
384 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
385 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
386 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
387 const SIInstrInfo *TII = ST.getInstrInfo();
388 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
389 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
390
391 // We don't need this if we only have spills since there is no user facing
392 // scratch.
393
394 // TODO: If we know we don't have flat instructions earlier, we can omit
395 // this from the input registers.
396 //
397 // TODO: We only need to know if we access scratch space through a flat
398 // pointer. Because we only detect if flat instructions are used at all,
399 // this will be used more often than necessary on VI.
400
401 Register FlatScrInitLo;
402 Register FlatScrInitHi;
403
404 if (ST.isAmdPalOS()) {
405 // Extract the scratch offset from the descriptor in the GIT
406 LiveRegUnits LiveUnits;
407 LiveUnits.init(TRI: *TRI);
408 LiveUnits.addLiveIns(MBB);
409
410 // Find unused reg to load flat scratch init into
411 MachineRegisterInfo &MRI = MF.getRegInfo();
412 Register FlatScrInit = AMDGPU::NoRegister;
413 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
414 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
415 AllSGPR64s = AllSGPR64s.slice(
416 N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded));
417 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
418 for (MCPhysReg Reg : AllSGPR64s) {
419 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) &&
420 MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg)) {
421 FlatScrInit = Reg;
422 break;
423 }
424 }
425 assert(FlatScrInit && "Failed to find free register for scratch init");
426
427 FlatScrInitLo = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub0);
428 FlatScrInitHi = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub1);
429
430 buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit);
431
432 // We now have the GIT ptr - now get the scratch descriptor from the entry
433 // at offset 0 (or offset 16 for a compute shader).
434 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
435 const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
436 auto *MMO = MF.getMachineMemOperand(
437 PtrInfo,
438 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
439 MachineMemOperand::MODereferenceable,
440 Size: 8, BaseAlignment: Align(4));
441 unsigned Offset =
442 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
443 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
444 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
445 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit)
446 .addReg(RegNo: FlatScrInit)
447 .addImm(Val: EncodedOffset) // offset
448 .addImm(Val: 0) // cpol
449 .addMemOperand(MMO);
450
451 // Mask the offset in [47:0] of the descriptor
452 const MCInstrDesc &SAndB32 = TII->get(Opcode: AMDGPU::S_AND_B32);
453 auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi)
454 .addReg(RegNo: FlatScrInitHi)
455 .addImm(Val: 0xffff);
456 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
457 } else {
458 Register FlatScratchInitReg =
459 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
460 assert(FlatScratchInitReg);
461
462 MachineRegisterInfo &MRI = MF.getRegInfo();
463 MRI.addLiveIn(Reg: FlatScratchInitReg);
464 MBB.addLiveIn(PhysReg: FlatScratchInitReg);
465
466 FlatScrInitLo = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub0);
467 FlatScrInitHi = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub1);
468 }
469
470 // Do a 64-bit pointer add.
471 if (ST.flatScratchIsPointer()) {
472 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
473 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: FlatScrInitLo)
474 .addReg(RegNo: FlatScrInitLo)
475 .addReg(RegNo: ScratchWaveOffsetReg);
476 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
477 DestReg: FlatScrInitHi)
478 .addReg(RegNo: FlatScrInitHi)
479 .addImm(Val: 0);
480 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
481
482 using namespace AMDGPU::Hwreg;
483 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
484 .addReg(RegNo: FlatScrInitLo)
485 .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_LO, Values: 0, Values: 32)));
486 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
487 .addReg(RegNo: FlatScrInitHi)
488 .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_HI, Values: 0, Values: 32)));
489 return;
490 }
491
492 // For GFX9.
493 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: AMDGPU::FLAT_SCR_LO)
494 .addReg(RegNo: FlatScrInitLo)
495 .addReg(RegNo: ScratchWaveOffsetReg);
496 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
497 DestReg: AMDGPU::FLAT_SCR_HI)
498 .addReg(RegNo: FlatScrInitHi)
499 .addImm(Val: 0);
500 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
501
502 return;
503 }
504
505 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
506
507 // Copy the size in bytes.
508 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::FLAT_SCR_LO)
509 .addReg(RegNo: FlatScrInitHi, flags: RegState::Kill);
510
511 // Add wave offset in bytes to private base offset.
512 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
513 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FlatScrInitLo)
514 .addReg(RegNo: FlatScrInitLo)
515 .addReg(RegNo: ScratchWaveOffsetReg);
516
517 // Convert offset to 256-byte units.
518 auto LShr = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32),
519 DestReg: AMDGPU::FLAT_SCR_HI)
520 .addReg(RegNo: FlatScrInitLo, flags: RegState::Kill)
521 .addImm(Val: 8);
522 LShr->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
523}
524
525// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
526// memory. They should have been removed by now.
527static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
528 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
529 I != E; ++I) {
530 if (!MFI.isDeadObjectIndex(ObjectIdx: I))
531 return false;
532 }
533
534 return true;
535}
536
537// Shift down registers reserved for the scratch RSRC.
538Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
539 MachineFunction &MF) const {
540
541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
542 const SIInstrInfo *TII = ST.getInstrInfo();
543 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
544 MachineRegisterInfo &MRI = MF.getRegInfo();
545 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
546
547 assert(MFI->isEntryFunction());
548
549 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
550
551 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) &&
552 allStackObjectsAreDead(MFI: MF.getFrameInfo())))
553 return Register();
554
555 if (ST.hasSGPRInitBug() ||
556 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
557 return ScratchRsrcReg;
558
559 // We reserved the last registers for this. Shift it down to the end of those
560 // which were actually used.
561 //
562 // FIXME: It might be safer to use a pseudoregister before replacement.
563
564 // FIXME: We should be able to eliminate unused input registers. We only
565 // cannot do this for the resources required for scratch access. For now we
566 // skip over user SGPRs and may leave unused holes.
567
568 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
569 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
570 AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded));
571
572 // Skip the last N reserved elements because they should have already been
573 // reserved for VCC etc.
574 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
575 for (MCPhysReg Reg : AllSGPR128s) {
576 // Pick the first unallocated one. Make sure we don't clobber the other
577 // reserved input we needed. Also for PAL, make sure we don't clobber
578 // the GIT pointer passed in SGPR0 or SGPR8.
579 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
580 (!GITPtrLoReg || !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg))) {
581 MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg);
582 MFI->setScratchRSrcReg(Reg);
583 MRI.reserveReg(PhysReg: Reg, TRI);
584 return Reg;
585 }
586 }
587
588 return ScratchRsrcReg;
589}
590
591static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
592 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
593}
594
595void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
596 MachineBasicBlock &MBB) const {
597 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
598
599 // FIXME: If we only have SGPR spills, we won't actually be using scratch
600 // memory since these spill to VGPRs. We should be cleaning up these unused
601 // SGPR spill frame indices somewhere.
602
603 // FIXME: We still have implicit uses on SGPR spill instructions in case they
604 // need to spill to vector memory. It's likely that will not happen, but at
605 // this point it appears we need the setup. This part of the prolog should be
606 // emitted after frame indices are eliminated.
607
608 // FIXME: Remove all of the isPhysRegUsed checks
609
610 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
611 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
612 const SIInstrInfo *TII = ST.getInstrInfo();
613 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
614 MachineRegisterInfo &MRI = MF.getRegInfo();
615 const Function &F = MF.getFunction();
616 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
617
618 assert(MFI->isEntryFunction());
619
620 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
621 Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
622
623 // We need to do the replacement of the private segment buffer register even
624 // if there are no stack objects. There could be stores to undef or a
625 // constant without an associated object.
626 //
627 // This will return `Register()` in cases where there are no actual
628 // uses of the SRSRC.
629 Register ScratchRsrcReg;
630 if (!ST.enableFlatScratch())
631 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
632
633 // Make the selected register live throughout the function.
634 if (ScratchRsrcReg) {
635 for (MachineBasicBlock &OtherBB : MF) {
636 if (&OtherBB != &MBB) {
637 OtherBB.addLiveIn(PhysReg: ScratchRsrcReg);
638 }
639 }
640 }
641
642 // Now that we have fixed the reserved SRSRC we need to locate the
643 // (potentially) preloaded SRSRC.
644 Register PreloadedScratchRsrcReg;
645 if (ST.isAmdHsaOrMesa(F)) {
646 PreloadedScratchRsrcReg =
647 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
648 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
649 // We added live-ins during argument lowering, but since they were not
650 // used they were deleted. We're adding the uses now, so add them back.
651 MRI.addLiveIn(Reg: PreloadedScratchRsrcReg);
652 MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg);
653 }
654 }
655
656 // Debug location must be unknown since the first debug location is used to
657 // determine the end of the prologue.
658 DebugLoc DL;
659 MachineBasicBlock::iterator I = MBB.begin();
660
661 // We found the SRSRC first because it needs four registers and has an
662 // alignment requirement. If the SRSRC that we found is clobbering with
663 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
664 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
665 // wave offset to a free SGPR.
666 Register ScratchWaveOffsetReg;
667 if (PreloadedScratchWaveOffsetReg &&
668 TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: PreloadedScratchWaveOffsetReg)) {
669 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
670 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
671 AllSGPRs = AllSGPRs.slice(
672 N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded));
673 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
674 for (MCPhysReg Reg : AllSGPRs) {
675 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
676 !TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: Reg) && GITPtrLoReg != Reg) {
677 ScratchWaveOffsetReg = Reg;
678 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchWaveOffsetReg)
679 .addReg(RegNo: PreloadedScratchWaveOffsetReg, flags: RegState::Kill);
680 break;
681 }
682 }
683
684 // FIXME: We can spill incoming arguments and restore at the end of the
685 // prolog.
686 if (!ScratchWaveOffsetReg)
687 report_fatal_error(
688 reason: "could not find temporary scratch offset register in prolog");
689 } else {
690 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
691 }
692 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
693
694 unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
695 if (!mayReserveScratchForCWSR(MF)) {
696 if (hasFP(MF)) {
697 Register FPReg = MFI->getFrameOffsetReg();
698 assert(FPReg != AMDGPU::FP_REG);
699 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: FPReg).addImm(Val: 0);
700 }
701
702 if (requiresStackPointerReference(MF)) {
703 Register SPReg = MFI->getStackPtrOffsetReg();
704 assert(SPReg != AMDGPU::SP_REG);
705 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
706 }
707 } else {
708 // We need to check if we're on a compute queue - if we are, then the CWSR
709 // trap handler may need to store some VGPRs on the stack. The first VGPR
710 // block is saved separately, so we only need to allocate space for any
711 // additional VGPR blocks used. For now, we will make sure there's enough
712 // room for the theoretical maximum number of VGPRs that can be allocated.
713 // FIXME: Figure out if the shader uses fewer VGPRs in practice.
714 assert(hasFP(MF));
715 Register FPReg = MFI->getFrameOffsetReg();
716 assert(FPReg != AMDGPU::FP_REG);
717 unsigned VGPRSize = llvm::alignTo(
718 Size: (ST.getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()) -
719 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST,
720 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize())) *
721 4,
722 A: FrameInfo.getMaxAlign());
723 MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
724
725 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: FPReg)
726 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
727 Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2));
728 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
729 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
730 // SCC, so we need to check for 0 manually.
731 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: FPReg);
732 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: FPReg).addImm(Val: VGPRSize);
733 if (requiresStackPointerReference(MF)) {
734 Register SPReg = MFI->getStackPtrOffsetReg();
735 assert(SPReg != AMDGPU::SP_REG);
736
737 // If at least one of the constants can be inlined, then we can use
738 // s_cselect. Otherwise, use a mov and cmovk.
739 if (AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm()) ||
740 AMDGPU::isInlinableLiteral32(Literal: Offset + VGPRSize,
741 HasInv2Pi: ST.hasInv2PiInlineImm())) {
742 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SPReg)
743 .addImm(Val: Offset + VGPRSize)
744 .addImm(Val: Offset);
745 } else {
746 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
747 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: SPReg)
748 .addImm(Val: Offset + VGPRSize);
749 }
750 }
751 }
752
753 bool NeedsFlatScratchInit =
754 MFI->getUserSGPRInfo().hasFlatScratchInit() &&
755 (MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
756 (!allStackObjectsAreDead(MFI: FrameInfo) && ST.enableFlatScratch()));
757
758 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
759 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
760 MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg);
761 MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg);
762 }
763
764 if (NeedsFlatScratchInit) {
765 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
766 }
767
768 if (ScratchRsrcReg) {
769 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
770 PreloadedPrivateBufferReg: PreloadedScratchRsrcReg,
771 ScratchRsrcReg, ScratchWaveOffsetReg);
772 }
773}
774
775// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
776void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
777 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
778 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
779 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
780
781 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
782 const SIInstrInfo *TII = ST.getInstrInfo();
783 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
784 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
785 const Function &Fn = MF.getFunction();
786
787 if (ST.isAmdPalOS()) {
788 // The pointer to the GIT is formed from the offset passed in and either
789 // the amdgpu-git-ptr-high function attribute or the top part of the PC
790 Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
791 Register Rsrc03 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
792
793 buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01);
794
795 // We now have the GIT ptr - now get the scratch descriptor from the entry
796 // at offset 0 (or offset 16 for a compute shader).
797 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
798 const MCInstrDesc &LoadDwordX4 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX4_IMM);
799 auto *MMO = MF.getMachineMemOperand(
800 PtrInfo,
801 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
802 MachineMemOperand::MODereferenceable,
803 Size: 16, BaseAlignment: Align(4));
804 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
805 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
806 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
807 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg)
808 .addReg(RegNo: Rsrc01)
809 .addImm(Val: EncodedOffset) // offset
810 .addImm(Val: 0) // cpol
811 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine)
812 .addMemOperand(MMO);
813
814 // The driver will always set the SRD for wave 64 (bits 118:117 of
815 // descriptor / bits 22:21 of third sub-reg will be 0b11)
816 // If the shader is actually wave32 we have to modify the const_index_stride
817 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
818 // reason the driver does this is that there can be cases where it presents
819 // 2 shaders with different wave size (e.g. VsFs).
820 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
821 if (ST.isWave32()) {
822 const MCInstrDesc &SBitsetB32 = TII->get(Opcode: AMDGPU::S_BITSET0_B32);
823 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03)
824 .addImm(Val: 21)
825 .addReg(RegNo: Rsrc03);
826 }
827 } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) {
828 assert(!ST.isAmdHsaOrMesa(Fn));
829 const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
830
831 Register Rsrc2 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub2);
832 Register Rsrc3 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
833
834 // Use relocations to get the pointer, and setup the other bits manually.
835 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
836
837 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
838 Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
839
840 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
841 const MCInstrDesc &Mov64 = TII->get(Opcode: AMDGPU::S_MOV_B64);
842
843 BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01)
844 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
845 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
846 } else {
847 const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
848
849 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
850 auto *MMO = MF.getMachineMemOperand(
851 PtrInfo,
852 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
853 MachineMemOperand::MODereferenceable,
854 Size: 8, BaseAlignment: Align(4));
855 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01)
856 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
857 .addImm(Val: 0) // offset
858 .addImm(Val: 0) // cpol
859 .addMemOperand(MMO)
860 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
861
862 MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR());
863 MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR());
864 }
865 } else {
866 Register Rsrc0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
867 Register Rsrc1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
868
869 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0)
870 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0")
871 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
872
873 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1)
874 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1")
875 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
876 }
877
878 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2)
879 .addImm(Val: Lo_32(Value: Rsrc23))
880 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
881
882 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3)
883 .addImm(Val: Hi_32(Value: Rsrc23))
884 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
885 } else if (ST.isAmdHsaOrMesa(F: Fn)) {
886 assert(PreloadedScratchRsrcReg);
887
888 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
889 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchRsrcReg)
890 .addReg(RegNo: PreloadedScratchRsrcReg, flags: RegState::Kill);
891 }
892 }
893
894 // Add the scratch wave offset into the scratch RSRC.
895 //
896 // We only want to update the first 48 bits, which is the base address
897 // pointer, without touching the adjacent 16 bits of flags. We know this add
898 // cannot carry-out from bit 47, otherwise the scratch allocation would be
899 // impossible to fit in the 48-bit global address space.
900 //
901 // TODO: Evaluate if it is better to just construct an SRD using the flat
902 // scratch init and some constants rather than update the one we are passed.
903 Register ScratchRsrcSub0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
904 Register ScratchRsrcSub1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
905
906 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
907 // the kernel body via inreg arguments.
908 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: ScratchRsrcSub0)
909 .addReg(RegNo: ScratchRsrcSub0)
910 .addReg(RegNo: ScratchWaveOffsetReg)
911 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
912 auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: ScratchRsrcSub1)
913 .addReg(RegNo: ScratchRsrcSub1)
914 .addImm(Val: 0)
915 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
916 Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
917}
918
919bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
920 switch (ID) {
921 case TargetStackID::Default:
922 case TargetStackID::NoAlloc:
923 case TargetStackID::SGPRSpill:
924 return true;
925 case TargetStackID::ScalableVector:
926 case TargetStackID::WasmLocal:
927 return false;
928 }
929 llvm_unreachable("Invalid TargetStackID::Value");
930}
931
932// Activate only the inactive lanes when \p EnableInactiveLanes is true.
933// Otherwise, activate all lanes. It returns the saved exec.
934static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
935 MachineFunction &MF,
936 MachineBasicBlock &MBB,
937 MachineBasicBlock::iterator MBBI,
938 const DebugLoc &DL, bool IsProlog,
939 bool EnableInactiveLanes) {
940 Register ScratchExecCopy;
941 MachineRegisterInfo &MRI = MF.getRegInfo();
942 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
943 const SIInstrInfo *TII = ST.getInstrInfo();
944 const SIRegisterInfo &TRI = TII->getRegisterInfo();
945 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
946
947 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
948
949 ScratchExecCopy = findScratchNonCalleeSaveRegister(
950 MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass());
951 if (!ScratchExecCopy)
952 report_fatal_error(reason: "failed to find free scratch register");
953
954 LiveUnits.addReg(Reg: ScratchExecCopy);
955
956 const unsigned SaveExecOpc =
957 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
958 : AMDGPU::S_OR_SAVEEXEC_B32)
959 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
960 : AMDGPU::S_OR_SAVEEXEC_B64);
961 auto SaveExec =
962 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: SaveExecOpc), DestReg: ScratchExecCopy).addImm(Val: -1);
963 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
964
965 return ScratchExecCopy;
966}
967
968void SIFrameLowering::emitCSRSpillStores(
969 MachineFunction &MF, MachineBasicBlock &MBB,
970 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
971 Register FrameReg, Register FramePtrRegScratchCopy) const {
972 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
973 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
974 const SIInstrInfo *TII = ST.getInstrInfo();
975 const SIRegisterInfo &TRI = TII->getRegisterInfo();
976
977 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
978 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
979 // might end up flipping the EXEC bits twice.
980 Register ScratchExecCopy;
981 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
982 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
983 if (!WWMScratchRegs.empty())
984 ScratchExecCopy =
985 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
986 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
987
988 auto StoreWWMRegisters =
989 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
990 for (const auto &Reg : WWMRegs) {
991 Register VGPR = Reg.first;
992 int FI = Reg.second;
993 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
994 SpillReg: VGPR, FI, FrameReg);
995 }
996 };
997
998 StoreWWMRegisters(WWMScratchRegs);
999 if (!WWMCalleeSavedRegs.empty()) {
1000 if (ScratchExecCopy) {
1001 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -1);
1003 } else {
1004 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1005 /*IsProlog*/ true,
1006 /*EnableInactiveLanes*/ false);
1007 }
1008 }
1009
1010 StoreWWMRegisters(WWMCalleeSavedRegs);
1011 if (ScratchExecCopy) {
1012 // FIXME: Split block and make terminator.
1013 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1014 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec())
1015 .addReg(RegNo: ScratchExecCopy, flags: RegState::Kill);
1016 LiveUnits.addReg(Reg: ScratchExecCopy);
1017 }
1018
1019 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1020
1021 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1022 // Special handle FP spill:
1023 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1024 // Otherwise, FP has been moved to a temporary register and spill it
1025 // instead.
1026 Register Reg =
1027 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1028 if (!Reg)
1029 continue;
1030
1031 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1032 LiveUnits, FrameReg);
1033 SB.save();
1034 }
1035
1036 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1037 // such scratch registers live throughout the function.
1038 SmallVector<Register, 1> ScratchSGPRs;
1039 FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs);
1040 if (!ScratchSGPRs.empty()) {
1041 for (MachineBasicBlock &MBB : MF) {
1042 for (MCPhysReg Reg : ScratchSGPRs)
1043 MBB.addLiveIn(PhysReg: Reg);
1044
1045 MBB.sortUniqueLiveIns();
1046 }
1047 if (!LiveUnits.empty()) {
1048 for (MCPhysReg Reg : ScratchSGPRs)
1049 LiveUnits.addReg(Reg);
1050 }
1051 }
1052}
1053
1054void SIFrameLowering::emitCSRSpillRestores(
1055 MachineFunction &MF, MachineBasicBlock &MBB,
1056 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1057 Register FrameReg, Register FramePtrRegScratchCopy) const {
1058 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1059 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1060 const SIInstrInfo *TII = ST.getInstrInfo();
1061 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1062 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1063
1064 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1065 // Special handle FP restore:
1066 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1067 // the FP value to a temporary register. The frame pointer should be
1068 // overwritten only at the end when all other spills are restored from
1069 // current frame.
1070 Register Reg =
1071 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1072 if (!Reg)
1073 continue;
1074
1075 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1076 LiveUnits, FrameReg);
1077 SB.restore();
1078 }
1079
1080 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1081 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1082 // this, we might end up flipping the EXEC bits twice.
1083 Register ScratchExecCopy;
1084 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1085 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1086 if (!WWMScratchRegs.empty())
1087 ScratchExecCopy =
1088 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1089 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1090
1091 auto RestoreWWMRegisters =
1092 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1093 for (const auto &Reg : WWMRegs) {
1094 Register VGPR = Reg.first;
1095 int FI = Reg.second;
1096 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1097 SpillReg: VGPR, FI, FrameReg);
1098 }
1099 };
1100
1101 RestoreWWMRegisters(WWMScratchRegs);
1102 if (!WWMCalleeSavedRegs.empty()) {
1103 if (ScratchExecCopy) {
1104 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1105 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -1);
1106 } else {
1107 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1108 /*IsProlog*/ false,
1109 /*EnableInactiveLanes*/ false);
1110 }
1111 }
1112
1113 RestoreWWMRegisters(WWMCalleeSavedRegs);
1114 if (ScratchExecCopy) {
1115 // FIXME: Split block and make terminator.
1116 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1117 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec())
1118 .addReg(RegNo: ScratchExecCopy, flags: RegState::Kill);
1119 }
1120}
1121
1122void SIFrameLowering::emitPrologue(MachineFunction &MF,
1123 MachineBasicBlock &MBB) const {
1124 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1125 if (FuncInfo->isEntryFunction()) {
1126 emitEntryFunctionPrologue(MF, MBB);
1127 return;
1128 }
1129
1130 MachineFrameInfo &MFI = MF.getFrameInfo();
1131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132 const SIInstrInfo *TII = ST.getInstrInfo();
1133 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1134 MachineRegisterInfo &MRI = MF.getRegInfo();
1135
1136 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1137 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1138 Register BasePtrReg =
1139 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1140 LiveRegUnits LiveUnits;
1141
1142 MachineBasicBlock::iterator MBBI = MBB.begin();
1143 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1144 // to determine the end of the prologue.
1145 DebugLoc DL;
1146
1147 if (FuncInfo->isChainFunction()) {
1148 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1149 // are free to set one up if they need it.
1150 bool UseSP = requiresStackPointerReference(MF);
1151 if (UseSP) {
1152 assert(StackPtrReg != AMDGPU::SP_REG);
1153
1154 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: StackPtrReg)
1155 .addImm(Val: MFI.getStackSize() * getScratchScaleFactor(ST));
1156 }
1157 }
1158
1159 bool HasFP = false;
1160 bool HasBP = false;
1161 uint32_t NumBytes = MFI.getStackSize();
1162 uint32_t RoundedSize = NumBytes;
1163
1164 if (TRI.hasStackRealignment(MF))
1165 HasFP = true;
1166
1167 Register FramePtrRegScratchCopy;
1168 if (!HasFP && !hasFP(MF)) {
1169 // Emit the CSR spill stores with SP base register.
1170 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1171 FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1172 FramePtrRegScratchCopy);
1173 } else {
1174 // CSR spill stores will use FP as base register.
1175 Register SGPRForFPSaveRestoreCopy =
1176 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1177
1178 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1179 if (SGPRForFPSaveRestoreCopy) {
1180 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1181 // the extra FP copy needed in the other two cases when FP is spilled to
1182 // memory or to a VGPR lane.
1183 PrologEpilogSGPRSpillBuilder SB(
1184 FramePtrReg,
1185 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI,
1186 DL, TII, TRI, LiveUnits, FramePtrReg);
1187 SB.save();
1188 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1189 } else {
1190 // Copy FP into a new scratch register so that its previous value can be
1191 // spilled after setting up the new frame.
1192 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1193 MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1194 if (!FramePtrRegScratchCopy)
1195 report_fatal_error(reason: "failed to find free scratch register");
1196
1197 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1198 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrRegScratchCopy)
1199 .addReg(RegNo: FramePtrReg);
1200 }
1201 }
1202
1203 if (HasFP) {
1204 const unsigned Alignment = MFI.getMaxAlign().value();
1205
1206 RoundedSize += Alignment;
1207 if (LiveUnits.empty()) {
1208 LiveUnits.init(TRI);
1209 LiveUnits.addLiveIns(MBB);
1210 }
1211
1212 // s_add_i32 s33, s32, NumBytes
1213 // s_and_b32 s33, s33, 0b111...0000
1214 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FramePtrReg)
1215 .addReg(RegNo: StackPtrReg)
1216 .addImm(Val: (Alignment - 1) * getScratchScaleFactor(ST))
1217 .setMIFlag(MachineInstr::FrameSetup);
1218 auto And = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: FramePtrReg)
1219 .addReg(RegNo: FramePtrReg, flags: RegState::Kill)
1220 .addImm(Val: -Alignment * getScratchScaleFactor(ST))
1221 .setMIFlag(MachineInstr::FrameSetup);
1222 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1223 FuncInfo->setIsStackRealigned(true);
1224 } else if ((HasFP = hasFP(MF))) {
1225 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1226 .addReg(RegNo: StackPtrReg)
1227 .setMIFlag(MachineInstr::FrameSetup);
1228 }
1229
1230 // If FP is used, emit the CSR spills with FP base register.
1231 if (HasFP) {
1232 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1233 FramePtrRegScratchCopy);
1234 if (FramePtrRegScratchCopy)
1235 LiveUnits.removeReg(Reg: FramePtrRegScratchCopy);
1236 }
1237
1238 // If we need a base pointer, set it up here. It's whatever the value of
1239 // the stack pointer is at this point. Any variable size objects will be
1240 // allocated after this, so we can still use the base pointer to reference
1241 // the incoming arguments.
1242 if ((HasBP = TRI.hasBasePointer(MF))) {
1243 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BasePtrReg)
1244 .addReg(RegNo: StackPtrReg)
1245 .setMIFlag(MachineInstr::FrameSetup);
1246 }
1247
1248 if (HasFP && RoundedSize != 0) {
1249 auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg)
1250 .addReg(RegNo: StackPtrReg)
1251 .addImm(Val: RoundedSize * getScratchScaleFactor(ST))
1252 .setMIFlag(MachineInstr::FrameSetup);
1253 Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
1254 }
1255
1256 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1257 (void)FPSaved;
1258 assert((!HasFP || FPSaved) &&
1259 "Needed to save FP but didn't save it anywhere");
1260
1261 // If we allow spilling to AGPRs we may have saved FP but then spill
1262 // everything into AGPRs instead of the stack.
1263 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1264 "Saved FP but didn't need it");
1265
1266 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg);
1267 (void)BPSaved;
1268 assert((!HasBP || BPSaved) &&
1269 "Needed to save BP but didn't save it anywhere");
1270
1271 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1272}
1273
1274void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1275 MachineBasicBlock &MBB) const {
1276 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1277 if (FuncInfo->isEntryFunction())
1278 return;
1279
1280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1281 const SIInstrInfo *TII = ST.getInstrInfo();
1282 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1283 MachineRegisterInfo &MRI = MF.getRegInfo();
1284 LiveRegUnits LiveUnits;
1285 // Get the insert location for the epilogue. If there were no terminators in
1286 // the block, get the last instruction.
1287 MachineBasicBlock::iterator MBBI = MBB.end();
1288 DebugLoc DL;
1289 if (!MBB.empty()) {
1290 MBBI = MBB.getLastNonDebugInstr();
1291 if (MBBI != MBB.end())
1292 DL = MBBI->getDebugLoc();
1293
1294 MBBI = MBB.getFirstTerminator();
1295 }
1296
1297 const MachineFrameInfo &MFI = MF.getFrameInfo();
1298 uint32_t NumBytes = MFI.getStackSize();
1299 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1300 ? NumBytes + MFI.getMaxAlign().value()
1301 : NumBytes;
1302 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1303 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1304 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1305
1306 if (RoundedSize != 0) {
1307 if (TRI.hasBasePointer(MF)) {
1308 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1309 .addReg(RegNo: TRI.getBaseRegister())
1310 .setMIFlag(MachineInstr::FrameDestroy);
1311 } else if (hasFP(MF)) {
1312 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1313 .addReg(RegNo: FramePtrReg)
1314 .setMIFlag(MachineInstr::FrameDestroy);
1315 }
1316 }
1317
1318 Register FramePtrRegScratchCopy;
1319 Register SGPRForFPSaveRestoreCopy =
1320 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1321 if (FPSaved) {
1322 // CSR spill restores should use FP as base register. If
1323 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1324 // into a new scratch register and copy to FP later when other registers are
1325 // restored from the current stack frame.
1326 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1327 if (SGPRForFPSaveRestoreCopy) {
1328 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1329 } else {
1330 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1331 MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1332 if (!FramePtrRegScratchCopy)
1333 report_fatal_error(reason: "failed to find free scratch register");
1334
1335 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1336 }
1337
1338 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1339 FramePtrRegScratchCopy);
1340 }
1341
1342 if (FPSaved) {
1343 // Insert the copy to restore FP.
1344 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1345 : FramePtrRegScratchCopy;
1346 MachineInstrBuilder MIB =
1347 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1348 .addReg(RegNo: SrcReg);
1349 if (SGPRForFPSaveRestoreCopy)
1350 MIB.setMIFlag(MachineInstr::FrameDestroy);
1351 } else {
1352 // Insert the CSR spill restores with SP as the base register.
1353 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
1354 FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1355 FramePtrRegScratchCopy);
1356 }
1357}
1358
1359#ifndef NDEBUG
1360static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1361 const MachineFrameInfo &MFI = MF.getFrameInfo();
1362 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1363 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1364 I != E; ++I) {
1365 if (!MFI.isDeadObjectIndex(I) &&
1366 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1367 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1368 return false;
1369 }
1370 }
1371
1372 return true;
1373}
1374#endif
1375
1376StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1377 int FI,
1378 Register &FrameReg) const {
1379 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1380
1381 FrameReg = RI->getFrameRegister(MF);
1382 return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI));
1383}
1384
1385void SIFrameLowering::processFunctionBeforeFrameFinalized(
1386 MachineFunction &MF,
1387 RegScavenger *RS) const {
1388 MachineFrameInfo &MFI = MF.getFrameInfo();
1389
1390 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1391 const SIInstrInfo *TII = ST.getInstrInfo();
1392 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1393 MachineRegisterInfo &MRI = MF.getRegInfo();
1394 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1395
1396 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1397 && EnableSpillVGPRToAGPR;
1398
1399 if (SpillVGPRToAGPR) {
1400 // To track the spill frame indices handled in this pass.
1401 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1402 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1403
1404 bool SeenDbgInstr = false;
1405
1406 for (MachineBasicBlock &MBB : MF) {
1407 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
1408 int FrameIndex;
1409 if (MI.isDebugInstr())
1410 SeenDbgInstr = true;
1411
1412 if (TII->isVGPRSpill(MI)) {
1413 // Try to eliminate stack used by VGPR spills before frame
1414 // finalization.
1415 unsigned FIOp = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
1416 Name: AMDGPU::OpName::vaddr);
1417 int FI = MI.getOperand(i: FIOp).getIndex();
1418 Register VReg =
1419 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
1420 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1421 isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) {
1422 assert(RS != nullptr);
1423 RS->enterBasicBlockEnd(MBB);
1424 RS->backward(I: std::next(x: MI.getIterator()));
1425 TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS);
1426 SpillFIs.set(FI);
1427 continue;
1428 }
1429 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1430 TII->isLoadFromStackSlot(MI, FrameIndex))
1431 if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex))
1432 NonVGPRSpillFIs.set(FrameIndex);
1433 }
1434 }
1435
1436 // Stack slot coloring may assign different objects to the same stack slot.
1437 // If not, then the VGPR to AGPR spill slot is dead.
1438 for (unsigned FI : SpillFIs.set_bits())
1439 if (!NonVGPRSpillFIs.test(Idx: FI))
1440 FuncInfo->setVGPRToAGPRSpillDead(FI);
1441
1442 for (MachineBasicBlock &MBB : MF) {
1443 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1444 MBB.addLiveIn(PhysReg: Reg);
1445
1446 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1447 MBB.addLiveIn(PhysReg: Reg);
1448
1449 MBB.sortUniqueLiveIns();
1450
1451 if (!SpillFIs.empty() && SeenDbgInstr) {
1452 // FIXME: The dead frame indices are replaced with a null register from
1453 // the debug value instructions. We should instead, update it with the
1454 // correct register value. But not sure the register value alone is
1455 for (MachineInstr &MI : MBB) {
1456 if (MI.isDebugValue()) {
1457 uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
1458 if (MI.getOperand(i: StackOperandIdx).isFI() &&
1459 !MFI.isFixedObjectIndex(
1460 ObjectIdx: MI.getOperand(i: StackOperandIdx).getIndex()) &&
1461 SpillFIs[MI.getOperand(i: StackOperandIdx).getIndex()]) {
1462 MI.getOperand(i: StackOperandIdx)
1463 .ChangeToRegister(Reg: Register(), isDef: false /*isDef*/);
1464 }
1465 }
1466 }
1467 }
1468 }
1469 }
1470
1471 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1472 // can. Any remaining SGPR spills will go to memory, so move them back to the
1473 // default stack.
1474 bool HaveSGPRToVMemSpill =
1475 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1476 assert(allSGPRSpillsAreDead(MF) &&
1477 "SGPR spill should have been removed in SILowerSGPRSpills");
1478
1479 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1480 // but currently hasNonSpillStackObjects is set only from source
1481 // allocas. Stack temps produced from legalization are not counted currently.
1482 if (!allStackObjectsAreDead(MFI)) {
1483 assert(RS && "RegScavenger required if spilling");
1484
1485 // Add an emergency spill slot
1486 RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI));
1487
1488 // If we are spilling SGPRs to memory with a large frame, we may need a
1489 // second VGPR emergency frame index.
1490 if (HaveSGPRToVMemSpill &&
1491 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1492 RS->addScavengingFrameIndex(FI: MFI.CreateSpillStackObject(Size: 4, Alignment: Align(4)));
1493 }
1494 }
1495}
1496
1497void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1498 MachineFunction &MF, RegScavenger *RS) const {
1499 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1500 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1501 MachineRegisterInfo &MRI = MF.getRegInfo();
1502 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1503
1504 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1505 // On gfx908, we had initially reserved highest available VGPR for AGPR
1506 // copy. Now since we are done with RA, check if there exist an unused VGPR
1507 // which is lower than the eariler reserved VGPR before RA. If one exist,
1508 // use it for AGPR copy instead of one reserved before RA.
1509 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1510 Register UnusedLowVGPR =
1511 TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
1512 if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) <
1513 TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) {
1514 // Reserve this newly identified VGPR (for AGPR copy)
1515 // reserved registers should already be frozen at this point
1516 // so we can avoid calling MRI.freezeReservedRegs and just use
1517 // MRI.reserveReg
1518 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1519 MRI.reserveReg(PhysReg: UnusedLowVGPR, TRI);
1520 }
1521 }
1522 // We initally reserved the highest available SGPR pair for long branches
1523 // now, after RA, we shift down to a lower unused one if one exists
1524 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1525 Register UnusedLowSGPR =
1526 TRI->findUnusedRegister(MRI, RC: &AMDGPU::SGPR_64RegClass, MF);
1527 // If LongBranchReservedReg is null then we didn't find a long branch
1528 // and never reserved a register to begin with so there is nothing to
1529 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1530 // register to use so just keep the original one we set.
1531 if (LongBranchReservedReg && UnusedLowSGPR) {
1532 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1533 MRI.reserveReg(PhysReg: UnusedLowSGPR, TRI);
1534 }
1535}
1536
1537// The special SGPR spills like the one needed for FP, BP or any reserved
1538// registers delayed until frame lowering.
1539void SIFrameLowering::determinePrologEpilogSGPRSaves(
1540 MachineFunction &MF, BitVector &SavedVGPRs,
1541 bool NeedExecCopyReservedReg) const {
1542 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1543 MachineRegisterInfo &MRI = MF.getRegInfo();
1544 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1545 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1546 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1547 LiveRegUnits LiveUnits;
1548 LiveUnits.init(TRI: *TRI);
1549 // Initially mark callee saved registers as used so we will not choose them
1550 // while looking for scratch SGPRs.
1551 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1552 for (unsigned I = 0; CSRegs[I]; ++I)
1553 LiveUnits.addReg(Reg: CSRegs[I]);
1554
1555 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1556
1557 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1558 if (NeedExecCopyReservedReg ||
1559 (ReservedRegForExecCopy &&
1560 MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1561 MRI.reserveReg(PhysReg: ReservedRegForExecCopy, TRI);
1562 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1563 if (UnusedScratchReg) {
1564 // If found any unused scratch SGPR, reserve the register itself for Exec
1565 // copy and there is no need for any spills in that case.
1566 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1567 MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg);
1568 LiveUnits.addReg(Reg: UnusedScratchReg);
1569 } else {
1570 // Needs spill.
1571 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1572 "Re-reserving spill slot for EXEC copy register");
1573 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC,
1574 /*IncludeScratchCopy=*/false);
1575 }
1576 } else if (ReservedRegForExecCopy) {
1577 // Reset it at this point. There are no whole-wave copies and spills
1578 // encountered.
1579 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1580 }
1581
1582 // hasFP only knows about stack objects that already exist. We're now
1583 // determining the stack slots that will be created, so we have to predict
1584 // them. Stack objects force FP usage with calls.
1585 //
1586 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1587 // don't want to report it here.
1588 //
1589 // FIXME: Is this really hasReservedCallFrame?
1590 const bool WillHaveFP =
1591 FrameInfo.hasCalls() &&
1592 (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo));
1593
1594 if (WillHaveFP || hasFP(MF)) {
1595 Register FramePtrReg = MFI->getFrameOffsetReg();
1596 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1597 "Re-reserving spill slot for FP");
1598 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg);
1599 }
1600
1601 if (TRI->hasBasePointer(MF)) {
1602 Register BasePtrReg = TRI->getBaseRegister();
1603 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1604 "Re-reserving spill slot for BP");
1605 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg);
1606 }
1607}
1608
1609// Only report VGPRs to generic code.
1610void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1611 BitVector &SavedVGPRs,
1612 RegScavenger *RS) const {
1613 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1614
1615 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1616 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1617 // we don't need to save and restore anything.
1618 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1619 return;
1620
1621 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS);
1622
1623 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1624 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1625 const SIInstrInfo *TII = ST.getInstrInfo();
1626 bool NeedExecCopyReservedReg = false;
1627
1628 MachineInstr *ReturnMI = nullptr;
1629 for (MachineBasicBlock &MBB : MF) {
1630 for (MachineInstr &MI : MBB) {
1631 // TODO: Walking through all MBBs here would be a bad heuristic. Better
1632 // handle them elsewhere.
1633 if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode()))
1634 NeedExecCopyReservedReg = true;
1635 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1636 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1637 (MFI->isChainFunction() &&
1638 TII->isChainCallOpcode(Opcode: MI.getOpcode()))) {
1639 // We expect all return to be the same size.
1640 assert(!ReturnMI ||
1641 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1642 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1643 ReturnMI = &MI;
1644 }
1645 }
1646 }
1647
1648 SmallVector<Register> SortedWWMVGPRs;
1649 for (Register Reg : MFI->getWWMReservedRegs()) {
1650 // The shift-back is needed only for the VGPRs used for SGPR spills and they
1651 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1652 // reserved registers.
1653 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1654 if (TRI->getRegSizeInBits(RC: *RC) != 32)
1655 continue;
1656 SortedWWMVGPRs.push_back(Elt: Reg);
1657 }
1658
1659 sort(C&: SortedWWMVGPRs, Comp: std::greater<Register>());
1660 MFI->shiftWwmVGPRsToLowestRange(MF, WWMVGPRs&: SortedWWMVGPRs, SavedVGPRs);
1661
1662 if (MFI->isEntryFunction())
1663 return;
1664
1665 // Remove any VGPRs used in the return value because these do not need to be saved.
1666 // This prevents CSR restore from clobbering return VGPRs.
1667 if (ReturnMI) {
1668 for (auto &Op : ReturnMI->operands()) {
1669 if (Op.isReg())
1670 SavedVGPRs.reset(Idx: Op.getReg());
1671 }
1672 }
1673
1674 // Create the stack objects for WWM registers now.
1675 for (Register Reg : MFI->getWWMReservedRegs()) {
1676 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1677 MFI->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(RC: *RC),
1678 Alignment: TRI->getSpillAlign(RC: *RC));
1679 }
1680
1681 // Ignore the SGPRs the default implementation found.
1682 SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask());
1683
1684 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1685 // In gfx908 there was do AGPR loads and stores and thus spilling also
1686 // require a temporary VGPR.
1687 if (!ST.hasGFX90AInsts())
1688 SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask());
1689
1690 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1691
1692 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1693 // allow the default insertion to handle them.
1694 for (auto &Reg : MFI->getWWMSpills())
1695 SavedVGPRs.reset(Idx: Reg.first);
1696}
1697
1698void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1699 BitVector &SavedRegs,
1700 RegScavenger *RS) const {
1701 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1702 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1703 if (MFI->isEntryFunction())
1704 return;
1705
1706 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1707 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1708
1709 // The SP is specifically managed and we don't want extra spills of it.
1710 SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg());
1711
1712 const BitVector AllSavedRegs = SavedRegs;
1713 SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask());
1714
1715 // We have to anticipate introducing CSR VGPR spills or spill of caller
1716 // save VGPR reserved for SGPR spills as we now always create stack entry
1717 // for it, if we don't have any stack objects already, since we require a FP
1718 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1719 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1720 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1721 const bool WillHaveFP =
1722 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1723
1724 // FP will be specially managed like SP.
1725 if (WillHaveFP || hasFP(MF))
1726 SavedRegs.reset(Idx: MFI->getFrameOffsetReg());
1727
1728 // Return address use with return instruction is hidden through the SI_RETURN
1729 // pseudo. Given that and since the IPRA computes actual register usage and
1730 // does not use CSR list, the clobbering of return address by function calls
1731 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1732 // usage collection. This will ensure save/restore of return address happens
1733 // in those scenarios.
1734 const MachineRegisterInfo &MRI = MF.getRegInfo();
1735 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1736 if (!MFI->isEntryFunction() &&
1737 (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) {
1738 SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0));
1739 SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1));
1740 }
1741}
1742
1743static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
1744 const GCNSubtarget &ST,
1745 std::vector<CalleeSavedInfo> &CSI,
1746 unsigned &MinCSFrameIndex,
1747 unsigned &MaxCSFrameIndex) {
1748 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1749 MachineFrameInfo &MFI = MF.getFrameInfo();
1750 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1751
1752 assert(
1753 llvm::is_sorted(CSI,
1754 [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1755 return A.getReg() < B.getReg();
1756 }) &&
1757 "Callee saved registers not sorted");
1758
1759 auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1760 return !CSI.isSpilledToReg() &&
1761 TRI->getPhysRegBaseClass(Reg: CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
1762 !FuncInfo->isWWMReservedRegister(Reg: CSI.getReg());
1763 };
1764
1765 auto CSEnd = CSI.end();
1766 for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1767 Register Reg = CSIt->getReg();
1768 if (!CanUseBlockOps(*CSIt))
1769 continue;
1770
1771 // Find all the regs that will fit in a 32-bit mask starting at the current
1772 // reg and build said mask. It should have 1 for every register that's
1773 // included, with the current register as the least significant bit.
1774 uint32_t Mask = 1;
1775 CSEnd = std::remove_if(
1776 first: CSIt + 1, last: CSEnd, pred: [&](const CalleeSavedInfo &CSI) -> bool {
1777 if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
1778 Mask |= 1 << (CSI.getReg() - Reg);
1779 return true;
1780 } else {
1781 return false;
1782 }
1783 });
1784
1785 const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
1786 Register RegBlock =
1787 TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
1788 if (!RegBlock) {
1789 // We couldn't find a super register for the block. This can happen if
1790 // the register we started with is too high (e.g. v232 if the maximum is
1791 // v255). We therefore try to get the last register block and figure out
1792 // the mask from there.
1793 Register LastBlockStart =
1794 AMDGPU::VGPR0 + alignDown(Value: Reg - AMDGPU::VGPR0, Align: 32);
1795 RegBlock =
1796 TRI->getMatchingSuperReg(Reg: LastBlockStart, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
1797 assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
1798 "Couldn't find super register");
1799 int RegDelta = Reg - LastBlockStart;
1800 assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
1801 "Bad shift amount");
1802 Mask <<= RegDelta;
1803 }
1804
1805 FuncInfo->setMaskForVGPRBlockOps(RegisterBlock: RegBlock, Mask);
1806
1807 // The stack objects can be a bit smaller than the register block if we know
1808 // some of the high bits of Mask are 0. This may happen often with calling
1809 // conventions where the caller and callee-saved VGPRs are interleaved at
1810 // a small boundary (e.g. 8 or 16).
1811 int UnusedBits = llvm::countl_zero(Val: Mask);
1812 unsigned BlockSize = TRI->getSpillSize(RC: *BlockRegClass) - UnusedBits * 4;
1813 int FrameIdx =
1814 MFI.CreateStackObject(Size: BlockSize, Alignment: TRI->getSpillAlign(RC: *BlockRegClass),
1815 /*isSpillSlot=*/true);
1816 if ((unsigned)FrameIdx < MinCSFrameIndex)
1817 MinCSFrameIndex = FrameIdx;
1818 if ((unsigned)FrameIdx > MaxCSFrameIndex)
1819 MaxCSFrameIndex = FrameIdx;
1820
1821 CSIt->setFrameIdx(FrameIdx);
1822 CSIt->setReg(RegBlock);
1823 }
1824 CSI.erase(first: CSEnd, last: CSI.end());
1825}
1826
1827bool SIFrameLowering::assignCalleeSavedSpillSlots(
1828 MachineFunction &MF, const TargetRegisterInfo *TRI,
1829 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1830 unsigned &MaxCSFrameIndex) const {
1831 if (CSI.empty())
1832 return true; // Early exit if no callee saved registers are modified!
1833
1834 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1835 bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1836
1837 if (UseVGPRBlocks)
1838 assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
1839
1840 return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
1841}
1842
1843bool SIFrameLowering::assignCalleeSavedSpillSlots(
1844 MachineFunction &MF, const TargetRegisterInfo *TRI,
1845 std::vector<CalleeSavedInfo> &CSI) const {
1846 if (CSI.empty())
1847 return true; // Early exit if no callee saved registers are modified!
1848
1849 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1850 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1851 const SIRegisterInfo *RI = ST.getRegisterInfo();
1852 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1853 Register BasePtrReg = RI->getBaseRegister();
1854 Register SGPRForFPSaveRestoreCopy =
1855 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1856 Register SGPRForBPSaveRestoreCopy =
1857 FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg);
1858 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1859 return false;
1860
1861 unsigned NumModifiedRegs = 0;
1862
1863 if (SGPRForFPSaveRestoreCopy)
1864 NumModifiedRegs++;
1865 if (SGPRForBPSaveRestoreCopy)
1866 NumModifiedRegs++;
1867
1868 for (auto &CS : CSI) {
1869 if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
1870 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1871 if (--NumModifiedRegs)
1872 break;
1873 } else if (CS.getReg() == BasePtrReg.asMCReg() &&
1874 SGPRForBPSaveRestoreCopy) {
1875 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1876 if (--NumModifiedRegs)
1877 break;
1878 }
1879 }
1880
1881 return false;
1882}
1883
1884bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1885 const MachineFunction &MF) const {
1886
1887 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1888 const MachineFrameInfo &MFI = MF.getFrameInfo();
1889 const SIInstrInfo *TII = ST.getInstrInfo();
1890 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1891 uint64_t MaxOffset = EstStackSize - 1;
1892
1893 // We need the emergency stack slots to be allocated in range of the
1894 // MUBUF/flat scratch immediate offset from the base register, so assign these
1895 // first at the incoming SP position.
1896 //
1897 // TODO: We could try sorting the objects to find a hole in the first bytes
1898 // rather than allocating as close to possible. This could save a lot of space
1899 // on frames with alignment requirements.
1900 if (ST.enableFlatScratch()) {
1901 if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1902 FlatVariant: SIInstrFlags::FlatScratch))
1903 return false;
1904 } else {
1905 if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset))
1906 return false;
1907 }
1908
1909 return true;
1910}
1911
1912bool SIFrameLowering::spillCalleeSavedRegisters(
1913 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1914 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1915 MachineFunction *MF = MBB.getParent();
1916 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1917 if (!ST.useVGPRBlockOpsForCSR())
1918 return false;
1919
1920 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1921 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1922 const SIInstrInfo *TII = ST.getInstrInfo();
1923 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1924
1925 const TargetRegisterClass *BlockRegClass =
1926 static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(MF: *MF);
1927 for (const CalleeSavedInfo &CS : CSI) {
1928 Register Reg = CS.getReg();
1929 if (!BlockRegClass->contains(Reg) ||
1930 !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
1931 spillCalleeSavedRegister(SaveBlock&: MBB, MI, CS, TII, TRI);
1932 continue;
1933 }
1934
1935 // Build a scratch block store.
1936 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
1937 int FrameIndex = CS.getFrameIdx();
1938 MachinePointerInfo PtrInfo =
1939 MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1940 MachineMemOperand *MMO =
1941 MF->getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
1942 Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1943 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1944
1945 BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(),
1946 MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
1947 .addReg(RegNo: Reg, flags: getKillRegState(B: false))
1948 .addFrameIndex(Idx: FrameIndex)
1949 .addReg(RegNo: MFI->getStackPtrOffsetReg())
1950 .addImm(Val: 0)
1951 .addImm(Val: Mask)
1952 .addMemOperand(MMO);
1953
1954 FuncInfo->setHasSpilledVGPRs();
1955
1956 // Add the register to the liveins. This is necessary because if any of the
1957 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
1958 // then the whole block will be marked as reserved and `updateLiveness` will
1959 // skip it.
1960 MBB.addLiveIn(PhysReg: Reg);
1961 }
1962 MBB.sortUniqueLiveIns();
1963
1964 return true;
1965}
1966
1967bool SIFrameLowering::restoreCalleeSavedRegisters(
1968 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1969 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1970 MachineFunction *MF = MBB.getParent();
1971 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1972 if (!ST.useVGPRBlockOpsForCSR())
1973 return false;
1974
1975 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1976 MachineFrameInfo &MFI = MF->getFrameInfo();
1977 const SIInstrInfo *TII = ST.getInstrInfo();
1978 const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
1979 const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(MF: *MF);
1980 for (const CalleeSavedInfo &CS : reverse(C&: CSI)) {
1981 Register Reg = CS.getReg();
1982 if (!BlockRegClass->contains(Reg) ||
1983 !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
1984 restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
1985 continue;
1986 }
1987
1988 // Build a scratch block load.
1989 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
1990 int FrameIndex = CS.getFrameIdx();
1991 MachinePointerInfo PtrInfo =
1992 MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1993 MachineMemOperand *MMO = MF->getMachineMemOperand(
1994 PtrInfo, F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIndex),
1995 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIndex));
1996
1997 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(),
1998 MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), DestReg: Reg)
1999 .addFrameIndex(Idx: FrameIndex)
2000 .addReg(RegNo: FuncInfo->getStackPtrOffsetReg())
2001 .addImm(Val: 0)
2002 .addImm(Val: Mask)
2003 .addMemOperand(MMO);
2004 SITRI->addImplicitUsesForBlockCSRLoad(MIB, BlockReg: Reg);
2005
2006 // Add the register to the liveins. This is necessary because if any of the
2007 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2008 // then the whole block will be marked as reserved and `updateLiveness` will
2009 // skip it.
2010 MBB.addLiveIn(PhysReg: Reg);
2011 }
2012
2013 MBB.sortUniqueLiveIns();
2014 return true;
2015}
2016
2017MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
2018 MachineFunction &MF,
2019 MachineBasicBlock &MBB,
2020 MachineBasicBlock::iterator I) const {
2021 int64_t Amount = I->getOperand(i: 0).getImm();
2022 if (Amount == 0)
2023 return MBB.erase(I);
2024
2025 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2026 const SIInstrInfo *TII = ST.getInstrInfo();
2027 const DebugLoc &DL = I->getDebugLoc();
2028 unsigned Opc = I->getOpcode();
2029 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
2030 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0;
2031
2032 if (!hasReservedCallFrame(MF)) {
2033 Amount = alignTo(Size: Amount, A: getStackAlign());
2034 assert(isUInt<32>(Amount) && "exceeded stack address space size");
2035 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2036 Register SPReg = MFI->getStackPtrOffsetReg();
2037
2038 Amount *= getScratchScaleFactor(ST);
2039 if (IsDestroy)
2040 Amount = -Amount;
2041 auto Add = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SPReg)
2042 .addReg(RegNo: SPReg)
2043 .addImm(Val: Amount);
2044 Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
2045 } else if (CalleePopAmount != 0) {
2046 llvm_unreachable("is this used?");
2047 }
2048
2049 return MBB.erase(I);
2050}
2051
2052/// Returns true if the frame will require a reference to the stack pointer.
2053///
2054/// This is the set of conditions common to setting up the stack pointer in a
2055/// kernel, and for using a frame pointer in a callable function.
2056///
2057/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2058/// references SP.
2059static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
2060 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
2061}
2062
2063// The FP for kernels is always known 0, so we never really need to setup an
2064// explicit register for it. However, DisableFramePointerElim will force us to
2065// use a register for it.
2066bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
2067 const MachineFrameInfo &MFI = MF.getFrameInfo();
2068
2069 // For entry & chain functions we can use an immediate offset in most cases,
2070 // so the presence of calls doesn't imply we need a distinct frame pointer.
2071 if (MFI.hasCalls() &&
2072 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
2073 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
2074 // All offsets are unsigned, so need to be addressed in the same direction
2075 // as stack growth.
2076
2077 // FIXME: This function is pretty broken, since it can be called before the
2078 // frame layout is determined or CSR spills are inserted.
2079 return MFI.getStackSize() != 0;
2080 }
2081
2082 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
2083 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
2084 MF) ||
2085 mayReserveScratchForCWSR(MF) ||
2086 MF.getTarget().Options.DisableFramePointerElim(MF);
2087}
2088
2089bool SIFrameLowering::mayReserveScratchForCWSR(
2090 const MachineFunction &MF) const {
2091 return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
2092 AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) &&
2093 AMDGPU::isCompute(CC: MF.getFunction().getCallingConv());
2094}
2095
2096// This is essentially a reduced version of hasFP for entry functions. Since the
2097// stack pointer is known 0 on entry to kernels, we never really need an FP
2098// register. We may need to initialize the stack pointer depending on the frame
2099// properties, which logically overlaps many of the cases where an ordinary
2100// function would require an FP.
2101// Also used for chain functions. While not technically entry functions, chain
2102// functions may need to set up a stack pointer in some situations.
2103bool SIFrameLowering::requiresStackPointerReference(
2104 const MachineFunction &MF) const {
2105 // Callable functions always require a stack pointer reference.
2106 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
2107 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
2108 "only expected to call this for entry points and chain functions");
2109
2110 const MachineFrameInfo &MFI = MF.getFrameInfo();
2111
2112 // Entry points ordinarily don't need to initialize SP. We have to set it up
2113 // for callees if there are any. Also note tail calls are impossible/don't
2114 // make any sense for kernels.
2115 if (MFI.hasCalls())
2116 return true;
2117
2118 // We still need to initialize the SP if we're doing anything weird that
2119 // references the SP, like variable sized stack objects.
2120 return frameTriviallyRequiresSP(MFI);
2121}
2122