SIFrameLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp]

1	//===----------------------- SIFrameLowering.cpp --------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8
9	#include "SIFrameLowering.h"
10	#include "AMDGPU.h"
11	#include "GCNSubtarget.h"
12	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13	#include "SIMachineFunctionInfo.h"
14	#include "llvm/CodeGen/LiveRegUnits.h"
15	#include "llvm/CodeGen/MachineFrameInfo.h"
16	#include "llvm/CodeGen/RegisterScavenging.h"
17	#include "llvm/Target/TargetMachine.h"
18
19	using namespace llvm;
20
21	#define DEBUG_TYPE "frame-info"
22
23	static cl::opt<bool> EnableSpillVGPRToAGPR(
24	"amdgpu-spill-vgpr-to-agpr",
25	cl::desc ("Enable spilling VGPRs to AGPRs"),
26	cl::ReallyHidden,
27	cl::init(Val: true));
28
29	// Find a register matching \p RC from \p LiveUnits which is unused and
30	// available throughout the function. On failure, returns AMDGPU::NoRegister.
31	// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32	// MCRegisters. This should reduce the number of iterations and avoid redundant
33	// checking.
34	static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35	const LiveRegUnits &LiveUnits,
36	const TargetRegisterClass &RC) {
37	for (MCRegister Reg : RC) {
38	if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) &&
39	!MRI.isReserved(PhysReg: Reg))
40	return Reg;
41	}
42	return MCRegister ();
43	}
44
45	// Find a scratch register that we can use in the prologue. We avoid using
46	// callee-save registers since they may appear to be free when this is called
47	// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48	// when this is called from emitPrologue.
49	static MCRegister findScratchNonCalleeSaveRegister(
50	MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51	const TargetRegisterClass &RC, bool Unused = false) {
52	// Mark callee saved registers as used so we will not choose them.
53	const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54	for (unsigned i = `0`; CSRegs[i]; ++i)
55	LiveUnits.addReg(Reg: CSRegs[i]);
56
57	// We are looking for a register that can be used throughout the entire
58	// function, so any use is unacceptable.
59	if (Unused)
60	return findUnusedRegister(MRI, LiveUnits, RC);
61
62	for (MCRegister Reg : RC) {
63	if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg))
64	return Reg;
65	}
66
67	return MCRegister ();
68	}
69
70	/// Query target location for spilling SGPRs
71	/// \p IncludeScratchCopy : Also look for free scratch SGPRs
72	static void getVGPRSpillLaneOrTempRegister(
73	MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74	const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75	bool IncludeScratchCopy = true) {
76	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80	const SIRegisterInfo *TRI = ST.getRegisterInfo();
81	unsigned Size = TRI->getSpillSize(RC);
82	Align Alignment = TRI->getSpillAlign(RC);
83
84	// We need to save and restore the given SGPR.
85
86	Register ScratchSGPR;
87	// 1: Try to save the given register into an unused scratch SGPR. The
88	// LiveUnits should have all the callee saved registers marked as used. For
89	// certain cases we skip copy to scratch SGPR.
90	if (IncludeScratchCopy)
91	ScratchSGPR = findUnusedRegister(MRI&: MF.getRegInfo(), LiveUnits, RC);
92
93	if (!ScratchSGPR) {
94	int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr,
95	ID: TargetStackID::SGPRSpill);
96
97	if (TRI->spillSGPRToVGPR() &&
98	MFI->allocateSGPRSpillToVGPRLane(MF, FI, /SpillToPhysVGPRLane=/true,
99	/IsPrologEpilog=/true)) {
100	// 2: There's no free lane to spill, and no free register to save the
101	// SGPR, so we're forced to take another VGPR to use for the spill.
102	MFI->addToPrologEpilogSGPRSpills(
103	Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo (
104	SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106	LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107	dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108	<< printReg(Spill.VGPR, TRI) << `':'` << Spill.Lane
109	<< `'\n'`;);
110	} else {
111	// Remove dead <FI> index
112	MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI);
113	// 3: If all else fails, spill the register to memory.
114	FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115	MFI->addToPrologEpilogSGPRSpills(
116	Reg: SGPR,
117	SI: PrologEpilogSGPRSaveRestoreInfo (SGPRSaveKind::SPILL_TO_MEM, FI));
118	LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119	<< printReg(SGPR, TRI) << `'\n'`);
120	}
121	} else {
122	MFI->addToPrologEpilogSGPRSpills(
123	Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo (
124	SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125	LiveUnits.addReg(Reg: ScratchSGPR);
126	LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127	<< printReg(ScratchSGPR, TRI) << `'\n'`);
128	}
129	}
130
131	// We need to specially emit stack operations here because a different frame
132	// register is used than in the rest of the function, as getFrameRegister would
133	// use.
134	static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135	const SIMachineFunctionInfo &FuncInfo,
136	LiveRegUnits &LiveUnits, MachineFunction &MF,
137	MachineBasicBlock &MBB,
138	MachineBasicBlock::iterator I, const DebugLoc &DL,
139	Register SpillReg, int FI, Register FrameReg,
140	int64_t DwordOff = `0`) {
141	unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142	: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146	MachineMemOperand *MMO = MF.getMachineMemOperand(
147	PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
148	BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
149	LiveUnits.addReg(Reg: SpillReg);
150	bool IsKill = !MBB.isLiveIn(Reg: SpillReg);
151	TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg,
152	InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
153	if (IsKill)
154	LiveUnits.removeReg(Reg: SpillReg);
155	}
156
157	static void buildEpilogRestore(const GCNSubtarget &ST,
158	const SIRegisterInfo &TRI,
159	const SIMachineFunctionInfo &FuncInfo,
160	LiveRegUnits &LiveUnits, MachineFunction &MF,
161	MachineBasicBlock &MBB,
162	MachineBasicBlock::iterator I,
163	const DebugLoc &DL, Register SpillReg, int FI,
164	Register FrameReg, int64_t DwordOff = `0`) {
165	unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166	: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170	MachineMemOperand *MMO = MF.getMachineMemOperand(
171	PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
172	BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
173	TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg,
174	InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
175	}
176
177	static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178	const DebugLoc &DL, const SIInstrInfo *TII,
179	Register TargetReg) {
180	MachineFunction *MF = MBB.getParent();
181	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183	const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
184	Register TargetLo = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub0);
185	Register TargetHi = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub1);
186
187	if (MFI->getGITPtrHigh() != `0xffffffff`) {
188	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi)
189	.addImm(Val: MFI->getGITPtrHigh())
190	.addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine);
191	} else {
192	const MCInstrDesc &GetPC64 = TII->get(Opcode: AMDGPU::S_GETPC_B64_pseudo);
193	BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg);
194	}
195	Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF);
196	MF->getRegInfo().addLiveIn(Reg: GitPtrLo);
197	MBB.addLiveIn(PhysReg: GitPtrLo);
198	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo)
199	.addReg(RegNo: GitPtrLo);
200	}
201
202	static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203	const SIMachineFunctionInfo *FuncInfo,
204	MachineFunction &MF, MachineBasicBlock &MBB,
205	MachineBasicBlock::iterator MBBI, bool IsProlog) {
206	if (LiveUnits.empty()) {
207	LiveUnits.init(TRI);
208	if (IsProlog) {
209	LiveUnits.addLiveIns(MBB);
210	} else {
211	// In epilog.
212	LiveUnits.addLiveOuts(MBB);
213	LiveUnits.stepBackward(MI: *MBBI);
214	}
215	}
216	}
217
218	namespace llvm {
219
220	// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221	// BP, etc. These spills are delayed until the current function's frame is
222	// finalized. For a given register, the builder uses the
223	// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224	class PrologEpilogSGPRSpillBuilder {
225	MachineBasicBlock::iterator MI;
226	MachineBasicBlock &MBB;
227	MachineFunction &MF;
228	const GCNSubtarget &ST;
229	MachineFrameInfo &MFI;
230	SIMachineFunctionInfo *FuncInfo;
231	const SIInstrInfo *TII;
232	const SIRegisterInfo &TRI;
233	Register SuperReg;
234	const PrologEpilogSGPRSaveRestoreInfo SI;
235	LiveRegUnits &LiveUnits;
236	const DebugLoc &DL;
237	Register FrameReg;
238	ArrayRef<int16_t> SplitParts;
239	unsigned NumSubRegs;
240	unsigned EltSize = `4`;
241
242	void saveToMemory(const int FI) const {
243	MachineRegisterInfo &MRI = MF.getRegInfo();
244	assert(!MFI.isDeadObjectIndex(FI));
245
246	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /IsProlog/ true);
247
248	MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249	MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
250	if (!TmpVGPR)
251	report_fatal_error(reason: "failed to find free scratch register");
252
253	for (unsigned I = `0`, DwordOff = `0`; I < NumSubRegs; ++I) {
254	Register SubReg = NumSubRegs == `1`
255	? SuperReg
256	: Register (TRI.getSubReg(Reg: SuperReg, Idx: SplitParts [I]));
257	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR)
258	.addReg(RegNo: SubReg);
259
260	buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR,
261	FI, FrameReg, DwordOff);
262	DwordOff += `4`;
263	}
264	}
265
266	void saveToVGPRLane(const int FI) const {
267	assert(!MFI.isDeadObjectIndex(FI));
268
269	assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270	ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271	FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
272	assert(Spill.size() == NumSubRegs);
273
274	for (unsigned I = `0`; I < NumSubRegs; ++I) {
275	Register SubReg = NumSubRegs == `1`
276	? SuperReg
277	: Register (TRI.getSubReg(Reg: SuperReg, Idx: SplitParts [I]));
278	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR),
279	DestReg: Spill [I].VGPR)
280	.addReg(RegNo: SubReg)
281	.addImm(Val: Spill [I].Lane)
282	.addReg(RegNo: Spill [I].VGPR, flags: RegState::Undef);
283	}
284	}
285
286	void copyToScratchSGPR(Register DstReg) const {
287	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg)
288	.addReg(RegNo: SuperReg)
289	.setMIFlag(MachineInstr::FrameSetup);
290	}
291
292	void restoreFromMemory(const int FI) {
293	MachineRegisterInfo &MRI = MF.getRegInfo();
294
295	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /IsProlog/ false);
296	MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297	MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass);
298	if (!TmpVGPR)
299	report_fatal_error(reason: "failed to find free scratch register");
300
301	for (unsigned I = `0`, DwordOff = `0`; I < NumSubRegs; ++I) {
302	Register SubReg = NumSubRegs == `1`
303	? SuperReg
304	: Register (TRI.getSubReg(Reg: SuperReg, Idx: SplitParts [I]));
305
306	buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL,
307	SpillReg: TmpVGPR, FI, FrameReg, DwordOff);
308	MRI.constrainRegClass(Reg: SubReg, RC: &AMDGPU::SReg_32_XM0RegClass);
309	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SubReg)
310	.addReg(RegNo: TmpVGPR, flags: RegState::Kill);
311	DwordOff += `4`;
312	}
313	}
314
315	void restoreFromVGPRLane(const int FI) {
316	assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
317	ArrayRef<SIRegisterInfo::SpilledReg> Spill =
318	FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
319	assert(Spill.size() == NumSubRegs);
320
321	for (unsigned I = `0`; I < NumSubRegs; ++I) {
322	Register SubReg = NumSubRegs == `1`
323	? SuperReg
324	: Register (TRI.getSubReg(Reg: SuperReg, Idx: SplitParts [I]));
325	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg)
326	.addReg(RegNo: Spill [I].VGPR)
327	.addImm(Val: Spill [I].Lane);
328	}
329	}
330
331	void copyFromScratchSGPR(Register SrcReg) const {
332	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SuperReg)
333	.addReg(RegNo: SrcReg)
334	.setMIFlag(MachineInstr::FrameDestroy);
335	}
336
337	public:
338	PrologEpilogSGPRSpillBuilder(Register Reg,
339	const PrologEpilogSGPRSaveRestoreInfo SI,
340	MachineBasicBlock &MBB,
341	MachineBasicBlock::iterator MI,
342	const DebugLoc &DL, const SIInstrInfo *TII,
343	const SIRegisterInfo &TRI,
344	LiveRegUnits &LiveUnits, Register FrameReg)
345	: MI (MI), MBB(MBB), MF(*MBB.getParent()),
346	ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
347	FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
348	SuperReg (Reg), SI (SI), LiveUnits(LiveUnits), DL(DL),
349	FrameReg (FrameReg) {
350	const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg);
351	SplitParts = TRI.getRegSplitParts(RC, EltSize);
352	NumSubRegs = SplitParts.empty() ? `1` : SplitParts.size();
353
354	assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
355	}
356
357	void save() {
358	switch (SI.getKind()) {
359	case SGPRSaveKind::SPILL_TO_MEM:
360	return saveToMemory(FI: SI.getIndex());
361	case SGPRSaveKind::SPILL_TO_VGPR_LANE:
362	return saveToVGPRLane(FI: SI.getIndex());
363	case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
364	return copyToScratchSGPR(DstReg: SI.getReg());
365	}
366	}
367
368	void restore() {
369	switch (SI.getKind()) {
370	case SGPRSaveKind::SPILL_TO_MEM:
371	return restoreFromMemory(FI: SI.getIndex());
372	case SGPRSaveKind::SPILL_TO_VGPR_LANE:
373	return restoreFromVGPRLane(FI: SI.getIndex());
374	case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
375	return copyFromScratchSGPR(SrcReg: SI.getReg());
376	}
377	}
378	};
379
380	} // namespace llvm
381
382	// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
383	void SIFrameLowering::emitEntryFunctionFlatScratchInit(
384	MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
385	const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
386	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
387	const SIInstrInfo *TII = ST.getInstrInfo();
388	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
389	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
390
391	// We don't need this if we only have spills since there is no user facing
392	// scratch.
393
394	// TODO: If we know we don't have flat instructions earlier, we can omit
395	// this from the input registers.
396	//
397	// TODO: We only need to know if we access scratch space through a flat
398	// pointer. Because we only detect if flat instructions are used at all,
399	// this will be used more often than necessary on VI.
400
401	Register FlatScrInitLo;
402	Register FlatScrInitHi;
403
404	if (ST.isAmdPalOS()) {
405	// Extract the scratch offset from the descriptor in the GIT
406	LiveRegUnits LiveUnits;
407	LiveUnits.init(TRI: *TRI);
408	LiveUnits.addLiveIns(MBB);
409
410	// Find unused reg to load flat scratch init into
411	MachineRegisterInfo &MRI = MF.getRegInfo();
412	Register FlatScrInit = AMDGPU::NoRegister;
413	ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
414	unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + `1`) / `2`;
415	AllSGPR64s = AllSGPR64s.slice(
416	N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded));
417	Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
418	for (MCPhysReg Reg : AllSGPR64s) {
419	if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) &&
420	MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg)) {
421	FlatScrInit = Reg;
422	break;
423	}
424	}
425	assert(FlatScrInit && "Failed to find free register for scratch init");
426
427	FlatScrInitLo = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub0);
428	FlatScrInitHi = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub1);
429
430	buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit);
431
432	// We now have the GIT ptr - now get the scratch descriptor from the entry
433	// at offset 0 (or offset 16 for a compute shader).
434	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
435	const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
436	auto *MMO = MF.getMachineMemOperand(
437	PtrInfo,
438	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
439	MachineMemOperand::MODereferenceable,
440	Size: `8`, BaseAlignment: Align (`4`));
441	unsigned Offset =
442	MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? `16` : `0`;
443	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
444	unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
445	BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit)
446	.addReg(RegNo: FlatScrInit)
447	.addImm(Val: EncodedOffset) // offset
448	.addImm(Val: `0`) // cpol
449	.addMemOperand(MMO);
450
451	// Mask the offset in [47:0] of the descriptor
452	const MCInstrDesc &SAndB32 = TII->get(Opcode: AMDGPU::S_AND_B32);
453	auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi)
454	.addReg(RegNo: FlatScrInitHi)
455	.addImm(Val: `0xffff`);
456	And ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
457	} else {
458	Register FlatScratchInitReg =
459	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
460	assert(FlatScratchInitReg);
461
462	MachineRegisterInfo &MRI = MF.getRegInfo();
463	MRI.addLiveIn(Reg: FlatScratchInitReg);
464	MBB.addLiveIn(PhysReg: FlatScratchInitReg);
465
466	FlatScrInitLo = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub0);
467	FlatScrInitHi = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub1);
468	}
469
470	// Do a 64-bit pointer add.
471	if (ST.flatScratchIsPointer()) {
472	if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
473	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: FlatScrInitLo)
474	.addReg(RegNo: FlatScrInitLo)
475	.addReg(RegNo: ScratchWaveOffsetReg);
476	auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
477	DestReg: FlatScrInitHi)
478	.addReg(RegNo: FlatScrInitHi)
479	.addImm(Val: `0`);
480	Addc ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
481
482	using namespace AMDGPU::Hwreg;
483	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
484	.addReg(RegNo: FlatScrInitLo)
485	.addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_LO, Values: `0`, Values: `32`)));
486	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32))
487	.addReg(RegNo: FlatScrInitHi)
488	.addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_HI, Values: `0`, Values: `32`)));
489	return;
490	}
491
492	// For GFX9.
493	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: AMDGPU::FLAT_SCR_LO)
494	.addReg(RegNo: FlatScrInitLo)
495	.addReg(RegNo: ScratchWaveOffsetReg);
496	auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32),
497	DestReg: AMDGPU::FLAT_SCR_HI)
498	.addReg(RegNo: FlatScrInitHi)
499	.addImm(Val: `0`);
500	Addc ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
501
502	return;
503	}
504
505	assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
506
507	// Copy the size in bytes.
508	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::FLAT_SCR_LO)
509	.addReg(RegNo: FlatScrInitHi, flags: RegState::Kill);
510
511	// Add wave offset in bytes to private base offset.
512	// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
513	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FlatScrInitLo)
514	.addReg(RegNo: FlatScrInitLo)
515	.addReg(RegNo: ScratchWaveOffsetReg);
516
517	// Convert offset to 256-byte units.
518	auto LShr = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32),
519	DestReg: AMDGPU::FLAT_SCR_HI)
520	.addReg(RegNo: FlatScrInitLo, flags: RegState::Kill)
521	.addImm(Val: `8`);
522	LShr ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
523	}
524
525	// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
526	// memory. They should have been removed by now.
527	static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
528	for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
529	I != E; ++I) {
530	if (!MFI.isDeadObjectIndex(ObjectIdx: I))
531	return false;
532	}
533
534	return true;
535	}
536
537	// Shift down registers reserved for the scratch RSRC.
538	Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
539	MachineFunction &MF) const {
540
541	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
542	const SIInstrInfo *TII = ST.getInstrInfo();
543	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
544	MachineRegisterInfo &MRI = MF.getRegInfo();
545	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
546
547	assert(MFI->isEntryFunction());
548
549	Register ScratchRsrcReg = MFI->getScratchRSrcReg();
550
551	if (!ScratchRsrcReg \|\| (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) &&
552	allStackObjectsAreDead(MFI: MF.getFrameInfo())))
553	return Register ();
554
555	if (ST.hasSGPRInitBug() \|\|
556	ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
557	return ScratchRsrcReg;
558
559	// We reserved the last registers for this. Shift it down to the end of those
560	// which were actually used.
561	//
562	// FIXME: It might be safer to use a pseudoregister before replacement.
563
564	// FIXME: We should be able to eliminate unused input registers. We only
565	// cannot do this for the resources required for scratch access. For now we
566	// skip over user SGPRs and may leave unused holes.
567
568	unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + `3`) / `4`;
569	ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
570	AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded));
571
572	// Skip the last N reserved elements because they should have already been
573	// reserved for VCC etc.
574	Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
575	for (MCPhysReg Reg : AllSGPR128s) {
576	// Pick the first unallocated one. Make sure we don't clobber the other
577	// reserved input we needed. Also for PAL, make sure we don't clobber
578	// the GIT pointer passed in SGPR0 or SGPR8.
579	if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
580	(!GITPtrLoReg \|\| !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg))) {
581	MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg);
582	MFI->setScratchRSrcReg(Reg);
583	MRI.reserveReg(PhysReg: Reg, TRI);
584	return Reg;
585	}
586	}
587
588	return ScratchRsrcReg;
589	}
590
591	static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
592	return ST.enableFlatScratch() ? `1` : ST.getWavefrontSize();
593	}
594
595	void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
596	MachineBasicBlock &MBB) const {
597	assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
598
599	// FIXME: If we only have SGPR spills, we won't actually be using scratch
600	// memory since these spill to VGPRs. We should be cleaning up these unused
601	// SGPR spill frame indices somewhere.
602
603	// FIXME: We still have implicit uses on SGPR spill instructions in case they
604	// need to spill to vector memory. It's likely that will not happen, but at
605	// this point it appears we need the setup. This part of the prolog should be
606	// emitted after frame indices are eliminated.
607
608	// FIXME: Remove all of the isPhysRegUsed checks
609
610	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
611	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
612	const SIInstrInfo *TII = ST.getInstrInfo();
613	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
614	MachineRegisterInfo &MRI = MF.getRegInfo();
615	const Function &F = MF.getFunction();
616	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
617
618	assert(MFI->isEntryFunction());
619
620	Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
621	Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
622
623	// We need to do the replacement of the private segment buffer register even
624	// if there are no stack objects. There could be stores to undef or a
625	// constant without an associated object.
626	//
627	// This will return `Register()` in cases where there are no actual
628	// uses of the SRSRC.
629	Register ScratchRsrcReg;
630	if (!ST.enableFlatScratch())
631	ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
632
633	// Make the selected register live throughout the function.
634	if (ScratchRsrcReg) {
635	for (MachineBasicBlock &OtherBB : MF) {
636	if (&OtherBB != &MBB) {
637	OtherBB.addLiveIn(PhysReg: ScratchRsrcReg);
638	}
639	}
640	}
641
642	// Now that we have fixed the reserved SRSRC we need to locate the
643	// (potentially) preloaded SRSRC.
644	Register PreloadedScratchRsrcReg;
645	if (ST.isAmdHsaOrMesa(F)) {
646	PreloadedScratchRsrcReg =
647	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
648	if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
649	// We added live-ins during argument lowering, but since they were not
650	// used they were deleted. We're adding the uses now, so add them back.
651	MRI.addLiveIn(Reg: PreloadedScratchRsrcReg);
652	MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg);
653	}
654	}
655
656	// Debug location must be unknown since the first debug location is used to
657	// determine the end of the prologue.
658	DebugLoc DL;
659	MachineBasicBlock::iterator I = MBB.begin();
660
661	// We found the SRSRC first because it needs four registers and has an
662	// alignment requirement. If the SRSRC that we found is clobbering with
663	// the scratch wave offset, which may be in a fixed SGPR or a free SGPR
664	// chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
665	// wave offset to a free SGPR.
666	Register ScratchWaveOffsetReg;
667	if (PreloadedScratchWaveOffsetReg &&
668	TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: PreloadedScratchWaveOffsetReg)) {
669	ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
670	unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
671	AllSGPRs = AllSGPRs.slice(
672	N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded));
673	Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
674	for (MCPhysReg Reg : AllSGPRs) {
675	if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
676	!TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: Reg) && GITPtrLoReg != Reg) {
677	ScratchWaveOffsetReg = Reg;
678	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchWaveOffsetReg)
679	.addReg(RegNo: PreloadedScratchWaveOffsetReg, flags: RegState::Kill);
680	break;
681	}
682	}
683
684	// FIXME: We can spill incoming arguments and restore at the end of the
685	// prolog.
686	if (!ScratchWaveOffsetReg)
687	report_fatal_error(
688	reason: "could not find temporary scratch offset register in prolog");
689	} else {
690	ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
691	}
692	assert(ScratchWaveOffsetReg \|\| !PreloadedScratchWaveOffsetReg);
693
694	unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
695	if (!mayReserveScratchForCWSR(MF)) {
696	if (hasFP(MF)) {
697	Register FPReg = MFI->getFrameOffsetReg();
698	assert(FPReg != AMDGPU::FP_REG);
699	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: FPReg).addImm(Val: `0`);
700	}
701
702	if (requiresStackPointerReference(MF)) {
703	Register SPReg = MFI->getStackPtrOffsetReg();
704	assert(SPReg != AMDGPU::SP_REG);
705	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
706	}
707	} else {
708	// We need to check if we're on a compute queue - if we are, then the CWSR
709	// trap handler may need to store some VGPRs on the stack. The first VGPR
710	// block is saved separately, so we only need to allocate space for any
711	// additional VGPR blocks used. For now, we will make sure there's enough
712	// room for the theoretical maximum number of VGPRs that can be allocated.
713	// FIXME: Figure out if the shader uses fewer VGPRs in practice.
714	assert(hasFP(MF));
715	Register FPReg = MFI->getFrameOffsetReg();
716	assert(FPReg != AMDGPU::FP_REG);
717	unsigned VGPRSize = llvm::alignTo(
718	Size: (ST.getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()) -
719	AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST,
720	DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize())) *
721	`4`,
722	A: FrameInfo.getMaxAlign());
723	MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
724
725	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: FPReg)
726	.addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
727	Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: `2`));
728	// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
729	// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
730	// SCC, so we need to check for 0 manually.
731	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: `0`).addReg(RegNo: FPReg);
732	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: FPReg).addImm(Val: VGPRSize);
733	if (requiresStackPointerReference(MF)) {
734	Register SPReg = MFI->getStackPtrOffsetReg();
735	assert(SPReg != AMDGPU::SP_REG);
736
737	// If at least one of the constants can be inlined, then we can use
738	// s_cselect. Otherwise, use a mov and cmovk.
739	if (AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm()) \|\|
740	AMDGPU::isInlinableLiteral32(Literal: Offset + VGPRSize,
741	HasInv2Pi: ST.hasInv2PiInlineImm())) {
742	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SPReg)
743	.addImm(Val: Offset + VGPRSize)
744	.addImm(Val: Offset);
745	} else {
746	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset);
747	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: SPReg)
748	.addImm(Val: Offset + VGPRSize);
749	}
750	}
751	}
752
753	bool NeedsFlatScratchInit =
754	MFI->getUserSGPRInfo().hasFlatScratchInit() &&
755	(MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR) \|\| FrameInfo.hasCalls() \|\|
756	(!allStackObjectsAreDead(MFI: FrameInfo) && ST.enableFlatScratch()));
757
758	if ((NeedsFlatScratchInit \|\| ScratchRsrcReg) &&
759	PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
760	MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg);
761	MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg);
762	}
763
764	if (NeedsFlatScratchInit) {
765	emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
766	}
767
768	if (ScratchRsrcReg) {
769	emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
770	PreloadedPrivateBufferReg: PreloadedScratchRsrcReg,
771	ScratchRsrcReg, ScratchWaveOffsetReg);
772	}
773	}
774
775	// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
776	void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
777	MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
778	const DebugLoc &DL, Register PreloadedScratchRsrcReg,
779	Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
780
781	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
782	const SIInstrInfo *TII = ST.getInstrInfo();
783	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
784	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
785	const Function &Fn = MF.getFunction();
786
787	if (ST.isAmdPalOS()) {
788	// The pointer to the GIT is formed from the offset passed in and either
789	// the amdgpu-git-ptr-high function attribute or the top part of the PC
790	Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
791	Register Rsrc03 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
792
793	buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01);
794
795	// We now have the GIT ptr - now get the scratch descriptor from the entry
796	// at offset 0 (or offset 16 for a compute shader).
797	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
798	const MCInstrDesc &LoadDwordX4 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX4_IMM);
799	auto *MMO = MF.getMachineMemOperand(
800	PtrInfo,
801	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
802	MachineMemOperand::MODereferenceable,
803	Size: `16`, BaseAlignment: Align (`4`));
804	unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? `16` : `0`;
805	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
806	unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset);
807	BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg)
808	.addReg(RegNo: Rsrc01)
809	.addImm(Val: EncodedOffset) // offset
810	.addImm(Val: `0`) // cpol
811	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine)
812	.addMemOperand(MMO);
813
814	// The driver will always set the SRD for wave 64 (bits 118:117 of
815	// descriptor / bits 22:21 of third sub-reg will be 0b11)
816	// If the shader is actually wave32 we have to modify the const_index_stride
817	// field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
818	// reason the driver does this is that there can be cases where it presents
819	// 2 shaders with different wave size (e.g. VsFs).
820	// TODO: convert to using SCRATCH instructions or multiple SRD buffers
821	if (ST.isWave32()) {
822	const MCInstrDesc &SBitsetB32 = TII->get(Opcode: AMDGPU::S_BITSET0_B32);
823	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03)
824	.addImm(Val: `21`)
825	.addReg(RegNo: Rsrc03);
826	}
827	} else if (ST.isMesaGfxShader(F: Fn) \|\| !PreloadedScratchRsrcReg) {
828	assert(!ST.isAmdHsaOrMesa(Fn));
829	const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32);
830
831	Register Rsrc2 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub2);
832	Register Rsrc3 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3);
833
834	// Use relocations to get the pointer, and setup the other bits manually.
835	uint64_t Rsrc23 = TII->getScratchRsrcWords23();
836
837	if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
838	Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1);
839
840	if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
841	const MCInstrDesc &Mov64 = TII->get(Opcode: AMDGPU::S_MOV_B64);
842
843	BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01)
844	.addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
845	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
846	} else {
847	const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM);
848
849	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
850	auto *MMO = MF.getMachineMemOperand(
851	PtrInfo,
852	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
853	MachineMemOperand::MODereferenceable,
854	Size: `8`, BaseAlignment: Align (`4`));
855	BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01)
856	.addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
857	.addImm(Val: `0`) // offset
858	.addImm(Val: `0`) // cpol
859	.addMemOperand(MMO)
860	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
861
862	MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR());
863	MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR());
864	}
865	} else {
866	Register Rsrc0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
867	Register Rsrc1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
868
869	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0)
870	.addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0")
871	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
872
873	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1)
874	.addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1")
875	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
876	}
877
878	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2)
879	.addImm(Val: Lo_32(Value: Rsrc23))
880	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
881
882	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3)
883	.addImm(Val: Hi_32(Value: Rsrc23))
884	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
885	} else if (ST.isAmdHsaOrMesa(F: Fn)) {
886	assert(PreloadedScratchRsrcReg);
887
888	if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
889	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchRsrcReg)
890	.addReg(RegNo: PreloadedScratchRsrcReg, flags: RegState::Kill);
891	}
892	}
893
894	// Add the scratch wave offset into the scratch RSRC.
895	//
896	// We only want to update the first 48 bits, which is the base address
897	// pointer, without touching the adjacent 16 bits of flags. We know this add
898	// cannot carry-out from bit 47, otherwise the scratch allocation would be
899	// impossible to fit in the 48-bit global address space.
900	//
901	// TODO: Evaluate if it is better to just construct an SRD using the flat
902	// scratch init and some constants rather than update the one we are passed.
903	Register ScratchRsrcSub0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0);
904	Register ScratchRsrcSub1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1);
905
906	// We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
907	// the kernel body via inreg arguments.
908	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: ScratchRsrcSub0)
909	.addReg(RegNo: ScratchRsrcSub0)
910	.addReg(RegNo: ScratchWaveOffsetReg)
911	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
912	auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: ScratchRsrcSub1)
913	.addReg(RegNo: ScratchRsrcSub1)
914	.addImm(Val: `0`)
915	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
916	Addc ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
917	}
918
919	bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
920	switch (ID) {
921	case TargetStackID::Default:
922	case TargetStackID::NoAlloc:
923	case TargetStackID::SGPRSpill:
924	return true;
925	case TargetStackID::ScalableVector:
926	case TargetStackID::WasmLocal:
927	return false;
928	}
929	llvm_unreachable("Invalid TargetStackID::Value");
930	}
931
932	// Activate only the inactive lanes when \p EnableInactiveLanes is true.
933	// Otherwise, activate all lanes. It returns the saved exec.
934	static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
935	MachineFunction &MF,
936	MachineBasicBlock &MBB,
937	MachineBasicBlock::iterator MBBI,
938	const DebugLoc &DL, bool IsProlog,
939	bool EnableInactiveLanes) {
940	Register ScratchExecCopy;
941	MachineRegisterInfo &MRI = MF.getRegInfo();
942	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
943	const SIInstrInfo *TII = ST.getInstrInfo();
944	const SIRegisterInfo &TRI = TII->getRegisterInfo();
945	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
946
947	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
948
949	ScratchExecCopy = findScratchNonCalleeSaveRegister(
950	MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass());
951	if (!ScratchExecCopy)
952	report_fatal_error(reason: "failed to find free scratch register");
953
954	LiveUnits.addReg(Reg: ScratchExecCopy);
955
956	const unsigned SaveExecOpc =
957	ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
958	: AMDGPU::S_OR_SAVEEXEC_B32)
959	: (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
960	: AMDGPU::S_OR_SAVEEXEC_B64);
961	auto SaveExec =
962	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: SaveExecOpc), DestReg: ScratchExecCopy).addImm(Val: -`1`);
963	SaveExec ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
964
965	return ScratchExecCopy;
966	}
967
968	void SIFrameLowering::emitCSRSpillStores(
969	MachineFunction &MF, MachineBasicBlock &MBB,
970	MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
971	Register FrameReg, Register FramePtrRegScratchCopy) const {
972	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
973	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
974	const SIInstrInfo *TII = ST.getInstrInfo();
975	const SIRegisterInfo &TRI = TII->getRegisterInfo();
976
977	// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
978	// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
979	// might end up flipping the EXEC bits twice.
980	Register ScratchExecCopy;
981	SmallVector<std::pair<Register, int>, `2`> WWMCalleeSavedRegs, WWMScratchRegs;
982	FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
983	if (!WWMScratchRegs.empty())
984	ScratchExecCopy =
985	buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
986	/IsProlog/ true, /EnableInactiveLanes/ true);
987
988	auto StoreWWMRegisters =
989	[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
990	for (const auto &Reg : WWMRegs) {
991	Register VGPR = Reg.first;
992	int FI = Reg.second;
993	buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
994	SpillReg: VGPR, FI, FrameReg);
995	}
996	};
997
998	StoreWWMRegisters (WWMScratchRegs);
999	if (!WWMCalleeSavedRegs.empty()) {
1000	if (ScratchExecCopy) {
1001	unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -`1`);
1003	} else {
1004	ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1005	/IsProlog/ true,
1006	/EnableInactiveLanes/ false);
1007	}
1008	}
1009
1010	StoreWWMRegisters (WWMCalleeSavedRegs);
1011	if (ScratchExecCopy) {
1012	// FIXME: Split block and make terminator.
1013	unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1014	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec())
1015	.addReg(RegNo: ScratchExecCopy, flags: RegState::Kill);
1016	LiveUnits.addReg(Reg: ScratchExecCopy);
1017	}
1018
1019	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1020
1021	for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1022	// Special handle FP spill:
1023	// Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1024	// Otherwise, FP has been moved to a temporary register and spill it
1025	// instead.
1026	Register Reg =
1027	Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1028	if (!Reg)
1029	continue;
1030
1031	PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1032	LiveUnits, FrameReg);
1033	SB.save();
1034	}
1035
1036	// If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1037	// such scratch registers live throughout the function.
1038	SmallVector<Register, `1`> ScratchSGPRs;
1039	FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs);
1040	if (!ScratchSGPRs.empty()) {
1041	for (MachineBasicBlock &MBB : MF) {
1042	for (MCPhysReg Reg : ScratchSGPRs)
1043	MBB.addLiveIn(PhysReg: Reg);
1044
1045	MBB.sortUniqueLiveIns();
1046	}
1047	if (!LiveUnits.empty()) {
1048	for (MCPhysReg Reg : ScratchSGPRs)
1049	LiveUnits.addReg(Reg);
1050	}
1051	}
1052	}
1053
1054	void SIFrameLowering::emitCSRSpillRestores(
1055	MachineFunction &MF, MachineBasicBlock &MBB,
1056	MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1057	Register FrameReg, Register FramePtrRegScratchCopy) const {
1058	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1059	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1060	const SIInstrInfo *TII = ST.getInstrInfo();
1061	const SIRegisterInfo &TRI = TII->getRegisterInfo();
1062	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1063
1064	for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1065	// Special handle FP restore:
1066	// Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1067	// the FP value to a temporary register. The frame pointer should be
1068	// overwritten only at the end when all other spills are restored from
1069	// current frame.
1070	Register Reg =
1071	Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1072	if (!Reg)
1073	continue;
1074
1075	PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1076	LiveUnits, FrameReg);
1077	SB.restore();
1078	}
1079
1080	// Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1081	// scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1082	// this, we might end up flipping the EXEC bits twice.
1083	Register ScratchExecCopy;
1084	SmallVector<std::pair<Register, int>, `2`> WWMCalleeSavedRegs, WWMScratchRegs;
1085	FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1086	if (!WWMScratchRegs.empty())
1087	ScratchExecCopy =
1088	buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1089	/IsProlog/ false, /EnableInactiveLanes/ true);
1090
1091	auto RestoreWWMRegisters =
1092	[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1093	for (const auto &Reg : WWMRegs) {
1094	Register VGPR = Reg.first;
1095	int FI = Reg.second;
1096	buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1097	SpillReg: VGPR, FI, FrameReg);
1098	}
1099	};
1100
1101	RestoreWWMRegisters (WWMScratchRegs);
1102	if (!WWMCalleeSavedRegs.empty()) {
1103	if (ScratchExecCopy) {
1104	unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1105	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -`1`);
1106	} else {
1107	ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1108	/IsProlog/ false,
1109	/EnableInactiveLanes/ false);
1110	}
1111	}
1112
1113	RestoreWWMRegisters (WWMCalleeSavedRegs);
1114	if (ScratchExecCopy) {
1115	// FIXME: Split block and make terminator.
1116	unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1117	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec())
1118	.addReg(RegNo: ScratchExecCopy, flags: RegState::Kill);
1119	}
1120	}
1121
1122	void SIFrameLowering::emitPrologue(MachineFunction &MF,
1123	MachineBasicBlock &MBB) const {
1124	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1125	if (FuncInfo->isEntryFunction()) {
1126	emitEntryFunctionPrologue(MF, MBB);
1127	return;
1128	}
1129
1130	MachineFrameInfo &MFI = MF.getFrameInfo();
1131	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132	const SIInstrInfo *TII = ST.getInstrInfo();
1133	const SIRegisterInfo &TRI = TII->getRegisterInfo();
1134	MachineRegisterInfo &MRI = MF.getRegInfo();
1135
1136	Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1137	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1138	Register BasePtrReg =
1139	TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register ();
1140	LiveRegUnits LiveUnits;
1141
1142	MachineBasicBlock::iterator MBBI = MBB.begin();
1143	// DebugLoc must be unknown since the first instruction with DebugLoc is used
1144	// to determine the end of the prologue.
1145	DebugLoc DL;
1146
1147	if (FuncInfo->isChainFunction()) {
1148	// Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1149	// are free to set one up if they need it.
1150	bool UseSP = requiresStackPointerReference(MF);
1151	if (UseSP) {
1152	assert(StackPtrReg != AMDGPU::SP_REG);
1153
1154	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: StackPtrReg)
1155	.addImm(Val: MFI.getStackSize() * getScratchScaleFactor(ST));
1156	}
1157	}
1158
1159	bool HasFP = false;
1160	bool HasBP = false;
1161	uint32_t NumBytes = MFI.getStackSize();
1162	uint32_t RoundedSize = NumBytes;
1163
1164	if (TRI.hasStackRealignment(MF))
1165	HasFP = true;
1166
1167	Register FramePtrRegScratchCopy;
1168	if (!HasFP && !hasFP(MF)) {
1169	// Emit the CSR spill stores with SP base register.
1170	emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1171	FrameReg: FuncInfo->isChainFunction() ? Register () : StackPtrReg,
1172	FramePtrRegScratchCopy);
1173	} else {
1174	// CSR spill stores will use FP as base register.
1175	Register SGPRForFPSaveRestoreCopy =
1176	FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1177
1178	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /IsProlog/ true);
1179	if (SGPRForFPSaveRestoreCopy) {
1180	// Copy FP to the scratch register now and emit the CFI entry. It avoids
1181	// the extra FP copy needed in the other two cases when FP is spilled to
1182	// memory or to a VGPR lane.
1183	PrologEpilogSGPRSpillBuilder SB(
1184	FramePtrReg,
1185	FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI,
1186	DL, TII, TRI, LiveUnits, FramePtrReg);
1187	SB.save();
1188	LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1189	} else {
1190	// Copy FP into a new scratch register so that its previous value can be
1191	// spilled after setting up the new frame.
1192	FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1193	MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1194	if (!FramePtrRegScratchCopy)
1195	report_fatal_error(reason: "failed to find free scratch register");
1196
1197	LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1198	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrRegScratchCopy)
1199	.addReg(RegNo: FramePtrReg);
1200	}
1201	}
1202
1203	if (HasFP) {
1204	const unsigned Alignment = MFI.getMaxAlign().value();
1205
1206	RoundedSize += Alignment;
1207	if (LiveUnits.empty()) {
1208	LiveUnits.init(TRI);
1209	LiveUnits.addLiveIns(MBB);
1210	}
1211
1212	// s_add_i32 s33, s32, NumBytes
1213	// s_and_b32 s33, s33, 0b111...0000
1214	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FramePtrReg)
1215	.addReg(RegNo: StackPtrReg)
1216	.addImm(Val: (Alignment - `1`) * getScratchScaleFactor(ST))
1217	.setMIFlag(MachineInstr::FrameSetup);
1218	auto And = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: FramePtrReg)
1219	.addReg(RegNo: FramePtrReg, flags: RegState::Kill)
1220	.addImm(Val: -Alignment * getScratchScaleFactor(ST))
1221	.setMIFlag(MachineInstr::FrameSetup);
1222	And ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
1223	FuncInfo->setIsStackRealigned(true);
1224	} else if ((HasFP = hasFP(MF))) {
1225	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1226	.addReg(RegNo: StackPtrReg)
1227	.setMIFlag(MachineInstr::FrameSetup);
1228	}
1229
1230	// If FP is used, emit the CSR spills with FP base register.
1231	if (HasFP) {
1232	emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1233	FramePtrRegScratchCopy);
1234	if (FramePtrRegScratchCopy)
1235	LiveUnits.removeReg(Reg: FramePtrRegScratchCopy);
1236	}
1237
1238	// If we need a base pointer, set it up here. It's whatever the value of
1239	// the stack pointer is at this point. Any variable size objects will be
1240	// allocated after this, so we can still use the base pointer to reference
1241	// the incoming arguments.
1242	if ((HasBP = TRI.hasBasePointer(MF))) {
1243	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BasePtrReg)
1244	.addReg(RegNo: StackPtrReg)
1245	.setMIFlag(MachineInstr::FrameSetup);
1246	}
1247
1248	if (HasFP && RoundedSize != `0`) {
1249	auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg)
1250	.addReg(RegNo: StackPtrReg)
1251	.addImm(Val: RoundedSize * getScratchScaleFactor(ST))
1252	.setMIFlag(MachineInstr::FrameSetup);
1253	Add ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
1254	}
1255
1256	bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1257	(void)FPSaved;
1258	assert((!HasFP \|\| FPSaved) &&
1259	"Needed to save FP but didn't save it anywhere");
1260
1261	// If we allow spilling to AGPRs we may have saved FP but then spill
1262	// everything into AGPRs instead of the stack.
1263	assert((HasFP \|\| !FPSaved \|\| EnableSpillVGPRToAGPR) &&
1264	"Saved FP but didn't need it");
1265
1266	bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg);
1267	(void)BPSaved;
1268	assert((!HasBP \|\| BPSaved) &&
1269	"Needed to save BP but didn't save it anywhere");
1270
1271	assert((HasBP \|\| !BPSaved) && "Saved BP but didn't need it");
1272	}
1273
1274	void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1275	MachineBasicBlock &MBB) const {
1276	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1277	if (FuncInfo->isEntryFunction())
1278	return;
1279
1280	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1281	const SIInstrInfo *TII = ST.getInstrInfo();
1282	const SIRegisterInfo &TRI = TII->getRegisterInfo();
1283	MachineRegisterInfo &MRI = MF.getRegInfo();
1284	LiveRegUnits LiveUnits;
1285	// Get the insert location for the epilogue. If there were no terminators in
1286	// the block, get the last instruction.
1287	MachineBasicBlock::iterator MBBI = MBB.end();
1288	DebugLoc DL;
1289	if (!MBB.empty()) {
1290	MBBI = MBB.getLastNonDebugInstr();
1291	if (MBBI != MBB.end())
1292	DL = MBBI ->getDebugLoc();
1293
1294	MBBI = MBB.getFirstTerminator();
1295	}
1296
1297	const MachineFrameInfo &MFI = MF.getFrameInfo();
1298	uint32_t NumBytes = MFI.getStackSize();
1299	uint32_t RoundedSize = FuncInfo->isStackRealigned()
1300	? NumBytes + MFI.getMaxAlign().value()
1301	: NumBytes;
1302	const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1303	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1304	bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1305
1306	if (RoundedSize != `0`) {
1307	if (TRI.hasBasePointer(MF)) {
1308	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1309	.addReg(RegNo: TRI.getBaseRegister())
1310	.setMIFlag(MachineInstr::FrameDestroy);
1311	} else if (hasFP(MF)) {
1312	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg)
1313	.addReg(RegNo: FramePtrReg)
1314	.setMIFlag(MachineInstr::FrameDestroy);
1315	}
1316	}
1317
1318	Register FramePtrRegScratchCopy;
1319	Register SGPRForFPSaveRestoreCopy =
1320	FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1321	if (FPSaved) {
1322	// CSR spill restores should use FP as base register. If
1323	// SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1324	// into a new scratch register and copy to FP later when other registers are
1325	// restored from the current stack frame.
1326	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /IsProlog/ false);
1327	if (SGPRForFPSaveRestoreCopy) {
1328	LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1329	} else {
1330	FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1331	MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass);
1332	if (!FramePtrRegScratchCopy)
1333	report_fatal_error(reason: "failed to find free scratch register");
1334
1335	LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1336	}
1337
1338	emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1339	FramePtrRegScratchCopy);
1340	}
1341
1342	if (FPSaved) {
1343	// Insert the copy to restore FP.
1344	Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1345	: FramePtrRegScratchCopy;
1346	MachineInstrBuilder MIB =
1347	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg)
1348	.addReg(RegNo: SrcReg);
1349	if (SGPRForFPSaveRestoreCopy)
1350	MIB.setMIFlag(MachineInstr::FrameDestroy);
1351	} else {
1352	// Insert the CSR spill restores with SP as the base register.
1353	emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
1354	FrameReg: FuncInfo->isChainFunction() ? Register () : StackPtrReg,
1355	FramePtrRegScratchCopy);
1356	}
1357	}
1358
1359	#ifndef NDEBUG
1360	static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1361	const MachineFrameInfo &MFI = MF.getFrameInfo();
1362	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1363	for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1364	I != E; ++I) {
1365	if (!MFI.isDeadObjectIndex(I) &&
1366	MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1367	!FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1368	return false;
1369	}
1370	}
1371
1372	return true;
1373	}
1374	#endif
1375
1376	StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1377	int FI,
1378	Register &FrameReg) const {
1379	const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1380
1381	FrameReg = RI->getFrameRegister(MF);
1382	return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI));
1383	}
1384
1385	void SIFrameLowering::processFunctionBeforeFrameFinalized(
1386	MachineFunction &MF,
1387	RegScavenger RS) const* {
1388	MachineFrameInfo &MFI = MF.getFrameInfo();
1389
1390	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1391	const SIInstrInfo *TII = ST.getInstrInfo();
1392	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1393	MachineRegisterInfo &MRI = MF.getRegInfo();
1394	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1395
1396	const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1397	&& EnableSpillVGPRToAGPR;
1398
1399	if (SpillVGPRToAGPR) {
1400	// To track the spill frame indices handled in this pass.
1401	BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1402	BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1403
1404	bool SeenDbgInstr = false;
1405
1406	for (MachineBasicBlock &MBB : MF) {
1407	for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
1408	int FrameIndex;
1409	if (MI.isDebugInstr())
1410	SeenDbgInstr = true;
1411
1412	if (TII->isVGPRSpill(MI)) {
1413	// Try to eliminate stack used by VGPR spills before frame
1414	// finalization.
1415	unsigned FIOp = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
1416	Name: AMDGPU::OpName::vaddr);
1417	int FI = MI.getOperand(i: FIOp).getIndex();
1418	Register VReg =
1419	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
1420	if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1421	isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) {
1422	assert(RS != nullptr);
1423	RS->enterBasicBlockEnd(MBB);
1424	RS->backward(I: std::next(x: MI.getIterator()));
1425	TRI->eliminateFrameIndex(MI, SPAdj: `0`, FIOperandNum: FIOp, RS);
1426	SpillFIs.set(FI);
1427	continue;
1428	}
1429	} else if (TII->isStoreToStackSlot(MI, FrameIndex) \|\|
1430	TII->isLoadFromStackSlot(MI, FrameIndex))
1431	if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex))
1432	NonVGPRSpillFIs.set(FrameIndex);
1433	}
1434	}
1435
1436	// Stack slot coloring may assign different objects to the same stack slot.
1437	// If not, then the VGPR to AGPR spill slot is dead.
1438	for (unsigned FI : SpillFIs.set_bits())
1439	if (!NonVGPRSpillFIs.test(Idx: FI))
1440	FuncInfo->setVGPRToAGPRSpillDead(FI);
1441
1442	for (MachineBasicBlock &MBB : MF) {
1443	for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1444	MBB.addLiveIn(PhysReg: Reg);
1445
1446	for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1447	MBB.addLiveIn(PhysReg: Reg);
1448
1449	MBB.sortUniqueLiveIns();
1450
1451	if (!SpillFIs.empty() && SeenDbgInstr) {
1452	// FIXME: The dead frame indices are replaced with a null register from
1453	// the debug value instructions. We should instead, update it with the
1454	// correct register value. But not sure the register value alone is
1455	for (MachineInstr &MI : MBB) {
1456	if (MI.isDebugValue()) {
1457	uint32_t StackOperandIdx = MI.isDebugValueList() ? `2` : `0`;
1458	if (MI.getOperand(i: StackOperandIdx).isFI() &&
1459	!MFI.isFixedObjectIndex(
1460	ObjectIdx: MI.getOperand(i: StackOperandIdx).getIndex()) &&
1461	SpillFIs [MI.getOperand(i: StackOperandIdx).getIndex()]) {
1462	MI.getOperand(i: StackOperandIdx)
1463	.ChangeToRegister(Reg: Register (), isDef: false /isDef/);
1464	}
1465	}
1466	}
1467	}
1468	}
1469	}
1470
1471	// At this point we've already allocated all spilled SGPRs to VGPRs if we
1472	// can. Any remaining SGPR spills will go to memory, so move them back to the
1473	// default stack.
1474	bool HaveSGPRToVMemSpill =
1475	FuncInfo->removeDeadFrameIndices(MFI, /ResetSGPRSpillStackIDs/ true);
1476	assert(allSGPRSpillsAreDead(MF) &&
1477	"SGPR spill should have been removed in SILowerSGPRSpills");
1478
1479	// FIXME: The other checks should be redundant with allStackObjectsAreDead,
1480	// but currently hasNonSpillStackObjects is set only from source
1481	// allocas. Stack temps produced from legalization are not counted currently.
1482	if (!allStackObjectsAreDead(MFI)) {
1483	assert(RS && "RegScavenger required if spilling");
1484
1485	// Add an emergency spill slot
1486	RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI));
1487
1488	// If we are spilling SGPRs to memory with a large frame, we may need a
1489	// second VGPR emergency frame index.
1490	if (HaveSGPRToVMemSpill &&
1491	allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1492	RS->addScavengingFrameIndex(FI: MFI.CreateSpillStackObject(Size: `4`, Alignment: Align (`4`)));
1493	}
1494	}
1495	}
1496
1497	void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1498	MachineFunction &MF, RegScavenger RS) const* {
1499	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1500	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1501	MachineRegisterInfo &MRI = MF.getRegInfo();
1502	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1503
1504	if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1505	// On gfx908, we had initially reserved highest available VGPR for AGPR
1506	// copy. Now since we are done with RA, check if there exist an unused VGPR
1507	// which is lower than the eariler reserved VGPR before RA. If one exist,
1508	// use it for AGPR copy instead of one reserved before RA.
1509	Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1510	Register UnusedLowVGPR =
1511	TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
1512	if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) <
1513	TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) {
1514	// Reserve this newly identified VGPR (for AGPR copy)
1515	// reserved registers should already be frozen at this point
1516	// so we can avoid calling MRI.freezeReservedRegs and just use
1517	// MRI.reserveReg
1518	FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1519	MRI.reserveReg(PhysReg: UnusedLowVGPR, TRI);
1520	}
1521	}
1522	// We initally reserved the highest available SGPR pair for long branches
1523	// now, after RA, we shift down to a lower unused one if one exists
1524	Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1525	Register UnusedLowSGPR =
1526	TRI->findUnusedRegister(MRI, RC: &AMDGPU::SGPR_64RegClass, MF);
1527	// If LongBranchReservedReg is null then we didn't find a long branch
1528	// and never reserved a register to begin with so there is nothing to
1529	// shift down. Then if UnusedLowSGPR is null, there isn't available lower
1530	// register to use so just keep the original one we set.
1531	if (LongBranchReservedReg && UnusedLowSGPR) {
1532	FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1533	MRI.reserveReg(PhysReg: UnusedLowSGPR, TRI);
1534	}
1535	}
1536
1537	// The special SGPR spills like the one needed for FP, BP or any reserved
1538	// registers delayed until frame lowering.
1539	void SIFrameLowering::determinePrologEpilogSGPRSaves(
1540	MachineFunction &MF, BitVector &SavedVGPRs,
1541	bool NeedExecCopyReservedReg) const {
1542	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1543	MachineRegisterInfo &MRI = MF.getRegInfo();
1544	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1545	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1546	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1547	LiveRegUnits LiveUnits;
1548	LiveUnits.init(TRI: *TRI);
1549	// Initially mark callee saved registers as used so we will not choose them
1550	// while looking for scratch SGPRs.
1551	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1552	for (unsigned I = `0`; CSRegs[I]; ++I)
1553	LiveUnits.addReg(Reg: CSRegs[I]);
1554
1555	const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1556
1557	Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1558	if (NeedExecCopyReservedReg \|\|
1559	(ReservedRegForExecCopy &&
1560	MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /SkipRegMaskTest=/true))) {
1561	MRI.reserveReg(PhysReg: ReservedRegForExecCopy, TRI);
1562	Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1563	if (UnusedScratchReg) {
1564	// If found any unused scratch SGPR, reserve the register itself for Exec
1565	// copy and there is no need for any spills in that case.
1566	MFI->setSGPRForEXECCopy(UnusedScratchReg);
1567	MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg);
1568	LiveUnits.addReg(Reg: UnusedScratchReg);
1569	} else {
1570	// Needs spill.
1571	assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1572	"Re-reserving spill slot for EXEC copy register");
1573	getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC,
1574	/IncludeScratchCopy=/false);
1575	}
1576	} else if (ReservedRegForExecCopy) {
1577	// Reset it at this point. There are no whole-wave copies and spills
1578	// encountered.
1579	MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1580	}
1581
1582	// hasFP only knows about stack objects that already exist. We're now
1583	// determining the stack slots that will be created, so we have to predict
1584	// them. Stack objects force FP usage with calls.
1585	//
1586	// Note a new VGPR CSR may be introduced if one is used for the spill, but we
1587	// don't want to report it here.
1588	//
1589	// FIXME: Is this really hasReservedCallFrame?
1590	const bool WillHaveFP =
1591	FrameInfo.hasCalls() &&
1592	(SavedVGPRs.any() \|\| !allStackObjectsAreDead(MFI: FrameInfo));
1593
1594	if (WillHaveFP \|\| hasFP(MF)) {
1595	Register FramePtrReg = MFI->getFrameOffsetReg();
1596	assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1597	"Re-reserving spill slot for FP");
1598	getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg);
1599	}
1600
1601	if (TRI->hasBasePointer(MF)) {
1602	Register BasePtrReg = TRI->getBaseRegister();
1603	assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1604	"Re-reserving spill slot for BP");
1605	getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg);
1606	}
1607	}
1608
1609	// Only report VGPRs to generic code.
1610	void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1611	BitVector &SavedVGPRs,
1612	RegScavenger RS) const* {
1613	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1614
1615	// If this is a function with the amdgpu_cs_chain[_preserve] calling
1616	// convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1617	// we don't need to save and restore anything.
1618	if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1619	return;
1620
1621	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS);
1622
1623	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1624	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1625	const SIInstrInfo *TII = ST.getInstrInfo();
1626	bool NeedExecCopyReservedReg = false;
1627
1628	MachineInstr ReturnMI = nullptr*;
1629	for (MachineBasicBlock &MBB : MF) {
1630	for (MachineInstr &MI : MBB) {
1631	// TODO: Walking through all MBBs here would be a bad heuristic. Better
1632	// handle them elsewhere.
1633	if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode()))
1634	NeedExecCopyReservedReg = true;
1635	else if (MI.getOpcode() == AMDGPU::SI_RETURN \|\|
1636	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
1637	(MFI->isChainFunction() &&
1638	TII->isChainCallOpcode(Opcode: MI.getOpcode()))) {
1639	// We expect all return to be the same size.
1640	assert(!ReturnMI \|\|
1641	(count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1642	count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1643	ReturnMI = &MI;
1644	}
1645	}
1646	}
1647
1648	SmallVector<Register> SortedWWMVGPRs;
1649	for (Register Reg : MFI->getWWMReservedRegs()) {
1650	// The shift-back is needed only for the VGPRs used for SGPR spills and they
1651	// are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1652	// reserved registers.
1653	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1654	if (TRI->getRegSizeInBits(RC: *RC) != `32`)
1655	continue;
1656	SortedWWMVGPRs.push_back(Elt: Reg);
1657	}
1658
1659	sort(C&: SortedWWMVGPRs, Comp: std::greater<Register>());
1660	MFI->shiftWwmVGPRsToLowestRange(MF, WWMVGPRs&: SortedWWMVGPRs, SavedVGPRs);
1661
1662	if (MFI->isEntryFunction())
1663	return;
1664
1665	// Remove any VGPRs used in the return value because these do not need to be saved.
1666	// This prevents CSR restore from clobbering return VGPRs.
1667	if (ReturnMI) {
1668	for (auto &Op : ReturnMI->operands()) {
1669	if (Op.isReg())
1670	SavedVGPRs.reset(Idx: Op.getReg());
1671	}
1672	}
1673
1674	// Create the stack objects for WWM registers now.
1675	for (Register Reg : MFI->getWWMReservedRegs()) {
1676	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1677	MFI->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(RC: *RC),
1678	Alignment: TRI->getSpillAlign(RC: *RC));
1679	}
1680
1681	// Ignore the SGPRs the default implementation found.
1682	SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask());
1683
1684	// Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1685	// In gfx908 there was do AGPR loads and stores and thus spilling also
1686	// require a temporary VGPR.
1687	if (!ST.hasGFX90AInsts())
1688	SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask());
1689
1690	determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1691
1692	// The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1693	// allow the default insertion to handle them.
1694	for (auto &Reg : MFI->getWWMSpills())
1695	SavedVGPRs.reset(Idx: Reg.first);
1696	}
1697
1698	void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1699	BitVector &SavedRegs,
1700	RegScavenger RS) const* {
1701	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1702	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1703	if (MFI->isEntryFunction())
1704	return;
1705
1706	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1707	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1708
1709	// The SP is specifically managed and we don't want extra spills of it.
1710	SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg());
1711
1712	const BitVector AllSavedRegs = SavedRegs;
1713	SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask());
1714
1715	// We have to anticipate introducing CSR VGPR spills or spill of caller
1716	// save VGPR reserved for SGPR spills as we now always create stack entry
1717	// for it, if we don't have any stack objects already, since we require a FP
1718	// if there is a call and stack. We will allocate a VGPR for SGPR spills if
1719	// there are any SGPR spills. Whether they are CSR spills or otherwise.
1720	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1721	const bool WillHaveFP =
1722	FrameInfo.hasCalls() && (AllSavedRegs.any() \|\| MFI->hasSpilledSGPRs());
1723
1724	// FP will be specially managed like SP.
1725	if (WillHaveFP \|\| hasFP(MF))
1726	SavedRegs.reset(Idx: MFI->getFrameOffsetReg());
1727
1728	// Return address use with return instruction is hidden through the SI_RETURN
1729	// pseudo. Given that and since the IPRA computes actual register usage and
1730	// does not use CSR list, the clobbering of return address by function calls
1731	// (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1732	// usage collection. This will ensure save/restore of return address happens
1733	// in those scenarios.
1734	const MachineRegisterInfo &MRI = MF.getRegInfo();
1735	Register RetAddrReg = TRI->getReturnAddressReg(MF);
1736	if (!MFI->isEntryFunction() &&
1737	(FrameInfo.hasCalls() \|\| MRI.isPhysRegModified(PhysReg: RetAddrReg))) {
1738	SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0));
1739	SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1));
1740	}
1741	}
1742
1743	static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
1744	const GCNSubtarget &ST,
1745	std::vector<CalleeSavedInfo> &CSI,
1746	unsigned &MinCSFrameIndex,
1747	unsigned &MaxCSFrameIndex) {
1748	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1749	MachineFrameInfo &MFI = MF.getFrameInfo();
1750	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1751
1752	assert(
1753	llvm::is_sorted(CSI,
1754	[](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1755	return A.getReg() < B.getReg();
1756	}) &&
1757	"Callee saved registers not sorted");
1758
1759	auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1760	return !CSI.isSpilledToReg() &&
1761	TRI->getPhysRegBaseClass(Reg: CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
1762	!FuncInfo->isWWMReservedRegister(Reg: CSI.getReg());
1763	};
1764
1765	auto CSEnd = CSI.end();
1766	for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1767	Register Reg = CSIt ->getReg();
1768	if (!CanUseBlockOps (*CSIt))
1769	continue;
1770
1771	// Find all the regs that will fit in a 32-bit mask starting at the current
1772	// reg and build said mask. It should have 1 for every register that's
1773	// included, with the current register as the least significant bit.
1774	uint32_t Mask = `1`;
1775	CSEnd = std::remove_if(
1776	first: CSIt + `1`, last: CSEnd, pred: [&](const CalleeSavedInfo &CSI) -> bool {
1777	if (CanUseBlockOps (CSI) && CSI.getReg() < Reg + `32`) {
1778	Mask \|= `1` << (CSI.getReg() - Reg);
1779	return true;
1780	} else {
1781	return false;
1782	}
1783	});
1784
1785	const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
1786	Register RegBlock =
1787	TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
1788	if (!RegBlock) {
1789	// We couldn't find a super register for the block. This can happen if
1790	// the register we started with is too high (e.g. v232 if the maximum is
1791	// v255). We therefore try to get the last register block and figure out
1792	// the mask from there.
1793	Register LastBlockStart =
1794	AMDGPU::VGPR0 + alignDown(Value: Reg - AMDGPU::VGPR0, Align: `32`);
1795	RegBlock =
1796	TRI->getMatchingSuperReg(Reg: LastBlockStart, SubIdx: AMDGPU::sub0, RC: BlockRegClass);
1797	assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
1798	"Couldn't find super register");
1799	int RegDelta = Reg - LastBlockStart;
1800	assert(RegDelta > `0` && llvm::countl_zero(Mask) >= RegDelta &&
1801	"Bad shift amount");
1802	Mask <<= RegDelta;
1803	}
1804
1805	FuncInfo->setMaskForVGPRBlockOps(RegisterBlock: RegBlock, Mask);
1806
1807	// The stack objects can be a bit smaller than the register block if we know
1808	// some of the high bits of Mask are 0. This may happen often with calling
1809	// conventions where the caller and callee-saved VGPRs are interleaved at
1810	// a small boundary (e.g. 8 or 16).
1811	int UnusedBits = llvm::countl_zero(Val: Mask);
1812	unsigned BlockSize = TRI->getSpillSize(RC: BlockRegClass) - UnusedBits `4`;
1813	int FrameIdx =
1814	MFI.CreateStackObject(Size: BlockSize, Alignment: TRI->getSpillAlign(RC: *BlockRegClass),
1815	/isSpillSlot=/true);
1816	if ((unsigned)FrameIdx < MinCSFrameIndex)
1817	MinCSFrameIndex = FrameIdx;
1818	if ((unsigned)FrameIdx > MaxCSFrameIndex)
1819	MaxCSFrameIndex = FrameIdx;
1820
1821	CSIt ->setFrameIdx(FrameIdx);
1822	CSIt ->setReg(RegBlock);
1823	}
1824	CSI.erase(first: CSEnd, last: CSI.end());
1825	}
1826
1827	bool SIFrameLowering::assignCalleeSavedSpillSlots(
1828	MachineFunction &MF, const TargetRegisterInfo *TRI,
1829	std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1830	unsigned &MaxCSFrameIndex) const {
1831	if (CSI.empty())
1832	return true; // Early exit if no callee saved registers are modified!
1833
1834	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1835	bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1836
1837	if (UseVGPRBlocks)
1838	assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
1839
1840	return assignCalleeSavedSpillSlots(MF, TRI, CSI) \|\| UseVGPRBlocks;
1841	}
1842
1843	bool SIFrameLowering::assignCalleeSavedSpillSlots(
1844	MachineFunction &MF, const TargetRegisterInfo *TRI,
1845	std::vector<CalleeSavedInfo> &CSI) const {
1846	if (CSI.empty())
1847	return true; // Early exit if no callee saved registers are modified!
1848
1849	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1850	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1851	const SIRegisterInfo *RI = ST.getRegisterInfo();
1852	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1853	Register BasePtrReg = RI->getBaseRegister();
1854	Register SGPRForFPSaveRestoreCopy =
1855	FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1856	Register SGPRForBPSaveRestoreCopy =
1857	FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg);
1858	if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1859	return false;
1860
1861	unsigned NumModifiedRegs = `0`;
1862
1863	if (SGPRForFPSaveRestoreCopy)
1864	NumModifiedRegs++;
1865	if (SGPRForBPSaveRestoreCopy)
1866	NumModifiedRegs++;
1867
1868	for (auto &CS : CSI) {
1869	if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
1870	CS.setDstReg(SGPRForFPSaveRestoreCopy);
1871	if (--NumModifiedRegs)
1872	break;
1873	} else if (CS.getReg() == BasePtrReg.asMCReg() &&
1874	SGPRForBPSaveRestoreCopy) {
1875	CS.setDstReg(SGPRForBPSaveRestoreCopy);
1876	if (--NumModifiedRegs)
1877	break;
1878	}
1879	}
1880
1881	return false;
1882	}
1883
1884	bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1885	const MachineFunction &MF) const {
1886
1887	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1888	const MachineFrameInfo &MFI = MF.getFrameInfo();
1889	const SIInstrInfo *TII = ST.getInstrInfo();
1890	uint64_t EstStackSize = MFI.estimateStackSize(MF);
1891	uint64_t MaxOffset = EstStackSize - `1`;
1892
1893	// We need the emergency stack slots to be allocated in range of the
1894	// MUBUF/flat scratch immediate offset from the base register, so assign these
1895	// first at the incoming SP position.
1896	//
1897	// TODO: We could try sorting the objects to find a hole in the first bytes
1898	// rather than allocating as close to possible. This could save a lot of space
1899	// on frames with alignment requirements.
1900	if (ST.enableFlatScratch()) {
1901	if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1902	FlatVariant: SIInstrFlags::FlatScratch))
1903	return false;
1904	} else {
1905	if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset))
1906	return false;
1907	}
1908
1909	return true;
1910	}
1911
1912	bool SIFrameLowering::spillCalleeSavedRegisters(
1913	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1914	ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo TRI) const* {
1915	MachineFunction *MF = MBB.getParent();
1916	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1917	if (!ST.useVGPRBlockOpsForCSR())
1918	return false;
1919
1920	MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1921	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1922	const SIInstrInfo *TII = ST.getInstrInfo();
1923	SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1924
1925	const TargetRegisterClass *BlockRegClass =
1926	static_cast<const SIRegisterInfo >(TRI)->getRegClassForBlockOp(MF: MF);
1927	for (const CalleeSavedInfo &CS : CSI) {
1928	Register Reg = CS.getReg();
1929	if (!BlockRegClass->contains(Reg) \|\|
1930	!FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
1931	spillCalleeSavedRegister(SaveBlock&: MBB, MI, CS, TII, TRI);
1932	continue;
1933	}
1934
1935	// Build a scratch block store.
1936	uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
1937	int FrameIndex = CS.getFrameIdx();
1938	MachinePointerInfo PtrInfo =
1939	MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1940	MachineMemOperand *MMO =
1941	MF->getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
1942	Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1943	BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1944
1945	BuildMI(BB&: MBB, I: MI, MIMD: MI ->getDebugLoc(),
1946	MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
1947	.addReg(RegNo: Reg, flags: getKillRegState(B: false))
1948	.addFrameIndex(Idx: FrameIndex)
1949	.addReg(RegNo: MFI->getStackPtrOffsetReg())
1950	.addImm(Val: `0`)
1951	.addImm(Val: Mask)
1952	.addMemOperand(MMO);
1953
1954	FuncInfo->setHasSpilledVGPRs();
1955
1956	// Add the register to the liveins. This is necessary because if any of the
1957	// VGPRs in the register block is reserved (e.g. if it's a WWM register),
1958	// then the whole block will be marked as reserved and `updateLiveness` will
1959	// skip it.
1960	MBB.addLiveIn(PhysReg: Reg);
1961	}
1962	MBB.sortUniqueLiveIns();
1963
1964	return true;
1965	}
1966
1967	bool SIFrameLowering::restoreCalleeSavedRegisters(
1968	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1969	MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo TRI) const* {
1970	MachineFunction *MF = MBB.getParent();
1971	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1972	if (!ST.useVGPRBlockOpsForCSR())
1973	return false;
1974
1975	SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1976	MachineFrameInfo &MFI = MF->getFrameInfo();
1977	const SIInstrInfo *TII = ST.getInstrInfo();
1978	const SIRegisterInfo SITRI = static_cast<const* SIRegisterInfo *>(TRI);
1979	const TargetRegisterClass BlockRegClass = SITRI->getRegClassForBlockOp(MF: MF);
1980	for (const CalleeSavedInfo &CS : reverse(C&: CSI)) {
1981	Register Reg = CS.getReg();
1982	if (!BlockRegClass->contains(Reg) \|\|
1983	!FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) {
1984	restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
1985	continue;
1986	}
1987
1988	// Build a scratch block load.
1989	uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg);
1990	int FrameIndex = CS.getFrameIdx();
1991	MachinePointerInfo PtrInfo =
1992	MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1993	MachineMemOperand *MMO = MF->getMachineMemOperand(
1994	PtrInfo, F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIndex),
1995	BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIndex));
1996
1997	auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: MI ->getDebugLoc(),
1998	MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), DestReg: Reg)
1999	.addFrameIndex(Idx: FrameIndex)
2000	.addReg(RegNo: FuncInfo->getStackPtrOffsetReg())
2001	.addImm(Val: `0`)
2002	.addImm(Val: Mask)
2003	.addMemOperand(MMO);
2004	SITRI->addImplicitUsesForBlockCSRLoad(MIB, BlockReg: Reg);
2005
2006	// Add the register to the liveins. This is necessary because if any of the
2007	// VGPRs in the register block is reserved (e.g. if it's a WWM register),
2008	// then the whole block will be marked as reserved and `updateLiveness` will
2009	// skip it.
2010	MBB.addLiveIn(PhysReg: Reg);
2011	}
2012
2013	MBB.sortUniqueLiveIns();
2014	return true;
2015	}
2016
2017	MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
2018	MachineFunction &MF,
2019	MachineBasicBlock &MBB,
2020	MachineBasicBlock::iterator I) const {
2021	int64_t Amount = I ->getOperand(i: `0`).getImm();
2022	if (Amount == `0`)
2023	return MBB.erase(I);
2024
2025	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2026	const SIInstrInfo *TII = ST.getInstrInfo();
2027	const DebugLoc &DL = I ->getDebugLoc();
2028	unsigned Opc = I ->getOpcode();
2029	bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
2030	uint64_t CalleePopAmount = IsDestroy ? I ->getOperand(i: `1`).getImm() : `0`;
2031
2032	if (!hasReservedCallFrame(MF)) {
2033	Amount = alignTo(Size: Amount, A: getStackAlign());
2034	assert(isUInt<`32`>(Amount) && "exceeded stack address space size");
2035	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2036	Register SPReg = MFI->getStackPtrOffsetReg();
2037
2038	Amount *= getScratchScaleFactor(ST);
2039	if (IsDestroy)
2040	Amount = -Amount;
2041	auto Add = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SPReg)
2042	.addReg(RegNo: SPReg)
2043	.addImm(Val: Amount);
2044	Add ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
2045	} else if (CalleePopAmount != `0`) {
2046	llvm_unreachable("is this used?");
2047	}
2048
2049	return MBB.erase(I);
2050	}
2051
2052	/// Returns true if the frame will require a reference to the stack pointer.
2053	///
2054	/// This is the set of conditions common to setting up the stack pointer in a
2055	/// kernel, and for using a frame pointer in a callable function.
2056	///
2057	/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2058	/// references SP.
2059	static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
2060	return MFI.hasVarSizedObjects() \|\| MFI.hasStackMap() \|\| MFI.hasPatchPoint();
2061	}
2062
2063	// The FP for kernels is always known 0, so we never really need to setup an
2064	// explicit register for it. However, DisableFramePointerElim will force us to
2065	// use a register for it.
2066	bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
2067	const MachineFrameInfo &MFI = MF.getFrameInfo();
2068
2069	// For entry & chain functions we can use an immediate offset in most cases,
2070	// so the presence of calls doesn't imply we need a distinct frame pointer.
2071	if (MFI.hasCalls() &&
2072	!MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
2073	!MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
2074	// All offsets are unsigned, so need to be addressed in the same direction
2075	// as stack growth.
2076
2077	// FIXME: This function is pretty broken, since it can be called before the
2078	// frame layout is determined or CSR spills are inserted.
2079	return MFI.getStackSize() != `0`;
2080	}
2081
2082	return frameTriviallyRequiresSP(MFI) \|\| MFI.isFrameAddressTaken() \|\|
2083	MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
2084	MF) \|\|
2085	mayReserveScratchForCWSR(MF) \|\|
2086	MF.getTarget().Options.DisableFramePointerElim(MF);
2087	}
2088
2089	bool SIFrameLowering::mayReserveScratchForCWSR(
2090	const MachineFunction &MF) const {
2091	return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
2092	AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) &&
2093	AMDGPU::isCompute(CC: MF.getFunction().getCallingConv());
2094	}
2095
2096	// This is essentially a reduced version of hasFP for entry functions. Since the
2097	// stack pointer is known 0 on entry to kernels, we never really need an FP
2098	// register. We may need to initialize the stack pointer depending on the frame
2099	// properties, which logically overlaps many of the cases where an ordinary
2100	// function would require an FP.
2101	// Also used for chain functions. While not technically entry functions, chain
2102	// functions may need to set up a stack pointer in some situations.
2103	bool SIFrameLowering::requiresStackPointerReference(
2104	const MachineFunction &MF) const {
2105	// Callable functions always require a stack pointer reference.
2106	assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() \|\|
2107	MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
2108	"only expected to call this for entry points and chain functions");
2109
2110	const MachineFrameInfo &MFI = MF.getFrameInfo();
2111
2112	// Entry points ordinarily don't need to initialize SP. We have to set it up
2113	// for callees if there are any. Also note tail calls are impossible/don't
2114	// make any sense for kernels.
2115	if (MFI.hasCalls())
2116	return true;
2117
2118	// We still need to initialize the SP if we're doing anything weird that
2119	// references the SP, like variable sized stack objects.
2120	return frameTriviallyRequiresSP(MFI);
2121	}
2122

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp