1 | //===----------------------- SIFrameLowering.cpp --------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //==-----------------------------------------------------------------------===// |
8 | |
9 | #include "SIFrameLowering.h" |
10 | #include "AMDGPU.h" |
11 | #include "GCNSubtarget.h" |
12 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
13 | #include "SIMachineFunctionInfo.h" |
14 | #include "llvm/CodeGen/LiveRegUnits.h" |
15 | #include "llvm/CodeGen/MachineFrameInfo.h" |
16 | #include "llvm/CodeGen/RegisterScavenging.h" |
17 | #include "llvm/Target/TargetMachine.h" |
18 | |
19 | using namespace llvm; |
20 | |
21 | #define DEBUG_TYPE "frame-info" |
22 | |
23 | static cl::opt<bool> EnableSpillVGPRToAGPR( |
24 | "amdgpu-spill-vgpr-to-agpr" , |
25 | cl::desc("Enable spilling VGPRs to AGPRs" ), |
26 | cl::ReallyHidden, |
27 | cl::init(Val: true)); |
28 | |
29 | // Find a register matching \p RC from \p LiveUnits which is unused and |
30 | // available throughout the function. On failure, returns AMDGPU::NoRegister. |
31 | // TODO: Rewrite the loop here to iterate over MCRegUnits instead of |
32 | // MCRegisters. This should reduce the number of iterations and avoid redundant |
33 | // checking. |
34 | static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, |
35 | const LiveRegUnits &LiveUnits, |
36 | const TargetRegisterClass &RC) { |
37 | for (MCRegister Reg : RC) { |
38 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) && |
39 | !MRI.isReserved(PhysReg: Reg)) |
40 | return Reg; |
41 | } |
42 | return MCRegister(); |
43 | } |
44 | |
45 | // Find a scratch register that we can use in the prologue. We avoid using |
46 | // callee-save registers since they may appear to be free when this is called |
47 | // from canUseAsPrologue (during shrink wrapping), but then no longer be free |
48 | // when this is called from emitPrologue. |
49 | static MCRegister findScratchNonCalleeSaveRegister( |
50 | MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, |
51 | const TargetRegisterClass &RC, bool Unused = false) { |
52 | // Mark callee saved registers as used so we will not choose them. |
53 | const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); |
54 | for (unsigned i = 0; CSRegs[i]; ++i) |
55 | LiveUnits.addReg(Reg: CSRegs[i]); |
56 | |
57 | // We are looking for a register that can be used throughout the entire |
58 | // function, so any use is unacceptable. |
59 | if (Unused) |
60 | return findUnusedRegister(MRI, LiveUnits, RC); |
61 | |
62 | for (MCRegister Reg : RC) { |
63 | if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg)) |
64 | return Reg; |
65 | } |
66 | |
67 | return MCRegister(); |
68 | } |
69 | |
70 | /// Query target location for spilling SGPRs |
71 | /// \p IncludeScratchCopy : Also look for free scratch SGPRs |
72 | static void getVGPRSpillLaneOrTempRegister( |
73 | MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, |
74 | const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, |
75 | bool IncludeScratchCopy = true) { |
76 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
77 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
78 | |
79 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
80 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
81 | unsigned Size = TRI->getSpillSize(RC); |
82 | Align Alignment = TRI->getSpillAlign(RC); |
83 | |
84 | // We need to save and restore the given SGPR. |
85 | |
86 | Register ScratchSGPR; |
87 | // 1: Try to save the given register into an unused scratch SGPR. The |
88 | // LiveUnits should have all the callee saved registers marked as used. For |
89 | // certain cases we skip copy to scratch SGPR. |
90 | if (IncludeScratchCopy) |
91 | ScratchSGPR = findUnusedRegister(MRI&: MF.getRegInfo(), LiveUnits, RC); |
92 | |
93 | if (!ScratchSGPR) { |
94 | int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr, |
95 | ID: TargetStackID::SGPRSpill); |
96 | |
97 | if (TRI->spillSGPRToVGPR() && |
98 | MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, |
99 | /*IsPrologEpilog=*/true)) { |
100 | // 2: There's no free lane to spill, and no free register to save the |
101 | // SGPR, so we're forced to take another VGPR to use for the spill. |
102 | MFI->addToPrologEpilogSGPRSpills( |
103 | Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo( |
104 | SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); |
105 | |
106 | LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); |
107 | dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " |
108 | << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane |
109 | << '\n';); |
110 | } else { |
111 | // Remove dead <FI> index |
112 | MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI); |
113 | // 3: If all else fails, spill the register to memory. |
114 | FI = FrameInfo.CreateSpillStackObject(Size, Alignment); |
115 | MFI->addToPrologEpilogSGPRSpills( |
116 | Reg: SGPR, |
117 | SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); |
118 | LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " |
119 | << printReg(SGPR, TRI) << '\n'); |
120 | } |
121 | } else { |
122 | MFI->addToPrologEpilogSGPRSpills( |
123 | Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo( |
124 | SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); |
125 | LiveUnits.addReg(Reg: ScratchSGPR); |
126 | LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " |
127 | << printReg(ScratchSGPR, TRI) << '\n'); |
128 | } |
129 | } |
130 | |
131 | // We need to specially emit stack operations here because a different frame |
132 | // register is used than in the rest of the function, as getFrameRegister would |
133 | // use. |
134 | static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, |
135 | const SIMachineFunctionInfo &FuncInfo, |
136 | LiveRegUnits &LiveUnits, MachineFunction &MF, |
137 | MachineBasicBlock &MBB, |
138 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
139 | Register SpillReg, int FI, Register FrameReg, |
140 | int64_t DwordOff = 0) { |
141 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
142 | : AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
143 | |
144 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
145 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
146 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
147 | PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI), |
148 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI)); |
149 | LiveUnits.addReg(Reg: SpillReg); |
150 | bool IsKill = !MBB.isLiveIn(Reg: SpillReg); |
151 | TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg, |
152 | InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits); |
153 | if (IsKill) |
154 | LiveUnits.removeReg(Reg: SpillReg); |
155 | } |
156 | |
157 | static void buildEpilogRestore(const GCNSubtarget &ST, |
158 | const SIRegisterInfo &TRI, |
159 | const SIMachineFunctionInfo &FuncInfo, |
160 | LiveRegUnits &LiveUnits, MachineFunction &MF, |
161 | MachineBasicBlock &MBB, |
162 | MachineBasicBlock::iterator I, |
163 | const DebugLoc &DL, Register SpillReg, int FI, |
164 | Register FrameReg, int64_t DwordOff = 0) { |
165 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR |
166 | : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
167 | |
168 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
169 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
170 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
171 | PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI), |
172 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI)); |
173 | TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg, |
174 | InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits); |
175 | } |
176 | |
177 | static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
178 | const DebugLoc &DL, const SIInstrInfo *TII, |
179 | Register TargetReg) { |
180 | MachineFunction *MF = MBB.getParent(); |
181 | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
182 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
183 | const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32); |
184 | Register TargetLo = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub0); |
185 | Register TargetHi = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub1); |
186 | |
187 | if (MFI->getGITPtrHigh() != 0xffffffff) { |
188 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi) |
189 | .addImm(Val: MFI->getGITPtrHigh()) |
190 | .addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine); |
191 | } else { |
192 | const MCInstrDesc &GetPC64 = TII->get(Opcode: AMDGPU::S_GETPC_B64_pseudo); |
193 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg); |
194 | } |
195 | Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF); |
196 | MF->getRegInfo().addLiveIn(Reg: GitPtrLo); |
197 | MBB.addLiveIn(PhysReg: GitPtrLo); |
198 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo) |
199 | .addReg(RegNo: GitPtrLo); |
200 | } |
201 | |
202 | static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, |
203 | const SIMachineFunctionInfo *FuncInfo, |
204 | MachineFunction &MF, MachineBasicBlock &MBB, |
205 | MachineBasicBlock::iterator MBBI, bool IsProlog) { |
206 | if (LiveUnits.empty()) { |
207 | LiveUnits.init(TRI); |
208 | if (IsProlog) { |
209 | LiveUnits.addLiveIns(MBB); |
210 | } else { |
211 | // In epilog. |
212 | LiveUnits.addLiveOuts(MBB); |
213 | LiveUnits.stepBackward(MI: *MBBI); |
214 | } |
215 | } |
216 | } |
217 | |
218 | namespace llvm { |
219 | |
220 | // SpillBuilder to save/restore special SGPR spills like the one needed for FP, |
221 | // BP, etc. These spills are delayed until the current function's frame is |
222 | // finalized. For a given register, the builder uses the |
223 | // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. |
224 | class PrologEpilogSGPRSpillBuilder { |
225 | MachineBasicBlock::iterator MI; |
226 | MachineBasicBlock &MBB; |
227 | MachineFunction &MF; |
228 | const GCNSubtarget &ST; |
229 | MachineFrameInfo &MFI; |
230 | SIMachineFunctionInfo *FuncInfo; |
231 | const SIInstrInfo *TII; |
232 | const SIRegisterInfo &TRI; |
233 | Register SuperReg; |
234 | const PrologEpilogSGPRSaveRestoreInfo SI; |
235 | LiveRegUnits &LiveUnits; |
236 | const DebugLoc &DL; |
237 | Register FrameReg; |
238 | ArrayRef<int16_t> SplitParts; |
239 | unsigned NumSubRegs; |
240 | unsigned EltSize = 4; |
241 | |
242 | void saveToMemory(const int FI) const { |
243 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
244 | assert(!MFI.isDeadObjectIndex(FI)); |
245 | |
246 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true); |
247 | |
248 | MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( |
249 | MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass); |
250 | if (!TmpVGPR) |
251 | report_fatal_error(reason: "failed to find free scratch register" ); |
252 | |
253 | for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { |
254 | Register SubReg = NumSubRegs == 1 |
255 | ? SuperReg |
256 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
257 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR) |
258 | .addReg(RegNo: SubReg); |
259 | |
260 | buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR, |
261 | FI, FrameReg, DwordOff); |
262 | DwordOff += 4; |
263 | } |
264 | } |
265 | |
266 | void saveToVGPRLane(const int FI) const { |
267 | assert(!MFI.isDeadObjectIndex(FI)); |
268 | |
269 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
270 | ArrayRef<SIRegisterInfo::SpilledReg> Spill = |
271 | FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI); |
272 | assert(Spill.size() == NumSubRegs); |
273 | |
274 | for (unsigned I = 0; I < NumSubRegs; ++I) { |
275 | Register SubReg = NumSubRegs == 1 |
276 | ? SuperReg |
277 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
278 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR), |
279 | DestReg: Spill[I].VGPR) |
280 | .addReg(RegNo: SubReg) |
281 | .addImm(Val: Spill[I].Lane) |
282 | .addReg(RegNo: Spill[I].VGPR, flags: RegState::Undef); |
283 | } |
284 | } |
285 | |
286 | void copyToScratchSGPR(Register DstReg) const { |
287 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
288 | .addReg(RegNo: SuperReg) |
289 | .setMIFlag(MachineInstr::FrameSetup); |
290 | } |
291 | |
292 | void restoreFromMemory(const int FI) { |
293 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
294 | |
295 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false); |
296 | MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( |
297 | MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass); |
298 | if (!TmpVGPR) |
299 | report_fatal_error(reason: "failed to find free scratch register" ); |
300 | |
301 | for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { |
302 | Register SubReg = NumSubRegs == 1 |
303 | ? SuperReg |
304 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
305 | |
306 | buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, |
307 | SpillReg: TmpVGPR, FI, FrameReg, DwordOff); |
308 | MRI.constrainRegClass(Reg: SubReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
309 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SubReg) |
310 | .addReg(RegNo: TmpVGPR, flags: RegState::Kill); |
311 | DwordOff += 4; |
312 | } |
313 | } |
314 | |
315 | void restoreFromVGPRLane(const int FI) { |
316 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
317 | ArrayRef<SIRegisterInfo::SpilledReg> Spill = |
318 | FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI); |
319 | assert(Spill.size() == NumSubRegs); |
320 | |
321 | for (unsigned I = 0; I < NumSubRegs; ++I) { |
322 | Register SubReg = NumSubRegs == 1 |
323 | ? SuperReg |
324 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
325 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg) |
326 | .addReg(RegNo: Spill[I].VGPR) |
327 | .addImm(Val: Spill[I].Lane); |
328 | } |
329 | } |
330 | |
331 | void copyFromScratchSGPR(Register SrcReg) const { |
332 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SuperReg) |
333 | .addReg(RegNo: SrcReg) |
334 | .setMIFlag(MachineInstr::FrameDestroy); |
335 | } |
336 | |
337 | public: |
338 | PrologEpilogSGPRSpillBuilder(Register Reg, |
339 | const PrologEpilogSGPRSaveRestoreInfo SI, |
340 | MachineBasicBlock &MBB, |
341 | MachineBasicBlock::iterator MI, |
342 | const DebugLoc &DL, const SIInstrInfo *TII, |
343 | const SIRegisterInfo &TRI, |
344 | LiveRegUnits &LiveUnits, Register FrameReg) |
345 | : MI(MI), MBB(MBB), MF(*MBB.getParent()), |
346 | ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), |
347 | FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), |
348 | SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), |
349 | FrameReg(FrameReg) { |
350 | const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg); |
351 | SplitParts = TRI.getRegSplitParts(RC, EltSize); |
352 | NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); |
353 | |
354 | assert(SuperReg != AMDGPU::M0 && "m0 should never spill" ); |
355 | } |
356 | |
357 | void save() { |
358 | switch (SI.getKind()) { |
359 | case SGPRSaveKind::SPILL_TO_MEM: |
360 | return saveToMemory(FI: SI.getIndex()); |
361 | case SGPRSaveKind::SPILL_TO_VGPR_LANE: |
362 | return saveToVGPRLane(FI: SI.getIndex()); |
363 | case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: |
364 | return copyToScratchSGPR(DstReg: SI.getReg()); |
365 | } |
366 | } |
367 | |
368 | void restore() { |
369 | switch (SI.getKind()) { |
370 | case SGPRSaveKind::SPILL_TO_MEM: |
371 | return restoreFromMemory(FI: SI.getIndex()); |
372 | case SGPRSaveKind::SPILL_TO_VGPR_LANE: |
373 | return restoreFromVGPRLane(FI: SI.getIndex()); |
374 | case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: |
375 | return copyFromScratchSGPR(SrcReg: SI.getReg()); |
376 | } |
377 | } |
378 | }; |
379 | |
380 | } // namespace llvm |
381 | |
382 | // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` |
383 | void SIFrameLowering::emitEntryFunctionFlatScratchInit( |
384 | MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
385 | const DebugLoc &DL, Register ScratchWaveOffsetReg) const { |
386 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
387 | const SIInstrInfo *TII = ST.getInstrInfo(); |
388 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
389 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
390 | |
391 | // We don't need this if we only have spills since there is no user facing |
392 | // scratch. |
393 | |
394 | // TODO: If we know we don't have flat instructions earlier, we can omit |
395 | // this from the input registers. |
396 | // |
397 | // TODO: We only need to know if we access scratch space through a flat |
398 | // pointer. Because we only detect if flat instructions are used at all, |
399 | // this will be used more often than necessary on VI. |
400 | |
401 | Register FlatScrInitLo; |
402 | Register FlatScrInitHi; |
403 | |
404 | if (ST.isAmdPalOS()) { |
405 | // Extract the scratch offset from the descriptor in the GIT |
406 | LiveRegUnits LiveUnits; |
407 | LiveUnits.init(TRI: *TRI); |
408 | LiveUnits.addLiveIns(MBB); |
409 | |
410 | // Find unused reg to load flat scratch init into |
411 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
412 | Register FlatScrInit = AMDGPU::NoRegister; |
413 | ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); |
414 | unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; |
415 | AllSGPR64s = AllSGPR64s.slice( |
416 | N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded)); |
417 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
418 | for (MCPhysReg Reg : AllSGPR64s) { |
419 | if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) && |
420 | MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg)) { |
421 | FlatScrInit = Reg; |
422 | break; |
423 | } |
424 | } |
425 | assert(FlatScrInit && "Failed to find free register for scratch init" ); |
426 | |
427 | FlatScrInitLo = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub0); |
428 | FlatScrInitHi = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub1); |
429 | |
430 | buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit); |
431 | |
432 | // We now have the GIT ptr - now get the scratch descriptor from the entry |
433 | // at offset 0 (or offset 16 for a compute shader). |
434 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
435 | const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM); |
436 | auto *MMO = MF.getMachineMemOperand( |
437 | PtrInfo, |
438 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
439 | MachineMemOperand::MODereferenceable, |
440 | Size: 8, BaseAlignment: Align(4)); |
441 | unsigned Offset = |
442 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; |
443 | const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); |
444 | unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset); |
445 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit) |
446 | .addReg(RegNo: FlatScrInit) |
447 | .addImm(Val: EncodedOffset) // offset |
448 | .addImm(Val: 0) // cpol |
449 | .addMemOperand(MMO); |
450 | |
451 | // Mask the offset in [47:0] of the descriptor |
452 | const MCInstrDesc &SAndB32 = TII->get(Opcode: AMDGPU::S_AND_B32); |
453 | auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi) |
454 | .addReg(RegNo: FlatScrInitHi) |
455 | .addImm(Val: 0xffff); |
456 | And->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
457 | } else { |
458 | Register FlatScratchInitReg = |
459 | MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); |
460 | assert(FlatScratchInitReg); |
461 | |
462 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
463 | MRI.addLiveIn(Reg: FlatScratchInitReg); |
464 | MBB.addLiveIn(PhysReg: FlatScratchInitReg); |
465 | |
466 | FlatScrInitLo = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub0); |
467 | FlatScrInitHi = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub1); |
468 | } |
469 | |
470 | // Do a 64-bit pointer add. |
471 | if (ST.flatScratchIsPointer()) { |
472 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { |
473 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: FlatScrInitLo) |
474 | .addReg(RegNo: FlatScrInitLo) |
475 | .addReg(RegNo: ScratchWaveOffsetReg); |
476 | auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), |
477 | DestReg: FlatScrInitHi) |
478 | .addReg(RegNo: FlatScrInitHi) |
479 | .addImm(Val: 0); |
480 | Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
481 | |
482 | using namespace AMDGPU::Hwreg; |
483 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32)) |
484 | .addReg(RegNo: FlatScrInitLo) |
485 | .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_LO, Values: 0, Values: 32))); |
486 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32)) |
487 | .addReg(RegNo: FlatScrInitHi) |
488 | .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_HI, Values: 0, Values: 32))); |
489 | return; |
490 | } |
491 | |
492 | // For GFX9. |
493 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: AMDGPU::FLAT_SCR_LO) |
494 | .addReg(RegNo: FlatScrInitLo) |
495 | .addReg(RegNo: ScratchWaveOffsetReg); |
496 | auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), |
497 | DestReg: AMDGPU::FLAT_SCR_HI) |
498 | .addReg(RegNo: FlatScrInitHi) |
499 | .addImm(Val: 0); |
500 | Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
501 | |
502 | return; |
503 | } |
504 | |
505 | assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); |
506 | |
507 | // Copy the size in bytes. |
508 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::FLAT_SCR_LO) |
509 | .addReg(RegNo: FlatScrInitHi, flags: RegState::Kill); |
510 | |
511 | // Add wave offset in bytes to private base offset. |
512 | // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. |
513 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FlatScrInitLo) |
514 | .addReg(RegNo: FlatScrInitLo) |
515 | .addReg(RegNo: ScratchWaveOffsetReg); |
516 | |
517 | // Convert offset to 256-byte units. |
518 | auto LShr = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32), |
519 | DestReg: AMDGPU::FLAT_SCR_HI) |
520 | .addReg(RegNo: FlatScrInitLo, flags: RegState::Kill) |
521 | .addImm(Val: 8); |
522 | LShr->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
523 | } |
524 | |
525 | // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not |
526 | // memory. They should have been removed by now. |
527 | static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { |
528 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); |
529 | I != E; ++I) { |
530 | if (!MFI.isDeadObjectIndex(ObjectIdx: I)) |
531 | return false; |
532 | } |
533 | |
534 | return true; |
535 | } |
536 | |
537 | // Shift down registers reserved for the scratch RSRC. |
538 | Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( |
539 | MachineFunction &MF) const { |
540 | |
541 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
542 | const SIInstrInfo *TII = ST.getInstrInfo(); |
543 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
544 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
545 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
546 | |
547 | assert(MFI->isEntryFunction()); |
548 | |
549 | Register ScratchRsrcReg = MFI->getScratchRSrcReg(); |
550 | |
551 | if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) && |
552 | allStackObjectsAreDead(MFI: MF.getFrameInfo()))) |
553 | return Register(); |
554 | |
555 | if (ST.hasSGPRInitBug() || |
556 | ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) |
557 | return ScratchRsrcReg; |
558 | |
559 | // We reserved the last registers for this. Shift it down to the end of those |
560 | // which were actually used. |
561 | // |
562 | // FIXME: It might be safer to use a pseudoregister before replacement. |
563 | |
564 | // FIXME: We should be able to eliminate unused input registers. We only |
565 | // cannot do this for the resources required for scratch access. For now we |
566 | // skip over user SGPRs and may leave unused holes. |
567 | |
568 | unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; |
569 | ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); |
570 | AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded)); |
571 | |
572 | // Skip the last N reserved elements because they should have already been |
573 | // reserved for VCC etc. |
574 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
575 | for (MCPhysReg Reg : AllSGPR128s) { |
576 | // Pick the first unallocated one. Make sure we don't clobber the other |
577 | // reserved input we needed. Also for PAL, make sure we don't clobber |
578 | // the GIT pointer passed in SGPR0 or SGPR8. |
579 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) && |
580 | (!GITPtrLoReg || !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg))) { |
581 | MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg); |
582 | MFI->setScratchRSrcReg(Reg); |
583 | MRI.reserveReg(PhysReg: Reg, TRI); |
584 | return Reg; |
585 | } |
586 | } |
587 | |
588 | return ScratchRsrcReg; |
589 | } |
590 | |
591 | static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { |
592 | return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); |
593 | } |
594 | |
595 | void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, |
596 | MachineBasicBlock &MBB) const { |
597 | assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported" ); |
598 | |
599 | // FIXME: If we only have SGPR spills, we won't actually be using scratch |
600 | // memory since these spill to VGPRs. We should be cleaning up these unused |
601 | // SGPR spill frame indices somewhere. |
602 | |
603 | // FIXME: We still have implicit uses on SGPR spill instructions in case they |
604 | // need to spill to vector memory. It's likely that will not happen, but at |
605 | // this point it appears we need the setup. This part of the prolog should be |
606 | // emitted after frame indices are eliminated. |
607 | |
608 | // FIXME: Remove all of the isPhysRegUsed checks |
609 | |
610 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
611 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
612 | const SIInstrInfo *TII = ST.getInstrInfo(); |
613 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
614 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
615 | const Function &F = MF.getFunction(); |
616 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
617 | |
618 | assert(MFI->isEntryFunction()); |
619 | |
620 | Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( |
621 | Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); |
622 | |
623 | // We need to do the replacement of the private segment buffer register even |
624 | // if there are no stack objects. There could be stores to undef or a |
625 | // constant without an associated object. |
626 | // |
627 | // This will return `Register()` in cases where there are no actual |
628 | // uses of the SRSRC. |
629 | Register ScratchRsrcReg; |
630 | if (!ST.enableFlatScratch()) |
631 | ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); |
632 | |
633 | // Make the selected register live throughout the function. |
634 | if (ScratchRsrcReg) { |
635 | for (MachineBasicBlock &OtherBB : MF) { |
636 | if (&OtherBB != &MBB) { |
637 | OtherBB.addLiveIn(PhysReg: ScratchRsrcReg); |
638 | } |
639 | } |
640 | } |
641 | |
642 | // Now that we have fixed the reserved SRSRC we need to locate the |
643 | // (potentially) preloaded SRSRC. |
644 | Register PreloadedScratchRsrcReg; |
645 | if (ST.isAmdHsaOrMesa(F)) { |
646 | PreloadedScratchRsrcReg = |
647 | MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); |
648 | if (ScratchRsrcReg && PreloadedScratchRsrcReg) { |
649 | // We added live-ins during argument lowering, but since they were not |
650 | // used they were deleted. We're adding the uses now, so add them back. |
651 | MRI.addLiveIn(Reg: PreloadedScratchRsrcReg); |
652 | MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg); |
653 | } |
654 | } |
655 | |
656 | // Debug location must be unknown since the first debug location is used to |
657 | // determine the end of the prologue. |
658 | DebugLoc DL; |
659 | MachineBasicBlock::iterator I = MBB.begin(); |
660 | |
661 | // We found the SRSRC first because it needs four registers and has an |
662 | // alignment requirement. If the SRSRC that we found is clobbering with |
663 | // the scratch wave offset, which may be in a fixed SGPR or a free SGPR |
664 | // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch |
665 | // wave offset to a free SGPR. |
666 | Register ScratchWaveOffsetReg; |
667 | if (PreloadedScratchWaveOffsetReg && |
668 | TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: PreloadedScratchWaveOffsetReg)) { |
669 | ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); |
670 | unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); |
671 | AllSGPRs = AllSGPRs.slice( |
672 | N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded)); |
673 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
674 | for (MCPhysReg Reg : AllSGPRs) { |
675 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) && |
676 | !TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: Reg) && GITPtrLoReg != Reg) { |
677 | ScratchWaveOffsetReg = Reg; |
678 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchWaveOffsetReg) |
679 | .addReg(RegNo: PreloadedScratchWaveOffsetReg, flags: RegState::Kill); |
680 | break; |
681 | } |
682 | } |
683 | |
684 | // FIXME: We can spill incoming arguments and restore at the end of the |
685 | // prolog. |
686 | if (!ScratchWaveOffsetReg) |
687 | report_fatal_error( |
688 | reason: "could not find temporary scratch offset register in prolog" ); |
689 | } else { |
690 | ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; |
691 | } |
692 | assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); |
693 | |
694 | unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); |
695 | if (!mayReserveScratchForCWSR(MF)) { |
696 | if (hasFP(MF)) { |
697 | Register FPReg = MFI->getFrameOffsetReg(); |
698 | assert(FPReg != AMDGPU::FP_REG); |
699 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: FPReg).addImm(Val: 0); |
700 | } |
701 | |
702 | if (requiresStackPointerReference(MF)) { |
703 | Register SPReg = MFI->getStackPtrOffsetReg(); |
704 | assert(SPReg != AMDGPU::SP_REG); |
705 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset); |
706 | } |
707 | } else { |
708 | // We need to check if we're on a compute queue - if we are, then the CWSR |
709 | // trap handler may need to store some VGPRs on the stack. The first VGPR |
710 | // block is saved separately, so we only need to allocate space for any |
711 | // additional VGPR blocks used. For now, we will make sure there's enough |
712 | // room for the theoretical maximum number of VGPRs that can be allocated. |
713 | // FIXME: Figure out if the shader uses fewer VGPRs in practice. |
714 | assert(hasFP(MF)); |
715 | Register FPReg = MFI->getFrameOffsetReg(); |
716 | assert(FPReg != AMDGPU::FP_REG); |
717 | unsigned VGPRSize = llvm::alignTo( |
718 | Size: (ST.getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()) - |
719 | AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, |
720 | DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize())) * |
721 | 4, |
722 | A: FrameInfo.getMaxAlign()); |
723 | MFI->setScratchReservedForDynamicVGPRs(VGPRSize); |
724 | |
725 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: FPReg) |
726 | .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode( |
727 | Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2)); |
728 | // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute |
729 | // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set |
730 | // SCC, so we need to check for 0 manually. |
731 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: FPReg); |
732 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: FPReg).addImm(Val: VGPRSize); |
733 | if (requiresStackPointerReference(MF)) { |
734 | Register SPReg = MFI->getStackPtrOffsetReg(); |
735 | assert(SPReg != AMDGPU::SP_REG); |
736 | |
737 | // If at least one of the constants can be inlined, then we can use |
738 | // s_cselect. Otherwise, use a mov and cmovk. |
739 | if (AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm()) || |
740 | AMDGPU::isInlinableLiteral32(Literal: Offset + VGPRSize, |
741 | HasInv2Pi: ST.hasInv2PiInlineImm())) { |
742 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SPReg) |
743 | .addImm(Val: Offset + VGPRSize) |
744 | .addImm(Val: Offset); |
745 | } else { |
746 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg).addImm(Val: Offset); |
747 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMOVK_I32), DestReg: SPReg) |
748 | .addImm(Val: Offset + VGPRSize); |
749 | } |
750 | } |
751 | } |
752 | |
753 | bool NeedsFlatScratchInit = |
754 | MFI->getUserSGPRInfo().hasFlatScratchInit() && |
755 | (MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || |
756 | (!allStackObjectsAreDead(MFI: FrameInfo) && ST.enableFlatScratch())); |
757 | |
758 | if ((NeedsFlatScratchInit || ScratchRsrcReg) && |
759 | PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { |
760 | MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg); |
761 | MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg); |
762 | } |
763 | |
764 | if (NeedsFlatScratchInit) { |
765 | emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); |
766 | } |
767 | |
768 | if (ScratchRsrcReg) { |
769 | emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, |
770 | PreloadedPrivateBufferReg: PreloadedScratchRsrcReg, |
771 | ScratchRsrcReg, ScratchWaveOffsetReg); |
772 | } |
773 | } |
774 | |
775 | // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` |
776 | void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( |
777 | MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
778 | const DebugLoc &DL, Register PreloadedScratchRsrcReg, |
779 | Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { |
780 | |
781 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
782 | const SIInstrInfo *TII = ST.getInstrInfo(); |
783 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
784 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
785 | const Function &Fn = MF.getFunction(); |
786 | |
787 | if (ST.isAmdPalOS()) { |
788 | // The pointer to the GIT is formed from the offset passed in and either |
789 | // the amdgpu-git-ptr-high function attribute or the top part of the PC |
790 | Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1); |
791 | Register Rsrc03 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3); |
792 | |
793 | buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01); |
794 | |
795 | // We now have the GIT ptr - now get the scratch descriptor from the entry |
796 | // at offset 0 (or offset 16 for a compute shader). |
797 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
798 | const MCInstrDesc &LoadDwordX4 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX4_IMM); |
799 | auto *MMO = MF.getMachineMemOperand( |
800 | PtrInfo, |
801 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
802 | MachineMemOperand::MODereferenceable, |
803 | Size: 16, BaseAlignment: Align(4)); |
804 | unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; |
805 | const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); |
806 | unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset); |
807 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg) |
808 | .addReg(RegNo: Rsrc01) |
809 | .addImm(Val: EncodedOffset) // offset |
810 | .addImm(Val: 0) // cpol |
811 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine) |
812 | .addMemOperand(MMO); |
813 | |
814 | // The driver will always set the SRD for wave 64 (bits 118:117 of |
815 | // descriptor / bits 22:21 of third sub-reg will be 0b11) |
816 | // If the shader is actually wave32 we have to modify the const_index_stride |
817 | // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The |
818 | // reason the driver does this is that there can be cases where it presents |
819 | // 2 shaders with different wave size (e.g. VsFs). |
820 | // TODO: convert to using SCRATCH instructions or multiple SRD buffers |
821 | if (ST.isWave32()) { |
822 | const MCInstrDesc &SBitsetB32 = TII->get(Opcode: AMDGPU::S_BITSET0_B32); |
823 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03) |
824 | .addImm(Val: 21) |
825 | .addReg(RegNo: Rsrc03); |
826 | } |
827 | } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) { |
828 | assert(!ST.isAmdHsaOrMesa(Fn)); |
829 | const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32); |
830 | |
831 | Register Rsrc2 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub2); |
832 | Register Rsrc3 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3); |
833 | |
834 | // Use relocations to get the pointer, and setup the other bits manually. |
835 | uint64_t Rsrc23 = TII->getScratchRsrcWords23(); |
836 | |
837 | if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { |
838 | Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1); |
839 | |
840 | if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) { |
841 | const MCInstrDesc &Mov64 = TII->get(Opcode: AMDGPU::S_MOV_B64); |
842 | |
843 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01) |
844 | .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR()) |
845 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
846 | } else { |
847 | const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM); |
848 | |
849 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
850 | auto *MMO = MF.getMachineMemOperand( |
851 | PtrInfo, |
852 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
853 | MachineMemOperand::MODereferenceable, |
854 | Size: 8, BaseAlignment: Align(4)); |
855 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01) |
856 | .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR()) |
857 | .addImm(Val: 0) // offset |
858 | .addImm(Val: 0) // cpol |
859 | .addMemOperand(MMO) |
860 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
861 | |
862 | MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR()); |
863 | MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR()); |
864 | } |
865 | } else { |
866 | Register Rsrc0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0); |
867 | Register Rsrc1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1); |
868 | |
869 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0) |
870 | .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0" ) |
871 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
872 | |
873 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1) |
874 | .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1" ) |
875 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
876 | } |
877 | |
878 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2) |
879 | .addImm(Val: Lo_32(Value: Rsrc23)) |
880 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
881 | |
882 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3) |
883 | .addImm(Val: Hi_32(Value: Rsrc23)) |
884 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
885 | } else if (ST.isAmdHsaOrMesa(F: Fn)) { |
886 | assert(PreloadedScratchRsrcReg); |
887 | |
888 | if (ScratchRsrcReg != PreloadedScratchRsrcReg) { |
889 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchRsrcReg) |
890 | .addReg(RegNo: PreloadedScratchRsrcReg, flags: RegState::Kill); |
891 | } |
892 | } |
893 | |
894 | // Add the scratch wave offset into the scratch RSRC. |
895 | // |
896 | // We only want to update the first 48 bits, which is the base address |
897 | // pointer, without touching the adjacent 16 bits of flags. We know this add |
898 | // cannot carry-out from bit 47, otherwise the scratch allocation would be |
899 | // impossible to fit in the 48-bit global address space. |
900 | // |
901 | // TODO: Evaluate if it is better to just construct an SRD using the flat |
902 | // scratch init and some constants rather than update the one we are passed. |
903 | Register ScratchRsrcSub0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0); |
904 | Register ScratchRsrcSub1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1); |
905 | |
906 | // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in |
907 | // the kernel body via inreg arguments. |
908 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: ScratchRsrcSub0) |
909 | .addReg(RegNo: ScratchRsrcSub0) |
910 | .addReg(RegNo: ScratchWaveOffsetReg) |
911 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
912 | auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: ScratchRsrcSub1) |
913 | .addReg(RegNo: ScratchRsrcSub1) |
914 | .addImm(Val: 0) |
915 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
916 | Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
917 | } |
918 | |
919 | bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { |
920 | switch (ID) { |
921 | case TargetStackID::Default: |
922 | case TargetStackID::NoAlloc: |
923 | case TargetStackID::SGPRSpill: |
924 | return true; |
925 | case TargetStackID::ScalableVector: |
926 | case TargetStackID::WasmLocal: |
927 | return false; |
928 | } |
929 | llvm_unreachable("Invalid TargetStackID::Value" ); |
930 | } |
931 | |
932 | // Activate only the inactive lanes when \p EnableInactiveLanes is true. |
933 | // Otherwise, activate all lanes. It returns the saved exec. |
934 | static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, |
935 | MachineFunction &MF, |
936 | MachineBasicBlock &MBB, |
937 | MachineBasicBlock::iterator MBBI, |
938 | const DebugLoc &DL, bool IsProlog, |
939 | bool EnableInactiveLanes) { |
940 | Register ScratchExecCopy; |
941 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
942 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
943 | const SIInstrInfo *TII = ST.getInstrInfo(); |
944 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
945 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
946 | |
947 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); |
948 | |
949 | ScratchExecCopy = findScratchNonCalleeSaveRegister( |
950 | MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass()); |
951 | if (!ScratchExecCopy) |
952 | report_fatal_error(reason: "failed to find free scratch register" ); |
953 | |
954 | LiveUnits.addReg(Reg: ScratchExecCopy); |
955 | |
956 | const unsigned SaveExecOpc = |
957 | ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 |
958 | : AMDGPU::S_OR_SAVEEXEC_B32) |
959 | : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 |
960 | : AMDGPU::S_OR_SAVEEXEC_B64); |
961 | auto SaveExec = |
962 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: SaveExecOpc), DestReg: ScratchExecCopy).addImm(Val: -1); |
963 | SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
964 | |
965 | return ScratchExecCopy; |
966 | } |
967 | |
968 | void SIFrameLowering::emitCSRSpillStores( |
969 | MachineFunction &MF, MachineBasicBlock &MBB, |
970 | MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, |
971 | Register FrameReg, Register FramePtrRegScratchCopy) const { |
972 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
973 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
974 | const SIInstrInfo *TII = ST.getInstrInfo(); |
975 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
976 | |
977 | // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch |
978 | // registers. However, save all lanes of callee-saved VGPRs. Due to this, we |
979 | // might end up flipping the EXEC bits twice. |
980 | Register ScratchExecCopy; |
981 | SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; |
982 | FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs); |
983 | if (!WWMScratchRegs.empty()) |
984 | ScratchExecCopy = |
985 | buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
986 | /*IsProlog*/ true, /*EnableInactiveLanes*/ true); |
987 | |
988 | auto StoreWWMRegisters = |
989 | [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { |
990 | for (const auto &Reg : WWMRegs) { |
991 | Register VGPR = Reg.first; |
992 | int FI = Reg.second; |
993 | buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL, |
994 | SpillReg: VGPR, FI, FrameReg); |
995 | } |
996 | }; |
997 | |
998 | StoreWWMRegisters(WWMScratchRegs); |
999 | if (!WWMCalleeSavedRegs.empty()) { |
1000 | if (ScratchExecCopy) { |
1001 | unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1002 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -1); |
1003 | } else { |
1004 | ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1005 | /*IsProlog*/ true, |
1006 | /*EnableInactiveLanes*/ false); |
1007 | } |
1008 | } |
1009 | |
1010 | StoreWWMRegisters(WWMCalleeSavedRegs); |
1011 | if (ScratchExecCopy) { |
1012 | // FIXME: Split block and make terminator. |
1013 | unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1014 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec()) |
1015 | .addReg(RegNo: ScratchExecCopy, flags: RegState::Kill); |
1016 | LiveUnits.addReg(Reg: ScratchExecCopy); |
1017 | } |
1018 | |
1019 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1020 | |
1021 | for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { |
1022 | // Special handle FP spill: |
1023 | // Skip if FP is saved to a scratch SGPR, the save has already been emitted. |
1024 | // Otherwise, FP has been moved to a temporary register and spill it |
1025 | // instead. |
1026 | Register Reg = |
1027 | Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; |
1028 | if (!Reg) |
1029 | continue; |
1030 | |
1031 | PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, |
1032 | LiveUnits, FrameReg); |
1033 | SB.save(); |
1034 | } |
1035 | |
1036 | // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make |
1037 | // such scratch registers live throughout the function. |
1038 | SmallVector<Register, 1> ScratchSGPRs; |
1039 | FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs); |
1040 | if (!ScratchSGPRs.empty()) { |
1041 | for (MachineBasicBlock &MBB : MF) { |
1042 | for (MCPhysReg Reg : ScratchSGPRs) |
1043 | MBB.addLiveIn(PhysReg: Reg); |
1044 | |
1045 | MBB.sortUniqueLiveIns(); |
1046 | } |
1047 | if (!LiveUnits.empty()) { |
1048 | for (MCPhysReg Reg : ScratchSGPRs) |
1049 | LiveUnits.addReg(Reg); |
1050 | } |
1051 | } |
1052 | } |
1053 | |
1054 | void SIFrameLowering::emitCSRSpillRestores( |
1055 | MachineFunction &MF, MachineBasicBlock &MBB, |
1056 | MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, |
1057 | Register FrameReg, Register FramePtrRegScratchCopy) const { |
1058 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1059 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1060 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1061 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1062 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1063 | |
1064 | for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { |
1065 | // Special handle FP restore: |
1066 | // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore |
1067 | // the FP value to a temporary register. The frame pointer should be |
1068 | // overwritten only at the end when all other spills are restored from |
1069 | // current frame. |
1070 | Register Reg = |
1071 | Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; |
1072 | if (!Reg) |
1073 | continue; |
1074 | |
1075 | PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, |
1076 | LiveUnits, FrameReg); |
1077 | SB.restore(); |
1078 | } |
1079 | |
1080 | // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the |
1081 | // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to |
1082 | // this, we might end up flipping the EXEC bits twice. |
1083 | Register ScratchExecCopy; |
1084 | SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; |
1085 | FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs); |
1086 | if (!WWMScratchRegs.empty()) |
1087 | ScratchExecCopy = |
1088 | buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1089 | /*IsProlog*/ false, /*EnableInactiveLanes*/ true); |
1090 | |
1091 | auto RestoreWWMRegisters = |
1092 | [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { |
1093 | for (const auto &Reg : WWMRegs) { |
1094 | Register VGPR = Reg.first; |
1095 | int FI = Reg.second; |
1096 | buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL, |
1097 | SpillReg: VGPR, FI, FrameReg); |
1098 | } |
1099 | }; |
1100 | |
1101 | RestoreWWMRegisters(WWMScratchRegs); |
1102 | if (!WWMCalleeSavedRegs.empty()) { |
1103 | if (ScratchExecCopy) { |
1104 | unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1105 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -1); |
1106 | } else { |
1107 | ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1108 | /*IsProlog*/ false, |
1109 | /*EnableInactiveLanes*/ false); |
1110 | } |
1111 | } |
1112 | |
1113 | RestoreWWMRegisters(WWMCalleeSavedRegs); |
1114 | if (ScratchExecCopy) { |
1115 | // FIXME: Split block and make terminator. |
1116 | unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1117 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec()) |
1118 | .addReg(RegNo: ScratchExecCopy, flags: RegState::Kill); |
1119 | } |
1120 | } |
1121 | |
1122 | void SIFrameLowering::emitPrologue(MachineFunction &MF, |
1123 | MachineBasicBlock &MBB) const { |
1124 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1125 | if (FuncInfo->isEntryFunction()) { |
1126 | emitEntryFunctionPrologue(MF, MBB); |
1127 | return; |
1128 | } |
1129 | |
1130 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1131 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1132 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1133 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1134 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1135 | |
1136 | Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); |
1137 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1138 | Register BasePtrReg = |
1139 | TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); |
1140 | LiveRegUnits LiveUnits; |
1141 | |
1142 | MachineBasicBlock::iterator MBBI = MBB.begin(); |
1143 | // DebugLoc must be unknown since the first instruction with DebugLoc is used |
1144 | // to determine the end of the prologue. |
1145 | DebugLoc DL; |
1146 | |
1147 | if (FuncInfo->isChainFunction()) { |
1148 | // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but |
1149 | // are free to set one up if they need it. |
1150 | bool UseSP = requiresStackPointerReference(MF); |
1151 | if (UseSP) { |
1152 | assert(StackPtrReg != AMDGPU::SP_REG); |
1153 | |
1154 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: StackPtrReg) |
1155 | .addImm(Val: MFI.getStackSize() * getScratchScaleFactor(ST)); |
1156 | } |
1157 | } |
1158 | |
1159 | bool HasFP = false; |
1160 | bool HasBP = false; |
1161 | uint32_t NumBytes = MFI.getStackSize(); |
1162 | uint32_t RoundedSize = NumBytes; |
1163 | |
1164 | if (TRI.hasStackRealignment(MF)) |
1165 | HasFP = true; |
1166 | |
1167 | Register FramePtrRegScratchCopy; |
1168 | if (!HasFP && !hasFP(MF)) { |
1169 | // Emit the CSR spill stores with SP base register. |
1170 | emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, |
1171 | FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg, |
1172 | FramePtrRegScratchCopy); |
1173 | } else { |
1174 | // CSR spill stores will use FP as base register. |
1175 | Register SGPRForFPSaveRestoreCopy = |
1176 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1177 | |
1178 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); |
1179 | if (SGPRForFPSaveRestoreCopy) { |
1180 | // Copy FP to the scratch register now and emit the CFI entry. It avoids |
1181 | // the extra FP copy needed in the other two cases when FP is spilled to |
1182 | // memory or to a VGPR lane. |
1183 | PrologEpilogSGPRSpillBuilder SB( |
1184 | FramePtrReg, |
1185 | FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI, |
1186 | DL, TII, TRI, LiveUnits, FramePtrReg); |
1187 | SB.save(); |
1188 | LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy); |
1189 | } else { |
1190 | // Copy FP into a new scratch register so that its previous value can be |
1191 | // spilled after setting up the new frame. |
1192 | FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( |
1193 | MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass); |
1194 | if (!FramePtrRegScratchCopy) |
1195 | report_fatal_error(reason: "failed to find free scratch register" ); |
1196 | |
1197 | LiveUnits.addReg(Reg: FramePtrRegScratchCopy); |
1198 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrRegScratchCopy) |
1199 | .addReg(RegNo: FramePtrReg); |
1200 | } |
1201 | } |
1202 | |
1203 | if (HasFP) { |
1204 | const unsigned Alignment = MFI.getMaxAlign().value(); |
1205 | |
1206 | RoundedSize += Alignment; |
1207 | if (LiveUnits.empty()) { |
1208 | LiveUnits.init(TRI); |
1209 | LiveUnits.addLiveIns(MBB); |
1210 | } |
1211 | |
1212 | // s_add_i32 s33, s32, NumBytes |
1213 | // s_and_b32 s33, s33, 0b111...0000 |
1214 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FramePtrReg) |
1215 | .addReg(RegNo: StackPtrReg) |
1216 | .addImm(Val: (Alignment - 1) * getScratchScaleFactor(ST)) |
1217 | .setMIFlag(MachineInstr::FrameSetup); |
1218 | auto And = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: FramePtrReg) |
1219 | .addReg(RegNo: FramePtrReg, flags: RegState::Kill) |
1220 | .addImm(Val: -Alignment * getScratchScaleFactor(ST)) |
1221 | .setMIFlag(MachineInstr::FrameSetup); |
1222 | And->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1223 | FuncInfo->setIsStackRealigned(true); |
1224 | } else if ((HasFP = hasFP(MF))) { |
1225 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg) |
1226 | .addReg(RegNo: StackPtrReg) |
1227 | .setMIFlag(MachineInstr::FrameSetup); |
1228 | } |
1229 | |
1230 | // If FP is used, emit the CSR spills with FP base register. |
1231 | if (HasFP) { |
1232 | emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg, |
1233 | FramePtrRegScratchCopy); |
1234 | if (FramePtrRegScratchCopy) |
1235 | LiveUnits.removeReg(Reg: FramePtrRegScratchCopy); |
1236 | } |
1237 | |
1238 | // If we need a base pointer, set it up here. It's whatever the value of |
1239 | // the stack pointer is at this point. Any variable size objects will be |
1240 | // allocated after this, so we can still use the base pointer to reference |
1241 | // the incoming arguments. |
1242 | if ((HasBP = TRI.hasBasePointer(MF))) { |
1243 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BasePtrReg) |
1244 | .addReg(RegNo: StackPtrReg) |
1245 | .setMIFlag(MachineInstr::FrameSetup); |
1246 | } |
1247 | |
1248 | if (HasFP && RoundedSize != 0) { |
1249 | auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg) |
1250 | .addReg(RegNo: StackPtrReg) |
1251 | .addImm(Val: RoundedSize * getScratchScaleFactor(ST)) |
1252 | .setMIFlag(MachineInstr::FrameSetup); |
1253 | Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1254 | } |
1255 | |
1256 | bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg); |
1257 | (void)FPSaved; |
1258 | assert((!HasFP || FPSaved) && |
1259 | "Needed to save FP but didn't save it anywhere" ); |
1260 | |
1261 | // If we allow spilling to AGPRs we may have saved FP but then spill |
1262 | // everything into AGPRs instead of the stack. |
1263 | assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && |
1264 | "Saved FP but didn't need it" ); |
1265 | |
1266 | bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg); |
1267 | (void)BPSaved; |
1268 | assert((!HasBP || BPSaved) && |
1269 | "Needed to save BP but didn't save it anywhere" ); |
1270 | |
1271 | assert((HasBP || !BPSaved) && "Saved BP but didn't need it" ); |
1272 | } |
1273 | |
1274 | void SIFrameLowering::emitEpilogue(MachineFunction &MF, |
1275 | MachineBasicBlock &MBB) const { |
1276 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1277 | if (FuncInfo->isEntryFunction()) |
1278 | return; |
1279 | |
1280 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1281 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1282 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1283 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1284 | LiveRegUnits LiveUnits; |
1285 | // Get the insert location for the epilogue. If there were no terminators in |
1286 | // the block, get the last instruction. |
1287 | MachineBasicBlock::iterator MBBI = MBB.end(); |
1288 | DebugLoc DL; |
1289 | if (!MBB.empty()) { |
1290 | MBBI = MBB.getLastNonDebugInstr(); |
1291 | if (MBBI != MBB.end()) |
1292 | DL = MBBI->getDebugLoc(); |
1293 | |
1294 | MBBI = MBB.getFirstTerminator(); |
1295 | } |
1296 | |
1297 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1298 | uint32_t NumBytes = MFI.getStackSize(); |
1299 | uint32_t RoundedSize = FuncInfo->isStackRealigned() |
1300 | ? NumBytes + MFI.getMaxAlign().value() |
1301 | : NumBytes; |
1302 | const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); |
1303 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1304 | bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg); |
1305 | |
1306 | if (RoundedSize != 0) { |
1307 | if (TRI.hasBasePointer(MF)) { |
1308 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg) |
1309 | .addReg(RegNo: TRI.getBaseRegister()) |
1310 | .setMIFlag(MachineInstr::FrameDestroy); |
1311 | } else if (hasFP(MF)) { |
1312 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: StackPtrReg) |
1313 | .addReg(RegNo: FramePtrReg) |
1314 | .setMIFlag(MachineInstr::FrameDestroy); |
1315 | } |
1316 | } |
1317 | |
1318 | Register FramePtrRegScratchCopy; |
1319 | Register SGPRForFPSaveRestoreCopy = |
1320 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1321 | if (FPSaved) { |
1322 | // CSR spill restores should use FP as base register. If |
1323 | // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP |
1324 | // into a new scratch register and copy to FP later when other registers are |
1325 | // restored from the current stack frame. |
1326 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); |
1327 | if (SGPRForFPSaveRestoreCopy) { |
1328 | LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy); |
1329 | } else { |
1330 | FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( |
1331 | MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass); |
1332 | if (!FramePtrRegScratchCopy) |
1333 | report_fatal_error(reason: "failed to find free scratch register" ); |
1334 | |
1335 | LiveUnits.addReg(Reg: FramePtrRegScratchCopy); |
1336 | } |
1337 | |
1338 | emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg, |
1339 | FramePtrRegScratchCopy); |
1340 | } |
1341 | |
1342 | if (FPSaved) { |
1343 | // Insert the copy to restore FP. |
1344 | Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy |
1345 | : FramePtrRegScratchCopy; |
1346 | MachineInstrBuilder MIB = |
1347 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg) |
1348 | .addReg(RegNo: SrcReg); |
1349 | if (SGPRForFPSaveRestoreCopy) |
1350 | MIB.setMIFlag(MachineInstr::FrameDestroy); |
1351 | } else { |
1352 | // Insert the CSR spill restores with SP as the base register. |
1353 | emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, |
1354 | FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg, |
1355 | FramePtrRegScratchCopy); |
1356 | } |
1357 | } |
1358 | |
1359 | #ifndef NDEBUG |
1360 | static bool allSGPRSpillsAreDead(const MachineFunction &MF) { |
1361 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1362 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1363 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); |
1364 | I != E; ++I) { |
1365 | if (!MFI.isDeadObjectIndex(I) && |
1366 | MFI.getStackID(I) == TargetStackID::SGPRSpill && |
1367 | !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { |
1368 | return false; |
1369 | } |
1370 | } |
1371 | |
1372 | return true; |
1373 | } |
1374 | #endif |
1375 | |
1376 | StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, |
1377 | int FI, |
1378 | Register &FrameReg) const { |
1379 | const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); |
1380 | |
1381 | FrameReg = RI->getFrameRegister(MF); |
1382 | return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI)); |
1383 | } |
1384 | |
1385 | void SIFrameLowering::processFunctionBeforeFrameFinalized( |
1386 | MachineFunction &MF, |
1387 | RegScavenger *RS) const { |
1388 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1389 | |
1390 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1391 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1392 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1393 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1394 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1395 | |
1396 | const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() |
1397 | && EnableSpillVGPRToAGPR; |
1398 | |
1399 | if (SpillVGPRToAGPR) { |
1400 | // To track the spill frame indices handled in this pass. |
1401 | BitVector SpillFIs(MFI.getObjectIndexEnd(), false); |
1402 | BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); |
1403 | |
1404 | bool SeenDbgInstr = false; |
1405 | |
1406 | for (MachineBasicBlock &MBB : MF) { |
1407 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) { |
1408 | int FrameIndex; |
1409 | if (MI.isDebugInstr()) |
1410 | SeenDbgInstr = true; |
1411 | |
1412 | if (TII->isVGPRSpill(MI)) { |
1413 | // Try to eliminate stack used by VGPR spills before frame |
1414 | // finalization. |
1415 | unsigned FIOp = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
1416 | Name: AMDGPU::OpName::vaddr); |
1417 | int FI = MI.getOperand(i: FIOp).getIndex(); |
1418 | Register VReg = |
1419 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg(); |
1420 | if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, |
1421 | isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) { |
1422 | assert(RS != nullptr); |
1423 | RS->enterBasicBlockEnd(MBB); |
1424 | RS->backward(I: std::next(x: MI.getIterator())); |
1425 | TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS); |
1426 | SpillFIs.set(FI); |
1427 | continue; |
1428 | } |
1429 | } else if (TII->isStoreToStackSlot(MI, FrameIndex) || |
1430 | TII->isLoadFromStackSlot(MI, FrameIndex)) |
1431 | if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex)) |
1432 | NonVGPRSpillFIs.set(FrameIndex); |
1433 | } |
1434 | } |
1435 | |
1436 | // Stack slot coloring may assign different objects to the same stack slot. |
1437 | // If not, then the VGPR to AGPR spill slot is dead. |
1438 | for (unsigned FI : SpillFIs.set_bits()) |
1439 | if (!NonVGPRSpillFIs.test(Idx: FI)) |
1440 | FuncInfo->setVGPRToAGPRSpillDead(FI); |
1441 | |
1442 | for (MachineBasicBlock &MBB : MF) { |
1443 | for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) |
1444 | MBB.addLiveIn(PhysReg: Reg); |
1445 | |
1446 | for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) |
1447 | MBB.addLiveIn(PhysReg: Reg); |
1448 | |
1449 | MBB.sortUniqueLiveIns(); |
1450 | |
1451 | if (!SpillFIs.empty() && SeenDbgInstr) { |
1452 | // FIXME: The dead frame indices are replaced with a null register from |
1453 | // the debug value instructions. We should instead, update it with the |
1454 | // correct register value. But not sure the register value alone is |
1455 | for (MachineInstr &MI : MBB) { |
1456 | if (MI.isDebugValue()) { |
1457 | uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0; |
1458 | if (MI.getOperand(i: StackOperandIdx).isFI() && |
1459 | !MFI.isFixedObjectIndex( |
1460 | ObjectIdx: MI.getOperand(i: StackOperandIdx).getIndex()) && |
1461 | SpillFIs[MI.getOperand(i: StackOperandIdx).getIndex()]) { |
1462 | MI.getOperand(i: StackOperandIdx) |
1463 | .ChangeToRegister(Reg: Register(), isDef: false /*isDef*/); |
1464 | } |
1465 | } |
1466 | } |
1467 | } |
1468 | } |
1469 | } |
1470 | |
1471 | // At this point we've already allocated all spilled SGPRs to VGPRs if we |
1472 | // can. Any remaining SGPR spills will go to memory, so move them back to the |
1473 | // default stack. |
1474 | bool HaveSGPRToVMemSpill = |
1475 | FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); |
1476 | assert(allSGPRSpillsAreDead(MF) && |
1477 | "SGPR spill should have been removed in SILowerSGPRSpills" ); |
1478 | |
1479 | // FIXME: The other checks should be redundant with allStackObjectsAreDead, |
1480 | // but currently hasNonSpillStackObjects is set only from source |
1481 | // allocas. Stack temps produced from legalization are not counted currently. |
1482 | if (!allStackObjectsAreDead(MFI)) { |
1483 | assert(RS && "RegScavenger required if spilling" ); |
1484 | |
1485 | // Add an emergency spill slot |
1486 | RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI)); |
1487 | |
1488 | // If we are spilling SGPRs to memory with a large frame, we may need a |
1489 | // second VGPR emergency frame index. |
1490 | if (HaveSGPRToVMemSpill && |
1491 | allocateScavengingFrameIndexesNearIncomingSP(MF)) { |
1492 | RS->addScavengingFrameIndex(FI: MFI.CreateSpillStackObject(Size: 4, Alignment: Align(4))); |
1493 | } |
1494 | } |
1495 | } |
1496 | |
1497 | void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( |
1498 | MachineFunction &MF, RegScavenger *RS) const { |
1499 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1500 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1501 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1502 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1503 | |
1504 | if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { |
1505 | // On gfx908, we had initially reserved highest available VGPR for AGPR |
1506 | // copy. Now since we are done with RA, check if there exist an unused VGPR |
1507 | // which is lower than the eariler reserved VGPR before RA. If one exist, |
1508 | // use it for AGPR copy instead of one reserved before RA. |
1509 | Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); |
1510 | Register UnusedLowVGPR = |
1511 | TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF); |
1512 | if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) < |
1513 | TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) { |
1514 | // Reserve this newly identified VGPR (for AGPR copy) |
1515 | // reserved registers should already be frozen at this point |
1516 | // so we can avoid calling MRI.freezeReservedRegs and just use |
1517 | // MRI.reserveReg |
1518 | FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); |
1519 | MRI.reserveReg(PhysReg: UnusedLowVGPR, TRI); |
1520 | } |
1521 | } |
1522 | // We initally reserved the highest available SGPR pair for long branches |
1523 | // now, after RA, we shift down to a lower unused one if one exists |
1524 | Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); |
1525 | Register UnusedLowSGPR = |
1526 | TRI->findUnusedRegister(MRI, RC: &AMDGPU::SGPR_64RegClass, MF); |
1527 | // If LongBranchReservedReg is null then we didn't find a long branch |
1528 | // and never reserved a register to begin with so there is nothing to |
1529 | // shift down. Then if UnusedLowSGPR is null, there isn't available lower |
1530 | // register to use so just keep the original one we set. |
1531 | if (LongBranchReservedReg && UnusedLowSGPR) { |
1532 | FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); |
1533 | MRI.reserveReg(PhysReg: UnusedLowSGPR, TRI); |
1534 | } |
1535 | } |
1536 | |
1537 | // The special SGPR spills like the one needed for FP, BP or any reserved |
1538 | // registers delayed until frame lowering. |
1539 | void SIFrameLowering::determinePrologEpilogSGPRSaves( |
1540 | MachineFunction &MF, BitVector &SavedVGPRs, |
1541 | bool NeedExecCopyReservedReg) const { |
1542 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
1543 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1544 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1545 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1546 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1547 | LiveRegUnits LiveUnits; |
1548 | LiveUnits.init(TRI: *TRI); |
1549 | // Initially mark callee saved registers as used so we will not choose them |
1550 | // while looking for scratch SGPRs. |
1551 | const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
1552 | for (unsigned I = 0; CSRegs[I]; ++I) |
1553 | LiveUnits.addReg(Reg: CSRegs[I]); |
1554 | |
1555 | const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); |
1556 | |
1557 | Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy(); |
1558 | if (NeedExecCopyReservedReg || |
1559 | (ReservedRegForExecCopy && |
1560 | MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) { |
1561 | MRI.reserveReg(PhysReg: ReservedRegForExecCopy, TRI); |
1562 | Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); |
1563 | if (UnusedScratchReg) { |
1564 | // If found any unused scratch SGPR, reserve the register itself for Exec |
1565 | // copy and there is no need for any spills in that case. |
1566 | MFI->setSGPRForEXECCopy(UnusedScratchReg); |
1567 | MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg); |
1568 | LiveUnits.addReg(Reg: UnusedScratchReg); |
1569 | } else { |
1570 | // Needs spill. |
1571 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) && |
1572 | "Re-reserving spill slot for EXEC copy register" ); |
1573 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC, |
1574 | /*IncludeScratchCopy=*/false); |
1575 | } |
1576 | } else if (ReservedRegForExecCopy) { |
1577 | // Reset it at this point. There are no whole-wave copies and spills |
1578 | // encountered. |
1579 | MFI->setSGPRForEXECCopy(AMDGPU::NoRegister); |
1580 | } |
1581 | |
1582 | // hasFP only knows about stack objects that already exist. We're now |
1583 | // determining the stack slots that will be created, so we have to predict |
1584 | // them. Stack objects force FP usage with calls. |
1585 | // |
1586 | // Note a new VGPR CSR may be introduced if one is used for the spill, but we |
1587 | // don't want to report it here. |
1588 | // |
1589 | // FIXME: Is this really hasReservedCallFrame? |
1590 | const bool WillHaveFP = |
1591 | FrameInfo.hasCalls() && |
1592 | (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo)); |
1593 | |
1594 | if (WillHaveFP || hasFP(MF)) { |
1595 | Register FramePtrReg = MFI->getFrameOffsetReg(); |
1596 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && |
1597 | "Re-reserving spill slot for FP" ); |
1598 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg); |
1599 | } |
1600 | |
1601 | if (TRI->hasBasePointer(MF)) { |
1602 | Register BasePtrReg = TRI->getBaseRegister(); |
1603 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && |
1604 | "Re-reserving spill slot for BP" ); |
1605 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg); |
1606 | } |
1607 | } |
1608 | |
1609 | // Only report VGPRs to generic code. |
1610 | void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, |
1611 | BitVector &SavedVGPRs, |
1612 | RegScavenger *RS) const { |
1613 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1614 | |
1615 | // If this is a function with the amdgpu_cs_chain[_preserve] calling |
1616 | // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then |
1617 | // we don't need to save and restore anything. |
1618 | if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) |
1619 | return; |
1620 | |
1621 | TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS); |
1622 | |
1623 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1624 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1625 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1626 | bool NeedExecCopyReservedReg = false; |
1627 | |
1628 | MachineInstr *ReturnMI = nullptr; |
1629 | for (MachineBasicBlock &MBB : MF) { |
1630 | for (MachineInstr &MI : MBB) { |
1631 | // TODO: Walking through all MBBs here would be a bad heuristic. Better |
1632 | // handle them elsewhere. |
1633 | if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode())) |
1634 | NeedExecCopyReservedReg = true; |
1635 | else if (MI.getOpcode() == AMDGPU::SI_RETURN || |
1636 | MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || |
1637 | (MFI->isChainFunction() && |
1638 | TII->isChainCallOpcode(Opcode: MI.getOpcode()))) { |
1639 | // We expect all return to be the same size. |
1640 | assert(!ReturnMI || |
1641 | (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == |
1642 | count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); |
1643 | ReturnMI = &MI; |
1644 | } |
1645 | } |
1646 | } |
1647 | |
1648 | SmallVector<Register> SortedWWMVGPRs; |
1649 | for (Register Reg : MFI->getWWMReservedRegs()) { |
1650 | // The shift-back is needed only for the VGPRs used for SGPR spills and they |
1651 | // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM |
1652 | // reserved registers. |
1653 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); |
1654 | if (TRI->getRegSizeInBits(RC: *RC) != 32) |
1655 | continue; |
1656 | SortedWWMVGPRs.push_back(Elt: Reg); |
1657 | } |
1658 | |
1659 | sort(C&: SortedWWMVGPRs, Comp: std::greater<Register>()); |
1660 | MFI->shiftWwmVGPRsToLowestRange(MF, WWMVGPRs&: SortedWWMVGPRs, SavedVGPRs); |
1661 | |
1662 | if (MFI->isEntryFunction()) |
1663 | return; |
1664 | |
1665 | // Remove any VGPRs used in the return value because these do not need to be saved. |
1666 | // This prevents CSR restore from clobbering return VGPRs. |
1667 | if (ReturnMI) { |
1668 | for (auto &Op : ReturnMI->operands()) { |
1669 | if (Op.isReg()) |
1670 | SavedVGPRs.reset(Idx: Op.getReg()); |
1671 | } |
1672 | } |
1673 | |
1674 | // Create the stack objects for WWM registers now. |
1675 | for (Register Reg : MFI->getWWMReservedRegs()) { |
1676 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); |
1677 | MFI->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(RC: *RC), |
1678 | Alignment: TRI->getSpillAlign(RC: *RC)); |
1679 | } |
1680 | |
1681 | // Ignore the SGPRs the default implementation found. |
1682 | SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask()); |
1683 | |
1684 | // Do not save AGPRs prior to GFX90A because there was no easy way to do so. |
1685 | // In gfx908 there was do AGPR loads and stores and thus spilling also |
1686 | // require a temporary VGPR. |
1687 | if (!ST.hasGFX90AInsts()) |
1688 | SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask()); |
1689 | |
1690 | determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); |
1691 | |
1692 | // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't |
1693 | // allow the default insertion to handle them. |
1694 | for (auto &Reg : MFI->getWWMSpills()) |
1695 | SavedVGPRs.reset(Idx: Reg.first); |
1696 | } |
1697 | |
1698 | void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, |
1699 | BitVector &SavedRegs, |
1700 | RegScavenger *RS) const { |
1701 | TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); |
1702 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1703 | if (MFI->isEntryFunction()) |
1704 | return; |
1705 | |
1706 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1707 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1708 | |
1709 | // The SP is specifically managed and we don't want extra spills of it. |
1710 | SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg()); |
1711 | |
1712 | const BitVector AllSavedRegs = SavedRegs; |
1713 | SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask()); |
1714 | |
1715 | // We have to anticipate introducing CSR VGPR spills or spill of caller |
1716 | // save VGPR reserved for SGPR spills as we now always create stack entry |
1717 | // for it, if we don't have any stack objects already, since we require a FP |
1718 | // if there is a call and stack. We will allocate a VGPR for SGPR spills if |
1719 | // there are any SGPR spills. Whether they are CSR spills or otherwise. |
1720 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
1721 | const bool WillHaveFP = |
1722 | FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); |
1723 | |
1724 | // FP will be specially managed like SP. |
1725 | if (WillHaveFP || hasFP(MF)) |
1726 | SavedRegs.reset(Idx: MFI->getFrameOffsetReg()); |
1727 | |
1728 | // Return address use with return instruction is hidden through the SI_RETURN |
1729 | // pseudo. Given that and since the IPRA computes actual register usage and |
1730 | // does not use CSR list, the clobbering of return address by function calls |
1731 | // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register |
1732 | // usage collection. This will ensure save/restore of return address happens |
1733 | // in those scenarios. |
1734 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1735 | Register RetAddrReg = TRI->getReturnAddressReg(MF); |
1736 | if (!MFI->isEntryFunction() && |
1737 | (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) { |
1738 | SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0)); |
1739 | SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1)); |
1740 | } |
1741 | } |
1742 | |
1743 | static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, |
1744 | const GCNSubtarget &ST, |
1745 | std::vector<CalleeSavedInfo> &CSI, |
1746 | unsigned &MinCSFrameIndex, |
1747 | unsigned &MaxCSFrameIndex) { |
1748 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1749 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1750 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1751 | |
1752 | assert( |
1753 | llvm::is_sorted(CSI, |
1754 | [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) { |
1755 | return A.getReg() < B.getReg(); |
1756 | }) && |
1757 | "Callee saved registers not sorted" ); |
1758 | |
1759 | auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) { |
1760 | return !CSI.isSpilledToReg() && |
1761 | TRI->getPhysRegBaseClass(Reg: CSI.getReg()) == &AMDGPU::VGPR_32RegClass && |
1762 | !FuncInfo->isWWMReservedRegister(Reg: CSI.getReg()); |
1763 | }; |
1764 | |
1765 | auto CSEnd = CSI.end(); |
1766 | for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) { |
1767 | Register Reg = CSIt->getReg(); |
1768 | if (!CanUseBlockOps(*CSIt)) |
1769 | continue; |
1770 | |
1771 | // Find all the regs that will fit in a 32-bit mask starting at the current |
1772 | // reg and build said mask. It should have 1 for every register that's |
1773 | // included, with the current register as the least significant bit. |
1774 | uint32_t Mask = 1; |
1775 | CSEnd = std::remove_if( |
1776 | first: CSIt + 1, last: CSEnd, pred: [&](const CalleeSavedInfo &CSI) -> bool { |
1777 | if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) { |
1778 | Mask |= 1 << (CSI.getReg() - Reg); |
1779 | return true; |
1780 | } else { |
1781 | return false; |
1782 | } |
1783 | }); |
1784 | |
1785 | const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF); |
1786 | Register RegBlock = |
1787 | TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC: BlockRegClass); |
1788 | if (!RegBlock) { |
1789 | // We couldn't find a super register for the block. This can happen if |
1790 | // the register we started with is too high (e.g. v232 if the maximum is |
1791 | // v255). We therefore try to get the last register block and figure out |
1792 | // the mask from there. |
1793 | Register LastBlockStart = |
1794 | AMDGPU::VGPR0 + alignDown(Value: Reg - AMDGPU::VGPR0, Align: 32); |
1795 | RegBlock = |
1796 | TRI->getMatchingSuperReg(Reg: LastBlockStart, SubIdx: AMDGPU::sub0, RC: BlockRegClass); |
1797 | assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) && |
1798 | "Couldn't find super register" ); |
1799 | int RegDelta = Reg - LastBlockStart; |
1800 | assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta && |
1801 | "Bad shift amount" ); |
1802 | Mask <<= RegDelta; |
1803 | } |
1804 | |
1805 | FuncInfo->setMaskForVGPRBlockOps(RegisterBlock: RegBlock, Mask); |
1806 | |
1807 | // The stack objects can be a bit smaller than the register block if we know |
1808 | // some of the high bits of Mask are 0. This may happen often with calling |
1809 | // conventions where the caller and callee-saved VGPRs are interleaved at |
1810 | // a small boundary (e.g. 8 or 16). |
1811 | int UnusedBits = llvm::countl_zero(Val: Mask); |
1812 | unsigned BlockSize = TRI->getSpillSize(RC: *BlockRegClass) - UnusedBits * 4; |
1813 | int FrameIdx = |
1814 | MFI.CreateStackObject(Size: BlockSize, Alignment: TRI->getSpillAlign(RC: *BlockRegClass), |
1815 | /*isSpillSlot=*/true); |
1816 | if ((unsigned)FrameIdx < MinCSFrameIndex) |
1817 | MinCSFrameIndex = FrameIdx; |
1818 | if ((unsigned)FrameIdx > MaxCSFrameIndex) |
1819 | MaxCSFrameIndex = FrameIdx; |
1820 | |
1821 | CSIt->setFrameIdx(FrameIdx); |
1822 | CSIt->setReg(RegBlock); |
1823 | } |
1824 | CSI.erase(first: CSEnd, last: CSI.end()); |
1825 | } |
1826 | |
1827 | bool SIFrameLowering::assignCalleeSavedSpillSlots( |
1828 | MachineFunction &MF, const TargetRegisterInfo *TRI, |
1829 | std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, |
1830 | unsigned &MaxCSFrameIndex) const { |
1831 | if (CSI.empty()) |
1832 | return true; // Early exit if no callee saved registers are modified! |
1833 | |
1834 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1835 | bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR(); |
1836 | |
1837 | if (UseVGPRBlocks) |
1838 | assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex); |
1839 | |
1840 | return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks; |
1841 | } |
1842 | |
1843 | bool SIFrameLowering::assignCalleeSavedSpillSlots( |
1844 | MachineFunction &MF, const TargetRegisterInfo *TRI, |
1845 | std::vector<CalleeSavedInfo> &CSI) const { |
1846 | if (CSI.empty()) |
1847 | return true; // Early exit if no callee saved registers are modified! |
1848 | |
1849 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1850 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1851 | const SIRegisterInfo *RI = ST.getRegisterInfo(); |
1852 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1853 | Register BasePtrReg = RI->getBaseRegister(); |
1854 | Register SGPRForFPSaveRestoreCopy = |
1855 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1856 | Register SGPRForBPSaveRestoreCopy = |
1857 | FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg); |
1858 | if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) |
1859 | return false; |
1860 | |
1861 | unsigned NumModifiedRegs = 0; |
1862 | |
1863 | if (SGPRForFPSaveRestoreCopy) |
1864 | NumModifiedRegs++; |
1865 | if (SGPRForBPSaveRestoreCopy) |
1866 | NumModifiedRegs++; |
1867 | |
1868 | for (auto &CS : CSI) { |
1869 | if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) { |
1870 | CS.setDstReg(SGPRForFPSaveRestoreCopy); |
1871 | if (--NumModifiedRegs) |
1872 | break; |
1873 | } else if (CS.getReg() == BasePtrReg.asMCReg() && |
1874 | SGPRForBPSaveRestoreCopy) { |
1875 | CS.setDstReg(SGPRForBPSaveRestoreCopy); |
1876 | if (--NumModifiedRegs) |
1877 | break; |
1878 | } |
1879 | } |
1880 | |
1881 | return false; |
1882 | } |
1883 | |
1884 | bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( |
1885 | const MachineFunction &MF) const { |
1886 | |
1887 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1888 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1889 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1890 | uint64_t EstStackSize = MFI.estimateStackSize(MF); |
1891 | uint64_t MaxOffset = EstStackSize - 1; |
1892 | |
1893 | // We need the emergency stack slots to be allocated in range of the |
1894 | // MUBUF/flat scratch immediate offset from the base register, so assign these |
1895 | // first at the incoming SP position. |
1896 | // |
1897 | // TODO: We could try sorting the objects to find a hole in the first bytes |
1898 | // rather than allocating as close to possible. This could save a lot of space |
1899 | // on frames with alignment requirements. |
1900 | if (ST.enableFlatScratch()) { |
1901 | if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
1902 | FlatVariant: SIInstrFlags::FlatScratch)) |
1903 | return false; |
1904 | } else { |
1905 | if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset)) |
1906 | return false; |
1907 | } |
1908 | |
1909 | return true; |
1910 | } |
1911 | |
1912 | bool SIFrameLowering::spillCalleeSavedRegisters( |
1913 | MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, |
1914 | ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { |
1915 | MachineFunction *MF = MBB.getParent(); |
1916 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
1917 | if (!ST.useVGPRBlockOpsForCSR()) |
1918 | return false; |
1919 | |
1920 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
1921 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
1922 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1923 | SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); |
1924 | |
1925 | const TargetRegisterClass *BlockRegClass = |
1926 | static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(MF: *MF); |
1927 | for (const CalleeSavedInfo &CS : CSI) { |
1928 | Register Reg = CS.getReg(); |
1929 | if (!BlockRegClass->contains(Reg) || |
1930 | !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) { |
1931 | spillCalleeSavedRegister(SaveBlock&: MBB, MI, CS, TII, TRI); |
1932 | continue; |
1933 | } |
1934 | |
1935 | // Build a scratch block store. |
1936 | uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg); |
1937 | int FrameIndex = CS.getFrameIdx(); |
1938 | MachinePointerInfo PtrInfo = |
1939 | MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex); |
1940 | MachineMemOperand *MMO = |
1941 | MF->getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore, |
1942 | Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex), |
1943 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex)); |
1944 | |
1945 | BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(), |
1946 | MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_SAVE)) |
1947 | .addReg(RegNo: Reg, flags: getKillRegState(B: false)) |
1948 | .addFrameIndex(Idx: FrameIndex) |
1949 | .addReg(RegNo: MFI->getStackPtrOffsetReg()) |
1950 | .addImm(Val: 0) |
1951 | .addImm(Val: Mask) |
1952 | .addMemOperand(MMO); |
1953 | |
1954 | FuncInfo->setHasSpilledVGPRs(); |
1955 | |
1956 | // Add the register to the liveins. This is necessary because if any of the |
1957 | // VGPRs in the register block is reserved (e.g. if it's a WWM register), |
1958 | // then the whole block will be marked as reserved and `updateLiveness` will |
1959 | // skip it. |
1960 | MBB.addLiveIn(PhysReg: Reg); |
1961 | } |
1962 | MBB.sortUniqueLiveIns(); |
1963 | |
1964 | return true; |
1965 | } |
1966 | |
1967 | bool SIFrameLowering::restoreCalleeSavedRegisters( |
1968 | MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, |
1969 | MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { |
1970 | MachineFunction *MF = MBB.getParent(); |
1971 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
1972 | if (!ST.useVGPRBlockOpsForCSR()) |
1973 | return false; |
1974 | |
1975 | SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); |
1976 | MachineFrameInfo &MFI = MF->getFrameInfo(); |
1977 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1978 | const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI); |
1979 | const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(MF: *MF); |
1980 | for (const CalleeSavedInfo &CS : reverse(C&: CSI)) { |
1981 | Register Reg = CS.getReg(); |
1982 | if (!BlockRegClass->contains(Reg) || |
1983 | !FuncInfo->hasMaskForVGPRBlockOps(RegisterBlock: Reg)) { |
1984 | restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI); |
1985 | continue; |
1986 | } |
1987 | |
1988 | // Build a scratch block load. |
1989 | uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: Reg); |
1990 | int FrameIndex = CS.getFrameIdx(); |
1991 | MachinePointerInfo PtrInfo = |
1992 | MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex); |
1993 | MachineMemOperand *MMO = MF->getMachineMemOperand( |
1994 | PtrInfo, F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIndex), |
1995 | BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIndex)); |
1996 | |
1997 | auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(), |
1998 | MCID: TII->get(Opcode: AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), DestReg: Reg) |
1999 | .addFrameIndex(Idx: FrameIndex) |
2000 | .addReg(RegNo: FuncInfo->getStackPtrOffsetReg()) |
2001 | .addImm(Val: 0) |
2002 | .addImm(Val: Mask) |
2003 | .addMemOperand(MMO); |
2004 | SITRI->addImplicitUsesForBlockCSRLoad(MIB, BlockReg: Reg); |
2005 | |
2006 | // Add the register to the liveins. This is necessary because if any of the |
2007 | // VGPRs in the register block is reserved (e.g. if it's a WWM register), |
2008 | // then the whole block will be marked as reserved and `updateLiveness` will |
2009 | // skip it. |
2010 | MBB.addLiveIn(PhysReg: Reg); |
2011 | } |
2012 | |
2013 | MBB.sortUniqueLiveIns(); |
2014 | return true; |
2015 | } |
2016 | |
2017 | MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( |
2018 | MachineFunction &MF, |
2019 | MachineBasicBlock &MBB, |
2020 | MachineBasicBlock::iterator I) const { |
2021 | int64_t Amount = I->getOperand(i: 0).getImm(); |
2022 | if (Amount == 0) |
2023 | return MBB.erase(I); |
2024 | |
2025 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
2026 | const SIInstrInfo *TII = ST.getInstrInfo(); |
2027 | const DebugLoc &DL = I->getDebugLoc(); |
2028 | unsigned Opc = I->getOpcode(); |
2029 | bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); |
2030 | uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0; |
2031 | |
2032 | if (!hasReservedCallFrame(MF)) { |
2033 | Amount = alignTo(Size: Amount, A: getStackAlign()); |
2034 | assert(isUInt<32>(Amount) && "exceeded stack address space size" ); |
2035 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2036 | Register SPReg = MFI->getStackPtrOffsetReg(); |
2037 | |
2038 | Amount *= getScratchScaleFactor(ST); |
2039 | if (IsDestroy) |
2040 | Amount = -Amount; |
2041 | auto Add = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SPReg) |
2042 | .addReg(RegNo: SPReg) |
2043 | .addImm(Val: Amount); |
2044 | Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
2045 | } else if (CalleePopAmount != 0) { |
2046 | llvm_unreachable("is this used?" ); |
2047 | } |
2048 | |
2049 | return MBB.erase(I); |
2050 | } |
2051 | |
2052 | /// Returns true if the frame will require a reference to the stack pointer. |
2053 | /// |
2054 | /// This is the set of conditions common to setting up the stack pointer in a |
2055 | /// kernel, and for using a frame pointer in a callable function. |
2056 | /// |
2057 | /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm |
2058 | /// references SP. |
2059 | static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { |
2060 | return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); |
2061 | } |
2062 | |
2063 | // The FP for kernels is always known 0, so we never really need to setup an |
2064 | // explicit register for it. However, DisableFramePointerElim will force us to |
2065 | // use a register for it. |
2066 | bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { |
2067 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
2068 | |
2069 | // For entry & chain functions we can use an immediate offset in most cases, |
2070 | // so the presence of calls doesn't imply we need a distinct frame pointer. |
2071 | if (MFI.hasCalls() && |
2072 | !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && |
2073 | !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { |
2074 | // All offsets are unsigned, so need to be addressed in the same direction |
2075 | // as stack growth. |
2076 | |
2077 | // FIXME: This function is pretty broken, since it can be called before the |
2078 | // frame layout is determined or CSR spills are inserted. |
2079 | return MFI.getStackSize() != 0; |
2080 | } |
2081 | |
2082 | return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || |
2083 | MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( |
2084 | MF) || |
2085 | mayReserveScratchForCWSR(MF) || |
2086 | MF.getTarget().Options.DisableFramePointerElim(MF); |
2087 | } |
2088 | |
2089 | bool SIFrameLowering::mayReserveScratchForCWSR( |
2090 | const MachineFunction &MF) const { |
2091 | return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() && |
2092 | AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) && |
2093 | AMDGPU::isCompute(CC: MF.getFunction().getCallingConv()); |
2094 | } |
2095 | |
2096 | // This is essentially a reduced version of hasFP for entry functions. Since the |
2097 | // stack pointer is known 0 on entry to kernels, we never really need an FP |
2098 | // register. We may need to initialize the stack pointer depending on the frame |
2099 | // properties, which logically overlaps many of the cases where an ordinary |
2100 | // function would require an FP. |
2101 | // Also used for chain functions. While not technically entry functions, chain |
2102 | // functions may need to set up a stack pointer in some situations. |
2103 | bool SIFrameLowering::requiresStackPointerReference( |
2104 | const MachineFunction &MF) const { |
2105 | // Callable functions always require a stack pointer reference. |
2106 | assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || |
2107 | MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && |
2108 | "only expected to call this for entry points and chain functions" ); |
2109 | |
2110 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
2111 | |
2112 | // Entry points ordinarily don't need to initialize SP. We have to set it up |
2113 | // for callees if there are any. Also note tail calls are impossible/don't |
2114 | // make any sense for kernels. |
2115 | if (MFI.hasCalls()) |
2116 | return true; |
2117 | |
2118 | // We still need to initialize the SP if we're doing anything weird that |
2119 | // references the SP, like variable sized stack objects. |
2120 | return frameTriviallyRequiresSP(MFI); |
2121 | } |
2122 | |