1 | //===----------------------- SIFrameLowering.cpp --------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //==-----------------------------------------------------------------------===// |
8 | |
9 | #include "SIFrameLowering.h" |
10 | #include "AMDGPU.h" |
11 | #include "GCNSubtarget.h" |
12 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
13 | #include "SIMachineFunctionInfo.h" |
14 | #include "llvm/CodeGen/LiveRegUnits.h" |
15 | #include "llvm/CodeGen/MachineFrameInfo.h" |
16 | #include "llvm/CodeGen/RegisterScavenging.h" |
17 | #include "llvm/Target/TargetMachine.h" |
18 | |
19 | using namespace llvm; |
20 | |
21 | #define DEBUG_TYPE "frame-info" |
22 | |
23 | static cl::opt<bool> EnableSpillVGPRToAGPR( |
24 | "amdgpu-spill-vgpr-to-agpr" , |
25 | cl::desc("Enable spilling VGPRs to AGPRs" ), |
26 | cl::ReallyHidden, |
27 | cl::init(Val: true)); |
28 | |
29 | // Find a register matching \p RC from \p LiveUnits which is unused and |
30 | // available throughout the function. On failure, returns AMDGPU::NoRegister. |
31 | // TODO: Rewrite the loop here to iterate over MCRegUnits instead of |
32 | // MCRegisters. This should reduce the number of iterations and avoid redundant |
33 | // checking. |
34 | static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, |
35 | const LiveRegUnits &LiveUnits, |
36 | const TargetRegisterClass &RC) { |
37 | for (MCRegister Reg : RC) { |
38 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) && |
39 | !MRI.isReserved(PhysReg: Reg)) |
40 | return Reg; |
41 | } |
42 | return MCRegister(); |
43 | } |
44 | |
45 | // Find a scratch register that we can use in the prologue. We avoid using |
46 | // callee-save registers since they may appear to be free when this is called |
47 | // from canUseAsPrologue (during shrink wrapping), but then no longer be free |
48 | // when this is called from emitPrologue. |
49 | static MCRegister findScratchNonCalleeSaveRegister( |
50 | MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, |
51 | const TargetRegisterClass &RC, bool Unused = false) { |
52 | // Mark callee saved registers as used so we will not choose them. |
53 | const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); |
54 | for (unsigned i = 0; CSRegs[i]; ++i) |
55 | LiveUnits.addReg(Reg: CSRegs[i]); |
56 | |
57 | // We are looking for a register that can be used throughout the entire |
58 | // function, so any use is unacceptable. |
59 | if (Unused) |
60 | return findUnusedRegister(MRI, LiveUnits, RC); |
61 | |
62 | for (MCRegister Reg : RC) { |
63 | if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg)) |
64 | return Reg; |
65 | } |
66 | |
67 | return MCRegister(); |
68 | } |
69 | |
70 | /// Query target location for spilling SGPRs |
71 | /// \p IncludeScratchCopy : Also look for free scratch SGPRs |
72 | static void getVGPRSpillLaneOrTempRegister( |
73 | MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, |
74 | const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, |
75 | bool IncludeScratchCopy = true) { |
76 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
77 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
78 | |
79 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
80 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
81 | unsigned Size = TRI->getSpillSize(RC); |
82 | Align Alignment = TRI->getSpillAlign(RC); |
83 | |
84 | // We need to save and restore the given SGPR. |
85 | |
86 | Register ScratchSGPR; |
87 | // 1: Try to save the given register into an unused scratch SGPR. The |
88 | // LiveUnits should have all the callee saved registers marked as used. For |
89 | // certain cases we skip copy to scratch SGPR. |
90 | if (IncludeScratchCopy) |
91 | ScratchSGPR = findUnusedRegister(MRI&: MF.getRegInfo(), LiveUnits, RC); |
92 | |
93 | if (!ScratchSGPR) { |
94 | int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr, |
95 | ID: TargetStackID::SGPRSpill); |
96 | |
97 | if (TRI->spillSGPRToVGPR() && |
98 | MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, |
99 | /*IsPrologEpilog=*/true)) { |
100 | // 2: There's no free lane to spill, and no free register to save the |
101 | // SGPR, so we're forced to take another VGPR to use for the spill. |
102 | MFI->addToPrologEpilogSGPRSpills( |
103 | Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo( |
104 | SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); |
105 | |
106 | LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); |
107 | dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " |
108 | << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane |
109 | << '\n';); |
110 | } else { |
111 | // Remove dead <FI> index |
112 | MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI); |
113 | // 3: If all else fails, spill the register to memory. |
114 | FI = FrameInfo.CreateSpillStackObject(Size, Alignment); |
115 | MFI->addToPrologEpilogSGPRSpills( |
116 | Reg: SGPR, |
117 | SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); |
118 | LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " |
119 | << printReg(SGPR, TRI) << '\n'); |
120 | } |
121 | } else { |
122 | MFI->addToPrologEpilogSGPRSpills( |
123 | Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo( |
124 | SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); |
125 | LiveUnits.addReg(Reg: ScratchSGPR); |
126 | LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " |
127 | << printReg(ScratchSGPR, TRI) << '\n'); |
128 | } |
129 | } |
130 | |
131 | // We need to specially emit stack operations here because a different frame |
132 | // register is used than in the rest of the function, as getFrameRegister would |
133 | // use. |
134 | static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, |
135 | const SIMachineFunctionInfo &FuncInfo, |
136 | LiveRegUnits &LiveUnits, MachineFunction &MF, |
137 | MachineBasicBlock &MBB, |
138 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
139 | Register SpillReg, int FI, Register FrameReg, |
140 | int64_t DwordOff = 0) { |
141 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
142 | : AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
143 | |
144 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
145 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
146 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
147 | PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI), |
148 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI)); |
149 | LiveUnits.addReg(Reg: SpillReg); |
150 | bool IsKill = !MBB.isLiveIn(Reg: SpillReg); |
151 | TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg, |
152 | InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits); |
153 | if (IsKill) |
154 | LiveUnits.removeReg(Reg: SpillReg); |
155 | } |
156 | |
157 | static void buildEpilogRestore(const GCNSubtarget &ST, |
158 | const SIRegisterInfo &TRI, |
159 | const SIMachineFunctionInfo &FuncInfo, |
160 | LiveRegUnits &LiveUnits, MachineFunction &MF, |
161 | MachineBasicBlock &MBB, |
162 | MachineBasicBlock::iterator I, |
163 | const DebugLoc &DL, Register SpillReg, int FI, |
164 | Register FrameReg, int64_t DwordOff = 0) { |
165 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR |
166 | : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
167 | |
168 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
169 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
170 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
171 | PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI), |
172 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI)); |
173 | TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg, |
174 | InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits); |
175 | } |
176 | |
177 | static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
178 | const DebugLoc &DL, const SIInstrInfo *TII, |
179 | Register TargetReg) { |
180 | MachineFunction *MF = MBB.getParent(); |
181 | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
182 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
183 | const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32); |
184 | Register TargetLo = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub0); |
185 | Register TargetHi = TRI->getSubReg(Reg: TargetReg, Idx: AMDGPU::sub1); |
186 | |
187 | if (MFI->getGITPtrHigh() != 0xffffffff) { |
188 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi) |
189 | .addImm(Val: MFI->getGITPtrHigh()) |
190 | .addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine); |
191 | } else { |
192 | const MCInstrDesc &GetPC64 = TII->get(Opcode: AMDGPU::S_GETPC_B64_pseudo); |
193 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg); |
194 | } |
195 | Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF); |
196 | MF->getRegInfo().addLiveIn(Reg: GitPtrLo); |
197 | MBB.addLiveIn(PhysReg: GitPtrLo); |
198 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo) |
199 | .addReg(RegNo: GitPtrLo); |
200 | } |
201 | |
202 | static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, |
203 | const SIMachineFunctionInfo *FuncInfo, |
204 | MachineFunction &MF, MachineBasicBlock &MBB, |
205 | MachineBasicBlock::iterator MBBI, bool IsProlog) { |
206 | if (LiveUnits.empty()) { |
207 | LiveUnits.init(TRI); |
208 | if (IsProlog) { |
209 | LiveUnits.addLiveIns(MBB); |
210 | } else { |
211 | // In epilog. |
212 | LiveUnits.addLiveOuts(MBB); |
213 | LiveUnits.stepBackward(MI: *MBBI); |
214 | } |
215 | } |
216 | } |
217 | |
218 | namespace llvm { |
219 | |
220 | // SpillBuilder to save/restore special SGPR spills like the one needed for FP, |
221 | // BP, etc. These spills are delayed until the current function's frame is |
222 | // finalized. For a given register, the builder uses the |
223 | // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. |
224 | class PrologEpilogSGPRSpillBuilder { |
225 | MachineBasicBlock::iterator MI; |
226 | MachineBasicBlock &MBB; |
227 | MachineFunction &MF; |
228 | const GCNSubtarget &ST; |
229 | MachineFrameInfo &MFI; |
230 | SIMachineFunctionInfo *FuncInfo; |
231 | const SIInstrInfo *TII; |
232 | const SIRegisterInfo &TRI; |
233 | Register SuperReg; |
234 | const PrologEpilogSGPRSaveRestoreInfo SI; |
235 | LiveRegUnits &LiveUnits; |
236 | const DebugLoc &DL; |
237 | Register FrameReg; |
238 | ArrayRef<int16_t> SplitParts; |
239 | unsigned NumSubRegs; |
240 | unsigned EltSize = 4; |
241 | |
242 | void saveToMemory(const int FI) const { |
243 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
244 | assert(!MFI.isDeadObjectIndex(FI)); |
245 | |
246 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true); |
247 | |
248 | MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( |
249 | MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass); |
250 | if (!TmpVGPR) |
251 | report_fatal_error(reason: "failed to find free scratch register" ); |
252 | |
253 | for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { |
254 | Register SubReg = NumSubRegs == 1 |
255 | ? SuperReg |
256 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
257 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR) |
258 | .addReg(RegNo: SubReg); |
259 | |
260 | buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR, |
261 | FI, FrameReg, DwordOff); |
262 | DwordOff += 4; |
263 | } |
264 | } |
265 | |
266 | void saveToVGPRLane(const int FI) const { |
267 | assert(!MFI.isDeadObjectIndex(FI)); |
268 | |
269 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
270 | ArrayRef<SIRegisterInfo::SpilledReg> Spill = |
271 | FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI); |
272 | assert(Spill.size() == NumSubRegs); |
273 | |
274 | for (unsigned I = 0; I < NumSubRegs; ++I) { |
275 | Register SubReg = NumSubRegs == 1 |
276 | ? SuperReg |
277 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
278 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR), |
279 | DestReg: Spill[I].VGPR) |
280 | .addReg(RegNo: SubReg) |
281 | .addImm(Val: Spill[I].Lane) |
282 | .addReg(RegNo: Spill[I].VGPR, flags: RegState::Undef); |
283 | } |
284 | } |
285 | |
286 | void copyToScratchSGPR(Register DstReg) const { |
287 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
288 | .addReg(RegNo: SuperReg) |
289 | .setMIFlag(MachineInstr::FrameSetup); |
290 | } |
291 | |
292 | void restoreFromMemory(const int FI) { |
293 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
294 | |
295 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false); |
296 | MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( |
297 | MRI, LiveUnits, RC: AMDGPU::VGPR_32RegClass); |
298 | if (!TmpVGPR) |
299 | report_fatal_error(reason: "failed to find free scratch register" ); |
300 | |
301 | for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { |
302 | Register SubReg = NumSubRegs == 1 |
303 | ? SuperReg |
304 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
305 | |
306 | buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, |
307 | SpillReg: TmpVGPR, FI, FrameReg, DwordOff); |
308 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SubReg) |
309 | .addReg(RegNo: TmpVGPR, flags: RegState::Kill); |
310 | DwordOff += 4; |
311 | } |
312 | } |
313 | |
314 | void restoreFromVGPRLane(const int FI) { |
315 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
316 | ArrayRef<SIRegisterInfo::SpilledReg> Spill = |
317 | FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI); |
318 | assert(Spill.size() == NumSubRegs); |
319 | |
320 | for (unsigned I = 0; I < NumSubRegs; ++I) { |
321 | Register SubReg = NumSubRegs == 1 |
322 | ? SuperReg |
323 | : Register(TRI.getSubReg(Reg: SuperReg, Idx: SplitParts[I])); |
324 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg) |
325 | .addReg(RegNo: Spill[I].VGPR) |
326 | .addImm(Val: Spill[I].Lane); |
327 | } |
328 | } |
329 | |
330 | void copyFromScratchSGPR(Register SrcReg) const { |
331 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SuperReg) |
332 | .addReg(RegNo: SrcReg) |
333 | .setMIFlag(MachineInstr::FrameDestroy); |
334 | } |
335 | |
336 | public: |
337 | PrologEpilogSGPRSpillBuilder(Register Reg, |
338 | const PrologEpilogSGPRSaveRestoreInfo SI, |
339 | MachineBasicBlock &MBB, |
340 | MachineBasicBlock::iterator MI, |
341 | const DebugLoc &DL, const SIInstrInfo *TII, |
342 | const SIRegisterInfo &TRI, |
343 | LiveRegUnits &LiveUnits, Register FrameReg) |
344 | : MI(MI), MBB(MBB), MF(*MBB.getParent()), |
345 | ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), |
346 | FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), |
347 | SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), |
348 | FrameReg(FrameReg) { |
349 | const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg); |
350 | SplitParts = TRI.getRegSplitParts(RC, EltSize); |
351 | NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); |
352 | |
353 | assert(SuperReg != AMDGPU::M0 && "m0 should never spill" ); |
354 | } |
355 | |
356 | void save() { |
357 | switch (SI.getKind()) { |
358 | case SGPRSaveKind::SPILL_TO_MEM: |
359 | return saveToMemory(FI: SI.getIndex()); |
360 | case SGPRSaveKind::SPILL_TO_VGPR_LANE: |
361 | return saveToVGPRLane(FI: SI.getIndex()); |
362 | case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: |
363 | return copyToScratchSGPR(DstReg: SI.getReg()); |
364 | } |
365 | } |
366 | |
367 | void restore() { |
368 | switch (SI.getKind()) { |
369 | case SGPRSaveKind::SPILL_TO_MEM: |
370 | return restoreFromMemory(FI: SI.getIndex()); |
371 | case SGPRSaveKind::SPILL_TO_VGPR_LANE: |
372 | return restoreFromVGPRLane(FI: SI.getIndex()); |
373 | case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: |
374 | return copyFromScratchSGPR(SrcReg: SI.getReg()); |
375 | } |
376 | } |
377 | }; |
378 | |
379 | } // namespace llvm |
380 | |
381 | // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` |
382 | void SIFrameLowering::emitEntryFunctionFlatScratchInit( |
383 | MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
384 | const DebugLoc &DL, Register ScratchWaveOffsetReg) const { |
385 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
386 | const SIInstrInfo *TII = ST.getInstrInfo(); |
387 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
388 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
389 | |
390 | // We don't need this if we only have spills since there is no user facing |
391 | // scratch. |
392 | |
393 | // TODO: If we know we don't have flat instructions earlier, we can omit |
394 | // this from the input registers. |
395 | // |
396 | // TODO: We only need to know if we access scratch space through a flat |
397 | // pointer. Because we only detect if flat instructions are used at all, |
398 | // this will be used more often than necessary on VI. |
399 | |
400 | Register FlatScrInitLo; |
401 | Register FlatScrInitHi; |
402 | |
403 | if (ST.isAmdPalOS()) { |
404 | // Extract the scratch offset from the descriptor in the GIT |
405 | LiveRegUnits LiveUnits; |
406 | LiveUnits.init(TRI: *TRI); |
407 | LiveUnits.addLiveIns(MBB); |
408 | |
409 | // Find unused reg to load flat scratch init into |
410 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
411 | Register FlatScrInit = AMDGPU::NoRegister; |
412 | ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); |
413 | unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; |
414 | AllSGPR64s = AllSGPR64s.slice( |
415 | N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded)); |
416 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
417 | for (MCPhysReg Reg : AllSGPR64s) { |
418 | if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) && |
419 | MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg)) { |
420 | FlatScrInit = Reg; |
421 | break; |
422 | } |
423 | } |
424 | assert(FlatScrInit && "Failed to find free register for scratch init" ); |
425 | |
426 | FlatScrInitLo = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub0); |
427 | FlatScrInitHi = TRI->getSubReg(Reg: FlatScrInit, Idx: AMDGPU::sub1); |
428 | |
429 | buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit); |
430 | |
431 | // We now have the GIT ptr - now get the scratch descriptor from the entry |
432 | // at offset 0 (or offset 16 for a compute shader). |
433 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
434 | const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM); |
435 | auto *MMO = MF.getMachineMemOperand( |
436 | PtrInfo, |
437 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
438 | MachineMemOperand::MODereferenceable, |
439 | Size: 8, BaseAlignment: Align(4)); |
440 | unsigned Offset = |
441 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; |
442 | const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); |
443 | unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset); |
444 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit) |
445 | .addReg(RegNo: FlatScrInit) |
446 | .addImm(Val: EncodedOffset) // offset |
447 | .addImm(Val: 0) // cpol |
448 | .addMemOperand(MMO); |
449 | |
450 | // Mask the offset in [47:0] of the descriptor |
451 | const MCInstrDesc &SAndB32 = TII->get(Opcode: AMDGPU::S_AND_B32); |
452 | auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi) |
453 | .addReg(RegNo: FlatScrInitHi) |
454 | .addImm(Val: 0xffff); |
455 | And->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
456 | } else { |
457 | Register FlatScratchInitReg = |
458 | MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); |
459 | assert(FlatScratchInitReg); |
460 | |
461 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
462 | MRI.addLiveIn(Reg: FlatScratchInitReg); |
463 | MBB.addLiveIn(PhysReg: FlatScratchInitReg); |
464 | |
465 | FlatScrInitLo = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub0); |
466 | FlatScrInitHi = TRI->getSubReg(Reg: FlatScratchInitReg, Idx: AMDGPU::sub1); |
467 | } |
468 | |
469 | // Do a 64-bit pointer add. |
470 | if (ST.flatScratchIsPointer()) { |
471 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { |
472 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: FlatScrInitLo) |
473 | .addReg(RegNo: FlatScrInitLo) |
474 | .addReg(RegNo: ScratchWaveOffsetReg); |
475 | auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), |
476 | DestReg: FlatScrInitHi) |
477 | .addReg(RegNo: FlatScrInitHi) |
478 | .addImm(Val: 0); |
479 | Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
480 | |
481 | using namespace AMDGPU::Hwreg; |
482 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32)) |
483 | .addReg(RegNo: FlatScrInitLo) |
484 | .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_LO, Values: 0, Values: 32))); |
485 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_B32)) |
486 | .addReg(RegNo: FlatScrInitHi) |
487 | .addImm(Val: int16_t(HwregEncoding::encode(Values: ID_FLAT_SCR_HI, Values: 0, Values: 32))); |
488 | return; |
489 | } |
490 | |
491 | // For GFX9. |
492 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: AMDGPU::FLAT_SCR_LO) |
493 | .addReg(RegNo: FlatScrInitLo) |
494 | .addReg(RegNo: ScratchWaveOffsetReg); |
495 | auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), |
496 | DestReg: AMDGPU::FLAT_SCR_HI) |
497 | .addReg(RegNo: FlatScrInitHi) |
498 | .addImm(Val: 0); |
499 | Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
500 | |
501 | return; |
502 | } |
503 | |
504 | assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); |
505 | |
506 | // Copy the size in bytes. |
507 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::FLAT_SCR_LO) |
508 | .addReg(RegNo: FlatScrInitHi, flags: RegState::Kill); |
509 | |
510 | // Add wave offset in bytes to private base offset. |
511 | // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. |
512 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FlatScrInitLo) |
513 | .addReg(RegNo: FlatScrInitLo) |
514 | .addReg(RegNo: ScratchWaveOffsetReg); |
515 | |
516 | // Convert offset to 256-byte units. |
517 | auto LShr = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32), |
518 | DestReg: AMDGPU::FLAT_SCR_HI) |
519 | .addReg(RegNo: FlatScrInitLo, flags: RegState::Kill) |
520 | .addImm(Val: 8); |
521 | LShr->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
522 | } |
523 | |
524 | // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not |
525 | // memory. They should have been removed by now. |
526 | static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { |
527 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); |
528 | I != E; ++I) { |
529 | if (!MFI.isDeadObjectIndex(ObjectIdx: I)) |
530 | return false; |
531 | } |
532 | |
533 | return true; |
534 | } |
535 | |
536 | // Shift down registers reserved for the scratch RSRC. |
537 | Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( |
538 | MachineFunction &MF) const { |
539 | |
540 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
541 | const SIInstrInfo *TII = ST.getInstrInfo(); |
542 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
543 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
544 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
545 | |
546 | assert(MFI->isEntryFunction()); |
547 | |
548 | Register ScratchRsrcReg = MFI->getScratchRSrcReg(); |
549 | |
550 | if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) && |
551 | allStackObjectsAreDead(MFI: MF.getFrameInfo()))) |
552 | return Register(); |
553 | |
554 | if (ST.hasSGPRInitBug() || |
555 | ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) |
556 | return ScratchRsrcReg; |
557 | |
558 | // We reserved the last registers for this. Shift it down to the end of those |
559 | // which were actually used. |
560 | // |
561 | // FIXME: It might be safer to use a pseudoregister before replacement. |
562 | |
563 | // FIXME: We should be able to eliminate unused input registers. We only |
564 | // cannot do this for the resources required for scratch access. For now we |
565 | // skip over user SGPRs and may leave unused holes. |
566 | |
567 | unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; |
568 | ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); |
569 | AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded)); |
570 | |
571 | // Skip the last N reserved elements because they should have already been |
572 | // reserved for VCC etc. |
573 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
574 | for (MCPhysReg Reg : AllSGPR128s) { |
575 | // Pick the first unallocated one. Make sure we don't clobber the other |
576 | // reserved input we needed. Also for PAL, make sure we don't clobber |
577 | // the GIT pointer passed in SGPR0 or SGPR8. |
578 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) && |
579 | (!GITPtrLoReg || !TRI->isSubRegisterEq(RegA: Reg, RegB: GITPtrLoReg))) { |
580 | MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg); |
581 | MFI->setScratchRSrcReg(Reg); |
582 | MRI.reserveReg(PhysReg: Reg, TRI); |
583 | return Reg; |
584 | } |
585 | } |
586 | |
587 | return ScratchRsrcReg; |
588 | } |
589 | |
590 | static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { |
591 | return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); |
592 | } |
593 | |
594 | void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, |
595 | MachineBasicBlock &MBB) const { |
596 | assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported" ); |
597 | |
598 | // FIXME: If we only have SGPR spills, we won't actually be using scratch |
599 | // memory since these spill to VGPRs. We should be cleaning up these unused |
600 | // SGPR spill frame indices somewhere. |
601 | |
602 | // FIXME: We still have implicit uses on SGPR spill instructions in case they |
603 | // need to spill to vector memory. It's likely that will not happen, but at |
604 | // this point it appears we need the setup. This part of the prolog should be |
605 | // emitted after frame indices are eliminated. |
606 | |
607 | // FIXME: Remove all of the isPhysRegUsed checks |
608 | |
609 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
610 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
611 | const SIInstrInfo *TII = ST.getInstrInfo(); |
612 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
613 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
614 | const Function &F = MF.getFunction(); |
615 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
616 | |
617 | assert(MFI->isEntryFunction()); |
618 | |
619 | Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( |
620 | Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); |
621 | |
622 | // We need to do the replacement of the private segment buffer register even |
623 | // if there are no stack objects. There could be stores to undef or a |
624 | // constant without an associated object. |
625 | // |
626 | // This will return `Register()` in cases where there are no actual |
627 | // uses of the SRSRC. |
628 | Register ScratchRsrcReg; |
629 | if (!ST.enableFlatScratch()) |
630 | ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); |
631 | |
632 | // Make the selected register live throughout the function. |
633 | if (ScratchRsrcReg) { |
634 | for (MachineBasicBlock &OtherBB : MF) { |
635 | if (&OtherBB != &MBB) { |
636 | OtherBB.addLiveIn(PhysReg: ScratchRsrcReg); |
637 | } |
638 | } |
639 | } |
640 | |
641 | // Now that we have fixed the reserved SRSRC we need to locate the |
642 | // (potentially) preloaded SRSRC. |
643 | Register PreloadedScratchRsrcReg; |
644 | if (ST.isAmdHsaOrMesa(F)) { |
645 | PreloadedScratchRsrcReg = |
646 | MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); |
647 | if (ScratchRsrcReg && PreloadedScratchRsrcReg) { |
648 | // We added live-ins during argument lowering, but since they were not |
649 | // used they were deleted. We're adding the uses now, so add them back. |
650 | MRI.addLiveIn(Reg: PreloadedScratchRsrcReg); |
651 | MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg); |
652 | } |
653 | } |
654 | |
655 | // Debug location must be unknown since the first debug location is used to |
656 | // determine the end of the prologue. |
657 | DebugLoc DL; |
658 | MachineBasicBlock::iterator I = MBB.begin(); |
659 | |
660 | // We found the SRSRC first because it needs four registers and has an |
661 | // alignment requirement. If the SRSRC that we found is clobbering with |
662 | // the scratch wave offset, which may be in a fixed SGPR or a free SGPR |
663 | // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch |
664 | // wave offset to a free SGPR. |
665 | Register ScratchWaveOffsetReg; |
666 | if (PreloadedScratchWaveOffsetReg && |
667 | TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: PreloadedScratchWaveOffsetReg)) { |
668 | ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); |
669 | unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); |
670 | AllSGPRs = AllSGPRs.slice( |
671 | N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded)); |
672 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
673 | for (MCPhysReg Reg : AllSGPRs) { |
674 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) && |
675 | !TRI->isSubRegisterEq(RegA: ScratchRsrcReg, RegB: Reg) && GITPtrLoReg != Reg) { |
676 | ScratchWaveOffsetReg = Reg; |
677 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchWaveOffsetReg) |
678 | .addReg(RegNo: PreloadedScratchWaveOffsetReg, flags: RegState::Kill); |
679 | break; |
680 | } |
681 | } |
682 | |
683 | // FIXME: We can spill incoming arguments and restore at the end of the |
684 | // prolog. |
685 | if (!ScratchWaveOffsetReg) |
686 | report_fatal_error( |
687 | reason: "could not find temporary scratch offset register in prolog" ); |
688 | } else { |
689 | ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; |
690 | } |
691 | assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); |
692 | |
693 | if (hasFP(MF)) { |
694 | Register FPReg = MFI->getFrameOffsetReg(); |
695 | assert(FPReg != AMDGPU::FP_REG); |
696 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: FPReg).addImm(Val: 0); |
697 | } |
698 | |
699 | if (requiresStackPointerReference(MF)) { |
700 | Register SPReg = MFI->getStackPtrOffsetReg(); |
701 | assert(SPReg != AMDGPU::SP_REG); |
702 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SPReg) |
703 | .addImm(Val: FrameInfo.getStackSize() * getScratchScaleFactor(ST)); |
704 | } |
705 | |
706 | bool NeedsFlatScratchInit = |
707 | MFI->getUserSGPRInfo().hasFlatScratchInit() && |
708 | (MRI.isPhysRegUsed(PhysReg: AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || |
709 | (!allStackObjectsAreDead(MFI: FrameInfo) && ST.enableFlatScratch())); |
710 | |
711 | if ((NeedsFlatScratchInit || ScratchRsrcReg) && |
712 | PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { |
713 | MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg); |
714 | MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg); |
715 | } |
716 | |
717 | if (NeedsFlatScratchInit) { |
718 | emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); |
719 | } |
720 | |
721 | if (ScratchRsrcReg) { |
722 | emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, |
723 | PreloadedPrivateBufferReg: PreloadedScratchRsrcReg, |
724 | ScratchRsrcReg, ScratchWaveOffsetReg); |
725 | } |
726 | } |
727 | |
728 | // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` |
729 | void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( |
730 | MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
731 | const DebugLoc &DL, Register PreloadedScratchRsrcReg, |
732 | Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { |
733 | |
734 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
735 | const SIInstrInfo *TII = ST.getInstrInfo(); |
736 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
737 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
738 | const Function &Fn = MF.getFunction(); |
739 | |
740 | if (ST.isAmdPalOS()) { |
741 | // The pointer to the GIT is formed from the offset passed in and either |
742 | // the amdgpu-git-ptr-high function attribute or the top part of the PC |
743 | Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1); |
744 | Register Rsrc03 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3); |
745 | |
746 | buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01); |
747 | |
748 | // We now have the GIT ptr - now get the scratch descriptor from the entry |
749 | // at offset 0 (or offset 16 for a compute shader). |
750 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
751 | const MCInstrDesc &LoadDwordX4 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX4_IMM); |
752 | auto MMO = MF.getMachineMemOperand(PtrInfo, |
753 | F: MachineMemOperand::MOLoad | |
754 | MachineMemOperand::MOInvariant | |
755 | MachineMemOperand::MODereferenceable, |
756 | Size: 16, BaseAlignment: Align(4)); |
757 | unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; |
758 | const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); |
759 | unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(ST: Subtarget, ByteOffset: Offset); |
760 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg) |
761 | .addReg(RegNo: Rsrc01) |
762 | .addImm(Val: EncodedOffset) // offset |
763 | .addImm(Val: 0) // cpol |
764 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine) |
765 | .addMemOperand(MMO); |
766 | |
767 | // The driver will always set the SRD for wave 64 (bits 118:117 of |
768 | // descriptor / bits 22:21 of third sub-reg will be 0b11) |
769 | // If the shader is actually wave32 we have to modify the const_index_stride |
770 | // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The |
771 | // reason the driver does this is that there can be cases where it presents |
772 | // 2 shaders with different wave size (e.g. VsFs). |
773 | // TODO: convert to using SCRATCH instructions or multiple SRD buffers |
774 | if (ST.isWave32()) { |
775 | const MCInstrDesc &SBitsetB32 = TII->get(Opcode: AMDGPU::S_BITSET0_B32); |
776 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03) |
777 | .addImm(Val: 21) |
778 | .addReg(RegNo: Rsrc03); |
779 | } |
780 | } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) { |
781 | assert(!ST.isAmdHsaOrMesa(Fn)); |
782 | const MCInstrDesc &SMovB32 = TII->get(Opcode: AMDGPU::S_MOV_B32); |
783 | |
784 | Register Rsrc2 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub2); |
785 | Register Rsrc3 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub3); |
786 | |
787 | // Use relocations to get the pointer, and setup the other bits manually. |
788 | uint64_t Rsrc23 = TII->getScratchRsrcWords23(); |
789 | |
790 | if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { |
791 | Register Rsrc01 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0_sub1); |
792 | |
793 | if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) { |
794 | const MCInstrDesc &Mov64 = TII->get(Opcode: AMDGPU::S_MOV_B64); |
795 | |
796 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01) |
797 | .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR()) |
798 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
799 | } else { |
800 | const MCInstrDesc &LoadDwordX2 = TII->get(Opcode: AMDGPU::S_LOAD_DWORDX2_IMM); |
801 | |
802 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
803 | auto MMO = MF.getMachineMemOperand( |
804 | PtrInfo, |
805 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
806 | MachineMemOperand::MODereferenceable, |
807 | Size: 8, BaseAlignment: Align(4)); |
808 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01) |
809 | .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR()) |
810 | .addImm(Val: 0) // offset |
811 | .addImm(Val: 0) // cpol |
812 | .addMemOperand(MMO) |
813 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
814 | |
815 | MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR()); |
816 | MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR()); |
817 | } |
818 | } else { |
819 | Register Rsrc0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0); |
820 | Register Rsrc1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1); |
821 | |
822 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0) |
823 | .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0" ) |
824 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
825 | |
826 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1) |
827 | .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1" ) |
828 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
829 | } |
830 | |
831 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2) |
832 | .addImm(Val: Rsrc23 & 0xffffffff) |
833 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
834 | |
835 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3) |
836 | .addImm(Val: Rsrc23 >> 32) |
837 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
838 | } else if (ST.isAmdHsaOrMesa(F: Fn)) { |
839 | assert(PreloadedScratchRsrcReg); |
840 | |
841 | if (ScratchRsrcReg != PreloadedScratchRsrcReg) { |
842 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ScratchRsrcReg) |
843 | .addReg(RegNo: PreloadedScratchRsrcReg, flags: RegState::Kill); |
844 | } |
845 | } |
846 | |
847 | // Add the scratch wave offset into the scratch RSRC. |
848 | // |
849 | // We only want to update the first 48 bits, which is the base address |
850 | // pointer, without touching the adjacent 16 bits of flags. We know this add |
851 | // cannot carry-out from bit 47, otherwise the scratch allocation would be |
852 | // impossible to fit in the 48-bit global address space. |
853 | // |
854 | // TODO: Evaluate if it is better to just construct an SRD using the flat |
855 | // scratch init and some constants rather than update the one we are passed. |
856 | Register ScratchRsrcSub0 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub0); |
857 | Register ScratchRsrcSub1 = TRI->getSubReg(Reg: ScratchRsrcReg, Idx: AMDGPU::sub1); |
858 | |
859 | // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in |
860 | // the kernel body via inreg arguments. |
861 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: ScratchRsrcSub0) |
862 | .addReg(RegNo: ScratchRsrcSub0) |
863 | .addReg(RegNo: ScratchWaveOffsetReg) |
864 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
865 | auto Addc = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: ScratchRsrcSub1) |
866 | .addReg(RegNo: ScratchRsrcSub1) |
867 | .addImm(Val: 0) |
868 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
869 | Addc->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
870 | } |
871 | |
872 | bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { |
873 | switch (ID) { |
874 | case TargetStackID::Default: |
875 | case TargetStackID::NoAlloc: |
876 | case TargetStackID::SGPRSpill: |
877 | return true; |
878 | case TargetStackID::ScalableVector: |
879 | case TargetStackID::WasmLocal: |
880 | return false; |
881 | } |
882 | llvm_unreachable("Invalid TargetStackID::Value" ); |
883 | } |
884 | |
885 | // Activate only the inactive lanes when \p EnableInactiveLanes is true. |
886 | // Otherwise, activate all lanes. It returns the saved exec. |
887 | static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, |
888 | MachineFunction &MF, |
889 | MachineBasicBlock &MBB, |
890 | MachineBasicBlock::iterator MBBI, |
891 | const DebugLoc &DL, bool IsProlog, |
892 | bool EnableInactiveLanes) { |
893 | Register ScratchExecCopy; |
894 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
895 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
896 | const SIInstrInfo *TII = ST.getInstrInfo(); |
897 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
898 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
899 | |
900 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); |
901 | |
902 | ScratchExecCopy = findScratchNonCalleeSaveRegister( |
903 | MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass()); |
904 | if (!ScratchExecCopy) |
905 | report_fatal_error(reason: "failed to find free scratch register" ); |
906 | |
907 | LiveUnits.addReg(Reg: ScratchExecCopy); |
908 | |
909 | const unsigned SaveExecOpc = |
910 | ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 |
911 | : AMDGPU::S_OR_SAVEEXEC_B32) |
912 | : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 |
913 | : AMDGPU::S_OR_SAVEEXEC_B64); |
914 | auto SaveExec = |
915 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: SaveExecOpc), DestReg: ScratchExecCopy).addImm(Val: -1); |
916 | SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
917 | |
918 | return ScratchExecCopy; |
919 | } |
920 | |
921 | void SIFrameLowering::emitCSRSpillStores( |
922 | MachineFunction &MF, MachineBasicBlock &MBB, |
923 | MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, |
924 | Register FrameReg, Register FramePtrRegScratchCopy) const { |
925 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
926 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
927 | const SIInstrInfo *TII = ST.getInstrInfo(); |
928 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
929 | |
930 | // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch |
931 | // registers. However, save all lanes of callee-saved VGPRs. Due to this, we |
932 | // might end up flipping the EXEC bits twice. |
933 | Register ScratchExecCopy; |
934 | SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; |
935 | FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs); |
936 | if (!WWMScratchRegs.empty()) |
937 | ScratchExecCopy = |
938 | buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
939 | /*IsProlog*/ true, /*EnableInactiveLanes*/ true); |
940 | |
941 | auto StoreWWMRegisters = |
942 | [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { |
943 | for (const auto &Reg : WWMRegs) { |
944 | Register VGPR = Reg.first; |
945 | int FI = Reg.second; |
946 | buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL, |
947 | SpillReg: VGPR, FI, FrameReg); |
948 | } |
949 | }; |
950 | |
951 | StoreWWMRegisters(WWMScratchRegs); |
952 | if (!WWMCalleeSavedRegs.empty()) { |
953 | if (ScratchExecCopy) { |
954 | unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
955 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -1); |
956 | } else { |
957 | ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
958 | /*IsProlog*/ true, |
959 | /*EnableInactiveLanes*/ false); |
960 | } |
961 | } |
962 | |
963 | StoreWWMRegisters(WWMCalleeSavedRegs); |
964 | if (ScratchExecCopy) { |
965 | // FIXME: Split block and make terminator. |
966 | unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
967 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec()) |
968 | .addReg(RegNo: ScratchExecCopy, flags: RegState::Kill); |
969 | LiveUnits.addReg(Reg: ScratchExecCopy); |
970 | } |
971 | |
972 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
973 | |
974 | for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { |
975 | // Special handle FP spill: |
976 | // Skip if FP is saved to a scratch SGPR, the save has already been emitted. |
977 | // Otherwise, FP has been moved to a temporary register and spill it |
978 | // instead. |
979 | Register Reg = |
980 | Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; |
981 | if (!Reg) |
982 | continue; |
983 | |
984 | PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, |
985 | LiveUnits, FrameReg); |
986 | SB.save(); |
987 | } |
988 | |
989 | // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make |
990 | // such scratch registers live throughout the function. |
991 | SmallVector<Register, 1> ScratchSGPRs; |
992 | FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs); |
993 | if (!ScratchSGPRs.empty()) { |
994 | for (MachineBasicBlock &MBB : MF) { |
995 | for (MCPhysReg Reg : ScratchSGPRs) |
996 | MBB.addLiveIn(PhysReg: Reg); |
997 | |
998 | MBB.sortUniqueLiveIns(); |
999 | } |
1000 | if (!LiveUnits.empty()) { |
1001 | for (MCPhysReg Reg : ScratchSGPRs) |
1002 | LiveUnits.addReg(Reg); |
1003 | } |
1004 | } |
1005 | } |
1006 | |
1007 | void SIFrameLowering::emitCSRSpillRestores( |
1008 | MachineFunction &MF, MachineBasicBlock &MBB, |
1009 | MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, |
1010 | Register FrameReg, Register FramePtrRegScratchCopy) const { |
1011 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1012 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1013 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1014 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1015 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1016 | |
1017 | for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { |
1018 | // Special handle FP restore: |
1019 | // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore |
1020 | // the FP value to a temporary register. The frame pointer should be |
1021 | // overwritten only at the end when all other spills are restored from |
1022 | // current frame. |
1023 | Register Reg = |
1024 | Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; |
1025 | if (!Reg) |
1026 | continue; |
1027 | |
1028 | PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, |
1029 | LiveUnits, FrameReg); |
1030 | SB.restore(); |
1031 | } |
1032 | |
1033 | // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the |
1034 | // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to |
1035 | // this, we might end up flipping the EXEC bits twice. |
1036 | Register ScratchExecCopy; |
1037 | SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; |
1038 | FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs); |
1039 | if (!WWMScratchRegs.empty()) |
1040 | ScratchExecCopy = |
1041 | buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1042 | /*IsProlog*/ false, /*EnableInactiveLanes*/ true); |
1043 | |
1044 | auto RestoreWWMRegisters = |
1045 | [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { |
1046 | for (const auto &Reg : WWMRegs) { |
1047 | Register VGPR = Reg.first; |
1048 | int FI = Reg.second; |
1049 | buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL, |
1050 | SpillReg: VGPR, FI, FrameReg); |
1051 | } |
1052 | }; |
1053 | |
1054 | RestoreWWMRegisters(WWMScratchRegs); |
1055 | if (!WWMCalleeSavedRegs.empty()) { |
1056 | if (ScratchExecCopy) { |
1057 | unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1058 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: TRI.getExec()).addImm(Val: -1); |
1059 | } else { |
1060 | ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1061 | /*IsProlog*/ false, |
1062 | /*EnableInactiveLanes*/ false); |
1063 | } |
1064 | } |
1065 | |
1066 | RestoreWWMRegisters(WWMCalleeSavedRegs); |
1067 | if (ScratchExecCopy) { |
1068 | // FIXME: Split block and make terminator. |
1069 | unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1070 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: ExecMov), DestReg: TRI.getExec()) |
1071 | .addReg(RegNo: ScratchExecCopy, flags: RegState::Kill); |
1072 | } |
1073 | } |
1074 | |
1075 | void SIFrameLowering::emitPrologue(MachineFunction &MF, |
1076 | MachineBasicBlock &MBB) const { |
1077 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1078 | if (FuncInfo->isEntryFunction()) { |
1079 | emitEntryFunctionPrologue(MF, MBB); |
1080 | return; |
1081 | } |
1082 | |
1083 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1084 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1085 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1086 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1087 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1088 | |
1089 | Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); |
1090 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1091 | Register BasePtrReg = |
1092 | TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); |
1093 | LiveRegUnits LiveUnits; |
1094 | |
1095 | MachineBasicBlock::iterator MBBI = MBB.begin(); |
1096 | // DebugLoc must be unknown since the first instruction with DebugLoc is used |
1097 | // to determine the end of the prologue. |
1098 | DebugLoc DL; |
1099 | |
1100 | if (FuncInfo->isChainFunction()) { |
1101 | // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but |
1102 | // are free to set one up if they need it. |
1103 | bool UseSP = requiresStackPointerReference(MF); |
1104 | if (UseSP) { |
1105 | assert(StackPtrReg != AMDGPU::SP_REG); |
1106 | |
1107 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: StackPtrReg) |
1108 | .addImm(Val: MFI.getStackSize() * getScratchScaleFactor(ST)); |
1109 | } |
1110 | } |
1111 | |
1112 | bool HasFP = false; |
1113 | bool HasBP = false; |
1114 | uint32_t NumBytes = MFI.getStackSize(); |
1115 | uint32_t RoundedSize = NumBytes; |
1116 | |
1117 | if (TRI.hasStackRealignment(MF)) |
1118 | HasFP = true; |
1119 | |
1120 | Register FramePtrRegScratchCopy; |
1121 | if (!HasFP && !hasFP(MF)) { |
1122 | // Emit the CSR spill stores with SP base register. |
1123 | emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, |
1124 | FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg, |
1125 | FramePtrRegScratchCopy); |
1126 | } else { |
1127 | // CSR spill stores will use FP as base register. |
1128 | Register SGPRForFPSaveRestoreCopy = |
1129 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1130 | |
1131 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); |
1132 | if (SGPRForFPSaveRestoreCopy) { |
1133 | // Copy FP to the scratch register now and emit the CFI entry. It avoids |
1134 | // the extra FP copy needed in the other two cases when FP is spilled to |
1135 | // memory or to a VGPR lane. |
1136 | PrologEpilogSGPRSpillBuilder SB( |
1137 | FramePtrReg, |
1138 | FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI, |
1139 | DL, TII, TRI, LiveUnits, FramePtrReg); |
1140 | SB.save(); |
1141 | LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy); |
1142 | } else { |
1143 | // Copy FP into a new scratch register so that its previous value can be |
1144 | // spilled after setting up the new frame. |
1145 | FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( |
1146 | MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass); |
1147 | if (!FramePtrRegScratchCopy) |
1148 | report_fatal_error(reason: "failed to find free scratch register" ); |
1149 | |
1150 | LiveUnits.addReg(Reg: FramePtrRegScratchCopy); |
1151 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrRegScratchCopy) |
1152 | .addReg(RegNo: FramePtrReg); |
1153 | } |
1154 | } |
1155 | |
1156 | if (HasFP) { |
1157 | const unsigned Alignment = MFI.getMaxAlign().value(); |
1158 | |
1159 | RoundedSize += Alignment; |
1160 | if (LiveUnits.empty()) { |
1161 | LiveUnits.init(TRI); |
1162 | LiveUnits.addLiveIns(MBB); |
1163 | } |
1164 | |
1165 | // s_add_i32 s33, s32, NumBytes |
1166 | // s_and_b32 s33, s33, 0b111...0000 |
1167 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: FramePtrReg) |
1168 | .addReg(RegNo: StackPtrReg) |
1169 | .addImm(Val: (Alignment - 1) * getScratchScaleFactor(ST)) |
1170 | .setMIFlag(MachineInstr::FrameSetup); |
1171 | auto And = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: FramePtrReg) |
1172 | .addReg(RegNo: FramePtrReg, flags: RegState::Kill) |
1173 | .addImm(Val: -Alignment * getScratchScaleFactor(ST)) |
1174 | .setMIFlag(MachineInstr::FrameSetup); |
1175 | And->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1176 | FuncInfo->setIsStackRealigned(true); |
1177 | } else if ((HasFP = hasFP(MF))) { |
1178 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg) |
1179 | .addReg(RegNo: StackPtrReg) |
1180 | .setMIFlag(MachineInstr::FrameSetup); |
1181 | } |
1182 | |
1183 | // If FP is used, emit the CSR spills with FP base register. |
1184 | if (HasFP) { |
1185 | emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg, |
1186 | FramePtrRegScratchCopy); |
1187 | if (FramePtrRegScratchCopy) |
1188 | LiveUnits.removeReg(Reg: FramePtrRegScratchCopy); |
1189 | } |
1190 | |
1191 | // If we need a base pointer, set it up here. It's whatever the value of |
1192 | // the stack pointer is at this point. Any variable size objects will be |
1193 | // allocated after this, so we can still use the base pointer to reference |
1194 | // the incoming arguments. |
1195 | if ((HasBP = TRI.hasBasePointer(MF))) { |
1196 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BasePtrReg) |
1197 | .addReg(RegNo: StackPtrReg) |
1198 | .setMIFlag(MachineInstr::FrameSetup); |
1199 | } |
1200 | |
1201 | if (HasFP && RoundedSize != 0) { |
1202 | auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg) |
1203 | .addReg(RegNo: StackPtrReg) |
1204 | .addImm(Val: RoundedSize * getScratchScaleFactor(ST)) |
1205 | .setMIFlag(MachineInstr::FrameSetup); |
1206 | Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1207 | } |
1208 | |
1209 | bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg); |
1210 | (void)FPSaved; |
1211 | assert((!HasFP || FPSaved) && |
1212 | "Needed to save FP but didn't save it anywhere" ); |
1213 | |
1214 | // If we allow spilling to AGPRs we may have saved FP but then spill |
1215 | // everything into AGPRs instead of the stack. |
1216 | assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && |
1217 | "Saved FP but didn't need it" ); |
1218 | |
1219 | bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg); |
1220 | (void)BPSaved; |
1221 | assert((!HasBP || BPSaved) && |
1222 | "Needed to save BP but didn't save it anywhere" ); |
1223 | |
1224 | assert((HasBP || !BPSaved) && "Saved BP but didn't need it" ); |
1225 | } |
1226 | |
1227 | void SIFrameLowering::emitEpilogue(MachineFunction &MF, |
1228 | MachineBasicBlock &MBB) const { |
1229 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1230 | if (FuncInfo->isEntryFunction()) |
1231 | return; |
1232 | |
1233 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1234 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1235 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1236 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1237 | LiveRegUnits LiveUnits; |
1238 | // Get the insert location for the epilogue. If there were no terminators in |
1239 | // the block, get the last instruction. |
1240 | MachineBasicBlock::iterator MBBI = MBB.end(); |
1241 | DebugLoc DL; |
1242 | if (!MBB.empty()) { |
1243 | MBBI = MBB.getLastNonDebugInstr(); |
1244 | if (MBBI != MBB.end()) |
1245 | DL = MBBI->getDebugLoc(); |
1246 | |
1247 | MBBI = MBB.getFirstTerminator(); |
1248 | } |
1249 | |
1250 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1251 | uint32_t NumBytes = MFI.getStackSize(); |
1252 | uint32_t RoundedSize = FuncInfo->isStackRealigned() |
1253 | ? NumBytes + MFI.getMaxAlign().value() |
1254 | : NumBytes; |
1255 | const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); |
1256 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1257 | bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg); |
1258 | |
1259 | Register FramePtrRegScratchCopy; |
1260 | Register SGPRForFPSaveRestoreCopy = |
1261 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1262 | if (FPSaved) { |
1263 | // CSR spill restores should use FP as base register. If |
1264 | // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP |
1265 | // into a new scratch register and copy to FP later when other registers are |
1266 | // restored from the current stack frame. |
1267 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); |
1268 | if (SGPRForFPSaveRestoreCopy) { |
1269 | LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy); |
1270 | } else { |
1271 | FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( |
1272 | MRI, LiveUnits, RC: AMDGPU::SReg_32_XM0_XEXECRegClass); |
1273 | if (!FramePtrRegScratchCopy) |
1274 | report_fatal_error(reason: "failed to find free scratch register" ); |
1275 | |
1276 | LiveUnits.addReg(Reg: FramePtrRegScratchCopy); |
1277 | } |
1278 | |
1279 | emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg, |
1280 | FramePtrRegScratchCopy); |
1281 | } |
1282 | |
1283 | if (RoundedSize != 0 && hasFP(MF)) { |
1284 | auto Add = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: StackPtrReg) |
1285 | .addReg(RegNo: StackPtrReg) |
1286 | .addImm(Val: -static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) |
1287 | .setMIFlag(MachineInstr::FrameDestroy); |
1288 | Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1289 | } |
1290 | |
1291 | if (FPSaved) { |
1292 | // Insert the copy to restore FP. |
1293 | Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy |
1294 | : FramePtrRegScratchCopy; |
1295 | MachineInstrBuilder MIB = |
1296 | BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: FramePtrReg) |
1297 | .addReg(RegNo: SrcReg); |
1298 | if (SGPRForFPSaveRestoreCopy) |
1299 | MIB.setMIFlag(MachineInstr::FrameDestroy); |
1300 | } else { |
1301 | // Insert the CSR spill restores with SP as the base register. |
1302 | emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg, |
1303 | FramePtrRegScratchCopy); |
1304 | } |
1305 | } |
1306 | |
1307 | #ifndef NDEBUG |
1308 | static bool allSGPRSpillsAreDead(const MachineFunction &MF) { |
1309 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1310 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1311 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); |
1312 | I != E; ++I) { |
1313 | if (!MFI.isDeadObjectIndex(I) && |
1314 | MFI.getStackID(I) == TargetStackID::SGPRSpill && |
1315 | !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { |
1316 | return false; |
1317 | } |
1318 | } |
1319 | |
1320 | return true; |
1321 | } |
1322 | #endif |
1323 | |
1324 | StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, |
1325 | int FI, |
1326 | Register &FrameReg) const { |
1327 | const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); |
1328 | |
1329 | FrameReg = RI->getFrameRegister(MF); |
1330 | return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI)); |
1331 | } |
1332 | |
1333 | void SIFrameLowering::processFunctionBeforeFrameFinalized( |
1334 | MachineFunction &MF, |
1335 | RegScavenger *RS) const { |
1336 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1337 | |
1338 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1339 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1340 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1341 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1342 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1343 | |
1344 | // Allocate spill slots for WWM reserved VGPRs. |
1345 | // For chain functions, we only need to do this if we have calls to |
1346 | // llvm.amdgcn.cs.chain. |
1347 | bool IsChainWithoutCalls = |
1348 | FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); |
1349 | if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { |
1350 | for (Register Reg : FuncInfo->getWWMReservedRegs()) { |
1351 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); |
1352 | FuncInfo->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(RC: *RC), |
1353 | Alignment: TRI->getSpillAlign(RC: *RC)); |
1354 | } |
1355 | } |
1356 | |
1357 | const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() |
1358 | && EnableSpillVGPRToAGPR; |
1359 | |
1360 | if (SpillVGPRToAGPR) { |
1361 | // To track the spill frame indices handled in this pass. |
1362 | BitVector SpillFIs(MFI.getObjectIndexEnd(), false); |
1363 | BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); |
1364 | |
1365 | bool SeenDbgInstr = false; |
1366 | |
1367 | for (MachineBasicBlock &MBB : MF) { |
1368 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) { |
1369 | int FrameIndex; |
1370 | if (MI.isDebugInstr()) |
1371 | SeenDbgInstr = true; |
1372 | |
1373 | if (TII->isVGPRSpill(MI)) { |
1374 | // Try to eliminate stack used by VGPR spills before frame |
1375 | // finalization. |
1376 | unsigned FIOp = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
1377 | NamedIdx: AMDGPU::OpName::vaddr); |
1378 | int FI = MI.getOperand(i: FIOp).getIndex(); |
1379 | Register VReg = |
1380 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg(); |
1381 | if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, |
1382 | isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) { |
1383 | assert(RS != nullptr); |
1384 | RS->enterBasicBlockEnd(MBB); |
1385 | RS->backward(I: std::next(x: MI.getIterator())); |
1386 | TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS); |
1387 | SpillFIs.set(FI); |
1388 | continue; |
1389 | } |
1390 | } else if (TII->isStoreToStackSlot(MI, FrameIndex) || |
1391 | TII->isLoadFromStackSlot(MI, FrameIndex)) |
1392 | if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex)) |
1393 | NonVGPRSpillFIs.set(FrameIndex); |
1394 | } |
1395 | } |
1396 | |
1397 | // Stack slot coloring may assign different objects to the same stack slot. |
1398 | // If not, then the VGPR to AGPR spill slot is dead. |
1399 | for (unsigned FI : SpillFIs.set_bits()) |
1400 | if (!NonVGPRSpillFIs.test(Idx: FI)) |
1401 | FuncInfo->setVGPRToAGPRSpillDead(FI); |
1402 | |
1403 | for (MachineBasicBlock &MBB : MF) { |
1404 | for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) |
1405 | MBB.addLiveIn(PhysReg: Reg); |
1406 | |
1407 | for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) |
1408 | MBB.addLiveIn(PhysReg: Reg); |
1409 | |
1410 | MBB.sortUniqueLiveIns(); |
1411 | |
1412 | if (!SpillFIs.empty() && SeenDbgInstr) { |
1413 | // FIXME: The dead frame indices are replaced with a null register from |
1414 | // the debug value instructions. We should instead, update it with the |
1415 | // correct register value. But not sure the register value alone is |
1416 | for (MachineInstr &MI : MBB) { |
1417 | if (MI.isDebugValue() && MI.getOperand(i: 0).isFI() && |
1418 | !MFI.isFixedObjectIndex(ObjectIdx: MI.getOperand(i: 0).getIndex()) && |
1419 | SpillFIs[MI.getOperand(i: 0).getIndex()]) { |
1420 | MI.getOperand(i: 0).ChangeToRegister(Reg: Register(), isDef: false /*isDef*/); |
1421 | } |
1422 | } |
1423 | } |
1424 | } |
1425 | } |
1426 | |
1427 | // At this point we've already allocated all spilled SGPRs to VGPRs if we |
1428 | // can. Any remaining SGPR spills will go to memory, so move them back to the |
1429 | // default stack. |
1430 | bool HaveSGPRToVMemSpill = |
1431 | FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); |
1432 | assert(allSGPRSpillsAreDead(MF) && |
1433 | "SGPR spill should have been removed in SILowerSGPRSpills" ); |
1434 | |
1435 | // FIXME: The other checks should be redundant with allStackObjectsAreDead, |
1436 | // but currently hasNonSpillStackObjects is set only from source |
1437 | // allocas. Stack temps produced from legalization are not counted currently. |
1438 | if (!allStackObjectsAreDead(MFI)) { |
1439 | assert(RS && "RegScavenger required if spilling" ); |
1440 | |
1441 | // Add an emergency spill slot |
1442 | RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI)); |
1443 | |
1444 | // If we are spilling SGPRs to memory with a large frame, we may need a |
1445 | // second VGPR emergency frame index. |
1446 | if (HaveSGPRToVMemSpill && |
1447 | allocateScavengingFrameIndexesNearIncomingSP(MF)) { |
1448 | RS->addScavengingFrameIndex(FI: MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false)); |
1449 | } |
1450 | } |
1451 | } |
1452 | |
1453 | void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( |
1454 | MachineFunction &MF, RegScavenger *RS) const { |
1455 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1456 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1457 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1458 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1459 | |
1460 | if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { |
1461 | // On gfx908, we had initially reserved highest available VGPR for AGPR |
1462 | // copy. Now since we are done with RA, check if there exist an unused VGPR |
1463 | // which is lower than the eariler reserved VGPR before RA. If one exist, |
1464 | // use it for AGPR copy instead of one reserved before RA. |
1465 | Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); |
1466 | Register UnusedLowVGPR = |
1467 | TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF); |
1468 | if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) < |
1469 | TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) { |
1470 | // Reserve this newly identified VGPR (for AGPR copy) |
1471 | // reserved registers should already be frozen at this point |
1472 | // so we can avoid calling MRI.freezeReservedRegs and just use |
1473 | // MRI.reserveReg |
1474 | FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); |
1475 | MRI.reserveReg(PhysReg: UnusedLowVGPR, TRI); |
1476 | } |
1477 | } |
1478 | // We initally reserved the highest available SGPR pair for long branches |
1479 | // now, after RA, we shift down to a lower unused one if one exists |
1480 | Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); |
1481 | Register UnusedLowSGPR = |
1482 | TRI->findUnusedRegister(MRI, RC: &AMDGPU::SGPR_64RegClass, MF); |
1483 | // If LongBranchReservedReg is null then we didn't find a long branch |
1484 | // and never reserved a register to begin with so there is nothing to |
1485 | // shift down. Then if UnusedLowSGPR is null, there isn't available lower |
1486 | // register to use so just keep the original one we set. |
1487 | if (LongBranchReservedReg && UnusedLowSGPR) { |
1488 | FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); |
1489 | MRI.reserveReg(PhysReg: UnusedLowSGPR, TRI); |
1490 | } |
1491 | } |
1492 | |
1493 | // The special SGPR spills like the one needed for FP, BP or any reserved |
1494 | // registers delayed until frame lowering. |
1495 | void SIFrameLowering::determinePrologEpilogSGPRSaves( |
1496 | MachineFunction &MF, BitVector &SavedVGPRs, |
1497 | bool NeedExecCopyReservedReg) const { |
1498 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
1499 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1500 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1501 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1502 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1503 | LiveRegUnits LiveUnits; |
1504 | LiveUnits.init(TRI: *TRI); |
1505 | // Initially mark callee saved registers as used so we will not choose them |
1506 | // while looking for scratch SGPRs. |
1507 | const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
1508 | for (unsigned I = 0; CSRegs[I]; ++I) |
1509 | LiveUnits.addReg(Reg: CSRegs[I]); |
1510 | |
1511 | const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); |
1512 | |
1513 | Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy(); |
1514 | if (NeedExecCopyReservedReg || |
1515 | (ReservedRegForExecCopy && |
1516 | MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) { |
1517 | MRI.reserveReg(PhysReg: ReservedRegForExecCopy, TRI); |
1518 | Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); |
1519 | if (UnusedScratchReg) { |
1520 | // If found any unused scratch SGPR, reserve the register itself for Exec |
1521 | // copy and there is no need for any spills in that case. |
1522 | MFI->setSGPRForEXECCopy(UnusedScratchReg); |
1523 | MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg); |
1524 | LiveUnits.addReg(Reg: UnusedScratchReg); |
1525 | } else { |
1526 | // Needs spill. |
1527 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) && |
1528 | "Re-reserving spill slot for EXEC copy register" ); |
1529 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC, |
1530 | /*IncludeScratchCopy=*/false); |
1531 | } |
1532 | } else if (ReservedRegForExecCopy) { |
1533 | // Reset it at this point. There are no whole-wave copies and spills |
1534 | // encountered. |
1535 | MFI->setSGPRForEXECCopy(AMDGPU::NoRegister); |
1536 | } |
1537 | |
1538 | // hasFP only knows about stack objects that already exist. We're now |
1539 | // determining the stack slots that will be created, so we have to predict |
1540 | // them. Stack objects force FP usage with calls. |
1541 | // |
1542 | // Note a new VGPR CSR may be introduced if one is used for the spill, but we |
1543 | // don't want to report it here. |
1544 | // |
1545 | // FIXME: Is this really hasReservedCallFrame? |
1546 | const bool WillHaveFP = |
1547 | FrameInfo.hasCalls() && |
1548 | (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo)); |
1549 | |
1550 | if (WillHaveFP || hasFP(MF)) { |
1551 | Register FramePtrReg = MFI->getFrameOffsetReg(); |
1552 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && |
1553 | "Re-reserving spill slot for FP" ); |
1554 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg); |
1555 | } |
1556 | |
1557 | if (TRI->hasBasePointer(MF)) { |
1558 | Register BasePtrReg = TRI->getBaseRegister(); |
1559 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && |
1560 | "Re-reserving spill slot for BP" ); |
1561 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg); |
1562 | } |
1563 | } |
1564 | |
1565 | // Only report VGPRs to generic code. |
1566 | void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, |
1567 | BitVector &SavedVGPRs, |
1568 | RegScavenger *RS) const { |
1569 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1570 | |
1571 | // If this is a function with the amdgpu_cs_chain[_preserve] calling |
1572 | // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then |
1573 | // we don't need to save and restore anything. |
1574 | if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) |
1575 | return; |
1576 | |
1577 | MFI->shiftSpillPhysVGPRsToLowestRange(MF); |
1578 | |
1579 | TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS); |
1580 | if (MFI->isEntryFunction()) |
1581 | return; |
1582 | |
1583 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1584 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1585 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1586 | bool NeedExecCopyReservedReg = false; |
1587 | |
1588 | MachineInstr *ReturnMI = nullptr; |
1589 | for (MachineBasicBlock &MBB : MF) { |
1590 | for (MachineInstr &MI : MBB) { |
1591 | // WRITELANE instructions used for SGPR spills can overwrite the inactive |
1592 | // lanes of VGPRs and callee must spill and restore them even if they are |
1593 | // marked Caller-saved. |
1594 | |
1595 | // TODO: Handle this elsewhere at an early point. Walking through all MBBs |
1596 | // here would be a bad heuristic. A better way should be by calling |
1597 | // allocateWWMSpill during the regalloc pipeline whenever a physical |
1598 | // register is allocated for the intended virtual registers. |
1599 | if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) |
1600 | MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: 0).getReg()); |
1601 | else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) |
1602 | MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: 1).getReg()); |
1603 | else if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode())) |
1604 | NeedExecCopyReservedReg = true; |
1605 | else if (MI.getOpcode() == AMDGPU::SI_RETURN || |
1606 | MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || |
1607 | (MFI->isChainFunction() && |
1608 | TII->isChainCallOpcode(Opcode: MI.getOpcode()))) { |
1609 | // We expect all return to be the same size. |
1610 | assert(!ReturnMI || |
1611 | (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == |
1612 | count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); |
1613 | ReturnMI = &MI; |
1614 | } |
1615 | } |
1616 | } |
1617 | |
1618 | // Remove any VGPRs used in the return value because these do not need to be saved. |
1619 | // This prevents CSR restore from clobbering return VGPRs. |
1620 | if (ReturnMI) { |
1621 | for (auto &Op : ReturnMI->operands()) { |
1622 | if (Op.isReg()) |
1623 | SavedVGPRs.reset(Idx: Op.getReg()); |
1624 | } |
1625 | } |
1626 | |
1627 | // Ignore the SGPRs the default implementation found. |
1628 | SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask()); |
1629 | |
1630 | // Do not save AGPRs prior to GFX90A because there was no easy way to do so. |
1631 | // In gfx908 there was do AGPR loads and stores and thus spilling also |
1632 | // require a temporary VGPR. |
1633 | if (!ST.hasGFX90AInsts()) |
1634 | SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask()); |
1635 | |
1636 | determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); |
1637 | |
1638 | // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't |
1639 | // allow the default insertion to handle them. |
1640 | for (auto &Reg : MFI->getWWMSpills()) |
1641 | SavedVGPRs.reset(Idx: Reg.first); |
1642 | |
1643 | // Mark all lane VGPRs as BB LiveIns. |
1644 | for (MachineBasicBlock &MBB : MF) { |
1645 | for (auto &Reg : MFI->getWWMSpills()) |
1646 | MBB.addLiveIn(PhysReg: Reg.first); |
1647 | |
1648 | MBB.sortUniqueLiveIns(); |
1649 | } |
1650 | } |
1651 | |
1652 | void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, |
1653 | BitVector &SavedRegs, |
1654 | RegScavenger *RS) const { |
1655 | TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); |
1656 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1657 | if (MFI->isEntryFunction()) |
1658 | return; |
1659 | |
1660 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1661 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1662 | |
1663 | // The SP is specifically managed and we don't want extra spills of it. |
1664 | SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg()); |
1665 | |
1666 | const BitVector AllSavedRegs = SavedRegs; |
1667 | SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask()); |
1668 | |
1669 | // We have to anticipate introducing CSR VGPR spills or spill of caller |
1670 | // save VGPR reserved for SGPR spills as we now always create stack entry |
1671 | // for it, if we don't have any stack objects already, since we require a FP |
1672 | // if there is a call and stack. We will allocate a VGPR for SGPR spills if |
1673 | // there are any SGPR spills. Whether they are CSR spills or otherwise. |
1674 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
1675 | const bool WillHaveFP = |
1676 | FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); |
1677 | |
1678 | // FP will be specially managed like SP. |
1679 | if (WillHaveFP || hasFP(MF)) |
1680 | SavedRegs.reset(Idx: MFI->getFrameOffsetReg()); |
1681 | |
1682 | // Return address use with return instruction is hidden through the SI_RETURN |
1683 | // pseudo. Given that and since the IPRA computes actual register usage and |
1684 | // does not use CSR list, the clobbering of return address by function calls |
1685 | // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register |
1686 | // usage collection. This will ensure save/restore of return address happens |
1687 | // in those scenarios. |
1688 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1689 | Register RetAddrReg = TRI->getReturnAddressReg(MF); |
1690 | if (!MFI->isEntryFunction() && |
1691 | (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) { |
1692 | SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub0)); |
1693 | SavedRegs.set(TRI->getSubReg(Reg: RetAddrReg, Idx: AMDGPU::sub1)); |
1694 | } |
1695 | } |
1696 | |
1697 | bool SIFrameLowering::assignCalleeSavedSpillSlots( |
1698 | MachineFunction &MF, const TargetRegisterInfo *TRI, |
1699 | std::vector<CalleeSavedInfo> &CSI) const { |
1700 | if (CSI.empty()) |
1701 | return true; // Early exit if no callee saved registers are modified! |
1702 | |
1703 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1704 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1705 | const SIRegisterInfo *RI = ST.getRegisterInfo(); |
1706 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1707 | Register BasePtrReg = RI->getBaseRegister(); |
1708 | Register SGPRForFPSaveRestoreCopy = |
1709 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1710 | Register SGPRForBPSaveRestoreCopy = |
1711 | FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg); |
1712 | if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) |
1713 | return false; |
1714 | |
1715 | unsigned NumModifiedRegs = 0; |
1716 | |
1717 | if (SGPRForFPSaveRestoreCopy) |
1718 | NumModifiedRegs++; |
1719 | if (SGPRForBPSaveRestoreCopy) |
1720 | NumModifiedRegs++; |
1721 | |
1722 | for (auto &CS : CSI) { |
1723 | if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { |
1724 | CS.setDstReg(SGPRForFPSaveRestoreCopy); |
1725 | if (--NumModifiedRegs) |
1726 | break; |
1727 | } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { |
1728 | CS.setDstReg(SGPRForBPSaveRestoreCopy); |
1729 | if (--NumModifiedRegs) |
1730 | break; |
1731 | } |
1732 | } |
1733 | |
1734 | return false; |
1735 | } |
1736 | |
1737 | bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( |
1738 | const MachineFunction &MF) const { |
1739 | |
1740 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1741 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1742 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1743 | uint64_t EstStackSize = MFI.estimateStackSize(MF); |
1744 | uint64_t MaxOffset = EstStackSize - 1; |
1745 | |
1746 | // We need the emergency stack slots to be allocated in range of the |
1747 | // MUBUF/flat scratch immediate offset from the base register, so assign these |
1748 | // first at the incoming SP position. |
1749 | // |
1750 | // TODO: We could try sorting the objects to find a hole in the first bytes |
1751 | // rather than allocating as close to possible. This could save a lot of space |
1752 | // on frames with alignment requirements. |
1753 | if (ST.enableFlatScratch()) { |
1754 | if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
1755 | FlatVariant: SIInstrFlags::FlatScratch)) |
1756 | return false; |
1757 | } else { |
1758 | if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset)) |
1759 | return false; |
1760 | } |
1761 | |
1762 | return true; |
1763 | } |
1764 | |
1765 | MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( |
1766 | MachineFunction &MF, |
1767 | MachineBasicBlock &MBB, |
1768 | MachineBasicBlock::iterator I) const { |
1769 | int64_t Amount = I->getOperand(i: 0).getImm(); |
1770 | if (Amount == 0) |
1771 | return MBB.erase(I); |
1772 | |
1773 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1774 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1775 | const DebugLoc &DL = I->getDebugLoc(); |
1776 | unsigned Opc = I->getOpcode(); |
1777 | bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); |
1778 | uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0; |
1779 | |
1780 | if (!hasReservedCallFrame(MF)) { |
1781 | Amount = alignTo(Size: Amount, A: getStackAlign()); |
1782 | assert(isUInt<32>(Amount) && "exceeded stack address space size" ); |
1783 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1784 | Register SPReg = MFI->getStackPtrOffsetReg(); |
1785 | |
1786 | Amount *= getScratchScaleFactor(ST); |
1787 | if (IsDestroy) |
1788 | Amount = -Amount; |
1789 | auto Add = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SPReg) |
1790 | .addReg(RegNo: SPReg) |
1791 | .addImm(Val: Amount); |
1792 | Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1793 | } else if (CalleePopAmount != 0) { |
1794 | llvm_unreachable("is this used?" ); |
1795 | } |
1796 | |
1797 | return MBB.erase(I); |
1798 | } |
1799 | |
1800 | /// Returns true if the frame will require a reference to the stack pointer. |
1801 | /// |
1802 | /// This is the set of conditions common to setting up the stack pointer in a |
1803 | /// kernel, and for using a frame pointer in a callable function. |
1804 | /// |
1805 | /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm |
1806 | /// references SP. |
1807 | static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { |
1808 | return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); |
1809 | } |
1810 | |
1811 | // The FP for kernels is always known 0, so we never really need to setup an |
1812 | // explicit register for it. However, DisableFramePointerElim will force us to |
1813 | // use a register for it. |
1814 | bool SIFrameLowering::hasFP(const MachineFunction &MF) const { |
1815 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1816 | |
1817 | // For entry & chain functions we can use an immediate offset in most cases, |
1818 | // so the presence of calls doesn't imply we need a distinct frame pointer. |
1819 | if (MFI.hasCalls() && |
1820 | !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && |
1821 | !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { |
1822 | // All offsets are unsigned, so need to be addressed in the same direction |
1823 | // as stack growth. |
1824 | |
1825 | // FIXME: This function is pretty broken, since it can be called before the |
1826 | // frame layout is determined or CSR spills are inserted. |
1827 | return MFI.getStackSize() != 0; |
1828 | } |
1829 | |
1830 | return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || |
1831 | MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( |
1832 | MF) || |
1833 | MF.getTarget().Options.DisableFramePointerElim(MF); |
1834 | } |
1835 | |
1836 | // This is essentially a reduced version of hasFP for entry functions. Since the |
1837 | // stack pointer is known 0 on entry to kernels, we never really need an FP |
1838 | // register. We may need to initialize the stack pointer depending on the frame |
1839 | // properties, which logically overlaps many of the cases where an ordinary |
1840 | // function would require an FP. |
1841 | // Also used for chain functions. While not technically entry functions, chain |
1842 | // functions may need to set up a stack pointer in some situations. |
1843 | bool SIFrameLowering::requiresStackPointerReference( |
1844 | const MachineFunction &MF) const { |
1845 | // Callable functions always require a stack pointer reference. |
1846 | assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || |
1847 | MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && |
1848 | "only expected to call this for entry points and chain functions" ); |
1849 | |
1850 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1851 | |
1852 | // Entry points ordinarily don't need to initialize SP. We have to set it up |
1853 | // for callees if there are any. Also note tail calls are impossible/don't |
1854 | // make any sense for kernels. |
1855 | if (MFI.hasCalls()) |
1856 | return true; |
1857 | |
1858 | // We still need to initialize the SP if we're doing anything weird that |
1859 | // references the SP, like variable sized stack objects. |
1860 | return frameTriviallyRequiresSP(MFI); |
1861 | } |
1862 | |