1 | //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// SI implementation of the TargetRegisterInfo class. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "AMDGPURegisterBankInfo.h" |
16 | #include "GCNSubtarget.h" |
17 | #include "MCTargetDesc/AMDGPUInstPrinter.h" |
18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
19 | #include "SIMachineFunctionInfo.h" |
20 | #include "SIRegisterInfo.h" |
21 | #include "llvm/CodeGen/LiveIntervals.h" |
22 | #include "llvm/CodeGen/LiveRegUnits.h" |
23 | #include "llvm/CodeGen/MachineDominators.h" |
24 | #include "llvm/CodeGen/MachineFrameInfo.h" |
25 | #include "llvm/CodeGen/RegisterScavenging.h" |
26 | |
27 | using namespace llvm; |
28 | |
29 | #define GET_REGINFO_TARGET_DESC |
30 | #include "AMDGPUGenRegisterInfo.inc" |
31 | |
32 | static cl::opt<bool> EnableSpillSGPRToVGPR( |
33 | "amdgpu-spill-sgpr-to-vgpr" , |
34 | cl::desc("Enable spilling SGPRs to VGPRs" ), |
35 | cl::ReallyHidden, |
36 | cl::init(Val: true)); |
37 | |
38 | std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts; |
39 | std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; |
40 | |
41 | // Map numbers of DWORDs to indexes in SubRegFromChannelTable. |
42 | // Valid indexes are shifted 1, such that a 0 mapping means unsupported. |
43 | // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, |
44 | // meaning index 7 in SubRegFromChannelTable. |
45 | static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { |
46 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; |
47 | |
48 | static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, |
49 | const Twine &ErrMsg) { |
50 | Fn.getContext().diagnose( |
51 | DI: DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc())); |
52 | } |
53 | |
54 | namespace llvm { |
55 | |
56 | // A temporary struct to spill SGPRs. |
57 | // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits |
58 | // just v_writelane and v_readlane. |
59 | // |
60 | // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR |
61 | // is saved to scratch (or the other way around for loads). |
62 | // For this, a VGPR is required where the needed lanes can be clobbered. The |
63 | // RegScavenger can provide a VGPR where currently active lanes can be |
64 | // clobbered, but we still need to save inactive lanes. |
65 | // The high-level steps are: |
66 | // - Try to scavenge SGPR(s) to save exec |
67 | // - Try to scavenge VGPR |
68 | // - Save needed, all or inactive lanes of a TmpVGPR |
69 | // - Spill/Restore SGPRs using TmpVGPR |
70 | // - Restore TmpVGPR |
71 | // |
72 | // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we |
73 | // cannot scavenge temporary SGPRs to save exec, we use the following code: |
74 | // buffer_store_dword TmpVGPR ; only if active lanes need to be saved |
75 | // s_not exec, exec |
76 | // buffer_store_dword TmpVGPR ; save inactive lanes |
77 | // s_not exec, exec |
78 | struct SGPRSpillBuilder { |
79 | struct PerVGPRData { |
80 | unsigned PerVGPR; |
81 | unsigned NumVGPRs; |
82 | int64_t VGPRLanes; |
83 | }; |
84 | |
85 | // The SGPR to save |
86 | Register SuperReg; |
87 | MachineBasicBlock::iterator MI; |
88 | ArrayRef<int16_t> SplitParts; |
89 | unsigned NumSubRegs; |
90 | bool IsKill; |
91 | const DebugLoc &DL; |
92 | |
93 | /* When spilling to stack */ |
94 | // The SGPRs are written into this VGPR, which is then written to scratch |
95 | // (or vice versa for loads). |
96 | Register TmpVGPR = AMDGPU::NoRegister; |
97 | // Temporary spill slot to save TmpVGPR to. |
98 | int TmpVGPRIndex = 0; |
99 | // If TmpVGPR is live before the spill or if it is scavenged. |
100 | bool TmpVGPRLive = false; |
101 | // Scavenged SGPR to save EXEC. |
102 | Register SavedExecReg = AMDGPU::NoRegister; |
103 | // Stack index to write the SGPRs to. |
104 | int Index; |
105 | unsigned EltSize = 4; |
106 | |
107 | RegScavenger *RS; |
108 | MachineBasicBlock *MBB; |
109 | MachineFunction &MF; |
110 | SIMachineFunctionInfo &MFI; |
111 | const SIInstrInfo &TII; |
112 | const SIRegisterInfo &TRI; |
113 | bool IsWave32; |
114 | Register ExecReg; |
115 | unsigned MovOpc; |
116 | unsigned NotOpc; |
117 | |
118 | SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, |
119 | bool IsWave32, MachineBasicBlock::iterator MI, int Index, |
120 | RegScavenger *RS) |
121 | : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(i: 0).getReg(), |
122 | MI->getOperand(i: 0).isKill(), Index, RS) {} |
123 | |
124 | SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, |
125 | bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, |
126 | bool IsKill, int Index, RegScavenger *RS) |
127 | : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), |
128 | Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), |
129 | MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), |
130 | IsWave32(IsWave32) { |
131 | const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg: SuperReg); |
132 | SplitParts = TRI.getRegSplitParts(RC, EltSize); |
133 | NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); |
134 | |
135 | if (IsWave32) { |
136 | ExecReg = AMDGPU::EXEC_LO; |
137 | MovOpc = AMDGPU::S_MOV_B32; |
138 | NotOpc = AMDGPU::S_NOT_B32; |
139 | } else { |
140 | ExecReg = AMDGPU::EXEC; |
141 | MovOpc = AMDGPU::S_MOV_B64; |
142 | NotOpc = AMDGPU::S_NOT_B64; |
143 | } |
144 | |
145 | assert(SuperReg != AMDGPU::M0 && "m0 should never spill" ); |
146 | assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && |
147 | SuperReg != AMDGPU::EXEC && "exec should never spill" ); |
148 | } |
149 | |
150 | PerVGPRData getPerVGPRData() { |
151 | PerVGPRData Data; |
152 | Data.PerVGPR = IsWave32 ? 32 : 64; |
153 | Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; |
154 | Data.VGPRLanes = (1LL << std::min(a: Data.PerVGPR, b: NumSubRegs)) - 1LL; |
155 | return Data; |
156 | } |
157 | |
158 | // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is |
159 | // free. |
160 | // Writes these instructions if an SGPR can be scavenged: |
161 | // s_mov_b64 s[6:7], exec ; Save exec |
162 | // s_mov_b64 exec, 3 ; Wanted lanemask |
163 | // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot |
164 | // |
165 | // Writes these instructions if no SGPR can be scavenged: |
166 | // buffer_store_dword v0 ; Only if no free VGPR was found |
167 | // s_not_b64 exec, exec |
168 | // buffer_store_dword v0 ; Save inactive lanes |
169 | // ; exec stays inverted, it is flipped back in |
170 | // ; restore. |
171 | void prepare() { |
172 | // Scavenged temporary VGPR to use. It must be scavenged once for any number |
173 | // of spilled subregs. |
174 | // FIXME: The liveness analysis is limited and does not tell if a register |
175 | // is in use in lanes that are currently inactive. We can never be sure if |
176 | // a register as actually in use in another lane, so we need to save all |
177 | // used lanes of the chosen VGPR. |
178 | assert(RS && "Cannot spill SGPR to memory without RegScavenger" ); |
179 | TmpVGPR = RS->scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI, RestoreAfter: false, |
180 | SPAdj: 0, AllowSpill: false); |
181 | |
182 | // Reserve temporary stack slot |
183 | TmpVGPRIndex = MFI.getScavengeFI(MFI&: MF.getFrameInfo(), TRI); |
184 | if (TmpVGPR) { |
185 | // Found a register that is dead in the currently active lanes, we only |
186 | // need to spill inactive lanes. |
187 | TmpVGPRLive = false; |
188 | } else { |
189 | // Pick v0 because it doesn't make a difference. |
190 | TmpVGPR = AMDGPU::VGPR0; |
191 | TmpVGPRLive = true; |
192 | } |
193 | |
194 | if (TmpVGPRLive) { |
195 | // We need to inform the scavenger that this index is already in use until |
196 | // we're done with the custom emergency spill. |
197 | RS->assignRegToScavengingIndex(FI: TmpVGPRIndex, Reg: TmpVGPR); |
198 | } |
199 | |
200 | // We may end up recursively calling the scavenger, and don't want to re-use |
201 | // the same register. |
202 | RS->setRegUsed(Reg: TmpVGPR); |
203 | |
204 | // Try to scavenge SGPRs to save exec |
205 | assert(!SavedExecReg && "Exec is already saved, refuse to save again" ); |
206 | const TargetRegisterClass &RC = |
207 | IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; |
208 | RS->setRegUsed(Reg: SuperReg); |
209 | SavedExecReg = RS->scavengeRegisterBackwards(RC, To: MI, RestoreAfter: false, SPAdj: 0, AllowSpill: false); |
210 | |
211 | int64_t VGPRLanes = getPerVGPRData().VGPRLanes; |
212 | |
213 | if (SavedExecReg) { |
214 | RS->setRegUsed(Reg: SavedExecReg); |
215 | // Set exec to needed lanes |
216 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: SavedExecReg).addReg(RegNo: ExecReg); |
217 | auto I = |
218 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ExecReg).addImm(Val: VGPRLanes); |
219 | if (!TmpVGPRLive) |
220 | I.addReg(RegNo: TmpVGPR, flags: RegState::ImplicitDefine); |
221 | // Spill needed lanes |
222 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ false); |
223 | } else { |
224 | // The modify and restore of exec clobber SCC, which we would have to save |
225 | // and restore. FIXME: We probably would need to reserve a register for |
226 | // this. |
227 | if (RS->isRegUsed(Reg: AMDGPU::SCC)) |
228 | emitUnsupportedError(Fn: MF.getFunction(), MI: *MI, |
229 | ErrMsg: "unhandled SGPR spill to memory" ); |
230 | |
231 | // Spill active lanes |
232 | if (TmpVGPRLive) |
233 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ false, |
234 | /*IsKill*/ false); |
235 | // Spill inactive lanes |
236 | auto I = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: NotOpc), DestReg: ExecReg).addReg(RegNo: ExecReg); |
237 | if (!TmpVGPRLive) |
238 | I.addReg(RegNo: TmpVGPR, flags: RegState::ImplicitDefine); |
239 | I->getOperand(i: 2).setIsDead(); // Mark SCC as dead. |
240 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ false); |
241 | } |
242 | } |
243 | |
244 | // Writes these instructions if an SGPR can be scavenged: |
245 | // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot |
246 | // s_waitcnt vmcnt(0) ; If a free VGPR was found |
247 | // s_mov_b64 exec, s[6:7] ; Save exec |
248 | // |
249 | // Writes these instructions if no SGPR can be scavenged: |
250 | // buffer_load_dword v0 ; Restore inactive lanes |
251 | // s_waitcnt vmcnt(0) ; If a free VGPR was found |
252 | // s_not_b64 exec, exec |
253 | // buffer_load_dword v0 ; Only if no free VGPR was found |
254 | void restore() { |
255 | if (SavedExecReg) { |
256 | // Restore used lanes |
257 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ true, |
258 | /*IsKill*/ false); |
259 | // Restore exec |
260 | auto I = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ExecReg) |
261 | .addReg(RegNo: SavedExecReg, flags: RegState::Kill); |
262 | // Add an implicit use of the load so it is not dead. |
263 | // FIXME This inserts an unnecessary waitcnt |
264 | if (!TmpVGPRLive) { |
265 | I.addReg(RegNo: TmpVGPR, flags: RegState::ImplicitKill); |
266 | } |
267 | } else { |
268 | // Restore inactive lanes |
269 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ true, |
270 | /*IsKill*/ false); |
271 | auto I = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: NotOpc), DestReg: ExecReg).addReg(RegNo: ExecReg); |
272 | if (!TmpVGPRLive) |
273 | I.addReg(RegNo: TmpVGPR, flags: RegState::ImplicitKill); |
274 | I->getOperand(i: 2).setIsDead(); // Mark SCC as dead. |
275 | |
276 | // Restore active lanes |
277 | if (TmpVGPRLive) |
278 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ true); |
279 | } |
280 | |
281 | // Inform the scavenger where we're releasing our custom scavenged register. |
282 | if (TmpVGPRLive) { |
283 | MachineBasicBlock::iterator RestorePt = std::prev(x: MI); |
284 | RS->assignRegToScavengingIndex(FI: TmpVGPRIndex, Reg: TmpVGPR, Restore: &*RestorePt); |
285 | } |
286 | } |
287 | |
288 | // Write TmpVGPR to memory or read TmpVGPR from memory. |
289 | // Either using a single buffer_load/store if exec is set to the needed mask |
290 | // or using |
291 | // buffer_load |
292 | // s_not exec, exec |
293 | // buffer_load |
294 | // s_not exec, exec |
295 | void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { |
296 | if (SavedExecReg) { |
297 | // Spill needed lanes |
298 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index, Offset, IsLoad); |
299 | } else { |
300 | // The modify and restore of exec clobber SCC, which we would have to save |
301 | // and restore. FIXME: We probably would need to reserve a register for |
302 | // this. |
303 | if (RS->isRegUsed(Reg: AMDGPU::SCC)) |
304 | emitUnsupportedError(Fn: MF.getFunction(), MI: *MI, |
305 | ErrMsg: "unhandled SGPR spill to memory" ); |
306 | |
307 | // Spill active lanes |
308 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index, Offset, IsLoad, |
309 | /*IsKill*/ false); |
310 | // Spill inactive lanes |
311 | auto Not0 = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: NotOpc), DestReg: ExecReg).addReg(RegNo: ExecReg); |
312 | Not0->getOperand(i: 2).setIsDead(); // Mark SCC as dead. |
313 | TRI.buildVGPRSpillLoadStore(SB&: *this, Index, Offset, IsLoad); |
314 | auto Not1 = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: NotOpc), DestReg: ExecReg).addReg(RegNo: ExecReg); |
315 | Not1->getOperand(i: 2).setIsDead(); // Mark SCC as dead. |
316 | } |
317 | } |
318 | |
319 | void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { |
320 | assert(MBB->getParent() == &MF); |
321 | MI = NewMI; |
322 | MBB = NewMBB; |
323 | } |
324 | }; |
325 | |
326 | } // namespace llvm |
327 | |
328 | SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) |
329 | : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), |
330 | ST.getAMDGPUDwarfFlavour(), |
331 | /*PC=*/0, ST.getHwMode()), |
332 | ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { |
333 | |
334 | assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && |
335 | getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && |
336 | (getSubRegIndexLaneMask(AMDGPU::lo16) | |
337 | getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == |
338 | getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && |
339 | "getNumCoveredRegs() will not work with generated subreg masks!" ); |
340 | |
341 | RegPressureIgnoredUnits.resize(N: getNumRegUnits()); |
342 | RegPressureIgnoredUnits.set(*regunits(Reg: MCRegister::from(Val: AMDGPU::M0)).begin()); |
343 | for (auto Reg : AMDGPU::VGPR_16RegClass) { |
344 | if (AMDGPU::isHi16Reg(Reg, MRI: *this)) |
345 | RegPressureIgnoredUnits.set(*regunits(Reg).begin()); |
346 | } |
347 | |
348 | // HACK: Until this is fully tablegen'd. |
349 | static llvm::once_flag InitializeRegSplitPartsFlag; |
350 | |
351 | static auto InitializeRegSplitPartsOnce = [this]() { |
352 | for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { |
353 | unsigned Size = getSubRegIdxSize(Idx); |
354 | if (Size & 15) |
355 | continue; |
356 | std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1]; |
357 | unsigned Pos = getSubRegIdxOffset(Idx); |
358 | if (Pos % Size) |
359 | continue; |
360 | Pos /= Size; |
361 | if (Vec.empty()) { |
362 | unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. |
363 | Vec.resize(new_size: MaxNumParts); |
364 | } |
365 | Vec[Pos] = Idx; |
366 | } |
367 | }; |
368 | |
369 | static llvm::once_flag InitializeSubRegFromChannelTableFlag; |
370 | |
371 | static auto InitializeSubRegFromChannelTableOnce = [this]() { |
372 | for (auto &Row : SubRegFromChannelTable) |
373 | Row.fill(u: AMDGPU::NoSubRegister); |
374 | for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { |
375 | unsigned Width = getSubRegIdxSize(Idx) / 32; |
376 | unsigned Offset = getSubRegIdxOffset(Idx) / 32; |
377 | assert(Width < SubRegFromChannelTableWidthMap.size()); |
378 | Width = SubRegFromChannelTableWidthMap[Width]; |
379 | if (Width == 0) |
380 | continue; |
381 | unsigned TableIdx = Width - 1; |
382 | assert(TableIdx < SubRegFromChannelTable.size()); |
383 | assert(Offset < SubRegFromChannelTable[TableIdx].size()); |
384 | SubRegFromChannelTable[TableIdx][Offset] = Idx; |
385 | } |
386 | }; |
387 | |
388 | llvm::call_once(flag&: InitializeRegSplitPartsFlag, F&: InitializeRegSplitPartsOnce); |
389 | llvm::call_once(flag&: InitializeSubRegFromChannelTableFlag, |
390 | F&: InitializeSubRegFromChannelTableOnce); |
391 | } |
392 | |
393 | void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, |
394 | MCRegister Reg) const { |
395 | for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) |
396 | Reserved.set(*R); |
397 | } |
398 | |
399 | // Forced to be here by one .inc |
400 | const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( |
401 | const MachineFunction *MF) const { |
402 | CallingConv::ID CC = MF->getFunction().getCallingConv(); |
403 | switch (CC) { |
404 | case CallingConv::C: |
405 | case CallingConv::Fast: |
406 | case CallingConv::Cold: |
407 | return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList |
408 | : CSR_AMDGPU_SaveList; |
409 | case CallingConv::AMDGPU_Gfx: |
410 | return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList |
411 | : CSR_AMDGPU_SI_Gfx_SaveList; |
412 | case CallingConv::AMDGPU_CS_ChainPreserve: |
413 | return CSR_AMDGPU_CS_ChainPreserve_SaveList; |
414 | default: { |
415 | // Dummy to not crash RegisterClassInfo. |
416 | static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; |
417 | return &NoCalleeSavedReg; |
418 | } |
419 | } |
420 | } |
421 | |
422 | const MCPhysReg * |
423 | SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { |
424 | return nullptr; |
425 | } |
426 | |
427 | const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, |
428 | CallingConv::ID CC) const { |
429 | switch (CC) { |
430 | case CallingConv::C: |
431 | case CallingConv::Fast: |
432 | case CallingConv::Cold: |
433 | return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask |
434 | : CSR_AMDGPU_RegMask; |
435 | case CallingConv::AMDGPU_Gfx: |
436 | return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask |
437 | : CSR_AMDGPU_SI_Gfx_RegMask; |
438 | case CallingConv::AMDGPU_CS_Chain: |
439 | case CallingConv::AMDGPU_CS_ChainPreserve: |
440 | // Calls to these functions never return, so we can pretend everything is |
441 | // preserved. |
442 | return AMDGPU_AllVGPRs_RegMask; |
443 | default: |
444 | return nullptr; |
445 | } |
446 | } |
447 | |
448 | const uint32_t *SIRegisterInfo::getNoPreservedMask() const { |
449 | return CSR_AMDGPU_NoRegs_RegMask; |
450 | } |
451 | |
452 | bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { |
453 | return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; |
454 | } |
455 | |
456 | const TargetRegisterClass * |
457 | SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, |
458 | const MachineFunction &MF) const { |
459 | // FIXME: Should have a helper function like getEquivalentVGPRClass to get the |
460 | // equivalent AV class. If used one, the verifier will crash after |
461 | // RegBankSelect in the GISel flow. The aligned regclasses are not fully given |
462 | // until Instruction selection. |
463 | if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { |
464 | if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) |
465 | return &AMDGPU::AV_32RegClass; |
466 | if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) |
467 | return &AMDGPU::AV_64RegClass; |
468 | if (RC == &AMDGPU::VReg_64_Align2RegClass || |
469 | RC == &AMDGPU::AReg_64_Align2RegClass) |
470 | return &AMDGPU::AV_64_Align2RegClass; |
471 | if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) |
472 | return &AMDGPU::AV_96RegClass; |
473 | if (RC == &AMDGPU::VReg_96_Align2RegClass || |
474 | RC == &AMDGPU::AReg_96_Align2RegClass) |
475 | return &AMDGPU::AV_96_Align2RegClass; |
476 | if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) |
477 | return &AMDGPU::AV_128RegClass; |
478 | if (RC == &AMDGPU::VReg_128_Align2RegClass || |
479 | RC == &AMDGPU::AReg_128_Align2RegClass) |
480 | return &AMDGPU::AV_128_Align2RegClass; |
481 | if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) |
482 | return &AMDGPU::AV_160RegClass; |
483 | if (RC == &AMDGPU::VReg_160_Align2RegClass || |
484 | RC == &AMDGPU::AReg_160_Align2RegClass) |
485 | return &AMDGPU::AV_160_Align2RegClass; |
486 | if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) |
487 | return &AMDGPU::AV_192RegClass; |
488 | if (RC == &AMDGPU::VReg_192_Align2RegClass || |
489 | RC == &AMDGPU::AReg_192_Align2RegClass) |
490 | return &AMDGPU::AV_192_Align2RegClass; |
491 | if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) |
492 | return &AMDGPU::AV_256RegClass; |
493 | if (RC == &AMDGPU::VReg_256_Align2RegClass || |
494 | RC == &AMDGPU::AReg_256_Align2RegClass) |
495 | return &AMDGPU::AV_256_Align2RegClass; |
496 | if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) |
497 | return &AMDGPU::AV_512RegClass; |
498 | if (RC == &AMDGPU::VReg_512_Align2RegClass || |
499 | RC == &AMDGPU::AReg_512_Align2RegClass) |
500 | return &AMDGPU::AV_512_Align2RegClass; |
501 | if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) |
502 | return &AMDGPU::AV_1024RegClass; |
503 | if (RC == &AMDGPU::VReg_1024_Align2RegClass || |
504 | RC == &AMDGPU::AReg_1024_Align2RegClass) |
505 | return &AMDGPU::AV_1024_Align2RegClass; |
506 | } |
507 | |
508 | return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); |
509 | } |
510 | |
511 | Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { |
512 | const SIFrameLowering *TFI = ST.getFrameLowering(); |
513 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
514 | |
515 | // During ISel lowering we always reserve the stack pointer in entry and chain |
516 | // functions, but never actually want to reference it when accessing our own |
517 | // frame. If we need a frame pointer we use it, but otherwise we can just use |
518 | // an immediate "0" which we represent by returning NoRegister. |
519 | if (FuncInfo->isBottomOfStack()) { |
520 | return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); |
521 | } |
522 | return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() |
523 | : FuncInfo->getStackPtrOffsetReg(); |
524 | } |
525 | |
526 | bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { |
527 | // When we need stack realignment, we can't reference off of the |
528 | // stack pointer, so we reserve a base pointer. |
529 | return shouldRealignStack(MF); |
530 | } |
531 | |
532 | Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } |
533 | |
534 | const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { |
535 | return AMDGPU_AllVGPRs_RegMask; |
536 | } |
537 | |
538 | const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { |
539 | return AMDGPU_AllAGPRs_RegMask; |
540 | } |
541 | |
542 | const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { |
543 | return AMDGPU_AllVectorRegs_RegMask; |
544 | } |
545 | |
546 | const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { |
547 | return AMDGPU_AllAllocatableSRegs_RegMask; |
548 | } |
549 | |
550 | unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, |
551 | unsigned NumRegs) { |
552 | assert(NumRegs < SubRegFromChannelTableWidthMap.size()); |
553 | unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; |
554 | assert(NumRegIndex && "Not implemented" ); |
555 | assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); |
556 | return SubRegFromChannelTable[NumRegIndex - 1][Channel]; |
557 | } |
558 | |
559 | MCRegister |
560 | SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, |
561 | const unsigned Align, |
562 | const TargetRegisterClass *RC) const { |
563 | unsigned BaseIdx = alignDown(Value: ST.getMaxNumSGPRs(MF), Align) - Align; |
564 | MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(i: BaseIdx)); |
565 | return getMatchingSuperReg(Reg: BaseReg, SubIdx: AMDGPU::sub0, RC); |
566 | } |
567 | |
568 | MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( |
569 | const MachineFunction &MF) const { |
570 | return getAlignedHighSGPRForRC(MF, /*Align=*/4, RC: &AMDGPU::SGPR_128RegClass); |
571 | } |
572 | |
573 | std::pair<unsigned, unsigned> |
574 | SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { |
575 | const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); |
576 | |
577 | unsigned MaxNumVGPRs = MaxVectorRegs; |
578 | unsigned MaxNumAGPRs = 0; |
579 | |
580 | // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, |
581 | // a wave may have up to 512 total vector registers combining together both |
582 | // VGPRs and AGPRs. Hence, in an entry function without calls and without |
583 | // AGPRs used within it, it is possible to use the whole vector register |
584 | // budget for VGPRs. |
585 | // |
586 | // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split |
587 | // register file accordingly. |
588 | if (ST.hasGFX90AInsts()) { |
589 | unsigned MinNumAGPRs = 0; |
590 | const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); |
591 | const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); |
592 | |
593 | const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; |
594 | |
595 | // TODO: Move this logic into subtarget on IR function |
596 | // |
597 | // TODO: The lower bound should probably force the number of required |
598 | // registers up, overriding amdgpu-waves-per-eu. |
599 | std::tie(args&: MinNumAGPRs, args&: MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( |
600 | F: MF.getFunction(), Name: "amdgpu-agpr-alloc" , Default: DefaultNumAGPR, |
601 | /*OnlyFirstRequired=*/true); |
602 | |
603 | if (MinNumAGPRs == DefaultNumAGPR.first) { |
604 | // Default to splitting half the registers if AGPRs are required. |
605 | MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; |
606 | } else { |
607 | // Align to accum_offset's allocation granularity. |
608 | MinNumAGPRs = alignTo(Value: MinNumAGPRs, Align: 4); |
609 | |
610 | MinNumAGPRs = std::min(a: MinNumAGPRs, b: TotalNumAGPRs); |
611 | } |
612 | |
613 | // Clamp values to be inbounds of our limits, and ensure min <= max. |
614 | |
615 | MaxNumAGPRs = std::min(a: std::max(a: MinNumAGPRs, b: MaxNumAGPRs), b: MaxVectorRegs); |
616 | MinNumAGPRs = std::min(a: std::min(a: MinNumAGPRs, b: TotalNumAGPRs), b: MaxNumAGPRs); |
617 | |
618 | MaxNumVGPRs = std::min(a: MaxVectorRegs - MinNumAGPRs, b: TotalNumVGPRs); |
619 | MaxNumAGPRs = std::min(a: MaxVectorRegs - MaxNumVGPRs, b: MaxNumAGPRs); |
620 | |
621 | assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && |
622 | MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && |
623 | "invalid register counts" ); |
624 | } else if (ST.hasMAIInsts()) { |
625 | // On gfx908 the number of AGPRs always equals the number of VGPRs. |
626 | MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; |
627 | } |
628 | |
629 | return std::pair(MaxNumVGPRs, MaxNumAGPRs); |
630 | } |
631 | |
632 | BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { |
633 | BitVector Reserved(getNumRegs()); |
634 | Reserved.set(AMDGPU::MODE); |
635 | |
636 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
637 | |
638 | // Reserve special purpose registers. |
639 | // |
640 | // EXEC_LO and EXEC_HI could be allocated and used as regular register, but |
641 | // this seems likely to result in bugs, so I'm marking them as reserved. |
642 | reserveRegisterTuples(Reserved, Reg: AMDGPU::EXEC); |
643 | reserveRegisterTuples(Reserved, Reg: AMDGPU::FLAT_SCR); |
644 | |
645 | // M0 has to be reserved so that llvm accepts it as a live-in into a block. |
646 | reserveRegisterTuples(Reserved, Reg: AMDGPU::M0); |
647 | |
648 | // Reserve src_vccz, src_execz, src_scc. |
649 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_VCCZ); |
650 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_EXECZ); |
651 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_SCC); |
652 | |
653 | // Reserve the memory aperture registers |
654 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_SHARED_BASE); |
655 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_SHARED_LIMIT); |
656 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_PRIVATE_BASE); |
657 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_PRIVATE_LIMIT); |
658 | |
659 | // Reserve async counters pseudo registers |
660 | reserveRegisterTuples(Reserved, Reg: AMDGPU::ASYNCcnt); |
661 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TENSORcnt); |
662 | |
663 | // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. |
664 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SRC_POPS_EXITING_WAVE_ID); |
665 | |
666 | // Reserve xnack_mask registers - support is not implemented in Codegen. |
667 | reserveRegisterTuples(Reserved, Reg: AMDGPU::XNACK_MASK); |
668 | |
669 | // Reserve lds_direct register - support is not implemented in Codegen. |
670 | reserveRegisterTuples(Reserved, Reg: AMDGPU::LDS_DIRECT); |
671 | |
672 | // Reserve Trap Handler registers - support is not implemented in Codegen. |
673 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TBA); |
674 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TMA); |
675 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP0_TTMP1); |
676 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP2_TTMP3); |
677 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP4_TTMP5); |
678 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP6_TTMP7); |
679 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP8_TTMP9); |
680 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP10_TTMP11); |
681 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP12_TTMP13); |
682 | reserveRegisterTuples(Reserved, Reg: AMDGPU::TTMP14_TTMP15); |
683 | |
684 | // Reserve null register - it shall never be allocated |
685 | reserveRegisterTuples(Reserved, Reg: AMDGPU::SGPR_NULL64); |
686 | |
687 | // Reserve SGPRs. |
688 | // |
689 | unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); |
690 | unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); |
691 | for (const TargetRegisterClass *RC : regclasses()) { |
692 | if (RC->isBaseClass() && isSGPRClass(RC)) { |
693 | unsigned NumRegs = divideCeil(Numerator: getRegSizeInBits(RC: *RC), Denominator: 32); |
694 | for (MCPhysReg Reg : *RC) { |
695 | unsigned Index = getHWRegIndex(Reg); |
696 | if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs) |
697 | Reserved.set(Reg); |
698 | } |
699 | } |
700 | } |
701 | |
702 | Register ScratchRSrcReg = MFI->getScratchRSrcReg(); |
703 | if (ScratchRSrcReg != AMDGPU::NoRegister) { |
704 | // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we |
705 | // need to spill. |
706 | // TODO: May need to reserve a VGPR if doing LDS spilling. |
707 | reserveRegisterTuples(Reserved, Reg: ScratchRSrcReg); |
708 | } |
709 | |
710 | Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); |
711 | if (LongBranchReservedReg) |
712 | reserveRegisterTuples(Reserved, Reg: LongBranchReservedReg); |
713 | |
714 | // We have to assume the SP is needed in case there are calls in the function, |
715 | // which is detected after the function is lowered. If we aren't really going |
716 | // to need SP, don't bother reserving it. |
717 | MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); |
718 | if (StackPtrReg) { |
719 | reserveRegisterTuples(Reserved, Reg: StackPtrReg); |
720 | assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); |
721 | } |
722 | |
723 | MCRegister FrameReg = MFI->getFrameOffsetReg(); |
724 | if (FrameReg) { |
725 | reserveRegisterTuples(Reserved, Reg: FrameReg); |
726 | assert(!isSubRegister(ScratchRSrcReg, FrameReg)); |
727 | } |
728 | |
729 | if (hasBasePointer(MF)) { |
730 | MCRegister BasePtrReg = getBaseRegister(); |
731 | reserveRegisterTuples(Reserved, Reg: BasePtrReg); |
732 | assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); |
733 | } |
734 | |
735 | // FIXME: Use same reserved register introduced in D149775 |
736 | // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. |
737 | Register ExecCopyReg = MFI->getSGPRForEXECCopy(); |
738 | if (ExecCopyReg) |
739 | reserveRegisterTuples(Reserved, Reg: ExecCopyReg); |
740 | |
741 | // Reserve VGPRs/AGPRs. |
742 | // |
743 | auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); |
744 | |
745 | for (const TargetRegisterClass *RC : regclasses()) { |
746 | if (RC->isBaseClass() && isVGPRClass(RC)) { |
747 | unsigned NumRegs = divideCeil(Numerator: getRegSizeInBits(RC: *RC), Denominator: 32); |
748 | for (MCPhysReg Reg : *RC) { |
749 | unsigned Index = getHWRegIndex(Reg); |
750 | if (Index + NumRegs > MaxNumVGPRs) |
751 | Reserved.set(Reg); |
752 | } |
753 | } |
754 | } |
755 | |
756 | // Reserve all the AGPRs if there are no instructions to use it. |
757 | if (!ST.hasMAIInsts()) |
758 | MaxNumAGPRs = 0; |
759 | for (const TargetRegisterClass *RC : regclasses()) { |
760 | if (RC->isBaseClass() && isAGPRClass(RC)) { |
761 | unsigned NumRegs = divideCeil(Numerator: getRegSizeInBits(RC: *RC), Denominator: 32); |
762 | for (MCPhysReg Reg : *RC) { |
763 | unsigned Index = getHWRegIndex(Reg); |
764 | if (Index + NumRegs > MaxNumAGPRs) |
765 | Reserved.set(Reg); |
766 | } |
767 | } |
768 | } |
769 | |
770 | // On GFX908, in order to guarantee copying between AGPRs, we need a scratch |
771 | // VGPR available at all times. |
772 | if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { |
773 | reserveRegisterTuples(Reserved, Reg: MFI->getVGPRForAGPRCopy()); |
774 | } |
775 | |
776 | // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The |
777 | // MFI->getNonWWMRegMask() field will have a valid bitmask only during |
778 | // wwm-regalloc and it would be empty otherwise. |
779 | BitVector NonWWMRegMask = MFI->getNonWWMRegMask(); |
780 | if (!NonWWMRegMask.empty()) { |
781 | for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs; |
782 | RegI < RegE; ++RegI) { |
783 | if (NonWWMRegMask.test(Idx: RegI)) |
784 | reserveRegisterTuples(Reserved, Reg: RegI); |
785 | } |
786 | } |
787 | |
788 | for (Register Reg : MFI->getWWMReservedRegs()) |
789 | reserveRegisterTuples(Reserved, Reg); |
790 | |
791 | // FIXME: Stop using reserved registers for this. |
792 | for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) |
793 | reserveRegisterTuples(Reserved, Reg); |
794 | |
795 | for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) |
796 | reserveRegisterTuples(Reserved, Reg); |
797 | |
798 | return Reserved; |
799 | } |
800 | |
801 | bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, |
802 | MCRegister PhysReg) const { |
803 | return !MF.getRegInfo().isReserved(PhysReg); |
804 | } |
805 | |
806 | bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { |
807 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
808 | // On entry or in chain functions, the base address is 0, so it can't possibly |
809 | // need any more alignment. |
810 | |
811 | // FIXME: Should be able to specify the entry frame alignment per calling |
812 | // convention instead. |
813 | if (Info->isBottomOfStack()) |
814 | return false; |
815 | |
816 | return TargetRegisterInfo::shouldRealignStack(MF); |
817 | } |
818 | |
819 | bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { |
820 | const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); |
821 | if (Info->isEntryFunction()) { |
822 | const MachineFrameInfo &MFI = Fn.getFrameInfo(); |
823 | return MFI.hasStackObjects() || MFI.hasCalls(); |
824 | } |
825 | |
826 | // May need scavenger for dealing with callee saved registers. |
827 | return true; |
828 | } |
829 | |
830 | bool SIRegisterInfo::requiresFrameIndexScavenging( |
831 | const MachineFunction &MF) const { |
832 | // Do not use frame virtual registers. They used to be used for SGPRs, but |
833 | // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the |
834 | // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a |
835 | // spill. |
836 | return false; |
837 | } |
838 | |
839 | bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( |
840 | const MachineFunction &MF) const { |
841 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
842 | return MFI.hasStackObjects(); |
843 | } |
844 | |
845 | bool SIRegisterInfo::requiresVirtualBaseRegisters( |
846 | const MachineFunction &) const { |
847 | // There are no special dedicated stack or frame pointers. |
848 | return true; |
849 | } |
850 | |
851 | int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { |
852 | assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); |
853 | |
854 | int OffIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), |
855 | Name: AMDGPU::OpName::offset); |
856 | return MI->getOperand(i: OffIdx).getImm(); |
857 | } |
858 | |
859 | int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, |
860 | int Idx) const { |
861 | switch (MI->getOpcode()) { |
862 | case AMDGPU::V_ADD_U32_e32: |
863 | case AMDGPU::V_ADD_U32_e64: |
864 | case AMDGPU::V_ADD_CO_U32_e32: { |
865 | int OtherIdx = Idx == 1 ? 2 : 1; |
866 | const MachineOperand &OtherOp = MI->getOperand(i: OtherIdx); |
867 | return OtherOp.isImm() ? OtherOp.getImm() : 0; |
868 | } |
869 | case AMDGPU::V_ADD_CO_U32_e64: { |
870 | int OtherIdx = Idx == 2 ? 3 : 2; |
871 | const MachineOperand &OtherOp = MI->getOperand(i: OtherIdx); |
872 | return OtherOp.isImm() ? OtherOp.getImm() : 0; |
873 | } |
874 | default: |
875 | break; |
876 | } |
877 | |
878 | if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isFLATScratch(MI: *MI)) |
879 | return 0; |
880 | |
881 | assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), |
882 | AMDGPU::OpName::vaddr) || |
883 | (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), |
884 | AMDGPU::OpName::saddr))) && |
885 | "Should never see frame index on non-address operand" ); |
886 | |
887 | return getScratchInstrOffset(MI); |
888 | } |
889 | |
890 | static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, |
891 | const MachineInstr &MI) { |
892 | assert(MI.getDesc().isAdd()); |
893 | const MachineOperand &Src0 = MI.getOperand(i: 1); |
894 | const MachineOperand &Src1 = MI.getOperand(i: 2); |
895 | |
896 | if (Src0.isFI()) { |
897 | return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MRI: MI.getMF()->getRegInfo(), |
898 | Reg: Src1.getReg())); |
899 | } |
900 | |
901 | if (Src1.isFI()) { |
902 | return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MRI: MI.getMF()->getRegInfo(), |
903 | Reg: Src0.getReg())); |
904 | } |
905 | |
906 | return false; |
907 | } |
908 | |
909 | bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { |
910 | // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes. |
911 | switch (MI->getOpcode()) { |
912 | case AMDGPU::V_ADD_U32_e32: { |
913 | // TODO: We could handle this but it requires work to avoid violating |
914 | // operand restrictions. |
915 | if (ST.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e32) < 2 && |
916 | !isFIPlusImmOrVGPR(TRI: *this, MI: *MI)) |
917 | return false; |
918 | [[fallthrough]]; |
919 | } |
920 | case AMDGPU::V_ADD_U32_e64: |
921 | // FIXME: This optimization is barely profitable enableFlatScratch as-is. |
922 | // |
923 | // Much of the benefit with the MUBUF handling is we avoid duplicating the |
924 | // shift of the frame register, which isn't needed with scratch. |
925 | // |
926 | // materializeFrameBaseRegister doesn't know the register classes of the |
927 | // uses, and unconditionally uses an s_add_i32, which will end up using a |
928 | // copy for the vector uses. |
929 | return !ST.enableFlatScratch(); |
930 | case AMDGPU::V_ADD_CO_U32_e32: |
931 | if (ST.getConstantBusLimit(Opcode: AMDGPU::V_ADD_CO_U32_e32) < 2 && |
932 | !isFIPlusImmOrVGPR(TRI: *this, MI: *MI)) |
933 | return false; |
934 | // We can't deal with the case where the carry out has a use (though this |
935 | // should never happen) |
936 | return MI->getOperand(i: 3).isDead(); |
937 | case AMDGPU::V_ADD_CO_U32_e64: |
938 | // TODO: Should we check use_empty instead? |
939 | return MI->getOperand(i: 1).isDead(); |
940 | default: |
941 | break; |
942 | } |
943 | |
944 | if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isFLATScratch(MI: *MI)) |
945 | return false; |
946 | |
947 | int64_t FullOffset = Offset + getScratchInstrOffset(MI); |
948 | |
949 | const SIInstrInfo *TII = ST.getInstrInfo(); |
950 | if (SIInstrInfo::isMUBUF(MI: *MI)) |
951 | return !TII->isLegalMUBUFImmOffset(Imm: FullOffset); |
952 | |
953 | return !TII->isLegalFLATOffset(Offset: FullOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
954 | FlatVariant: SIInstrFlags::FlatScratch); |
955 | } |
956 | |
957 | Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, |
958 | int FrameIdx, |
959 | int64_t Offset) const { |
960 | MachineBasicBlock::iterator Ins = MBB->begin(); |
961 | DebugLoc DL; // Defaults to "unknown" |
962 | |
963 | if (Ins != MBB->end()) |
964 | DL = Ins->getDebugLoc(); |
965 | |
966 | MachineFunction *MF = MBB->getParent(); |
967 | const SIInstrInfo *TII = ST.getInstrInfo(); |
968 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
969 | unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 |
970 | : AMDGPU::V_MOV_B32_e32; |
971 | |
972 | Register BaseReg = MRI.createVirtualRegister( |
973 | RegClass: ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass |
974 | : &AMDGPU::VGPR_32RegClass); |
975 | |
976 | if (Offset == 0) { |
977 | BuildMI(BB&: *MBB, I: Ins, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: BaseReg) |
978 | .addFrameIndex(Idx: FrameIdx); |
979 | return BaseReg; |
980 | } |
981 | |
982 | Register OffsetReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
983 | |
984 | Register FIReg = MRI.createVirtualRegister( |
985 | RegClass: ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass |
986 | : &AMDGPU::VGPR_32RegClass); |
987 | |
988 | BuildMI(BB&: *MBB, I: Ins, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: OffsetReg) |
989 | .addImm(Val: Offset); |
990 | BuildMI(BB&: *MBB, I: Ins, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: FIReg) |
991 | .addFrameIndex(Idx: FrameIdx); |
992 | |
993 | if (ST.enableFlatScratch() ) { |
994 | // FIXME: Make sure scc isn't live in. |
995 | BuildMI(BB&: *MBB, I: Ins, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: BaseReg) |
996 | .addReg(RegNo: OffsetReg, flags: RegState::Kill) |
997 | .addReg(RegNo: FIReg) |
998 | .setOperandDead(3); // scc |
999 | return BaseReg; |
1000 | } |
1001 | |
1002 | TII->getAddNoCarry(MBB&: *MBB, I: Ins, DL, DestReg: BaseReg) |
1003 | .addReg(RegNo: OffsetReg, flags: RegState::Kill) |
1004 | .addReg(RegNo: FIReg) |
1005 | .addImm(Val: 0); // clamp bit |
1006 | |
1007 | return BaseReg; |
1008 | } |
1009 | |
1010 | void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, |
1011 | int64_t Offset) const { |
1012 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1013 | |
1014 | switch (MI.getOpcode()) { |
1015 | case AMDGPU::V_ADD_U32_e32: |
1016 | case AMDGPU::V_ADD_CO_U32_e32: { |
1017 | MachineOperand *FIOp = &MI.getOperand(i: 2); |
1018 | MachineOperand *ImmOp = &MI.getOperand(i: 1); |
1019 | if (!FIOp->isFI()) |
1020 | std::swap(a&: FIOp, b&: ImmOp); |
1021 | |
1022 | if (!ImmOp->isImm()) { |
1023 | assert(Offset == 0); |
1024 | FIOp->ChangeToRegister(Reg: BaseReg, isDef: false); |
1025 | TII->legalizeOperandsVOP2(MRI&: MI.getMF()->getRegInfo(), MI); |
1026 | return; |
1027 | } |
1028 | |
1029 | int64_t TotalOffset = ImmOp->getImm() + Offset; |
1030 | if (TotalOffset == 0) { |
1031 | MI.setDesc(TII->get(Opcode: AMDGPU::COPY)); |
1032 | for (unsigned I = MI.getNumOperands() - 1; I != 1; --I) |
1033 | MI.removeOperand(OpNo: I); |
1034 | |
1035 | MI.getOperand(i: 1).ChangeToRegister(Reg: BaseReg, isDef: false); |
1036 | return; |
1037 | } |
1038 | |
1039 | ImmOp->setImm(TotalOffset); |
1040 | |
1041 | MachineBasicBlock *MBB = MI.getParent(); |
1042 | MachineFunction *MF = MBB->getParent(); |
1043 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
1044 | |
1045 | // FIXME: materializeFrameBaseRegister does not know the register class of |
1046 | // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit |
1047 | // a copy so we have a legal operand and hope the register coalescer can |
1048 | // clean it up. |
1049 | if (isSGPRReg(MRI, Reg: BaseReg)) { |
1050 | Register BaseRegVGPR = |
1051 | MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1052 | BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: BaseRegVGPR) |
1053 | .addReg(RegNo: BaseReg); |
1054 | MI.getOperand(i: 2).ChangeToRegister(Reg: BaseRegVGPR, isDef: false); |
1055 | } else { |
1056 | MI.getOperand(i: 2).ChangeToRegister(Reg: BaseReg, isDef: false); |
1057 | } |
1058 | return; |
1059 | } |
1060 | case AMDGPU::V_ADD_U32_e64: |
1061 | case AMDGPU::V_ADD_CO_U32_e64: { |
1062 | int Src0Idx = MI.getNumExplicitDefs(); |
1063 | MachineOperand *FIOp = &MI.getOperand(i: Src0Idx); |
1064 | MachineOperand *ImmOp = &MI.getOperand(i: Src0Idx + 1); |
1065 | if (!FIOp->isFI()) |
1066 | std::swap(a&: FIOp, b&: ImmOp); |
1067 | |
1068 | if (!ImmOp->isImm()) { |
1069 | FIOp->ChangeToRegister(Reg: BaseReg, isDef: false); |
1070 | TII->legalizeOperandsVOP3(MRI&: MI.getMF()->getRegInfo(), MI); |
1071 | return; |
1072 | } |
1073 | |
1074 | int64_t TotalOffset = ImmOp->getImm() + Offset; |
1075 | if (TotalOffset == 0) { |
1076 | MI.setDesc(TII->get(Opcode: AMDGPU::COPY)); |
1077 | |
1078 | for (unsigned I = MI.getNumOperands() - 1; I != 1; --I) |
1079 | MI.removeOperand(OpNo: I); |
1080 | |
1081 | MI.getOperand(i: 1).ChangeToRegister(Reg: BaseReg, isDef: false); |
1082 | } else { |
1083 | FIOp->ChangeToRegister(Reg: BaseReg, isDef: false); |
1084 | ImmOp->setImm(TotalOffset); |
1085 | } |
1086 | |
1087 | return; |
1088 | } |
1089 | default: |
1090 | break; |
1091 | } |
1092 | |
1093 | bool IsFlat = TII->isFLATScratch(MI); |
1094 | |
1095 | #ifndef NDEBUG |
1096 | // FIXME: Is it possible to be storing a frame index to itself? |
1097 | bool SeenFI = false; |
1098 | for (const MachineOperand &MO: MI.operands()) { |
1099 | if (MO.isFI()) { |
1100 | if (SeenFI) |
1101 | llvm_unreachable("should not see multiple frame indices" ); |
1102 | |
1103 | SeenFI = true; |
1104 | } |
1105 | } |
1106 | #endif |
1107 | |
1108 | MachineOperand *FIOp = |
1109 | TII->getNamedOperand(MI, OperandName: IsFlat ? AMDGPU::OpName::saddr |
1110 | : AMDGPU::OpName::vaddr); |
1111 | |
1112 | MachineOperand *OffsetOp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset); |
1113 | int64_t NewOffset = OffsetOp->getImm() + Offset; |
1114 | |
1115 | assert(FIOp && FIOp->isFI() && "frame index must be address operand" ); |
1116 | assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); |
1117 | |
1118 | if (IsFlat) { |
1119 | assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, |
1120 | SIInstrFlags::FlatScratch) && |
1121 | "offset should be legal" ); |
1122 | FIOp->ChangeToRegister(Reg: BaseReg, isDef: false); |
1123 | OffsetOp->setImm(NewOffset); |
1124 | return; |
1125 | } |
1126 | |
1127 | #ifndef NDEBUG |
1128 | MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); |
1129 | assert(SOffset->isImm() && SOffset->getImm() == 0); |
1130 | #endif |
1131 | |
1132 | assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal" ); |
1133 | |
1134 | FIOp->ChangeToRegister(Reg: BaseReg, isDef: false); |
1135 | OffsetOp->setImm(NewOffset); |
1136 | } |
1137 | |
1138 | bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, |
1139 | Register BaseReg, |
1140 | int64_t Offset) const { |
1141 | |
1142 | switch (MI->getOpcode()) { |
1143 | case AMDGPU::V_ADD_U32_e32: |
1144 | case AMDGPU::V_ADD_CO_U32_e32: |
1145 | return true; |
1146 | case AMDGPU::V_ADD_U32_e64: |
1147 | case AMDGPU::V_ADD_CO_U32_e64: |
1148 | return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Literal: Offset); |
1149 | default: |
1150 | break; |
1151 | } |
1152 | |
1153 | if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isFLATScratch(MI: *MI)) |
1154 | return false; |
1155 | |
1156 | int64_t NewOffset = Offset + getScratchInstrOffset(MI); |
1157 | |
1158 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1159 | if (SIInstrInfo::isMUBUF(MI: *MI)) |
1160 | return TII->isLegalMUBUFImmOffset(Imm: NewOffset); |
1161 | |
1162 | return TII->isLegalFLATOffset(Offset: NewOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
1163 | FlatVariant: SIInstrFlags::FlatScratch); |
1164 | } |
1165 | |
1166 | const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( |
1167 | const MachineFunction &MF, unsigned Kind) const { |
1168 | // This is inaccurate. It depends on the instruction and address space. The |
1169 | // only place where we should hit this is for dealing with frame indexes / |
1170 | // private accesses, so this is correct in that case. |
1171 | return &AMDGPU::VGPR_32RegClass; |
1172 | } |
1173 | |
1174 | const TargetRegisterClass * |
1175 | SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { |
1176 | if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) |
1177 | return getEquivalentVGPRClass(SRC: RC); |
1178 | if (RC == &AMDGPU::SCC_CLASSRegClass) |
1179 | return getWaveMaskRegClass(); |
1180 | |
1181 | return RC; |
1182 | } |
1183 | |
1184 | static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, |
1185 | const SIInstrInfo *TII) { |
1186 | |
1187 | unsigned Op = MI.getOpcode(); |
1188 | switch (Op) { |
1189 | case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: |
1190 | case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: |
1191 | // FIXME: This assumes the mask is statically known and not computed at |
1192 | // runtime. However, some ABIs may want to compute the mask dynamically and |
1193 | // this will need to be updated. |
1194 | return llvm::popcount( |
1195 | Value: (uint64_t)TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::mask)->getImm()); |
1196 | case AMDGPU::SI_SPILL_S1024_SAVE: |
1197 | case AMDGPU::SI_SPILL_S1024_RESTORE: |
1198 | case AMDGPU::SI_SPILL_V1024_SAVE: |
1199 | case AMDGPU::SI_SPILL_V1024_RESTORE: |
1200 | case AMDGPU::SI_SPILL_A1024_SAVE: |
1201 | case AMDGPU::SI_SPILL_A1024_RESTORE: |
1202 | case AMDGPU::SI_SPILL_AV1024_SAVE: |
1203 | case AMDGPU::SI_SPILL_AV1024_RESTORE: |
1204 | return 32; |
1205 | case AMDGPU::SI_SPILL_S512_SAVE: |
1206 | case AMDGPU::SI_SPILL_S512_RESTORE: |
1207 | case AMDGPU::SI_SPILL_V512_SAVE: |
1208 | case AMDGPU::SI_SPILL_V512_RESTORE: |
1209 | case AMDGPU::SI_SPILL_A512_SAVE: |
1210 | case AMDGPU::SI_SPILL_A512_RESTORE: |
1211 | case AMDGPU::SI_SPILL_AV512_SAVE: |
1212 | case AMDGPU::SI_SPILL_AV512_RESTORE: |
1213 | return 16; |
1214 | case AMDGPU::SI_SPILL_S384_SAVE: |
1215 | case AMDGPU::SI_SPILL_S384_RESTORE: |
1216 | case AMDGPU::SI_SPILL_V384_SAVE: |
1217 | case AMDGPU::SI_SPILL_V384_RESTORE: |
1218 | case AMDGPU::SI_SPILL_A384_SAVE: |
1219 | case AMDGPU::SI_SPILL_A384_RESTORE: |
1220 | case AMDGPU::SI_SPILL_AV384_SAVE: |
1221 | case AMDGPU::SI_SPILL_AV384_RESTORE: |
1222 | return 12; |
1223 | case AMDGPU::SI_SPILL_S352_SAVE: |
1224 | case AMDGPU::SI_SPILL_S352_RESTORE: |
1225 | case AMDGPU::SI_SPILL_V352_SAVE: |
1226 | case AMDGPU::SI_SPILL_V352_RESTORE: |
1227 | case AMDGPU::SI_SPILL_A352_SAVE: |
1228 | case AMDGPU::SI_SPILL_A352_RESTORE: |
1229 | case AMDGPU::SI_SPILL_AV352_SAVE: |
1230 | case AMDGPU::SI_SPILL_AV352_RESTORE: |
1231 | return 11; |
1232 | case AMDGPU::SI_SPILL_S320_SAVE: |
1233 | case AMDGPU::SI_SPILL_S320_RESTORE: |
1234 | case AMDGPU::SI_SPILL_V320_SAVE: |
1235 | case AMDGPU::SI_SPILL_V320_RESTORE: |
1236 | case AMDGPU::SI_SPILL_A320_SAVE: |
1237 | case AMDGPU::SI_SPILL_A320_RESTORE: |
1238 | case AMDGPU::SI_SPILL_AV320_SAVE: |
1239 | case AMDGPU::SI_SPILL_AV320_RESTORE: |
1240 | return 10; |
1241 | case AMDGPU::SI_SPILL_S288_SAVE: |
1242 | case AMDGPU::SI_SPILL_S288_RESTORE: |
1243 | case AMDGPU::SI_SPILL_V288_SAVE: |
1244 | case AMDGPU::SI_SPILL_V288_RESTORE: |
1245 | case AMDGPU::SI_SPILL_A288_SAVE: |
1246 | case AMDGPU::SI_SPILL_A288_RESTORE: |
1247 | case AMDGPU::SI_SPILL_AV288_SAVE: |
1248 | case AMDGPU::SI_SPILL_AV288_RESTORE: |
1249 | return 9; |
1250 | case AMDGPU::SI_SPILL_S256_SAVE: |
1251 | case AMDGPU::SI_SPILL_S256_RESTORE: |
1252 | case AMDGPU::SI_SPILL_V256_SAVE: |
1253 | case AMDGPU::SI_SPILL_V256_RESTORE: |
1254 | case AMDGPU::SI_SPILL_A256_SAVE: |
1255 | case AMDGPU::SI_SPILL_A256_RESTORE: |
1256 | case AMDGPU::SI_SPILL_AV256_SAVE: |
1257 | case AMDGPU::SI_SPILL_AV256_RESTORE: |
1258 | return 8; |
1259 | case AMDGPU::SI_SPILL_S224_SAVE: |
1260 | case AMDGPU::SI_SPILL_S224_RESTORE: |
1261 | case AMDGPU::SI_SPILL_V224_SAVE: |
1262 | case AMDGPU::SI_SPILL_V224_RESTORE: |
1263 | case AMDGPU::SI_SPILL_A224_SAVE: |
1264 | case AMDGPU::SI_SPILL_A224_RESTORE: |
1265 | case AMDGPU::SI_SPILL_AV224_SAVE: |
1266 | case AMDGPU::SI_SPILL_AV224_RESTORE: |
1267 | return 7; |
1268 | case AMDGPU::SI_SPILL_S192_SAVE: |
1269 | case AMDGPU::SI_SPILL_S192_RESTORE: |
1270 | case AMDGPU::SI_SPILL_V192_SAVE: |
1271 | case AMDGPU::SI_SPILL_V192_RESTORE: |
1272 | case AMDGPU::SI_SPILL_A192_SAVE: |
1273 | case AMDGPU::SI_SPILL_A192_RESTORE: |
1274 | case AMDGPU::SI_SPILL_AV192_SAVE: |
1275 | case AMDGPU::SI_SPILL_AV192_RESTORE: |
1276 | return 6; |
1277 | case AMDGPU::SI_SPILL_S160_SAVE: |
1278 | case AMDGPU::SI_SPILL_S160_RESTORE: |
1279 | case AMDGPU::SI_SPILL_V160_SAVE: |
1280 | case AMDGPU::SI_SPILL_V160_RESTORE: |
1281 | case AMDGPU::SI_SPILL_A160_SAVE: |
1282 | case AMDGPU::SI_SPILL_A160_RESTORE: |
1283 | case AMDGPU::SI_SPILL_AV160_SAVE: |
1284 | case AMDGPU::SI_SPILL_AV160_RESTORE: |
1285 | return 5; |
1286 | case AMDGPU::SI_SPILL_S128_SAVE: |
1287 | case AMDGPU::SI_SPILL_S128_RESTORE: |
1288 | case AMDGPU::SI_SPILL_V128_SAVE: |
1289 | case AMDGPU::SI_SPILL_V128_RESTORE: |
1290 | case AMDGPU::SI_SPILL_A128_SAVE: |
1291 | case AMDGPU::SI_SPILL_A128_RESTORE: |
1292 | case AMDGPU::SI_SPILL_AV128_SAVE: |
1293 | case AMDGPU::SI_SPILL_AV128_RESTORE: |
1294 | return 4; |
1295 | case AMDGPU::SI_SPILL_S96_SAVE: |
1296 | case AMDGPU::SI_SPILL_S96_RESTORE: |
1297 | case AMDGPU::SI_SPILL_V96_SAVE: |
1298 | case AMDGPU::SI_SPILL_V96_RESTORE: |
1299 | case AMDGPU::SI_SPILL_A96_SAVE: |
1300 | case AMDGPU::SI_SPILL_A96_RESTORE: |
1301 | case AMDGPU::SI_SPILL_AV96_SAVE: |
1302 | case AMDGPU::SI_SPILL_AV96_RESTORE: |
1303 | return 3; |
1304 | case AMDGPU::SI_SPILL_S64_SAVE: |
1305 | case AMDGPU::SI_SPILL_S64_RESTORE: |
1306 | case AMDGPU::SI_SPILL_V64_SAVE: |
1307 | case AMDGPU::SI_SPILL_V64_RESTORE: |
1308 | case AMDGPU::SI_SPILL_A64_SAVE: |
1309 | case AMDGPU::SI_SPILL_A64_RESTORE: |
1310 | case AMDGPU::SI_SPILL_AV64_SAVE: |
1311 | case AMDGPU::SI_SPILL_AV64_RESTORE: |
1312 | return 2; |
1313 | case AMDGPU::SI_SPILL_S32_SAVE: |
1314 | case AMDGPU::SI_SPILL_S32_RESTORE: |
1315 | case AMDGPU::SI_SPILL_V32_SAVE: |
1316 | case AMDGPU::SI_SPILL_V32_RESTORE: |
1317 | case AMDGPU::SI_SPILL_A32_SAVE: |
1318 | case AMDGPU::SI_SPILL_A32_RESTORE: |
1319 | case AMDGPU::SI_SPILL_AV32_SAVE: |
1320 | case AMDGPU::SI_SPILL_AV32_RESTORE: |
1321 | case AMDGPU::SI_SPILL_WWM_V32_SAVE: |
1322 | case AMDGPU::SI_SPILL_WWM_V32_RESTORE: |
1323 | case AMDGPU::SI_SPILL_WWM_AV32_SAVE: |
1324 | case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: |
1325 | case AMDGPU::SI_SPILL_V16_SAVE: |
1326 | case AMDGPU::SI_SPILL_V16_RESTORE: |
1327 | return 1; |
1328 | default: llvm_unreachable("Invalid spill opcode" ); |
1329 | } |
1330 | } |
1331 | |
1332 | static int getOffsetMUBUFStore(unsigned Opc) { |
1333 | switch (Opc) { |
1334 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: |
1335 | return AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
1336 | case AMDGPU::BUFFER_STORE_BYTE_OFFEN: |
1337 | return AMDGPU::BUFFER_STORE_BYTE_OFFSET; |
1338 | case AMDGPU::BUFFER_STORE_SHORT_OFFEN: |
1339 | return AMDGPU::BUFFER_STORE_SHORT_OFFSET; |
1340 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: |
1341 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; |
1342 | case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: |
1343 | return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; |
1344 | case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: |
1345 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; |
1346 | case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: |
1347 | return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; |
1348 | case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: |
1349 | return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; |
1350 | default: |
1351 | return -1; |
1352 | } |
1353 | } |
1354 | |
1355 | static int getOffsetMUBUFLoad(unsigned Opc) { |
1356 | switch (Opc) { |
1357 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: |
1358 | return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
1359 | case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: |
1360 | return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; |
1361 | case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: |
1362 | return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; |
1363 | case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: |
1364 | return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; |
1365 | case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: |
1366 | return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; |
1367 | case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: |
1368 | return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; |
1369 | case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: |
1370 | return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; |
1371 | case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: |
1372 | return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; |
1373 | case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: |
1374 | return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; |
1375 | case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: |
1376 | return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; |
1377 | case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: |
1378 | return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; |
1379 | case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: |
1380 | return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; |
1381 | case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: |
1382 | return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; |
1383 | case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: |
1384 | return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; |
1385 | default: |
1386 | return -1; |
1387 | } |
1388 | } |
1389 | |
1390 | static int getOffenMUBUFStore(unsigned Opc) { |
1391 | switch (Opc) { |
1392 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: |
1393 | return AMDGPU::BUFFER_STORE_DWORD_OFFEN; |
1394 | case AMDGPU::BUFFER_STORE_BYTE_OFFSET: |
1395 | return AMDGPU::BUFFER_STORE_BYTE_OFFEN; |
1396 | case AMDGPU::BUFFER_STORE_SHORT_OFFSET: |
1397 | return AMDGPU::BUFFER_STORE_SHORT_OFFEN; |
1398 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: |
1399 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; |
1400 | case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: |
1401 | return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; |
1402 | case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: |
1403 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; |
1404 | case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: |
1405 | return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; |
1406 | case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: |
1407 | return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; |
1408 | default: |
1409 | return -1; |
1410 | } |
1411 | } |
1412 | |
1413 | static int getOffenMUBUFLoad(unsigned Opc) { |
1414 | switch (Opc) { |
1415 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: |
1416 | return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; |
1417 | case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: |
1418 | return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; |
1419 | case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: |
1420 | return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; |
1421 | case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: |
1422 | return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; |
1423 | case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: |
1424 | return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; |
1425 | case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: |
1426 | return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; |
1427 | case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: |
1428 | return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; |
1429 | case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: |
1430 | return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; |
1431 | case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: |
1432 | return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; |
1433 | case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: |
1434 | return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; |
1435 | case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: |
1436 | return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; |
1437 | case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: |
1438 | return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; |
1439 | case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: |
1440 | return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; |
1441 | case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: |
1442 | return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; |
1443 | default: |
1444 | return -1; |
1445 | } |
1446 | } |
1447 | |
1448 | static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, |
1449 | MachineBasicBlock &MBB, |
1450 | MachineBasicBlock::iterator MI, |
1451 | int Index, unsigned Lane, |
1452 | unsigned ValueReg, bool IsKill) { |
1453 | MachineFunction *MF = MBB.getParent(); |
1454 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
1455 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1456 | |
1457 | MCPhysReg Reg = MFI->getVGPRToAGPRSpill(FrameIndex: Index, Lane); |
1458 | |
1459 | if (Reg == AMDGPU::NoRegister) |
1460 | return MachineInstrBuilder(); |
1461 | |
1462 | bool IsStore = MI->mayStore(); |
1463 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
1464 | auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); |
1465 | |
1466 | unsigned Dst = IsStore ? Reg : ValueReg; |
1467 | unsigned Src = IsStore ? ValueReg : Reg; |
1468 | bool IsVGPR = TRI->isVGPR(MRI, Reg); |
1469 | DebugLoc DL = MI->getDebugLoc(); |
1470 | if (IsVGPR == TRI->isVGPR(MRI, Reg: ValueReg)) { |
1471 | // Spiller during regalloc may restore a spilled register to its superclass. |
1472 | // It could result in AGPR spills restored to VGPRs or the other way around, |
1473 | // making the src and dst with identical regclasses at this point. It just |
1474 | // needs a copy in such cases. |
1475 | auto CopyMIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Dst) |
1476 | .addReg(RegNo: Src, flags: getKillRegState(B: IsKill)); |
1477 | CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); |
1478 | return CopyMIB; |
1479 | } |
1480 | unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 |
1481 | : AMDGPU::V_ACCVGPR_READ_B32_e64; |
1482 | |
1483 | auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dst) |
1484 | .addReg(RegNo: Src, flags: getKillRegState(B: IsKill)); |
1485 | MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); |
1486 | return MIB; |
1487 | } |
1488 | |
1489 | // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not |
1490 | // need to handle the case where an SGPR may need to be spilled while spilling. |
1491 | static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, |
1492 | MachineFrameInfo &MFI, |
1493 | MachineBasicBlock::iterator MI, |
1494 | int Index, |
1495 | int64_t Offset) { |
1496 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1497 | MachineBasicBlock *MBB = MI->getParent(); |
1498 | const DebugLoc &DL = MI->getDebugLoc(); |
1499 | bool IsStore = MI->mayStore(); |
1500 | |
1501 | unsigned Opc = MI->getOpcode(); |
1502 | int LoadStoreOp = IsStore ? |
1503 | getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); |
1504 | if (LoadStoreOp == -1) |
1505 | return false; |
1506 | |
1507 | const MachineOperand *Reg = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdata); |
1508 | if (spillVGPRtoAGPR(ST, MBB&: *MBB, MI, Index, Lane: 0, ValueReg: Reg->getReg(), IsKill: false).getInstr()) |
1509 | return true; |
1510 | |
1511 | MachineInstrBuilder NewMI = |
1512 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: LoadStoreOp)) |
1513 | .add(MO: *Reg) |
1514 | .add(MO: *TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::srsrc)) |
1515 | .add(MO: *TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::soffset)) |
1516 | .addImm(Val: Offset) |
1517 | .addImm(Val: 0) // cpol |
1518 | .addImm(Val: 0) // swz |
1519 | .cloneMemRefs(OtherMI: *MI); |
1520 | |
1521 | const MachineOperand *VDataIn = TII->getNamedOperand(MI&: *MI, |
1522 | OperandName: AMDGPU::OpName::vdata_in); |
1523 | if (VDataIn) |
1524 | NewMI.add(MO: *VDataIn); |
1525 | return true; |
1526 | } |
1527 | |
1528 | static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, |
1529 | unsigned LoadStoreOp, |
1530 | unsigned EltSize) { |
1531 | bool IsStore = TII->get(Opcode: LoadStoreOp).mayStore(); |
1532 | bool HasVAddr = AMDGPU::hasNamedOperand(Opcode: LoadStoreOp, NamedIdx: AMDGPU::OpName::vaddr); |
1533 | bool UseST = |
1534 | !HasVAddr && !AMDGPU::hasNamedOperand(Opcode: LoadStoreOp, NamedIdx: AMDGPU::OpName::saddr); |
1535 | |
1536 | // Handle block load/store first. |
1537 | if (TII->isBlockLoadStore(Opcode: LoadStoreOp)) |
1538 | return LoadStoreOp; |
1539 | |
1540 | switch (EltSize) { |
1541 | case 4: |
1542 | LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
1543 | : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; |
1544 | break; |
1545 | case 8: |
1546 | LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR |
1547 | : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; |
1548 | break; |
1549 | case 12: |
1550 | LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR |
1551 | : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; |
1552 | break; |
1553 | case 16: |
1554 | LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR |
1555 | : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; |
1556 | break; |
1557 | default: |
1558 | llvm_unreachable("Unexpected spill load/store size!" ); |
1559 | } |
1560 | |
1561 | if (HasVAddr) |
1562 | LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(Opcode: LoadStoreOp); |
1563 | else if (UseST) |
1564 | LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(Opcode: LoadStoreOp); |
1565 | |
1566 | return LoadStoreOp; |
1567 | } |
1568 | |
1569 | void SIRegisterInfo::buildSpillLoadStore( |
1570 | MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, |
1571 | unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, |
1572 | MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, |
1573 | RegScavenger *RS, LiveRegUnits *LiveUnits) const { |
1574 | assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both" ); |
1575 | |
1576 | MachineFunction *MF = MBB.getParent(); |
1577 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1578 | const MachineFrameInfo &MFI = MF->getFrameInfo(); |
1579 | const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); |
1580 | |
1581 | const MCInstrDesc *Desc = &TII->get(Opcode: LoadStoreOp); |
1582 | bool IsStore = Desc->mayStore(); |
1583 | bool IsFlat = TII->isFLATScratch(Opcode: LoadStoreOp); |
1584 | bool IsBlock = TII->isBlockLoadStore(Opcode: LoadStoreOp); |
1585 | |
1586 | bool CanClobberSCC = false; |
1587 | bool Scavenged = false; |
1588 | MCRegister SOffset = ScratchOffsetReg; |
1589 | |
1590 | const TargetRegisterClass *RC = getRegClassForReg(MRI: MF->getRegInfo(), Reg: ValueReg); |
1591 | // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. |
1592 | const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); |
1593 | const unsigned RegWidth = AMDGPU::getRegBitWidth(RC: *RC) / 8; |
1594 | |
1595 | // Always use 4 byte operations for AGPRs because we need to scavenge |
1596 | // a temporary VGPR. |
1597 | // If we're using a block operation, the element should be the whole block. |
1598 | unsigned EltSize = IsBlock ? RegWidth |
1599 | : (IsFlat && !IsAGPR) ? std::min(a: RegWidth, b: 16u) |
1600 | : 4u; |
1601 | unsigned NumSubRegs = RegWidth / EltSize; |
1602 | unsigned Size = NumSubRegs * EltSize; |
1603 | unsigned RemSize = RegWidth - Size; |
1604 | unsigned NumRemSubRegs = RemSize ? 1 : 0; |
1605 | int64_t Offset = InstOffset + MFI.getObjectOffset(ObjectIdx: Index); |
1606 | int64_t MaterializedOffset = Offset; |
1607 | |
1608 | int64_t MaxOffset = Offset + Size + RemSize - EltSize; |
1609 | int64_t ScratchOffsetRegDelta = 0; |
1610 | |
1611 | if (IsFlat && EltSize > 4) { |
1612 | LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); |
1613 | Desc = &TII->get(Opcode: LoadStoreOp); |
1614 | } |
1615 | |
1616 | Align Alignment = MFI.getObjectAlign(ObjectIdx: Index); |
1617 | const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); |
1618 | |
1619 | assert((IsFlat || ((Offset % EltSize) == 0)) && |
1620 | "unexpected VGPR spill offset" ); |
1621 | |
1622 | // Track a VGPR to use for a constant offset we need to materialize. |
1623 | Register TmpOffsetVGPR; |
1624 | |
1625 | // Track a VGPR to use as an intermediate value. |
1626 | Register TmpIntermediateVGPR; |
1627 | bool UseVGPROffset = false; |
1628 | |
1629 | // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate |
1630 | // combination. |
1631 | auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, |
1632 | int64_t VOffset) { |
1633 | // We are using a VGPR offset |
1634 | if (IsFlat && SGPRBase) { |
1635 | // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free |
1636 | // SGPR, so perform the add as vector. |
1637 | // We don't need a base SGPR in the kernel. |
1638 | |
1639 | if (ST.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) >= 2) { |
1640 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg: TmpVGPR) |
1641 | .addReg(RegNo: SGPRBase) |
1642 | .addImm(Val: VOffset) |
1643 | .addImm(Val: 0); // clamp |
1644 | } else { |
1645 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR) |
1646 | .addReg(RegNo: SGPRBase); |
1647 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: TmpVGPR) |
1648 | .addImm(Val: VOffset) |
1649 | .addReg(RegNo: TmpOffsetVGPR); |
1650 | } |
1651 | } else { |
1652 | assert(TmpOffsetVGPR); |
1653 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpVGPR) |
1654 | .addImm(Val: VOffset); |
1655 | } |
1656 | }; |
1657 | |
1658 | bool IsOffsetLegal = |
1659 | IsFlat ? TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
1660 | FlatVariant: SIInstrFlags::FlatScratch) |
1661 | : TII->isLegalMUBUFImmOffset(Imm: MaxOffset); |
1662 | if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { |
1663 | SOffset = MCRegister(); |
1664 | |
1665 | // We don't have access to the register scavenger if this function is called |
1666 | // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. |
1667 | // TODO: Clobbering SCC is not necessary for scratch instructions in the |
1668 | // entry. |
1669 | if (RS) { |
1670 | SOffset = RS->scavengeRegisterBackwards(RC: AMDGPU::SGPR_32RegClass, To: MI, RestoreAfter: false, SPAdj: 0, AllowSpill: false); |
1671 | |
1672 | // Piggy back on the liveness scan we just did see if SCC is dead. |
1673 | CanClobberSCC = !RS->isRegUsed(Reg: AMDGPU::SCC); |
1674 | } else if (LiveUnits) { |
1675 | CanClobberSCC = LiveUnits->available(Reg: AMDGPU::SCC); |
1676 | for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { |
1677 | if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(PhysReg: Reg)) { |
1678 | SOffset = Reg; |
1679 | break; |
1680 | } |
1681 | } |
1682 | } |
1683 | |
1684 | if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) |
1685 | SOffset = Register(); |
1686 | |
1687 | if (!SOffset) { |
1688 | UseVGPROffset = true; |
1689 | |
1690 | if (RS) { |
1691 | TmpOffsetVGPR = RS->scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI, RestoreAfter: false, SPAdj: 0); |
1692 | } else { |
1693 | assert(LiveUnits); |
1694 | for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { |
1695 | if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(PhysReg: Reg)) { |
1696 | TmpOffsetVGPR = Reg; |
1697 | break; |
1698 | } |
1699 | } |
1700 | } |
1701 | |
1702 | assert(TmpOffsetVGPR); |
1703 | } else if (!SOffset && CanClobberSCC) { |
1704 | // There are no free SGPRs, and since we are in the process of spilling |
1705 | // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true |
1706 | // on SI/CI and on VI it is true until we implement spilling using scalar |
1707 | // stores), we have no way to free up an SGPR. Our solution here is to |
1708 | // add the offset directly to the ScratchOffset or StackPtrOffset |
1709 | // register, and then subtract the offset after the spill to return the |
1710 | // register to it's original value. |
1711 | |
1712 | // TODO: If we don't have to do an emergency stack slot spill, converting |
1713 | // to use the VGPR offset is fewer instructions. |
1714 | if (!ScratchOffsetReg) |
1715 | ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); |
1716 | SOffset = ScratchOffsetReg; |
1717 | ScratchOffsetRegDelta = Offset; |
1718 | } else { |
1719 | Scavenged = true; |
1720 | } |
1721 | |
1722 | // We currently only support spilling VGPRs to EltSize boundaries, meaning |
1723 | // we can simplify the adjustment of Offset here to just scale with |
1724 | // WavefrontSize. |
1725 | if (!IsFlat && !UseVGPROffset) |
1726 | Offset *= ST.getWavefrontSize(); |
1727 | |
1728 | if (!UseVGPROffset && !SOffset) |
1729 | report_fatal_error(reason: "could not scavenge SGPR to spill in entry function" ); |
1730 | |
1731 | if (UseVGPROffset) { |
1732 | // We are using a VGPR offset |
1733 | MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); |
1734 | } else if (ScratchOffsetReg == AMDGPU::NoRegister) { |
1735 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: SOffset).addImm(Val: Offset); |
1736 | } else { |
1737 | assert(Offset != 0); |
1738 | auto Add = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SOffset) |
1739 | .addReg(RegNo: ScratchOffsetReg) |
1740 | .addImm(Val: Offset); |
1741 | Add->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
1742 | } |
1743 | |
1744 | Offset = 0; |
1745 | } |
1746 | |
1747 | if (IsFlat && SOffset == AMDGPU::NoRegister) { |
1748 | assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 |
1749 | && "Unexpected vaddr for flat scratch with a FI operand" ); |
1750 | |
1751 | if (UseVGPROffset) { |
1752 | LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(Opcode: LoadStoreOp); |
1753 | } else { |
1754 | assert(ST.hasFlatScratchSTMode()); |
1755 | assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST" ); |
1756 | LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(Opcode: LoadStoreOp); |
1757 | } |
1758 | |
1759 | Desc = &TII->get(Opcode: LoadStoreOp); |
1760 | } |
1761 | |
1762 | for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; |
1763 | ++i, RegOffset += EltSize) { |
1764 | if (i == NumSubRegs) { |
1765 | EltSize = RemSize; |
1766 | LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); |
1767 | } |
1768 | Desc = &TII->get(Opcode: LoadStoreOp); |
1769 | |
1770 | if (!IsFlat && UseVGPROffset) { |
1771 | int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(Opc: LoadStoreOp) |
1772 | : getOffenMUBUFLoad(Opc: LoadStoreOp); |
1773 | Desc = &TII->get(Opcode: NewLoadStoreOp); |
1774 | } |
1775 | |
1776 | if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { |
1777 | // If we are spilling an AGPR beyond the range of the memory instruction |
1778 | // offset and need to use a VGPR offset, we ideally have at least 2 |
1779 | // scratch VGPRs. If we don't have a second free VGPR without spilling, |
1780 | // recycle the VGPR used for the offset which requires resetting after |
1781 | // each subregister. |
1782 | |
1783 | MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); |
1784 | } |
1785 | |
1786 | unsigned NumRegs = EltSize / 4; |
1787 | Register SubReg = e == 1 |
1788 | ? ValueReg |
1789 | : Register(getSubReg(Reg: ValueReg, |
1790 | Idx: getSubRegFromChannel(Channel: RegOffset / 4, NumRegs))); |
1791 | |
1792 | unsigned SOffsetRegState = 0; |
1793 | unsigned SrcDstRegState = getDefRegState(B: !IsStore); |
1794 | const bool IsLastSubReg = i + 1 == e; |
1795 | const bool IsFirstSubReg = i == 0; |
1796 | if (IsLastSubReg) { |
1797 | SOffsetRegState |= getKillRegState(B: Scavenged); |
1798 | // The last implicit use carries the "Kill" flag. |
1799 | SrcDstRegState |= getKillRegState(B: IsKill); |
1800 | } |
1801 | |
1802 | // Make sure the whole register is defined if there are undef components by |
1803 | // adding an implicit def of the super-reg on the first instruction. |
1804 | bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; |
1805 | bool NeedSuperRegImpOperand = e > 1; |
1806 | |
1807 | // Remaining element size to spill into memory after some parts of it |
1808 | // spilled into either AGPRs or VGPRs. |
1809 | unsigned RemEltSize = EltSize; |
1810 | |
1811 | // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, |
1812 | // starting from the last lane. In case if a register cannot be completely |
1813 | // spilled into another register that will ensure its alignment does not |
1814 | // change. For targets with VGPR alignment requirement this is important |
1815 | // in case of flat scratch usage as we might get a scratch_load or |
1816 | // scratch_store of an unaligned register otherwise. |
1817 | for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, |
1818 | LaneE = RegOffset / 4; |
1819 | Lane >= LaneE; --Lane) { |
1820 | bool IsSubReg = e > 1 || EltSize > 4; |
1821 | Register Sub = IsSubReg |
1822 | ? Register(getSubReg(Reg: ValueReg, Idx: getSubRegFromChannel(Channel: Lane))) |
1823 | : ValueReg; |
1824 | auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, ValueReg: Sub, IsKill); |
1825 | if (!MIB.getInstr()) |
1826 | break; |
1827 | if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { |
1828 | MIB.addReg(RegNo: ValueReg, flags: RegState::ImplicitDefine); |
1829 | NeedSuperRegDef = false; |
1830 | } |
1831 | if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { |
1832 | NeedSuperRegImpOperand = true; |
1833 | unsigned State = SrcDstRegState; |
1834 | if (!IsLastSubReg || (Lane != LaneE)) |
1835 | State &= ~RegState::Kill; |
1836 | if (!IsFirstSubReg || (Lane != LaneS)) |
1837 | State &= ~RegState::Define; |
1838 | MIB.addReg(RegNo: ValueReg, flags: RegState::Implicit | State); |
1839 | } |
1840 | RemEltSize -= 4; |
1841 | } |
1842 | |
1843 | if (!RemEltSize) // Fully spilled into AGPRs. |
1844 | continue; |
1845 | |
1846 | if (RemEltSize != EltSize) { // Partially spilled to AGPRs |
1847 | assert(IsFlat && EltSize > 4); |
1848 | |
1849 | unsigned NumRegs = RemEltSize / 4; |
1850 | SubReg = Register(getSubReg(Reg: ValueReg, |
1851 | Idx: getSubRegFromChannel(Channel: RegOffset / 4, NumRegs))); |
1852 | unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize: RemEltSize); |
1853 | Desc = &TII->get(Opcode: Opc); |
1854 | } |
1855 | |
1856 | unsigned FinalReg = SubReg; |
1857 | |
1858 | if (IsAGPR) { |
1859 | assert(EltSize == 4); |
1860 | |
1861 | if (!TmpIntermediateVGPR) { |
1862 | TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); |
1863 | assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); |
1864 | } |
1865 | if (IsStore) { |
1866 | auto AccRead = BuildMI(BB&: MBB, I: MI, MIMD: DL, |
1867 | MCID: TII->get(Opcode: AMDGPU::V_ACCVGPR_READ_B32_e64), |
1868 | DestReg: TmpIntermediateVGPR) |
1869 | .addReg(RegNo: SubReg, flags: getKillRegState(B: IsKill)); |
1870 | if (NeedSuperRegDef) |
1871 | AccRead.addReg(RegNo: ValueReg, flags: RegState::ImplicitDefine); |
1872 | if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) |
1873 | AccRead.addReg(RegNo: ValueReg, flags: RegState::Implicit); |
1874 | AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); |
1875 | } |
1876 | SubReg = TmpIntermediateVGPR; |
1877 | } else if (UseVGPROffset) { |
1878 | if (!TmpOffsetVGPR) { |
1879 | TmpOffsetVGPR = RS->scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, |
1880 | To: MI, RestoreAfter: false, SPAdj: 0); |
1881 | RS->setRegUsed(Reg: TmpOffsetVGPR); |
1882 | } |
1883 | } |
1884 | |
1885 | MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(O: RegOffset); |
1886 | MachineMemOperand *NewMMO = |
1887 | MF->getMachineMemOperand(PtrInfo: PInfo, F: MMO->getFlags(), Size: RemEltSize, |
1888 | BaseAlignment: commonAlignment(A: Alignment, Offset: RegOffset)); |
1889 | |
1890 | auto MIB = |
1891 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: *Desc) |
1892 | .addReg(RegNo: SubReg, flags: getDefRegState(B: !IsStore) | getKillRegState(B: IsKill)); |
1893 | |
1894 | if (UseVGPROffset) { |
1895 | // For an AGPR spill, we reuse the same temp VGPR for the offset and the |
1896 | // intermediate accvgpr_write. |
1897 | MIB.addReg(RegNo: TmpOffsetVGPR, flags: getKillRegState(B: IsLastSubReg && !IsAGPR)); |
1898 | } |
1899 | |
1900 | if (!IsFlat) |
1901 | MIB.addReg(RegNo: FuncInfo->getScratchRSrcReg()); |
1902 | |
1903 | if (SOffset == AMDGPU::NoRegister) { |
1904 | if (!IsFlat) { |
1905 | if (UseVGPROffset && ScratchOffsetReg) { |
1906 | MIB.addReg(RegNo: ScratchOffsetReg); |
1907 | } else { |
1908 | assert(FuncInfo->isBottomOfStack()); |
1909 | MIB.addImm(Val: 0); |
1910 | } |
1911 | } |
1912 | } else { |
1913 | MIB.addReg(RegNo: SOffset, flags: SOffsetRegState); |
1914 | } |
1915 | |
1916 | MIB.addImm(Val: Offset + RegOffset); |
1917 | |
1918 | bool LastUse = MMO->getFlags() & MOLastUse; |
1919 | MIB.addImm(Val: LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol |
1920 | |
1921 | if (!IsFlat) |
1922 | MIB.addImm(Val: 0); // swz |
1923 | MIB.addMemOperand(MMO: NewMMO); |
1924 | |
1925 | if (!IsAGPR && NeedSuperRegDef) |
1926 | MIB.addReg(RegNo: ValueReg, flags: RegState::ImplicitDefine); |
1927 | |
1928 | if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { |
1929 | MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), |
1930 | DestReg: FinalReg) |
1931 | .addReg(RegNo: TmpIntermediateVGPR, flags: RegState::Kill); |
1932 | MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); |
1933 | } |
1934 | |
1935 | bool IsSrcDstDef = SrcDstRegState & RegState::Define; |
1936 | if (NeedSuperRegImpOperand && |
1937 | (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) |
1938 | MIB.addReg(RegNo: ValueReg, flags: RegState::Implicit | SrcDstRegState); |
1939 | |
1940 | // The epilog restore of a wwm-scratch register can cause undesired |
1941 | // optimization during machine-cp post PrologEpilogInserter if the same |
1942 | // register was assigned for return value ABI lowering with a COPY |
1943 | // instruction. As given below, with the epilog reload, the earlier COPY |
1944 | // appeared to be dead during machine-cp. |
1945 | // ... |
1946 | // v0 in WWM operation, needs the WWM spill at prolog/epilog. |
1947 | // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 |
1948 | // ... |
1949 | // Epilog block: |
1950 | // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 |
1951 | // ... |
1952 | // WWM spill restore to preserve the inactive lanes of v0. |
1953 | // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 |
1954 | // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 |
1955 | // $exec = S_MOV_B64 killed $sgpr4_sgpr5 |
1956 | // ... |
1957 | // SI_RETURN implicit $vgpr0 |
1958 | // ... |
1959 | // To fix it, mark the same reg as a tied op for such restore instructions |
1960 | // so that it marks a usage for the preceding COPY. |
1961 | if (!IsStore && MI != MBB.end() && MI->isReturn() && |
1962 | MI->readsRegister(Reg: SubReg, TRI: this)) { |
1963 | MIB.addReg(RegNo: SubReg, flags: RegState::Implicit); |
1964 | MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1); |
1965 | } |
1966 | |
1967 | // If we're building a block load, we should add artificial uses for the |
1968 | // CSR VGPRs that are *not* being transferred. This is because liveness |
1969 | // analysis is not aware of the mask, so we need to somehow inform it that |
1970 | // those registers are not available before the load and they should not be |
1971 | // scavenged. |
1972 | if (!IsStore && TII->isBlockLoadStore(Opcode: LoadStoreOp)) |
1973 | addImplicitUsesForBlockCSRLoad(MIB, BlockReg: ValueReg); |
1974 | } |
1975 | |
1976 | if (ScratchOffsetRegDelta != 0) { |
1977 | // Subtract the offset we added to the ScratchOffset register. |
1978 | BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SOffset) |
1979 | .addReg(RegNo: SOffset) |
1980 | .addImm(Val: -ScratchOffsetRegDelta); |
1981 | } |
1982 | } |
1983 | |
1984 | void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, |
1985 | Register BlockReg) const { |
1986 | const MachineFunction *MF = MIB->getParent()->getParent(); |
1987 | const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); |
1988 | uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(RegisterBlock: BlockReg); |
1989 | Register BaseVGPR = getSubReg(Reg: BlockReg, Idx: AMDGPU::sub0); |
1990 | for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset) |
1991 | if (!(Mask & (1 << RegOffset)) && |
1992 | isCalleeSavedPhysReg(PhysReg: BaseVGPR + RegOffset, MF: *MF)) |
1993 | MIB.addUse(RegNo: BaseVGPR + RegOffset, Flags: RegState::Implicit); |
1994 | } |
1995 | |
1996 | void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, |
1997 | int Offset, bool IsLoad, |
1998 | bool IsKill) const { |
1999 | // Load/store VGPR |
2000 | MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); |
2001 | assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); |
2002 | |
2003 | Register FrameReg = |
2004 | FrameInfo.isFixedObjectIndex(ObjectIdx: Index) && hasBasePointer(MF: SB.MF) |
2005 | ? getBaseRegister() |
2006 | : getFrameRegister(MF: SB.MF); |
2007 | |
2008 | Align Alignment = FrameInfo.getObjectAlign(ObjectIdx: Index); |
2009 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF&: SB.MF, FI: Index); |
2010 | MachineMemOperand *MMO = SB.MF.getMachineMemOperand( |
2011 | PtrInfo, F: IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, |
2012 | Size: SB.EltSize, BaseAlignment: Alignment); |
2013 | |
2014 | if (IsLoad) { |
2015 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR |
2016 | : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
2017 | buildSpillLoadStore(MBB&: *SB.MBB, MI: SB.MI, DL: SB.DL, LoadStoreOp: Opc, Index, ValueReg: SB.TmpVGPR, IsKill: false, |
2018 | ScratchOffsetReg: FrameReg, InstOffset: (int64_t)Offset * SB.EltSize, MMO, RS: SB.RS); |
2019 | } else { |
2020 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
2021 | : AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
2022 | buildSpillLoadStore(MBB&: *SB.MBB, MI: SB.MI, DL: SB.DL, LoadStoreOp: Opc, Index, ValueReg: SB.TmpVGPR, IsKill, |
2023 | ScratchOffsetReg: FrameReg, InstOffset: (int64_t)Offset * SB.EltSize, MMO, RS: SB.RS); |
2024 | // This only ever adds one VGPR spill |
2025 | SB.MFI.addToSpilledVGPRs(num: 1); |
2026 | } |
2027 | } |
2028 | |
2029 | bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, |
2030 | RegScavenger *RS, SlotIndexes *Indexes, |
2031 | LiveIntervals *LIS, bool OnlyToVGPR, |
2032 | bool SpillToPhysVGPRLane) const { |
2033 | assert(!MI->getOperand(0).isUndef() && |
2034 | "undef spill should have been deleted earlier" ); |
2035 | |
2036 | SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); |
2037 | |
2038 | ArrayRef<SpilledReg> VGPRSpills = |
2039 | SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(FrameIndex: Index) |
2040 | : SB.MFI.getSGPRSpillToVirtualVGPRLanes(FrameIndex: Index); |
2041 | bool SpillToVGPR = !VGPRSpills.empty(); |
2042 | if (OnlyToVGPR && !SpillToVGPR) |
2043 | return false; |
2044 | |
2045 | assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && |
2046 | SB.SuperReg != SB.MFI.getFrameOffsetReg())); |
2047 | |
2048 | if (SpillToVGPR) { |
2049 | |
2050 | // Since stack slot coloring pass is trying to optimize SGPR spills, |
2051 | // VGPR lanes (mapped from spill stack slot) may be shared for SGPR |
2052 | // spills of different sizes. This accounts for number of VGPR lanes alloted |
2053 | // equal to the largest SGPR being spilled in them. |
2054 | assert(SB.NumSubRegs <= VGPRSpills.size() && |
2055 | "Num of SGPRs spilled should be less than or equal to num of " |
2056 | "the VGPR lanes." ); |
2057 | |
2058 | for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { |
2059 | Register SubReg = |
2060 | SB.NumSubRegs == 1 |
2061 | ? SB.SuperReg |
2062 | : Register(getSubReg(Reg: SB.SuperReg, Idx: SB.SplitParts[i])); |
2063 | SpilledReg Spill = VGPRSpills[i]; |
2064 | |
2065 | bool IsFirstSubreg = i == 0; |
2066 | bool IsLastSubreg = i == SB.NumSubRegs - 1; |
2067 | bool UseKill = SB.IsKill && IsLastSubreg; |
2068 | |
2069 | |
2070 | // Mark the "old value of vgpr" input undef only if this is the first sgpr |
2071 | // spill to this specific vgpr in the first basic block. |
2072 | auto MIB = BuildMI(BB&: *SB.MBB, I: MI, MIMD: SB.DL, |
2073 | MCID: SB.TII.get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR), DestReg: Spill.VGPR) |
2074 | .addReg(RegNo: SubReg, flags: getKillRegState(B: UseKill)) |
2075 | .addImm(Val: Spill.Lane) |
2076 | .addReg(RegNo: Spill.VGPR); |
2077 | if (Indexes) { |
2078 | if (IsFirstSubreg) |
2079 | Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *MIB); |
2080 | else |
2081 | Indexes->insertMachineInstrInMaps(MI&: *MIB); |
2082 | } |
2083 | |
2084 | if (IsFirstSubreg && SB.NumSubRegs > 1) { |
2085 | // We may be spilling a super-register which is only partially defined, |
2086 | // and need to ensure later spills think the value is defined. |
2087 | MIB.addReg(RegNo: SB.SuperReg, flags: RegState::ImplicitDefine); |
2088 | } |
2089 | |
2090 | if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) |
2091 | MIB.addReg(RegNo: SB.SuperReg, flags: getKillRegState(B: UseKill) | RegState::Implicit); |
2092 | |
2093 | // FIXME: Since this spills to another register instead of an actual |
2094 | // frame index, we should delete the frame index when all references to |
2095 | // it are fixed. |
2096 | } |
2097 | } else { |
2098 | SB.prepare(); |
2099 | |
2100 | // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. |
2101 | unsigned SubKillState = getKillRegState(B: (SB.NumSubRegs == 1) && SB.IsKill); |
2102 | |
2103 | // Per VGPR helper data |
2104 | auto PVD = SB.getPerVGPRData(); |
2105 | |
2106 | for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { |
2107 | unsigned TmpVGPRFlags = RegState::Undef; |
2108 | |
2109 | // Write sub registers into the VGPR |
2110 | for (unsigned i = Offset * PVD.PerVGPR, |
2111 | e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs); |
2112 | i < e; ++i) { |
2113 | Register SubReg = |
2114 | SB.NumSubRegs == 1 |
2115 | ? SB.SuperReg |
2116 | : Register(getSubReg(Reg: SB.SuperReg, Idx: SB.SplitParts[i])); |
2117 | |
2118 | MachineInstrBuilder WriteLane = |
2119 | BuildMI(BB&: *SB.MBB, I: MI, MIMD: SB.DL, |
2120 | MCID: SB.TII.get(Opcode: AMDGPU::SI_SPILL_S32_TO_VGPR), DestReg: SB.TmpVGPR) |
2121 | .addReg(RegNo: SubReg, flags: SubKillState) |
2122 | .addImm(Val: i % PVD.PerVGPR) |
2123 | .addReg(RegNo: SB.TmpVGPR, flags: TmpVGPRFlags); |
2124 | TmpVGPRFlags = 0; |
2125 | |
2126 | if (Indexes) { |
2127 | if (i == 0) |
2128 | Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *WriteLane); |
2129 | else |
2130 | Indexes->insertMachineInstrInMaps(MI&: *WriteLane); |
2131 | } |
2132 | |
2133 | // There could be undef components of a spilled super register. |
2134 | // TODO: Can we detect this and skip the spill? |
2135 | if (SB.NumSubRegs > 1) { |
2136 | // The last implicit use of the SB.SuperReg carries the "Kill" flag. |
2137 | unsigned SuperKillState = 0; |
2138 | if (i + 1 == SB.NumSubRegs) |
2139 | SuperKillState |= getKillRegState(B: SB.IsKill); |
2140 | WriteLane.addReg(RegNo: SB.SuperReg, flags: RegState::Implicit | SuperKillState); |
2141 | } |
2142 | } |
2143 | |
2144 | // Write out VGPR |
2145 | SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); |
2146 | } |
2147 | |
2148 | SB.restore(); |
2149 | } |
2150 | |
2151 | MI->eraseFromParent(); |
2152 | SB.MFI.addToSpilledSGPRs(num: SB.NumSubRegs); |
2153 | |
2154 | if (LIS) |
2155 | LIS->removeAllRegUnitsForPhysReg(Reg: SB.SuperReg); |
2156 | |
2157 | return true; |
2158 | } |
2159 | |
2160 | bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, |
2161 | RegScavenger *RS, SlotIndexes *Indexes, |
2162 | LiveIntervals *LIS, bool OnlyToVGPR, |
2163 | bool SpillToPhysVGPRLane) const { |
2164 | SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); |
2165 | |
2166 | ArrayRef<SpilledReg> VGPRSpills = |
2167 | SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(FrameIndex: Index) |
2168 | : SB.MFI.getSGPRSpillToVirtualVGPRLanes(FrameIndex: Index); |
2169 | bool SpillToVGPR = !VGPRSpills.empty(); |
2170 | if (OnlyToVGPR && !SpillToVGPR) |
2171 | return false; |
2172 | |
2173 | if (SpillToVGPR) { |
2174 | for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { |
2175 | Register SubReg = |
2176 | SB.NumSubRegs == 1 |
2177 | ? SB.SuperReg |
2178 | : Register(getSubReg(Reg: SB.SuperReg, Idx: SB.SplitParts[i])); |
2179 | |
2180 | SpilledReg Spill = VGPRSpills[i]; |
2181 | auto MIB = BuildMI(BB&: *SB.MBB, I: MI, MIMD: SB.DL, |
2182 | MCID: SB.TII.get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg) |
2183 | .addReg(RegNo: Spill.VGPR) |
2184 | .addImm(Val: Spill.Lane); |
2185 | if (SB.NumSubRegs > 1 && i == 0) |
2186 | MIB.addReg(RegNo: SB.SuperReg, flags: RegState::ImplicitDefine); |
2187 | if (Indexes) { |
2188 | if (i == e - 1) |
2189 | Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *MIB); |
2190 | else |
2191 | Indexes->insertMachineInstrInMaps(MI&: *MIB); |
2192 | } |
2193 | } |
2194 | } else { |
2195 | SB.prepare(); |
2196 | |
2197 | // Per VGPR helper data |
2198 | auto PVD = SB.getPerVGPRData(); |
2199 | |
2200 | for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { |
2201 | // Load in VGPR data |
2202 | SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); |
2203 | |
2204 | // Unpack lanes |
2205 | for (unsigned i = Offset * PVD.PerVGPR, |
2206 | e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs); |
2207 | i < e; ++i) { |
2208 | Register SubReg = |
2209 | SB.NumSubRegs == 1 |
2210 | ? SB.SuperReg |
2211 | : Register(getSubReg(Reg: SB.SuperReg, Idx: SB.SplitParts[i])); |
2212 | |
2213 | bool LastSubReg = (i + 1 == e); |
2214 | auto MIB = BuildMI(BB&: *SB.MBB, I: MI, MIMD: SB.DL, |
2215 | MCID: SB.TII.get(Opcode: AMDGPU::SI_RESTORE_S32_FROM_VGPR), DestReg: SubReg) |
2216 | .addReg(RegNo: SB.TmpVGPR, flags: getKillRegState(B: LastSubReg)) |
2217 | .addImm(Val: i); |
2218 | if (SB.NumSubRegs > 1 && i == 0) |
2219 | MIB.addReg(RegNo: SB.SuperReg, flags: RegState::ImplicitDefine); |
2220 | if (Indexes) { |
2221 | if (i == e - 1) |
2222 | Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *MIB); |
2223 | else |
2224 | Indexes->insertMachineInstrInMaps(MI&: *MIB); |
2225 | } |
2226 | } |
2227 | } |
2228 | |
2229 | SB.restore(); |
2230 | } |
2231 | |
2232 | MI->eraseFromParent(); |
2233 | |
2234 | if (LIS) |
2235 | LIS->removeAllRegUnitsForPhysReg(Reg: SB.SuperReg); |
2236 | |
2237 | return true; |
2238 | } |
2239 | |
2240 | bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, |
2241 | MachineBasicBlock &RestoreMBB, |
2242 | Register SGPR, RegScavenger *RS) const { |
2243 | SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, |
2244 | RS); |
2245 | SB.prepare(); |
2246 | // Generate the spill of SGPR to SB.TmpVGPR. |
2247 | unsigned SubKillState = getKillRegState(B: (SB.NumSubRegs == 1) && SB.IsKill); |
2248 | auto PVD = SB.getPerVGPRData(); |
2249 | for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { |
2250 | unsigned TmpVGPRFlags = RegState::Undef; |
2251 | // Write sub registers into the VGPR |
2252 | for (unsigned i = Offset * PVD.PerVGPR, |
2253 | e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs); |
2254 | i < e; ++i) { |
2255 | Register SubReg = |
2256 | SB.NumSubRegs == 1 |
2257 | ? SB.SuperReg |
2258 | : Register(getSubReg(Reg: SB.SuperReg, Idx: SB.SplitParts[i])); |
2259 | |
2260 | MachineInstrBuilder WriteLane = |
2261 | BuildMI(BB&: *SB.MBB, I: MI, MIMD: SB.DL, MCID: SB.TII.get(Opcode: AMDGPU::V_WRITELANE_B32), |
2262 | DestReg: SB.TmpVGPR) |
2263 | .addReg(RegNo: SubReg, flags: SubKillState) |
2264 | .addImm(Val: i % PVD.PerVGPR) |
2265 | .addReg(RegNo: SB.TmpVGPR, flags: TmpVGPRFlags); |
2266 | TmpVGPRFlags = 0; |
2267 | // There could be undef components of a spilled super register. |
2268 | // TODO: Can we detect this and skip the spill? |
2269 | if (SB.NumSubRegs > 1) { |
2270 | // The last implicit use of the SB.SuperReg carries the "Kill" flag. |
2271 | unsigned SuperKillState = 0; |
2272 | if (i + 1 == SB.NumSubRegs) |
2273 | SuperKillState |= getKillRegState(B: SB.IsKill); |
2274 | WriteLane.addReg(RegNo: SB.SuperReg, flags: RegState::Implicit | SuperKillState); |
2275 | } |
2276 | } |
2277 | // Don't need to write VGPR out. |
2278 | } |
2279 | |
2280 | MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); |
2281 | |
2282 | // Restore clobbered registers in the specified restore block. |
2283 | MI = RestoreMBB.end(); |
2284 | SB.setMI(NewMBB: &RestoreMBB, NewMI: MI); |
2285 | // Generate the restore of SGPR from SB.TmpVGPR. |
2286 | for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { |
2287 | // Don't need to load VGPR in. |
2288 | // Unpack lanes |
2289 | for (unsigned i = Offset * PVD.PerVGPR, |
2290 | e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs); |
2291 | i < e; ++i) { |
2292 | Register SubReg = |
2293 | SB.NumSubRegs == 1 |
2294 | ? SB.SuperReg |
2295 | : Register(getSubReg(Reg: SB.SuperReg, Idx: SB.SplitParts[i])); |
2296 | MRI.constrainRegClass(Reg: SubReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
2297 | bool LastSubReg = (i + 1 == e); |
2298 | auto MIB = BuildMI(BB&: *SB.MBB, I: MI, MIMD: SB.DL, MCID: SB.TII.get(Opcode: AMDGPU::V_READLANE_B32), |
2299 | DestReg: SubReg) |
2300 | .addReg(RegNo: SB.TmpVGPR, flags: getKillRegState(B: LastSubReg)) |
2301 | .addImm(Val: i); |
2302 | if (SB.NumSubRegs > 1 && i == 0) |
2303 | MIB.addReg(RegNo: SB.SuperReg, flags: RegState::ImplicitDefine); |
2304 | } |
2305 | } |
2306 | SB.restore(); |
2307 | |
2308 | SB.MFI.addToSpilledSGPRs(num: SB.NumSubRegs); |
2309 | return false; |
2310 | } |
2311 | |
2312 | /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to |
2313 | /// a VGPR and the stack slot can be safely eliminated when all other users are |
2314 | /// handled. |
2315 | bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( |
2316 | MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, |
2317 | SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { |
2318 | switch (MI->getOpcode()) { |
2319 | case AMDGPU::SI_SPILL_S1024_SAVE: |
2320 | case AMDGPU::SI_SPILL_S512_SAVE: |
2321 | case AMDGPU::SI_SPILL_S384_SAVE: |
2322 | case AMDGPU::SI_SPILL_S352_SAVE: |
2323 | case AMDGPU::SI_SPILL_S320_SAVE: |
2324 | case AMDGPU::SI_SPILL_S288_SAVE: |
2325 | case AMDGPU::SI_SPILL_S256_SAVE: |
2326 | case AMDGPU::SI_SPILL_S224_SAVE: |
2327 | case AMDGPU::SI_SPILL_S192_SAVE: |
2328 | case AMDGPU::SI_SPILL_S160_SAVE: |
2329 | case AMDGPU::SI_SPILL_S128_SAVE: |
2330 | case AMDGPU::SI_SPILL_S96_SAVE: |
2331 | case AMDGPU::SI_SPILL_S64_SAVE: |
2332 | case AMDGPU::SI_SPILL_S32_SAVE: |
2333 | return spillSGPR(MI, Index: FI, RS, Indexes, LIS, OnlyToVGPR: true, SpillToPhysVGPRLane); |
2334 | case AMDGPU::SI_SPILL_S1024_RESTORE: |
2335 | case AMDGPU::SI_SPILL_S512_RESTORE: |
2336 | case AMDGPU::SI_SPILL_S384_RESTORE: |
2337 | case AMDGPU::SI_SPILL_S352_RESTORE: |
2338 | case AMDGPU::SI_SPILL_S320_RESTORE: |
2339 | case AMDGPU::SI_SPILL_S288_RESTORE: |
2340 | case AMDGPU::SI_SPILL_S256_RESTORE: |
2341 | case AMDGPU::SI_SPILL_S224_RESTORE: |
2342 | case AMDGPU::SI_SPILL_S192_RESTORE: |
2343 | case AMDGPU::SI_SPILL_S160_RESTORE: |
2344 | case AMDGPU::SI_SPILL_S128_RESTORE: |
2345 | case AMDGPU::SI_SPILL_S96_RESTORE: |
2346 | case AMDGPU::SI_SPILL_S64_RESTORE: |
2347 | case AMDGPU::SI_SPILL_S32_RESTORE: |
2348 | return restoreSGPR(MI, Index: FI, RS, Indexes, LIS, OnlyToVGPR: true, SpillToPhysVGPRLane); |
2349 | default: |
2350 | llvm_unreachable("not an SGPR spill instruction" ); |
2351 | } |
2352 | } |
2353 | |
2354 | bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
2355 | int SPAdj, unsigned FIOperandNum, |
2356 | RegScavenger *RS) const { |
2357 | MachineFunction *MF = MI->getParent()->getParent(); |
2358 | MachineBasicBlock *MBB = MI->getParent(); |
2359 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
2360 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
2361 | const SIInstrInfo *TII = ST.getInstrInfo(); |
2362 | const DebugLoc &DL = MI->getDebugLoc(); |
2363 | |
2364 | assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?" ); |
2365 | |
2366 | assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) && |
2367 | "unreserved scratch RSRC register" ); |
2368 | |
2369 | MachineOperand *FIOp = &MI->getOperand(i: FIOperandNum); |
2370 | int Index = MI->getOperand(i: FIOperandNum).getIndex(); |
2371 | |
2372 | Register FrameReg = FrameInfo.isFixedObjectIndex(ObjectIdx: Index) && hasBasePointer(MF: *MF) |
2373 | ? getBaseRegister() |
2374 | : getFrameRegister(MF: *MF); |
2375 | |
2376 | switch (MI->getOpcode()) { |
2377 | // SGPR register spill |
2378 | case AMDGPU::SI_SPILL_S1024_SAVE: |
2379 | case AMDGPU::SI_SPILL_S512_SAVE: |
2380 | case AMDGPU::SI_SPILL_S384_SAVE: |
2381 | case AMDGPU::SI_SPILL_S352_SAVE: |
2382 | case AMDGPU::SI_SPILL_S320_SAVE: |
2383 | case AMDGPU::SI_SPILL_S288_SAVE: |
2384 | case AMDGPU::SI_SPILL_S256_SAVE: |
2385 | case AMDGPU::SI_SPILL_S224_SAVE: |
2386 | case AMDGPU::SI_SPILL_S192_SAVE: |
2387 | case AMDGPU::SI_SPILL_S160_SAVE: |
2388 | case AMDGPU::SI_SPILL_S128_SAVE: |
2389 | case AMDGPU::SI_SPILL_S96_SAVE: |
2390 | case AMDGPU::SI_SPILL_S64_SAVE: |
2391 | case AMDGPU::SI_SPILL_S32_SAVE: { |
2392 | return spillSGPR(MI, Index, RS); |
2393 | } |
2394 | |
2395 | // SGPR register restore |
2396 | case AMDGPU::SI_SPILL_S1024_RESTORE: |
2397 | case AMDGPU::SI_SPILL_S512_RESTORE: |
2398 | case AMDGPU::SI_SPILL_S384_RESTORE: |
2399 | case AMDGPU::SI_SPILL_S352_RESTORE: |
2400 | case AMDGPU::SI_SPILL_S320_RESTORE: |
2401 | case AMDGPU::SI_SPILL_S288_RESTORE: |
2402 | case AMDGPU::SI_SPILL_S256_RESTORE: |
2403 | case AMDGPU::SI_SPILL_S224_RESTORE: |
2404 | case AMDGPU::SI_SPILL_S192_RESTORE: |
2405 | case AMDGPU::SI_SPILL_S160_RESTORE: |
2406 | case AMDGPU::SI_SPILL_S128_RESTORE: |
2407 | case AMDGPU::SI_SPILL_S96_RESTORE: |
2408 | case AMDGPU::SI_SPILL_S64_RESTORE: |
2409 | case AMDGPU::SI_SPILL_S32_RESTORE: { |
2410 | return restoreSGPR(MI, Index, RS); |
2411 | } |
2412 | |
2413 | // VGPR register spill |
2414 | case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: { |
2415 | // Put mask into M0. |
2416 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), |
2417 | DestReg: AMDGPU::M0) |
2418 | .add(MO: *TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::mask)); |
2419 | LLVM_FALLTHROUGH; |
2420 | } |
2421 | case AMDGPU::SI_SPILL_V1024_SAVE: |
2422 | case AMDGPU::SI_SPILL_V512_SAVE: |
2423 | case AMDGPU::SI_SPILL_V384_SAVE: |
2424 | case AMDGPU::SI_SPILL_V352_SAVE: |
2425 | case AMDGPU::SI_SPILL_V320_SAVE: |
2426 | case AMDGPU::SI_SPILL_V288_SAVE: |
2427 | case AMDGPU::SI_SPILL_V256_SAVE: |
2428 | case AMDGPU::SI_SPILL_V224_SAVE: |
2429 | case AMDGPU::SI_SPILL_V192_SAVE: |
2430 | case AMDGPU::SI_SPILL_V160_SAVE: |
2431 | case AMDGPU::SI_SPILL_V128_SAVE: |
2432 | case AMDGPU::SI_SPILL_V96_SAVE: |
2433 | case AMDGPU::SI_SPILL_V64_SAVE: |
2434 | case AMDGPU::SI_SPILL_V32_SAVE: |
2435 | case AMDGPU::SI_SPILL_V16_SAVE: |
2436 | case AMDGPU::SI_SPILL_A1024_SAVE: |
2437 | case AMDGPU::SI_SPILL_A512_SAVE: |
2438 | case AMDGPU::SI_SPILL_A384_SAVE: |
2439 | case AMDGPU::SI_SPILL_A352_SAVE: |
2440 | case AMDGPU::SI_SPILL_A320_SAVE: |
2441 | case AMDGPU::SI_SPILL_A288_SAVE: |
2442 | case AMDGPU::SI_SPILL_A256_SAVE: |
2443 | case AMDGPU::SI_SPILL_A224_SAVE: |
2444 | case AMDGPU::SI_SPILL_A192_SAVE: |
2445 | case AMDGPU::SI_SPILL_A160_SAVE: |
2446 | case AMDGPU::SI_SPILL_A128_SAVE: |
2447 | case AMDGPU::SI_SPILL_A96_SAVE: |
2448 | case AMDGPU::SI_SPILL_A64_SAVE: |
2449 | case AMDGPU::SI_SPILL_A32_SAVE: |
2450 | case AMDGPU::SI_SPILL_AV1024_SAVE: |
2451 | case AMDGPU::SI_SPILL_AV512_SAVE: |
2452 | case AMDGPU::SI_SPILL_AV384_SAVE: |
2453 | case AMDGPU::SI_SPILL_AV352_SAVE: |
2454 | case AMDGPU::SI_SPILL_AV320_SAVE: |
2455 | case AMDGPU::SI_SPILL_AV288_SAVE: |
2456 | case AMDGPU::SI_SPILL_AV256_SAVE: |
2457 | case AMDGPU::SI_SPILL_AV224_SAVE: |
2458 | case AMDGPU::SI_SPILL_AV192_SAVE: |
2459 | case AMDGPU::SI_SPILL_AV160_SAVE: |
2460 | case AMDGPU::SI_SPILL_AV128_SAVE: |
2461 | case AMDGPU::SI_SPILL_AV96_SAVE: |
2462 | case AMDGPU::SI_SPILL_AV64_SAVE: |
2463 | case AMDGPU::SI_SPILL_AV32_SAVE: |
2464 | case AMDGPU::SI_SPILL_WWM_V32_SAVE: |
2465 | case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { |
2466 | const MachineOperand *VData = TII->getNamedOperand(MI&: *MI, |
2467 | OperandName: AMDGPU::OpName::vdata); |
2468 | if (VData->isUndef()) { |
2469 | MI->eraseFromParent(); |
2470 | return true; |
2471 | } |
2472 | |
2473 | assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == |
2474 | MFI->getStackPtrOffsetReg()); |
2475 | |
2476 | unsigned Opc; |
2477 | if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) { |
2478 | assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!" ); |
2479 | Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16; |
2480 | } else { |
2481 | Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE |
2482 | ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR |
2483 | : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
2484 | : AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
2485 | } |
2486 | |
2487 | auto *MBB = MI->getParent(); |
2488 | bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(Opcode: MI->getOpcode()); |
2489 | if (IsWWMRegSpill) { |
2490 | TII->insertScratchExecCopy(MF&: *MF, MBB&: *MBB, MBBI: MI, DL, Reg: MFI->getSGPRForEXECCopy(), |
2491 | IsSCCLive: RS->isRegUsed(Reg: AMDGPU::SCC)); |
2492 | } |
2493 | buildSpillLoadStore( |
2494 | MBB&: *MBB, MI, DL, LoadStoreOp: Opc, Index, ValueReg: VData->getReg(), IsKill: VData->isKill(), ScratchOffsetReg: FrameReg, |
2495 | InstOffset: TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset)->getImm(), |
2496 | MMO: *MI->memoperands_begin(), RS); |
2497 | MFI->addToSpilledVGPRs(num: getNumSubRegsForSpillOp(MI: *MI, TII)); |
2498 | if (IsWWMRegSpill) |
2499 | TII->restoreExec(MF&: *MF, MBB&: *MBB, MBBI: MI, DL, Reg: MFI->getSGPRForEXECCopy()); |
2500 | |
2501 | MI->eraseFromParent(); |
2502 | return true; |
2503 | } |
2504 | case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: { |
2505 | // Put mask into M0. |
2506 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), |
2507 | DestReg: AMDGPU::M0) |
2508 | .add(MO: *TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::mask)); |
2509 | LLVM_FALLTHROUGH; |
2510 | } |
2511 | case AMDGPU::SI_SPILL_V16_RESTORE: |
2512 | case AMDGPU::SI_SPILL_V32_RESTORE: |
2513 | case AMDGPU::SI_SPILL_V64_RESTORE: |
2514 | case AMDGPU::SI_SPILL_V96_RESTORE: |
2515 | case AMDGPU::SI_SPILL_V128_RESTORE: |
2516 | case AMDGPU::SI_SPILL_V160_RESTORE: |
2517 | case AMDGPU::SI_SPILL_V192_RESTORE: |
2518 | case AMDGPU::SI_SPILL_V224_RESTORE: |
2519 | case AMDGPU::SI_SPILL_V256_RESTORE: |
2520 | case AMDGPU::SI_SPILL_V288_RESTORE: |
2521 | case AMDGPU::SI_SPILL_V320_RESTORE: |
2522 | case AMDGPU::SI_SPILL_V352_RESTORE: |
2523 | case AMDGPU::SI_SPILL_V384_RESTORE: |
2524 | case AMDGPU::SI_SPILL_V512_RESTORE: |
2525 | case AMDGPU::SI_SPILL_V1024_RESTORE: |
2526 | case AMDGPU::SI_SPILL_A32_RESTORE: |
2527 | case AMDGPU::SI_SPILL_A64_RESTORE: |
2528 | case AMDGPU::SI_SPILL_A96_RESTORE: |
2529 | case AMDGPU::SI_SPILL_A128_RESTORE: |
2530 | case AMDGPU::SI_SPILL_A160_RESTORE: |
2531 | case AMDGPU::SI_SPILL_A192_RESTORE: |
2532 | case AMDGPU::SI_SPILL_A224_RESTORE: |
2533 | case AMDGPU::SI_SPILL_A256_RESTORE: |
2534 | case AMDGPU::SI_SPILL_A288_RESTORE: |
2535 | case AMDGPU::SI_SPILL_A320_RESTORE: |
2536 | case AMDGPU::SI_SPILL_A352_RESTORE: |
2537 | case AMDGPU::SI_SPILL_A384_RESTORE: |
2538 | case AMDGPU::SI_SPILL_A512_RESTORE: |
2539 | case AMDGPU::SI_SPILL_A1024_RESTORE: |
2540 | case AMDGPU::SI_SPILL_AV32_RESTORE: |
2541 | case AMDGPU::SI_SPILL_AV64_RESTORE: |
2542 | case AMDGPU::SI_SPILL_AV96_RESTORE: |
2543 | case AMDGPU::SI_SPILL_AV128_RESTORE: |
2544 | case AMDGPU::SI_SPILL_AV160_RESTORE: |
2545 | case AMDGPU::SI_SPILL_AV192_RESTORE: |
2546 | case AMDGPU::SI_SPILL_AV224_RESTORE: |
2547 | case AMDGPU::SI_SPILL_AV256_RESTORE: |
2548 | case AMDGPU::SI_SPILL_AV288_RESTORE: |
2549 | case AMDGPU::SI_SPILL_AV320_RESTORE: |
2550 | case AMDGPU::SI_SPILL_AV352_RESTORE: |
2551 | case AMDGPU::SI_SPILL_AV384_RESTORE: |
2552 | case AMDGPU::SI_SPILL_AV512_RESTORE: |
2553 | case AMDGPU::SI_SPILL_AV1024_RESTORE: |
2554 | case AMDGPU::SI_SPILL_WWM_V32_RESTORE: |
2555 | case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { |
2556 | const MachineOperand *VData = TII->getNamedOperand(MI&: *MI, |
2557 | OperandName: AMDGPU::OpName::vdata); |
2558 | assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == |
2559 | MFI->getStackPtrOffsetReg()); |
2560 | |
2561 | unsigned Opc; |
2562 | if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { |
2563 | assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!" ); |
2564 | Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; |
2565 | } else { |
2566 | Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE |
2567 | ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR |
2568 | : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR |
2569 | : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
2570 | } |
2571 | |
2572 | auto *MBB = MI->getParent(); |
2573 | bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(Opcode: MI->getOpcode()); |
2574 | if (IsWWMRegSpill) { |
2575 | TII->insertScratchExecCopy(MF&: *MF, MBB&: *MBB, MBBI: MI, DL, Reg: MFI->getSGPRForEXECCopy(), |
2576 | IsSCCLive: RS->isRegUsed(Reg: AMDGPU::SCC)); |
2577 | } |
2578 | |
2579 | buildSpillLoadStore( |
2580 | MBB&: *MBB, MI, DL, LoadStoreOp: Opc, Index, ValueReg: VData->getReg(), IsKill: VData->isKill(), ScratchOffsetReg: FrameReg, |
2581 | InstOffset: TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset)->getImm(), |
2582 | MMO: *MI->memoperands_begin(), RS); |
2583 | |
2584 | if (IsWWMRegSpill) |
2585 | TII->restoreExec(MF&: *MF, MBB&: *MBB, MBBI: MI, DL, Reg: MFI->getSGPRForEXECCopy()); |
2586 | |
2587 | MI->eraseFromParent(); |
2588 | return true; |
2589 | } |
2590 | case AMDGPU::V_ADD_U32_e32: |
2591 | case AMDGPU::V_ADD_U32_e64: |
2592 | case AMDGPU::V_ADD_CO_U32_e32: |
2593 | case AMDGPU::V_ADD_CO_U32_e64: { |
2594 | // TODO: Handle sub, and, or. |
2595 | unsigned NumDefs = MI->getNumExplicitDefs(); |
2596 | unsigned Src0Idx = NumDefs; |
2597 | |
2598 | bool HasClamp = false; |
2599 | MachineOperand *VCCOp = nullptr; |
2600 | |
2601 | switch (MI->getOpcode()) { |
2602 | case AMDGPU::V_ADD_U32_e32: |
2603 | break; |
2604 | case AMDGPU::V_ADD_U32_e64: |
2605 | HasClamp = MI->getOperand(i: 3).getImm(); |
2606 | break; |
2607 | case AMDGPU::V_ADD_CO_U32_e32: |
2608 | VCCOp = &MI->getOperand(i: 3); |
2609 | break; |
2610 | case AMDGPU::V_ADD_CO_U32_e64: |
2611 | VCCOp = &MI->getOperand(i: 1); |
2612 | HasClamp = MI->getOperand(i: 4).getImm(); |
2613 | break; |
2614 | default: |
2615 | break; |
2616 | } |
2617 | bool DeadVCC = !VCCOp || VCCOp->isDead(); |
2618 | MachineOperand &DstOp = MI->getOperand(i: 0); |
2619 | Register DstReg = DstOp.getReg(); |
2620 | |
2621 | unsigned OtherOpIdx = |
2622 | FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx; |
2623 | MachineOperand *OtherOp = &MI->getOperand(i: OtherOpIdx); |
2624 | |
2625 | unsigned Src1Idx = Src0Idx + 1; |
2626 | Register MaterializedReg = FrameReg; |
2627 | Register ScavengedVGPR; |
2628 | |
2629 | int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index); |
2630 | // For the non-immediate case, we could fall through to the default |
2631 | // handling, but we do an in-place update of the result register here to |
2632 | // avoid scavenging another register. |
2633 | if (OtherOp->isImm()) { |
2634 | int64_t TotalOffset = OtherOp->getImm() + Offset; |
2635 | |
2636 | if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(MI: *MI) && |
2637 | !AMDGPU::isInlinableIntLiteral(Literal: TotalOffset)) { |
2638 | // If we can't support a VOP3 literal in the VALU instruction, we |
2639 | // can't specially fold into the add. |
2640 | // TODO: Handle VOP3->VOP2 shrink to support the fold. |
2641 | break; |
2642 | } |
2643 | |
2644 | OtherOp->setImm(TotalOffset); |
2645 | Offset = 0; |
2646 | } |
2647 | |
2648 | if (FrameReg && !ST.enableFlatScratch()) { |
2649 | // We should just do an in-place update of the result register. However, |
2650 | // the value there may also be used by the add, in which case we need a |
2651 | // temporary register. |
2652 | // |
2653 | // FIXME: The scavenger is not finding the result register in the |
2654 | // common case where the add does not read the register. |
2655 | |
2656 | ScavengedVGPR = RS->scavengeRegisterBackwards( |
2657 | RC: AMDGPU::VGPR_32RegClass, To: MI, /*RestoreAfter=*/false, /*SPAdj=*/0); |
2658 | |
2659 | // TODO: If we have a free SGPR, it's sometimes better to use a scalar |
2660 | // shift. |
2661 | BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHRREV_B32_e64)) |
2662 | .addDef(RegNo: ScavengedVGPR, Flags: RegState::Renamable) |
2663 | .addImm(Val: ST.getWavefrontSizeLog2()) |
2664 | .addReg(RegNo: FrameReg); |
2665 | MaterializedReg = ScavengedVGPR; |
2666 | } |
2667 | |
2668 | if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) { |
2669 | if (ST.enableFlatScratch() && |
2670 | !TII->isOperandLegal(MI: *MI, OpIdx: Src1Idx, MO: OtherOp)) { |
2671 | // We didn't need the shift above, so we have an SGPR for the frame |
2672 | // register, but may have a VGPR only operand. |
2673 | // |
2674 | // TODO: On gfx10+, we can easily change the opcode to the e64 version |
2675 | // and use the higher constant bus restriction to avoid this copy. |
2676 | |
2677 | if (!ScavengedVGPR) { |
2678 | ScavengedVGPR = RS->scavengeRegisterBackwards( |
2679 | RC: AMDGPU::VGPR_32RegClass, To: MI, /*RestoreAfter=*/false, |
2680 | /*SPAdj=*/0); |
2681 | } |
2682 | |
2683 | assert(ScavengedVGPR != DstReg); |
2684 | |
2685 | BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ScavengedVGPR) |
2686 | .addReg(RegNo: MaterializedReg, |
2687 | flags: MaterializedReg != FrameReg ? RegState::Kill : 0); |
2688 | MaterializedReg = ScavengedVGPR; |
2689 | } |
2690 | |
2691 | // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC |
2692 | // is not live, we could use a scalar add + vector add instead of 2 |
2693 | // vector adds. |
2694 | auto AddI32 = BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: TII->get(Opcode: MI->getOpcode())) |
2695 | .addDef(RegNo: DstReg, Flags: RegState::Renamable); |
2696 | if (NumDefs == 2) |
2697 | AddI32.add(MO: MI->getOperand(i: 1)); |
2698 | |
2699 | unsigned MaterializedRegFlags = |
2700 | MaterializedReg != FrameReg ? RegState::Kill : 0; |
2701 | |
2702 | if (isVGPRClass(RC: getPhysRegBaseClass(Reg: MaterializedReg))) { |
2703 | // If we know we have a VGPR already, it's more likely the other |
2704 | // operand is a legal vsrc0. |
2705 | AddI32 |
2706 | .add(MO: *OtherOp) |
2707 | .addReg(RegNo: MaterializedReg, flags: MaterializedRegFlags); |
2708 | } else { |
2709 | // Commute operands to avoid violating VOP2 restrictions. This will |
2710 | // typically happen when using scratch. |
2711 | AddI32 |
2712 | .addReg(RegNo: MaterializedReg, flags: MaterializedRegFlags) |
2713 | .add(MO: *OtherOp); |
2714 | } |
2715 | |
2716 | if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || |
2717 | MI->getOpcode() == AMDGPU::V_ADD_U32_e64) |
2718 | AddI32.addImm(Val: 0); // clamp |
2719 | |
2720 | if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32) |
2721 | AddI32.setOperandDead(3); // Dead vcc |
2722 | |
2723 | MaterializedReg = DstReg; |
2724 | |
2725 | OtherOp->ChangeToRegister(Reg: MaterializedReg, isDef: false); |
2726 | OtherOp->setIsKill(true); |
2727 | FIOp->ChangeToImmediate(ImmVal: Offset); |
2728 | Offset = 0; |
2729 | } else if (Offset != 0) { |
2730 | assert(!MaterializedReg); |
2731 | FIOp->ChangeToImmediate(ImmVal: Offset); |
2732 | Offset = 0; |
2733 | } else { |
2734 | if (DeadVCC && !HasClamp) { |
2735 | assert(Offset == 0); |
2736 | |
2737 | // TODO: Losing kills and implicit operands. Just mutate to copy and |
2738 | // let lowerCopy deal with it? |
2739 | if (OtherOp->isReg() && OtherOp->getReg() == DstReg) { |
2740 | // Folded to an identity copy. |
2741 | MI->eraseFromParent(); |
2742 | return true; |
2743 | } |
2744 | |
2745 | // The immediate value should be in OtherOp |
2746 | MI->setDesc(TII->get(Opcode: AMDGPU::V_MOV_B32_e32)); |
2747 | MI->removeOperand(OpNo: FIOperandNum); |
2748 | |
2749 | unsigned NumOps = MI->getNumOperands(); |
2750 | for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I) |
2751 | MI->removeOperand(OpNo: I); |
2752 | |
2753 | if (NumDefs == 2) |
2754 | MI->removeOperand(OpNo: 1); |
2755 | |
2756 | // The code below can't deal with a mov. |
2757 | return true; |
2758 | } |
2759 | |
2760 | // This folded to a constant, but we have to keep the add around for |
2761 | // pointless implicit defs or clamp modifier. |
2762 | FIOp->ChangeToImmediate(ImmVal: 0); |
2763 | } |
2764 | |
2765 | // Try to improve legality by commuting. |
2766 | if (!TII->isOperandLegal(MI: *MI, OpIdx: Src1Idx) && TII->commuteInstruction(MI&: *MI)) { |
2767 | std::swap(a&: FIOp, b&: OtherOp); |
2768 | std::swap(a&: FIOperandNum, b&: OtherOpIdx); |
2769 | } |
2770 | |
2771 | // We need at most one mov to satisfy the operand constraints. Prefer to |
2772 | // move the FI operand first, as it may be a literal in a VOP3 |
2773 | // instruction. |
2774 | for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) { |
2775 | if (!TII->isOperandLegal(MI: *MI, OpIdx: SrcIdx)) { |
2776 | // If commuting didn't make the operands legal, we need to materialize |
2777 | // in a register. |
2778 | // TODO: Can use SGPR on gfx10+ in some cases. |
2779 | if (!ScavengedVGPR) { |
2780 | ScavengedVGPR = RS->scavengeRegisterBackwards( |
2781 | RC: AMDGPU::VGPR_32RegClass, To: MI, /*RestoreAfter=*/false, |
2782 | /*SPAdj=*/0); |
2783 | } |
2784 | |
2785 | assert(ScavengedVGPR != DstReg); |
2786 | |
2787 | MachineOperand &Src = MI->getOperand(i: SrcIdx); |
2788 | BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ScavengedVGPR) |
2789 | .add(MO: Src); |
2790 | |
2791 | Src.ChangeToRegister(Reg: ScavengedVGPR, isDef: false); |
2792 | Src.setIsKill(true); |
2793 | break; |
2794 | } |
2795 | } |
2796 | |
2797 | // Fold out add of 0 case that can appear in kernels. |
2798 | if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) { |
2799 | if (OtherOp->isReg() && OtherOp->getReg() != DstReg) { |
2800 | BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg).add(MO: *OtherOp); |
2801 | } |
2802 | |
2803 | MI->eraseFromParent(); |
2804 | } |
2805 | |
2806 | return true; |
2807 | } |
2808 | case AMDGPU::S_ADD_I32: |
2809 | case AMDGPU::S_ADD_U32: { |
2810 | // TODO: Handle s_or_b32, s_and_b32. |
2811 | unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1; |
2812 | MachineOperand &OtherOp = MI->getOperand(i: OtherOpIdx); |
2813 | |
2814 | assert(FrameReg || MFI->isBottomOfStack()); |
2815 | |
2816 | MachineOperand &DstOp = MI->getOperand(i: 0); |
2817 | const DebugLoc &DL = MI->getDebugLoc(); |
2818 | Register MaterializedReg = FrameReg; |
2819 | |
2820 | // Defend against live scc, which should never happen in practice. |
2821 | bool DeadSCC = MI->getOperand(i: 3).isDead(); |
2822 | |
2823 | Register TmpReg; |
2824 | |
2825 | // FIXME: Scavenger should figure out that the result register is |
2826 | // available. Also should do this for the v_add case. |
2827 | if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg()) |
2828 | TmpReg = DstOp.getReg(); |
2829 | |
2830 | if (FrameReg && !ST.enableFlatScratch()) { |
2831 | // FIXME: In the common case where the add does not also read its result |
2832 | // (i.e. this isn't a reg += fi), it's not finding the dest reg as |
2833 | // available. |
2834 | if (!TmpReg) |
2835 | TmpReg = RS->scavengeRegisterBackwards(RC: AMDGPU::SReg_32_XM0RegClass, |
2836 | To: MI, /*RestoreAfter=*/false, SPAdj: 0, |
2837 | /*AllowSpill=*/false); |
2838 | if (TmpReg) { |
2839 | BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32)) |
2840 | .addDef(RegNo: TmpReg, Flags: RegState::Renamable) |
2841 | .addReg(RegNo: FrameReg) |
2842 | .addImm(Val: ST.getWavefrontSizeLog2()) |
2843 | .setOperandDead(3); // Set SCC dead |
2844 | } |
2845 | MaterializedReg = TmpReg; |
2846 | } |
2847 | |
2848 | int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index); |
2849 | |
2850 | // For the non-immediate case, we could fall through to the default |
2851 | // handling, but we do an in-place update of the result register here to |
2852 | // avoid scavenging another register. |
2853 | if (OtherOp.isImm()) { |
2854 | OtherOp.setImm(OtherOp.getImm() + Offset); |
2855 | Offset = 0; |
2856 | |
2857 | if (MaterializedReg) |
2858 | FIOp->ChangeToRegister(Reg: MaterializedReg, isDef: false); |
2859 | else |
2860 | FIOp->ChangeToImmediate(ImmVal: 0); |
2861 | } else if (MaterializedReg) { |
2862 | // If we can't fold the other operand, do another increment. |
2863 | Register DstReg = DstOp.getReg(); |
2864 | |
2865 | if (!TmpReg && MaterializedReg == FrameReg) { |
2866 | TmpReg = RS->scavengeRegisterBackwards(RC: AMDGPU::SReg_32_XM0RegClass, |
2867 | To: MI, /*RestoreAfter=*/false, SPAdj: 0, |
2868 | /*AllowSpill=*/false); |
2869 | DstReg = TmpReg; |
2870 | } |
2871 | |
2872 | if (TmpReg) { |
2873 | auto AddI32 = BuildMI(BB&: *MBB, I&: *MI, MIMD: DL, MCID: MI->getDesc()) |
2874 | .addDef(RegNo: DstReg, Flags: RegState::Renamable) |
2875 | .addReg(RegNo: MaterializedReg, flags: RegState::Kill) |
2876 | .add(MO: OtherOp); |
2877 | if (DeadSCC) |
2878 | AddI32.setOperandDead(3); |
2879 | |
2880 | MaterializedReg = DstReg; |
2881 | |
2882 | OtherOp.ChangeToRegister(Reg: MaterializedReg, isDef: false); |
2883 | OtherOp.setIsKill(true); |
2884 | OtherOp.setIsRenamable(true); |
2885 | } |
2886 | FIOp->ChangeToImmediate(ImmVal: Offset); |
2887 | } else { |
2888 | // If we don't have any other offset to apply, we can just directly |
2889 | // interpret the frame index as the offset. |
2890 | FIOp->ChangeToImmediate(ImmVal: Offset); |
2891 | } |
2892 | |
2893 | if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) { |
2894 | assert(Offset == 0); |
2895 | MI->removeOperand(OpNo: 3); |
2896 | MI->removeOperand(OpNo: OtherOpIdx); |
2897 | MI->setDesc(TII->get(Opcode: FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); |
2898 | } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) { |
2899 | assert(Offset == 0); |
2900 | MI->removeOperand(OpNo: 3); |
2901 | MI->removeOperand(OpNo: FIOperandNum); |
2902 | MI->setDesc( |
2903 | TII->get(Opcode: OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); |
2904 | } |
2905 | |
2906 | assert(!FIOp->isFI()); |
2907 | return true; |
2908 | } |
2909 | default: { |
2910 | break; |
2911 | } |
2912 | } |
2913 | |
2914 | int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index); |
2915 | if (ST.enableFlatScratch()) { |
2916 | if (TII->isFLATScratch(MI: *MI)) { |
2917 | assert( |
2918 | (int16_t)FIOperandNum == |
2919 | AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr)); |
2920 | |
2921 | // The offset is always swizzled, just replace it |
2922 | if (FrameReg) |
2923 | FIOp->ChangeToRegister(Reg: FrameReg, isDef: false); |
2924 | |
2925 | MachineOperand *OffsetOp = |
2926 | TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset); |
2927 | int64_t NewOffset = Offset + OffsetOp->getImm(); |
2928 | if (TII->isLegalFLATOffset(Offset: NewOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
2929 | FlatVariant: SIInstrFlags::FlatScratch)) { |
2930 | OffsetOp->setImm(NewOffset); |
2931 | if (FrameReg) |
2932 | return false; |
2933 | Offset = 0; |
2934 | } |
2935 | |
2936 | if (!Offset) { |
2937 | unsigned Opc = MI->getOpcode(); |
2938 | int NewOpc = -1; |
2939 | if (AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr)) { |
2940 | NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opcode: Opc); |
2941 | } else if (ST.hasFlatScratchSTMode()) { |
2942 | // On GFX10 we have ST mode to use no registers for an address. |
2943 | // Otherwise we need to materialize 0 into an SGPR. |
2944 | NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opcode: Opc); |
2945 | } |
2946 | |
2947 | if (NewOpc != -1) { |
2948 | // removeOperand doesn't fixup tied operand indexes as it goes, so |
2949 | // it asserts. Untie vdst_in for now and retie them afterwards. |
2950 | int VDstIn = |
2951 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in); |
2952 | bool TiedVDst = VDstIn != -1 && MI->getOperand(i: VDstIn).isReg() && |
2953 | MI->getOperand(i: VDstIn).isTied(); |
2954 | if (TiedVDst) |
2955 | MI->untieRegOperand(OpIdx: VDstIn); |
2956 | |
2957 | MI->removeOperand( |
2958 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr)); |
2959 | |
2960 | if (TiedVDst) { |
2961 | int NewVDst = |
2962 | AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst); |
2963 | int NewVDstIn = |
2964 | AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst_in); |
2965 | assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!" ); |
2966 | MI->tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn); |
2967 | } |
2968 | MI->setDesc(TII->get(Opcode: NewOpc)); |
2969 | return false; |
2970 | } |
2971 | } |
2972 | } |
2973 | |
2974 | if (!FrameReg) { |
2975 | FIOp->ChangeToImmediate(ImmVal: Offset); |
2976 | if (TII->isImmOperandLegal(MI: *MI, OpNo: FIOperandNum, MO: *FIOp)) |
2977 | return false; |
2978 | } |
2979 | |
2980 | // We need to use register here. Check if we can use an SGPR or need |
2981 | // a VGPR. |
2982 | FIOp->ChangeToRegister(Reg: AMDGPU::M0, isDef: false); |
2983 | bool UseSGPR = TII->isOperandLegal(MI: *MI, OpIdx: FIOperandNum, MO: FIOp); |
2984 | |
2985 | if (!Offset && FrameReg && UseSGPR) { |
2986 | FIOp->setReg(FrameReg); |
2987 | return false; |
2988 | } |
2989 | |
2990 | const TargetRegisterClass *RC = |
2991 | UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass; |
2992 | |
2993 | Register TmpReg = |
2994 | RS->scavengeRegisterBackwards(RC: *RC, To: MI, RestoreAfter: false, SPAdj: 0, AllowSpill: !UseSGPR); |
2995 | FIOp->setReg(TmpReg); |
2996 | FIOp->setIsKill(); |
2997 | |
2998 | if ((!FrameReg || !Offset) && TmpReg) { |
2999 | unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; |
3000 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg); |
3001 | if (FrameReg) |
3002 | MIB.addReg(RegNo: FrameReg); |
3003 | else |
3004 | MIB.addImm(Val: Offset); |
3005 | |
3006 | return false; |
3007 | } |
3008 | |
3009 | bool NeedSaveSCC = RS->isRegUsed(Reg: AMDGPU::SCC) && |
3010 | !MI->definesRegister(Reg: AMDGPU::SCC, /*TRI=*/nullptr); |
3011 | |
3012 | Register TmpSReg = |
3013 | UseSGPR ? TmpReg |
3014 | : RS->scavengeRegisterBackwards(RC: AMDGPU::SReg_32_XM0RegClass, |
3015 | To: MI, RestoreAfter: false, SPAdj: 0, AllowSpill: !UseSGPR); |
3016 | |
3017 | // TODO: for flat scratch another attempt can be made with a VGPR index |
3018 | // if no SGPRs can be scavenged. |
3019 | if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) |
3020 | report_fatal_error(reason: "Cannot scavenge register in FI elimination!" ); |
3021 | |
3022 | if (!TmpSReg) { |
3023 | // Use frame register and restore it after. |
3024 | TmpSReg = FrameReg; |
3025 | FIOp->setReg(FrameReg); |
3026 | FIOp->setIsKill(false); |
3027 | } |
3028 | |
3029 | if (NeedSaveSCC) { |
3030 | assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!" ); |
3031 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), DestReg: TmpSReg) |
3032 | .addReg(RegNo: FrameReg) |
3033 | .addImm(Val: Offset); |
3034 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BITCMP1_B32)) |
3035 | .addReg(RegNo: TmpSReg) |
3036 | .addImm(Val: 0); |
3037 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BITSET0_B32), DestReg: TmpSReg) |
3038 | .addImm(Val: 0) |
3039 | .addReg(RegNo: TmpSReg); |
3040 | } else { |
3041 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: TmpSReg) |
3042 | .addReg(RegNo: FrameReg) |
3043 | .addImm(Val: Offset); |
3044 | } |
3045 | |
3046 | if (!UseSGPR) |
3047 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpReg) |
3048 | .addReg(RegNo: TmpSReg, flags: RegState::Kill); |
3049 | |
3050 | if (TmpSReg == FrameReg) { |
3051 | // Undo frame register modification. |
3052 | if (NeedSaveSCC && |
3053 | !MI->registerDefIsDead(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
3054 | MachineBasicBlock::iterator I = |
3055 | BuildMI(BB&: *MBB, I: std::next(x: MI), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADDC_U32), |
3056 | DestReg: TmpSReg) |
3057 | .addReg(RegNo: FrameReg) |
3058 | .addImm(Val: -Offset); |
3059 | I = BuildMI(BB&: *MBB, I: std::next(x: I), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BITCMP1_B32)) |
3060 | .addReg(RegNo: TmpSReg) |
3061 | .addImm(Val: 0); |
3062 | BuildMI(BB&: *MBB, I: std::next(x: I), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BITSET0_B32), |
3063 | DestReg: TmpSReg) |
3064 | .addImm(Val: 0) |
3065 | .addReg(RegNo: TmpSReg); |
3066 | } else { |
3067 | BuildMI(BB&: *MBB, I: std::next(x: MI), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), |
3068 | DestReg: FrameReg) |
3069 | .addReg(RegNo: FrameReg) |
3070 | .addImm(Val: -Offset); |
3071 | } |
3072 | } |
3073 | |
3074 | return false; |
3075 | } |
3076 | |
3077 | bool IsMUBUF = TII->isMUBUF(MI: *MI); |
3078 | |
3079 | if (!IsMUBUF && !MFI->isBottomOfStack()) { |
3080 | // Convert to a swizzled stack address by scaling by the wave size. |
3081 | // In an entry function/kernel the offset is already swizzled. |
3082 | bool IsSALU = isSGPRClass(RC: TII->getOpRegClass(MI: *MI, OpNo: FIOperandNum)); |
3083 | bool LiveSCC = RS->isRegUsed(Reg: AMDGPU::SCC) && |
3084 | !MI->definesRegister(Reg: AMDGPU::SCC, /*TRI=*/nullptr); |
3085 | const TargetRegisterClass *RC = IsSALU && !LiveSCC |
3086 | ? &AMDGPU::SReg_32RegClass |
3087 | : &AMDGPU::VGPR_32RegClass; |
3088 | bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || |
3089 | MI->getOpcode() == AMDGPU::V_MOV_B32_e64 || |
3090 | MI->getOpcode() == AMDGPU::S_MOV_B32; |
3091 | Register ResultReg = |
3092 | IsCopy ? MI->getOperand(i: 0).getReg() |
3093 | : RS->scavengeRegisterBackwards(RC: *RC, To: MI, RestoreAfter: false, SPAdj: 0); |
3094 | |
3095 | int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index); |
3096 | if (Offset == 0) { |
3097 | unsigned OpCode = |
3098 | IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64; |
3099 | Register TmpResultReg = ResultReg; |
3100 | if (IsSALU && LiveSCC) { |
3101 | TmpResultReg = RS->scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, |
3102 | To: MI, RestoreAfter: false, SPAdj: 0); |
3103 | } |
3104 | |
3105 | auto Shift = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: OpCode), DestReg: TmpResultReg); |
3106 | if (OpCode == AMDGPU::V_LSHRREV_B32_e64) |
3107 | // For V_LSHRREV, the operands are reversed (the shift count goes |
3108 | // first). |
3109 | Shift.addImm(Val: ST.getWavefrontSizeLog2()).addReg(RegNo: FrameReg); |
3110 | else |
3111 | Shift.addReg(RegNo: FrameReg).addImm(Val: ST.getWavefrontSizeLog2()); |
3112 | if (IsSALU && !LiveSCC) |
3113 | Shift.getInstr()->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
3114 | if (IsSALU && LiveSCC) { |
3115 | Register NewDest; |
3116 | if (IsCopy) { |
3117 | MF->getRegInfo().constrainRegClass(Reg: ResultReg, |
3118 | RC: &AMDGPU::SReg_32_XM0RegClass); |
3119 | NewDest = ResultReg; |
3120 | } else { |
3121 | NewDest = RS->scavengeRegisterBackwards(RC: AMDGPU::SReg_32_XM0RegClass, |
3122 | To: Shift, RestoreAfter: false, SPAdj: 0); |
3123 | } |
3124 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDest) |
3125 | .addReg(RegNo: TmpResultReg); |
3126 | ResultReg = NewDest; |
3127 | } |
3128 | } else { |
3129 | MachineInstrBuilder MIB; |
3130 | if (!IsSALU) { |
3131 | if ((MIB = TII->getAddNoCarry(MBB&: *MBB, I: MI, DL, DestReg: ResultReg, RS&: *RS)) != |
3132 | nullptr) { |
3133 | // Reuse ResultReg in intermediate step. |
3134 | Register ScaledReg = ResultReg; |
3135 | |
3136 | BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHRREV_B32_e64), |
3137 | DestReg: ScaledReg) |
3138 | .addImm(Val: ST.getWavefrontSizeLog2()) |
3139 | .addReg(RegNo: FrameReg); |
3140 | |
3141 | const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; |
3142 | |
3143 | // TODO: Fold if use instruction is another add of a constant. |
3144 | if (IsVOP2 || |
3145 | AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm())) { |
3146 | // FIXME: This can fail |
3147 | MIB.addImm(Val: Offset); |
3148 | MIB.addReg(RegNo: ScaledReg, flags: RegState::Kill); |
3149 | if (!IsVOP2) |
3150 | MIB.addImm(Val: 0); // clamp bit |
3151 | } else { |
3152 | assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && |
3153 | "Need to reuse carry out register" ); |
3154 | |
3155 | // Use scavenged unused carry out as offset register. |
3156 | Register ConstOffsetReg; |
3157 | if (!isWave32) |
3158 | ConstOffsetReg = getSubReg(Reg: MIB.getReg(Idx: 1), Idx: AMDGPU::sub0); |
3159 | else |
3160 | ConstOffsetReg = MIB.getReg(Idx: 1); |
3161 | |
3162 | BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), |
3163 | DestReg: ConstOffsetReg) |
3164 | .addImm(Val: Offset); |
3165 | MIB.addReg(RegNo: ConstOffsetReg, flags: RegState::Kill); |
3166 | MIB.addReg(RegNo: ScaledReg, flags: RegState::Kill); |
3167 | MIB.addImm(Val: 0); // clamp bit |
3168 | } |
3169 | } |
3170 | } |
3171 | if (!MIB || IsSALU) { |
3172 | // We have to produce a carry out, and there isn't a free SGPR pair |
3173 | // for it. We can keep the whole computation on the SALU to avoid |
3174 | // clobbering an additional register at the cost of an extra mov. |
3175 | |
3176 | // We may have 1 free scratch SGPR even though a carry out is |
3177 | // unavailable. Only one additional mov is needed. |
3178 | Register TmpScaledReg = IsCopy && IsSALU |
3179 | ? ResultReg |
3180 | : RS->scavengeRegisterBackwards( |
3181 | RC: AMDGPU::SReg_32_XM0RegClass, To: MI, |
3182 | RestoreAfter: false, SPAdj: 0, /*AllowSpill=*/false); |
3183 | Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; |
3184 | Register TmpResultReg = ScaledReg; |
3185 | |
3186 | if (!LiveSCC) { |
3187 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpResultReg) |
3188 | .addReg(RegNo: FrameReg) |
3189 | .addImm(Val: ST.getWavefrontSizeLog2()); |
3190 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: TmpResultReg) |
3191 | .addReg(RegNo: TmpResultReg, flags: RegState::Kill) |
3192 | .addImm(Val: Offset); |
3193 | } else { |
3194 | TmpResultReg = RS->scavengeRegisterBackwards( |
3195 | RC: AMDGPU::VGPR_32RegClass, To: MI, RestoreAfter: false, SPAdj: 0, /*AllowSpill=*/true); |
3196 | |
3197 | MachineInstrBuilder Add; |
3198 | if ((Add = TII->getAddNoCarry(MBB&: *MBB, I: MI, DL, DestReg: TmpResultReg, RS&: *RS))) { |
3199 | BuildMI(BB&: *MBB, I&: *Add, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHRREV_B32_e64), |
3200 | DestReg: TmpResultReg) |
3201 | .addImm(Val: ST.getWavefrontSizeLog2()) |
3202 | .addReg(RegNo: FrameReg); |
3203 | if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) { |
3204 | BuildMI(BB&: *MBB, I&: *Add, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ResultReg) |
3205 | .addImm(Val: Offset); |
3206 | Add.addReg(RegNo: ResultReg, flags: RegState::Kill) |
3207 | .addReg(RegNo: TmpResultReg, flags: RegState::Kill) |
3208 | .addImm(Val: 0); |
3209 | } else |
3210 | Add.addImm(Val: Offset).addReg(RegNo: TmpResultReg, flags: RegState::Kill); |
3211 | } else { |
3212 | assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) && |
3213 | "offset is unsafe for v_mad_u32_u24" ); |
3214 | |
3215 | // We start with a frame pointer with a wave space value, and |
3216 | // an offset in lane-space. We are materializing a lane space |
3217 | // value. We can either do a right shift of the frame pointer |
3218 | // to get to lane space, or a left shift of the offset to get |
3219 | // to wavespace. We can right shift after the computation to |
3220 | // get back to the desired per-lane value. We are using the |
3221 | // mad_u32_u24 primarily as an add with no carry out clobber. |
3222 | bool IsInlinableLiteral = |
3223 | AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm()); |
3224 | if (!IsInlinableLiteral) { |
3225 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), |
3226 | DestReg: TmpResultReg) |
3227 | .addImm(Val: Offset); |
3228 | } |
3229 | |
3230 | Add = BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MAD_U32_U24_e64), |
3231 | DestReg: TmpResultReg); |
3232 | |
3233 | if (!IsInlinableLiteral) { |
3234 | Add.addReg(RegNo: TmpResultReg, flags: RegState::Kill); |
3235 | } else { |
3236 | // We fold the offset into mad itself if its inlinable. |
3237 | Add.addImm(Val: Offset); |
3238 | } |
3239 | Add.addImm(Val: ST.getWavefrontSize()).addReg(RegNo: FrameReg).addImm(Val: 0); |
3240 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHRREV_B32_e64), |
3241 | DestReg: TmpResultReg) |
3242 | .addImm(Val: ST.getWavefrontSizeLog2()) |
3243 | .addReg(RegNo: TmpResultReg); |
3244 | } |
3245 | |
3246 | Register NewDest; |
3247 | if (IsCopy) { |
3248 | MF->getRegInfo().constrainRegClass(Reg: ResultReg, |
3249 | RC: &AMDGPU::SReg_32_XM0RegClass); |
3250 | NewDest = ResultReg; |
3251 | } else { |
3252 | NewDest = RS->scavengeRegisterBackwards( |
3253 | RC: AMDGPU::SReg_32_XM0RegClass, To: *Add, RestoreAfter: false, SPAdj: 0, |
3254 | /*AllowSpill=*/true); |
3255 | } |
3256 | |
3257 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), |
3258 | DestReg: NewDest) |
3259 | .addReg(RegNo: TmpResultReg); |
3260 | ResultReg = NewDest; |
3261 | } |
3262 | if (!IsSALU) |
3263 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: ResultReg) |
3264 | .addReg(RegNo: TmpResultReg, flags: RegState::Kill); |
3265 | else |
3266 | ResultReg = TmpResultReg; |
3267 | // If there were truly no free SGPRs, we need to undo everything. |
3268 | if (!TmpScaledReg.isValid()) { |
3269 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: ScaledReg) |
3270 | .addReg(RegNo: ScaledReg, flags: RegState::Kill) |
3271 | .addImm(Val: -Offset); |
3272 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_LSHL_B32), DestReg: ScaledReg) |
3273 | .addReg(RegNo: FrameReg) |
3274 | .addImm(Val: ST.getWavefrontSizeLog2()); |
3275 | } |
3276 | } |
3277 | } |
3278 | |
3279 | // Don't introduce an extra copy if we're just materializing in a mov. |
3280 | if (IsCopy) { |
3281 | MI->eraseFromParent(); |
3282 | return true; |
3283 | } |
3284 | FIOp->ChangeToRegister(Reg: ResultReg, isDef: false, isImp: false, isKill: true); |
3285 | return false; |
3286 | } |
3287 | |
3288 | if (IsMUBUF) { |
3289 | // Disable offen so we don't need a 0 vgpr base. |
3290 | assert( |
3291 | static_cast<int>(FIOperandNum) == |
3292 | AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); |
3293 | |
3294 | auto &SOffset = *TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::soffset); |
3295 | assert((SOffset.isImm() && SOffset.getImm() == 0)); |
3296 | |
3297 | if (FrameReg != AMDGPU::NoRegister) |
3298 | SOffset.ChangeToRegister(Reg: FrameReg, isDef: false); |
3299 | |
3300 | int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index); |
3301 | int64_t OldImm = |
3302 | TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset)->getImm(); |
3303 | int64_t NewOffset = OldImm + Offset; |
3304 | |
3305 | if (TII->isLegalMUBUFImmOffset(Imm: NewOffset) && |
3306 | buildMUBUFOffsetLoadStore(ST, MFI&: FrameInfo, MI, Index, Offset: NewOffset)) { |
3307 | MI->eraseFromParent(); |
3308 | return true; |
3309 | } |
3310 | } |
3311 | |
3312 | // If the offset is simply too big, don't convert to a scratch wave offset |
3313 | // relative index. |
3314 | |
3315 | FIOp->ChangeToImmediate(ImmVal: Offset); |
3316 | if (!TII->isImmOperandLegal(MI: *MI, OpNo: FIOperandNum, MO: *FIOp)) { |
3317 | Register TmpReg = |
3318 | RS->scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI, RestoreAfter: false, SPAdj: 0); |
3319 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: TmpReg) |
3320 | .addImm(Val: Offset); |
3321 | FIOp->ChangeToRegister(Reg: TmpReg, isDef: false, isImp: false, isKill: true); |
3322 | } |
3323 | |
3324 | return false; |
3325 | } |
3326 | |
3327 | StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { |
3328 | return AMDGPUInstPrinter::getRegisterName(Reg); |
3329 | } |
3330 | |
3331 | unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { |
3332 | return getRegBitWidth(RCID: RC.getID()); |
3333 | } |
3334 | |
3335 | static const TargetRegisterClass * |
3336 | getAnyVGPRClassForBitWidth(unsigned BitWidth) { |
3337 | if (BitWidth == 64) |
3338 | return &AMDGPU::VReg_64RegClass; |
3339 | if (BitWidth == 96) |
3340 | return &AMDGPU::VReg_96RegClass; |
3341 | if (BitWidth == 128) |
3342 | return &AMDGPU::VReg_128RegClass; |
3343 | if (BitWidth == 160) |
3344 | return &AMDGPU::VReg_160RegClass; |
3345 | if (BitWidth == 192) |
3346 | return &AMDGPU::VReg_192RegClass; |
3347 | if (BitWidth == 224) |
3348 | return &AMDGPU::VReg_224RegClass; |
3349 | if (BitWidth == 256) |
3350 | return &AMDGPU::VReg_256RegClass; |
3351 | if (BitWidth == 288) |
3352 | return &AMDGPU::VReg_288RegClass; |
3353 | if (BitWidth == 320) |
3354 | return &AMDGPU::VReg_320RegClass; |
3355 | if (BitWidth == 352) |
3356 | return &AMDGPU::VReg_352RegClass; |
3357 | if (BitWidth == 384) |
3358 | return &AMDGPU::VReg_384RegClass; |
3359 | if (BitWidth == 512) |
3360 | return &AMDGPU::VReg_512RegClass; |
3361 | if (BitWidth == 1024) |
3362 | return &AMDGPU::VReg_1024RegClass; |
3363 | |
3364 | return nullptr; |
3365 | } |
3366 | |
3367 | static const TargetRegisterClass * |
3368 | getAlignedVGPRClassForBitWidth(unsigned BitWidth) { |
3369 | if (BitWidth == 64) |
3370 | return &AMDGPU::VReg_64_Align2RegClass; |
3371 | if (BitWidth == 96) |
3372 | return &AMDGPU::VReg_96_Align2RegClass; |
3373 | if (BitWidth == 128) |
3374 | return &AMDGPU::VReg_128_Align2RegClass; |
3375 | if (BitWidth == 160) |
3376 | return &AMDGPU::VReg_160_Align2RegClass; |
3377 | if (BitWidth == 192) |
3378 | return &AMDGPU::VReg_192_Align2RegClass; |
3379 | if (BitWidth == 224) |
3380 | return &AMDGPU::VReg_224_Align2RegClass; |
3381 | if (BitWidth == 256) |
3382 | return &AMDGPU::VReg_256_Align2RegClass; |
3383 | if (BitWidth == 288) |
3384 | return &AMDGPU::VReg_288_Align2RegClass; |
3385 | if (BitWidth == 320) |
3386 | return &AMDGPU::VReg_320_Align2RegClass; |
3387 | if (BitWidth == 352) |
3388 | return &AMDGPU::VReg_352_Align2RegClass; |
3389 | if (BitWidth == 384) |
3390 | return &AMDGPU::VReg_384_Align2RegClass; |
3391 | if (BitWidth == 512) |
3392 | return &AMDGPU::VReg_512_Align2RegClass; |
3393 | if (BitWidth == 1024) |
3394 | return &AMDGPU::VReg_1024_Align2RegClass; |
3395 | |
3396 | return nullptr; |
3397 | } |
3398 | |
3399 | const TargetRegisterClass * |
3400 | SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { |
3401 | if (BitWidth == 1) |
3402 | return &AMDGPU::VReg_1RegClass; |
3403 | if (BitWidth == 16) |
3404 | return &AMDGPU::VGPR_16RegClass; |
3405 | if (BitWidth == 32) |
3406 | return &AMDGPU::VGPR_32RegClass; |
3407 | return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) |
3408 | : getAnyVGPRClassForBitWidth(BitWidth); |
3409 | } |
3410 | |
3411 | static const TargetRegisterClass * |
3412 | getAnyAGPRClassForBitWidth(unsigned BitWidth) { |
3413 | if (BitWidth == 64) |
3414 | return &AMDGPU::AReg_64RegClass; |
3415 | if (BitWidth == 96) |
3416 | return &AMDGPU::AReg_96RegClass; |
3417 | if (BitWidth == 128) |
3418 | return &AMDGPU::AReg_128RegClass; |
3419 | if (BitWidth == 160) |
3420 | return &AMDGPU::AReg_160RegClass; |
3421 | if (BitWidth == 192) |
3422 | return &AMDGPU::AReg_192RegClass; |
3423 | if (BitWidth == 224) |
3424 | return &AMDGPU::AReg_224RegClass; |
3425 | if (BitWidth == 256) |
3426 | return &AMDGPU::AReg_256RegClass; |
3427 | if (BitWidth == 288) |
3428 | return &AMDGPU::AReg_288RegClass; |
3429 | if (BitWidth == 320) |
3430 | return &AMDGPU::AReg_320RegClass; |
3431 | if (BitWidth == 352) |
3432 | return &AMDGPU::AReg_352RegClass; |
3433 | if (BitWidth == 384) |
3434 | return &AMDGPU::AReg_384RegClass; |
3435 | if (BitWidth == 512) |
3436 | return &AMDGPU::AReg_512RegClass; |
3437 | if (BitWidth == 1024) |
3438 | return &AMDGPU::AReg_1024RegClass; |
3439 | |
3440 | return nullptr; |
3441 | } |
3442 | |
3443 | static const TargetRegisterClass * |
3444 | getAlignedAGPRClassForBitWidth(unsigned BitWidth) { |
3445 | if (BitWidth == 64) |
3446 | return &AMDGPU::AReg_64_Align2RegClass; |
3447 | if (BitWidth == 96) |
3448 | return &AMDGPU::AReg_96_Align2RegClass; |
3449 | if (BitWidth == 128) |
3450 | return &AMDGPU::AReg_128_Align2RegClass; |
3451 | if (BitWidth == 160) |
3452 | return &AMDGPU::AReg_160_Align2RegClass; |
3453 | if (BitWidth == 192) |
3454 | return &AMDGPU::AReg_192_Align2RegClass; |
3455 | if (BitWidth == 224) |
3456 | return &AMDGPU::AReg_224_Align2RegClass; |
3457 | if (BitWidth == 256) |
3458 | return &AMDGPU::AReg_256_Align2RegClass; |
3459 | if (BitWidth == 288) |
3460 | return &AMDGPU::AReg_288_Align2RegClass; |
3461 | if (BitWidth == 320) |
3462 | return &AMDGPU::AReg_320_Align2RegClass; |
3463 | if (BitWidth == 352) |
3464 | return &AMDGPU::AReg_352_Align2RegClass; |
3465 | if (BitWidth == 384) |
3466 | return &AMDGPU::AReg_384_Align2RegClass; |
3467 | if (BitWidth == 512) |
3468 | return &AMDGPU::AReg_512_Align2RegClass; |
3469 | if (BitWidth == 1024) |
3470 | return &AMDGPU::AReg_1024_Align2RegClass; |
3471 | |
3472 | return nullptr; |
3473 | } |
3474 | |
3475 | const TargetRegisterClass * |
3476 | SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { |
3477 | if (BitWidth == 16) |
3478 | return &AMDGPU::AGPR_LO16RegClass; |
3479 | if (BitWidth == 32) |
3480 | return &AMDGPU::AGPR_32RegClass; |
3481 | return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) |
3482 | : getAnyAGPRClassForBitWidth(BitWidth); |
3483 | } |
3484 | |
3485 | static const TargetRegisterClass * |
3486 | getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { |
3487 | if (BitWidth == 64) |
3488 | return &AMDGPU::AV_64RegClass; |
3489 | if (BitWidth == 96) |
3490 | return &AMDGPU::AV_96RegClass; |
3491 | if (BitWidth == 128) |
3492 | return &AMDGPU::AV_128RegClass; |
3493 | if (BitWidth == 160) |
3494 | return &AMDGPU::AV_160RegClass; |
3495 | if (BitWidth == 192) |
3496 | return &AMDGPU::AV_192RegClass; |
3497 | if (BitWidth == 224) |
3498 | return &AMDGPU::AV_224RegClass; |
3499 | if (BitWidth == 256) |
3500 | return &AMDGPU::AV_256RegClass; |
3501 | if (BitWidth == 288) |
3502 | return &AMDGPU::AV_288RegClass; |
3503 | if (BitWidth == 320) |
3504 | return &AMDGPU::AV_320RegClass; |
3505 | if (BitWidth == 352) |
3506 | return &AMDGPU::AV_352RegClass; |
3507 | if (BitWidth == 384) |
3508 | return &AMDGPU::AV_384RegClass; |
3509 | if (BitWidth == 512) |
3510 | return &AMDGPU::AV_512RegClass; |
3511 | if (BitWidth == 1024) |
3512 | return &AMDGPU::AV_1024RegClass; |
3513 | |
3514 | return nullptr; |
3515 | } |
3516 | |
3517 | static const TargetRegisterClass * |
3518 | getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { |
3519 | if (BitWidth == 64) |
3520 | return &AMDGPU::AV_64_Align2RegClass; |
3521 | if (BitWidth == 96) |
3522 | return &AMDGPU::AV_96_Align2RegClass; |
3523 | if (BitWidth == 128) |
3524 | return &AMDGPU::AV_128_Align2RegClass; |
3525 | if (BitWidth == 160) |
3526 | return &AMDGPU::AV_160_Align2RegClass; |
3527 | if (BitWidth == 192) |
3528 | return &AMDGPU::AV_192_Align2RegClass; |
3529 | if (BitWidth == 224) |
3530 | return &AMDGPU::AV_224_Align2RegClass; |
3531 | if (BitWidth == 256) |
3532 | return &AMDGPU::AV_256_Align2RegClass; |
3533 | if (BitWidth == 288) |
3534 | return &AMDGPU::AV_288_Align2RegClass; |
3535 | if (BitWidth == 320) |
3536 | return &AMDGPU::AV_320_Align2RegClass; |
3537 | if (BitWidth == 352) |
3538 | return &AMDGPU::AV_352_Align2RegClass; |
3539 | if (BitWidth == 384) |
3540 | return &AMDGPU::AV_384_Align2RegClass; |
3541 | if (BitWidth == 512) |
3542 | return &AMDGPU::AV_512_Align2RegClass; |
3543 | if (BitWidth == 1024) |
3544 | return &AMDGPU::AV_1024_Align2RegClass; |
3545 | |
3546 | return nullptr; |
3547 | } |
3548 | |
3549 | const TargetRegisterClass * |
3550 | SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { |
3551 | if (BitWidth == 32) |
3552 | return &AMDGPU::AV_32RegClass; |
3553 | return ST.needsAlignedVGPRs() |
3554 | ? getAlignedVectorSuperClassForBitWidth(BitWidth) |
3555 | : getAnyVectorSuperClassForBitWidth(BitWidth); |
3556 | } |
3557 | |
3558 | const TargetRegisterClass * |
3559 | SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { |
3560 | if (BitWidth == 16) |
3561 | return &AMDGPU::SGPR_LO16RegClass; |
3562 | if (BitWidth == 32) |
3563 | return &AMDGPU::SReg_32RegClass; |
3564 | if (BitWidth == 64) |
3565 | return &AMDGPU::SReg_64RegClass; |
3566 | if (BitWidth == 96) |
3567 | return &AMDGPU::SGPR_96RegClass; |
3568 | if (BitWidth == 128) |
3569 | return &AMDGPU::SGPR_128RegClass; |
3570 | if (BitWidth == 160) |
3571 | return &AMDGPU::SGPR_160RegClass; |
3572 | if (BitWidth == 192) |
3573 | return &AMDGPU::SGPR_192RegClass; |
3574 | if (BitWidth == 224) |
3575 | return &AMDGPU::SGPR_224RegClass; |
3576 | if (BitWidth == 256) |
3577 | return &AMDGPU::SGPR_256RegClass; |
3578 | if (BitWidth == 288) |
3579 | return &AMDGPU::SGPR_288RegClass; |
3580 | if (BitWidth == 320) |
3581 | return &AMDGPU::SGPR_320RegClass; |
3582 | if (BitWidth == 352) |
3583 | return &AMDGPU::SGPR_352RegClass; |
3584 | if (BitWidth == 384) |
3585 | return &AMDGPU::SGPR_384RegClass; |
3586 | if (BitWidth == 512) |
3587 | return &AMDGPU::SGPR_512RegClass; |
3588 | if (BitWidth == 1024) |
3589 | return &AMDGPU::SGPR_1024RegClass; |
3590 | |
3591 | return nullptr; |
3592 | } |
3593 | |
3594 | bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, |
3595 | Register Reg) const { |
3596 | const TargetRegisterClass *RC; |
3597 | if (Reg.isVirtual()) |
3598 | RC = MRI.getRegClass(Reg); |
3599 | else |
3600 | RC = getPhysRegBaseClass(Reg); |
3601 | return RC && isSGPRClass(RC); |
3602 | } |
3603 | |
3604 | const TargetRegisterClass * |
3605 | SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { |
3606 | unsigned Size = getRegSizeInBits(RC: *SRC); |
3607 | const TargetRegisterClass *VRC = getVGPRClassForBitWidth(BitWidth: Size); |
3608 | assert(VRC && "Invalid register class size" ); |
3609 | return VRC; |
3610 | } |
3611 | |
3612 | const TargetRegisterClass * |
3613 | SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { |
3614 | unsigned Size = getRegSizeInBits(RC: *SRC); |
3615 | const TargetRegisterClass *ARC = getAGPRClassForBitWidth(BitWidth: Size); |
3616 | assert(ARC && "Invalid register class size" ); |
3617 | return ARC; |
3618 | } |
3619 | |
3620 | const TargetRegisterClass * |
3621 | SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { |
3622 | unsigned Size = getRegSizeInBits(RC: *VRC); |
3623 | if (Size == 32) |
3624 | return &AMDGPU::SGPR_32RegClass; |
3625 | const TargetRegisterClass *SRC = getSGPRClassForBitWidth(BitWidth: Size); |
3626 | assert(SRC && "Invalid register class size" ); |
3627 | return SRC; |
3628 | } |
3629 | |
3630 | const TargetRegisterClass * |
3631 | SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, |
3632 | const TargetRegisterClass *SubRC, |
3633 | unsigned SubIdx) const { |
3634 | // Ensure this subregister index is aligned in the super register. |
3635 | const TargetRegisterClass *MatchRC = |
3636 | getMatchingSuperRegClass(A: SuperRC, B: SubRC, Idx: SubIdx); |
3637 | return MatchRC && MatchRC->hasSubClassEq(RC: SuperRC) ? MatchRC : nullptr; |
3638 | } |
3639 | |
3640 | bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { |
3641 | if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && |
3642 | OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) |
3643 | return !ST.hasMFMAInlineLiteralBug(); |
3644 | |
3645 | return OpType >= AMDGPU::OPERAND_SRC_FIRST && |
3646 | OpType <= AMDGPU::OPERAND_SRC_LAST; |
3647 | } |
3648 | |
3649 | bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { |
3650 | // TODO: 64-bit operands have extending behavior from 32-bit literal. |
3651 | return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && |
3652 | OpType <= AMDGPU::OPERAND_REG_IMM_LAST; |
3653 | } |
3654 | |
3655 | /// Returns a lowest register that is not used at any point in the function. |
3656 | /// If all registers are used, then this function will return |
3657 | /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return |
3658 | /// highest unused register. |
3659 | MCRegister SIRegisterInfo::findUnusedRegister( |
3660 | const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, |
3661 | const MachineFunction &MF, bool ReserveHighestRegister) const { |
3662 | if (ReserveHighestRegister) { |
3663 | for (MCRegister Reg : reverse(C: *RC)) |
3664 | if (MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg)) |
3665 | return Reg; |
3666 | } else { |
3667 | for (MCRegister Reg : *RC) |
3668 | if (MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg)) |
3669 | return Reg; |
3670 | } |
3671 | return MCRegister(); |
3672 | } |
3673 | |
3674 | bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, |
3675 | const RegisterBankInfo &RBI, |
3676 | Register Reg) const { |
3677 | auto *RB = RBI.getRegBank(Reg, MRI, TRI: *MRI.getTargetRegisterInfo()); |
3678 | if (!RB) |
3679 | return false; |
3680 | |
3681 | return !RBI.isDivergentRegBank(RB); |
3682 | } |
3683 | |
3684 | ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, |
3685 | unsigned EltSize) const { |
3686 | const unsigned RegBitWidth = AMDGPU::getRegBitWidth(RC: *RC); |
3687 | assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2); |
3688 | |
3689 | const unsigned RegHalves = RegBitWidth / 16; |
3690 | const unsigned EltHalves = EltSize / 2; |
3691 | assert(RegSplitParts.size() + 1 >= EltHalves); |
3692 | |
3693 | const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1]; |
3694 | const unsigned NumParts = RegHalves / EltHalves; |
3695 | |
3696 | return ArrayRef(Parts.data(), NumParts); |
3697 | } |
3698 | |
3699 | const TargetRegisterClass* |
3700 | SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, |
3701 | Register Reg) const { |
3702 | return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); |
3703 | } |
3704 | |
3705 | const TargetRegisterClass * |
3706 | SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, |
3707 | const MachineOperand &MO) const { |
3708 | const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, Reg: MO.getReg()); |
3709 | return getSubRegisterClass(RC: SrcRC, Idx: MO.getSubReg()); |
3710 | } |
3711 | |
3712 | bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, |
3713 | Register Reg) const { |
3714 | const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); |
3715 | // Registers without classes are unaddressable, SGPR-like registers. |
3716 | return RC && isVGPRClass(RC); |
3717 | } |
3718 | |
3719 | bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, |
3720 | Register Reg) const { |
3721 | const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); |
3722 | |
3723 | // Registers without classes are unaddressable, SGPR-like registers. |
3724 | return RC && isAGPRClass(RC); |
3725 | } |
3726 | |
3727 | bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, |
3728 | const TargetRegisterClass *SrcRC, |
3729 | unsigned SubReg, |
3730 | const TargetRegisterClass *DstRC, |
3731 | unsigned DstSubReg, |
3732 | const TargetRegisterClass *NewRC, |
3733 | LiveIntervals &LIS) const { |
3734 | unsigned SrcSize = getRegSizeInBits(RC: *SrcRC); |
3735 | unsigned DstSize = getRegSizeInBits(RC: *DstRC); |
3736 | unsigned NewSize = getRegSizeInBits(RC: *NewRC); |
3737 | |
3738 | // Do not increase size of registers beyond dword, we would need to allocate |
3739 | // adjacent registers and constraint regalloc more than needed. |
3740 | |
3741 | // Always allow dword coalescing. |
3742 | if (SrcSize <= 32 || DstSize <= 32) |
3743 | return true; |
3744 | |
3745 | return NewSize <= DstSize || NewSize <= SrcSize; |
3746 | } |
3747 | |
3748 | unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, |
3749 | MachineFunction &MF) const { |
3750 | unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; |
3751 | switch (RC->getID()) { |
3752 | default: |
3753 | return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); |
3754 | case AMDGPU::VGPR_32RegClassID: |
3755 | return std::min( |
3756 | a: ST.getMaxNumVGPRs( |
3757 | WavesPerEU: MinOcc, |
3758 | DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()), |
3759 | b: ST.getMaxNumVGPRs(MF)); |
3760 | case AMDGPU::SGPR_32RegClassID: |
3761 | case AMDGPU::SGPR_LO16RegClassID: |
3762 | return std::min(a: ST.getMaxNumSGPRs(WavesPerEU: MinOcc, Addressable: true), b: ST.getMaxNumSGPRs(MF)); |
3763 | } |
3764 | } |
3765 | |
3766 | unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, |
3767 | unsigned Idx) const { |
3768 | if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || |
3769 | Idx == AMDGPU::RegisterPressureSets::AGPR_32) |
3770 | return getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass, |
3771 | MF&: const_cast<MachineFunction &>(MF)); |
3772 | |
3773 | if (Idx == AMDGPU::RegisterPressureSets::SReg_32) |
3774 | return getRegPressureLimit(RC: &AMDGPU::SGPR_32RegClass, |
3775 | MF&: const_cast<MachineFunction &>(MF)); |
3776 | |
3777 | llvm_unreachable("Unexpected register pressure set!" ); |
3778 | } |
3779 | |
3780 | const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { |
3781 | static const int Empty[] = { -1 }; |
3782 | |
3783 | if (RegPressureIgnoredUnits[RegUnit]) |
3784 | return Empty; |
3785 | |
3786 | return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); |
3787 | } |
3788 | |
3789 | bool SIRegisterInfo::getRegAllocationHints(Register VirtReg, |
3790 | ArrayRef<MCPhysReg> Order, |
3791 | SmallVectorImpl<MCPhysReg> &Hints, |
3792 | const MachineFunction &MF, |
3793 | const VirtRegMap *VRM, |
3794 | const LiveRegMatrix *Matrix) const { |
3795 | |
3796 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3797 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
3798 | |
3799 | std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VReg: VirtReg); |
3800 | |
3801 | switch (Hint.first) { |
3802 | case AMDGPURI::Size32: { |
3803 | Register Paired = Hint.second; |
3804 | assert(Paired); |
3805 | Register PairedPhys; |
3806 | if (Paired.isPhysical()) { |
3807 | PairedPhys = |
3808 | getMatchingSuperReg(Reg: Paired, SubIdx: AMDGPU::lo16, RC: &AMDGPU::VGPR_32RegClass); |
3809 | } else if (VRM && VRM->hasPhys(virtReg: Paired)) { |
3810 | PairedPhys = getMatchingSuperReg(Reg: VRM->getPhys(virtReg: Paired), SubIdx: AMDGPU::lo16, |
3811 | RC: &AMDGPU::VGPR_32RegClass); |
3812 | } |
3813 | |
3814 | // Prefer the paired physreg. |
3815 | if (PairedPhys) |
3816 | // isLo(Paired) is implicitly true here from the API of |
3817 | // getMatchingSuperReg. |
3818 | Hints.push_back(Elt: PairedPhys); |
3819 | return false; |
3820 | } |
3821 | case AMDGPURI::Size16: { |
3822 | Register Paired = Hint.second; |
3823 | assert(Paired); |
3824 | Register PairedPhys; |
3825 | if (Paired.isPhysical()) { |
3826 | PairedPhys = TRI->getSubReg(Reg: Paired, Idx: AMDGPU::lo16); |
3827 | } else if (VRM && VRM->hasPhys(virtReg: Paired)) { |
3828 | PairedPhys = TRI->getSubReg(Reg: VRM->getPhys(virtReg: Paired), Idx: AMDGPU::lo16); |
3829 | } |
3830 | |
3831 | // First prefer the paired physreg. |
3832 | if (PairedPhys) |
3833 | Hints.push_back(Elt: PairedPhys); |
3834 | else { |
3835 | // Add all the lo16 physregs. |
3836 | // When the Paired operand has not yet been assigned a physreg it is |
3837 | // better to try putting VirtReg in a lo16 register, because possibly |
3838 | // later Paired can be assigned to the overlapping register and the COPY |
3839 | // can be eliminated. |
3840 | for (MCPhysReg PhysReg : Order) { |
3841 | if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(Reg: PhysReg, MRI: *this)) |
3842 | continue; |
3843 | if (AMDGPU::VGPR_16RegClass.contains(Reg: PhysReg) && |
3844 | !MRI.isReserved(PhysReg)) |
3845 | Hints.push_back(Elt: PhysReg); |
3846 | } |
3847 | } |
3848 | return false; |
3849 | } |
3850 | default: |
3851 | return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, |
3852 | VRM); |
3853 | } |
3854 | } |
3855 | |
3856 | MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { |
3857 | // Not a callee saved register. |
3858 | return AMDGPU::SGPR30_SGPR31; |
3859 | } |
3860 | |
3861 | const TargetRegisterClass * |
3862 | SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, |
3863 | const RegisterBank &RB) const { |
3864 | switch (RB.getID()) { |
3865 | case AMDGPU::VGPRRegBankID: |
3866 | return getVGPRClassForBitWidth( |
3867 | BitWidth: std::max(a: ST.useRealTrue16Insts() ? 16u : 32u, b: Size)); |
3868 | case AMDGPU::VCCRegBankID: |
3869 | assert(Size == 1); |
3870 | return getWaveMaskRegClass(); |
3871 | case AMDGPU::SGPRRegBankID: |
3872 | return getSGPRClassForBitWidth(BitWidth: std::max(a: 32u, b: Size)); |
3873 | case AMDGPU::AGPRRegBankID: |
3874 | return getAGPRClassForBitWidth(BitWidth: std::max(a: 32u, b: Size)); |
3875 | default: |
3876 | llvm_unreachable("unknown register bank" ); |
3877 | } |
3878 | } |
3879 | |
3880 | const TargetRegisterClass * |
3881 | SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, |
3882 | const MachineRegisterInfo &MRI) const { |
3883 | const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg: MO.getReg()); |
3884 | if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(Val: RCOrRB)) |
3885 | return getRegClassForTypeOnBank(Ty: MRI.getType(Reg: MO.getReg()), Bank: *RB); |
3886 | |
3887 | if (const auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RCOrRB)) |
3888 | return getAllocatableClass(RC); |
3889 | |
3890 | return nullptr; |
3891 | } |
3892 | |
3893 | MCRegister SIRegisterInfo::getVCC() const { |
3894 | return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; |
3895 | } |
3896 | |
3897 | MCRegister SIRegisterInfo::getExec() const { |
3898 | return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
3899 | } |
3900 | |
3901 | const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { |
3902 | // VGPR tuples have an alignment requirement on gfx90a variants. |
3903 | return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass |
3904 | : &AMDGPU::VReg_64RegClass; |
3905 | } |
3906 | |
3907 | const TargetRegisterClass * |
3908 | SIRegisterInfo::getRegClass(unsigned RCID) const { |
3909 | switch ((int)RCID) { |
3910 | case AMDGPU::SReg_1RegClassID: |
3911 | return getBoolRC(); |
3912 | case AMDGPU::SReg_1_XEXECRegClassID: |
3913 | return getWaveMaskRegClass(); |
3914 | case -1: |
3915 | return nullptr; |
3916 | default: |
3917 | return AMDGPUGenRegisterInfo::getRegClass(i: RCID); |
3918 | } |
3919 | } |
3920 | |
3921 | // Find reaching register definition |
3922 | MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, |
3923 | MachineInstr &Use, |
3924 | MachineRegisterInfo &MRI, |
3925 | LiveIntervals *LIS) const { |
3926 | auto &MDT = LIS->getDomTree(); |
3927 | SlotIndex UseIdx = LIS->getInstructionIndex(Instr: Use); |
3928 | SlotIndex DefIdx; |
3929 | |
3930 | if (Reg.isVirtual()) { |
3931 | if (!LIS->hasInterval(Reg)) |
3932 | return nullptr; |
3933 | LiveInterval &LI = LIS->getInterval(Reg); |
3934 | LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubIdx: SubReg) |
3935 | : MRI.getMaxLaneMaskForVReg(Reg); |
3936 | VNInfo *V = nullptr; |
3937 | if (LI.hasSubRanges()) { |
3938 | for (auto &S : LI.subranges()) { |
3939 | if ((S.LaneMask & SubLanes) == SubLanes) { |
3940 | V = S.getVNInfoAt(Idx: UseIdx); |
3941 | break; |
3942 | } |
3943 | } |
3944 | } else { |
3945 | V = LI.getVNInfoAt(Idx: UseIdx); |
3946 | } |
3947 | if (!V) |
3948 | return nullptr; |
3949 | DefIdx = V->def; |
3950 | } else { |
3951 | // Find last def. |
3952 | for (MCRegUnit Unit : regunits(Reg: Reg.asMCReg())) { |
3953 | LiveRange &LR = LIS->getRegUnit(Unit); |
3954 | if (VNInfo *V = LR.getVNInfoAt(Idx: UseIdx)) { |
3955 | if (!DefIdx.isValid() || |
3956 | MDT.dominates(A: LIS->getInstructionFromIndex(index: DefIdx), |
3957 | B: LIS->getInstructionFromIndex(index: V->def))) |
3958 | DefIdx = V->def; |
3959 | } else { |
3960 | return nullptr; |
3961 | } |
3962 | } |
3963 | } |
3964 | |
3965 | MachineInstr *Def = LIS->getInstructionFromIndex(index: DefIdx); |
3966 | |
3967 | if (!Def || !MDT.dominates(A: Def, B: &Use)) |
3968 | return nullptr; |
3969 | |
3970 | assert(Def->modifiesRegister(Reg, this)); |
3971 | |
3972 | return Def; |
3973 | } |
3974 | |
3975 | MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { |
3976 | assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); |
3977 | |
3978 | for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, |
3979 | AMDGPU::SReg_32RegClass, |
3980 | AMDGPU::AGPR_32RegClass } ) { |
3981 | if (MCPhysReg Super = getMatchingSuperReg(Reg, SubIdx: AMDGPU::lo16, RC: &RC)) |
3982 | return Super; |
3983 | } |
3984 | if (MCPhysReg Super = getMatchingSuperReg(Reg, SubIdx: AMDGPU::hi16, |
3985 | RC: &AMDGPU::VGPR_32RegClass)) { |
3986 | return Super; |
3987 | } |
3988 | |
3989 | return AMDGPU::NoRegister; |
3990 | } |
3991 | |
3992 | bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { |
3993 | if (!ST.needsAlignedVGPRs()) |
3994 | return true; |
3995 | |
3996 | if (isVGPRClass(RC: &RC)) |
3997 | return RC.hasSuperClassEq(RC: getVGPRClassForBitWidth(BitWidth: getRegSizeInBits(RC))); |
3998 | if (isAGPRClass(RC: &RC)) |
3999 | return RC.hasSuperClassEq(RC: getAGPRClassForBitWidth(BitWidth: getRegSizeInBits(RC))); |
4000 | if (isVectorSuperClass(RC: &RC)) |
4001 | return RC.hasSuperClassEq( |
4002 | RC: getVectorSuperClassForBitWidth(BitWidth: getRegSizeInBits(RC))); |
4003 | |
4004 | return true; |
4005 | } |
4006 | |
4007 | const TargetRegisterClass * |
4008 | SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { |
4009 | if (!RC || !ST.needsAlignedVGPRs()) |
4010 | return RC; |
4011 | |
4012 | unsigned Size = getRegSizeInBits(RC: *RC); |
4013 | if (Size <= 32) |
4014 | return RC; |
4015 | |
4016 | if (isVGPRClass(RC)) |
4017 | return getAlignedVGPRClassForBitWidth(BitWidth: Size); |
4018 | if (isAGPRClass(RC)) |
4019 | return getAlignedAGPRClassForBitWidth(BitWidth: Size); |
4020 | if (isVectorSuperClass(RC)) |
4021 | return getAlignedVectorSuperClassForBitWidth(BitWidth: Size); |
4022 | |
4023 | return RC; |
4024 | } |
4025 | |
4026 | ArrayRef<MCPhysReg> |
4027 | SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { |
4028 | return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); |
4029 | } |
4030 | |
4031 | ArrayRef<MCPhysReg> |
4032 | SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { |
4033 | return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); |
4034 | } |
4035 | |
4036 | ArrayRef<MCPhysReg> |
4037 | SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { |
4038 | return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); |
4039 | } |
4040 | |
4041 | unsigned |
4042 | SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, |
4043 | unsigned SubReg) const { |
4044 | switch (RC->TSFlags & SIRCFlags::RegKindMask) { |
4045 | case SIRCFlags::HasSGPR: |
4046 | return std::min(a: 128u, b: getSubRegIdxSize(Idx: SubReg)); |
4047 | case SIRCFlags::HasAGPR: |
4048 | case SIRCFlags::HasVGPR: |
4049 | case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: |
4050 | return std::min(a: 32u, b: getSubRegIdxSize(Idx: SubReg)); |
4051 | default: |
4052 | break; |
4053 | } |
4054 | return 0; |
4055 | } |
4056 | |
4057 | unsigned |
4058 | SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, |
4059 | const TargetRegisterClass &RC) const { |
4060 | for (MCPhysReg Reg : reverse(C: RC.getRegisters())) |
4061 | if (MRI.isPhysRegUsed(PhysReg: Reg)) |
4062 | return getHWRegIndex(Reg) + 1; |
4063 | return 0; |
4064 | } |
4065 | |
4066 | SmallVector<StringLiteral> |
4067 | SIRegisterInfo::getVRegFlagsOfReg(Register Reg, |
4068 | const MachineFunction &MF) const { |
4069 | SmallVector<StringLiteral> RegFlags; |
4070 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
4071 | if (FuncInfo->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG)) |
4072 | RegFlags.push_back(Elt: "WWM_REG" ); |
4073 | return RegFlags; |
4074 | } |
4075 | |