1 | //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential |
11 | /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA |
12 | /// with sequential versions where possible. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "AMDGPU.h" |
17 | #include "GCNSubtarget.h" |
18 | #include "SIMachineFunctionInfo.h" |
19 | #include "SIRegisterInfo.h" |
20 | #include "llvm/ADT/Statistic.h" |
21 | #include "llvm/CodeGen/LiveIntervals.h" |
22 | #include "llvm/CodeGen/LiveRegMatrix.h" |
23 | #include "llvm/CodeGen/MachineFunctionPass.h" |
24 | #include "llvm/CodeGen/VirtRegMap.h" |
25 | #include "llvm/InitializePasses.h" |
26 | |
27 | using namespace llvm; |
28 | |
29 | #define DEBUG_TYPE "amdgpu-nsa-reassign" |
30 | |
31 | STATISTIC(NumNSAInstructions, |
32 | "Number of NSA instructions with non-sequential address found" ); |
33 | STATISTIC(NumNSAConverted, |
34 | "Number of NSA instructions changed to sequential" ); |
35 | |
36 | namespace { |
37 | |
38 | class GCNNSAReassign : public MachineFunctionPass { |
39 | public: |
40 | static char ID; |
41 | |
42 | GCNNSAReassign() : MachineFunctionPass(ID) { |
43 | initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); |
44 | } |
45 | |
46 | bool runOnMachineFunction(MachineFunction &MF) override; |
47 | |
48 | StringRef getPassName() const override { return "GCN NSA Reassign" ; } |
49 | |
50 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
51 | AU.addRequired<LiveIntervalsWrapperPass>(); |
52 | AU.addRequired<VirtRegMap>(); |
53 | AU.addRequired<LiveRegMatrix>(); |
54 | AU.setPreservesAll(); |
55 | MachineFunctionPass::getAnalysisUsage(AU); |
56 | } |
57 | |
58 | private: |
59 | using NSA_Status = enum { |
60 | NOT_NSA, // Not an NSA instruction |
61 | FIXED, // NSA which we cannot modify |
62 | NON_CONTIGUOUS, // NSA with non-sequential address which we can try |
63 | // to optimize. |
64 | CONTIGUOUS // NSA with all sequential address registers |
65 | }; |
66 | |
67 | const GCNSubtarget *ST; |
68 | |
69 | const MachineRegisterInfo *MRI; |
70 | |
71 | const SIRegisterInfo *TRI; |
72 | |
73 | VirtRegMap *VRM; |
74 | |
75 | LiveRegMatrix *LRM; |
76 | |
77 | LiveIntervals *LIS; |
78 | |
79 | unsigned MaxNumVGPRs; |
80 | |
81 | const MCPhysReg *CSRegs; |
82 | |
83 | NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; |
84 | |
85 | bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
86 | unsigned StartReg) const; |
87 | |
88 | bool canAssign(unsigned StartReg, unsigned NumRegs) const; |
89 | |
90 | bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; |
91 | }; |
92 | |
93 | } // End anonymous namespace. |
94 | |
95 | INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign" , |
96 | false, false) |
97 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
98 | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
99 | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) |
100 | INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign" , |
101 | false, false) |
102 | |
103 | |
104 | char GCNNSAReassign::ID = 0; |
105 | |
106 | char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; |
107 | |
108 | bool |
109 | GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
110 | unsigned StartReg) const { |
111 | unsigned NumRegs = Intervals.size(); |
112 | |
113 | for (unsigned N = 0; N < NumRegs; ++N) |
114 | if (VRM->hasPhys(virtReg: Intervals[N]->reg())) |
115 | LRM->unassign(VirtReg: *Intervals[N]); |
116 | |
117 | for (unsigned N = 0; N < NumRegs; ++N) |
118 | if (LRM->checkInterference(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N))) |
119 | return false; |
120 | |
121 | for (unsigned N = 0; N < NumRegs; ++N) |
122 | LRM->assign(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N)); |
123 | |
124 | return true; |
125 | } |
126 | |
127 | bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { |
128 | for (unsigned N = 0; N < NumRegs; ++N) { |
129 | unsigned Reg = StartReg + N; |
130 | if (!MRI->isAllocatable(PhysReg: Reg)) |
131 | return false; |
132 | |
133 | for (unsigned I = 0; CSRegs[I]; ++I) |
134 | if (TRI->isSubRegisterEq(RegA: Reg, RegB: CSRegs[I]) && |
135 | !LRM->isPhysRegUsed(PhysReg: CSRegs[I])) |
136 | return false; |
137 | } |
138 | |
139 | return true; |
140 | } |
141 | |
142 | bool |
143 | GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { |
144 | unsigned NumRegs = Intervals.size(); |
145 | |
146 | if (NumRegs > MaxNumVGPRs) |
147 | return false; |
148 | unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; |
149 | |
150 | for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { |
151 | if (!canAssign(StartReg: Reg, NumRegs)) |
152 | continue; |
153 | |
154 | if (tryAssignRegisters(Intervals, StartReg: Reg)) |
155 | return true; |
156 | } |
157 | |
158 | return false; |
159 | } |
160 | |
161 | GCNNSAReassign::NSA_Status |
162 | GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { |
163 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
164 | if (!Info) |
165 | return NSA_Status::NOT_NSA; |
166 | |
167 | switch (Info->MIMGEncoding) { |
168 | case AMDGPU::MIMGEncGfx10NSA: |
169 | case AMDGPU::MIMGEncGfx11NSA: |
170 | break; |
171 | default: |
172 | return NSA_Status::NOT_NSA; |
173 | } |
174 | |
175 | int VAddr0Idx = |
176 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vaddr0); |
177 | |
178 | unsigned VgprBase = 0; |
179 | bool NSA = false; |
180 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) { |
181 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + I); |
182 | Register Reg = Op.getReg(); |
183 | if (Reg.isPhysical() || !VRM->isAssignedReg(virtReg: Reg)) |
184 | return NSA_Status::FIXED; |
185 | |
186 | Register PhysReg = VRM->getPhys(virtReg: Reg); |
187 | |
188 | if (!Fast) { |
189 | if (!PhysReg) |
190 | return NSA_Status::FIXED; |
191 | |
192 | // TODO: address the below limitation to handle GFX11 BVH instructions |
193 | // Bail if address is not a VGPR32. That should be possible to extend the |
194 | // optimization to work with subregs of a wider register tuples, but the |
195 | // logic to find free registers will be much more complicated with much |
196 | // less chances for success. That seems reasonable to assume that in most |
197 | // cases a tuple is used because a vector variable contains different |
198 | // parts of an address and it is either already consecutive or cannot |
199 | // be reassigned if not. If needed it is better to rely on register |
200 | // coalescer to process such address tuples. |
201 | if (TRI->getRegSizeInBits(RC: *MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) |
202 | return NSA_Status::FIXED; |
203 | |
204 | // InlineSpiller does not call LRM::assign() after an LI split leaving |
205 | // it in an inconsistent state, so we cannot call LRM::unassign(). |
206 | // See llvm bug #48911. |
207 | // Skip reassign if a register has originated from such split. |
208 | // FIXME: Remove the workaround when bug #48911 is fixed. |
209 | if (VRM->getPreSplitReg(virtReg: Reg)) |
210 | return NSA_Status::FIXED; |
211 | |
212 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
213 | |
214 | if (Def && Def->isCopy() && Def->getOperand(i: 1).getReg() == PhysReg) |
215 | return NSA_Status::FIXED; |
216 | |
217 | for (auto U : MRI->use_nodbg_operands(Reg)) { |
218 | if (U.isImplicit()) |
219 | return NSA_Status::FIXED; |
220 | const MachineInstr *UseInst = U.getParent(); |
221 | if (UseInst->isCopy() && UseInst->getOperand(i: 0).getReg() == PhysReg) |
222 | return NSA_Status::FIXED; |
223 | } |
224 | |
225 | if (!LIS->hasInterval(Reg)) |
226 | return NSA_Status::FIXED; |
227 | } |
228 | |
229 | if (I == 0) |
230 | VgprBase = PhysReg; |
231 | else if (VgprBase + I != PhysReg) |
232 | NSA = true; |
233 | } |
234 | |
235 | return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; |
236 | } |
237 | |
238 | bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { |
239 | ST = &MF.getSubtarget<GCNSubtarget>(); |
240 | if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) |
241 | return false; |
242 | |
243 | MRI = &MF.getRegInfo(); |
244 | TRI = ST->getRegisterInfo(); |
245 | VRM = &getAnalysis<VirtRegMap>(); |
246 | LRM = &getAnalysis<LiveRegMatrix>(); |
247 | LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
248 | |
249 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
250 | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
251 | MaxNumVGPRs = std::min(a: ST->getMaxNumVGPRs(WavesPerEU: MFI->getOccupancy()), b: MaxNumVGPRs); |
252 | CSRegs = MRI->getCalleeSavedRegs(); |
253 | |
254 | using Candidate = std::pair<const MachineInstr*, bool>; |
255 | SmallVector<Candidate, 32> Candidates; |
256 | for (const MachineBasicBlock &MBB : MF) { |
257 | for (const MachineInstr &MI : MBB) { |
258 | switch (CheckNSA(MI)) { |
259 | default: |
260 | continue; |
261 | case NSA_Status::CONTIGUOUS: |
262 | Candidates.push_back(Elt: std::pair(&MI, true)); |
263 | break; |
264 | case NSA_Status::NON_CONTIGUOUS: |
265 | Candidates.push_back(Elt: std::pair(&MI, false)); |
266 | ++NumNSAInstructions; |
267 | break; |
268 | } |
269 | } |
270 | } |
271 | |
272 | bool Changed = false; |
273 | for (auto &C : Candidates) { |
274 | if (C.second) |
275 | continue; |
276 | |
277 | const MachineInstr *MI = C.first; |
278 | if (CheckNSA(MI: *MI, Fast: true) == NSA_Status::CONTIGUOUS) { |
279 | // Already happen to be fixed. |
280 | C.second = true; |
281 | ++NumNSAConverted; |
282 | continue; |
283 | } |
284 | |
285 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI->getOpcode()); |
286 | int VAddr0Idx = |
287 | AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), NamedIdx: AMDGPU::OpName::vaddr0); |
288 | |
289 | SmallVector<LiveInterval *, 16> Intervals; |
290 | SmallVector<MCRegister, 16> OrigRegs; |
291 | SlotIndex MinInd, MaxInd; |
292 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) { |
293 | const MachineOperand &Op = MI->getOperand(i: VAddr0Idx + I); |
294 | Register Reg = Op.getReg(); |
295 | LiveInterval *LI = &LIS->getInterval(Reg); |
296 | if (llvm::is_contained(Range&: Intervals, Element: LI)) { |
297 | // Same register used, unable to make sequential |
298 | Intervals.clear(); |
299 | break; |
300 | } |
301 | Intervals.push_back(Elt: LI); |
302 | OrigRegs.push_back(Elt: VRM->getPhys(virtReg: Reg)); |
303 | if (LI->empty()) { |
304 | // The address input is undef, so it doesn't contribute to the relevant |
305 | // range. Seed a reasonable index range if required. |
306 | if (I == 0) |
307 | MinInd = MaxInd = LIS->getInstructionIndex(Instr: *MI); |
308 | continue; |
309 | } |
310 | MinInd = I != 0 ? std::min(a: MinInd, b: LI->beginIndex()) : LI->beginIndex(); |
311 | MaxInd = I != 0 ? std::max(a: MaxInd, b: LI->endIndex()) : LI->endIndex(); |
312 | } |
313 | |
314 | if (Intervals.empty()) |
315 | continue; |
316 | |
317 | LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI |
318 | << "\tOriginal allocation:\t" ; |
319 | for (auto *LI |
320 | : Intervals) dbgs() |
321 | << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI); |
322 | dbgs() << '\n'); |
323 | |
324 | bool Success = scavengeRegs(Intervals); |
325 | if (!Success) { |
326 | LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n" ); |
327 | if (VRM->hasPhys(virtReg: Intervals.back()->reg())) // Did not change allocation. |
328 | continue; |
329 | } else { |
330 | // Check we did not make it worse for other instructions. |
331 | auto I = std::lower_bound(first: Candidates.begin(), last: &C, val: MinInd, |
332 | comp: [this](const Candidate &C, SlotIndex I) { |
333 | return LIS->getInstructionIndex(Instr: *C.first) < I; |
334 | }); |
335 | for (auto E = Candidates.end(); Success && I != E && |
336 | LIS->getInstructionIndex(Instr: *I->first) < MaxInd; ++I) { |
337 | if (I->second && CheckNSA(MI: *I->first, Fast: true) < NSA_Status::CONTIGUOUS) { |
338 | Success = false; |
339 | LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); |
340 | } |
341 | } |
342 | } |
343 | |
344 | if (!Success) { |
345 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) |
346 | if (VRM->hasPhys(virtReg: Intervals[I]->reg())) |
347 | LRM->unassign(VirtReg: *Intervals[I]); |
348 | |
349 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) |
350 | LRM->assign(VirtReg: *Intervals[I], PhysReg: OrigRegs[I]); |
351 | |
352 | continue; |
353 | } |
354 | |
355 | C.second = true; |
356 | ++NumNSAConverted; |
357 | LLVM_DEBUG( |
358 | dbgs() << "\tNew allocation:\t\t [" |
359 | << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI) |
360 | << " : " |
361 | << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI) |
362 | << "]\n" ); |
363 | Changed = true; |
364 | } |
365 | |
366 | return Changed; |
367 | } |
368 | |