1 | //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential |
11 | /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA |
12 | /// with sequential versions where possible. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "GCNNSAReassign.h" |
17 | #include "AMDGPU.h" |
18 | #include "GCNSubtarget.h" |
19 | #include "SIMachineFunctionInfo.h" |
20 | #include "SIRegisterInfo.h" |
21 | #include "llvm/ADT/Statistic.h" |
22 | #include "llvm/CodeGen/LiveIntervals.h" |
23 | #include "llvm/CodeGen/LiveRegMatrix.h" |
24 | #include "llvm/CodeGen/MachineFunctionPass.h" |
25 | #include "llvm/CodeGen/VirtRegMap.h" |
26 | #include "llvm/InitializePasses.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | #define DEBUG_TYPE "amdgpu-nsa-reassign" |
31 | |
32 | STATISTIC(NumNSAInstructions, |
33 | "Number of NSA instructions with non-sequential address found" ); |
34 | STATISTIC(NumNSAConverted, |
35 | "Number of NSA instructions changed to sequential" ); |
36 | |
37 | namespace { |
38 | class GCNNSAReassignImpl { |
39 | public: |
40 | GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS) |
41 | : VRM(VM), LRM(LM), LIS(LS) {} |
42 | |
43 | bool run(MachineFunction &MF); |
44 | |
45 | private: |
46 | using NSA_Status = enum { |
47 | NOT_NSA, // Not an NSA instruction |
48 | FIXED, // NSA which we cannot modify |
49 | NON_CONTIGUOUS, // NSA with non-sequential address which we can try |
50 | // to optimize. |
51 | CONTIGUOUS // NSA with all sequential address registers |
52 | }; |
53 | |
54 | const GCNSubtarget *ST; |
55 | |
56 | const MachineRegisterInfo *MRI; |
57 | |
58 | const SIRegisterInfo *TRI; |
59 | |
60 | VirtRegMap *VRM; |
61 | |
62 | LiveRegMatrix *LRM; |
63 | |
64 | LiveIntervals *LIS; |
65 | |
66 | unsigned MaxNumVGPRs; |
67 | |
68 | const MCPhysReg *CSRegs; |
69 | |
70 | NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; |
71 | |
72 | bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
73 | unsigned StartReg) const; |
74 | |
75 | bool canAssign(unsigned StartReg, unsigned NumRegs) const; |
76 | |
77 | bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; |
78 | }; |
79 | |
80 | class GCNNSAReassignLegacy : public MachineFunctionPass { |
81 | public: |
82 | static char ID; |
83 | |
84 | GCNNSAReassignLegacy() : MachineFunctionPass(ID) { |
85 | initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry()); |
86 | } |
87 | |
88 | bool runOnMachineFunction(MachineFunction &MF) override; |
89 | |
90 | StringRef getPassName() const override { return "GCN NSA Reassign" ; }; |
91 | |
92 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
93 | AU.addRequired<LiveIntervalsWrapperPass>(); |
94 | AU.addRequired<VirtRegMapWrapperLegacy>(); |
95 | AU.addRequired<LiveRegMatrixWrapperLegacy>(); |
96 | AU.setPreservesAll(); |
97 | MachineFunctionPass::getAnalysisUsage(AU); |
98 | } |
99 | }; |
100 | |
101 | } // End anonymous namespace. |
102 | |
103 | INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign" , |
104 | false, false) |
105 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
106 | INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) |
107 | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) |
108 | INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign" , false, |
109 | false) |
110 | |
111 | char GCNNSAReassignLegacy::ID = 0; |
112 | |
113 | char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID; |
114 | |
115 | bool GCNNSAReassignImpl::tryAssignRegisters( |
116 | SmallVectorImpl<LiveInterval *> &Intervals, unsigned StartReg) const { |
117 | unsigned NumRegs = Intervals.size(); |
118 | |
119 | for (unsigned N = 0; N < NumRegs; ++N) |
120 | if (VRM->hasPhys(virtReg: Intervals[N]->reg())) |
121 | LRM->unassign(VirtReg: *Intervals[N]); |
122 | |
123 | for (unsigned N = 0; N < NumRegs; ++N) |
124 | if (LRM->checkInterference(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N))) |
125 | return false; |
126 | |
127 | for (unsigned N = 0; N < NumRegs; ++N) |
128 | LRM->assign(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N)); |
129 | |
130 | return true; |
131 | } |
132 | |
133 | bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const { |
134 | for (unsigned N = 0; N < NumRegs; ++N) { |
135 | unsigned Reg = StartReg + N; |
136 | if (!MRI->isAllocatable(PhysReg: Reg)) |
137 | return false; |
138 | |
139 | for (unsigned I = 0; CSRegs[I]; ++I) |
140 | if (TRI->isSubRegisterEq(RegA: Reg, RegB: CSRegs[I]) && |
141 | !LRM->isPhysRegUsed(PhysReg: CSRegs[I])) |
142 | return false; |
143 | } |
144 | |
145 | return true; |
146 | } |
147 | |
148 | bool GCNNSAReassignImpl::scavengeRegs( |
149 | SmallVectorImpl<LiveInterval *> &Intervals) const { |
150 | unsigned NumRegs = Intervals.size(); |
151 | |
152 | if (NumRegs > MaxNumVGPRs) |
153 | return false; |
154 | unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; |
155 | |
156 | for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { |
157 | if (!canAssign(StartReg: Reg, NumRegs)) |
158 | continue; |
159 | |
160 | if (tryAssignRegisters(Intervals, StartReg: Reg)) |
161 | return true; |
162 | } |
163 | |
164 | return false; |
165 | } |
166 | |
167 | GCNNSAReassignImpl::NSA_Status |
168 | GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const { |
169 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
170 | if (!Info) |
171 | return NSA_Status::NOT_NSA; |
172 | |
173 | switch (Info->MIMGEncoding) { |
174 | case AMDGPU::MIMGEncGfx10NSA: |
175 | case AMDGPU::MIMGEncGfx11NSA: |
176 | break; |
177 | default: |
178 | return NSA_Status::NOT_NSA; |
179 | } |
180 | |
181 | int VAddr0Idx = |
182 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0); |
183 | |
184 | unsigned VgprBase = 0; |
185 | bool NSA = false; |
186 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) { |
187 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + I); |
188 | Register Reg = Op.getReg(); |
189 | if (Reg.isPhysical() || !VRM->isAssignedReg(virtReg: Reg)) |
190 | return NSA_Status::FIXED; |
191 | |
192 | Register PhysReg = VRM->getPhys(virtReg: Reg); |
193 | |
194 | if (!Fast) { |
195 | if (!PhysReg) |
196 | return NSA_Status::FIXED; |
197 | |
198 | // TODO: address the below limitation to handle GFX11 BVH instructions |
199 | // Bail if address is not a VGPR32. That should be possible to extend the |
200 | // optimization to work with subregs of a wider register tuples, but the |
201 | // logic to find free registers will be much more complicated with much |
202 | // less chances for success. That seems reasonable to assume that in most |
203 | // cases a tuple is used because a vector variable contains different |
204 | // parts of an address and it is either already consecutive or cannot |
205 | // be reassigned if not. If needed it is better to rely on register |
206 | // coalescer to process such address tuples. |
207 | if (TRI->getRegSizeInBits(RC: *MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) |
208 | return NSA_Status::FIXED; |
209 | |
210 | // InlineSpiller does not call LRM::assign() after an LI split leaving |
211 | // it in an inconsistent state, so we cannot call LRM::unassign(). |
212 | // See llvm bug #48911. |
213 | // Skip reassign if a register has originated from such split. |
214 | // FIXME: Remove the workaround when bug #48911 is fixed. |
215 | if (VRM->getPreSplitReg(virtReg: Reg)) |
216 | return NSA_Status::FIXED; |
217 | |
218 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
219 | |
220 | if (Def && Def->isCopy() && Def->getOperand(i: 1).getReg() == PhysReg) |
221 | return NSA_Status::FIXED; |
222 | |
223 | for (auto U : MRI->use_nodbg_operands(Reg)) { |
224 | if (U.isImplicit()) |
225 | return NSA_Status::FIXED; |
226 | const MachineInstr *UseInst = U.getParent(); |
227 | if (UseInst->isCopy() && UseInst->getOperand(i: 0).getReg() == PhysReg) |
228 | return NSA_Status::FIXED; |
229 | } |
230 | |
231 | if (!LIS->hasInterval(Reg)) |
232 | return NSA_Status::FIXED; |
233 | } |
234 | |
235 | if (I == 0) |
236 | VgprBase = PhysReg; |
237 | else if (VgprBase + I != PhysReg) |
238 | NSA = true; |
239 | } |
240 | |
241 | return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; |
242 | } |
243 | |
244 | bool GCNNSAReassignImpl::run(MachineFunction &MF) { |
245 | ST = &MF.getSubtarget<GCNSubtarget>(); |
246 | if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) |
247 | return false; |
248 | |
249 | MRI = &MF.getRegInfo(); |
250 | TRI = ST->getRegisterInfo(); |
251 | |
252 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
253 | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
254 | MaxNumVGPRs = std::min( |
255 | a: ST->getMaxNumVGPRs(WavesPerEU: MFI->getOccupancy(), DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()), |
256 | b: MaxNumVGPRs); |
257 | CSRegs = MRI->getCalleeSavedRegs(); |
258 | |
259 | using Candidate = std::pair<const MachineInstr*, bool>; |
260 | SmallVector<Candidate, 32> Candidates; |
261 | for (const MachineBasicBlock &MBB : MF) { |
262 | for (const MachineInstr &MI : MBB) { |
263 | switch (CheckNSA(MI)) { |
264 | default: |
265 | continue; |
266 | case NSA_Status::CONTIGUOUS: |
267 | Candidates.push_back(Elt: std::pair(&MI, true)); |
268 | break; |
269 | case NSA_Status::NON_CONTIGUOUS: |
270 | Candidates.push_back(Elt: std::pair(&MI, false)); |
271 | ++NumNSAInstructions; |
272 | break; |
273 | } |
274 | } |
275 | } |
276 | |
277 | bool Changed = false; |
278 | for (auto &C : Candidates) { |
279 | if (C.second) |
280 | continue; |
281 | |
282 | const MachineInstr *MI = C.first; |
283 | if (CheckNSA(MI: *MI, Fast: true) == NSA_Status::CONTIGUOUS) { |
284 | // Already happen to be fixed. |
285 | C.second = true; |
286 | ++NumNSAConverted; |
287 | continue; |
288 | } |
289 | |
290 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI->getOpcode()); |
291 | int VAddr0Idx = |
292 | AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::vaddr0); |
293 | |
294 | SmallVector<LiveInterval *, 16> Intervals; |
295 | SmallVector<MCRegister, 16> OrigRegs; |
296 | SlotIndex MinInd, MaxInd; |
297 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) { |
298 | const MachineOperand &Op = MI->getOperand(i: VAddr0Idx + I); |
299 | Register Reg = Op.getReg(); |
300 | LiveInterval *LI = &LIS->getInterval(Reg); |
301 | if (llvm::is_contained(Range&: Intervals, Element: LI)) { |
302 | // Same register used, unable to make sequential |
303 | Intervals.clear(); |
304 | break; |
305 | } |
306 | Intervals.push_back(Elt: LI); |
307 | OrigRegs.push_back(Elt: VRM->getPhys(virtReg: Reg)); |
308 | if (LI->empty()) { |
309 | // The address input is undef, so it doesn't contribute to the relevant |
310 | // range. Seed a reasonable index range if required. |
311 | if (I == 0) |
312 | MinInd = MaxInd = LIS->getInstructionIndex(Instr: *MI); |
313 | continue; |
314 | } |
315 | MinInd = I != 0 ? std::min(a: MinInd, b: LI->beginIndex()) : LI->beginIndex(); |
316 | MaxInd = I != 0 ? std::max(a: MaxInd, b: LI->endIndex()) : LI->endIndex(); |
317 | } |
318 | |
319 | if (Intervals.empty()) |
320 | continue; |
321 | |
322 | LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI |
323 | << "\tOriginal allocation:\t" ; |
324 | for (auto *LI |
325 | : Intervals) dbgs() |
326 | << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI); |
327 | dbgs() << '\n'); |
328 | |
329 | bool Success = scavengeRegs(Intervals); |
330 | if (!Success) { |
331 | LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n" ); |
332 | if (VRM->hasPhys(virtReg: Intervals.back()->reg())) // Did not change allocation. |
333 | continue; |
334 | } else { |
335 | // Check we did not make it worse for other instructions. |
336 | auto *I = |
337 | std::lower_bound(first: Candidates.begin(), last: &C, val: MinInd, |
338 | comp: [this](const Candidate &C, SlotIndex I) { |
339 | return LIS->getInstructionIndex(Instr: *C.first) < I; |
340 | }); |
341 | for (auto *E = Candidates.end(); |
342 | Success && I != E && LIS->getInstructionIndex(Instr: *I->first) < MaxInd; |
343 | ++I) { |
344 | if (I->second && CheckNSA(MI: *I->first, Fast: true) < NSA_Status::CONTIGUOUS) { |
345 | Success = false; |
346 | LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); |
347 | } |
348 | } |
349 | } |
350 | |
351 | if (!Success) { |
352 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) |
353 | if (VRM->hasPhys(virtReg: Intervals[I]->reg())) |
354 | LRM->unassign(VirtReg: *Intervals[I]); |
355 | |
356 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) |
357 | LRM->assign(VirtReg: *Intervals[I], PhysReg: OrigRegs[I]); |
358 | |
359 | continue; |
360 | } |
361 | |
362 | C.second = true; |
363 | ++NumNSAConverted; |
364 | LLVM_DEBUG( |
365 | dbgs() << "\tNew allocation:\t\t [" |
366 | << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI) |
367 | << " : " |
368 | << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI) |
369 | << "]\n" ); |
370 | Changed = true; |
371 | } |
372 | |
373 | return Changed; |
374 | } |
375 | |
376 | bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) { |
377 | auto *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM(); |
378 | auto *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM(); |
379 | auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
380 | |
381 | GCNNSAReassignImpl Impl(VRM, LRM, LIS); |
382 | return Impl.run(MF); |
383 | } |
384 | |
385 | PreservedAnalyses |
386 | GCNNSAReassignPass::run(MachineFunction &MF, |
387 | MachineFunctionAnalysisManager &MFAM) { |
388 | auto &VRM = MFAM.getResult<VirtRegMapAnalysis>(IR&: MF); |
389 | auto &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(IR&: MF); |
390 | auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF); |
391 | |
392 | GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS); |
393 | Impl.run(MF); |
394 | return PreservedAnalyses::all(); |
395 | } |
396 | |