1//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12/// with sequential versions where possible.
13///
14//===----------------------------------------------------------------------===//
15
16#include "GCNNSAReassign.h"
17#include "AMDGPU.h"
18#include "GCNSubtarget.h"
19#include "SIMachineFunctionInfo.h"
20#include "SIRegisterInfo.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/CodeGen/LiveIntervals.h"
23#include "llvm/CodeGen/LiveRegMatrix.h"
24#include "llvm/CodeGen/MachineFunctionPass.h"
25#include "llvm/CodeGen/VirtRegMap.h"
26#include "llvm/InitializePasses.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "amdgpu-nsa-reassign"
31
32STATISTIC(NumNSAInstructions,
33 "Number of NSA instructions with non-sequential address found");
34STATISTIC(NumNSAConverted,
35 "Number of NSA instructions changed to sequential");
36
37namespace {
38class GCNNSAReassignImpl {
39public:
40 GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS)
41 : VRM(VM), LRM(LM), LIS(LS) {}
42
43 bool run(MachineFunction &MF);
44
45private:
46 using NSA_Status = enum {
47 NOT_NSA, // Not an NSA instruction
48 FIXED, // NSA which we cannot modify
49 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
50 // to optimize.
51 CONTIGUOUS // NSA with all sequential address registers
52 };
53
54 const GCNSubtarget *ST;
55
56 const MachineRegisterInfo *MRI;
57
58 const SIRegisterInfo *TRI;
59
60 VirtRegMap *VRM;
61
62 LiveRegMatrix *LRM;
63
64 LiveIntervals *LIS;
65
66 unsigned MaxNumVGPRs;
67
68 const MCPhysReg *CSRegs;
69
70 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
71
72 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
73 unsigned StartReg) const;
74
75 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
76
77 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
78};
79
80class GCNNSAReassignLegacy : public MachineFunctionPass {
81public:
82 static char ID;
83
84 GCNNSAReassignLegacy() : MachineFunctionPass(ID) {
85 initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry());
86 }
87
88 bool runOnMachineFunction(MachineFunction &MF) override;
89
90 StringRef getPassName() const override { return "GCN NSA Reassign"; };
91
92 void getAnalysisUsage(AnalysisUsage &AU) const override {
93 AU.addRequired<LiveIntervalsWrapperPass>();
94 AU.addRequired<VirtRegMapWrapperLegacy>();
95 AU.addRequired<LiveRegMatrixWrapperLegacy>();
96 AU.setPreservesAll();
97 MachineFunctionPass::getAnalysisUsage(AU);
98 }
99};
100
101} // End anonymous namespace.
102
103INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign",
104 false, false)
105INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
106INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
107INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
108INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false,
109 false)
110
111char GCNNSAReassignLegacy::ID = 0;
112
113char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID;
114
115bool GCNNSAReassignImpl::tryAssignRegisters(
116 SmallVectorImpl<LiveInterval *> &Intervals, unsigned StartReg) const {
117 unsigned NumRegs = Intervals.size();
118
119 for (unsigned N = 0; N < NumRegs; ++N)
120 if (VRM->hasPhys(virtReg: Intervals[N]->reg()))
121 LRM->unassign(VirtReg: *Intervals[N]);
122
123 for (unsigned N = 0; N < NumRegs; ++N)
124 if (LRM->checkInterference(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N)))
125 return false;
126
127 for (unsigned N = 0; N < NumRegs; ++N)
128 LRM->assign(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N));
129
130 return true;
131}
132
133bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const {
134 for (unsigned N = 0; N < NumRegs; ++N) {
135 unsigned Reg = StartReg + N;
136 if (!MRI->isAllocatable(PhysReg: Reg))
137 return false;
138
139 for (unsigned I = 0; CSRegs[I]; ++I)
140 if (TRI->isSubRegisterEq(RegA: Reg, RegB: CSRegs[I]) &&
141 !LRM->isPhysRegUsed(PhysReg: CSRegs[I]))
142 return false;
143 }
144
145 return true;
146}
147
148bool GCNNSAReassignImpl::scavengeRegs(
149 SmallVectorImpl<LiveInterval *> &Intervals) const {
150 unsigned NumRegs = Intervals.size();
151
152 if (NumRegs > MaxNumVGPRs)
153 return false;
154 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
155
156 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
157 if (!canAssign(StartReg: Reg, NumRegs))
158 continue;
159
160 if (tryAssignRegisters(Intervals, StartReg: Reg))
161 return true;
162 }
163
164 return false;
165}
166
167GCNNSAReassignImpl::NSA_Status
168GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const {
169 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
170 if (!Info)
171 return NSA_Status::NOT_NSA;
172
173 switch (Info->MIMGEncoding) {
174 case AMDGPU::MIMGEncGfx10NSA:
175 case AMDGPU::MIMGEncGfx11NSA:
176 break;
177 default:
178 return NSA_Status::NOT_NSA;
179 }
180
181 int VAddr0Idx =
182 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0);
183
184 unsigned VgprBase = 0;
185 bool NSA = false;
186 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
187 const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + I);
188 Register Reg = Op.getReg();
189 if (Reg.isPhysical() || !VRM->isAssignedReg(virtReg: Reg))
190 return NSA_Status::FIXED;
191
192 Register PhysReg = VRM->getPhys(virtReg: Reg);
193
194 if (!Fast) {
195 if (!PhysReg)
196 return NSA_Status::FIXED;
197
198 // TODO: address the below limitation to handle GFX11 BVH instructions
199 // Bail if address is not a VGPR32. That should be possible to extend the
200 // optimization to work with subregs of a wider register tuples, but the
201 // logic to find free registers will be much more complicated with much
202 // less chances for success. That seems reasonable to assume that in most
203 // cases a tuple is used because a vector variable contains different
204 // parts of an address and it is either already consecutive or cannot
205 // be reassigned if not. If needed it is better to rely on register
206 // coalescer to process such address tuples.
207 if (TRI->getRegSizeInBits(RC: *MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
208 return NSA_Status::FIXED;
209
210 // InlineSpiller does not call LRM::assign() after an LI split leaving
211 // it in an inconsistent state, so we cannot call LRM::unassign().
212 // See llvm bug #48911.
213 // Skip reassign if a register has originated from such split.
214 // FIXME: Remove the workaround when bug #48911 is fixed.
215 if (VRM->getPreSplitReg(virtReg: Reg))
216 return NSA_Status::FIXED;
217
218 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
219
220 if (Def && Def->isCopy() && Def->getOperand(i: 1).getReg() == PhysReg)
221 return NSA_Status::FIXED;
222
223 for (auto U : MRI->use_nodbg_operands(Reg)) {
224 if (U.isImplicit())
225 return NSA_Status::FIXED;
226 const MachineInstr *UseInst = U.getParent();
227 if (UseInst->isCopy() && UseInst->getOperand(i: 0).getReg() == PhysReg)
228 return NSA_Status::FIXED;
229 }
230
231 if (!LIS->hasInterval(Reg))
232 return NSA_Status::FIXED;
233 }
234
235 if (I == 0)
236 VgprBase = PhysReg;
237 else if (VgprBase + I != PhysReg)
238 NSA = true;
239 }
240
241 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
242}
243
244bool GCNNSAReassignImpl::run(MachineFunction &MF) {
245 ST = &MF.getSubtarget<GCNSubtarget>();
246 if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())
247 return false;
248
249 MRI = &MF.getRegInfo();
250 TRI = ST->getRegisterInfo();
251
252 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
253 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
254 MaxNumVGPRs = std::min(
255 a: ST->getMaxNumVGPRs(WavesPerEU: MFI->getOccupancy(), DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()),
256 b: MaxNumVGPRs);
257 CSRegs = MRI->getCalleeSavedRegs();
258
259 using Candidate = std::pair<const MachineInstr*, bool>;
260 SmallVector<Candidate, 32> Candidates;
261 for (const MachineBasicBlock &MBB : MF) {
262 for (const MachineInstr &MI : MBB) {
263 switch (CheckNSA(MI)) {
264 default:
265 continue;
266 case NSA_Status::CONTIGUOUS:
267 Candidates.push_back(Elt: std::pair(&MI, true));
268 break;
269 case NSA_Status::NON_CONTIGUOUS:
270 Candidates.push_back(Elt: std::pair(&MI, false));
271 ++NumNSAInstructions;
272 break;
273 }
274 }
275 }
276
277 bool Changed = false;
278 for (auto &C : Candidates) {
279 if (C.second)
280 continue;
281
282 const MachineInstr *MI = C.first;
283 if (CheckNSA(MI: *MI, Fast: true) == NSA_Status::CONTIGUOUS) {
284 // Already happen to be fixed.
285 C.second = true;
286 ++NumNSAConverted;
287 continue;
288 }
289
290 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI->getOpcode());
291 int VAddr0Idx =
292 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::vaddr0);
293
294 SmallVector<LiveInterval *, 16> Intervals;
295 SmallVector<MCRegister, 16> OrigRegs;
296 SlotIndex MinInd, MaxInd;
297 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
298 const MachineOperand &Op = MI->getOperand(i: VAddr0Idx + I);
299 Register Reg = Op.getReg();
300 LiveInterval *LI = &LIS->getInterval(Reg);
301 if (llvm::is_contained(Range&: Intervals, Element: LI)) {
302 // Same register used, unable to make sequential
303 Intervals.clear();
304 break;
305 }
306 Intervals.push_back(Elt: LI);
307 OrigRegs.push_back(Elt: VRM->getPhys(virtReg: Reg));
308 if (LI->empty()) {
309 // The address input is undef, so it doesn't contribute to the relevant
310 // range. Seed a reasonable index range if required.
311 if (I == 0)
312 MinInd = MaxInd = LIS->getInstructionIndex(Instr: *MI);
313 continue;
314 }
315 MinInd = I != 0 ? std::min(a: MinInd, b: LI->beginIndex()) : LI->beginIndex();
316 MaxInd = I != 0 ? std::max(a: MaxInd, b: LI->endIndex()) : LI->endIndex();
317 }
318
319 if (Intervals.empty())
320 continue;
321
322 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
323 << "\tOriginal allocation:\t";
324 for (auto *LI
325 : Intervals) dbgs()
326 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
327 dbgs() << '\n');
328
329 bool Success = scavengeRegs(Intervals);
330 if (!Success) {
331 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
332 if (VRM->hasPhys(virtReg: Intervals.back()->reg())) // Did not change allocation.
333 continue;
334 } else {
335 // Check we did not make it worse for other instructions.
336 auto *I =
337 std::lower_bound(first: Candidates.begin(), last: &C, val: MinInd,
338 comp: [this](const Candidate &C, SlotIndex I) {
339 return LIS->getInstructionIndex(Instr: *C.first) < I;
340 });
341 for (auto *E = Candidates.end();
342 Success && I != E && LIS->getInstructionIndex(Instr: *I->first) < MaxInd;
343 ++I) {
344 if (I->second && CheckNSA(MI: *I->first, Fast: true) < NSA_Status::CONTIGUOUS) {
345 Success = false;
346 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
347 }
348 }
349 }
350
351 if (!Success) {
352 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
353 if (VRM->hasPhys(virtReg: Intervals[I]->reg()))
354 LRM->unassign(VirtReg: *Intervals[I]);
355
356 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
357 LRM->assign(VirtReg: *Intervals[I], PhysReg: OrigRegs[I]);
358
359 continue;
360 }
361
362 C.second = true;
363 ++NumNSAConverted;
364 LLVM_DEBUG(
365 dbgs() << "\tNew allocation:\t\t ["
366 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
367 << " : "
368 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
369 << "]\n");
370 Changed = true;
371 }
372
373 return Changed;
374}
375
376bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) {
377 auto *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
378 auto *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
379 auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
380
381 GCNNSAReassignImpl Impl(VRM, LRM, LIS);
382 return Impl.run(MF);
383}
384
385PreservedAnalyses
386GCNNSAReassignPass::run(MachineFunction &MF,
387 MachineFunctionAnalysisManager &MFAM) {
388 auto &VRM = MFAM.getResult<VirtRegMapAnalysis>(IR&: MF);
389 auto &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(IR&: MF);
390 auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF);
391
392 GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS);
393 Impl.run(MF);
394 return PreservedAnalyses::all();
395}
396