1//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12/// with sequential versions where possible.
13///
14//===----------------------------------------------------------------------===//
15
16#include "GCNNSAReassign.h"
17#include "AMDGPU.h"
18#include "GCNSubtarget.h"
19#include "SIMachineFunctionInfo.h"
20#include "SIRegisterInfo.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/CodeGen/LiveIntervals.h"
23#include "llvm/CodeGen/LiveRegMatrix.h"
24#include "llvm/CodeGen/MachineFunctionPass.h"
25#include "llvm/CodeGen/VirtRegMap.h"
26#include "llvm/InitializePasses.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "amdgpu-nsa-reassign"
31
32STATISTIC(NumNSAInstructions,
33 "Number of NSA instructions with non-sequential address found");
34STATISTIC(NumNSAConverted,
35 "Number of NSA instructions changed to sequential");
36
37namespace {
38class GCNNSAReassignImpl {
39public:
40 GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS)
41 : VRM(VM), LRM(LM), LIS(LS) {}
42
43 bool run(MachineFunction &MF);
44
45private:
46 enum NSA_Status {
47 NOT_NSA, // Not an NSA instruction
48 FIXED, // NSA which we cannot modify
49 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
50 // to optimize.
51 CONTIGUOUS // NSA with all sequential address registers
52 };
53
54 const GCNSubtarget *ST;
55
56 const MachineRegisterInfo *MRI;
57
58 const SIRegisterInfo *TRI;
59
60 VirtRegMap *VRM;
61
62 LiveRegMatrix *LRM;
63
64 LiveIntervals *LIS;
65
66 unsigned MaxNumVGPRs;
67
68 const MCPhysReg *CSRegs;
69
70 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
71
72 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
73 unsigned StartReg) const;
74
75 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
76
77 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
78};
79
80class GCNNSAReassignLegacy : public MachineFunctionPass {
81public:
82 static char ID;
83
84 GCNNSAReassignLegacy() : MachineFunctionPass(ID) {}
85
86 bool runOnMachineFunction(MachineFunction &MF) override;
87
88 StringRef getPassName() const override { return "GCN NSA Reassign"; };
89
90 void getAnalysisUsage(AnalysisUsage &AU) const override {
91 AU.addRequired<LiveIntervalsWrapperPass>();
92 AU.addRequired<VirtRegMapWrapperLegacy>();
93 AU.addRequired<LiveRegMatrixWrapperLegacy>();
94 AU.setPreservesAll();
95 MachineFunctionPass::getAnalysisUsage(AU);
96 }
97};
98
99} // End anonymous namespace.
100
101INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign",
102 false, false)
103INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
104INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
105INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
106INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false,
107 false)
108
109char GCNNSAReassignLegacy::ID = 0;
110
111char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID;
112
113bool GCNNSAReassignImpl::tryAssignRegisters(
114 SmallVectorImpl<LiveInterval *> &Intervals, unsigned StartReg) const {
115 unsigned NumRegs = Intervals.size();
116
117 for (unsigned N = 0; N < NumRegs; ++N)
118 if (VRM->hasPhys(virtReg: Intervals[N]->reg()))
119 LRM->unassign(VirtReg: *Intervals[N]);
120
121 for (unsigned N = 0; N < NumRegs; ++N)
122 if (LRM->checkInterference(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N)))
123 return false;
124
125 for (unsigned N = 0; N < NumRegs; ++N)
126 LRM->assign(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N));
127
128 return true;
129}
130
131bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const {
132 for (unsigned N = 0; N < NumRegs; ++N) {
133 unsigned Reg = StartReg + N;
134 if (!MRI->isAllocatable(PhysReg: Reg))
135 return false;
136
137 for (unsigned I = 0; CSRegs[I]; ++I)
138 if (TRI->isSubRegisterEq(RegA: Reg, RegB: CSRegs[I]) &&
139 !LRM->isPhysRegUsed(PhysReg: CSRegs[I]))
140 return false;
141 }
142
143 return true;
144}
145
146bool GCNNSAReassignImpl::scavengeRegs(
147 SmallVectorImpl<LiveInterval *> &Intervals) const {
148 unsigned NumRegs = Intervals.size();
149
150 if (NumRegs > MaxNumVGPRs)
151 return false;
152 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
153
154 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
155 if (!canAssign(StartReg: Reg, NumRegs))
156 continue;
157
158 if (tryAssignRegisters(Intervals, StartReg: Reg))
159 return true;
160 }
161
162 return false;
163}
164
165GCNNSAReassignImpl::NSA_Status
166GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const {
167 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
168 if (!Info)
169 return NSA_Status::NOT_NSA;
170
171 switch (Info->MIMGEncoding) {
172 case AMDGPU::MIMGEncGfx10NSA:
173 case AMDGPU::MIMGEncGfx11NSA:
174 break;
175 default:
176 return NSA_Status::NOT_NSA;
177 }
178
179 int VAddr0Idx =
180 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0);
181
182 unsigned VgprBase = 0;
183 bool NSA = false;
184 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
185 const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + I);
186 Register Reg = Op.getReg();
187 if (Reg.isPhysical() || !VRM->isAssignedReg(virtReg: Reg))
188 return NSA_Status::FIXED;
189
190 Register PhysReg = VRM->getPhys(virtReg: Reg);
191
192 if (!Fast) {
193 if (!PhysReg)
194 return NSA_Status::FIXED;
195
196 // TODO: address the below limitation to handle GFX11 BVH instructions
197 // Bail if address is not a VGPR32. That should be possible to extend the
198 // optimization to work with subregs of a wider register tuples, but the
199 // logic to find free registers will be much more complicated with much
200 // less chances for success. That seems reasonable to assume that in most
201 // cases a tuple is used because a vector variable contains different
202 // parts of an address and it is either already consecutive or cannot
203 // be reassigned if not. If needed it is better to rely on register
204 // coalescer to process such address tuples.
205 if (TRI->getRegSizeInBits(RC: *MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
206 return NSA_Status::FIXED;
207
208 // InlineSpiller does not call LRM::assign() after an LI split leaving
209 // it in an inconsistent state, so we cannot call LRM::unassign().
210 // See llvm bug #48911.
211 // Skip reassign if a register has originated from such split.
212 // FIXME: Remove the workaround when bug #48911 is fixed.
213 if (VRM->getPreSplitReg(virtReg: Reg))
214 return NSA_Status::FIXED;
215
216 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
217
218 if (Def && Def->isCopy() && Def->getOperand(i: 1).getReg() == PhysReg)
219 return NSA_Status::FIXED;
220
221 for (auto U : MRI->use_nodbg_operands(Reg)) {
222 if (U.isImplicit())
223 return NSA_Status::FIXED;
224 const MachineInstr *UseInst = U.getParent();
225 if (UseInst->isCopy() && UseInst->getOperand(i: 0).getReg() == PhysReg)
226 return NSA_Status::FIXED;
227 }
228
229 if (!LIS->hasInterval(Reg))
230 return NSA_Status::FIXED;
231 }
232
233 if (I == 0)
234 VgprBase = PhysReg;
235 else if (VgprBase + I != PhysReg)
236 NSA = true;
237 }
238
239 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
240}
241
242bool GCNNSAReassignImpl::run(MachineFunction &MF) {
243 ST = &MF.getSubtarget<GCNSubtarget>();
244 if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())
245 return false;
246
247 MRI = &MF.getRegInfo();
248 TRI = ST->getRegisterInfo();
249
250 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
251 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
252 MaxNumVGPRs = std::min(
253 a: ST->getMaxNumVGPRs(WavesPerEU: MFI->getOccupancy(), DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()),
254 b: MaxNumVGPRs);
255 CSRegs = MRI->getCalleeSavedRegs();
256
257 using Candidate = std::pair<const MachineInstr*, bool>;
258 SmallVector<Candidate, 32> Candidates;
259 for (const MachineBasicBlock &MBB : MF) {
260 for (const MachineInstr &MI : MBB) {
261 switch (CheckNSA(MI)) {
262 default:
263 continue;
264 case NSA_Status::CONTIGUOUS:
265 Candidates.push_back(Elt: std::pair(&MI, true));
266 break;
267 case NSA_Status::NON_CONTIGUOUS:
268 Candidates.push_back(Elt: std::pair(&MI, false));
269 ++NumNSAInstructions;
270 break;
271 }
272 }
273 }
274
275 bool Changed = false;
276 for (auto &C : Candidates) {
277 if (C.second)
278 continue;
279
280 const MachineInstr *MI = C.first;
281 if (CheckNSA(MI: *MI, Fast: true) == NSA_Status::CONTIGUOUS) {
282 // Already happen to be fixed.
283 C.second = true;
284 ++NumNSAConverted;
285 continue;
286 }
287
288 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI->getOpcode());
289 int VAddr0Idx =
290 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::vaddr0);
291
292 SmallVector<LiveInterval *, 16> Intervals;
293 SmallVector<MCRegister, 16> OrigRegs;
294 SlotIndex MinInd, MaxInd;
295 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
296 const MachineOperand &Op = MI->getOperand(i: VAddr0Idx + I);
297 Register Reg = Op.getReg();
298 LiveInterval *LI = &LIS->getInterval(Reg);
299 if (llvm::is_contained(Range&: Intervals, Element: LI)) {
300 // Same register used, unable to make sequential
301 Intervals.clear();
302 break;
303 }
304 Intervals.push_back(Elt: LI);
305 OrigRegs.push_back(Elt: VRM->getPhys(virtReg: Reg));
306 if (LI->empty()) {
307 // The address input is undef, so it doesn't contribute to the relevant
308 // range. Seed a reasonable index range if required.
309 if (I == 0)
310 MinInd = MaxInd = LIS->getInstructionIndex(Instr: *MI);
311 continue;
312 }
313 MinInd = I != 0 ? std::min(a: MinInd, b: LI->beginIndex()) : LI->beginIndex();
314 MaxInd = I != 0 ? std::max(a: MaxInd, b: LI->endIndex()) : LI->endIndex();
315 }
316
317 if (Intervals.empty())
318 continue;
319
320 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
321 << "\tOriginal allocation:\t";
322 for (auto *LI
323 : Intervals) dbgs()
324 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
325 dbgs() << '\n');
326
327 bool Success = scavengeRegs(Intervals);
328 if (!Success) {
329 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
330 if (VRM->hasPhys(virtReg: Intervals.back()->reg())) // Did not change allocation.
331 continue;
332 } else {
333 // Check we did not make it worse for other instructions.
334 auto *I =
335 std::lower_bound(first: Candidates.begin(), last: &C, val: MinInd,
336 comp: [this](const Candidate &C, SlotIndex I) {
337 return LIS->getInstructionIndex(Instr: *C.first) < I;
338 });
339 for (auto *E = Candidates.end();
340 Success && I != E && LIS->getInstructionIndex(Instr: *I->first) < MaxInd;
341 ++I) {
342 if (I->second && CheckNSA(MI: *I->first, Fast: true) < NSA_Status::CONTIGUOUS) {
343 Success = false;
344 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
345 }
346 }
347 }
348
349 if (!Success) {
350 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
351 if (VRM->hasPhys(virtReg: Intervals[I]->reg()))
352 LRM->unassign(VirtReg: *Intervals[I]);
353
354 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
355 LRM->assign(VirtReg: *Intervals[I], PhysReg: OrigRegs[I]);
356
357 continue;
358 }
359
360 C.second = true;
361 ++NumNSAConverted;
362 LLVM_DEBUG(
363 dbgs() << "\tNew allocation:\t\t ["
364 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
365 << " : "
366 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
367 << "]\n");
368 Changed = true;
369 }
370
371 return Changed;
372}
373
374bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) {
375 auto *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
376 auto *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
377 auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
378
379 GCNNSAReassignImpl Impl(VRM, LRM, LIS);
380 return Impl.run(MF);
381}
382
383PreservedAnalyses
384GCNNSAReassignPass::run(MachineFunction &MF,
385 MachineFunctionAnalysisManager &MFAM) {
386 auto &VRM = MFAM.getResult<VirtRegMapAnalysis>(IR&: MF);
387 auto &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(IR&: MF);
388 auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF);
389
390 GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS);
391 Impl.run(MF);
392 return PreservedAnalyses::all();
393}
394