| 1 | //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential |
| 11 | /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA |
| 12 | /// with sequential versions where possible. |
| 13 | /// |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | |
| 16 | #include "GCNNSAReassign.h" |
| 17 | #include "AMDGPU.h" |
| 18 | #include "GCNSubtarget.h" |
| 19 | #include "SIMachineFunctionInfo.h" |
| 20 | #include "SIRegisterInfo.h" |
| 21 | #include "llvm/ADT/Statistic.h" |
| 22 | #include "llvm/CodeGen/LiveIntervals.h" |
| 23 | #include "llvm/CodeGen/LiveRegMatrix.h" |
| 24 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 25 | #include "llvm/CodeGen/VirtRegMap.h" |
| 26 | #include "llvm/InitializePasses.h" |
| 27 | |
| 28 | using namespace llvm; |
| 29 | |
| 30 | #define DEBUG_TYPE "amdgpu-nsa-reassign" |
| 31 | |
| 32 | STATISTIC(NumNSAInstructions, |
| 33 | "Number of NSA instructions with non-sequential address found" ); |
| 34 | STATISTIC(NumNSAConverted, |
| 35 | "Number of NSA instructions changed to sequential" ); |
| 36 | |
| 37 | namespace { |
| 38 | class GCNNSAReassignImpl { |
| 39 | public: |
| 40 | GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS) |
| 41 | : VRM(VM), LRM(LM), LIS(LS) {} |
| 42 | |
| 43 | bool run(MachineFunction &MF); |
| 44 | |
| 45 | private: |
| 46 | using NSA_Status = enum { |
| 47 | NOT_NSA, // Not an NSA instruction |
| 48 | FIXED, // NSA which we cannot modify |
| 49 | NON_CONTIGUOUS, // NSA with non-sequential address which we can try |
| 50 | // to optimize. |
| 51 | CONTIGUOUS // NSA with all sequential address registers |
| 52 | }; |
| 53 | |
| 54 | const GCNSubtarget *ST; |
| 55 | |
| 56 | const MachineRegisterInfo *MRI; |
| 57 | |
| 58 | const SIRegisterInfo *TRI; |
| 59 | |
| 60 | VirtRegMap *VRM; |
| 61 | |
| 62 | LiveRegMatrix *LRM; |
| 63 | |
| 64 | LiveIntervals *LIS; |
| 65 | |
| 66 | unsigned MaxNumVGPRs; |
| 67 | |
| 68 | const MCPhysReg *CSRegs; |
| 69 | |
| 70 | NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; |
| 71 | |
| 72 | bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
| 73 | unsigned StartReg) const; |
| 74 | |
| 75 | bool canAssign(unsigned StartReg, unsigned NumRegs) const; |
| 76 | |
| 77 | bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; |
| 78 | }; |
| 79 | |
| 80 | class GCNNSAReassignLegacy : public MachineFunctionPass { |
| 81 | public: |
| 82 | static char ID; |
| 83 | |
| 84 | GCNNSAReassignLegacy() : MachineFunctionPass(ID) { |
| 85 | initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry()); |
| 86 | } |
| 87 | |
| 88 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 89 | |
| 90 | StringRef getPassName() const override { return "GCN NSA Reassign" ; }; |
| 91 | |
| 92 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 93 | AU.addRequired<LiveIntervalsWrapperPass>(); |
| 94 | AU.addRequired<VirtRegMapWrapperLegacy>(); |
| 95 | AU.addRequired<LiveRegMatrixWrapperLegacy>(); |
| 96 | AU.setPreservesAll(); |
| 97 | MachineFunctionPass::getAnalysisUsage(AU); |
| 98 | } |
| 99 | }; |
| 100 | |
| 101 | } // End anonymous namespace. |
| 102 | |
| 103 | INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign" , |
| 104 | false, false) |
| 105 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
| 106 | INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) |
| 107 | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) |
| 108 | INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign" , false, |
| 109 | false) |
| 110 | |
| 111 | char GCNNSAReassignLegacy::ID = 0; |
| 112 | |
| 113 | char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID; |
| 114 | |
| 115 | bool GCNNSAReassignImpl::tryAssignRegisters( |
| 116 | SmallVectorImpl<LiveInterval *> &Intervals, unsigned StartReg) const { |
| 117 | unsigned NumRegs = Intervals.size(); |
| 118 | |
| 119 | for (unsigned N = 0; N < NumRegs; ++N) |
| 120 | if (VRM->hasPhys(virtReg: Intervals[N]->reg())) |
| 121 | LRM->unassign(VirtReg: *Intervals[N]); |
| 122 | |
| 123 | for (unsigned N = 0; N < NumRegs; ++N) |
| 124 | if (LRM->checkInterference(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N))) |
| 125 | return false; |
| 126 | |
| 127 | for (unsigned N = 0; N < NumRegs; ++N) |
| 128 | LRM->assign(VirtReg: *Intervals[N], PhysReg: MCRegister::from(Val: StartReg + N)); |
| 129 | |
| 130 | return true; |
| 131 | } |
| 132 | |
| 133 | bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const { |
| 134 | for (unsigned N = 0; N < NumRegs; ++N) { |
| 135 | unsigned Reg = StartReg + N; |
| 136 | if (!MRI->isAllocatable(PhysReg: Reg)) |
| 137 | return false; |
| 138 | |
| 139 | for (unsigned I = 0; CSRegs[I]; ++I) |
| 140 | if (TRI->isSubRegisterEq(RegA: Reg, RegB: CSRegs[I]) && |
| 141 | !LRM->isPhysRegUsed(PhysReg: CSRegs[I])) |
| 142 | return false; |
| 143 | } |
| 144 | |
| 145 | return true; |
| 146 | } |
| 147 | |
| 148 | bool GCNNSAReassignImpl::scavengeRegs( |
| 149 | SmallVectorImpl<LiveInterval *> &Intervals) const { |
| 150 | unsigned NumRegs = Intervals.size(); |
| 151 | |
| 152 | if (NumRegs > MaxNumVGPRs) |
| 153 | return false; |
| 154 | unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; |
| 155 | |
| 156 | for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { |
| 157 | if (!canAssign(StartReg: Reg, NumRegs)) |
| 158 | continue; |
| 159 | |
| 160 | if (tryAssignRegisters(Intervals, StartReg: Reg)) |
| 161 | return true; |
| 162 | } |
| 163 | |
| 164 | return false; |
| 165 | } |
| 166 | |
| 167 | GCNNSAReassignImpl::NSA_Status |
| 168 | GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const { |
| 169 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
| 170 | if (!Info) |
| 171 | return NSA_Status::NOT_NSA; |
| 172 | |
| 173 | switch (Info->MIMGEncoding) { |
| 174 | case AMDGPU::MIMGEncGfx10NSA: |
| 175 | case AMDGPU::MIMGEncGfx11NSA: |
| 176 | break; |
| 177 | default: |
| 178 | return NSA_Status::NOT_NSA; |
| 179 | } |
| 180 | |
| 181 | int VAddr0Idx = |
| 182 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0); |
| 183 | |
| 184 | unsigned VgprBase = 0; |
| 185 | bool NSA = false; |
| 186 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) { |
| 187 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + I); |
| 188 | Register Reg = Op.getReg(); |
| 189 | if (Reg.isPhysical() || !VRM->isAssignedReg(virtReg: Reg)) |
| 190 | return NSA_Status::FIXED; |
| 191 | |
| 192 | Register PhysReg = VRM->getPhys(virtReg: Reg); |
| 193 | |
| 194 | if (!Fast) { |
| 195 | if (!PhysReg) |
| 196 | return NSA_Status::FIXED; |
| 197 | |
| 198 | // TODO: address the below limitation to handle GFX11 BVH instructions |
| 199 | // Bail if address is not a VGPR32. That should be possible to extend the |
| 200 | // optimization to work with subregs of a wider register tuples, but the |
| 201 | // logic to find free registers will be much more complicated with much |
| 202 | // less chances for success. That seems reasonable to assume that in most |
| 203 | // cases a tuple is used because a vector variable contains different |
| 204 | // parts of an address and it is either already consecutive or cannot |
| 205 | // be reassigned if not. If needed it is better to rely on register |
| 206 | // coalescer to process such address tuples. |
| 207 | if (TRI->getRegSizeInBits(RC: *MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) |
| 208 | return NSA_Status::FIXED; |
| 209 | |
| 210 | // InlineSpiller does not call LRM::assign() after an LI split leaving |
| 211 | // it in an inconsistent state, so we cannot call LRM::unassign(). |
| 212 | // See llvm bug #48911. |
| 213 | // Skip reassign if a register has originated from such split. |
| 214 | // FIXME: Remove the workaround when bug #48911 is fixed. |
| 215 | if (VRM->getPreSplitReg(virtReg: Reg)) |
| 216 | return NSA_Status::FIXED; |
| 217 | |
| 218 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
| 219 | |
| 220 | if (Def && Def->isCopy() && Def->getOperand(i: 1).getReg() == PhysReg) |
| 221 | return NSA_Status::FIXED; |
| 222 | |
| 223 | for (auto U : MRI->use_nodbg_operands(Reg)) { |
| 224 | if (U.isImplicit()) |
| 225 | return NSA_Status::FIXED; |
| 226 | const MachineInstr *UseInst = U.getParent(); |
| 227 | if (UseInst->isCopy() && UseInst->getOperand(i: 0).getReg() == PhysReg) |
| 228 | return NSA_Status::FIXED; |
| 229 | } |
| 230 | |
| 231 | if (!LIS->hasInterval(Reg)) |
| 232 | return NSA_Status::FIXED; |
| 233 | } |
| 234 | |
| 235 | if (I == 0) |
| 236 | VgprBase = PhysReg; |
| 237 | else if (VgprBase + I != PhysReg) |
| 238 | NSA = true; |
| 239 | } |
| 240 | |
| 241 | return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; |
| 242 | } |
| 243 | |
| 244 | bool GCNNSAReassignImpl::run(MachineFunction &MF) { |
| 245 | ST = &MF.getSubtarget<GCNSubtarget>(); |
| 246 | if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) |
| 247 | return false; |
| 248 | |
| 249 | MRI = &MF.getRegInfo(); |
| 250 | TRI = ST->getRegisterInfo(); |
| 251 | |
| 252 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 253 | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
| 254 | MaxNumVGPRs = std::min( |
| 255 | a: ST->getMaxNumVGPRs(WavesPerEU: MFI->getOccupancy(), DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()), |
| 256 | b: MaxNumVGPRs); |
| 257 | CSRegs = MRI->getCalleeSavedRegs(); |
| 258 | |
| 259 | using Candidate = std::pair<const MachineInstr*, bool>; |
| 260 | SmallVector<Candidate, 32> Candidates; |
| 261 | for (const MachineBasicBlock &MBB : MF) { |
| 262 | for (const MachineInstr &MI : MBB) { |
| 263 | switch (CheckNSA(MI)) { |
| 264 | default: |
| 265 | continue; |
| 266 | case NSA_Status::CONTIGUOUS: |
| 267 | Candidates.push_back(Elt: std::pair(&MI, true)); |
| 268 | break; |
| 269 | case NSA_Status::NON_CONTIGUOUS: |
| 270 | Candidates.push_back(Elt: std::pair(&MI, false)); |
| 271 | ++NumNSAInstructions; |
| 272 | break; |
| 273 | } |
| 274 | } |
| 275 | } |
| 276 | |
| 277 | bool Changed = false; |
| 278 | for (auto &C : Candidates) { |
| 279 | if (C.second) |
| 280 | continue; |
| 281 | |
| 282 | const MachineInstr *MI = C.first; |
| 283 | if (CheckNSA(MI: *MI, Fast: true) == NSA_Status::CONTIGUOUS) { |
| 284 | // Already happen to be fixed. |
| 285 | C.second = true; |
| 286 | ++NumNSAConverted; |
| 287 | continue; |
| 288 | } |
| 289 | |
| 290 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI->getOpcode()); |
| 291 | int VAddr0Idx = |
| 292 | AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::vaddr0); |
| 293 | |
| 294 | SmallVector<LiveInterval *, 16> Intervals; |
| 295 | SmallVector<MCRegister, 16> OrigRegs; |
| 296 | SlotIndex MinInd, MaxInd; |
| 297 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) { |
| 298 | const MachineOperand &Op = MI->getOperand(i: VAddr0Idx + I); |
| 299 | Register Reg = Op.getReg(); |
| 300 | LiveInterval *LI = &LIS->getInterval(Reg); |
| 301 | if (llvm::is_contained(Range&: Intervals, Element: LI)) { |
| 302 | // Same register used, unable to make sequential |
| 303 | Intervals.clear(); |
| 304 | break; |
| 305 | } |
| 306 | Intervals.push_back(Elt: LI); |
| 307 | OrigRegs.push_back(Elt: VRM->getPhys(virtReg: Reg)); |
| 308 | if (LI->empty()) { |
| 309 | // The address input is undef, so it doesn't contribute to the relevant |
| 310 | // range. Seed a reasonable index range if required. |
| 311 | if (I == 0) |
| 312 | MinInd = MaxInd = LIS->getInstructionIndex(Instr: *MI); |
| 313 | continue; |
| 314 | } |
| 315 | MinInd = I != 0 ? std::min(a: MinInd, b: LI->beginIndex()) : LI->beginIndex(); |
| 316 | MaxInd = I != 0 ? std::max(a: MaxInd, b: LI->endIndex()) : LI->endIndex(); |
| 317 | } |
| 318 | |
| 319 | if (Intervals.empty()) |
| 320 | continue; |
| 321 | |
| 322 | LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI |
| 323 | << "\tOriginal allocation:\t" ; |
| 324 | for (auto *LI |
| 325 | : Intervals) dbgs() |
| 326 | << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI); |
| 327 | dbgs() << '\n'); |
| 328 | |
| 329 | bool Success = scavengeRegs(Intervals); |
| 330 | if (!Success) { |
| 331 | LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n" ); |
| 332 | if (VRM->hasPhys(virtReg: Intervals.back()->reg())) // Did not change allocation. |
| 333 | continue; |
| 334 | } else { |
| 335 | // Check we did not make it worse for other instructions. |
| 336 | auto *I = |
| 337 | std::lower_bound(first: Candidates.begin(), last: &C, val: MinInd, |
| 338 | comp: [this](const Candidate &C, SlotIndex I) { |
| 339 | return LIS->getInstructionIndex(Instr: *C.first) < I; |
| 340 | }); |
| 341 | for (auto *E = Candidates.end(); |
| 342 | Success && I != E && LIS->getInstructionIndex(Instr: *I->first) < MaxInd; |
| 343 | ++I) { |
| 344 | if (I->second && CheckNSA(MI: *I->first, Fast: true) < NSA_Status::CONTIGUOUS) { |
| 345 | Success = false; |
| 346 | LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); |
| 347 | } |
| 348 | } |
| 349 | } |
| 350 | |
| 351 | if (!Success) { |
| 352 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) |
| 353 | if (VRM->hasPhys(virtReg: Intervals[I]->reg())) |
| 354 | LRM->unassign(VirtReg: *Intervals[I]); |
| 355 | |
| 356 | for (unsigned I = 0; I < Info->VAddrOperands; ++I) |
| 357 | LRM->assign(VirtReg: *Intervals[I], PhysReg: OrigRegs[I]); |
| 358 | |
| 359 | continue; |
| 360 | } |
| 361 | |
| 362 | C.second = true; |
| 363 | ++NumNSAConverted; |
| 364 | LLVM_DEBUG( |
| 365 | dbgs() << "\tNew allocation:\t\t [" |
| 366 | << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI) |
| 367 | << " : " |
| 368 | << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI) |
| 369 | << "]\n" ); |
| 370 | Changed = true; |
| 371 | } |
| 372 | |
| 373 | return Changed; |
| 374 | } |
| 375 | |
| 376 | bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) { |
| 377 | auto *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM(); |
| 378 | auto *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM(); |
| 379 | auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
| 380 | |
| 381 | GCNNSAReassignImpl Impl(VRM, LRM, LIS); |
| 382 | return Impl.run(MF); |
| 383 | } |
| 384 | |
| 385 | PreservedAnalyses |
| 386 | GCNNSAReassignPass::run(MachineFunction &MF, |
| 387 | MachineFunctionAnalysisManager &MFAM) { |
| 388 | auto &VRM = MFAM.getResult<VirtRegMapAnalysis>(IR&: MF); |
| 389 | auto &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(IR&: MF); |
| 390 | auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF); |
| 391 | |
| 392 | GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS); |
| 393 | Impl.run(MF); |
| 394 | return PreservedAnalyses::all(); |
| 395 | } |
| 396 | |