| 1 | //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// Copies from VGPR to SGPR registers are illegal and the register coalescer |
| 11 | /// will sometimes generate these illegal copies in situations like this: |
| 12 | /// |
| 13 | /// Register Class <vsrc> is the union of <vgpr> and <sgpr> |
| 14 | /// |
| 15 | /// BB0: |
| 16 | /// %0 <sgpr> = SCALAR_INST |
| 17 | /// %1 <vsrc> = COPY %0 <sgpr> |
| 18 | /// ... |
| 19 | /// BRANCH %cond BB1, BB2 |
| 20 | /// BB1: |
| 21 | /// %2 <vgpr> = VECTOR_INST |
| 22 | /// %3 <vsrc> = COPY %2 <vgpr> |
| 23 | /// BB2: |
| 24 | /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> |
| 25 | /// %5 <vgpr> = VECTOR_INST %4 <vsrc> |
| 26 | /// |
| 27 | /// |
| 28 | /// The coalescer will begin at BB0 and eliminate its copy, then the resulting |
| 29 | /// code will look like this: |
| 30 | /// |
| 31 | /// BB0: |
| 32 | /// %0 <sgpr> = SCALAR_INST |
| 33 | /// ... |
| 34 | /// BRANCH %cond BB1, BB2 |
| 35 | /// BB1: |
| 36 | /// %2 <vgpr> = VECTOR_INST |
| 37 | /// %3 <vsrc> = COPY %2 <vgpr> |
| 38 | /// BB2: |
| 39 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> |
| 40 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
| 41 | /// |
| 42 | /// Now that the result of the PHI instruction is an SGPR, the register |
| 43 | /// allocator is now forced to constrain the register class of %3 to |
| 44 | /// <sgpr> so we end up with final code like this: |
| 45 | /// |
| 46 | /// BB0: |
| 47 | /// %0 <sgpr> = SCALAR_INST |
| 48 | /// ... |
| 49 | /// BRANCH %cond BB1, BB2 |
| 50 | /// BB1: |
| 51 | /// %2 <vgpr> = VECTOR_INST |
| 52 | /// %3 <sgpr> = COPY %2 <vgpr> |
| 53 | /// BB2: |
| 54 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> |
| 55 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
| 56 | /// |
| 57 | /// Now this code contains an illegal copy from a VGPR to an SGPR. |
| 58 | /// |
| 59 | /// In order to avoid this problem, this pass searches for PHI instructions |
| 60 | /// which define a <vsrc> register and constrains its definition class to |
| 61 | /// <vgpr> if the user of the PHI's definition register is a vector instruction. |
| 62 | /// If the PHI's definition class is constrained to <vgpr> then the coalescer |
| 63 | /// will be unable to perform the COPY removal from the above example which |
| 64 | /// ultimately led to the creation of an illegal COPY. |
| 65 | //===----------------------------------------------------------------------===// |
| 66 | |
| 67 | #include "SIFixSGPRCopies.h" |
| 68 | #include "AMDGPU.h" |
| 69 | #include "GCNSubtarget.h" |
| 70 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 71 | #include "llvm/CodeGen/MachineDominators.h" |
| 72 | #include "llvm/InitializePasses.h" |
| 73 | #include "llvm/Target/TargetMachine.h" |
| 74 | |
| 75 | using namespace llvm; |
| 76 | |
| 77 | #define DEBUG_TYPE "si-fix-sgpr-copies" |
| 78 | |
| 79 | static cl::opt<bool> EnableM0Merge( |
| 80 | "amdgpu-enable-merge-m0" , |
| 81 | cl::desc("Merge and hoist M0 initializations" ), |
| 82 | cl::init(Val: true)); |
| 83 | |
| 84 | namespace { |
| 85 | |
| 86 | class V2SCopyInfo { |
| 87 | public: |
| 88 | // VGPR to SGPR copy being processed |
| 89 | MachineInstr *Copy; |
| 90 | // All SALU instructions reachable from this copy in SSA graph |
| 91 | SetVector<MachineInstr *> SChain; |
| 92 | // Number of SGPR to VGPR copies that are used to put the SALU computation |
| 93 | // results back to VALU. |
| 94 | unsigned NumSVCopies = 0; |
| 95 | |
| 96 | unsigned Score = 0; |
| 97 | // Actual count of v_readfirstlane_b32 |
| 98 | // which need to be inserted to keep SChain SALU |
| 99 | unsigned NumReadfirstlanes = 0; |
| 100 | // Current score state. To speedup selection V2SCopyInfos for processing |
| 101 | bool NeedToBeConvertedToVALU = false; |
| 102 | // Unique ID. Used as a key for mapping to keep permanent order. |
| 103 | unsigned ID; |
| 104 | |
| 105 | // Count of another VGPR to SGPR copies that contribute to the |
| 106 | // current copy SChain |
| 107 | unsigned SiblingPenalty = 0; |
| 108 | SetVector<unsigned> Siblings; |
| 109 | V2SCopyInfo() : Copy(nullptr), ID(0){}; |
| 110 | V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) |
| 111 | : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; |
| 112 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 113 | void dump() { |
| 114 | dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() |
| 115 | << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty |
| 116 | << "\nScore: " << Score << "\n" ; |
| 117 | } |
| 118 | #endif |
| 119 | }; |
| 120 | |
| 121 | class SIFixSGPRCopies { |
| 122 | MachineDominatorTree *MDT; |
| 123 | SmallVector<MachineInstr*, 4> SCCCopies; |
| 124 | SmallVector<MachineInstr*, 4> RegSequences; |
| 125 | SmallVector<MachineInstr*, 4> PHINodes; |
| 126 | SmallVector<MachineInstr*, 4> S2VCopies; |
| 127 | unsigned NextVGPRToSGPRCopyID = 0; |
| 128 | MapVector<unsigned, V2SCopyInfo> V2SCopies; |
| 129 | DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; |
| 130 | DenseSet<MachineInstr *> PHISources; |
| 131 | |
| 132 | public: |
| 133 | MachineRegisterInfo *MRI; |
| 134 | const SIRegisterInfo *TRI; |
| 135 | const SIInstrInfo *TII; |
| 136 | |
| 137 | SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {} |
| 138 | |
| 139 | bool run(MachineFunction &MF); |
| 140 | void fixSCCCopies(MachineFunction &MF); |
| 141 | void prepareRegSequenceAndPHIs(MachineFunction &MF); |
| 142 | unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } |
| 143 | bool needToBeConvertedToVALU(V2SCopyInfo *I); |
| 144 | void analyzeVGPRToSGPRCopy(MachineInstr *MI); |
| 145 | void lowerVGPR2SGPRCopies(MachineFunction &MF); |
| 146 | // Handles copies which source register is: |
| 147 | // 1. Physical register |
| 148 | // 2. AGPR |
| 149 | // 3. Defined by the instruction the merely moves the immediate |
| 150 | bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); |
| 151 | |
| 152 | void processPHINode(MachineInstr &MI); |
| 153 | |
| 154 | // Check if MO is an immediate materialized into a VGPR, and if so replace it |
| 155 | // with an SGPR immediate. The VGPR immediate is also deleted if it does not |
| 156 | // have any other uses. |
| 157 | bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, |
| 158 | MachineBasicBlock *BlockToInsertTo, |
| 159 | MachineBasicBlock::iterator PointToInsertTo, |
| 160 | const DebugLoc &DL); |
| 161 | }; |
| 162 | |
| 163 | class SIFixSGPRCopiesLegacy : public MachineFunctionPass { |
| 164 | public: |
| 165 | static char ID; |
| 166 | |
| 167 | SIFixSGPRCopiesLegacy() : MachineFunctionPass(ID) {} |
| 168 | |
| 169 | bool runOnMachineFunction(MachineFunction &MF) override { |
| 170 | MachineDominatorTree *MDT = |
| 171 | &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
| 172 | SIFixSGPRCopies Impl(MDT); |
| 173 | return Impl.run(MF); |
| 174 | } |
| 175 | |
| 176 | StringRef getPassName() const override { return "SI Fix SGPR copies" ; } |
| 177 | |
| 178 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 179 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
| 180 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
| 181 | AU.setPreservesCFG(); |
| 182 | MachineFunctionPass::getAnalysisUsage(AU); |
| 183 | } |
| 184 | }; |
| 185 | |
| 186 | } // end anonymous namespace |
| 187 | |
| 188 | INITIALIZE_PASS_BEGIN(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies" , |
| 189 | false, false) |
| 190 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
| 191 | INITIALIZE_PASS_END(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies" , |
| 192 | false, false) |
| 193 | |
| 194 | char SIFixSGPRCopiesLegacy::ID = 0; |
| 195 | |
| 196 | char &llvm::SIFixSGPRCopiesLegacyID = SIFixSGPRCopiesLegacy::ID; |
| 197 | |
| 198 | FunctionPass *llvm::createSIFixSGPRCopiesLegacyPass() { |
| 199 | return new SIFixSGPRCopiesLegacy(); |
| 200 | } |
| 201 | |
| 202 | static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> |
| 203 | getCopyRegClasses(const MachineInstr &Copy, |
| 204 | const SIRegisterInfo &TRI, |
| 205 | const MachineRegisterInfo &MRI) { |
| 206 | Register DstReg = Copy.getOperand(i: 0).getReg(); |
| 207 | Register SrcReg = Copy.getOperand(i: 1).getReg(); |
| 208 | |
| 209 | const TargetRegisterClass *SrcRC = SrcReg.isVirtual() |
| 210 | ? MRI.getRegClass(Reg: SrcReg) |
| 211 | : TRI.getPhysRegBaseClass(Reg: SrcReg); |
| 212 | |
| 213 | // We don't really care about the subregister here. |
| 214 | // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); |
| 215 | |
| 216 | const TargetRegisterClass *DstRC = DstReg.isVirtual() |
| 217 | ? MRI.getRegClass(Reg: DstReg) |
| 218 | : TRI.getPhysRegBaseClass(Reg: DstReg); |
| 219 | |
| 220 | return std::pair(SrcRC, DstRC); |
| 221 | } |
| 222 | |
| 223 | static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, |
| 224 | const TargetRegisterClass *DstRC, |
| 225 | const SIRegisterInfo &TRI) { |
| 226 | return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: DstRC) && |
| 227 | TRI.hasVectorRegisters(RC: SrcRC); |
| 228 | } |
| 229 | |
| 230 | static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, |
| 231 | const TargetRegisterClass *DstRC, |
| 232 | const SIRegisterInfo &TRI) { |
| 233 | return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: SrcRC) && |
| 234 | TRI.hasVectorRegisters(RC: DstRC); |
| 235 | } |
| 236 | |
| 237 | static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, |
| 238 | const SIRegisterInfo *TRI, |
| 239 | const SIInstrInfo *TII) { |
| 240 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
| 241 | auto &Src = MI.getOperand(i: 1); |
| 242 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 243 | Register SrcReg = Src.getReg(); |
| 244 | if (!SrcReg.isVirtual() || !DstReg.isVirtual()) |
| 245 | return false; |
| 246 | |
| 247 | for (const auto &MO : MRI.reg_nodbg_operands(Reg: DstReg)) { |
| 248 | const auto *UseMI = MO.getParent(); |
| 249 | if (UseMI == &MI) |
| 250 | continue; |
| 251 | if (MO.isDef() || UseMI->getParent() != MI.getParent() || |
| 252 | UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
| 253 | return false; |
| 254 | |
| 255 | unsigned OpIdx = MO.getOperandNo(); |
| 256 | if (OpIdx >= UseMI->getDesc().getNumOperands() || |
| 257 | !TII->isOperandLegal(MI: *UseMI, OpIdx, MO: &Src)) |
| 258 | return false; |
| 259 | } |
| 260 | // Change VGPR to SGPR destination. |
| 261 | MRI.setRegClass(Reg: DstReg, RC: TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: DstReg))); |
| 262 | return true; |
| 263 | } |
| 264 | |
| 265 | // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. |
| 266 | // |
| 267 | // SGPRx = ... |
| 268 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
| 269 | // VGPRz = COPY SGPRy |
| 270 | // |
| 271 | // ==> |
| 272 | // |
| 273 | // VGPRx = COPY SGPRx |
| 274 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
| 275 | // |
| 276 | // This exposes immediate folding opportunities when materializing 64-bit |
| 277 | // immediates. |
| 278 | static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, |
| 279 | const SIRegisterInfo *TRI, |
| 280 | const SIInstrInfo *TII, |
| 281 | MachineRegisterInfo &MRI) { |
| 282 | assert(MI.isRegSequence()); |
| 283 | |
| 284 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 285 | if (!TRI->isSGPRClass(RC: MRI.getRegClass(Reg: DstReg))) |
| 286 | return false; |
| 287 | |
| 288 | if (!MRI.hasOneUse(RegNo: DstReg)) |
| 289 | return false; |
| 290 | |
| 291 | MachineInstr &CopyUse = *MRI.use_instr_begin(RegNo: DstReg); |
| 292 | if (!CopyUse.isCopy()) |
| 293 | return false; |
| 294 | |
| 295 | // It is illegal to have vreg inputs to a physreg defining reg_sequence. |
| 296 | if (CopyUse.getOperand(i: 0).getReg().isPhysical()) |
| 297 | return false; |
| 298 | |
| 299 | const TargetRegisterClass *SrcRC, *DstRC; |
| 300 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: CopyUse, TRI: *TRI, MRI); |
| 301 | |
| 302 | if (!isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
| 303 | return false; |
| 304 | |
| 305 | if (tryChangeVGPRtoSGPRinCopy(MI&: CopyUse, TRI, TII)) |
| 306 | return true; |
| 307 | |
| 308 | // TODO: Could have multiple extracts? |
| 309 | unsigned SubReg = CopyUse.getOperand(i: 1).getSubReg(); |
| 310 | if (SubReg != AMDGPU::NoSubRegister) |
| 311 | return false; |
| 312 | |
| 313 | MRI.setRegClass(Reg: DstReg, RC: DstRC); |
| 314 | |
| 315 | // SGPRx = ... |
| 316 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
| 317 | // VGPRz = COPY SGPRy |
| 318 | |
| 319 | // => |
| 320 | // VGPRx = COPY SGPRx |
| 321 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
| 322 | |
| 323 | MI.getOperand(i: 0).setReg(CopyUse.getOperand(i: 0).getReg()); |
| 324 | bool IsAGPR = TRI->isAGPRClass(RC: DstRC); |
| 325 | |
| 326 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
| 327 | const TargetRegisterClass *SrcRC = |
| 328 | TRI->getRegClassForOperandReg(MRI, MO: MI.getOperand(i: I)); |
| 329 | assert(TRI->isSGPRClass(SrcRC) && |
| 330 | "Expected SGPR REG_SEQUENCE to only have SGPR inputs" ); |
| 331 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SRC: SrcRC); |
| 332 | |
| 333 | Register TmpReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
| 334 | |
| 335 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), |
| 336 | DestReg: TmpReg) |
| 337 | .add(MO: MI.getOperand(i: I)); |
| 338 | |
| 339 | if (IsAGPR) { |
| 340 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SRC: SrcRC); |
| 341 | Register TmpAReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
| 342 | unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? |
| 343 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; |
| 344 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc), |
| 345 | DestReg: TmpAReg) |
| 346 | .addReg(RegNo: TmpReg, flags: RegState::Kill); |
| 347 | TmpReg = TmpAReg; |
| 348 | } |
| 349 | |
| 350 | MI.getOperand(i: I).setReg(TmpReg); |
| 351 | } |
| 352 | |
| 353 | CopyUse.eraseFromParent(); |
| 354 | return true; |
| 355 | } |
| 356 | |
| 357 | static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, |
| 358 | const MachineInstr *MoveImm, |
| 359 | const SIInstrInfo *TII, |
| 360 | unsigned &SMovOp, |
| 361 | int64_t &Imm) { |
| 362 | if (Copy->getOpcode() != AMDGPU::COPY) |
| 363 | return false; |
| 364 | |
| 365 | if (!MoveImm->isMoveImmediate()) |
| 366 | return false; |
| 367 | |
| 368 | const MachineOperand *ImmOp = |
| 369 | TII->getNamedOperand(MI: *MoveImm, OperandName: AMDGPU::OpName::src0); |
| 370 | if (!ImmOp->isImm()) |
| 371 | return false; |
| 372 | |
| 373 | // FIXME: Handle copies with sub-regs. |
| 374 | if (Copy->getOperand(i: 1).getSubReg()) |
| 375 | return false; |
| 376 | |
| 377 | switch (MoveImm->getOpcode()) { |
| 378 | default: |
| 379 | return false; |
| 380 | case AMDGPU::V_MOV_B32_e32: |
| 381 | SMovOp = AMDGPU::S_MOV_B32; |
| 382 | break; |
| 383 | case AMDGPU::V_MOV_B64_PSEUDO: |
| 384 | SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; |
| 385 | break; |
| 386 | } |
| 387 | Imm = ImmOp->getImm(); |
| 388 | return true; |
| 389 | } |
| 390 | |
| 391 | template <class UnaryPredicate> |
| 392 | bool searchPredecessors(const MachineBasicBlock *MBB, |
| 393 | const MachineBasicBlock *CutOff, |
| 394 | UnaryPredicate Predicate) { |
| 395 | if (MBB == CutOff) |
| 396 | return false; |
| 397 | |
| 398 | DenseSet<const MachineBasicBlock *> Visited; |
| 399 | SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); |
| 400 | |
| 401 | while (!Worklist.empty()) { |
| 402 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
| 403 | |
| 404 | if (!Visited.insert(V: MBB).second) |
| 405 | continue; |
| 406 | if (MBB == CutOff) |
| 407 | continue; |
| 408 | if (Predicate(MBB)) |
| 409 | return true; |
| 410 | |
| 411 | Worklist.append(in_start: MBB->pred_begin(), in_end: MBB->pred_end()); |
| 412 | } |
| 413 | |
| 414 | return false; |
| 415 | } |
| 416 | |
| 417 | // Checks if there is potential path From instruction To instruction. |
| 418 | // If CutOff is specified and it sits in between of that path we ignore |
| 419 | // a higher portion of the path and report it is not reachable. |
| 420 | static bool isReachable(const MachineInstr *From, |
| 421 | const MachineInstr *To, |
| 422 | const MachineBasicBlock *CutOff, |
| 423 | MachineDominatorTree &MDT) { |
| 424 | if (MDT.dominates(A: From, B: To)) |
| 425 | return true; |
| 426 | |
| 427 | const MachineBasicBlock *MBBFrom = From->getParent(); |
| 428 | const MachineBasicBlock *MBBTo = To->getParent(); |
| 429 | |
| 430 | // Do predecessor search. |
| 431 | // We should almost never get here since we do not usually produce M0 stores |
| 432 | // other than -1. |
| 433 | return searchPredecessors(MBB: MBBTo, CutOff, Predicate: [MBBFrom] |
| 434 | (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); |
| 435 | } |
| 436 | |
| 437 | // Return the first non-prologue instruction in the block. |
| 438 | static MachineBasicBlock::iterator |
| 439 | getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { |
| 440 | MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); |
| 441 | while (I != MBB->end() && TII->isBasicBlockPrologue(MI: *I)) |
| 442 | ++I; |
| 443 | |
| 444 | return I; |
| 445 | } |
| 446 | |
| 447 | // Hoist and merge identical SGPR initializations into a common predecessor. |
| 448 | // This is intended to combine M0 initializations, but can work with any |
| 449 | // SGPR. A VGPR cannot be processed since we cannot guarantee vector |
| 450 | // executioon. |
| 451 | static bool hoistAndMergeSGPRInits(unsigned Reg, |
| 452 | const MachineRegisterInfo &MRI, |
| 453 | const TargetRegisterInfo *TRI, |
| 454 | MachineDominatorTree &MDT, |
| 455 | const TargetInstrInfo *TII) { |
| 456 | // List of inits by immediate value. |
| 457 | using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; |
| 458 | InitListMap Inits; |
| 459 | // List of clobbering instructions. |
| 460 | SmallVector<MachineInstr*, 8> Clobbers; |
| 461 | // List of instructions marked for deletion. |
| 462 | SmallSet<MachineInstr*, 8> MergedInstrs; |
| 463 | |
| 464 | bool Changed = false; |
| 465 | |
| 466 | for (auto &MI : MRI.def_instructions(Reg)) { |
| 467 | MachineOperand *Imm = nullptr; |
| 468 | for (auto &MO : MI.operands()) { |
| 469 | if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || |
| 470 | (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { |
| 471 | Imm = nullptr; |
| 472 | break; |
| 473 | } |
| 474 | if (MO.isImm()) |
| 475 | Imm = &MO; |
| 476 | } |
| 477 | if (Imm) |
| 478 | Inits[Imm->getImm()].push_front(x: &MI); |
| 479 | else |
| 480 | Clobbers.push_back(Elt: &MI); |
| 481 | } |
| 482 | |
| 483 | for (auto &Init : Inits) { |
| 484 | auto &Defs = Init.second; |
| 485 | |
| 486 | for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { |
| 487 | MachineInstr *MI1 = *I1; |
| 488 | |
| 489 | for (auto I2 = std::next(x: I1); I2 != E; ) { |
| 490 | MachineInstr *MI2 = *I2; |
| 491 | |
| 492 | // Check any possible interference |
| 493 | auto interferes = [&](MachineBasicBlock::iterator From, |
| 494 | MachineBasicBlock::iterator To) -> bool { |
| 495 | |
| 496 | assert(MDT.dominates(&*To, &*From)); |
| 497 | |
| 498 | auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { |
| 499 | const MachineBasicBlock *MBBFrom = From->getParent(); |
| 500 | const MachineBasicBlock *MBBTo = To->getParent(); |
| 501 | bool MayClobberFrom = isReachable(From: Clobber, To: &*From, CutOff: MBBTo, MDT); |
| 502 | bool MayClobberTo = isReachable(From: Clobber, To: &*To, CutOff: MBBTo, MDT); |
| 503 | if (!MayClobberFrom && !MayClobberTo) |
| 504 | return false; |
| 505 | if ((MayClobberFrom && !MayClobberTo) || |
| 506 | (!MayClobberFrom && MayClobberTo)) |
| 507 | return true; |
| 508 | // Both can clobber, this is not an interference only if both are |
| 509 | // dominated by Clobber and belong to the same block or if Clobber |
| 510 | // properly dominates To, given that To >> From, so it dominates |
| 511 | // both and located in a common dominator. |
| 512 | return !((MBBFrom == MBBTo && |
| 513 | MDT.dominates(A: Clobber, B: &*From) && |
| 514 | MDT.dominates(A: Clobber, B: &*To)) || |
| 515 | MDT.properlyDominates(A: Clobber->getParent(), B: MBBTo)); |
| 516 | }; |
| 517 | |
| 518 | return (llvm::any_of(Range&: Clobbers, P: interferes)) || |
| 519 | (llvm::any_of(Range&: Inits, P: [&](InitListMap::value_type &C) { |
| 520 | return C.first != Init.first && |
| 521 | llvm::any_of(Range&: C.second, P: interferes); |
| 522 | })); |
| 523 | }; |
| 524 | |
| 525 | if (MDT.dominates(A: MI1, B: MI2)) { |
| 526 | if (!interferes(MI2, MI1)) { |
| 527 | LLVM_DEBUG(dbgs() |
| 528 | << "Erasing from " |
| 529 | << printMBBReference(*MI2->getParent()) << " " << *MI2); |
| 530 | MergedInstrs.insert(Ptr: MI2); |
| 531 | Changed = true; |
| 532 | ++I2; |
| 533 | continue; |
| 534 | } |
| 535 | } else if (MDT.dominates(A: MI2, B: MI1)) { |
| 536 | if (!interferes(MI1, MI2)) { |
| 537 | LLVM_DEBUG(dbgs() |
| 538 | << "Erasing from " |
| 539 | << printMBBReference(*MI1->getParent()) << " " << *MI1); |
| 540 | MergedInstrs.insert(Ptr: MI1); |
| 541 | Changed = true; |
| 542 | ++I1; |
| 543 | break; |
| 544 | } |
| 545 | } else { |
| 546 | auto *MBB = MDT.findNearestCommonDominator(A: MI1->getParent(), |
| 547 | B: MI2->getParent()); |
| 548 | if (!MBB) { |
| 549 | ++I2; |
| 550 | continue; |
| 551 | } |
| 552 | |
| 553 | MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); |
| 554 | if (!interferes(MI1, I) && !interferes(MI2, I)) { |
| 555 | LLVM_DEBUG(dbgs() |
| 556 | << "Erasing from " |
| 557 | << printMBBReference(*MI1->getParent()) << " " << *MI1 |
| 558 | << "and moving from " |
| 559 | << printMBBReference(*MI2->getParent()) << " to " |
| 560 | << printMBBReference(*I->getParent()) << " " << *MI2); |
| 561 | I->getParent()->splice(Where: I, Other: MI2->getParent(), From: MI2); |
| 562 | MergedInstrs.insert(Ptr: MI1); |
| 563 | Changed = true; |
| 564 | ++I1; |
| 565 | break; |
| 566 | } |
| 567 | } |
| 568 | ++I2; |
| 569 | } |
| 570 | ++I1; |
| 571 | } |
| 572 | } |
| 573 | |
| 574 | // Remove initializations that were merged into another. |
| 575 | for (auto &Init : Inits) { |
| 576 | auto &Defs = Init.second; |
| 577 | auto I = Defs.begin(); |
| 578 | while (I != Defs.end()) { |
| 579 | if (MergedInstrs.count(Ptr: *I)) { |
| 580 | (*I)->eraseFromParent(); |
| 581 | I = Defs.erase(position: I); |
| 582 | } else |
| 583 | ++I; |
| 584 | } |
| 585 | } |
| 586 | |
| 587 | // Try to schedule SGPR initializations as early as possible in the MBB. |
| 588 | for (auto &Init : Inits) { |
| 589 | auto &Defs = Init.second; |
| 590 | for (auto *MI : Defs) { |
| 591 | auto *MBB = MI->getParent(); |
| 592 | MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); |
| 593 | MachineBasicBlock::reverse_iterator B(BoundaryMI); |
| 594 | // Check if B should actually be a boundary. If not set the previous |
| 595 | // instruction as the boundary instead. |
| 596 | if (!TII->isBasicBlockPrologue(MI: *B)) |
| 597 | B++; |
| 598 | |
| 599 | auto R = std::next(x: MI->getReverseIterator()); |
| 600 | const unsigned Threshold = 50; |
| 601 | // Search until B or Threshold for a place to insert the initialization. |
| 602 | for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) |
| 603 | if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || |
| 604 | TII->isSchedulingBoundary(MI: *R, MBB, MF: *MBB->getParent())) |
| 605 | break; |
| 606 | |
| 607 | // Move to directly after R. |
| 608 | if (&*--R != MI) |
| 609 | MBB->splice(Where: *R, Other: MBB, From: MI); |
| 610 | } |
| 611 | } |
| 612 | |
| 613 | if (Changed) |
| 614 | MRI.clearKillFlags(Reg); |
| 615 | |
| 616 | return Changed; |
| 617 | } |
| 618 | |
| 619 | bool SIFixSGPRCopies::run(MachineFunction &MF) { |
| 620 | // Only need to run this in SelectionDAG path. |
| 621 | if (MF.getProperties().hasSelected()) |
| 622 | return false; |
| 623 | |
| 624 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 625 | MRI = &MF.getRegInfo(); |
| 626 | TRI = ST.getRegisterInfo(); |
| 627 | TII = ST.getInstrInfo(); |
| 628 | |
| 629 | for (MachineBasicBlock &MBB : MF) { |
| 630 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
| 631 | ++I) { |
| 632 | MachineInstr &MI = *I; |
| 633 | |
| 634 | switch (MI.getOpcode()) { |
| 635 | default: |
| 636 | continue; |
| 637 | case AMDGPU::COPY: { |
| 638 | const TargetRegisterClass *SrcRC, *DstRC; |
| 639 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: *TRI, MRI: *MRI); |
| 640 | |
| 641 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) { |
| 642 | // Since VGPR to SGPR copies affect VGPR to SGPR copy |
| 643 | // score and, hence the lowering decision, let's try to get rid of |
| 644 | // them as early as possible |
| 645 | if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) |
| 646 | continue; |
| 647 | |
| 648 | // Collect those not changed to try them after VGPR to SGPR copies |
| 649 | // lowering as there will be more opportunities. |
| 650 | S2VCopies.push_back(Elt: &MI); |
| 651 | } |
| 652 | if (!isVGPRToSGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
| 653 | continue; |
| 654 | if (lowerSpecialCase(MI, I)) |
| 655 | continue; |
| 656 | |
| 657 | analyzeVGPRToSGPRCopy(MI: &MI); |
| 658 | |
| 659 | break; |
| 660 | } |
| 661 | case AMDGPU::WQM: |
| 662 | case AMDGPU::STRICT_WQM: |
| 663 | case AMDGPU::SOFT_WQM: |
| 664 | case AMDGPU::STRICT_WWM: |
| 665 | case AMDGPU::INSERT_SUBREG: |
| 666 | case AMDGPU::PHI: |
| 667 | case AMDGPU::REG_SEQUENCE: { |
| 668 | if (TRI->isSGPRClass(RC: TII->getOpRegClass(MI, OpNo: 0))) { |
| 669 | for (MachineOperand &MO : MI.operands()) { |
| 670 | if (!MO.isReg() || !MO.getReg().isVirtual()) |
| 671 | continue; |
| 672 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: MO.getReg()); |
| 673 | if (SrcRC == &AMDGPU::VReg_1RegClass) |
| 674 | continue; |
| 675 | |
| 676 | if (TRI->hasVectorRegisters(RC: SrcRC)) { |
| 677 | const TargetRegisterClass *DestRC = |
| 678 | TRI->getEquivalentSGPRClass(VRC: SrcRC); |
| 679 | Register NewDst = MRI->createVirtualRegister(RegClass: DestRC); |
| 680 | MachineBasicBlock *BlockToInsertCopy = |
| 681 | MI.isPHI() ? MI.getOperand(i: MO.getOperandNo() + 1).getMBB() |
| 682 | : &MBB; |
| 683 | MachineBasicBlock::iterator PointToInsertCopy = |
| 684 | MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; |
| 685 | |
| 686 | const DebugLoc &DL = MI.getDebugLoc(); |
| 687 | if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertTo: BlockToInsertCopy, |
| 688 | PointToInsertTo: PointToInsertCopy, DL)) { |
| 689 | MachineInstr *NewCopy = |
| 690 | BuildMI(BB&: *BlockToInsertCopy, I: PointToInsertCopy, MIMD: DL, |
| 691 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: NewDst) |
| 692 | .addReg(RegNo: MO.getReg()); |
| 693 | MO.setReg(NewDst); |
| 694 | analyzeVGPRToSGPRCopy(MI: NewCopy); |
| 695 | PHISources.insert(V: NewCopy); |
| 696 | } |
| 697 | } |
| 698 | } |
| 699 | } |
| 700 | |
| 701 | if (MI.isPHI()) |
| 702 | PHINodes.push_back(Elt: &MI); |
| 703 | else if (MI.isRegSequence()) |
| 704 | RegSequences.push_back(Elt: &MI); |
| 705 | |
| 706 | break; |
| 707 | } |
| 708 | case AMDGPU::V_WRITELANE_B32: { |
| 709 | // Some architectures allow more than one constant bus access without |
| 710 | // SGPR restriction |
| 711 | if (ST.getConstantBusLimit(Opcode: MI.getOpcode()) != 1) |
| 712 | break; |
| 713 | |
| 714 | // Writelane is special in that it can use SGPR and M0 (which would |
| 715 | // normally count as using the constant bus twice - but in this case it |
| 716 | // is allowed since the lane selector doesn't count as a use of the |
| 717 | // constant bus). However, it is still required to abide by the 1 SGPR |
| 718 | // rule. Apply a fix here as we might have multiple SGPRs after |
| 719 | // legalizing VGPRs to SGPRs |
| 720 | int Src0Idx = |
| 721 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0); |
| 722 | int Src1Idx = |
| 723 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src1); |
| 724 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
| 725 | MachineOperand &Src1 = MI.getOperand(i: Src1Idx); |
| 726 | |
| 727 | // Check to see if the instruction violates the 1 SGPR rule |
| 728 | if ((Src0.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0.getReg()) && |
| 729 | Src0.getReg() != AMDGPU::M0) && |
| 730 | (Src1.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1.getReg()) && |
| 731 | Src1.getReg() != AMDGPU::M0)) { |
| 732 | |
| 733 | // Check for trivially easy constant prop into one of the operands |
| 734 | // If this is the case then perform the operation now to resolve SGPR |
| 735 | // issue. If we don't do that here we will always insert a mov to m0 |
| 736 | // that can't be resolved in later operand folding pass |
| 737 | bool Resolved = false; |
| 738 | for (MachineOperand *MO : {&Src0, &Src1}) { |
| 739 | if (MO->getReg().isVirtual()) { |
| 740 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MO->getReg()); |
| 741 | if (DefMI && TII->isFoldableCopy(MI: *DefMI)) { |
| 742 | const MachineOperand &Def = DefMI->getOperand(i: 0); |
| 743 | if (Def.isReg() && |
| 744 | MO->getReg() == Def.getReg() && |
| 745 | MO->getSubReg() == Def.getSubReg()) { |
| 746 | const MachineOperand &Copied = DefMI->getOperand(i: 1); |
| 747 | if (Copied.isImm() && |
| 748 | TII->isInlineConstant(Imm: APInt(64, Copied.getImm(), true))) { |
| 749 | MO->ChangeToImmediate(ImmVal: Copied.getImm()); |
| 750 | Resolved = true; |
| 751 | break; |
| 752 | } |
| 753 | } |
| 754 | } |
| 755 | } |
| 756 | } |
| 757 | |
| 758 | if (!Resolved) { |
| 759 | // Haven't managed to resolve by replacing an SGPR with an immediate |
| 760 | // Move src1 to be in M0 |
| 761 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
| 762 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0) |
| 763 | .add(MO: Src1); |
| 764 | Src1.ChangeToRegister(Reg: AMDGPU::M0, isDef: false); |
| 765 | } |
| 766 | } |
| 767 | break; |
| 768 | } |
| 769 | } |
| 770 | } |
| 771 | } |
| 772 | |
| 773 | lowerVGPR2SGPRCopies(MF); |
| 774 | // Postprocessing |
| 775 | fixSCCCopies(MF); |
| 776 | for (auto *MI : S2VCopies) { |
| 777 | // Check if it is still valid |
| 778 | if (MI->isCopy()) { |
| 779 | const TargetRegisterClass *SrcRC, *DstRC; |
| 780 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: *MI, TRI: *TRI, MRI: *MRI); |
| 781 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
| 782 | tryChangeVGPRtoSGPRinCopy(MI&: *MI, TRI, TII); |
| 783 | } |
| 784 | } |
| 785 | for (auto *MI : RegSequences) { |
| 786 | // Check if it is still valid |
| 787 | if (MI->isRegSequence()) |
| 788 | foldVGPRCopyIntoRegSequence(MI&: *MI, TRI, TII, MRI&: *MRI); |
| 789 | } |
| 790 | for (auto *MI : PHINodes) { |
| 791 | processPHINode(MI&: *MI); |
| 792 | } |
| 793 | if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) |
| 794 | hoistAndMergeSGPRInits(Reg: AMDGPU::M0, MRI: *MRI, TRI, MDT&: *MDT, TII); |
| 795 | |
| 796 | SiblingPenalty.clear(); |
| 797 | V2SCopies.clear(); |
| 798 | SCCCopies.clear(); |
| 799 | RegSequences.clear(); |
| 800 | PHINodes.clear(); |
| 801 | S2VCopies.clear(); |
| 802 | PHISources.clear(); |
| 803 | |
| 804 | return true; |
| 805 | } |
| 806 | |
| 807 | void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { |
| 808 | bool AllAGPRUses = true; |
| 809 | SetVector<const MachineInstr *> worklist; |
| 810 | SmallSet<const MachineInstr *, 4> Visited; |
| 811 | SetVector<MachineInstr *> PHIOperands; |
| 812 | worklist.insert(X: &MI); |
| 813 | Visited.insert(Ptr: &MI); |
| 814 | // HACK to make MIR tests with no uses happy |
| 815 | bool HasUses = false; |
| 816 | while (!worklist.empty()) { |
| 817 | const MachineInstr *Instr = worklist.pop_back_val(); |
| 818 | Register Reg = Instr->getOperand(i: 0).getReg(); |
| 819 | for (const auto &Use : MRI->use_operands(Reg)) { |
| 820 | HasUses = true; |
| 821 | const MachineInstr *UseMI = Use.getParent(); |
| 822 | AllAGPRUses &= (UseMI->isCopy() && |
| 823 | TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: 0).getReg())) || |
| 824 | TRI->isAGPR(MRI: *MRI, Reg: Use.getReg()); |
| 825 | if (UseMI->isCopy() || UseMI->isRegSequence()) { |
| 826 | if (Visited.insert(Ptr: UseMI).second) |
| 827 | worklist.insert(X: UseMI); |
| 828 | |
| 829 | continue; |
| 830 | } |
| 831 | } |
| 832 | } |
| 833 | |
| 834 | Register PHIRes = MI.getOperand(i: 0).getReg(); |
| 835 | const TargetRegisterClass *RC0 = MRI->getRegClass(Reg: PHIRes); |
| 836 | if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC: RC0)) { |
| 837 | LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); |
| 838 | MRI->setRegClass(Reg: PHIRes, RC: TRI->getEquivalentAGPRClass(SRC: RC0)); |
| 839 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
| 840 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MI.getOperand(i: I).getReg()); |
| 841 | if (DefMI && DefMI->isPHI()) |
| 842 | PHIOperands.insert(X: DefMI); |
| 843 | } |
| 844 | } |
| 845 | |
| 846 | if (TRI->isVectorRegister(MRI: *MRI, Reg: PHIRes) || |
| 847 | RC0 == &AMDGPU::VReg_1RegClass) { |
| 848 | LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); |
| 849 | TII->legalizeOperands(MI, MDT); |
| 850 | } |
| 851 | |
| 852 | // Propagate register class back to PHI operands which are PHI themselves. |
| 853 | while (!PHIOperands.empty()) { |
| 854 | processPHINode(MI&: *PHIOperands.pop_back_val()); |
| 855 | } |
| 856 | } |
| 857 | |
| 858 | bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( |
| 859 | MachineOperand &MaybeVGPRConstMO, Register DstReg, |
| 860 | MachineBasicBlock *BlockToInsertTo, |
| 861 | MachineBasicBlock::iterator PointToInsertTo, const DebugLoc &DL) { |
| 862 | |
| 863 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MaybeVGPRConstMO.getReg()); |
| 864 | if (!DefMI || !DefMI->isMoveImmediate()) |
| 865 | return false; |
| 866 | |
| 867 | MachineOperand *SrcConst = TII->getNamedOperand(MI&: *DefMI, OperandName: AMDGPU::OpName::src0); |
| 868 | if (SrcConst->isReg()) |
| 869 | return false; |
| 870 | |
| 871 | const TargetRegisterClass *SrcRC = |
| 872 | MRI->getRegClass(Reg: MaybeVGPRConstMO.getReg()); |
| 873 | unsigned MoveSize = TRI->getRegSizeInBits(RC: *SrcRC); |
| 874 | unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
| 875 | BuildMI(BB&: *BlockToInsertTo, I: PointToInsertTo, MIMD: DL, MCID: TII->get(Opcode: MoveOp), DestReg: DstReg) |
| 876 | .add(MO: *SrcConst); |
| 877 | if (MRI->hasOneUse(RegNo: MaybeVGPRConstMO.getReg())) |
| 878 | DefMI->eraseFromParent(); |
| 879 | MaybeVGPRConstMO.setReg(DstReg); |
| 880 | return true; |
| 881 | } |
| 882 | |
| 883 | bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, |
| 884 | MachineBasicBlock::iterator &I) { |
| 885 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 886 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 887 | if (!DstReg.isVirtual()) { |
| 888 | // If the destination register is a physical register there isn't |
| 889 | // really much we can do to fix this. |
| 890 | // Some special instructions use M0 as an input. Some even only use |
| 891 | // the first lane. Insert a readfirstlane and hope for the best. |
| 892 | if (DstReg == AMDGPU::M0 && |
| 893 | TRI->hasVectorRegisters(RC: MRI->getRegClass(Reg: SrcReg))) { |
| 894 | Register TmpReg = |
| 895 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
| 896 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
| 897 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: TmpReg) |
| 898 | .add(MO: MI.getOperand(i: 1)); |
| 899 | MI.getOperand(i: 1).setReg(TmpReg); |
| 900 | } else if (tryMoveVGPRConstToSGPR(MaybeVGPRConstMO&: MI.getOperand(i: 1), DstReg, BlockToInsertTo: MI.getParent(), |
| 901 | PointToInsertTo: MI, DL: MI.getDebugLoc())) { |
| 902 | I = std::next(x: I); |
| 903 | MI.eraseFromParent(); |
| 904 | } |
| 905 | return true; |
| 906 | } |
| 907 | if (!SrcReg.isVirtual() || TRI->isAGPR(MRI: *MRI, Reg: SrcReg)) { |
| 908 | SIInstrWorklist worklist; |
| 909 | worklist.insert(MI: &MI); |
| 910 | TII->moveToVALU(Worklist&: worklist, MDT); |
| 911 | return true; |
| 912 | } |
| 913 | |
| 914 | unsigned SMovOp; |
| 915 | int64_t Imm; |
| 916 | // If we are just copying an immediate, we can replace the copy with |
| 917 | // s_mov_b32. |
| 918 | if (isSafeToFoldImmIntoCopy(Copy: &MI, MoveImm: MRI->getVRegDef(Reg: SrcReg), TII, SMovOp, Imm)) { |
| 919 | MI.getOperand(i: 1).ChangeToImmediate(ImmVal: Imm); |
| 920 | MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent()); |
| 921 | MI.setDesc(TII->get(Opcode: SMovOp)); |
| 922 | return true; |
| 923 | } |
| 924 | return false; |
| 925 | } |
| 926 | |
| 927 | void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { |
| 928 | if (PHISources.contains(V: MI)) |
| 929 | return; |
| 930 | Register DstReg = MI->getOperand(i: 0).getReg(); |
| 931 | const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg); |
| 932 | |
| 933 | V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, |
| 934 | TRI->getRegSizeInBits(RC: *DstRC)); |
| 935 | SmallVector<MachineInstr *, 8> AnalysisWorklist; |
| 936 | // Needed because the SSA is not a tree but a graph and may have |
| 937 | // forks and joins. We should not then go same way twice. |
| 938 | DenseSet<MachineInstr *> Visited; |
| 939 | AnalysisWorklist.push_back(Elt: Info.Copy); |
| 940 | while (!AnalysisWorklist.empty()) { |
| 941 | |
| 942 | MachineInstr *Inst = AnalysisWorklist.pop_back_val(); |
| 943 | |
| 944 | if (!Visited.insert(V: Inst).second) |
| 945 | continue; |
| 946 | |
| 947 | // Copies and REG_SEQUENCE do not contribute to the final assembly |
| 948 | // So, skip them but take care of the SGPR to VGPR copies bookkeeping. |
| 949 | if (Inst->isCopy() || Inst->isRegSequence()) { |
| 950 | if (TRI->isVGPR(MRI: *MRI, Reg: Inst->getOperand(i: 0).getReg())) { |
| 951 | if (!Inst->isCopy() || |
| 952 | !tryChangeVGPRtoSGPRinCopy(MI&: *Inst, TRI, TII)) { |
| 953 | Info.NumSVCopies++; |
| 954 | continue; |
| 955 | } |
| 956 | } |
| 957 | } |
| 958 | |
| 959 | SiblingPenalty[Inst].insert(X: Info.ID); |
| 960 | |
| 961 | SmallVector<MachineInstr *, 4> Users; |
| 962 | if ((TII->isSALU(MI: *Inst) && Inst->isCompare()) || |
| 963 | (Inst->isCopy() && Inst->getOperand(i: 0).getReg() == AMDGPU::SCC)) { |
| 964 | auto I = Inst->getIterator(); |
| 965 | auto E = Inst->getParent()->end(); |
| 966 | while (++I != E && |
| 967 | !I->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
| 968 | if (I->readsRegister(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) |
| 969 | Users.push_back(Elt: &*I); |
| 970 | } |
| 971 | } else if (Inst->getNumExplicitDefs() != 0) { |
| 972 | Register Reg = Inst->getOperand(i: 0).getReg(); |
| 973 | if (Reg.isVirtual() && TRI->isSGPRReg(MRI: *MRI, Reg) && !TII->isVALU(MI: *Inst)) { |
| 974 | for (auto &U : MRI->use_instructions(Reg)) |
| 975 | Users.push_back(Elt: &U); |
| 976 | } |
| 977 | } |
| 978 | for (auto *U : Users) { |
| 979 | if (TII->isSALU(MI: *U)) |
| 980 | Info.SChain.insert(X: U); |
| 981 | AnalysisWorklist.push_back(Elt: U); |
| 982 | } |
| 983 | } |
| 984 | V2SCopies[Info.ID] = Info; |
| 985 | } |
| 986 | |
| 987 | // The main function that computes the VGPR to SGPR copy score |
| 988 | // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU |
| 989 | bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { |
| 990 | if (Info->SChain.empty()) { |
| 991 | Info->Score = 0; |
| 992 | return true; |
| 993 | } |
| 994 | Info->Siblings = SiblingPenalty[*llvm::max_element( |
| 995 | Range&: Info->SChain, C: [&](MachineInstr *A, MachineInstr *B) -> bool { |
| 996 | return SiblingPenalty[A].size() < SiblingPenalty[B].size(); |
| 997 | })]; |
| 998 | Info->Siblings.remove_if(P: [&](unsigned ID) { return ID == Info->ID; }); |
| 999 | // The loop below computes the number of another VGPR to SGPR V2SCopies |
| 1000 | // which contribute to the current copy SALU chain. We assume that all the |
| 1001 | // V2SCopies with the same source virtual register will be squashed to one |
| 1002 | // by regalloc. Also we take care of the V2SCopies of the differnt subregs |
| 1003 | // of the same register. |
| 1004 | SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; |
| 1005 | for (auto J : Info->Siblings) { |
| 1006 | auto *InfoIt = V2SCopies.find(Key: J); |
| 1007 | if (InfoIt != V2SCopies.end()) { |
| 1008 | MachineInstr *SiblingCopy = InfoIt->second.Copy; |
| 1009 | if (SiblingCopy->isImplicitDef()) |
| 1010 | // the COPY has already been MoveToVALUed |
| 1011 | continue; |
| 1012 | |
| 1013 | SrcRegs.insert(V: std::pair(SiblingCopy->getOperand(i: 1).getReg(), |
| 1014 | SiblingCopy->getOperand(i: 1).getSubReg())); |
| 1015 | } |
| 1016 | } |
| 1017 | Info->SiblingPenalty = SrcRegs.size(); |
| 1018 | |
| 1019 | unsigned Penalty = |
| 1020 | Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; |
| 1021 | unsigned Profit = Info->SChain.size(); |
| 1022 | Info->Score = Penalty > Profit ? 0 : Profit - Penalty; |
| 1023 | Info->NeedToBeConvertedToVALU = Info->Score < 3; |
| 1024 | return Info->NeedToBeConvertedToVALU; |
| 1025 | } |
| 1026 | |
| 1027 | void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { |
| 1028 | |
| 1029 | SmallVector<unsigned, 8> LoweringWorklist; |
| 1030 | for (auto &C : V2SCopies) { |
| 1031 | if (needToBeConvertedToVALU(Info: &C.second)) |
| 1032 | LoweringWorklist.push_back(Elt: C.second.ID); |
| 1033 | } |
| 1034 | |
| 1035 | // Store all the V2S copy instructions that need to be moved to VALU |
| 1036 | // in the Copies worklist. |
| 1037 | SIInstrWorklist Copies; |
| 1038 | |
| 1039 | while (!LoweringWorklist.empty()) { |
| 1040 | unsigned CurID = LoweringWorklist.pop_back_val(); |
| 1041 | auto *CurInfoIt = V2SCopies.find(Key: CurID); |
| 1042 | if (CurInfoIt != V2SCopies.end()) { |
| 1043 | V2SCopyInfo C = CurInfoIt->second; |
| 1044 | LLVM_DEBUG(dbgs() << "Processing ...\n" ; C.dump()); |
| 1045 | for (auto S : C.Siblings) { |
| 1046 | auto *SibInfoIt = V2SCopies.find(Key: S); |
| 1047 | if (SibInfoIt != V2SCopies.end()) { |
| 1048 | V2SCopyInfo &SI = SibInfoIt->second; |
| 1049 | LLVM_DEBUG(dbgs() << "Sibling:\n" ; SI.dump()); |
| 1050 | if (!SI.NeedToBeConvertedToVALU) { |
| 1051 | SI.SChain.set_subtract(C.SChain); |
| 1052 | if (needToBeConvertedToVALU(Info: &SI)) |
| 1053 | LoweringWorklist.push_back(Elt: SI.ID); |
| 1054 | } |
| 1055 | SI.Siblings.remove_if(P: [&](unsigned ID) { return ID == C.ID; }); |
| 1056 | } |
| 1057 | } |
| 1058 | LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy |
| 1059 | << " is being turned to VALU\n" ); |
| 1060 | // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if |
| 1061 | // instead. |
| 1062 | V2SCopies.erase(Key: C.ID); |
| 1063 | Copies.insert(MI: C.Copy); |
| 1064 | } |
| 1065 | } |
| 1066 | |
| 1067 | TII->moveToVALU(Worklist&: Copies, MDT); |
| 1068 | Copies.clear(); |
| 1069 | |
| 1070 | // Now do actual lowering |
| 1071 | for (auto C : V2SCopies) { |
| 1072 | MachineInstr *MI = C.second.Copy; |
| 1073 | MachineBasicBlock *MBB = MI->getParent(); |
| 1074 | // We decide to turn V2S copy to v_readfirstlane_b32 |
| 1075 | // remove it from the V2SCopies and remove it from all its siblings |
| 1076 | LLVM_DEBUG(dbgs() << "V2S copy " << *MI |
| 1077 | << " is being turned to v_readfirstlane_b32" |
| 1078 | << " Score: " << C.second.Score << "\n" ); |
| 1079 | Register DstReg = MI->getOperand(i: 0).getReg(); |
| 1080 | MRI->constrainRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
| 1081 | |
| 1082 | Register SrcReg = MI->getOperand(i: 1).getReg(); |
| 1083 | unsigned SubReg = MI->getOperand(i: 1).getSubReg(); |
| 1084 | const TargetRegisterClass *SrcRC = |
| 1085 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 1)); |
| 1086 | size_t SrcSize = TRI->getRegSizeInBits(RC: *SrcRC); |
| 1087 | if (SrcSize == 16) { |
| 1088 | assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() && |
| 1089 | "We do not expect to see 16-bit copies from VGPR to SGPR unless " |
| 1090 | "we have 16-bit VGPRs" ); |
| 1091 | assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass || |
| 1092 | MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass || |
| 1093 | MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass); |
| 1094 | // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits |
| 1095 | MRI->setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
| 1096 | Register VReg32 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1097 | const DebugLoc &DL = MI->getDebugLoc(); |
| 1098 | Register Undef = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass); |
| 1099 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef); |
| 1100 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: VReg32) |
| 1101 | .addReg(RegNo: SrcReg, flags: 0, SubReg) |
| 1102 | .addImm(Val: AMDGPU::lo16) |
| 1103 | .addReg(RegNo: Undef) |
| 1104 | .addImm(Val: AMDGPU::hi16); |
| 1105 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg) |
| 1106 | .addReg(RegNo: VReg32); |
| 1107 | } else if (SrcSize == 32) { |
| 1108 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
| 1109 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg); |
| 1110 | MIB.addReg(RegNo: SrcReg, flags: 0, SubReg); |
| 1111 | } else { |
| 1112 | auto Result = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
| 1113 | MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg); |
| 1114 | int N = TRI->getRegSizeInBits(RC: *SrcRC) / 32; |
| 1115 | for (int i = 0; i < N; i++) { |
| 1116 | Register PartialSrc = TII->buildExtractSubReg( |
| 1117 | MI: Result, MRI&: *MRI, SuperReg: MI->getOperand(i: 1), SuperRC: SrcRC, |
| 1118 | SubIdx: TRI->getSubRegFromChannel(Channel: i), SubRC: &AMDGPU::VGPR_32RegClass); |
| 1119 | Register PartialDst = |
| 1120 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
| 1121 | BuildMI(BB&: *MBB, I&: *Result, MIMD: Result->getDebugLoc(), |
| 1122 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: PartialDst) |
| 1123 | .addReg(RegNo: PartialSrc); |
| 1124 | Result.addReg(RegNo: PartialDst).addImm(Val: TRI->getSubRegFromChannel(Channel: i)); |
| 1125 | } |
| 1126 | } |
| 1127 | MI->eraseFromParent(); |
| 1128 | } |
| 1129 | } |
| 1130 | |
| 1131 | void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { |
| 1132 | bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); |
| 1133 | for (MachineBasicBlock &MBB : MF) { |
| 1134 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
| 1135 | ++I) { |
| 1136 | MachineInstr &MI = *I; |
| 1137 | // May already have been lowered. |
| 1138 | if (!MI.isCopy()) |
| 1139 | continue; |
| 1140 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 1141 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 1142 | if (SrcReg == AMDGPU::SCC) { |
| 1143 | Register SCCCopy = |
| 1144 | MRI->createVirtualRegister(RegClass: TRI->getWaveMaskRegClass()); |
| 1145 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
| 1146 | MIMD: MI.getDebugLoc(), |
| 1147 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CSELECT_B32 |
| 1148 | : AMDGPU::S_CSELECT_B64), |
| 1149 | DestReg: SCCCopy) |
| 1150 | .addImm(Val: -1) |
| 1151 | .addImm(Val: 0); |
| 1152 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: I), MIMD: I->getDebugLoc(), |
| 1153 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
| 1154 | .addReg(RegNo: SCCCopy); |
| 1155 | MI.eraseFromParent(); |
| 1156 | continue; |
| 1157 | } |
| 1158 | if (DstReg == AMDGPU::SCC) { |
| 1159 | unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
| 1160 | Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
| 1161 | Register Tmp = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
| 1162 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
| 1163 | MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode)) |
| 1164 | .addReg(RegNo: Tmp, flags: getDefRegState(B: true)) |
| 1165 | .addReg(RegNo: SrcReg) |
| 1166 | .addReg(RegNo: Exec); |
| 1167 | MI.eraseFromParent(); |
| 1168 | } |
| 1169 | } |
| 1170 | } |
| 1171 | } |
| 1172 | |
| 1173 | PreservedAnalyses |
| 1174 | SIFixSGPRCopiesPass::run(MachineFunction &MF, |
| 1175 | MachineFunctionAnalysisManager &MFAM) { |
| 1176 | MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF); |
| 1177 | SIFixSGPRCopies Impl(&MDT); |
| 1178 | bool Changed = Impl.run(MF); |
| 1179 | if (!Changed) |
| 1180 | return PreservedAnalyses::all(); |
| 1181 | |
| 1182 | // TODO: We could detect CFG changed. |
| 1183 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
| 1184 | return PA; |
| 1185 | } |
| 1186 | |