| 1 | //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// Copies from VGPR to SGPR registers are illegal and the register coalescer |
| 11 | /// will sometimes generate these illegal copies in situations like this: |
| 12 | /// |
| 13 | /// Register Class <vsrc> is the union of <vgpr> and <sgpr> |
| 14 | /// |
| 15 | /// BB0: |
| 16 | /// %0 <sgpr> = SCALAR_INST |
| 17 | /// %1 <vsrc> = COPY %0 <sgpr> |
| 18 | /// ... |
| 19 | /// BRANCH %cond BB1, BB2 |
| 20 | /// BB1: |
| 21 | /// %2 <vgpr> = VECTOR_INST |
| 22 | /// %3 <vsrc> = COPY %2 <vgpr> |
| 23 | /// BB2: |
| 24 | /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> |
| 25 | /// %5 <vgpr> = VECTOR_INST %4 <vsrc> |
| 26 | /// |
| 27 | /// |
| 28 | /// The coalescer will begin at BB0 and eliminate its copy, then the resulting |
| 29 | /// code will look like this: |
| 30 | /// |
| 31 | /// BB0: |
| 32 | /// %0 <sgpr> = SCALAR_INST |
| 33 | /// ... |
| 34 | /// BRANCH %cond BB1, BB2 |
| 35 | /// BB1: |
| 36 | /// %2 <vgpr> = VECTOR_INST |
| 37 | /// %3 <vsrc> = COPY %2 <vgpr> |
| 38 | /// BB2: |
| 39 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> |
| 40 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
| 41 | /// |
| 42 | /// Now that the result of the PHI instruction is an SGPR, the register |
| 43 | /// allocator is now forced to constrain the register class of %3 to |
| 44 | /// <sgpr> so we end up with final code like this: |
| 45 | /// |
| 46 | /// BB0: |
| 47 | /// %0 <sgpr> = SCALAR_INST |
| 48 | /// ... |
| 49 | /// BRANCH %cond BB1, BB2 |
| 50 | /// BB1: |
| 51 | /// %2 <vgpr> = VECTOR_INST |
| 52 | /// %3 <sgpr> = COPY %2 <vgpr> |
| 53 | /// BB2: |
| 54 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> |
| 55 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
| 56 | /// |
| 57 | /// Now this code contains an illegal copy from a VGPR to an SGPR. |
| 58 | /// |
| 59 | /// In order to avoid this problem, this pass searches for PHI instructions |
| 60 | /// which define a <vsrc> register and constrains its definition class to |
| 61 | /// <vgpr> if the user of the PHI's definition register is a vector instruction. |
| 62 | /// If the PHI's definition class is constrained to <vgpr> then the coalescer |
| 63 | /// will be unable to perform the COPY removal from the above example which |
| 64 | /// ultimately led to the creation of an illegal COPY. |
| 65 | //===----------------------------------------------------------------------===// |
| 66 | |
| 67 | #include "SIFixSGPRCopies.h" |
| 68 | #include "AMDGPU.h" |
| 69 | #include "AMDGPULaneMaskUtils.h" |
| 70 | #include "GCNSubtarget.h" |
| 71 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 72 | #include "llvm/CodeGen/MachineDominators.h" |
| 73 | #include "llvm/InitializePasses.h" |
| 74 | #include "llvm/Target/TargetMachine.h" |
| 75 | |
| 76 | using namespace llvm; |
| 77 | |
| 78 | #define DEBUG_TYPE "si-fix-sgpr-copies" |
| 79 | |
| 80 | static cl::opt<bool> EnableM0Merge( |
| 81 | "amdgpu-enable-merge-m0" , |
| 82 | cl::desc("Merge and hoist M0 initializations" ), |
| 83 | cl::init(Val: true)); |
| 84 | |
| 85 | namespace { |
| 86 | |
| 87 | class V2SCopyInfo { |
| 88 | public: |
| 89 | // VGPR to SGPR copy being processed |
| 90 | MachineInstr *Copy; |
| 91 | // All SALU instructions reachable from this copy in SSA graph |
| 92 | SetVector<MachineInstr *> SChain; |
| 93 | // Number of SGPR to VGPR copies that are used to put the SALU computation |
| 94 | // results back to VALU. |
| 95 | unsigned NumSVCopies = 0; |
| 96 | |
| 97 | unsigned Score = 0; |
| 98 | // Actual count of v_readfirstlane_b32 |
| 99 | // which need to be inserted to keep SChain SALU |
| 100 | unsigned NumReadfirstlanes = 0; |
| 101 | // Current score state. To speedup selection V2SCopyInfos for processing |
| 102 | bool NeedToBeConvertedToVALU = false; |
| 103 | // Unique ID. Used as a key for mapping to keep permanent order. |
| 104 | unsigned ID; |
| 105 | |
| 106 | // Count of another VGPR to SGPR copies that contribute to the |
| 107 | // current copy SChain |
| 108 | unsigned SiblingPenalty = 0; |
| 109 | SetVector<unsigned> Siblings; |
| 110 | V2SCopyInfo() : Copy(nullptr), ID(0){}; |
| 111 | V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) |
| 112 | : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; |
| 113 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 114 | void dump() const { |
| 115 | dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() |
| 116 | << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty |
| 117 | << "\nScore: " << Score << "\n" ; |
| 118 | } |
| 119 | #endif |
| 120 | }; |
| 121 | |
| 122 | class SIFixSGPRCopies { |
| 123 | MachineDominatorTree *MDT; |
| 124 | SmallVector<MachineInstr*, 4> SCCCopies; |
| 125 | SmallVector<MachineInstr*, 4> RegSequences; |
| 126 | SmallVector<MachineInstr*, 4> PHINodes; |
| 127 | SmallVector<MachineInstr*, 4> S2VCopies; |
| 128 | unsigned NextVGPRToSGPRCopyID = 0; |
| 129 | MapVector<unsigned, V2SCopyInfo> V2SCopies; |
| 130 | DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; |
| 131 | DenseSet<MachineInstr *> PHISources; |
| 132 | |
| 133 | public: |
| 134 | MachineRegisterInfo *MRI; |
| 135 | const SIRegisterInfo *TRI; |
| 136 | const SIInstrInfo *TII; |
| 137 | |
| 138 | SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {} |
| 139 | |
| 140 | bool run(MachineFunction &MF); |
| 141 | void fixSCCCopies(MachineFunction &MF); |
| 142 | void prepareRegSequenceAndPHIs(MachineFunction &MF); |
| 143 | unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } |
| 144 | bool needToBeConvertedToVALU(V2SCopyInfo *I); |
| 145 | void analyzeVGPRToSGPRCopy(MachineInstr *MI); |
| 146 | void lowerVGPR2SGPRCopies(MachineFunction &MF); |
| 147 | // Handles copies which source register is: |
| 148 | // 1. Physical register |
| 149 | // 2. AGPR |
| 150 | // 3. Defined by the instruction the merely moves the immediate |
| 151 | bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); |
| 152 | |
| 153 | void processPHINode(MachineInstr &MI); |
| 154 | |
| 155 | // Check if MO is an immediate materialized into a VGPR, and if so replace it |
| 156 | // with an SGPR immediate. The VGPR immediate is also deleted if it does not |
| 157 | // have any other uses. |
| 158 | bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, |
| 159 | MachineBasicBlock *BlockToInsertTo, |
| 160 | MachineBasicBlock::iterator PointToInsertTo, |
| 161 | const DebugLoc &DL); |
| 162 | }; |
| 163 | |
| 164 | class SIFixSGPRCopiesLegacy : public MachineFunctionPass { |
| 165 | public: |
| 166 | static char ID; |
| 167 | |
| 168 | SIFixSGPRCopiesLegacy() : MachineFunctionPass(ID) {} |
| 169 | |
| 170 | bool runOnMachineFunction(MachineFunction &MF) override { |
| 171 | MachineDominatorTree *MDT = |
| 172 | &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
| 173 | SIFixSGPRCopies Impl(MDT); |
| 174 | return Impl.run(MF); |
| 175 | } |
| 176 | |
| 177 | StringRef getPassName() const override { return "SI Fix SGPR copies" ; } |
| 178 | |
| 179 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 180 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
| 181 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
| 182 | AU.setPreservesCFG(); |
| 183 | MachineFunctionPass::getAnalysisUsage(AU); |
| 184 | } |
| 185 | }; |
| 186 | |
| 187 | } // end anonymous namespace |
| 188 | |
| 189 | INITIALIZE_PASS_BEGIN(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies" , |
| 190 | false, false) |
| 191 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
| 192 | INITIALIZE_PASS_END(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies" , |
| 193 | false, false) |
| 194 | |
| 195 | char SIFixSGPRCopiesLegacy::ID = 0; |
| 196 | |
| 197 | char &llvm::SIFixSGPRCopiesLegacyID = SIFixSGPRCopiesLegacy::ID; |
| 198 | |
| 199 | FunctionPass *llvm::createSIFixSGPRCopiesLegacyPass() { |
| 200 | return new SIFixSGPRCopiesLegacy(); |
| 201 | } |
| 202 | |
| 203 | static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> |
| 204 | getCopyRegClasses(const MachineInstr &Copy, |
| 205 | const SIRegisterInfo &TRI, |
| 206 | const MachineRegisterInfo &MRI) { |
| 207 | Register DstReg = Copy.getOperand(i: 0).getReg(); |
| 208 | Register SrcReg = Copy.getOperand(i: 1).getReg(); |
| 209 | |
| 210 | const TargetRegisterClass *SrcRC = SrcReg.isVirtual() |
| 211 | ? MRI.getRegClass(Reg: SrcReg) |
| 212 | : TRI.getPhysRegBaseClass(Reg: SrcReg); |
| 213 | |
| 214 | // We don't really care about the subregister here. |
| 215 | // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); |
| 216 | |
| 217 | const TargetRegisterClass *DstRC = DstReg.isVirtual() |
| 218 | ? MRI.getRegClass(Reg: DstReg) |
| 219 | : TRI.getPhysRegBaseClass(Reg: DstReg); |
| 220 | |
| 221 | return std::pair(SrcRC, DstRC); |
| 222 | } |
| 223 | |
| 224 | static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, |
| 225 | const TargetRegisterClass *DstRC, |
| 226 | const SIRegisterInfo &TRI) { |
| 227 | return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: DstRC) && |
| 228 | TRI.hasVectorRegisters(RC: SrcRC); |
| 229 | } |
| 230 | |
| 231 | static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, |
| 232 | const TargetRegisterClass *DstRC, |
| 233 | const SIRegisterInfo &TRI) { |
| 234 | return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: SrcRC) && |
| 235 | TRI.hasVectorRegisters(RC: DstRC); |
| 236 | } |
| 237 | |
| 238 | static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, |
| 239 | const SIRegisterInfo *TRI, |
| 240 | const SIInstrInfo *TII) { |
| 241 | MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); |
| 242 | auto &Src = MI.getOperand(i: 1); |
| 243 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 244 | Register SrcReg = Src.getReg(); |
| 245 | if (!SrcReg.isVirtual() || !DstReg.isVirtual()) |
| 246 | return false; |
| 247 | |
| 248 | for (const auto &MO : MRI.reg_nodbg_operands(Reg: DstReg)) { |
| 249 | const auto *UseMI = MO.getParent(); |
| 250 | if (UseMI == &MI) |
| 251 | continue; |
| 252 | if (MO.isDef() || UseMI->getParent() != MI.getParent() || |
| 253 | UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
| 254 | return false; |
| 255 | |
| 256 | unsigned OpIdx = MO.getOperandNo(); |
| 257 | if (OpIdx >= UseMI->getDesc().getNumOperands() || |
| 258 | !TII->isOperandLegal(MI: *UseMI, OpIdx, MO: &Src)) |
| 259 | return false; |
| 260 | } |
| 261 | // Change VGPR to SGPR destination. |
| 262 | MRI.setRegClass(Reg: DstReg, RC: TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: DstReg))); |
| 263 | return true; |
| 264 | } |
| 265 | |
| 266 | // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. |
| 267 | // |
| 268 | // SGPRx = ... |
| 269 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
| 270 | // VGPRz = COPY SGPRy |
| 271 | // |
| 272 | // ==> |
| 273 | // |
| 274 | // VGPRx = COPY SGPRx |
| 275 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
| 276 | // |
| 277 | // This exposes immediate folding opportunities when materializing 64-bit |
| 278 | // immediates. |
| 279 | static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, |
| 280 | const SIRegisterInfo *TRI, |
| 281 | const SIInstrInfo *TII, |
| 282 | MachineRegisterInfo &MRI) { |
| 283 | assert(MI.isRegSequence()); |
| 284 | |
| 285 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 286 | if (!TRI->isSGPRClass(RC: MRI.getRegClass(Reg: DstReg))) |
| 287 | return false; |
| 288 | |
| 289 | if (!MRI.hasOneUse(RegNo: DstReg)) |
| 290 | return false; |
| 291 | |
| 292 | MachineInstr &CopyUse = *MRI.use_instr_begin(RegNo: DstReg); |
| 293 | if (!CopyUse.isCopy()) |
| 294 | return false; |
| 295 | |
| 296 | // It is illegal to have vreg inputs to a physreg defining reg_sequence. |
| 297 | if (CopyUse.getOperand(i: 0).getReg().isPhysical()) |
| 298 | return false; |
| 299 | |
| 300 | const TargetRegisterClass *SrcRC, *DstRC; |
| 301 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: CopyUse, TRI: *TRI, MRI); |
| 302 | |
| 303 | if (!isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
| 304 | return false; |
| 305 | |
| 306 | if (tryChangeVGPRtoSGPRinCopy(MI&: CopyUse, TRI, TII)) |
| 307 | return true; |
| 308 | |
| 309 | // TODO: Could have multiple extracts? |
| 310 | unsigned SubReg = CopyUse.getOperand(i: 1).getSubReg(); |
| 311 | if (SubReg != AMDGPU::NoSubRegister) |
| 312 | return false; |
| 313 | |
| 314 | MRI.setRegClass(Reg: DstReg, RC: DstRC); |
| 315 | |
| 316 | // SGPRx = ... |
| 317 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
| 318 | // VGPRz = COPY SGPRy |
| 319 | |
| 320 | // => |
| 321 | // VGPRx = COPY SGPRx |
| 322 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
| 323 | |
| 324 | MI.getOperand(i: 0).setReg(CopyUse.getOperand(i: 0).getReg()); |
| 325 | bool IsAGPR = TRI->isAGPRClass(RC: DstRC); |
| 326 | |
| 327 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
| 328 | const TargetRegisterClass *SrcRC = |
| 329 | TRI->getRegClassForOperandReg(MRI, MO: MI.getOperand(i: I)); |
| 330 | assert(TRI->isSGPRClass(SrcRC) && |
| 331 | "Expected SGPR REG_SEQUENCE to only have SGPR inputs" ); |
| 332 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SRC: SrcRC); |
| 333 | |
| 334 | Register TmpReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
| 335 | |
| 336 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), |
| 337 | DestReg: TmpReg) |
| 338 | .add(MO: MI.getOperand(i: I)); |
| 339 | |
| 340 | if (IsAGPR) { |
| 341 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SRC: SrcRC); |
| 342 | Register TmpAReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
| 343 | unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? |
| 344 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; |
| 345 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc), |
| 346 | DestReg: TmpAReg) |
| 347 | .addReg(RegNo: TmpReg, Flags: RegState::Kill); |
| 348 | TmpReg = TmpAReg; |
| 349 | } |
| 350 | |
| 351 | MI.getOperand(i: I).setReg(TmpReg); |
| 352 | } |
| 353 | |
| 354 | CopyUse.eraseFromParent(); |
| 355 | return true; |
| 356 | } |
| 357 | |
| 358 | static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, |
| 359 | const MachineInstr *MoveImm, |
| 360 | const SIInstrInfo *TII, |
| 361 | unsigned &SMovOp, |
| 362 | int64_t &Imm) { |
| 363 | if (Copy->getOpcode() != AMDGPU::COPY) |
| 364 | return false; |
| 365 | |
| 366 | if (!MoveImm->isMoveImmediate()) |
| 367 | return false; |
| 368 | |
| 369 | const MachineOperand *ImmOp = |
| 370 | TII->getNamedOperand(MI: *MoveImm, OperandName: AMDGPU::OpName::src0); |
| 371 | if (!ImmOp->isImm()) |
| 372 | return false; |
| 373 | |
| 374 | // FIXME: Handle copies with sub-regs. |
| 375 | if (Copy->getOperand(i: 1).getSubReg()) |
| 376 | return false; |
| 377 | |
| 378 | switch (MoveImm->getOpcode()) { |
| 379 | default: |
| 380 | return false; |
| 381 | case AMDGPU::V_MOV_B32_e32: |
| 382 | case AMDGPU::AV_MOV_B32_IMM_PSEUDO: |
| 383 | SMovOp = AMDGPU::S_MOV_B32; |
| 384 | break; |
| 385 | case AMDGPU::V_MOV_B64_PSEUDO: |
| 386 | SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; |
| 387 | break; |
| 388 | } |
| 389 | Imm = ImmOp->getImm(); |
| 390 | return true; |
| 391 | } |
| 392 | |
| 393 | template <class UnaryPredicate> |
| 394 | bool searchPredecessors(const MachineBasicBlock *MBB, |
| 395 | const MachineBasicBlock *CutOff, |
| 396 | UnaryPredicate Predicate) { |
| 397 | if (MBB == CutOff) |
| 398 | return false; |
| 399 | |
| 400 | DenseSet<const MachineBasicBlock *> Visited; |
| 401 | SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); |
| 402 | |
| 403 | while (!Worklist.empty()) { |
| 404 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
| 405 | |
| 406 | if (!Visited.insert(V: MBB).second) |
| 407 | continue; |
| 408 | if (MBB == CutOff) |
| 409 | continue; |
| 410 | if (Predicate(MBB)) |
| 411 | return true; |
| 412 | |
| 413 | Worklist.append(in_start: MBB->pred_begin(), in_end: MBB->pred_end()); |
| 414 | } |
| 415 | |
| 416 | return false; |
| 417 | } |
| 418 | |
| 419 | // Checks if there is potential path From instruction To instruction. |
| 420 | // If CutOff is specified and it sits in between of that path we ignore |
| 421 | // a higher portion of the path and report it is not reachable. |
| 422 | static bool isReachable(const MachineInstr *From, |
| 423 | const MachineInstr *To, |
| 424 | const MachineBasicBlock *CutOff, |
| 425 | MachineDominatorTree &MDT) { |
| 426 | if (MDT.dominates(A: From, B: To)) |
| 427 | return true; |
| 428 | |
| 429 | const MachineBasicBlock *MBBFrom = From->getParent(); |
| 430 | const MachineBasicBlock *MBBTo = To->getParent(); |
| 431 | |
| 432 | // Do predecessor search. |
| 433 | // We should almost never get here since we do not usually produce M0 stores |
| 434 | // other than -1. |
| 435 | return searchPredecessors(MBB: MBBTo, CutOff, Predicate: [MBBFrom] |
| 436 | (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); |
| 437 | } |
| 438 | |
| 439 | // Return the first non-prologue instruction in the block. |
| 440 | static MachineBasicBlock::iterator |
| 441 | getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { |
| 442 | MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); |
| 443 | while (I != MBB->end() && TII->isBasicBlockPrologue(MI: *I)) |
| 444 | ++I; |
| 445 | |
| 446 | return I; |
| 447 | } |
| 448 | |
| 449 | // Hoist and merge identical SGPR initializations into a common predecessor. |
| 450 | // This is intended to combine M0 initializations, but can work with any |
| 451 | // SGPR. A VGPR cannot be processed since we cannot guarantee vector |
| 452 | // executioon. |
| 453 | static bool hoistAndMergeSGPRInits(unsigned Reg, |
| 454 | const MachineRegisterInfo &MRI, |
| 455 | const TargetRegisterInfo *TRI, |
| 456 | MachineDominatorTree &MDT, |
| 457 | const TargetInstrInfo *TII) { |
| 458 | // List of inits by immediate value. |
| 459 | using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; |
| 460 | InitListMap Inits; |
| 461 | // List of clobbering instructions. |
| 462 | SmallVector<MachineInstr*, 8> Clobbers; |
| 463 | // List of instructions marked for deletion. |
| 464 | SmallPtrSet<MachineInstr *, 8> MergedInstrs; |
| 465 | |
| 466 | bool Changed = false; |
| 467 | |
| 468 | for (auto &MI : MRI.def_instructions(Reg)) { |
| 469 | MachineOperand *Imm = nullptr; |
| 470 | for (auto &MO : MI.operands()) { |
| 471 | if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || |
| 472 | (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { |
| 473 | Imm = nullptr; |
| 474 | break; |
| 475 | } |
| 476 | if (MO.isImm()) |
| 477 | Imm = &MO; |
| 478 | } |
| 479 | if (Imm) |
| 480 | Inits[Imm->getImm()].push_front(x: &MI); |
| 481 | else |
| 482 | Clobbers.push_back(Elt: &MI); |
| 483 | } |
| 484 | |
| 485 | for (auto &Init : Inits) { |
| 486 | auto &Defs = Init.second; |
| 487 | |
| 488 | for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { |
| 489 | MachineInstr *MI1 = *I1; |
| 490 | |
| 491 | for (auto I2 = std::next(x: I1); I2 != E; ) { |
| 492 | MachineInstr *MI2 = *I2; |
| 493 | |
| 494 | // Check any possible interference |
| 495 | auto interferes = [&](MachineBasicBlock::iterator From, |
| 496 | MachineBasicBlock::iterator To) -> bool { |
| 497 | |
| 498 | assert(MDT.dominates(&*To, &*From)); |
| 499 | |
| 500 | auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { |
| 501 | const MachineBasicBlock *MBBFrom = From->getParent(); |
| 502 | const MachineBasicBlock *MBBTo = To->getParent(); |
| 503 | bool MayClobberFrom = isReachable(From: Clobber, To: &*From, CutOff: MBBTo, MDT); |
| 504 | bool MayClobberTo = isReachable(From: Clobber, To: &*To, CutOff: MBBTo, MDT); |
| 505 | if (!MayClobberFrom && !MayClobberTo) |
| 506 | return false; |
| 507 | if ((MayClobberFrom && !MayClobberTo) || |
| 508 | (!MayClobberFrom && MayClobberTo)) |
| 509 | return true; |
| 510 | // Both can clobber, this is not an interference only if both are |
| 511 | // dominated by Clobber and belong to the same block or if Clobber |
| 512 | // properly dominates To, given that To >> From, so it dominates |
| 513 | // both and located in a common dominator. |
| 514 | return !((MBBFrom == MBBTo && |
| 515 | MDT.dominates(A: Clobber, B: &*From) && |
| 516 | MDT.dominates(A: Clobber, B: &*To)) || |
| 517 | MDT.properlyDominates(A: Clobber->getParent(), B: MBBTo)); |
| 518 | }; |
| 519 | |
| 520 | return (llvm::any_of(Range&: Clobbers, P: interferes)) || |
| 521 | (llvm::any_of(Range&: Inits, P: [&](InitListMap::value_type &C) { |
| 522 | return C.first != Init.first && |
| 523 | llvm::any_of(Range&: C.second, P: interferes); |
| 524 | })); |
| 525 | }; |
| 526 | |
| 527 | if (MDT.dominates(A: MI1, B: MI2)) { |
| 528 | if (!interferes(MI2, MI1)) { |
| 529 | LLVM_DEBUG(dbgs() |
| 530 | << "Erasing from " |
| 531 | << printMBBReference(*MI2->getParent()) << " " << *MI2); |
| 532 | MergedInstrs.insert(Ptr: MI2); |
| 533 | Changed = true; |
| 534 | ++I2; |
| 535 | continue; |
| 536 | } |
| 537 | } else if (MDT.dominates(A: MI2, B: MI1)) { |
| 538 | if (!interferes(MI1, MI2)) { |
| 539 | LLVM_DEBUG(dbgs() |
| 540 | << "Erasing from " |
| 541 | << printMBBReference(*MI1->getParent()) << " " << *MI1); |
| 542 | MergedInstrs.insert(Ptr: MI1); |
| 543 | Changed = true; |
| 544 | ++I1; |
| 545 | break; |
| 546 | } |
| 547 | } else { |
| 548 | auto *MBB = MDT.findNearestCommonDominator(A: MI1->getParent(), |
| 549 | B: MI2->getParent()); |
| 550 | if (!MBB) { |
| 551 | ++I2; |
| 552 | continue; |
| 553 | } |
| 554 | |
| 555 | MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); |
| 556 | if (!interferes(MI1, I) && !interferes(MI2, I)) { |
| 557 | LLVM_DEBUG(dbgs() |
| 558 | << "Erasing from " |
| 559 | << printMBBReference(*MI1->getParent()) << " " << *MI1 |
| 560 | << "and moving from " |
| 561 | << printMBBReference(*MI2->getParent()) << " to " |
| 562 | << printMBBReference(*I->getParent()) << " " << *MI2); |
| 563 | I->getParent()->splice(Where: I, Other: MI2->getParent(), From: MI2); |
| 564 | MergedInstrs.insert(Ptr: MI1); |
| 565 | Changed = true; |
| 566 | ++I1; |
| 567 | break; |
| 568 | } |
| 569 | } |
| 570 | ++I2; |
| 571 | } |
| 572 | ++I1; |
| 573 | } |
| 574 | } |
| 575 | |
| 576 | // Remove initializations that were merged into another. |
| 577 | for (auto &Init : Inits) { |
| 578 | auto &Defs = Init.second; |
| 579 | auto I = Defs.begin(); |
| 580 | while (I != Defs.end()) { |
| 581 | if (MergedInstrs.count(Ptr: *I)) { |
| 582 | (*I)->eraseFromParent(); |
| 583 | I = Defs.erase(position: I); |
| 584 | } else |
| 585 | ++I; |
| 586 | } |
| 587 | } |
| 588 | |
| 589 | // Try to schedule SGPR initializations as early as possible in the MBB. |
| 590 | for (auto &Init : Inits) { |
| 591 | auto &Defs = Init.second; |
| 592 | for (auto *MI : Defs) { |
| 593 | auto *MBB = MI->getParent(); |
| 594 | MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); |
| 595 | MachineBasicBlock::reverse_iterator B(BoundaryMI); |
| 596 | // Check if B should actually be a boundary. If not set the previous |
| 597 | // instruction as the boundary instead. |
| 598 | if (!TII->isBasicBlockPrologue(MI: *B)) |
| 599 | B++; |
| 600 | |
| 601 | auto R = std::next(x: MI->getReverseIterator()); |
| 602 | const unsigned Threshold = 50; |
| 603 | // Search until B or Threshold for a place to insert the initialization. |
| 604 | for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) |
| 605 | if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || |
| 606 | TII->isSchedulingBoundary(MI: *R, MBB, MF: *MBB->getParent())) |
| 607 | break; |
| 608 | |
| 609 | // Move to directly after R. |
| 610 | if (&*--R != MI) |
| 611 | MBB->splice(Where: *R, Other: MBB, From: MI); |
| 612 | } |
| 613 | } |
| 614 | |
| 615 | if (Changed) |
| 616 | MRI.clearKillFlags(Reg); |
| 617 | |
| 618 | return Changed; |
| 619 | } |
| 620 | |
| 621 | bool SIFixSGPRCopies::run(MachineFunction &MF) { |
| 622 | // Only need to run this in SelectionDAG path. |
| 623 | if (MF.getProperties().hasSelected()) |
| 624 | return false; |
| 625 | |
| 626 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 627 | MRI = &MF.getRegInfo(); |
| 628 | TRI = ST.getRegisterInfo(); |
| 629 | TII = ST.getInstrInfo(); |
| 630 | |
| 631 | // Instructions to re-legalize after changing register classes |
| 632 | SmallVector<MachineInstr *, 8> Relegalize; |
| 633 | |
| 634 | for (MachineBasicBlock &MBB : MF) { |
| 635 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
| 636 | ++I) { |
| 637 | MachineInstr &MI = *I; |
| 638 | |
| 639 | switch (MI.getOpcode()) { |
| 640 | default: |
| 641 | // scale_src has a register class restricted to low 256 VGPRs, changing |
| 642 | // registers to VGPR may not take it into acount. |
| 643 | if (TII->isWMMA(MI) && |
| 644 | AMDGPU::hasNamedOperand(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::scale_src0)) |
| 645 | Relegalize.push_back(Elt: &MI); |
| 646 | continue; |
| 647 | case AMDGPU::COPY: { |
| 648 | const TargetRegisterClass *SrcRC, *DstRC; |
| 649 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: *TRI, MRI: *MRI); |
| 650 | |
| 651 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) { |
| 652 | // Since VGPR to SGPR copies affect VGPR to SGPR copy |
| 653 | // score and, hence the lowering decision, let's try to get rid of |
| 654 | // them as early as possible |
| 655 | if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) |
| 656 | continue; |
| 657 | |
| 658 | // Collect those not changed to try them after VGPR to SGPR copies |
| 659 | // lowering as there will be more opportunities. |
| 660 | S2VCopies.push_back(Elt: &MI); |
| 661 | } |
| 662 | if (!isVGPRToSGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
| 663 | continue; |
| 664 | if (lowerSpecialCase(MI, I)) |
| 665 | continue; |
| 666 | |
| 667 | analyzeVGPRToSGPRCopy(MI: &MI); |
| 668 | |
| 669 | break; |
| 670 | } |
| 671 | case AMDGPU::WQM: |
| 672 | case AMDGPU::STRICT_WQM: |
| 673 | case AMDGPU::SOFT_WQM: |
| 674 | case AMDGPU::STRICT_WWM: |
| 675 | case AMDGPU::INSERT_SUBREG: |
| 676 | case AMDGPU::PHI: |
| 677 | case AMDGPU::REG_SEQUENCE: { |
| 678 | if (TRI->isSGPRClass(RC: TII->getOpRegClass(MI, OpNo: 0))) { |
| 679 | for (MachineOperand &MO : MI.operands()) { |
| 680 | if (!MO.isReg() || !MO.getReg().isVirtual()) |
| 681 | continue; |
| 682 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: MO.getReg()); |
| 683 | if (SrcRC == &AMDGPU::VReg_1RegClass) |
| 684 | continue; |
| 685 | |
| 686 | if (TRI->hasVectorRegisters(RC: SrcRC)) { |
| 687 | const TargetRegisterClass *DestRC = |
| 688 | TRI->getEquivalentSGPRClass(VRC: SrcRC); |
| 689 | Register NewDst = MRI->createVirtualRegister(RegClass: DestRC); |
| 690 | MachineBasicBlock *BlockToInsertCopy = |
| 691 | MI.isPHI() ? MI.getOperand(i: MO.getOperandNo() + 1).getMBB() |
| 692 | : &MBB; |
| 693 | MachineBasicBlock::iterator PointToInsertCopy = |
| 694 | MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; |
| 695 | |
| 696 | const DebugLoc &DL = MI.getDebugLoc(); |
| 697 | if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertTo: BlockToInsertCopy, |
| 698 | PointToInsertTo: PointToInsertCopy, DL)) { |
| 699 | MachineInstr *NewCopy = |
| 700 | BuildMI(BB&: *BlockToInsertCopy, I: PointToInsertCopy, MIMD: DL, |
| 701 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: NewDst) |
| 702 | .addReg(RegNo: MO.getReg()); |
| 703 | MO.setReg(NewDst); |
| 704 | analyzeVGPRToSGPRCopy(MI: NewCopy); |
| 705 | PHISources.insert(V: NewCopy); |
| 706 | } |
| 707 | } |
| 708 | } |
| 709 | } |
| 710 | |
| 711 | if (MI.isPHI()) |
| 712 | PHINodes.push_back(Elt: &MI); |
| 713 | else if (MI.isRegSequence()) |
| 714 | RegSequences.push_back(Elt: &MI); |
| 715 | |
| 716 | break; |
| 717 | } |
| 718 | case AMDGPU::V_WRITELANE_B32: { |
| 719 | // Some architectures allow more than one constant bus access without |
| 720 | // SGPR restriction |
| 721 | if (ST.getConstantBusLimit(Opcode: MI.getOpcode()) != 1) |
| 722 | break; |
| 723 | |
| 724 | // Writelane is special in that it can use SGPR and M0 (which would |
| 725 | // normally count as using the constant bus twice - but in this case it |
| 726 | // is allowed since the lane selector doesn't count as a use of the |
| 727 | // constant bus). However, it is still required to abide by the 1 SGPR |
| 728 | // rule. Apply a fix here as we might have multiple SGPRs after |
| 729 | // legalizing VGPRs to SGPRs |
| 730 | int Src0Idx = |
| 731 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0); |
| 732 | int Src1Idx = |
| 733 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src1); |
| 734 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
| 735 | MachineOperand &Src1 = MI.getOperand(i: Src1Idx); |
| 736 | |
| 737 | // Check to see if the instruction violates the 1 SGPR rule |
| 738 | if ((Src0.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0.getReg()) && |
| 739 | Src0.getReg() != AMDGPU::M0) && |
| 740 | (Src1.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1.getReg()) && |
| 741 | Src1.getReg() != AMDGPU::M0)) { |
| 742 | |
| 743 | // Check for trivially easy constant prop into one of the operands |
| 744 | // If this is the case then perform the operation now to resolve SGPR |
| 745 | // issue. If we don't do that here we will always insert a mov to m0 |
| 746 | // that can't be resolved in later operand folding pass |
| 747 | bool Resolved = false; |
| 748 | for (MachineOperand *MO : {&Src0, &Src1}) { |
| 749 | if (MO->getReg().isVirtual()) { |
| 750 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MO->getReg()); |
| 751 | if (DefMI && TII->isFoldableCopy(MI: *DefMI)) { |
| 752 | const MachineOperand &Def = DefMI->getOperand(i: 0); |
| 753 | if (Def.isReg() && |
| 754 | MO->getReg() == Def.getReg() && |
| 755 | MO->getSubReg() == Def.getSubReg()) { |
| 756 | const MachineOperand &Copied = DefMI->getOperand(i: 1); |
| 757 | if (Copied.isImm() && |
| 758 | TII->isInlineConstant(Imm: APInt(64, Copied.getImm(), true))) { |
| 759 | MO->ChangeToImmediate(ImmVal: Copied.getImm()); |
| 760 | Resolved = true; |
| 761 | break; |
| 762 | } |
| 763 | } |
| 764 | } |
| 765 | } |
| 766 | } |
| 767 | |
| 768 | if (!Resolved) { |
| 769 | // Haven't managed to resolve by replacing an SGPR with an immediate |
| 770 | // Move src1 to be in M0 |
| 771 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
| 772 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0) |
| 773 | .add(MO: Src1); |
| 774 | Src1.ChangeToRegister(Reg: AMDGPU::M0, isDef: false); |
| 775 | } |
| 776 | } |
| 777 | break; |
| 778 | } |
| 779 | } |
| 780 | } |
| 781 | } |
| 782 | |
| 783 | lowerVGPR2SGPRCopies(MF); |
| 784 | // Postprocessing |
| 785 | fixSCCCopies(MF); |
| 786 | for (auto *MI : S2VCopies) { |
| 787 | // Check if it is still valid |
| 788 | if (MI->isCopy()) { |
| 789 | const TargetRegisterClass *SrcRC, *DstRC; |
| 790 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: *MI, TRI: *TRI, MRI: *MRI); |
| 791 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
| 792 | tryChangeVGPRtoSGPRinCopy(MI&: *MI, TRI, TII); |
| 793 | } |
| 794 | } |
| 795 | for (auto *MI : RegSequences) { |
| 796 | // Check if it is still valid |
| 797 | if (MI->isRegSequence()) |
| 798 | foldVGPRCopyIntoRegSequence(MI&: *MI, TRI, TII, MRI&: *MRI); |
| 799 | } |
| 800 | for (auto *MI : PHINodes) { |
| 801 | processPHINode(MI&: *MI); |
| 802 | } |
| 803 | while (!Relegalize.empty()) |
| 804 | TII->legalizeOperands(MI&: *Relegalize.pop_back_val(), MDT); |
| 805 | |
| 806 | if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) |
| 807 | hoistAndMergeSGPRInits(Reg: AMDGPU::M0, MRI: *MRI, TRI, MDT&: *MDT, TII); |
| 808 | |
| 809 | SiblingPenalty.clear(); |
| 810 | V2SCopies.clear(); |
| 811 | SCCCopies.clear(); |
| 812 | RegSequences.clear(); |
| 813 | PHINodes.clear(); |
| 814 | S2VCopies.clear(); |
| 815 | PHISources.clear(); |
| 816 | |
| 817 | return true; |
| 818 | } |
| 819 | |
| 820 | void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { |
| 821 | bool AllAGPRUses = true; |
| 822 | SetVector<const MachineInstr *> worklist; |
| 823 | SmallPtrSet<const MachineInstr *, 4> Visited; |
| 824 | SetVector<MachineInstr *> PHIOperands; |
| 825 | worklist.insert(X: &MI); |
| 826 | Visited.insert(Ptr: &MI); |
| 827 | // HACK to make MIR tests with no uses happy |
| 828 | bool HasUses = false; |
| 829 | while (!worklist.empty()) { |
| 830 | const MachineInstr *Instr = worklist.pop_back_val(); |
| 831 | Register Reg = Instr->getOperand(i: 0).getReg(); |
| 832 | for (const auto &Use : MRI->use_operands(Reg)) { |
| 833 | HasUses = true; |
| 834 | const MachineInstr *UseMI = Use.getParent(); |
| 835 | AllAGPRUses &= (UseMI->isCopy() && |
| 836 | TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: 0).getReg())) || |
| 837 | TRI->isAGPR(MRI: *MRI, Reg: Use.getReg()); |
| 838 | if (UseMI->isCopy() || UseMI->isRegSequence()) { |
| 839 | if (Visited.insert(Ptr: UseMI).second) |
| 840 | worklist.insert(X: UseMI); |
| 841 | |
| 842 | continue; |
| 843 | } |
| 844 | } |
| 845 | } |
| 846 | |
| 847 | Register PHIRes = MI.getOperand(i: 0).getReg(); |
| 848 | const TargetRegisterClass *RC0 = MRI->getRegClass(Reg: PHIRes); |
| 849 | if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC: RC0)) { |
| 850 | LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); |
| 851 | MRI->setRegClass(Reg: PHIRes, RC: TRI->getEquivalentAGPRClass(SRC: RC0)); |
| 852 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
| 853 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MI.getOperand(i: I).getReg()); |
| 854 | if (DefMI && DefMI->isPHI()) |
| 855 | PHIOperands.insert(X: DefMI); |
| 856 | } |
| 857 | } |
| 858 | |
| 859 | if (TRI->hasVectorRegisters(RC: MRI->getRegClass(Reg: PHIRes)) || |
| 860 | RC0 == &AMDGPU::VReg_1RegClass) { |
| 861 | LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); |
| 862 | TII->legalizeOperands(MI, MDT); |
| 863 | } |
| 864 | |
| 865 | // Propagate register class back to PHI operands which are PHI themselves. |
| 866 | while (!PHIOperands.empty()) { |
| 867 | processPHINode(MI&: *PHIOperands.pop_back_val()); |
| 868 | } |
| 869 | } |
| 870 | |
| 871 | bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( |
| 872 | MachineOperand &MaybeVGPRConstMO, Register DstReg, |
| 873 | MachineBasicBlock *BlockToInsertTo, |
| 874 | MachineBasicBlock::iterator PointToInsertTo, const DebugLoc &DL) { |
| 875 | |
| 876 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MaybeVGPRConstMO.getReg()); |
| 877 | if (!DefMI || !DefMI->isMoveImmediate()) |
| 878 | return false; |
| 879 | |
| 880 | MachineOperand *SrcConst = TII->getNamedOperand(MI&: *DefMI, OperandName: AMDGPU::OpName::src0); |
| 881 | if (SrcConst->isReg()) |
| 882 | return false; |
| 883 | |
| 884 | const TargetRegisterClass *SrcRC = |
| 885 | MRI->getRegClass(Reg: MaybeVGPRConstMO.getReg()); |
| 886 | unsigned MoveSize = TRI->getRegSizeInBits(RC: *SrcRC); |
| 887 | unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
| 888 | BuildMI(BB&: *BlockToInsertTo, I: PointToInsertTo, MIMD: DL, MCID: TII->get(Opcode: MoveOp), DestReg: DstReg) |
| 889 | .add(MO: *SrcConst); |
| 890 | if (MRI->hasOneUse(RegNo: MaybeVGPRConstMO.getReg())) |
| 891 | DefMI->eraseFromParent(); |
| 892 | MaybeVGPRConstMO.setReg(DstReg); |
| 893 | return true; |
| 894 | } |
| 895 | |
| 896 | bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, |
| 897 | MachineBasicBlock::iterator &I) { |
| 898 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 899 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 900 | if (!DstReg.isVirtual()) { |
| 901 | // If the destination register is a physical register there isn't |
| 902 | // really much we can do to fix this. |
| 903 | // Some special instructions use M0 as an input. Some even only use |
| 904 | // the first lane. Insert a readfirstlane and hope for the best. |
| 905 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: SrcReg); |
| 906 | if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(RC: SrcRC)) { |
| 907 | Register TmpReg = |
| 908 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
| 909 | |
| 910 | const MCInstrDesc &ReadFirstLaneDesc = |
| 911 | TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32); |
| 912 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: ReadFirstLaneDesc, DestReg: TmpReg) |
| 913 | .add(MO: MI.getOperand(i: 1)); |
| 914 | |
| 915 | unsigned SubReg = MI.getOperand(i: 1).getSubReg(); |
| 916 | MI.getOperand(i: 1).setReg(TmpReg); |
| 917 | MI.getOperand(i: 1).setSubReg(AMDGPU::NoSubRegister); |
| 918 | |
| 919 | const TargetRegisterClass *OpRC = TII->getRegClass(MCID: ReadFirstLaneDesc, OpNum: 1); |
| 920 | const TargetRegisterClass *ConstrainRC = |
| 921 | SubReg == AMDGPU::NoSubRegister |
| 922 | ? OpRC |
| 923 | : TRI->getMatchingSuperRegClass(A: SrcRC, B: OpRC, Idx: SubReg); |
| 924 | |
| 925 | if (!MRI->constrainRegClass(Reg: SrcReg, RC: ConstrainRC)) |
| 926 | llvm_unreachable("failed to constrain register" ); |
| 927 | } else if (tryMoveVGPRConstToSGPR(MaybeVGPRConstMO&: MI.getOperand(i: 1), DstReg, BlockToInsertTo: MI.getParent(), |
| 928 | PointToInsertTo: MI, DL: MI.getDebugLoc())) { |
| 929 | I = std::next(x: I); |
| 930 | MI.eraseFromParent(); |
| 931 | } |
| 932 | return true; |
| 933 | } |
| 934 | if (!SrcReg.isVirtual() || TRI->isAGPR(MRI: *MRI, Reg: SrcReg)) { |
| 935 | SIInstrWorklist worklist; |
| 936 | worklist.insert(MI: &MI); |
| 937 | TII->moveToVALU(Worklist&: worklist, MDT); |
| 938 | return true; |
| 939 | } |
| 940 | |
| 941 | unsigned SMovOp; |
| 942 | int64_t Imm; |
| 943 | // If we are just copying an immediate, we can replace the copy with |
| 944 | // s_mov_b32. |
| 945 | if (isSafeToFoldImmIntoCopy(Copy: &MI, MoveImm: MRI->getVRegDef(Reg: SrcReg), TII, SMovOp, Imm)) { |
| 946 | MI.getOperand(i: 1).ChangeToImmediate(ImmVal: Imm); |
| 947 | MI.addImplicitDefUseOperands(MF&: *MI.getMF()); |
| 948 | MI.setDesc(TII->get(Opcode: SMovOp)); |
| 949 | return true; |
| 950 | } |
| 951 | return false; |
| 952 | } |
| 953 | |
| 954 | void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { |
| 955 | if (PHISources.contains(V: MI)) |
| 956 | return; |
| 957 | Register DstReg = MI->getOperand(i: 0).getReg(); |
| 958 | const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg); |
| 959 | |
| 960 | V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, |
| 961 | TRI->getRegSizeInBits(RC: *DstRC)); |
| 962 | SmallVector<MachineInstr *, 8> AnalysisWorklist; |
| 963 | // Needed because the SSA is not a tree but a graph and may have |
| 964 | // forks and joins. We should not then go same way twice. |
| 965 | DenseSet<MachineInstr *> Visited; |
| 966 | AnalysisWorklist.push_back(Elt: Info.Copy); |
| 967 | while (!AnalysisWorklist.empty()) { |
| 968 | |
| 969 | MachineInstr *Inst = AnalysisWorklist.pop_back_val(); |
| 970 | |
| 971 | if (!Visited.insert(V: Inst).second) |
| 972 | continue; |
| 973 | |
| 974 | // Copies and REG_SEQUENCE do not contribute to the final assembly |
| 975 | // So, skip them but take care of the SGPR to VGPR copies bookkeeping. |
| 976 | if (Inst->isRegSequence() && |
| 977 | TRI->isVGPR(MRI: *MRI, Reg: Inst->getOperand(i: 0).getReg())) { |
| 978 | Info.NumSVCopies++; |
| 979 | continue; |
| 980 | } |
| 981 | if (Inst->isCopy()) { |
| 982 | const TargetRegisterClass *SrcRC, *DstRC; |
| 983 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: *Inst, TRI: *TRI, MRI: *MRI); |
| 984 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI) && |
| 985 | !tryChangeVGPRtoSGPRinCopy(MI&: *Inst, TRI, TII)) { |
| 986 | Info.NumSVCopies++; |
| 987 | continue; |
| 988 | } |
| 989 | } |
| 990 | |
| 991 | SiblingPenalty[Inst].insert(X: Info.ID); |
| 992 | |
| 993 | SmallVector<MachineInstr *, 4> Users; |
| 994 | if ((TII->isSALU(MI: *Inst) && Inst->isCompare()) || |
| 995 | (Inst->isCopy() && Inst->getOperand(i: 0).getReg() == AMDGPU::SCC)) { |
| 996 | auto I = Inst->getIterator(); |
| 997 | auto E = Inst->getParent()->end(); |
| 998 | while (++I != E && |
| 999 | !I->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
| 1000 | if (I->readsRegister(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) |
| 1001 | Users.push_back(Elt: &*I); |
| 1002 | } |
| 1003 | } else if (Inst->getNumExplicitDefs() != 0) { |
| 1004 | Register Reg = Inst->getOperand(i: 0).getReg(); |
| 1005 | if (Reg.isVirtual() && TRI->isSGPRReg(MRI: *MRI, Reg) && !TII->isVALU(MI: *Inst)) { |
| 1006 | for (auto &U : MRI->use_instructions(Reg)) |
| 1007 | Users.push_back(Elt: &U); |
| 1008 | } |
| 1009 | } |
| 1010 | for (auto *U : Users) { |
| 1011 | if (TII->isSALU(MI: *U)) |
| 1012 | Info.SChain.insert(X: U); |
| 1013 | AnalysisWorklist.push_back(Elt: U); |
| 1014 | } |
| 1015 | } |
| 1016 | V2SCopies[Info.ID] = std::move(Info); |
| 1017 | } |
| 1018 | |
| 1019 | // The main function that computes the VGPR to SGPR copy score |
| 1020 | // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU |
| 1021 | bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { |
| 1022 | if (Info->SChain.empty()) { |
| 1023 | Info->Score = 0; |
| 1024 | return true; |
| 1025 | } |
| 1026 | Info->Siblings = SiblingPenalty[*llvm::max_element( |
| 1027 | Range&: Info->SChain, C: [&](MachineInstr *A, MachineInstr *B) -> bool { |
| 1028 | return SiblingPenalty[A].size() < SiblingPenalty[B].size(); |
| 1029 | })]; |
| 1030 | Info->Siblings.remove_if(P: [&](unsigned ID) { return ID == Info->ID; }); |
| 1031 | // The loop below computes the number of another VGPR to SGPR V2SCopies |
| 1032 | // which contribute to the current copy SALU chain. We assume that all the |
| 1033 | // V2SCopies with the same source virtual register will be squashed to one |
| 1034 | // by regalloc. Also we take care of the V2SCopies of the differnt subregs |
| 1035 | // of the same register. |
| 1036 | SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; |
| 1037 | for (auto J : Info->Siblings) { |
| 1038 | auto *InfoIt = V2SCopies.find(Key: J); |
| 1039 | if (InfoIt != V2SCopies.end()) { |
| 1040 | MachineInstr *SiblingCopy = InfoIt->second.Copy; |
| 1041 | if (SiblingCopy->isImplicitDef()) |
| 1042 | // the COPY has already been MoveToVALUed |
| 1043 | continue; |
| 1044 | |
| 1045 | SrcRegs.insert(V: std::pair(SiblingCopy->getOperand(i: 1).getReg(), |
| 1046 | SiblingCopy->getOperand(i: 1).getSubReg())); |
| 1047 | } |
| 1048 | } |
| 1049 | Info->SiblingPenalty = SrcRegs.size(); |
| 1050 | |
| 1051 | unsigned Penalty = |
| 1052 | Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; |
| 1053 | unsigned Profit = Info->SChain.size(); |
| 1054 | Info->Score = Penalty > Profit ? 0 : Profit - Penalty; |
| 1055 | Info->NeedToBeConvertedToVALU = Info->Score < 3; |
| 1056 | return Info->NeedToBeConvertedToVALU; |
| 1057 | } |
| 1058 | |
| 1059 | void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { |
| 1060 | |
| 1061 | SmallVector<unsigned, 8> LoweringWorklist; |
| 1062 | for (auto &C : V2SCopies) { |
| 1063 | if (needToBeConvertedToVALU(Info: &C.second)) |
| 1064 | LoweringWorklist.push_back(Elt: C.second.ID); |
| 1065 | } |
| 1066 | |
| 1067 | // Store all the V2S copy instructions that need to be moved to VALU |
| 1068 | // in the Copies worklist. |
| 1069 | SIInstrWorklist Copies; |
| 1070 | |
| 1071 | while (!LoweringWorklist.empty()) { |
| 1072 | unsigned CurID = LoweringWorklist.pop_back_val(); |
| 1073 | auto *CurInfoIt = V2SCopies.find(Key: CurID); |
| 1074 | if (CurInfoIt != V2SCopies.end()) { |
| 1075 | const V2SCopyInfo &C = CurInfoIt->second; |
| 1076 | LLVM_DEBUG(dbgs() << "Processing ...\n" ; C.dump()); |
| 1077 | for (auto S : C.Siblings) { |
| 1078 | auto *SibInfoIt = V2SCopies.find(Key: S); |
| 1079 | if (SibInfoIt != V2SCopies.end()) { |
| 1080 | V2SCopyInfo &SI = SibInfoIt->second; |
| 1081 | LLVM_DEBUG(dbgs() << "Sibling:\n" ; SI.dump()); |
| 1082 | if (!SI.NeedToBeConvertedToVALU) { |
| 1083 | SI.SChain.set_subtract(C.SChain); |
| 1084 | if (needToBeConvertedToVALU(Info: &SI)) |
| 1085 | LoweringWorklist.push_back(Elt: SI.ID); |
| 1086 | } |
| 1087 | SI.Siblings.remove_if(P: [&](unsigned ID) { return ID == C.ID; }); |
| 1088 | } |
| 1089 | } |
| 1090 | LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy |
| 1091 | << " is being turned to VALU\n" ); |
| 1092 | Copies.insert(MI: C.Copy); |
| 1093 | // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if |
| 1094 | // instead. |
| 1095 | V2SCopies.erase(Key: C.ID); |
| 1096 | } |
| 1097 | } |
| 1098 | |
| 1099 | TII->moveToVALU(Worklist&: Copies, MDT); |
| 1100 | Copies.clear(); |
| 1101 | |
| 1102 | // Now do actual lowering |
| 1103 | for (auto C : V2SCopies) { |
| 1104 | MachineInstr *MI = C.second.Copy; |
| 1105 | MachineBasicBlock *MBB = MI->getParent(); |
| 1106 | // We decide to turn V2S copy to v_readfirstlane_b32 |
| 1107 | // remove it from the V2SCopies and remove it from all its siblings |
| 1108 | LLVM_DEBUG(dbgs() << "V2S copy " << *MI |
| 1109 | << " is being turned to v_readfirstlane_b32" |
| 1110 | << " Score: " << C.second.Score << "\n" ); |
| 1111 | Register DstReg = MI->getOperand(i: 0).getReg(); |
| 1112 | MRI->constrainRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
| 1113 | |
| 1114 | Register SrcReg = MI->getOperand(i: 1).getReg(); |
| 1115 | unsigned SubReg = MI->getOperand(i: 1).getSubReg(); |
| 1116 | const TargetRegisterClass *SrcRC = |
| 1117 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 1)); |
| 1118 | size_t SrcSize = TRI->getRegSizeInBits(RC: *SrcRC); |
| 1119 | if (SrcSize == 16) { |
| 1120 | assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() && |
| 1121 | "We do not expect to see 16-bit copies from VGPR to SGPR unless " |
| 1122 | "we have 16-bit VGPRs" ); |
| 1123 | assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass || |
| 1124 | MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass); |
| 1125 | // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits |
| 1126 | MRI->setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
| 1127 | Register VReg32 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1128 | const DebugLoc &DL = MI->getDebugLoc(); |
| 1129 | Register Undef = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass); |
| 1130 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef); |
| 1131 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: VReg32) |
| 1132 | .addReg(RegNo: SrcReg, Flags: {}, SubReg) |
| 1133 | .addImm(Val: AMDGPU::lo16) |
| 1134 | .addReg(RegNo: Undef) |
| 1135 | .addImm(Val: AMDGPU::hi16); |
| 1136 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg) |
| 1137 | .addReg(RegNo: VReg32); |
| 1138 | } else if (SrcSize == 32) { |
| 1139 | const MCInstrDesc &ReadFirstLaneDesc = |
| 1140 | TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32); |
| 1141 | const TargetRegisterClass *OpRC = TII->getRegClass(MCID: ReadFirstLaneDesc, OpNum: 1); |
| 1142 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: ReadFirstLaneDesc, DestReg: DstReg) |
| 1143 | .addReg(RegNo: SrcReg, Flags: {}, SubReg); |
| 1144 | |
| 1145 | const TargetRegisterClass *ConstrainRC = |
| 1146 | SubReg == AMDGPU::NoSubRegister |
| 1147 | ? OpRC |
| 1148 | : TRI->getMatchingSuperRegClass(A: MRI->getRegClass(Reg: SrcReg), B: OpRC, |
| 1149 | Idx: SubReg); |
| 1150 | |
| 1151 | if (!MRI->constrainRegClass(Reg: SrcReg, RC: ConstrainRC)) |
| 1152 | llvm_unreachable("failed to constrain register" ); |
| 1153 | } else { |
| 1154 | auto Result = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
| 1155 | MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg); |
| 1156 | int N = TRI->getRegSizeInBits(RC: *SrcRC) / 32; |
| 1157 | for (int i = 0; i < N; i++) { |
| 1158 | Register PartialSrc = TII->buildExtractSubReg( |
| 1159 | MI: Result, MRI&: *MRI, SuperReg: MI->getOperand(i: 1), SuperRC: SrcRC, |
| 1160 | SubIdx: TRI->getSubRegFromChannel(Channel: i), SubRC: &AMDGPU::VGPR_32RegClass); |
| 1161 | Register PartialDst = |
| 1162 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
| 1163 | BuildMI(BB&: *MBB, I&: *Result, MIMD: Result->getDebugLoc(), |
| 1164 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: PartialDst) |
| 1165 | .addReg(RegNo: PartialSrc); |
| 1166 | Result.addReg(RegNo: PartialDst).addImm(Val: TRI->getSubRegFromChannel(Channel: i)); |
| 1167 | } |
| 1168 | } |
| 1169 | MI->eraseFromParent(); |
| 1170 | } |
| 1171 | } |
| 1172 | |
| 1173 | void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { |
| 1174 | const AMDGPU::LaneMaskConstants &LMC = |
| 1175 | AMDGPU::LaneMaskConstants::get(ST: MF.getSubtarget<GCNSubtarget>()); |
| 1176 | for (MachineBasicBlock &MBB : MF) { |
| 1177 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
| 1178 | ++I) { |
| 1179 | MachineInstr &MI = *I; |
| 1180 | // May already have been lowered. |
| 1181 | if (!MI.isCopy()) |
| 1182 | continue; |
| 1183 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 1184 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 1185 | if (SrcReg == AMDGPU::SCC) { |
| 1186 | Register SCCCopy = |
| 1187 | MRI->createVirtualRegister(RegClass: TRI->getWaveMaskRegClass()); |
| 1188 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
| 1189 | MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: LMC.CSelectOpc), DestReg: SCCCopy) |
| 1190 | .addImm(Val: -1) |
| 1191 | .addImm(Val: 0); |
| 1192 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: I), MIMD: I->getDebugLoc(), |
| 1193 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
| 1194 | .addReg(RegNo: SCCCopy); |
| 1195 | MI.eraseFromParent(); |
| 1196 | continue; |
| 1197 | } |
| 1198 | if (DstReg == AMDGPU::SCC) { |
| 1199 | Register Tmp = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
| 1200 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
| 1201 | MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: LMC.AndOpc)) |
| 1202 | .addReg(RegNo: Tmp, Flags: getDefRegState(B: true)) |
| 1203 | .addReg(RegNo: SrcReg) |
| 1204 | .addReg(RegNo: LMC.ExecReg); |
| 1205 | MI.eraseFromParent(); |
| 1206 | } |
| 1207 | } |
| 1208 | } |
| 1209 | } |
| 1210 | |
| 1211 | PreservedAnalyses |
| 1212 | SIFixSGPRCopiesPass::run(MachineFunction &MF, |
| 1213 | MachineFunctionAnalysisManager &MFAM) { |
| 1214 | MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF); |
| 1215 | SIFixSGPRCopies Impl(&MDT); |
| 1216 | bool Changed = Impl.run(MF); |
| 1217 | if (!Changed) |
| 1218 | return PreservedAnalyses::all(); |
| 1219 | |
| 1220 | // TODO: We could detect CFG changed. |
| 1221 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
| 1222 | return PA; |
| 1223 | } |
| 1224 | |