| 1 | //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// GlobalISel pass that selects divergent i1 phis as lane mask phis. |
| 11 | /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. |
| 12 | /// Handles all cases of temporal divergence. |
| 13 | /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass |
| 14 | /// currently depends on LCSSA to insert phis with one incoming. |
| 15 | // |
| 16 | //===----------------------------------------------------------------------===// |
| 17 | |
| 18 | #include "AMDGPU.h" |
| 19 | #include "AMDGPUGlobalISelUtils.h" |
| 20 | #include "SILowerI1Copies.h" |
| 21 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
| 22 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 23 | #include "llvm/CodeGen/MachineUniformityAnalysis.h" |
| 24 | #include "llvm/InitializePasses.h" |
| 25 | |
| 26 | #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" |
| 27 | |
| 28 | using namespace llvm; |
| 29 | |
| 30 | namespace { |
| 31 | |
| 32 | class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { |
| 33 | public: |
| 34 | static char ID; |
| 35 | |
| 36 | public: |
| 37 | AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {} |
| 38 | |
| 39 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 40 | |
| 41 | StringRef getPassName() const override { |
| 42 | return "AMDGPU GlobalISel divergence lowering" ; |
| 43 | } |
| 44 | |
| 45 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 46 | AU.setPreservesCFG(); |
| 47 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
| 48 | AU.addRequired<MachinePostDominatorTreeWrapperPass>(); |
| 49 | AU.addRequired<MachineUniformityAnalysisPass>(); |
| 50 | MachineFunctionPass::getAnalysisUsage(AU); |
| 51 | } |
| 52 | }; |
| 53 | |
| 54 | class DivergenceLoweringHelper : public PhiLoweringHelper { |
| 55 | public: |
| 56 | DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, |
| 57 | MachinePostDominatorTree *PDT, |
| 58 | MachineUniformityInfo *MUI); |
| 59 | |
| 60 | private: |
| 61 | MachineUniformityInfo *MUI = nullptr; |
| 62 | MachineIRBuilder B; |
| 63 | Register buildRegCopyToLaneMask(Register Reg); |
| 64 | |
| 65 | public: |
| 66 | void markAsLaneMask(Register DstReg) const override; |
| 67 | void getCandidatesForLowering( |
| 68 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; |
| 69 | void collectIncomingValuesFromPhi( |
| 70 | const MachineInstr *MI, |
| 71 | SmallVectorImpl<Incoming> &Incomings) const override; |
| 72 | void replaceDstReg(Register NewReg, Register OldReg, |
| 73 | MachineBasicBlock *MBB) override; |
| 74 | void buildMergeLaneMasks(MachineBasicBlock &MBB, |
| 75 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
| 76 | Register DstReg, Register PrevReg, |
| 77 | Register CurReg) override; |
| 78 | void constrainAsLaneMask(Incoming &In) override; |
| 79 | |
| 80 | bool lowerTemporalDivergence(); |
| 81 | bool lowerTemporalDivergenceI1(); |
| 82 | }; |
| 83 | |
| 84 | DivergenceLoweringHelper::DivergenceLoweringHelper( |
| 85 | MachineFunction *MF, MachineDominatorTree *DT, |
| 86 | MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI) |
| 87 | : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {} |
| 88 | |
| 89 | // _(s1) -> SReg_32/64(s1) |
| 90 | void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const { |
| 91 | assert(MRI->getType(DstReg) == LLT::scalar(1)); |
| 92 | |
| 93 | if (MRI->getRegClassOrNull(Reg: DstReg)) { |
| 94 | if (MRI->constrainRegClass(Reg: DstReg, RC: ST->getBoolRC())) |
| 95 | return; |
| 96 | llvm_unreachable("Failed to constrain register class" ); |
| 97 | } |
| 98 | |
| 99 | MRI->setRegClass(Reg: DstReg, RC: ST->getBoolRC()); |
| 100 | } |
| 101 | |
| 102 | void DivergenceLoweringHelper::getCandidatesForLowering( |
| 103 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { |
| 104 | LLT S1 = LLT::scalar(SizeInBits: 1); |
| 105 | |
| 106 | // Add divergent i1 phis to the list |
| 107 | for (MachineBasicBlock &MBB : *MF) { |
| 108 | for (MachineInstr &MI : MBB.phis()) { |
| 109 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 110 | if (MRI->getType(Reg: Dst) == S1 && MUI->isDivergent(V: Dst)) |
| 111 | Vreg1Phis.push_back(Elt: &MI); |
| 112 | } |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | void DivergenceLoweringHelper::collectIncomingValuesFromPhi( |
| 117 | const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { |
| 118 | for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { |
| 119 | Incomings.emplace_back(Args: MI->getOperand(i).getReg(), |
| 120 | Args: MI->getOperand(i: i + 1).getMBB(), Args: Register()); |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg, |
| 125 | MachineBasicBlock *MBB) { |
| 126 | BuildMI(BB&: *MBB, I: MBB->getFirstNonPHI(), MIMD: {}, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: OldReg) |
| 127 | .addReg(RegNo: NewReg); |
| 128 | } |
| 129 | |
| 130 | // Copy Reg to new lane mask register, insert a copy after instruction that |
| 131 | // defines Reg while skipping phis if needed. |
| 132 | Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) { |
| 133 | Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
| 134 | MachineInstr *Instr = MRI->getVRegDef(Reg); |
| 135 | MachineBasicBlock *MBB = Instr->getParent(); |
| 136 | B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: Instr->getIterator()))); |
| 137 | B.buildCopy(Res: LaneMask, Op: Reg); |
| 138 | return LaneMask; |
| 139 | } |
| 140 | |
| 141 | // bb.previous |
| 142 | // %PrevReg = ... |
| 143 | // |
| 144 | // bb.current |
| 145 | // %CurReg = ... |
| 146 | // |
| 147 | // %DstReg - not defined |
| 148 | // |
| 149 | // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) |
| 150 | // |
| 151 | // bb.previous |
| 152 | // %PrevReg = ... |
| 153 | // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg |
| 154 | // |
| 155 | // bb.current |
| 156 | // %CurReg = ... |
| 157 | // %CurRegCopy:sreg_32(s1) = COPY %CurReg |
| 158 | // ... |
| 159 | // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 |
| 160 | // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 |
| 161 | // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg |
| 162 | // |
| 163 | // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg |
| 164 | void DivergenceLoweringHelper::buildMergeLaneMasks( |
| 165 | MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, |
| 166 | Register DstReg, Register PrevReg, Register CurReg) { |
| 167 | // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC) |
| 168 | // TODO: check if inputs are constants or results of a compare. |
| 169 | |
| 170 | Register PrevRegCopy = buildRegCopyToLaneMask(Reg: PrevReg); |
| 171 | Register CurRegCopy = buildRegCopyToLaneMask(Reg: CurReg); |
| 172 | Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
| 173 | Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
| 174 | |
| 175 | B.setInsertPt(MBB, II: I); |
| 176 | B.buildInstr(Opc: AndN2Op, DstOps: {PrevMaskedReg}, SrcOps: {PrevRegCopy, ExecReg}); |
| 177 | B.buildInstr(Opc: AndOp, DstOps: {CurMaskedReg}, SrcOps: {ExecReg, CurRegCopy}); |
| 178 | B.buildInstr(Opc: OrOp, DstOps: {DstReg}, SrcOps: {PrevMaskedReg, CurMaskedReg}); |
| 179 | } |
| 180 | |
| 181 | // GlobalISel has to constrain S1 incoming taken as-is with lane mask register |
| 182 | // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block, |
| 183 | // Incoming.Reg becomes that new lane mask. |
| 184 | void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { |
| 185 | B.setInsertPt(MBB&: *In.Block, II: In.Block->getFirstTerminator()); |
| 186 | |
| 187 | auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: In.Reg); |
| 188 | MRI->setRegClass(Reg: Copy.getReg(Idx: 0), RC: ST->getBoolRC()); |
| 189 | In.Reg = Copy.getReg(Idx: 0); |
| 190 | } |
| 191 | |
| 192 | void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, |
| 193 | Register NewReg) { |
| 194 | for (MachineOperand &Op : Inst->operands()) { |
| 195 | if (Op.isReg() && Op.getReg() == Reg) |
| 196 | Op.setReg(NewReg); |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | bool DivergenceLoweringHelper::lowerTemporalDivergence() { |
| 201 | AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); |
| 202 | DenseMap<Register, Register> TDCache; |
| 203 | |
| 204 | for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) { |
| 205 | if (MRI->getType(Reg) == LLT::scalar(SizeInBits: 1) || MUI->isDivergent(V: Reg) || |
| 206 | ILMA.isS32S64LaneMask(Reg)) |
| 207 | continue; |
| 208 | |
| 209 | Register CachedTDCopy = TDCache.lookup(Val: Reg); |
| 210 | if (CachedTDCopy) { |
| 211 | replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: CachedTDCopy); |
| 212 | continue; |
| 213 | } |
| 214 | |
| 215 | MachineInstr *Inst = MRI->getVRegDef(Reg); |
| 216 | MachineBasicBlock *MBB = Inst->getParent(); |
| 217 | B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: Inst->getIterator()))); |
| 218 | |
| 219 | Register VgprReg = MRI->createGenericVirtualRegister(Ty: MRI->getType(Reg)); |
| 220 | B.buildInstr(Opc: AMDGPU::COPY, DstOps: {VgprReg}, SrcOps: {Reg}) |
| 221 | .addUse(RegNo: ExecReg, Flags: RegState::Implicit); |
| 222 | |
| 223 | replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: VgprReg); |
| 224 | TDCache[Reg] = VgprReg; |
| 225 | } |
| 226 | return false; |
| 227 | } |
| 228 | |
| 229 | bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { |
| 230 | MachineRegisterInfo::VRegAttrs BoolS1 = {.RCOrRB: ST->getBoolRC(), .Ty: LLT::scalar(SizeInBits: 1)}; |
| 231 | initializeLaneMaskRegisterAttributes(Attrs: BoolS1); |
| 232 | MachineSSAUpdater SSAUpdater(*MF); |
| 233 | |
| 234 | // In case of use outside muliple nested cycles or muliple uses we only need |
| 235 | // to merge lane mask across largest relevant cycle. |
| 236 | SmallDenseMap<Register, std::pair<const MachineCycle *, Register>> LRCCache; |
| 237 | for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { |
| 238 | if (MRI->getType(Reg) != LLT::scalar(SizeInBits: 1)) |
| 239 | continue; |
| 240 | |
| 241 | auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Key: Reg); |
| 242 | auto &CycleMergedMask = LRCCacheIter->getSecond(); |
| 243 | const MachineCycle *&CachedLRC = CycleMergedMask.first; |
| 244 | if (RegNotCached || LRC->contains(C: CachedLRC)) { |
| 245 | CachedLRC = LRC; |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | for (auto &LRCCacheEntry : LRCCache) { |
| 250 | Register Reg = LRCCacheEntry.first; |
| 251 | auto &CycleMergedMask = LRCCacheEntry.getSecond(); |
| 252 | const MachineCycle *Cycle = CycleMergedMask.first; |
| 253 | |
| 254 | Register MergedMask = MRI->createVirtualRegister(RegAttr: BoolS1); |
| 255 | SSAUpdater.Initialize(V: MergedMask); |
| 256 | |
| 257 | MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); |
| 258 | SSAUpdater.AddAvailableValue(BB: MBB, V: MergedMask); |
| 259 | |
| 260 | for (auto Entry : Cycle->getEntries()) { |
| 261 | for (MachineBasicBlock *Pred : Entry->predecessors()) { |
| 262 | if (!Cycle->contains(Block: Pred)) { |
| 263 | B.setInsertPt(MBB&: *Pred, II: Pred->getFirstTerminator()); |
| 264 | auto ImplDef = B.buildInstr(Opc: AMDGPU::IMPLICIT_DEF, DstOps: {BoolS1}, SrcOps: {}); |
| 265 | SSAUpdater.AddAvailableValue(BB: Pred, V: ImplDef.getReg(Idx: 0)); |
| 266 | } |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | buildMergeLaneMasks(MBB&: *MBB, I: MBB->getFirstTerminator(), DL: {}, DstReg: MergedMask, |
| 271 | PrevReg: SSAUpdater.GetValueInMiddleOfBlock(BB: MBB), CurReg: Reg); |
| 272 | |
| 273 | CycleMergedMask.second = MergedMask; |
| 274 | } |
| 275 | |
| 276 | for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { |
| 277 | if (MRI->getType(Reg) != LLT::scalar(SizeInBits: 1)) |
| 278 | continue; |
| 279 | |
| 280 | replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: LRCCache.lookup(Val: Reg).second); |
| 281 | } |
| 282 | |
| 283 | return false; |
| 284 | } |
| 285 | |
| 286 | } // End anonymous namespace. |
| 287 | |
| 288 | INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, |
| 289 | "AMDGPU GlobalISel divergence lowering" , false, false) |
| 290 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
| 291 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
| 292 | INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) |
| 293 | INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, |
| 294 | "AMDGPU GlobalISel divergence lowering" , false, false) |
| 295 | |
| 296 | char AMDGPUGlobalISelDivergenceLowering::ID = 0; |
| 297 | |
| 298 | char &llvm::AMDGPUGlobalISelDivergenceLoweringID = |
| 299 | AMDGPUGlobalISelDivergenceLowering::ID; |
| 300 | |
| 301 | FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { |
| 302 | return new AMDGPUGlobalISelDivergenceLowering(); |
| 303 | } |
| 304 | |
| 305 | bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( |
| 306 | MachineFunction &MF) { |
| 307 | MachineDominatorTree &DT = |
| 308 | getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
| 309 | MachinePostDominatorTree &PDT = |
| 310 | getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); |
| 311 | MachineUniformityInfo &MUI = |
| 312 | getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); |
| 313 | |
| 314 | DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI); |
| 315 | |
| 316 | bool Changed = false; |
| 317 | // Temporal divergence lowering needs to inspect list of instructions used |
| 318 | // outside cycle with divergent exit provided by uniformity analysis. Uniform |
| 319 | // instructions from the list require lowering, no instruction is deleted. |
| 320 | // Thus it needs to be run before lowerPhis that deletes phis that require |
| 321 | // lowering and replaces them with new instructions. |
| 322 | |
| 323 | // Non-i1 temporal divergence lowering. |
| 324 | Changed |= Helper.lowerTemporalDivergence(); |
| 325 | // This covers both uniform and divergent i1s. Lane masks are in sgpr and need |
| 326 | // to be updated in each iteration. |
| 327 | Changed |= Helper.lowerTemporalDivergenceI1(); |
| 328 | // Temporal divergence lowering of divergent i1 phi used outside of the cycle |
| 329 | // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1 |
| 330 | // since in some case lowerPhis does unnecessary lane mask merging. |
| 331 | Changed |= Helper.lowerPhis(); |
| 332 | return Changed; |
| 333 | } |
| 334 | |