1 | //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// GlobalISel pass that selects divergent i1 phis as lane mask phis. |
11 | /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. |
12 | /// Handles all cases of temporal divergence. |
13 | /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass |
14 | /// currently depends on LCSSA to insert phis with one incoming. |
15 | // |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "AMDGPU.h" |
19 | #include "AMDGPUGlobalISelUtils.h" |
20 | #include "SILowerI1Copies.h" |
21 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
22 | #include "llvm/CodeGen/MachineFunctionPass.h" |
23 | #include "llvm/CodeGen/MachineUniformityAnalysis.h" |
24 | #include "llvm/InitializePasses.h" |
25 | |
26 | #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" |
27 | |
28 | using namespace llvm; |
29 | |
30 | namespace { |
31 | |
32 | class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { |
33 | public: |
34 | static char ID; |
35 | |
36 | public: |
37 | AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {} |
38 | |
39 | bool runOnMachineFunction(MachineFunction &MF) override; |
40 | |
41 | StringRef getPassName() const override { |
42 | return "AMDGPU GlobalISel divergence lowering" ; |
43 | } |
44 | |
45 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
46 | AU.setPreservesCFG(); |
47 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
48 | AU.addRequired<MachinePostDominatorTreeWrapperPass>(); |
49 | AU.addRequired<MachineUniformityAnalysisPass>(); |
50 | MachineFunctionPass::getAnalysisUsage(AU); |
51 | } |
52 | }; |
53 | |
54 | class DivergenceLoweringHelper : public PhiLoweringHelper { |
55 | public: |
56 | DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, |
57 | MachinePostDominatorTree *PDT, |
58 | MachineUniformityInfo *MUI); |
59 | |
60 | private: |
61 | MachineUniformityInfo *MUI = nullptr; |
62 | MachineIRBuilder B; |
63 | Register buildRegCopyToLaneMask(Register Reg); |
64 | |
65 | public: |
66 | void markAsLaneMask(Register DstReg) const override; |
67 | void getCandidatesForLowering( |
68 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; |
69 | void collectIncomingValuesFromPhi( |
70 | const MachineInstr *MI, |
71 | SmallVectorImpl<Incoming> &Incomings) const override; |
72 | void replaceDstReg(Register NewReg, Register OldReg, |
73 | MachineBasicBlock *MBB) override; |
74 | void buildMergeLaneMasks(MachineBasicBlock &MBB, |
75 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
76 | Register DstReg, Register PrevReg, |
77 | Register CurReg) override; |
78 | void constrainAsLaneMask(Incoming &In) override; |
79 | |
80 | bool lowerTemporalDivergence(); |
81 | bool lowerTemporalDivergenceI1(); |
82 | }; |
83 | |
84 | DivergenceLoweringHelper::DivergenceLoweringHelper( |
85 | MachineFunction *MF, MachineDominatorTree *DT, |
86 | MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI) |
87 | : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {} |
88 | |
89 | // _(s1) -> SReg_32/64(s1) |
90 | void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const { |
91 | assert(MRI->getType(DstReg) == LLT::scalar(1)); |
92 | |
93 | if (MRI->getRegClassOrNull(Reg: DstReg)) { |
94 | if (MRI->constrainRegClass(Reg: DstReg, RC: ST->getBoolRC())) |
95 | return; |
96 | llvm_unreachable("Failed to constrain register class" ); |
97 | } |
98 | |
99 | MRI->setRegClass(Reg: DstReg, RC: ST->getBoolRC()); |
100 | } |
101 | |
102 | void DivergenceLoweringHelper::getCandidatesForLowering( |
103 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { |
104 | LLT S1 = LLT::scalar(SizeInBits: 1); |
105 | |
106 | // Add divergent i1 phis to the list |
107 | for (MachineBasicBlock &MBB : *MF) { |
108 | for (MachineInstr &MI : MBB.phis()) { |
109 | Register Dst = MI.getOperand(i: 0).getReg(); |
110 | if (MRI->getType(Reg: Dst) == S1 && MUI->isDivergent(V: Dst)) |
111 | Vreg1Phis.push_back(Elt: &MI); |
112 | } |
113 | } |
114 | } |
115 | |
116 | void DivergenceLoweringHelper::collectIncomingValuesFromPhi( |
117 | const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { |
118 | for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { |
119 | Incomings.emplace_back(Args: MI->getOperand(i).getReg(), |
120 | Args: MI->getOperand(i: i + 1).getMBB(), Args: Register()); |
121 | } |
122 | } |
123 | |
124 | void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg, |
125 | MachineBasicBlock *MBB) { |
126 | BuildMI(BB&: *MBB, I: MBB->getFirstNonPHI(), MIMD: {}, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: OldReg) |
127 | .addReg(RegNo: NewReg); |
128 | } |
129 | |
130 | // Copy Reg to new lane mask register, insert a copy after instruction that |
131 | // defines Reg while skipping phis if needed. |
132 | Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) { |
133 | Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
134 | MachineInstr *Instr = MRI->getVRegDef(Reg); |
135 | MachineBasicBlock *MBB = Instr->getParent(); |
136 | B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: Instr->getIterator()))); |
137 | B.buildCopy(Res: LaneMask, Op: Reg); |
138 | return LaneMask; |
139 | } |
140 | |
141 | // bb.previous |
142 | // %PrevReg = ... |
143 | // |
144 | // bb.current |
145 | // %CurReg = ... |
146 | // |
147 | // %DstReg - not defined |
148 | // |
149 | // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) |
150 | // |
151 | // bb.previous |
152 | // %PrevReg = ... |
153 | // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg |
154 | // |
155 | // bb.current |
156 | // %CurReg = ... |
157 | // %CurRegCopy:sreg_32(s1) = COPY %CurReg |
158 | // ... |
159 | // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 |
160 | // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 |
161 | // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg |
162 | // |
163 | // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg |
164 | void DivergenceLoweringHelper::buildMergeLaneMasks( |
165 | MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, |
166 | Register DstReg, Register PrevReg, Register CurReg) { |
167 | // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC) |
168 | // TODO: check if inputs are constants or results of a compare. |
169 | |
170 | Register PrevRegCopy = buildRegCopyToLaneMask(Reg: PrevReg); |
171 | Register CurRegCopy = buildRegCopyToLaneMask(Reg: CurReg); |
172 | Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
173 | Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
174 | |
175 | B.setInsertPt(MBB, II: I); |
176 | B.buildInstr(Opc: AndN2Op, DstOps: {PrevMaskedReg}, SrcOps: {PrevRegCopy, ExecReg}); |
177 | B.buildInstr(Opc: AndOp, DstOps: {CurMaskedReg}, SrcOps: {ExecReg, CurRegCopy}); |
178 | B.buildInstr(Opc: OrOp, DstOps: {DstReg}, SrcOps: {PrevMaskedReg, CurMaskedReg}); |
179 | } |
180 | |
181 | // GlobalISel has to constrain S1 incoming taken as-is with lane mask register |
182 | // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block, |
183 | // Incoming.Reg becomes that new lane mask. |
184 | void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { |
185 | B.setInsertPt(MBB&: *In.Block, II: In.Block->getFirstTerminator()); |
186 | |
187 | auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: In.Reg); |
188 | MRI->setRegClass(Reg: Copy.getReg(Idx: 0), RC: ST->getBoolRC()); |
189 | In.Reg = Copy.getReg(Idx: 0); |
190 | } |
191 | |
192 | void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, |
193 | Register NewReg) { |
194 | for (MachineOperand &Op : Inst->operands()) { |
195 | if (Op.isReg() && Op.getReg() == Reg) |
196 | Op.setReg(NewReg); |
197 | } |
198 | } |
199 | |
200 | bool DivergenceLoweringHelper::lowerTemporalDivergence() { |
201 | AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); |
202 | DenseMap<Register, Register> TDCache; |
203 | |
204 | for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) { |
205 | if (MRI->getType(Reg) == LLT::scalar(SizeInBits: 1) || MUI->isDivergent(V: Reg) || |
206 | ILMA.isS32S64LaneMask(Reg)) |
207 | continue; |
208 | |
209 | Register CachedTDCopy = TDCache.lookup(Val: Reg); |
210 | if (CachedTDCopy) { |
211 | replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: CachedTDCopy); |
212 | continue; |
213 | } |
214 | |
215 | MachineInstr *Inst = MRI->getVRegDef(Reg); |
216 | MachineBasicBlock *MBB = Inst->getParent(); |
217 | B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: Inst->getIterator()))); |
218 | |
219 | Register VgprReg = MRI->createGenericVirtualRegister(Ty: MRI->getType(Reg)); |
220 | B.buildInstr(Opc: AMDGPU::COPY, DstOps: {VgprReg}, SrcOps: {Reg}) |
221 | .addUse(RegNo: ExecReg, Flags: RegState::Implicit); |
222 | |
223 | replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: VgprReg); |
224 | TDCache[Reg] = VgprReg; |
225 | } |
226 | return false; |
227 | } |
228 | |
229 | bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { |
230 | MachineRegisterInfo::VRegAttrs BoolS1 = {.RCOrRB: ST->getBoolRC(), .Ty: LLT::scalar(SizeInBits: 1)}; |
231 | initializeLaneMaskRegisterAttributes(Attrs: BoolS1); |
232 | MachineSSAUpdater SSAUpdater(*MF); |
233 | |
234 | // In case of use outside muliple nested cycles or muliple uses we only need |
235 | // to merge lane mask across largest relevant cycle. |
236 | SmallDenseMap<Register, std::pair<const MachineCycle *, Register>> LRCCache; |
237 | for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { |
238 | if (MRI->getType(Reg) != LLT::scalar(SizeInBits: 1)) |
239 | continue; |
240 | |
241 | auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Key: Reg); |
242 | auto &CycleMergedMask = LRCCacheIter->getSecond(); |
243 | const MachineCycle *&CachedLRC = CycleMergedMask.first; |
244 | if (RegNotCached || LRC->contains(C: CachedLRC)) { |
245 | CachedLRC = LRC; |
246 | } |
247 | } |
248 | |
249 | for (auto &LRCCacheEntry : LRCCache) { |
250 | Register Reg = LRCCacheEntry.first; |
251 | auto &CycleMergedMask = LRCCacheEntry.getSecond(); |
252 | const MachineCycle *Cycle = CycleMergedMask.first; |
253 | |
254 | Register MergedMask = MRI->createVirtualRegister(RegAttr: BoolS1); |
255 | SSAUpdater.Initialize(V: MergedMask); |
256 | |
257 | MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); |
258 | SSAUpdater.AddAvailableValue(BB: MBB, V: MergedMask); |
259 | |
260 | for (auto Entry : Cycle->getEntries()) { |
261 | for (MachineBasicBlock *Pred : Entry->predecessors()) { |
262 | if (!Cycle->contains(Block: Pred)) { |
263 | B.setInsertPt(MBB&: *Pred, II: Pred->getFirstTerminator()); |
264 | auto ImplDef = B.buildInstr(Opc: AMDGPU::IMPLICIT_DEF, DstOps: {BoolS1}, SrcOps: {}); |
265 | SSAUpdater.AddAvailableValue(BB: Pred, V: ImplDef.getReg(Idx: 0)); |
266 | } |
267 | } |
268 | } |
269 | |
270 | buildMergeLaneMasks(MBB&: *MBB, I: MBB->getFirstTerminator(), DL: {}, DstReg: MergedMask, |
271 | PrevReg: SSAUpdater.GetValueInMiddleOfBlock(BB: MBB), CurReg: Reg); |
272 | |
273 | CycleMergedMask.second = MergedMask; |
274 | } |
275 | |
276 | for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { |
277 | if (MRI->getType(Reg) != LLT::scalar(SizeInBits: 1)) |
278 | continue; |
279 | |
280 | replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: LRCCache.lookup(Val: Reg).second); |
281 | } |
282 | |
283 | return false; |
284 | } |
285 | |
286 | } // End anonymous namespace. |
287 | |
288 | INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, |
289 | "AMDGPU GlobalISel divergence lowering" , false, false) |
290 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
291 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
292 | INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) |
293 | INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, |
294 | "AMDGPU GlobalISel divergence lowering" , false, false) |
295 | |
296 | char AMDGPUGlobalISelDivergenceLowering::ID = 0; |
297 | |
298 | char &llvm::AMDGPUGlobalISelDivergenceLoweringID = |
299 | AMDGPUGlobalISelDivergenceLowering::ID; |
300 | |
301 | FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { |
302 | return new AMDGPUGlobalISelDivergenceLowering(); |
303 | } |
304 | |
305 | bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( |
306 | MachineFunction &MF) { |
307 | MachineDominatorTree &DT = |
308 | getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
309 | MachinePostDominatorTree &PDT = |
310 | getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); |
311 | MachineUniformityInfo &MUI = |
312 | getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); |
313 | |
314 | DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI); |
315 | |
316 | bool Changed = false; |
317 | // Temporal divergence lowering needs to inspect list of instructions used |
318 | // outside cycle with divergent exit provided by uniformity analysis. Uniform |
319 | // instructions from the list require lowering, no instruction is deleted. |
320 | // Thus it needs to be run before lowerPhis that deletes phis that require |
321 | // lowering and replaces them with new instructions. |
322 | |
323 | // Non-i1 temporal divergence lowering. |
324 | Changed |= Helper.lowerTemporalDivergence(); |
325 | // This covers both uniform and divergent i1s. Lane masks are in sgpr and need |
326 | // to be updated in each iteration. |
327 | Changed |= Helper.lowerTemporalDivergenceI1(); |
328 | // Temporal divergence lowering of divergent i1 phi used outside of the cycle |
329 | // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1 |
330 | // since in some case lowerPhis does unnecessary lane mask merging. |
331 | Changed |= Helper.lowerPhis(); |
332 | return Changed; |
333 | } |
334 | |