1//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
11/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
12/// Handles all cases of temporal divergence.
13/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
14/// currently depends on LCSSA to insert phis with one incoming.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
19#include "AMDGPUGlobalISelUtils.h"
20#include "SILowerI1Copies.h"
21#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22#include "llvm/CodeGen/MachineFunctionPass.h"
23#include "llvm/CodeGen/MachineUniformityAnalysis.h"
24#include "llvm/InitializePasses.h"
25
26#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
27
28using namespace llvm;
29
30namespace {
31
32class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
33public:
34 static char ID;
35
36public:
37 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {}
38
39 bool runOnMachineFunction(MachineFunction &MF) override;
40
41 StringRef getPassName() const override {
42 return "AMDGPU GlobalISel divergence lowering";
43 }
44
45 void getAnalysisUsage(AnalysisUsage &AU) const override {
46 AU.setPreservesCFG();
47 AU.addRequired<MachineDominatorTreeWrapperPass>();
48 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
49 AU.addRequired<MachineUniformityAnalysisPass>();
50 MachineFunctionPass::getAnalysisUsage(AU);
51 }
52};
53
54class DivergenceLoweringHelper : public AMDGPU::PhiLoweringHelper {
55public:
56 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
57 MachinePostDominatorTree *PDT,
58 MachineUniformityInfo *MUI);
59
60private:
61 MachineUniformityInfo *MUI = nullptr;
62 MachineIRBuilder B;
63 Register buildRegCopyToLaneMask(Register Reg);
64
65public:
66 void markAsLaneMask(Register DstReg) const override;
67 void getCandidatesForLowering(
68 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
69 void collectIncomingValuesFromPhi(
70 const MachineInstr *MI,
71 SmallVectorImpl<AMDGPU::Incoming> &Incomings) const override;
72 void replaceDstReg(Register NewReg, Register OldReg,
73 MachineBasicBlock *MBB) override;
74 void buildMergeLaneMasks(MachineBasicBlock &MBB,
75 MachineBasicBlock::iterator I, const DebugLoc &DL,
76 Register DstReg, Register PrevReg,
77 Register CurReg) override;
78 void constrainAsLaneMask(AMDGPU::Incoming &In) override;
79
80 bool lowerTemporalDivergence();
81 bool lowerTemporalDivergenceI1();
82};
83
84DivergenceLoweringHelper::DivergenceLoweringHelper(
85 MachineFunction *MF, MachineDominatorTree *DT,
86 MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
87 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
88
89// _(s1) -> SReg_32/64(s1)
90void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
91 assert(MRI->getType(DstReg) == LLT::scalar(1));
92
93 if (MRI->getRegClassOrNull(Reg: DstReg)) {
94 if (MRI->constrainRegClass(Reg: DstReg, RC: ST->getBoolRC()))
95 return;
96 llvm_unreachable("Failed to constrain register class");
97 }
98
99 MRI->setRegClass(Reg: DstReg, RC: ST->getBoolRC());
100}
101
102void DivergenceLoweringHelper::getCandidatesForLowering(
103 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
104 LLT S1 = LLT::scalar(SizeInBits: 1);
105
106 // Add divergent i1 G_PHIs to the list. Only consider G_PHI instructions,
107 // not PHI instructions that may have been created by earlier lowering stages
108 // (e.g., lowerTemporalDivergenceI1).
109 for (MachineBasicBlock &MBB : *MF) {
110 for (MachineInstr &MI : MBB.phis()) {
111 if (MI.getOpcode() != TargetOpcode::G_PHI)
112 continue;
113 Register Dst = MI.getOperand(i: 0).getReg();
114 if (MRI->getType(Reg: Dst) == S1 && MUI->isDivergentAtDef(V: Dst))
115 Vreg1Phis.push_back(Elt: &MI);
116 }
117 }
118}
119
120void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
121 const MachineInstr *MI,
122 SmallVectorImpl<AMDGPU::Incoming> &Incomings) const {
123 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
124 Incomings.emplace_back(Args: MI->getOperand(i).getReg(),
125 Args: MI->getOperand(i: i + 1).getMBB(), Args: Register());
126 }
127}
128
129void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
130 MachineBasicBlock *MBB) {
131 BuildMI(BB&: *MBB, I: MBB->getFirstNonPHI(), MIMD: {}, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: OldReg)
132 .addReg(RegNo: NewReg);
133}
134
135// Copy Reg to new lane mask register, insert a copy after instruction that
136// defines Reg while skipping phis if needed.
137Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
138 Register LaneMask = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
139 MachineInstr *Instr = MRI->getVRegDef(Reg);
140 MachineBasicBlock *MBB = Instr->getParent();
141 B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: Instr->getIterator())));
142 B.buildCopy(Res: LaneMask, Op: Reg);
143 return LaneMask;
144}
145
146// bb.previous
147// %PrevReg = ...
148//
149// bb.current
150// %CurReg = ...
151//
152// %DstReg - not defined
153//
154// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
155//
156// bb.previous
157// %PrevReg = ...
158// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
159//
160// bb.current
161// %CurReg = ...
162// %CurRegCopy:sreg_32(s1) = COPY %CurReg
163// ...
164// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
165// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
166// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
167//
168// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
169void DivergenceLoweringHelper::buildMergeLaneMasks(
170 MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
171 Register DstReg, Register PrevReg, Register CurReg) {
172 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
173 // TODO: check if inputs are constants or results of a compare.
174
175 Register PrevRegCopy = buildRegCopyToLaneMask(Reg: PrevReg);
176 Register CurRegCopy = buildRegCopyToLaneMask(Reg: CurReg);
177 Register PrevMaskedReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
178 Register CurMaskedReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
179
180 B.setInsertPt(MBB, II: I);
181 B.buildInstr(Opc: LMC->AndN2Opc, DstOps: {PrevMaskedReg}, SrcOps: {PrevRegCopy, LMC->ExecReg});
182 B.buildInstr(Opc: LMC->AndOpc, DstOps: {CurMaskedReg}, SrcOps: {LMC->ExecReg, CurRegCopy});
183 B.buildInstr(Opc: LMC->OrOpc, DstOps: {DstReg}, SrcOps: {PrevMaskedReg, CurMaskedReg});
184}
185
186// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
187// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
188// Incoming.Reg becomes that new lane mask.
189void DivergenceLoweringHelper::constrainAsLaneMask(AMDGPU::Incoming &In) {
190 B.setInsertPt(MBB&: *In.Block, II: In.Block->getFirstTerminator());
191
192 auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: In.Reg);
193 MRI->setRegClass(Reg: Copy.getReg(Idx: 0), RC: ST->getBoolRC());
194 In.Reg = Copy.getReg(Idx: 0);
195}
196
197void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
198 Register NewReg) {
199 for (MachineOperand &Op : Inst->operands()) {
200 if (Op.isReg() && Op.getReg() == Reg)
201 Op.setReg(NewReg);
202 }
203}
204
205bool DivergenceLoweringHelper::lowerTemporalDivergence() {
206 AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
207 DenseMap<Register, Register> TDCache;
208
209 for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
210 if (MRI->getType(Reg) == LLT::scalar(SizeInBits: 1) || MUI->isDivergentAtDef(V: Reg) ||
211 ILMA.isS32S64LaneMask(Reg))
212 continue;
213
214 Register CachedTDCopy = TDCache.lookup(Val: Reg);
215 if (CachedTDCopy) {
216 replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: CachedTDCopy);
217 continue;
218 }
219
220 MachineInstr *Inst = MRI->getVRegDef(Reg);
221 MachineBasicBlock *MBB = Inst->getParent();
222 B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: Inst->getIterator())));
223
224 Register VgprReg = MRI->createGenericVirtualRegister(Ty: MRI->getType(Reg));
225 B.buildInstr(Opc: AMDGPU::COPY, DstOps: {VgprReg}, SrcOps: {Reg})
226 .addUse(RegNo: LMC->ExecReg, Flags: RegState::Implicit);
227
228 replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: VgprReg);
229 TDCache[Reg] = VgprReg;
230 }
231 return false;
232}
233
234bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
235 MachineRegisterInfo::VRegAttrs BoolS1 = {.RCOrRB: ST->getBoolRC(), .Ty: LLT::scalar(SizeInBits: 1)};
236 initializeLaneMaskRegisterAttributes(Attrs: BoolS1);
237 MachineSSAUpdater SSAUpdater(*MF);
238
239 // In case of use outside muliple nested cycles or muliple uses we only need
240 // to merge lane mask across largest relevant cycle.
241 SmallDenseMap<Register, std::pair<const MachineCycle *, Register>> LRCCache;
242 for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
243 if (MRI->getType(Reg) != LLT::scalar(SizeInBits: 1))
244 continue;
245
246 auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Key: Reg);
247 auto &CycleMergedMask = LRCCacheIter->getSecond();
248 const MachineCycle *&CachedLRC = CycleMergedMask.first;
249 if (RegNotCached || LRC->contains(C: CachedLRC)) {
250 CachedLRC = LRC;
251 }
252 }
253
254 for (auto &LRCCacheEntry : LRCCache) {
255 Register Reg = LRCCacheEntry.first;
256 auto &CycleMergedMask = LRCCacheEntry.getSecond();
257 const MachineCycle *Cycle = CycleMergedMask.first;
258
259 Register MergedMask = MRI->createVirtualRegister(RegAttr: BoolS1);
260 SSAUpdater.Initialize(V: MergedMask);
261
262 MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
263 SSAUpdater.AddAvailableValue(BB: MBB, V: MergedMask);
264
265 for (auto Entry : Cycle->getEntries()) {
266 for (MachineBasicBlock *Pred : Entry->predecessors()) {
267 if (!Cycle->contains(Block: Pred)) {
268 B.setInsertPt(MBB&: *Pred, II: Pred->getFirstTerminator());
269 auto ImplDef = B.buildInstr(Opc: AMDGPU::IMPLICIT_DEF, DstOps: {BoolS1}, SrcOps: {});
270 SSAUpdater.AddAvailableValue(BB: Pred, V: ImplDef.getReg(Idx: 0));
271 }
272 }
273 }
274
275 buildMergeLaneMasks(MBB&: *MBB, I: MBB->getFirstTerminator(), DL: {}, DstReg: MergedMask,
276 PrevReg: SSAUpdater.GetValueInMiddleOfBlock(BB: MBB), CurReg: Reg);
277
278 CycleMergedMask.second = MergedMask;
279 }
280
281 for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
282 if (MRI->getType(Reg) != LLT::scalar(SizeInBits: 1))
283 continue;
284
285 replaceUsesOfRegInInstWith(Reg, Inst: UseInst, NewReg: LRCCache.lookup(Val: Reg).second);
286 }
287
288 return false;
289}
290
291} // End anonymous namespace.
292
293INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
294 "AMDGPU GlobalISel divergence lowering", false, false)
295INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
296INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
297INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
298INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
299 "AMDGPU GlobalISel divergence lowering", false, false)
300
301char AMDGPUGlobalISelDivergenceLowering::ID = 0;
302
303char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
304 AMDGPUGlobalISelDivergenceLowering::ID;
305
306FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
307 return new AMDGPUGlobalISelDivergenceLowering();
308}
309
310bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
311 MachineFunction &MF) {
312 MachineDominatorTree &DT =
313 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
314 MachinePostDominatorTree &PDT =
315 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
316 MachineUniformityInfo &MUI =
317 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
318
319 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
320
321 bool Changed = false;
322 // Temporal divergence lowering needs to inspect list of instructions used
323 // outside cycle with divergent exit provided by uniformity analysis. Uniform
324 // instructions from the list require lowering, no instruction is deleted.
325 // Thus it needs to be run before lowerPhis that deletes phis that require
326 // lowering and replaces them with new instructions.
327
328 // Non-i1 temporal divergence lowering.
329 Changed |= Helper.lowerTemporalDivergence();
330 // This covers both uniform and divergent i1s. Lane masks are in sgpr and need
331 // to be updated in each iteration.
332 Changed |= Helper.lowerTemporalDivergenceI1();
333 // Temporal divergence lowering of divergent i1 phi used outside of the cycle
334 // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
335 // since in some case lowerPhis does unnecessary lane mask merging.
336 Changed |= Helper.lowerPhis();
337 return Changed;
338}
339