1 | //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass lowers all occurrences of i1 values (with a vreg_1 register class) |
10 | // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA |
11 | // form and a wave-level control flow graph. |
12 | // |
13 | // Before this pass, values that are semantically i1 and are defined and used |
14 | // within the same basic block are already represented as lane masks in scalar |
15 | // registers. However, values that cross basic blocks are always transferred |
16 | // between basic blocks in vreg_1 virtual registers and are lowered by this |
17 | // pass. |
18 | // |
19 | // The only instructions that use or define vreg_1 virtual registers are COPY, |
20 | // PHI, and IMPLICIT_DEF. |
21 | // |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #include "SILowerI1Copies.h" |
25 | #include "AMDGPU.h" |
26 | #include "llvm/CodeGen/MachineSSAUpdater.h" |
27 | #include "llvm/InitializePasses.h" |
28 | |
29 | #define DEBUG_TYPE "si-i1-copies" |
30 | |
31 | using namespace llvm; |
32 | |
33 | static Register |
34 | insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, |
35 | MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs); |
36 | |
37 | namespace { |
38 | |
39 | class Vreg1LoweringHelper : public PhiLoweringHelper { |
40 | public: |
41 | Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, |
42 | MachinePostDominatorTree *PDT); |
43 | |
44 | private: |
45 | DenseSet<Register> ConstrainRegs; |
46 | |
47 | public: |
48 | void markAsLaneMask(Register DstReg) const override; |
49 | void getCandidatesForLowering( |
50 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; |
51 | void collectIncomingValuesFromPhi( |
52 | const MachineInstr *MI, |
53 | SmallVectorImpl<Incoming> &Incomings) const override; |
54 | void replaceDstReg(Register NewReg, Register OldReg, |
55 | MachineBasicBlock *MBB) override; |
56 | void buildMergeLaneMasks(MachineBasicBlock &MBB, |
57 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
58 | Register DstReg, Register PrevReg, |
59 | Register CurReg) override; |
60 | void constrainAsLaneMask(Incoming &In) override; |
61 | |
62 | bool lowerCopiesFromI1(); |
63 | bool lowerCopiesToI1(); |
64 | bool cleanConstrainRegs(bool Changed); |
65 | bool isVreg1(Register Reg) const { |
66 | return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; |
67 | } |
68 | }; |
69 | |
70 | Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, |
71 | MachineDominatorTree *DT, |
72 | MachinePostDominatorTree *PDT) |
73 | : PhiLoweringHelper(MF, DT, PDT) {} |
74 | |
75 | bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { |
76 | assert(Changed || ConstrainRegs.empty()); |
77 | for (Register Reg : ConstrainRegs) |
78 | MRI->constrainRegClass(Reg, RC: &AMDGPU::SReg_1_XEXECRegClass); |
79 | ConstrainRegs.clear(); |
80 | |
81 | return Changed; |
82 | } |
83 | |
84 | /// Helper class that determines the relationship between incoming values of a |
85 | /// phi in the control flow graph to determine where an incoming value can |
86 | /// simply be taken as a scalar lane mask as-is, and where it needs to be |
87 | /// merged with another, previously defined lane mask. |
88 | /// |
89 | /// The approach is as follows: |
90 | /// - Determine all basic blocks which, starting from the incoming blocks, |
91 | /// a wave may reach before entering the def block (the block containing the |
92 | /// phi). |
93 | /// - If an incoming block has no predecessors in this set, we can take the |
94 | /// incoming value as a scalar lane mask as-is. |
95 | /// -- A special case of this is when the def block has a self-loop. |
96 | /// - Otherwise, the incoming value needs to be merged with a previously |
97 | /// defined lane mask. |
98 | /// - If there is a path into the set of reachable blocks that does _not_ go |
99 | /// through an incoming block where we can take the scalar lane mask as-is, |
100 | /// we need to invent an available value for the SSAUpdater. Choices are |
101 | /// 0 and undef, with differing consequences for how to merge values etc. |
102 | /// |
103 | /// TODO: We could use region analysis to quickly skip over SESE regions during |
104 | /// the traversal. |
105 | /// |
106 | class PhiIncomingAnalysis { |
107 | MachinePostDominatorTree &PDT; |
108 | const SIInstrInfo *TII; |
109 | |
110 | // For each reachable basic block, whether it is a source in the induced |
111 | // subgraph of the CFG. |
112 | MapVector<MachineBasicBlock *, bool> ReachableMap; |
113 | SmallVector<MachineBasicBlock *, 4> Stack; |
114 | SmallVector<MachineBasicBlock *, 4> Predecessors; |
115 | |
116 | public: |
117 | PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII) |
118 | : PDT(PDT), TII(TII) {} |
119 | |
120 | /// Returns whether \p MBB is a source in the induced subgraph of reachable |
121 | /// blocks. |
122 | bool isSource(MachineBasicBlock &MBB) const { |
123 | return ReachableMap.find(Key: &MBB)->second; |
124 | } |
125 | |
126 | ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } |
127 | |
128 | void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) { |
129 | assert(Stack.empty()); |
130 | ReachableMap.clear(); |
131 | Predecessors.clear(); |
132 | |
133 | // Insert the def block first, so that it acts as an end point for the |
134 | // traversal. |
135 | ReachableMap.try_emplace(Key: &DefBlock, Args: false); |
136 | |
137 | for (auto Incoming : Incomings) { |
138 | MachineBasicBlock *MBB = Incoming.Block; |
139 | if (MBB == &DefBlock) { |
140 | ReachableMap[&DefBlock] = true; // self-loop on DefBlock |
141 | continue; |
142 | } |
143 | |
144 | ReachableMap.try_emplace(Key: MBB, Args: false); |
145 | |
146 | // If this block has a divergent terminator and the def block is its |
147 | // post-dominator, the wave may first visit the other successors. |
148 | if (TII->hasDivergentBranch(MBB) && PDT.dominates(A: &DefBlock, B: MBB)) |
149 | append_range(C&: Stack, R: MBB->successors()); |
150 | } |
151 | |
152 | while (!Stack.empty()) { |
153 | MachineBasicBlock *MBB = Stack.pop_back_val(); |
154 | if (ReachableMap.try_emplace(Key: MBB, Args: false).second) |
155 | append_range(C&: Stack, R: MBB->successors()); |
156 | } |
157 | |
158 | for (auto &[MBB, Reachable] : ReachableMap) { |
159 | bool HaveReachablePred = false; |
160 | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
161 | if (ReachableMap.count(Key: Pred)) { |
162 | HaveReachablePred = true; |
163 | } else { |
164 | Stack.push_back(Elt: Pred); |
165 | } |
166 | } |
167 | if (!HaveReachablePred) |
168 | Reachable = true; |
169 | if (HaveReachablePred) { |
170 | for (MachineBasicBlock *UnreachablePred : Stack) { |
171 | if (!llvm::is_contained(Range&: Predecessors, Element: UnreachablePred)) |
172 | Predecessors.push_back(Elt: UnreachablePred); |
173 | } |
174 | } |
175 | Stack.clear(); |
176 | } |
177 | } |
178 | }; |
179 | |
180 | /// Helper class that detects loops which require us to lower an i1 COPY into |
181 | /// bitwise manipulation. |
182 | /// |
183 | /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish |
184 | /// between loops with the same header. Consider this example: |
185 | /// |
186 | /// A-+-+ |
187 | /// | | | |
188 | /// B-+ | |
189 | /// | | |
190 | /// C---+ |
191 | /// |
192 | /// A is the header of a loop containing A, B, and C as far as LoopInfo is |
193 | /// concerned. However, an i1 COPY in B that is used in C must be lowered to |
194 | /// bitwise operations to combine results from different loop iterations when |
195 | /// B has a divergent branch (since by default we will compile this code such |
196 | /// that threads in a wave are merged at the entry of C). |
197 | /// |
198 | /// The following rule is implemented to determine whether bitwise operations |
199 | /// are required: use the bitwise lowering for a def in block B if a backward |
200 | /// edge to B is reachable without going through the nearest common |
201 | /// post-dominator of B and all uses of the def. |
202 | /// |
203 | /// TODO: This rule is conservative because it does not check whether the |
204 | /// relevant branches are actually divergent. |
205 | /// |
206 | /// The class is designed to cache the CFG traversal so that it can be re-used |
207 | /// for multiple defs within the same basic block. |
208 | /// |
209 | /// TODO: We could use region analysis to quickly skip over SESE regions during |
210 | /// the traversal. |
211 | /// |
212 | class LoopFinder { |
213 | MachineDominatorTree &DT; |
214 | MachinePostDominatorTree &PDT; |
215 | |
216 | // All visited / reachable block, tagged by level (level 0 is the def block, |
217 | // level 1 are all blocks reachable including but not going through the def |
218 | // block's IPDOM, etc.). |
219 | DenseMap<MachineBasicBlock *, unsigned> Visited; |
220 | |
221 | // Nearest common dominator of all visited blocks by level (level 0 is the |
222 | // def block). Used for seeding the SSAUpdater. |
223 | SmallVector<MachineBasicBlock *, 4> CommonDominators; |
224 | |
225 | // Post-dominator of all visited blocks. |
226 | MachineBasicBlock *VisitedPostDom = nullptr; |
227 | |
228 | // Level at which a loop was found: 0 is not possible; 1 = a backward edge is |
229 | // reachable without going through the IPDOM of the def block (if the IPDOM |
230 | // itself has an edge to the def block, the loop level is 2), etc. |
231 | unsigned FoundLoopLevel = ~0u; |
232 | |
233 | MachineBasicBlock *DefBlock = nullptr; |
234 | SmallVector<MachineBasicBlock *, 4> Stack; |
235 | SmallVector<MachineBasicBlock *, 4> NextLevel; |
236 | |
237 | public: |
238 | LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) |
239 | : DT(DT), PDT(PDT) {} |
240 | |
241 | void initialize(MachineBasicBlock &MBB) { |
242 | Visited.clear(); |
243 | CommonDominators.clear(); |
244 | Stack.clear(); |
245 | NextLevel.clear(); |
246 | VisitedPostDom = nullptr; |
247 | FoundLoopLevel = ~0u; |
248 | |
249 | DefBlock = &MBB; |
250 | } |
251 | |
252 | /// Check whether a backward edge can be reached without going through the |
253 | /// given \p PostDom of the def block. |
254 | /// |
255 | /// Return the level of \p PostDom if a loop was found, or 0 otherwise. |
256 | unsigned findLoop(MachineBasicBlock *PostDom) { |
257 | MachineDomTreeNode *PDNode = PDT.getNode(BB: DefBlock); |
258 | |
259 | if (!VisitedPostDom) |
260 | advanceLevel(); |
261 | |
262 | unsigned Level = 0; |
263 | while (PDNode->getBlock() != PostDom) { |
264 | if (PDNode->getBlock() == VisitedPostDom) |
265 | advanceLevel(); |
266 | PDNode = PDNode->getIDom(); |
267 | Level++; |
268 | if (FoundLoopLevel == Level) |
269 | return Level; |
270 | } |
271 | |
272 | return 0; |
273 | } |
274 | |
275 | /// Add undef values dominating the loop and the optionally given additional |
276 | /// blocks, so that the SSA updater doesn't have to search all the way to the |
277 | /// function entry. |
278 | void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, |
279 | MachineRegisterInfo &MRI, |
280 | MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs, |
281 | ArrayRef<Incoming> Incomings = {}) { |
282 | assert(LoopLevel < CommonDominators.size()); |
283 | |
284 | MachineBasicBlock *Dom = CommonDominators[LoopLevel]; |
285 | for (auto &Incoming : Incomings) |
286 | Dom = DT.findNearestCommonDominator(A: Dom, B: Incoming.Block); |
287 | |
288 | if (!inLoopLevel(MBB&: *Dom, LoopLevel, Incomings)) { |
289 | SSAUpdater.AddAvailableValue( |
290 | BB: Dom, V: insertUndefLaneMask(MBB: Dom, MRI: &MRI, LaneMaskRegAttrs)); |
291 | } else { |
292 | // The dominator is part of the loop or the given blocks, so add the |
293 | // undef value to unreachable predecessors instead. |
294 | for (MachineBasicBlock *Pred : Dom->predecessors()) { |
295 | if (!inLoopLevel(MBB&: *Pred, LoopLevel, Incomings)) |
296 | SSAUpdater.AddAvailableValue( |
297 | BB: Pred, V: insertUndefLaneMask(MBB: Pred, MRI: &MRI, LaneMaskRegAttrs)); |
298 | } |
299 | } |
300 | } |
301 | |
302 | private: |
303 | bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, |
304 | ArrayRef<Incoming> Incomings) const { |
305 | auto DomIt = Visited.find(Val: &MBB); |
306 | if (DomIt != Visited.end() && DomIt->second <= LoopLevel) |
307 | return true; |
308 | |
309 | for (auto &Incoming : Incomings) |
310 | if (Incoming.Block == &MBB) |
311 | return true; |
312 | |
313 | return false; |
314 | } |
315 | |
316 | void advanceLevel() { |
317 | MachineBasicBlock *VisitedDom; |
318 | |
319 | if (!VisitedPostDom) { |
320 | VisitedPostDom = DefBlock; |
321 | VisitedDom = DefBlock; |
322 | Stack.push_back(Elt: DefBlock); |
323 | } else { |
324 | VisitedPostDom = PDT.getNode(BB: VisitedPostDom)->getIDom()->getBlock(); |
325 | VisitedDom = CommonDominators.back(); |
326 | |
327 | for (unsigned i = 0; i < NextLevel.size();) { |
328 | if (PDT.dominates(A: VisitedPostDom, B: NextLevel[i])) { |
329 | Stack.push_back(Elt: NextLevel[i]); |
330 | |
331 | NextLevel[i] = NextLevel.back(); |
332 | NextLevel.pop_back(); |
333 | } else { |
334 | i++; |
335 | } |
336 | } |
337 | } |
338 | |
339 | unsigned Level = CommonDominators.size(); |
340 | while (!Stack.empty()) { |
341 | MachineBasicBlock *MBB = Stack.pop_back_val(); |
342 | if (!PDT.dominates(A: VisitedPostDom, B: MBB)) |
343 | NextLevel.push_back(Elt: MBB); |
344 | |
345 | Visited[MBB] = Level; |
346 | VisitedDom = DT.findNearestCommonDominator(A: VisitedDom, B: MBB); |
347 | |
348 | for (MachineBasicBlock *Succ : MBB->successors()) { |
349 | if (Succ == DefBlock) { |
350 | if (MBB == VisitedPostDom) |
351 | FoundLoopLevel = std::min(a: FoundLoopLevel, b: Level + 1); |
352 | else |
353 | FoundLoopLevel = std::min(a: FoundLoopLevel, b: Level); |
354 | continue; |
355 | } |
356 | |
357 | if (Visited.try_emplace(Key: Succ, Args: ~0u).second) { |
358 | if (MBB == VisitedPostDom) |
359 | NextLevel.push_back(Elt: Succ); |
360 | else |
361 | Stack.push_back(Elt: Succ); |
362 | } |
363 | } |
364 | } |
365 | |
366 | CommonDominators.push_back(Elt: VisitedDom); |
367 | } |
368 | }; |
369 | |
370 | } // End anonymous namespace. |
371 | |
372 | Register |
373 | llvm::createLaneMaskReg(MachineRegisterInfo *MRI, |
374 | MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { |
375 | return MRI->createVirtualRegister(RegAttr: LaneMaskRegAttrs); |
376 | } |
377 | |
378 | static Register |
379 | insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, |
380 | MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { |
381 | MachineFunction &MF = *MBB->getParent(); |
382 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
383 | const SIInstrInfo *TII = ST.getInstrInfo(); |
384 | Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
385 | BuildMI(BB&: *MBB, I: MBB->getFirstTerminator(), MIMD: {}, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), |
386 | DestReg: UndefReg); |
387 | return UndefReg; |
388 | } |
389 | |
390 | #ifndef NDEBUG |
391 | static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, |
392 | const MachineRegisterInfo &MRI, |
393 | Register Reg) { |
394 | unsigned Size = TRI.getRegSizeInBits(Reg, MRI); |
395 | return Size == 1 || Size == 32; |
396 | } |
397 | #endif |
398 | |
399 | bool Vreg1LoweringHelper::lowerCopiesFromI1() { |
400 | bool Changed = false; |
401 | SmallVector<MachineInstr *, 4> DeadCopies; |
402 | |
403 | for (MachineBasicBlock &MBB : *MF) { |
404 | for (MachineInstr &MI : MBB) { |
405 | if (MI.getOpcode() != AMDGPU::COPY) |
406 | continue; |
407 | |
408 | Register DstReg = MI.getOperand(i: 0).getReg(); |
409 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
410 | if (!isVreg1(Reg: SrcReg)) |
411 | continue; |
412 | |
413 | if (isLaneMaskReg(Reg: DstReg) || isVreg1(Reg: DstReg)) |
414 | continue; |
415 | |
416 | Changed = true; |
417 | |
418 | // Copy into a 32-bit vector register. |
419 | LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); |
420 | DebugLoc DL = MI.getDebugLoc(); |
421 | |
422 | assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); |
423 | assert(!MI.getOperand(0).getSubReg()); |
424 | |
425 | ConstrainRegs.insert(V: SrcReg); |
426 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg) |
427 | .addImm(Val: 0) |
428 | .addImm(Val: 0) |
429 | .addImm(Val: 0) |
430 | .addImm(Val: -1) |
431 | .addReg(RegNo: SrcReg); |
432 | DeadCopies.push_back(Elt: &MI); |
433 | } |
434 | |
435 | for (MachineInstr *MI : DeadCopies) |
436 | MI->eraseFromParent(); |
437 | DeadCopies.clear(); |
438 | } |
439 | return Changed; |
440 | } |
441 | |
442 | PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, |
443 | MachineDominatorTree *DT, |
444 | MachinePostDominatorTree *PDT) |
445 | : MF(MF), DT(DT), PDT(PDT) { |
446 | MRI = &MF->getRegInfo(); |
447 | |
448 | ST = &MF->getSubtarget<GCNSubtarget>(); |
449 | TII = ST->getInstrInfo(); |
450 | IsWave32 = ST->isWave32(); |
451 | |
452 | if (IsWave32) { |
453 | ExecReg = AMDGPU::EXEC_LO; |
454 | MovOp = AMDGPU::S_MOV_B32; |
455 | AndOp = AMDGPU::S_AND_B32; |
456 | OrOp = AMDGPU::S_OR_B32; |
457 | XorOp = AMDGPU::S_XOR_B32; |
458 | AndN2Op = AMDGPU::S_ANDN2_B32; |
459 | OrN2Op = AMDGPU::S_ORN2_B32; |
460 | } else { |
461 | ExecReg = AMDGPU::EXEC; |
462 | MovOp = AMDGPU::S_MOV_B64; |
463 | AndOp = AMDGPU::S_AND_B64; |
464 | OrOp = AMDGPU::S_OR_B64; |
465 | XorOp = AMDGPU::S_XOR_B64; |
466 | AndN2Op = AMDGPU::S_ANDN2_B64; |
467 | OrN2Op = AMDGPU::S_ORN2_B64; |
468 | } |
469 | } |
470 | |
471 | bool PhiLoweringHelper::lowerPhis() { |
472 | MachineSSAUpdater SSAUpdater(*MF); |
473 | LoopFinder LF(*DT, *PDT); |
474 | PhiIncomingAnalysis PIA(*PDT, TII); |
475 | SmallVector<MachineInstr *, 4> Vreg1Phis; |
476 | SmallVector<Incoming, 4> Incomings; |
477 | |
478 | getCandidatesForLowering(Vreg1Phis); |
479 | if (Vreg1Phis.empty()) |
480 | return false; |
481 | |
482 | DT->updateDFSNumbers(); |
483 | MachineBasicBlock *PrevMBB = nullptr; |
484 | for (MachineInstr *MI : Vreg1Phis) { |
485 | MachineBasicBlock &MBB = *MI->getParent(); |
486 | if (&MBB != PrevMBB) { |
487 | LF.initialize(MBB); |
488 | PrevMBB = &MBB; |
489 | } |
490 | |
491 | LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); |
492 | |
493 | Register DstReg = MI->getOperand(i: 0).getReg(); |
494 | markAsLaneMask(DstReg); |
495 | initializeLaneMaskRegisterAttributes(LaneMask: DstReg); |
496 | |
497 | collectIncomingValuesFromPhi(MI, Incomings); |
498 | |
499 | // Sort the incomings such that incoming values that dominate other incoming |
500 | // values are sorted earlier. This allows us to do some amount of on-the-fly |
501 | // constant folding. |
502 | // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block. |
503 | llvm::sort(C&: Incomings, Comp: [this](Incoming LHS, Incoming RHS) { |
504 | return DT->getNode(BB: LHS.Block)->getDFSNumIn() < |
505 | DT->getNode(BB: RHS.Block)->getDFSNumIn(); |
506 | }); |
507 | |
508 | #ifndef NDEBUG |
509 | PhiRegisters.insert(DstReg); |
510 | #endif |
511 | |
512 | // Phis in a loop that are observed outside the loop receive a simple but |
513 | // conservatively correct treatment. |
514 | std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; |
515 | for (MachineInstr &Use : MRI->use_instructions(Reg: DstReg)) |
516 | DomBlocks.push_back(x: Use.getParent()); |
517 | |
518 | MachineBasicBlock *PostDomBound = |
519 | PDT->findNearestCommonDominator(Blocks: DomBlocks); |
520 | |
521 | // FIXME: This fails to find irreducible cycles. If we have a def (other |
522 | // than a constant) in a pair of blocks that end up looping back to each |
523 | // other, it will be mishandle. Due to structurization this shouldn't occur |
524 | // in practice. |
525 | unsigned FoundLoopLevel = LF.findLoop(PostDom: PostDomBound); |
526 | |
527 | SSAUpdater.Initialize(V: DstReg); |
528 | |
529 | if (FoundLoopLevel) { |
530 | LF.addLoopEntries(LoopLevel: FoundLoopLevel, SSAUpdater, MRI&: *MRI, LaneMaskRegAttrs, |
531 | Incomings); |
532 | |
533 | for (auto &Incoming : Incomings) { |
534 | Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
535 | SSAUpdater.AddAvailableValue(BB: Incoming.Block, V: Incoming.UpdatedReg); |
536 | } |
537 | |
538 | for (auto &Incoming : Incomings) { |
539 | MachineBasicBlock &IMBB = *Incoming.Block; |
540 | buildMergeLaneMasks( |
541 | MBB&: IMBB, I: getSaluInsertionAtEnd(MBB&: IMBB), DL: {}, DstReg: Incoming.UpdatedReg, |
542 | PrevReg: SSAUpdater.GetValueInMiddleOfBlock(BB: &IMBB), CurReg: Incoming.Reg); |
543 | } |
544 | } else { |
545 | // The phi is not observed from outside a loop. Use a more accurate |
546 | // lowering. |
547 | PIA.analyze(DefBlock&: MBB, Incomings); |
548 | |
549 | for (MachineBasicBlock *MBB : PIA.predecessors()) |
550 | SSAUpdater.AddAvailableValue( |
551 | BB: MBB, V: insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs)); |
552 | |
553 | for (auto &Incoming : Incomings) { |
554 | MachineBasicBlock &IMBB = *Incoming.Block; |
555 | if (PIA.isSource(MBB&: IMBB)) { |
556 | constrainAsLaneMask(In&: Incoming); |
557 | SSAUpdater.AddAvailableValue(BB: &IMBB, V: Incoming.Reg); |
558 | } else { |
559 | Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
560 | SSAUpdater.AddAvailableValue(BB: &IMBB, V: Incoming.UpdatedReg); |
561 | } |
562 | } |
563 | |
564 | for (auto &Incoming : Incomings) { |
565 | if (!Incoming.UpdatedReg.isValid()) |
566 | continue; |
567 | |
568 | MachineBasicBlock &IMBB = *Incoming.Block; |
569 | buildMergeLaneMasks( |
570 | MBB&: IMBB, I: getSaluInsertionAtEnd(MBB&: IMBB), DL: {}, DstReg: Incoming.UpdatedReg, |
571 | PrevReg: SSAUpdater.GetValueInMiddleOfBlock(BB: &IMBB), CurReg: Incoming.Reg); |
572 | } |
573 | } |
574 | |
575 | Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(BB: &MBB); |
576 | if (NewReg != DstReg) { |
577 | replaceDstReg(NewReg, OldReg: DstReg, MBB: &MBB); |
578 | MI->eraseFromParent(); |
579 | } |
580 | |
581 | Incomings.clear(); |
582 | } |
583 | return true; |
584 | } |
585 | |
586 | bool Vreg1LoweringHelper::lowerCopiesToI1() { |
587 | bool Changed = false; |
588 | MachineSSAUpdater SSAUpdater(*MF); |
589 | LoopFinder LF(*DT, *PDT); |
590 | SmallVector<MachineInstr *, 4> DeadCopies; |
591 | |
592 | for (MachineBasicBlock &MBB : *MF) { |
593 | LF.initialize(MBB); |
594 | |
595 | for (MachineInstr &MI : MBB) { |
596 | if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && |
597 | MI.getOpcode() != AMDGPU::COPY) |
598 | continue; |
599 | |
600 | Register DstReg = MI.getOperand(i: 0).getReg(); |
601 | if (!isVreg1(Reg: DstReg)) |
602 | continue; |
603 | |
604 | Changed = true; |
605 | |
606 | if (MRI->use_empty(RegNo: DstReg)) { |
607 | DeadCopies.push_back(Elt: &MI); |
608 | continue; |
609 | } |
610 | |
611 | LLVM_DEBUG(dbgs() << "Lower Other: " << MI); |
612 | |
613 | markAsLaneMask(DstReg); |
614 | initializeLaneMaskRegisterAttributes(LaneMask: DstReg); |
615 | |
616 | if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) |
617 | continue; |
618 | |
619 | DebugLoc DL = MI.getDebugLoc(); |
620 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
621 | assert(!MI.getOperand(1).getSubReg()); |
622 | |
623 | if (!SrcReg.isVirtual() || (!isLaneMaskReg(Reg: SrcReg) && !isVreg1(Reg: SrcReg))) { |
624 | assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); |
625 | Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
626 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_NE_U32_e64), DestReg: TmpReg) |
627 | .addReg(RegNo: SrcReg) |
628 | .addImm(Val: 0); |
629 | MI.getOperand(i: 1).setReg(TmpReg); |
630 | SrcReg = TmpReg; |
631 | } else { |
632 | // SrcReg needs to be live beyond copy. |
633 | MI.getOperand(i: 1).setIsKill(false); |
634 | } |
635 | |
636 | // Defs in a loop that are observed outside the loop must be transformed |
637 | // into appropriate bit manipulation. |
638 | std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; |
639 | for (MachineInstr &Use : MRI->use_instructions(Reg: DstReg)) |
640 | DomBlocks.push_back(x: Use.getParent()); |
641 | |
642 | MachineBasicBlock *PostDomBound = |
643 | PDT->findNearestCommonDominator(Blocks: DomBlocks); |
644 | unsigned FoundLoopLevel = LF.findLoop(PostDom: PostDomBound); |
645 | if (FoundLoopLevel) { |
646 | SSAUpdater.Initialize(V: DstReg); |
647 | SSAUpdater.AddAvailableValue(BB: &MBB, V: DstReg); |
648 | LF.addLoopEntries(LoopLevel: FoundLoopLevel, SSAUpdater, MRI&: *MRI, LaneMaskRegAttrs); |
649 | |
650 | buildMergeLaneMasks(MBB, I: MI, DL, DstReg, |
651 | PrevReg: SSAUpdater.GetValueInMiddleOfBlock(BB: &MBB), CurReg: SrcReg); |
652 | DeadCopies.push_back(Elt: &MI); |
653 | } |
654 | } |
655 | |
656 | for (MachineInstr *MI : DeadCopies) |
657 | MI->eraseFromParent(); |
658 | DeadCopies.clear(); |
659 | } |
660 | return Changed; |
661 | } |
662 | |
663 | bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { |
664 | const MachineInstr *MI; |
665 | for (;;) { |
666 | MI = MRI->getUniqueVRegDef(Reg); |
667 | if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) |
668 | return true; |
669 | |
670 | if (MI->getOpcode() != AMDGPU::COPY) |
671 | break; |
672 | |
673 | Reg = MI->getOperand(i: 1).getReg(); |
674 | if (!Reg.isVirtual()) |
675 | return false; |
676 | if (!isLaneMaskReg(Reg)) |
677 | return false; |
678 | } |
679 | |
680 | if (MI->getOpcode() != MovOp) |
681 | return false; |
682 | |
683 | if (!MI->getOperand(i: 1).isImm()) |
684 | return false; |
685 | |
686 | int64_t Imm = MI->getOperand(i: 1).getImm(); |
687 | if (Imm == 0) { |
688 | Val = false; |
689 | return true; |
690 | } |
691 | if (Imm == -1) { |
692 | Val = true; |
693 | return true; |
694 | } |
695 | |
696 | return false; |
697 | } |
698 | |
699 | static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { |
700 | Def = false; |
701 | Use = false; |
702 | |
703 | for (const MachineOperand &MO : MI.operands()) { |
704 | if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { |
705 | if (MO.isUse()) |
706 | Use = true; |
707 | else |
708 | Def = true; |
709 | } |
710 | } |
711 | } |
712 | |
713 | /// Return a point at the end of the given \p MBB to insert SALU instructions |
714 | /// for lane mask calculation. Take terminators and SCC into account. |
715 | MachineBasicBlock::iterator |
716 | PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { |
717 | auto InsertionPt = MBB.getFirstTerminator(); |
718 | bool TerminatorsUseSCC = false; |
719 | for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { |
720 | bool DefsSCC; |
721 | instrDefsUsesSCC(MI: *I, Def&: DefsSCC, Use&: TerminatorsUseSCC); |
722 | if (TerminatorsUseSCC || DefsSCC) |
723 | break; |
724 | } |
725 | |
726 | if (!TerminatorsUseSCC) |
727 | return InsertionPt; |
728 | |
729 | while (InsertionPt != MBB.begin()) { |
730 | InsertionPt--; |
731 | |
732 | bool DefSCC, UseSCC; |
733 | instrDefsUsesSCC(MI: *InsertionPt, Def&: DefSCC, Use&: UseSCC); |
734 | if (DefSCC) |
735 | return InsertionPt; |
736 | } |
737 | |
738 | // We should have at least seen an IMPLICIT_DEF or COPY |
739 | llvm_unreachable("SCC used by terminator but no def in block" ); |
740 | } |
741 | |
742 | // VReg_1 -> SReg_32 or SReg_64 |
743 | void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { |
744 | MRI->setRegClass(Reg: DstReg, RC: ST->getBoolRC()); |
745 | } |
746 | |
747 | void Vreg1LoweringHelper::getCandidatesForLowering( |
748 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { |
749 | for (MachineBasicBlock &MBB : *MF) { |
750 | for (MachineInstr &MI : MBB.phis()) { |
751 | if (isVreg1(Reg: MI.getOperand(i: 0).getReg())) |
752 | Vreg1Phis.push_back(Elt: &MI); |
753 | } |
754 | } |
755 | } |
756 | |
757 | void Vreg1LoweringHelper::collectIncomingValuesFromPhi( |
758 | const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { |
759 | for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { |
760 | assert(i + 1 < MI->getNumOperands()); |
761 | Register IncomingReg = MI->getOperand(i).getReg(); |
762 | MachineBasicBlock *IncomingMBB = MI->getOperand(i: i + 1).getMBB(); |
763 | MachineInstr *IncomingDef = MRI->getUniqueVRegDef(Reg: IncomingReg); |
764 | |
765 | if (IncomingDef->getOpcode() == AMDGPU::COPY) { |
766 | IncomingReg = IncomingDef->getOperand(i: 1).getReg(); |
767 | assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); |
768 | assert(!IncomingDef->getOperand(1).getSubReg()); |
769 | } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { |
770 | continue; |
771 | } else { |
772 | assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); |
773 | } |
774 | |
775 | Incomings.emplace_back(Args&: IncomingReg, Args&: IncomingMBB, Args: Register()); |
776 | } |
777 | } |
778 | |
779 | void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, |
780 | MachineBasicBlock *MBB) { |
781 | MRI->replaceRegWith(FromReg: NewReg, ToReg: OldReg); |
782 | } |
783 | |
784 | void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, |
785 | MachineBasicBlock::iterator I, |
786 | const DebugLoc &DL, |
787 | Register DstReg, Register PrevReg, |
788 | Register CurReg) { |
789 | bool PrevVal = false; |
790 | bool PrevConstant = isConstantLaneMask(Reg: PrevReg, Val&: PrevVal); |
791 | bool CurVal = false; |
792 | bool CurConstant = isConstantLaneMask(Reg: CurReg, Val&: CurVal); |
793 | |
794 | if (PrevConstant && CurConstant) { |
795 | if (PrevVal == CurVal) { |
796 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: CurReg); |
797 | } else if (CurVal) { |
798 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: ExecReg); |
799 | } else { |
800 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: XorOp), DestReg: DstReg) |
801 | .addReg(RegNo: ExecReg) |
802 | .addImm(Val: -1); |
803 | } |
804 | return; |
805 | } |
806 | |
807 | Register PrevMaskedReg; |
808 | Register CurMaskedReg; |
809 | if (!PrevConstant) { |
810 | if (CurConstant && CurVal) { |
811 | PrevMaskedReg = PrevReg; |
812 | } else { |
813 | PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
814 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AndN2Op), DestReg: PrevMaskedReg) |
815 | .addReg(RegNo: PrevReg) |
816 | .addReg(RegNo: ExecReg); |
817 | } |
818 | } |
819 | if (!CurConstant) { |
820 | // TODO: check whether CurReg is already masked by EXEC |
821 | if (PrevConstant && PrevVal) { |
822 | CurMaskedReg = CurReg; |
823 | } else { |
824 | CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
825 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AndOp), DestReg: CurMaskedReg) |
826 | .addReg(RegNo: CurReg) |
827 | .addReg(RegNo: ExecReg); |
828 | } |
829 | } |
830 | |
831 | if (PrevConstant && !PrevVal) { |
832 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
833 | .addReg(RegNo: CurMaskedReg); |
834 | } else if (CurConstant && !CurVal) { |
835 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
836 | .addReg(RegNo: PrevMaskedReg); |
837 | } else if (PrevConstant && PrevVal) { |
838 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: OrN2Op), DestReg: DstReg) |
839 | .addReg(RegNo: CurMaskedReg) |
840 | .addReg(RegNo: ExecReg); |
841 | } else { |
842 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: OrOp), DestReg: DstReg) |
843 | .addReg(RegNo: PrevMaskedReg) |
844 | .addReg(RegNo: CurMaskedReg ? CurMaskedReg : ExecReg); |
845 | } |
846 | } |
847 | |
848 | void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {} |
849 | |
850 | /// Lower all instructions that def or use vreg_1 registers. |
851 | /// |
852 | /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can |
853 | /// occur around inline assembly. We do this first, before vreg_1 registers |
854 | /// are changed to scalar mask registers. |
855 | /// |
856 | /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before |
857 | /// all others, because phi lowering looks through copies and can therefore |
858 | /// often make copy lowering unnecessary. |
859 | static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, |
860 | MachinePostDominatorTree &MPDT) { |
861 | // Only need to run this in SelectionDAG path. |
862 | if (MF.getProperties().hasSelected()) |
863 | return false; |
864 | |
865 | Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT); |
866 | bool Changed = false; |
867 | Changed |= Helper.lowerCopiesFromI1(); |
868 | Changed |= Helper.lowerPhis(); |
869 | Changed |= Helper.lowerCopiesToI1(); |
870 | return Helper.cleanConstrainRegs(Changed); |
871 | } |
872 | |
873 | PreservedAnalyses |
874 | SILowerI1CopiesPass::run(MachineFunction &MF, |
875 | MachineFunctionAnalysisManager &MFAM) { |
876 | MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF); |
877 | MachinePostDominatorTree &MPDT = |
878 | MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF); |
879 | bool Changed = runFixI1Copies(MF, MDT, MPDT); |
880 | if (!Changed) |
881 | return PreservedAnalyses::all(); |
882 | |
883 | // TODO: Probably preserves most. |
884 | PreservedAnalyses PA; |
885 | PA.preserveSet<CFGAnalyses>(); |
886 | return PA; |
887 | } |
888 | |
889 | class SILowerI1CopiesLegacy : public MachineFunctionPass { |
890 | public: |
891 | static char ID; |
892 | |
893 | SILowerI1CopiesLegacy() : MachineFunctionPass(ID) { |
894 | initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry()); |
895 | } |
896 | |
897 | bool runOnMachineFunction(MachineFunction &MF) override; |
898 | |
899 | StringRef getPassName() const override { return "SI Lower i1 Copies" ; } |
900 | |
901 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
902 | AU.setPreservesCFG(); |
903 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
904 | AU.addRequired<MachinePostDominatorTreeWrapperPass>(); |
905 | MachineFunctionPass::getAnalysisUsage(AU); |
906 | } |
907 | }; |
908 | |
909 | bool SILowerI1CopiesLegacy::runOnMachineFunction(MachineFunction &MF) { |
910 | MachineDominatorTree &MDT = |
911 | getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
912 | MachinePostDominatorTree &MPDT = |
913 | getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); |
914 | return runFixI1Copies(MF, MDT, MPDT); |
915 | } |
916 | |
917 | INITIALIZE_PASS_BEGIN(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies" , |
918 | false, false) |
919 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
920 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
921 | INITIALIZE_PASS_END(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies" , |
922 | false, false) |
923 | |
924 | char SILowerI1CopiesLegacy::ID = 0; |
925 | |
926 | char &llvm::SILowerI1CopiesLegacyID = SILowerI1CopiesLegacy::ID; |
927 | |
928 | FunctionPass *llvm::createSILowerI1CopiesLegacyPass() { |
929 | return new SILowerI1CopiesLegacy(); |
930 | } |
931 | |