1 | //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass does a few optimisations related to Tail predicated loops |
10 | /// and MVE VPT blocks before register allocation is performed. For VPT blocks |
11 | /// the goal is to maximize the sizes of the blocks that will be created by the |
12 | /// MVE VPT Block Insertion pass (which runs after register allocation). For |
13 | /// tail predicated loops we transform the loop into something that will |
14 | /// hopefully make the backend ARMLowOverheadLoops pass's job easier. |
15 | /// |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "ARM.h" |
19 | #include "ARMSubtarget.h" |
20 | #include "MCTargetDesc/ARMBaseInfo.h" |
21 | #include "MVETailPredUtils.h" |
22 | #include "Thumb2InstrInfo.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/CodeGen/MachineBasicBlock.h" |
25 | #include "llvm/CodeGen/MachineDominators.h" |
26 | #include "llvm/CodeGen/MachineFunction.h" |
27 | #include "llvm/CodeGen/MachineFunctionPass.h" |
28 | #include "llvm/CodeGen/MachineInstr.h" |
29 | #include "llvm/CodeGen/MachineLoopInfo.h" |
30 | #include "llvm/InitializePasses.h" |
31 | #include "llvm/Support/Debug.h" |
32 | #include <cassert> |
33 | |
34 | using namespace llvm; |
35 | |
36 | #define DEBUG_TYPE "arm-mve-vpt-opts" |
37 | |
38 | static cl::opt<bool> |
39 | MergeEndDec("arm-enable-merge-loopenddec" , cl::Hidden, |
40 | cl::desc("Enable merging Loop End and Dec instructions." ), |
41 | cl::init(Val: true)); |
42 | |
43 | static cl::opt<bool> |
44 | SetLRPredicate("arm-set-lr-predicate" , cl::Hidden, |
45 | cl::desc("Enable setting lr as a predicate in tail predication regions." ), |
46 | cl::init(Val: true)); |
47 | |
48 | namespace { |
49 | class MVETPAndVPTOptimisations : public MachineFunctionPass { |
50 | public: |
51 | static char ID; |
52 | const Thumb2InstrInfo *TII; |
53 | MachineRegisterInfo *MRI; |
54 | |
55 | MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {} |
56 | |
57 | bool runOnMachineFunction(MachineFunction &Fn) override; |
58 | |
59 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
60 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
61 | AU.addPreserved<MachineLoopInfoWrapperPass>(); |
62 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
63 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
64 | MachineFunctionPass::getAnalysisUsage(AU); |
65 | } |
66 | |
67 | StringRef getPassName() const override { |
68 | return "ARM MVE TailPred and VPT Optimisation Pass" ; |
69 | } |
70 | |
71 | private: |
72 | bool LowerWhileLoopStart(MachineLoop *ML); |
73 | bool MergeLoopEnd(MachineLoop *ML); |
74 | bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); |
75 | MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, |
76 | MachineInstr &Instr, |
77 | MachineOperand &User, |
78 | Register Target); |
79 | bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); |
80 | bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); |
81 | bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); |
82 | bool ConvertVPSEL(MachineBasicBlock &MBB); |
83 | bool HintDoLoopStartReg(MachineBasicBlock &MBB); |
84 | MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *, |
85 | MachineInstr *LoopStart); |
86 | }; |
87 | |
88 | char MVETPAndVPTOptimisations::ID = 0; |
89 | |
90 | } // end anonymous namespace |
91 | |
92 | INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, |
93 | "ARM MVE TailPred and VPT Optimisations pass" , false, |
94 | false) |
95 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
96 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
97 | INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, |
98 | "ARM MVE TailPred and VPT Optimisations pass" , false, false) |
99 | |
100 | static MachineInstr *LookThroughCOPY(MachineInstr *MI, |
101 | MachineRegisterInfo *MRI) { |
102 | while (MI && MI->getOpcode() == TargetOpcode::COPY && |
103 | MI->getOperand(i: 1).getReg().isVirtual()) |
104 | MI = MRI->getVRegDef(Reg: MI->getOperand(i: 1).getReg()); |
105 | return MI; |
106 | } |
107 | |
108 | // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and |
109 | // corresponding PHI that make up a low overhead loop. Only handles 'do' loops |
110 | // at the moment, returning a t2DoLoopStart in LoopStart. |
111 | static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, |
112 | MachineInstr *&LoopStart, MachineInstr *&LoopPhi, |
113 | MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { |
114 | MachineBasicBlock * = ML->getHeader(); |
115 | MachineBasicBlock *Latch = ML->getLoopLatch(); |
116 | if (!Header || !Latch) { |
117 | LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n" ); |
118 | return false; |
119 | } |
120 | |
121 | // Find the loop end from the terminators. |
122 | LoopEnd = nullptr; |
123 | for (auto &T : Latch->terminators()) { |
124 | if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(i: 1).getMBB() == Header) { |
125 | LoopEnd = &T; |
126 | break; |
127 | } |
128 | if (T.getOpcode() == ARM::t2LoopEndDec && |
129 | T.getOperand(i: 2).getMBB() == Header) { |
130 | LoopEnd = &T; |
131 | break; |
132 | } |
133 | } |
134 | if (!LoopEnd) { |
135 | LLVM_DEBUG(dbgs() << " no LoopEnd\n" ); |
136 | return false; |
137 | } |
138 | LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd); |
139 | |
140 | // Find the dec from the use of the end. There may be copies between |
141 | // instructions. We expect the loop to loop like: |
142 | // $vs = t2DoLoopStart ... |
143 | // loop: |
144 | // $vp = phi [ $vs ], [ $vd ] |
145 | // ... |
146 | // $vd = t2LoopDec $vp |
147 | // ... |
148 | // t2LoopEnd $vd, loop |
149 | if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) |
150 | LoopDec = LoopEnd; |
151 | else { |
152 | LoopDec = |
153 | LookThroughCOPY(MI: MRI->getVRegDef(Reg: LoopEnd->getOperand(i: 0).getReg()), MRI); |
154 | if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { |
155 | LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n" ); |
156 | return false; |
157 | } |
158 | } |
159 | LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); |
160 | |
161 | LoopPhi = |
162 | LookThroughCOPY(MI: MRI->getVRegDef(Reg: LoopDec->getOperand(i: 1).getReg()), MRI); |
163 | if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || |
164 | LoopPhi->getNumOperands() != 5 || |
165 | (LoopPhi->getOperand(i: 2).getMBB() != Latch && |
166 | LoopPhi->getOperand(i: 4).getMBB() != Latch)) { |
167 | LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n" ); |
168 | return false; |
169 | } |
170 | LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi); |
171 | |
172 | Register StartReg = LoopPhi->getOperand(i: 2).getMBB() == Latch |
173 | ? LoopPhi->getOperand(i: 3).getReg() |
174 | : LoopPhi->getOperand(i: 1).getReg(); |
175 | LoopStart = LookThroughCOPY(MI: MRI->getVRegDef(Reg: StartReg), MRI); |
176 | if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart && |
177 | LoopStart->getOpcode() != ARM::t2WhileLoopSetup && |
178 | LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) { |
179 | LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n" ); |
180 | return false; |
181 | } |
182 | LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart); |
183 | |
184 | return true; |
185 | } |
186 | |
187 | static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) { |
188 | MachineBasicBlock *MBB = MI->getParent(); |
189 | assert(MI->getOpcode() == ARM::t2WhileLoopSetup && |
190 | "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!" ); |
191 | |
192 | // Subs |
193 | MachineInstrBuilder MIB = |
194 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri)); |
195 | MIB.add(MO: MI->getOperand(i: 0)); |
196 | MIB.add(MO: MI->getOperand(i: 1)); |
197 | MIB.addImm(Val: 0); |
198 | MIB.addImm(Val: ARMCC::AL); |
199 | MIB.addReg(RegNo: ARM::NoRegister); |
200 | MIB.addReg(RegNo: ARM::CPSR, flags: RegState::Define); |
201 | |
202 | // Attempt to find a t2WhileLoopStart and revert to a t2Bcc. |
203 | for (MachineInstr &I : MBB->terminators()) { |
204 | if (I.getOpcode() == ARM::t2WhileLoopStart) { |
205 | MachineInstrBuilder MIB = |
206 | BuildMI(BB&: *MBB, I: &I, MIMD: I.getDebugLoc(), MCID: TII->get(Opcode: ARM::t2Bcc)); |
207 | MIB.add(MO: MI->getOperand(i: 1)); // branch target |
208 | MIB.addImm(Val: ARMCC::EQ); |
209 | MIB.addReg(RegNo: ARM::CPSR); |
210 | I.eraseFromParent(); |
211 | break; |
212 | } |
213 | } |
214 | |
215 | MI->eraseFromParent(); |
216 | } |
217 | |
218 | // The Hardware Loop insertion and ISel Lowering produce the pseudos for the |
219 | // start of a while loop: |
220 | // %a:gprlr = t2WhileLoopSetup %Cnt |
221 | // t2WhileLoopStart %a, %BB |
222 | // We want to convert those to a single instruction which, like t2LoopEndDec and |
223 | // t2DoLoopStartTP is both a terminator and produces a value: |
224 | // %a:grplr: t2WhileLoopStartLR %Cnt, %BB |
225 | // |
226 | // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and |
227 | // t2WhileLoopStart are not valid past regalloc. |
228 | bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) { |
229 | LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop " |
230 | << ML->getHeader()->getName() << "\n" ); |
231 | |
232 | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; |
233 | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) |
234 | return false; |
235 | |
236 | if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup) |
237 | return false; |
238 | |
239 | Register LR = LoopStart->getOperand(i: 0).getReg(); |
240 | auto WLSIt = find_if(Range: MRI->use_nodbg_instructions(Reg: LR), P: [](auto &MI) { |
241 | return MI.getOpcode() == ARM::t2WhileLoopStart; |
242 | }); |
243 | if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) { |
244 | RevertWhileLoopSetup(MI: LoopStart, TII); |
245 | RevertLoopDec(MI: LoopStart, TII); |
246 | RevertLoopEnd(MI: LoopStart, TII); |
247 | return true; |
248 | } |
249 | |
250 | MachineInstrBuilder MI = |
251 | BuildMI(BB&: *WLSIt->getParent(), I&: *WLSIt, MIMD: WLSIt->getDebugLoc(), |
252 | MCID: TII->get(Opcode: ARM::t2WhileLoopStartLR), DestReg: LR) |
253 | .add(MO: LoopStart->getOperand(i: 1)) |
254 | .add(MO: WLSIt->getOperand(i: 1)); |
255 | (void)MI; |
256 | LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr()); |
257 | |
258 | WLSIt->eraseFromParent(); |
259 | LoopStart->eraseFromParent(); |
260 | return true; |
261 | } |
262 | |
263 | // Return true if this instruction is invalid in a low overhead loop, usually |
264 | // because it clobbers LR. |
265 | static bool IsInvalidTPInstruction(MachineInstr &MI) { |
266 | return MI.isCall() || isLoopStart(MI); |
267 | } |
268 | |
269 | // Starting from PreHeader, search for invalid instructions back until the |
270 | // LoopStart block is reached. If invalid instructions are found, the loop start |
271 | // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will |
272 | // return the new DLS LoopStart if updated. |
273 | MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors( |
274 | MachineBasicBlock *, MachineInstr *LoopStart) { |
275 | SmallVector<MachineBasicBlock *> Worklist; |
276 | SmallPtrSet<MachineBasicBlock *, 4> Visited; |
277 | Worklist.push_back(Elt: PreHeader); |
278 | Visited.insert(Ptr: LoopStart->getParent()); |
279 | |
280 | while (!Worklist.empty()) { |
281 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
282 | if (Visited.count(Ptr: MBB)) |
283 | continue; |
284 | |
285 | for (MachineInstr &MI : *MBB) { |
286 | if (!IsInvalidTPInstruction(MI)) |
287 | continue; |
288 | |
289 | LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI); |
290 | |
291 | // Create a t2DoLoopStart at the end of the preheader. |
292 | MachineInstrBuilder MIB = |
293 | BuildMI(BB&: *PreHeader, I: PreHeader->getFirstTerminator(), |
294 | MIMD: LoopStart->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2DoLoopStart)); |
295 | MIB.add(MO: LoopStart->getOperand(i: 0)); |
296 | MIB.add(MO: LoopStart->getOperand(i: 1)); |
297 | |
298 | // Make sure to remove the kill flags, to prevent them from being invalid. |
299 | LoopStart->getOperand(i: 1).setIsKill(false); |
300 | |
301 | // Revert the t2WhileLoopStartLR to a CMP and Br. |
302 | RevertWhileLoopStartLR(MI: LoopStart, TII, BrOpc: ARM::t2Bcc, UseCmp: true); |
303 | return MIB; |
304 | } |
305 | |
306 | Visited.insert(Ptr: MBB); |
307 | for (auto *Pred : MBB->predecessors()) |
308 | Worklist.push_back(Elt: Pred); |
309 | } |
310 | return LoopStart; |
311 | } |
312 | |
313 | // This function converts loops with t2LoopEnd and t2LoopEnd instructions into |
314 | // a single t2LoopEndDec instruction. To do that it needs to make sure that LR |
315 | // will be valid to be used for the low overhead loop, which means nothing else |
316 | // is using LR (especially calls) and there are no superfluous copies in the |
317 | // loop. The t2LoopEndDec is a branching terminator that produces a value (the |
318 | // decrement) around the loop edge, which means we need to be careful that they |
319 | // will be valid to allocate without any spilling. |
320 | bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { |
321 | if (!MergeEndDec) |
322 | return false; |
323 | |
324 | LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() |
325 | << "\n" ); |
326 | |
327 | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; |
328 | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) |
329 | return false; |
330 | |
331 | // Check if there is an illegal instruction (a call) in the low overhead loop |
332 | // and if so revert it now before we get any further. While loops also need to |
333 | // check the preheaders, but can be reverted to a DLS loop if needed. |
334 | auto * = ML->getLoopPreheader(); |
335 | if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader) |
336 | LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart); |
337 | |
338 | for (MachineBasicBlock *MBB : ML->blocks()) { |
339 | for (MachineInstr &MI : *MBB) { |
340 | if (IsInvalidTPInstruction(MI)) { |
341 | LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI); |
342 | if (LoopStart->getOpcode() == ARM::t2DoLoopStart) |
343 | RevertDoLoopStart(MI: LoopStart, TII); |
344 | else |
345 | RevertWhileLoopStartLR(MI: LoopStart, TII); |
346 | RevertLoopDec(MI: LoopDec, TII); |
347 | RevertLoopEnd(MI: LoopEnd, TII); |
348 | return true; |
349 | } |
350 | } |
351 | } |
352 | |
353 | // Remove any copies from the loop, to ensure the phi that remains is both |
354 | // simpler and contains no extra uses. Because t2LoopEndDec is a terminator |
355 | // that cannot spill, we need to be careful what remains in the loop. |
356 | Register PhiReg = LoopPhi->getOperand(i: 0).getReg(); |
357 | Register DecReg = LoopDec->getOperand(i: 0).getReg(); |
358 | Register StartReg = LoopStart->getOperand(i: 0).getReg(); |
359 | // Ensure the uses are expected, and collect any copies we want to remove. |
360 | SmallVector<MachineInstr *, 4> Copies; |
361 | auto CheckUsers = [&Copies](Register BaseReg, |
362 | ArrayRef<MachineInstr *> ExpectedUsers, |
363 | MachineRegisterInfo *MRI) { |
364 | SmallVector<Register, 4> Worklist; |
365 | Worklist.push_back(Elt: BaseReg); |
366 | while (!Worklist.empty()) { |
367 | Register Reg = Worklist.pop_back_val(); |
368 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { |
369 | if (llvm::is_contained(Range&: ExpectedUsers, Element: &MI)) |
370 | continue; |
371 | if (MI.getOpcode() != TargetOpcode::COPY || |
372 | !MI.getOperand(i: 0).getReg().isVirtual()) { |
373 | LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); |
374 | return false; |
375 | } |
376 | Worklist.push_back(Elt: MI.getOperand(i: 0).getReg()); |
377 | Copies.push_back(Elt: &MI); |
378 | } |
379 | } |
380 | return true; |
381 | }; |
382 | if (!CheckUsers(PhiReg, {LoopDec}, MRI) || |
383 | !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || |
384 | !CheckUsers(StartReg, {LoopPhi}, MRI)) { |
385 | // Don't leave a t2WhileLoopStartLR without the LoopDecEnd. |
386 | if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) { |
387 | RevertWhileLoopStartLR(MI: LoopStart, TII); |
388 | RevertLoopDec(MI: LoopDec, TII); |
389 | RevertLoopEnd(MI: LoopEnd, TII); |
390 | return true; |
391 | } |
392 | return false; |
393 | } |
394 | |
395 | MRI->constrainRegClass(Reg: StartReg, RC: &ARM::GPRlrRegClass); |
396 | MRI->constrainRegClass(Reg: PhiReg, RC: &ARM::GPRlrRegClass); |
397 | MRI->constrainRegClass(Reg: DecReg, RC: &ARM::GPRlrRegClass); |
398 | |
399 | if (LoopPhi->getOperand(i: 2).getMBB() == ML->getLoopLatch()) { |
400 | LoopPhi->getOperand(i: 3).setReg(StartReg); |
401 | LoopPhi->getOperand(i: 1).setReg(DecReg); |
402 | } else { |
403 | LoopPhi->getOperand(i: 1).setReg(StartReg); |
404 | LoopPhi->getOperand(i: 3).setReg(DecReg); |
405 | } |
406 | |
407 | SmallVector<MachineOperand, 4> Cond; // For analyzeBranch. |
408 | MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. |
409 | if (!TII->analyzeBranch(MBB&: *LoopEnd->getParent(), TBB, FBB, Cond) && !FBB) { |
410 | // If the LoopEnd falls through, need to insert a t2B to the fall-through |
411 | // block so that the non-analyzable t2LoopEndDec doesn't fall through. |
412 | MachineFunction::iterator MBBI = ++LoopEnd->getParent()->getIterator(); |
413 | BuildMI(BB: LoopEnd->getParent(), MIMD: DebugLoc(), MCID: TII->get(Opcode: ARM::t2B)) |
414 | .addMBB(MBB: &*MBBI) |
415 | .add(MOs: predOps(Pred: ARMCC::AL)); |
416 | } |
417 | |
418 | // Replace the loop dec and loop end as a single instruction. |
419 | MachineInstrBuilder MI = |
420 | BuildMI(BB&: *LoopEnd->getParent(), I&: *LoopEnd, MIMD: LoopEnd->getDebugLoc(), |
421 | MCID: TII->get(Opcode: ARM::t2LoopEndDec), DestReg: DecReg) |
422 | .addReg(RegNo: PhiReg) |
423 | .add(MO: LoopEnd->getOperand(i: 1)); |
424 | (void)MI; |
425 | LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); |
426 | |
427 | LoopDec->eraseFromParent(); |
428 | LoopEnd->eraseFromParent(); |
429 | for (auto *MI : Copies) |
430 | MI->eraseFromParent(); |
431 | return true; |
432 | } |
433 | |
434 | // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP |
435 | // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP |
436 | // instruction, making the backend ARMLowOverheadLoops passes job of finding the |
437 | // VCTP operand much simpler. |
438 | bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, |
439 | MachineDominatorTree *DT) { |
440 | LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " |
441 | << ML->getHeader()->getName() << "\n" ); |
442 | |
443 | // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's |
444 | // in the loop. |
445 | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; |
446 | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) |
447 | return false; |
448 | if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart && |
449 | LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) |
450 | return false; |
451 | |
452 | SmallVector<MachineInstr *, 4> VCTPs; |
453 | SmallVector<MachineInstr *, 4> MVEInstrs; |
454 | for (MachineBasicBlock *BB : ML->blocks()) { |
455 | for (MachineInstr &MI : *BB) |
456 | if (isVCTP(MI: &MI)) |
457 | VCTPs.push_back(Elt: &MI); |
458 | else if (findFirstVPTPredOperandIdx(MI) != -1) |
459 | MVEInstrs.push_back(Elt: &MI); |
460 | } |
461 | |
462 | if (VCTPs.empty()) { |
463 | LLVM_DEBUG(dbgs() << " no VCTPs\n" ); |
464 | return false; |
465 | } |
466 | |
467 | // Check all VCTPs are the same. |
468 | MachineInstr *FirstVCTP = *VCTPs.begin(); |
469 | for (MachineInstr *VCTP : VCTPs) { |
470 | LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP); |
471 | if (VCTP->getOpcode() != FirstVCTP->getOpcode() || |
472 | VCTP->getOperand(i: 0).getReg() != FirstVCTP->getOperand(i: 0).getReg()) { |
473 | LLVM_DEBUG(dbgs() << " VCTP's are not identical\n" ); |
474 | return false; |
475 | } |
476 | } |
477 | |
478 | // Check for the register being used can be setup before the loop. We expect |
479 | // this to be: |
480 | // $vx = ... |
481 | // loop: |
482 | // $vp = PHI [ $vx ], [ $vd ] |
483 | // .. |
484 | // $vpr = VCTP $vp |
485 | // .. |
486 | // $vd = t2SUBri $vp, #n |
487 | // .. |
488 | Register CountReg = FirstVCTP->getOperand(i: 1).getReg(); |
489 | if (!CountReg.isVirtual()) { |
490 | LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n" ); |
491 | return false; |
492 | } |
493 | MachineInstr *Phi = LookThroughCOPY(MI: MRI->getVRegDef(Reg: CountReg), MRI); |
494 | if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || |
495 | Phi->getNumOperands() != 5 || |
496 | (Phi->getOperand(i: 2).getMBB() != ML->getLoopLatch() && |
497 | Phi->getOperand(i: 4).getMBB() != ML->getLoopLatch())) { |
498 | LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n" ); |
499 | return false; |
500 | } |
501 | CountReg = Phi->getOperand(i: 2).getMBB() == ML->getLoopLatch() |
502 | ? Phi->getOperand(i: 3).getReg() |
503 | : Phi->getOperand(i: 1).getReg(); |
504 | |
505 | // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of |
506 | // the preheader and add the new CountReg to it. We attempt to place it late |
507 | // in the preheader, but may need to move that earlier based on uses. |
508 | MachineBasicBlock *MBB = LoopStart->getParent(); |
509 | MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); |
510 | for (MachineInstr &Use : |
511 | MRI->use_instructions(Reg: LoopStart->getOperand(i: 0).getReg())) |
512 | if ((InsertPt != MBB->end() && !DT->dominates(A: &*InsertPt, B: &Use)) || |
513 | !DT->dominates(A: ML->getHeader(), B: Use.getParent())) { |
514 | LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n" ); |
515 | return false; |
516 | } |
517 | |
518 | unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart |
519 | ? ARM::t2DoLoopStartTP |
520 | : ARM::t2WhileLoopStartTP; |
521 | MachineInstrBuilder MI = |
522 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: LoopStart->getDebugLoc(), MCID: TII->get(Opcode: NewOpc)) |
523 | .add(MO: LoopStart->getOperand(i: 0)) |
524 | .add(MO: LoopStart->getOperand(i: 1)) |
525 | .addReg(RegNo: CountReg); |
526 | if (NewOpc == ARM::t2WhileLoopStartTP) |
527 | MI.add(MO: LoopStart->getOperand(i: 2)); |
528 | LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " |
529 | << *MI.getInstr()); |
530 | MRI->constrainRegClass(Reg: CountReg, RC: &ARM::rGPRRegClass); |
531 | LoopStart->eraseFromParent(); |
532 | |
533 | if (SetLRPredicate) { |
534 | // Each instruction in the loop needs to be using LR as the predicate from |
535 | // the Phi as the predicate. |
536 | Register LR = LoopPhi->getOperand(i: 0).getReg(); |
537 | for (MachineInstr *MI : MVEInstrs) { |
538 | int Idx = findFirstVPTPredOperandIdx(MI: *MI); |
539 | MI->getOperand(i: Idx + 2).setReg(LR); |
540 | } |
541 | } |
542 | |
543 | return true; |
544 | } |
545 | |
546 | // Returns true if Opcode is any VCMP Opcode. |
547 | static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } |
548 | |
549 | // Returns true if a VCMP with this Opcode can have its operands swapped. |
550 | // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs, |
551 | // and VCMPr instructions (since the r is always on the right). |
552 | static bool CanHaveSwappedOperands(unsigned Opcode) { |
553 | switch (Opcode) { |
554 | default: |
555 | return true; |
556 | case ARM::MVE_VCMPf32: |
557 | case ARM::MVE_VCMPf16: |
558 | case ARM::MVE_VCMPf32r: |
559 | case ARM::MVE_VCMPf16r: |
560 | case ARM::MVE_VCMPi8r: |
561 | case ARM::MVE_VCMPi16r: |
562 | case ARM::MVE_VCMPi32r: |
563 | case ARM::MVE_VCMPu8r: |
564 | case ARM::MVE_VCMPu16r: |
565 | case ARM::MVE_VCMPu32r: |
566 | case ARM::MVE_VCMPs8r: |
567 | case ARM::MVE_VCMPs16r: |
568 | case ARM::MVE_VCMPs32r: |
569 | return false; |
570 | } |
571 | } |
572 | |
573 | // Returns the CondCode of a VCMP Instruction. |
574 | static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { |
575 | assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP" ); |
576 | return ARMCC::CondCodes(Instr.getOperand(i: 3).getImm()); |
577 | } |
578 | |
579 | // Returns true if Cond is equivalent to a VPNOT instruction on the result of |
580 | // Prev. Cond and Prev must be VCMPs. |
581 | static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { |
582 | assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode())); |
583 | |
584 | // Opcodes must match. |
585 | if (Cond.getOpcode() != Prev.getOpcode()) |
586 | return false; |
587 | |
588 | MachineOperand &CondOP1 = Cond.getOperand(i: 1), &CondOP2 = Cond.getOperand(i: 2); |
589 | MachineOperand &PrevOP1 = Prev.getOperand(i: 1), &PrevOP2 = Prev.getOperand(i: 2); |
590 | |
591 | // If the VCMP has the opposite condition with the same operands, we can |
592 | // replace it with a VPNOT |
593 | ARMCC::CondCodes ExpectedCode = GetCondCode(Instr&: Cond); |
594 | ExpectedCode = ARMCC::getOppositeCondition(CC: ExpectedCode); |
595 | if (ExpectedCode == GetCondCode(Instr&: Prev)) |
596 | if (CondOP1.isIdenticalTo(Other: PrevOP1) && CondOP2.isIdenticalTo(Other: PrevOP2)) |
597 | return true; |
598 | // Check again with operands swapped if possible |
599 | if (!CanHaveSwappedOperands(Opcode: Cond.getOpcode())) |
600 | return false; |
601 | ExpectedCode = ARMCC::getSwappedCondition(CC: ExpectedCode); |
602 | return ExpectedCode == GetCondCode(Instr&: Prev) && CondOP1.isIdenticalTo(Other: PrevOP2) && |
603 | CondOP2.isIdenticalTo(Other: PrevOP1); |
604 | } |
605 | |
606 | // Returns true if Instr writes to VCCR. |
607 | static bool IsWritingToVCCR(MachineInstr &Instr) { |
608 | if (Instr.getNumOperands() == 0) |
609 | return false; |
610 | MachineOperand &Dst = Instr.getOperand(i: 0); |
611 | if (!Dst.isReg()) |
612 | return false; |
613 | Register DstReg = Dst.getReg(); |
614 | if (!DstReg.isVirtual()) |
615 | return false; |
616 | MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo(); |
617 | const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(Reg: DstReg); |
618 | return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); |
619 | } |
620 | |
621 | // Transforms |
622 | // <Instr that uses %A ('User' Operand)> |
623 | // Into |
624 | // %K = VPNOT %Target |
625 | // <Instr that uses %K ('User' Operand)> |
626 | // And returns the newly inserted VPNOT. |
627 | // This optimization is done in the hopes of preventing spills/reloads of VPR by |
628 | // reducing the number of VCCR values with overlapping lifetimes. |
629 | MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT( |
630 | MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, |
631 | Register Target) { |
632 | Register NewResult = MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: Target)); |
633 | |
634 | MachineInstrBuilder MIBuilder = |
635 | BuildMI(BB&: MBB, I: &Instr, MIMD: Instr.getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPNOT)) |
636 | .addDef(RegNo: NewResult) |
637 | .addReg(RegNo: Target); |
638 | addUnpredicatedMveVpredNOp(MIB&: MIBuilder); |
639 | |
640 | // Make the user use NewResult instead, and clear its kill flag. |
641 | User.setReg(NewResult); |
642 | User.setIsKill(false); |
643 | |
644 | LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): " ; |
645 | MIBuilder.getInstr()->dump()); |
646 | |
647 | return *MIBuilder.getInstr(); |
648 | } |
649 | |
650 | // Moves a VPNOT before its first user if an instruction that uses Reg is found |
651 | // in-between the VPNOT and its user. |
652 | // Returns true if there is at least one user of the VPNOT in the block. |
653 | static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, |
654 | MachineBasicBlock::iterator Iter, |
655 | Register Reg) { |
656 | assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!" ); |
657 | assert(getVPTInstrPredicate(*Iter) == ARMVCC::None && |
658 | "The VPNOT cannot be predicated" ); |
659 | |
660 | MachineInstr &VPNOT = *Iter; |
661 | Register VPNOTResult = VPNOT.getOperand(i: 0).getReg(); |
662 | Register VPNOTOperand = VPNOT.getOperand(i: 1).getReg(); |
663 | |
664 | // Whether the VPNOT will need to be moved, and whether we found a user of the |
665 | // VPNOT. |
666 | bool MustMove = false, HasUser = false; |
667 | MachineOperand *VPNOTOperandKiller = nullptr; |
668 | for (; Iter != MBB.end(); ++Iter) { |
669 | if (MachineOperand *MO = |
670 | Iter->findRegisterUseOperand(Reg: VPNOTOperand, /*TRI=*/nullptr, |
671 | /*isKill*/ true)) { |
672 | // If we find the operand that kills the VPNOTOperand's result, save it. |
673 | VPNOTOperandKiller = MO; |
674 | } |
675 | |
676 | if (Iter->findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != -1) { |
677 | MustMove = true; |
678 | continue; |
679 | } |
680 | |
681 | if (Iter->findRegisterUseOperandIdx(Reg: VPNOTResult, /*TRI=*/nullptr) == -1) |
682 | continue; |
683 | |
684 | HasUser = true; |
685 | if (!MustMove) |
686 | break; |
687 | |
688 | // Move the VPNOT right before Iter |
689 | LLVM_DEBUG(dbgs() << "Moving: " ; VPNOT.dump(); dbgs() << " Before: " ; |
690 | Iter->dump()); |
691 | MBB.splice(Where: Iter, Other: &MBB, From: VPNOT.getIterator()); |
692 | // If we move the instr, and its operand was killed earlier, remove the kill |
693 | // flag. |
694 | if (VPNOTOperandKiller) |
695 | VPNOTOperandKiller->setIsKill(false); |
696 | |
697 | break; |
698 | } |
699 | return HasUser; |
700 | } |
701 | |
702 | // This optimisation attempts to reduce the number of overlapping lifetimes of |
703 | // VCCR values by replacing uses of old VCCR values with VPNOTs. For example, |
704 | // this replaces |
705 | // %A:vccr = (something) |
706 | // %B:vccr = VPNOT %A |
707 | // %Foo = (some op that uses %B) |
708 | // %Bar = (some op that uses %A) |
709 | // With |
710 | // %A:vccr = (something) |
711 | // %B:vccr = VPNOT %A |
712 | // %Foo = (some op that uses %B) |
713 | // %TMP2:vccr = VPNOT %B |
714 | // %Bar = (some op that uses %A) |
715 | bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { |
716 | MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end(); |
717 | SmallVector<MachineInstr *, 4> DeadInstructions; |
718 | bool Modified = false; |
719 | |
720 | while (Iter != End) { |
721 | Register VCCRValue, OppositeVCCRValue; |
722 | // The first loop looks for 2 unpredicated instructions: |
723 | // %A:vccr = (instr) ; A is stored in VCCRValue |
724 | // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue |
725 | for (; Iter != End; ++Iter) { |
726 | // We're only interested in unpredicated instructions that write to VCCR. |
727 | if (!IsWritingToVCCR(Instr&: *Iter) || |
728 | getVPTInstrPredicate(MI: *Iter) != ARMVCC::None) |
729 | continue; |
730 | Register Dst = Iter->getOperand(i: 0).getReg(); |
731 | |
732 | // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've |
733 | // found what we were looking for. |
734 | if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT && |
735 | Iter->findRegisterUseOperandIdx(Reg: VCCRValue, /*TRI=*/nullptr) != -1) { |
736 | // Move the VPNOT closer to its first user if needed, and ignore if it |
737 | // has no users. |
738 | if (!MoveVPNOTBeforeFirstUser(MBB, Iter, Reg: VCCRValue)) |
739 | continue; |
740 | |
741 | OppositeVCCRValue = Dst; |
742 | ++Iter; |
743 | break; |
744 | } |
745 | |
746 | // Else, just set VCCRValue. |
747 | VCCRValue = Dst; |
748 | } |
749 | |
750 | // If the first inner loop didn't find anything, stop here. |
751 | if (Iter == End) |
752 | break; |
753 | |
754 | assert(VCCRValue && OppositeVCCRValue && |
755 | "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop " |
756 | "stopped before the end of the block!" ); |
757 | assert(VCCRValue != OppositeVCCRValue && |
758 | "VCCRValue should not be equal to OppositeVCCRValue!" ); |
759 | |
760 | // LastVPNOTResult always contains the same value as OppositeVCCRValue. |
761 | Register LastVPNOTResult = OppositeVCCRValue; |
762 | |
763 | // This second loop tries to optimize the remaining instructions. |
764 | for (; Iter != End; ++Iter) { |
765 | bool IsInteresting = false; |
766 | |
767 | if (MachineOperand *MO = |
768 | Iter->findRegisterUseOperand(Reg: VCCRValue, /*TRI=*/nullptr)) { |
769 | IsInteresting = true; |
770 | |
771 | // - If the instruction is a VPNOT, it can be removed, and we can just |
772 | // replace its uses with LastVPNOTResult. |
773 | // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue. |
774 | if (Iter->getOpcode() == ARM::MVE_VPNOT) { |
775 | Register Result = Iter->getOperand(i: 0).getReg(); |
776 | |
777 | MRI->replaceRegWith(FromReg: Result, ToReg: LastVPNOTResult); |
778 | DeadInstructions.push_back(Elt: &*Iter); |
779 | Modified = true; |
780 | |
781 | LLVM_DEBUG(dbgs() |
782 | << "Replacing all uses of '" << printReg(Result) |
783 | << "' with '" << printReg(LastVPNOTResult) << "'\n" ); |
784 | } else { |
785 | MachineInstr &VPNOT = |
786 | ReplaceRegisterUseWithVPNOT(MBB, Instr&: *Iter, User&: *MO, Target: LastVPNOTResult); |
787 | Modified = true; |
788 | |
789 | LastVPNOTResult = VPNOT.getOperand(i: 0).getReg(); |
790 | std::swap(a&: VCCRValue, b&: OppositeVCCRValue); |
791 | |
792 | LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue) |
793 | << "' with '" << printReg(LastVPNOTResult) |
794 | << "' in instr: " << *Iter); |
795 | } |
796 | } else { |
797 | // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult |
798 | // instead as they contain the same value. |
799 | if (MachineOperand *MO = Iter->findRegisterUseOperand( |
800 | Reg: OppositeVCCRValue, /*TRI=*/nullptr)) { |
801 | IsInteresting = true; |
802 | |
803 | // This is pointless if LastVPNOTResult == OppositeVCCRValue. |
804 | if (LastVPNOTResult != OppositeVCCRValue) { |
805 | LLVM_DEBUG(dbgs() << "Replacing usage of '" |
806 | << printReg(OppositeVCCRValue) << "' with '" |
807 | << printReg(LastVPNOTResult) << " for instr: " ; |
808 | Iter->dump()); |
809 | MO->setReg(LastVPNOTResult); |
810 | Modified = true; |
811 | } |
812 | |
813 | MO->setIsKill(false); |
814 | } |
815 | |
816 | // If this is an unpredicated VPNOT on |
817 | // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it. |
818 | if (Iter->getOpcode() == ARM::MVE_VPNOT && |
819 | getVPTInstrPredicate(MI: *Iter) == ARMVCC::None) { |
820 | Register VPNOTOperand = Iter->getOperand(i: 1).getReg(); |
821 | if (VPNOTOperand == LastVPNOTResult || |
822 | VPNOTOperand == OppositeVCCRValue) { |
823 | IsInteresting = true; |
824 | |
825 | std::swap(a&: VCCRValue, b&: OppositeVCCRValue); |
826 | LastVPNOTResult = Iter->getOperand(i: 0).getReg(); |
827 | } |
828 | } |
829 | } |
830 | |
831 | // If this instruction was not interesting, and it writes to VCCR, stop. |
832 | if (!IsInteresting && IsWritingToVCCR(Instr&: *Iter)) |
833 | break; |
834 | } |
835 | } |
836 | |
837 | for (MachineInstr *DeadInstruction : DeadInstructions) |
838 | DeadInstruction->eraseFromParent(); |
839 | |
840 | return Modified; |
841 | } |
842 | |
843 | // This optimisation replaces VCMPs with VPNOTs when they are equivalent. |
844 | bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { |
845 | SmallVector<MachineInstr *, 4> DeadInstructions; |
846 | |
847 | // The last VCMP that we have seen and that couldn't be replaced. |
848 | // This is reset when an instruction that writes to VCCR/VPR is found, or when |
849 | // a VCMP is replaced with a VPNOT. |
850 | // We'll only replace VCMPs with VPNOTs when this is not null, and when the |
851 | // current VCMP is the opposite of PrevVCMP. |
852 | MachineInstr *PrevVCMP = nullptr; |
853 | // If we find an instruction that kills the result of PrevVCMP, we save the |
854 | // operand here to remove the kill flag in case we need to use PrevVCMP's |
855 | // result. |
856 | MachineOperand *PrevVCMPResultKiller = nullptr; |
857 | |
858 | for (MachineInstr &Instr : MBB.instrs()) { |
859 | if (PrevVCMP) { |
860 | if (MachineOperand *MO = |
861 | Instr.findRegisterUseOperand(Reg: PrevVCMP->getOperand(i: 0).getReg(), |
862 | /*TRI=*/nullptr, /*isKill*/ true)) { |
863 | // If we come accross the instr that kills PrevVCMP's result, record it |
864 | // so we can remove the kill flag later if we need to. |
865 | PrevVCMPResultKiller = MO; |
866 | } |
867 | } |
868 | |
869 | // Ignore predicated instructions. |
870 | if (getVPTInstrPredicate(MI: Instr) != ARMVCC::None) |
871 | continue; |
872 | |
873 | // Only look at VCMPs |
874 | if (!IsVCMP(Opcode: Instr.getOpcode())) { |
875 | // If the instruction writes to VCCR, forget the previous VCMP. |
876 | if (IsWritingToVCCR(Instr)) |
877 | PrevVCMP = nullptr; |
878 | continue; |
879 | } |
880 | |
881 | if (!PrevVCMP || !IsVPNOTEquivalent(Cond&: Instr, Prev&: *PrevVCMP)) { |
882 | PrevVCMP = &Instr; |
883 | continue; |
884 | } |
885 | |
886 | // The register containing the result of the VCMP that we're going to |
887 | // replace. |
888 | Register PrevVCMPResultReg = PrevVCMP->getOperand(i: 0).getReg(); |
889 | |
890 | // Build a VPNOT to replace the VCMP, reusing its operands. |
891 | MachineInstrBuilder MIBuilder = |
892 | BuildMI(BB&: MBB, I: &Instr, MIMD: Instr.getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPNOT)) |
893 | .add(MO: Instr.getOperand(i: 0)) |
894 | .addReg(RegNo: PrevVCMPResultReg); |
895 | addUnpredicatedMveVpredNOp(MIB&: MIBuilder); |
896 | LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): " ; |
897 | MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: " ; |
898 | Instr.dump()); |
899 | |
900 | // If we found an instruction that uses, and kills PrevVCMP's result, |
901 | // remove the kill flag. |
902 | if (PrevVCMPResultKiller) |
903 | PrevVCMPResultKiller->setIsKill(false); |
904 | |
905 | // Finally, mark the old VCMP for removal and reset |
906 | // PrevVCMP/PrevVCMPResultKiller. |
907 | DeadInstructions.push_back(Elt: &Instr); |
908 | PrevVCMP = nullptr; |
909 | PrevVCMPResultKiller = nullptr; |
910 | } |
911 | |
912 | for (MachineInstr *DeadInstruction : DeadInstructions) |
913 | DeadInstruction->eraseFromParent(); |
914 | |
915 | return !DeadInstructions.empty(); |
916 | } |
917 | |
918 | bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, |
919 | MachineDominatorTree *DT) { |
920 | // Scan through the block, looking for instructions that use constants moves |
921 | // into VPR that are the negative of one another. These are expected to be |
922 | // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant |
923 | // mask is kept it or and VPNOT's of it are added or reused as we scan through |
924 | // the function. |
925 | unsigned LastVPTImm = 0; |
926 | Register LastVPTReg = 0; |
927 | SmallSet<MachineInstr *, 4> DeadInstructions; |
928 | |
929 | for (MachineInstr &Instr : MBB.instrs()) { |
930 | // Look for predicated MVE instructions. |
931 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: Instr); |
932 | if (PIdx == -1) |
933 | continue; |
934 | Register VPR = Instr.getOperand(i: PIdx + 1).getReg(); |
935 | if (!VPR.isVirtual()) |
936 | continue; |
937 | |
938 | // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. |
939 | MachineInstr *Copy = MRI->getVRegDef(Reg: VPR); |
940 | if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || |
941 | !Copy->getOperand(i: 1).getReg().isVirtual() || |
942 | MRI->getRegClass(Reg: Copy->getOperand(i: 1).getReg()) == &ARM::VCCRRegClass) { |
943 | LastVPTReg = 0; |
944 | continue; |
945 | } |
946 | Register GPR = Copy->getOperand(i: 1).getReg(); |
947 | |
948 | // Find the Immediate used by the copy. |
949 | auto getImm = [&](Register GPR) -> unsigned { |
950 | MachineInstr *Def = MRI->getVRegDef(Reg: GPR); |
951 | if (Def && (Def->getOpcode() == ARM::t2MOVi || |
952 | Def->getOpcode() == ARM::t2MOVi16)) |
953 | return Def->getOperand(i: 1).getImm(); |
954 | return -1U; |
955 | }; |
956 | unsigned Imm = getImm(GPR); |
957 | if (Imm == -1U) { |
958 | LastVPTReg = 0; |
959 | continue; |
960 | } |
961 | |
962 | unsigned NotImm = ~Imm & 0xffff; |
963 | if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { |
964 | MRI->clearKillFlags(Reg: LastVPTReg); |
965 | Instr.getOperand(i: PIdx + 1).setReg(LastVPTReg); |
966 | if (MRI->use_empty(RegNo: VPR)) { |
967 | DeadInstructions.insert(Ptr: Copy); |
968 | if (MRI->hasOneUse(RegNo: GPR)) |
969 | DeadInstructions.insert(Ptr: MRI->getVRegDef(Reg: GPR)); |
970 | } |
971 | LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr); |
972 | VPR = LastVPTReg; |
973 | } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { |
974 | // We have found the not of a previous constant. Create a VPNot of the |
975 | // earlier predicate reg and use it instead of the copy. |
976 | Register NewVPR = MRI->createVirtualRegister(RegClass: &ARM::VCCRRegClass); |
977 | auto VPNot = BuildMI(BB&: MBB, I: &Instr, MIMD: Instr.getDebugLoc(), |
978 | MCID: TII->get(Opcode: ARM::MVE_VPNOT), DestReg: NewVPR) |
979 | .addReg(RegNo: LastVPTReg); |
980 | addUnpredicatedMveVpredNOp(MIB&: VPNot); |
981 | |
982 | // Use the new register and check if the def is now dead. |
983 | Instr.getOperand(i: PIdx + 1).setReg(NewVPR); |
984 | if (MRI->use_empty(RegNo: VPR)) { |
985 | DeadInstructions.insert(Ptr: Copy); |
986 | if (MRI->hasOneUse(RegNo: GPR)) |
987 | DeadInstructions.insert(Ptr: MRI->getVRegDef(Reg: GPR)); |
988 | } |
989 | LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at " |
990 | << Instr); |
991 | VPR = NewVPR; |
992 | } |
993 | |
994 | LastVPTImm = Imm; |
995 | LastVPTReg = VPR; |
996 | } |
997 | |
998 | for (MachineInstr *DI : DeadInstructions) |
999 | DI->eraseFromParent(); |
1000 | |
1001 | return !DeadInstructions.empty(); |
1002 | } |
1003 | |
1004 | // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a |
1005 | // somewhat blunt approximation to allow tail predicated with vpsel |
1006 | // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly |
1007 | // different semantics under tail predication. Until that is modelled we just |
1008 | // convert to a VMOVT (via a predicated VORR) instead. |
1009 | bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { |
1010 | bool HasVCTP = false; |
1011 | SmallVector<MachineInstr *, 4> DeadInstructions; |
1012 | |
1013 | for (MachineInstr &MI : MBB.instrs()) { |
1014 | if (isVCTP(MI: &MI)) { |
1015 | HasVCTP = true; |
1016 | continue; |
1017 | } |
1018 | |
1019 | if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) |
1020 | continue; |
1021 | |
1022 | MachineInstrBuilder MIBuilder = |
1023 | BuildMI(BB&: MBB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR)) |
1024 | .add(MO: MI.getOperand(i: 0)) |
1025 | .add(MO: MI.getOperand(i: 1)) |
1026 | .add(MO: MI.getOperand(i: 1)) |
1027 | .addImm(Val: ARMVCC::Then) |
1028 | .add(MO: MI.getOperand(i: 4)) |
1029 | .add(MO: MI.getOperand(i: 5)) |
1030 | .add(MO: MI.getOperand(i: 2)); |
1031 | // Silence unused variable warning in release builds. |
1032 | (void)MIBuilder; |
1033 | LLVM_DEBUG(dbgs() << "Replacing VPSEL: " ; MI.dump(); |
1034 | dbgs() << " with VMOVT: " ; MIBuilder.getInstr()->dump()); |
1035 | DeadInstructions.push_back(Elt: &MI); |
1036 | } |
1037 | |
1038 | for (MachineInstr *DeadInstruction : DeadInstructions) |
1039 | DeadInstruction->eraseFromParent(); |
1040 | |
1041 | return !DeadInstructions.empty(); |
1042 | } |
1043 | |
1044 | // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as |
1045 | // the instruction may be removable as a noop. |
1046 | bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) { |
1047 | bool Changed = false; |
1048 | for (MachineInstr &MI : MBB.instrs()) { |
1049 | if (MI.getOpcode() != ARM::t2DoLoopStart) |
1050 | continue; |
1051 | Register R = MI.getOperand(i: 1).getReg(); |
1052 | MachineFunction *MF = MI.getParent()->getParent(); |
1053 | MF->getRegInfo().setRegAllocationHint(VReg: R, Type: ARMRI::RegLR, PrefReg: 0); |
1054 | Changed = true; |
1055 | } |
1056 | return Changed; |
1057 | } |
1058 | |
1059 | bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { |
1060 | const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>(); |
1061 | |
1062 | if (!STI.isThumb2() || !STI.hasLOB()) |
1063 | return false; |
1064 | |
1065 | TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); |
1066 | MRI = &Fn.getRegInfo(); |
1067 | MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
1068 | MachineDominatorTree *DT = |
1069 | &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
1070 | |
1071 | LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" |
1072 | << "********** Function: " << Fn.getName() << '\n'); |
1073 | |
1074 | bool Modified = false; |
1075 | for (MachineLoop *ML : MLI->getLoopsInPreorder()) { |
1076 | Modified |= LowerWhileLoopStart(ML); |
1077 | Modified |= MergeLoopEnd(ML); |
1078 | Modified |= ConvertTailPredLoop(ML, DT); |
1079 | } |
1080 | |
1081 | for (MachineBasicBlock &MBB : Fn) { |
1082 | Modified |= HintDoLoopStartReg(MBB); |
1083 | Modified |= ReplaceConstByVPNOTs(MBB, DT); |
1084 | Modified |= ReplaceVCMPsByVPNOTs(MBB); |
1085 | Modified |= ReduceOldVCCRValueUses(MBB); |
1086 | Modified |= ConvertVPSEL(MBB); |
1087 | } |
1088 | |
1089 | LLVM_DEBUG(dbgs() << "**************************************\n" ); |
1090 | return Modified; |
1091 | } |
1092 | |
1093 | /// createMVETPAndVPTOptimisationsPass |
1094 | FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() { |
1095 | return new MVETPAndVPTOptimisations(); |
1096 | } |
1097 | |