1 | //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass does a few optimisations related to Tail predicated loops |
10 | /// and MVE VPT blocks before register allocation is performed. For VPT blocks |
11 | /// the goal is to maximize the sizes of the blocks that will be created by the |
12 | /// MVE VPT Block Insertion pass (which runs after register allocation). For |
13 | /// tail predicated loops we transform the loop into something that will |
14 | /// hopefully make the backend ARMLowOverheadLoops pass's job easier. |
15 | /// |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "ARM.h" |
19 | #include "ARMSubtarget.h" |
20 | #include "MVETailPredUtils.h" |
21 | #include "Thumb2InstrInfo.h" |
22 | #include "llvm/ADT/SmallVector.h" |
23 | #include "llvm/CodeGen/MachineBasicBlock.h" |
24 | #include "llvm/CodeGen/MachineDominators.h" |
25 | #include "llvm/CodeGen/MachineFunction.h" |
26 | #include "llvm/CodeGen/MachineFunctionPass.h" |
27 | #include "llvm/CodeGen/MachineInstr.h" |
28 | #include "llvm/CodeGen/MachineLoopInfo.h" |
29 | #include "llvm/InitializePasses.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include <cassert> |
32 | |
33 | using namespace llvm; |
34 | |
35 | #define DEBUG_TYPE "arm-mve-vpt-opts" |
36 | |
37 | static cl::opt<bool> |
38 | MergeEndDec("arm-enable-merge-loopenddec" , cl::Hidden, |
39 | cl::desc("Enable merging Loop End and Dec instructions." ), |
40 | cl::init(Val: true)); |
41 | |
42 | static cl::opt<bool> |
43 | SetLRPredicate("arm-set-lr-predicate" , cl::Hidden, |
44 | cl::desc("Enable setting lr as a predicate in tail predication regions." ), |
45 | cl::init(Val: true)); |
46 | |
47 | namespace { |
48 | class MVETPAndVPTOptimisations : public MachineFunctionPass { |
49 | public: |
50 | static char ID; |
51 | const Thumb2InstrInfo *TII; |
52 | MachineRegisterInfo *MRI; |
53 | |
54 | MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {} |
55 | |
56 | bool runOnMachineFunction(MachineFunction &Fn) override; |
57 | |
58 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
59 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
60 | AU.addPreserved<MachineLoopInfoWrapperPass>(); |
61 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
62 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
63 | MachineFunctionPass::getAnalysisUsage(AU); |
64 | } |
65 | |
66 | StringRef getPassName() const override { |
67 | return "ARM MVE TailPred and VPT Optimisation Pass" ; |
68 | } |
69 | |
70 | private: |
71 | bool LowerWhileLoopStart(MachineLoop *ML); |
72 | bool MergeLoopEnd(MachineLoop *ML); |
73 | bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); |
74 | MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, |
75 | MachineInstr &Instr, |
76 | MachineOperand &User, |
77 | Register Target); |
78 | bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); |
79 | bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); |
80 | bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); |
81 | bool ConvertVPSEL(MachineBasicBlock &MBB); |
82 | bool HintDoLoopStartReg(MachineBasicBlock &MBB); |
83 | MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *, |
84 | MachineInstr *LoopStart); |
85 | }; |
86 | |
87 | char MVETPAndVPTOptimisations::ID = 0; |
88 | |
89 | } // end anonymous namespace |
90 | |
91 | INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, |
92 | "ARM MVE TailPred and VPT Optimisations pass" , false, |
93 | false) |
94 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
95 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
96 | INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, |
97 | "ARM MVE TailPred and VPT Optimisations pass" , false, false) |
98 | |
99 | static MachineInstr *LookThroughCOPY(MachineInstr *MI, |
100 | MachineRegisterInfo *MRI) { |
101 | while (MI && MI->getOpcode() == TargetOpcode::COPY && |
102 | MI->getOperand(i: 1).getReg().isVirtual()) |
103 | MI = MRI->getVRegDef(Reg: MI->getOperand(i: 1).getReg()); |
104 | return MI; |
105 | } |
106 | |
107 | // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and |
108 | // corresponding PHI that make up a low overhead loop. Only handles 'do' loops |
109 | // at the moment, returning a t2DoLoopStart in LoopStart. |
110 | static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, |
111 | MachineInstr *&LoopStart, MachineInstr *&LoopPhi, |
112 | MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { |
113 | MachineBasicBlock * = ML->getHeader(); |
114 | MachineBasicBlock *Latch = ML->getLoopLatch(); |
115 | if (!Header || !Latch) { |
116 | LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n" ); |
117 | return false; |
118 | } |
119 | |
120 | // Find the loop end from the terminators. |
121 | LoopEnd = nullptr; |
122 | for (auto &T : Latch->terminators()) { |
123 | if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(i: 1).getMBB() == Header) { |
124 | LoopEnd = &T; |
125 | break; |
126 | } |
127 | if (T.getOpcode() == ARM::t2LoopEndDec && |
128 | T.getOperand(i: 2).getMBB() == Header) { |
129 | LoopEnd = &T; |
130 | break; |
131 | } |
132 | } |
133 | if (!LoopEnd) { |
134 | LLVM_DEBUG(dbgs() << " no LoopEnd\n" ); |
135 | return false; |
136 | } |
137 | LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd); |
138 | |
139 | // Find the dec from the use of the end. There may be copies between |
140 | // instructions. We expect the loop to loop like: |
141 | // $vs = t2DoLoopStart ... |
142 | // loop: |
143 | // $vp = phi [ $vs ], [ $vd ] |
144 | // ... |
145 | // $vd = t2LoopDec $vp |
146 | // ... |
147 | // t2LoopEnd $vd, loop |
148 | if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) |
149 | LoopDec = LoopEnd; |
150 | else { |
151 | LoopDec = |
152 | LookThroughCOPY(MI: MRI->getVRegDef(Reg: LoopEnd->getOperand(i: 0).getReg()), MRI); |
153 | if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { |
154 | LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n" ); |
155 | return false; |
156 | } |
157 | } |
158 | LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); |
159 | |
160 | LoopPhi = |
161 | LookThroughCOPY(MI: MRI->getVRegDef(Reg: LoopDec->getOperand(i: 1).getReg()), MRI); |
162 | if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || |
163 | LoopPhi->getNumOperands() != 5 || |
164 | (LoopPhi->getOperand(i: 2).getMBB() != Latch && |
165 | LoopPhi->getOperand(i: 4).getMBB() != Latch)) { |
166 | LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n" ); |
167 | return false; |
168 | } |
169 | LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi); |
170 | |
171 | Register StartReg = LoopPhi->getOperand(i: 2).getMBB() == Latch |
172 | ? LoopPhi->getOperand(i: 3).getReg() |
173 | : LoopPhi->getOperand(i: 1).getReg(); |
174 | LoopStart = LookThroughCOPY(MI: MRI->getVRegDef(Reg: StartReg), MRI); |
175 | if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart && |
176 | LoopStart->getOpcode() != ARM::t2WhileLoopSetup && |
177 | LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) { |
178 | LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n" ); |
179 | return false; |
180 | } |
181 | LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart); |
182 | |
183 | return true; |
184 | } |
185 | |
186 | static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) { |
187 | MachineBasicBlock *MBB = MI->getParent(); |
188 | assert(MI->getOpcode() == ARM::t2WhileLoopSetup && |
189 | "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!" ); |
190 | |
191 | // Subs |
192 | MachineInstrBuilder MIB = |
193 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri)); |
194 | MIB.add(MO: MI->getOperand(i: 0)); |
195 | MIB.add(MO: MI->getOperand(i: 1)); |
196 | MIB.addImm(Val: 0); |
197 | MIB.addImm(Val: ARMCC::AL); |
198 | MIB.addReg(RegNo: ARM::NoRegister); |
199 | MIB.addReg(RegNo: ARM::CPSR, flags: RegState::Define); |
200 | |
201 | // Attempt to find a t2WhileLoopStart and revert to a t2Bcc. |
202 | for (MachineInstr &I : MBB->terminators()) { |
203 | if (I.getOpcode() == ARM::t2WhileLoopStart) { |
204 | MachineInstrBuilder MIB = |
205 | BuildMI(BB&: *MBB, I: &I, MIMD: I.getDebugLoc(), MCID: TII->get(Opcode: ARM::t2Bcc)); |
206 | MIB.add(MO: MI->getOperand(i: 1)); // branch target |
207 | MIB.addImm(Val: ARMCC::EQ); |
208 | MIB.addReg(RegNo: ARM::CPSR); |
209 | I.eraseFromParent(); |
210 | break; |
211 | } |
212 | } |
213 | |
214 | MI->eraseFromParent(); |
215 | } |
216 | |
217 | // The Hardware Loop insertion and ISel Lowering produce the pseudos for the |
218 | // start of a while loop: |
219 | // %a:gprlr = t2WhileLoopSetup %Cnt |
220 | // t2WhileLoopStart %a, %BB |
221 | // We want to convert those to a single instruction which, like t2LoopEndDec and |
222 | // t2DoLoopStartTP is both a terminator and produces a value: |
223 | // %a:grplr: t2WhileLoopStartLR %Cnt, %BB |
224 | // |
225 | // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and |
226 | // t2WhileLoopStart are not valid past regalloc. |
227 | bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) { |
228 | LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop " |
229 | << ML->getHeader()->getName() << "\n" ); |
230 | |
231 | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; |
232 | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) |
233 | return false; |
234 | |
235 | if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup) |
236 | return false; |
237 | |
238 | Register LR = LoopStart->getOperand(i: 0).getReg(); |
239 | auto WLSIt = find_if(Range: MRI->use_nodbg_instructions(Reg: LR), P: [](auto &MI) { |
240 | return MI.getOpcode() == ARM::t2WhileLoopStart; |
241 | }); |
242 | if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) { |
243 | RevertWhileLoopSetup(MI: LoopStart, TII); |
244 | RevertLoopDec(MI: LoopStart, TII); |
245 | RevertLoopEnd(MI: LoopStart, TII); |
246 | return true; |
247 | } |
248 | |
249 | MachineInstrBuilder MI = |
250 | BuildMI(BB&: *WLSIt->getParent(), I&: *WLSIt, MIMD: WLSIt->getDebugLoc(), |
251 | MCID: TII->get(Opcode: ARM::t2WhileLoopStartLR), DestReg: LR) |
252 | .add(MO: LoopStart->getOperand(i: 1)) |
253 | .add(MO: WLSIt->getOperand(i: 1)); |
254 | (void)MI; |
255 | LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr()); |
256 | |
257 | WLSIt->eraseFromParent(); |
258 | LoopStart->eraseFromParent(); |
259 | return true; |
260 | } |
261 | |
262 | // Return true if this instruction is invalid in a low overhead loop, usually |
263 | // because it clobbers LR. |
264 | static bool IsInvalidTPInstruction(MachineInstr &MI) { |
265 | return MI.isCall() || isLoopStart(MI); |
266 | } |
267 | |
268 | // Starting from PreHeader, search for invalid instructions back until the |
269 | // LoopStart block is reached. If invalid instructions are found, the loop start |
270 | // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will |
271 | // return the new DLS LoopStart if updated. |
272 | MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors( |
273 | MachineBasicBlock *, MachineInstr *LoopStart) { |
274 | SmallVector<MachineBasicBlock *> Worklist; |
275 | SmallPtrSet<MachineBasicBlock *, 4> Visited; |
276 | Worklist.push_back(Elt: PreHeader); |
277 | Visited.insert(Ptr: LoopStart->getParent()); |
278 | |
279 | while (!Worklist.empty()) { |
280 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
281 | if (Visited.count(Ptr: MBB)) |
282 | continue; |
283 | |
284 | for (MachineInstr &MI : *MBB) { |
285 | if (!IsInvalidTPInstruction(MI)) |
286 | continue; |
287 | |
288 | LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI); |
289 | |
290 | // Create a t2DoLoopStart at the end of the preheader. |
291 | MachineInstrBuilder MIB = |
292 | BuildMI(BB&: *PreHeader, I: PreHeader->getFirstTerminator(), |
293 | MIMD: LoopStart->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2DoLoopStart)); |
294 | MIB.add(MO: LoopStart->getOperand(i: 0)); |
295 | MIB.add(MO: LoopStart->getOperand(i: 1)); |
296 | |
297 | // Make sure to remove the kill flags, to prevent them from being invalid. |
298 | LoopStart->getOperand(i: 1).setIsKill(false); |
299 | |
300 | // Revert the t2WhileLoopStartLR to a CMP and Br. |
301 | RevertWhileLoopStartLR(MI: LoopStart, TII, BrOpc: ARM::t2Bcc, UseCmp: true); |
302 | return MIB; |
303 | } |
304 | |
305 | Visited.insert(Ptr: MBB); |
306 | llvm::append_range(C&: Worklist, R: MBB->predecessors()); |
307 | } |
308 | return LoopStart; |
309 | } |
310 | |
311 | // This function converts loops with t2LoopEnd and t2LoopEnd instructions into |
312 | // a single t2LoopEndDec instruction. To do that it needs to make sure that LR |
313 | // will be valid to be used for the low overhead loop, which means nothing else |
314 | // is using LR (especially calls) and there are no superfluous copies in the |
315 | // loop. The t2LoopEndDec is a branching terminator that produces a value (the |
316 | // decrement) around the loop edge, which means we need to be careful that they |
317 | // will be valid to allocate without any spilling. |
318 | bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { |
319 | if (!MergeEndDec) |
320 | return false; |
321 | |
322 | LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() |
323 | << "\n" ); |
324 | |
325 | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; |
326 | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) |
327 | return false; |
328 | |
329 | // Check if there is an illegal instruction (a call) in the low overhead loop |
330 | // and if so revert it now before we get any further. While loops also need to |
331 | // check the preheaders, but can be reverted to a DLS loop if needed. |
332 | auto * = ML->getLoopPreheader(); |
333 | if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader) |
334 | LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart); |
335 | |
336 | for (MachineBasicBlock *MBB : ML->blocks()) { |
337 | for (MachineInstr &MI : *MBB) { |
338 | if (IsInvalidTPInstruction(MI)) { |
339 | LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI); |
340 | if (LoopStart->getOpcode() == ARM::t2DoLoopStart) |
341 | RevertDoLoopStart(MI: LoopStart, TII); |
342 | else |
343 | RevertWhileLoopStartLR(MI: LoopStart, TII); |
344 | RevertLoopDec(MI: LoopDec, TII); |
345 | RevertLoopEnd(MI: LoopEnd, TII); |
346 | return true; |
347 | } |
348 | } |
349 | } |
350 | |
351 | // Remove any copies from the loop, to ensure the phi that remains is both |
352 | // simpler and contains no extra uses. Because t2LoopEndDec is a terminator |
353 | // that cannot spill, we need to be careful what remains in the loop. |
354 | Register PhiReg = LoopPhi->getOperand(i: 0).getReg(); |
355 | Register DecReg = LoopDec->getOperand(i: 0).getReg(); |
356 | Register StartReg = LoopStart->getOperand(i: 0).getReg(); |
357 | // Ensure the uses are expected, and collect any copies we want to remove. |
358 | SmallVector<MachineInstr *, 4> Copies; |
359 | auto CheckUsers = [&Copies](Register BaseReg, |
360 | ArrayRef<MachineInstr *> ExpectedUsers, |
361 | MachineRegisterInfo *MRI) { |
362 | SmallVector<Register, 4> Worklist; |
363 | Worklist.push_back(Elt: BaseReg); |
364 | while (!Worklist.empty()) { |
365 | Register Reg = Worklist.pop_back_val(); |
366 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { |
367 | if (llvm::is_contained(Range&: ExpectedUsers, Element: &MI)) |
368 | continue; |
369 | if (MI.getOpcode() != TargetOpcode::COPY || |
370 | !MI.getOperand(i: 0).getReg().isVirtual()) { |
371 | LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); |
372 | return false; |
373 | } |
374 | Worklist.push_back(Elt: MI.getOperand(i: 0).getReg()); |
375 | Copies.push_back(Elt: &MI); |
376 | } |
377 | } |
378 | return true; |
379 | }; |
380 | if (!CheckUsers(PhiReg, {LoopDec}, MRI) || |
381 | !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || |
382 | !CheckUsers(StartReg, {LoopPhi}, MRI)) { |
383 | // Don't leave a t2WhileLoopStartLR without the LoopDecEnd. |
384 | if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) { |
385 | RevertWhileLoopStartLR(MI: LoopStart, TII); |
386 | RevertLoopDec(MI: LoopDec, TII); |
387 | RevertLoopEnd(MI: LoopEnd, TII); |
388 | return true; |
389 | } |
390 | return false; |
391 | } |
392 | |
393 | MRI->constrainRegClass(Reg: StartReg, RC: &ARM::GPRlrRegClass); |
394 | MRI->constrainRegClass(Reg: PhiReg, RC: &ARM::GPRlrRegClass); |
395 | MRI->constrainRegClass(Reg: DecReg, RC: &ARM::GPRlrRegClass); |
396 | |
397 | if (LoopPhi->getOperand(i: 2).getMBB() == ML->getLoopLatch()) { |
398 | LoopPhi->getOperand(i: 3).setReg(StartReg); |
399 | LoopPhi->getOperand(i: 1).setReg(DecReg); |
400 | } else { |
401 | LoopPhi->getOperand(i: 1).setReg(StartReg); |
402 | LoopPhi->getOperand(i: 3).setReg(DecReg); |
403 | } |
404 | |
405 | SmallVector<MachineOperand, 4> Cond; // For analyzeBranch. |
406 | MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. |
407 | if (!TII->analyzeBranch(MBB&: *LoopEnd->getParent(), TBB, FBB, Cond) && !FBB) { |
408 | // If the LoopEnd falls through, need to insert a t2B to the fall-through |
409 | // block so that the non-analyzable t2LoopEndDec doesn't fall through. |
410 | MachineFunction::iterator MBBI = ++LoopEnd->getParent()->getIterator(); |
411 | BuildMI(BB: LoopEnd->getParent(), MIMD: DebugLoc(), MCID: TII->get(Opcode: ARM::t2B)) |
412 | .addMBB(MBB: &*MBBI) |
413 | .add(MOs: predOps(Pred: ARMCC::AL)); |
414 | } |
415 | |
416 | // Replace the loop dec and loop end as a single instruction. |
417 | MachineInstrBuilder MI = |
418 | BuildMI(BB&: *LoopEnd->getParent(), I&: *LoopEnd, MIMD: LoopEnd->getDebugLoc(), |
419 | MCID: TII->get(Opcode: ARM::t2LoopEndDec), DestReg: DecReg) |
420 | .addReg(RegNo: PhiReg) |
421 | .add(MO: LoopEnd->getOperand(i: 1)); |
422 | (void)MI; |
423 | LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); |
424 | |
425 | LoopDec->eraseFromParent(); |
426 | LoopEnd->eraseFromParent(); |
427 | for (auto *MI : Copies) |
428 | MI->eraseFromParent(); |
429 | return true; |
430 | } |
431 | |
432 | // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP |
433 | // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP |
434 | // instruction, making the backend ARMLowOverheadLoops passes job of finding the |
435 | // VCTP operand much simpler. |
436 | bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, |
437 | MachineDominatorTree *DT) { |
438 | LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " |
439 | << ML->getHeader()->getName() << "\n" ); |
440 | |
441 | // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's |
442 | // in the loop. |
443 | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; |
444 | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) |
445 | return false; |
446 | if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart && |
447 | LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) |
448 | return false; |
449 | |
450 | SmallVector<MachineInstr *, 4> VCTPs; |
451 | SmallVector<MachineInstr *, 4> MVEInstrs; |
452 | for (MachineBasicBlock *BB : ML->blocks()) { |
453 | for (MachineInstr &MI : *BB) |
454 | if (isVCTP(MI: &MI)) |
455 | VCTPs.push_back(Elt: &MI); |
456 | else if (findFirstVPTPredOperandIdx(MI) != -1) |
457 | MVEInstrs.push_back(Elt: &MI); |
458 | } |
459 | |
460 | if (VCTPs.empty()) { |
461 | LLVM_DEBUG(dbgs() << " no VCTPs\n" ); |
462 | return false; |
463 | } |
464 | |
465 | // Check all VCTPs are the same. |
466 | MachineInstr *FirstVCTP = *VCTPs.begin(); |
467 | for (MachineInstr *VCTP : VCTPs) { |
468 | LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP); |
469 | if (VCTP->getOpcode() != FirstVCTP->getOpcode() || |
470 | VCTP->getOperand(i: 0).getReg() != FirstVCTP->getOperand(i: 0).getReg()) { |
471 | LLVM_DEBUG(dbgs() << " VCTP's are not identical\n" ); |
472 | return false; |
473 | } |
474 | } |
475 | |
476 | // Check for the register being used can be setup before the loop. We expect |
477 | // this to be: |
478 | // $vx = ... |
479 | // loop: |
480 | // $vp = PHI [ $vx ], [ $vd ] |
481 | // .. |
482 | // $vpr = VCTP $vp |
483 | // .. |
484 | // $vd = t2SUBri $vp, #n |
485 | // .. |
486 | Register CountReg = FirstVCTP->getOperand(i: 1).getReg(); |
487 | if (!CountReg.isVirtual()) { |
488 | LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n" ); |
489 | return false; |
490 | } |
491 | MachineInstr *Phi = LookThroughCOPY(MI: MRI->getVRegDef(Reg: CountReg), MRI); |
492 | if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || |
493 | Phi->getNumOperands() != 5 || |
494 | (Phi->getOperand(i: 2).getMBB() != ML->getLoopLatch() && |
495 | Phi->getOperand(i: 4).getMBB() != ML->getLoopLatch())) { |
496 | LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n" ); |
497 | return false; |
498 | } |
499 | CountReg = Phi->getOperand(i: 2).getMBB() == ML->getLoopLatch() |
500 | ? Phi->getOperand(i: 3).getReg() |
501 | : Phi->getOperand(i: 1).getReg(); |
502 | |
503 | // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of |
504 | // the preheader and add the new CountReg to it. We attempt to place it late |
505 | // in the preheader, but may need to move that earlier based on uses. |
506 | MachineBasicBlock *MBB = LoopStart->getParent(); |
507 | MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); |
508 | for (MachineInstr &Use : |
509 | MRI->use_instructions(Reg: LoopStart->getOperand(i: 0).getReg())) |
510 | if ((InsertPt != MBB->end() && !DT->dominates(A: &*InsertPt, B: &Use)) || |
511 | !DT->dominates(A: ML->getHeader(), B: Use.getParent())) { |
512 | LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n" ); |
513 | return false; |
514 | } |
515 | |
516 | unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart |
517 | ? ARM::t2DoLoopStartTP |
518 | : ARM::t2WhileLoopStartTP; |
519 | MachineInstrBuilder MI = |
520 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: LoopStart->getDebugLoc(), MCID: TII->get(Opcode: NewOpc)) |
521 | .add(MO: LoopStart->getOperand(i: 0)) |
522 | .add(MO: LoopStart->getOperand(i: 1)) |
523 | .addReg(RegNo: CountReg); |
524 | if (NewOpc == ARM::t2WhileLoopStartTP) |
525 | MI.add(MO: LoopStart->getOperand(i: 2)); |
526 | LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " |
527 | << *MI.getInstr()); |
528 | MRI->constrainRegClass(Reg: CountReg, RC: &ARM::rGPRRegClass); |
529 | LoopStart->eraseFromParent(); |
530 | |
531 | if (SetLRPredicate) { |
532 | // Each instruction in the loop needs to be using LR as the predicate from |
533 | // the Phi as the predicate. |
534 | Register LR = LoopPhi->getOperand(i: 0).getReg(); |
535 | for (MachineInstr *MI : MVEInstrs) { |
536 | int Idx = findFirstVPTPredOperandIdx(MI: *MI); |
537 | MI->getOperand(i: Idx + 2).setReg(LR); |
538 | } |
539 | } |
540 | |
541 | return true; |
542 | } |
543 | |
544 | // Returns true if Opcode is any VCMP Opcode. |
545 | static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } |
546 | |
547 | // Returns true if a VCMP with this Opcode can have its operands swapped. |
548 | // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs, |
549 | // and VCMPr instructions (since the r is always on the right). |
550 | static bool CanHaveSwappedOperands(unsigned Opcode) { |
551 | switch (Opcode) { |
552 | default: |
553 | return true; |
554 | case ARM::MVE_VCMPf32: |
555 | case ARM::MVE_VCMPf16: |
556 | case ARM::MVE_VCMPf32r: |
557 | case ARM::MVE_VCMPf16r: |
558 | case ARM::MVE_VCMPi8r: |
559 | case ARM::MVE_VCMPi16r: |
560 | case ARM::MVE_VCMPi32r: |
561 | case ARM::MVE_VCMPu8r: |
562 | case ARM::MVE_VCMPu16r: |
563 | case ARM::MVE_VCMPu32r: |
564 | case ARM::MVE_VCMPs8r: |
565 | case ARM::MVE_VCMPs16r: |
566 | case ARM::MVE_VCMPs32r: |
567 | return false; |
568 | } |
569 | } |
570 | |
571 | // Returns the CondCode of a VCMP Instruction. |
572 | static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { |
573 | assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP" ); |
574 | return ARMCC::CondCodes(Instr.getOperand(i: 3).getImm()); |
575 | } |
576 | |
577 | // Returns true if Cond is equivalent to a VPNOT instruction on the result of |
578 | // Prev. Cond and Prev must be VCMPs. |
579 | static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { |
580 | assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode())); |
581 | |
582 | // Opcodes must match. |
583 | if (Cond.getOpcode() != Prev.getOpcode()) |
584 | return false; |
585 | |
586 | MachineOperand &CondOP1 = Cond.getOperand(i: 1), &CondOP2 = Cond.getOperand(i: 2); |
587 | MachineOperand &PrevOP1 = Prev.getOperand(i: 1), &PrevOP2 = Prev.getOperand(i: 2); |
588 | |
589 | // If the VCMP has the opposite condition with the same operands, we can |
590 | // replace it with a VPNOT |
591 | ARMCC::CondCodes ExpectedCode = GetCondCode(Instr&: Cond); |
592 | ExpectedCode = ARMCC::getOppositeCondition(CC: ExpectedCode); |
593 | if (ExpectedCode == GetCondCode(Instr&: Prev)) |
594 | if (CondOP1.isIdenticalTo(Other: PrevOP1) && CondOP2.isIdenticalTo(Other: PrevOP2)) |
595 | return true; |
596 | // Check again with operands swapped if possible |
597 | if (!CanHaveSwappedOperands(Opcode: Cond.getOpcode())) |
598 | return false; |
599 | ExpectedCode = ARMCC::getSwappedCondition(CC: ExpectedCode); |
600 | return ExpectedCode == GetCondCode(Instr&: Prev) && CondOP1.isIdenticalTo(Other: PrevOP2) && |
601 | CondOP2.isIdenticalTo(Other: PrevOP1); |
602 | } |
603 | |
604 | // Returns true if Instr writes to VCCR. |
605 | static bool IsWritingToVCCR(MachineInstr &Instr) { |
606 | if (Instr.getNumOperands() == 0) |
607 | return false; |
608 | MachineOperand &Dst = Instr.getOperand(i: 0); |
609 | if (!Dst.isReg()) |
610 | return false; |
611 | Register DstReg = Dst.getReg(); |
612 | if (!DstReg.isVirtual()) |
613 | return false; |
614 | MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo(); |
615 | const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(Reg: DstReg); |
616 | return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); |
617 | } |
618 | |
619 | // Transforms |
620 | // <Instr that uses %A ('User' Operand)> |
621 | // Into |
622 | // %K = VPNOT %Target |
623 | // <Instr that uses %K ('User' Operand)> |
624 | // And returns the newly inserted VPNOT. |
625 | // This optimization is done in the hopes of preventing spills/reloads of VPR by |
626 | // reducing the number of VCCR values with overlapping lifetimes. |
627 | MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT( |
628 | MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, |
629 | Register Target) { |
630 | Register NewResult = MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: Target)); |
631 | |
632 | MachineInstrBuilder MIBuilder = |
633 | BuildMI(BB&: MBB, I: &Instr, MIMD: Instr.getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPNOT)) |
634 | .addDef(RegNo: NewResult) |
635 | .addReg(RegNo: Target); |
636 | addUnpredicatedMveVpredNOp(MIB&: MIBuilder); |
637 | |
638 | // Make the user use NewResult instead, and clear its kill flag. |
639 | User.setReg(NewResult); |
640 | User.setIsKill(false); |
641 | |
642 | LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): " ; |
643 | MIBuilder.getInstr()->dump()); |
644 | |
645 | return *MIBuilder.getInstr(); |
646 | } |
647 | |
648 | // Moves a VPNOT before its first user if an instruction that uses Reg is found |
649 | // in-between the VPNOT and its user. |
650 | // Returns true if there is at least one user of the VPNOT in the block. |
651 | static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, |
652 | MachineBasicBlock::iterator Iter, |
653 | Register Reg) { |
654 | assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!" ); |
655 | assert(getVPTInstrPredicate(*Iter) == ARMVCC::None && |
656 | "The VPNOT cannot be predicated" ); |
657 | |
658 | MachineInstr &VPNOT = *Iter; |
659 | Register VPNOTResult = VPNOT.getOperand(i: 0).getReg(); |
660 | Register VPNOTOperand = VPNOT.getOperand(i: 1).getReg(); |
661 | |
662 | // Whether the VPNOT will need to be moved, and whether we found a user of the |
663 | // VPNOT. |
664 | bool MustMove = false, HasUser = false; |
665 | MachineOperand *VPNOTOperandKiller = nullptr; |
666 | for (; Iter != MBB.end(); ++Iter) { |
667 | if (MachineOperand *MO = |
668 | Iter->findRegisterUseOperand(Reg: VPNOTOperand, /*TRI=*/nullptr, |
669 | /*isKill*/ true)) { |
670 | // If we find the operand that kills the VPNOTOperand's result, save it. |
671 | VPNOTOperandKiller = MO; |
672 | } |
673 | |
674 | if (Iter->findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != -1) { |
675 | MustMove = true; |
676 | continue; |
677 | } |
678 | |
679 | if (Iter->findRegisterUseOperandIdx(Reg: VPNOTResult, /*TRI=*/nullptr) == -1) |
680 | continue; |
681 | |
682 | HasUser = true; |
683 | if (!MustMove) |
684 | break; |
685 | |
686 | // Move the VPNOT right before Iter |
687 | LLVM_DEBUG(dbgs() << "Moving: " ; VPNOT.dump(); dbgs() << " Before: " ; |
688 | Iter->dump()); |
689 | MBB.splice(Where: Iter, Other: &MBB, From: VPNOT.getIterator()); |
690 | // If we move the instr, and its operand was killed earlier, remove the kill |
691 | // flag. |
692 | if (VPNOTOperandKiller) |
693 | VPNOTOperandKiller->setIsKill(false); |
694 | |
695 | break; |
696 | } |
697 | return HasUser; |
698 | } |
699 | |
700 | // This optimisation attempts to reduce the number of overlapping lifetimes of |
701 | // VCCR values by replacing uses of old VCCR values with VPNOTs. For example, |
702 | // this replaces |
703 | // %A:vccr = (something) |
704 | // %B:vccr = VPNOT %A |
705 | // %Foo = (some op that uses %B) |
706 | // %Bar = (some op that uses %A) |
707 | // With |
708 | // %A:vccr = (something) |
709 | // %B:vccr = VPNOT %A |
710 | // %Foo = (some op that uses %B) |
711 | // %TMP2:vccr = VPNOT %B |
712 | // %Bar = (some op that uses %A) |
713 | bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { |
714 | MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end(); |
715 | SmallVector<MachineInstr *, 4> DeadInstructions; |
716 | bool Modified = false; |
717 | |
718 | while (Iter != End) { |
719 | Register VCCRValue, OppositeVCCRValue; |
720 | // The first loop looks for 2 unpredicated instructions: |
721 | // %A:vccr = (instr) ; A is stored in VCCRValue |
722 | // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue |
723 | for (; Iter != End; ++Iter) { |
724 | // We're only interested in unpredicated instructions that write to VCCR. |
725 | if (!IsWritingToVCCR(Instr&: *Iter) || |
726 | getVPTInstrPredicate(MI: *Iter) != ARMVCC::None) |
727 | continue; |
728 | Register Dst = Iter->getOperand(i: 0).getReg(); |
729 | |
730 | // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've |
731 | // found what we were looking for. |
732 | if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT && |
733 | Iter->findRegisterUseOperandIdx(Reg: VCCRValue, /*TRI=*/nullptr) != -1) { |
734 | // Move the VPNOT closer to its first user if needed, and ignore if it |
735 | // has no users. |
736 | if (!MoveVPNOTBeforeFirstUser(MBB, Iter, Reg: VCCRValue)) |
737 | continue; |
738 | |
739 | OppositeVCCRValue = Dst; |
740 | ++Iter; |
741 | break; |
742 | } |
743 | |
744 | // Else, just set VCCRValue. |
745 | VCCRValue = Dst; |
746 | } |
747 | |
748 | // If the first inner loop didn't find anything, stop here. |
749 | if (Iter == End) |
750 | break; |
751 | |
752 | assert(VCCRValue && OppositeVCCRValue && |
753 | "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop " |
754 | "stopped before the end of the block!" ); |
755 | assert(VCCRValue != OppositeVCCRValue && |
756 | "VCCRValue should not be equal to OppositeVCCRValue!" ); |
757 | |
758 | // LastVPNOTResult always contains the same value as OppositeVCCRValue. |
759 | Register LastVPNOTResult = OppositeVCCRValue; |
760 | |
761 | // This second loop tries to optimize the remaining instructions. |
762 | for (; Iter != End; ++Iter) { |
763 | bool IsInteresting = false; |
764 | |
765 | if (MachineOperand *MO = |
766 | Iter->findRegisterUseOperand(Reg: VCCRValue, /*TRI=*/nullptr)) { |
767 | IsInteresting = true; |
768 | |
769 | // - If the instruction is a VPNOT, it can be removed, and we can just |
770 | // replace its uses with LastVPNOTResult. |
771 | // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue. |
772 | if (Iter->getOpcode() == ARM::MVE_VPNOT) { |
773 | Register Result = Iter->getOperand(i: 0).getReg(); |
774 | |
775 | MRI->replaceRegWith(FromReg: Result, ToReg: LastVPNOTResult); |
776 | DeadInstructions.push_back(Elt: &*Iter); |
777 | Modified = true; |
778 | |
779 | LLVM_DEBUG(dbgs() |
780 | << "Replacing all uses of '" << printReg(Result) |
781 | << "' with '" << printReg(LastVPNOTResult) << "'\n" ); |
782 | } else { |
783 | MachineInstr &VPNOT = |
784 | ReplaceRegisterUseWithVPNOT(MBB, Instr&: *Iter, User&: *MO, Target: LastVPNOTResult); |
785 | Modified = true; |
786 | |
787 | LastVPNOTResult = VPNOT.getOperand(i: 0).getReg(); |
788 | std::swap(a&: VCCRValue, b&: OppositeVCCRValue); |
789 | |
790 | LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue) |
791 | << "' with '" << printReg(LastVPNOTResult) |
792 | << "' in instr: " << *Iter); |
793 | } |
794 | } else { |
795 | // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult |
796 | // instead as they contain the same value. |
797 | if (MachineOperand *MO = Iter->findRegisterUseOperand( |
798 | Reg: OppositeVCCRValue, /*TRI=*/nullptr)) { |
799 | IsInteresting = true; |
800 | |
801 | // This is pointless if LastVPNOTResult == OppositeVCCRValue. |
802 | if (LastVPNOTResult != OppositeVCCRValue) { |
803 | LLVM_DEBUG(dbgs() << "Replacing usage of '" |
804 | << printReg(OppositeVCCRValue) << "' with '" |
805 | << printReg(LastVPNOTResult) << " for instr: " ; |
806 | Iter->dump()); |
807 | MO->setReg(LastVPNOTResult); |
808 | Modified = true; |
809 | } |
810 | |
811 | MO->setIsKill(false); |
812 | } |
813 | |
814 | // If this is an unpredicated VPNOT on |
815 | // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it. |
816 | if (Iter->getOpcode() == ARM::MVE_VPNOT && |
817 | getVPTInstrPredicate(MI: *Iter) == ARMVCC::None) { |
818 | Register VPNOTOperand = Iter->getOperand(i: 1).getReg(); |
819 | if (VPNOTOperand == LastVPNOTResult || |
820 | VPNOTOperand == OppositeVCCRValue) { |
821 | IsInteresting = true; |
822 | |
823 | std::swap(a&: VCCRValue, b&: OppositeVCCRValue); |
824 | LastVPNOTResult = Iter->getOperand(i: 0).getReg(); |
825 | } |
826 | } |
827 | } |
828 | |
829 | // If this instruction was not interesting, and it writes to VCCR, stop. |
830 | if (!IsInteresting && IsWritingToVCCR(Instr&: *Iter)) |
831 | break; |
832 | } |
833 | } |
834 | |
835 | for (MachineInstr *DeadInstruction : DeadInstructions) |
836 | DeadInstruction->eraseFromParent(); |
837 | |
838 | return Modified; |
839 | } |
840 | |
841 | // This optimisation replaces VCMPs with VPNOTs when they are equivalent. |
842 | bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { |
843 | SmallVector<MachineInstr *, 4> DeadInstructions; |
844 | |
845 | // The last VCMP that we have seen and that couldn't be replaced. |
846 | // This is reset when an instruction that writes to VCCR/VPR is found, or when |
847 | // a VCMP is replaced with a VPNOT. |
848 | // We'll only replace VCMPs with VPNOTs when this is not null, and when the |
849 | // current VCMP is the opposite of PrevVCMP. |
850 | MachineInstr *PrevVCMP = nullptr; |
851 | // If we find an instruction that kills the result of PrevVCMP, we save the |
852 | // operand here to remove the kill flag in case we need to use PrevVCMP's |
853 | // result. |
854 | MachineOperand *PrevVCMPResultKiller = nullptr; |
855 | |
856 | for (MachineInstr &Instr : MBB.instrs()) { |
857 | if (PrevVCMP) { |
858 | if (MachineOperand *MO = |
859 | Instr.findRegisterUseOperand(Reg: PrevVCMP->getOperand(i: 0).getReg(), |
860 | /*TRI=*/nullptr, /*isKill*/ true)) { |
861 | // If we come accross the instr that kills PrevVCMP's result, record it |
862 | // so we can remove the kill flag later if we need to. |
863 | PrevVCMPResultKiller = MO; |
864 | } |
865 | } |
866 | |
867 | // Ignore predicated instructions. |
868 | if (getVPTInstrPredicate(MI: Instr) != ARMVCC::None) |
869 | continue; |
870 | |
871 | // Only look at VCMPs |
872 | if (!IsVCMP(Opcode: Instr.getOpcode())) { |
873 | // If the instruction writes to VCCR, forget the previous VCMP. |
874 | if (IsWritingToVCCR(Instr)) |
875 | PrevVCMP = nullptr; |
876 | continue; |
877 | } |
878 | |
879 | if (!PrevVCMP || !IsVPNOTEquivalent(Cond&: Instr, Prev&: *PrevVCMP)) { |
880 | PrevVCMP = &Instr; |
881 | continue; |
882 | } |
883 | |
884 | // The register containing the result of the VCMP that we're going to |
885 | // replace. |
886 | Register PrevVCMPResultReg = PrevVCMP->getOperand(i: 0).getReg(); |
887 | |
888 | // Build a VPNOT to replace the VCMP, reusing its operands. |
889 | MachineInstrBuilder MIBuilder = |
890 | BuildMI(BB&: MBB, I: &Instr, MIMD: Instr.getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPNOT)) |
891 | .add(MO: Instr.getOperand(i: 0)) |
892 | .addReg(RegNo: PrevVCMPResultReg); |
893 | addUnpredicatedMveVpredNOp(MIB&: MIBuilder); |
894 | LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): " ; |
895 | MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: " ; |
896 | Instr.dump()); |
897 | |
898 | // If we found an instruction that uses, and kills PrevVCMP's result, |
899 | // remove the kill flag. |
900 | if (PrevVCMPResultKiller) |
901 | PrevVCMPResultKiller->setIsKill(false); |
902 | |
903 | // Finally, mark the old VCMP for removal and reset |
904 | // PrevVCMP/PrevVCMPResultKiller. |
905 | DeadInstructions.push_back(Elt: &Instr); |
906 | PrevVCMP = nullptr; |
907 | PrevVCMPResultKiller = nullptr; |
908 | } |
909 | |
910 | for (MachineInstr *DeadInstruction : DeadInstructions) |
911 | DeadInstruction->eraseFromParent(); |
912 | |
913 | return !DeadInstructions.empty(); |
914 | } |
915 | |
916 | bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, |
917 | MachineDominatorTree *DT) { |
918 | // Scan through the block, looking for instructions that use constants moves |
919 | // into VPR that are the negative of one another. These are expected to be |
920 | // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant |
921 | // mask is kept it or and VPNOT's of it are added or reused as we scan through |
922 | // the function. |
923 | unsigned LastVPTImm = 0; |
924 | Register LastVPTReg = 0; |
925 | SmallSet<MachineInstr *, 4> DeadInstructions; |
926 | |
927 | for (MachineInstr &Instr : MBB.instrs()) { |
928 | // Look for predicated MVE instructions. |
929 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: Instr); |
930 | if (PIdx == -1) |
931 | continue; |
932 | Register VPR = Instr.getOperand(i: PIdx + 1).getReg(); |
933 | if (!VPR.isVirtual()) |
934 | continue; |
935 | |
936 | // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. |
937 | MachineInstr *Copy = MRI->getVRegDef(Reg: VPR); |
938 | if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || |
939 | !Copy->getOperand(i: 1).getReg().isVirtual() || |
940 | MRI->getRegClass(Reg: Copy->getOperand(i: 1).getReg()) == &ARM::VCCRRegClass) { |
941 | LastVPTReg = 0; |
942 | continue; |
943 | } |
944 | Register GPR = Copy->getOperand(i: 1).getReg(); |
945 | |
946 | // Find the Immediate used by the copy. |
947 | auto getImm = [&](Register GPR) -> unsigned { |
948 | MachineInstr *Def = MRI->getVRegDef(Reg: GPR); |
949 | if (Def && (Def->getOpcode() == ARM::t2MOVi || |
950 | Def->getOpcode() == ARM::t2MOVi16)) |
951 | return Def->getOperand(i: 1).getImm(); |
952 | return -1U; |
953 | }; |
954 | unsigned Imm = getImm(GPR); |
955 | if (Imm == -1U) { |
956 | LastVPTReg = 0; |
957 | continue; |
958 | } |
959 | |
960 | unsigned NotImm = ~Imm & 0xffff; |
961 | if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { |
962 | MRI->clearKillFlags(Reg: LastVPTReg); |
963 | Instr.getOperand(i: PIdx + 1).setReg(LastVPTReg); |
964 | if (MRI->use_empty(RegNo: VPR)) { |
965 | DeadInstructions.insert(Ptr: Copy); |
966 | if (MRI->hasOneUse(RegNo: GPR)) |
967 | DeadInstructions.insert(Ptr: MRI->getVRegDef(Reg: GPR)); |
968 | } |
969 | LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr); |
970 | VPR = LastVPTReg; |
971 | } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { |
972 | // We have found the not of a previous constant. Create a VPNot of the |
973 | // earlier predicate reg and use it instead of the copy. |
974 | Register NewVPR = MRI->createVirtualRegister(RegClass: &ARM::VCCRRegClass); |
975 | auto VPNot = BuildMI(BB&: MBB, I: &Instr, MIMD: Instr.getDebugLoc(), |
976 | MCID: TII->get(Opcode: ARM::MVE_VPNOT), DestReg: NewVPR) |
977 | .addReg(RegNo: LastVPTReg); |
978 | addUnpredicatedMveVpredNOp(MIB&: VPNot); |
979 | |
980 | // Use the new register and check if the def is now dead. |
981 | Instr.getOperand(i: PIdx + 1).setReg(NewVPR); |
982 | if (MRI->use_empty(RegNo: VPR)) { |
983 | DeadInstructions.insert(Ptr: Copy); |
984 | if (MRI->hasOneUse(RegNo: GPR)) |
985 | DeadInstructions.insert(Ptr: MRI->getVRegDef(Reg: GPR)); |
986 | } |
987 | MRI->clearKillFlags(Reg: LastVPTReg); |
988 | LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at " |
989 | << Instr); |
990 | VPR = NewVPR; |
991 | } |
992 | |
993 | LastVPTImm = Imm; |
994 | LastVPTReg = VPR; |
995 | } |
996 | |
997 | for (MachineInstr *DI : DeadInstructions) |
998 | DI->eraseFromParent(); |
999 | |
1000 | return !DeadInstructions.empty(); |
1001 | } |
1002 | |
1003 | // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a |
1004 | // somewhat blunt approximation to allow tail predicated with vpsel |
1005 | // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly |
1006 | // different semantics under tail predication. Until that is modelled we just |
1007 | // convert to a VMOVT (via a predicated VORR) instead. |
1008 | bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { |
1009 | bool HasVCTP = false; |
1010 | SmallVector<MachineInstr *, 4> DeadInstructions; |
1011 | |
1012 | for (MachineInstr &MI : MBB.instrs()) { |
1013 | if (isVCTP(MI: &MI)) { |
1014 | HasVCTP = true; |
1015 | continue; |
1016 | } |
1017 | |
1018 | if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) |
1019 | continue; |
1020 | |
1021 | MachineInstrBuilder MIBuilder = |
1022 | BuildMI(BB&: MBB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR)) |
1023 | .add(MO: MI.getOperand(i: 0)) |
1024 | .add(MO: MI.getOperand(i: 1)) |
1025 | .add(MO: MI.getOperand(i: 1)) |
1026 | .addImm(Val: ARMVCC::Then) |
1027 | .add(MO: MI.getOperand(i: 4)) |
1028 | .add(MO: MI.getOperand(i: 5)) |
1029 | .add(MO: MI.getOperand(i: 2)); |
1030 | // Silence unused variable warning in release builds. |
1031 | (void)MIBuilder; |
1032 | LLVM_DEBUG(dbgs() << "Replacing VPSEL: " ; MI.dump(); |
1033 | dbgs() << " with VMOVT: " ; MIBuilder.getInstr()->dump()); |
1034 | DeadInstructions.push_back(Elt: &MI); |
1035 | } |
1036 | |
1037 | for (MachineInstr *DeadInstruction : DeadInstructions) |
1038 | DeadInstruction->eraseFromParent(); |
1039 | |
1040 | return !DeadInstructions.empty(); |
1041 | } |
1042 | |
1043 | // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as |
1044 | // the instruction may be removable as a noop. |
1045 | bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) { |
1046 | bool Changed = false; |
1047 | for (MachineInstr &MI : MBB.instrs()) { |
1048 | if (MI.getOpcode() != ARM::t2DoLoopStart) |
1049 | continue; |
1050 | Register R = MI.getOperand(i: 1).getReg(); |
1051 | MachineFunction *MF = MI.getParent()->getParent(); |
1052 | MF->getRegInfo().setRegAllocationHint(VReg: R, Type: ARMRI::RegLR, PrefReg: 0); |
1053 | Changed = true; |
1054 | } |
1055 | return Changed; |
1056 | } |
1057 | |
1058 | bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { |
1059 | const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>(); |
1060 | |
1061 | if (!STI.isThumb2() || !STI.hasLOB()) |
1062 | return false; |
1063 | |
1064 | TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); |
1065 | MRI = &Fn.getRegInfo(); |
1066 | MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
1067 | MachineDominatorTree *DT = |
1068 | &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
1069 | |
1070 | LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" |
1071 | << "********** Function: " << Fn.getName() << '\n'); |
1072 | |
1073 | bool Modified = false; |
1074 | for (MachineLoop *ML : MLI->getLoopsInPreorder()) { |
1075 | Modified |= LowerWhileLoopStart(ML); |
1076 | Modified |= MergeLoopEnd(ML); |
1077 | Modified |= ConvertTailPredLoop(ML, DT); |
1078 | } |
1079 | |
1080 | for (MachineBasicBlock &MBB : Fn) { |
1081 | Modified |= HintDoLoopStartReg(MBB); |
1082 | Modified |= ReplaceConstByVPNOTs(MBB, DT); |
1083 | Modified |= ReplaceVCMPsByVPNOTs(MBB); |
1084 | Modified |= ReduceOldVCCRValueUses(MBB); |
1085 | Modified |= ConvertVPSEL(MBB); |
1086 | } |
1087 | |
1088 | LLVM_DEBUG(dbgs() << "**************************************\n" ); |
1089 | return Modified; |
1090 | } |
1091 | |
1092 | /// createMVETPAndVPTOptimisationsPass |
1093 | FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() { |
1094 | return new MVETPAndVPTOptimisations(); |
1095 | } |
1096 | |