1 | //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Finalize v8.1-m low-overhead loops by converting the associated pseudo |
10 | /// instructions into machine operations. |
11 | /// The expectation is that the loop contains three pseudo instructions: |
12 | /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop |
13 | /// form should be in the preheader, whereas the while form should be in the |
14 | /// preheaders only predecessor. |
15 | /// - t2LoopDec - placed within in the loop body. |
16 | /// - t2LoopEnd - the loop latch terminator. |
17 | /// |
18 | /// In addition to this, we also look for the presence of the VCTP instruction, |
19 | /// which determines whether we can generated the tail-predicated low-overhead |
20 | /// loop form. |
21 | /// |
22 | /// Assumptions and Dependencies: |
23 | /// Low-overhead loops are constructed and executed using a setup instruction: |
24 | /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. |
25 | /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range |
26 | /// but fixed polarity: WLS can only branch forwards and LE can only branch |
27 | /// backwards. These restrictions mean that this pass is dependent upon block |
28 | /// layout and block sizes, which is why it's the last pass to run. The same is |
29 | /// true for ConstantIslands, but this pass does not increase the size of the |
30 | /// basic blocks, nor does it change the CFG. Instructions are mainly removed |
31 | /// during the transform and pseudo instructions are replaced by real ones. In |
32 | /// some cases, when we have to revert to a 'normal' loop, we have to introduce |
33 | /// multiple instructions for a single pseudo (see RevertWhile and |
34 | /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd |
35 | /// are defined to be as large as this maximum sequence of replacement |
36 | /// instructions. |
37 | /// |
38 | /// A note on VPR.P0 (the lane mask): |
39 | /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a |
40 | /// "VPT Active" context (which includes low-overhead loops and vpt blocks). |
41 | /// They will simply "and" the result of their calculation with the current |
42 | /// value of VPR.P0. You can think of it like this: |
43 | /// \verbatim |
44 | /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs |
45 | /// VPR.P0 &= Value |
46 | /// else |
47 | /// VPR.P0 = Value |
48 | /// \endverbatim |
49 | /// When we're inside the low-overhead loop (between DLSTP and LETP), we always |
50 | /// fall in the "VPT active" case, so we can consider that all VPR writes by |
51 | /// one of those instruction is actually a "and". |
52 | //===----------------------------------------------------------------------===// |
53 | |
54 | #include "ARM.h" |
55 | #include "ARMBaseInstrInfo.h" |
56 | #include "ARMBasicBlockInfo.h" |
57 | #include "ARMSubtarget.h" |
58 | #include "MVETailPredUtils.h" |
59 | #include "Thumb2InstrInfo.h" |
60 | #include "llvm/ADT/SetVector.h" |
61 | #include "llvm/CodeGen/LivePhysRegs.h" |
62 | #include "llvm/CodeGen/MachineFrameInfo.h" |
63 | #include "llvm/CodeGen/MachineFunctionPass.h" |
64 | #include "llvm/CodeGen/MachineLoopInfo.h" |
65 | #include "llvm/CodeGen/MachineLoopUtils.h" |
66 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
67 | #include "llvm/CodeGen/Passes.h" |
68 | #include "llvm/CodeGen/ReachingDefAnalysis.h" |
69 | #include "llvm/MC/MCInstrDesc.h" |
70 | |
71 | using namespace llvm; |
72 | |
73 | #define DEBUG_TYPE "arm-low-overhead-loops" |
74 | #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" |
75 | |
76 | static cl::opt<bool> |
77 | DisableTailPredication("arm-loloops-disable-tailpred" , cl::Hidden, |
78 | cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass" ), |
79 | cl::init(Val: false)); |
80 | |
81 | static cl::opt<bool> |
82 | DisableOmitDLS("arm-disable-omit-dls" , cl::Hidden, |
83 | cl::desc("Disable omitting 'dls lr, lr' instructions" ), |
84 | cl::init(Val: false)); |
85 | |
86 | static bool isVectorPredicated(MachineInstr *MI) { |
87 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
88 | return PIdx != -1 && MI->getOperand(i: PIdx + 1).getReg() == ARM::VPR; |
89 | } |
90 | |
91 | static bool isVectorPredicate(MachineInstr *MI) { |
92 | return MI->findRegisterDefOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
93 | } |
94 | |
95 | static bool hasVPRUse(MachineInstr &MI) { |
96 | return MI.findRegisterUseOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
97 | } |
98 | |
99 | static bool isDomainMVE(MachineInstr *MI) { |
100 | uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; |
101 | return Domain == ARMII::DomainMVE; |
102 | } |
103 | |
104 | static int getVecSize(const MachineInstr &MI) { |
105 | const MCInstrDesc &MCID = MI.getDesc(); |
106 | uint64_t Flags = MCID.TSFlags; |
107 | return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift; |
108 | } |
109 | |
110 | static bool shouldInspect(MachineInstr &MI) { |
111 | if (MI.isDebugInstr()) |
112 | return false; |
113 | return isDomainMVE(MI: &MI) || isVectorPredicate(MI: &MI) || hasVPRUse(MI); |
114 | } |
115 | |
116 | static bool isHorizontalReduction(const MachineInstr &MI) { |
117 | const MCInstrDesc &MCID = MI.getDesc(); |
118 | uint64_t Flags = MCID.TSFlags; |
119 | return (Flags & ARMII::HorizontalReduction) != 0; |
120 | } |
121 | |
122 | namespace { |
123 | |
124 | using InstSet = SmallPtrSetImpl<MachineInstr *>; |
125 | |
126 | class PostOrderLoopTraversal { |
127 | MachineLoop &ML; |
128 | MachineLoopInfo &MLI; |
129 | SmallPtrSet<MachineBasicBlock*, 4> Visited; |
130 | SmallVector<MachineBasicBlock*, 4> Order; |
131 | |
132 | public: |
133 | PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) |
134 | : ML(ML), MLI(MLI) { } |
135 | |
136 | const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { |
137 | return Order; |
138 | } |
139 | |
140 | // Visit all the blocks within the loop, as well as exit blocks and any |
141 | // blocks properly dominating the header. |
142 | void ProcessLoop() { |
143 | std::function<void(MachineBasicBlock *)> Search = |
144 | [this, &Search](MachineBasicBlock *MBB) -> void { |
145 | if (!Visited.insert(Ptr: MBB).second) |
146 | return; |
147 | |
148 | for (auto *Succ : MBB->successors()) { |
149 | if (!ML.contains(BB: Succ)) |
150 | continue; |
151 | Search(Succ); |
152 | } |
153 | Order.push_back(Elt: MBB); |
154 | }; |
155 | |
156 | // Insert exit blocks. |
157 | SmallVector<MachineBasicBlock*, 2> ExitBlocks; |
158 | ML.getExitBlocks(ExitBlocks); |
159 | append_range(C&: Order, R&: ExitBlocks); |
160 | |
161 | // Then add the loop body. |
162 | Search(ML.getHeader()); |
163 | |
164 | // Then try the preheader and its predecessors. |
165 | std::function<void(MachineBasicBlock*)> GetPredecessor = |
166 | [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { |
167 | Order.push_back(Elt: MBB); |
168 | if (MBB->pred_size() == 1) |
169 | GetPredecessor(*MBB->pred_begin()); |
170 | }; |
171 | |
172 | if (auto * = ML.getLoopPreheader()) |
173 | GetPredecessor(Preheader); |
174 | else if (auto * = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
175 | GetPredecessor(Preheader); |
176 | } |
177 | }; |
178 | |
179 | class VPTBlock { |
180 | SmallVector<MachineInstr *, 4> Insts; |
181 | |
182 | public: |
183 | VPTBlock(MachineInstr *MI) { Insts.push_back(Elt: MI); } |
184 | |
185 | // Have we found an instruction within the block which defines the vpr? If |
186 | // so, not all the instructions in the block will have the same predicate. |
187 | bool hasUniformPredicate() { return getDivergent() == nullptr; } |
188 | |
189 | // If it exists, return the first internal instruction which modifies the |
190 | // VPR. |
191 | MachineInstr *getDivergent() { |
192 | SmallVectorImpl<MachineInstr *> &Insts = getInsts(); |
193 | for (unsigned i = 1; i < Insts.size(); ++i) { |
194 | MachineInstr *Next = Insts[i]; |
195 | if (isVectorPredicate(MI: Next)) |
196 | return Next; // Found an instruction altering the vpr. |
197 | } |
198 | return nullptr; |
199 | } |
200 | |
201 | void insert(MachineInstr *MI) { |
202 | Insts.push_back(Elt: MI); |
203 | // VPT/VPST + 4 predicated instructions. |
204 | assert(Insts.size() <= 5 && "Too many instructions in VPT block!" ); |
205 | } |
206 | |
207 | bool containsVCTP() const { return llvm::any_of(Range: Insts, P: isVCTP); } |
208 | |
209 | unsigned size() const { return Insts.size(); } |
210 | SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } |
211 | }; |
212 | |
213 | // Represent the current state of the VPR and hold all instances which |
214 | // represent a VPT block, which is a list of instructions that begins with a |
215 | // VPT/VPST and has a maximum of four proceeding instructions. All |
216 | // instructions within the block are predicated upon the vpr and we allow |
217 | // instructions to define the vpr within in the block too. |
218 | class VPTState { |
219 | friend struct LowOverheadLoop; |
220 | |
221 | SmallVector<VPTBlock, 4> Blocks; |
222 | SetVector<MachineInstr *> CurrentPredicates; |
223 | std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts; |
224 | |
225 | void CreateVPTBlock(MachineInstr *MI) { |
226 | assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) |
227 | && "Can't begin VPT without predicate" ); |
228 | Blocks.emplace_back(Args&: MI); |
229 | // The execution of MI is predicated upon the current set of instructions |
230 | // that are AND'ed together to form the VPR predicate value. In the case |
231 | // that MI is a VPT, CurrentPredicates will also just be MI. |
232 | PredicatedInsts[MI] = CurrentPredicates; |
233 | } |
234 | |
235 | void addInst(MachineInstr *MI) { |
236 | Blocks.back().insert(MI); |
237 | PredicatedInsts[MI] = CurrentPredicates; |
238 | } |
239 | |
240 | void addPredicate(MachineInstr *MI) { |
241 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); |
242 | CurrentPredicates.insert(X: MI); |
243 | } |
244 | |
245 | void resetPredicate(MachineInstr *MI) { |
246 | LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); |
247 | CurrentPredicates.clear(); |
248 | CurrentPredicates.insert(X: MI); |
249 | } |
250 | |
251 | public: |
252 | // Return whether the given instruction is predicated upon a VCTP. |
253 | bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { |
254 | SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]; |
255 | if (Exclusive && Predicates.size() != 1) |
256 | return false; |
257 | // We do not know how to convert an else predicate of a VCTP. |
258 | if (getVPTInstrPredicate(MI: *MI) == ARMVCC::Else) |
259 | return false; |
260 | return llvm::any_of(Range&: Predicates, P: isVCTP); |
261 | } |
262 | |
263 | // Is the VPST, controlling the block entry, predicated upon a VCTP. |
264 | bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) { |
265 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
266 | return isPredicatedOnVCTP(MI: Insts.front(), Exclusive); |
267 | } |
268 | |
269 | // If this block begins with a VPT, we can check whether it's using |
270 | // at least one predicated input(s), as well as possible loop invariant |
271 | // which would result in it being implicitly predicated. |
272 | bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) { |
273 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
274 | MachineInstr *VPT = Insts.front(); |
275 | assert(isVPTOpcode(VPT->getOpcode()) && |
276 | "Expected VPT block to begin with VPT/VPST" ); |
277 | |
278 | if (VPT->getOpcode() == ARM::MVE_VPST) |
279 | return false; |
280 | |
281 | // If the VPT block does not define something that is an "output", then |
282 | // the tail-predicated version will just perform a subset of the original |
283 | // vpt block, where the last lanes should not be used. |
284 | if (isVPTOpcode(Opc: VPT->getOpcode()) && |
285 | all_of(Range&: Block.getInsts(), P: [](const MachineInstr *MI) { |
286 | return !MI->mayStore() && !MI->mayLoad() && |
287 | !isHorizontalReduction(MI: *MI) && !isVCTP(MI); |
288 | })) |
289 | return true; |
290 | |
291 | auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { |
292 | MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx)); |
293 | return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op); |
294 | }; |
295 | |
296 | auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { |
297 | MachineOperand &MO = MI->getOperand(i: Idx); |
298 | if (!MO.isReg() || !MO.getReg()) |
299 | return true; |
300 | |
301 | SmallPtrSet<MachineInstr *, 2> Defs; |
302 | RDA.getGlobalReachingDefs(MI, Reg: MO.getReg(), Defs); |
303 | if (Defs.empty()) |
304 | return true; |
305 | |
306 | for (auto *Def : Defs) |
307 | if (Def->getParent() == VPT->getParent()) |
308 | return false; |
309 | return true; |
310 | }; |
311 | |
312 | // Check that at least one of the operands is directly predicated on a |
313 | // vctp and allow an invariant value too. |
314 | return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && |
315 | (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && |
316 | (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); |
317 | } |
318 | |
319 | bool isValid(ReachingDefAnalysis &RDA) { |
320 | // All predication within the loop should be based on vctp. If the block |
321 | // isn't predicated on entry, check whether the vctp is within the block |
322 | // and that all other instructions are then predicated on it. |
323 | for (auto &Block : Blocks) { |
324 | if (isEntryPredicatedOnVCTP(Block, Exclusive: false) && |
325 | !any_of(Range: drop_begin(RangeOrContainer&: Block.getInsts()), P: [](const MachineInstr *MI) { |
326 | return getVPTInstrPredicate(MI: *MI) == ARMVCC::Else; |
327 | })) |
328 | continue; |
329 | if (hasImplicitlyValidVPT(Block, RDA)) |
330 | continue; |
331 | |
332 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
333 | // We don't know how to convert a block with just a VPT;VCTP into |
334 | // anything valid once we remove the VCTP. For now just bail out. |
335 | assert(isVPTOpcode(Insts.front()->getOpcode()) && |
336 | "Expected VPT block to start with a VPST or VPT!" ); |
337 | if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && |
338 | isVCTP(MI: Insts.back())) |
339 | return false; |
340 | |
341 | for (auto *MI : Insts) { |
342 | // Check that any internal VCTPs are 'Then' predicated. |
343 | if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then) |
344 | return false; |
345 | // Skip other instructions that build up the predicate. |
346 | if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) |
347 | continue; |
348 | // Check that any other instructions are predicated upon a vctp. |
349 | // TODO: We could infer when VPTs are implicitly predicated on the |
350 | // vctp (when the operands are predicated). |
351 | if (!isPredicatedOnVCTP(MI)) { |
352 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); |
353 | return false; |
354 | } |
355 | } |
356 | } |
357 | return true; |
358 | } |
359 | }; |
360 | |
361 | struct LowOverheadLoop { |
362 | |
363 | MachineLoop &ML; |
364 | MachineBasicBlock * = nullptr; |
365 | MachineLoopInfo &MLI; |
366 | ReachingDefAnalysis &RDA; |
367 | const TargetRegisterInfo &TRI; |
368 | const ARMBaseInstrInfo &TII; |
369 | MachineFunction *MF = nullptr; |
370 | MachineBasicBlock::iterator StartInsertPt; |
371 | MachineBasicBlock *StartInsertBB = nullptr; |
372 | MachineInstr *Start = nullptr; |
373 | MachineInstr *Dec = nullptr; |
374 | MachineInstr *End = nullptr; |
375 | MachineOperand TPNumElements; |
376 | SmallVector<MachineInstr *, 4> VCTPs; |
377 | SmallPtrSet<MachineInstr *, 4> ToRemove; |
378 | SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute; |
379 | SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs; |
380 | SmallPtrSet<MachineInstr *, 4> VMOVCopies; |
381 | bool Revert = false; |
382 | bool CannotTailPredicate = false; |
383 | VPTState VPTstate; |
384 | |
385 | LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, |
386 | ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, |
387 | const ARMBaseInstrInfo &TII) |
388 | : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), |
389 | TPNumElements(MachineOperand::CreateImm(Val: 0)) { |
390 | MF = ML.getHeader()->getParent(); |
391 | if (auto *MBB = ML.getLoopPreheader()) |
392 | Preheader = MBB; |
393 | else if (auto *MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
394 | Preheader = MBB; |
395 | } |
396 | |
397 | // If this is an MVE instruction, check that we know how to use tail |
398 | // predication with it. Record VPT blocks and return whether the |
399 | // instruction is valid for tail predication. |
400 | bool ValidateMVEInst(MachineInstr *MI); |
401 | |
402 | void AnalyseMVEInst(MachineInstr *MI) { |
403 | CannotTailPredicate = !ValidateMVEInst(MI); |
404 | } |
405 | |
406 | bool IsTailPredicationLegal() const { |
407 | // For now, let's keep things really simple and only support a single |
408 | // block for tail predication. |
409 | return !Revert && FoundAllComponents() && !VCTPs.empty() && |
410 | !CannotTailPredicate && ML.getNumBlocks() == 1; |
411 | } |
412 | |
413 | // Given that MI is a VCTP, check that is equivalent to any other VCTPs |
414 | // found. |
415 | bool AddVCTP(MachineInstr *MI); |
416 | |
417 | // Check that the predication in the loop will be equivalent once we |
418 | // perform the conversion. Also ensure that we can provide the number |
419 | // of elements to the loop start instruction. |
420 | bool ValidateTailPredicate(); |
421 | |
422 | // Check that any values available outside of the loop will be the same |
423 | // after tail predication conversion. |
424 | bool ValidateLiveOuts(); |
425 | |
426 | // Check the branch targets are within range and we satisfy our |
427 | // restrictions. |
428 | void Validate(ARMBasicBlockUtils *BBUtils); |
429 | |
430 | bool FoundAllComponents() const { |
431 | return Start && Dec && End; |
432 | } |
433 | |
434 | SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; } |
435 | |
436 | // Return the operand for the loop start instruction. This will be the loop |
437 | // iteration count, or the number of elements if we're tail predicating. |
438 | MachineOperand &getLoopStartOperand() { |
439 | if (IsTailPredicationLegal()) |
440 | return TPNumElements; |
441 | return Start->getOperand(i: 1); |
442 | } |
443 | |
444 | unsigned getStartOpcode() const { |
445 | bool IsDo = isDoLoopStart(MI: *Start); |
446 | if (!IsTailPredicationLegal()) |
447 | return IsDo ? ARM::t2DLS : ARM::t2WLS; |
448 | |
449 | return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo); |
450 | } |
451 | |
452 | void dump() const { |
453 | if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; |
454 | if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; |
455 | if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; |
456 | if (!VCTPs.empty()) { |
457 | dbgs() << "ARM Loops: Found VCTP(s):\n" ; |
458 | for (auto *MI : VCTPs) |
459 | dbgs() << " - " << *MI; |
460 | } |
461 | if (!FoundAllComponents()) |
462 | dbgs() << "ARM Loops: Not a low-overhead loop.\n" ; |
463 | else if (!(Start && Dec && End)) |
464 | dbgs() << "ARM Loops: Failed to find all loop components.\n" ; |
465 | } |
466 | }; |
467 | |
468 | class ARMLowOverheadLoops : public MachineFunctionPass { |
469 | MachineFunction *MF = nullptr; |
470 | MachineLoopInfo *MLI = nullptr; |
471 | ReachingDefAnalysis *RDA = nullptr; |
472 | const ARMBaseInstrInfo *TII = nullptr; |
473 | MachineRegisterInfo *MRI = nullptr; |
474 | const TargetRegisterInfo *TRI = nullptr; |
475 | std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; |
476 | |
477 | public: |
478 | static char ID; |
479 | |
480 | ARMLowOverheadLoops() : MachineFunctionPass(ID) { } |
481 | |
482 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
483 | AU.setPreservesCFG(); |
484 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
485 | AU.addRequired<ReachingDefAnalysis>(); |
486 | MachineFunctionPass::getAnalysisUsage(AU); |
487 | } |
488 | |
489 | bool runOnMachineFunction(MachineFunction &MF) override; |
490 | |
491 | MachineFunctionProperties getRequiredProperties() const override { |
492 | return MachineFunctionProperties().setNoVRegs().setTracksLiveness(); |
493 | } |
494 | |
495 | StringRef getPassName() const override { |
496 | return ARM_LOW_OVERHEAD_LOOPS_NAME; |
497 | } |
498 | |
499 | private: |
500 | bool ProcessLoop(MachineLoop *ML); |
501 | |
502 | bool RevertNonLoops(); |
503 | |
504 | void RevertWhile(MachineInstr *MI) const; |
505 | void RevertDo(MachineInstr *MI) const; |
506 | |
507 | bool RevertLoopDec(MachineInstr *MI) const; |
508 | |
509 | void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; |
510 | |
511 | void RevertLoopEndDec(MachineInstr *MI) const; |
512 | |
513 | void ConvertVPTBlocks(LowOverheadLoop &LoLoop); |
514 | |
515 | MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); |
516 | |
517 | void Expand(LowOverheadLoop &LoLoop); |
518 | |
519 | void IterationCountDCE(LowOverheadLoop &LoLoop); |
520 | }; |
521 | } |
522 | |
523 | char ARMLowOverheadLoops::ID = 0; |
524 | |
525 | INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, |
526 | false, false) |
527 | |
528 | static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, |
529 | InstSet &ToRemove, InstSet &Ignore) { |
530 | |
531 | // Check that we can remove all of Killed without having to modify any IT |
532 | // blocks. |
533 | auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) { |
534 | // Collect the dead code and the MBBs in which they reside. |
535 | SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; |
536 | for (auto *Dead : Killed) |
537 | BasicBlocks.insert(Ptr: Dead->getParent()); |
538 | |
539 | // Collect IT blocks in all affected basic blocks. |
540 | std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; |
541 | for (auto *MBB : BasicBlocks) { |
542 | for (auto &IT : *MBB) { |
543 | if (IT.getOpcode() != ARM::t2IT) |
544 | continue; |
545 | RDA.getReachingLocalUses(MI: &IT, Reg: MCRegister::from(Val: ARM::ITSTATE), |
546 | Uses&: ITBlocks[&IT]); |
547 | } |
548 | } |
549 | |
550 | // If we're removing all of the instructions within an IT block, then |
551 | // also remove the IT instruction. |
552 | SmallPtrSet<MachineInstr *, 2> ModifiedITs; |
553 | SmallPtrSet<MachineInstr *, 2> RemoveITs; |
554 | for (auto *Dead : Killed) { |
555 | if (MachineOperand *MO = |
556 | Dead->findRegisterUseOperand(Reg: ARM::ITSTATE, /*TRI=*/nullptr)) { |
557 | MachineInstr *IT = RDA.getMIOperand(MI: Dead, MO&: *MO); |
558 | RemoveITs.insert(Ptr: IT); |
559 | auto &CurrentBlock = ITBlocks[IT]; |
560 | CurrentBlock.erase(Ptr: Dead); |
561 | if (CurrentBlock.empty()) |
562 | ModifiedITs.erase(Ptr: IT); |
563 | else |
564 | ModifiedITs.insert(Ptr: IT); |
565 | } |
566 | } |
567 | if (!ModifiedITs.empty()) |
568 | return false; |
569 | Killed.insert_range(R&: RemoveITs); |
570 | return true; |
571 | }; |
572 | |
573 | SmallPtrSet<MachineInstr *, 2> Uses; |
574 | if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore)) |
575 | return false; |
576 | |
577 | if (WontCorruptITs(Uses, RDA)) { |
578 | ToRemove.insert_range(R&: Uses); |
579 | LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI |
580 | << " - can also remove:\n" ; |
581 | for (auto *Use : Uses) |
582 | dbgs() << " - " << *Use); |
583 | |
584 | SmallPtrSet<MachineInstr*, 4> Killed; |
585 | RDA.collectKilledOperands(MI, Dead&: Killed); |
586 | if (WontCorruptITs(Killed, RDA)) { |
587 | ToRemove.insert_range(R&: Killed); |
588 | LLVM_DEBUG(for (auto *Dead : Killed) |
589 | dbgs() << " - " << *Dead); |
590 | } |
591 | return true; |
592 | } |
593 | return false; |
594 | } |
595 | |
596 | bool LowOverheadLoop::ValidateTailPredicate() { |
597 | if (!IsTailPredicationLegal()) { |
598 | LLVM_DEBUG(if (VCTPs.empty()) |
599 | dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; |
600 | dbgs() << "ARM Loops: Tail-predication is not valid.\n" ); |
601 | return false; |
602 | } |
603 | |
604 | assert(!VCTPs.empty() && "VCTP instruction expected but is not set" ); |
605 | assert(ML.getBlocks().size() == 1 && |
606 | "Shouldn't be processing a loop with more than one block" ); |
607 | |
608 | if (DisableTailPredication) { |
609 | LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n" ); |
610 | return false; |
611 | } |
612 | |
613 | if (!VPTstate.isValid(RDA)) { |
614 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n" ); |
615 | return false; |
616 | } |
617 | |
618 | if (!ValidateLiveOuts()) { |
619 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n" ); |
620 | return false; |
621 | } |
622 | |
623 | // For tail predication, we need to provide the number of elements, instead |
624 | // of the iteration count, to the loop start instruction. The number of |
625 | // elements is provided to the vctp instruction, so we need to check that |
626 | // we can use this register at InsertPt. |
627 | MachineInstr *VCTP = VCTPs.back(); |
628 | if (Start->getOpcode() == ARM::t2DoLoopStartTP || |
629 | Start->getOpcode() == ARM::t2WhileLoopStartTP) { |
630 | TPNumElements = Start->getOperand(i: 2); |
631 | StartInsertPt = Start; |
632 | StartInsertBB = Start->getParent(); |
633 | } else { |
634 | TPNumElements = VCTP->getOperand(i: 1); |
635 | MCRegister NumElements = TPNumElements.getReg().asMCReg(); |
636 | |
637 | // If the register is defined within loop, then we can't perform TP. |
638 | // TODO: Check whether this is just a mov of a register that would be |
639 | // available. |
640 | if (RDA.hasLocalDefBefore(MI: VCTP, Reg: NumElements)) { |
641 | LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n" ); |
642 | return false; |
643 | } |
644 | |
645 | // The element count register maybe defined after InsertPt, in which case we |
646 | // need to try to move either InsertPt or the def so that the [w|d]lstp can |
647 | // use the value. |
648 | |
649 | if (StartInsertPt != StartInsertBB->end() && |
650 | !RDA.isReachingDefLiveOut(MI: &*StartInsertPt, Reg: NumElements)) { |
651 | if (auto *ElemDef = |
652 | RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, Reg: NumElements)) { |
653 | if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) { |
654 | ElemDef->removeFromParent(); |
655 | StartInsertBB->insert(I: StartInsertPt, MI: ElemDef); |
656 | LLVM_DEBUG(dbgs() |
657 | << "ARM Loops: Moved element count def: " << *ElemDef); |
658 | } else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) { |
659 | StartInsertPt->removeFromParent(); |
660 | StartInsertBB->insertAfter(I: MachineBasicBlock::iterator(ElemDef), |
661 | MI: &*StartInsertPt); |
662 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); |
663 | } else { |
664 | // If we fail to move an instruction and the element count is provided |
665 | // by a mov, use the mov operand if it will have the same value at the |
666 | // insertion point |
667 | MachineOperand Operand = ElemDef->getOperand(i: 1); |
668 | if (isMovRegOpcode(Opc: ElemDef->getOpcode()) && |
669 | RDA.getUniqueReachingMIDef(MI: ElemDef, Reg: Operand.getReg().asMCReg()) == |
670 | RDA.getUniqueReachingMIDef(MI: &*StartInsertPt, |
671 | Reg: Operand.getReg().asMCReg())) { |
672 | TPNumElements = Operand; |
673 | NumElements = TPNumElements.getReg(); |
674 | } else { |
675 | LLVM_DEBUG(dbgs() |
676 | << "ARM Loops: Unable to move element count to loop " |
677 | << "start instruction.\n" ); |
678 | return false; |
679 | } |
680 | } |
681 | } |
682 | } |
683 | |
684 | // Especially in the case of while loops, InsertBB may not be the |
685 | // preheader, so we need to check that the register isn't redefined |
686 | // before entering the loop. |
687 | auto CannotProvideElements = [this](MachineBasicBlock *MBB, |
688 | MCRegister NumElements) { |
689 | if (MBB->empty()) |
690 | return false; |
691 | // NumElements is redefined in this block. |
692 | if (RDA.hasLocalDefBefore(MI: &MBB->back(), Reg: NumElements)) |
693 | return true; |
694 | |
695 | // Don't continue searching up through multiple predecessors. |
696 | if (MBB->pred_size() > 1) |
697 | return true; |
698 | |
699 | return false; |
700 | }; |
701 | |
702 | // Search backwards for a def, until we get to InsertBB. |
703 | MachineBasicBlock *MBB = Preheader; |
704 | while (MBB && MBB != StartInsertBB) { |
705 | if (CannotProvideElements(MBB, NumElements)) { |
706 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n" ); |
707 | return false; |
708 | } |
709 | MBB = *MBB->pred_begin(); |
710 | } |
711 | } |
712 | |
713 | // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect |
714 | // world the [w|d]lstp instruction would be last instruction in the preheader |
715 | // and so it would only affect instructions within the loop body. But due to |
716 | // scheduling, and/or the logic in this pass (above), the insertion point can |
717 | // be moved earlier. So if the Loop Start isn't the last instruction in the |
718 | // preheader, and if the initial element count is smaller than the vector |
719 | // width, the Loop Start instruction will immediately generate one or more |
720 | // false lane mask which can, incorrectly, affect the proceeding MVE |
721 | // instructions in the preheader. |
722 | if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) { |
723 | LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n" ); |
724 | return false; |
725 | } |
726 | |
727 | // For any DoubleWidthResultInstrs we found whilst scanning instructions, they |
728 | // need to compute an output size that is smaller than the VCTP mask operates |
729 | // on. The VecSize of the DoubleWidthResult is the larger vector size - the |
730 | // size it extends into, so any VCTP VecSize <= is valid. |
731 | unsigned VCTPVecSize = getVecSize(MI: *VCTP); |
732 | for (MachineInstr *MI : DoubleWidthResultInstrs) { |
733 | unsigned InstrVecSize = getVecSize(MI: *MI); |
734 | if (InstrVecSize > VCTPVecSize) { |
735 | LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP " |
736 | << "VecSize:\n" << *MI); |
737 | return false; |
738 | } |
739 | } |
740 | |
741 | // Check that the value change of the element count is what we expect and |
742 | // that the predication will be equivalent. For this we need: |
743 | // NumElements = NumElements - VectorWidth. The sub will be a sub immediate |
744 | // and we can also allow register copies within the chain too. |
745 | auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { |
746 | return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth; |
747 | }; |
748 | |
749 | MachineBasicBlock *MBB = VCTP->getParent(); |
750 | // Remove modifications to the element count since they have no purpose in a |
751 | // tail predicated loop. Explicitly refer to the vctp operand no matter which |
752 | // register NumElements has been assigned to, since that is what the |
753 | // modifications will be using |
754 | if (auto *Def = RDA.getUniqueReachingMIDef( |
755 | MI: &MBB->back(), Reg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
756 | SmallPtrSet<MachineInstr*, 2> ElementChain; |
757 | SmallPtrSet<MachineInstr*, 2> Ignore; |
758 | unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode()); |
759 | |
760 | Ignore.insert_range(R&: VCTPs); |
761 | |
762 | if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) { |
763 | bool FoundSub = false; |
764 | |
765 | for (auto *MI : ElementChain) { |
766 | if (isMovRegOpcode(Opc: MI->getOpcode())) |
767 | continue; |
768 | |
769 | if (isSubImmOpcode(Opc: MI->getOpcode())) { |
770 | if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { |
771 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
772 | " count: " << *MI); |
773 | return false; |
774 | } |
775 | FoundSub = true; |
776 | } else { |
777 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
778 | " count: " << *MI); |
779 | return false; |
780 | } |
781 | } |
782 | ToRemove.insert_range(R&: ElementChain); |
783 | } |
784 | } |
785 | |
786 | // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we |
787 | // can also remove any extra instructions in the preheader, which often |
788 | // includes a now unused MOV. |
789 | if ((Start->getOpcode() == ARM::t2DoLoopStartTP || |
790 | Start->getOpcode() == ARM::t2WhileLoopStartTP) && |
791 | Preheader && !Preheader->empty() && |
792 | !RDA.hasLocalDefBefore(MI: VCTP, Reg: VCTP->getOperand(i: 1).getReg())) { |
793 | if (auto *Def = RDA.getUniqueReachingMIDef( |
794 | MI: &Preheader->back(), Reg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
795 | SmallPtrSet<MachineInstr *, 2> Ignore(llvm::from_range, VCTPs); |
796 | TryRemove(MI: Def, RDA, ToRemove, Ignore); |
797 | } |
798 | } |
799 | |
800 | return true; |
801 | } |
802 | |
803 | static bool isRegInClass(const MachineOperand &MO, |
804 | const TargetRegisterClass *Class) { |
805 | return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg()); |
806 | } |
807 | |
808 | // MVE 'narrowing' operate on half a lane, reading from half and writing |
809 | // to half, which are referred to has the top and bottom half. The other |
810 | // half retains its previous value. |
811 | static bool retainsPreviousHalfElement(const MachineInstr &MI) { |
812 | const MCInstrDesc &MCID = MI.getDesc(); |
813 | uint64_t Flags = MCID.TSFlags; |
814 | return (Flags & ARMII::RetainsPreviousHalfElement) != 0; |
815 | } |
816 | |
817 | // Some MVE instructions read from the top/bottom halves of their operand(s) |
818 | // and generate a vector result with result elements that are double the |
819 | // width of the input. |
820 | static bool producesDoubleWidthResult(const MachineInstr &MI) { |
821 | const MCInstrDesc &MCID = MI.getDesc(); |
822 | uint64_t Flags = MCID.TSFlags; |
823 | return (Flags & ARMII::DoubleWidthResult) != 0; |
824 | } |
825 | |
826 | // Can this instruction generate a non-zero result when given only zeroed |
827 | // operands? This allows us to know that, given operands with false bytes |
828 | // zeroed by masked loads, that the result will also contain zeros in those |
829 | // bytes. |
830 | static bool canGenerateNonZeros(const MachineInstr &MI) { |
831 | |
832 | // Check for instructions which can write into a larger element size, |
833 | // possibly writing into a previous zero'd lane. |
834 | if (producesDoubleWidthResult(MI)) |
835 | return true; |
836 | |
837 | switch (MI.getOpcode()) { |
838 | default: |
839 | break; |
840 | // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow |
841 | // fp16 -> fp32 vector conversions. |
842 | // Instructions that perform a NOT will generate 1s from 0s. |
843 | case ARM::MVE_VMVN: |
844 | case ARM::MVE_VORN: |
845 | // Count leading zeros will do just that! |
846 | case ARM::MVE_VCLZs8: |
847 | case ARM::MVE_VCLZs16: |
848 | case ARM::MVE_VCLZs32: |
849 | return true; |
850 | } |
851 | return false; |
852 | } |
853 | |
854 | // Look at its register uses to see if it only can only receive zeros |
855 | // into its false lanes which would then produce zeros. Also check that |
856 | // the output register is also defined by an FalseLanesZero instruction |
857 | // so that if tail-predication happens, the lanes that aren't updated will |
858 | // still be zeros. |
859 | static bool producesFalseLanesZero(MachineInstr &MI, |
860 | const TargetRegisterClass *QPRs, |
861 | const ReachingDefAnalysis &RDA, |
862 | InstSet &FalseLanesZero) { |
863 | if (canGenerateNonZeros(MI)) |
864 | return false; |
865 | |
866 | bool isPredicated = isVectorPredicated(MI: &MI); |
867 | // Predicated loads will write zeros to the falsely predicated bytes of the |
868 | // destination register. |
869 | if (MI.mayLoad()) |
870 | return isPredicated; |
871 | |
872 | auto IsZeroInit = [](MachineInstr *Def) { |
873 | return !isVectorPredicated(MI: Def) && |
874 | Def->getOpcode() == ARM::MVE_VMOVimmi32 && |
875 | Def->getOperand(i: 1).getImm() == 0; |
876 | }; |
877 | |
878 | bool AllowScalars = isHorizontalReduction(MI); |
879 | for (auto &MO : MI.operands()) { |
880 | if (!MO.isReg() || !MO.getReg()) |
881 | continue; |
882 | if (!isRegInClass(MO, Class: QPRs) && AllowScalars) |
883 | continue; |
884 | // Skip the lr predicate reg |
885 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI); |
886 | if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) |
887 | continue; |
888 | |
889 | // Check that this instruction will produce zeros in its false lanes: |
890 | // - If it only consumes false lanes zero or constant 0 (vmov #0) |
891 | // - If it's predicated, it only matters that it's def register already has |
892 | // false lane zeros, so we can ignore the uses. |
893 | SmallPtrSet<MachineInstr *, 2> Defs; |
894 | RDA.getGlobalReachingDefs(MI: &MI, Reg: MO.getReg(), Defs); |
895 | if (Defs.empty()) |
896 | return false; |
897 | for (auto *Def : Defs) { |
898 | if (Def == &MI || FalseLanesZero.count(Ptr: Def) || IsZeroInit(Def)) |
899 | continue; |
900 | if (MO.isUse() && isPredicated) |
901 | continue; |
902 | return false; |
903 | } |
904 | } |
905 | LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); |
906 | return true; |
907 | } |
908 | |
909 | bool LowOverheadLoop::ValidateLiveOuts() { |
910 | // We want to find out if the tail-predicated version of this loop will |
911 | // produce the same values as the loop in its original form. For this to |
912 | // be true, the newly inserted implicit predication must not change the |
913 | // the (observable) results. |
914 | // We're doing this because many instructions in the loop will not be |
915 | // predicated and so the conversion from VPT predication to tail-predication |
916 | // can result in different values being produced; due to the tail-predication |
917 | // preventing many instructions from updating their falsely predicated |
918 | // lanes. This analysis assumes that all the instructions perform lane-wise |
919 | // operations and don't perform any exchanges. |
920 | // A masked load, whether through VPT or tail predication, will write zeros |
921 | // to any of the falsely predicated bytes. So, from the loads, we know that |
922 | // the false lanes are zeroed and here we're trying to track that those false |
923 | // lanes remain zero, or where they change, the differences are masked away |
924 | // by their user(s). |
925 | // All MVE stores have to be predicated, so we know that any predicate load |
926 | // operands, or stored results are equivalent already. Other explicitly |
927 | // predicated instructions will perform the same operation in the original |
928 | // loop and the tail-predicated form too. Because of this, we can insert |
929 | // loads, stores and other predicated instructions into our Predicated |
930 | // set and build from there. |
931 | const TargetRegisterClass *QPRs = TRI.getRegClass(i: ARM::MQPRRegClassID); |
932 | SetVector<MachineInstr *> FalseLanesUnknown; |
933 | SmallPtrSet<MachineInstr *, 4> FalseLanesZero; |
934 | SmallPtrSet<MachineInstr *, 4> Predicated; |
935 | MachineBasicBlock * = ML.getHeader(); |
936 | |
937 | LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n" ); |
938 | |
939 | for (auto &MI : *Header) { |
940 | if (!shouldInspect(MI)) |
941 | continue; |
942 | |
943 | if (isVCTP(MI: &MI) || isVPTOpcode(Opc: MI.getOpcode())) |
944 | continue; |
945 | |
946 | bool isPredicated = isVectorPredicated(MI: &MI); |
947 | bool retainsOrReduces = |
948 | retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); |
949 | |
950 | if (isPredicated) |
951 | Predicated.insert(Ptr: &MI); |
952 | if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) |
953 | FalseLanesZero.insert(Ptr: &MI); |
954 | else if (MI.getNumDefs() == 0) |
955 | continue; |
956 | else if (!isPredicated && retainsOrReduces) { |
957 | LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); |
958 | return false; |
959 | } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) |
960 | FalseLanesUnknown.insert(X: &MI); |
961 | } |
962 | |
963 | LLVM_DEBUG({ |
964 | dbgs() << " Predicated:\n" ; |
965 | for (auto *I : Predicated) |
966 | dbgs() << " " << *I; |
967 | dbgs() << " FalseLanesZero:\n" ; |
968 | for (auto *I : FalseLanesZero) |
969 | dbgs() << " " << *I; |
970 | dbgs() << " FalseLanesUnknown:\n" ; |
971 | for (auto *I : FalseLanesUnknown) |
972 | dbgs() << " " << *I; |
973 | }); |
974 | |
975 | auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, |
976 | SmallPtrSetImpl<MachineInstr *> &Predicated) { |
977 | SmallPtrSet<MachineInstr *, 2> Uses; |
978 | RDA.getGlobalUses(MI, Reg: MO.getReg().asMCReg(), Uses); |
979 | for (auto *Use : Uses) { |
980 | if (Use != MI && !Predicated.count(Ptr: Use)) |
981 | return false; |
982 | } |
983 | return true; |
984 | }; |
985 | |
986 | // Visit the unknowns in reverse so that we can start at the values being |
987 | // stored and then we can work towards the leaves, hopefully adding more |
988 | // instructions to Predicated. Successfully terminating the loop means that |
989 | // all the unknown values have to found to be masked by predicated user(s). |
990 | // For any unpredicated values, we store them in NonPredicated so that we |
991 | // can later check whether these form a reduction. |
992 | SmallPtrSet<MachineInstr*, 2> NonPredicated; |
993 | for (auto *MI : reverse(C&: FalseLanesUnknown)) { |
994 | for (auto &MO : MI->operands()) { |
995 | if (!isRegInClass(MO, Class: QPRs) || !MO.isDef()) |
996 | continue; |
997 | if (!HasPredicatedUsers(MI, MO, Predicated)) { |
998 | LLVM_DEBUG(dbgs() << " Found an unknown def of : " |
999 | << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); |
1000 | NonPredicated.insert(Ptr: MI); |
1001 | break; |
1002 | } |
1003 | } |
1004 | // Any unknown false lanes have been masked away by the user(s). |
1005 | if (!NonPredicated.contains(Ptr: MI)) |
1006 | Predicated.insert(Ptr: MI); |
1007 | } |
1008 | |
1009 | SmallPtrSet<MachineInstr *, 2> LiveOutMIs; |
1010 | SmallVector<MachineBasicBlock *, 2> ExitBlocks; |
1011 | ML.getExitBlocks(ExitBlocks); |
1012 | assert(ML.getNumBlocks() == 1 && "Expected single block loop!" ); |
1013 | assert(ExitBlocks.size() == 1 && "Expected a single exit block" ); |
1014 | MachineBasicBlock *ExitBB = ExitBlocks.front(); |
1015 | for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { |
1016 | // TODO: Instead of blocking predication, we could move the vctp to the exit |
1017 | // block and calculate it's operand there in or the preheader. |
1018 | if (RegMask.PhysReg == ARM::VPR) { |
1019 | LLVM_DEBUG(dbgs() << " VPR is live in to the exit block." ); |
1020 | return false; |
1021 | } |
1022 | // Check Q-regs that are live in the exit blocks. We don't collect scalars |
1023 | // because they won't be affected by lane predication. |
1024 | if (QPRs->contains(Reg: RegMask.PhysReg)) |
1025 | if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, Reg: RegMask.PhysReg)) |
1026 | LiveOutMIs.insert(Ptr: MI); |
1027 | } |
1028 | |
1029 | // We've already validated that any VPT predication within the loop will be |
1030 | // equivalent when we perform the predication transformation; so we know that |
1031 | // any VPT predicated instruction is predicated upon VCTP. Any live-out |
1032 | // instruction needs to be predicated, so check this here. The instructions |
1033 | // in NonPredicated have been found to be a reduction that we can ensure its |
1034 | // legality. Any MQPRCopy found will need to validate its input as if it was |
1035 | // live out. |
1036 | SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); |
1037 | while (!Worklist.empty()) { |
1038 | MachineInstr *MI = Worklist.pop_back_val(); |
1039 | if (MI->getOpcode() == ARM::MQPRCopy) { |
1040 | VMOVCopies.insert(Ptr: MI); |
1041 | MachineInstr *CopySrc = |
1042 | RDA.getUniqueReachingMIDef(MI, Reg: MI->getOperand(i: 1).getReg()); |
1043 | if (CopySrc) |
1044 | Worklist.push_back(Elt: CopySrc); |
1045 | } else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) { |
1046 | LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); |
1047 | VMOVCopies.clear(); |
1048 | return false; |
1049 | } |
1050 | } |
1051 | |
1052 | return true; |
1053 | } |
1054 | |
1055 | void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { |
1056 | if (Revert) |
1057 | return; |
1058 | |
1059 | // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] |
1060 | // can only jump back. |
1061 | auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, |
1062 | ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { |
1063 | MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd |
1064 | ? End->getOperand(i: 1).getMBB() |
1065 | : End->getOperand(i: 2).getMBB(); |
1066 | // TODO Maybe there's cases where the target doesn't have to be the header, |
1067 | // but for now be safe and revert. |
1068 | if (TgtBB != ML.getHeader()) { |
1069 | LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n" ); |
1070 | return false; |
1071 | } |
1072 | |
1073 | // The WLS and LE instructions have 12-bits for the label offset. WLS |
1074 | // requires a positive offset, while LE uses negative. |
1075 | if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) || |
1076 | !BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: 4094)) { |
1077 | LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n" ); |
1078 | return false; |
1079 | } |
1080 | |
1081 | if (isWhileLoopStart(MI: *Start)) { |
1082 | MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(MI: *Start); |
1083 | if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) || |
1084 | !BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: 4094)) { |
1085 | LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n" ); |
1086 | return false; |
1087 | } |
1088 | } |
1089 | return true; |
1090 | }; |
1091 | |
1092 | StartInsertPt = MachineBasicBlock::iterator(Start); |
1093 | StartInsertBB = Start->getParent(); |
1094 | LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at " |
1095 | << *StartInsertPt); |
1096 | |
1097 | Revert = !ValidateRanges(Start, End, BBUtils, ML); |
1098 | CannotTailPredicate = !ValidateTailPredicate(); |
1099 | } |
1100 | |
1101 | bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { |
1102 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); |
1103 | if (VCTPs.empty()) { |
1104 | VCTPs.push_back(Elt: MI); |
1105 | return true; |
1106 | } |
1107 | |
1108 | // If we find another VCTP, check whether it uses the same value as the main VCTP. |
1109 | // If it does, store it in the VCTPs set, else refuse it. |
1110 | MachineInstr *Prev = VCTPs.back(); |
1111 | if (!Prev->getOperand(i: 1).isIdenticalTo(Other: MI->getOperand(i: 1)) || |
1112 | !RDA.hasSameReachingDef(A: Prev, B: MI, Reg: MI->getOperand(i: 1).getReg().asMCReg())) { |
1113 | LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " |
1114 | "definition from the main VCTP" ); |
1115 | return false; |
1116 | } |
1117 | VCTPs.push_back(Elt: MI); |
1118 | return true; |
1119 | } |
1120 | |
1121 | static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { |
1122 | |
1123 | auto GetFrameIndex = [](MachineMemOperand *Operand) { |
1124 | const PseudoSourceValue *PseudoValue = Operand->getPseudoValue(); |
1125 | if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) { |
1126 | if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) { |
1127 | return FS->getFrameIndex(); |
1128 | } |
1129 | } |
1130 | return -1; |
1131 | }; |
1132 | |
1133 | auto IsStackOp = [GetFrameIndex](MachineInstr *I) { |
1134 | switch (I->getOpcode()) { |
1135 | case ARM::MVE_VSTRWU32: |
1136 | case ARM::MVE_VLDRWU32: { |
1137 | return I->getOperand(i: 1).getReg() == ARM::SP && |
1138 | I->memoperands().size() == 1 && |
1139 | GetFrameIndex(I->memoperands().front()) >= 0; |
1140 | } |
1141 | default: |
1142 | return false; |
1143 | } |
1144 | }; |
1145 | |
1146 | // An unpredicated vector register spill is allowed if all of the uses of the |
1147 | // stack slot are within the loop |
1148 | if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) |
1149 | return false; |
1150 | |
1151 | // Search all blocks after the loop for accesses to the same stack slot. |
1152 | // ReachingDefAnalysis doesn't work for sp as it relies on registers being |
1153 | // live-out (which sp never is) to know what blocks to look in |
1154 | if (MI->memoperands().size() == 0) |
1155 | return false; |
1156 | int FI = GetFrameIndex(MI->memoperands().front()); |
1157 | |
1158 | auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo(); |
1159 | if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI)) |
1160 | return false; |
1161 | |
1162 | SmallVector<MachineBasicBlock *> Frontier; |
1163 | ML->getExitBlocks(ExitBlocks&: Frontier); |
1164 | SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()}; |
1165 | unsigned Idx = 0; |
1166 | while (Idx < Frontier.size()) { |
1167 | MachineBasicBlock *BB = Frontier[Idx]; |
1168 | bool LookAtSuccessors = true; |
1169 | for (auto &I : *BB) { |
1170 | if (!IsStackOp(&I) || I.memoperands().size() == 0) |
1171 | continue; |
1172 | if (GetFrameIndex(I.memoperands().front()) != FI) |
1173 | continue; |
1174 | // If this block has a store to the stack slot before any loads then we |
1175 | // can ignore the block |
1176 | if (I.getOpcode() == ARM::MVE_VSTRWU32) { |
1177 | LookAtSuccessors = false; |
1178 | break; |
1179 | } |
1180 | // If the store and the load are using the same stack slot then the |
1181 | // store isn't valid for tail predication |
1182 | if (I.getOpcode() == ARM::MVE_VLDRWU32) |
1183 | return false; |
1184 | } |
1185 | |
1186 | if (LookAtSuccessors) { |
1187 | for (auto *Succ : BB->successors()) { |
1188 | if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ)) |
1189 | Frontier.push_back(Elt: Succ); |
1190 | } |
1191 | } |
1192 | Visited.insert(Ptr: BB); |
1193 | Idx++; |
1194 | } |
1195 | |
1196 | return true; |
1197 | } |
1198 | |
1199 | bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { |
1200 | if (CannotTailPredicate) |
1201 | return false; |
1202 | |
1203 | if (!shouldInspect(MI&: *MI)) |
1204 | return true; |
1205 | |
1206 | if (MI->getOpcode() == ARM::MVE_VPSEL || |
1207 | MI->getOpcode() == ARM::MVE_VPNOT) { |
1208 | // TODO: Allow VPSEL and VPNOT, we currently cannot because: |
1209 | // 1) It will use the VPR as a predicate operand, but doesn't have to be |
1210 | // instead a VPT block, which means we can assert while building up |
1211 | // the VPT block because we don't find another VPT or VPST to being a new |
1212 | // one. |
1213 | // 2) VPSEL still requires a VPR operand even after tail predicating, |
1214 | // which means we can't remove it unless there is another |
1215 | // instruction, such as vcmp, that can provide the VPR def. |
1216 | return false; |
1217 | } |
1218 | |
1219 | // Record all VCTPs and check that they're equivalent to one another. |
1220 | if (isVCTP(MI) && !AddVCTP(MI)) |
1221 | return false; |
1222 | |
1223 | // Inspect uses first so that any instructions that alter the VPR don't |
1224 | // alter the predicate upon themselves. |
1225 | const MCInstrDesc &MCID = MI->getDesc(); |
1226 | bool IsUse = false; |
1227 | unsigned LastOpIdx = MI->getNumOperands() - 1; |
1228 | for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) { |
1229 | const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index()); |
1230 | if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) |
1231 | continue; |
1232 | |
1233 | if (ARM::isVpred(op: Op.value().OperandType)) { |
1234 | VPTstate.addInst(MI); |
1235 | IsUse = true; |
1236 | } else if (MI->getOpcode() != ARM::MVE_VPST) { |
1237 | LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); |
1238 | return false; |
1239 | } |
1240 | } |
1241 | |
1242 | // If we find an instruction that has been marked as not valid for tail |
1243 | // predication, only allow the instruction if it's contained within a valid |
1244 | // VPT block. |
1245 | bool RequiresExplicitPredication = |
1246 | (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; |
1247 | if (isDomainMVE(MI) && RequiresExplicitPredication) { |
1248 | if (MI->getOpcode() == ARM::MQPRCopy) |
1249 | return true; |
1250 | if (!IsUse && producesDoubleWidthResult(MI: *MI)) { |
1251 | DoubleWidthResultInstrs.insert(Ptr: MI); |
1252 | return true; |
1253 | } |
1254 | |
1255 | LLVM_DEBUG(if (!IsUse) dbgs() |
1256 | << "ARM Loops: Can't tail predicate: " << *MI); |
1257 | return IsUse; |
1258 | } |
1259 | |
1260 | // If the instruction is already explicitly predicated, then the conversion |
1261 | // will be fine, but ensure that all store operations are predicated. |
1262 | if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML)) |
1263 | return IsUse; |
1264 | |
1265 | // If this instruction defines the VPR, update the predicate for the |
1266 | // proceeding instructions. |
1267 | if (isVectorPredicate(MI)) { |
1268 | // Clear the existing predicate when we're not in VPT Active state, |
1269 | // otherwise we add to it. |
1270 | if (!isVectorPredicated(MI)) |
1271 | VPTstate.resetPredicate(MI); |
1272 | else |
1273 | VPTstate.addPredicate(MI); |
1274 | } |
1275 | |
1276 | // Finally once the predicate has been modified, we can start a new VPT |
1277 | // block if necessary. |
1278 | if (isVPTOpcode(Opc: MI->getOpcode())) |
1279 | VPTstate.CreateVPTBlock(MI); |
1280 | |
1281 | return true; |
1282 | } |
1283 | |
1284 | bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { |
1285 | const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>(); |
1286 | if (!ST.hasLOB()) |
1287 | return false; |
1288 | |
1289 | MF = &mf; |
1290 | LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n" ); |
1291 | |
1292 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
1293 | RDA = &getAnalysis<ReachingDefAnalysis>(); |
1294 | MF->getProperties().setTracksLiveness(); |
1295 | MRI = &MF->getRegInfo(); |
1296 | TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); |
1297 | TRI = ST.getRegisterInfo(); |
1298 | BBUtils = std::make_unique<ARMBasicBlockUtils>(args&: *MF); |
1299 | BBUtils->computeAllBlockSizes(); |
1300 | BBUtils->adjustBBOffsetsAfter(MBB: &MF->front()); |
1301 | |
1302 | bool Changed = false; |
1303 | for (auto *ML : *MLI) { |
1304 | if (ML->isOutermost()) |
1305 | Changed |= ProcessLoop(ML); |
1306 | } |
1307 | Changed |= RevertNonLoops(); |
1308 | return Changed; |
1309 | } |
1310 | |
1311 | bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { |
1312 | bool Changed = false; |
1313 | |
1314 | // Process inner loops first. |
1315 | for (MachineLoop *L : *ML) |
1316 | Changed |= ProcessLoop(ML: L); |
1317 | |
1318 | LLVM_DEBUG({ |
1319 | dbgs() << "ARM Loops: Processing loop containing:\n" ; |
1320 | if (auto *Preheader = ML->getLoopPreheader()) |
1321 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
1322 | else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true)) |
1323 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
1324 | for (auto *MBB : ML->getBlocks()) |
1325 | dbgs() << " - Block: " << printMBBReference(*MBB) << "\n" ; |
1326 | }); |
1327 | |
1328 | // Search the given block for a loop start instruction. If one isn't found, |
1329 | // and there's only one predecessor block, search that one too. |
1330 | std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = |
1331 | [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { |
1332 | for (auto &MI : *MBB) { |
1333 | if (isLoopStart(MI)) |
1334 | return &MI; |
1335 | } |
1336 | if (MBB->pred_size() == 1) |
1337 | return SearchForStart(*MBB->pred_begin()); |
1338 | return nullptr; |
1339 | }; |
1340 | |
1341 | LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); |
1342 | // Search the preheader for the start intrinsic. |
1343 | // FIXME: I don't see why we shouldn't be supporting multiple predecessors |
1344 | // with potentially multiple set.loop.iterations, so we need to enable this. |
1345 | if (LoLoop.Preheader) |
1346 | LoLoop.Start = SearchForStart(LoLoop.Preheader); |
1347 | else |
1348 | return Changed; |
1349 | |
1350 | // Find the low-overhead loop components and decide whether or not to fall |
1351 | // back to a normal loop. Also look for a vctp instructions and decide |
1352 | // whether we can convert that predicate using tail predication. |
1353 | for (auto *MBB : reverse(C: ML->getBlocks())) { |
1354 | for (auto &MI : *MBB) { |
1355 | if (MI.isDebugValue()) |
1356 | continue; |
1357 | else if (MI.getOpcode() == ARM::t2LoopDec) |
1358 | LoLoop.Dec = &MI; |
1359 | else if (MI.getOpcode() == ARM::t2LoopEnd) |
1360 | LoLoop.End = &MI; |
1361 | else if (MI.getOpcode() == ARM::t2LoopEndDec) |
1362 | LoLoop.End = LoLoop.Dec = &MI; |
1363 | else if (isLoopStart(MI)) |
1364 | LoLoop.Start = &MI; |
1365 | else if (MI.getDesc().isCall()) { |
1366 | // TODO: Though the call will require LE to execute again, does this |
1367 | // mean we should revert? Always executing LE hopefully should be |
1368 | // faster than performing a sub,cmp,br or even subs,br. |
1369 | LoLoop.Revert = true; |
1370 | LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n" ); |
1371 | } else { |
1372 | // Record VPR defs and build up their corresponding vpt blocks. |
1373 | // Check we know how to tail predicate any mve instructions. |
1374 | LoLoop.AnalyseMVEInst(MI: &MI); |
1375 | } |
1376 | } |
1377 | } |
1378 | |
1379 | LLVM_DEBUG(LoLoop.dump()); |
1380 | if (!LoLoop.FoundAllComponents()) { |
1381 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n" ); |
1382 | return Changed; |
1383 | } |
1384 | |
1385 | assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart && |
1386 | "Expected t2WhileLoopStart to be removed before regalloc!" ); |
1387 | |
1388 | // Check that the only instruction using LoopDec is LoopEnd. This can only |
1389 | // happen when the Dec and End are separate, not a single t2LoopEndDec. |
1390 | // TODO: Check for copy chains that really have no effect. |
1391 | if (LoLoop.Dec != LoLoop.End) { |
1392 | SmallPtrSet<MachineInstr *, 2> Uses; |
1393 | RDA->getReachingLocalUses(MI: LoLoop.Dec, Reg: MCRegister::from(Val: ARM::LR), Uses); |
1394 | if (Uses.size() > 1 || !Uses.count(Ptr: LoLoop.End)) { |
1395 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n" ); |
1396 | LoLoop.Revert = true; |
1397 | } |
1398 | } |
1399 | LoLoop.Validate(BBUtils: BBUtils.get()); |
1400 | Expand(LoLoop); |
1401 | return true; |
1402 | } |
1403 | |
1404 | // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a |
1405 | // beq that branches to the exit branch. |
1406 | // TODO: We could also try to generate a cbz if the value in LR is also in |
1407 | // another low register. |
1408 | void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { |
1409 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); |
1410 | MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(MI: *MI); |
1411 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
1412 | ARM::tBcc : ARM::t2Bcc; |
1413 | |
1414 | RevertWhileLoopStartLR(MI, TII, BrOpc); |
1415 | } |
1416 | |
1417 | void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { |
1418 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); |
1419 | RevertDoLoopStart(MI, TII); |
1420 | } |
1421 | |
1422 | bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { |
1423 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); |
1424 | MachineBasicBlock *MBB = MI->getParent(); |
1425 | SmallPtrSet<MachineInstr*, 1> Ignore; |
1426 | for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { |
1427 | if (I->getOpcode() == ARM::t2LoopEnd) { |
1428 | Ignore.insert(Ptr: &*I); |
1429 | break; |
1430 | } |
1431 | } |
1432 | |
1433 | // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. |
1434 | bool SetFlags = |
1435 | RDA->isSafeToDefRegAt(MI, Reg: MCRegister::from(Val: ARM::CPSR), Ignore); |
1436 | |
1437 | llvm::RevertLoopDec(MI, TII, SetFlags); |
1438 | return SetFlags; |
1439 | } |
1440 | |
1441 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
1442 | void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { |
1443 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); |
1444 | |
1445 | MachineBasicBlock *DestBB = MI->getOperand(i: 1).getMBB(); |
1446 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
1447 | ARM::tBcc : ARM::t2Bcc; |
1448 | |
1449 | llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); |
1450 | } |
1451 | |
1452 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
1453 | void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { |
1454 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); |
1455 | assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!" ); |
1456 | MachineBasicBlock *MBB = MI->getParent(); |
1457 | |
1458 | MachineInstrBuilder MIB = |
1459 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri)); |
1460 | MIB.addDef(RegNo: ARM::LR); |
1461 | MIB.add(MO: MI->getOperand(i: 1)); |
1462 | MIB.addImm(Val: 1); |
1463 | MIB.addImm(Val: ARMCC::AL); |
1464 | MIB.addReg(RegNo: ARM::NoRegister); |
1465 | MIB.addReg(RegNo: ARM::CPSR); |
1466 | MIB->getOperand(i: 5).setIsDef(true); |
1467 | |
1468 | MachineBasicBlock *DestBB = MI->getOperand(i: 2).getMBB(); |
1469 | unsigned BrOpc = |
1470 | BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? ARM::tBcc : ARM::t2Bcc; |
1471 | |
1472 | // Create bne |
1473 | MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: BrOpc)); |
1474 | MIB.add(MO: MI->getOperand(i: 2)); // branch target |
1475 | MIB.addImm(Val: ARMCC::NE); // condition code |
1476 | MIB.addReg(RegNo: ARM::CPSR); |
1477 | |
1478 | MI->eraseFromParent(); |
1479 | } |
1480 | |
1481 | // Perform dead code elimation on the loop iteration count setup expression. |
1482 | // If we are tail-predicating, the number of elements to be processed is the |
1483 | // operand of the VCTP instruction in the vector body, see getCount(), which is |
1484 | // register $r3 in this example: |
1485 | // |
1486 | // $lr = big-itercount-expression |
1487 | // .. |
1488 | // $lr = t2DoLoopStart renamable $lr |
1489 | // vector.body: |
1490 | // .. |
1491 | // $vpr = MVE_VCTP32 renamable $r3 |
1492 | // renamable $lr = t2LoopDec killed renamable $lr, 1 |
1493 | // t2LoopEnd renamable $lr, %vector.body |
1494 | // tB %end |
1495 | // |
1496 | // What we would like achieve here is to replace the do-loop start pseudo |
1497 | // instruction t2DoLoopStart with: |
1498 | // |
1499 | // $lr = MVE_DLSTP_32 killed renamable $r3 |
1500 | // |
1501 | // Thus, $r3 which defines the number of elements, is written to $lr, |
1502 | // and then we want to delete the whole chain that used to define $lr, |
1503 | // see the comment below how this chain could look like. |
1504 | // |
1505 | void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { |
1506 | if (!LoLoop.IsTailPredicationLegal()) |
1507 | return; |
1508 | |
1509 | LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n" ); |
1510 | |
1511 | MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: 1); |
1512 | if (!Def) { |
1513 | LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n" ); |
1514 | return; |
1515 | } |
1516 | |
1517 | // Collect and remove the users of iteration count. |
1518 | SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, |
1519 | LoLoop.End }; |
1520 | if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed)) |
1521 | LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n" ); |
1522 | } |
1523 | |
1524 | MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { |
1525 | LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n" ); |
1526 | // When using tail-predication, try to delete the dead code that was used to |
1527 | // calculate the number of loop iterations. |
1528 | IterationCountDCE(LoLoop); |
1529 | |
1530 | MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; |
1531 | MachineInstr *Start = LoLoop.Start; |
1532 | MachineBasicBlock *MBB = LoLoop.StartInsertBB; |
1533 | unsigned Opc = LoLoop.getStartOpcode(); |
1534 | MachineOperand &Count = LoLoop.getLoopStartOperand(); |
1535 | |
1536 | // A DLS lr, lr we needn't emit |
1537 | MachineInstr* NewStart; |
1538 | if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() && |
1539 | Count.getReg() == ARM::LR) { |
1540 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr" ); |
1541 | NewStart = nullptr; |
1542 | } else { |
1543 | MachineInstrBuilder MIB = |
1544 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: Start->getDebugLoc(), MCID: TII->get(Opcode: Opc)); |
1545 | |
1546 | MIB.addDef(RegNo: ARM::LR); |
1547 | MIB.add(MO: Count); |
1548 | if (isWhileLoopStart(MI: *Start)) |
1549 | MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start)); |
1550 | |
1551 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); |
1552 | NewStart = &*MIB; |
1553 | } |
1554 | |
1555 | LoLoop.ToRemove.insert(Ptr: Start); |
1556 | return NewStart; |
1557 | } |
1558 | |
1559 | void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { |
1560 | auto RemovePredicate = [](MachineInstr *MI) { |
1561 | if (MI->isDebugInstr()) |
1562 | return; |
1563 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); |
1564 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
1565 | assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction" ); |
1566 | assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && |
1567 | "Expected Then predicate!" ); |
1568 | MI->getOperand(i: PIdx).setImm(ARMVCC::None); |
1569 | MI->getOperand(i: PIdx + 1).setReg(0); |
1570 | }; |
1571 | |
1572 | for (auto &Block : LoLoop.getVPTBlocks()) { |
1573 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
1574 | |
1575 | auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { |
1576 | assert(TheVCMP && "Replacing a removed or non-existent VCMP" ); |
1577 | // Replace the VCMP with a VPT |
1578 | MachineInstrBuilder MIB = |
1579 | BuildMI(BB&: *At->getParent(), I: At, MIMD: At->getDebugLoc(), |
1580 | MCID: TII->get(Opcode: VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode()))); |
1581 | MIB.addImm(Val: ARMVCC::Then); |
1582 | // Register one |
1583 | MIB.add(MO: TheVCMP->getOperand(i: 1)); |
1584 | // Register two |
1585 | MIB.add(MO: TheVCMP->getOperand(i: 2)); |
1586 | // The comparison code, e.g. ge, eq, lt |
1587 | MIB.add(MO: TheVCMP->getOperand(i: 3)); |
1588 | LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); |
1589 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
1590 | LoLoop.ToRemove.insert(Ptr: TheVCMP); |
1591 | TheVCMP = nullptr; |
1592 | }; |
1593 | |
1594 | if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ Exclusive: true)) { |
1595 | MachineInstr *VPST = Insts.front(); |
1596 | if (Block.hasUniformPredicate()) { |
1597 | // A vpt block starting with VPST, is only predicated upon vctp and has no |
1598 | // internal vpr defs: |
1599 | // - Remove vpst. |
1600 | // - Unpredicate the remaining instructions. |
1601 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1602 | for (unsigned i = 1; i < Insts.size(); ++i) |
1603 | RemovePredicate(Insts[i]); |
1604 | } else { |
1605 | // The VPT block has a non-uniform predicate but it uses a vpst and its |
1606 | // entry is guarded only by a vctp, which means we: |
1607 | // - Need to remove the original vpst. |
1608 | // - Then need to unpredicate any following instructions, until |
1609 | // we come across the divergent vpr def. |
1610 | // - Insert a new vpst to predicate the instruction(s) that following |
1611 | // the divergent vpr def. |
1612 | MachineInstr *Divergent = Block.getDivergent(); |
1613 | MachineBasicBlock *MBB = Divergent->getParent(); |
1614 | auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); |
1615 | while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) |
1616 | ++DivergentNext; |
1617 | |
1618 | bool DivergentNextIsPredicated = |
1619 | DivergentNext != MBB->end() && |
1620 | getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None; |
1621 | |
1622 | for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; |
1623 | I != E; ++I) |
1624 | RemovePredicate(&*I); |
1625 | |
1626 | // Check if the instruction defining vpr is a vcmp so it can be combined |
1627 | // with the VPST This should be the divergent instruction |
1628 | MachineInstr *VCMP = |
1629 | VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != 0 ? Divergent : nullptr; |
1630 | |
1631 | if (DivergentNextIsPredicated) { |
1632 | // Insert a VPST at the divergent only if the next instruction |
1633 | // would actually use it. A VCMP following a VPST can be |
1634 | // merged into a VPT so do that instead if the VCMP exists. |
1635 | if (!VCMP) { |
1636 | // Create a VPST (with a null mask for now, we'll recompute it |
1637 | // later) |
1638 | MachineInstrBuilder MIB = |
1639 | BuildMI(BB&: *Divergent->getParent(), I: Divergent, |
1640 | MIMD: Divergent->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPST)); |
1641 | MIB.addImm(Val: 0); |
1642 | LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); |
1643 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
1644 | } else { |
1645 | // No RDA checks are necessary here since the VPST would have been |
1646 | // directly after the VCMP |
1647 | ReplaceVCMPWithVPT(VCMP, VCMP); |
1648 | } |
1649 | } |
1650 | } |
1651 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1652 | LoLoop.ToRemove.insert(Ptr: VPST); |
1653 | } else if (Block.containsVCTP()) { |
1654 | // The vctp will be removed, so either the entire block will be dead or |
1655 | // the block mask of the vp(s)t will need to be recomputed. |
1656 | MachineInstr *VPST = Insts.front(); |
1657 | if (Block.size() == 2) { |
1658 | assert(VPST->getOpcode() == ARM::MVE_VPST && |
1659 | "Found a VPST in an otherwise empty vpt block" ); |
1660 | LoLoop.ToRemove.insert(Ptr: VPST); |
1661 | } else |
1662 | LoLoop.BlockMasksToRecompute.insert(Ptr: VPST); |
1663 | } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { |
1664 | // If this block starts with a VPST then attempt to merge it with the |
1665 | // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT |
1666 | // block that no longer exists |
1667 | MachineInstr *VPST = Insts.front(); |
1668 | auto Next = ++MachineBasicBlock::iterator(VPST); |
1669 | assert(getVPTInstrPredicate(*Next) != ARMVCC::None && |
1670 | "The instruction after a VPST must be predicated" ); |
1671 | (void)Next; |
1672 | MachineInstr *VprDef = RDA->getUniqueReachingMIDef(MI: VPST, Reg: ARM::VPR); |
1673 | if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) && |
1674 | !LoLoop.ToRemove.contains(Ptr: VprDef)) { |
1675 | MachineInstr *VCMP = VprDef; |
1676 | // The VCMP and VPST can only be merged if the VCMP's operands will have |
1677 | // the same values at the VPST. |
1678 | // If any of the instructions between the VCMP and VPST are predicated |
1679 | // then a different code path is expected to have merged the VCMP and |
1680 | // VPST already. |
1681 | if (std::none_of(first: ++MachineBasicBlock::iterator(VCMP), |
1682 | last: MachineBasicBlock::iterator(VPST), pred: hasVPRUse) && |
1683 | RDA->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: 1).getReg()) && |
1684 | RDA->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: 2).getReg())) { |
1685 | ReplaceVCMPWithVPT(VCMP, VPST); |
1686 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1687 | LoLoop.ToRemove.insert(Ptr: VPST); |
1688 | } |
1689 | } |
1690 | } |
1691 | } |
1692 | |
1693 | LoLoop.ToRemove.insert_range(R&: LoLoop.VCTPs); |
1694 | } |
1695 | |
1696 | void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { |
1697 | |
1698 | // Combine the LoopDec and LoopEnd instructions into LE(TP). |
1699 | auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { |
1700 | MachineInstr *End = LoLoop.End; |
1701 | MachineBasicBlock *MBB = End->getParent(); |
1702 | unsigned Opc = LoLoop.IsTailPredicationLegal() ? |
1703 | ARM::MVE_LETP : ARM::t2LEUpdate; |
1704 | MachineInstrBuilder MIB = BuildMI(BB&: *MBB, I: End, MIMD: End->getDebugLoc(), |
1705 | MCID: TII->get(Opcode: Opc)); |
1706 | MIB.addDef(RegNo: ARM::LR); |
1707 | unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; |
1708 | MIB.add(MO: End->getOperand(i: Off + 0)); |
1709 | MIB.add(MO: End->getOperand(i: Off + 1)); |
1710 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); |
1711 | LoLoop.ToRemove.insert(Ptr: LoLoop.Dec); |
1712 | LoLoop.ToRemove.insert(Ptr: End); |
1713 | return &*MIB; |
1714 | }; |
1715 | |
1716 | // TODO: We should be able to automatically remove these branches before we |
1717 | // get here - probably by teaching analyzeBranch about the pseudo |
1718 | // instructions. |
1719 | // If there is an unconditional branch, after I, that just branches to the |
1720 | // next block, remove it. |
1721 | auto RemoveDeadBranch = [](MachineInstr *I) { |
1722 | MachineBasicBlock *BB = I->getParent(); |
1723 | MachineInstr *Terminator = &BB->instr_back(); |
1724 | if (Terminator->isUnconditionalBranch() && I != Terminator) { |
1725 | MachineBasicBlock *Succ = Terminator->getOperand(i: 0).getMBB(); |
1726 | if (BB->isLayoutSuccessor(MBB: Succ)) { |
1727 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); |
1728 | Terminator->eraseFromParent(); |
1729 | } |
1730 | } |
1731 | }; |
1732 | |
1733 | // And VMOVCopies need to become 2xVMOVD for tail predication to be valid. |
1734 | // Anything other MQPRCopy can be converted to MVE_VORR later on. |
1735 | auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) { |
1736 | for (auto *MI : VMOVCopies) { |
1737 | LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI); |
1738 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
1739 | MachineBasicBlock *MBB = MI->getParent(); |
1740 | Register Dst = MI->getOperand(i: 0).getReg(); |
1741 | Register Src = MI->getOperand(i: 1).getReg(); |
1742 | auto MIB1 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
1743 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2) |
1744 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2) |
1745 | .add(MOs: predOps(Pred: ARMCC::AL)); |
1746 | (void)MIB1; |
1747 | LLVM_DEBUG(dbgs() << " into " << *MIB1); |
1748 | auto MIB2 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
1749 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2 + 1) |
1750 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2 + 1) |
1751 | .add(MOs: predOps(Pred: ARMCC::AL)); |
1752 | LLVM_DEBUG(dbgs() << " and " << *MIB2); |
1753 | (void)MIB2; |
1754 | MI->eraseFromParent(); |
1755 | } |
1756 | }; |
1757 | |
1758 | if (LoLoop.Revert) { |
1759 | if (isWhileLoopStart(MI: *LoLoop.Start)) |
1760 | RevertWhile(MI: LoLoop.Start); |
1761 | else |
1762 | RevertDo(MI: LoLoop.Start); |
1763 | if (LoLoop.Dec == LoLoop.End) |
1764 | RevertLoopEndDec(MI: LoLoop.End); |
1765 | else |
1766 | RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec)); |
1767 | } else { |
1768 | ExpandVMOVCopies(LoLoop.VMOVCopies); |
1769 | LoLoop.Start = ExpandLoopStart(LoLoop); |
1770 | if (LoLoop.Start) |
1771 | RemoveDeadBranch(LoLoop.Start); |
1772 | LoLoop.End = ExpandLoopEnd(LoLoop); |
1773 | RemoveDeadBranch(LoLoop.End); |
1774 | if (LoLoop.IsTailPredicationLegal()) |
1775 | ConvertVPTBlocks(LoLoop); |
1776 | for (auto *I : LoLoop.ToRemove) { |
1777 | LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); |
1778 | I->eraseFromParent(); |
1779 | } |
1780 | for (auto *I : LoLoop.BlockMasksToRecompute) { |
1781 | LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); |
1782 | recomputeVPTBlockMask(Instr&: *I); |
1783 | LLVM_DEBUG(dbgs() << " ... done: " << *I); |
1784 | } |
1785 | } |
1786 | |
1787 | PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); |
1788 | DFS.ProcessLoop(); |
1789 | const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); |
1790 | fullyRecomputeLiveIns(MBBs: PostOrder); |
1791 | |
1792 | for (auto *MBB : reverse(C: PostOrder)) |
1793 | recomputeLivenessFlags(MBB&: *MBB); |
1794 | |
1795 | // We've moved, removed and inserted new instructions, so update RDA. |
1796 | RDA->reset(); |
1797 | } |
1798 | |
1799 | bool ARMLowOverheadLoops::RevertNonLoops() { |
1800 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n" ); |
1801 | bool Changed = false; |
1802 | |
1803 | for (auto &MBB : *MF) { |
1804 | SmallVector<MachineInstr*, 4> Starts; |
1805 | SmallVector<MachineInstr*, 4> Decs; |
1806 | SmallVector<MachineInstr*, 4> Ends; |
1807 | SmallVector<MachineInstr *, 4> EndDecs; |
1808 | SmallVector<MachineInstr *, 4> MQPRCopies; |
1809 | |
1810 | for (auto &I : MBB) { |
1811 | if (isLoopStart(MI: I)) |
1812 | Starts.push_back(Elt: &I); |
1813 | else if (I.getOpcode() == ARM::t2LoopDec) |
1814 | Decs.push_back(Elt: &I); |
1815 | else if (I.getOpcode() == ARM::t2LoopEnd) |
1816 | Ends.push_back(Elt: &I); |
1817 | else if (I.getOpcode() == ARM::t2LoopEndDec) |
1818 | EndDecs.push_back(Elt: &I); |
1819 | else if (I.getOpcode() == ARM::MQPRCopy) |
1820 | MQPRCopies.push_back(Elt: &I); |
1821 | } |
1822 | |
1823 | if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() && |
1824 | MQPRCopies.empty()) |
1825 | continue; |
1826 | |
1827 | Changed = true; |
1828 | |
1829 | for (auto *Start : Starts) { |
1830 | if (isWhileLoopStart(MI: *Start)) |
1831 | RevertWhile(MI: Start); |
1832 | else |
1833 | RevertDo(MI: Start); |
1834 | } |
1835 | for (auto *Dec : Decs) |
1836 | RevertLoopDec(MI: Dec); |
1837 | |
1838 | for (auto *End : Ends) |
1839 | RevertLoopEnd(MI: End); |
1840 | for (auto *End : EndDecs) |
1841 | RevertLoopEndDec(MI: End); |
1842 | for (auto *MI : MQPRCopies) { |
1843 | LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI); |
1844 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
1845 | MachineBasicBlock *MBB = MI->getParent(); |
1846 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR), |
1847 | DestReg: MI->getOperand(i: 0).getReg()) |
1848 | .add(MO: MI->getOperand(i: 1)) |
1849 | .add(MO: MI->getOperand(i: 1)); |
1850 | addUnpredicatedMveVpredROp(MIB, DestReg: MI->getOperand(i: 0).getReg()); |
1851 | MI->eraseFromParent(); |
1852 | } |
1853 | } |
1854 | return Changed; |
1855 | } |
1856 | |
1857 | FunctionPass *llvm::createARMLowOverheadLoopsPass() { |
1858 | return new ARMLowOverheadLoops(); |
1859 | } |
1860 | |