1 | //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Finalize v8.1-m low-overhead loops by converting the associated pseudo |
10 | /// instructions into machine operations. |
11 | /// The expectation is that the loop contains three pseudo instructions: |
12 | /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop |
13 | /// form should be in the preheader, whereas the while form should be in the |
14 | /// preheaders only predecessor. |
15 | /// - t2LoopDec - placed within in the loop body. |
16 | /// - t2LoopEnd - the loop latch terminator. |
17 | /// |
18 | /// In addition to this, we also look for the presence of the VCTP instruction, |
19 | /// which determines whether we can generated the tail-predicated low-overhead |
20 | /// loop form. |
21 | /// |
22 | /// Assumptions and Dependencies: |
23 | /// Low-overhead loops are constructed and executed using a setup instruction: |
24 | /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. |
25 | /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range |
26 | /// but fixed polarity: WLS can only branch forwards and LE can only branch |
27 | /// backwards. These restrictions mean that this pass is dependent upon block |
28 | /// layout and block sizes, which is why it's the last pass to run. The same is |
29 | /// true for ConstantIslands, but this pass does not increase the size of the |
30 | /// basic blocks, nor does it change the CFG. Instructions are mainly removed |
31 | /// during the transform and pseudo instructions are replaced by real ones. In |
32 | /// some cases, when we have to revert to a 'normal' loop, we have to introduce |
33 | /// multiple instructions for a single pseudo (see RevertWhile and |
34 | /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd |
35 | /// are defined to be as large as this maximum sequence of replacement |
36 | /// instructions. |
37 | /// |
38 | /// A note on VPR.P0 (the lane mask): |
39 | /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a |
40 | /// "VPT Active" context (which includes low-overhead loops and vpt blocks). |
41 | /// They will simply "and" the result of their calculation with the current |
42 | /// value of VPR.P0. You can think of it like this: |
43 | /// \verbatim |
44 | /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs |
45 | /// VPR.P0 &= Value |
46 | /// else |
47 | /// VPR.P0 = Value |
48 | /// \endverbatim |
49 | /// When we're inside the low-overhead loop (between DLSTP and LETP), we always |
50 | /// fall in the "VPT active" case, so we can consider that all VPR writes by |
51 | /// one of those instruction is actually a "and". |
52 | //===----------------------------------------------------------------------===// |
53 | |
54 | #include "ARM.h" |
55 | #include "ARMBaseInstrInfo.h" |
56 | #include "ARMBaseRegisterInfo.h" |
57 | #include "ARMBasicBlockInfo.h" |
58 | #include "ARMSubtarget.h" |
59 | #include "MVETailPredUtils.h" |
60 | #include "Thumb2InstrInfo.h" |
61 | #include "llvm/ADT/SetOperations.h" |
62 | #include "llvm/ADT/SetVector.h" |
63 | #include "llvm/CodeGen/LivePhysRegs.h" |
64 | #include "llvm/CodeGen/MachineFrameInfo.h" |
65 | #include "llvm/CodeGen/MachineFunctionPass.h" |
66 | #include "llvm/CodeGen/MachineLoopInfo.h" |
67 | #include "llvm/CodeGen/MachineLoopUtils.h" |
68 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
69 | #include "llvm/CodeGen/Passes.h" |
70 | #include "llvm/CodeGen/ReachingDefAnalysis.h" |
71 | #include "llvm/MC/MCInstrDesc.h" |
72 | |
73 | using namespace llvm; |
74 | |
75 | #define DEBUG_TYPE "arm-low-overhead-loops" |
76 | #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" |
77 | |
78 | static cl::opt<bool> |
79 | DisableTailPredication("arm-loloops-disable-tailpred" , cl::Hidden, |
80 | cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass" ), |
81 | cl::init(Val: false)); |
82 | |
83 | static cl::opt<bool> |
84 | DisableOmitDLS("arm-disable-omit-dls" , cl::Hidden, |
85 | cl::desc("Disable omitting 'dls lr, lr' instructions" ), |
86 | cl::init(Val: false)); |
87 | |
88 | static bool isVectorPredicated(MachineInstr *MI) { |
89 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
90 | return PIdx != -1 && MI->getOperand(i: PIdx + 1).getReg() == ARM::VPR; |
91 | } |
92 | |
93 | static bool isVectorPredicate(MachineInstr *MI) { |
94 | return MI->findRegisterDefOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
95 | } |
96 | |
97 | static bool hasVPRUse(MachineInstr &MI) { |
98 | return MI.findRegisterUseOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
99 | } |
100 | |
101 | static bool isDomainMVE(MachineInstr *MI) { |
102 | uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; |
103 | return Domain == ARMII::DomainMVE; |
104 | } |
105 | |
106 | static int getVecSize(const MachineInstr &MI) { |
107 | const MCInstrDesc &MCID = MI.getDesc(); |
108 | uint64_t Flags = MCID.TSFlags; |
109 | return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift; |
110 | } |
111 | |
112 | static bool shouldInspect(MachineInstr &MI) { |
113 | if (MI.isDebugInstr()) |
114 | return false; |
115 | return isDomainMVE(MI: &MI) || isVectorPredicate(MI: &MI) || hasVPRUse(MI); |
116 | } |
117 | |
118 | static bool isHorizontalReduction(const MachineInstr &MI) { |
119 | const MCInstrDesc &MCID = MI.getDesc(); |
120 | uint64_t Flags = MCID.TSFlags; |
121 | return (Flags & ARMII::HorizontalReduction) != 0; |
122 | } |
123 | |
124 | namespace { |
125 | |
126 | using InstSet = SmallPtrSetImpl<MachineInstr *>; |
127 | |
128 | class PostOrderLoopTraversal { |
129 | MachineLoop &ML; |
130 | MachineLoopInfo &MLI; |
131 | SmallPtrSet<MachineBasicBlock*, 4> Visited; |
132 | SmallVector<MachineBasicBlock*, 4> Order; |
133 | |
134 | public: |
135 | PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) |
136 | : ML(ML), MLI(MLI) { } |
137 | |
138 | const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { |
139 | return Order; |
140 | } |
141 | |
142 | // Visit all the blocks within the loop, as well as exit blocks and any |
143 | // blocks properly dominating the header. |
144 | void ProcessLoop() { |
145 | std::function<void(MachineBasicBlock*)> Search = [this, &Search] |
146 | (MachineBasicBlock *MBB) -> void { |
147 | if (Visited.count(Ptr: MBB)) |
148 | return; |
149 | |
150 | Visited.insert(Ptr: MBB); |
151 | for (auto *Succ : MBB->successors()) { |
152 | if (!ML.contains(BB: Succ)) |
153 | continue; |
154 | Search(Succ); |
155 | } |
156 | Order.push_back(Elt: MBB); |
157 | }; |
158 | |
159 | // Insert exit blocks. |
160 | SmallVector<MachineBasicBlock*, 2> ExitBlocks; |
161 | ML.getExitBlocks(ExitBlocks); |
162 | append_range(C&: Order, R&: ExitBlocks); |
163 | |
164 | // Then add the loop body. |
165 | Search(ML.getHeader()); |
166 | |
167 | // Then try the preheader and its predecessors. |
168 | std::function<void(MachineBasicBlock*)> GetPredecessor = |
169 | [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { |
170 | Order.push_back(Elt: MBB); |
171 | if (MBB->pred_size() == 1) |
172 | GetPredecessor(*MBB->pred_begin()); |
173 | }; |
174 | |
175 | if (auto * = ML.getLoopPreheader()) |
176 | GetPredecessor(Preheader); |
177 | else if (auto * = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
178 | GetPredecessor(Preheader); |
179 | } |
180 | }; |
181 | |
182 | class VPTBlock { |
183 | SmallVector<MachineInstr *, 4> Insts; |
184 | |
185 | public: |
186 | VPTBlock(MachineInstr *MI) { Insts.push_back(Elt: MI); } |
187 | |
188 | // Have we found an instruction within the block which defines the vpr? If |
189 | // so, not all the instructions in the block will have the same predicate. |
190 | bool hasUniformPredicate() { return getDivergent() == nullptr; } |
191 | |
192 | // If it exists, return the first internal instruction which modifies the |
193 | // VPR. |
194 | MachineInstr *getDivergent() { |
195 | SmallVectorImpl<MachineInstr *> &Insts = getInsts(); |
196 | for (unsigned i = 1; i < Insts.size(); ++i) { |
197 | MachineInstr *Next = Insts[i]; |
198 | if (isVectorPredicate(MI: Next)) |
199 | return Next; // Found an instruction altering the vpr. |
200 | } |
201 | return nullptr; |
202 | } |
203 | |
204 | void insert(MachineInstr *MI) { |
205 | Insts.push_back(Elt: MI); |
206 | // VPT/VPST + 4 predicated instructions. |
207 | assert(Insts.size() <= 5 && "Too many instructions in VPT block!" ); |
208 | } |
209 | |
210 | bool containsVCTP() const { return llvm::any_of(Range: Insts, P: isVCTP); } |
211 | |
212 | unsigned size() const { return Insts.size(); } |
213 | SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } |
214 | }; |
215 | |
216 | // Represent the current state of the VPR and hold all instances which |
217 | // represent a VPT block, which is a list of instructions that begins with a |
218 | // VPT/VPST and has a maximum of four proceeding instructions. All |
219 | // instructions within the block are predicated upon the vpr and we allow |
220 | // instructions to define the vpr within in the block too. |
221 | class VPTState { |
222 | friend struct LowOverheadLoop; |
223 | |
224 | SmallVector<VPTBlock, 4> Blocks; |
225 | SetVector<MachineInstr *> CurrentPredicates; |
226 | std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts; |
227 | |
228 | void CreateVPTBlock(MachineInstr *MI) { |
229 | assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) |
230 | && "Can't begin VPT without predicate" ); |
231 | Blocks.emplace_back(Args&: MI); |
232 | // The execution of MI is predicated upon the current set of instructions |
233 | // that are AND'ed together to form the VPR predicate value. In the case |
234 | // that MI is a VPT, CurrentPredicates will also just be MI. |
235 | PredicatedInsts[MI] = CurrentPredicates; |
236 | } |
237 | |
238 | void addInst(MachineInstr *MI) { |
239 | Blocks.back().insert(MI); |
240 | PredicatedInsts[MI] = CurrentPredicates; |
241 | } |
242 | |
243 | void addPredicate(MachineInstr *MI) { |
244 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); |
245 | CurrentPredicates.insert(X: MI); |
246 | } |
247 | |
248 | void resetPredicate(MachineInstr *MI) { |
249 | LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); |
250 | CurrentPredicates.clear(); |
251 | CurrentPredicates.insert(X: MI); |
252 | } |
253 | |
254 | public: |
255 | // Return whether the given instruction is predicated upon a VCTP. |
256 | bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { |
257 | SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]; |
258 | if (Exclusive && Predicates.size() != 1) |
259 | return false; |
260 | // We do not know how to convert an else predicate of a VCTP. |
261 | if (getVPTInstrPredicate(MI: *MI) == ARMVCC::Else) |
262 | return false; |
263 | return llvm::any_of(Range&: Predicates, P: isVCTP); |
264 | } |
265 | |
266 | // Is the VPST, controlling the block entry, predicated upon a VCTP. |
267 | bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) { |
268 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
269 | return isPredicatedOnVCTP(MI: Insts.front(), Exclusive); |
270 | } |
271 | |
272 | // If this block begins with a VPT, we can check whether it's using |
273 | // at least one predicated input(s), as well as possible loop invariant |
274 | // which would result in it being implicitly predicated. |
275 | bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) { |
276 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
277 | MachineInstr *VPT = Insts.front(); |
278 | assert(isVPTOpcode(VPT->getOpcode()) && |
279 | "Expected VPT block to begin with VPT/VPST" ); |
280 | |
281 | if (VPT->getOpcode() == ARM::MVE_VPST) |
282 | return false; |
283 | |
284 | // If the VPT block does not define something that is an "output", then |
285 | // the tail-predicated version will just perform a subset of the original |
286 | // vpt block, where the last lanes should not be used. |
287 | if (isVPTOpcode(Opc: VPT->getOpcode()) && |
288 | all_of(Range&: Block.getInsts(), P: [](const MachineInstr *MI) { |
289 | return !MI->mayStore() && !MI->mayLoad() && |
290 | !isHorizontalReduction(MI: *MI) && !isVCTP(MI); |
291 | })) |
292 | return true; |
293 | |
294 | auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { |
295 | MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx)); |
296 | return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op); |
297 | }; |
298 | |
299 | auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { |
300 | MachineOperand &MO = MI->getOperand(i: Idx); |
301 | if (!MO.isReg() || !MO.getReg()) |
302 | return true; |
303 | |
304 | SmallPtrSet<MachineInstr *, 2> Defs; |
305 | RDA.getGlobalReachingDefs(MI, PhysReg: MO.getReg(), Defs); |
306 | if (Defs.empty()) |
307 | return true; |
308 | |
309 | for (auto *Def : Defs) |
310 | if (Def->getParent() == VPT->getParent()) |
311 | return false; |
312 | return true; |
313 | }; |
314 | |
315 | // Check that at least one of the operands is directly predicated on a |
316 | // vctp and allow an invariant value too. |
317 | return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && |
318 | (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && |
319 | (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); |
320 | } |
321 | |
322 | bool isValid(ReachingDefAnalysis &RDA) { |
323 | // All predication within the loop should be based on vctp. If the block |
324 | // isn't predicated on entry, check whether the vctp is within the block |
325 | // and that all other instructions are then predicated on it. |
326 | for (auto &Block : Blocks) { |
327 | if (isEntryPredicatedOnVCTP(Block, Exclusive: false) && |
328 | !any_of(Range: drop_begin(RangeOrContainer&: Block.getInsts()), P: [](const MachineInstr *MI) { |
329 | return getVPTInstrPredicate(MI: *MI) == ARMVCC::Else; |
330 | })) |
331 | continue; |
332 | if (hasImplicitlyValidVPT(Block, RDA)) |
333 | continue; |
334 | |
335 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
336 | // We don't know how to convert a block with just a VPT;VCTP into |
337 | // anything valid once we remove the VCTP. For now just bail out. |
338 | assert(isVPTOpcode(Insts.front()->getOpcode()) && |
339 | "Expected VPT block to start with a VPST or VPT!" ); |
340 | if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && |
341 | isVCTP(MI: Insts.back())) |
342 | return false; |
343 | |
344 | for (auto *MI : Insts) { |
345 | // Check that any internal VCTPs are 'Then' predicated. |
346 | if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then) |
347 | return false; |
348 | // Skip other instructions that build up the predicate. |
349 | if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) |
350 | continue; |
351 | // Check that any other instructions are predicated upon a vctp. |
352 | // TODO: We could infer when VPTs are implicitly predicated on the |
353 | // vctp (when the operands are predicated). |
354 | if (!isPredicatedOnVCTP(MI)) { |
355 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); |
356 | return false; |
357 | } |
358 | } |
359 | } |
360 | return true; |
361 | } |
362 | }; |
363 | |
364 | struct LowOverheadLoop { |
365 | |
366 | MachineLoop &ML; |
367 | MachineBasicBlock * = nullptr; |
368 | MachineLoopInfo &MLI; |
369 | ReachingDefAnalysis &RDA; |
370 | const TargetRegisterInfo &TRI; |
371 | const ARMBaseInstrInfo &TII; |
372 | MachineFunction *MF = nullptr; |
373 | MachineBasicBlock::iterator StartInsertPt; |
374 | MachineBasicBlock *StartInsertBB = nullptr; |
375 | MachineInstr *Start = nullptr; |
376 | MachineInstr *Dec = nullptr; |
377 | MachineInstr *End = nullptr; |
378 | MachineOperand TPNumElements; |
379 | SmallVector<MachineInstr *, 4> VCTPs; |
380 | SmallPtrSet<MachineInstr *, 4> ToRemove; |
381 | SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute; |
382 | SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs; |
383 | SmallPtrSet<MachineInstr *, 4> VMOVCopies; |
384 | bool Revert = false; |
385 | bool CannotTailPredicate = false; |
386 | VPTState VPTstate; |
387 | |
388 | LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, |
389 | ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, |
390 | const ARMBaseInstrInfo &TII) |
391 | : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), |
392 | TPNumElements(MachineOperand::CreateImm(Val: 0)) { |
393 | MF = ML.getHeader()->getParent(); |
394 | if (auto *MBB = ML.getLoopPreheader()) |
395 | Preheader = MBB; |
396 | else if (auto *MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
397 | Preheader = MBB; |
398 | } |
399 | |
400 | // If this is an MVE instruction, check that we know how to use tail |
401 | // predication with it. Record VPT blocks and return whether the |
402 | // instruction is valid for tail predication. |
403 | bool ValidateMVEInst(MachineInstr *MI); |
404 | |
405 | void AnalyseMVEInst(MachineInstr *MI) { |
406 | CannotTailPredicate = !ValidateMVEInst(MI); |
407 | } |
408 | |
409 | bool IsTailPredicationLegal() const { |
410 | // For now, let's keep things really simple and only support a single |
411 | // block for tail predication. |
412 | return !Revert && FoundAllComponents() && !VCTPs.empty() && |
413 | !CannotTailPredicate && ML.getNumBlocks() == 1; |
414 | } |
415 | |
416 | // Given that MI is a VCTP, check that is equivalent to any other VCTPs |
417 | // found. |
418 | bool AddVCTP(MachineInstr *MI); |
419 | |
420 | // Check that the predication in the loop will be equivalent once we |
421 | // perform the conversion. Also ensure that we can provide the number |
422 | // of elements to the loop start instruction. |
423 | bool ValidateTailPredicate(); |
424 | |
425 | // Check that any values available outside of the loop will be the same |
426 | // after tail predication conversion. |
427 | bool ValidateLiveOuts(); |
428 | |
429 | // Check the branch targets are within range and we satisfy our |
430 | // restrictions. |
431 | void Validate(ARMBasicBlockUtils *BBUtils); |
432 | |
433 | bool FoundAllComponents() const { |
434 | return Start && Dec && End; |
435 | } |
436 | |
437 | SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; } |
438 | |
439 | // Return the operand for the loop start instruction. This will be the loop |
440 | // iteration count, or the number of elements if we're tail predicating. |
441 | MachineOperand &getLoopStartOperand() { |
442 | if (IsTailPredicationLegal()) |
443 | return TPNumElements; |
444 | return Start->getOperand(i: 1); |
445 | } |
446 | |
447 | unsigned getStartOpcode() const { |
448 | bool IsDo = isDoLoopStart(MI: *Start); |
449 | if (!IsTailPredicationLegal()) |
450 | return IsDo ? ARM::t2DLS : ARM::t2WLS; |
451 | |
452 | return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo); |
453 | } |
454 | |
455 | void dump() const { |
456 | if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; |
457 | if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; |
458 | if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; |
459 | if (!VCTPs.empty()) { |
460 | dbgs() << "ARM Loops: Found VCTP(s):\n" ; |
461 | for (auto *MI : VCTPs) |
462 | dbgs() << " - " << *MI; |
463 | } |
464 | if (!FoundAllComponents()) |
465 | dbgs() << "ARM Loops: Not a low-overhead loop.\n" ; |
466 | else if (!(Start && Dec && End)) |
467 | dbgs() << "ARM Loops: Failed to find all loop components.\n" ; |
468 | } |
469 | }; |
470 | |
471 | class ARMLowOverheadLoops : public MachineFunctionPass { |
472 | MachineFunction *MF = nullptr; |
473 | MachineLoopInfo *MLI = nullptr; |
474 | ReachingDefAnalysis *RDA = nullptr; |
475 | const ARMBaseInstrInfo *TII = nullptr; |
476 | MachineRegisterInfo *MRI = nullptr; |
477 | const TargetRegisterInfo *TRI = nullptr; |
478 | std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; |
479 | |
480 | public: |
481 | static char ID; |
482 | |
483 | ARMLowOverheadLoops() : MachineFunctionPass(ID) { } |
484 | |
485 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
486 | AU.setPreservesCFG(); |
487 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
488 | AU.addRequired<ReachingDefAnalysis>(); |
489 | MachineFunctionPass::getAnalysisUsage(AU); |
490 | } |
491 | |
492 | bool runOnMachineFunction(MachineFunction &MF) override; |
493 | |
494 | MachineFunctionProperties getRequiredProperties() const override { |
495 | return MachineFunctionProperties().set( |
496 | MachineFunctionProperties::Property::NoVRegs).set( |
497 | MachineFunctionProperties::Property::TracksLiveness); |
498 | } |
499 | |
500 | StringRef getPassName() const override { |
501 | return ARM_LOW_OVERHEAD_LOOPS_NAME; |
502 | } |
503 | |
504 | private: |
505 | bool ProcessLoop(MachineLoop *ML); |
506 | |
507 | bool RevertNonLoops(); |
508 | |
509 | void RevertWhile(MachineInstr *MI) const; |
510 | void RevertDo(MachineInstr *MI) const; |
511 | |
512 | bool RevertLoopDec(MachineInstr *MI) const; |
513 | |
514 | void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; |
515 | |
516 | void RevertLoopEndDec(MachineInstr *MI) const; |
517 | |
518 | void ConvertVPTBlocks(LowOverheadLoop &LoLoop); |
519 | |
520 | MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); |
521 | |
522 | void Expand(LowOverheadLoop &LoLoop); |
523 | |
524 | void IterationCountDCE(LowOverheadLoop &LoLoop); |
525 | }; |
526 | } |
527 | |
528 | char ARMLowOverheadLoops::ID = 0; |
529 | |
530 | INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, |
531 | false, false) |
532 | |
533 | static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, |
534 | InstSet &ToRemove, InstSet &Ignore) { |
535 | |
536 | // Check that we can remove all of Killed without having to modify any IT |
537 | // blocks. |
538 | auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) { |
539 | // Collect the dead code and the MBBs in which they reside. |
540 | SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; |
541 | for (auto *Dead : Killed) |
542 | BasicBlocks.insert(Ptr: Dead->getParent()); |
543 | |
544 | // Collect IT blocks in all affected basic blocks. |
545 | std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; |
546 | for (auto *MBB : BasicBlocks) { |
547 | for (auto &IT : *MBB) { |
548 | if (IT.getOpcode() != ARM::t2IT) |
549 | continue; |
550 | RDA.getReachingLocalUses(MI: &IT, PhysReg: MCRegister::from(Val: ARM::ITSTATE), |
551 | Uses&: ITBlocks[&IT]); |
552 | } |
553 | } |
554 | |
555 | // If we're removing all of the instructions within an IT block, then |
556 | // also remove the IT instruction. |
557 | SmallPtrSet<MachineInstr *, 2> ModifiedITs; |
558 | SmallPtrSet<MachineInstr *, 2> RemoveITs; |
559 | for (auto *Dead : Killed) { |
560 | if (MachineOperand *MO = |
561 | Dead->findRegisterUseOperand(Reg: ARM::ITSTATE, /*TRI=*/nullptr)) { |
562 | MachineInstr *IT = RDA.getMIOperand(MI: Dead, MO&: *MO); |
563 | RemoveITs.insert(Ptr: IT); |
564 | auto &CurrentBlock = ITBlocks[IT]; |
565 | CurrentBlock.erase(Ptr: Dead); |
566 | if (CurrentBlock.empty()) |
567 | ModifiedITs.erase(Ptr: IT); |
568 | else |
569 | ModifiedITs.insert(Ptr: IT); |
570 | } |
571 | } |
572 | if (!ModifiedITs.empty()) |
573 | return false; |
574 | Killed.insert(I: RemoveITs.begin(), E: RemoveITs.end()); |
575 | return true; |
576 | }; |
577 | |
578 | SmallPtrSet<MachineInstr *, 2> Uses; |
579 | if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore)) |
580 | return false; |
581 | |
582 | if (WontCorruptITs(Uses, RDA)) { |
583 | ToRemove.insert(I: Uses.begin(), E: Uses.end()); |
584 | LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI |
585 | << " - can also remove:\n" ; |
586 | for (auto *Use : Uses) |
587 | dbgs() << " - " << *Use); |
588 | |
589 | SmallPtrSet<MachineInstr*, 4> Killed; |
590 | RDA.collectKilledOperands(MI, Dead&: Killed); |
591 | if (WontCorruptITs(Killed, RDA)) { |
592 | ToRemove.insert(I: Killed.begin(), E: Killed.end()); |
593 | LLVM_DEBUG(for (auto *Dead : Killed) |
594 | dbgs() << " - " << *Dead); |
595 | } |
596 | return true; |
597 | } |
598 | return false; |
599 | } |
600 | |
601 | bool LowOverheadLoop::ValidateTailPredicate() { |
602 | if (!IsTailPredicationLegal()) { |
603 | LLVM_DEBUG(if (VCTPs.empty()) |
604 | dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; |
605 | dbgs() << "ARM Loops: Tail-predication is not valid.\n" ); |
606 | return false; |
607 | } |
608 | |
609 | assert(!VCTPs.empty() && "VCTP instruction expected but is not set" ); |
610 | assert(ML.getBlocks().size() == 1 && |
611 | "Shouldn't be processing a loop with more than one block" ); |
612 | |
613 | if (DisableTailPredication) { |
614 | LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n" ); |
615 | return false; |
616 | } |
617 | |
618 | if (!VPTstate.isValid(RDA)) { |
619 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n" ); |
620 | return false; |
621 | } |
622 | |
623 | if (!ValidateLiveOuts()) { |
624 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n" ); |
625 | return false; |
626 | } |
627 | |
628 | // For tail predication, we need to provide the number of elements, instead |
629 | // of the iteration count, to the loop start instruction. The number of |
630 | // elements is provided to the vctp instruction, so we need to check that |
631 | // we can use this register at InsertPt. |
632 | MachineInstr *VCTP = VCTPs.back(); |
633 | if (Start->getOpcode() == ARM::t2DoLoopStartTP || |
634 | Start->getOpcode() == ARM::t2WhileLoopStartTP) { |
635 | TPNumElements = Start->getOperand(i: 2); |
636 | StartInsertPt = Start; |
637 | StartInsertBB = Start->getParent(); |
638 | } else { |
639 | TPNumElements = VCTP->getOperand(i: 1); |
640 | MCRegister NumElements = TPNumElements.getReg().asMCReg(); |
641 | |
642 | // If the register is defined within loop, then we can't perform TP. |
643 | // TODO: Check whether this is just a mov of a register that would be |
644 | // available. |
645 | if (RDA.hasLocalDefBefore(MI: VCTP, PhysReg: NumElements)) { |
646 | LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n" ); |
647 | return false; |
648 | } |
649 | |
650 | // The element count register maybe defined after InsertPt, in which case we |
651 | // need to try to move either InsertPt or the def so that the [w|d]lstp can |
652 | // use the value. |
653 | |
654 | if (StartInsertPt != StartInsertBB->end() && |
655 | !RDA.isReachingDefLiveOut(MI: &*StartInsertPt, PhysReg: NumElements)) { |
656 | if (auto *ElemDef = |
657 | RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, PhysReg: NumElements)) { |
658 | if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) { |
659 | ElemDef->removeFromParent(); |
660 | StartInsertBB->insert(I: StartInsertPt, MI: ElemDef); |
661 | LLVM_DEBUG(dbgs() |
662 | << "ARM Loops: Moved element count def: " << *ElemDef); |
663 | } else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) { |
664 | StartInsertPt->removeFromParent(); |
665 | StartInsertBB->insertAfter(I: MachineBasicBlock::iterator(ElemDef), |
666 | MI: &*StartInsertPt); |
667 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); |
668 | } else { |
669 | // If we fail to move an instruction and the element count is provided |
670 | // by a mov, use the mov operand if it will have the same value at the |
671 | // insertion point |
672 | MachineOperand Operand = ElemDef->getOperand(i: 1); |
673 | if (isMovRegOpcode(Opc: ElemDef->getOpcode()) && |
674 | RDA.getUniqueReachingMIDef(MI: ElemDef, PhysReg: Operand.getReg().asMCReg()) == |
675 | RDA.getUniqueReachingMIDef(MI: &*StartInsertPt, |
676 | PhysReg: Operand.getReg().asMCReg())) { |
677 | TPNumElements = Operand; |
678 | NumElements = TPNumElements.getReg(); |
679 | } else { |
680 | LLVM_DEBUG(dbgs() |
681 | << "ARM Loops: Unable to move element count to loop " |
682 | << "start instruction.\n" ); |
683 | return false; |
684 | } |
685 | } |
686 | } |
687 | } |
688 | |
689 | // Especially in the case of while loops, InsertBB may not be the |
690 | // preheader, so we need to check that the register isn't redefined |
691 | // before entering the loop. |
692 | auto CannotProvideElements = [this](MachineBasicBlock *MBB, |
693 | MCRegister NumElements) { |
694 | if (MBB->empty()) |
695 | return false; |
696 | // NumElements is redefined in this block. |
697 | if (RDA.hasLocalDefBefore(MI: &MBB->back(), PhysReg: NumElements)) |
698 | return true; |
699 | |
700 | // Don't continue searching up through multiple predecessors. |
701 | if (MBB->pred_size() > 1) |
702 | return true; |
703 | |
704 | return false; |
705 | }; |
706 | |
707 | // Search backwards for a def, until we get to InsertBB. |
708 | MachineBasicBlock *MBB = Preheader; |
709 | while (MBB && MBB != StartInsertBB) { |
710 | if (CannotProvideElements(MBB, NumElements)) { |
711 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n" ); |
712 | return false; |
713 | } |
714 | MBB = *MBB->pred_begin(); |
715 | } |
716 | } |
717 | |
718 | // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect |
719 | // world the [w|d]lstp instruction would be last instruction in the preheader |
720 | // and so it would only affect instructions within the loop body. But due to |
721 | // scheduling, and/or the logic in this pass (above), the insertion point can |
722 | // be moved earlier. So if the Loop Start isn't the last instruction in the |
723 | // preheader, and if the initial element count is smaller than the vector |
724 | // width, the Loop Start instruction will immediately generate one or more |
725 | // false lane mask which can, incorrectly, affect the proceeding MVE |
726 | // instructions in the preheader. |
727 | if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) { |
728 | LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n" ); |
729 | return false; |
730 | } |
731 | |
732 | // For any DoubleWidthResultInstrs we found whilst scanning instructions, they |
733 | // need to compute an output size that is smaller than the VCTP mask operates |
734 | // on. The VecSize of the DoubleWidthResult is the larger vector size - the |
735 | // size it extends into, so any VCTP VecSize <= is valid. |
736 | unsigned VCTPVecSize = getVecSize(MI: *VCTP); |
737 | for (MachineInstr *MI : DoubleWidthResultInstrs) { |
738 | unsigned InstrVecSize = getVecSize(MI: *MI); |
739 | if (InstrVecSize > VCTPVecSize) { |
740 | LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP " |
741 | << "VecSize:\n" << *MI); |
742 | return false; |
743 | } |
744 | } |
745 | |
746 | // Check that the value change of the element count is what we expect and |
747 | // that the predication will be equivalent. For this we need: |
748 | // NumElements = NumElements - VectorWidth. The sub will be a sub immediate |
749 | // and we can also allow register copies within the chain too. |
750 | auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { |
751 | return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth; |
752 | }; |
753 | |
754 | MachineBasicBlock *MBB = VCTP->getParent(); |
755 | // Remove modifications to the element count since they have no purpose in a |
756 | // tail predicated loop. Explicitly refer to the vctp operand no matter which |
757 | // register NumElements has been assigned to, since that is what the |
758 | // modifications will be using |
759 | if (auto *Def = RDA.getUniqueReachingMIDef( |
760 | MI: &MBB->back(), PhysReg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
761 | SmallPtrSet<MachineInstr*, 2> ElementChain; |
762 | SmallPtrSet<MachineInstr*, 2> Ignore; |
763 | unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode()); |
764 | |
765 | Ignore.insert(I: VCTPs.begin(), E: VCTPs.end()); |
766 | |
767 | if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) { |
768 | bool FoundSub = false; |
769 | |
770 | for (auto *MI : ElementChain) { |
771 | if (isMovRegOpcode(Opc: MI->getOpcode())) |
772 | continue; |
773 | |
774 | if (isSubImmOpcode(Opc: MI->getOpcode())) { |
775 | if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { |
776 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
777 | " count: " << *MI); |
778 | return false; |
779 | } |
780 | FoundSub = true; |
781 | } else { |
782 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
783 | " count: " << *MI); |
784 | return false; |
785 | } |
786 | } |
787 | ToRemove.insert(I: ElementChain.begin(), E: ElementChain.end()); |
788 | } |
789 | } |
790 | |
791 | // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we |
792 | // can also remove any extra instructions in the preheader, which often |
793 | // includes a now unused MOV. |
794 | if ((Start->getOpcode() == ARM::t2DoLoopStartTP || |
795 | Start->getOpcode() == ARM::t2WhileLoopStartTP) && |
796 | Preheader && !Preheader->empty() && |
797 | !RDA.hasLocalDefBefore(MI: VCTP, PhysReg: VCTP->getOperand(i: 1).getReg())) { |
798 | if (auto *Def = RDA.getUniqueReachingMIDef( |
799 | MI: &Preheader->back(), PhysReg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
800 | SmallPtrSet<MachineInstr*, 2> Ignore; |
801 | Ignore.insert(I: VCTPs.begin(), E: VCTPs.end()); |
802 | TryRemove(MI: Def, RDA, ToRemove, Ignore); |
803 | } |
804 | } |
805 | |
806 | return true; |
807 | } |
808 | |
809 | static bool isRegInClass(const MachineOperand &MO, |
810 | const TargetRegisterClass *Class) { |
811 | return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg()); |
812 | } |
813 | |
814 | // MVE 'narrowing' operate on half a lane, reading from half and writing |
815 | // to half, which are referred to has the top and bottom half. The other |
816 | // half retains its previous value. |
817 | static bool retainsPreviousHalfElement(const MachineInstr &MI) { |
818 | const MCInstrDesc &MCID = MI.getDesc(); |
819 | uint64_t Flags = MCID.TSFlags; |
820 | return (Flags & ARMII::RetainsPreviousHalfElement) != 0; |
821 | } |
822 | |
823 | // Some MVE instructions read from the top/bottom halves of their operand(s) |
824 | // and generate a vector result with result elements that are double the |
825 | // width of the input. |
826 | static bool producesDoubleWidthResult(const MachineInstr &MI) { |
827 | const MCInstrDesc &MCID = MI.getDesc(); |
828 | uint64_t Flags = MCID.TSFlags; |
829 | return (Flags & ARMII::DoubleWidthResult) != 0; |
830 | } |
831 | |
832 | // Can this instruction generate a non-zero result when given only zeroed |
833 | // operands? This allows us to know that, given operands with false bytes |
834 | // zeroed by masked loads, that the result will also contain zeros in those |
835 | // bytes. |
836 | static bool canGenerateNonZeros(const MachineInstr &MI) { |
837 | |
838 | // Check for instructions which can write into a larger element size, |
839 | // possibly writing into a previous zero'd lane. |
840 | if (producesDoubleWidthResult(MI)) |
841 | return true; |
842 | |
843 | switch (MI.getOpcode()) { |
844 | default: |
845 | break; |
846 | // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow |
847 | // fp16 -> fp32 vector conversions. |
848 | // Instructions that perform a NOT will generate 1s from 0s. |
849 | case ARM::MVE_VMVN: |
850 | case ARM::MVE_VORN: |
851 | // Count leading zeros will do just that! |
852 | case ARM::MVE_VCLZs8: |
853 | case ARM::MVE_VCLZs16: |
854 | case ARM::MVE_VCLZs32: |
855 | return true; |
856 | } |
857 | return false; |
858 | } |
859 | |
860 | // Look at its register uses to see if it only can only receive zeros |
861 | // into its false lanes which would then produce zeros. Also check that |
862 | // the output register is also defined by an FalseLanesZero instruction |
863 | // so that if tail-predication happens, the lanes that aren't updated will |
864 | // still be zeros. |
865 | static bool producesFalseLanesZero(MachineInstr &MI, |
866 | const TargetRegisterClass *QPRs, |
867 | const ReachingDefAnalysis &RDA, |
868 | InstSet &FalseLanesZero) { |
869 | if (canGenerateNonZeros(MI)) |
870 | return false; |
871 | |
872 | bool isPredicated = isVectorPredicated(MI: &MI); |
873 | // Predicated loads will write zeros to the falsely predicated bytes of the |
874 | // destination register. |
875 | if (MI.mayLoad()) |
876 | return isPredicated; |
877 | |
878 | auto IsZeroInit = [](MachineInstr *Def) { |
879 | return !isVectorPredicated(MI: Def) && |
880 | Def->getOpcode() == ARM::MVE_VMOVimmi32 && |
881 | Def->getOperand(i: 1).getImm() == 0; |
882 | }; |
883 | |
884 | bool AllowScalars = isHorizontalReduction(MI); |
885 | for (auto &MO : MI.operands()) { |
886 | if (!MO.isReg() || !MO.getReg()) |
887 | continue; |
888 | if (!isRegInClass(MO, Class: QPRs) && AllowScalars) |
889 | continue; |
890 | // Skip the lr predicate reg |
891 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI); |
892 | if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) |
893 | continue; |
894 | |
895 | // Check that this instruction will produce zeros in its false lanes: |
896 | // - If it only consumes false lanes zero or constant 0 (vmov #0) |
897 | // - If it's predicated, it only matters that it's def register already has |
898 | // false lane zeros, so we can ignore the uses. |
899 | SmallPtrSet<MachineInstr *, 2> Defs; |
900 | RDA.getGlobalReachingDefs(MI: &MI, PhysReg: MO.getReg(), Defs); |
901 | if (Defs.empty()) |
902 | return false; |
903 | for (auto *Def : Defs) { |
904 | if (Def == &MI || FalseLanesZero.count(Ptr: Def) || IsZeroInit(Def)) |
905 | continue; |
906 | if (MO.isUse() && isPredicated) |
907 | continue; |
908 | return false; |
909 | } |
910 | } |
911 | LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); |
912 | return true; |
913 | } |
914 | |
915 | bool LowOverheadLoop::ValidateLiveOuts() { |
916 | // We want to find out if the tail-predicated version of this loop will |
917 | // produce the same values as the loop in its original form. For this to |
918 | // be true, the newly inserted implicit predication must not change the |
919 | // the (observable) results. |
920 | // We're doing this because many instructions in the loop will not be |
921 | // predicated and so the conversion from VPT predication to tail-predication |
922 | // can result in different values being produced; due to the tail-predication |
923 | // preventing many instructions from updating their falsely predicated |
924 | // lanes. This analysis assumes that all the instructions perform lane-wise |
925 | // operations and don't perform any exchanges. |
926 | // A masked load, whether through VPT or tail predication, will write zeros |
927 | // to any of the falsely predicated bytes. So, from the loads, we know that |
928 | // the false lanes are zeroed and here we're trying to track that those false |
929 | // lanes remain zero, or where they change, the differences are masked away |
930 | // by their user(s). |
931 | // All MVE stores have to be predicated, so we know that any predicate load |
932 | // operands, or stored results are equivalent already. Other explicitly |
933 | // predicated instructions will perform the same operation in the original |
934 | // loop and the tail-predicated form too. Because of this, we can insert |
935 | // loads, stores and other predicated instructions into our Predicated |
936 | // set and build from there. |
937 | const TargetRegisterClass *QPRs = TRI.getRegClass(i: ARM::MQPRRegClassID); |
938 | SetVector<MachineInstr *> FalseLanesUnknown; |
939 | SmallPtrSet<MachineInstr *, 4> FalseLanesZero; |
940 | SmallPtrSet<MachineInstr *, 4> Predicated; |
941 | MachineBasicBlock * = ML.getHeader(); |
942 | |
943 | LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n" ); |
944 | |
945 | for (auto &MI : *Header) { |
946 | if (!shouldInspect(MI)) |
947 | continue; |
948 | |
949 | if (isVCTP(MI: &MI) || isVPTOpcode(Opc: MI.getOpcode())) |
950 | continue; |
951 | |
952 | bool isPredicated = isVectorPredicated(MI: &MI); |
953 | bool retainsOrReduces = |
954 | retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); |
955 | |
956 | if (isPredicated) |
957 | Predicated.insert(Ptr: &MI); |
958 | if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) |
959 | FalseLanesZero.insert(Ptr: &MI); |
960 | else if (MI.getNumDefs() == 0) |
961 | continue; |
962 | else if (!isPredicated && retainsOrReduces) { |
963 | LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); |
964 | return false; |
965 | } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) |
966 | FalseLanesUnknown.insert(X: &MI); |
967 | } |
968 | |
969 | LLVM_DEBUG({ |
970 | dbgs() << " Predicated:\n" ; |
971 | for (auto *I : Predicated) |
972 | dbgs() << " " << *I; |
973 | dbgs() << " FalseLanesZero:\n" ; |
974 | for (auto *I : FalseLanesZero) |
975 | dbgs() << " " << *I; |
976 | dbgs() << " FalseLanesUnknown:\n" ; |
977 | for (auto *I : FalseLanesUnknown) |
978 | dbgs() << " " << *I; |
979 | }); |
980 | |
981 | auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, |
982 | SmallPtrSetImpl<MachineInstr *> &Predicated) { |
983 | SmallPtrSet<MachineInstr *, 2> Uses; |
984 | RDA.getGlobalUses(MI, PhysReg: MO.getReg().asMCReg(), Uses); |
985 | for (auto *Use : Uses) { |
986 | if (Use != MI && !Predicated.count(Ptr: Use)) |
987 | return false; |
988 | } |
989 | return true; |
990 | }; |
991 | |
992 | // Visit the unknowns in reverse so that we can start at the values being |
993 | // stored and then we can work towards the leaves, hopefully adding more |
994 | // instructions to Predicated. Successfully terminating the loop means that |
995 | // all the unknown values have to found to be masked by predicated user(s). |
996 | // For any unpredicated values, we store them in NonPredicated so that we |
997 | // can later check whether these form a reduction. |
998 | SmallPtrSet<MachineInstr*, 2> NonPredicated; |
999 | for (auto *MI : reverse(C&: FalseLanesUnknown)) { |
1000 | for (auto &MO : MI->operands()) { |
1001 | if (!isRegInClass(MO, Class: QPRs) || !MO.isDef()) |
1002 | continue; |
1003 | if (!HasPredicatedUsers(MI, MO, Predicated)) { |
1004 | LLVM_DEBUG(dbgs() << " Found an unknown def of : " |
1005 | << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); |
1006 | NonPredicated.insert(Ptr: MI); |
1007 | break; |
1008 | } |
1009 | } |
1010 | // Any unknown false lanes have been masked away by the user(s). |
1011 | if (!NonPredicated.contains(Ptr: MI)) |
1012 | Predicated.insert(Ptr: MI); |
1013 | } |
1014 | |
1015 | SmallPtrSet<MachineInstr *, 2> LiveOutMIs; |
1016 | SmallVector<MachineBasicBlock *, 2> ExitBlocks; |
1017 | ML.getExitBlocks(ExitBlocks); |
1018 | assert(ML.getNumBlocks() == 1 && "Expected single block loop!" ); |
1019 | assert(ExitBlocks.size() == 1 && "Expected a single exit block" ); |
1020 | MachineBasicBlock *ExitBB = ExitBlocks.front(); |
1021 | for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { |
1022 | // TODO: Instead of blocking predication, we could move the vctp to the exit |
1023 | // block and calculate it's operand there in or the preheader. |
1024 | if (RegMask.PhysReg == ARM::VPR) { |
1025 | LLVM_DEBUG(dbgs() << " VPR is live in to the exit block." ); |
1026 | return false; |
1027 | } |
1028 | // Check Q-regs that are live in the exit blocks. We don't collect scalars |
1029 | // because they won't be affected by lane predication. |
1030 | if (QPRs->contains(Reg: RegMask.PhysReg)) |
1031 | if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, PhysReg: RegMask.PhysReg)) |
1032 | LiveOutMIs.insert(Ptr: MI); |
1033 | } |
1034 | |
1035 | // We've already validated that any VPT predication within the loop will be |
1036 | // equivalent when we perform the predication transformation; so we know that |
1037 | // any VPT predicated instruction is predicated upon VCTP. Any live-out |
1038 | // instruction needs to be predicated, so check this here. The instructions |
1039 | // in NonPredicated have been found to be a reduction that we can ensure its |
1040 | // legality. Any MQPRCopy found will need to validate its input as if it was |
1041 | // live out. |
1042 | SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); |
1043 | while (!Worklist.empty()) { |
1044 | MachineInstr *MI = Worklist.pop_back_val(); |
1045 | if (MI->getOpcode() == ARM::MQPRCopy) { |
1046 | VMOVCopies.insert(Ptr: MI); |
1047 | MachineInstr *CopySrc = |
1048 | RDA.getUniqueReachingMIDef(MI, PhysReg: MI->getOperand(i: 1).getReg()); |
1049 | if (CopySrc) |
1050 | Worklist.push_back(Elt: CopySrc); |
1051 | } else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) { |
1052 | LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); |
1053 | VMOVCopies.clear(); |
1054 | return false; |
1055 | } |
1056 | } |
1057 | |
1058 | return true; |
1059 | } |
1060 | |
1061 | void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { |
1062 | if (Revert) |
1063 | return; |
1064 | |
1065 | // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] |
1066 | // can only jump back. |
1067 | auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, |
1068 | ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { |
1069 | MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd |
1070 | ? End->getOperand(i: 1).getMBB() |
1071 | : End->getOperand(i: 2).getMBB(); |
1072 | // TODO Maybe there's cases where the target doesn't have to be the header, |
1073 | // but for now be safe and revert. |
1074 | if (TgtBB != ML.getHeader()) { |
1075 | LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n" ); |
1076 | return false; |
1077 | } |
1078 | |
1079 | // The WLS and LE instructions have 12-bits for the label offset. WLS |
1080 | // requires a positive offset, while LE uses negative. |
1081 | if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) || |
1082 | !BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: 4094)) { |
1083 | LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n" ); |
1084 | return false; |
1085 | } |
1086 | |
1087 | if (isWhileLoopStart(MI: *Start)) { |
1088 | MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(MI: *Start); |
1089 | if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) || |
1090 | !BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: 4094)) { |
1091 | LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n" ); |
1092 | return false; |
1093 | } |
1094 | } |
1095 | return true; |
1096 | }; |
1097 | |
1098 | StartInsertPt = MachineBasicBlock::iterator(Start); |
1099 | StartInsertBB = Start->getParent(); |
1100 | LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at " |
1101 | << *StartInsertPt); |
1102 | |
1103 | Revert = !ValidateRanges(Start, End, BBUtils, ML); |
1104 | CannotTailPredicate = !ValidateTailPredicate(); |
1105 | } |
1106 | |
1107 | bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { |
1108 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); |
1109 | if (VCTPs.empty()) { |
1110 | VCTPs.push_back(Elt: MI); |
1111 | return true; |
1112 | } |
1113 | |
1114 | // If we find another VCTP, check whether it uses the same value as the main VCTP. |
1115 | // If it does, store it in the VCTPs set, else refuse it. |
1116 | MachineInstr *Prev = VCTPs.back(); |
1117 | if (!Prev->getOperand(i: 1).isIdenticalTo(Other: MI->getOperand(i: 1)) || |
1118 | !RDA.hasSameReachingDef(A: Prev, B: MI, PhysReg: MI->getOperand(i: 1).getReg().asMCReg())) { |
1119 | LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " |
1120 | "definition from the main VCTP" ); |
1121 | return false; |
1122 | } |
1123 | VCTPs.push_back(Elt: MI); |
1124 | return true; |
1125 | } |
1126 | |
1127 | static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { |
1128 | |
1129 | auto GetFrameIndex = [](MachineMemOperand *Operand) { |
1130 | const PseudoSourceValue *PseudoValue = Operand->getPseudoValue(); |
1131 | if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) { |
1132 | if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) { |
1133 | return FS->getFrameIndex(); |
1134 | } |
1135 | } |
1136 | return -1; |
1137 | }; |
1138 | |
1139 | auto IsStackOp = [GetFrameIndex](MachineInstr *I) { |
1140 | switch (I->getOpcode()) { |
1141 | case ARM::MVE_VSTRWU32: |
1142 | case ARM::MVE_VLDRWU32: { |
1143 | return I->getOperand(i: 1).getReg() == ARM::SP && |
1144 | I->memoperands().size() == 1 && |
1145 | GetFrameIndex(I->memoperands().front()) >= 0; |
1146 | } |
1147 | default: |
1148 | return false; |
1149 | } |
1150 | }; |
1151 | |
1152 | // An unpredicated vector register spill is allowed if all of the uses of the |
1153 | // stack slot are within the loop |
1154 | if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) |
1155 | return false; |
1156 | |
1157 | // Search all blocks after the loop for accesses to the same stack slot. |
1158 | // ReachingDefAnalysis doesn't work for sp as it relies on registers being |
1159 | // live-out (which sp never is) to know what blocks to look in |
1160 | if (MI->memoperands().size() == 0) |
1161 | return false; |
1162 | int FI = GetFrameIndex(MI->memoperands().front()); |
1163 | |
1164 | auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo(); |
1165 | if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI)) |
1166 | return false; |
1167 | |
1168 | SmallVector<MachineBasicBlock *> Frontier; |
1169 | ML->getExitBlocks(ExitBlocks&: Frontier); |
1170 | SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()}; |
1171 | unsigned Idx = 0; |
1172 | while (Idx < Frontier.size()) { |
1173 | MachineBasicBlock *BB = Frontier[Idx]; |
1174 | bool LookAtSuccessors = true; |
1175 | for (auto &I : *BB) { |
1176 | if (!IsStackOp(&I) || I.memoperands().size() == 0) |
1177 | continue; |
1178 | if (GetFrameIndex(I.memoperands().front()) != FI) |
1179 | continue; |
1180 | // If this block has a store to the stack slot before any loads then we |
1181 | // can ignore the block |
1182 | if (I.getOpcode() == ARM::MVE_VSTRWU32) { |
1183 | LookAtSuccessors = false; |
1184 | break; |
1185 | } |
1186 | // If the store and the load are using the same stack slot then the |
1187 | // store isn't valid for tail predication |
1188 | if (I.getOpcode() == ARM::MVE_VLDRWU32) |
1189 | return false; |
1190 | } |
1191 | |
1192 | if (LookAtSuccessors) { |
1193 | for (auto *Succ : BB->successors()) { |
1194 | if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ)) |
1195 | Frontier.push_back(Elt: Succ); |
1196 | } |
1197 | } |
1198 | Visited.insert(Ptr: BB); |
1199 | Idx++; |
1200 | } |
1201 | |
1202 | return true; |
1203 | } |
1204 | |
1205 | bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { |
1206 | if (CannotTailPredicate) |
1207 | return false; |
1208 | |
1209 | if (!shouldInspect(MI&: *MI)) |
1210 | return true; |
1211 | |
1212 | if (MI->getOpcode() == ARM::MVE_VPSEL || |
1213 | MI->getOpcode() == ARM::MVE_VPNOT) { |
1214 | // TODO: Allow VPSEL and VPNOT, we currently cannot because: |
1215 | // 1) It will use the VPR as a predicate operand, but doesn't have to be |
1216 | // instead a VPT block, which means we can assert while building up |
1217 | // the VPT block because we don't find another VPT or VPST to being a new |
1218 | // one. |
1219 | // 2) VPSEL still requires a VPR operand even after tail predicating, |
1220 | // which means we can't remove it unless there is another |
1221 | // instruction, such as vcmp, that can provide the VPR def. |
1222 | return false; |
1223 | } |
1224 | |
1225 | // Record all VCTPs and check that they're equivalent to one another. |
1226 | if (isVCTP(MI) && !AddVCTP(MI)) |
1227 | return false; |
1228 | |
1229 | // Inspect uses first so that any instructions that alter the VPR don't |
1230 | // alter the predicate upon themselves. |
1231 | const MCInstrDesc &MCID = MI->getDesc(); |
1232 | bool IsUse = false; |
1233 | unsigned LastOpIdx = MI->getNumOperands() - 1; |
1234 | for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) { |
1235 | const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index()); |
1236 | if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) |
1237 | continue; |
1238 | |
1239 | if (ARM::isVpred(op: Op.value().OperandType)) { |
1240 | VPTstate.addInst(MI); |
1241 | IsUse = true; |
1242 | } else if (MI->getOpcode() != ARM::MVE_VPST) { |
1243 | LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); |
1244 | return false; |
1245 | } |
1246 | } |
1247 | |
1248 | // If we find an instruction that has been marked as not valid for tail |
1249 | // predication, only allow the instruction if it's contained within a valid |
1250 | // VPT block. |
1251 | bool RequiresExplicitPredication = |
1252 | (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; |
1253 | if (isDomainMVE(MI) && RequiresExplicitPredication) { |
1254 | if (MI->getOpcode() == ARM::MQPRCopy) |
1255 | return true; |
1256 | if (!IsUse && producesDoubleWidthResult(MI: *MI)) { |
1257 | DoubleWidthResultInstrs.insert(Ptr: MI); |
1258 | return true; |
1259 | } |
1260 | |
1261 | LLVM_DEBUG(if (!IsUse) dbgs() |
1262 | << "ARM Loops: Can't tail predicate: " << *MI); |
1263 | return IsUse; |
1264 | } |
1265 | |
1266 | // If the instruction is already explicitly predicated, then the conversion |
1267 | // will be fine, but ensure that all store operations are predicated. |
1268 | if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML)) |
1269 | return IsUse; |
1270 | |
1271 | // If this instruction defines the VPR, update the predicate for the |
1272 | // proceeding instructions. |
1273 | if (isVectorPredicate(MI)) { |
1274 | // Clear the existing predicate when we're not in VPT Active state, |
1275 | // otherwise we add to it. |
1276 | if (!isVectorPredicated(MI)) |
1277 | VPTstate.resetPredicate(MI); |
1278 | else |
1279 | VPTstate.addPredicate(MI); |
1280 | } |
1281 | |
1282 | // Finally once the predicate has been modified, we can start a new VPT |
1283 | // block if necessary. |
1284 | if (isVPTOpcode(Opc: MI->getOpcode())) |
1285 | VPTstate.CreateVPTBlock(MI); |
1286 | |
1287 | return true; |
1288 | } |
1289 | |
1290 | bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { |
1291 | const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>(); |
1292 | if (!ST.hasLOB()) |
1293 | return false; |
1294 | |
1295 | MF = &mf; |
1296 | LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n" ); |
1297 | |
1298 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
1299 | RDA = &getAnalysis<ReachingDefAnalysis>(); |
1300 | MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); |
1301 | MRI = &MF->getRegInfo(); |
1302 | TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); |
1303 | TRI = ST.getRegisterInfo(); |
1304 | BBUtils = std::make_unique<ARMBasicBlockUtils>(args&: *MF); |
1305 | BBUtils->computeAllBlockSizes(); |
1306 | BBUtils->adjustBBOffsetsAfter(MBB: &MF->front()); |
1307 | |
1308 | bool Changed = false; |
1309 | for (auto *ML : *MLI) { |
1310 | if (ML->isOutermost()) |
1311 | Changed |= ProcessLoop(ML); |
1312 | } |
1313 | Changed |= RevertNonLoops(); |
1314 | return Changed; |
1315 | } |
1316 | |
1317 | bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { |
1318 | bool Changed = false; |
1319 | |
1320 | // Process inner loops first. |
1321 | for (MachineLoop *L : *ML) |
1322 | Changed |= ProcessLoop(ML: L); |
1323 | |
1324 | LLVM_DEBUG({ |
1325 | dbgs() << "ARM Loops: Processing loop containing:\n" ; |
1326 | if (auto *Preheader = ML->getLoopPreheader()) |
1327 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
1328 | else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true)) |
1329 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
1330 | for (auto *MBB : ML->getBlocks()) |
1331 | dbgs() << " - Block: " << printMBBReference(*MBB) << "\n" ; |
1332 | }); |
1333 | |
1334 | // Search the given block for a loop start instruction. If one isn't found, |
1335 | // and there's only one predecessor block, search that one too. |
1336 | std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = |
1337 | [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { |
1338 | for (auto &MI : *MBB) { |
1339 | if (isLoopStart(MI)) |
1340 | return &MI; |
1341 | } |
1342 | if (MBB->pred_size() == 1) |
1343 | return SearchForStart(*MBB->pred_begin()); |
1344 | return nullptr; |
1345 | }; |
1346 | |
1347 | LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); |
1348 | // Search the preheader for the start intrinsic. |
1349 | // FIXME: I don't see why we shouldn't be supporting multiple predecessors |
1350 | // with potentially multiple set.loop.iterations, so we need to enable this. |
1351 | if (LoLoop.Preheader) |
1352 | LoLoop.Start = SearchForStart(LoLoop.Preheader); |
1353 | else |
1354 | return Changed; |
1355 | |
1356 | // Find the low-overhead loop components and decide whether or not to fall |
1357 | // back to a normal loop. Also look for a vctp instructions and decide |
1358 | // whether we can convert that predicate using tail predication. |
1359 | for (auto *MBB : reverse(C: ML->getBlocks())) { |
1360 | for (auto &MI : *MBB) { |
1361 | if (MI.isDebugValue()) |
1362 | continue; |
1363 | else if (MI.getOpcode() == ARM::t2LoopDec) |
1364 | LoLoop.Dec = &MI; |
1365 | else if (MI.getOpcode() == ARM::t2LoopEnd) |
1366 | LoLoop.End = &MI; |
1367 | else if (MI.getOpcode() == ARM::t2LoopEndDec) |
1368 | LoLoop.End = LoLoop.Dec = &MI; |
1369 | else if (isLoopStart(MI)) |
1370 | LoLoop.Start = &MI; |
1371 | else if (MI.getDesc().isCall()) { |
1372 | // TODO: Though the call will require LE to execute again, does this |
1373 | // mean we should revert? Always executing LE hopefully should be |
1374 | // faster than performing a sub,cmp,br or even subs,br. |
1375 | LoLoop.Revert = true; |
1376 | LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n" ); |
1377 | } else { |
1378 | // Record VPR defs and build up their corresponding vpt blocks. |
1379 | // Check we know how to tail predicate any mve instructions. |
1380 | LoLoop.AnalyseMVEInst(MI: &MI); |
1381 | } |
1382 | } |
1383 | } |
1384 | |
1385 | LLVM_DEBUG(LoLoop.dump()); |
1386 | if (!LoLoop.FoundAllComponents()) { |
1387 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n" ); |
1388 | return Changed; |
1389 | } |
1390 | |
1391 | assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart && |
1392 | "Expected t2WhileLoopStart to be removed before regalloc!" ); |
1393 | |
1394 | // Check that the only instruction using LoopDec is LoopEnd. This can only |
1395 | // happen when the Dec and End are separate, not a single t2LoopEndDec. |
1396 | // TODO: Check for copy chains that really have no effect. |
1397 | if (LoLoop.Dec != LoLoop.End) { |
1398 | SmallPtrSet<MachineInstr *, 2> Uses; |
1399 | RDA->getReachingLocalUses(MI: LoLoop.Dec, PhysReg: MCRegister::from(Val: ARM::LR), Uses); |
1400 | if (Uses.size() > 1 || !Uses.count(Ptr: LoLoop.End)) { |
1401 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n" ); |
1402 | LoLoop.Revert = true; |
1403 | } |
1404 | } |
1405 | LoLoop.Validate(BBUtils: BBUtils.get()); |
1406 | Expand(LoLoop); |
1407 | return true; |
1408 | } |
1409 | |
1410 | // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a |
1411 | // beq that branches to the exit branch. |
1412 | // TODO: We could also try to generate a cbz if the value in LR is also in |
1413 | // another low register. |
1414 | void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { |
1415 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); |
1416 | MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(MI: *MI); |
1417 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
1418 | ARM::tBcc : ARM::t2Bcc; |
1419 | |
1420 | RevertWhileLoopStartLR(MI, TII, BrOpc); |
1421 | } |
1422 | |
1423 | void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { |
1424 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); |
1425 | RevertDoLoopStart(MI, TII); |
1426 | } |
1427 | |
1428 | bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { |
1429 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); |
1430 | MachineBasicBlock *MBB = MI->getParent(); |
1431 | SmallPtrSet<MachineInstr*, 1> Ignore; |
1432 | for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { |
1433 | if (I->getOpcode() == ARM::t2LoopEnd) { |
1434 | Ignore.insert(Ptr: &*I); |
1435 | break; |
1436 | } |
1437 | } |
1438 | |
1439 | // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. |
1440 | bool SetFlags = |
1441 | RDA->isSafeToDefRegAt(MI, PhysReg: MCRegister::from(Val: ARM::CPSR), Ignore); |
1442 | |
1443 | llvm::RevertLoopDec(MI, TII, SetFlags); |
1444 | return SetFlags; |
1445 | } |
1446 | |
1447 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
1448 | void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { |
1449 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); |
1450 | |
1451 | MachineBasicBlock *DestBB = MI->getOperand(i: 1).getMBB(); |
1452 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
1453 | ARM::tBcc : ARM::t2Bcc; |
1454 | |
1455 | llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); |
1456 | } |
1457 | |
1458 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
1459 | void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { |
1460 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); |
1461 | assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!" ); |
1462 | MachineBasicBlock *MBB = MI->getParent(); |
1463 | |
1464 | MachineInstrBuilder MIB = |
1465 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri)); |
1466 | MIB.addDef(RegNo: ARM::LR); |
1467 | MIB.add(MO: MI->getOperand(i: 1)); |
1468 | MIB.addImm(Val: 1); |
1469 | MIB.addImm(Val: ARMCC::AL); |
1470 | MIB.addReg(RegNo: ARM::NoRegister); |
1471 | MIB.addReg(RegNo: ARM::CPSR); |
1472 | MIB->getOperand(i: 5).setIsDef(true); |
1473 | |
1474 | MachineBasicBlock *DestBB = MI->getOperand(i: 2).getMBB(); |
1475 | unsigned BrOpc = |
1476 | BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? ARM::tBcc : ARM::t2Bcc; |
1477 | |
1478 | // Create bne |
1479 | MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: BrOpc)); |
1480 | MIB.add(MO: MI->getOperand(i: 2)); // branch target |
1481 | MIB.addImm(Val: ARMCC::NE); // condition code |
1482 | MIB.addReg(RegNo: ARM::CPSR); |
1483 | |
1484 | MI->eraseFromParent(); |
1485 | } |
1486 | |
1487 | // Perform dead code elimation on the loop iteration count setup expression. |
1488 | // If we are tail-predicating, the number of elements to be processed is the |
1489 | // operand of the VCTP instruction in the vector body, see getCount(), which is |
1490 | // register $r3 in this example: |
1491 | // |
1492 | // $lr = big-itercount-expression |
1493 | // .. |
1494 | // $lr = t2DoLoopStart renamable $lr |
1495 | // vector.body: |
1496 | // .. |
1497 | // $vpr = MVE_VCTP32 renamable $r3 |
1498 | // renamable $lr = t2LoopDec killed renamable $lr, 1 |
1499 | // t2LoopEnd renamable $lr, %vector.body |
1500 | // tB %end |
1501 | // |
1502 | // What we would like achieve here is to replace the do-loop start pseudo |
1503 | // instruction t2DoLoopStart with: |
1504 | // |
1505 | // $lr = MVE_DLSTP_32 killed renamable $r3 |
1506 | // |
1507 | // Thus, $r3 which defines the number of elements, is written to $lr, |
1508 | // and then we want to delete the whole chain that used to define $lr, |
1509 | // see the comment below how this chain could look like. |
1510 | // |
1511 | void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { |
1512 | if (!LoLoop.IsTailPredicationLegal()) |
1513 | return; |
1514 | |
1515 | LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n" ); |
1516 | |
1517 | MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: 1); |
1518 | if (!Def) { |
1519 | LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n" ); |
1520 | return; |
1521 | } |
1522 | |
1523 | // Collect and remove the users of iteration count. |
1524 | SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, |
1525 | LoLoop.End }; |
1526 | if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed)) |
1527 | LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n" ); |
1528 | } |
1529 | |
1530 | MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { |
1531 | LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n" ); |
1532 | // When using tail-predication, try to delete the dead code that was used to |
1533 | // calculate the number of loop iterations. |
1534 | IterationCountDCE(LoLoop); |
1535 | |
1536 | MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; |
1537 | MachineInstr *Start = LoLoop.Start; |
1538 | MachineBasicBlock *MBB = LoLoop.StartInsertBB; |
1539 | unsigned Opc = LoLoop.getStartOpcode(); |
1540 | MachineOperand &Count = LoLoop.getLoopStartOperand(); |
1541 | |
1542 | // A DLS lr, lr we needn't emit |
1543 | MachineInstr* NewStart; |
1544 | if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() && |
1545 | Count.getReg() == ARM::LR) { |
1546 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr" ); |
1547 | NewStart = nullptr; |
1548 | } else { |
1549 | MachineInstrBuilder MIB = |
1550 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: Start->getDebugLoc(), MCID: TII->get(Opcode: Opc)); |
1551 | |
1552 | MIB.addDef(RegNo: ARM::LR); |
1553 | MIB.add(MO: Count); |
1554 | if (isWhileLoopStart(MI: *Start)) |
1555 | MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start)); |
1556 | |
1557 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); |
1558 | NewStart = &*MIB; |
1559 | } |
1560 | |
1561 | LoLoop.ToRemove.insert(Ptr: Start); |
1562 | return NewStart; |
1563 | } |
1564 | |
1565 | void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { |
1566 | auto RemovePredicate = [](MachineInstr *MI) { |
1567 | if (MI->isDebugInstr()) |
1568 | return; |
1569 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); |
1570 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
1571 | assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction" ); |
1572 | assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && |
1573 | "Expected Then predicate!" ); |
1574 | MI->getOperand(i: PIdx).setImm(ARMVCC::None); |
1575 | MI->getOperand(i: PIdx + 1).setReg(0); |
1576 | }; |
1577 | |
1578 | for (auto &Block : LoLoop.getVPTBlocks()) { |
1579 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
1580 | |
1581 | auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { |
1582 | assert(TheVCMP && "Replacing a removed or non-existent VCMP" ); |
1583 | // Replace the VCMP with a VPT |
1584 | MachineInstrBuilder MIB = |
1585 | BuildMI(BB&: *At->getParent(), I: At, MIMD: At->getDebugLoc(), |
1586 | MCID: TII->get(Opcode: VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode()))); |
1587 | MIB.addImm(Val: ARMVCC::Then); |
1588 | // Register one |
1589 | MIB.add(MO: TheVCMP->getOperand(i: 1)); |
1590 | // Register two |
1591 | MIB.add(MO: TheVCMP->getOperand(i: 2)); |
1592 | // The comparison code, e.g. ge, eq, lt |
1593 | MIB.add(MO: TheVCMP->getOperand(i: 3)); |
1594 | LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); |
1595 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
1596 | LoLoop.ToRemove.insert(Ptr: TheVCMP); |
1597 | TheVCMP = nullptr; |
1598 | }; |
1599 | |
1600 | if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ Exclusive: true)) { |
1601 | MachineInstr *VPST = Insts.front(); |
1602 | if (Block.hasUniformPredicate()) { |
1603 | // A vpt block starting with VPST, is only predicated upon vctp and has no |
1604 | // internal vpr defs: |
1605 | // - Remove vpst. |
1606 | // - Unpredicate the remaining instructions. |
1607 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1608 | for (unsigned i = 1; i < Insts.size(); ++i) |
1609 | RemovePredicate(Insts[i]); |
1610 | } else { |
1611 | // The VPT block has a non-uniform predicate but it uses a vpst and its |
1612 | // entry is guarded only by a vctp, which means we: |
1613 | // - Need to remove the original vpst. |
1614 | // - Then need to unpredicate any following instructions, until |
1615 | // we come across the divergent vpr def. |
1616 | // - Insert a new vpst to predicate the instruction(s) that following |
1617 | // the divergent vpr def. |
1618 | MachineInstr *Divergent = Block.getDivergent(); |
1619 | MachineBasicBlock *MBB = Divergent->getParent(); |
1620 | auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); |
1621 | while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) |
1622 | ++DivergentNext; |
1623 | |
1624 | bool DivergentNextIsPredicated = |
1625 | DivergentNext != MBB->end() && |
1626 | getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None; |
1627 | |
1628 | for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; |
1629 | I != E; ++I) |
1630 | RemovePredicate(&*I); |
1631 | |
1632 | // Check if the instruction defining vpr is a vcmp so it can be combined |
1633 | // with the VPST This should be the divergent instruction |
1634 | MachineInstr *VCMP = |
1635 | VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != 0 ? Divergent : nullptr; |
1636 | |
1637 | if (DivergentNextIsPredicated) { |
1638 | // Insert a VPST at the divergent only if the next instruction |
1639 | // would actually use it. A VCMP following a VPST can be |
1640 | // merged into a VPT so do that instead if the VCMP exists. |
1641 | if (!VCMP) { |
1642 | // Create a VPST (with a null mask for now, we'll recompute it |
1643 | // later) |
1644 | MachineInstrBuilder MIB = |
1645 | BuildMI(BB&: *Divergent->getParent(), I: Divergent, |
1646 | MIMD: Divergent->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPST)); |
1647 | MIB.addImm(Val: 0); |
1648 | LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); |
1649 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
1650 | } else { |
1651 | // No RDA checks are necessary here since the VPST would have been |
1652 | // directly after the VCMP |
1653 | ReplaceVCMPWithVPT(VCMP, VCMP); |
1654 | } |
1655 | } |
1656 | } |
1657 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1658 | LoLoop.ToRemove.insert(Ptr: VPST); |
1659 | } else if (Block.containsVCTP()) { |
1660 | // The vctp will be removed, so either the entire block will be dead or |
1661 | // the block mask of the vp(s)t will need to be recomputed. |
1662 | MachineInstr *VPST = Insts.front(); |
1663 | if (Block.size() == 2) { |
1664 | assert(VPST->getOpcode() == ARM::MVE_VPST && |
1665 | "Found a VPST in an otherwise empty vpt block" ); |
1666 | LoLoop.ToRemove.insert(Ptr: VPST); |
1667 | } else |
1668 | LoLoop.BlockMasksToRecompute.insert(Ptr: VPST); |
1669 | } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { |
1670 | // If this block starts with a VPST then attempt to merge it with the |
1671 | // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT |
1672 | // block that no longer exists |
1673 | MachineInstr *VPST = Insts.front(); |
1674 | auto Next = ++MachineBasicBlock::iterator(VPST); |
1675 | assert(getVPTInstrPredicate(*Next) != ARMVCC::None && |
1676 | "The instruction after a VPST must be predicated" ); |
1677 | (void)Next; |
1678 | MachineInstr *VprDef = RDA->getUniqueReachingMIDef(MI: VPST, PhysReg: ARM::VPR); |
1679 | if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) && |
1680 | !LoLoop.ToRemove.contains(Ptr: VprDef)) { |
1681 | MachineInstr *VCMP = VprDef; |
1682 | // The VCMP and VPST can only be merged if the VCMP's operands will have |
1683 | // the same values at the VPST. |
1684 | // If any of the instructions between the VCMP and VPST are predicated |
1685 | // then a different code path is expected to have merged the VCMP and |
1686 | // VPST already. |
1687 | if (std::none_of(first: ++MachineBasicBlock::iterator(VCMP), |
1688 | last: MachineBasicBlock::iterator(VPST), pred: hasVPRUse) && |
1689 | RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: 1).getReg()) && |
1690 | RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: 2).getReg())) { |
1691 | ReplaceVCMPWithVPT(VCMP, VPST); |
1692 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1693 | LoLoop.ToRemove.insert(Ptr: VPST); |
1694 | } |
1695 | } |
1696 | } |
1697 | } |
1698 | |
1699 | LoLoop.ToRemove.insert(I: LoLoop.VCTPs.begin(), E: LoLoop.VCTPs.end()); |
1700 | } |
1701 | |
1702 | void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { |
1703 | |
1704 | // Combine the LoopDec and LoopEnd instructions into LE(TP). |
1705 | auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { |
1706 | MachineInstr *End = LoLoop.End; |
1707 | MachineBasicBlock *MBB = End->getParent(); |
1708 | unsigned Opc = LoLoop.IsTailPredicationLegal() ? |
1709 | ARM::MVE_LETP : ARM::t2LEUpdate; |
1710 | MachineInstrBuilder MIB = BuildMI(BB&: *MBB, I: End, MIMD: End->getDebugLoc(), |
1711 | MCID: TII->get(Opcode: Opc)); |
1712 | MIB.addDef(RegNo: ARM::LR); |
1713 | unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; |
1714 | MIB.add(MO: End->getOperand(i: Off + 0)); |
1715 | MIB.add(MO: End->getOperand(i: Off + 1)); |
1716 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); |
1717 | LoLoop.ToRemove.insert(Ptr: LoLoop.Dec); |
1718 | LoLoop.ToRemove.insert(Ptr: End); |
1719 | return &*MIB; |
1720 | }; |
1721 | |
1722 | // TODO: We should be able to automatically remove these branches before we |
1723 | // get here - probably by teaching analyzeBranch about the pseudo |
1724 | // instructions. |
1725 | // If there is an unconditional branch, after I, that just branches to the |
1726 | // next block, remove it. |
1727 | auto RemoveDeadBranch = [](MachineInstr *I) { |
1728 | MachineBasicBlock *BB = I->getParent(); |
1729 | MachineInstr *Terminator = &BB->instr_back(); |
1730 | if (Terminator->isUnconditionalBranch() && I != Terminator) { |
1731 | MachineBasicBlock *Succ = Terminator->getOperand(i: 0).getMBB(); |
1732 | if (BB->isLayoutSuccessor(MBB: Succ)) { |
1733 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); |
1734 | Terminator->eraseFromParent(); |
1735 | } |
1736 | } |
1737 | }; |
1738 | |
1739 | // And VMOVCopies need to become 2xVMOVD for tail predication to be valid. |
1740 | // Anything other MQPRCopy can be converted to MVE_VORR later on. |
1741 | auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) { |
1742 | for (auto *MI : VMOVCopies) { |
1743 | LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI); |
1744 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
1745 | MachineBasicBlock *MBB = MI->getParent(); |
1746 | Register Dst = MI->getOperand(i: 0).getReg(); |
1747 | Register Src = MI->getOperand(i: 1).getReg(); |
1748 | auto MIB1 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
1749 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2) |
1750 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2) |
1751 | .add(MOs: predOps(Pred: ARMCC::AL)); |
1752 | (void)MIB1; |
1753 | LLVM_DEBUG(dbgs() << " into " << *MIB1); |
1754 | auto MIB2 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
1755 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2 + 1) |
1756 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2 + 1) |
1757 | .add(MOs: predOps(Pred: ARMCC::AL)); |
1758 | LLVM_DEBUG(dbgs() << " and " << *MIB2); |
1759 | (void)MIB2; |
1760 | MI->eraseFromParent(); |
1761 | } |
1762 | }; |
1763 | |
1764 | if (LoLoop.Revert) { |
1765 | if (isWhileLoopStart(MI: *LoLoop.Start)) |
1766 | RevertWhile(MI: LoLoop.Start); |
1767 | else |
1768 | RevertDo(MI: LoLoop.Start); |
1769 | if (LoLoop.Dec == LoLoop.End) |
1770 | RevertLoopEndDec(MI: LoLoop.End); |
1771 | else |
1772 | RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec)); |
1773 | } else { |
1774 | ExpandVMOVCopies(LoLoop.VMOVCopies); |
1775 | LoLoop.Start = ExpandLoopStart(LoLoop); |
1776 | if (LoLoop.Start) |
1777 | RemoveDeadBranch(LoLoop.Start); |
1778 | LoLoop.End = ExpandLoopEnd(LoLoop); |
1779 | RemoveDeadBranch(LoLoop.End); |
1780 | if (LoLoop.IsTailPredicationLegal()) |
1781 | ConvertVPTBlocks(LoLoop); |
1782 | for (auto *I : LoLoop.ToRemove) { |
1783 | LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); |
1784 | I->eraseFromParent(); |
1785 | } |
1786 | for (auto *I : LoLoop.BlockMasksToRecompute) { |
1787 | LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); |
1788 | recomputeVPTBlockMask(Instr&: *I); |
1789 | LLVM_DEBUG(dbgs() << " ... done: " << *I); |
1790 | } |
1791 | } |
1792 | |
1793 | PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); |
1794 | DFS.ProcessLoop(); |
1795 | const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); |
1796 | fullyRecomputeLiveIns(MBBs: PostOrder); |
1797 | |
1798 | for (auto *MBB : reverse(C: PostOrder)) |
1799 | recomputeLivenessFlags(MBB&: *MBB); |
1800 | |
1801 | // We've moved, removed and inserted new instructions, so update RDA. |
1802 | RDA->reset(); |
1803 | } |
1804 | |
1805 | bool ARMLowOverheadLoops::RevertNonLoops() { |
1806 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n" ); |
1807 | bool Changed = false; |
1808 | |
1809 | for (auto &MBB : *MF) { |
1810 | SmallVector<MachineInstr*, 4> Starts; |
1811 | SmallVector<MachineInstr*, 4> Decs; |
1812 | SmallVector<MachineInstr*, 4> Ends; |
1813 | SmallVector<MachineInstr *, 4> EndDecs; |
1814 | SmallVector<MachineInstr *, 4> MQPRCopies; |
1815 | |
1816 | for (auto &I : MBB) { |
1817 | if (isLoopStart(MI: I)) |
1818 | Starts.push_back(Elt: &I); |
1819 | else if (I.getOpcode() == ARM::t2LoopDec) |
1820 | Decs.push_back(Elt: &I); |
1821 | else if (I.getOpcode() == ARM::t2LoopEnd) |
1822 | Ends.push_back(Elt: &I); |
1823 | else if (I.getOpcode() == ARM::t2LoopEndDec) |
1824 | EndDecs.push_back(Elt: &I); |
1825 | else if (I.getOpcode() == ARM::MQPRCopy) |
1826 | MQPRCopies.push_back(Elt: &I); |
1827 | } |
1828 | |
1829 | if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() && |
1830 | MQPRCopies.empty()) |
1831 | continue; |
1832 | |
1833 | Changed = true; |
1834 | |
1835 | for (auto *Start : Starts) { |
1836 | if (isWhileLoopStart(MI: *Start)) |
1837 | RevertWhile(MI: Start); |
1838 | else |
1839 | RevertDo(MI: Start); |
1840 | } |
1841 | for (auto *Dec : Decs) |
1842 | RevertLoopDec(MI: Dec); |
1843 | |
1844 | for (auto *End : Ends) |
1845 | RevertLoopEnd(MI: End); |
1846 | for (auto *End : EndDecs) |
1847 | RevertLoopEndDec(MI: End); |
1848 | for (auto *MI : MQPRCopies) { |
1849 | LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI); |
1850 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
1851 | MachineBasicBlock *MBB = MI->getParent(); |
1852 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR), |
1853 | DestReg: MI->getOperand(i: 0).getReg()) |
1854 | .add(MO: MI->getOperand(i: 1)) |
1855 | .add(MO: MI->getOperand(i: 1)); |
1856 | addUnpredicatedMveVpredROp(MIB, DestReg: MI->getOperand(i: 0).getReg()); |
1857 | MI->eraseFromParent(); |
1858 | } |
1859 | } |
1860 | return Changed; |
1861 | } |
1862 | |
1863 | FunctionPass *llvm::createARMLowOverheadLoopsPass() { |
1864 | return new ARMLowOverheadLoops(); |
1865 | } |
1866 | |