| 1 | //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// \file |
| 9 | /// Finalize v8.1-m low-overhead loops by converting the associated pseudo |
| 10 | /// instructions into machine operations. |
| 11 | /// The expectation is that the loop contains three pseudo instructions: |
| 12 | /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop |
| 13 | /// form should be in the preheader, whereas the while form should be in the |
| 14 | /// preheaders only predecessor. |
| 15 | /// - t2LoopDec - placed within in the loop body. |
| 16 | /// - t2LoopEnd - the loop latch terminator. |
| 17 | /// |
| 18 | /// In addition to this, we also look for the presence of the VCTP instruction, |
| 19 | /// which determines whether we can generated the tail-predicated low-overhead |
| 20 | /// loop form. |
| 21 | /// |
| 22 | /// Assumptions and Dependencies: |
| 23 | /// Low-overhead loops are constructed and executed using a setup instruction: |
| 24 | /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. |
| 25 | /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range |
| 26 | /// but fixed polarity: WLS can only branch forwards and LE can only branch |
| 27 | /// backwards. These restrictions mean that this pass is dependent upon block |
| 28 | /// layout and block sizes, which is why it's the last pass to run. The same is |
| 29 | /// true for ConstantIslands, but this pass does not increase the size of the |
| 30 | /// basic blocks, nor does it change the CFG. Instructions are mainly removed |
| 31 | /// during the transform and pseudo instructions are replaced by real ones. In |
| 32 | /// some cases, when we have to revert to a 'normal' loop, we have to introduce |
| 33 | /// multiple instructions for a single pseudo (see RevertWhile and |
| 34 | /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd |
| 35 | /// are defined to be as large as this maximum sequence of replacement |
| 36 | /// instructions. |
| 37 | /// |
| 38 | /// A note on VPR.P0 (the lane mask): |
| 39 | /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a |
| 40 | /// "VPT Active" context (which includes low-overhead loops and vpt blocks). |
| 41 | /// They will simply "and" the result of their calculation with the current |
| 42 | /// value of VPR.P0. You can think of it like this: |
| 43 | /// \verbatim |
| 44 | /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs |
| 45 | /// VPR.P0 &= Value |
| 46 | /// else |
| 47 | /// VPR.P0 = Value |
| 48 | /// \endverbatim |
| 49 | /// When we're inside the low-overhead loop (between DLSTP and LETP), we always |
| 50 | /// fall in the "VPT active" case, so we can consider that all VPR writes by |
| 51 | /// one of those instruction is actually a "and". |
| 52 | //===----------------------------------------------------------------------===// |
| 53 | |
| 54 | #include "ARM.h" |
| 55 | #include "ARMBaseInstrInfo.h" |
| 56 | #include "ARMBasicBlockInfo.h" |
| 57 | #include "ARMSubtarget.h" |
| 58 | #include "MVETailPredUtils.h" |
| 59 | #include "Thumb2InstrInfo.h" |
| 60 | #include "llvm/ADT/SetVector.h" |
| 61 | #include "llvm/CodeGen/LivePhysRegs.h" |
| 62 | #include "llvm/CodeGen/MachineFrameInfo.h" |
| 63 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 64 | #include "llvm/CodeGen/MachineLoopInfo.h" |
| 65 | #include "llvm/CodeGen/MachineLoopUtils.h" |
| 66 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 67 | #include "llvm/CodeGen/Passes.h" |
| 68 | #include "llvm/CodeGen/ReachingDefAnalysis.h" |
| 69 | #include "llvm/MC/MCInstrDesc.h" |
| 70 | |
| 71 | using namespace llvm; |
| 72 | |
| 73 | #define DEBUG_TYPE "arm-low-overhead-loops" |
| 74 | #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" |
| 75 | |
| 76 | static cl::opt<bool> |
| 77 | DisableTailPredication("arm-loloops-disable-tailpred" , cl::Hidden, |
| 78 | cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass" ), |
| 79 | cl::init(Val: false)); |
| 80 | |
| 81 | static cl::opt<bool> |
| 82 | DisableOmitDLS("arm-disable-omit-dls" , cl::Hidden, |
| 83 | cl::desc("Disable omitting 'dls lr, lr' instructions" ), |
| 84 | cl::init(Val: false)); |
| 85 | |
| 86 | static bool isVectorPredicated(MachineInstr *MI) { |
| 87 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
| 88 | return PIdx != -1 && MI->getOperand(i: PIdx + 1).getReg() == ARM::VPR; |
| 89 | } |
| 90 | |
| 91 | static bool isVectorPredicate(MachineInstr *MI) { |
| 92 | return MI->findRegisterDefOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
| 93 | } |
| 94 | |
| 95 | static bool hasVPRUse(MachineInstr &MI) { |
| 96 | return MI.findRegisterUseOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
| 97 | } |
| 98 | |
| 99 | static bool isDomainMVE(MachineInstr *MI) { |
| 100 | uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; |
| 101 | return Domain == ARMII::DomainMVE; |
| 102 | } |
| 103 | |
| 104 | static int getVecSize(const MachineInstr &MI) { |
| 105 | const MCInstrDesc &MCID = MI.getDesc(); |
| 106 | uint64_t Flags = MCID.TSFlags; |
| 107 | return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift; |
| 108 | } |
| 109 | |
| 110 | static bool shouldInspect(MachineInstr &MI) { |
| 111 | if (MI.isDebugInstr()) |
| 112 | return false; |
| 113 | return isDomainMVE(MI: &MI) || isVectorPredicate(MI: &MI) || hasVPRUse(MI); |
| 114 | } |
| 115 | |
| 116 | static bool isHorizontalReduction(const MachineInstr &MI) { |
| 117 | const MCInstrDesc &MCID = MI.getDesc(); |
| 118 | uint64_t Flags = MCID.TSFlags; |
| 119 | return (Flags & ARMII::HorizontalReduction) != 0; |
| 120 | } |
| 121 | |
| 122 | namespace { |
| 123 | |
| 124 | using InstSet = SmallPtrSetImpl<MachineInstr *>; |
| 125 | |
| 126 | class PostOrderLoopTraversal { |
| 127 | MachineLoop &ML; |
| 128 | MachineLoopInfo &MLI; |
| 129 | SmallPtrSet<MachineBasicBlock*, 4> Visited; |
| 130 | SmallVector<MachineBasicBlock*, 4> Order; |
| 131 | |
| 132 | public: |
| 133 | PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) |
| 134 | : ML(ML), MLI(MLI) { } |
| 135 | |
| 136 | const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { |
| 137 | return Order; |
| 138 | } |
| 139 | |
| 140 | // Visit all the blocks within the loop, as well as exit blocks and any |
| 141 | // blocks properly dominating the header. |
| 142 | void ProcessLoop() { |
| 143 | std::function<void(MachineBasicBlock *)> Search = |
| 144 | [this, &Search](MachineBasicBlock *MBB) -> void { |
| 145 | if (!Visited.insert(Ptr: MBB).second) |
| 146 | return; |
| 147 | |
| 148 | for (auto *Succ : MBB->successors()) { |
| 149 | if (!ML.contains(BB: Succ)) |
| 150 | continue; |
| 151 | Search(Succ); |
| 152 | } |
| 153 | Order.push_back(Elt: MBB); |
| 154 | }; |
| 155 | |
| 156 | // Insert exit blocks. |
| 157 | SmallVector<MachineBasicBlock*, 2> ExitBlocks; |
| 158 | ML.getExitBlocks(ExitBlocks); |
| 159 | append_range(C&: Order, R&: ExitBlocks); |
| 160 | |
| 161 | // Then add the loop body. |
| 162 | Search(ML.getHeader()); |
| 163 | |
| 164 | // Then try the preheader and its predecessors. |
| 165 | std::function<void(MachineBasicBlock*)> GetPredecessor = |
| 166 | [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { |
| 167 | Order.push_back(Elt: MBB); |
| 168 | if (MBB->pred_size() == 1) |
| 169 | GetPredecessor(*MBB->pred_begin()); |
| 170 | }; |
| 171 | |
| 172 | if (auto * = ML.getLoopPreheader()) |
| 173 | GetPredecessor(Preheader); |
| 174 | else if (auto * = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
| 175 | GetPredecessor(Preheader); |
| 176 | } |
| 177 | }; |
| 178 | |
| 179 | class VPTBlock { |
| 180 | SmallVector<MachineInstr *, 4> Insts; |
| 181 | |
| 182 | public: |
| 183 | VPTBlock(MachineInstr *MI) { Insts.push_back(Elt: MI); } |
| 184 | |
| 185 | // Have we found an instruction within the block which defines the vpr? If |
| 186 | // so, not all the instructions in the block will have the same predicate. |
| 187 | bool hasUniformPredicate() { return getDivergent() == nullptr; } |
| 188 | |
| 189 | // If it exists, return the first internal instruction which modifies the |
| 190 | // VPR. |
| 191 | MachineInstr *getDivergent() { |
| 192 | SmallVectorImpl<MachineInstr *> &Insts = getInsts(); |
| 193 | for (unsigned i = 1; i < Insts.size(); ++i) { |
| 194 | MachineInstr *Next = Insts[i]; |
| 195 | if (isVectorPredicate(MI: Next)) |
| 196 | return Next; // Found an instruction altering the vpr. |
| 197 | } |
| 198 | return nullptr; |
| 199 | } |
| 200 | |
| 201 | void insert(MachineInstr *MI) { |
| 202 | Insts.push_back(Elt: MI); |
| 203 | // VPT/VPST + 4 predicated instructions. |
| 204 | assert(Insts.size() <= 5 && "Too many instructions in VPT block!" ); |
| 205 | } |
| 206 | |
| 207 | bool containsVCTP() const { return llvm::any_of(Range: Insts, P: isVCTP); } |
| 208 | |
| 209 | unsigned size() const { return Insts.size(); } |
| 210 | SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } |
| 211 | }; |
| 212 | |
| 213 | // Represent the current state of the VPR and hold all instances which |
| 214 | // represent a VPT block, which is a list of instructions that begins with a |
| 215 | // VPT/VPST and has a maximum of four proceeding instructions. All |
| 216 | // instructions within the block are predicated upon the vpr and we allow |
| 217 | // instructions to define the vpr within in the block too. |
| 218 | class VPTState { |
| 219 | friend struct LowOverheadLoop; |
| 220 | |
| 221 | SmallVector<VPTBlock, 4> Blocks; |
| 222 | SetVector<MachineInstr *> CurrentPredicates; |
| 223 | std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts; |
| 224 | |
| 225 | void CreateVPTBlock(MachineInstr *MI) { |
| 226 | assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) |
| 227 | && "Can't begin VPT without predicate" ); |
| 228 | Blocks.emplace_back(Args&: MI); |
| 229 | // The execution of MI is predicated upon the current set of instructions |
| 230 | // that are AND'ed together to form the VPR predicate value. In the case |
| 231 | // that MI is a VPT, CurrentPredicates will also just be MI. |
| 232 | PredicatedInsts[MI] = CurrentPredicates; |
| 233 | } |
| 234 | |
| 235 | void addInst(MachineInstr *MI) { |
| 236 | Blocks.back().insert(MI); |
| 237 | PredicatedInsts[MI] = CurrentPredicates; |
| 238 | } |
| 239 | |
| 240 | void addPredicate(MachineInstr *MI) { |
| 241 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); |
| 242 | CurrentPredicates.insert(X: MI); |
| 243 | } |
| 244 | |
| 245 | void resetPredicate(MachineInstr *MI) { |
| 246 | LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); |
| 247 | CurrentPredicates.clear(); |
| 248 | CurrentPredicates.insert(X: MI); |
| 249 | } |
| 250 | |
| 251 | public: |
| 252 | // Return whether the given instruction is predicated upon a VCTP. |
| 253 | bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { |
| 254 | SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]; |
| 255 | if (Exclusive && Predicates.size() != 1) |
| 256 | return false; |
| 257 | // We do not know how to convert an else predicate of a VCTP. |
| 258 | if (getVPTInstrPredicate(MI: *MI) == ARMVCC::Else) |
| 259 | return false; |
| 260 | return llvm::any_of(Range&: Predicates, P: isVCTP); |
| 261 | } |
| 262 | |
| 263 | // Is the VPST, controlling the block entry, predicated upon a VCTP. |
| 264 | bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) { |
| 265 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 266 | return isPredicatedOnVCTP(MI: Insts.front(), Exclusive); |
| 267 | } |
| 268 | |
| 269 | // If this block begins with a VPT, we can check whether it's using |
| 270 | // at least one predicated input(s), as well as possible loop invariant |
| 271 | // which would result in it being implicitly predicated. |
| 272 | bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefInfo &RDI) { |
| 273 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 274 | MachineInstr *VPT = Insts.front(); |
| 275 | assert(isVPTOpcode(VPT->getOpcode()) && |
| 276 | "Expected VPT block to begin with VPT/VPST" ); |
| 277 | |
| 278 | if (VPT->getOpcode() == ARM::MVE_VPST) |
| 279 | return false; |
| 280 | |
| 281 | // If the VPT block does not define something that is an "output", then |
| 282 | // the tail-predicated version will just perform a subset of the original |
| 283 | // vpt block, where the last lanes should not be used. |
| 284 | if (isVPTOpcode(Opc: VPT->getOpcode()) && |
| 285 | all_of(Range&: Block.getInsts(), P: [](const MachineInstr *MI) { |
| 286 | return !MI->mayStore() && !MI->mayLoad() && |
| 287 | !isHorizontalReduction(MI: *MI) && !isVCTP(MI); |
| 288 | })) |
| 289 | return true; |
| 290 | |
| 291 | auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { |
| 292 | MachineInstr *Op = RDI.getMIOperand(MI, MO&: MI->getOperand(i: Idx)); |
| 293 | return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op); |
| 294 | }; |
| 295 | |
| 296 | auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { |
| 297 | MachineOperand &MO = MI->getOperand(i: Idx); |
| 298 | if (!MO.isReg() || !MO.getReg()) |
| 299 | return true; |
| 300 | |
| 301 | SmallPtrSet<MachineInstr *, 2> Defs; |
| 302 | RDI.getGlobalReachingDefs(MI, Reg: MO.getReg(), Defs); |
| 303 | if (Defs.empty()) |
| 304 | return true; |
| 305 | |
| 306 | for (auto *Def : Defs) |
| 307 | if (Def->getParent() == VPT->getParent()) |
| 308 | return false; |
| 309 | return true; |
| 310 | }; |
| 311 | |
| 312 | // Check that at least one of the operands is directly predicated on a |
| 313 | // vctp and allow an invariant value too. |
| 314 | return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && |
| 315 | (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && |
| 316 | (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); |
| 317 | } |
| 318 | |
| 319 | bool isValid(ReachingDefInfo &RDI) { |
| 320 | // All predication within the loop should be based on vctp. If the block |
| 321 | // isn't predicated on entry, check whether the vctp is within the block |
| 322 | // and that all other instructions are then predicated on it. |
| 323 | for (auto &Block : Blocks) { |
| 324 | if (isEntryPredicatedOnVCTP(Block, Exclusive: false) && |
| 325 | !any_of(Range: drop_begin(RangeOrContainer&: Block.getInsts()), P: [](const MachineInstr *MI) { |
| 326 | return getVPTInstrPredicate(MI: *MI) == ARMVCC::Else; |
| 327 | })) |
| 328 | continue; |
| 329 | if (hasImplicitlyValidVPT(Block, RDI)) |
| 330 | continue; |
| 331 | |
| 332 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 333 | // We don't know how to convert a block with just a VPT;VCTP into |
| 334 | // anything valid once we remove the VCTP. For now just bail out. |
| 335 | assert(isVPTOpcode(Insts.front()->getOpcode()) && |
| 336 | "Expected VPT block to start with a VPST or VPT!" ); |
| 337 | if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && |
| 338 | isVCTP(MI: Insts.back())) |
| 339 | return false; |
| 340 | |
| 341 | for (auto *MI : Insts) { |
| 342 | // Check that any internal VCTPs are 'Then' predicated. |
| 343 | if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then) |
| 344 | return false; |
| 345 | // Skip other instructions that build up the predicate. |
| 346 | if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) |
| 347 | continue; |
| 348 | // Check that any other instructions are predicated upon a vctp. |
| 349 | // TODO: We could infer when VPTs are implicitly predicated on the |
| 350 | // vctp (when the operands are predicated). |
| 351 | if (!isPredicatedOnVCTP(MI)) { |
| 352 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); |
| 353 | return false; |
| 354 | } |
| 355 | } |
| 356 | } |
| 357 | return true; |
| 358 | } |
| 359 | }; |
| 360 | |
| 361 | struct LowOverheadLoop { |
| 362 | |
| 363 | MachineLoop &ML; |
| 364 | MachineBasicBlock * = nullptr; |
| 365 | MachineLoopInfo &MLI; |
| 366 | ReachingDefInfo &RDI; |
| 367 | const TargetRegisterInfo &TRI; |
| 368 | const ARMBaseInstrInfo &TII; |
| 369 | MachineFunction *MF = nullptr; |
| 370 | MachineBasicBlock::iterator StartInsertPt; |
| 371 | MachineBasicBlock *StartInsertBB = nullptr; |
| 372 | MachineInstr *Start = nullptr; |
| 373 | MachineInstr *Dec = nullptr; |
| 374 | MachineInstr *End = nullptr; |
| 375 | MachineOperand TPNumElements; |
| 376 | SmallVector<MachineInstr *, 4> VCTPs; |
| 377 | SmallPtrSet<MachineInstr *, 4> ToRemove; |
| 378 | SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute; |
| 379 | SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs; |
| 380 | SmallPtrSet<MachineInstr *, 4> VMOVCopies; |
| 381 | bool Revert = false; |
| 382 | bool CannotTailPredicate = false; |
| 383 | VPTState VPTstate; |
| 384 | |
| 385 | LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, ReachingDefInfo &RDI, |
| 386 | const TargetRegisterInfo &TRI, const ARMBaseInstrInfo &TII) |
| 387 | : ML(ML), MLI(MLI), RDI(RDI), TRI(TRI), TII(TII), |
| 388 | TPNumElements(MachineOperand::CreateImm(Val: 0)) { |
| 389 | MF = ML.getHeader()->getParent(); |
| 390 | if (auto *MBB = ML.getLoopPreheader()) |
| 391 | Preheader = MBB; |
| 392 | else if (auto *MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
| 393 | Preheader = MBB; |
| 394 | } |
| 395 | |
| 396 | // If this is an MVE instruction, check that we know how to use tail |
| 397 | // predication with it. Record VPT blocks and return whether the |
| 398 | // instruction is valid for tail predication. |
| 399 | bool ValidateMVEInst(MachineInstr *MI); |
| 400 | |
| 401 | void AnalyseMVEInst(MachineInstr *MI) { |
| 402 | CannotTailPredicate = !ValidateMVEInst(MI); |
| 403 | } |
| 404 | |
| 405 | bool IsTailPredicationLegal() const { |
| 406 | // For now, let's keep things really simple and only support a single |
| 407 | // block for tail predication. |
| 408 | return !Revert && FoundAllComponents() && !VCTPs.empty() && |
| 409 | !CannotTailPredicate && ML.getNumBlocks() == 1; |
| 410 | } |
| 411 | |
| 412 | // Given that MI is a VCTP, check that is equivalent to any other VCTPs |
| 413 | // found. |
| 414 | bool AddVCTP(MachineInstr *MI); |
| 415 | |
| 416 | // Check that the predication in the loop will be equivalent once we |
| 417 | // perform the conversion. Also ensure that we can provide the number |
| 418 | // of elements to the loop start instruction. |
| 419 | bool ValidateTailPredicate(); |
| 420 | |
| 421 | // Check that any values available outside of the loop will be the same |
| 422 | // after tail predication conversion. |
| 423 | bool ValidateLiveOuts(); |
| 424 | |
| 425 | // Check the branch targets are within range and we satisfy our |
| 426 | // restrictions. |
| 427 | void Validate(ARMBasicBlockUtils *BBUtils); |
| 428 | |
| 429 | bool FoundAllComponents() const { |
| 430 | return Start && Dec && End; |
| 431 | } |
| 432 | |
| 433 | SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; } |
| 434 | |
| 435 | // Return the operand for the loop start instruction. This will be the loop |
| 436 | // iteration count, or the number of elements if we're tail predicating. |
| 437 | MachineOperand &getLoopStartOperand() { |
| 438 | if (IsTailPredicationLegal()) |
| 439 | return TPNumElements; |
| 440 | return Start->getOperand(i: 1); |
| 441 | } |
| 442 | |
| 443 | unsigned getStartOpcode() const { |
| 444 | bool IsDo = isDoLoopStart(MI: *Start); |
| 445 | if (!IsTailPredicationLegal()) |
| 446 | return IsDo ? ARM::t2DLS : ARM::t2WLS; |
| 447 | |
| 448 | return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo); |
| 449 | } |
| 450 | |
| 451 | void dump() const { |
| 452 | if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; |
| 453 | if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; |
| 454 | if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; |
| 455 | if (!VCTPs.empty()) { |
| 456 | dbgs() << "ARM Loops: Found VCTP(s):\n" ; |
| 457 | for (auto *MI : VCTPs) |
| 458 | dbgs() << " - " << *MI; |
| 459 | } |
| 460 | if (!FoundAllComponents()) |
| 461 | dbgs() << "ARM Loops: Not a low-overhead loop.\n" ; |
| 462 | else if (!(Start && Dec && End)) |
| 463 | dbgs() << "ARM Loops: Failed to find all loop components.\n" ; |
| 464 | } |
| 465 | }; |
| 466 | |
| 467 | class ARMLowOverheadLoops : public MachineFunctionPass { |
| 468 | MachineFunction *MF = nullptr; |
| 469 | MachineLoopInfo *MLI = nullptr; |
| 470 | ReachingDefInfo *RDI = nullptr; |
| 471 | const ARMBaseInstrInfo *TII = nullptr; |
| 472 | MachineRegisterInfo *MRI = nullptr; |
| 473 | const TargetRegisterInfo *TRI = nullptr; |
| 474 | std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; |
| 475 | |
| 476 | public: |
| 477 | static char ID; |
| 478 | |
| 479 | ARMLowOverheadLoops() : MachineFunctionPass(ID) { } |
| 480 | |
| 481 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 482 | AU.setPreservesCFG(); |
| 483 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
| 484 | AU.addRequired<ReachingDefInfoWrapperPass>(); |
| 485 | MachineFunctionPass::getAnalysisUsage(AU); |
| 486 | } |
| 487 | |
| 488 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 489 | |
| 490 | MachineFunctionProperties getRequiredProperties() const override { |
| 491 | return MachineFunctionProperties().setNoVRegs().setTracksLiveness(); |
| 492 | } |
| 493 | |
| 494 | StringRef getPassName() const override { |
| 495 | return ARM_LOW_OVERHEAD_LOOPS_NAME; |
| 496 | } |
| 497 | |
| 498 | private: |
| 499 | bool ProcessLoop(MachineLoop *ML); |
| 500 | |
| 501 | bool RevertNonLoops(); |
| 502 | |
| 503 | void RevertWhile(MachineInstr *MI) const; |
| 504 | void RevertDo(MachineInstr *MI) const; |
| 505 | |
| 506 | bool RevertLoopDec(MachineInstr *MI) const; |
| 507 | |
| 508 | void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; |
| 509 | |
| 510 | void RevertLoopEndDec(MachineInstr *MI) const; |
| 511 | |
| 512 | void ConvertVPTBlocks(LowOverheadLoop &LoLoop); |
| 513 | |
| 514 | MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); |
| 515 | |
| 516 | void Expand(LowOverheadLoop &LoLoop); |
| 517 | |
| 518 | void IterationCountDCE(LowOverheadLoop &LoLoop); |
| 519 | }; |
| 520 | } |
| 521 | |
| 522 | char ARMLowOverheadLoops::ID = 0; |
| 523 | |
| 524 | INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, |
| 525 | false, false) |
| 526 | |
| 527 | static bool TryRemove(MachineInstr *MI, ReachingDefInfo &RDI, InstSet &ToRemove, |
| 528 | InstSet &Ignore) { |
| 529 | |
| 530 | // Check that we can remove all of Killed without having to modify any IT |
| 531 | // blocks. |
| 532 | auto WontCorruptITs = [](InstSet &Killed, ReachingDefInfo &RDI) { |
| 533 | // Collect the dead code and the MBBs in which they reside. |
| 534 | SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; |
| 535 | for (auto *Dead : Killed) |
| 536 | BasicBlocks.insert(Ptr: Dead->getParent()); |
| 537 | |
| 538 | // Collect IT blocks in all affected basic blocks. |
| 539 | std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; |
| 540 | for (auto *MBB : BasicBlocks) { |
| 541 | for (auto &IT : *MBB) { |
| 542 | if (IT.getOpcode() != ARM::t2IT) |
| 543 | continue; |
| 544 | RDI.getReachingLocalUses(MI: &IT, Reg: MCRegister::from(Val: ARM::ITSTATE), |
| 545 | Uses&: ITBlocks[&IT]); |
| 546 | } |
| 547 | } |
| 548 | |
| 549 | // If we're removing all of the instructions within an IT block, then |
| 550 | // also remove the IT instruction. |
| 551 | SmallPtrSet<MachineInstr *, 2> ModifiedITs; |
| 552 | SmallPtrSet<MachineInstr *, 2> RemoveITs; |
| 553 | for (auto *Dead : Killed) { |
| 554 | if (MachineOperand *MO = |
| 555 | Dead->findRegisterUseOperand(Reg: ARM::ITSTATE, /*TRI=*/nullptr)) { |
| 556 | MachineInstr *IT = RDI.getMIOperand(MI: Dead, MO&: *MO); |
| 557 | RemoveITs.insert(Ptr: IT); |
| 558 | auto &CurrentBlock = ITBlocks[IT]; |
| 559 | CurrentBlock.erase(Ptr: Dead); |
| 560 | if (CurrentBlock.empty()) |
| 561 | ModifiedITs.erase(Ptr: IT); |
| 562 | else |
| 563 | ModifiedITs.insert(Ptr: IT); |
| 564 | } |
| 565 | } |
| 566 | if (!ModifiedITs.empty()) |
| 567 | return false; |
| 568 | Killed.insert_range(R&: RemoveITs); |
| 569 | return true; |
| 570 | }; |
| 571 | |
| 572 | SmallPtrSet<MachineInstr *, 2> Uses; |
| 573 | if (!RDI.isSafeToRemove(MI, ToRemove&: Uses, Ignore)) |
| 574 | return false; |
| 575 | |
| 576 | if (WontCorruptITs(Uses, RDI)) { |
| 577 | ToRemove.insert_range(R&: Uses); |
| 578 | LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI |
| 579 | << " - can also remove:\n" ; |
| 580 | for (auto *Use : Uses) |
| 581 | dbgs() << " - " << *Use); |
| 582 | |
| 583 | SmallPtrSet<MachineInstr*, 4> Killed; |
| 584 | RDI.collectKilledOperands(MI, Dead&: Killed); |
| 585 | if (WontCorruptITs(Killed, RDI)) { |
| 586 | ToRemove.insert_range(R&: Killed); |
| 587 | LLVM_DEBUG(for (auto *Dead : Killed) |
| 588 | dbgs() << " - " << *Dead); |
| 589 | } |
| 590 | return true; |
| 591 | } |
| 592 | return false; |
| 593 | } |
| 594 | |
| 595 | bool LowOverheadLoop::ValidateTailPredicate() { |
| 596 | if (!IsTailPredicationLegal()) { |
| 597 | LLVM_DEBUG(if (VCTPs.empty()) |
| 598 | dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; |
| 599 | dbgs() << "ARM Loops: Tail-predication is not valid.\n" ); |
| 600 | return false; |
| 601 | } |
| 602 | |
| 603 | assert(!VCTPs.empty() && "VCTP instruction expected but is not set" ); |
| 604 | assert(ML.getBlocks().size() == 1 && |
| 605 | "Shouldn't be processing a loop with more than one block" ); |
| 606 | |
| 607 | if (DisableTailPredication) { |
| 608 | LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n" ); |
| 609 | return false; |
| 610 | } |
| 611 | |
| 612 | if (!VPTstate.isValid(RDI)) { |
| 613 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n" ); |
| 614 | return false; |
| 615 | } |
| 616 | |
| 617 | if (!ValidateLiveOuts()) { |
| 618 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n" ); |
| 619 | return false; |
| 620 | } |
| 621 | |
| 622 | // For tail predication, we need to provide the number of elements, instead |
| 623 | // of the iteration count, to the loop start instruction. The number of |
| 624 | // elements is provided to the vctp instruction, so we need to check that |
| 625 | // we can use this register at InsertPt. |
| 626 | MachineInstr *VCTP = VCTPs.back(); |
| 627 | if (Start->getOpcode() == ARM::t2DoLoopStartTP || |
| 628 | Start->getOpcode() == ARM::t2WhileLoopStartTP) { |
| 629 | TPNumElements = Start->getOperand(i: 2); |
| 630 | StartInsertPt = Start; |
| 631 | StartInsertBB = Start->getParent(); |
| 632 | } else { |
| 633 | TPNumElements = VCTP->getOperand(i: 1); |
| 634 | MCRegister NumElements = TPNumElements.getReg().asMCReg(); |
| 635 | |
| 636 | // If the register is defined within loop, then we can't perform TP. |
| 637 | // TODO: Check whether this is just a mov of a register that would be |
| 638 | // available. |
| 639 | if (RDI.hasLocalDefBefore(MI: VCTP, Reg: NumElements)) { |
| 640 | LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n" ); |
| 641 | return false; |
| 642 | } |
| 643 | |
| 644 | // The element count register maybe defined after InsertPt, in which case we |
| 645 | // need to try to move either InsertPt or the def so that the [w|d]lstp can |
| 646 | // use the value. |
| 647 | |
| 648 | if (StartInsertPt != StartInsertBB->end() && |
| 649 | !RDI.isReachingDefLiveOut(MI: &*StartInsertPt, Reg: NumElements)) { |
| 650 | if (auto *ElemDef = |
| 651 | RDI.getLocalLiveOutMIDef(MBB: StartInsertBB, Reg: NumElements)) { |
| 652 | if (RDI.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) { |
| 653 | ElemDef->removeFromParent(); |
| 654 | StartInsertBB->insert(I: StartInsertPt, MI: ElemDef); |
| 655 | LLVM_DEBUG(dbgs() |
| 656 | << "ARM Loops: Moved element count def: " << *ElemDef); |
| 657 | } else if (RDI.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) { |
| 658 | StartInsertPt->removeFromParent(); |
| 659 | StartInsertBB->insertAfter(I: MachineBasicBlock::iterator(ElemDef), |
| 660 | MI: &*StartInsertPt); |
| 661 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); |
| 662 | } else { |
| 663 | // If we fail to move an instruction and the element count is provided |
| 664 | // by a mov, use the mov operand if it will have the same value at the |
| 665 | // insertion point |
| 666 | MachineOperand Operand = ElemDef->getOperand(i: 1); |
| 667 | if (isMovRegOpcode(Opc: ElemDef->getOpcode()) && |
| 668 | RDI.getUniqueReachingMIDef(MI: ElemDef, Reg: Operand.getReg().asMCReg()) == |
| 669 | RDI.getUniqueReachingMIDef(MI: &*StartInsertPt, |
| 670 | Reg: Operand.getReg().asMCReg())) { |
| 671 | TPNumElements = Operand; |
| 672 | NumElements = TPNumElements.getReg(); |
| 673 | } else { |
| 674 | LLVM_DEBUG(dbgs() |
| 675 | << "ARM Loops: Unable to move element count to loop " |
| 676 | << "start instruction.\n" ); |
| 677 | return false; |
| 678 | } |
| 679 | } |
| 680 | } |
| 681 | } |
| 682 | |
| 683 | // Especially in the case of while loops, InsertBB may not be the |
| 684 | // preheader, so we need to check that the register isn't redefined |
| 685 | // before entering the loop. |
| 686 | auto CannotProvideElements = [this](MachineBasicBlock *MBB, |
| 687 | MCRegister NumElements) { |
| 688 | if (MBB->empty()) |
| 689 | return false; |
| 690 | // NumElements is redefined in this block. |
| 691 | if (RDI.hasLocalDefBefore(MI: &MBB->back(), Reg: NumElements)) |
| 692 | return true; |
| 693 | |
| 694 | // Don't continue searching up through multiple predecessors. |
| 695 | if (MBB->pred_size() > 1) |
| 696 | return true; |
| 697 | |
| 698 | return false; |
| 699 | }; |
| 700 | |
| 701 | // Search backwards for a def, until we get to InsertBB. |
| 702 | MachineBasicBlock *MBB = Preheader; |
| 703 | while (MBB && MBB != StartInsertBB) { |
| 704 | if (CannotProvideElements(MBB, NumElements)) { |
| 705 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n" ); |
| 706 | return false; |
| 707 | } |
| 708 | MBB = *MBB->pred_begin(); |
| 709 | } |
| 710 | } |
| 711 | |
| 712 | // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect |
| 713 | // world the [w|d]lstp instruction would be last instruction in the preheader |
| 714 | // and so it would only affect instructions within the loop body. But due to |
| 715 | // scheduling, and/or the logic in this pass (above), the insertion point can |
| 716 | // be moved earlier. So if the Loop Start isn't the last instruction in the |
| 717 | // preheader, and if the initial element count is smaller than the vector |
| 718 | // width, the Loop Start instruction will immediately generate one or more |
| 719 | // false lane mask which can, incorrectly, affect the proceeding MVE |
| 720 | // instructions in the preheader. |
| 721 | if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) { |
| 722 | LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n" ); |
| 723 | return false; |
| 724 | } |
| 725 | |
| 726 | // For any DoubleWidthResultInstrs we found whilst scanning instructions, they |
| 727 | // need to compute an output size that is smaller than the VCTP mask operates |
| 728 | // on. The VecSize of the DoubleWidthResult is the larger vector size - the |
| 729 | // size it extends into, so any VCTP VecSize <= is valid. |
| 730 | unsigned VCTPVecSize = getVecSize(MI: *VCTP); |
| 731 | for (MachineInstr *MI : DoubleWidthResultInstrs) { |
| 732 | unsigned InstrVecSize = getVecSize(MI: *MI); |
| 733 | if (InstrVecSize > VCTPVecSize) { |
| 734 | LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP " |
| 735 | << "VecSize:\n" << *MI); |
| 736 | return false; |
| 737 | } |
| 738 | } |
| 739 | |
| 740 | // Check that the value change of the element count is what we expect and |
| 741 | // that the predication will be equivalent. For this we need: |
| 742 | // NumElements = NumElements - VectorWidth. The sub will be a sub immediate |
| 743 | // and we can also allow register copies within the chain too. |
| 744 | auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { |
| 745 | return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth; |
| 746 | }; |
| 747 | |
| 748 | MachineBasicBlock *MBB = VCTP->getParent(); |
| 749 | // Remove modifications to the element count since they have no purpose in a |
| 750 | // tail predicated loop. Explicitly refer to the vctp operand no matter which |
| 751 | // register NumElements has been assigned to, since that is what the |
| 752 | // modifications will be using |
| 753 | if (auto *Def = RDI.getUniqueReachingMIDef( |
| 754 | MI: &MBB->back(), Reg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
| 755 | SmallPtrSet<MachineInstr*, 2> ElementChain; |
| 756 | SmallPtrSet<MachineInstr*, 2> Ignore; |
| 757 | unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode()); |
| 758 | |
| 759 | Ignore.insert_range(R&: VCTPs); |
| 760 | |
| 761 | if (TryRemove(MI: Def, RDI, ToRemove&: ElementChain, Ignore)) { |
| 762 | bool FoundSub = false; |
| 763 | |
| 764 | for (auto *MI : ElementChain) { |
| 765 | if (isMovRegOpcode(Opc: MI->getOpcode())) |
| 766 | continue; |
| 767 | |
| 768 | if (isSubImmOpcode(Opc: MI->getOpcode())) { |
| 769 | if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { |
| 770 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
| 771 | " count: " << *MI); |
| 772 | return false; |
| 773 | } |
| 774 | FoundSub = true; |
| 775 | } else { |
| 776 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
| 777 | " count: " << *MI); |
| 778 | return false; |
| 779 | } |
| 780 | } |
| 781 | ToRemove.insert_range(R&: ElementChain); |
| 782 | } |
| 783 | } |
| 784 | |
| 785 | // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we |
| 786 | // can also remove any extra instructions in the preheader, which often |
| 787 | // includes a now unused MOV. |
| 788 | if ((Start->getOpcode() == ARM::t2DoLoopStartTP || |
| 789 | Start->getOpcode() == ARM::t2WhileLoopStartTP) && |
| 790 | Preheader && !Preheader->empty() && |
| 791 | !RDI.hasLocalDefBefore(MI: VCTP, Reg: VCTP->getOperand(i: 1).getReg())) { |
| 792 | if (auto *Def = RDI.getUniqueReachingMIDef( |
| 793 | MI: &Preheader->back(), Reg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
| 794 | SmallPtrSet<MachineInstr *, 2> Ignore(llvm::from_range, VCTPs); |
| 795 | TryRemove(MI: Def, RDI, ToRemove, Ignore); |
| 796 | } |
| 797 | } |
| 798 | |
| 799 | return true; |
| 800 | } |
| 801 | |
| 802 | static bool isRegInClass(const MachineOperand &MO, |
| 803 | const TargetRegisterClass *Class) { |
| 804 | return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg()); |
| 805 | } |
| 806 | |
| 807 | // MVE 'narrowing' operate on half a lane, reading from half and writing |
| 808 | // to half, which are referred to has the top and bottom half. The other |
| 809 | // half retains its previous value. |
| 810 | static bool retainsPreviousHalfElement(const MachineInstr &MI) { |
| 811 | const MCInstrDesc &MCID = MI.getDesc(); |
| 812 | uint64_t Flags = MCID.TSFlags; |
| 813 | return (Flags & ARMII::RetainsPreviousHalfElement) != 0; |
| 814 | } |
| 815 | |
| 816 | // Some MVE instructions read from the top/bottom halves of their operand(s) |
| 817 | // and generate a vector result with result elements that are double the |
| 818 | // width of the input. |
| 819 | static bool producesDoubleWidthResult(const MachineInstr &MI) { |
| 820 | const MCInstrDesc &MCID = MI.getDesc(); |
| 821 | uint64_t Flags = MCID.TSFlags; |
| 822 | return (Flags & ARMII::DoubleWidthResult) != 0; |
| 823 | } |
| 824 | |
| 825 | // Can this instruction generate a non-zero result when given only zeroed |
| 826 | // operands? This allows us to know that, given operands with false bytes |
| 827 | // zeroed by masked loads, that the result will also contain zeros in those |
| 828 | // bytes. |
| 829 | static bool canGenerateNonZeros(const MachineInstr &MI) { |
| 830 | |
| 831 | // Check for instructions which can write into a larger element size, |
| 832 | // possibly writing into a previous zero'd lane. |
| 833 | if (producesDoubleWidthResult(MI)) |
| 834 | return true; |
| 835 | |
| 836 | switch (MI.getOpcode()) { |
| 837 | default: |
| 838 | break; |
| 839 | // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow |
| 840 | // fp16 -> fp32 vector conversions. |
| 841 | // Instructions that perform a NOT will generate 1s from 0s. |
| 842 | case ARM::MVE_VMVN: |
| 843 | case ARM::MVE_VORN: |
| 844 | // Count leading zeros will do just that! |
| 845 | case ARM::MVE_VCLZs8: |
| 846 | case ARM::MVE_VCLZs16: |
| 847 | case ARM::MVE_VCLZs32: |
| 848 | return true; |
| 849 | } |
| 850 | return false; |
| 851 | } |
| 852 | |
| 853 | // Look at its register uses to see if it only can only receive zeros |
| 854 | // into its false lanes which would then produce zeros. Also check that |
| 855 | // the output register is also defined by an FalseLanesZero instruction |
| 856 | // so that if tail-predication happens, the lanes that aren't updated will |
| 857 | // still be zeros. |
| 858 | static bool producesFalseLanesZero(MachineInstr &MI, |
| 859 | const TargetRegisterClass *QPRs, |
| 860 | const ReachingDefInfo &RDI, |
| 861 | InstSet &FalseLanesZero) { |
| 862 | if (canGenerateNonZeros(MI)) |
| 863 | return false; |
| 864 | |
| 865 | bool isPredicated = isVectorPredicated(MI: &MI); |
| 866 | // Predicated loads will write zeros to the falsely predicated bytes of the |
| 867 | // destination register. |
| 868 | if (MI.mayLoad()) |
| 869 | return isPredicated; |
| 870 | |
| 871 | auto IsZeroInit = [](MachineInstr *Def) { |
| 872 | return !isVectorPredicated(MI: Def) && |
| 873 | Def->getOpcode() == ARM::MVE_VMOVimmi32 && |
| 874 | Def->getOperand(i: 1).getImm() == 0; |
| 875 | }; |
| 876 | |
| 877 | bool AllowScalars = isHorizontalReduction(MI); |
| 878 | for (auto &MO : MI.operands()) { |
| 879 | if (!MO.isReg() || !MO.getReg()) |
| 880 | continue; |
| 881 | if (!isRegInClass(MO, Class: QPRs) && AllowScalars) |
| 882 | continue; |
| 883 | // Skip the lr predicate reg |
| 884 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI); |
| 885 | if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg) |
| 886 | continue; |
| 887 | |
| 888 | // Check that this instruction will produce zeros in its false lanes: |
| 889 | // - If it only consumes false lanes zero or constant 0 (vmov #0) |
| 890 | // - If it's predicated, it only matters that it's def register already has |
| 891 | // false lane zeros, so we can ignore the uses. |
| 892 | SmallPtrSet<MachineInstr *, 2> Defs; |
| 893 | RDI.getGlobalReachingDefs(MI: &MI, Reg: MO.getReg(), Defs); |
| 894 | if (Defs.empty()) |
| 895 | return false; |
| 896 | for (auto *Def : Defs) { |
| 897 | if (Def == &MI || FalseLanesZero.count(Ptr: Def) || IsZeroInit(Def)) |
| 898 | continue; |
| 899 | if (MO.isUse() && isPredicated) |
| 900 | continue; |
| 901 | return false; |
| 902 | } |
| 903 | } |
| 904 | LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); |
| 905 | return true; |
| 906 | } |
| 907 | |
| 908 | bool LowOverheadLoop::ValidateLiveOuts() { |
| 909 | // We want to find out if the tail-predicated version of this loop will |
| 910 | // produce the same values as the loop in its original form. For this to |
| 911 | // be true, the newly inserted implicit predication must not change the |
| 912 | // the (observable) results. |
| 913 | // We're doing this because many instructions in the loop will not be |
| 914 | // predicated and so the conversion from VPT predication to tail-predication |
| 915 | // can result in different values being produced; due to the tail-predication |
| 916 | // preventing many instructions from updating their falsely predicated |
| 917 | // lanes. This analysis assumes that all the instructions perform lane-wise |
| 918 | // operations and don't perform any exchanges. |
| 919 | // A masked load, whether through VPT or tail predication, will write zeros |
| 920 | // to any of the falsely predicated bytes. So, from the loads, we know that |
| 921 | // the false lanes are zeroed and here we're trying to track that those false |
| 922 | // lanes remain zero, or where they change, the differences are masked away |
| 923 | // by their user(s). |
| 924 | // All MVE stores have to be predicated, so we know that any predicate load |
| 925 | // operands, or stored results are equivalent already. Other explicitly |
| 926 | // predicated instructions will perform the same operation in the original |
| 927 | // loop and the tail-predicated form too. Because of this, we can insert |
| 928 | // loads, stores and other predicated instructions into our Predicated |
| 929 | // set and build from there. |
| 930 | const TargetRegisterClass *QPRs = TRI.getRegClass(i: ARM::MQPRRegClassID); |
| 931 | SetVector<MachineInstr *> FalseLanesUnknown; |
| 932 | SmallPtrSet<MachineInstr *, 4> FalseLanesZero; |
| 933 | SmallPtrSet<MachineInstr *, 4> Predicated; |
| 934 | MachineBasicBlock * = ML.getHeader(); |
| 935 | |
| 936 | LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n" ); |
| 937 | |
| 938 | for (auto &MI : *Header) { |
| 939 | if (!shouldInspect(MI)) |
| 940 | continue; |
| 941 | |
| 942 | if (isVCTP(MI: &MI) || isVPTOpcode(Opc: MI.getOpcode())) |
| 943 | continue; |
| 944 | |
| 945 | bool isPredicated = isVectorPredicated(MI: &MI); |
| 946 | bool retainsOrReduces = |
| 947 | retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); |
| 948 | |
| 949 | if (isPredicated) |
| 950 | Predicated.insert(Ptr: &MI); |
| 951 | if (producesFalseLanesZero(MI, QPRs, RDI, FalseLanesZero)) |
| 952 | FalseLanesZero.insert(Ptr: &MI); |
| 953 | else if (MI.getNumDefs() == 0) |
| 954 | continue; |
| 955 | else if (!isPredicated && retainsOrReduces) { |
| 956 | LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); |
| 957 | return false; |
| 958 | } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) |
| 959 | FalseLanesUnknown.insert(X: &MI); |
| 960 | } |
| 961 | |
| 962 | LLVM_DEBUG({ |
| 963 | dbgs() << " Predicated:\n" ; |
| 964 | for (auto *I : Predicated) |
| 965 | dbgs() << " " << *I; |
| 966 | dbgs() << " FalseLanesZero:\n" ; |
| 967 | for (auto *I : FalseLanesZero) |
| 968 | dbgs() << " " << *I; |
| 969 | dbgs() << " FalseLanesUnknown:\n" ; |
| 970 | for (auto *I : FalseLanesUnknown) |
| 971 | dbgs() << " " << *I; |
| 972 | }); |
| 973 | |
| 974 | auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, |
| 975 | SmallPtrSetImpl<MachineInstr *> &Predicated) { |
| 976 | SmallPtrSet<MachineInstr *, 2> Uses; |
| 977 | RDI.getGlobalUses(MI, Reg: MO.getReg().asMCReg(), Uses); |
| 978 | for (auto *Use : Uses) { |
| 979 | if (Use != MI && !Predicated.count(Ptr: Use)) |
| 980 | return false; |
| 981 | } |
| 982 | return true; |
| 983 | }; |
| 984 | |
| 985 | // Visit the unknowns in reverse so that we can start at the values being |
| 986 | // stored and then we can work towards the leaves, hopefully adding more |
| 987 | // instructions to Predicated. Successfully terminating the loop means that |
| 988 | // all the unknown values have to found to be masked by predicated user(s). |
| 989 | // For any unpredicated values, we store them in NonPredicated so that we |
| 990 | // can later check whether these form a reduction. |
| 991 | SmallPtrSet<MachineInstr*, 2> NonPredicated; |
| 992 | for (auto *MI : reverse(C&: FalseLanesUnknown)) { |
| 993 | for (auto &MO : MI->operands()) { |
| 994 | if (!isRegInClass(MO, Class: QPRs) || !MO.isDef()) |
| 995 | continue; |
| 996 | if (!HasPredicatedUsers(MI, MO, Predicated)) { |
| 997 | LLVM_DEBUG(dbgs() << " Found an unknown def of : " |
| 998 | << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); |
| 999 | NonPredicated.insert(Ptr: MI); |
| 1000 | break; |
| 1001 | } |
| 1002 | } |
| 1003 | // Any unknown false lanes have been masked away by the user(s). |
| 1004 | if (!NonPredicated.contains(Ptr: MI)) |
| 1005 | Predicated.insert(Ptr: MI); |
| 1006 | } |
| 1007 | |
| 1008 | SmallPtrSet<MachineInstr *, 2> LiveOutMIs; |
| 1009 | SmallVector<MachineBasicBlock *, 2> ExitBlocks; |
| 1010 | ML.getExitBlocks(ExitBlocks); |
| 1011 | assert(ML.getNumBlocks() == 1 && "Expected single block loop!" ); |
| 1012 | assert(ExitBlocks.size() == 1 && "Expected a single exit block" ); |
| 1013 | MachineBasicBlock *ExitBB = ExitBlocks.front(); |
| 1014 | for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { |
| 1015 | // TODO: Instead of blocking predication, we could move the vctp to the exit |
| 1016 | // block and calculate it's operand there in or the preheader. |
| 1017 | if (RegMask.PhysReg == ARM::VPR) { |
| 1018 | LLVM_DEBUG(dbgs() << " VPR is live in to the exit block." ); |
| 1019 | return false; |
| 1020 | } |
| 1021 | // Check Q-regs that are live in the exit blocks. We don't collect scalars |
| 1022 | // because they won't be affected by lane predication. |
| 1023 | if (QPRs->contains(Reg: RegMask.PhysReg)) |
| 1024 | if (auto *MI = RDI.getLocalLiveOutMIDef(MBB: Header, Reg: RegMask.PhysReg)) |
| 1025 | LiveOutMIs.insert(Ptr: MI); |
| 1026 | } |
| 1027 | |
| 1028 | // We've already validated that any VPT predication within the loop will be |
| 1029 | // equivalent when we perform the predication transformation; so we know that |
| 1030 | // any VPT predicated instruction is predicated upon VCTP. Any live-out |
| 1031 | // instruction needs to be predicated, so check this here. The instructions |
| 1032 | // in NonPredicated have been found to be a reduction that we can ensure its |
| 1033 | // legality. Any MQPRCopy found will need to validate its input as if it was |
| 1034 | // live out. |
| 1035 | SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); |
| 1036 | while (!Worklist.empty()) { |
| 1037 | MachineInstr *MI = Worklist.pop_back_val(); |
| 1038 | if (MI->getOpcode() == ARM::MQPRCopy) { |
| 1039 | LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI); |
| 1040 | VMOVCopies.insert(Ptr: MI); |
| 1041 | MachineInstr *CopySrc = |
| 1042 | RDI.getUniqueReachingMIDef(MI, Reg: MI->getOperand(i: 1).getReg()); |
| 1043 | if (CopySrc) |
| 1044 | Worklist.push_back(Elt: CopySrc); |
| 1045 | } else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) { |
| 1046 | LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); |
| 1047 | VMOVCopies.clear(); |
| 1048 | return false; |
| 1049 | } else if (isVectorPredicated(MI)) { |
| 1050 | // If this is a predicated instruction with merging semantics, |
| 1051 | // check where it gets its false lanes from, if any. |
| 1052 | int InactiveIdx = findVPTInactiveOperandIdx(MI: *MI); |
| 1053 | if (InactiveIdx != -1) { |
| 1054 | MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef( |
| 1055 | MI, Reg: MI->getOperand(i: InactiveIdx).getReg()); |
| 1056 | if (FalseSrc) { |
| 1057 | LLVM_DEBUG(dbgs() |
| 1058 | << " Must check source of false lanes for: " << *MI); |
| 1059 | Worklist.push_back(Elt: FalseSrc); |
| 1060 | } |
| 1061 | } |
| 1062 | } |
| 1063 | } |
| 1064 | |
| 1065 | return true; |
| 1066 | } |
| 1067 | |
| 1068 | void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { |
| 1069 | if (Revert) |
| 1070 | return; |
| 1071 | |
| 1072 | // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] |
| 1073 | // can only jump back. |
| 1074 | auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, |
| 1075 | ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { |
| 1076 | MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd |
| 1077 | ? End->getOperand(i: 1).getMBB() |
| 1078 | : End->getOperand(i: 2).getMBB(); |
| 1079 | // TODO Maybe there's cases where the target doesn't have to be the header, |
| 1080 | // but for now be safe and revert. |
| 1081 | if (TgtBB != ML.getHeader()) { |
| 1082 | LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n" ); |
| 1083 | return false; |
| 1084 | } |
| 1085 | |
| 1086 | // The WLS and LE instructions have 12-bits for the label offset. WLS |
| 1087 | // requires a positive offset, while LE uses negative. |
| 1088 | if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) || |
| 1089 | !BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: 4094)) { |
| 1090 | LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n" ); |
| 1091 | return false; |
| 1092 | } |
| 1093 | |
| 1094 | if (isWhileLoopStart(MI: *Start)) { |
| 1095 | MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(MI: *Start); |
| 1096 | if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) || |
| 1097 | !BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: 4094)) { |
| 1098 | LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n" ); |
| 1099 | return false; |
| 1100 | } |
| 1101 | } |
| 1102 | return true; |
| 1103 | }; |
| 1104 | |
| 1105 | StartInsertPt = MachineBasicBlock::iterator(Start); |
| 1106 | StartInsertBB = Start->getParent(); |
| 1107 | LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at " |
| 1108 | << *StartInsertPt); |
| 1109 | |
| 1110 | Revert = !ValidateRanges(Start, End, BBUtils, ML); |
| 1111 | CannotTailPredicate = !ValidateTailPredicate(); |
| 1112 | } |
| 1113 | |
| 1114 | bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { |
| 1115 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); |
| 1116 | if (VCTPs.empty()) { |
| 1117 | VCTPs.push_back(Elt: MI); |
| 1118 | return true; |
| 1119 | } |
| 1120 | |
| 1121 | // If we find another VCTP, check whether it uses the same value as the main VCTP. |
| 1122 | // If it does, store it in the VCTPs set, else refuse it. |
| 1123 | MachineInstr *Prev = VCTPs.back(); |
| 1124 | if (!Prev->getOperand(i: 1).isIdenticalTo(Other: MI->getOperand(i: 1)) || |
| 1125 | !RDI.hasSameReachingDef(A: Prev, B: MI, Reg: MI->getOperand(i: 1).getReg().asMCReg())) { |
| 1126 | LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " |
| 1127 | "definition from the main VCTP" ); |
| 1128 | return false; |
| 1129 | } |
| 1130 | VCTPs.push_back(Elt: MI); |
| 1131 | return true; |
| 1132 | } |
| 1133 | |
| 1134 | static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { |
| 1135 | |
| 1136 | auto GetFrameIndex = [](MachineMemOperand *Operand) { |
| 1137 | const PseudoSourceValue *PseudoValue = Operand->getPseudoValue(); |
| 1138 | if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) { |
| 1139 | if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) { |
| 1140 | return FS->getFrameIndex(); |
| 1141 | } |
| 1142 | } |
| 1143 | return -1; |
| 1144 | }; |
| 1145 | |
| 1146 | auto IsStackOp = [GetFrameIndex](MachineInstr *I) { |
| 1147 | switch (I->getOpcode()) { |
| 1148 | case ARM::MVE_VSTRWU32: |
| 1149 | case ARM::MVE_VLDRWU32: { |
| 1150 | return I->getOperand(i: 1).getReg() == ARM::SP && |
| 1151 | I->memoperands().size() == 1 && |
| 1152 | GetFrameIndex(I->memoperands().front()) >= 0; |
| 1153 | } |
| 1154 | default: |
| 1155 | return false; |
| 1156 | } |
| 1157 | }; |
| 1158 | |
| 1159 | // An unpredicated vector register spill is allowed if all of the uses of the |
| 1160 | // stack slot are within the loop |
| 1161 | if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) |
| 1162 | return false; |
| 1163 | |
| 1164 | // Search all blocks after the loop for accesses to the same stack slot. |
| 1165 | // ReachingDefAnalysis doesn't work for sp as it relies on registers being |
| 1166 | // live-out (which sp never is) to know what blocks to look in |
| 1167 | if (MI->memoperands().size() == 0) |
| 1168 | return false; |
| 1169 | int FI = GetFrameIndex(MI->memoperands().front()); |
| 1170 | |
| 1171 | auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo(); |
| 1172 | if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI)) |
| 1173 | return false; |
| 1174 | |
| 1175 | SmallVector<MachineBasicBlock *> Frontier; |
| 1176 | ML->getExitBlocks(ExitBlocks&: Frontier); |
| 1177 | SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()}; |
| 1178 | unsigned Idx = 0; |
| 1179 | while (Idx < Frontier.size()) { |
| 1180 | MachineBasicBlock *BB = Frontier[Idx]; |
| 1181 | bool LookAtSuccessors = true; |
| 1182 | for (auto &I : *BB) { |
| 1183 | if (!IsStackOp(&I) || I.memoperands().size() == 0) |
| 1184 | continue; |
| 1185 | if (GetFrameIndex(I.memoperands().front()) != FI) |
| 1186 | continue; |
| 1187 | // If this block has a store to the stack slot before any loads then we |
| 1188 | // can ignore the block |
| 1189 | if (I.getOpcode() == ARM::MVE_VSTRWU32) { |
| 1190 | LookAtSuccessors = false; |
| 1191 | break; |
| 1192 | } |
| 1193 | // If the store and the load are using the same stack slot then the |
| 1194 | // store isn't valid for tail predication |
| 1195 | if (I.getOpcode() == ARM::MVE_VLDRWU32) |
| 1196 | return false; |
| 1197 | } |
| 1198 | |
| 1199 | if (LookAtSuccessors) { |
| 1200 | for (auto *Succ : BB->successors()) { |
| 1201 | if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ)) |
| 1202 | Frontier.push_back(Elt: Succ); |
| 1203 | } |
| 1204 | } |
| 1205 | Visited.insert(Ptr: BB); |
| 1206 | Idx++; |
| 1207 | } |
| 1208 | |
| 1209 | return true; |
| 1210 | } |
| 1211 | |
| 1212 | bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { |
| 1213 | if (CannotTailPredicate) |
| 1214 | return false; |
| 1215 | |
| 1216 | if (!shouldInspect(MI&: *MI)) |
| 1217 | return true; |
| 1218 | |
| 1219 | if (MI->getOpcode() == ARM::MVE_VPSEL || |
| 1220 | MI->getOpcode() == ARM::MVE_VPNOT) { |
| 1221 | // TODO: Allow VPSEL and VPNOT, we currently cannot because: |
| 1222 | // 1) It will use the VPR as a predicate operand, but doesn't have to be |
| 1223 | // instead a VPT block, which means we can assert while building up |
| 1224 | // the VPT block because we don't find another VPT or VPST to being a new |
| 1225 | // one. |
| 1226 | // 2) VPSEL still requires a VPR operand even after tail predicating, |
| 1227 | // which means we can't remove it unless there is another |
| 1228 | // instruction, such as vcmp, that can provide the VPR def. |
| 1229 | return false; |
| 1230 | } |
| 1231 | |
| 1232 | // Record all VCTPs and check that they're equivalent to one another. |
| 1233 | if (isVCTP(MI) && !AddVCTP(MI)) |
| 1234 | return false; |
| 1235 | |
| 1236 | // Inspect uses first so that any instructions that alter the VPR don't |
| 1237 | // alter the predicate upon themselves. |
| 1238 | const MCInstrDesc &MCID = MI->getDesc(); |
| 1239 | bool IsUse = false; |
| 1240 | unsigned LastOpIdx = MI->getNumOperands() - 1; |
| 1241 | for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) { |
| 1242 | const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index()); |
| 1243 | if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) |
| 1244 | continue; |
| 1245 | |
| 1246 | if (ARM::isVpred(op: Op.value().OperandType)) { |
| 1247 | VPTstate.addInst(MI); |
| 1248 | IsUse = true; |
| 1249 | } else if (MI->getOpcode() != ARM::MVE_VPST) { |
| 1250 | LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); |
| 1251 | return false; |
| 1252 | } |
| 1253 | } |
| 1254 | |
| 1255 | // If we find an instruction that has been marked as not valid for tail |
| 1256 | // predication, only allow the instruction if it's contained within a valid |
| 1257 | // VPT block. |
| 1258 | bool RequiresExplicitPredication = |
| 1259 | (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; |
| 1260 | if (isDomainMVE(MI) && RequiresExplicitPredication) { |
| 1261 | if (MI->getOpcode() == ARM::MQPRCopy) |
| 1262 | return true; |
| 1263 | if (!IsUse && producesDoubleWidthResult(MI: *MI)) { |
| 1264 | DoubleWidthResultInstrs.insert(Ptr: MI); |
| 1265 | return true; |
| 1266 | } |
| 1267 | |
| 1268 | LLVM_DEBUG(if (!IsUse) dbgs() |
| 1269 | << "ARM Loops: Can't tail predicate: " << *MI); |
| 1270 | return IsUse; |
| 1271 | } |
| 1272 | |
| 1273 | // If the instruction is already explicitly predicated, then the conversion |
| 1274 | // will be fine, but ensure that all store operations are predicated. |
| 1275 | if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML)) |
| 1276 | return IsUse; |
| 1277 | |
| 1278 | // If this instruction defines the VPR, update the predicate for the |
| 1279 | // proceeding instructions. |
| 1280 | if (isVectorPredicate(MI)) { |
| 1281 | // Clear the existing predicate when we're not in VPT Active state, |
| 1282 | // otherwise we add to it. |
| 1283 | if (!isVectorPredicated(MI)) |
| 1284 | VPTstate.resetPredicate(MI); |
| 1285 | else |
| 1286 | VPTstate.addPredicate(MI); |
| 1287 | } |
| 1288 | |
| 1289 | // Finally once the predicate has been modified, we can start a new VPT |
| 1290 | // block if necessary. |
| 1291 | if (isVPTOpcode(Opc: MI->getOpcode())) |
| 1292 | VPTstate.CreateVPTBlock(MI); |
| 1293 | |
| 1294 | return true; |
| 1295 | } |
| 1296 | |
| 1297 | bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { |
| 1298 | const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>(); |
| 1299 | if (!ST.hasLOB()) |
| 1300 | return false; |
| 1301 | |
| 1302 | MF = &mf; |
| 1303 | LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n" ); |
| 1304 | |
| 1305 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
| 1306 | RDI = &getAnalysis<ReachingDefInfoWrapperPass>().getRDI(); |
| 1307 | MF->getProperties().setTracksLiveness(); |
| 1308 | MRI = &MF->getRegInfo(); |
| 1309 | TII = ST.getInstrInfo(); |
| 1310 | TRI = ST.getRegisterInfo(); |
| 1311 | BBUtils = std::make_unique<ARMBasicBlockUtils>(args&: *MF); |
| 1312 | BBUtils->computeAllBlockSizes(); |
| 1313 | BBUtils->adjustBBOffsetsAfter(MBB: &MF->front()); |
| 1314 | |
| 1315 | bool Changed = false; |
| 1316 | for (auto *ML : *MLI) { |
| 1317 | if (ML->isOutermost()) |
| 1318 | Changed |= ProcessLoop(ML); |
| 1319 | } |
| 1320 | Changed |= RevertNonLoops(); |
| 1321 | return Changed; |
| 1322 | } |
| 1323 | |
| 1324 | bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { |
| 1325 | bool Changed = false; |
| 1326 | |
| 1327 | // Process inner loops first. |
| 1328 | for (MachineLoop *L : *ML) |
| 1329 | Changed |= ProcessLoop(ML: L); |
| 1330 | |
| 1331 | LLVM_DEBUG({ |
| 1332 | dbgs() << "ARM Loops: Processing loop containing:\n" ; |
| 1333 | if (auto *Preheader = ML->getLoopPreheader()) |
| 1334 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
| 1335 | else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true)) |
| 1336 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
| 1337 | for (auto *MBB : ML->getBlocks()) |
| 1338 | dbgs() << " - Block: " << printMBBReference(*MBB) << "\n" ; |
| 1339 | }); |
| 1340 | |
| 1341 | // Search the given block for a loop start instruction. If one isn't found, |
| 1342 | // and there's only one predecessor block, search that one too. |
| 1343 | std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = |
| 1344 | [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { |
| 1345 | for (auto &MI : *MBB) { |
| 1346 | if (isLoopStart(MI)) |
| 1347 | return &MI; |
| 1348 | } |
| 1349 | if (MBB->pred_size() == 1) |
| 1350 | return SearchForStart(*MBB->pred_begin()); |
| 1351 | return nullptr; |
| 1352 | }; |
| 1353 | |
| 1354 | LowOverheadLoop LoLoop(*ML, *MLI, *RDI, *TRI, *TII); |
| 1355 | // Search the preheader for the start intrinsic. |
| 1356 | // FIXME: I don't see why we shouldn't be supporting multiple predecessors |
| 1357 | // with potentially multiple set.loop.iterations, so we need to enable this. |
| 1358 | if (LoLoop.Preheader) |
| 1359 | LoLoop.Start = SearchForStart(LoLoop.Preheader); |
| 1360 | else |
| 1361 | return Changed; |
| 1362 | |
| 1363 | // Find the low-overhead loop components and decide whether or not to fall |
| 1364 | // back to a normal loop. Also look for a vctp instructions and decide |
| 1365 | // whether we can convert that predicate using tail predication. |
| 1366 | for (auto *MBB : reverse(C: ML->getBlocks())) { |
| 1367 | for (auto &MI : *MBB) { |
| 1368 | if (MI.isDebugValue()) |
| 1369 | continue; |
| 1370 | else if (MI.getOpcode() == ARM::t2LoopDec) |
| 1371 | LoLoop.Dec = &MI; |
| 1372 | else if (MI.getOpcode() == ARM::t2LoopEnd) |
| 1373 | LoLoop.End = &MI; |
| 1374 | else if (MI.getOpcode() == ARM::t2LoopEndDec) |
| 1375 | LoLoop.End = LoLoop.Dec = &MI; |
| 1376 | else if (isLoopStart(MI)) |
| 1377 | LoLoop.Start = &MI; |
| 1378 | else if (MI.getDesc().isCall()) { |
| 1379 | // TODO: Though the call will require LE to execute again, does this |
| 1380 | // mean we should revert? Always executing LE hopefully should be |
| 1381 | // faster than performing a sub,cmp,br or even subs,br. |
| 1382 | LoLoop.Revert = true; |
| 1383 | LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n" ); |
| 1384 | } else { |
| 1385 | // Record VPR defs and build up their corresponding vpt blocks. |
| 1386 | // Check we know how to tail predicate any mve instructions. |
| 1387 | LoLoop.AnalyseMVEInst(MI: &MI); |
| 1388 | } |
| 1389 | } |
| 1390 | } |
| 1391 | |
| 1392 | LLVM_DEBUG(LoLoop.dump()); |
| 1393 | if (!LoLoop.FoundAllComponents()) { |
| 1394 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n" ); |
| 1395 | return Changed; |
| 1396 | } |
| 1397 | |
| 1398 | assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart && |
| 1399 | "Expected t2WhileLoopStart to be removed before regalloc!" ); |
| 1400 | |
| 1401 | // Check that the only instruction using LoopDec is LoopEnd. This can only |
| 1402 | // happen when the Dec and End are separate, not a single t2LoopEndDec. |
| 1403 | // TODO: Check for copy chains that really have no effect. |
| 1404 | if (LoLoop.Dec != LoLoop.End) { |
| 1405 | SmallPtrSet<MachineInstr *, 2> Uses; |
| 1406 | RDI->getReachingLocalUses(MI: LoLoop.Dec, Reg: MCRegister::from(Val: ARM::LR), Uses); |
| 1407 | if (Uses.size() > 1 || !Uses.count(Ptr: LoLoop.End)) { |
| 1408 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n" ); |
| 1409 | LoLoop.Revert = true; |
| 1410 | } |
| 1411 | } |
| 1412 | LoLoop.Validate(BBUtils: BBUtils.get()); |
| 1413 | Expand(LoLoop); |
| 1414 | return true; |
| 1415 | } |
| 1416 | |
| 1417 | // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a |
| 1418 | // beq that branches to the exit branch. |
| 1419 | // TODO: We could also try to generate a cbz if the value in LR is also in |
| 1420 | // another low register. |
| 1421 | void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { |
| 1422 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); |
| 1423 | MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(MI: *MI); |
| 1424 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
| 1425 | ARM::tBcc : ARM::t2Bcc; |
| 1426 | |
| 1427 | RevertWhileLoopStartLR(MI, TII, BrOpc); |
| 1428 | } |
| 1429 | |
| 1430 | void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { |
| 1431 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); |
| 1432 | RevertDoLoopStart(MI, TII); |
| 1433 | } |
| 1434 | |
| 1435 | bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { |
| 1436 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); |
| 1437 | MachineBasicBlock *MBB = MI->getParent(); |
| 1438 | SmallPtrSet<MachineInstr*, 1> Ignore; |
| 1439 | for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { |
| 1440 | if (I->getOpcode() == ARM::t2LoopEnd) { |
| 1441 | Ignore.insert(Ptr: &*I); |
| 1442 | break; |
| 1443 | } |
| 1444 | } |
| 1445 | |
| 1446 | // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. |
| 1447 | bool SetFlags = |
| 1448 | RDI->isSafeToDefRegAt(MI, Reg: MCRegister::from(Val: ARM::CPSR), Ignore); |
| 1449 | |
| 1450 | llvm::RevertLoopDec(MI, TII, SetFlags); |
| 1451 | return SetFlags; |
| 1452 | } |
| 1453 | |
| 1454 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
| 1455 | void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { |
| 1456 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); |
| 1457 | |
| 1458 | MachineBasicBlock *DestBB = MI->getOperand(i: 1).getMBB(); |
| 1459 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
| 1460 | ARM::tBcc : ARM::t2Bcc; |
| 1461 | |
| 1462 | llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); |
| 1463 | } |
| 1464 | |
| 1465 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
| 1466 | void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { |
| 1467 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); |
| 1468 | assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!" ); |
| 1469 | MachineBasicBlock *MBB = MI->getParent(); |
| 1470 | |
| 1471 | MachineInstrBuilder MIB = |
| 1472 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri)); |
| 1473 | MIB.addDef(RegNo: ARM::LR); |
| 1474 | MIB.add(MO: MI->getOperand(i: 1)); |
| 1475 | MIB.addImm(Val: 1); |
| 1476 | MIB.addImm(Val: ARMCC::AL); |
| 1477 | MIB.addReg(RegNo: ARM::NoRegister); |
| 1478 | MIB.addReg(RegNo: ARM::CPSR); |
| 1479 | MIB->getOperand(i: 5).setIsDef(true); |
| 1480 | |
| 1481 | MachineBasicBlock *DestBB = MI->getOperand(i: 2).getMBB(); |
| 1482 | unsigned BrOpc = |
| 1483 | BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? ARM::tBcc : ARM::t2Bcc; |
| 1484 | |
| 1485 | // Create bne |
| 1486 | MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: BrOpc)); |
| 1487 | MIB.add(MO: MI->getOperand(i: 2)); // branch target |
| 1488 | MIB.addImm(Val: ARMCC::NE); // condition code |
| 1489 | MIB.addReg(RegNo: ARM::CPSR); |
| 1490 | |
| 1491 | MI->eraseFromParent(); |
| 1492 | } |
| 1493 | |
| 1494 | // Perform dead code elimation on the loop iteration count setup expression. |
| 1495 | // If we are tail-predicating, the number of elements to be processed is the |
| 1496 | // operand of the VCTP instruction in the vector body, see getCount(), which is |
| 1497 | // register $r3 in this example: |
| 1498 | // |
| 1499 | // $lr = big-itercount-expression |
| 1500 | // .. |
| 1501 | // $lr = t2DoLoopStart renamable $lr |
| 1502 | // vector.body: |
| 1503 | // .. |
| 1504 | // $vpr = MVE_VCTP32 renamable $r3 |
| 1505 | // renamable $lr = t2LoopDec killed renamable $lr, 1 |
| 1506 | // t2LoopEnd renamable $lr, %vector.body |
| 1507 | // tB %end |
| 1508 | // |
| 1509 | // What we would like achieve here is to replace the do-loop start pseudo |
| 1510 | // instruction t2DoLoopStart with: |
| 1511 | // |
| 1512 | // $lr = MVE_DLSTP_32 killed renamable $r3 |
| 1513 | // |
| 1514 | // Thus, $r3 which defines the number of elements, is written to $lr, |
| 1515 | // and then we want to delete the whole chain that used to define $lr, |
| 1516 | // see the comment below how this chain could look like. |
| 1517 | // |
| 1518 | void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { |
| 1519 | if (!LoLoop.IsTailPredicationLegal()) |
| 1520 | return; |
| 1521 | |
| 1522 | LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n" ); |
| 1523 | |
| 1524 | MachineInstr *Def = RDI->getMIOperand(MI: LoLoop.Start, Idx: 1); |
| 1525 | if (!Def) { |
| 1526 | LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n" ); |
| 1527 | return; |
| 1528 | } |
| 1529 | |
| 1530 | // Collect and remove the users of iteration count. |
| 1531 | SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, |
| 1532 | LoLoop.End }; |
| 1533 | if (!TryRemove(MI: Def, RDI&: *RDI, ToRemove&: LoLoop.ToRemove, Ignore&: Killed)) |
| 1534 | LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n" ); |
| 1535 | } |
| 1536 | |
| 1537 | MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { |
| 1538 | LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n" ); |
| 1539 | // When using tail-predication, try to delete the dead code that was used to |
| 1540 | // calculate the number of loop iterations. |
| 1541 | IterationCountDCE(LoLoop); |
| 1542 | |
| 1543 | MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; |
| 1544 | MachineInstr *Start = LoLoop.Start; |
| 1545 | MachineBasicBlock *MBB = LoLoop.StartInsertBB; |
| 1546 | unsigned Opc = LoLoop.getStartOpcode(); |
| 1547 | MachineOperand &Count = LoLoop.getLoopStartOperand(); |
| 1548 | |
| 1549 | // A DLS lr, lr we needn't emit |
| 1550 | MachineInstr* NewStart; |
| 1551 | if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() && |
| 1552 | Count.getReg() == ARM::LR) { |
| 1553 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr" ); |
| 1554 | NewStart = nullptr; |
| 1555 | } else { |
| 1556 | MachineInstrBuilder MIB = |
| 1557 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: Start->getDebugLoc(), MCID: TII->get(Opcode: Opc)); |
| 1558 | |
| 1559 | MIB.addDef(RegNo: ARM::LR); |
| 1560 | MIB.add(MO: Count); |
| 1561 | if (isWhileLoopStart(MI: *Start)) |
| 1562 | MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start)); |
| 1563 | |
| 1564 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); |
| 1565 | NewStart = &*MIB; |
| 1566 | } |
| 1567 | |
| 1568 | LoLoop.ToRemove.insert(Ptr: Start); |
| 1569 | return NewStart; |
| 1570 | } |
| 1571 | |
| 1572 | void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { |
| 1573 | auto RemovePredicate = [](MachineInstr *MI) { |
| 1574 | if (MI->isDebugInstr()) |
| 1575 | return; |
| 1576 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); |
| 1577 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
| 1578 | assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction" ); |
| 1579 | assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && |
| 1580 | "Expected Then predicate!" ); |
| 1581 | MI->getOperand(i: PIdx).setImm(ARMVCC::None); |
| 1582 | MI->getOperand(i: PIdx + 1).setReg(0); |
| 1583 | }; |
| 1584 | |
| 1585 | for (auto &Block : LoLoop.getVPTBlocks()) { |
| 1586 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 1587 | |
| 1588 | auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { |
| 1589 | assert(TheVCMP && "Replacing a removed or non-existent VCMP" ); |
| 1590 | // Replace the VCMP with a VPT |
| 1591 | MachineInstrBuilder MIB = |
| 1592 | BuildMI(BB&: *At->getParent(), I: At, MIMD: At->getDebugLoc(), |
| 1593 | MCID: TII->get(Opcode: VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode()))); |
| 1594 | MIB.addImm(Val: ARMVCC::Then); |
| 1595 | // Register one |
| 1596 | MIB.add(MO: TheVCMP->getOperand(i: 1)); |
| 1597 | // Register two |
| 1598 | MIB.add(MO: TheVCMP->getOperand(i: 2)); |
| 1599 | // The comparison code, e.g. ge, eq, lt |
| 1600 | MIB.add(MO: TheVCMP->getOperand(i: 3)); |
| 1601 | LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); |
| 1602 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
| 1603 | LoLoop.ToRemove.insert(Ptr: TheVCMP); |
| 1604 | TheVCMP = nullptr; |
| 1605 | }; |
| 1606 | |
| 1607 | if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ Exclusive: true)) { |
| 1608 | MachineInstr *VPST = Insts.front(); |
| 1609 | if (Block.hasUniformPredicate()) { |
| 1610 | // A vpt block starting with VPST, is only predicated upon vctp and has no |
| 1611 | // internal vpr defs: |
| 1612 | // - Remove vpst. |
| 1613 | // - Unpredicate the remaining instructions. |
| 1614 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
| 1615 | for (unsigned i = 1; i < Insts.size(); ++i) |
| 1616 | RemovePredicate(Insts[i]); |
| 1617 | } else { |
| 1618 | // The VPT block has a non-uniform predicate but it uses a vpst and its |
| 1619 | // entry is guarded only by a vctp, which means we: |
| 1620 | // - Need to remove the original vpst. |
| 1621 | // - Then need to unpredicate any following instructions, until |
| 1622 | // we come across the divergent vpr def. |
| 1623 | // - Insert a new vpst to predicate the instruction(s) that following |
| 1624 | // the divergent vpr def. |
| 1625 | MachineInstr *Divergent = Block.getDivergent(); |
| 1626 | MachineBasicBlock *MBB = Divergent->getParent(); |
| 1627 | auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); |
| 1628 | while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) |
| 1629 | ++DivergentNext; |
| 1630 | |
| 1631 | bool DivergentNextIsPredicated = |
| 1632 | DivergentNext != MBB->end() && |
| 1633 | getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None; |
| 1634 | |
| 1635 | for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; |
| 1636 | I != E; ++I) |
| 1637 | RemovePredicate(&*I); |
| 1638 | |
| 1639 | // Check if the instruction defining vpr is a vcmp so it can be combined |
| 1640 | // with the VPST This should be the divergent instruction |
| 1641 | MachineInstr *VCMP = |
| 1642 | VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != 0 ? Divergent : nullptr; |
| 1643 | |
| 1644 | if (DivergentNextIsPredicated) { |
| 1645 | // Insert a VPST at the divergent only if the next instruction |
| 1646 | // would actually use it. A VCMP following a VPST can be |
| 1647 | // merged into a VPT so do that instead if the VCMP exists. |
| 1648 | if (!VCMP) { |
| 1649 | // Create a VPST (with a null mask for now, we'll recompute it |
| 1650 | // later) |
| 1651 | MachineInstrBuilder MIB = |
| 1652 | BuildMI(BB&: *Divergent->getParent(), I: Divergent, |
| 1653 | MIMD: Divergent->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPST)); |
| 1654 | MIB.addImm(Val: 0); |
| 1655 | LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); |
| 1656 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
| 1657 | } else { |
| 1658 | // No RDI checks are necessary here since the VPST would have been |
| 1659 | // directly after the VCMP |
| 1660 | ReplaceVCMPWithVPT(VCMP, VCMP); |
| 1661 | } |
| 1662 | } |
| 1663 | } |
| 1664 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
| 1665 | LoLoop.ToRemove.insert(Ptr: VPST); |
| 1666 | } else if (Block.containsVCTP()) { |
| 1667 | // The vctp will be removed, so either the entire block will be dead or |
| 1668 | // the block mask of the vp(s)t will need to be recomputed. |
| 1669 | MachineInstr *VPST = Insts.front(); |
| 1670 | if (Block.size() == 2) { |
| 1671 | assert(VPST->getOpcode() == ARM::MVE_VPST && |
| 1672 | "Found a VPST in an otherwise empty vpt block" ); |
| 1673 | LoLoop.ToRemove.insert(Ptr: VPST); |
| 1674 | } else |
| 1675 | LoLoop.BlockMasksToRecompute.insert(Ptr: VPST); |
| 1676 | } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { |
| 1677 | // If this block starts with a VPST then attempt to merge it with the |
| 1678 | // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT |
| 1679 | // block that no longer exists |
| 1680 | MachineInstr *VPST = Insts.front(); |
| 1681 | auto Next = ++MachineBasicBlock::iterator(VPST); |
| 1682 | assert(getVPTInstrPredicate(*Next) != ARMVCC::None && |
| 1683 | "The instruction after a VPST must be predicated" ); |
| 1684 | (void)Next; |
| 1685 | MachineInstr *VprDef = RDI->getUniqueReachingMIDef(MI: VPST, Reg: ARM::VPR); |
| 1686 | if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) && |
| 1687 | !LoLoop.ToRemove.contains(Ptr: VprDef)) { |
| 1688 | MachineInstr *VCMP = VprDef; |
| 1689 | // The VCMP and VPST can only be merged if the VCMP's operands will have |
| 1690 | // the same values at the VPST. |
| 1691 | // If any of the instructions between the VCMP and VPST are predicated |
| 1692 | // then a different code path is expected to have merged the VCMP and |
| 1693 | // VPST already. |
| 1694 | if (std::none_of(first: ++MachineBasicBlock::iterator(VCMP), |
| 1695 | last: MachineBasicBlock::iterator(VPST), pred: hasVPRUse) && |
| 1696 | RDI->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: 1).getReg()) && |
| 1697 | RDI->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: 2).getReg())) { |
| 1698 | ReplaceVCMPWithVPT(VCMP, VPST); |
| 1699 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
| 1700 | LoLoop.ToRemove.insert(Ptr: VPST); |
| 1701 | } |
| 1702 | } |
| 1703 | } |
| 1704 | } |
| 1705 | |
| 1706 | LoLoop.ToRemove.insert_range(R&: LoLoop.VCTPs); |
| 1707 | } |
| 1708 | |
| 1709 | void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { |
| 1710 | |
| 1711 | // Combine the LoopDec and LoopEnd instructions into LE(TP). |
| 1712 | auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { |
| 1713 | MachineInstr *End = LoLoop.End; |
| 1714 | MachineBasicBlock *MBB = End->getParent(); |
| 1715 | unsigned Opc = LoLoop.IsTailPredicationLegal() ? |
| 1716 | ARM::MVE_LETP : ARM::t2LEUpdate; |
| 1717 | MachineInstrBuilder MIB = BuildMI(BB&: *MBB, I: End, MIMD: End->getDebugLoc(), |
| 1718 | MCID: TII->get(Opcode: Opc)); |
| 1719 | MIB.addDef(RegNo: ARM::LR); |
| 1720 | unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; |
| 1721 | MIB.add(MO: End->getOperand(i: Off + 0)); |
| 1722 | MIB.add(MO: End->getOperand(i: Off + 1)); |
| 1723 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); |
| 1724 | LoLoop.ToRemove.insert(Ptr: LoLoop.Dec); |
| 1725 | LoLoop.ToRemove.insert(Ptr: End); |
| 1726 | return &*MIB; |
| 1727 | }; |
| 1728 | |
| 1729 | // TODO: We should be able to automatically remove these branches before we |
| 1730 | // get here - probably by teaching analyzeBranch about the pseudo |
| 1731 | // instructions. |
| 1732 | // If there is an unconditional branch, after I, that just branches to the |
| 1733 | // next block, remove it. |
| 1734 | auto RemoveDeadBranch = [](MachineInstr *I) { |
| 1735 | MachineBasicBlock *BB = I->getParent(); |
| 1736 | MachineInstr *Terminator = &BB->instr_back(); |
| 1737 | if (Terminator->isUnconditionalBranch() && I != Terminator) { |
| 1738 | MachineBasicBlock *Succ = Terminator->getOperand(i: 0).getMBB(); |
| 1739 | if (BB->isLayoutSuccessor(MBB: Succ)) { |
| 1740 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); |
| 1741 | Terminator->eraseFromParent(); |
| 1742 | } |
| 1743 | } |
| 1744 | }; |
| 1745 | |
| 1746 | // And VMOVCopies need to become 2xVMOVD for tail predication to be valid. |
| 1747 | // Anything other MQPRCopy can be converted to MVE_VORR later on. |
| 1748 | auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) { |
| 1749 | for (auto *MI : VMOVCopies) { |
| 1750 | LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI); |
| 1751 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
| 1752 | MachineBasicBlock *MBB = MI->getParent(); |
| 1753 | Register Dst = MI->getOperand(i: 0).getReg(); |
| 1754 | Register Src = MI->getOperand(i: 1).getReg(); |
| 1755 | auto MIB1 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
| 1756 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2) |
| 1757 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2) |
| 1758 | .add(MOs: predOps(Pred: ARMCC::AL)); |
| 1759 | (void)MIB1; |
| 1760 | LLVM_DEBUG(dbgs() << " into " << *MIB1); |
| 1761 | auto MIB2 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
| 1762 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2 + 1) |
| 1763 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2 + 1) |
| 1764 | .add(MOs: predOps(Pred: ARMCC::AL)); |
| 1765 | LLVM_DEBUG(dbgs() << " and " << *MIB2); |
| 1766 | (void)MIB2; |
| 1767 | MI->eraseFromParent(); |
| 1768 | } |
| 1769 | }; |
| 1770 | |
| 1771 | if (LoLoop.Revert) { |
| 1772 | if (isWhileLoopStart(MI: *LoLoop.Start)) |
| 1773 | RevertWhile(MI: LoLoop.Start); |
| 1774 | else |
| 1775 | RevertDo(MI: LoLoop.Start); |
| 1776 | if (LoLoop.Dec == LoLoop.End) |
| 1777 | RevertLoopEndDec(MI: LoLoop.End); |
| 1778 | else |
| 1779 | RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec)); |
| 1780 | } else { |
| 1781 | ExpandVMOVCopies(LoLoop.VMOVCopies); |
| 1782 | LoLoop.Start = ExpandLoopStart(LoLoop); |
| 1783 | if (LoLoop.Start) |
| 1784 | RemoveDeadBranch(LoLoop.Start); |
| 1785 | LoLoop.End = ExpandLoopEnd(LoLoop); |
| 1786 | RemoveDeadBranch(LoLoop.End); |
| 1787 | if (LoLoop.IsTailPredicationLegal()) |
| 1788 | ConvertVPTBlocks(LoLoop); |
| 1789 | for (auto *I : LoLoop.ToRemove) { |
| 1790 | LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); |
| 1791 | I->eraseFromParent(); |
| 1792 | } |
| 1793 | for (auto *I : LoLoop.BlockMasksToRecompute) { |
| 1794 | LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); |
| 1795 | recomputeVPTBlockMask(Instr&: *I); |
| 1796 | LLVM_DEBUG(dbgs() << " ... done: " << *I); |
| 1797 | } |
| 1798 | } |
| 1799 | |
| 1800 | PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); |
| 1801 | DFS.ProcessLoop(); |
| 1802 | const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); |
| 1803 | fullyRecomputeLiveIns(MBBs: PostOrder); |
| 1804 | |
| 1805 | for (auto *MBB : reverse(C: PostOrder)) |
| 1806 | recomputeLivenessFlags(MBB&: *MBB); |
| 1807 | |
| 1808 | // We've moved, removed and inserted new instructions, so update RDI. |
| 1809 | RDI->reset(); |
| 1810 | } |
| 1811 | |
| 1812 | bool ARMLowOverheadLoops::RevertNonLoops() { |
| 1813 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n" ); |
| 1814 | bool Changed = false; |
| 1815 | |
| 1816 | for (auto &MBB : *MF) { |
| 1817 | SmallVector<MachineInstr*, 4> Starts; |
| 1818 | SmallVector<MachineInstr*, 4> Decs; |
| 1819 | SmallVector<MachineInstr*, 4> Ends; |
| 1820 | SmallVector<MachineInstr *, 4> EndDecs; |
| 1821 | SmallVector<MachineInstr *, 4> MQPRCopies; |
| 1822 | |
| 1823 | for (auto &I : MBB) { |
| 1824 | if (isLoopStart(MI: I)) |
| 1825 | Starts.push_back(Elt: &I); |
| 1826 | else if (I.getOpcode() == ARM::t2LoopDec) |
| 1827 | Decs.push_back(Elt: &I); |
| 1828 | else if (I.getOpcode() == ARM::t2LoopEnd) |
| 1829 | Ends.push_back(Elt: &I); |
| 1830 | else if (I.getOpcode() == ARM::t2LoopEndDec) |
| 1831 | EndDecs.push_back(Elt: &I); |
| 1832 | else if (I.getOpcode() == ARM::MQPRCopy) |
| 1833 | MQPRCopies.push_back(Elt: &I); |
| 1834 | } |
| 1835 | |
| 1836 | if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() && |
| 1837 | MQPRCopies.empty()) |
| 1838 | continue; |
| 1839 | |
| 1840 | Changed = true; |
| 1841 | |
| 1842 | for (auto *Start : Starts) { |
| 1843 | if (isWhileLoopStart(MI: *Start)) |
| 1844 | RevertWhile(MI: Start); |
| 1845 | else |
| 1846 | RevertDo(MI: Start); |
| 1847 | } |
| 1848 | for (auto *Dec : Decs) |
| 1849 | RevertLoopDec(MI: Dec); |
| 1850 | |
| 1851 | for (auto *End : Ends) |
| 1852 | RevertLoopEnd(MI: End); |
| 1853 | for (auto *End : EndDecs) |
| 1854 | RevertLoopEndDec(MI: End); |
| 1855 | for (auto *MI : MQPRCopies) { |
| 1856 | LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI); |
| 1857 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
| 1858 | MachineBasicBlock *MBB = MI->getParent(); |
| 1859 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR), |
| 1860 | DestReg: MI->getOperand(i: 0).getReg()) |
| 1861 | .add(MO: MI->getOperand(i: 1)) |
| 1862 | .add(MO: MI->getOperand(i: 1)); |
| 1863 | addUnpredicatedMveVpredROp(MIB, DestReg: MI->getOperand(i: 0).getReg()); |
| 1864 | MI->eraseFromParent(); |
| 1865 | } |
| 1866 | } |
| 1867 | return Changed; |
| 1868 | } |
| 1869 | |
| 1870 | FunctionPass *llvm::createARMLowOverheadLoopsPass() { |
| 1871 | return new ARMLowOverheadLoops(); |
| 1872 | } |
| 1873 | |