| 1 | //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// \file |
| 9 | /// Finalize v8.1-m low-overhead loops by converting the associated pseudo |
| 10 | /// instructions into machine operations. |
| 11 | /// The expectation is that the loop contains three pseudo instructions: |
| 12 | /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop |
| 13 | /// form should be in the preheader, whereas the while form should be in the |
| 14 | /// preheaders only predecessor. |
| 15 | /// - t2LoopDec - placed within in the loop body. |
| 16 | /// - t2LoopEnd - the loop latch terminator. |
| 17 | /// |
| 18 | /// In addition to this, we also look for the presence of the VCTP instruction, |
| 19 | /// which determines whether we can generated the tail-predicated low-overhead |
| 20 | /// loop form. |
| 21 | /// |
| 22 | /// Assumptions and Dependencies: |
| 23 | /// Low-overhead loops are constructed and executed using a setup instruction: |
| 24 | /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. |
| 25 | /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range |
| 26 | /// but fixed polarity: WLS can only branch forwards and LE can only branch |
| 27 | /// backwards. These restrictions mean that this pass is dependent upon block |
| 28 | /// layout and block sizes, which is why it's the last pass to run. The same is |
| 29 | /// true for ConstantIslands, but this pass does not increase the size of the |
| 30 | /// basic blocks, nor does it change the CFG. Instructions are mainly removed |
| 31 | /// during the transform and pseudo instructions are replaced by real ones. In |
| 32 | /// some cases, when we have to revert to a 'normal' loop, we have to introduce |
| 33 | /// multiple instructions for a single pseudo (see RevertWhile and |
| 34 | /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd |
| 35 | /// are defined to be as large as this maximum sequence of replacement |
| 36 | /// instructions. |
| 37 | /// |
| 38 | /// A note on VPR.P0 (the lane mask): |
| 39 | /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a |
| 40 | /// "VPT Active" context (which includes low-overhead loops and vpt blocks). |
| 41 | /// They will simply "and" the result of their calculation with the current |
| 42 | /// value of VPR.P0. You can think of it like this: |
| 43 | /// \verbatim |
| 44 | /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs |
| 45 | /// VPR.P0 &= Value |
| 46 | /// else |
| 47 | /// VPR.P0 = Value |
| 48 | /// \endverbatim |
| 49 | /// When we're inside the low-overhead loop (between DLSTP and LETP), we always |
| 50 | /// fall in the "VPT active" case, so we can consider that all VPR writes by |
| 51 | /// one of those instruction is actually a "and". |
| 52 | //===----------------------------------------------------------------------===// |
| 53 | |
| 54 | #include "ARM.h" |
| 55 | #include "ARMBaseInstrInfo.h" |
| 56 | #include "ARMBasicBlockInfo.h" |
| 57 | #include "ARMSubtarget.h" |
| 58 | #include "MVETailPredUtils.h" |
| 59 | #include "Thumb2InstrInfo.h" |
| 60 | #include "llvm/ADT/SetVector.h" |
| 61 | #include "llvm/CodeGen/LivePhysRegs.h" |
| 62 | #include "llvm/CodeGen/MachineFrameInfo.h" |
| 63 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 64 | #include "llvm/CodeGen/MachineLoopInfo.h" |
| 65 | #include "llvm/CodeGen/MachineLoopUtils.h" |
| 66 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 67 | #include "llvm/CodeGen/Passes.h" |
| 68 | #include "llvm/CodeGen/ReachingDefAnalysis.h" |
| 69 | #include "llvm/MC/MCInstrDesc.h" |
| 70 | |
| 71 | using namespace llvm; |
| 72 | |
| 73 | #define DEBUG_TYPE "arm-low-overhead-loops" |
| 74 | #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" |
| 75 | |
| 76 | static cl::opt<bool> |
| 77 | DisableTailPredication("arm-loloops-disable-tailpred" , cl::Hidden, |
| 78 | cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass" ), |
| 79 | cl::init(Val: false)); |
| 80 | |
| 81 | static cl::opt<bool> |
| 82 | DisableOmitDLS("arm-disable-omit-dls" , cl::Hidden, |
| 83 | cl::desc("Disable omitting 'dls lr, lr' instructions" ), |
| 84 | cl::init(Val: false)); |
| 85 | |
| 86 | static bool isVectorPredicated(MachineInstr *MI) { |
| 87 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
| 88 | return PIdx != -1 && MI->getOperand(i: PIdx + 1).getReg() == ARM::VPR; |
| 89 | } |
| 90 | |
| 91 | static bool isVectorPredicate(MachineInstr *MI) { |
| 92 | return MI->findRegisterDefOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
| 93 | } |
| 94 | |
| 95 | static bool hasVPRUse(MachineInstr &MI) { |
| 96 | return MI.findRegisterUseOperandIdx(Reg: ARM::VPR, /*TRI=*/nullptr) != -1; |
| 97 | } |
| 98 | |
| 99 | static bool isDomainMVE(MachineInstr *MI) { |
| 100 | uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; |
| 101 | return Domain == ARMII::DomainMVE; |
| 102 | } |
| 103 | |
| 104 | static int getVecSize(const MachineInstr &MI) { |
| 105 | const MCInstrDesc &MCID = MI.getDesc(); |
| 106 | uint64_t Flags = MCID.TSFlags; |
| 107 | return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift; |
| 108 | } |
| 109 | |
| 110 | static bool shouldInspect(MachineInstr &MI) { |
| 111 | if (MI.isDebugInstr()) |
| 112 | return false; |
| 113 | return isDomainMVE(MI: &MI) || isVectorPredicate(MI: &MI) || hasVPRUse(MI); |
| 114 | } |
| 115 | |
| 116 | static bool isHorizontalReduction(const MachineInstr &MI) { |
| 117 | const MCInstrDesc &MCID = MI.getDesc(); |
| 118 | uint64_t Flags = MCID.TSFlags; |
| 119 | return (Flags & ARMII::HorizontalReduction) != 0; |
| 120 | } |
| 121 | |
| 122 | namespace { |
| 123 | |
| 124 | using InstSet = SmallPtrSetImpl<MachineInstr *>; |
| 125 | |
| 126 | class PostOrderLoopTraversal { |
| 127 | MachineLoop &ML; |
| 128 | MachineLoopInfo &MLI; |
| 129 | SmallPtrSet<MachineBasicBlock*, 4> Visited; |
| 130 | SmallVector<MachineBasicBlock*, 4> Order; |
| 131 | |
| 132 | public: |
| 133 | PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) |
| 134 | : ML(ML), MLI(MLI) { } |
| 135 | |
| 136 | const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { |
| 137 | return Order; |
| 138 | } |
| 139 | |
| 140 | // Visit all the blocks within the loop, as well as exit blocks and any |
| 141 | // blocks properly dominating the header. |
| 142 | void ProcessLoop() { |
| 143 | std::function<void(MachineBasicBlock *)> Search = |
| 144 | [this, &Search](MachineBasicBlock *MBB) -> void { |
| 145 | if (!Visited.insert(Ptr: MBB).second) |
| 146 | return; |
| 147 | |
| 148 | for (auto *Succ : MBB->successors()) { |
| 149 | if (!ML.contains(BB: Succ)) |
| 150 | continue; |
| 151 | Search(Succ); |
| 152 | } |
| 153 | Order.push_back(Elt: MBB); |
| 154 | }; |
| 155 | |
| 156 | // Insert exit blocks. |
| 157 | SmallVector<MachineBasicBlock*, 2> ExitBlocks; |
| 158 | ML.getExitBlocks(ExitBlocks); |
| 159 | append_range(C&: Order, R&: ExitBlocks); |
| 160 | |
| 161 | // Then add the loop body. |
| 162 | Search(ML.getHeader()); |
| 163 | |
| 164 | // Then try the preheader and its predecessors. |
| 165 | std::function<void(MachineBasicBlock*)> GetPredecessor = |
| 166 | [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { |
| 167 | Order.push_back(Elt: MBB); |
| 168 | if (MBB->pred_size() == 1) |
| 169 | GetPredecessor(*MBB->pred_begin()); |
| 170 | }; |
| 171 | |
| 172 | if (auto * = ML.getLoopPreheader()) |
| 173 | GetPredecessor(Preheader); |
| 174 | else if (auto * = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
| 175 | GetPredecessor(Preheader); |
| 176 | } |
| 177 | }; |
| 178 | |
| 179 | class VPTBlock { |
| 180 | SmallVector<MachineInstr *, 4> Insts; |
| 181 | |
| 182 | public: |
| 183 | VPTBlock(MachineInstr *MI) { Insts.push_back(Elt: MI); } |
| 184 | |
| 185 | // Have we found an instruction within the block which defines the vpr? If |
| 186 | // so, not all the instructions in the block will have the same predicate. |
| 187 | bool hasUniformPredicate() { return getDivergent() == nullptr; } |
| 188 | |
| 189 | // If it exists, return the first internal instruction which modifies the |
| 190 | // VPR. |
| 191 | MachineInstr *getDivergent() { |
| 192 | SmallVectorImpl<MachineInstr *> &Insts = getInsts(); |
| 193 | for (unsigned i = 1; i < Insts.size(); ++i) { |
| 194 | MachineInstr *Next = Insts[i]; |
| 195 | if (isVectorPredicate(MI: Next)) |
| 196 | return Next; // Found an instruction altering the vpr. |
| 197 | } |
| 198 | return nullptr; |
| 199 | } |
| 200 | |
| 201 | void insert(MachineInstr *MI) { |
| 202 | Insts.push_back(Elt: MI); |
| 203 | // VPT/VPST + 4 predicated instructions. |
| 204 | assert(Insts.size() <= 5 && "Too many instructions in VPT block!" ); |
| 205 | } |
| 206 | |
| 207 | bool containsVCTP() const { return llvm::any_of(Range: Insts, P: isVCTP); } |
| 208 | |
| 209 | unsigned size() const { return Insts.size(); } |
| 210 | SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } |
| 211 | }; |
| 212 | |
| 213 | // Represent the current state of the VPR and hold all instances which |
| 214 | // represent a VPT block, which is a list of instructions that begins with a |
| 215 | // VPT/VPST and has a maximum of four proceeding instructions. All |
| 216 | // instructions within the block are predicated upon the vpr and we allow |
| 217 | // instructions to define the vpr within in the block too. |
| 218 | class VPTState { |
| 219 | friend struct LowOverheadLoop; |
| 220 | |
| 221 | SmallVector<VPTBlock, 4> Blocks; |
| 222 | SetVector<MachineInstr *> CurrentPredicates; |
| 223 | std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts; |
| 224 | |
| 225 | void CreateVPTBlock(MachineInstr *MI) { |
| 226 | assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) |
| 227 | && "Can't begin VPT without predicate" ); |
| 228 | Blocks.emplace_back(Args&: MI); |
| 229 | // The execution of MI is predicated upon the current set of instructions |
| 230 | // that are AND'ed together to form the VPR predicate value. In the case |
| 231 | // that MI is a VPT, CurrentPredicates will also just be MI. |
| 232 | PredicatedInsts[MI] = CurrentPredicates; |
| 233 | } |
| 234 | |
| 235 | void addInst(MachineInstr *MI) { |
| 236 | Blocks.back().insert(MI); |
| 237 | PredicatedInsts[MI] = CurrentPredicates; |
| 238 | } |
| 239 | |
| 240 | void addPredicate(MachineInstr *MI) { |
| 241 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); |
| 242 | CurrentPredicates.insert(X: MI); |
| 243 | } |
| 244 | |
| 245 | void resetPredicate(MachineInstr *MI) { |
| 246 | LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); |
| 247 | CurrentPredicates.clear(); |
| 248 | CurrentPredicates.insert(X: MI); |
| 249 | } |
| 250 | |
| 251 | public: |
| 252 | // Return whether the given instruction is predicated upon a VCTP. |
| 253 | bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { |
| 254 | SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]; |
| 255 | if (Exclusive && Predicates.size() != 1) |
| 256 | return false; |
| 257 | // We do not know how to convert an else predicate of a VCTP. |
| 258 | if (getVPTInstrPredicate(MI: *MI) == ARMVCC::Else) |
| 259 | return false; |
| 260 | return llvm::any_of(Range&: Predicates, P: isVCTP); |
| 261 | } |
| 262 | |
| 263 | // Is the VPST, controlling the block entry, predicated upon a VCTP. |
| 264 | bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) { |
| 265 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 266 | return isPredicatedOnVCTP(MI: Insts.front(), Exclusive); |
| 267 | } |
| 268 | |
| 269 | // If this block begins with a VPT, we can check whether it's using |
| 270 | // at least one predicated input(s), as well as possible loop invariant |
| 271 | // which would result in it being implicitly predicated. |
| 272 | bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) { |
| 273 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 274 | MachineInstr *VPT = Insts.front(); |
| 275 | assert(isVPTOpcode(VPT->getOpcode()) && |
| 276 | "Expected VPT block to begin with VPT/VPST" ); |
| 277 | |
| 278 | if (VPT->getOpcode() == ARM::MVE_VPST) |
| 279 | return false; |
| 280 | |
| 281 | // If the VPT block does not define something that is an "output", then |
| 282 | // the tail-predicated version will just perform a subset of the original |
| 283 | // vpt block, where the last lanes should not be used. |
| 284 | if (isVPTOpcode(Opc: VPT->getOpcode()) && |
| 285 | all_of(Range&: Block.getInsts(), P: [](const MachineInstr *MI) { |
| 286 | return !MI->mayStore() && !MI->mayLoad() && |
| 287 | !isHorizontalReduction(MI: *MI) && !isVCTP(MI); |
| 288 | })) |
| 289 | return true; |
| 290 | |
| 291 | auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { |
| 292 | MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx)); |
| 293 | return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op); |
| 294 | }; |
| 295 | |
| 296 | auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { |
| 297 | MachineOperand &MO = MI->getOperand(i: Idx); |
| 298 | if (!MO.isReg() || !MO.getReg()) |
| 299 | return true; |
| 300 | |
| 301 | SmallPtrSet<MachineInstr *, 2> Defs; |
| 302 | RDA.getGlobalReachingDefs(MI, Reg: MO.getReg(), Defs); |
| 303 | if (Defs.empty()) |
| 304 | return true; |
| 305 | |
| 306 | for (auto *Def : Defs) |
| 307 | if (Def->getParent() == VPT->getParent()) |
| 308 | return false; |
| 309 | return true; |
| 310 | }; |
| 311 | |
| 312 | // Check that at least one of the operands is directly predicated on a |
| 313 | // vctp and allow an invariant value too. |
| 314 | return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && |
| 315 | (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && |
| 316 | (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); |
| 317 | } |
| 318 | |
| 319 | bool isValid(ReachingDefAnalysis &RDA) { |
| 320 | // All predication within the loop should be based on vctp. If the block |
| 321 | // isn't predicated on entry, check whether the vctp is within the block |
| 322 | // and that all other instructions are then predicated on it. |
| 323 | for (auto &Block : Blocks) { |
| 324 | if (isEntryPredicatedOnVCTP(Block, Exclusive: false) && |
| 325 | !any_of(Range: drop_begin(RangeOrContainer&: Block.getInsts()), P: [](const MachineInstr *MI) { |
| 326 | return getVPTInstrPredicate(MI: *MI) == ARMVCC::Else; |
| 327 | })) |
| 328 | continue; |
| 329 | if (hasImplicitlyValidVPT(Block, RDA)) |
| 330 | continue; |
| 331 | |
| 332 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 333 | // We don't know how to convert a block with just a VPT;VCTP into |
| 334 | // anything valid once we remove the VCTP. For now just bail out. |
| 335 | assert(isVPTOpcode(Insts.front()->getOpcode()) && |
| 336 | "Expected VPT block to start with a VPST or VPT!" ); |
| 337 | if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && |
| 338 | isVCTP(MI: Insts.back())) |
| 339 | return false; |
| 340 | |
| 341 | for (auto *MI : Insts) { |
| 342 | // Check that any internal VCTPs are 'Then' predicated. |
| 343 | if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then) |
| 344 | return false; |
| 345 | // Skip other instructions that build up the predicate. |
| 346 | if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) |
| 347 | continue; |
| 348 | // Check that any other instructions are predicated upon a vctp. |
| 349 | // TODO: We could infer when VPTs are implicitly predicated on the |
| 350 | // vctp (when the operands are predicated). |
| 351 | if (!isPredicatedOnVCTP(MI)) { |
| 352 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); |
| 353 | return false; |
| 354 | } |
| 355 | } |
| 356 | } |
| 357 | return true; |
| 358 | } |
| 359 | }; |
| 360 | |
| 361 | struct LowOverheadLoop { |
| 362 | |
| 363 | MachineLoop &ML; |
| 364 | MachineBasicBlock * = nullptr; |
| 365 | MachineLoopInfo &MLI; |
| 366 | ReachingDefAnalysis &RDA; |
| 367 | const TargetRegisterInfo &TRI; |
| 368 | const ARMBaseInstrInfo &TII; |
| 369 | MachineFunction *MF = nullptr; |
| 370 | MachineBasicBlock::iterator StartInsertPt; |
| 371 | MachineBasicBlock *StartInsertBB = nullptr; |
| 372 | MachineInstr *Start = nullptr; |
| 373 | MachineInstr *Dec = nullptr; |
| 374 | MachineInstr *End = nullptr; |
| 375 | MachineOperand TPNumElements; |
| 376 | SmallVector<MachineInstr *, 4> VCTPs; |
| 377 | SmallPtrSet<MachineInstr *, 4> ToRemove; |
| 378 | SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute; |
| 379 | SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs; |
| 380 | SmallPtrSet<MachineInstr *, 4> VMOVCopies; |
| 381 | bool Revert = false; |
| 382 | bool CannotTailPredicate = false; |
| 383 | VPTState VPTstate; |
| 384 | |
| 385 | LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, |
| 386 | ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, |
| 387 | const ARMBaseInstrInfo &TII) |
| 388 | : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), |
| 389 | TPNumElements(MachineOperand::CreateImm(Val: 0)) { |
| 390 | MF = ML.getHeader()->getParent(); |
| 391 | if (auto *MBB = ML.getLoopPreheader()) |
| 392 | Preheader = MBB; |
| 393 | else if (auto *MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
| 394 | Preheader = MBB; |
| 395 | } |
| 396 | |
| 397 | // If this is an MVE instruction, check that we know how to use tail |
| 398 | // predication with it. Record VPT blocks and return whether the |
| 399 | // instruction is valid for tail predication. |
| 400 | bool ValidateMVEInst(MachineInstr *MI); |
| 401 | |
| 402 | void AnalyseMVEInst(MachineInstr *MI) { |
| 403 | CannotTailPredicate = !ValidateMVEInst(MI); |
| 404 | } |
| 405 | |
| 406 | bool IsTailPredicationLegal() const { |
| 407 | // For now, let's keep things really simple and only support a single |
| 408 | // block for tail predication. |
| 409 | return !Revert && FoundAllComponents() && !VCTPs.empty() && |
| 410 | !CannotTailPredicate && ML.getNumBlocks() == 1; |
| 411 | } |
| 412 | |
| 413 | // Given that MI is a VCTP, check that is equivalent to any other VCTPs |
| 414 | // found. |
| 415 | bool AddVCTP(MachineInstr *MI); |
| 416 | |
| 417 | // Check that the predication in the loop will be equivalent once we |
| 418 | // perform the conversion. Also ensure that we can provide the number |
| 419 | // of elements to the loop start instruction. |
| 420 | bool ValidateTailPredicate(); |
| 421 | |
| 422 | // Check that any values available outside of the loop will be the same |
| 423 | // after tail predication conversion. |
| 424 | bool ValidateLiveOuts(); |
| 425 | |
| 426 | // Check the branch targets are within range and we satisfy our |
| 427 | // restrictions. |
| 428 | void Validate(ARMBasicBlockUtils *BBUtils); |
| 429 | |
| 430 | bool FoundAllComponents() const { |
| 431 | return Start && Dec && End; |
| 432 | } |
| 433 | |
| 434 | SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; } |
| 435 | |
| 436 | // Return the operand for the loop start instruction. This will be the loop |
| 437 | // iteration count, or the number of elements if we're tail predicating. |
| 438 | MachineOperand &getLoopStartOperand() { |
| 439 | if (IsTailPredicationLegal()) |
| 440 | return TPNumElements; |
| 441 | return Start->getOperand(i: 1); |
| 442 | } |
| 443 | |
| 444 | unsigned getStartOpcode() const { |
| 445 | bool IsDo = isDoLoopStart(MI: *Start); |
| 446 | if (!IsTailPredicationLegal()) |
| 447 | return IsDo ? ARM::t2DLS : ARM::t2WLS; |
| 448 | |
| 449 | return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo); |
| 450 | } |
| 451 | |
| 452 | void dump() const { |
| 453 | if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; |
| 454 | if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; |
| 455 | if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; |
| 456 | if (!VCTPs.empty()) { |
| 457 | dbgs() << "ARM Loops: Found VCTP(s):\n" ; |
| 458 | for (auto *MI : VCTPs) |
| 459 | dbgs() << " - " << *MI; |
| 460 | } |
| 461 | if (!FoundAllComponents()) |
| 462 | dbgs() << "ARM Loops: Not a low-overhead loop.\n" ; |
| 463 | else if (!(Start && Dec && End)) |
| 464 | dbgs() << "ARM Loops: Failed to find all loop components.\n" ; |
| 465 | } |
| 466 | }; |
| 467 | |
| 468 | class ARMLowOverheadLoops : public MachineFunctionPass { |
| 469 | MachineFunction *MF = nullptr; |
| 470 | MachineLoopInfo *MLI = nullptr; |
| 471 | ReachingDefAnalysis *RDA = nullptr; |
| 472 | const ARMBaseInstrInfo *TII = nullptr; |
| 473 | MachineRegisterInfo *MRI = nullptr; |
| 474 | const TargetRegisterInfo *TRI = nullptr; |
| 475 | std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; |
| 476 | |
| 477 | public: |
| 478 | static char ID; |
| 479 | |
| 480 | ARMLowOverheadLoops() : MachineFunctionPass(ID) { } |
| 481 | |
| 482 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 483 | AU.setPreservesCFG(); |
| 484 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
| 485 | AU.addRequired<ReachingDefAnalysis>(); |
| 486 | MachineFunctionPass::getAnalysisUsage(AU); |
| 487 | } |
| 488 | |
| 489 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 490 | |
| 491 | MachineFunctionProperties getRequiredProperties() const override { |
| 492 | return MachineFunctionProperties().setNoVRegs().setTracksLiveness(); |
| 493 | } |
| 494 | |
| 495 | StringRef getPassName() const override { |
| 496 | return ARM_LOW_OVERHEAD_LOOPS_NAME; |
| 497 | } |
| 498 | |
| 499 | private: |
| 500 | bool ProcessLoop(MachineLoop *ML); |
| 501 | |
| 502 | bool RevertNonLoops(); |
| 503 | |
| 504 | void RevertWhile(MachineInstr *MI) const; |
| 505 | void RevertDo(MachineInstr *MI) const; |
| 506 | |
| 507 | bool RevertLoopDec(MachineInstr *MI) const; |
| 508 | |
| 509 | void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; |
| 510 | |
| 511 | void RevertLoopEndDec(MachineInstr *MI) const; |
| 512 | |
| 513 | void ConvertVPTBlocks(LowOverheadLoop &LoLoop); |
| 514 | |
| 515 | MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); |
| 516 | |
| 517 | void Expand(LowOverheadLoop &LoLoop); |
| 518 | |
| 519 | void IterationCountDCE(LowOverheadLoop &LoLoop); |
| 520 | }; |
| 521 | } |
| 522 | |
| 523 | char ARMLowOverheadLoops::ID = 0; |
| 524 | |
| 525 | INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, |
| 526 | false, false) |
| 527 | |
| 528 | static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, |
| 529 | InstSet &ToRemove, InstSet &Ignore) { |
| 530 | |
| 531 | // Check that we can remove all of Killed without having to modify any IT |
| 532 | // blocks. |
| 533 | auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) { |
| 534 | // Collect the dead code and the MBBs in which they reside. |
| 535 | SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; |
| 536 | for (auto *Dead : Killed) |
| 537 | BasicBlocks.insert(Ptr: Dead->getParent()); |
| 538 | |
| 539 | // Collect IT blocks in all affected basic blocks. |
| 540 | std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; |
| 541 | for (auto *MBB : BasicBlocks) { |
| 542 | for (auto &IT : *MBB) { |
| 543 | if (IT.getOpcode() != ARM::t2IT) |
| 544 | continue; |
| 545 | RDA.getReachingLocalUses(MI: &IT, Reg: MCRegister::from(Val: ARM::ITSTATE), |
| 546 | Uses&: ITBlocks[&IT]); |
| 547 | } |
| 548 | } |
| 549 | |
| 550 | // If we're removing all of the instructions within an IT block, then |
| 551 | // also remove the IT instruction. |
| 552 | SmallPtrSet<MachineInstr *, 2> ModifiedITs; |
| 553 | SmallPtrSet<MachineInstr *, 2> RemoveITs; |
| 554 | for (auto *Dead : Killed) { |
| 555 | if (MachineOperand *MO = |
| 556 | Dead->findRegisterUseOperand(Reg: ARM::ITSTATE, /*TRI=*/nullptr)) { |
| 557 | MachineInstr *IT = RDA.getMIOperand(MI: Dead, MO&: *MO); |
| 558 | RemoveITs.insert(Ptr: IT); |
| 559 | auto &CurrentBlock = ITBlocks[IT]; |
| 560 | CurrentBlock.erase(Ptr: Dead); |
| 561 | if (CurrentBlock.empty()) |
| 562 | ModifiedITs.erase(Ptr: IT); |
| 563 | else |
| 564 | ModifiedITs.insert(Ptr: IT); |
| 565 | } |
| 566 | } |
| 567 | if (!ModifiedITs.empty()) |
| 568 | return false; |
| 569 | Killed.insert_range(R&: RemoveITs); |
| 570 | return true; |
| 571 | }; |
| 572 | |
| 573 | SmallPtrSet<MachineInstr *, 2> Uses; |
| 574 | if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore)) |
| 575 | return false; |
| 576 | |
| 577 | if (WontCorruptITs(Uses, RDA)) { |
| 578 | ToRemove.insert_range(R&: Uses); |
| 579 | LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI |
| 580 | << " - can also remove:\n" ; |
| 581 | for (auto *Use : Uses) |
| 582 | dbgs() << " - " << *Use); |
| 583 | |
| 584 | SmallPtrSet<MachineInstr*, 4> Killed; |
| 585 | RDA.collectKilledOperands(MI, Dead&: Killed); |
| 586 | if (WontCorruptITs(Killed, RDA)) { |
| 587 | ToRemove.insert_range(R&: Killed); |
| 588 | LLVM_DEBUG(for (auto *Dead : Killed) |
| 589 | dbgs() << " - " << *Dead); |
| 590 | } |
| 591 | return true; |
| 592 | } |
| 593 | return false; |
| 594 | } |
| 595 | |
| 596 | bool LowOverheadLoop::ValidateTailPredicate() { |
| 597 | if (!IsTailPredicationLegal()) { |
| 598 | LLVM_DEBUG(if (VCTPs.empty()) |
| 599 | dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; |
| 600 | dbgs() << "ARM Loops: Tail-predication is not valid.\n" ); |
| 601 | return false; |
| 602 | } |
| 603 | |
| 604 | assert(!VCTPs.empty() && "VCTP instruction expected but is not set" ); |
| 605 | assert(ML.getBlocks().size() == 1 && |
| 606 | "Shouldn't be processing a loop with more than one block" ); |
| 607 | |
| 608 | if (DisableTailPredication) { |
| 609 | LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n" ); |
| 610 | return false; |
| 611 | } |
| 612 | |
| 613 | if (!VPTstate.isValid(RDA)) { |
| 614 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n" ); |
| 615 | return false; |
| 616 | } |
| 617 | |
| 618 | if (!ValidateLiveOuts()) { |
| 619 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n" ); |
| 620 | return false; |
| 621 | } |
| 622 | |
| 623 | // For tail predication, we need to provide the number of elements, instead |
| 624 | // of the iteration count, to the loop start instruction. The number of |
| 625 | // elements is provided to the vctp instruction, so we need to check that |
| 626 | // we can use this register at InsertPt. |
| 627 | MachineInstr *VCTP = VCTPs.back(); |
| 628 | if (Start->getOpcode() == ARM::t2DoLoopStartTP || |
| 629 | Start->getOpcode() == ARM::t2WhileLoopStartTP) { |
| 630 | TPNumElements = Start->getOperand(i: 2); |
| 631 | StartInsertPt = Start; |
| 632 | StartInsertBB = Start->getParent(); |
| 633 | } else { |
| 634 | TPNumElements = VCTP->getOperand(i: 1); |
| 635 | MCRegister NumElements = TPNumElements.getReg().asMCReg(); |
| 636 | |
| 637 | // If the register is defined within loop, then we can't perform TP. |
| 638 | // TODO: Check whether this is just a mov of a register that would be |
| 639 | // available. |
| 640 | if (RDA.hasLocalDefBefore(MI: VCTP, Reg: NumElements)) { |
| 641 | LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n" ); |
| 642 | return false; |
| 643 | } |
| 644 | |
| 645 | // The element count register maybe defined after InsertPt, in which case we |
| 646 | // need to try to move either InsertPt or the def so that the [w|d]lstp can |
| 647 | // use the value. |
| 648 | |
| 649 | if (StartInsertPt != StartInsertBB->end() && |
| 650 | !RDA.isReachingDefLiveOut(MI: &*StartInsertPt, Reg: NumElements)) { |
| 651 | if (auto *ElemDef = |
| 652 | RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, Reg: NumElements)) { |
| 653 | if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) { |
| 654 | ElemDef->removeFromParent(); |
| 655 | StartInsertBB->insert(I: StartInsertPt, MI: ElemDef); |
| 656 | LLVM_DEBUG(dbgs() |
| 657 | << "ARM Loops: Moved element count def: " << *ElemDef); |
| 658 | } else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) { |
| 659 | StartInsertPt->removeFromParent(); |
| 660 | StartInsertBB->insertAfter(I: MachineBasicBlock::iterator(ElemDef), |
| 661 | MI: &*StartInsertPt); |
| 662 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); |
| 663 | } else { |
| 664 | // If we fail to move an instruction and the element count is provided |
| 665 | // by a mov, use the mov operand if it will have the same value at the |
| 666 | // insertion point |
| 667 | MachineOperand Operand = ElemDef->getOperand(i: 1); |
| 668 | if (isMovRegOpcode(Opc: ElemDef->getOpcode()) && |
| 669 | RDA.getUniqueReachingMIDef(MI: ElemDef, Reg: Operand.getReg().asMCReg()) == |
| 670 | RDA.getUniqueReachingMIDef(MI: &*StartInsertPt, |
| 671 | Reg: Operand.getReg().asMCReg())) { |
| 672 | TPNumElements = Operand; |
| 673 | NumElements = TPNumElements.getReg(); |
| 674 | } else { |
| 675 | LLVM_DEBUG(dbgs() |
| 676 | << "ARM Loops: Unable to move element count to loop " |
| 677 | << "start instruction.\n" ); |
| 678 | return false; |
| 679 | } |
| 680 | } |
| 681 | } |
| 682 | } |
| 683 | |
| 684 | // Especially in the case of while loops, InsertBB may not be the |
| 685 | // preheader, so we need to check that the register isn't redefined |
| 686 | // before entering the loop. |
| 687 | auto CannotProvideElements = [this](MachineBasicBlock *MBB, |
| 688 | MCRegister NumElements) { |
| 689 | if (MBB->empty()) |
| 690 | return false; |
| 691 | // NumElements is redefined in this block. |
| 692 | if (RDA.hasLocalDefBefore(MI: &MBB->back(), Reg: NumElements)) |
| 693 | return true; |
| 694 | |
| 695 | // Don't continue searching up through multiple predecessors. |
| 696 | if (MBB->pred_size() > 1) |
| 697 | return true; |
| 698 | |
| 699 | return false; |
| 700 | }; |
| 701 | |
| 702 | // Search backwards for a def, until we get to InsertBB. |
| 703 | MachineBasicBlock *MBB = Preheader; |
| 704 | while (MBB && MBB != StartInsertBB) { |
| 705 | if (CannotProvideElements(MBB, NumElements)) { |
| 706 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n" ); |
| 707 | return false; |
| 708 | } |
| 709 | MBB = *MBB->pred_begin(); |
| 710 | } |
| 711 | } |
| 712 | |
| 713 | // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect |
| 714 | // world the [w|d]lstp instruction would be last instruction in the preheader |
| 715 | // and so it would only affect instructions within the loop body. But due to |
| 716 | // scheduling, and/or the logic in this pass (above), the insertion point can |
| 717 | // be moved earlier. So if the Loop Start isn't the last instruction in the |
| 718 | // preheader, and if the initial element count is smaller than the vector |
| 719 | // width, the Loop Start instruction will immediately generate one or more |
| 720 | // false lane mask which can, incorrectly, affect the proceeding MVE |
| 721 | // instructions in the preheader. |
| 722 | if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) { |
| 723 | LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n" ); |
| 724 | return false; |
| 725 | } |
| 726 | |
| 727 | // For any DoubleWidthResultInstrs we found whilst scanning instructions, they |
| 728 | // need to compute an output size that is smaller than the VCTP mask operates |
| 729 | // on. The VecSize of the DoubleWidthResult is the larger vector size - the |
| 730 | // size it extends into, so any VCTP VecSize <= is valid. |
| 731 | unsigned VCTPVecSize = getVecSize(MI: *VCTP); |
| 732 | for (MachineInstr *MI : DoubleWidthResultInstrs) { |
| 733 | unsigned InstrVecSize = getVecSize(MI: *MI); |
| 734 | if (InstrVecSize > VCTPVecSize) { |
| 735 | LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP " |
| 736 | << "VecSize:\n" << *MI); |
| 737 | return false; |
| 738 | } |
| 739 | } |
| 740 | |
| 741 | // Check that the value change of the element count is what we expect and |
| 742 | // that the predication will be equivalent. For this we need: |
| 743 | // NumElements = NumElements - VectorWidth. The sub will be a sub immediate |
| 744 | // and we can also allow register copies within the chain too. |
| 745 | auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { |
| 746 | return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth; |
| 747 | }; |
| 748 | |
| 749 | MachineBasicBlock *MBB = VCTP->getParent(); |
| 750 | // Remove modifications to the element count since they have no purpose in a |
| 751 | // tail predicated loop. Explicitly refer to the vctp operand no matter which |
| 752 | // register NumElements has been assigned to, since that is what the |
| 753 | // modifications will be using |
| 754 | if (auto *Def = RDA.getUniqueReachingMIDef( |
| 755 | MI: &MBB->back(), Reg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
| 756 | SmallPtrSet<MachineInstr*, 2> ElementChain; |
| 757 | SmallPtrSet<MachineInstr*, 2> Ignore; |
| 758 | unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode()); |
| 759 | |
| 760 | Ignore.insert_range(R&: VCTPs); |
| 761 | |
| 762 | if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) { |
| 763 | bool FoundSub = false; |
| 764 | |
| 765 | for (auto *MI : ElementChain) { |
| 766 | if (isMovRegOpcode(Opc: MI->getOpcode())) |
| 767 | continue; |
| 768 | |
| 769 | if (isSubImmOpcode(Opc: MI->getOpcode())) { |
| 770 | if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { |
| 771 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
| 772 | " count: " << *MI); |
| 773 | return false; |
| 774 | } |
| 775 | FoundSub = true; |
| 776 | } else { |
| 777 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
| 778 | " count: " << *MI); |
| 779 | return false; |
| 780 | } |
| 781 | } |
| 782 | ToRemove.insert_range(R&: ElementChain); |
| 783 | } |
| 784 | } |
| 785 | |
| 786 | // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we |
| 787 | // can also remove any extra instructions in the preheader, which often |
| 788 | // includes a now unused MOV. |
| 789 | if ((Start->getOpcode() == ARM::t2DoLoopStartTP || |
| 790 | Start->getOpcode() == ARM::t2WhileLoopStartTP) && |
| 791 | Preheader && !Preheader->empty() && |
| 792 | !RDA.hasLocalDefBefore(MI: VCTP, Reg: VCTP->getOperand(i: 1).getReg())) { |
| 793 | if (auto *Def = RDA.getUniqueReachingMIDef( |
| 794 | MI: &Preheader->back(), Reg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
| 795 | SmallPtrSet<MachineInstr *, 2> Ignore(llvm::from_range, VCTPs); |
| 796 | TryRemove(MI: Def, RDA, ToRemove, Ignore); |
| 797 | } |
| 798 | } |
| 799 | |
| 800 | return true; |
| 801 | } |
| 802 | |
| 803 | static bool isRegInClass(const MachineOperand &MO, |
| 804 | const TargetRegisterClass *Class) { |
| 805 | return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg()); |
| 806 | } |
| 807 | |
| 808 | // MVE 'narrowing' operate on half a lane, reading from half and writing |
| 809 | // to half, which are referred to has the top and bottom half. The other |
| 810 | // half retains its previous value. |
| 811 | static bool retainsPreviousHalfElement(const MachineInstr &MI) { |
| 812 | const MCInstrDesc &MCID = MI.getDesc(); |
| 813 | uint64_t Flags = MCID.TSFlags; |
| 814 | return (Flags & ARMII::RetainsPreviousHalfElement) != 0; |
| 815 | } |
| 816 | |
| 817 | // Some MVE instructions read from the top/bottom halves of their operand(s) |
| 818 | // and generate a vector result with result elements that are double the |
| 819 | // width of the input. |
| 820 | static bool producesDoubleWidthResult(const MachineInstr &MI) { |
| 821 | const MCInstrDesc &MCID = MI.getDesc(); |
| 822 | uint64_t Flags = MCID.TSFlags; |
| 823 | return (Flags & ARMII::DoubleWidthResult) != 0; |
| 824 | } |
| 825 | |
| 826 | // Can this instruction generate a non-zero result when given only zeroed |
| 827 | // operands? This allows us to know that, given operands with false bytes |
| 828 | // zeroed by masked loads, that the result will also contain zeros in those |
| 829 | // bytes. |
| 830 | static bool canGenerateNonZeros(const MachineInstr &MI) { |
| 831 | |
| 832 | // Check for instructions which can write into a larger element size, |
| 833 | // possibly writing into a previous zero'd lane. |
| 834 | if (producesDoubleWidthResult(MI)) |
| 835 | return true; |
| 836 | |
| 837 | switch (MI.getOpcode()) { |
| 838 | default: |
| 839 | break; |
| 840 | // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow |
| 841 | // fp16 -> fp32 vector conversions. |
| 842 | // Instructions that perform a NOT will generate 1s from 0s. |
| 843 | case ARM::MVE_VMVN: |
| 844 | case ARM::MVE_VORN: |
| 845 | // Count leading zeros will do just that! |
| 846 | case ARM::MVE_VCLZs8: |
| 847 | case ARM::MVE_VCLZs16: |
| 848 | case ARM::MVE_VCLZs32: |
| 849 | return true; |
| 850 | } |
| 851 | return false; |
| 852 | } |
| 853 | |
| 854 | // Look at its register uses to see if it only can only receive zeros |
| 855 | // into its false lanes which would then produce zeros. Also check that |
| 856 | // the output register is also defined by an FalseLanesZero instruction |
| 857 | // so that if tail-predication happens, the lanes that aren't updated will |
| 858 | // still be zeros. |
| 859 | static bool producesFalseLanesZero(MachineInstr &MI, |
| 860 | const TargetRegisterClass *QPRs, |
| 861 | const ReachingDefAnalysis &RDA, |
| 862 | InstSet &FalseLanesZero) { |
| 863 | if (canGenerateNonZeros(MI)) |
| 864 | return false; |
| 865 | |
| 866 | bool isPredicated = isVectorPredicated(MI: &MI); |
| 867 | // Predicated loads will write zeros to the falsely predicated bytes of the |
| 868 | // destination register. |
| 869 | if (MI.mayLoad()) |
| 870 | return isPredicated; |
| 871 | |
| 872 | auto IsZeroInit = [](MachineInstr *Def) { |
| 873 | return !isVectorPredicated(MI: Def) && |
| 874 | Def->getOpcode() == ARM::MVE_VMOVimmi32 && |
| 875 | Def->getOperand(i: 1).getImm() == 0; |
| 876 | }; |
| 877 | |
| 878 | bool AllowScalars = isHorizontalReduction(MI); |
| 879 | for (auto &MO : MI.operands()) { |
| 880 | if (!MO.isReg() || !MO.getReg()) |
| 881 | continue; |
| 882 | if (!isRegInClass(MO, Class: QPRs) && AllowScalars) |
| 883 | continue; |
| 884 | // Skip the lr predicate reg |
| 885 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI); |
| 886 | if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) |
| 887 | continue; |
| 888 | |
| 889 | // Check that this instruction will produce zeros in its false lanes: |
| 890 | // - If it only consumes false lanes zero or constant 0 (vmov #0) |
| 891 | // - If it's predicated, it only matters that it's def register already has |
| 892 | // false lane zeros, so we can ignore the uses. |
| 893 | SmallPtrSet<MachineInstr *, 2> Defs; |
| 894 | RDA.getGlobalReachingDefs(MI: &MI, Reg: MO.getReg(), Defs); |
| 895 | if (Defs.empty()) |
| 896 | return false; |
| 897 | for (auto *Def : Defs) { |
| 898 | if (Def == &MI || FalseLanesZero.count(Ptr: Def) || IsZeroInit(Def)) |
| 899 | continue; |
| 900 | if (MO.isUse() && isPredicated) |
| 901 | continue; |
| 902 | return false; |
| 903 | } |
| 904 | } |
| 905 | LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); |
| 906 | return true; |
| 907 | } |
| 908 | |
| 909 | bool LowOverheadLoop::ValidateLiveOuts() { |
| 910 | // We want to find out if the tail-predicated version of this loop will |
| 911 | // produce the same values as the loop in its original form. For this to |
| 912 | // be true, the newly inserted implicit predication must not change the |
| 913 | // the (observable) results. |
| 914 | // We're doing this because many instructions in the loop will not be |
| 915 | // predicated and so the conversion from VPT predication to tail-predication |
| 916 | // can result in different values being produced; due to the tail-predication |
| 917 | // preventing many instructions from updating their falsely predicated |
| 918 | // lanes. This analysis assumes that all the instructions perform lane-wise |
| 919 | // operations and don't perform any exchanges. |
| 920 | // A masked load, whether through VPT or tail predication, will write zeros |
| 921 | // to any of the falsely predicated bytes. So, from the loads, we know that |
| 922 | // the false lanes are zeroed and here we're trying to track that those false |
| 923 | // lanes remain zero, or where they change, the differences are masked away |
| 924 | // by their user(s). |
| 925 | // All MVE stores have to be predicated, so we know that any predicate load |
| 926 | // operands, or stored results are equivalent already. Other explicitly |
| 927 | // predicated instructions will perform the same operation in the original |
| 928 | // loop and the tail-predicated form too. Because of this, we can insert |
| 929 | // loads, stores and other predicated instructions into our Predicated |
| 930 | // set and build from there. |
| 931 | const TargetRegisterClass *QPRs = TRI.getRegClass(i: ARM::MQPRRegClassID); |
| 932 | SetVector<MachineInstr *> FalseLanesUnknown; |
| 933 | SmallPtrSet<MachineInstr *, 4> FalseLanesZero; |
| 934 | SmallPtrSet<MachineInstr *, 4> Predicated; |
| 935 | MachineBasicBlock * = ML.getHeader(); |
| 936 | |
| 937 | LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n" ); |
| 938 | |
| 939 | for (auto &MI : *Header) { |
| 940 | if (!shouldInspect(MI)) |
| 941 | continue; |
| 942 | |
| 943 | if (isVCTP(MI: &MI) || isVPTOpcode(Opc: MI.getOpcode())) |
| 944 | continue; |
| 945 | |
| 946 | bool isPredicated = isVectorPredicated(MI: &MI); |
| 947 | bool retainsOrReduces = |
| 948 | retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); |
| 949 | |
| 950 | if (isPredicated) |
| 951 | Predicated.insert(Ptr: &MI); |
| 952 | if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) |
| 953 | FalseLanesZero.insert(Ptr: &MI); |
| 954 | else if (MI.getNumDefs() == 0) |
| 955 | continue; |
| 956 | else if (!isPredicated && retainsOrReduces) { |
| 957 | LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); |
| 958 | return false; |
| 959 | } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) |
| 960 | FalseLanesUnknown.insert(X: &MI); |
| 961 | } |
| 962 | |
| 963 | LLVM_DEBUG({ |
| 964 | dbgs() << " Predicated:\n" ; |
| 965 | for (auto *I : Predicated) |
| 966 | dbgs() << " " << *I; |
| 967 | dbgs() << " FalseLanesZero:\n" ; |
| 968 | for (auto *I : FalseLanesZero) |
| 969 | dbgs() << " " << *I; |
| 970 | dbgs() << " FalseLanesUnknown:\n" ; |
| 971 | for (auto *I : FalseLanesUnknown) |
| 972 | dbgs() << " " << *I; |
| 973 | }); |
| 974 | |
| 975 | auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, |
| 976 | SmallPtrSetImpl<MachineInstr *> &Predicated) { |
| 977 | SmallPtrSet<MachineInstr *, 2> Uses; |
| 978 | RDA.getGlobalUses(MI, Reg: MO.getReg().asMCReg(), Uses); |
| 979 | for (auto *Use : Uses) { |
| 980 | if (Use != MI && !Predicated.count(Ptr: Use)) |
| 981 | return false; |
| 982 | } |
| 983 | return true; |
| 984 | }; |
| 985 | |
| 986 | // Visit the unknowns in reverse so that we can start at the values being |
| 987 | // stored and then we can work towards the leaves, hopefully adding more |
| 988 | // instructions to Predicated. Successfully terminating the loop means that |
| 989 | // all the unknown values have to found to be masked by predicated user(s). |
| 990 | // For any unpredicated values, we store them in NonPredicated so that we |
| 991 | // can later check whether these form a reduction. |
| 992 | SmallPtrSet<MachineInstr*, 2> NonPredicated; |
| 993 | for (auto *MI : reverse(C&: FalseLanesUnknown)) { |
| 994 | for (auto &MO : MI->operands()) { |
| 995 | if (!isRegInClass(MO, Class: QPRs) || !MO.isDef()) |
| 996 | continue; |
| 997 | if (!HasPredicatedUsers(MI, MO, Predicated)) { |
| 998 | LLVM_DEBUG(dbgs() << " Found an unknown def of : " |
| 999 | << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); |
| 1000 | NonPredicated.insert(Ptr: MI); |
| 1001 | break; |
| 1002 | } |
| 1003 | } |
| 1004 | // Any unknown false lanes have been masked away by the user(s). |
| 1005 | if (!NonPredicated.contains(Ptr: MI)) |
| 1006 | Predicated.insert(Ptr: MI); |
| 1007 | } |
| 1008 | |
| 1009 | SmallPtrSet<MachineInstr *, 2> LiveOutMIs; |
| 1010 | SmallVector<MachineBasicBlock *, 2> ExitBlocks; |
| 1011 | ML.getExitBlocks(ExitBlocks); |
| 1012 | assert(ML.getNumBlocks() == 1 && "Expected single block loop!" ); |
| 1013 | assert(ExitBlocks.size() == 1 && "Expected a single exit block" ); |
| 1014 | MachineBasicBlock *ExitBB = ExitBlocks.front(); |
| 1015 | for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { |
| 1016 | // TODO: Instead of blocking predication, we could move the vctp to the exit |
| 1017 | // block and calculate it's operand there in or the preheader. |
| 1018 | if (RegMask.PhysReg == ARM::VPR) { |
| 1019 | LLVM_DEBUG(dbgs() << " VPR is live in to the exit block." ); |
| 1020 | return false; |
| 1021 | } |
| 1022 | // Check Q-regs that are live in the exit blocks. We don't collect scalars |
| 1023 | // because they won't be affected by lane predication. |
| 1024 | if (QPRs->contains(Reg: RegMask.PhysReg)) |
| 1025 | if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, Reg: RegMask.PhysReg)) |
| 1026 | LiveOutMIs.insert(Ptr: MI); |
| 1027 | } |
| 1028 | |
| 1029 | // We've already validated that any VPT predication within the loop will be |
| 1030 | // equivalent when we perform the predication transformation; so we know that |
| 1031 | // any VPT predicated instruction is predicated upon VCTP. Any live-out |
| 1032 | // instruction needs to be predicated, so check this here. The instructions |
| 1033 | // in NonPredicated have been found to be a reduction that we can ensure its |
| 1034 | // legality. Any MQPRCopy found will need to validate its input as if it was |
| 1035 | // live out. |
| 1036 | SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); |
| 1037 | while (!Worklist.empty()) { |
| 1038 | MachineInstr *MI = Worklist.pop_back_val(); |
| 1039 | if (MI->getOpcode() == ARM::MQPRCopy) { |
| 1040 | VMOVCopies.insert(Ptr: MI); |
| 1041 | MachineInstr *CopySrc = |
| 1042 | RDA.getUniqueReachingMIDef(MI, Reg: MI->getOperand(i: 1).getReg()); |
| 1043 | if (CopySrc) |
| 1044 | Worklist.push_back(Elt: CopySrc); |
| 1045 | } else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) { |
| 1046 | LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); |
| 1047 | VMOVCopies.clear(); |
| 1048 | return false; |
| 1049 | } |
| 1050 | } |
| 1051 | |
| 1052 | return true; |
| 1053 | } |
| 1054 | |
| 1055 | void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { |
| 1056 | if (Revert) |
| 1057 | return; |
| 1058 | |
| 1059 | // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] |
| 1060 | // can only jump back. |
| 1061 | auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, |
| 1062 | ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { |
| 1063 | MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd |
| 1064 | ? End->getOperand(i: 1).getMBB() |
| 1065 | : End->getOperand(i: 2).getMBB(); |
| 1066 | // TODO Maybe there's cases where the target doesn't have to be the header, |
| 1067 | // but for now be safe and revert. |
| 1068 | if (TgtBB != ML.getHeader()) { |
| 1069 | LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n" ); |
| 1070 | return false; |
| 1071 | } |
| 1072 | |
| 1073 | // The WLS and LE instructions have 12-bits for the label offset. WLS |
| 1074 | // requires a positive offset, while LE uses negative. |
| 1075 | if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) || |
| 1076 | !BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: 4094)) { |
| 1077 | LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n" ); |
| 1078 | return false; |
| 1079 | } |
| 1080 | |
| 1081 | if (isWhileLoopStart(MI: *Start)) { |
| 1082 | MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(MI: *Start); |
| 1083 | if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) || |
| 1084 | !BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: 4094)) { |
| 1085 | LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n" ); |
| 1086 | return false; |
| 1087 | } |
| 1088 | } |
| 1089 | return true; |
| 1090 | }; |
| 1091 | |
| 1092 | StartInsertPt = MachineBasicBlock::iterator(Start); |
| 1093 | StartInsertBB = Start->getParent(); |
| 1094 | LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at " |
| 1095 | << *StartInsertPt); |
| 1096 | |
| 1097 | Revert = !ValidateRanges(Start, End, BBUtils, ML); |
| 1098 | CannotTailPredicate = !ValidateTailPredicate(); |
| 1099 | } |
| 1100 | |
| 1101 | bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { |
| 1102 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); |
| 1103 | if (VCTPs.empty()) { |
| 1104 | VCTPs.push_back(Elt: MI); |
| 1105 | return true; |
| 1106 | } |
| 1107 | |
| 1108 | // If we find another VCTP, check whether it uses the same value as the main VCTP. |
| 1109 | // If it does, store it in the VCTPs set, else refuse it. |
| 1110 | MachineInstr *Prev = VCTPs.back(); |
| 1111 | if (!Prev->getOperand(i: 1).isIdenticalTo(Other: MI->getOperand(i: 1)) || |
| 1112 | !RDA.hasSameReachingDef(A: Prev, B: MI, Reg: MI->getOperand(i: 1).getReg().asMCReg())) { |
| 1113 | LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " |
| 1114 | "definition from the main VCTP" ); |
| 1115 | return false; |
| 1116 | } |
| 1117 | VCTPs.push_back(Elt: MI); |
| 1118 | return true; |
| 1119 | } |
| 1120 | |
| 1121 | static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { |
| 1122 | |
| 1123 | auto GetFrameIndex = [](MachineMemOperand *Operand) { |
| 1124 | const PseudoSourceValue *PseudoValue = Operand->getPseudoValue(); |
| 1125 | if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) { |
| 1126 | if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) { |
| 1127 | return FS->getFrameIndex(); |
| 1128 | } |
| 1129 | } |
| 1130 | return -1; |
| 1131 | }; |
| 1132 | |
| 1133 | auto IsStackOp = [GetFrameIndex](MachineInstr *I) { |
| 1134 | switch (I->getOpcode()) { |
| 1135 | case ARM::MVE_VSTRWU32: |
| 1136 | case ARM::MVE_VLDRWU32: { |
| 1137 | return I->getOperand(i: 1).getReg() == ARM::SP && |
| 1138 | I->memoperands().size() == 1 && |
| 1139 | GetFrameIndex(I->memoperands().front()) >= 0; |
| 1140 | } |
| 1141 | default: |
| 1142 | return false; |
| 1143 | } |
| 1144 | }; |
| 1145 | |
| 1146 | // An unpredicated vector register spill is allowed if all of the uses of the |
| 1147 | // stack slot are within the loop |
| 1148 | if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) |
| 1149 | return false; |
| 1150 | |
| 1151 | // Search all blocks after the loop for accesses to the same stack slot. |
| 1152 | // ReachingDefAnalysis doesn't work for sp as it relies on registers being |
| 1153 | // live-out (which sp never is) to know what blocks to look in |
| 1154 | if (MI->memoperands().size() == 0) |
| 1155 | return false; |
| 1156 | int FI = GetFrameIndex(MI->memoperands().front()); |
| 1157 | |
| 1158 | auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo(); |
| 1159 | if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI)) |
| 1160 | return false; |
| 1161 | |
| 1162 | SmallVector<MachineBasicBlock *> Frontier; |
| 1163 | ML->getExitBlocks(ExitBlocks&: Frontier); |
| 1164 | SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()}; |
| 1165 | unsigned Idx = 0; |
| 1166 | while (Idx < Frontier.size()) { |
| 1167 | MachineBasicBlock *BB = Frontier[Idx]; |
| 1168 | bool LookAtSuccessors = true; |
| 1169 | for (auto &I : *BB) { |
| 1170 | if (!IsStackOp(&I) || I.memoperands().size() == 0) |
| 1171 | continue; |
| 1172 | if (GetFrameIndex(I.memoperands().front()) != FI) |
| 1173 | continue; |
| 1174 | // If this block has a store to the stack slot before any loads then we |
| 1175 | // can ignore the block |
| 1176 | if (I.getOpcode() == ARM::MVE_VSTRWU32) { |
| 1177 | LookAtSuccessors = false; |
| 1178 | break; |
| 1179 | } |
| 1180 | // If the store and the load are using the same stack slot then the |
| 1181 | // store isn't valid for tail predication |
| 1182 | if (I.getOpcode() == ARM::MVE_VLDRWU32) |
| 1183 | return false; |
| 1184 | } |
| 1185 | |
| 1186 | if (LookAtSuccessors) { |
| 1187 | for (auto *Succ : BB->successors()) { |
| 1188 | if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ)) |
| 1189 | Frontier.push_back(Elt: Succ); |
| 1190 | } |
| 1191 | } |
| 1192 | Visited.insert(Ptr: BB); |
| 1193 | Idx++; |
| 1194 | } |
| 1195 | |
| 1196 | return true; |
| 1197 | } |
| 1198 | |
| 1199 | bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { |
| 1200 | if (CannotTailPredicate) |
| 1201 | return false; |
| 1202 | |
| 1203 | if (!shouldInspect(MI&: *MI)) |
| 1204 | return true; |
| 1205 | |
| 1206 | if (MI->getOpcode() == ARM::MVE_VPSEL || |
| 1207 | MI->getOpcode() == ARM::MVE_VPNOT) { |
| 1208 | // TODO: Allow VPSEL and VPNOT, we currently cannot because: |
| 1209 | // 1) It will use the VPR as a predicate operand, but doesn't have to be |
| 1210 | // instead a VPT block, which means we can assert while building up |
| 1211 | // the VPT block because we don't find another VPT or VPST to being a new |
| 1212 | // one. |
| 1213 | // 2) VPSEL still requires a VPR operand even after tail predicating, |
| 1214 | // which means we can't remove it unless there is another |
| 1215 | // instruction, such as vcmp, that can provide the VPR def. |
| 1216 | return false; |
| 1217 | } |
| 1218 | |
| 1219 | // Record all VCTPs and check that they're equivalent to one another. |
| 1220 | if (isVCTP(MI) && !AddVCTP(MI)) |
| 1221 | return false; |
| 1222 | |
| 1223 | // Inspect uses first so that any instructions that alter the VPR don't |
| 1224 | // alter the predicate upon themselves. |
| 1225 | const MCInstrDesc &MCID = MI->getDesc(); |
| 1226 | bool IsUse = false; |
| 1227 | unsigned LastOpIdx = MI->getNumOperands() - 1; |
| 1228 | for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) { |
| 1229 | const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index()); |
| 1230 | if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) |
| 1231 | continue; |
| 1232 | |
| 1233 | if (ARM::isVpred(op: Op.value().OperandType)) { |
| 1234 | VPTstate.addInst(MI); |
| 1235 | IsUse = true; |
| 1236 | } else if (MI->getOpcode() != ARM::MVE_VPST) { |
| 1237 | LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); |
| 1238 | return false; |
| 1239 | } |
| 1240 | } |
| 1241 | |
| 1242 | // If we find an instruction that has been marked as not valid for tail |
| 1243 | // predication, only allow the instruction if it's contained within a valid |
| 1244 | // VPT block. |
| 1245 | bool RequiresExplicitPredication = |
| 1246 | (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; |
| 1247 | if (isDomainMVE(MI) && RequiresExplicitPredication) { |
| 1248 | if (MI->getOpcode() == ARM::MQPRCopy) |
| 1249 | return true; |
| 1250 | if (!IsUse && producesDoubleWidthResult(MI: *MI)) { |
| 1251 | DoubleWidthResultInstrs.insert(Ptr: MI); |
| 1252 | return true; |
| 1253 | } |
| 1254 | |
| 1255 | LLVM_DEBUG(if (!IsUse) dbgs() |
| 1256 | << "ARM Loops: Can't tail predicate: " << *MI); |
| 1257 | return IsUse; |
| 1258 | } |
| 1259 | |
| 1260 | // If the instruction is already explicitly predicated, then the conversion |
| 1261 | // will be fine, but ensure that all store operations are predicated. |
| 1262 | if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML)) |
| 1263 | return IsUse; |
| 1264 | |
| 1265 | // If this instruction defines the VPR, update the predicate for the |
| 1266 | // proceeding instructions. |
| 1267 | if (isVectorPredicate(MI)) { |
| 1268 | // Clear the existing predicate when we're not in VPT Active state, |
| 1269 | // otherwise we add to it. |
| 1270 | if (!isVectorPredicated(MI)) |
| 1271 | VPTstate.resetPredicate(MI); |
| 1272 | else |
| 1273 | VPTstate.addPredicate(MI); |
| 1274 | } |
| 1275 | |
| 1276 | // Finally once the predicate has been modified, we can start a new VPT |
| 1277 | // block if necessary. |
| 1278 | if (isVPTOpcode(Opc: MI->getOpcode())) |
| 1279 | VPTstate.CreateVPTBlock(MI); |
| 1280 | |
| 1281 | return true; |
| 1282 | } |
| 1283 | |
| 1284 | bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { |
| 1285 | const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>(); |
| 1286 | if (!ST.hasLOB()) |
| 1287 | return false; |
| 1288 | |
| 1289 | MF = &mf; |
| 1290 | LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n" ); |
| 1291 | |
| 1292 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
| 1293 | RDA = &getAnalysis<ReachingDefAnalysis>(); |
| 1294 | MF->getProperties().setTracksLiveness(); |
| 1295 | MRI = &MF->getRegInfo(); |
| 1296 | TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); |
| 1297 | TRI = ST.getRegisterInfo(); |
| 1298 | BBUtils = std::make_unique<ARMBasicBlockUtils>(args&: *MF); |
| 1299 | BBUtils->computeAllBlockSizes(); |
| 1300 | BBUtils->adjustBBOffsetsAfter(MBB: &MF->front()); |
| 1301 | |
| 1302 | bool Changed = false; |
| 1303 | for (auto *ML : *MLI) { |
| 1304 | if (ML->isOutermost()) |
| 1305 | Changed |= ProcessLoop(ML); |
| 1306 | } |
| 1307 | Changed |= RevertNonLoops(); |
| 1308 | return Changed; |
| 1309 | } |
| 1310 | |
| 1311 | bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { |
| 1312 | bool Changed = false; |
| 1313 | |
| 1314 | // Process inner loops first. |
| 1315 | for (MachineLoop *L : *ML) |
| 1316 | Changed |= ProcessLoop(ML: L); |
| 1317 | |
| 1318 | LLVM_DEBUG({ |
| 1319 | dbgs() << "ARM Loops: Processing loop containing:\n" ; |
| 1320 | if (auto *Preheader = ML->getLoopPreheader()) |
| 1321 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
| 1322 | else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true)) |
| 1323 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
| 1324 | for (auto *MBB : ML->getBlocks()) |
| 1325 | dbgs() << " - Block: " << printMBBReference(*MBB) << "\n" ; |
| 1326 | }); |
| 1327 | |
| 1328 | // Search the given block for a loop start instruction. If one isn't found, |
| 1329 | // and there's only one predecessor block, search that one too. |
| 1330 | std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = |
| 1331 | [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { |
| 1332 | for (auto &MI : *MBB) { |
| 1333 | if (isLoopStart(MI)) |
| 1334 | return &MI; |
| 1335 | } |
| 1336 | if (MBB->pred_size() == 1) |
| 1337 | return SearchForStart(*MBB->pred_begin()); |
| 1338 | return nullptr; |
| 1339 | }; |
| 1340 | |
| 1341 | LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); |
| 1342 | // Search the preheader for the start intrinsic. |
| 1343 | // FIXME: I don't see why we shouldn't be supporting multiple predecessors |
| 1344 | // with potentially multiple set.loop.iterations, so we need to enable this. |
| 1345 | if (LoLoop.Preheader) |
| 1346 | LoLoop.Start = SearchForStart(LoLoop.Preheader); |
| 1347 | else |
| 1348 | return Changed; |
| 1349 | |
| 1350 | // Find the low-overhead loop components and decide whether or not to fall |
| 1351 | // back to a normal loop. Also look for a vctp instructions and decide |
| 1352 | // whether we can convert that predicate using tail predication. |
| 1353 | for (auto *MBB : reverse(C: ML->getBlocks())) { |
| 1354 | for (auto &MI : *MBB) { |
| 1355 | if (MI.isDebugValue()) |
| 1356 | continue; |
| 1357 | else if (MI.getOpcode() == ARM::t2LoopDec) |
| 1358 | LoLoop.Dec = &MI; |
| 1359 | else if (MI.getOpcode() == ARM::t2LoopEnd) |
| 1360 | LoLoop.End = &MI; |
| 1361 | else if (MI.getOpcode() == ARM::t2LoopEndDec) |
| 1362 | LoLoop.End = LoLoop.Dec = &MI; |
| 1363 | else if (isLoopStart(MI)) |
| 1364 | LoLoop.Start = &MI; |
| 1365 | else if (MI.getDesc().isCall()) { |
| 1366 | // TODO: Though the call will require LE to execute again, does this |
| 1367 | // mean we should revert? Always executing LE hopefully should be |
| 1368 | // faster than performing a sub,cmp,br or even subs,br. |
| 1369 | LoLoop.Revert = true; |
| 1370 | LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n" ); |
| 1371 | } else { |
| 1372 | // Record VPR defs and build up their corresponding vpt blocks. |
| 1373 | // Check we know how to tail predicate any mve instructions. |
| 1374 | LoLoop.AnalyseMVEInst(MI: &MI); |
| 1375 | } |
| 1376 | } |
| 1377 | } |
| 1378 | |
| 1379 | LLVM_DEBUG(LoLoop.dump()); |
| 1380 | if (!LoLoop.FoundAllComponents()) { |
| 1381 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n" ); |
| 1382 | return Changed; |
| 1383 | } |
| 1384 | |
| 1385 | assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart && |
| 1386 | "Expected t2WhileLoopStart to be removed before regalloc!" ); |
| 1387 | |
| 1388 | // Check that the only instruction using LoopDec is LoopEnd. This can only |
| 1389 | // happen when the Dec and End are separate, not a single t2LoopEndDec. |
| 1390 | // TODO: Check for copy chains that really have no effect. |
| 1391 | if (LoLoop.Dec != LoLoop.End) { |
| 1392 | SmallPtrSet<MachineInstr *, 2> Uses; |
| 1393 | RDA->getReachingLocalUses(MI: LoLoop.Dec, Reg: MCRegister::from(Val: ARM::LR), Uses); |
| 1394 | if (Uses.size() > 1 || !Uses.count(Ptr: LoLoop.End)) { |
| 1395 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n" ); |
| 1396 | LoLoop.Revert = true; |
| 1397 | } |
| 1398 | } |
| 1399 | LoLoop.Validate(BBUtils: BBUtils.get()); |
| 1400 | Expand(LoLoop); |
| 1401 | return true; |
| 1402 | } |
| 1403 | |
| 1404 | // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a |
| 1405 | // beq that branches to the exit branch. |
| 1406 | // TODO: We could also try to generate a cbz if the value in LR is also in |
| 1407 | // another low register. |
| 1408 | void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { |
| 1409 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); |
| 1410 | MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(MI: *MI); |
| 1411 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
| 1412 | ARM::tBcc : ARM::t2Bcc; |
| 1413 | |
| 1414 | RevertWhileLoopStartLR(MI, TII, BrOpc); |
| 1415 | } |
| 1416 | |
| 1417 | void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { |
| 1418 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); |
| 1419 | RevertDoLoopStart(MI, TII); |
| 1420 | } |
| 1421 | |
| 1422 | bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { |
| 1423 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); |
| 1424 | MachineBasicBlock *MBB = MI->getParent(); |
| 1425 | SmallPtrSet<MachineInstr*, 1> Ignore; |
| 1426 | for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { |
| 1427 | if (I->getOpcode() == ARM::t2LoopEnd) { |
| 1428 | Ignore.insert(Ptr: &*I); |
| 1429 | break; |
| 1430 | } |
| 1431 | } |
| 1432 | |
| 1433 | // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. |
| 1434 | bool SetFlags = |
| 1435 | RDA->isSafeToDefRegAt(MI, Reg: MCRegister::from(Val: ARM::CPSR), Ignore); |
| 1436 | |
| 1437 | llvm::RevertLoopDec(MI, TII, SetFlags); |
| 1438 | return SetFlags; |
| 1439 | } |
| 1440 | |
| 1441 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
| 1442 | void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { |
| 1443 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); |
| 1444 | |
| 1445 | MachineBasicBlock *DestBB = MI->getOperand(i: 1).getMBB(); |
| 1446 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? |
| 1447 | ARM::tBcc : ARM::t2Bcc; |
| 1448 | |
| 1449 | llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); |
| 1450 | } |
| 1451 | |
| 1452 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
| 1453 | void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { |
| 1454 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); |
| 1455 | assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!" ); |
| 1456 | MachineBasicBlock *MBB = MI->getParent(); |
| 1457 | |
| 1458 | MachineInstrBuilder MIB = |
| 1459 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri)); |
| 1460 | MIB.addDef(RegNo: ARM::LR); |
| 1461 | MIB.add(MO: MI->getOperand(i: 1)); |
| 1462 | MIB.addImm(Val: 1); |
| 1463 | MIB.addImm(Val: ARMCC::AL); |
| 1464 | MIB.addReg(RegNo: ARM::NoRegister); |
| 1465 | MIB.addReg(RegNo: ARM::CPSR); |
| 1466 | MIB->getOperand(i: 5).setIsDef(true); |
| 1467 | |
| 1468 | MachineBasicBlock *DestBB = MI->getOperand(i: 2).getMBB(); |
| 1469 | unsigned BrOpc = |
| 1470 | BBUtils->isBBInRange(MI, DestBB, MaxDisp: 254) ? ARM::tBcc : ARM::t2Bcc; |
| 1471 | |
| 1472 | // Create bne |
| 1473 | MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: BrOpc)); |
| 1474 | MIB.add(MO: MI->getOperand(i: 2)); // branch target |
| 1475 | MIB.addImm(Val: ARMCC::NE); // condition code |
| 1476 | MIB.addReg(RegNo: ARM::CPSR); |
| 1477 | |
| 1478 | MI->eraseFromParent(); |
| 1479 | } |
| 1480 | |
| 1481 | // Perform dead code elimation on the loop iteration count setup expression. |
| 1482 | // If we are tail-predicating, the number of elements to be processed is the |
| 1483 | // operand of the VCTP instruction in the vector body, see getCount(), which is |
| 1484 | // register $r3 in this example: |
| 1485 | // |
| 1486 | // $lr = big-itercount-expression |
| 1487 | // .. |
| 1488 | // $lr = t2DoLoopStart renamable $lr |
| 1489 | // vector.body: |
| 1490 | // .. |
| 1491 | // $vpr = MVE_VCTP32 renamable $r3 |
| 1492 | // renamable $lr = t2LoopDec killed renamable $lr, 1 |
| 1493 | // t2LoopEnd renamable $lr, %vector.body |
| 1494 | // tB %end |
| 1495 | // |
| 1496 | // What we would like achieve here is to replace the do-loop start pseudo |
| 1497 | // instruction t2DoLoopStart with: |
| 1498 | // |
| 1499 | // $lr = MVE_DLSTP_32 killed renamable $r3 |
| 1500 | // |
| 1501 | // Thus, $r3 which defines the number of elements, is written to $lr, |
| 1502 | // and then we want to delete the whole chain that used to define $lr, |
| 1503 | // see the comment below how this chain could look like. |
| 1504 | // |
| 1505 | void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { |
| 1506 | if (!LoLoop.IsTailPredicationLegal()) |
| 1507 | return; |
| 1508 | |
| 1509 | LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n" ); |
| 1510 | |
| 1511 | MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: 1); |
| 1512 | if (!Def) { |
| 1513 | LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n" ); |
| 1514 | return; |
| 1515 | } |
| 1516 | |
| 1517 | // Collect and remove the users of iteration count. |
| 1518 | SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, |
| 1519 | LoLoop.End }; |
| 1520 | if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed)) |
| 1521 | LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n" ); |
| 1522 | } |
| 1523 | |
| 1524 | MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { |
| 1525 | LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n" ); |
| 1526 | // When using tail-predication, try to delete the dead code that was used to |
| 1527 | // calculate the number of loop iterations. |
| 1528 | IterationCountDCE(LoLoop); |
| 1529 | |
| 1530 | MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; |
| 1531 | MachineInstr *Start = LoLoop.Start; |
| 1532 | MachineBasicBlock *MBB = LoLoop.StartInsertBB; |
| 1533 | unsigned Opc = LoLoop.getStartOpcode(); |
| 1534 | MachineOperand &Count = LoLoop.getLoopStartOperand(); |
| 1535 | |
| 1536 | // A DLS lr, lr we needn't emit |
| 1537 | MachineInstr* NewStart; |
| 1538 | if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() && |
| 1539 | Count.getReg() == ARM::LR) { |
| 1540 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr" ); |
| 1541 | NewStart = nullptr; |
| 1542 | } else { |
| 1543 | MachineInstrBuilder MIB = |
| 1544 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: Start->getDebugLoc(), MCID: TII->get(Opcode: Opc)); |
| 1545 | |
| 1546 | MIB.addDef(RegNo: ARM::LR); |
| 1547 | MIB.add(MO: Count); |
| 1548 | if (isWhileLoopStart(MI: *Start)) |
| 1549 | MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start)); |
| 1550 | |
| 1551 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); |
| 1552 | NewStart = &*MIB; |
| 1553 | } |
| 1554 | |
| 1555 | LoLoop.ToRemove.insert(Ptr: Start); |
| 1556 | return NewStart; |
| 1557 | } |
| 1558 | |
| 1559 | void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { |
| 1560 | auto RemovePredicate = [](MachineInstr *MI) { |
| 1561 | if (MI->isDebugInstr()) |
| 1562 | return; |
| 1563 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); |
| 1564 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
| 1565 | assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction" ); |
| 1566 | assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && |
| 1567 | "Expected Then predicate!" ); |
| 1568 | MI->getOperand(i: PIdx).setImm(ARMVCC::None); |
| 1569 | MI->getOperand(i: PIdx + 1).setReg(0); |
| 1570 | }; |
| 1571 | |
| 1572 | for (auto &Block : LoLoop.getVPTBlocks()) { |
| 1573 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
| 1574 | |
| 1575 | auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { |
| 1576 | assert(TheVCMP && "Replacing a removed or non-existent VCMP" ); |
| 1577 | // Replace the VCMP with a VPT |
| 1578 | MachineInstrBuilder MIB = |
| 1579 | BuildMI(BB&: *At->getParent(), I: At, MIMD: At->getDebugLoc(), |
| 1580 | MCID: TII->get(Opcode: VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode()))); |
| 1581 | MIB.addImm(Val: ARMVCC::Then); |
| 1582 | // Register one |
| 1583 | MIB.add(MO: TheVCMP->getOperand(i: 1)); |
| 1584 | // Register two |
| 1585 | MIB.add(MO: TheVCMP->getOperand(i: 2)); |
| 1586 | // The comparison code, e.g. ge, eq, lt |
| 1587 | MIB.add(MO: TheVCMP->getOperand(i: 3)); |
| 1588 | LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); |
| 1589 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
| 1590 | LoLoop.ToRemove.insert(Ptr: TheVCMP); |
| 1591 | TheVCMP = nullptr; |
| 1592 | }; |
| 1593 | |
| 1594 | if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ Exclusive: true)) { |
| 1595 | MachineInstr *VPST = Insts.front(); |
| 1596 | if (Block.hasUniformPredicate()) { |
| 1597 | // A vpt block starting with VPST, is only predicated upon vctp and has no |
| 1598 | // internal vpr defs: |
| 1599 | // - Remove vpst. |
| 1600 | // - Unpredicate the remaining instructions. |
| 1601 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
| 1602 | for (unsigned i = 1; i < Insts.size(); ++i) |
| 1603 | RemovePredicate(Insts[i]); |
| 1604 | } else { |
| 1605 | // The VPT block has a non-uniform predicate but it uses a vpst and its |
| 1606 | // entry is guarded only by a vctp, which means we: |
| 1607 | // - Need to remove the original vpst. |
| 1608 | // - Then need to unpredicate any following instructions, until |
| 1609 | // we come across the divergent vpr def. |
| 1610 | // - Insert a new vpst to predicate the instruction(s) that following |
| 1611 | // the divergent vpr def. |
| 1612 | MachineInstr *Divergent = Block.getDivergent(); |
| 1613 | MachineBasicBlock *MBB = Divergent->getParent(); |
| 1614 | auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); |
| 1615 | while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) |
| 1616 | ++DivergentNext; |
| 1617 | |
| 1618 | bool DivergentNextIsPredicated = |
| 1619 | DivergentNext != MBB->end() && |
| 1620 | getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None; |
| 1621 | |
| 1622 | for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; |
| 1623 | I != E; ++I) |
| 1624 | RemovePredicate(&*I); |
| 1625 | |
| 1626 | // Check if the instruction defining vpr is a vcmp so it can be combined |
| 1627 | // with the VPST This should be the divergent instruction |
| 1628 | MachineInstr *VCMP = |
| 1629 | VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != 0 ? Divergent : nullptr; |
| 1630 | |
| 1631 | if (DivergentNextIsPredicated) { |
| 1632 | // Insert a VPST at the divergent only if the next instruction |
| 1633 | // would actually use it. A VCMP following a VPST can be |
| 1634 | // merged into a VPT so do that instead if the VCMP exists. |
| 1635 | if (!VCMP) { |
| 1636 | // Create a VPST (with a null mask for now, we'll recompute it |
| 1637 | // later) |
| 1638 | MachineInstrBuilder MIB = |
| 1639 | BuildMI(BB&: *Divergent->getParent(), I: Divergent, |
| 1640 | MIMD: Divergent->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPST)); |
| 1641 | MIB.addImm(Val: 0); |
| 1642 | LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); |
| 1643 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
| 1644 | } else { |
| 1645 | // No RDA checks are necessary here since the VPST would have been |
| 1646 | // directly after the VCMP |
| 1647 | ReplaceVCMPWithVPT(VCMP, VCMP); |
| 1648 | } |
| 1649 | } |
| 1650 | } |
| 1651 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
| 1652 | LoLoop.ToRemove.insert(Ptr: VPST); |
| 1653 | } else if (Block.containsVCTP()) { |
| 1654 | // The vctp will be removed, so either the entire block will be dead or |
| 1655 | // the block mask of the vp(s)t will need to be recomputed. |
| 1656 | MachineInstr *VPST = Insts.front(); |
| 1657 | if (Block.size() == 2) { |
| 1658 | assert(VPST->getOpcode() == ARM::MVE_VPST && |
| 1659 | "Found a VPST in an otherwise empty vpt block" ); |
| 1660 | LoLoop.ToRemove.insert(Ptr: VPST); |
| 1661 | } else |
| 1662 | LoLoop.BlockMasksToRecompute.insert(Ptr: VPST); |
| 1663 | } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { |
| 1664 | // If this block starts with a VPST then attempt to merge it with the |
| 1665 | // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT |
| 1666 | // block that no longer exists |
| 1667 | MachineInstr *VPST = Insts.front(); |
| 1668 | auto Next = ++MachineBasicBlock::iterator(VPST); |
| 1669 | assert(getVPTInstrPredicate(*Next) != ARMVCC::None && |
| 1670 | "The instruction after a VPST must be predicated" ); |
| 1671 | (void)Next; |
| 1672 | MachineInstr *VprDef = RDA->getUniqueReachingMIDef(MI: VPST, Reg: ARM::VPR); |
| 1673 | if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) && |
| 1674 | !LoLoop.ToRemove.contains(Ptr: VprDef)) { |
| 1675 | MachineInstr *VCMP = VprDef; |
| 1676 | // The VCMP and VPST can only be merged if the VCMP's operands will have |
| 1677 | // the same values at the VPST. |
| 1678 | // If any of the instructions between the VCMP and VPST are predicated |
| 1679 | // then a different code path is expected to have merged the VCMP and |
| 1680 | // VPST already. |
| 1681 | if (std::none_of(first: ++MachineBasicBlock::iterator(VCMP), |
| 1682 | last: MachineBasicBlock::iterator(VPST), pred: hasVPRUse) && |
| 1683 | RDA->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: 1).getReg()) && |
| 1684 | RDA->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: 2).getReg())) { |
| 1685 | ReplaceVCMPWithVPT(VCMP, VPST); |
| 1686 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
| 1687 | LoLoop.ToRemove.insert(Ptr: VPST); |
| 1688 | } |
| 1689 | } |
| 1690 | } |
| 1691 | } |
| 1692 | |
| 1693 | LoLoop.ToRemove.insert_range(R&: LoLoop.VCTPs); |
| 1694 | } |
| 1695 | |
| 1696 | void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { |
| 1697 | |
| 1698 | // Combine the LoopDec and LoopEnd instructions into LE(TP). |
| 1699 | auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { |
| 1700 | MachineInstr *End = LoLoop.End; |
| 1701 | MachineBasicBlock *MBB = End->getParent(); |
| 1702 | unsigned Opc = LoLoop.IsTailPredicationLegal() ? |
| 1703 | ARM::MVE_LETP : ARM::t2LEUpdate; |
| 1704 | MachineInstrBuilder MIB = BuildMI(BB&: *MBB, I: End, MIMD: End->getDebugLoc(), |
| 1705 | MCID: TII->get(Opcode: Opc)); |
| 1706 | MIB.addDef(RegNo: ARM::LR); |
| 1707 | unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; |
| 1708 | MIB.add(MO: End->getOperand(i: Off + 0)); |
| 1709 | MIB.add(MO: End->getOperand(i: Off + 1)); |
| 1710 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); |
| 1711 | LoLoop.ToRemove.insert(Ptr: LoLoop.Dec); |
| 1712 | LoLoop.ToRemove.insert(Ptr: End); |
| 1713 | return &*MIB; |
| 1714 | }; |
| 1715 | |
| 1716 | // TODO: We should be able to automatically remove these branches before we |
| 1717 | // get here - probably by teaching analyzeBranch about the pseudo |
| 1718 | // instructions. |
| 1719 | // If there is an unconditional branch, after I, that just branches to the |
| 1720 | // next block, remove it. |
| 1721 | auto RemoveDeadBranch = [](MachineInstr *I) { |
| 1722 | MachineBasicBlock *BB = I->getParent(); |
| 1723 | MachineInstr *Terminator = &BB->instr_back(); |
| 1724 | if (Terminator->isUnconditionalBranch() && I != Terminator) { |
| 1725 | MachineBasicBlock *Succ = Terminator->getOperand(i: 0).getMBB(); |
| 1726 | if (BB->isLayoutSuccessor(MBB: Succ)) { |
| 1727 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); |
| 1728 | Terminator->eraseFromParent(); |
| 1729 | } |
| 1730 | } |
| 1731 | }; |
| 1732 | |
| 1733 | // And VMOVCopies need to become 2xVMOVD for tail predication to be valid. |
| 1734 | // Anything other MQPRCopy can be converted to MVE_VORR later on. |
| 1735 | auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) { |
| 1736 | for (auto *MI : VMOVCopies) { |
| 1737 | LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI); |
| 1738 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
| 1739 | MachineBasicBlock *MBB = MI->getParent(); |
| 1740 | Register Dst = MI->getOperand(i: 0).getReg(); |
| 1741 | Register Src = MI->getOperand(i: 1).getReg(); |
| 1742 | auto MIB1 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
| 1743 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2) |
| 1744 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2) |
| 1745 | .add(MOs: predOps(Pred: ARMCC::AL)); |
| 1746 | (void)MIB1; |
| 1747 | LLVM_DEBUG(dbgs() << " into " << *MIB1); |
| 1748 | auto MIB2 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD), |
| 1749 | DestReg: ARM::D0 + (Dst - ARM::Q0) * 2 + 1) |
| 1750 | .addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * 2 + 1) |
| 1751 | .add(MOs: predOps(Pred: ARMCC::AL)); |
| 1752 | LLVM_DEBUG(dbgs() << " and " << *MIB2); |
| 1753 | (void)MIB2; |
| 1754 | MI->eraseFromParent(); |
| 1755 | } |
| 1756 | }; |
| 1757 | |
| 1758 | if (LoLoop.Revert) { |
| 1759 | if (isWhileLoopStart(MI: *LoLoop.Start)) |
| 1760 | RevertWhile(MI: LoLoop.Start); |
| 1761 | else |
| 1762 | RevertDo(MI: LoLoop.Start); |
| 1763 | if (LoLoop.Dec == LoLoop.End) |
| 1764 | RevertLoopEndDec(MI: LoLoop.End); |
| 1765 | else |
| 1766 | RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec)); |
| 1767 | } else { |
| 1768 | ExpandVMOVCopies(LoLoop.VMOVCopies); |
| 1769 | LoLoop.Start = ExpandLoopStart(LoLoop); |
| 1770 | if (LoLoop.Start) |
| 1771 | RemoveDeadBranch(LoLoop.Start); |
| 1772 | LoLoop.End = ExpandLoopEnd(LoLoop); |
| 1773 | RemoveDeadBranch(LoLoop.End); |
| 1774 | if (LoLoop.IsTailPredicationLegal()) |
| 1775 | ConvertVPTBlocks(LoLoop); |
| 1776 | for (auto *I : LoLoop.ToRemove) { |
| 1777 | LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); |
| 1778 | I->eraseFromParent(); |
| 1779 | } |
| 1780 | for (auto *I : LoLoop.BlockMasksToRecompute) { |
| 1781 | LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); |
| 1782 | recomputeVPTBlockMask(Instr&: *I); |
| 1783 | LLVM_DEBUG(dbgs() << " ... done: " << *I); |
| 1784 | } |
| 1785 | } |
| 1786 | |
| 1787 | PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); |
| 1788 | DFS.ProcessLoop(); |
| 1789 | const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); |
| 1790 | fullyRecomputeLiveIns(MBBs: PostOrder); |
| 1791 | |
| 1792 | for (auto *MBB : reverse(C: PostOrder)) |
| 1793 | recomputeLivenessFlags(MBB&: *MBB); |
| 1794 | |
| 1795 | // We've moved, removed and inserted new instructions, so update RDA. |
| 1796 | RDA->reset(); |
| 1797 | } |
| 1798 | |
| 1799 | bool ARMLowOverheadLoops::RevertNonLoops() { |
| 1800 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n" ); |
| 1801 | bool Changed = false; |
| 1802 | |
| 1803 | for (auto &MBB : *MF) { |
| 1804 | SmallVector<MachineInstr*, 4> Starts; |
| 1805 | SmallVector<MachineInstr*, 4> Decs; |
| 1806 | SmallVector<MachineInstr*, 4> Ends; |
| 1807 | SmallVector<MachineInstr *, 4> EndDecs; |
| 1808 | SmallVector<MachineInstr *, 4> MQPRCopies; |
| 1809 | |
| 1810 | for (auto &I : MBB) { |
| 1811 | if (isLoopStart(MI: I)) |
| 1812 | Starts.push_back(Elt: &I); |
| 1813 | else if (I.getOpcode() == ARM::t2LoopDec) |
| 1814 | Decs.push_back(Elt: &I); |
| 1815 | else if (I.getOpcode() == ARM::t2LoopEnd) |
| 1816 | Ends.push_back(Elt: &I); |
| 1817 | else if (I.getOpcode() == ARM::t2LoopEndDec) |
| 1818 | EndDecs.push_back(Elt: &I); |
| 1819 | else if (I.getOpcode() == ARM::MQPRCopy) |
| 1820 | MQPRCopies.push_back(Elt: &I); |
| 1821 | } |
| 1822 | |
| 1823 | if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() && |
| 1824 | MQPRCopies.empty()) |
| 1825 | continue; |
| 1826 | |
| 1827 | Changed = true; |
| 1828 | |
| 1829 | for (auto *Start : Starts) { |
| 1830 | if (isWhileLoopStart(MI: *Start)) |
| 1831 | RevertWhile(MI: Start); |
| 1832 | else |
| 1833 | RevertDo(MI: Start); |
| 1834 | } |
| 1835 | for (auto *Dec : Decs) |
| 1836 | RevertLoopDec(MI: Dec); |
| 1837 | |
| 1838 | for (auto *End : Ends) |
| 1839 | RevertLoopEnd(MI: End); |
| 1840 | for (auto *End : EndDecs) |
| 1841 | RevertLoopEndDec(MI: End); |
| 1842 | for (auto *MI : MQPRCopies) { |
| 1843 | LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI); |
| 1844 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
| 1845 | MachineBasicBlock *MBB = MI->getParent(); |
| 1846 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR), |
| 1847 | DestReg: MI->getOperand(i: 0).getReg()) |
| 1848 | .add(MO: MI->getOperand(i: 1)) |
| 1849 | .add(MO: MI->getOperand(i: 1)); |
| 1850 | addUnpredicatedMveVpredROp(MIB, DestReg: MI->getOperand(i: 0).getReg()); |
| 1851 | MI->eraseFromParent(); |
| 1852 | } |
| 1853 | } |
| 1854 | return Changed; |
| 1855 | } |
| 1856 | |
| 1857 | FunctionPass *llvm::createARMLowOverheadLoopsPass() { |
| 1858 | return new ARMLowOverheadLoops(); |
| 1859 | } |
| 1860 | |