| 1 | //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements the loop fusion pass. |
| 11 | /// The implementation is largely based on the following document: |
| 12 | /// |
| 13 | /// Code Transformations to Augment the Scope of Loop Fusion in a |
| 14 | /// Production Compiler |
| 15 | /// Christopher Mark Barton |
| 16 | /// MSc Thesis |
| 17 | /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf |
| 18 | /// |
| 19 | /// The general approach taken is to collect sets of control flow equivalent |
| 20 | /// loops and test whether they can be fused. The necessary conditions for |
| 21 | /// fusion are: |
| 22 | /// 1. The loops must be adjacent (there cannot be any statements between |
| 23 | /// the two loops). |
| 24 | /// 2. The loops must be conforming (they must execute the same number of |
| 25 | /// iterations). |
| 26 | /// 3. The loops must be control flow equivalent (if one loop executes, the |
| 27 | /// other is guaranteed to execute). |
| 28 | /// 4. There cannot be any negative distance dependencies between the loops. |
| 29 | /// If all of these conditions are satisfied, it is safe to fuse the loops. |
| 30 | /// |
| 31 | /// This implementation creates FusionCandidates that represent the loop and the |
| 32 | /// necessary information needed by fusion. It then operates on the fusion |
| 33 | /// candidates, first confirming that the candidate is eligible for fusion. The |
| 34 | /// candidates are then collected into control flow equivalent sets, sorted in |
| 35 | /// dominance order. Each set of control flow equivalent candidates is then |
| 36 | /// traversed, attempting to fuse pairs of candidates in the set. If all |
| 37 | /// requirements for fusion are met, the two candidates are fused, creating a |
| 38 | /// new (fused) candidate which is then added back into the set to consider for |
| 39 | /// additional fusion. |
| 40 | /// |
| 41 | /// This implementation currently does not make any modifications to remove |
| 42 | /// conditions for fusion. Code transformations to make loops conform to each of |
| 43 | /// the conditions for fusion are discussed in more detail in the document |
| 44 | /// above. These can be added to the current implementation in the future. |
| 45 | //===----------------------------------------------------------------------===// |
| 46 | |
| 47 | #include "llvm/Transforms/Scalar/LoopFuse.h" |
| 48 | #include "llvm/ADT/Statistic.h" |
| 49 | #include "llvm/Analysis/AssumptionCache.h" |
| 50 | #include "llvm/Analysis/DependenceAnalysis.h" |
| 51 | #include "llvm/Analysis/DomTreeUpdater.h" |
| 52 | #include "llvm/Analysis/LoopInfo.h" |
| 53 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 54 | #include "llvm/Analysis/PostDominators.h" |
| 55 | #include "llvm/Analysis/ScalarEvolution.h" |
| 56 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 57 | #include "llvm/IR/Function.h" |
| 58 | #include "llvm/IR/Verifier.h" |
| 59 | #include "llvm/Support/CommandLine.h" |
| 60 | #include "llvm/Support/Debug.h" |
| 61 | #include "llvm/Support/raw_ostream.h" |
| 62 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 63 | #include "llvm/Transforms/Utils/CodeMoverUtils.h" |
| 64 | #include "llvm/Transforms/Utils/LoopPeel.h" |
| 65 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
| 66 | #include <list> |
| 67 | |
| 68 | using namespace llvm; |
| 69 | |
| 70 | #define DEBUG_TYPE "loop-fusion" |
| 71 | |
| 72 | STATISTIC(FuseCounter, "Loops fused" ); |
| 73 | STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion" ); |
| 74 | STATISTIC(InvalidLoopStructure, "Loop has invalid structure" ); |
| 75 | STATISTIC(AddressTakenBB, "Basic block has address taken" ); |
| 76 | STATISTIC(MayThrowException, "Loop may throw an exception" ); |
| 77 | STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access" ); |
| 78 | STATISTIC(ContainsAtomicAccess, "Loop contains an atomic access" ); |
| 79 | STATISTIC(NotSimplifiedForm, "Loop is not in simplified form" ); |
| 80 | STATISTIC(InvalidDependencies, "Dependencies prevent fusion" ); |
| 81 | STATISTIC(UnknownTripCount, "Loop has unknown trip count" ); |
| 82 | STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop" ); |
| 83 | STATISTIC(NonEqualTripCount, "Loop trip counts are not the same" ); |
| 84 | STATISTIC( |
| 85 | , |
| 86 | "Loop has a non-empty preheader with instructions that cannot be moved" ); |
| 87 | STATISTIC(FusionNotBeneficial, "Fusion is not beneficial" ); |
| 88 | STATISTIC(NonIdenticalGuards, "Candidates have different guards" ); |
| 89 | STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " |
| 90 | "instructions that cannot be moved" ); |
| 91 | STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " |
| 92 | "instructions that cannot be moved" ); |
| 93 | STATISTIC(NotRotated, "Candidate is not rotated" ); |
| 94 | STATISTIC(OnlySecondCandidateIsGuarded, |
| 95 | "The second candidate is guarded while the first one is not" ); |
| 96 | STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions." ); |
| 97 | STATISTIC(NumSunkInsts, "Number of sunk preheader instructions." ); |
| 98 | STATISTIC(NumDA, "DA checks passed" ); |
| 99 | |
| 100 | static cl::opt<uint32_t> FusionPeelMaxCount( |
| 101 | "loop-fusion-peel-max-count" , cl::init(Val: 0), cl::Hidden, |
| 102 | cl::desc("Max number of iterations to be peeled from a loop, such that " |
| 103 | "fusion can take place" )); |
| 104 | |
| 105 | #ifndef NDEBUG |
| 106 | static cl::opt<bool> |
| 107 | VerboseFusionDebugging("loop-fusion-verbose-debug" , |
| 108 | cl::desc("Enable verbose debugging for Loop Fusion" ), |
| 109 | cl::Hidden, cl::init(false)); |
| 110 | #endif |
| 111 | |
| 112 | namespace { |
| 113 | /// This class is used to represent a candidate for loop fusion. When it is |
| 114 | /// constructed, it checks the conditions for loop fusion to ensure that it |
| 115 | /// represents a valid candidate. It caches several parts of a loop that are |
| 116 | /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead |
| 117 | /// of continually querying the underlying Loop to retrieve these values. It is |
| 118 | /// assumed these will not change throughout loop fusion. |
| 119 | /// |
| 120 | /// The invalidate method should be used to indicate that the FusionCandidate is |
| 121 | /// no longer a valid candidate for fusion. Similarly, the isValid() method can |
| 122 | /// be used to ensure that the FusionCandidate is still valid for fusion. |
| 123 | struct FusionCandidate { |
| 124 | /// Cache of parts of the loop used throughout loop fusion. These should not |
| 125 | /// need to change throughout the analysis and transformation. |
| 126 | /// These parts are cached to avoid repeatedly looking up in the Loop class. |
| 127 | |
| 128 | /// Preheader of the loop this candidate represents |
| 129 | BasicBlock *Preheader; |
| 130 | /// Header of the loop this candidate represents |
| 131 | BasicBlock *Header; |
| 132 | /// Blocks in the loop that exit the loop |
| 133 | BasicBlock *ExitingBlock; |
| 134 | /// The successor block of this loop (where the exiting blocks go to) |
| 135 | BasicBlock *ExitBlock; |
| 136 | /// Latch of the loop |
| 137 | BasicBlock *Latch; |
| 138 | /// The loop that this fusion candidate represents |
| 139 | Loop *L; |
| 140 | /// Vector of instructions in this loop that read from memory |
| 141 | SmallVector<Instruction *, 16> MemReads; |
| 142 | /// Vector of instructions in this loop that write to memory |
| 143 | SmallVector<Instruction *, 16> MemWrites; |
| 144 | /// Are all of the members of this fusion candidate still valid |
| 145 | bool Valid; |
| 146 | /// Guard branch of the loop, if it exists |
| 147 | CondBrInst *GuardBranch; |
| 148 | /// Peeling Paramaters of the Loop. |
| 149 | TTI::PeelingPreferences PP; |
| 150 | /// Can you Peel this Loop? |
| 151 | bool AbleToPeel; |
| 152 | /// Has this loop been Peeled |
| 153 | bool Peeled; |
| 154 | |
| 155 | DominatorTree &DT; |
| 156 | const PostDominatorTree *PDT; |
| 157 | |
| 158 | OptimizationRemarkEmitter &ORE; |
| 159 | |
| 160 | FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, |
| 161 | OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) |
| 162 | : Preheader(L->getLoopPreheader()), Header(L->getHeader()), |
| 163 | ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), |
| 164 | Latch(L->getLoopLatch()), L(L), Valid(true), |
| 165 | GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), |
| 166 | Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { |
| 167 | |
| 168 | // Walk over all blocks in the loop and check for conditions that may |
| 169 | // prevent fusion. For each block, walk over all instructions and collect |
| 170 | // the memory reads and writes If any instructions that prevent fusion are |
| 171 | // found, invalidate this object and return. |
| 172 | for (BasicBlock *BB : L->blocks()) { |
| 173 | if (BB->hasAddressTaken()) { |
| 174 | invalidate(); |
| 175 | ++AddressTakenBB; |
| 176 | reportInvalidCandidate(RemarkName: "AddressTakenBB" , |
| 177 | RemarkMsg: "Basic block has address taken" ); |
| 178 | return; |
| 179 | } |
| 180 | |
| 181 | for (Instruction &I : *BB) { |
| 182 | if (I.mayThrow()) { |
| 183 | invalidate(); |
| 184 | ++MayThrowException; |
| 185 | reportInvalidCandidate(RemarkName: "MayThrowException" , |
| 186 | RemarkMsg: "Loop may throw an exception" ); |
| 187 | return; |
| 188 | } |
| 189 | if (I.isVolatile()) { |
| 190 | invalidate(); |
| 191 | ++ContainsVolatileAccess; |
| 192 | reportInvalidCandidate(RemarkName: "ContainsVolatileAccess" , |
| 193 | RemarkMsg: "Loop contains a volatile access" ); |
| 194 | return; |
| 195 | } |
| 196 | // Atomic accesses impose ordering/synchronization constraints that the |
| 197 | // dependence analysis used for fusion does not model, so reordering |
| 198 | // them across the fused body could be unsafe. |
| 199 | if (I.isAtomic()) { |
| 200 | invalidate(); |
| 201 | ++ContainsAtomicAccess; |
| 202 | reportInvalidCandidate(RemarkName: "ContainsAtomicAccess" , |
| 203 | RemarkMsg: "Loop contains an atomic access" ); |
| 204 | return; |
| 205 | } |
| 206 | if (I.mayWriteToMemory()) |
| 207 | MemWrites.push_back(Elt: &I); |
| 208 | if (I.mayReadFromMemory()) |
| 209 | MemReads.push_back(Elt: &I); |
| 210 | } |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | /// Check if all members of the class are valid. |
| 215 | bool isValid() const { |
| 216 | return Preheader && ExitingBlock && ExitBlock && Latch && L && |
| 217 | !L->isInvalid() && Valid; |
| 218 | } |
| 219 | |
| 220 | /// Verify that all members are in sync with the Loop object. |
| 221 | void verify() const { |
| 222 | assert(isValid() && "Candidate is not valid!!" ); |
| 223 | assert(!L->isInvalid() && "Loop is invalid!" ); |
| 224 | assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync" ); |
| 225 | assert(Header == L->getHeader() && "Header is out of sync" ); |
| 226 | assert(ExitingBlock == L->getExitingBlock() && |
| 227 | "Exiting Blocks is out of sync" ); |
| 228 | assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync" ); |
| 229 | assert(Latch == L->getLoopLatch() && "Latch is out of sync" ); |
| 230 | } |
| 231 | |
| 232 | /// Get the entry block for this fusion candidate. |
| 233 | /// |
| 234 | /// If this fusion candidate represents a guarded loop, the entry block is the |
| 235 | /// loop guard block. If it represents an unguarded loop, the entry block is |
| 236 | /// the preheader of the loop. |
| 237 | BasicBlock *getEntryBlock() const { |
| 238 | if (GuardBranch) |
| 239 | return GuardBranch->getParent(); |
| 240 | return Preheader; |
| 241 | } |
| 242 | |
| 243 | /// After Peeling the loop is modified quite a bit, hence all of the Blocks |
| 244 | /// need to be updated accordingly. |
| 245 | void updateAfterPeeling() { |
| 246 | Preheader = L->getLoopPreheader(); |
| 247 | Header = L->getHeader(); |
| 248 | ExitingBlock = L->getExitingBlock(); |
| 249 | ExitBlock = L->getExitBlock(); |
| 250 | Latch = L->getLoopLatch(); |
| 251 | verify(); |
| 252 | } |
| 253 | |
| 254 | /// Given a guarded loop, get the successor of the guard that is not in the |
| 255 | /// loop. |
| 256 | /// |
| 257 | /// This method returns the successor of the loop guard that is not located |
| 258 | /// within the loop (i.e., the successor of the guard that is not the |
| 259 | /// preheader). |
| 260 | /// This method is only valid for guarded loops. |
| 261 | BasicBlock *getNonLoopBlock() const { |
| 262 | assert(GuardBranch && "Only valid on guarded loops." ); |
| 263 | if (Peeled) |
| 264 | return GuardBranch->getSuccessor(i: 1); |
| 265 | return (GuardBranch->getSuccessor(i: 0) == Preheader) |
| 266 | ? GuardBranch->getSuccessor(i: 1) |
| 267 | : GuardBranch->getSuccessor(i: 0); |
| 268 | } |
| 269 | |
| 270 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 271 | LLVM_DUMP_METHOD void dump() const { |
| 272 | dbgs() << "\tGuardBranch: " ; |
| 273 | if (GuardBranch) |
| 274 | dbgs() << *GuardBranch; |
| 275 | else |
| 276 | dbgs() << "nullptr" ; |
| 277 | dbgs() << "\n" |
| 278 | << (GuardBranch ? GuardBranch->getName() : "nullptr" ) << "\n" |
| 279 | << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr" ) |
| 280 | << "\n" |
| 281 | << "\tHeader: " << (Header ? Header->getName() : "nullptr" ) << "\n" |
| 282 | << "\tExitingBB: " |
| 283 | << (ExitingBlock ? ExitingBlock->getName() : "nullptr" ) << "\n" |
| 284 | << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr" ) |
| 285 | << "\n" |
| 286 | << "\tLatch: " << (Latch ? Latch->getName() : "nullptr" ) << "\n" |
| 287 | << "\tEntryBlock: " |
| 288 | << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr" ) |
| 289 | << "\n" ; |
| 290 | } |
| 291 | #endif |
| 292 | |
| 293 | /// Determine if a fusion candidate (representing a loop) is eligible for |
| 294 | /// fusion. Note that this only checks whether a single loop can be fused - it |
| 295 | /// does not check whether it is *legal* to fuse two loops together. |
| 296 | bool isEligibleForFusion(ScalarEvolution &SE) const { |
| 297 | if (!isValid()) { |
| 298 | LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n" ); |
| 299 | assert(Header && "Header should be guaranteed to exist!" ); |
| 300 | ++InvalidLoopStructure; |
| 301 | return false; |
| 302 | } |
| 303 | |
| 304 | // Require ScalarEvolution to be able to determine a trip count. |
| 305 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { |
| 306 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
| 307 | << " trip count not computable!\n" ); |
| 308 | ++UnknownTripCount; |
| 309 | return reportInvalidCandidate(RemarkName: "UnknownTripCount" , |
| 310 | RemarkMsg: "Loop has unknown trip count" ); |
| 311 | } |
| 312 | |
| 313 | if (!L->isLoopSimplifyForm()) { |
| 314 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
| 315 | << " is not in simplified form!\n" ); |
| 316 | ++NotSimplifiedForm; |
| 317 | return reportInvalidCandidate(RemarkName: "NotSimplifiedForm" , |
| 318 | RemarkMsg: "Loop is not in simplified form" ); |
| 319 | } |
| 320 | |
| 321 | if (!L->isRotatedForm()) { |
| 322 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n" ); |
| 323 | ++NotRotated; |
| 324 | return reportInvalidCandidate(RemarkName: "NotRotated" , RemarkMsg: "Candidate is not rotated" ); |
| 325 | } |
| 326 | |
| 327 | return true; |
| 328 | } |
| 329 | |
| 330 | private: |
| 331 | // This is only used internally for now, to clear the MemWrites and MemReads |
| 332 | // list and setting Valid to false. I can't envision other uses of this right |
| 333 | // now, since once FusionCandidates are put into the FusionCandidateList they |
| 334 | // are immutable. Thus, any time we need to change/update a FusionCandidate, |
| 335 | // we must create a new one and insert it into the FusionCandidateList to |
| 336 | // ensure the FusionCandidateList remains ordered correctly. |
| 337 | void invalidate() { |
| 338 | MemWrites.clear(); |
| 339 | MemReads.clear(); |
| 340 | Valid = false; |
| 341 | } |
| 342 | |
| 343 | // Emit an analysis remark explaining why this loop cannot be fused. The |
| 344 | // remark is built from explicit strings so it does not depend on whether |
| 345 | // statistics are enabled. \p RemarkName is the -Rpass remark identifier and |
| 346 | // \p RemarkMsg the human-readable reason. |
| 347 | bool reportInvalidCandidate(StringRef , StringRef ) const { |
| 348 | using namespace ore; |
| 349 | ORE.emit(OptDiag: OptimizationRemarkAnalysis(DEBUG_TYPE, "InvalidCandidate" , |
| 350 | L->getStartLoc(), L->getHeader()) |
| 351 | << "Loop is not a candidate for fusion" ); |
| 352 | |
| 353 | ORE.emit(OptDiag: OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, |
| 354 | L->getStartLoc(), L->getHeader()) |
| 355 | << "[" << L->getHeader()->getParent()->getName() << "]: " |
| 356 | << "Loop is not a candidate for fusion: " << RemarkMsg); |
| 357 | return false; |
| 358 | } |
| 359 | }; |
| 360 | } // namespace |
| 361 | |
| 362 | using LoopVector = SmallVector<Loop *, 4>; |
| 363 | |
| 364 | // List of adjacent fusion candidates in order. Thus, if FC0 comes *before* FC1 |
| 365 | // in a FusionCandidateList, then FC0 dominates FC1, FC1 post-dominates FC0, |
| 366 | // and they are adjacent. |
| 367 | using FusionCandidateList = std::list<FusionCandidate>; |
| 368 | using FusionCandidateCollection = SmallVector<FusionCandidateList, 4>; |
| 369 | |
| 370 | #ifndef NDEBUG |
| 371 | static void printLoopVector(const LoopVector &LV) { |
| 372 | dbgs() << "****************************\n" ; |
| 373 | for (const Loop *L : LV) |
| 374 | printLoop(*L, dbgs()); |
| 375 | dbgs() << "****************************\n" ; |
| 376 | } |
| 377 | |
| 378 | static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) { |
| 379 | if (FC.isValid()) |
| 380 | OS << FC.Preheader->getName(); |
| 381 | else |
| 382 | OS << "<Invalid>" ; |
| 383 | |
| 384 | return OS; |
| 385 | } |
| 386 | |
| 387 | static raw_ostream &operator<<(raw_ostream &OS, |
| 388 | const FusionCandidateList &CandList) { |
| 389 | for (const FusionCandidate &FC : CandList) |
| 390 | OS << FC << '\n'; |
| 391 | |
| 392 | return OS; |
| 393 | } |
| 394 | |
| 395 | static void |
| 396 | printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { |
| 397 | dbgs() << "Fusion Candidates: \n" ; |
| 398 | for (const auto &CandidateList : FusionCandidates) { |
| 399 | dbgs() << "*** Fusion Candidate List ***\n" ; |
| 400 | dbgs() << CandidateList; |
| 401 | dbgs() << "****************************\n" ; |
| 402 | } |
| 403 | } |
| 404 | #endif // NDEBUG |
| 405 | |
| 406 | namespace { |
| 407 | |
| 408 | /// Collect all loops in function at the same nest level, starting at the |
| 409 | /// outermost level. |
| 410 | /// |
| 411 | /// This data structure collects all loops at the same nest level for a |
| 412 | /// given function (specified by the LoopInfo object). It starts at the |
| 413 | /// outermost level. |
| 414 | struct LoopDepthTree { |
| 415 | using LoopsOnLevelTy = SmallVector<LoopVector, 4>; |
| 416 | using iterator = LoopsOnLevelTy::iterator; |
| 417 | using const_iterator = LoopsOnLevelTy::const_iterator; |
| 418 | |
| 419 | LoopDepthTree(LoopInfo &LI) : Depth(1) { |
| 420 | if (!LI.empty()) |
| 421 | LoopsOnLevel.emplace_back(Args: LoopVector(LI.rbegin(), LI.rend())); |
| 422 | } |
| 423 | |
| 424 | /// Test whether a given loop has been removed from the function, and thus is |
| 425 | /// no longer valid. |
| 426 | bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(Ptr: L); } |
| 427 | |
| 428 | /// Record that a given loop has been removed from the function and is no |
| 429 | /// longer valid. |
| 430 | void removeLoop(const Loop *L) { RemovedLoops.insert(Ptr: L); } |
| 431 | |
| 432 | /// Descend the tree to the next (inner) nesting level |
| 433 | void descend() { |
| 434 | LoopsOnLevelTy LoopsOnNextLevel; |
| 435 | |
| 436 | for (const LoopVector &LV : *this) |
| 437 | for (Loop *L : LV) |
| 438 | if (!isRemovedLoop(L) && L->begin() != L->end()) |
| 439 | LoopsOnNextLevel.emplace_back(Args: LoopVector(L->begin(), L->end())); |
| 440 | |
| 441 | LoopsOnLevel = LoopsOnNextLevel; |
| 442 | RemovedLoops.clear(); |
| 443 | Depth++; |
| 444 | } |
| 445 | |
| 446 | bool empty() const { return size() == 0; } |
| 447 | size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } |
| 448 | unsigned getDepth() const { return Depth; } |
| 449 | |
| 450 | iterator begin() { return LoopsOnLevel.begin(); } |
| 451 | iterator end() { return LoopsOnLevel.end(); } |
| 452 | const_iterator begin() const { return LoopsOnLevel.begin(); } |
| 453 | const_iterator end() const { return LoopsOnLevel.end(); } |
| 454 | |
| 455 | private: |
| 456 | /// Set of loops that have been removed from the function and are no longer |
| 457 | /// valid. |
| 458 | SmallPtrSet<const Loop *, 8> RemovedLoops; |
| 459 | |
| 460 | /// Depth of the current level, starting at 1 (outermost loops). |
| 461 | unsigned Depth; |
| 462 | |
| 463 | /// Vector of loops at the current depth level that have the same parent loop |
| 464 | LoopsOnLevelTy LoopsOnLevel; |
| 465 | }; |
| 466 | |
| 467 | struct LoopFuser { |
| 468 | private: |
| 469 | // Sets of control flow equivalent fusion candidates for a given nest level. |
| 470 | FusionCandidateCollection FusionCandidates; |
| 471 | |
| 472 | LoopDepthTree LDT; |
| 473 | DomTreeUpdater DTU; |
| 474 | |
| 475 | LoopInfo &LI; |
| 476 | DominatorTree &DT; |
| 477 | DependenceInfo &DI; |
| 478 | ScalarEvolution &SE; |
| 479 | PostDominatorTree &PDT; |
| 480 | OptimizationRemarkEmitter &ORE; |
| 481 | AssumptionCache &AC; |
| 482 | const TargetTransformInfo &TTI; |
| 483 | |
| 484 | public: |
| 485 | LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, |
| 486 | ScalarEvolution &SE, PostDominatorTree &PDT, |
| 487 | OptimizationRemarkEmitter &ORE, AssumptionCache &AC, |
| 488 | const TargetTransformInfo &TTI) |
| 489 | : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), |
| 490 | DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} |
| 491 | |
| 492 | /// This is the main entry point for loop fusion. It will traverse the |
| 493 | /// specified function and collect candidate loops to fuse, starting at the |
| 494 | /// outermost nesting level and working inwards. |
| 495 | bool fuseLoops(Function &F) { |
| 496 | #ifndef NDEBUG |
| 497 | if (VerboseFusionDebugging) { |
| 498 | LI.print(dbgs()); |
| 499 | } |
| 500 | #endif |
| 501 | |
| 502 | LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() |
| 503 | << "\n" ); |
| 504 | bool Changed = false; |
| 505 | |
| 506 | while (!LDT.empty()) { |
| 507 | LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " |
| 508 | << LDT.getDepth() << "\n" ;); |
| 509 | |
| 510 | for (const LoopVector &LV : LDT) { |
| 511 | assert(LV.size() > 0 && "Empty loop set was build!" ); |
| 512 | |
| 513 | // Skip singleton loop sets as they do not offer fusion opportunities on |
| 514 | // this level. |
| 515 | if (LV.size() == 1) |
| 516 | continue; |
| 517 | #ifndef NDEBUG |
| 518 | if (VerboseFusionDebugging) { |
| 519 | LLVM_DEBUG({ |
| 520 | dbgs() << " Visit loop set (#" << LV.size() << "):\n" ; |
| 521 | printLoopVector(LV); |
| 522 | }); |
| 523 | } |
| 524 | #endif |
| 525 | |
| 526 | collectFusionCandidates(LV); |
| 527 | Changed |= fuseCandidates(); |
| 528 | // All loops in the candidate sets have a common parent (or no parent). |
| 529 | // Next loop vector will correspond to a different parent. It is safe |
| 530 | // to remove all the candidates currently in the set. |
| 531 | FusionCandidates.clear(); |
| 532 | } |
| 533 | |
| 534 | // Finished analyzing candidates at this level. Descend to the next level. |
| 535 | LLVM_DEBUG(dbgs() << "Descend one level!\n" ); |
| 536 | LDT.descend(); |
| 537 | } |
| 538 | |
| 539 | if (Changed) |
| 540 | LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n" ; F.dump();); |
| 541 | |
| 542 | #ifndef NDEBUG |
| 543 | assert(DT.verify()); |
| 544 | assert(PDT.verify()); |
| 545 | LI.verify(DT); |
| 546 | SE.verify(); |
| 547 | #endif |
| 548 | |
| 549 | LLVM_DEBUG(dbgs() << "Loop Fusion complete\n" ); |
| 550 | return Changed; |
| 551 | } |
| 552 | |
| 553 | private: |
| 554 | /// Iterate over all loops in the given loop set and identify the loops that |
| 555 | /// are eligible for fusion. Place all eligible fusion candidates into Control |
| 556 | /// Flow Equivalent sets, sorted by dominance. |
| 557 | void collectFusionCandidates(const LoopVector &LV) { |
| 558 | for (Loop *L : LV) { |
| 559 | TTI::PeelingPreferences PP = |
| 560 | gatherPeelingPreferences(L, SE, TTI, UserAllowPeeling: std::nullopt, UserAllowProfileBasedPeeling: std::nullopt); |
| 561 | FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); |
| 562 | if (!CurrCand.isEligibleForFusion(SE)) |
| 563 | continue; |
| 564 | |
| 565 | // Go through each list in FusionCandidates and determine if the first or |
| 566 | // last loop in the list is strictly adjacent to L. If it is, append L. |
| 567 | // If not, go to the next list. |
| 568 | // If no suitable list is found, start another list and add it to |
| 569 | // FusionCandidates. |
| 570 | bool FoundAdjacent = false; |
| 571 | for (auto &CurrCandList : FusionCandidates) { |
| 572 | if (isStrictlyAdjacent(FC0: CurrCandList.back(), FC1: CurrCand)) { |
| 573 | CurrCandList.push_back(x: CurrCand); |
| 574 | FoundAdjacent = true; |
| 575 | NumFusionCandidates++; |
| 576 | #ifndef NDEBUG |
| 577 | if (VerboseFusionDebugging) |
| 578 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand |
| 579 | << " to existing candidate list\n" ); |
| 580 | #endif |
| 581 | break; |
| 582 | } |
| 583 | } |
| 584 | if (!FoundAdjacent) { |
| 585 | // No list was found. Create a new list and add to FusionCandidates |
| 586 | #ifndef NDEBUG |
| 587 | if (VerboseFusionDebugging) |
| 588 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new list\n" ); |
| 589 | #endif |
| 590 | FusionCandidateList NewCandList; |
| 591 | NewCandList.push_back(x: CurrCand); |
| 592 | FusionCandidates.push_back(Elt: NewCandList); |
| 593 | } |
| 594 | } |
| 595 | } |
| 596 | |
| 597 | /// Determine if it is beneficial to fuse two loops. |
| 598 | /// |
| 599 | /// For now, this method simply returns true because we want to fuse as much |
| 600 | /// as possible (primarily to test the pass). This method will evolve, over |
| 601 | /// time, to add heuristics for profitability of fusion. |
| 602 | bool isBeneficialFusion(const FusionCandidate &FC0, |
| 603 | const FusionCandidate &FC1) { |
| 604 | return true; |
| 605 | } |
| 606 | |
| 607 | /// Computes the integer difference in trip counts: |
| 608 | /// TripCount(FC0) - TripCount(FC1). |
| 609 | /// |
| 610 | /// \returns The integer difference, or std::nullopt if it |
| 611 | /// cannot be determined. |
| 612 | std::optional<int64_t> |
| 613 | calculateTripCountDiff(const FusionCandidate &FC0, |
| 614 | const FusionCandidate &FC1) const { |
| 615 | const SCEV *TripCount0 = SE.getBackedgeTakenCount(L: FC0.L); |
| 616 | if (isa<SCEVCouldNotCompute>(Val: TripCount0)) { |
| 617 | UncomputableTripCount++; |
| 618 | LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!" ); |
| 619 | return std::nullopt; |
| 620 | } |
| 621 | |
| 622 | const SCEV *TripCount1 = SE.getBackedgeTakenCount(L: FC1.L); |
| 623 | if (isa<SCEVCouldNotCompute>(Val: TripCount1)) { |
| 624 | UncomputableTripCount++; |
| 625 | LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!" ); |
| 626 | return std::nullopt; |
| 627 | } |
| 628 | |
| 629 | LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " |
| 630 | << *TripCount1 << " are " |
| 631 | << (TripCount0 == TripCount1 ? "identical" : "different" ) |
| 632 | << "\n" ); |
| 633 | |
| 634 | if (TripCount0 == TripCount1) |
| 635 | return 0; |
| 636 | |
| 637 | LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " |
| 638 | "determining the difference between trip counts\n" ); |
| 639 | |
| 640 | // Currently only considering loops with a single exit point |
| 641 | // and a non-constant trip count. Note that the return value |
| 642 | // of getSmallConstantTripCount is a 32 bit number, based on |
| 643 | // the existing implementation. |
| 644 | const int64_t TC0 = |
| 645 | static_cast<int64_t>(SE.getSmallConstantTripCount(L: FC0.L)); |
| 646 | const int64_t TC1 = |
| 647 | static_cast<int64_t>(SE.getSmallConstantTripCount(L: FC1.L)); |
| 648 | |
| 649 | // If any of the tripcounts are zero that means that loop(s) do not have |
| 650 | // a single exit or a constant tripcount. |
| 651 | if (TC0 == 0 || TC1 == 0) { |
| 652 | LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " |
| 653 | "have a constant number of iterations. Peeling " |
| 654 | "is not benefical\n" ); |
| 655 | return std::nullopt; |
| 656 | } |
| 657 | |
| 658 | return TC0 - TC1; |
| 659 | } |
| 660 | |
| 661 | void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, |
| 662 | unsigned PeelCount) { |
| 663 | assert(FC0.AbleToPeel && "Should be able to peel loop" ); |
| 664 | |
| 665 | LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount |
| 666 | << " iterations of the first loop. \n" ); |
| 667 | |
| 668 | ValueToValueMapTy VMap; |
| 669 | // LoopFusion is a function pass that neither requires nor preserves |
| 670 | // LCSSA, so peelLoop need not preserve it across its internal |
| 671 | // simplifyLoop call. |
| 672 | peelLoop(L: FC0.L, PeelCount, /*PeelLast=*/false, LI: &LI, SE: &SE, DT, AC: &AC, |
| 673 | /*PreserveLCSSA=*/false, VMap); |
| 674 | FC0.Peeled = true; |
| 675 | LLVM_DEBUG(dbgs() << "Done Peeling\n" ); |
| 676 | |
| 677 | #ifndef NDEBUG |
| 678 | auto TCDiff = calculateTripCountDiff(FC0, FC1); |
| 679 | |
| 680 | assert(TCDiff && *TCDiff == 0 && |
| 681 | "Loops should have identical trip counts after peeling" ); |
| 682 | #endif |
| 683 | |
| 684 | FC0.PP.PeelCount += PeelCount; |
| 685 | |
| 686 | // Peeling does not update the PDT |
| 687 | PDT.recalculate(Func&: *FC0.Preheader->getParent()); |
| 688 | |
| 689 | FC0.updateAfterPeeling(); |
| 690 | |
| 691 | // In this case the iterations of the loop are constant, so the first |
| 692 | // loop will execute completely (will not jump from one of |
| 693 | // the peeled blocks to the second loop). Here we are updating the |
| 694 | // branch conditions of each of the peeled blocks, such that it will |
| 695 | // branch to its successor which is not the preheader of the second loop |
| 696 | // in the case of unguarded loops, or the succesors of the exit block of |
| 697 | // the first loop otherwise. Doing this update will ensure that the entry |
| 698 | // block of the first loop dominates the entry block of the second loop. |
| 699 | BasicBlock *BB = |
| 700 | FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; |
| 701 | if (BB) { |
| 702 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 703 | SmallVector<Instruction *, 8> WorkList; |
| 704 | for (BasicBlock *Pred : predecessors(BB)) { |
| 705 | if (Pred != FC0.ExitBlock) { |
| 706 | WorkList.emplace_back(Args: Pred->getTerminator()); |
| 707 | TreeUpdates.emplace_back( |
| 708 | Args: DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); |
| 709 | } |
| 710 | } |
| 711 | // Cannot modify the predecessors inside the above loop as it will cause |
| 712 | // the iterators to be nullptrs, causing memory errors. |
| 713 | for (Instruction *CurrentBranch : WorkList) { |
| 714 | BasicBlock *Succ = CurrentBranch->getSuccessor(Idx: 0); |
| 715 | if (Succ == BB) |
| 716 | Succ = CurrentBranch->getSuccessor(Idx: 1); |
| 717 | ReplaceInstWithInst(From: CurrentBranch, To: UncondBrInst::Create(Target: Succ)); |
| 718 | } |
| 719 | |
| 720 | DTU.applyUpdates(Updates: TreeUpdates); |
| 721 | DTU.flush(); |
| 722 | } |
| 723 | LLVM_DEBUG( |
| 724 | dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount |
| 725 | << " iterations from the first loop.\n" |
| 726 | "Both Loops have the same number of iterations now.\n" ); |
| 727 | } |
| 728 | |
| 729 | /// Walk each set of strictly adjacent fusion candidates and attempt to fuse |
| 730 | /// them. This does a single linear traversal of all candidates in the list. |
| 731 | /// The conditions for legal fusion are checked at this point. If a pair of |
| 732 | /// fusion candidates passes all legality checks, they are fused together and |
| 733 | /// a new fusion candidate is created and added to the FusionCandidateList. |
| 734 | /// The original fusion candidates are then removed, as they are no longer |
| 735 | /// valid. |
| 736 | bool fuseCandidates() { |
| 737 | bool Fused = false; |
| 738 | LLVM_DEBUG(printFusionCandidates(FusionCandidates)); |
| 739 | for (auto &CandidateList : FusionCandidates) { |
| 740 | if (CandidateList.size() < 2) |
| 741 | continue; |
| 742 | |
| 743 | LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate List:\n" |
| 744 | << CandidateList << "\n" ); |
| 745 | |
| 746 | for (auto It = CandidateList.begin(), NextIt = std::next(x: It); |
| 747 | NextIt != CandidateList.end(); It = NextIt, NextIt = std::next(x: It)) { |
| 748 | |
| 749 | const FusionCandidate &FC0 = *It; |
| 750 | const FusionCandidate &FC1 = *NextIt; |
| 751 | |
| 752 | assert(!LDT.isRemovedLoop(FC0.L) && |
| 753 | "Should not have removed loops in CandidateList!" ); |
| 754 | assert(!LDT.isRemovedLoop(FC1.L) && |
| 755 | "Should not have removed loops in CandidateList!" ); |
| 756 | |
| 757 | LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n" ; FC0.dump(); |
| 758 | dbgs() << " with\n" ; FC1.dump(); dbgs() << "\n" ); |
| 759 | |
| 760 | FC0.verify(); |
| 761 | FC1.verify(); |
| 762 | |
| 763 | std::optional<int64_t> TCDifference = calculateTripCountDiff(FC0, FC1); |
| 764 | // Here we are checking that FC0 (the first loop) can be peeled, and |
| 765 | // the first loop has a larger trip count. In this case it is possible |
| 766 | // that the first loop is peeled to expose the fusion opportunity. |
| 767 | // Peeling the second loop is not currently supported. |
| 768 | bool WillPeel = |
| 769 | FC0.AbleToPeel && TCDifference && *TCDifference > 0 && |
| 770 | *TCDifference <= static_cast<int64_t>(FusionPeelMaxCount); |
| 771 | |
| 772 | if (!WillPeel && (!TCDifference || *TCDifference != 0)) { |
| 773 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " |
| 774 | "counts and peeling is not supported for this " |
| 775 | "case. Not fusing.\n" ); |
| 776 | ++NonEqualTripCount; |
| 777 | reportLoopFusion<OptimizationRemarkMissed>( |
| 778 | FC0, FC1, RemarkName: "NonEqualTripCount" , |
| 779 | RemarkMsg: "Loop trip counts are not the same" ); |
| 780 | continue; |
| 781 | } |
| 782 | |
| 783 | if ((!FC0.GuardBranch && FC1.GuardBranch) || |
| 784 | (FC0.GuardBranch && !FC1.GuardBranch)) { |
| 785 | LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " |
| 786 | "another one is not. Not fusing.\n" ); |
| 787 | ++OnlySecondCandidateIsGuarded; |
| 788 | reportLoopFusion<OptimizationRemarkMissed>( |
| 789 | FC0, FC1, RemarkName: "OnlySecondCandidateIsGuarded" , |
| 790 | RemarkMsg: "The second candidate is guarded while the first one is not" ); |
| 791 | continue; |
| 792 | } |
| 793 | |
| 794 | // If TCDifference is not set or if it is zero, peeling is not needed. |
| 795 | // In this case we must ensure if the loops are guarded the guards |
| 796 | // are identical. |
| 797 | if (!TCDifference || *TCDifference == 0) { |
| 798 | if (FC0.GuardBranch && FC1.GuardBranch && |
| 799 | !haveIdenticalGuards(FC0, FC1)) { |
| 800 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " |
| 801 | "guards. Not Fusing.\n" ); |
| 802 | ++NonIdenticalGuards; |
| 803 | reportLoopFusion<OptimizationRemarkMissed>( |
| 804 | FC0, FC1, RemarkName: "NonIdenticalGuards" , |
| 805 | RemarkMsg: "Candidates have different guards" ); |
| 806 | continue; |
| 807 | } |
| 808 | } |
| 809 | |
| 810 | if (FC0.GuardBranch) { |
| 811 | assert(FC1.GuardBranch && "Expecting valid FC1 guard branch" ); |
| 812 | |
| 813 | if (!isSafeToMoveBefore(BB&: *FC0.ExitBlock, |
| 814 | InsertPoint&: *FC1.ExitBlock->getFirstNonPHIOrDbg(), DT, |
| 815 | PDT: &PDT, DI: &DI)) { |
| 816 | LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " |
| 817 | "instructions in exit block. Not fusing.\n" ); |
| 818 | ++NonEmptyExitBlock; |
| 819 | reportLoopFusion<OptimizationRemarkMissed>( |
| 820 | FC0, FC1, RemarkName: "NonEmptyExitBlock" , |
| 821 | RemarkMsg: "Candidate has a non-empty exit block with " |
| 822 | "instructions that cannot be moved" ); |
| 823 | continue; |
| 824 | } |
| 825 | |
| 826 | if (!isSafeToMoveBefore( |
| 827 | BB&: *FC1.GuardBranch->getParent(), |
| 828 | InsertPoint&: *FC0.GuardBranch->getParent()->getTerminator(), DT, PDT: &PDT, |
| 829 | DI: &DI)) { |
| 830 | LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " |
| 831 | "instructions in guard block. Not fusing.\n" ); |
| 832 | ++NonEmptyGuardBlock; |
| 833 | reportLoopFusion<OptimizationRemarkMissed>( |
| 834 | FC0, FC1, RemarkName: "NonEmptyGuardBlock" , |
| 835 | RemarkMsg: "Candidate has a non-empty guard block with " |
| 836 | "instructions that cannot be moved" ); |
| 837 | continue; |
| 838 | } |
| 839 | } |
| 840 | |
| 841 | // Check the dependencies across the loops and do not fuse if it would |
| 842 | // violate them. |
| 843 | if (!dependencesAllowFusion(FC0, FC1)) { |
| 844 | LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n" ); |
| 845 | ++InvalidDependencies; |
| 846 | reportLoopFusion<OptimizationRemarkMissed>( |
| 847 | FC0, FC1, RemarkName: "InvalidDependencies" , RemarkMsg: "Dependencies prevent fusion" ); |
| 848 | continue; |
| 849 | } |
| 850 | |
| 851 | // If the second loop has instructions in the pre-header, attempt to |
| 852 | // hoist them up to the first loop's pre-header or sink them into the |
| 853 | // body of the second loop. |
| 854 | SmallVector<Instruction *, 4> SafeToHoist; |
| 855 | SmallVector<Instruction *, 4> SafeToSink; |
| 856 | // At this point, this is the last remaining legality check. |
| 857 | // Which means if we can make this pre-header empty, we can fuse |
| 858 | // these loops |
| 859 | if (!isEmptyPreheader(FC: FC1)) { |
| 860 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " |
| 861 | "preheader.\n" ); |
| 862 | |
| 863 | // If it is not safe to hoist/sink all instructions in the |
| 864 | // pre-header, we cannot fuse these loops. |
| 865 | if (!collectMovablePreheaderInsts(FC0, FC1, SafeToHoist, |
| 866 | SafeToSink)) { |
| 867 | LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " |
| 868 | "Fusion Candidate Pre-header.\n" |
| 869 | << "Not Fusing.\n" ); |
| 870 | ++NonEmptyPreheader; |
| 871 | reportLoopFusion<OptimizationRemarkMissed>( |
| 872 | FC0, FC1, RemarkName: "NonEmptyPreheader" , |
| 873 | RemarkMsg: "Loop has a non-empty preheader with instructions that " |
| 874 | "cannot be moved" ); |
| 875 | continue; |
| 876 | } |
| 877 | } |
| 878 | |
| 879 | bool BeneficialToFuse = isBeneficialFusion(FC0, FC1); |
| 880 | LLVM_DEBUG(dbgs() << "\tFusion appears to be " |
| 881 | << (BeneficialToFuse ? "" : "un" ) << "profitable!\n" ); |
| 882 | if (!BeneficialToFuse) { |
| 883 | ++FusionNotBeneficial; |
| 884 | reportLoopFusion<OptimizationRemarkMissed>( |
| 885 | FC0, FC1, RemarkName: "FusionNotBeneficial" , RemarkMsg: "Fusion is not beneficial" ); |
| 886 | continue; |
| 887 | } |
| 888 | // All analysis has completed and has determined that fusion is legal |
| 889 | // and profitable. At this point, start transforming the code and |
| 890 | // perform fusion. |
| 891 | |
| 892 | // Execute the hoist/sink operations on preheader instructions |
| 893 | movePreheaderInsts(FC0, FC1, HoistInsts&: SafeToHoist, SinkInsts&: SafeToSink); |
| 894 | |
| 895 | LLVM_DEBUG(dbgs() << "\tFusion is performed: " << FC0 << " and " << FC1 |
| 896 | << "\n" ); |
| 897 | |
| 898 | FusionCandidate FC0Copy = FC0; |
| 899 | // Peel the loop after determining that fusion is legal. The Loops |
| 900 | // will still be safe to fuse after the peeling is performed. |
| 901 | bool Peel = TCDifference && *TCDifference > 0; |
| 902 | if (Peel) |
| 903 | peelFusionCandidate(FC0&: FC0Copy, FC1, PeelCount: *TCDifference); |
| 904 | |
| 905 | // Report fusion to the Optimization Remarks. |
| 906 | // Note this needs to be done *before* performFusion because |
| 907 | // performFusion will change the original loops, making it not |
| 908 | // possible to identify them after fusion is complete. |
| 909 | ++FuseCounter; |
| 910 | reportLoopFusion<OptimizationRemark>(FC0: (Peel ? FC0Copy : FC0), FC1, |
| 911 | RemarkName: "FuseCounter" , RemarkMsg: "Loops fused" ); |
| 912 | |
| 913 | FusionCandidate FusedCand(performFusion(FC0: (Peel ? FC0Copy : FC0), FC1), |
| 914 | DT, &PDT, ORE, FC0Copy.PP); |
| 915 | FusedCand.verify(); |
| 916 | assert(FusedCand.isEligibleForFusion(SE) && |
| 917 | "Fused candidate should be eligible for fusion!" ); |
| 918 | |
| 919 | // Notify the loop-depth-tree that these loops are not valid objects |
| 920 | LDT.removeLoop(L: FC1.L); |
| 921 | |
| 922 | // Replace FC0 and FC1 with their fused loop |
| 923 | It = CandidateList.erase(position: It); |
| 924 | It = CandidateList.erase(position: It); |
| 925 | It = CandidateList.insert(position: It, x: FusedCand); |
| 926 | |
| 927 | // Start from FusedCand in the next iteration |
| 928 | NextIt = It; |
| 929 | |
| 930 | LLVM_DEBUG(dbgs() << "Candidate List (after fusion): " << CandidateList |
| 931 | << "\n" ); |
| 932 | |
| 933 | Fused = true; |
| 934 | } |
| 935 | } |
| 936 | return Fused; |
| 937 | } |
| 938 | |
| 939 | // Returns true if the instruction \p I can be hoisted to the end of the |
| 940 | // preheader of \p FC0. \p SafeToHoist contains the instructions that are |
| 941 | // known to be safe to hoist. The instructions encountered that cannot be |
| 942 | // hoisted are in \p NotHoisting. |
| 943 | // TODO: Move functionality into CodeMoverUtils |
| 944 | bool canHoistInst(Instruction &I, |
| 945 | const SmallVector<Instruction *, 4> &SafeToHoist, |
| 946 | const SmallVector<Instruction *, 4> &NotHoisting, |
| 947 | const FusionCandidate &FC0) const { |
| 948 | const BasicBlock * = FC0.Preheader->getSingleSuccessor(); |
| 949 | assert(FC0PreheaderTarget && |
| 950 | "Expected single successor for loop preheader." ); |
| 951 | |
| 952 | for (Use &Op : I.operands()) { |
| 953 | if (auto *OpInst = dyn_cast<Instruction>(Val&: Op)) { |
| 954 | bool OpHoisted = is_contained(Range: SafeToHoist, Element: OpInst); |
| 955 | // Check if we have already decided to hoist this operand. In this |
| 956 | // case, it does not dominate FC0 *yet*, but will after we hoist it. |
| 957 | if (!(OpHoisted || DT.dominates(Def: OpInst, BB: FC0PreheaderTarget))) { |
| 958 | return false; |
| 959 | } |
| 960 | } |
| 961 | } |
| 962 | |
| 963 | // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs |
| 964 | // cannot be hoisted and should be sunk to the exit of the fused loop. |
| 965 | if (isa<PHINode>(Val: I)) |
| 966 | return false; |
| 967 | |
| 968 | // If this isn't a memory inst, hoisting is safe |
| 969 | if (!I.mayReadOrWriteMemory()) |
| 970 | return true; |
| 971 | |
| 972 | LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n" ); |
| 973 | for (Instruction *NotHoistedInst : NotHoisting) { |
| 974 | if (auto D = DI.depends(Src: &I, Dst: NotHoistedInst)) { |
| 975 | // Dependency is not read-before-write, write-before-read or |
| 976 | // write-before-write |
| 977 | if (D->isFlow() || D->isAnti() || D->isOutput()) { |
| 978 | LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " |
| 979 | "preheader that is not being hoisted.\n" ); |
| 980 | return false; |
| 981 | } |
| 982 | } |
| 983 | } |
| 984 | |
| 985 | for (Instruction *ReadInst : FC0.MemReads) { |
| 986 | if (auto D = DI.depends(Src: ReadInst, Dst: &I)) { |
| 987 | // Dependency is not read-before-write |
| 988 | if (D->isAnti()) { |
| 989 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n" ); |
| 990 | return false; |
| 991 | } |
| 992 | } |
| 993 | } |
| 994 | |
| 995 | for (Instruction *WriteInst : FC0.MemWrites) { |
| 996 | if (auto D = DI.depends(Src: WriteInst, Dst: &I)) { |
| 997 | // Dependency is not write-before-read or write-before-write |
| 998 | if (D->isFlow() || D->isOutput()) { |
| 999 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n" ); |
| 1000 | return false; |
| 1001 | } |
| 1002 | } |
| 1003 | } |
| 1004 | return true; |
| 1005 | } |
| 1006 | |
| 1007 | // Returns true if the instruction \p I can be sunk to the top of the exit |
| 1008 | // block of \p FC1. |
| 1009 | // TODO: Move functionality into CodeMoverUtils |
| 1010 | bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { |
| 1011 | for (User *U : I.users()) { |
| 1012 | if (auto *UI{dyn_cast<Instruction>(Val: U)}) { |
| 1013 | // Cannot sink if user in loop |
| 1014 | // If FC1 has phi users of this value, we cannot sink it into FC1. |
| 1015 | if (FC1.L->contains(Inst: UI)) { |
| 1016 | // Cannot hoist or sink this instruction. No hoisting/sinking |
| 1017 | // should take place, loops should not fuse |
| 1018 | return false; |
| 1019 | } |
| 1020 | } |
| 1021 | } |
| 1022 | |
| 1023 | // If this isn't a memory inst, sinking is safe |
| 1024 | if (!I.mayReadOrWriteMemory()) |
| 1025 | return true; |
| 1026 | |
| 1027 | for (Instruction *ReadInst : FC1.MemReads) { |
| 1028 | if (auto D = DI.depends(Src: &I, Dst: ReadInst)) { |
| 1029 | // Dependency is not write-before-read |
| 1030 | if (D->isFlow()) { |
| 1031 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n" ); |
| 1032 | return false; |
| 1033 | } |
| 1034 | } |
| 1035 | } |
| 1036 | |
| 1037 | for (Instruction *WriteInst : FC1.MemWrites) { |
| 1038 | if (auto D = DI.depends(Src: &I, Dst: WriteInst)) { |
| 1039 | // Dependency is not write-before-write or read-before-write |
| 1040 | if (D->isOutput() || D->isAnti()) { |
| 1041 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n" ); |
| 1042 | return false; |
| 1043 | } |
| 1044 | } |
| 1045 | } |
| 1046 | |
| 1047 | return true; |
| 1048 | } |
| 1049 | |
| 1050 | /// Collect instructions in the \p FC1 Preheader that can be hoisted |
| 1051 | /// to the \p FC0 Preheader or sunk into the \p FC1 Body |
| 1052 | bool collectMovablePreheaderInsts( |
| 1053 | const FusionCandidate &FC0, const FusionCandidate &FC1, |
| 1054 | SmallVector<Instruction *, 4> &SafeToHoist, |
| 1055 | SmallVector<Instruction *, 4> &SafeToSink) const { |
| 1056 | BasicBlock * = FC1.Preheader; |
| 1057 | // Save the instructions that are not being hoisted, so we know not to hoist |
| 1058 | // mem insts that they dominate. |
| 1059 | SmallVector<Instruction *, 4> NotHoisting; |
| 1060 | |
| 1061 | for (Instruction &I : *FC1Preheader) { |
| 1062 | // Can't move a branch |
| 1063 | if (&I == FC1Preheader->getTerminator()) |
| 1064 | continue; |
| 1065 | // If the instruction has side-effects, give up. |
| 1066 | // TODO: The case of mayReadFromMemory we can handle but requires |
| 1067 | // additional work with a dependence analysis so for now we give |
| 1068 | // up on memory reads. |
| 1069 | if (I.mayThrow() || !I.willReturn()) { |
| 1070 | LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n" ); |
| 1071 | return false; |
| 1072 | } |
| 1073 | |
| 1074 | LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n" ); |
| 1075 | |
| 1076 | if (I.isAtomic() || I.isVolatile()) { |
| 1077 | LLVM_DEBUG( |
| 1078 | dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n" ); |
| 1079 | return false; |
| 1080 | } |
| 1081 | |
| 1082 | if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { |
| 1083 | SafeToHoist.push_back(Elt: &I); |
| 1084 | LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n" ); |
| 1085 | } else { |
| 1086 | LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n" ); |
| 1087 | NotHoisting.push_back(Elt: &I); |
| 1088 | |
| 1089 | if (canSinkInst(I, FC1)) { |
| 1090 | SafeToSink.push_back(Elt: &I); |
| 1091 | LLVM_DEBUG(dbgs() << "\tSafe to sink.\n" ); |
| 1092 | } else { |
| 1093 | LLVM_DEBUG(dbgs() << "\tCould not sink.\n" ); |
| 1094 | return false; |
| 1095 | } |
| 1096 | } |
| 1097 | } |
| 1098 | LLVM_DEBUG( |
| 1099 | dbgs() << "All preheader instructions could be sunk or hoisted!\n" ); |
| 1100 | return true; |
| 1101 | } |
| 1102 | |
| 1103 | /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in |
| 1104 | /// @p L1) allow loop fusion of @p L0 and @p L1. |
| 1105 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
| 1106 | const FusionCandidate &FC1, Instruction &I0, |
| 1107 | Instruction &I1) { |
| 1108 | #ifndef NDEBUG |
| 1109 | if (VerboseFusionDebugging) { |
| 1110 | LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << "\n" ); |
| 1111 | } |
| 1112 | #endif |
| 1113 | auto DepResult = DI.depends(Src: &I0, Dst: &I1); |
| 1114 | if (!DepResult) |
| 1115 | return true; |
| 1116 | #ifndef NDEBUG |
| 1117 | if (VerboseFusionDebugging) { |
| 1118 | LLVM_DEBUG(dbgs() << "DA res: " ; DepResult->dump(dbgs()); |
| 1119 | dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " |
| 1120 | << (DepResult->isOrdered() ? "true" : "false" ) |
| 1121 | << "]\n" ); |
| 1122 | LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() |
| 1123 | << "\n" ); |
| 1124 | } |
| 1125 | #endif |
| 1126 | unsigned Levels = DepResult->getLevels(); |
| 1127 | unsigned SameSDLevels = DepResult->getSameSDLevels(); |
| 1128 | unsigned CurLoopLevel = FC0.L->getLoopDepth(); |
| 1129 | |
| 1130 | // Check if DA is missing info regarding the current loop level |
| 1131 | if (CurLoopLevel > Levels + SameSDLevels) |
| 1132 | return false; |
| 1133 | |
| 1134 | // Iterating over the outer levels. |
| 1135 | for (unsigned Level = 1; Level <= std::min(a: CurLoopLevel - 1, b: Levels); |
| 1136 | ++Level) { |
| 1137 | unsigned Direction = DepResult->getDirection(Level, SameSD: false); |
| 1138 | |
| 1139 | // Check if the direction vector does not include equality. If an outer |
| 1140 | // loop has a non-equal direction, outer indicies are different and it |
| 1141 | // is safe to fuse. |
| 1142 | if (!(Direction & Dependence::DVEntry::EQ)) { |
| 1143 | LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " |
| 1144 | "outer loops\n" ); |
| 1145 | NumDA++; |
| 1146 | return true; |
| 1147 | } |
| 1148 | } |
| 1149 | |
| 1150 | assert(CurLoopLevel > Levels && "Fusion candidates are not separated" ); |
| 1151 | |
| 1152 | if (DepResult->isScalar(Level: CurLoopLevel, SameSD: true)) { |
| 1153 | if (DepResult->isInput() || DepResult->isOutput()) { |
| 1154 | LLVM_DEBUG(dbgs() << "Safe to fuse due to a loop-invariant " |
| 1155 | << (DepResult->isInput() ? "input" : "output" ) |
| 1156 | << " dependency\n" ); |
| 1157 | NumDA++; |
| 1158 | return true; |
| 1159 | } |
| 1160 | LLVM_DEBUG( |
| 1161 | dbgs() << "Not safe to fuse due to a scalar flow dependency\n" ); |
| 1162 | return false; |
| 1163 | } |
| 1164 | |
| 1165 | unsigned CurDir = DepResult->getDirection(Level: CurLoopLevel, SameSD: true); |
| 1166 | |
| 1167 | // Check if the direction vector does not include greater direction. In |
| 1168 | // that case, the dependency is not a backward loop-carried and is legal |
| 1169 | // to fuse. For example here we have a forward dependency |
| 1170 | // for (int i = 0; i < n; i++) |
| 1171 | // A[i] = ...; |
| 1172 | // for (int i = 0; i < n; i++) |
| 1173 | // ... = A[i-1]; |
| 1174 | if (!(CurDir & Dependence::DVEntry::GT)) { |
| 1175 | LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " |
| 1176 | "dependency\n" ); |
| 1177 | NumDA++; |
| 1178 | return true; |
| 1179 | } |
| 1180 | |
| 1181 | if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) |
| 1182 | LLVM_DEBUG(dbgs() << "TODO: Implement pred/succ dependence handling!\n" ); |
| 1183 | |
| 1184 | return false; |
| 1185 | } |
| 1186 | |
| 1187 | /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. |
| 1188 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
| 1189 | const FusionCandidate &FC1) { |
| 1190 | LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 |
| 1191 | << "\n" ); |
| 1192 | assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); |
| 1193 | assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); |
| 1194 | |
| 1195 | for (Instruction *WriteL0 : FC0.MemWrites) { |
| 1196 | for (Instruction *WriteL1 : FC1.MemWrites) |
| 1197 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1)) { |
| 1198 | return false; |
| 1199 | } |
| 1200 | for (Instruction *ReadL1 : FC1.MemReads) |
| 1201 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *ReadL1)) { |
| 1202 | return false; |
| 1203 | } |
| 1204 | } |
| 1205 | |
| 1206 | // Write-write and write-read pairs are already covered above; only the |
| 1207 | // read-before-write pairs from FC0 reads to FC1 writes remain. |
| 1208 | for (Instruction *ReadL0 : FC0.MemReads) |
| 1209 | for (Instruction *WriteL1 : FC1.MemWrites) |
| 1210 | if (!dependencesAllowFusion(FC0, FC1, I0&: *ReadL0, I1&: *WriteL1)) { |
| 1211 | return false; |
| 1212 | } |
| 1213 | |
| 1214 | // Walk through all uses in FC1. For each use, find the reaching def. If the |
| 1215 | // def is located in FC0 then it is not safe to fuse. |
| 1216 | for (BasicBlock *BB : FC1.L->blocks()) |
| 1217 | for (Instruction &I : *BB) |
| 1218 | for (auto &Op : I.operands()) |
| 1219 | if (Instruction *Def = dyn_cast<Instruction>(Val&: Op)) |
| 1220 | if (FC0.L->contains(BB: Def->getParent())) { |
| 1221 | return false; |
| 1222 | } |
| 1223 | |
| 1224 | return true; |
| 1225 | } |
| 1226 | |
| 1227 | /// Determine if two fusion candidates are strictly adjacent in the CFG. |
| 1228 | /// |
| 1229 | /// This method will determine if there are additional basic blocks in the CFG |
| 1230 | /// between the exit of \p FC0 and the entry of \p FC1. |
| 1231 | /// If the two candidates are guarded loops, then it checks whether the |
| 1232 | /// exit block of the \p FC0 is the predecessor of the \p FC1 preheader. This |
| 1233 | /// implicitly ensures that the non-loop successor of the \p FC0 guard branch |
| 1234 | /// is the entry block of \p FC1. If not, then the loops are not adjacent. If |
| 1235 | /// the two candidates are not guarded loops, then it checks whether the exit |
| 1236 | /// block of \p FC0 is the preheader of \p FC1. |
| 1237 | /// Strictly means there is no predecessor for FC1 unless it is from FC0, |
| 1238 | /// i.e., FC0 dominates FC1. |
| 1239 | bool isStrictlyAdjacent(const FusionCandidate &FC0, |
| 1240 | const FusionCandidate &FC1) const { |
| 1241 | // If the successor of the guard branch is FC1, then the loops are adjacent |
| 1242 | if (FC0.GuardBranch) |
| 1243 | return DT.dominates(A: FC0.getEntryBlock(), B: FC1.getEntryBlock()) && |
| 1244 | FC0.ExitBlock->getSingleSuccessor() == FC1.getEntryBlock(); |
| 1245 | return FC0.ExitBlock == FC1.getEntryBlock(); |
| 1246 | } |
| 1247 | |
| 1248 | bool isEmptyPreheader(const FusionCandidate &FC) const { |
| 1249 | return FC.Preheader->size() == 1; |
| 1250 | } |
| 1251 | |
| 1252 | /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader |
| 1253 | /// and sink others into the body of \p FC1. |
| 1254 | void movePreheaderInsts(const FusionCandidate &FC0, |
| 1255 | const FusionCandidate &FC1, |
| 1256 | SmallVector<Instruction *, 4> &HoistInsts, |
| 1257 | SmallVector<Instruction *, 4> &SinkInsts) const { |
| 1258 | // All preheader instructions except the branch must be hoisted or sunk |
| 1259 | assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && |
| 1260 | "Attempting to sink and hoist preheader instructions, but not all " |
| 1261 | "the preheader instructions are accounted for." ); |
| 1262 | |
| 1263 | NumHoistedInsts += HoistInsts.size(); |
| 1264 | NumSunkInsts += SinkInsts.size(); |
| 1265 | |
| 1266 | LLVM_DEBUG(if (VerboseFusionDebugging) { |
| 1267 | if (!HoistInsts.empty()) |
| 1268 | dbgs() << "Hoisting: \n" ; |
| 1269 | for (Instruction *I : HoistInsts) |
| 1270 | dbgs() << *I << "\n" ; |
| 1271 | if (!SinkInsts.empty()) |
| 1272 | dbgs() << "Sinking: \n" ; |
| 1273 | for (Instruction *I : SinkInsts) |
| 1274 | dbgs() << *I << "\n" ; |
| 1275 | }); |
| 1276 | |
| 1277 | for (Instruction *I : HoistInsts) { |
| 1278 | assert(I->getParent() == FC1.Preheader); |
| 1279 | I->moveBefore(BB&: *FC0.Preheader, |
| 1280 | I: FC0.Preheader->getTerminator()->getIterator()); |
| 1281 | } |
| 1282 | // insert instructions in reverse order to maintain dominance relationship |
| 1283 | for (Instruction *I : reverse(C&: SinkInsts)) { |
| 1284 | assert(I->getParent() == FC1.Preheader); |
| 1285 | if (isa<PHINode>(Val: I)) { |
| 1286 | // The Phis to be sunk should have only one incoming value, as is |
| 1287 | // assured by the condition that the second loop is dominated by the |
| 1288 | // first one which is enforced by isStrictlyAdjacent(). |
| 1289 | // Replace the phi uses with the corresponding incoming value to clean |
| 1290 | // up the code. |
| 1291 | assert(cast<PHINode>(I)->getNumIncomingValues() == 1 && |
| 1292 | "Expected the sunk PHI node to have 1 incoming value." ); |
| 1293 | I->replaceAllUsesWith(V: I->getOperand(i: 0)); |
| 1294 | I->eraseFromParent(); |
| 1295 | } else |
| 1296 | I->moveBefore(BB&: *FC1.ExitBlock, I: FC1.ExitBlock->getFirstInsertionPt()); |
| 1297 | } |
| 1298 | } |
| 1299 | |
| 1300 | /// Determine if two fusion candidates have identical guards |
| 1301 | /// |
| 1302 | /// This method will determine if two fusion candidates have the same guards. |
| 1303 | /// The guards are considered the same if: |
| 1304 | /// 1. The instructions to compute the condition used in the compare are |
| 1305 | /// identical. |
| 1306 | /// 2. The successors of the guard have the same flow into/around the loop. |
| 1307 | /// If the compare instructions are identical, then the first successor of the |
| 1308 | /// guard must go to the same place (either the preheader of the loop or the |
| 1309 | /// NonLoopBlock). In other words, the first successor of both loops must |
| 1310 | /// both go into the loop (i.e., the preheader) or go around the loop (i.e., |
| 1311 | /// the NonLoopBlock). The same must be true for the second successor. |
| 1312 | bool haveIdenticalGuards(const FusionCandidate &FC0, |
| 1313 | const FusionCandidate &FC1) const { |
| 1314 | assert(FC0.GuardBranch && FC1.GuardBranch && |
| 1315 | "Expecting FC0 and FC1 to be guarded loops." ); |
| 1316 | |
| 1317 | auto *FC0CmpInst = dyn_cast<Instruction>(Val: FC0.GuardBranch->getCondition()); |
| 1318 | auto *FC1CmpInst = dyn_cast<Instruction>(Val: FC1.GuardBranch->getCondition()); |
| 1319 | if ((!FC0CmpInst || !FC1CmpInst) && |
| 1320 | FC0.GuardBranch->getCondition() != FC1.GuardBranch->getCondition()) |
| 1321 | return false; |
| 1322 | |
| 1323 | if (FC0CmpInst && FC1CmpInst && !FC0CmpInst->isIdenticalTo(I: FC1CmpInst)) |
| 1324 | return false; |
| 1325 | |
| 1326 | // The compare instructions are identical. |
| 1327 | // Now make sure the successor of the guards have the same flow into/around |
| 1328 | // the loop |
| 1329 | if (FC0.GuardBranch->getSuccessor(i: 0) == FC0.Preheader) |
| 1330 | return (FC1.GuardBranch->getSuccessor(i: 0) == FC1.Preheader); |
| 1331 | return (FC1.GuardBranch->getSuccessor(i: 1) == FC1.Preheader); |
| 1332 | } |
| 1333 | |
| 1334 | /// Modify the latch branch of FC to be unconditional since successors of the |
| 1335 | /// branch are the same. |
| 1336 | void simplifyLatchBranch(const FusionCandidate &FC) const { |
| 1337 | CondBrInst *FCLatchBranch = dyn_cast<CondBrInst>(Val: FC.Latch->getTerminator()); |
| 1338 | if (FCLatchBranch) { |
| 1339 | assert(FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && |
| 1340 | "Expecting the two successors of FCLatchBranch to be the same" ); |
| 1341 | UncondBrInst *NewBranch = |
| 1342 | UncondBrInst::Create(Target: FCLatchBranch->getSuccessor(i: 0)); |
| 1343 | ReplaceInstWithInst(From: FCLatchBranch, To: NewBranch); |
| 1344 | } |
| 1345 | } |
| 1346 | |
| 1347 | /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique |
| 1348 | /// successor, then merge FC0.Latch with its unique successor. |
| 1349 | void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
| 1350 | moveInstructionsToTheBeginning(FromBB&: *FC0.Latch, ToBB&: *FC1.Latch, DT, PDT, DI, SE); |
| 1351 | if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { |
| 1352 | MergeBlockIntoPredecessor(BB: Succ, DTU: &DTU, LI: &LI); |
| 1353 | DTU.flush(); |
| 1354 | } |
| 1355 | } |
| 1356 | |
| 1357 | /// Move FC1's header PHIs into FC0's header, insert the loop-carried PHIs |
| 1358 | /// needed to keep SSA valid when FC0 exits without taking its back-edge, and |
| 1359 | /// rewire both latches to form the fused loop. Latch dominator-tree updates |
| 1360 | /// are appended to \p TreeUpdates for the caller to apply. |
| 1361 | void rewireFusedHeaderPHIsAndLatches( |
| 1362 | const FusionCandidate &FC0, const FusionCandidate &FC1, |
| 1363 | const SmallVectorImpl<PHINode *> &OriginalFC0PHIs, |
| 1364 | SmallVectorImpl<DominatorTree::UpdateType> &TreeUpdates) { |
| 1365 | // Moves the phi nodes from the second to the first loops header block. |
| 1366 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
| 1367 | if (SE.isSCEVable(Ty: PHI->getType())) |
| 1368 | SE.forgetValue(V: PHI); |
| 1369 | if (PHI->hasNUsesOrMore(N: 1)) |
| 1370 | PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt()); |
| 1371 | else |
| 1372 | PHI->eraseFromParent(); |
| 1373 | } |
| 1374 | |
| 1375 | // Introduce new phi nodes in the second loop header to ensure |
| 1376 | // exiting the first and jumping to the header of the second does not break |
| 1377 | // the SSA property of the phis originally in the first loop. See also the |
| 1378 | // comment above. |
| 1379 | BasicBlock::iterator = FC1.Header->begin(); |
| 1380 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
| 1381 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
| 1382 | assert(L1LatchBBIdx >= 0 && |
| 1383 | "Expected loop carried value to be rewired at this point!" ); |
| 1384 | |
| 1385 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
| 1386 | |
| 1387 | PHINode * = |
| 1388 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
| 1389 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
| 1390 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
| 1391 | L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()), |
| 1392 | BB: FC0.ExitingBlock); |
| 1393 | |
| 1394 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
| 1395 | } |
| 1396 | |
| 1397 | // Replace latch terminator destinations. |
| 1398 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
| 1399 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
| 1400 | |
| 1401 | // Modify the latch branch of FC0 to be unconditional as both successors of |
| 1402 | // the branch are the same. |
| 1403 | simplifyLatchBranch(FC: FC0); |
| 1404 | |
| 1405 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
| 1406 | // performed the updates above. |
| 1407 | if (FC0.Latch != FC0.ExitingBlock) |
| 1408 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1409 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
| 1410 | |
| 1411 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
| 1412 | FC0.Latch, FC0.Header)); |
| 1413 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
| 1414 | FC1.Latch, FC0.Header)); |
| 1415 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
| 1416 | FC1.Latch, FC1.Header)); |
| 1417 | } |
| 1418 | |
| 1419 | /// Forget cached SCEV state for both loops, move all of FC1's blocks and |
| 1420 | /// child loops into FC0, erase the now-empty FC1, and merge the latches. |
| 1421 | /// Returns the fused loop (FC0.L). |
| 1422 | Loop *finalizeFusedLoop(const FusionCandidate &FC0, |
| 1423 | const FusionCandidate &FC1) { |
| 1424 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
| 1425 | // and rebuild the information in subsequent passes of fusion? |
| 1426 | // Note: Need to forget the loops before merging the loop latches, as |
| 1427 | // mergeLatch may remove the only block in FC1. |
| 1428 | SE.forgetLoop(L: FC1.L); |
| 1429 | SE.forgetLoop(L: FC0.L); |
| 1430 | |
| 1431 | // Merge the loops. |
| 1432 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
| 1433 | for (BasicBlock *BB : Blocks) { |
| 1434 | FC0.L->addBlockEntry(BB); |
| 1435 | FC1.L->removeBlockFromLoop(BB); |
| 1436 | if (LI.getLoopFor(BB) != FC1.L) |
| 1437 | continue; |
| 1438 | LI.changeLoopFor(BB, L: FC0.L); |
| 1439 | } |
| 1440 | while (!FC1.L->isInnermost()) { |
| 1441 | const auto &ChildLoopIt = FC1.L->begin(); |
| 1442 | Loop *ChildLoop = *ChildLoopIt; |
| 1443 | FC1.L->removeChildLoop(I: ChildLoopIt); |
| 1444 | FC0.L->addChildLoop(NewChild: ChildLoop); |
| 1445 | } |
| 1446 | |
| 1447 | // Delete the now empty loop L1. |
| 1448 | LI.erase(L: FC1.L); |
| 1449 | |
| 1450 | // Forget block dispositions as well, so that there are no dangling |
| 1451 | // pointers to erased/free'ed blocks. It should be done after mergeLatch() |
| 1452 | // since merging the latches may affect the dispositions. |
| 1453 | SE.forgetBlockAndLoopDispositions(); |
| 1454 | |
| 1455 | // Move instructions from FC0.Latch to FC1.Latch. |
| 1456 | // Note: mergeLatch requires an updated DT. |
| 1457 | mergeLatch(FC0, FC1); |
| 1458 | |
| 1459 | #ifndef NDEBUG |
| 1460 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
| 1461 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
| 1462 | assert(PDT.verify()); |
| 1463 | LI.verify(DT); |
| 1464 | SE.verify(); |
| 1465 | #endif |
| 1466 | |
| 1467 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
| 1468 | |
| 1469 | return FC0.L; |
| 1470 | } |
| 1471 | |
| 1472 | /// Fuse two fusion candidates, creating a new fused loop. |
| 1473 | /// |
| 1474 | /// This method contains the mechanics of fusing two loops, represented by \p |
| 1475 | /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 |
| 1476 | /// postdominates \p FC0 (making them control flow equivalent). It also |
| 1477 | /// assumes that the other conditions for fusion have been met: adjacent, |
| 1478 | /// identical trip counts, and no negative distance dependencies exist that |
| 1479 | /// would prevent fusion. Thus, there is no checking for these conditions in |
| 1480 | /// this method. |
| 1481 | /// |
| 1482 | /// Fusion is performed by rewiring the CFG to update successor blocks of the |
| 1483 | /// components of tho loop. Specifically, the following changes are done: |
| 1484 | /// |
| 1485 | /// 1. The preheader of \p FC1 is removed as it is no longer necessary |
| 1486 | /// (because it is currently only a single statement block). |
| 1487 | /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. |
| 1488 | /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. |
| 1489 | /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. |
| 1490 | /// |
| 1491 | /// All of these modifications are done with dominator tree updates, thus |
| 1492 | /// keeping the dominator (and post dominator) information up-to-date. |
| 1493 | /// |
| 1494 | /// This can be improved in the future by actually merging blocks during |
| 1495 | /// fusion. For example, the preheader of \p FC1 can be merged with the |
| 1496 | /// preheader of \p FC0. This would allow loops with more than a single |
| 1497 | /// statement in the preheader to be fused. Similarly, the latch blocks of the |
| 1498 | /// two loops could also be fused into a single block. This will require |
| 1499 | /// analysis to prove it is safe to move the contents of the block past |
| 1500 | /// existing code, which currently has not been implemented. |
| 1501 | Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
| 1502 | assert(FC0.isValid() && FC1.isValid() && |
| 1503 | "Expecting valid fusion candidates" ); |
| 1504 | |
| 1505 | LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n" ; FC0.dump(); |
| 1506 | dbgs() << "Fusion Candidate 1: \n" ; FC1.dump();); |
| 1507 | |
| 1508 | // Move instructions from the preheader of FC1 to the end of the preheader |
| 1509 | // of FC0. |
| 1510 | moveInstructionsToTheEnd(FromBB&: *FC1.Preheader, ToBB&: *FC0.Preheader, DT, PDT, DI, SE); |
| 1511 | |
| 1512 | // Fusing guarded loops is handled slightly differently than non-guarded |
| 1513 | // loops and has been broken out into a separate method instead of trying to |
| 1514 | // intersperse the logic within a single method. |
| 1515 | if (FC0.GuardBranch) |
| 1516 | return fuseGuardedLoops(FC0, FC1); |
| 1517 | |
| 1518 | assert(FC1.Preheader == |
| 1519 | (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); |
| 1520 | assert(FC1.Preheader->size() == 1 && |
| 1521 | FC1.Preheader->getSingleSuccessor() == FC1.Header); |
| 1522 | |
| 1523 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
| 1524 | // them later. However, this is only necessary if the new loop carried |
| 1525 | // values might not dominate the exiting branch. While we do not generally |
| 1526 | // test if this is the case but simply insert intermediate phi nodes, we |
| 1527 | // need to make sure these intermediate phi nodes have different |
| 1528 | // predecessors. To this end, we filter the special case where the exiting |
| 1529 | // block is the latch block of the first loop. Nothing needs to be done |
| 1530 | // anyway as all loop carried values dominate the latch and thereby also the |
| 1531 | // exiting branch. |
| 1532 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
| 1533 | if (FC0.ExitingBlock != FC0.Latch) |
| 1534 | for (PHINode &PHI : FC0.Header->phis()) |
| 1535 | OriginalFC0PHIs.push_back(Elt: &PHI); |
| 1536 | |
| 1537 | // Replace incoming blocks for header PHIs first. |
| 1538 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
| 1539 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
| 1540 | |
| 1541 | // Then modify the control flow and update DT and PDT. |
| 1542 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 1543 | |
| 1544 | // The old exiting block of the first loop (FC0) has to jump to the header |
| 1545 | // of the second as we need to execute the code in the second header block |
| 1546 | // regardless of the trip count. That is, if the trip count is 0, so the |
| 1547 | // back edge is never taken, we still have to execute both loop headers, |
| 1548 | // especially (but not only!) if the second is a do-while style loop. |
| 1549 | // However, doing so might invalidate the phi nodes of the first loop as |
| 1550 | // the new values do only need to dominate their latch and not the exiting |
| 1551 | // predicate. To remedy this potential problem we always introduce phi |
| 1552 | // nodes in the header of the second loop later that select the loop carried |
| 1553 | // value, if the second header was reached through an old latch of the |
| 1554 | // first, or undef otherwise. This is sound as exiting the first implies the |
| 1555 | // second will exit too, __without__ taking the back-edge. [Their |
| 1556 | // trip-counts are equal after all. |
| 1557 | // KB: Would this sequence be simpler to just make FC0.ExitingBlock go |
| 1558 | // to FC1.Header? I think this is basically what the three sequences are |
| 1559 | // trying to accomplish; however, doing this directly in the CFG may mean |
| 1560 | // the DT/PDT becomes invalid |
| 1561 | if (!FC0.Peeled) { |
| 1562 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC1.Preheader, |
| 1563 | To: FC1.Header); |
| 1564 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1565 | DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); |
| 1566 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1567 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1568 | } else { |
| 1569 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1570 | DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); |
| 1571 | |
| 1572 | // Remove the ExitBlock of the first Loop (also not needed) |
| 1573 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
| 1574 | To: FC1.Header); |
| 1575 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1576 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
| 1577 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
| 1578 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1579 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1580 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
| 1581 | } |
| 1582 | |
| 1583 | // The pre-header of L1 is not necessary anymore. |
| 1584 | assert(pred_empty(FC1.Preheader)); |
| 1585 | FC1.Preheader->getTerminator()->eraseFromParent(); |
| 1586 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
| 1587 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1588 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
| 1589 | |
| 1590 | rewireFusedHeaderPHIsAndLatches(FC0, FC1, OriginalFC0PHIs, TreeUpdates); |
| 1591 | |
| 1592 | // Update DT/PDT |
| 1593 | DTU.applyUpdates(Updates: TreeUpdates); |
| 1594 | |
| 1595 | LI.removeBlock(BB: FC1.Preheader); |
| 1596 | DTU.deleteBB(DelBB: FC1.Preheader); |
| 1597 | if (FC0.Peeled) { |
| 1598 | LI.removeBlock(BB: FC0.ExitBlock); |
| 1599 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
| 1600 | } |
| 1601 | |
| 1602 | DTU.flush(); |
| 1603 | |
| 1604 | return finalizeFusedLoop(FC0, FC1); |
| 1605 | } |
| 1606 | |
| 1607 | /// Report details on loop fusion opportunities. |
| 1608 | /// |
| 1609 | /// This template function can be used to report both successful and missed |
| 1610 | /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should |
| 1611 | /// be one of: |
| 1612 | /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful |
| 1613 | /// given two valid fusion candidates. |
| 1614 | /// - OptimizationRemark to report successful fusion of two fusion |
| 1615 | /// candidates. |
| 1616 | /// The remarks will be printed using the form: |
| 1617 | /// <path/filename>:<line number>:<column number>: [<function name>]: |
| 1618 | /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> |
| 1619 | template <typename RemarkKind> |
| 1620 | void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, |
| 1621 | StringRef , StringRef ) { |
| 1622 | assert(FC0.Preheader && FC1.Preheader && |
| 1623 | "Expecting valid fusion candidates" ); |
| 1624 | using namespace ore; |
| 1625 | ORE.emit( |
| 1626 | RemarkKind(DEBUG_TYPE, RemarkName, FC0.L->getStartLoc(), FC0.Preheader) |
| 1627 | << "[" << FC0.Preheader->getParent()->getName() |
| 1628 | << "]: " << NV("Cand1" , StringRef(FC0.Preheader->getName())) << " and " |
| 1629 | << NV("Cand2" , StringRef(FC1.Preheader->getName())) << ": " |
| 1630 | << RemarkMsg); |
| 1631 | } |
| 1632 | |
| 1633 | /// Fuse two guarded fusion candidates, creating a new fused loop. |
| 1634 | /// |
| 1635 | /// Fusing guarded loops is handled much the same way as fusing non-guarded |
| 1636 | /// loops. The rewiring of the CFG is slightly different though, because of |
| 1637 | /// the presence of the guards around the loops and the exit blocks after the |
| 1638 | /// loop body. As such, the new loop is rewired as follows: |
| 1639 | /// 1. Keep the guard branch from FC0 and use the non-loop block target |
| 1640 | /// from the FC1 guard branch. |
| 1641 | /// 2. Remove the exit block from FC0 (this exit block should be empty |
| 1642 | /// right now). |
| 1643 | /// 3. Remove the guard branch for FC1 |
| 1644 | /// 4. Remove the preheader for FC1. |
| 1645 | /// The exit block successor for the latch of FC0 is updated to be the header |
| 1646 | /// of FC1 and the non-exit block successor of the latch of FC1 is updated to |
| 1647 | /// be the header of FC0, thus creating the fused loop. |
| 1648 | Loop *fuseGuardedLoops(const FusionCandidate &FC0, |
| 1649 | const FusionCandidate &FC1) { |
| 1650 | assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops" ); |
| 1651 | |
| 1652 | BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); |
| 1653 | BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); |
| 1654 | BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); |
| 1655 | BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); |
| 1656 | BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); |
| 1657 | |
| 1658 | // Move instructions from the exit block of FC0 to the beginning of the exit |
| 1659 | // block of FC1, in the case that the FC0 loop has not been peeled. In the |
| 1660 | // case that FC0 loop is peeled, then move the instructions of the successor |
| 1661 | // of the FC0 Exit block to the beginning of the exit block of FC1. |
| 1662 | moveInstructionsToTheBeginning( |
| 1663 | FromBB&: (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), ToBB&: *FC1.ExitBlock, |
| 1664 | DT, PDT, DI, SE); |
| 1665 | |
| 1666 | // Move instructions from the guard block of FC1 to the end of the guard |
| 1667 | // block of FC0. |
| 1668 | moveInstructionsToTheEnd(FromBB&: *FC1GuardBlock, ToBB&: *FC0GuardBlock, DT, PDT, DI, SE); |
| 1669 | |
| 1670 | assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent" ); |
| 1671 | |
| 1672 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 1673 | |
| 1674 | //////////////////////////////////////////////////////////////////////////// |
| 1675 | // Update the Loop Guard |
| 1676 | //////////////////////////////////////////////////////////////////////////// |
| 1677 | // The guard for FC0 is updated to guard both FC0 and FC1. This is done by |
| 1678 | // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. |
| 1679 | // Thus, one path from the guard goes to the preheader for FC0 (and thus |
| 1680 | // executes the new fused loop) and the other path goes to the NonLoopBlock |
| 1681 | // for FC1 (where FC1 guard would have gone if FC1 was not executed). |
| 1682 | FC1NonLoopBlock->replacePhiUsesWith(Old: FC1GuardBlock, New: FC0GuardBlock); |
| 1683 | FC0.GuardBranch->replaceUsesOfWith(From: FC0NonLoopBlock, To: FC1NonLoopBlock); |
| 1684 | |
| 1685 | BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; |
| 1686 | BBToUpdate->getTerminator()->replaceUsesOfWith(From: FC1GuardBlock, To: FC1.Header); |
| 1687 | |
| 1688 | // The guard of FC1 is not necessary anymore. |
| 1689 | FC1.GuardBranch->eraseFromParent(); |
| 1690 | new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); |
| 1691 | |
| 1692 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1693 | DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); |
| 1694 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1695 | DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); |
| 1696 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1697 | DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); |
| 1698 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1699 | DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); |
| 1700 | |
| 1701 | if (FC0.Peeled) { |
| 1702 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1703 | DominatorTree::Delete, FC0.ExitBlock, FC0ExitBlockSuccessor)); |
| 1704 | // Remove the Block after the ExitBlock of FC0 |
| 1705 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1706 | DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); |
| 1707 | FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); |
| 1708 | new UnreachableInst(FC0ExitBlockSuccessor->getContext(), |
| 1709 | FC0ExitBlockSuccessor); |
| 1710 | } |
| 1711 | |
| 1712 | assert(pred_empty(FC1GuardBlock) && |
| 1713 | "Expecting guard block to have no predecessors" ); |
| 1714 | assert(succ_empty(FC1GuardBlock) && |
| 1715 | "Expecting guard block to have no successors" ); |
| 1716 | |
| 1717 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
| 1718 | // them later. However, this is only necessary if the new loop carried |
| 1719 | // values might not dominate the exiting branch. While we do not generally |
| 1720 | // test if this is the case but simply insert intermediate phi nodes, we |
| 1721 | // need to make sure these intermediate phi nodes have different |
| 1722 | // predecessors. To this end, we filter the special case where the exiting |
| 1723 | // block is the latch block of the first loop. Nothing needs to be done |
| 1724 | // anyway as all loop carried values dominate the latch and thereby also the |
| 1725 | // exiting branch. |
| 1726 | // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch |
| 1727 | // (because the loops are rotated. Thus, nothing will ever be added to |
| 1728 | // OriginalFC0PHIs. |
| 1729 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
| 1730 | if (FC0.ExitingBlock != FC0.Latch) |
| 1731 | for (PHINode &PHI : FC0.Header->phis()) |
| 1732 | OriginalFC0PHIs.push_back(Elt: &PHI); |
| 1733 | |
| 1734 | assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!" ); |
| 1735 | |
| 1736 | // Replace incoming blocks for header PHIs first. |
| 1737 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
| 1738 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
| 1739 | |
| 1740 | // The old exiting block of the first loop (FC0) has to jump to the header |
| 1741 | // of the second as we need to execute the code in the second header block |
| 1742 | // regardless of the trip count. That is, if the trip count is 0, so the |
| 1743 | // back edge is never taken, we still have to execute both loop headers, |
| 1744 | // especially (but not only!) if the second is a do-while style loop. |
| 1745 | // However, doing so might invalidate the phi nodes of the first loop as |
| 1746 | // the new values do only need to dominate their latch and not the exiting |
| 1747 | // predicate. To remedy this potential problem we always introduce phi |
| 1748 | // nodes in the header of the second loop later that select the loop carried |
| 1749 | // value, if the second header was reached through an old latch of the |
| 1750 | // first, or undef otherwise. This is sound as exiting the first implies the |
| 1751 | // second will exit too, __without__ taking the back-edge (their |
| 1752 | // trip-counts are equal after all). |
| 1753 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
| 1754 | To: FC1.Header); |
| 1755 | |
| 1756 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1757 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
| 1758 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1759 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1760 | |
| 1761 | // Remove FC0 Exit Block |
| 1762 | // The exit block for FC0 is no longer needed since control will flow |
| 1763 | // directly to the header of FC1. Since it is an empty block, it can be |
| 1764 | // removed at this point. |
| 1765 | // TODO: In the future, we can handle non-empty exit blocks my merging any |
| 1766 | // instructions from FC0 exit block into FC1 exit block prior to removing |
| 1767 | // the block. |
| 1768 | assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty" ); |
| 1769 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
| 1770 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
| 1771 | |
| 1772 | // Remove FC1 Preheader |
| 1773 | // The pre-header of L1 is not necessary anymore. |
| 1774 | assert(pred_empty(FC1.Preheader)); |
| 1775 | FC1.Preheader->getTerminator()->eraseFromParent(); |
| 1776 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
| 1777 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1778 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
| 1779 | |
| 1780 | rewireFusedHeaderPHIsAndLatches(FC0, FC1, OriginalFC0PHIs, TreeUpdates); |
| 1781 | |
| 1782 | // All done |
| 1783 | // Apply the updates to the Dominator Tree and cleanup. |
| 1784 | |
| 1785 | assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!" ); |
| 1786 | assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!" ); |
| 1787 | |
| 1788 | // Update DT/PDT |
| 1789 | DTU.applyUpdates(Updates: TreeUpdates); |
| 1790 | |
| 1791 | LI.removeBlock(BB: FC1GuardBlock); |
| 1792 | LI.removeBlock(BB: FC1.Preheader); |
| 1793 | LI.removeBlock(BB: FC0.ExitBlock); |
| 1794 | if (FC0.Peeled) { |
| 1795 | LI.removeBlock(BB: FC0ExitBlockSuccessor); |
| 1796 | DTU.deleteBB(DelBB: FC0ExitBlockSuccessor); |
| 1797 | } |
| 1798 | DTU.deleteBB(DelBB: FC1GuardBlock); |
| 1799 | DTU.deleteBB(DelBB: FC1.Preheader); |
| 1800 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
| 1801 | DTU.flush(); |
| 1802 | |
| 1803 | return finalizeFusedLoop(FC0, FC1); |
| 1804 | } |
| 1805 | }; |
| 1806 | } // namespace |
| 1807 | |
| 1808 | PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { |
| 1809 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
| 1810 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 1811 | auto &DI = AM.getResult<DependenceAnalysis>(IR&: F); |
| 1812 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
| 1813 | auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(IR&: F); |
| 1814 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
| 1815 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
| 1816 | const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
| 1817 | |
| 1818 | // Ensure loops are in simplifed form which is a pre-requisite for loop fusion |
| 1819 | // pass. Added only for new PM since the legacy PM has already added |
| 1820 | // LoopSimplify pass as a dependency. |
| 1821 | bool Changed = false; |
| 1822 | for (auto &L : LI) { |
| 1823 | Changed |= |
| 1824 | simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */); |
| 1825 | } |
| 1826 | if (Changed) |
| 1827 | PDT.recalculate(Func&: F); |
| 1828 | |
| 1829 | LoopFuser LF(LI, DT, DI, SE, PDT, ORE, AC, TTI); |
| 1830 | Changed |= LF.fuseLoops(F); |
| 1831 | if (!Changed) |
| 1832 | return PreservedAnalyses::all(); |
| 1833 | |
| 1834 | PreservedAnalyses PA; |
| 1835 | PA.preserve<DominatorTreeAnalysis>(); |
| 1836 | PA.preserve<PostDominatorTreeAnalysis>(); |
| 1837 | PA.preserve<ScalarEvolutionAnalysis>(); |
| 1838 | PA.preserve<LoopAnalysis>(); |
| 1839 | return PA; |
| 1840 | } |
| 1841 | |