| 1 | //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements the loop fusion pass. |
| 11 | /// The implementation is largely based on the following document: |
| 12 | /// |
| 13 | /// Code Transformations to Augment the Scope of Loop Fusion in a |
| 14 | /// Production Compiler |
| 15 | /// Christopher Mark Barton |
| 16 | /// MSc Thesis |
| 17 | /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf |
| 18 | /// |
| 19 | /// The general approach taken is to collect sets of control flow equivalent |
| 20 | /// loops and test whether they can be fused. The necessary conditions for |
| 21 | /// fusion are: |
| 22 | /// 1. The loops must be adjacent (there cannot be any statements between |
| 23 | /// the two loops). |
| 24 | /// 2. The loops must be conforming (they must execute the same number of |
| 25 | /// iterations). |
| 26 | /// 3. The loops must be control flow equivalent (if one loop executes, the |
| 27 | /// other is guaranteed to execute). |
| 28 | /// 4. There cannot be any negative distance dependencies between the loops. |
| 29 | /// If all of these conditions are satisfied, it is safe to fuse the loops. |
| 30 | /// |
| 31 | /// This implementation creates FusionCandidates that represent the loop and the |
| 32 | /// necessary information needed by fusion. It then operates on the fusion |
| 33 | /// candidates, first confirming that the candidate is eligible for fusion. The |
| 34 | /// candidates are then collected into control flow equivalent sets, sorted in |
| 35 | /// dominance order. Each set of control flow equivalent candidates is then |
| 36 | /// traversed, attempting to fuse pairs of candidates in the set. If all |
| 37 | /// requirements for fusion are met, the two candidates are fused, creating a |
| 38 | /// new (fused) candidate which is then added back into the set to consider for |
| 39 | /// additional fusion. |
| 40 | /// |
| 41 | /// This implementation currently does not make any modifications to remove |
| 42 | /// conditions for fusion. Code transformations to make loops conform to each of |
| 43 | /// the conditions for fusion are discussed in more detail in the document |
| 44 | /// above. These can be added to the current implementation in the future. |
| 45 | //===----------------------------------------------------------------------===// |
| 46 | |
| 47 | #include "llvm/Transforms/Scalar/LoopFuse.h" |
| 48 | #include "llvm/ADT/Statistic.h" |
| 49 | #include "llvm/Analysis/AssumptionCache.h" |
| 50 | #include "llvm/Analysis/DependenceAnalysis.h" |
| 51 | #include "llvm/Analysis/DomTreeUpdater.h" |
| 52 | #include "llvm/Analysis/LoopInfo.h" |
| 53 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 54 | #include "llvm/Analysis/PostDominators.h" |
| 55 | #include "llvm/Analysis/ScalarEvolution.h" |
| 56 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 57 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 58 | #include "llvm/IR/Function.h" |
| 59 | #include "llvm/IR/Verifier.h" |
| 60 | #include "llvm/Support/CommandLine.h" |
| 61 | #include "llvm/Support/Debug.h" |
| 62 | #include "llvm/Support/raw_ostream.h" |
| 63 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 64 | #include "llvm/Transforms/Utils/CodeMoverUtils.h" |
| 65 | #include "llvm/Transforms/Utils/LoopPeel.h" |
| 66 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
| 67 | #include <list> |
| 68 | |
| 69 | using namespace llvm; |
| 70 | |
| 71 | #define DEBUG_TYPE "loop-fusion" |
| 72 | |
| 73 | STATISTIC(FuseCounter, "Loops fused" ); |
| 74 | STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion" ); |
| 75 | STATISTIC(, "Loop has invalid preheader" ); |
| 76 | STATISTIC(, "Loop has invalid header" ); |
| 77 | STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks" ); |
| 78 | STATISTIC(InvalidExitBlock, "Loop has invalid exit block" ); |
| 79 | STATISTIC(InvalidLatch, "Loop has invalid latch" ); |
| 80 | STATISTIC(InvalidLoop, "Loop is invalid" ); |
| 81 | STATISTIC(AddressTakenBB, "Basic block has address taken" ); |
| 82 | STATISTIC(MayThrowException, "Loop may throw an exception" ); |
| 83 | STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access" ); |
| 84 | STATISTIC(NotSimplifiedForm, "Loop is not in simplified form" ); |
| 85 | STATISTIC(InvalidDependencies, "Dependencies prevent fusion" ); |
| 86 | STATISTIC(UnknownTripCount, "Loop has unknown trip count" ); |
| 87 | STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop" ); |
| 88 | STATISTIC(NonEqualTripCount, "Loop trip counts are not the same" ); |
| 89 | STATISTIC( |
| 90 | , |
| 91 | "Loop has a non-empty preheader with instructions that cannot be moved" ); |
| 92 | STATISTIC(FusionNotBeneficial, "Fusion is not beneficial" ); |
| 93 | STATISTIC(NonIdenticalGuards, "Candidates have different guards" ); |
| 94 | STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " |
| 95 | "instructions that cannot be moved" ); |
| 96 | STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " |
| 97 | "instructions that cannot be moved" ); |
| 98 | STATISTIC(NotRotated, "Candidate is not rotated" ); |
| 99 | STATISTIC(OnlySecondCandidateIsGuarded, |
| 100 | "The second candidate is guarded while the first one is not" ); |
| 101 | STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions." ); |
| 102 | STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions." ); |
| 103 | STATISTIC(NumDA, "DA checks passed" ); |
| 104 | |
| 105 | enum FusionDependenceAnalysisChoice { |
| 106 | FUSION_DEPENDENCE_ANALYSIS_SCEV, |
| 107 | FUSION_DEPENDENCE_ANALYSIS_DA, |
| 108 | FUSION_DEPENDENCE_ANALYSIS_ALL, |
| 109 | }; |
| 110 | |
| 111 | static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( |
| 112 | "loop-fusion-dependence-analysis" , |
| 113 | cl::desc("Which dependence analysis should loop fusion use?" ), |
| 114 | cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev" , |
| 115 | "Use the scalar evolution interface" ), |
| 116 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da" , |
| 117 | "Use the dependence analysis interface" ), |
| 118 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all" , |
| 119 | "Use all available analyses" )), |
| 120 | cl::Hidden, cl::init(Val: FUSION_DEPENDENCE_ANALYSIS_ALL)); |
| 121 | |
| 122 | static cl::opt<unsigned> FusionPeelMaxCount( |
| 123 | "loop-fusion-peel-max-count" , cl::init(Val: 0), cl::Hidden, |
| 124 | cl::desc("Max number of iterations to be peeled from a loop, such that " |
| 125 | "fusion can take place" )); |
| 126 | |
| 127 | #ifndef NDEBUG |
| 128 | static cl::opt<bool> |
| 129 | VerboseFusionDebugging("loop-fusion-verbose-debug" , |
| 130 | cl::desc("Enable verbose debugging for Loop Fusion" ), |
| 131 | cl::Hidden, cl::init(false)); |
| 132 | #endif |
| 133 | |
| 134 | namespace { |
| 135 | /// This class is used to represent a candidate for loop fusion. When it is |
| 136 | /// constructed, it checks the conditions for loop fusion to ensure that it |
| 137 | /// represents a valid candidate. It caches several parts of a loop that are |
| 138 | /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead |
| 139 | /// of continually querying the underlying Loop to retrieve these values. It is |
| 140 | /// assumed these will not change throughout loop fusion. |
| 141 | /// |
| 142 | /// The invalidate method should be used to indicate that the FusionCandidate is |
| 143 | /// no longer a valid candidate for fusion. Similarly, the isValid() method can |
| 144 | /// be used to ensure that the FusionCandidate is still valid for fusion. |
| 145 | struct FusionCandidate { |
| 146 | /// Cache of parts of the loop used throughout loop fusion. These should not |
| 147 | /// need to change throughout the analysis and transformation. |
| 148 | /// These parts are cached to avoid repeatedly looking up in the Loop class. |
| 149 | |
| 150 | /// Preheader of the loop this candidate represents |
| 151 | BasicBlock *Preheader; |
| 152 | /// Header of the loop this candidate represents |
| 153 | BasicBlock *Header; |
| 154 | /// Blocks in the loop that exit the loop |
| 155 | BasicBlock *ExitingBlock; |
| 156 | /// The successor block of this loop (where the exiting blocks go to) |
| 157 | BasicBlock *ExitBlock; |
| 158 | /// Latch of the loop |
| 159 | BasicBlock *Latch; |
| 160 | /// The loop that this fusion candidate represents |
| 161 | Loop *L; |
| 162 | /// Vector of instructions in this loop that read from memory |
| 163 | SmallVector<Instruction *, 16> MemReads; |
| 164 | /// Vector of instructions in this loop that write to memory |
| 165 | SmallVector<Instruction *, 16> MemWrites; |
| 166 | /// Are all of the members of this fusion candidate still valid |
| 167 | bool Valid; |
| 168 | /// Guard branch of the loop, if it exists |
| 169 | BranchInst *GuardBranch; |
| 170 | /// Peeling Paramaters of the Loop. |
| 171 | TTI::PeelingPreferences PP; |
| 172 | /// Can you Peel this Loop? |
| 173 | bool AbleToPeel; |
| 174 | /// Has this loop been Peeled |
| 175 | bool Peeled; |
| 176 | |
| 177 | DominatorTree &DT; |
| 178 | const PostDominatorTree *PDT; |
| 179 | |
| 180 | OptimizationRemarkEmitter &ORE; |
| 181 | |
| 182 | FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, |
| 183 | OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) |
| 184 | : Preheader(L->getLoopPreheader()), Header(L->getHeader()), |
| 185 | ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), |
| 186 | Latch(L->getLoopLatch()), L(L), Valid(true), |
| 187 | GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), |
| 188 | Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { |
| 189 | |
| 190 | // Walk over all blocks in the loop and check for conditions that may |
| 191 | // prevent fusion. For each block, walk over all instructions and collect |
| 192 | // the memory reads and writes If any instructions that prevent fusion are |
| 193 | // found, invalidate this object and return. |
| 194 | for (BasicBlock *BB : L->blocks()) { |
| 195 | if (BB->hasAddressTaken()) { |
| 196 | invalidate(); |
| 197 | reportInvalidCandidate(Stat&: AddressTakenBB); |
| 198 | return; |
| 199 | } |
| 200 | |
| 201 | for (Instruction &I : *BB) { |
| 202 | if (I.mayThrow()) { |
| 203 | invalidate(); |
| 204 | reportInvalidCandidate(Stat&: MayThrowException); |
| 205 | return; |
| 206 | } |
| 207 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: &I)) { |
| 208 | if (SI->isVolatile()) { |
| 209 | invalidate(); |
| 210 | reportInvalidCandidate(Stat&: ContainsVolatileAccess); |
| 211 | return; |
| 212 | } |
| 213 | } |
| 214 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: &I)) { |
| 215 | if (LI->isVolatile()) { |
| 216 | invalidate(); |
| 217 | reportInvalidCandidate(Stat&: ContainsVolatileAccess); |
| 218 | return; |
| 219 | } |
| 220 | } |
| 221 | if (I.mayWriteToMemory()) |
| 222 | MemWrites.push_back(Elt: &I); |
| 223 | if (I.mayReadFromMemory()) |
| 224 | MemReads.push_back(Elt: &I); |
| 225 | } |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | /// Check if all members of the class are valid. |
| 230 | bool isValid() const { |
| 231 | return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && |
| 232 | !L->isInvalid() && Valid; |
| 233 | } |
| 234 | |
| 235 | /// Verify that all members are in sync with the Loop object. |
| 236 | void verify() const { |
| 237 | assert(isValid() && "Candidate is not valid!!" ); |
| 238 | assert(!L->isInvalid() && "Loop is invalid!" ); |
| 239 | assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync" ); |
| 240 | assert(Header == L->getHeader() && "Header is out of sync" ); |
| 241 | assert(ExitingBlock == L->getExitingBlock() && |
| 242 | "Exiting Blocks is out of sync" ); |
| 243 | assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync" ); |
| 244 | assert(Latch == L->getLoopLatch() && "Latch is out of sync" ); |
| 245 | } |
| 246 | |
| 247 | /// Get the entry block for this fusion candidate. |
| 248 | /// |
| 249 | /// If this fusion candidate represents a guarded loop, the entry block is the |
| 250 | /// loop guard block. If it represents an unguarded loop, the entry block is |
| 251 | /// the preheader of the loop. |
| 252 | BasicBlock *getEntryBlock() const { |
| 253 | if (GuardBranch) |
| 254 | return GuardBranch->getParent(); |
| 255 | else |
| 256 | return Preheader; |
| 257 | } |
| 258 | |
| 259 | /// After Peeling the loop is modified quite a bit, hence all of the Blocks |
| 260 | /// need to be updated accordingly. |
| 261 | void updateAfterPeeling() { |
| 262 | Preheader = L->getLoopPreheader(); |
| 263 | Header = L->getHeader(); |
| 264 | ExitingBlock = L->getExitingBlock(); |
| 265 | ExitBlock = L->getExitBlock(); |
| 266 | Latch = L->getLoopLatch(); |
| 267 | verify(); |
| 268 | } |
| 269 | |
| 270 | /// Given a guarded loop, get the successor of the guard that is not in the |
| 271 | /// loop. |
| 272 | /// |
| 273 | /// This method returns the successor of the loop guard that is not located |
| 274 | /// within the loop (i.e., the successor of the guard that is not the |
| 275 | /// preheader). |
| 276 | /// This method is only valid for guarded loops. |
| 277 | BasicBlock *getNonLoopBlock() const { |
| 278 | assert(GuardBranch && "Only valid on guarded loops." ); |
| 279 | assert(GuardBranch->isConditional() && |
| 280 | "Expecting guard to be a conditional branch." ); |
| 281 | if (Peeled) |
| 282 | return GuardBranch->getSuccessor(i: 1); |
| 283 | return (GuardBranch->getSuccessor(i: 0) == Preheader) |
| 284 | ? GuardBranch->getSuccessor(i: 1) |
| 285 | : GuardBranch->getSuccessor(i: 0); |
| 286 | } |
| 287 | |
| 288 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 289 | LLVM_DUMP_METHOD void dump() const { |
| 290 | dbgs() << "\tGuardBranch: " ; |
| 291 | if (GuardBranch) |
| 292 | dbgs() << *GuardBranch; |
| 293 | else |
| 294 | dbgs() << "nullptr" ; |
| 295 | dbgs() << "\n" |
| 296 | << (GuardBranch ? GuardBranch->getName() : "nullptr" ) << "\n" |
| 297 | << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr" ) |
| 298 | << "\n" |
| 299 | << "\tHeader: " << (Header ? Header->getName() : "nullptr" ) << "\n" |
| 300 | << "\tExitingBB: " |
| 301 | << (ExitingBlock ? ExitingBlock->getName() : "nullptr" ) << "\n" |
| 302 | << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr" ) |
| 303 | << "\n" |
| 304 | << "\tLatch: " << (Latch ? Latch->getName() : "nullptr" ) << "\n" |
| 305 | << "\tEntryBlock: " |
| 306 | << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr" ) |
| 307 | << "\n" ; |
| 308 | } |
| 309 | #endif |
| 310 | |
| 311 | /// Determine if a fusion candidate (representing a loop) is eligible for |
| 312 | /// fusion. Note that this only checks whether a single loop can be fused - it |
| 313 | /// does not check whether it is *legal* to fuse two loops together. |
| 314 | bool isEligibleForFusion(ScalarEvolution &SE) const { |
| 315 | if (!isValid()) { |
| 316 | LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n" ); |
| 317 | if (!Preheader) |
| 318 | ++InvalidPreheader; |
| 319 | if (!Header) |
| 320 | ++InvalidHeader; |
| 321 | if (!ExitingBlock) |
| 322 | ++InvalidExitingBlock; |
| 323 | if (!ExitBlock) |
| 324 | ++InvalidExitBlock; |
| 325 | if (!Latch) |
| 326 | ++InvalidLatch; |
| 327 | if (L->isInvalid()) |
| 328 | ++InvalidLoop; |
| 329 | |
| 330 | return false; |
| 331 | } |
| 332 | |
| 333 | // Require ScalarEvolution to be able to determine a trip count. |
| 334 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { |
| 335 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
| 336 | << " trip count not computable!\n" ); |
| 337 | return reportInvalidCandidate(Stat&: UnknownTripCount); |
| 338 | } |
| 339 | |
| 340 | if (!L->isLoopSimplifyForm()) { |
| 341 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
| 342 | << " is not in simplified form!\n" ); |
| 343 | return reportInvalidCandidate(Stat&: NotSimplifiedForm); |
| 344 | } |
| 345 | |
| 346 | if (!L->isRotatedForm()) { |
| 347 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n" ); |
| 348 | return reportInvalidCandidate(Stat&: NotRotated); |
| 349 | } |
| 350 | |
| 351 | return true; |
| 352 | } |
| 353 | |
| 354 | private: |
| 355 | // This is only used internally for now, to clear the MemWrites and MemReads |
| 356 | // list and setting Valid to false. I can't envision other uses of this right |
| 357 | // now, since once FusionCandidates are put into the FusionCandidateList they |
| 358 | // are immutable. Thus, any time we need to change/update a FusionCandidate, |
| 359 | // we must create a new one and insert it into the FusionCandidateList to |
| 360 | // ensure the FusionCandidateList remains ordered correctly. |
| 361 | void invalidate() { |
| 362 | MemWrites.clear(); |
| 363 | MemReads.clear(); |
| 364 | Valid = false; |
| 365 | } |
| 366 | |
| 367 | bool reportInvalidCandidate(Statistic &Stat) const { |
| 368 | using namespace ore; |
| 369 | assert(L && Preheader && "Fusion candidate not initialized properly!" ); |
| 370 | #if LLVM_ENABLE_STATS |
| 371 | ++Stat; |
| 372 | ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), |
| 373 | L->getStartLoc(), Preheader) |
| 374 | << "[" << Preheader->getParent()->getName() << "]: " |
| 375 | << "Loop is not a candidate for fusion: " << Stat.getDesc()); |
| 376 | #endif |
| 377 | return false; |
| 378 | } |
| 379 | }; |
| 380 | } // namespace |
| 381 | |
| 382 | using LoopVector = SmallVector<Loop *, 4>; |
| 383 | |
| 384 | // List of adjacent fusion candidates in order. Thus, if FC0 comes *before* FC1 |
| 385 | // in a FusionCandidateList, then FC0 dominates FC1, FC1 post-dominates FC0, |
| 386 | // and they are adjacent. |
| 387 | using FusionCandidateList = std::list<FusionCandidate>; |
| 388 | using FusionCandidateCollection = SmallVector<FusionCandidateList, 4>; |
| 389 | |
| 390 | #ifndef NDEBUG |
| 391 | static void printLoopVector(const LoopVector &LV) { |
| 392 | dbgs() << "****************************\n" ; |
| 393 | for (const Loop *L : LV) |
| 394 | printLoop(*L, dbgs()); |
| 395 | dbgs() << "****************************\n" ; |
| 396 | } |
| 397 | |
| 398 | static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) { |
| 399 | if (FC.isValid()) |
| 400 | OS << FC.Preheader->getName(); |
| 401 | else |
| 402 | OS << "<Invalid>" ; |
| 403 | |
| 404 | return OS; |
| 405 | } |
| 406 | |
| 407 | static raw_ostream &operator<<(raw_ostream &OS, |
| 408 | const FusionCandidateList &CandList) { |
| 409 | for (const FusionCandidate &FC : CandList) |
| 410 | OS << FC << '\n'; |
| 411 | |
| 412 | return OS; |
| 413 | } |
| 414 | |
| 415 | static void |
| 416 | printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { |
| 417 | dbgs() << "Fusion Candidates: \n" ; |
| 418 | for (const auto &CandidateList : FusionCandidates) { |
| 419 | dbgs() << "*** Fusion Candidate List ***\n" ; |
| 420 | dbgs() << CandidateList; |
| 421 | dbgs() << "****************************\n" ; |
| 422 | } |
| 423 | } |
| 424 | #endif // NDEBUG |
| 425 | |
| 426 | namespace { |
| 427 | |
| 428 | /// Collect all loops in function at the same nest level, starting at the |
| 429 | /// outermost level. |
| 430 | /// |
| 431 | /// This data structure collects all loops at the same nest level for a |
| 432 | /// given function (specified by the LoopInfo object). It starts at the |
| 433 | /// outermost level. |
| 434 | struct LoopDepthTree { |
| 435 | using LoopsOnLevelTy = SmallVector<LoopVector, 4>; |
| 436 | using iterator = LoopsOnLevelTy::iterator; |
| 437 | using const_iterator = LoopsOnLevelTy::const_iterator; |
| 438 | |
| 439 | LoopDepthTree(LoopInfo &LI) : Depth(1) { |
| 440 | if (!LI.empty()) |
| 441 | LoopsOnLevel.emplace_back(Args: LoopVector(LI.rbegin(), LI.rend())); |
| 442 | } |
| 443 | |
| 444 | /// Test whether a given loop has been removed from the function, and thus is |
| 445 | /// no longer valid. |
| 446 | bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(Ptr: L); } |
| 447 | |
| 448 | /// Record that a given loop has been removed from the function and is no |
| 449 | /// longer valid. |
| 450 | void removeLoop(const Loop *L) { RemovedLoops.insert(Ptr: L); } |
| 451 | |
| 452 | /// Descend the tree to the next (inner) nesting level |
| 453 | void descend() { |
| 454 | LoopsOnLevelTy LoopsOnNextLevel; |
| 455 | |
| 456 | for (const LoopVector &LV : *this) |
| 457 | for (Loop *L : LV) |
| 458 | if (!isRemovedLoop(L) && L->begin() != L->end()) |
| 459 | LoopsOnNextLevel.emplace_back(Args: LoopVector(L->begin(), L->end())); |
| 460 | |
| 461 | LoopsOnLevel = LoopsOnNextLevel; |
| 462 | RemovedLoops.clear(); |
| 463 | Depth++; |
| 464 | } |
| 465 | |
| 466 | bool empty() const { return size() == 0; } |
| 467 | size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } |
| 468 | unsigned getDepth() const { return Depth; } |
| 469 | |
| 470 | iterator begin() { return LoopsOnLevel.begin(); } |
| 471 | iterator end() { return LoopsOnLevel.end(); } |
| 472 | const_iterator begin() const { return LoopsOnLevel.begin(); } |
| 473 | const_iterator end() const { return LoopsOnLevel.end(); } |
| 474 | |
| 475 | private: |
| 476 | /// Set of loops that have been removed from the function and are no longer |
| 477 | /// valid. |
| 478 | SmallPtrSet<const Loop *, 8> RemovedLoops; |
| 479 | |
| 480 | /// Depth of the current level, starting at 1 (outermost loops). |
| 481 | unsigned Depth; |
| 482 | |
| 483 | /// Vector of loops at the current depth level that have the same parent loop |
| 484 | LoopsOnLevelTy LoopsOnLevel; |
| 485 | }; |
| 486 | |
| 487 | struct LoopFuser { |
| 488 | private: |
| 489 | // Sets of control flow equivalent fusion candidates for a given nest level. |
| 490 | FusionCandidateCollection FusionCandidates; |
| 491 | |
| 492 | LoopDepthTree LDT; |
| 493 | DomTreeUpdater DTU; |
| 494 | |
| 495 | LoopInfo &LI; |
| 496 | DominatorTree &DT; |
| 497 | DependenceInfo &DI; |
| 498 | ScalarEvolution &SE; |
| 499 | PostDominatorTree &PDT; |
| 500 | OptimizationRemarkEmitter &ORE; |
| 501 | AssumptionCache &AC; |
| 502 | const TargetTransformInfo &TTI; |
| 503 | |
| 504 | public: |
| 505 | LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, |
| 506 | ScalarEvolution &SE, PostDominatorTree &PDT, |
| 507 | OptimizationRemarkEmitter &ORE, const DataLayout &DL, |
| 508 | AssumptionCache &AC, const TargetTransformInfo &TTI) |
| 509 | : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), |
| 510 | DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} |
| 511 | |
| 512 | /// This is the main entry point for loop fusion. It will traverse the |
| 513 | /// specified function and collect candidate loops to fuse, starting at the |
| 514 | /// outermost nesting level and working inwards. |
| 515 | bool fuseLoops(Function &F) { |
| 516 | #ifndef NDEBUG |
| 517 | if (VerboseFusionDebugging) { |
| 518 | LI.print(dbgs()); |
| 519 | } |
| 520 | #endif |
| 521 | |
| 522 | LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() |
| 523 | << "\n" ); |
| 524 | bool Changed = false; |
| 525 | |
| 526 | while (!LDT.empty()) { |
| 527 | LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " |
| 528 | << LDT.getDepth() << "\n" ;); |
| 529 | |
| 530 | for (const LoopVector &LV : LDT) { |
| 531 | assert(LV.size() > 0 && "Empty loop set was build!" ); |
| 532 | |
| 533 | // Skip singleton loop sets as they do not offer fusion opportunities on |
| 534 | // this level. |
| 535 | if (LV.size() == 1) |
| 536 | continue; |
| 537 | #ifndef NDEBUG |
| 538 | if (VerboseFusionDebugging) { |
| 539 | LLVM_DEBUG({ |
| 540 | dbgs() << " Visit loop set (#" << LV.size() << "):\n" ; |
| 541 | printLoopVector(LV); |
| 542 | }); |
| 543 | } |
| 544 | #endif |
| 545 | |
| 546 | collectFusionCandidates(LV); |
| 547 | Changed |= fuseCandidates(); |
| 548 | } |
| 549 | |
| 550 | // Finished analyzing candidates at this level. |
| 551 | // Descend to the next level and clear all of the candidates currently |
| 552 | // collected. Note that it will not be possible to fuse any of the |
| 553 | // existing candidates with new candidates because the new candidates will |
| 554 | // be at a different nest level and thus not be control flow equivalent |
| 555 | // with all of the candidates collected so far. |
| 556 | LLVM_DEBUG(dbgs() << "Descend one level!\n" ); |
| 557 | LDT.descend(); |
| 558 | FusionCandidates.clear(); |
| 559 | } |
| 560 | |
| 561 | if (Changed) |
| 562 | LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n" ; F.dump();); |
| 563 | |
| 564 | #ifndef NDEBUG |
| 565 | assert(DT.verify()); |
| 566 | assert(PDT.verify()); |
| 567 | LI.verify(DT); |
| 568 | SE.verify(); |
| 569 | #endif |
| 570 | |
| 571 | LLVM_DEBUG(dbgs() << "Loop Fusion complete\n" ); |
| 572 | return Changed; |
| 573 | } |
| 574 | |
| 575 | private: |
| 576 | /// Iterate over all loops in the given loop set and identify the loops that |
| 577 | /// are eligible for fusion. Place all eligible fusion candidates into Control |
| 578 | /// Flow Equivalent sets, sorted by dominance. |
| 579 | void collectFusionCandidates(const LoopVector &LV) { |
| 580 | for (Loop *L : LV) { |
| 581 | TTI::PeelingPreferences PP = |
| 582 | gatherPeelingPreferences(L, SE, TTI, UserAllowPeeling: std::nullopt, UserAllowProfileBasedPeeling: std::nullopt); |
| 583 | FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); |
| 584 | if (!CurrCand.isEligibleForFusion(SE)) |
| 585 | continue; |
| 586 | |
| 587 | // Go through each list in FusionCandidates and determine if the first or |
| 588 | // last loop in the list is strictly adjacent to L. If it is, append L. |
| 589 | // If not, go to the next list. |
| 590 | // If no suitable list is found, start another list and add it to |
| 591 | // FusionCandidates. |
| 592 | bool FoundAdjacent = false; |
| 593 | for (auto &CurrCandList : FusionCandidates) { |
| 594 | if (isStrictlyAdjacent(FC0: CurrCand, FC1: CurrCandList.front())) { |
| 595 | CurrCandList.push_front(x: CurrCand); |
| 596 | FoundAdjacent = true; |
| 597 | #ifndef NDEBUG |
| 598 | if (VerboseFusionDebugging) |
| 599 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand |
| 600 | << " to existing candidate list\n" ); |
| 601 | #endif |
| 602 | break; |
| 603 | } else if (isStrictlyAdjacent(FC0: CurrCandList.back(), FC1: CurrCand)) { |
| 604 | CurrCandList.push_back(x: CurrCand); |
| 605 | FoundAdjacent = true; |
| 606 | #ifndef NDEBUG |
| 607 | if (VerboseFusionDebugging) |
| 608 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand |
| 609 | << " to existing candidate list\n" ); |
| 610 | #endif |
| 611 | break; |
| 612 | } |
| 613 | } |
| 614 | if (!FoundAdjacent) { |
| 615 | // No list was found. Create a new list and add to FusionCandidates |
| 616 | #ifndef NDEBUG |
| 617 | if (VerboseFusionDebugging) |
| 618 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new list\n" ); |
| 619 | #endif |
| 620 | FusionCandidateList NewCandList; |
| 621 | NewCandList.push_back(x: CurrCand); |
| 622 | FusionCandidates.push_back(Elt: NewCandList); |
| 623 | } |
| 624 | NumFusionCandidates++; |
| 625 | } |
| 626 | } |
| 627 | |
| 628 | /// Determine if it is beneficial to fuse two loops. |
| 629 | /// |
| 630 | /// For now, this method simply returns true because we want to fuse as much |
| 631 | /// as possible (primarily to test the pass). This method will evolve, over |
| 632 | /// time, to add heuristics for profitability of fusion. |
| 633 | bool isBeneficialFusion(const FusionCandidate &FC0, |
| 634 | const FusionCandidate &FC1) { |
| 635 | return true; |
| 636 | } |
| 637 | |
| 638 | /// Determine if two fusion candidates have the same trip count (i.e., they |
| 639 | /// execute the same number of iterations). |
| 640 | /// |
| 641 | /// This function will return a pair of values. The first is a boolean, |
| 642 | /// stating whether or not the two candidates are known at compile time to |
| 643 | /// have the same TripCount. The second is the difference in the two |
| 644 | /// TripCounts. This information can be used later to determine whether or not |
| 645 | /// peeling can be performed on either one of the candidates. |
| 646 | std::pair<bool, std::optional<unsigned>> |
| 647 | haveIdenticalTripCounts(const FusionCandidate &FC0, |
| 648 | const FusionCandidate &FC1) const { |
| 649 | const SCEV *TripCount0 = SE.getBackedgeTakenCount(L: FC0.L); |
| 650 | if (isa<SCEVCouldNotCompute>(Val: TripCount0)) { |
| 651 | UncomputableTripCount++; |
| 652 | LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!" ); |
| 653 | return {false, std::nullopt}; |
| 654 | } |
| 655 | |
| 656 | const SCEV *TripCount1 = SE.getBackedgeTakenCount(L: FC1.L); |
| 657 | if (isa<SCEVCouldNotCompute>(Val: TripCount1)) { |
| 658 | UncomputableTripCount++; |
| 659 | LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!" ); |
| 660 | return {false, std::nullopt}; |
| 661 | } |
| 662 | |
| 663 | LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " |
| 664 | << *TripCount1 << " are " |
| 665 | << (TripCount0 == TripCount1 ? "identical" : "different" ) |
| 666 | << "\n" ); |
| 667 | |
| 668 | if (TripCount0 == TripCount1) |
| 669 | return {true, 0}; |
| 670 | |
| 671 | LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " |
| 672 | "determining the difference between trip counts\n" ); |
| 673 | |
| 674 | // Currently only considering loops with a single exit point |
| 675 | // and a non-constant trip count. |
| 676 | const unsigned TC0 = SE.getSmallConstantTripCount(L: FC0.L); |
| 677 | const unsigned TC1 = SE.getSmallConstantTripCount(L: FC1.L); |
| 678 | |
| 679 | // If any of the tripcounts are zero that means that loop(s) do not have |
| 680 | // a single exit or a constant tripcount. |
| 681 | if (TC0 == 0 || TC1 == 0) { |
| 682 | LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " |
| 683 | "have a constant number of iterations. Peeling " |
| 684 | "is not benefical\n" ); |
| 685 | return {false, std::nullopt}; |
| 686 | } |
| 687 | |
| 688 | std::optional<unsigned> Difference; |
| 689 | int Diff = TC0 - TC1; |
| 690 | |
| 691 | if (Diff > 0) |
| 692 | Difference = Diff; |
| 693 | else { |
| 694 | LLVM_DEBUG( |
| 695 | dbgs() << "Difference is less than 0. FC1 (second loop) has more " |
| 696 | "iterations than the first one. Currently not supported\n" ); |
| 697 | } |
| 698 | |
| 699 | LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference |
| 700 | << "\n" ); |
| 701 | |
| 702 | return {false, Difference}; |
| 703 | } |
| 704 | |
| 705 | void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, |
| 706 | unsigned PeelCount) { |
| 707 | assert(FC0.AbleToPeel && "Should be able to peel loop" ); |
| 708 | |
| 709 | LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount |
| 710 | << " iterations of the first loop. \n" ); |
| 711 | |
| 712 | ValueToValueMapTy VMap; |
| 713 | peelLoop(L: FC0.L, PeelCount, PeelLast: false, LI: &LI, SE: &SE, DT, AC: &AC, PreserveLCSSA: true, VMap); |
| 714 | FC0.Peeled = true; |
| 715 | LLVM_DEBUG(dbgs() << "Done Peeling\n" ); |
| 716 | |
| 717 | #ifndef NDEBUG |
| 718 | auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); |
| 719 | |
| 720 | assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && |
| 721 | "Loops should have identical trip counts after peeling" ); |
| 722 | #endif |
| 723 | |
| 724 | FC0.PP.PeelCount += PeelCount; |
| 725 | |
| 726 | // Peeling does not update the PDT |
| 727 | PDT.recalculate(Func&: *FC0.Preheader->getParent()); |
| 728 | |
| 729 | FC0.updateAfterPeeling(); |
| 730 | |
| 731 | // In this case the iterations of the loop are constant, so the first |
| 732 | // loop will execute completely (will not jump from one of |
| 733 | // the peeled blocks to the second loop). Here we are updating the |
| 734 | // branch conditions of each of the peeled blocks, such that it will |
| 735 | // branch to its successor which is not the preheader of the second loop |
| 736 | // in the case of unguarded loops, or the succesors of the exit block of |
| 737 | // the first loop otherwise. Doing this update will ensure that the entry |
| 738 | // block of the first loop dominates the entry block of the second loop. |
| 739 | BasicBlock *BB = |
| 740 | FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; |
| 741 | if (BB) { |
| 742 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 743 | SmallVector<Instruction *, 8> WorkList; |
| 744 | for (BasicBlock *Pred : predecessors(BB)) { |
| 745 | if (Pred != FC0.ExitBlock) { |
| 746 | WorkList.emplace_back(Args: Pred->getTerminator()); |
| 747 | TreeUpdates.emplace_back( |
| 748 | Args: DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); |
| 749 | } |
| 750 | } |
| 751 | // Cannot modify the predecessors inside the above loop as it will cause |
| 752 | // the iterators to be nullptrs, causing memory errors. |
| 753 | for (Instruction *CurrentBranch : WorkList) { |
| 754 | BasicBlock *Succ = CurrentBranch->getSuccessor(Idx: 0); |
| 755 | if (Succ == BB) |
| 756 | Succ = CurrentBranch->getSuccessor(Idx: 1); |
| 757 | ReplaceInstWithInst(From: CurrentBranch, To: BranchInst::Create(IfTrue: Succ)); |
| 758 | } |
| 759 | |
| 760 | DTU.applyUpdates(Updates: TreeUpdates); |
| 761 | DTU.flush(); |
| 762 | } |
| 763 | LLVM_DEBUG( |
| 764 | dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount |
| 765 | << " iterations from the first loop.\n" |
| 766 | "Both Loops have the same number of iterations now.\n" ); |
| 767 | } |
| 768 | |
| 769 | /// Walk each set of strictly adjacent fusion candidates and attempt to fuse |
| 770 | /// them. This does a single linear traversal of all candidates in the list. |
| 771 | /// The conditions for legal fusion are checked at this point. If a pair of |
| 772 | /// fusion candidates passes all legality checks, they are fused together and |
| 773 | /// a new fusion candidate is created and added to the FusionCandidateList. |
| 774 | /// The original fusion candidates are then removed, as they are no longer |
| 775 | /// valid. |
| 776 | bool fuseCandidates() { |
| 777 | bool Fused = false; |
| 778 | LLVM_DEBUG(printFusionCandidates(FusionCandidates)); |
| 779 | for (auto &CandidateList : FusionCandidates) { |
| 780 | if (CandidateList.size() < 2) |
| 781 | continue; |
| 782 | |
| 783 | LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate List:\n" |
| 784 | << CandidateList << "\n" ); |
| 785 | |
| 786 | for (auto It = CandidateList.begin(), NextIt = std::next(x: It); |
| 787 | NextIt != CandidateList.end(); It = NextIt, NextIt = std::next(x: It)) { |
| 788 | |
| 789 | auto FC0 = *It; |
| 790 | auto FC1 = *NextIt; |
| 791 | |
| 792 | assert(!LDT.isRemovedLoop(FC0.L) && |
| 793 | "Should not have removed loops in CandidateList!" ); |
| 794 | assert(!LDT.isRemovedLoop(FC1.L) && |
| 795 | "Should not have removed loops in CandidateList!" ); |
| 796 | |
| 797 | LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n" ; FC0.dump(); |
| 798 | dbgs() << " with\n" ; FC1.dump(); dbgs() << "\n" ); |
| 799 | |
| 800 | FC0.verify(); |
| 801 | FC1.verify(); |
| 802 | |
| 803 | // Check if the candidates have identical tripcounts (first value of |
| 804 | // pair), and if not check the difference in the tripcounts between |
| 805 | // the loops (second value of pair). The difference is not equal to |
| 806 | // std::nullopt iff the loops iterate a constant number of times, and |
| 807 | // have a single exit. |
| 808 | std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = |
| 809 | haveIdenticalTripCounts(FC0, FC1); |
| 810 | bool SameTripCount = IdenticalTripCountRes.first; |
| 811 | std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; |
| 812 | |
| 813 | // Here we are checking that FC0 (the first loop) can be peeled, and |
| 814 | // both loops have different tripcounts. |
| 815 | if (FC0.AbleToPeel && !SameTripCount && TCDifference) { |
| 816 | if (*TCDifference > FusionPeelMaxCount) { |
| 817 | LLVM_DEBUG(dbgs() |
| 818 | << "Difference in loop trip counts: " << *TCDifference |
| 819 | << " is greater than maximum peel count specificed: " |
| 820 | << FusionPeelMaxCount << "\n" ); |
| 821 | } else { |
| 822 | // Dependent on peeling being performed on the first loop, and |
| 823 | // assuming all other conditions for fusion return true. |
| 824 | SameTripCount = true; |
| 825 | } |
| 826 | } |
| 827 | |
| 828 | if (!SameTripCount) { |
| 829 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " |
| 830 | "counts. Not fusing.\n" ); |
| 831 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 832 | Stat&: NonEqualTripCount); |
| 833 | continue; |
| 834 | } |
| 835 | |
| 836 | if ((!FC0.GuardBranch && FC1.GuardBranch) || |
| 837 | (FC0.GuardBranch && !FC1.GuardBranch)) { |
| 838 | LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " |
| 839 | "another one is not. Not fusing.\n" ); |
| 840 | reportLoopFusion<OptimizationRemarkMissed>( |
| 841 | FC0, FC1, Stat&: OnlySecondCandidateIsGuarded); |
| 842 | continue; |
| 843 | } |
| 844 | |
| 845 | // Ensure that FC0 and FC1 have identical guards. |
| 846 | // If one (or both) are not guarded, this check is not necessary. |
| 847 | if (FC0.GuardBranch && FC1.GuardBranch && |
| 848 | !haveIdenticalGuards(FC0, FC1) && !TCDifference) { |
| 849 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " |
| 850 | "guards. Not Fusing.\n" ); |
| 851 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 852 | Stat&: NonIdenticalGuards); |
| 853 | continue; |
| 854 | } |
| 855 | |
| 856 | if (FC0.GuardBranch) { |
| 857 | assert(FC1.GuardBranch && "Expecting valid FC1 guard branch" ); |
| 858 | |
| 859 | if (!isSafeToMoveBefore(BB&: *FC0.ExitBlock, |
| 860 | InsertPoint&: *FC1.ExitBlock->getFirstNonPHIOrDbg(), DT, |
| 861 | PDT: &PDT, DI: &DI)) { |
| 862 | LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " |
| 863 | "instructions in exit block. Not fusing.\n" ); |
| 864 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 865 | Stat&: NonEmptyExitBlock); |
| 866 | continue; |
| 867 | } |
| 868 | |
| 869 | if (!isSafeToMoveBefore( |
| 870 | BB&: *FC1.GuardBranch->getParent(), |
| 871 | InsertPoint&: *FC0.GuardBranch->getParent()->getTerminator(), DT, PDT: &PDT, |
| 872 | DI: &DI)) { |
| 873 | LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " |
| 874 | "instructions in guard block. Not fusing.\n" ); |
| 875 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 876 | Stat&: NonEmptyGuardBlock); |
| 877 | continue; |
| 878 | } |
| 879 | } |
| 880 | |
| 881 | // Check the dependencies across the loops and do not fuse if it would |
| 882 | // violate them. |
| 883 | if (!dependencesAllowFusion(FC0, FC1)) { |
| 884 | LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n" ); |
| 885 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 886 | Stat&: InvalidDependencies); |
| 887 | continue; |
| 888 | } |
| 889 | |
| 890 | // If the second loop has instructions in the pre-header, attempt to |
| 891 | // hoist them up to the first loop's pre-header or sink them into the |
| 892 | // body of the second loop. |
| 893 | SmallVector<Instruction *, 4> SafeToHoist; |
| 894 | SmallVector<Instruction *, 4> SafeToSink; |
| 895 | // At this point, this is the last remaining legality check. |
| 896 | // Which means if we can make this pre-header empty, we can fuse |
| 897 | // these loops |
| 898 | if (!isEmptyPreheader(FC: FC1)) { |
| 899 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " |
| 900 | "preheader.\n" ); |
| 901 | |
| 902 | // If it is not safe to hoist/sink all instructions in the |
| 903 | // pre-header, we cannot fuse these loops. |
| 904 | if (!collectMovablePreheaderInsts(FC0, FC1, SafeToHoist, |
| 905 | SafeToSink)) { |
| 906 | LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " |
| 907 | "Fusion Candidate Pre-header.\n" |
| 908 | << "Not Fusing.\n" ); |
| 909 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 910 | Stat&: NonEmptyPreheader); |
| 911 | continue; |
| 912 | } |
| 913 | } |
| 914 | |
| 915 | bool BeneficialToFuse = isBeneficialFusion(FC0, FC1); |
| 916 | LLVM_DEBUG(dbgs() << "\tFusion appears to be " |
| 917 | << (BeneficialToFuse ? "" : "un" ) << "profitable!\n" ); |
| 918 | if (!BeneficialToFuse) { |
| 919 | reportLoopFusion<OptimizationRemarkMissed>(FC0, FC1, |
| 920 | Stat&: FusionNotBeneficial); |
| 921 | continue; |
| 922 | } |
| 923 | // All analysis has completed and has determined that fusion is legal |
| 924 | // and profitable. At this point, start transforming the code and |
| 925 | // perform fusion. |
| 926 | |
| 927 | // Execute the hoist/sink operations on preheader instructions |
| 928 | movePreheaderInsts(FC0, FC1, HoistInsts&: SafeToHoist, SinkInsts&: SafeToSink); |
| 929 | |
| 930 | LLVM_DEBUG(dbgs() << "\tFusion is performed: " << FC0 << " and " << FC1 |
| 931 | << "\n" ); |
| 932 | |
| 933 | FusionCandidate FC0Copy = FC0; |
| 934 | // Peel the loop after determining that fusion is legal. The Loops |
| 935 | // will still be safe to fuse after the peeling is performed. |
| 936 | bool Peel = TCDifference && *TCDifference > 0; |
| 937 | if (Peel) |
| 938 | peelFusionCandidate(FC0&: FC0Copy, FC1, PeelCount: *TCDifference); |
| 939 | |
| 940 | // Report fusion to the Optimization Remarks. |
| 941 | // Note this needs to be done *before* performFusion because |
| 942 | // performFusion will change the original loops, making it not |
| 943 | // possible to identify them after fusion is complete. |
| 944 | reportLoopFusion<OptimizationRemark>(FC0: (Peel ? FC0Copy : FC0), FC1, |
| 945 | Stat&: FuseCounter); |
| 946 | |
| 947 | FusionCandidate FusedCand(performFusion(FC0: (Peel ? FC0Copy : FC0), FC1), |
| 948 | DT, &PDT, ORE, FC0Copy.PP); |
| 949 | FusedCand.verify(); |
| 950 | assert(FusedCand.isEligibleForFusion(SE) && |
| 951 | "Fused candidate should be eligible for fusion!" ); |
| 952 | |
| 953 | // Notify the loop-depth-tree that these loops are not valid objects |
| 954 | LDT.removeLoop(L: FC1.L); |
| 955 | |
| 956 | // Replace FC0 and FC1 with their fused loop |
| 957 | It = CandidateList.erase(position: It); |
| 958 | It = CandidateList.erase(position: It); |
| 959 | It = CandidateList.insert(position: It, x: FusedCand); |
| 960 | |
| 961 | // Start from FusedCand in the next iteration |
| 962 | NextIt = It; |
| 963 | |
| 964 | LLVM_DEBUG(dbgs() << "Candidate List (after fusion): " << CandidateList |
| 965 | << "\n" ); |
| 966 | |
| 967 | Fused = true; |
| 968 | } |
| 969 | } |
| 970 | return Fused; |
| 971 | } |
| 972 | |
| 973 | // Returns true if the instruction \p I can be hoisted to the end of the |
| 974 | // preheader of \p FC0. \p SafeToHoist contains the instructions that are |
| 975 | // known to be safe to hoist. The instructions encountered that cannot be |
| 976 | // hoisted are in \p NotHoisting. |
| 977 | // TODO: Move functionality into CodeMoverUtils |
| 978 | bool canHoistInst(Instruction &I, |
| 979 | const SmallVector<Instruction *, 4> &SafeToHoist, |
| 980 | const SmallVector<Instruction *, 4> &NotHoisting, |
| 981 | const FusionCandidate &FC0) const { |
| 982 | const BasicBlock * = FC0.Preheader->getSingleSuccessor(); |
| 983 | assert(FC0PreheaderTarget && |
| 984 | "Expected single successor for loop preheader." ); |
| 985 | |
| 986 | for (Use &Op : I.operands()) { |
| 987 | if (auto *OpInst = dyn_cast<Instruction>(Val&: Op)) { |
| 988 | bool OpHoisted = is_contained(Range: SafeToHoist, Element: OpInst); |
| 989 | // Check if we have already decided to hoist this operand. In this |
| 990 | // case, it does not dominate FC0 *yet*, but will after we hoist it. |
| 991 | if (!(OpHoisted || DT.dominates(Def: OpInst, BB: FC0PreheaderTarget))) { |
| 992 | return false; |
| 993 | } |
| 994 | } |
| 995 | } |
| 996 | |
| 997 | // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs |
| 998 | // cannot be hoisted and should be sunk to the exit of the fused loop. |
| 999 | if (isa<PHINode>(Val: I)) |
| 1000 | return false; |
| 1001 | |
| 1002 | // If this isn't a memory inst, hoisting is safe |
| 1003 | if (!I.mayReadOrWriteMemory()) |
| 1004 | return true; |
| 1005 | |
| 1006 | LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n" ); |
| 1007 | for (Instruction *NotHoistedInst : NotHoisting) { |
| 1008 | if (auto D = DI.depends(Src: &I, Dst: NotHoistedInst)) { |
| 1009 | // Dependency is not read-before-write, write-before-read or |
| 1010 | // write-before-write |
| 1011 | if (D->isFlow() || D->isAnti() || D->isOutput()) { |
| 1012 | LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " |
| 1013 | "preheader that is not being hoisted.\n" ); |
| 1014 | return false; |
| 1015 | } |
| 1016 | } |
| 1017 | } |
| 1018 | |
| 1019 | for (Instruction *ReadInst : FC0.MemReads) { |
| 1020 | if (auto D = DI.depends(Src: ReadInst, Dst: &I)) { |
| 1021 | // Dependency is not read-before-write |
| 1022 | if (D->isAnti()) { |
| 1023 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n" ); |
| 1024 | return false; |
| 1025 | } |
| 1026 | } |
| 1027 | } |
| 1028 | |
| 1029 | for (Instruction *WriteInst : FC0.MemWrites) { |
| 1030 | if (auto D = DI.depends(Src: WriteInst, Dst: &I)) { |
| 1031 | // Dependency is not write-before-read or write-before-write |
| 1032 | if (D->isFlow() || D->isOutput()) { |
| 1033 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n" ); |
| 1034 | return false; |
| 1035 | } |
| 1036 | } |
| 1037 | } |
| 1038 | return true; |
| 1039 | } |
| 1040 | |
| 1041 | // Returns true if the instruction \p I can be sunk to the top of the exit |
| 1042 | // block of \p FC1. |
| 1043 | // TODO: Move functionality into CodeMoverUtils |
| 1044 | bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { |
| 1045 | for (User *U : I.users()) { |
| 1046 | if (auto *UI{dyn_cast<Instruction>(Val: U)}) { |
| 1047 | // Cannot sink if user in loop |
| 1048 | // If FC1 has phi users of this value, we cannot sink it into FC1. |
| 1049 | if (FC1.L->contains(Inst: UI)) { |
| 1050 | // Cannot hoist or sink this instruction. No hoisting/sinking |
| 1051 | // should take place, loops should not fuse |
| 1052 | return false; |
| 1053 | } |
| 1054 | } |
| 1055 | } |
| 1056 | |
| 1057 | // If this isn't a memory inst, sinking is safe |
| 1058 | if (!I.mayReadOrWriteMemory()) |
| 1059 | return true; |
| 1060 | |
| 1061 | for (Instruction *ReadInst : FC1.MemReads) { |
| 1062 | if (auto D = DI.depends(Src: &I, Dst: ReadInst)) { |
| 1063 | // Dependency is not write-before-read |
| 1064 | if (D->isFlow()) { |
| 1065 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n" ); |
| 1066 | return false; |
| 1067 | } |
| 1068 | } |
| 1069 | } |
| 1070 | |
| 1071 | for (Instruction *WriteInst : FC1.MemWrites) { |
| 1072 | if (auto D = DI.depends(Src: &I, Dst: WriteInst)) { |
| 1073 | // Dependency is not write-before-write or read-before-write |
| 1074 | if (D->isOutput() || D->isAnti()) { |
| 1075 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n" ); |
| 1076 | return false; |
| 1077 | } |
| 1078 | } |
| 1079 | } |
| 1080 | |
| 1081 | return true; |
| 1082 | } |
| 1083 | |
| 1084 | /// Collect instructions in the \p FC1 Preheader that can be hoisted |
| 1085 | /// to the \p FC0 Preheader or sunk into the \p FC1 Body |
| 1086 | bool collectMovablePreheaderInsts( |
| 1087 | const FusionCandidate &FC0, const FusionCandidate &FC1, |
| 1088 | SmallVector<Instruction *, 4> &SafeToHoist, |
| 1089 | SmallVector<Instruction *, 4> &SafeToSink) const { |
| 1090 | BasicBlock * = FC1.Preheader; |
| 1091 | // Save the instructions that are not being hoisted, so we know not to hoist |
| 1092 | // mem insts that they dominate. |
| 1093 | SmallVector<Instruction *, 4> NotHoisting; |
| 1094 | |
| 1095 | for (Instruction &I : *FC1Preheader) { |
| 1096 | // Can't move a branch |
| 1097 | if (&I == FC1Preheader->getTerminator()) |
| 1098 | continue; |
| 1099 | // If the instruction has side-effects, give up. |
| 1100 | // TODO: The case of mayReadFromMemory we can handle but requires |
| 1101 | // additional work with a dependence analysis so for now we give |
| 1102 | // up on memory reads. |
| 1103 | if (I.mayThrow() || !I.willReturn()) { |
| 1104 | LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n" ); |
| 1105 | return false; |
| 1106 | } |
| 1107 | |
| 1108 | LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n" ); |
| 1109 | |
| 1110 | if (I.isAtomic() || I.isVolatile()) { |
| 1111 | LLVM_DEBUG( |
| 1112 | dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n" ); |
| 1113 | return false; |
| 1114 | } |
| 1115 | |
| 1116 | if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { |
| 1117 | SafeToHoist.push_back(Elt: &I); |
| 1118 | LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n" ); |
| 1119 | } else { |
| 1120 | LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n" ); |
| 1121 | NotHoisting.push_back(Elt: &I); |
| 1122 | |
| 1123 | if (canSinkInst(I, FC1)) { |
| 1124 | SafeToSink.push_back(Elt: &I); |
| 1125 | LLVM_DEBUG(dbgs() << "\tSafe to sink.\n" ); |
| 1126 | } else { |
| 1127 | LLVM_DEBUG(dbgs() << "\tCould not sink.\n" ); |
| 1128 | return false; |
| 1129 | } |
| 1130 | } |
| 1131 | } |
| 1132 | LLVM_DEBUG( |
| 1133 | dbgs() << "All preheader instructions could be sunk or hoisted!\n" ); |
| 1134 | return true; |
| 1135 | } |
| 1136 | |
| 1137 | /// Rewrite all additive recurrences in a SCEV to use a new loop. |
| 1138 | class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { |
| 1139 | public: |
| 1140 | AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, |
| 1141 | bool UseMax = true) |
| 1142 | : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), |
| 1143 | NewL(NewL) {} |
| 1144 | |
| 1145 | const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { |
| 1146 | const Loop *ExprL = Expr->getLoop(); |
| 1147 | SmallVector<const SCEV *, 2> Operands; |
| 1148 | if (ExprL == &OldL) { |
| 1149 | append_range(C&: Operands, R: Expr->operands()); |
| 1150 | return SE.getAddRecExpr(Operands, L: &NewL, Flags: Expr->getNoWrapFlags()); |
| 1151 | } |
| 1152 | |
| 1153 | if (OldL.contains(L: ExprL)) { |
| 1154 | bool Pos = SE.isKnownPositive(S: Expr->getStepRecurrence(SE)); |
| 1155 | if (!UseMax || !Pos || !Expr->isAffine()) { |
| 1156 | Valid = false; |
| 1157 | return Expr; |
| 1158 | } |
| 1159 | return visit(S: Expr->getStart()); |
| 1160 | } |
| 1161 | |
| 1162 | for (const SCEV *Op : Expr->operands()) |
| 1163 | Operands.push_back(Elt: visit(S: Op)); |
| 1164 | return SE.getAddRecExpr(Operands, L: ExprL, Flags: Expr->getNoWrapFlags()); |
| 1165 | } |
| 1166 | |
| 1167 | bool wasValidSCEV() const { return Valid; } |
| 1168 | |
| 1169 | private: |
| 1170 | bool Valid, UseMax; |
| 1171 | const Loop &OldL, &NewL; |
| 1172 | }; |
| 1173 | |
| 1174 | /// Return false if the access functions of \p I0 and \p I1 could cause |
| 1175 | /// a negative dependence. |
| 1176 | bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, |
| 1177 | Instruction &I1, bool EqualIsInvalid) { |
| 1178 | Value *Ptr0 = getLoadStorePointerOperand(V: &I0); |
| 1179 | Value *Ptr1 = getLoadStorePointerOperand(V: &I1); |
| 1180 | if (!Ptr0 || !Ptr1) |
| 1181 | return false; |
| 1182 | |
| 1183 | const SCEV *SCEVPtr0 = SE.getSCEVAtScope(V: Ptr0, L: &L0); |
| 1184 | const SCEV *SCEVPtr1 = SE.getSCEVAtScope(V: Ptr1, L: &L1); |
| 1185 | #ifndef NDEBUG |
| 1186 | if (VerboseFusionDebugging) |
| 1187 | LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " |
| 1188 | << *SCEVPtr1 << "\n" ); |
| 1189 | #endif |
| 1190 | AddRecLoopReplacer Rewriter(SE, L0, L1); |
| 1191 | SCEVPtr0 = Rewriter.visit(S: SCEVPtr0); |
| 1192 | #ifndef NDEBUG |
| 1193 | if (VerboseFusionDebugging) |
| 1194 | LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 |
| 1195 | << " [Valid: " << Rewriter.wasValidSCEV() << "]\n" ); |
| 1196 | #endif |
| 1197 | if (!Rewriter.wasValidSCEV()) |
| 1198 | return false; |
| 1199 | |
| 1200 | // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by |
| 1201 | // L0) and the other is not. We could check if it is monotone and test |
| 1202 | // the beginning and end value instead. |
| 1203 | |
| 1204 | BasicBlock * = L0.getHeader(); |
| 1205 | auto HasNonLinearDominanceRelation = [&](const SCEV *S) { |
| 1206 | const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Val: S); |
| 1207 | if (!AddRec) |
| 1208 | return false; |
| 1209 | return !DT.dominates(A: L0Header, B: AddRec->getLoop()->getHeader()) && |
| 1210 | !DT.dominates(A: AddRec->getLoop()->getHeader(), B: L0Header); |
| 1211 | }; |
| 1212 | if (SCEVExprContains(Root: SCEVPtr1, Pred: HasNonLinearDominanceRelation)) |
| 1213 | return false; |
| 1214 | |
| 1215 | ICmpInst::Predicate Pred = |
| 1216 | EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; |
| 1217 | bool IsAlwaysGE = SE.isKnownPredicate(Pred, LHS: SCEVPtr0, RHS: SCEVPtr1); |
| 1218 | #ifndef NDEBUG |
| 1219 | if (VerboseFusionDebugging) |
| 1220 | LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 |
| 1221 | << (IsAlwaysGE ? " >= " : " may < " ) << *SCEVPtr1 |
| 1222 | << "\n" ); |
| 1223 | #endif |
| 1224 | return IsAlwaysGE; |
| 1225 | } |
| 1226 | |
| 1227 | /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in |
| 1228 | /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses |
| 1229 | /// specified by @p DepChoice are used to determine this. |
| 1230 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
| 1231 | const FusionCandidate &FC1, Instruction &I0, |
| 1232 | Instruction &I1, bool AnyDep, |
| 1233 | FusionDependenceAnalysisChoice DepChoice) { |
| 1234 | #ifndef NDEBUG |
| 1235 | if (VerboseFusionDebugging) { |
| 1236 | LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " |
| 1237 | << DepChoice << "\n" ); |
| 1238 | } |
| 1239 | #endif |
| 1240 | switch (DepChoice) { |
| 1241 | case FUSION_DEPENDENCE_ANALYSIS_SCEV: |
| 1242 | return accessDiffIsPositive(L0: *FC0.L, L1: *FC1.L, I0, I1, EqualIsInvalid: AnyDep); |
| 1243 | case FUSION_DEPENDENCE_ANALYSIS_DA: { |
| 1244 | auto DepResult = DI.depends(Src: &I0, Dst: &I1); |
| 1245 | if (!DepResult) |
| 1246 | return true; |
| 1247 | #ifndef NDEBUG |
| 1248 | if (VerboseFusionDebugging) { |
| 1249 | LLVM_DEBUG(dbgs() << "DA res: " ; DepResult->dump(dbgs()); |
| 1250 | dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " |
| 1251 | << (DepResult->isOrdered() ? "true" : "false" ) |
| 1252 | << "]\n" ); |
| 1253 | LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() |
| 1254 | << "\n" ); |
| 1255 | } |
| 1256 | #endif |
| 1257 | unsigned Levels = DepResult->getLevels(); |
| 1258 | unsigned SameSDLevels = DepResult->getSameSDLevels(); |
| 1259 | unsigned CurLoopLevel = FC0.L->getLoopDepth(); |
| 1260 | |
| 1261 | // Check if DA is missing info regarding the current loop level |
| 1262 | if (CurLoopLevel > Levels + SameSDLevels) |
| 1263 | return false; |
| 1264 | |
| 1265 | // Iterating over the outer levels. |
| 1266 | for (unsigned Level = 1; Level <= std::min(a: CurLoopLevel - 1, b: Levels); |
| 1267 | ++Level) { |
| 1268 | unsigned Direction = DepResult->getDirection(Level, SameSD: false); |
| 1269 | |
| 1270 | // Check if the direction vector does not include equality. If an outer |
| 1271 | // loop has a non-equal direction, outer indicies are different and it |
| 1272 | // is safe to fuse. |
| 1273 | if (!(Direction & Dependence::DVEntry::EQ)) { |
| 1274 | LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " |
| 1275 | "outer loops\n" ); |
| 1276 | NumDA++; |
| 1277 | return true; |
| 1278 | } |
| 1279 | } |
| 1280 | |
| 1281 | assert(CurLoopLevel > Levels && "Fusion candidates are not separated" ); |
| 1282 | |
| 1283 | unsigned CurDir = DepResult->getDirection(Level: CurLoopLevel, SameSD: true); |
| 1284 | |
| 1285 | // Check if the direction vector does not include greater direction. In |
| 1286 | // that case, the dependency is not a backward loop-carried and is legal |
| 1287 | // to fuse. For example here we have a forward dependency |
| 1288 | // for (int i = 0; i < n; i++) |
| 1289 | // A[i] = ...; |
| 1290 | // for (int i = 0; i < n; i++) |
| 1291 | // ... = A[i-1]; |
| 1292 | if (!(CurDir & Dependence::DVEntry::GT)) { |
| 1293 | LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " |
| 1294 | "dependency\n" ); |
| 1295 | NumDA++; |
| 1296 | return true; |
| 1297 | } |
| 1298 | |
| 1299 | if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) |
| 1300 | LLVM_DEBUG( |
| 1301 | dbgs() << "TODO: Implement pred/succ dependence handling!\n" ); |
| 1302 | |
| 1303 | // TODO: Can we actually use the dependence info analysis here? |
| 1304 | return false; |
| 1305 | } |
| 1306 | |
| 1307 | case FUSION_DEPENDENCE_ANALYSIS_ALL: |
| 1308 | return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
| 1309 | DepChoice: FUSION_DEPENDENCE_ANALYSIS_SCEV) || |
| 1310 | dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
| 1311 | DepChoice: FUSION_DEPENDENCE_ANALYSIS_DA); |
| 1312 | } |
| 1313 | |
| 1314 | llvm_unreachable("Unknown fusion dependence analysis choice!" ); |
| 1315 | } |
| 1316 | |
| 1317 | /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. |
| 1318 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
| 1319 | const FusionCandidate &FC1) { |
| 1320 | LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 |
| 1321 | << "\n" ); |
| 1322 | assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); |
| 1323 | assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); |
| 1324 | |
| 1325 | for (Instruction *WriteL0 : FC0.MemWrites) { |
| 1326 | for (Instruction *WriteL1 : FC1.MemWrites) |
| 1327 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1, |
| 1328 | /* AnyDep */ false, |
| 1329 | DepChoice: FusionDependenceAnalysis)) { |
| 1330 | InvalidDependencies++; |
| 1331 | return false; |
| 1332 | } |
| 1333 | for (Instruction *ReadL1 : FC1.MemReads) |
| 1334 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *ReadL1, |
| 1335 | /* AnyDep */ false, |
| 1336 | DepChoice: FusionDependenceAnalysis)) { |
| 1337 | InvalidDependencies++; |
| 1338 | return false; |
| 1339 | } |
| 1340 | } |
| 1341 | |
| 1342 | for (Instruction *WriteL1 : FC1.MemWrites) { |
| 1343 | for (Instruction *WriteL0 : FC0.MemWrites) |
| 1344 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1, |
| 1345 | /* AnyDep */ false, |
| 1346 | DepChoice: FusionDependenceAnalysis)) { |
| 1347 | InvalidDependencies++; |
| 1348 | return false; |
| 1349 | } |
| 1350 | for (Instruction *ReadL0 : FC0.MemReads) |
| 1351 | if (!dependencesAllowFusion(FC0, FC1, I0&: *ReadL0, I1&: *WriteL1, |
| 1352 | /* AnyDep */ false, |
| 1353 | DepChoice: FusionDependenceAnalysis)) { |
| 1354 | InvalidDependencies++; |
| 1355 | return false; |
| 1356 | } |
| 1357 | } |
| 1358 | |
| 1359 | // Walk through all uses in FC1. For each use, find the reaching def. If the |
| 1360 | // def is located in FC0 then it is not safe to fuse. |
| 1361 | for (BasicBlock *BB : FC1.L->blocks()) |
| 1362 | for (Instruction &I : *BB) |
| 1363 | for (auto &Op : I.operands()) |
| 1364 | if (Instruction *Def = dyn_cast<Instruction>(Val&: Op)) |
| 1365 | if (FC0.L->contains(BB: Def->getParent())) { |
| 1366 | InvalidDependencies++; |
| 1367 | return false; |
| 1368 | } |
| 1369 | |
| 1370 | return true; |
| 1371 | } |
| 1372 | |
| 1373 | /// Determine if two fusion candidates are strictly adjacent in the CFG. |
| 1374 | /// |
| 1375 | /// This method will determine if there are additional basic blocks in the CFG |
| 1376 | /// between the exit of \p FC0 and the entry of \p FC1. |
| 1377 | /// If the two candidates are guarded loops, then it checks whether the |
| 1378 | /// exit block of the \p FC0 is the predecessor of the \p FC1 preheader. This |
| 1379 | /// implicitly ensures that the non-loop successor of the \p FC0 guard branch |
| 1380 | /// is the entry block of \p FC1. If not, then the loops are not adjacent. If |
| 1381 | /// the two candidates are not guarded loops, then it checks whether the exit |
| 1382 | /// block of \p FC0 is the preheader of \p FC1. |
| 1383 | /// Strictly means there is no predecessor for FC1 unless it is from FC0, |
| 1384 | /// i.e., FC0 dominates FC1. |
| 1385 | bool isStrictlyAdjacent(const FusionCandidate &FC0, |
| 1386 | const FusionCandidate &FC1) const { |
| 1387 | // If the successor of the guard branch is FC1, then the loops are adjacent |
| 1388 | if (FC0.GuardBranch) |
| 1389 | return DT.dominates(A: FC0.getEntryBlock(), B: FC1.getEntryBlock()) && |
| 1390 | FC0.ExitBlock->getSingleSuccessor() == FC1.getEntryBlock(); |
| 1391 | else |
| 1392 | return FC0.ExitBlock == FC1.getEntryBlock(); |
| 1393 | } |
| 1394 | |
| 1395 | bool isEmptyPreheader(const FusionCandidate &FC) const { |
| 1396 | return FC.Preheader->size() == 1; |
| 1397 | } |
| 1398 | |
| 1399 | /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader |
| 1400 | /// and sink others into the body of \p FC1. |
| 1401 | void movePreheaderInsts(const FusionCandidate &FC0, |
| 1402 | const FusionCandidate &FC1, |
| 1403 | SmallVector<Instruction *, 4> &HoistInsts, |
| 1404 | SmallVector<Instruction *, 4> &SinkInsts) const { |
| 1405 | // All preheader instructions except the branch must be hoisted or sunk |
| 1406 | assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && |
| 1407 | "Attempting to sink and hoist preheader instructions, but not all " |
| 1408 | "the preheader instructions are accounted for." ); |
| 1409 | |
| 1410 | NumHoistedInsts += HoistInsts.size(); |
| 1411 | NumSunkInsts += SinkInsts.size(); |
| 1412 | |
| 1413 | LLVM_DEBUG(if (VerboseFusionDebugging) { |
| 1414 | if (!HoistInsts.empty()) |
| 1415 | dbgs() << "Hoisting: \n" ; |
| 1416 | for (Instruction *I : HoistInsts) |
| 1417 | dbgs() << *I << "\n" ; |
| 1418 | if (!SinkInsts.empty()) |
| 1419 | dbgs() << "Sinking: \n" ; |
| 1420 | for (Instruction *I : SinkInsts) |
| 1421 | dbgs() << *I << "\n" ; |
| 1422 | }); |
| 1423 | |
| 1424 | for (Instruction *I : HoistInsts) { |
| 1425 | assert(I->getParent() == FC1.Preheader); |
| 1426 | I->moveBefore(BB&: *FC0.Preheader, |
| 1427 | I: FC0.Preheader->getTerminator()->getIterator()); |
| 1428 | } |
| 1429 | // insert instructions in reverse order to maintain dominance relationship |
| 1430 | for (Instruction *I : reverse(C&: SinkInsts)) { |
| 1431 | assert(I->getParent() == FC1.Preheader); |
| 1432 | if (isa<PHINode>(Val: I)) { |
| 1433 | // The Phis to be sunk should have only one incoming value, as is |
| 1434 | // assured by the condition that the second loop is dominated by the |
| 1435 | // first one which is enforced by isStrictlyAdjacent(). |
| 1436 | // Replace the phi uses with the corresponding incoming value to clean |
| 1437 | // up the code. |
| 1438 | assert(cast<PHINode>(I)->getNumIncomingValues() == 1 && |
| 1439 | "Expected the sunk PHI node to have 1 incoming value." ); |
| 1440 | I->replaceAllUsesWith(V: I->getOperand(i: 0)); |
| 1441 | I->eraseFromParent(); |
| 1442 | } else |
| 1443 | I->moveBefore(BB&: *FC1.ExitBlock, I: FC1.ExitBlock->getFirstInsertionPt()); |
| 1444 | } |
| 1445 | } |
| 1446 | |
| 1447 | /// Determine if two fusion candidates have identical guards |
| 1448 | /// |
| 1449 | /// This method will determine if two fusion candidates have the same guards. |
| 1450 | /// The guards are considered the same if: |
| 1451 | /// 1. The instructions to compute the condition used in the compare are |
| 1452 | /// identical. |
| 1453 | /// 2. The successors of the guard have the same flow into/around the loop. |
| 1454 | /// If the compare instructions are identical, then the first successor of the |
| 1455 | /// guard must go to the same place (either the preheader of the loop or the |
| 1456 | /// NonLoopBlock). In other words, the first successor of both loops must |
| 1457 | /// both go into the loop (i.e., the preheader) or go around the loop (i.e., |
| 1458 | /// the NonLoopBlock). The same must be true for the second successor. |
| 1459 | bool haveIdenticalGuards(const FusionCandidate &FC0, |
| 1460 | const FusionCandidate &FC1) const { |
| 1461 | assert(FC0.GuardBranch && FC1.GuardBranch && |
| 1462 | "Expecting FC0 and FC1 to be guarded loops." ); |
| 1463 | |
| 1464 | if (auto FC0CmpInst = |
| 1465 | dyn_cast<Instruction>(Val: FC0.GuardBranch->getCondition())) |
| 1466 | if (auto FC1CmpInst = |
| 1467 | dyn_cast<Instruction>(Val: FC1.GuardBranch->getCondition())) |
| 1468 | if (!FC0CmpInst->isIdenticalTo(I: FC1CmpInst)) |
| 1469 | return false; |
| 1470 | |
| 1471 | // The compare instructions are identical. |
| 1472 | // Now make sure the successor of the guards have the same flow into/around |
| 1473 | // the loop |
| 1474 | if (FC0.GuardBranch->getSuccessor(i: 0) == FC0.Preheader) |
| 1475 | return (FC1.GuardBranch->getSuccessor(i: 0) == FC1.Preheader); |
| 1476 | else |
| 1477 | return (FC1.GuardBranch->getSuccessor(i: 1) == FC1.Preheader); |
| 1478 | } |
| 1479 | |
| 1480 | /// Modify the latch branch of FC to be unconditional since successors of the |
| 1481 | /// branch are the same. |
| 1482 | void simplifyLatchBranch(const FusionCandidate &FC) const { |
| 1483 | BranchInst *FCLatchBranch = dyn_cast<BranchInst>(Val: FC.Latch->getTerminator()); |
| 1484 | if (FCLatchBranch) { |
| 1485 | assert(FCLatchBranch->isConditional() && |
| 1486 | FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && |
| 1487 | "Expecting the two successors of FCLatchBranch to be the same" ); |
| 1488 | BranchInst *NewBranch = |
| 1489 | BranchInst::Create(IfTrue: FCLatchBranch->getSuccessor(i: 0)); |
| 1490 | ReplaceInstWithInst(From: FCLatchBranch, To: NewBranch); |
| 1491 | } |
| 1492 | } |
| 1493 | |
| 1494 | /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique |
| 1495 | /// successor, then merge FC0.Latch with its unique successor. |
| 1496 | void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
| 1497 | moveInstructionsToTheBeginning(FromBB&: *FC0.Latch, ToBB&: *FC1.Latch, DT, PDT, DI); |
| 1498 | if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { |
| 1499 | MergeBlockIntoPredecessor(BB: Succ, DTU: &DTU, LI: &LI); |
| 1500 | DTU.flush(); |
| 1501 | } |
| 1502 | } |
| 1503 | |
| 1504 | /// Fuse two fusion candidates, creating a new fused loop. |
| 1505 | /// |
| 1506 | /// This method contains the mechanics of fusing two loops, represented by \p |
| 1507 | /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 |
| 1508 | /// postdominates \p FC0 (making them control flow equivalent). It also |
| 1509 | /// assumes that the other conditions for fusion have been met: adjacent, |
| 1510 | /// identical trip counts, and no negative distance dependencies exist that |
| 1511 | /// would prevent fusion. Thus, there is no checking for these conditions in |
| 1512 | /// this method. |
| 1513 | /// |
| 1514 | /// Fusion is performed by rewiring the CFG to update successor blocks of the |
| 1515 | /// components of tho loop. Specifically, the following changes are done: |
| 1516 | /// |
| 1517 | /// 1. The preheader of \p FC1 is removed as it is no longer necessary |
| 1518 | /// (because it is currently only a single statement block). |
| 1519 | /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. |
| 1520 | /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. |
| 1521 | /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. |
| 1522 | /// |
| 1523 | /// All of these modifications are done with dominator tree updates, thus |
| 1524 | /// keeping the dominator (and post dominator) information up-to-date. |
| 1525 | /// |
| 1526 | /// This can be improved in the future by actually merging blocks during |
| 1527 | /// fusion. For example, the preheader of \p FC1 can be merged with the |
| 1528 | /// preheader of \p FC0. This would allow loops with more than a single |
| 1529 | /// statement in the preheader to be fused. Similarly, the latch blocks of the |
| 1530 | /// two loops could also be fused into a single block. This will require |
| 1531 | /// analysis to prove it is safe to move the contents of the block past |
| 1532 | /// existing code, which currently has not been implemented. |
| 1533 | Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
| 1534 | assert(FC0.isValid() && FC1.isValid() && |
| 1535 | "Expecting valid fusion candidates" ); |
| 1536 | |
| 1537 | LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n" ; FC0.dump(); |
| 1538 | dbgs() << "Fusion Candidate 1: \n" ; FC1.dump();); |
| 1539 | |
| 1540 | // Move instructions from the preheader of FC1 to the end of the preheader |
| 1541 | // of FC0. |
| 1542 | moveInstructionsToTheEnd(FromBB&: *FC1.Preheader, ToBB&: *FC0.Preheader, DT, PDT, DI); |
| 1543 | |
| 1544 | // Fusing guarded loops is handled slightly differently than non-guarded |
| 1545 | // loops and has been broken out into a separate method instead of trying to |
| 1546 | // intersperse the logic within a single method. |
| 1547 | if (FC0.GuardBranch) |
| 1548 | return fuseGuardedLoops(FC0, FC1); |
| 1549 | |
| 1550 | assert(FC1.Preheader == |
| 1551 | (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); |
| 1552 | assert(FC1.Preheader->size() == 1 && |
| 1553 | FC1.Preheader->getSingleSuccessor() == FC1.Header); |
| 1554 | |
| 1555 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
| 1556 | // them later. However, this is only necessary if the new loop carried |
| 1557 | // values might not dominate the exiting branch. While we do not generally |
| 1558 | // test if this is the case but simply insert intermediate phi nodes, we |
| 1559 | // need to make sure these intermediate phi nodes have different |
| 1560 | // predecessors. To this end, we filter the special case where the exiting |
| 1561 | // block is the latch block of the first loop. Nothing needs to be done |
| 1562 | // anyway as all loop carried values dominate the latch and thereby also the |
| 1563 | // exiting branch. |
| 1564 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
| 1565 | if (FC0.ExitingBlock != FC0.Latch) |
| 1566 | for (PHINode &PHI : FC0.Header->phis()) |
| 1567 | OriginalFC0PHIs.push_back(Elt: &PHI); |
| 1568 | |
| 1569 | // Replace incoming blocks for header PHIs first. |
| 1570 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
| 1571 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
| 1572 | |
| 1573 | // Then modify the control flow and update DT and PDT. |
| 1574 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 1575 | |
| 1576 | // The old exiting block of the first loop (FC0) has to jump to the header |
| 1577 | // of the second as we need to execute the code in the second header block |
| 1578 | // regardless of the trip count. That is, if the trip count is 0, so the |
| 1579 | // back edge is never taken, we still have to execute both loop headers, |
| 1580 | // especially (but not only!) if the second is a do-while style loop. |
| 1581 | // However, doing so might invalidate the phi nodes of the first loop as |
| 1582 | // the new values do only need to dominate their latch and not the exiting |
| 1583 | // predicate. To remedy this potential problem we always introduce phi |
| 1584 | // nodes in the header of the second loop later that select the loop carried |
| 1585 | // value, if the second header was reached through an old latch of the |
| 1586 | // first, or undef otherwise. This is sound as exiting the first implies the |
| 1587 | // second will exit too, __without__ taking the back-edge. [Their |
| 1588 | // trip-counts are equal after all. |
| 1589 | // KB: Would this sequence be simpler to just make FC0.ExitingBlock go |
| 1590 | // to FC1.Header? I think this is basically what the three sequences are |
| 1591 | // trying to accomplish; however, doing this directly in the CFG may mean |
| 1592 | // the DT/PDT becomes invalid |
| 1593 | if (!FC0.Peeled) { |
| 1594 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC1.Preheader, |
| 1595 | To: FC1.Header); |
| 1596 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1597 | DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); |
| 1598 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1599 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1600 | } else { |
| 1601 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1602 | DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); |
| 1603 | |
| 1604 | // Remove the ExitBlock of the first Loop (also not needed) |
| 1605 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
| 1606 | To: FC1.Header); |
| 1607 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1608 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
| 1609 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
| 1610 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1611 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1612 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
| 1613 | } |
| 1614 | |
| 1615 | // The pre-header of L1 is not necessary anymore. |
| 1616 | assert(pred_empty(FC1.Preheader)); |
| 1617 | FC1.Preheader->getTerminator()->eraseFromParent(); |
| 1618 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
| 1619 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1620 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
| 1621 | |
| 1622 | // Moves the phi nodes from the second to the first loops header block. |
| 1623 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
| 1624 | if (SE.isSCEVable(Ty: PHI->getType())) |
| 1625 | SE.forgetValue(V: PHI); |
| 1626 | if (PHI->hasNUsesOrMore(N: 1)) |
| 1627 | PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt()); |
| 1628 | else |
| 1629 | PHI->eraseFromParent(); |
| 1630 | } |
| 1631 | |
| 1632 | // Introduce new phi nodes in the second loop header to ensure |
| 1633 | // exiting the first and jumping to the header of the second does not break |
| 1634 | // the SSA property of the phis originally in the first loop. See also the |
| 1635 | // comment above. |
| 1636 | BasicBlock::iterator = FC1.Header->begin(); |
| 1637 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
| 1638 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
| 1639 | assert(L1LatchBBIdx >= 0 && |
| 1640 | "Expected loop carried value to be rewired at this point!" ); |
| 1641 | |
| 1642 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
| 1643 | |
| 1644 | PHINode * = |
| 1645 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
| 1646 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
| 1647 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
| 1648 | L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()), |
| 1649 | BB: FC0.ExitingBlock); |
| 1650 | |
| 1651 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
| 1652 | } |
| 1653 | |
| 1654 | // Replace latch terminator destinations. |
| 1655 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
| 1656 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
| 1657 | |
| 1658 | // Modify the latch branch of FC0 to be unconditional as both successors of |
| 1659 | // the branch are the same. |
| 1660 | simplifyLatchBranch(FC: FC0); |
| 1661 | |
| 1662 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
| 1663 | // performed the updates above. |
| 1664 | if (FC0.Latch != FC0.ExitingBlock) |
| 1665 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1666 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
| 1667 | |
| 1668 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
| 1669 | FC0.Latch, FC0.Header)); |
| 1670 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
| 1671 | FC1.Latch, FC0.Header)); |
| 1672 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
| 1673 | FC1.Latch, FC1.Header)); |
| 1674 | |
| 1675 | // Update DT/PDT |
| 1676 | DTU.applyUpdates(Updates: TreeUpdates); |
| 1677 | |
| 1678 | LI.removeBlock(BB: FC1.Preheader); |
| 1679 | DTU.deleteBB(DelBB: FC1.Preheader); |
| 1680 | if (FC0.Peeled) { |
| 1681 | LI.removeBlock(BB: FC0.ExitBlock); |
| 1682 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
| 1683 | } |
| 1684 | |
| 1685 | DTU.flush(); |
| 1686 | |
| 1687 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
| 1688 | // and rebuild the information in subsequent passes of fusion? |
| 1689 | // Note: Need to forget the loops before merging the loop latches, as |
| 1690 | // mergeLatch may remove the only block in FC1. |
| 1691 | SE.forgetLoop(L: FC1.L); |
| 1692 | SE.forgetLoop(L: FC0.L); |
| 1693 | |
| 1694 | // Move instructions from FC0.Latch to FC1.Latch. |
| 1695 | // Note: mergeLatch requires an updated DT. |
| 1696 | mergeLatch(FC0, FC1); |
| 1697 | |
| 1698 | // Forget block dispositions as well, so that there are no dangling |
| 1699 | // pointers to erased/free'ed blocks. It should be done after mergeLatch() |
| 1700 | // since merging the latches may affect the dispositions. |
| 1701 | SE.forgetBlockAndLoopDispositions(); |
| 1702 | |
| 1703 | // Forget the cached SCEV values including the induction variable that may |
| 1704 | // have changed after the fusion. |
| 1705 | SE.forgetLoop(L: FC0.L); |
| 1706 | |
| 1707 | // Merge the loops. |
| 1708 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
| 1709 | for (BasicBlock *BB : Blocks) { |
| 1710 | FC0.L->addBlockEntry(BB); |
| 1711 | FC1.L->removeBlockFromLoop(BB); |
| 1712 | if (LI.getLoopFor(BB) != FC1.L) |
| 1713 | continue; |
| 1714 | LI.changeLoopFor(BB, L: FC0.L); |
| 1715 | } |
| 1716 | while (!FC1.L->isInnermost()) { |
| 1717 | const auto &ChildLoopIt = FC1.L->begin(); |
| 1718 | Loop *ChildLoop = *ChildLoopIt; |
| 1719 | FC1.L->removeChildLoop(I: ChildLoopIt); |
| 1720 | FC0.L->addChildLoop(NewChild: ChildLoop); |
| 1721 | } |
| 1722 | |
| 1723 | // Delete the now empty loop L1. |
| 1724 | LI.erase(L: FC1.L); |
| 1725 | |
| 1726 | #ifndef NDEBUG |
| 1727 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
| 1728 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
| 1729 | assert(PDT.verify()); |
| 1730 | LI.verify(DT); |
| 1731 | SE.verify(); |
| 1732 | #endif |
| 1733 | |
| 1734 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
| 1735 | |
| 1736 | return FC0.L; |
| 1737 | } |
| 1738 | |
| 1739 | /// Report details on loop fusion opportunities. |
| 1740 | /// |
| 1741 | /// This template function can be used to report both successful and missed |
| 1742 | /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should |
| 1743 | /// be one of: |
| 1744 | /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful |
| 1745 | /// given two valid fusion candidates. |
| 1746 | /// - OptimizationRemark to report successful fusion of two fusion |
| 1747 | /// candidates. |
| 1748 | /// The remarks will be printed using the form: |
| 1749 | /// <path/filename>:<line number>:<column number>: [<function name>]: |
| 1750 | /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> |
| 1751 | template <typename RemarkKind> |
| 1752 | void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, |
| 1753 | Statistic &Stat) { |
| 1754 | assert(FC0.Preheader && FC1.Preheader && |
| 1755 | "Expecting valid fusion candidates" ); |
| 1756 | using namespace ore; |
| 1757 | #if LLVM_ENABLE_STATS |
| 1758 | ++Stat; |
| 1759 | ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), |
| 1760 | FC0.Preheader) |
| 1761 | << "[" << FC0.Preheader->getParent()->getName() |
| 1762 | << "]: " << NV("Cand1" , StringRef(FC0.Preheader->getName())) |
| 1763 | << " and " << NV("Cand2" , StringRef(FC1.Preheader->getName())) |
| 1764 | << ": " << Stat.getDesc()); |
| 1765 | #endif |
| 1766 | } |
| 1767 | |
| 1768 | /// Fuse two guarded fusion candidates, creating a new fused loop. |
| 1769 | /// |
| 1770 | /// Fusing guarded loops is handled much the same way as fusing non-guarded |
| 1771 | /// loops. The rewiring of the CFG is slightly different though, because of |
| 1772 | /// the presence of the guards around the loops and the exit blocks after the |
| 1773 | /// loop body. As such, the new loop is rewired as follows: |
| 1774 | /// 1. Keep the guard branch from FC0 and use the non-loop block target |
| 1775 | /// from the FC1 guard branch. |
| 1776 | /// 2. Remove the exit block from FC0 (this exit block should be empty |
| 1777 | /// right now). |
| 1778 | /// 3. Remove the guard branch for FC1 |
| 1779 | /// 4. Remove the preheader for FC1. |
| 1780 | /// The exit block successor for the latch of FC0 is updated to be the header |
| 1781 | /// of FC1 and the non-exit block successor of the latch of FC1 is updated to |
| 1782 | /// be the header of FC0, thus creating the fused loop. |
| 1783 | Loop *fuseGuardedLoops(const FusionCandidate &FC0, |
| 1784 | const FusionCandidate &FC1) { |
| 1785 | assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops" ); |
| 1786 | |
| 1787 | BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); |
| 1788 | BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); |
| 1789 | BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); |
| 1790 | BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); |
| 1791 | BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); |
| 1792 | |
| 1793 | // Move instructions from the exit block of FC0 to the beginning of the exit |
| 1794 | // block of FC1, in the case that the FC0 loop has not been peeled. In the |
| 1795 | // case that FC0 loop is peeled, then move the instructions of the successor |
| 1796 | // of the FC0 Exit block to the beginning of the exit block of FC1. |
| 1797 | moveInstructionsToTheBeginning( |
| 1798 | FromBB&: (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), ToBB&: *FC1.ExitBlock, |
| 1799 | DT, PDT, DI); |
| 1800 | |
| 1801 | // Move instructions from the guard block of FC1 to the end of the guard |
| 1802 | // block of FC0. |
| 1803 | moveInstructionsToTheEnd(FromBB&: *FC1GuardBlock, ToBB&: *FC0GuardBlock, DT, PDT, DI); |
| 1804 | |
| 1805 | assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent" ); |
| 1806 | |
| 1807 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 1808 | |
| 1809 | //////////////////////////////////////////////////////////////////////////// |
| 1810 | // Update the Loop Guard |
| 1811 | //////////////////////////////////////////////////////////////////////////// |
| 1812 | // The guard for FC0 is updated to guard both FC0 and FC1. This is done by |
| 1813 | // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. |
| 1814 | // Thus, one path from the guard goes to the preheader for FC0 (and thus |
| 1815 | // executes the new fused loop) and the other path goes to the NonLoopBlock |
| 1816 | // for FC1 (where FC1 guard would have gone if FC1 was not executed). |
| 1817 | FC1NonLoopBlock->replacePhiUsesWith(Old: FC1GuardBlock, New: FC0GuardBlock); |
| 1818 | FC0.GuardBranch->replaceUsesOfWith(From: FC0NonLoopBlock, To: FC1NonLoopBlock); |
| 1819 | |
| 1820 | BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; |
| 1821 | BBToUpdate->getTerminator()->replaceUsesOfWith(From: FC1GuardBlock, To: FC1.Header); |
| 1822 | |
| 1823 | // The guard of FC1 is not necessary anymore. |
| 1824 | FC1.GuardBranch->eraseFromParent(); |
| 1825 | new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); |
| 1826 | |
| 1827 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1828 | DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); |
| 1829 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1830 | DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); |
| 1831 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1832 | DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); |
| 1833 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1834 | DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); |
| 1835 | |
| 1836 | if (FC0.Peeled) { |
| 1837 | // Remove the Block after the ExitBlock of FC0 |
| 1838 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1839 | DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); |
| 1840 | FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); |
| 1841 | new UnreachableInst(FC0ExitBlockSuccessor->getContext(), |
| 1842 | FC0ExitBlockSuccessor); |
| 1843 | } |
| 1844 | |
| 1845 | assert(pred_empty(FC1GuardBlock) && |
| 1846 | "Expecting guard block to have no predecessors" ); |
| 1847 | assert(succ_empty(FC1GuardBlock) && |
| 1848 | "Expecting guard block to have no successors" ); |
| 1849 | |
| 1850 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
| 1851 | // them later. However, this is only necessary if the new loop carried |
| 1852 | // values might not dominate the exiting branch. While we do not generally |
| 1853 | // test if this is the case but simply insert intermediate phi nodes, we |
| 1854 | // need to make sure these intermediate phi nodes have different |
| 1855 | // predecessors. To this end, we filter the special case where the exiting |
| 1856 | // block is the latch block of the first loop. Nothing needs to be done |
| 1857 | // anyway as all loop carried values dominate the latch and thereby also the |
| 1858 | // exiting branch. |
| 1859 | // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch |
| 1860 | // (because the loops are rotated. Thus, nothing will ever be added to |
| 1861 | // OriginalFC0PHIs. |
| 1862 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
| 1863 | if (FC0.ExitingBlock != FC0.Latch) |
| 1864 | for (PHINode &PHI : FC0.Header->phis()) |
| 1865 | OriginalFC0PHIs.push_back(Elt: &PHI); |
| 1866 | |
| 1867 | assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!" ); |
| 1868 | |
| 1869 | // Replace incoming blocks for header PHIs first. |
| 1870 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
| 1871 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
| 1872 | |
| 1873 | // The old exiting block of the first loop (FC0) has to jump to the header |
| 1874 | // of the second as we need to execute the code in the second header block |
| 1875 | // regardless of the trip count. That is, if the trip count is 0, so the |
| 1876 | // back edge is never taken, we still have to execute both loop headers, |
| 1877 | // especially (but not only!) if the second is a do-while style loop. |
| 1878 | // However, doing so might invalidate the phi nodes of the first loop as |
| 1879 | // the new values do only need to dominate their latch and not the exiting |
| 1880 | // predicate. To remedy this potential problem we always introduce phi |
| 1881 | // nodes in the header of the second loop later that select the loop carried |
| 1882 | // value, if the second header was reached through an old latch of the |
| 1883 | // first, or undef otherwise. This is sound as exiting the first implies the |
| 1884 | // second will exit too, __without__ taking the back-edge (their |
| 1885 | // trip-counts are equal after all). |
| 1886 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
| 1887 | To: FC1.Header); |
| 1888 | |
| 1889 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1890 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
| 1891 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1892 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1893 | |
| 1894 | // Remove FC0 Exit Block |
| 1895 | // The exit block for FC0 is no longer needed since control will flow |
| 1896 | // directly to the header of FC1. Since it is an empty block, it can be |
| 1897 | // removed at this point. |
| 1898 | // TODO: In the future, we can handle non-empty exit blocks my merging any |
| 1899 | // instructions from FC0 exit block into FC1 exit block prior to removing |
| 1900 | // the block. |
| 1901 | assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty" ); |
| 1902 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
| 1903 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
| 1904 | |
| 1905 | // Remove FC1 Preheader |
| 1906 | // The pre-header of L1 is not necessary anymore. |
| 1907 | assert(pred_empty(FC1.Preheader)); |
| 1908 | FC1.Preheader->getTerminator()->eraseFromParent(); |
| 1909 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
| 1910 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1911 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
| 1912 | |
| 1913 | // Moves the phi nodes from the second to the first loops header block. |
| 1914 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
| 1915 | if (SE.isSCEVable(Ty: PHI->getType())) |
| 1916 | SE.forgetValue(V: PHI); |
| 1917 | if (PHI->hasNUsesOrMore(N: 1)) |
| 1918 | PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt()); |
| 1919 | else |
| 1920 | PHI->eraseFromParent(); |
| 1921 | } |
| 1922 | |
| 1923 | // Introduce new phi nodes in the second loop header to ensure |
| 1924 | // exiting the first and jumping to the header of the second does not break |
| 1925 | // the SSA property of the phis originally in the first loop. See also the |
| 1926 | // comment above. |
| 1927 | BasicBlock::iterator = FC1.Header->begin(); |
| 1928 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
| 1929 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
| 1930 | assert(L1LatchBBIdx >= 0 && |
| 1931 | "Expected loop carried value to be rewired at this point!" ); |
| 1932 | |
| 1933 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
| 1934 | |
| 1935 | PHINode * = |
| 1936 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
| 1937 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
| 1938 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
| 1939 | L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()), |
| 1940 | BB: FC0.ExitingBlock); |
| 1941 | |
| 1942 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
| 1943 | } |
| 1944 | |
| 1945 | // Update the latches |
| 1946 | |
| 1947 | // Replace latch terminator destinations. |
| 1948 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
| 1949 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
| 1950 | |
| 1951 | // Modify the latch branch of FC0 to be unconditional as both successors of |
| 1952 | // the branch are the same. |
| 1953 | simplifyLatchBranch(FC: FC0); |
| 1954 | |
| 1955 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
| 1956 | // performed the updates above. |
| 1957 | if (FC0.Latch != FC0.ExitingBlock) |
| 1958 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
| 1959 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
| 1960 | |
| 1961 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
| 1962 | FC0.Latch, FC0.Header)); |
| 1963 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
| 1964 | FC1.Latch, FC0.Header)); |
| 1965 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
| 1966 | FC1.Latch, FC1.Header)); |
| 1967 | |
| 1968 | // All done |
| 1969 | // Apply the updates to the Dominator Tree and cleanup. |
| 1970 | |
| 1971 | assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!" ); |
| 1972 | assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!" ); |
| 1973 | |
| 1974 | // Update DT/PDT |
| 1975 | DTU.applyUpdates(Updates: TreeUpdates); |
| 1976 | |
| 1977 | LI.removeBlock(BB: FC1GuardBlock); |
| 1978 | LI.removeBlock(BB: FC1.Preheader); |
| 1979 | LI.removeBlock(BB: FC0.ExitBlock); |
| 1980 | if (FC0.Peeled) { |
| 1981 | LI.removeBlock(BB: FC0ExitBlockSuccessor); |
| 1982 | DTU.deleteBB(DelBB: FC0ExitBlockSuccessor); |
| 1983 | } |
| 1984 | DTU.deleteBB(DelBB: FC1GuardBlock); |
| 1985 | DTU.deleteBB(DelBB: FC1.Preheader); |
| 1986 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
| 1987 | DTU.flush(); |
| 1988 | |
| 1989 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
| 1990 | // and rebuild the information in subsequent passes of fusion? |
| 1991 | // Note: Need to forget the loops before merging the loop latches, as |
| 1992 | // mergeLatch may remove the only block in FC1. |
| 1993 | SE.forgetLoop(L: FC1.L); |
| 1994 | SE.forgetLoop(L: FC0.L); |
| 1995 | |
| 1996 | // Move instructions from FC0.Latch to FC1.Latch. |
| 1997 | // Note: mergeLatch requires an updated DT. |
| 1998 | mergeLatch(FC0, FC1); |
| 1999 | |
| 2000 | // Forget block dispositions as well, so that there are no dangling |
| 2001 | // pointers to erased/free'ed blocks. It should be done after mergeLatch() |
| 2002 | // since merging the latches may affect the dispositions. |
| 2003 | SE.forgetBlockAndLoopDispositions(); |
| 2004 | |
| 2005 | // Merge the loops. |
| 2006 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
| 2007 | for (BasicBlock *BB : Blocks) { |
| 2008 | FC0.L->addBlockEntry(BB); |
| 2009 | FC1.L->removeBlockFromLoop(BB); |
| 2010 | if (LI.getLoopFor(BB) != FC1.L) |
| 2011 | continue; |
| 2012 | LI.changeLoopFor(BB, L: FC0.L); |
| 2013 | } |
| 2014 | while (!FC1.L->isInnermost()) { |
| 2015 | const auto &ChildLoopIt = FC1.L->begin(); |
| 2016 | Loop *ChildLoop = *ChildLoopIt; |
| 2017 | FC1.L->removeChildLoop(I: ChildLoopIt); |
| 2018 | FC0.L->addChildLoop(NewChild: ChildLoop); |
| 2019 | } |
| 2020 | |
| 2021 | // Delete the now empty loop L1. |
| 2022 | LI.erase(L: FC1.L); |
| 2023 | |
| 2024 | #ifndef NDEBUG |
| 2025 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
| 2026 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
| 2027 | assert(PDT.verify()); |
| 2028 | LI.verify(DT); |
| 2029 | SE.verify(); |
| 2030 | #endif |
| 2031 | |
| 2032 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
| 2033 | |
| 2034 | return FC0.L; |
| 2035 | } |
| 2036 | }; |
| 2037 | } // namespace |
| 2038 | |
| 2039 | PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { |
| 2040 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
| 2041 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 2042 | auto &DI = AM.getResult<DependenceAnalysis>(IR&: F); |
| 2043 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
| 2044 | auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(IR&: F); |
| 2045 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
| 2046 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
| 2047 | const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
| 2048 | const DataLayout &DL = F.getDataLayout(); |
| 2049 | |
| 2050 | // Ensure loops are in simplifed form which is a pre-requisite for loop fusion |
| 2051 | // pass. Added only for new PM since the legacy PM has already added |
| 2052 | // LoopSimplify pass as a dependency. |
| 2053 | bool Changed = false; |
| 2054 | for (auto &L : LI) { |
| 2055 | Changed |= |
| 2056 | simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */); |
| 2057 | } |
| 2058 | if (Changed) |
| 2059 | PDT.recalculate(Func&: F); |
| 2060 | |
| 2061 | LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); |
| 2062 | Changed |= LF.fuseLoops(F); |
| 2063 | if (!Changed) |
| 2064 | return PreservedAnalyses::all(); |
| 2065 | |
| 2066 | PreservedAnalyses PA; |
| 2067 | PA.preserve<DominatorTreeAnalysis>(); |
| 2068 | PA.preserve<PostDominatorTreeAnalysis>(); |
| 2069 | PA.preserve<ScalarEvolutionAnalysis>(); |
| 2070 | PA.preserve<LoopAnalysis>(); |
| 2071 | return PA; |
| 2072 | } |
| 2073 | |