1 | //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements the loop fusion pass. |
11 | /// The implementation is largely based on the following document: |
12 | /// |
13 | /// Code Transformations to Augment the Scope of Loop Fusion in a |
14 | /// Production Compiler |
15 | /// Christopher Mark Barton |
16 | /// MSc Thesis |
17 | /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf |
18 | /// |
19 | /// The general approach taken is to collect sets of control flow equivalent |
20 | /// loops and test whether they can be fused. The necessary conditions for |
21 | /// fusion are: |
22 | /// 1. The loops must be adjacent (there cannot be any statements between |
23 | /// the two loops). |
24 | /// 2. The loops must be conforming (they must execute the same number of |
25 | /// iterations). |
26 | /// 3. The loops must be control flow equivalent (if one loop executes, the |
27 | /// other is guaranteed to execute). |
28 | /// 4. There cannot be any negative distance dependencies between the loops. |
29 | /// If all of these conditions are satisfied, it is safe to fuse the loops. |
30 | /// |
31 | /// This implementation creates FusionCandidates that represent the loop and the |
32 | /// necessary information needed by fusion. It then operates on the fusion |
33 | /// candidates, first confirming that the candidate is eligible for fusion. The |
34 | /// candidates are then collected into control flow equivalent sets, sorted in |
35 | /// dominance order. Each set of control flow equivalent candidates is then |
36 | /// traversed, attempting to fuse pairs of candidates in the set. If all |
37 | /// requirements for fusion are met, the two candidates are fused, creating a |
38 | /// new (fused) candidate which is then added back into the set to consider for |
39 | /// additional fusion. |
40 | /// |
41 | /// This implementation currently does not make any modifications to remove |
42 | /// conditions for fusion. Code transformations to make loops conform to each of |
43 | /// the conditions for fusion are discussed in more detail in the document |
44 | /// above. These can be added to the current implementation in the future. |
45 | //===----------------------------------------------------------------------===// |
46 | |
47 | #include "llvm/Transforms/Scalar/LoopFuse.h" |
48 | #include "llvm/ADT/Statistic.h" |
49 | #include "llvm/Analysis/AssumptionCache.h" |
50 | #include "llvm/Analysis/DependenceAnalysis.h" |
51 | #include "llvm/Analysis/DomTreeUpdater.h" |
52 | #include "llvm/Analysis/LoopInfo.h" |
53 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
54 | #include "llvm/Analysis/PostDominators.h" |
55 | #include "llvm/Analysis/ScalarEvolution.h" |
56 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
57 | #include "llvm/Analysis/TargetTransformInfo.h" |
58 | #include "llvm/IR/Function.h" |
59 | #include "llvm/IR/Verifier.h" |
60 | #include "llvm/Support/CommandLine.h" |
61 | #include "llvm/Support/Debug.h" |
62 | #include "llvm/Support/raw_ostream.h" |
63 | #include "llvm/Transforms/Utils.h" |
64 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
65 | #include "llvm/Transforms/Utils/CodeMoverUtils.h" |
66 | #include "llvm/Transforms/Utils/LoopPeel.h" |
67 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
68 | |
69 | using namespace llvm; |
70 | |
71 | #define DEBUG_TYPE "loop-fusion" |
72 | |
73 | STATISTIC(FuseCounter, "Loops fused" ); |
74 | STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion" ); |
75 | STATISTIC(, "Loop has invalid preheader" ); |
76 | STATISTIC(, "Loop has invalid header" ); |
77 | STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks" ); |
78 | STATISTIC(InvalidExitBlock, "Loop has invalid exit block" ); |
79 | STATISTIC(InvalidLatch, "Loop has invalid latch" ); |
80 | STATISTIC(InvalidLoop, "Loop is invalid" ); |
81 | STATISTIC(AddressTakenBB, "Basic block has address taken" ); |
82 | STATISTIC(MayThrowException, "Loop may throw an exception" ); |
83 | STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access" ); |
84 | STATISTIC(NotSimplifiedForm, "Loop is not in simplified form" ); |
85 | STATISTIC(InvalidDependencies, "Dependencies prevent fusion" ); |
86 | STATISTIC(UnknownTripCount, "Loop has unknown trip count" ); |
87 | STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop" ); |
88 | STATISTIC(NonEqualTripCount, "Loop trip counts are not the same" ); |
89 | STATISTIC(NonAdjacent, "Loops are not adjacent" ); |
90 | STATISTIC( |
91 | , |
92 | "Loop has a non-empty preheader with instructions that cannot be moved" ); |
93 | STATISTIC(FusionNotBeneficial, "Fusion is not beneficial" ); |
94 | STATISTIC(NonIdenticalGuards, "Candidates have different guards" ); |
95 | STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " |
96 | "instructions that cannot be moved" ); |
97 | STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " |
98 | "instructions that cannot be moved" ); |
99 | STATISTIC(NotRotated, "Candidate is not rotated" ); |
100 | STATISTIC(OnlySecondCandidateIsGuarded, |
101 | "The second candidate is guarded while the first one is not" ); |
102 | STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions." ); |
103 | STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions." ); |
104 | |
105 | enum FusionDependenceAnalysisChoice { |
106 | FUSION_DEPENDENCE_ANALYSIS_SCEV, |
107 | FUSION_DEPENDENCE_ANALYSIS_DA, |
108 | FUSION_DEPENDENCE_ANALYSIS_ALL, |
109 | }; |
110 | |
111 | static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( |
112 | "loop-fusion-dependence-analysis" , |
113 | cl::desc("Which dependence analysis should loop fusion use?" ), |
114 | cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev" , |
115 | "Use the scalar evolution interface" ), |
116 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da" , |
117 | "Use the dependence analysis interface" ), |
118 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all" , |
119 | "Use all available analyses" )), |
120 | cl::Hidden, cl::init(Val: FUSION_DEPENDENCE_ANALYSIS_ALL)); |
121 | |
122 | static cl::opt<unsigned> FusionPeelMaxCount( |
123 | "loop-fusion-peel-max-count" , cl::init(Val: 0), cl::Hidden, |
124 | cl::desc("Max number of iterations to be peeled from a loop, such that " |
125 | "fusion can take place" )); |
126 | |
127 | #ifndef NDEBUG |
128 | static cl::opt<bool> |
129 | VerboseFusionDebugging("loop-fusion-verbose-debug" , |
130 | cl::desc("Enable verbose debugging for Loop Fusion" ), |
131 | cl::Hidden, cl::init(false)); |
132 | #endif |
133 | |
134 | namespace { |
135 | /// This class is used to represent a candidate for loop fusion. When it is |
136 | /// constructed, it checks the conditions for loop fusion to ensure that it |
137 | /// represents a valid candidate. It caches several parts of a loop that are |
138 | /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead |
139 | /// of continually querying the underlying Loop to retrieve these values. It is |
140 | /// assumed these will not change throughout loop fusion. |
141 | /// |
142 | /// The invalidate method should be used to indicate that the FusionCandidate is |
143 | /// no longer a valid candidate for fusion. Similarly, the isValid() method can |
144 | /// be used to ensure that the FusionCandidate is still valid for fusion. |
145 | struct FusionCandidate { |
146 | /// Cache of parts of the loop used throughout loop fusion. These should not |
147 | /// need to change throughout the analysis and transformation. |
148 | /// These parts are cached to avoid repeatedly looking up in the Loop class. |
149 | |
150 | /// Preheader of the loop this candidate represents |
151 | BasicBlock *Preheader; |
152 | /// Header of the loop this candidate represents |
153 | BasicBlock *Header; |
154 | /// Blocks in the loop that exit the loop |
155 | BasicBlock *ExitingBlock; |
156 | /// The successor block of this loop (where the exiting blocks go to) |
157 | BasicBlock *ExitBlock; |
158 | /// Latch of the loop |
159 | BasicBlock *Latch; |
160 | /// The loop that this fusion candidate represents |
161 | Loop *L; |
162 | /// Vector of instructions in this loop that read from memory |
163 | SmallVector<Instruction *, 16> MemReads; |
164 | /// Vector of instructions in this loop that write to memory |
165 | SmallVector<Instruction *, 16> MemWrites; |
166 | /// Are all of the members of this fusion candidate still valid |
167 | bool Valid; |
168 | /// Guard branch of the loop, if it exists |
169 | BranchInst *GuardBranch; |
170 | /// Peeling Paramaters of the Loop. |
171 | TTI::PeelingPreferences PP; |
172 | /// Can you Peel this Loop? |
173 | bool AbleToPeel; |
174 | /// Has this loop been Peeled |
175 | bool Peeled; |
176 | |
177 | /// Dominator and PostDominator trees are needed for the |
178 | /// FusionCandidateCompare function, required by FusionCandidateSet to |
179 | /// determine where the FusionCandidate should be inserted into the set. These |
180 | /// are used to establish ordering of the FusionCandidates based on dominance. |
181 | DominatorTree &DT; |
182 | const PostDominatorTree *PDT; |
183 | |
184 | OptimizationRemarkEmitter &ORE; |
185 | |
186 | FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, |
187 | OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) |
188 | : Preheader(L->getLoopPreheader()), Header(L->getHeader()), |
189 | ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), |
190 | Latch(L->getLoopLatch()), L(L), Valid(true), |
191 | GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), |
192 | Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { |
193 | |
194 | // Walk over all blocks in the loop and check for conditions that may |
195 | // prevent fusion. For each block, walk over all instructions and collect |
196 | // the memory reads and writes If any instructions that prevent fusion are |
197 | // found, invalidate this object and return. |
198 | for (BasicBlock *BB : L->blocks()) { |
199 | if (BB->hasAddressTaken()) { |
200 | invalidate(); |
201 | reportInvalidCandidate(Stat&: AddressTakenBB); |
202 | return; |
203 | } |
204 | |
205 | for (Instruction &I : *BB) { |
206 | if (I.mayThrow()) { |
207 | invalidate(); |
208 | reportInvalidCandidate(Stat&: MayThrowException); |
209 | return; |
210 | } |
211 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: &I)) { |
212 | if (SI->isVolatile()) { |
213 | invalidate(); |
214 | reportInvalidCandidate(Stat&: ContainsVolatileAccess); |
215 | return; |
216 | } |
217 | } |
218 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: &I)) { |
219 | if (LI->isVolatile()) { |
220 | invalidate(); |
221 | reportInvalidCandidate(Stat&: ContainsVolatileAccess); |
222 | return; |
223 | } |
224 | } |
225 | if (I.mayWriteToMemory()) |
226 | MemWrites.push_back(Elt: &I); |
227 | if (I.mayReadFromMemory()) |
228 | MemReads.push_back(Elt: &I); |
229 | } |
230 | } |
231 | } |
232 | |
233 | /// Check if all members of the class are valid. |
234 | bool isValid() const { |
235 | return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && |
236 | !L->isInvalid() && Valid; |
237 | } |
238 | |
239 | /// Verify that all members are in sync with the Loop object. |
240 | void verify() const { |
241 | assert(isValid() && "Candidate is not valid!!" ); |
242 | assert(!L->isInvalid() && "Loop is invalid!" ); |
243 | assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync" ); |
244 | assert(Header == L->getHeader() && "Header is out of sync" ); |
245 | assert(ExitingBlock == L->getExitingBlock() && |
246 | "Exiting Blocks is out of sync" ); |
247 | assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync" ); |
248 | assert(Latch == L->getLoopLatch() && "Latch is out of sync" ); |
249 | } |
250 | |
251 | /// Get the entry block for this fusion candidate. |
252 | /// |
253 | /// If this fusion candidate represents a guarded loop, the entry block is the |
254 | /// loop guard block. If it represents an unguarded loop, the entry block is |
255 | /// the preheader of the loop. |
256 | BasicBlock *getEntryBlock() const { |
257 | if (GuardBranch) |
258 | return GuardBranch->getParent(); |
259 | else |
260 | return Preheader; |
261 | } |
262 | |
263 | /// After Peeling the loop is modified quite a bit, hence all of the Blocks |
264 | /// need to be updated accordingly. |
265 | void updateAfterPeeling() { |
266 | Preheader = L->getLoopPreheader(); |
267 | Header = L->getHeader(); |
268 | ExitingBlock = L->getExitingBlock(); |
269 | ExitBlock = L->getExitBlock(); |
270 | Latch = L->getLoopLatch(); |
271 | verify(); |
272 | } |
273 | |
274 | /// Given a guarded loop, get the successor of the guard that is not in the |
275 | /// loop. |
276 | /// |
277 | /// This method returns the successor of the loop guard that is not located |
278 | /// within the loop (i.e., the successor of the guard that is not the |
279 | /// preheader). |
280 | /// This method is only valid for guarded loops. |
281 | BasicBlock *getNonLoopBlock() const { |
282 | assert(GuardBranch && "Only valid on guarded loops." ); |
283 | assert(GuardBranch->isConditional() && |
284 | "Expecting guard to be a conditional branch." ); |
285 | if (Peeled) |
286 | return GuardBranch->getSuccessor(i: 1); |
287 | return (GuardBranch->getSuccessor(i: 0) == Preheader) |
288 | ? GuardBranch->getSuccessor(i: 1) |
289 | : GuardBranch->getSuccessor(i: 0); |
290 | } |
291 | |
292 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
293 | LLVM_DUMP_METHOD void dump() const { |
294 | dbgs() << "\tGuardBranch: " ; |
295 | if (GuardBranch) |
296 | dbgs() << *GuardBranch; |
297 | else |
298 | dbgs() << "nullptr" ; |
299 | dbgs() << "\n" |
300 | << (GuardBranch ? GuardBranch->getName() : "nullptr" ) << "\n" |
301 | << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr" ) |
302 | << "\n" |
303 | << "\tHeader: " << (Header ? Header->getName() : "nullptr" ) << "\n" |
304 | << "\tExitingBB: " |
305 | << (ExitingBlock ? ExitingBlock->getName() : "nullptr" ) << "\n" |
306 | << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr" ) |
307 | << "\n" |
308 | << "\tLatch: " << (Latch ? Latch->getName() : "nullptr" ) << "\n" |
309 | << "\tEntryBlock: " |
310 | << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr" ) |
311 | << "\n" ; |
312 | } |
313 | #endif |
314 | |
315 | /// Determine if a fusion candidate (representing a loop) is eligible for |
316 | /// fusion. Note that this only checks whether a single loop can be fused - it |
317 | /// does not check whether it is *legal* to fuse two loops together. |
318 | bool isEligibleForFusion(ScalarEvolution &SE) const { |
319 | if (!isValid()) { |
320 | LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n" ); |
321 | if (!Preheader) |
322 | ++InvalidPreheader; |
323 | if (!Header) |
324 | ++InvalidHeader; |
325 | if (!ExitingBlock) |
326 | ++InvalidExitingBlock; |
327 | if (!ExitBlock) |
328 | ++InvalidExitBlock; |
329 | if (!Latch) |
330 | ++InvalidLatch; |
331 | if (L->isInvalid()) |
332 | ++InvalidLoop; |
333 | |
334 | return false; |
335 | } |
336 | |
337 | // Require ScalarEvolution to be able to determine a trip count. |
338 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { |
339 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
340 | << " trip count not computable!\n" ); |
341 | return reportInvalidCandidate(Stat&: UnknownTripCount); |
342 | } |
343 | |
344 | if (!L->isLoopSimplifyForm()) { |
345 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
346 | << " is not in simplified form!\n" ); |
347 | return reportInvalidCandidate(Stat&: NotSimplifiedForm); |
348 | } |
349 | |
350 | if (!L->isRotatedForm()) { |
351 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n" ); |
352 | return reportInvalidCandidate(Stat&: NotRotated); |
353 | } |
354 | |
355 | return true; |
356 | } |
357 | |
358 | private: |
359 | // This is only used internally for now, to clear the MemWrites and MemReads |
360 | // list and setting Valid to false. I can't envision other uses of this right |
361 | // now, since once FusionCandidates are put into the FusionCandidateSet they |
362 | // are immutable. Thus, any time we need to change/update a FusionCandidate, |
363 | // we must create a new one and insert it into the FusionCandidateSet to |
364 | // ensure the FusionCandidateSet remains ordered correctly. |
365 | void invalidate() { |
366 | MemWrites.clear(); |
367 | MemReads.clear(); |
368 | Valid = false; |
369 | } |
370 | |
371 | bool reportInvalidCandidate(llvm::Statistic &Stat) const { |
372 | using namespace ore; |
373 | assert(L && Preheader && "Fusion candidate not initialized properly!" ); |
374 | #if LLVM_ENABLE_STATS |
375 | ++Stat; |
376 | ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), |
377 | L->getStartLoc(), Preheader) |
378 | << "[" << Preheader->getParent()->getName() << "]: " |
379 | << "Loop is not a candidate for fusion: " << Stat.getDesc()); |
380 | #endif |
381 | return false; |
382 | } |
383 | }; |
384 | |
385 | struct FusionCandidateCompare { |
386 | /// Comparison functor to sort two Control Flow Equivalent fusion candidates |
387 | /// into dominance order. |
388 | /// If LHS dominates RHS and RHS post-dominates LHS, return true; |
389 | /// If RHS dominates LHS and LHS post-dominates RHS, return false; |
390 | /// If both LHS and RHS are not dominating each other then, non-strictly |
391 | /// post dominate check will decide the order of candidates. If RHS |
392 | /// non-strictly post dominates LHS then, return true. If LHS non-strictly |
393 | /// post dominates RHS then, return false. If both are non-strictly post |
394 | /// dominate each other then, level in the post dominator tree will decide |
395 | /// the order of candidates. |
396 | bool operator()(const FusionCandidate &LHS, |
397 | const FusionCandidate &RHS) const { |
398 | const DominatorTree *DT = &(LHS.DT); |
399 | |
400 | BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); |
401 | BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); |
402 | |
403 | // Do not save PDT to local variable as it is only used in asserts and thus |
404 | // will trigger an unused variable warning if building without asserts. |
405 | assert(DT && LHS.PDT && "Expecting valid dominator tree" ); |
406 | |
407 | // Do this compare first so if LHS == RHS, function returns false. |
408 | if (DT->dominates(A: RHSEntryBlock, B: LHSEntryBlock)) { |
409 | // RHS dominates LHS |
410 | // Verify LHS post-dominates RHS |
411 | assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); |
412 | return false; |
413 | } |
414 | |
415 | if (DT->dominates(A: LHSEntryBlock, B: RHSEntryBlock)) { |
416 | // Verify RHS Postdominates LHS |
417 | assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); |
418 | return true; |
419 | } |
420 | |
421 | // If two FusionCandidates are in the same level of dominator tree, |
422 | // they will not dominate each other, but may still be control flow |
423 | // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate() |
424 | // function is needed. |
425 | bool WrongOrder = |
426 | nonStrictlyPostDominate(ThisBlock: LHSEntryBlock, OtherBlock: RHSEntryBlock, DT, PDT: LHS.PDT); |
427 | bool RightOrder = |
428 | nonStrictlyPostDominate(ThisBlock: RHSEntryBlock, OtherBlock: LHSEntryBlock, DT, PDT: LHS.PDT); |
429 | if (WrongOrder && RightOrder) { |
430 | // If common predecessor of LHS and RHS post dominates both |
431 | // FusionCandidates then, Order of FusionCandidate can be |
432 | // identified by its level in post dominator tree. |
433 | DomTreeNode *LNode = LHS.PDT->getNode(BB: LHSEntryBlock); |
434 | DomTreeNode *RNode = LHS.PDT->getNode(BB: RHSEntryBlock); |
435 | return LNode->getLevel() > RNode->getLevel(); |
436 | } else if (WrongOrder) |
437 | return false; |
438 | else if (RightOrder) |
439 | return true; |
440 | |
441 | // If LHS does not non-strict Postdominate RHS and RHS does not non-strict |
442 | // Postdominate LHS then, there is no dominance relationship between the |
443 | // two FusionCandidates. Thus, they should not be in the same set together. |
444 | llvm_unreachable( |
445 | "No dominance relationship between these fusion candidates!" ); |
446 | } |
447 | }; |
448 | |
449 | using LoopVector = SmallVector<Loop *, 4>; |
450 | |
451 | // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance |
452 | // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 |
453 | // dominates FC1 and FC1 post-dominates FC0. |
454 | // std::set was chosen because we want a sorted data structure with stable |
455 | // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent |
456 | // loops by moving intervening code around. When this intervening code contains |
457 | // loops, those loops will be moved also. The corresponding FusionCandidates |
458 | // will also need to be moved accordingly. As this is done, having stable |
459 | // iterators will simplify the logic. Similarly, having an efficient insert that |
460 | // keeps the FusionCandidateSet sorted will also simplify the implementation. |
461 | using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; |
462 | using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; |
463 | |
464 | #if !defined(NDEBUG) |
465 | static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
466 | const FusionCandidate &FC) { |
467 | if (FC.isValid()) |
468 | OS << FC.Preheader->getName(); |
469 | else |
470 | OS << "<Invalid>" ; |
471 | |
472 | return OS; |
473 | } |
474 | |
475 | static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
476 | const FusionCandidateSet &CandSet) { |
477 | for (const FusionCandidate &FC : CandSet) |
478 | OS << FC << '\n'; |
479 | |
480 | return OS; |
481 | } |
482 | |
483 | static void |
484 | printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { |
485 | dbgs() << "Fusion Candidates: \n" ; |
486 | for (const auto &CandidateSet : FusionCandidates) { |
487 | dbgs() << "*** Fusion Candidate Set ***\n" ; |
488 | dbgs() << CandidateSet; |
489 | dbgs() << "****************************\n" ; |
490 | } |
491 | } |
492 | #endif |
493 | |
494 | /// Collect all loops in function at the same nest level, starting at the |
495 | /// outermost level. |
496 | /// |
497 | /// This data structure collects all loops at the same nest level for a |
498 | /// given function (specified by the LoopInfo object). It starts at the |
499 | /// outermost level. |
500 | struct LoopDepthTree { |
501 | using LoopsOnLevelTy = SmallVector<LoopVector, 4>; |
502 | using iterator = LoopsOnLevelTy::iterator; |
503 | using const_iterator = LoopsOnLevelTy::const_iterator; |
504 | |
505 | LoopDepthTree(LoopInfo &LI) : Depth(1) { |
506 | if (!LI.empty()) |
507 | LoopsOnLevel.emplace_back(Args: LoopVector(LI.rbegin(), LI.rend())); |
508 | } |
509 | |
510 | /// Test whether a given loop has been removed from the function, and thus is |
511 | /// no longer valid. |
512 | bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(Ptr: L); } |
513 | |
514 | /// Record that a given loop has been removed from the function and is no |
515 | /// longer valid. |
516 | void removeLoop(const Loop *L) { RemovedLoops.insert(Ptr: L); } |
517 | |
518 | /// Descend the tree to the next (inner) nesting level |
519 | void descend() { |
520 | LoopsOnLevelTy LoopsOnNextLevel; |
521 | |
522 | for (const LoopVector &LV : *this) |
523 | for (Loop *L : LV) |
524 | if (!isRemovedLoop(L) && L->begin() != L->end()) |
525 | LoopsOnNextLevel.emplace_back(Args: LoopVector(L->begin(), L->end())); |
526 | |
527 | LoopsOnLevel = LoopsOnNextLevel; |
528 | RemovedLoops.clear(); |
529 | Depth++; |
530 | } |
531 | |
532 | bool empty() const { return size() == 0; } |
533 | size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } |
534 | unsigned getDepth() const { return Depth; } |
535 | |
536 | iterator begin() { return LoopsOnLevel.begin(); } |
537 | iterator end() { return LoopsOnLevel.end(); } |
538 | const_iterator begin() const { return LoopsOnLevel.begin(); } |
539 | const_iterator end() const { return LoopsOnLevel.end(); } |
540 | |
541 | private: |
542 | /// Set of loops that have been removed from the function and are no longer |
543 | /// valid. |
544 | SmallPtrSet<const Loop *, 8> RemovedLoops; |
545 | |
546 | /// Depth of the current level, starting at 1 (outermost loops). |
547 | unsigned Depth; |
548 | |
549 | /// Vector of loops at the current depth level that have the same parent loop |
550 | LoopsOnLevelTy LoopsOnLevel; |
551 | }; |
552 | |
553 | #ifndef NDEBUG |
554 | static void printLoopVector(const LoopVector &LV) { |
555 | dbgs() << "****************************\n" ; |
556 | for (auto *L : LV) |
557 | printLoop(*L, dbgs()); |
558 | dbgs() << "****************************\n" ; |
559 | } |
560 | #endif |
561 | |
562 | struct LoopFuser { |
563 | private: |
564 | // Sets of control flow equivalent fusion candidates for a given nest level. |
565 | FusionCandidateCollection FusionCandidates; |
566 | |
567 | LoopDepthTree LDT; |
568 | DomTreeUpdater DTU; |
569 | |
570 | LoopInfo &LI; |
571 | DominatorTree &DT; |
572 | DependenceInfo &DI; |
573 | ScalarEvolution &SE; |
574 | PostDominatorTree &PDT; |
575 | OptimizationRemarkEmitter &ORE; |
576 | AssumptionCache &AC; |
577 | const TargetTransformInfo &TTI; |
578 | |
579 | public: |
580 | LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, |
581 | ScalarEvolution &SE, PostDominatorTree &PDT, |
582 | OptimizationRemarkEmitter &ORE, const DataLayout &DL, |
583 | AssumptionCache &AC, const TargetTransformInfo &TTI) |
584 | : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), |
585 | DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} |
586 | |
587 | /// This is the main entry point for loop fusion. It will traverse the |
588 | /// specified function and collect candidate loops to fuse, starting at the |
589 | /// outermost nesting level and working inwards. |
590 | bool fuseLoops(Function &F) { |
591 | #ifndef NDEBUG |
592 | if (VerboseFusionDebugging) { |
593 | LI.print(dbgs()); |
594 | } |
595 | #endif |
596 | |
597 | LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() |
598 | << "\n" ); |
599 | bool Changed = false; |
600 | |
601 | while (!LDT.empty()) { |
602 | LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " |
603 | << LDT.getDepth() << "\n" ;); |
604 | |
605 | for (const LoopVector &LV : LDT) { |
606 | assert(LV.size() > 0 && "Empty loop set was build!" ); |
607 | |
608 | // Skip singleton loop sets as they do not offer fusion opportunities on |
609 | // this level. |
610 | if (LV.size() == 1) |
611 | continue; |
612 | #ifndef NDEBUG |
613 | if (VerboseFusionDebugging) { |
614 | LLVM_DEBUG({ |
615 | dbgs() << " Visit loop set (#" << LV.size() << "):\n" ; |
616 | printLoopVector(LV); |
617 | }); |
618 | } |
619 | #endif |
620 | |
621 | collectFusionCandidates(LV); |
622 | Changed |= fuseCandidates(); |
623 | } |
624 | |
625 | // Finished analyzing candidates at this level. |
626 | // Descend to the next level and clear all of the candidates currently |
627 | // collected. Note that it will not be possible to fuse any of the |
628 | // existing candidates with new candidates because the new candidates will |
629 | // be at a different nest level and thus not be control flow equivalent |
630 | // with all of the candidates collected so far. |
631 | LLVM_DEBUG(dbgs() << "Descend one level!\n" ); |
632 | LDT.descend(); |
633 | FusionCandidates.clear(); |
634 | } |
635 | |
636 | if (Changed) |
637 | LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n" ; F.dump();); |
638 | |
639 | #ifndef NDEBUG |
640 | assert(DT.verify()); |
641 | assert(PDT.verify()); |
642 | LI.verify(DT); |
643 | SE.verify(); |
644 | #endif |
645 | |
646 | LLVM_DEBUG(dbgs() << "Loop Fusion complete\n" ); |
647 | return Changed; |
648 | } |
649 | |
650 | private: |
651 | /// Determine if two fusion candidates are control flow equivalent. |
652 | /// |
653 | /// Two fusion candidates are control flow equivalent if when one executes, |
654 | /// the other is guaranteed to execute. This is determined using dominators |
655 | /// and post-dominators: if A dominates B and B post-dominates A then A and B |
656 | /// are control-flow equivalent. |
657 | bool isControlFlowEquivalent(const FusionCandidate &FC0, |
658 | const FusionCandidate &FC1) const { |
659 | assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders" ); |
660 | |
661 | return ::isControlFlowEquivalent(BB0: *FC0.getEntryBlock(), BB1: *FC1.getEntryBlock(), |
662 | DT, PDT); |
663 | } |
664 | |
665 | /// Iterate over all loops in the given loop set and identify the loops that |
666 | /// are eligible for fusion. Place all eligible fusion candidates into Control |
667 | /// Flow Equivalent sets, sorted by dominance. |
668 | void collectFusionCandidates(const LoopVector &LV) { |
669 | for (Loop *L : LV) { |
670 | TTI::PeelingPreferences PP = |
671 | gatherPeelingPreferences(L, SE, TTI, UserAllowPeeling: std::nullopt, UserAllowProfileBasedPeeling: std::nullopt); |
672 | FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); |
673 | if (!CurrCand.isEligibleForFusion(SE)) |
674 | continue; |
675 | |
676 | // Go through each list in FusionCandidates and determine if L is control |
677 | // flow equivalent with the first loop in that list. If it is, append LV. |
678 | // If not, go to the next list. |
679 | // If no suitable list is found, start another list and add it to |
680 | // FusionCandidates. |
681 | bool FoundSet = false; |
682 | |
683 | for (auto &CurrCandSet : FusionCandidates) { |
684 | if (isControlFlowEquivalent(FC0: *CurrCandSet.begin(), FC1: CurrCand)) { |
685 | CurrCandSet.insert(x: CurrCand); |
686 | FoundSet = true; |
687 | #ifndef NDEBUG |
688 | if (VerboseFusionDebugging) |
689 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand |
690 | << " to existing candidate set\n" ); |
691 | #endif |
692 | break; |
693 | } |
694 | } |
695 | if (!FoundSet) { |
696 | // No set was found. Create a new set and add to FusionCandidates |
697 | #ifndef NDEBUG |
698 | if (VerboseFusionDebugging) |
699 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n" ); |
700 | #endif |
701 | FusionCandidateSet NewCandSet; |
702 | NewCandSet.insert(x: CurrCand); |
703 | FusionCandidates.push_back(Elt: NewCandSet); |
704 | } |
705 | NumFusionCandidates++; |
706 | } |
707 | } |
708 | |
709 | /// Determine if it is beneficial to fuse two loops. |
710 | /// |
711 | /// For now, this method simply returns true because we want to fuse as much |
712 | /// as possible (primarily to test the pass). This method will evolve, over |
713 | /// time, to add heuristics for profitability of fusion. |
714 | bool isBeneficialFusion(const FusionCandidate &FC0, |
715 | const FusionCandidate &FC1) { |
716 | return true; |
717 | } |
718 | |
719 | /// Determine if two fusion candidates have the same trip count (i.e., they |
720 | /// execute the same number of iterations). |
721 | /// |
722 | /// This function will return a pair of values. The first is a boolean, |
723 | /// stating whether or not the two candidates are known at compile time to |
724 | /// have the same TripCount. The second is the difference in the two |
725 | /// TripCounts. This information can be used later to determine whether or not |
726 | /// peeling can be performed on either one of the candidates. |
727 | std::pair<bool, std::optional<unsigned>> |
728 | haveIdenticalTripCounts(const FusionCandidate &FC0, |
729 | const FusionCandidate &FC1) const { |
730 | const SCEV *TripCount0 = SE.getBackedgeTakenCount(L: FC0.L); |
731 | if (isa<SCEVCouldNotCompute>(Val: TripCount0)) { |
732 | UncomputableTripCount++; |
733 | LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!" ); |
734 | return {false, std::nullopt}; |
735 | } |
736 | |
737 | const SCEV *TripCount1 = SE.getBackedgeTakenCount(L: FC1.L); |
738 | if (isa<SCEVCouldNotCompute>(Val: TripCount1)) { |
739 | UncomputableTripCount++; |
740 | LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!" ); |
741 | return {false, std::nullopt}; |
742 | } |
743 | |
744 | LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " |
745 | << *TripCount1 << " are " |
746 | << (TripCount0 == TripCount1 ? "identical" : "different" ) |
747 | << "\n" ); |
748 | |
749 | if (TripCount0 == TripCount1) |
750 | return {true, 0}; |
751 | |
752 | LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " |
753 | "determining the difference between trip counts\n" ); |
754 | |
755 | // Currently only considering loops with a single exit point |
756 | // and a non-constant trip count. |
757 | const unsigned TC0 = SE.getSmallConstantTripCount(L: FC0.L); |
758 | const unsigned TC1 = SE.getSmallConstantTripCount(L: FC1.L); |
759 | |
760 | // If any of the tripcounts are zero that means that loop(s) do not have |
761 | // a single exit or a constant tripcount. |
762 | if (TC0 == 0 || TC1 == 0) { |
763 | LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " |
764 | "have a constant number of iterations. Peeling " |
765 | "is not benefical\n" ); |
766 | return {false, std::nullopt}; |
767 | } |
768 | |
769 | std::optional<unsigned> Difference; |
770 | int Diff = TC0 - TC1; |
771 | |
772 | if (Diff > 0) |
773 | Difference = Diff; |
774 | else { |
775 | LLVM_DEBUG( |
776 | dbgs() << "Difference is less than 0. FC1 (second loop) has more " |
777 | "iterations than the first one. Currently not supported\n" ); |
778 | } |
779 | |
780 | LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference |
781 | << "\n" ); |
782 | |
783 | return {false, Difference}; |
784 | } |
785 | |
786 | void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, |
787 | unsigned PeelCount) { |
788 | assert(FC0.AbleToPeel && "Should be able to peel loop" ); |
789 | |
790 | LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount |
791 | << " iterations of the first loop. \n" ); |
792 | |
793 | ValueToValueMapTy VMap; |
794 | FC0.Peeled = peelLoop(L: FC0.L, PeelCount, LI: &LI, SE: &SE, DT, AC: &AC, PreserveLCSSA: true, VMap); |
795 | if (FC0.Peeled) { |
796 | LLVM_DEBUG(dbgs() << "Done Peeling\n" ); |
797 | |
798 | #ifndef NDEBUG |
799 | auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); |
800 | |
801 | assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && |
802 | "Loops should have identical trip counts after peeling" ); |
803 | #endif |
804 | |
805 | FC0.PP.PeelCount += PeelCount; |
806 | |
807 | // Peeling does not update the PDT |
808 | PDT.recalculate(Func&: *FC0.Preheader->getParent()); |
809 | |
810 | FC0.updateAfterPeeling(); |
811 | |
812 | // In this case the iterations of the loop are constant, so the first |
813 | // loop will execute completely (will not jump from one of |
814 | // the peeled blocks to the second loop). Here we are updating the |
815 | // branch conditions of each of the peeled blocks, such that it will |
816 | // branch to its successor which is not the preheader of the second loop |
817 | // in the case of unguarded loops, or the succesors of the exit block of |
818 | // the first loop otherwise. Doing this update will ensure that the entry |
819 | // block of the first loop dominates the entry block of the second loop. |
820 | BasicBlock *BB = |
821 | FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; |
822 | if (BB) { |
823 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
824 | SmallVector<Instruction *, 8> WorkList; |
825 | for (BasicBlock *Pred : predecessors(BB)) { |
826 | if (Pred != FC0.ExitBlock) { |
827 | WorkList.emplace_back(Args: Pred->getTerminator()); |
828 | TreeUpdates.emplace_back( |
829 | Args: DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); |
830 | } |
831 | } |
832 | // Cannot modify the predecessors inside the above loop as it will cause |
833 | // the iterators to be nullptrs, causing memory errors. |
834 | for (Instruction *CurrentBranch : WorkList) { |
835 | BasicBlock *Succ = CurrentBranch->getSuccessor(Idx: 0); |
836 | if (Succ == BB) |
837 | Succ = CurrentBranch->getSuccessor(Idx: 1); |
838 | ReplaceInstWithInst(From: CurrentBranch, To: BranchInst::Create(IfTrue: Succ)); |
839 | } |
840 | |
841 | DTU.applyUpdates(Updates: TreeUpdates); |
842 | DTU.flush(); |
843 | } |
844 | LLVM_DEBUG( |
845 | dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount |
846 | << " iterations from the first loop.\n" |
847 | "Both Loops have the same number of iterations now.\n" ); |
848 | } |
849 | } |
850 | |
851 | /// Walk each set of control flow equivalent fusion candidates and attempt to |
852 | /// fuse them. This does a single linear traversal of all candidates in the |
853 | /// set. The conditions for legal fusion are checked at this point. If a pair |
854 | /// of fusion candidates passes all legality checks, they are fused together |
855 | /// and a new fusion candidate is created and added to the FusionCandidateSet. |
856 | /// The original fusion candidates are then removed, as they are no longer |
857 | /// valid. |
858 | bool fuseCandidates() { |
859 | bool Fused = false; |
860 | LLVM_DEBUG(printFusionCandidates(FusionCandidates)); |
861 | for (auto &CandidateSet : FusionCandidates) { |
862 | if (CandidateSet.size() < 2) |
863 | continue; |
864 | |
865 | LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" |
866 | << CandidateSet << "\n" ); |
867 | |
868 | for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { |
869 | assert(!LDT.isRemovedLoop(FC0->L) && |
870 | "Should not have removed loops in CandidateSet!" ); |
871 | auto FC1 = FC0; |
872 | for (++FC1; FC1 != CandidateSet.end(); ++FC1) { |
873 | assert(!LDT.isRemovedLoop(FC1->L) && |
874 | "Should not have removed loops in CandidateSet!" ); |
875 | |
876 | LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n" ; FC0->dump(); |
877 | dbgs() << " with\n" ; FC1->dump(); dbgs() << "\n" ); |
878 | |
879 | FC0->verify(); |
880 | FC1->verify(); |
881 | |
882 | // Check if the candidates have identical tripcounts (first value of |
883 | // pair), and if not check the difference in the tripcounts between |
884 | // the loops (second value of pair). The difference is not equal to |
885 | // std::nullopt iff the loops iterate a constant number of times, and |
886 | // have a single exit. |
887 | std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = |
888 | haveIdenticalTripCounts(FC0: *FC0, FC1: *FC1); |
889 | bool SameTripCount = IdenticalTripCountRes.first; |
890 | std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; |
891 | |
892 | // Here we are checking that FC0 (the first loop) can be peeled, and |
893 | // both loops have different tripcounts. |
894 | if (FC0->AbleToPeel && !SameTripCount && TCDifference) { |
895 | if (*TCDifference > FusionPeelMaxCount) { |
896 | LLVM_DEBUG(dbgs() |
897 | << "Difference in loop trip counts: " << *TCDifference |
898 | << " is greater than maximum peel count specificed: " |
899 | << FusionPeelMaxCount << "\n" ); |
900 | } else { |
901 | // Dependent on peeling being performed on the first loop, and |
902 | // assuming all other conditions for fusion return true. |
903 | SameTripCount = true; |
904 | } |
905 | } |
906 | |
907 | if (!SameTripCount) { |
908 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " |
909 | "counts. Not fusing.\n" ); |
910 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
911 | Stat&: NonEqualTripCount); |
912 | continue; |
913 | } |
914 | |
915 | if (!isAdjacent(FC0: *FC0, FC1: *FC1)) { |
916 | LLVM_DEBUG(dbgs() |
917 | << "Fusion candidates are not adjacent. Not fusing.\n" ); |
918 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, Stat&: NonAdjacent); |
919 | continue; |
920 | } |
921 | |
922 | if ((!FC0->GuardBranch && FC1->GuardBranch) || |
923 | (FC0->GuardBranch && !FC1->GuardBranch)) { |
924 | LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " |
925 | "another one is not. Not fusing.\n" ); |
926 | reportLoopFusion<OptimizationRemarkMissed>( |
927 | FC0: *FC0, FC1: *FC1, Stat&: OnlySecondCandidateIsGuarded); |
928 | continue; |
929 | } |
930 | |
931 | // Ensure that FC0 and FC1 have identical guards. |
932 | // If one (or both) are not guarded, this check is not necessary. |
933 | if (FC0->GuardBranch && FC1->GuardBranch && |
934 | !haveIdenticalGuards(FC0: *FC0, FC1: *FC1) && !TCDifference) { |
935 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " |
936 | "guards. Not Fusing.\n" ); |
937 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
938 | Stat&: NonIdenticalGuards); |
939 | continue; |
940 | } |
941 | |
942 | if (FC0->GuardBranch) { |
943 | assert(FC1->GuardBranch && "Expecting valid FC1 guard branch" ); |
944 | |
945 | if (!isSafeToMoveBefore(BB&: *FC0->ExitBlock, |
946 | InsertPoint&: *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, |
947 | PDT: &PDT, DI: &DI)) { |
948 | LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " |
949 | "instructions in exit block. Not fusing.\n" ); |
950 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
951 | Stat&: NonEmptyExitBlock); |
952 | continue; |
953 | } |
954 | |
955 | if (!isSafeToMoveBefore( |
956 | BB&: *FC1->GuardBranch->getParent(), |
957 | InsertPoint&: *FC0->GuardBranch->getParent()->getTerminator(), DT, PDT: &PDT, |
958 | DI: &DI)) { |
959 | LLVM_DEBUG(dbgs() |
960 | << "Fusion candidate contains unsafe " |
961 | "instructions in guard block. Not fusing.\n" ); |
962 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
963 | Stat&: NonEmptyGuardBlock); |
964 | continue; |
965 | } |
966 | } |
967 | |
968 | // Check the dependencies across the loops and do not fuse if it would |
969 | // violate them. |
970 | if (!dependencesAllowFusion(FC0: *FC0, FC1: *FC1)) { |
971 | LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n" ); |
972 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
973 | Stat&: InvalidDependencies); |
974 | continue; |
975 | } |
976 | |
977 | // If the second loop has instructions in the pre-header, attempt to |
978 | // hoist them up to the first loop's pre-header or sink them into the |
979 | // body of the second loop. |
980 | SmallVector<Instruction *, 4> SafeToHoist; |
981 | SmallVector<Instruction *, 4> SafeToSink; |
982 | // At this point, this is the last remaining legality check. |
983 | // Which means if we can make this pre-header empty, we can fuse |
984 | // these loops |
985 | if (!isEmptyPreheader(FC: *FC1)) { |
986 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " |
987 | "preheader.\n" ); |
988 | |
989 | // If it is not safe to hoist/sink all instructions in the |
990 | // pre-header, we cannot fuse these loops. |
991 | if (!collectMovablePreheaderInsts(FC0: *FC0, FC1: *FC1, SafeToHoist, |
992 | SafeToSink)) { |
993 | LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " |
994 | "Fusion Candidate Pre-header.\n" |
995 | << "Not Fusing.\n" ); |
996 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
997 | Stat&: NonEmptyPreheader); |
998 | continue; |
999 | } |
1000 | } |
1001 | |
1002 | bool BeneficialToFuse = isBeneficialFusion(FC0: *FC0, FC1: *FC1); |
1003 | LLVM_DEBUG(dbgs() |
1004 | << "\tFusion appears to be " |
1005 | << (BeneficialToFuse ? "" : "un" ) << "profitable!\n" ); |
1006 | if (!BeneficialToFuse) { |
1007 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
1008 | Stat&: FusionNotBeneficial); |
1009 | continue; |
1010 | } |
1011 | // All analysis has completed and has determined that fusion is legal |
1012 | // and profitable. At this point, start transforming the code and |
1013 | // perform fusion. |
1014 | |
1015 | // Execute the hoist/sink operations on preheader instructions |
1016 | movePreheaderInsts(FC0: *FC0, FC1: *FC1, HoistInsts&: SafeToHoist, SinkInsts&: SafeToSink); |
1017 | |
1018 | LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " |
1019 | << *FC1 << "\n" ); |
1020 | |
1021 | FusionCandidate FC0Copy = *FC0; |
1022 | // Peel the loop after determining that fusion is legal. The Loops |
1023 | // will still be safe to fuse after the peeling is performed. |
1024 | bool Peel = TCDifference && *TCDifference > 0; |
1025 | if (Peel) |
1026 | peelFusionCandidate(FC0&: FC0Copy, FC1: *FC1, PeelCount: *TCDifference); |
1027 | |
1028 | // Report fusion to the Optimization Remarks. |
1029 | // Note this needs to be done *before* performFusion because |
1030 | // performFusion will change the original loops, making it not |
1031 | // possible to identify them after fusion is complete. |
1032 | reportLoopFusion<OptimizationRemark>(FC0: (Peel ? FC0Copy : *FC0), FC1: *FC1, |
1033 | Stat&: FuseCounter); |
1034 | |
1035 | FusionCandidate FusedCand( |
1036 | performFusion(FC0: (Peel ? FC0Copy : *FC0), FC1: *FC1), DT, &PDT, ORE, |
1037 | FC0Copy.PP); |
1038 | FusedCand.verify(); |
1039 | assert(FusedCand.isEligibleForFusion(SE) && |
1040 | "Fused candidate should be eligible for fusion!" ); |
1041 | |
1042 | // Notify the loop-depth-tree that these loops are not valid objects |
1043 | LDT.removeLoop(L: FC1->L); |
1044 | |
1045 | CandidateSet.erase(position: FC0); |
1046 | CandidateSet.erase(position: FC1); |
1047 | |
1048 | auto InsertPos = CandidateSet.insert(x: FusedCand); |
1049 | |
1050 | assert(InsertPos.second && |
1051 | "Unable to insert TargetCandidate in CandidateSet!" ); |
1052 | |
1053 | // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations |
1054 | // of the FC1 loop will attempt to fuse the new (fused) loop with the |
1055 | // remaining candidates in the current candidate set. |
1056 | FC0 = FC1 = InsertPos.first; |
1057 | |
1058 | LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet |
1059 | << "\n" ); |
1060 | |
1061 | Fused = true; |
1062 | } |
1063 | } |
1064 | } |
1065 | return Fused; |
1066 | } |
1067 | |
1068 | // Returns true if the instruction \p I can be hoisted to the end of the |
1069 | // preheader of \p FC0. \p SafeToHoist contains the instructions that are |
1070 | // known to be safe to hoist. The instructions encountered that cannot be |
1071 | // hoisted are in \p NotHoisting. |
1072 | // TODO: Move functionality into CodeMoverUtils |
1073 | bool canHoistInst(Instruction &I, |
1074 | const SmallVector<Instruction *, 4> &SafeToHoist, |
1075 | const SmallVector<Instruction *, 4> &NotHoisting, |
1076 | const FusionCandidate &FC0) const { |
1077 | const BasicBlock * = FC0.Preheader->getSingleSuccessor(); |
1078 | assert(FC0PreheaderTarget && |
1079 | "Expected single successor for loop preheader." ); |
1080 | |
1081 | for (Use &Op : I.operands()) { |
1082 | if (auto *OpInst = dyn_cast<Instruction>(Val&: Op)) { |
1083 | bool OpHoisted = is_contained(Range: SafeToHoist, Element: OpInst); |
1084 | // Check if we have already decided to hoist this operand. In this |
1085 | // case, it does not dominate FC0 *yet*, but will after we hoist it. |
1086 | if (!(OpHoisted || DT.dominates(Def: OpInst, BB: FC0PreheaderTarget))) { |
1087 | return false; |
1088 | } |
1089 | } |
1090 | } |
1091 | |
1092 | // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs |
1093 | // cannot be hoisted and should be sunk to the exit of the fused loop. |
1094 | if (isa<PHINode>(Val: I)) |
1095 | return false; |
1096 | |
1097 | // If this isn't a memory inst, hoisting is safe |
1098 | if (!I.mayReadOrWriteMemory()) |
1099 | return true; |
1100 | |
1101 | LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n" ); |
1102 | for (Instruction *NotHoistedInst : NotHoisting) { |
1103 | if (auto D = DI.depends(Src: &I, Dst: NotHoistedInst, PossiblyLoopIndependent: true)) { |
1104 | // Dependency is not read-before-write, write-before-read or |
1105 | // write-before-write |
1106 | if (D->isFlow() || D->isAnti() || D->isOutput()) { |
1107 | LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " |
1108 | "preheader that is not being hoisted.\n" ); |
1109 | return false; |
1110 | } |
1111 | } |
1112 | } |
1113 | |
1114 | for (Instruction *ReadInst : FC0.MemReads) { |
1115 | if (auto D = DI.depends(Src: ReadInst, Dst: &I, PossiblyLoopIndependent: true)) { |
1116 | // Dependency is not read-before-write |
1117 | if (D->isAnti()) { |
1118 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n" ); |
1119 | return false; |
1120 | } |
1121 | } |
1122 | } |
1123 | |
1124 | for (Instruction *WriteInst : FC0.MemWrites) { |
1125 | if (auto D = DI.depends(Src: WriteInst, Dst: &I, PossiblyLoopIndependent: true)) { |
1126 | // Dependency is not write-before-read or write-before-write |
1127 | if (D->isFlow() || D->isOutput()) { |
1128 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n" ); |
1129 | return false; |
1130 | } |
1131 | } |
1132 | } |
1133 | return true; |
1134 | } |
1135 | |
1136 | // Returns true if the instruction \p I can be sunk to the top of the exit |
1137 | // block of \p FC1. |
1138 | // TODO: Move functionality into CodeMoverUtils |
1139 | bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { |
1140 | for (User *U : I.users()) { |
1141 | if (auto *UI{dyn_cast<Instruction>(Val: U)}) { |
1142 | // Cannot sink if user in loop |
1143 | // If FC1 has phi users of this value, we cannot sink it into FC1. |
1144 | if (FC1.L->contains(Inst: UI)) { |
1145 | // Cannot hoist or sink this instruction. No hoisting/sinking |
1146 | // should take place, loops should not fuse |
1147 | return false; |
1148 | } |
1149 | } |
1150 | } |
1151 | |
1152 | // If this isn't a memory inst, sinking is safe |
1153 | if (!I.mayReadOrWriteMemory()) |
1154 | return true; |
1155 | |
1156 | for (Instruction *ReadInst : FC1.MemReads) { |
1157 | if (auto D = DI.depends(Src: &I, Dst: ReadInst, PossiblyLoopIndependent: true)) { |
1158 | // Dependency is not write-before-read |
1159 | if (D->isFlow()) { |
1160 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n" ); |
1161 | return false; |
1162 | } |
1163 | } |
1164 | } |
1165 | |
1166 | for (Instruction *WriteInst : FC1.MemWrites) { |
1167 | if (auto D = DI.depends(Src: &I, Dst: WriteInst, PossiblyLoopIndependent: true)) { |
1168 | // Dependency is not write-before-write or read-before-write |
1169 | if (D->isOutput() || D->isAnti()) { |
1170 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n" ); |
1171 | return false; |
1172 | } |
1173 | } |
1174 | } |
1175 | |
1176 | return true; |
1177 | } |
1178 | |
1179 | /// Collect instructions in the \p FC1 Preheader that can be hoisted |
1180 | /// to the \p FC0 Preheader or sunk into the \p FC1 Body |
1181 | bool collectMovablePreheaderInsts( |
1182 | const FusionCandidate &FC0, const FusionCandidate &FC1, |
1183 | SmallVector<Instruction *, 4> &SafeToHoist, |
1184 | SmallVector<Instruction *, 4> &SafeToSink) const { |
1185 | BasicBlock * = FC1.Preheader; |
1186 | // Save the instructions that are not being hoisted, so we know not to hoist |
1187 | // mem insts that they dominate. |
1188 | SmallVector<Instruction *, 4> NotHoisting; |
1189 | |
1190 | for (Instruction &I : *FC1Preheader) { |
1191 | // Can't move a branch |
1192 | if (&I == FC1Preheader->getTerminator()) |
1193 | continue; |
1194 | // If the instruction has side-effects, give up. |
1195 | // TODO: The case of mayReadFromMemory we can handle but requires |
1196 | // additional work with a dependence analysis so for now we give |
1197 | // up on memory reads. |
1198 | if (I.mayThrow() || !I.willReturn()) { |
1199 | LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n" ); |
1200 | return false; |
1201 | } |
1202 | |
1203 | LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n" ); |
1204 | |
1205 | if (I.isAtomic() || I.isVolatile()) { |
1206 | LLVM_DEBUG( |
1207 | dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n" ); |
1208 | return false; |
1209 | } |
1210 | |
1211 | if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { |
1212 | SafeToHoist.push_back(Elt: &I); |
1213 | LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n" ); |
1214 | } else { |
1215 | LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n" ); |
1216 | NotHoisting.push_back(Elt: &I); |
1217 | |
1218 | if (canSinkInst(I, FC1)) { |
1219 | SafeToSink.push_back(Elt: &I); |
1220 | LLVM_DEBUG(dbgs() << "\tSafe to sink.\n" ); |
1221 | } else { |
1222 | LLVM_DEBUG(dbgs() << "\tCould not sink.\n" ); |
1223 | return false; |
1224 | } |
1225 | } |
1226 | } |
1227 | LLVM_DEBUG( |
1228 | dbgs() << "All preheader instructions could be sunk or hoisted!\n" ); |
1229 | return true; |
1230 | } |
1231 | |
1232 | /// Rewrite all additive recurrences in a SCEV to use a new loop. |
1233 | class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { |
1234 | public: |
1235 | AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, |
1236 | bool UseMax = true) |
1237 | : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), |
1238 | NewL(NewL) {} |
1239 | |
1240 | const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { |
1241 | const Loop *ExprL = Expr->getLoop(); |
1242 | SmallVector<const SCEV *, 2> Operands; |
1243 | if (ExprL == &OldL) { |
1244 | append_range(C&: Operands, R: Expr->operands()); |
1245 | return SE.getAddRecExpr(Operands, L: &NewL, Flags: Expr->getNoWrapFlags()); |
1246 | } |
1247 | |
1248 | if (OldL.contains(L: ExprL)) { |
1249 | bool Pos = SE.isKnownPositive(S: Expr->getStepRecurrence(SE)); |
1250 | if (!UseMax || !Pos || !Expr->isAffine()) { |
1251 | Valid = false; |
1252 | return Expr; |
1253 | } |
1254 | return visit(S: Expr->getStart()); |
1255 | } |
1256 | |
1257 | for (const SCEV *Op : Expr->operands()) |
1258 | Operands.push_back(Elt: visit(S: Op)); |
1259 | return SE.getAddRecExpr(Operands, L: ExprL, Flags: Expr->getNoWrapFlags()); |
1260 | } |
1261 | |
1262 | bool wasValidSCEV() const { return Valid; } |
1263 | |
1264 | private: |
1265 | bool Valid, UseMax; |
1266 | const Loop &OldL, &NewL; |
1267 | }; |
1268 | |
1269 | /// Return false if the access functions of \p I0 and \p I1 could cause |
1270 | /// a negative dependence. |
1271 | bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, |
1272 | Instruction &I1, bool EqualIsInvalid) { |
1273 | Value *Ptr0 = getLoadStorePointerOperand(V: &I0); |
1274 | Value *Ptr1 = getLoadStorePointerOperand(V: &I1); |
1275 | if (!Ptr0 || !Ptr1) |
1276 | return false; |
1277 | |
1278 | const SCEV *SCEVPtr0 = SE.getSCEVAtScope(V: Ptr0, L: &L0); |
1279 | const SCEV *SCEVPtr1 = SE.getSCEVAtScope(V: Ptr1, L: &L1); |
1280 | #ifndef NDEBUG |
1281 | if (VerboseFusionDebugging) |
1282 | LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " |
1283 | << *SCEVPtr1 << "\n" ); |
1284 | #endif |
1285 | AddRecLoopReplacer Rewriter(SE, L0, L1); |
1286 | SCEVPtr0 = Rewriter.visit(S: SCEVPtr0); |
1287 | #ifndef NDEBUG |
1288 | if (VerboseFusionDebugging) |
1289 | LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 |
1290 | << " [Valid: " << Rewriter.wasValidSCEV() << "]\n" ); |
1291 | #endif |
1292 | if (!Rewriter.wasValidSCEV()) |
1293 | return false; |
1294 | |
1295 | // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by |
1296 | // L0) and the other is not. We could check if it is monotone and test |
1297 | // the beginning and end value instead. |
1298 | |
1299 | BasicBlock * = L0.getHeader(); |
1300 | auto HasNonLinearDominanceRelation = [&](const SCEV *S) { |
1301 | const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Val: S); |
1302 | if (!AddRec) |
1303 | return false; |
1304 | return !DT.dominates(A: L0Header, B: AddRec->getLoop()->getHeader()) && |
1305 | !DT.dominates(A: AddRec->getLoop()->getHeader(), B: L0Header); |
1306 | }; |
1307 | if (SCEVExprContains(Root: SCEVPtr1, Pred: HasNonLinearDominanceRelation)) |
1308 | return false; |
1309 | |
1310 | ICmpInst::Predicate Pred = |
1311 | EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; |
1312 | bool IsAlwaysGE = SE.isKnownPredicate(Pred, LHS: SCEVPtr0, RHS: SCEVPtr1); |
1313 | #ifndef NDEBUG |
1314 | if (VerboseFusionDebugging) |
1315 | LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 |
1316 | << (IsAlwaysGE ? " >= " : " may < " ) << *SCEVPtr1 |
1317 | << "\n" ); |
1318 | #endif |
1319 | return IsAlwaysGE; |
1320 | } |
1321 | |
1322 | /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in |
1323 | /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses |
1324 | /// specified by @p DepChoice are used to determine this. |
1325 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
1326 | const FusionCandidate &FC1, Instruction &I0, |
1327 | Instruction &I1, bool AnyDep, |
1328 | FusionDependenceAnalysisChoice DepChoice) { |
1329 | #ifndef NDEBUG |
1330 | if (VerboseFusionDebugging) { |
1331 | LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " |
1332 | << DepChoice << "\n" ); |
1333 | } |
1334 | #endif |
1335 | switch (DepChoice) { |
1336 | case FUSION_DEPENDENCE_ANALYSIS_SCEV: |
1337 | return accessDiffIsPositive(L0: *FC0.L, L1: *FC1.L, I0, I1, EqualIsInvalid: AnyDep); |
1338 | case FUSION_DEPENDENCE_ANALYSIS_DA: { |
1339 | auto DepResult = DI.depends(Src: &I0, Dst: &I1, PossiblyLoopIndependent: true); |
1340 | if (!DepResult) |
1341 | return true; |
1342 | #ifndef NDEBUG |
1343 | if (VerboseFusionDebugging) { |
1344 | LLVM_DEBUG(dbgs() << "DA res: " ; DepResult->dump(dbgs()); |
1345 | dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " |
1346 | << (DepResult->isOrdered() ? "true" : "false" ) |
1347 | << "]\n" ); |
1348 | LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() |
1349 | << "\n" ); |
1350 | } |
1351 | #endif |
1352 | |
1353 | if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) |
1354 | LLVM_DEBUG( |
1355 | dbgs() << "TODO: Implement pred/succ dependence handling!\n" ); |
1356 | |
1357 | // TODO: Can we actually use the dependence info analysis here? |
1358 | return false; |
1359 | } |
1360 | |
1361 | case FUSION_DEPENDENCE_ANALYSIS_ALL: |
1362 | return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
1363 | DepChoice: FUSION_DEPENDENCE_ANALYSIS_SCEV) || |
1364 | dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
1365 | DepChoice: FUSION_DEPENDENCE_ANALYSIS_DA); |
1366 | } |
1367 | |
1368 | llvm_unreachable("Unknown fusion dependence analysis choice!" ); |
1369 | } |
1370 | |
1371 | /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. |
1372 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
1373 | const FusionCandidate &FC1) { |
1374 | LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 |
1375 | << "\n" ); |
1376 | assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); |
1377 | assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); |
1378 | |
1379 | for (Instruction *WriteL0 : FC0.MemWrites) { |
1380 | for (Instruction *WriteL1 : FC1.MemWrites) |
1381 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1, |
1382 | /* AnyDep */ false, |
1383 | DepChoice: FusionDependenceAnalysis)) { |
1384 | InvalidDependencies++; |
1385 | return false; |
1386 | } |
1387 | for (Instruction *ReadL1 : FC1.MemReads) |
1388 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *ReadL1, |
1389 | /* AnyDep */ false, |
1390 | DepChoice: FusionDependenceAnalysis)) { |
1391 | InvalidDependencies++; |
1392 | return false; |
1393 | } |
1394 | } |
1395 | |
1396 | for (Instruction *WriteL1 : FC1.MemWrites) { |
1397 | for (Instruction *WriteL0 : FC0.MemWrites) |
1398 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1, |
1399 | /* AnyDep */ false, |
1400 | DepChoice: FusionDependenceAnalysis)) { |
1401 | InvalidDependencies++; |
1402 | return false; |
1403 | } |
1404 | for (Instruction *ReadL0 : FC0.MemReads) |
1405 | if (!dependencesAllowFusion(FC0, FC1, I0&: *ReadL0, I1&: *WriteL1, |
1406 | /* AnyDep */ false, |
1407 | DepChoice: FusionDependenceAnalysis)) { |
1408 | InvalidDependencies++; |
1409 | return false; |
1410 | } |
1411 | } |
1412 | |
1413 | // Walk through all uses in FC1. For each use, find the reaching def. If the |
1414 | // def is located in FC0 then it is not safe to fuse. |
1415 | for (BasicBlock *BB : FC1.L->blocks()) |
1416 | for (Instruction &I : *BB) |
1417 | for (auto &Op : I.operands()) |
1418 | if (Instruction *Def = dyn_cast<Instruction>(Val&: Op)) |
1419 | if (FC0.L->contains(BB: Def->getParent())) { |
1420 | InvalidDependencies++; |
1421 | return false; |
1422 | } |
1423 | |
1424 | return true; |
1425 | } |
1426 | |
1427 | /// Determine if two fusion candidates are adjacent in the CFG. |
1428 | /// |
1429 | /// This method will determine if there are additional basic blocks in the CFG |
1430 | /// between the exit of \p FC0 and the entry of \p FC1. |
1431 | /// If the two candidates are guarded loops, then it checks whether the |
1432 | /// non-loop successor of the \p FC0 guard branch is the entry block of \p |
1433 | /// FC1. If not, then the loops are not adjacent. If the two candidates are |
1434 | /// not guarded loops, then it checks whether the exit block of \p FC0 is the |
1435 | /// preheader of \p FC1. |
1436 | bool isAdjacent(const FusionCandidate &FC0, |
1437 | const FusionCandidate &FC1) const { |
1438 | // If the successor of the guard branch is FC1, then the loops are adjacent |
1439 | if (FC0.GuardBranch) |
1440 | return FC0.getNonLoopBlock() == FC1.getEntryBlock(); |
1441 | else |
1442 | return FC0.ExitBlock == FC1.getEntryBlock(); |
1443 | } |
1444 | |
1445 | bool isEmptyPreheader(const FusionCandidate &FC) const { |
1446 | return FC.Preheader->size() == 1; |
1447 | } |
1448 | |
1449 | /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader |
1450 | /// and sink others into the body of \p FC1. |
1451 | void movePreheaderInsts(const FusionCandidate &FC0, |
1452 | const FusionCandidate &FC1, |
1453 | SmallVector<Instruction *, 4> &HoistInsts, |
1454 | SmallVector<Instruction *, 4> &SinkInsts) const { |
1455 | // All preheader instructions except the branch must be hoisted or sunk |
1456 | assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && |
1457 | "Attempting to sink and hoist preheader instructions, but not all " |
1458 | "the preheader instructions are accounted for." ); |
1459 | |
1460 | NumHoistedInsts += HoistInsts.size(); |
1461 | NumSunkInsts += SinkInsts.size(); |
1462 | |
1463 | LLVM_DEBUG(if (VerboseFusionDebugging) { |
1464 | if (!HoistInsts.empty()) |
1465 | dbgs() << "Hoisting: \n" ; |
1466 | for (Instruction *I : HoistInsts) |
1467 | dbgs() << *I << "\n" ; |
1468 | if (!SinkInsts.empty()) |
1469 | dbgs() << "Sinking: \n" ; |
1470 | for (Instruction *I : SinkInsts) |
1471 | dbgs() << *I << "\n" ; |
1472 | }); |
1473 | |
1474 | for (Instruction *I : HoistInsts) { |
1475 | assert(I->getParent() == FC1.Preheader); |
1476 | I->moveBefore(BB&: *FC0.Preheader, |
1477 | I: FC0.Preheader->getTerminator()->getIterator()); |
1478 | } |
1479 | // insert instructions in reverse order to maintain dominance relationship |
1480 | for (Instruction *I : reverse(C&: SinkInsts)) { |
1481 | assert(I->getParent() == FC1.Preheader); |
1482 | I->moveBefore(BB&: *FC1.ExitBlock, I: FC1.ExitBlock->getFirstInsertionPt()); |
1483 | } |
1484 | } |
1485 | |
1486 | /// Determine if two fusion candidates have identical guards |
1487 | /// |
1488 | /// This method will determine if two fusion candidates have the same guards. |
1489 | /// The guards are considered the same if: |
1490 | /// 1. The instructions to compute the condition used in the compare are |
1491 | /// identical. |
1492 | /// 2. The successors of the guard have the same flow into/around the loop. |
1493 | /// If the compare instructions are identical, then the first successor of the |
1494 | /// guard must go to the same place (either the preheader of the loop or the |
1495 | /// NonLoopBlock). In other words, the first successor of both loops must |
1496 | /// both go into the loop (i.e., the preheader) or go around the loop (i.e., |
1497 | /// the NonLoopBlock). The same must be true for the second successor. |
1498 | bool haveIdenticalGuards(const FusionCandidate &FC0, |
1499 | const FusionCandidate &FC1) const { |
1500 | assert(FC0.GuardBranch && FC1.GuardBranch && |
1501 | "Expecting FC0 and FC1 to be guarded loops." ); |
1502 | |
1503 | if (auto FC0CmpInst = |
1504 | dyn_cast<Instruction>(Val: FC0.GuardBranch->getCondition())) |
1505 | if (auto FC1CmpInst = |
1506 | dyn_cast<Instruction>(Val: FC1.GuardBranch->getCondition())) |
1507 | if (!FC0CmpInst->isIdenticalTo(I: FC1CmpInst)) |
1508 | return false; |
1509 | |
1510 | // The compare instructions are identical. |
1511 | // Now make sure the successor of the guards have the same flow into/around |
1512 | // the loop |
1513 | if (FC0.GuardBranch->getSuccessor(i: 0) == FC0.Preheader) |
1514 | return (FC1.GuardBranch->getSuccessor(i: 0) == FC1.Preheader); |
1515 | else |
1516 | return (FC1.GuardBranch->getSuccessor(i: 1) == FC1.Preheader); |
1517 | } |
1518 | |
1519 | /// Modify the latch branch of FC to be unconditional since successors of the |
1520 | /// branch are the same. |
1521 | void simplifyLatchBranch(const FusionCandidate &FC) const { |
1522 | BranchInst *FCLatchBranch = dyn_cast<BranchInst>(Val: FC.Latch->getTerminator()); |
1523 | if (FCLatchBranch) { |
1524 | assert(FCLatchBranch->isConditional() && |
1525 | FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && |
1526 | "Expecting the two successors of FCLatchBranch to be the same" ); |
1527 | BranchInst *NewBranch = |
1528 | BranchInst::Create(IfTrue: FCLatchBranch->getSuccessor(i: 0)); |
1529 | ReplaceInstWithInst(From: FCLatchBranch, To: NewBranch); |
1530 | } |
1531 | } |
1532 | |
1533 | /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique |
1534 | /// successor, then merge FC0.Latch with its unique successor. |
1535 | void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
1536 | moveInstructionsToTheBeginning(FromBB&: *FC0.Latch, ToBB&: *FC1.Latch, DT, PDT, DI); |
1537 | if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { |
1538 | MergeBlockIntoPredecessor(BB: Succ, DTU: &DTU, LI: &LI); |
1539 | DTU.flush(); |
1540 | } |
1541 | } |
1542 | |
1543 | /// Fuse two fusion candidates, creating a new fused loop. |
1544 | /// |
1545 | /// This method contains the mechanics of fusing two loops, represented by \p |
1546 | /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 |
1547 | /// postdominates \p FC0 (making them control flow equivalent). It also |
1548 | /// assumes that the other conditions for fusion have been met: adjacent, |
1549 | /// identical trip counts, and no negative distance dependencies exist that |
1550 | /// would prevent fusion. Thus, there is no checking for these conditions in |
1551 | /// this method. |
1552 | /// |
1553 | /// Fusion is performed by rewiring the CFG to update successor blocks of the |
1554 | /// components of tho loop. Specifically, the following changes are done: |
1555 | /// |
1556 | /// 1. The preheader of \p FC1 is removed as it is no longer necessary |
1557 | /// (because it is currently only a single statement block). |
1558 | /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. |
1559 | /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. |
1560 | /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. |
1561 | /// |
1562 | /// All of these modifications are done with dominator tree updates, thus |
1563 | /// keeping the dominator (and post dominator) information up-to-date. |
1564 | /// |
1565 | /// This can be improved in the future by actually merging blocks during |
1566 | /// fusion. For example, the preheader of \p FC1 can be merged with the |
1567 | /// preheader of \p FC0. This would allow loops with more than a single |
1568 | /// statement in the preheader to be fused. Similarly, the latch blocks of the |
1569 | /// two loops could also be fused into a single block. This will require |
1570 | /// analysis to prove it is safe to move the contents of the block past |
1571 | /// existing code, which currently has not been implemented. |
1572 | Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
1573 | assert(FC0.isValid() && FC1.isValid() && |
1574 | "Expecting valid fusion candidates" ); |
1575 | |
1576 | LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n" ; FC0.dump(); |
1577 | dbgs() << "Fusion Candidate 1: \n" ; FC1.dump();); |
1578 | |
1579 | // Move instructions from the preheader of FC1 to the end of the preheader |
1580 | // of FC0. |
1581 | moveInstructionsToTheEnd(FromBB&: *FC1.Preheader, ToBB&: *FC0.Preheader, DT, PDT, DI); |
1582 | |
1583 | // Fusing guarded loops is handled slightly differently than non-guarded |
1584 | // loops and has been broken out into a separate method instead of trying to |
1585 | // intersperse the logic within a single method. |
1586 | if (FC0.GuardBranch) |
1587 | return fuseGuardedLoops(FC0, FC1); |
1588 | |
1589 | assert(FC1.Preheader == |
1590 | (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); |
1591 | assert(FC1.Preheader->size() == 1 && |
1592 | FC1.Preheader->getSingleSuccessor() == FC1.Header); |
1593 | |
1594 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
1595 | // them later. However, this is only necessary if the new loop carried |
1596 | // values might not dominate the exiting branch. While we do not generally |
1597 | // test if this is the case but simply insert intermediate phi nodes, we |
1598 | // need to make sure these intermediate phi nodes have different |
1599 | // predecessors. To this end, we filter the special case where the exiting |
1600 | // block is the latch block of the first loop. Nothing needs to be done |
1601 | // anyway as all loop carried values dominate the latch and thereby also the |
1602 | // exiting branch. |
1603 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
1604 | if (FC0.ExitingBlock != FC0.Latch) |
1605 | for (PHINode &PHI : FC0.Header->phis()) |
1606 | OriginalFC0PHIs.push_back(Elt: &PHI); |
1607 | |
1608 | // Replace incoming blocks for header PHIs first. |
1609 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
1610 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
1611 | |
1612 | // Then modify the control flow and update DT and PDT. |
1613 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
1614 | |
1615 | // The old exiting block of the first loop (FC0) has to jump to the header |
1616 | // of the second as we need to execute the code in the second header block |
1617 | // regardless of the trip count. That is, if the trip count is 0, so the |
1618 | // back edge is never taken, we still have to execute both loop headers, |
1619 | // especially (but not only!) if the second is a do-while style loop. |
1620 | // However, doing so might invalidate the phi nodes of the first loop as |
1621 | // the new values do only need to dominate their latch and not the exiting |
1622 | // predicate. To remedy this potential problem we always introduce phi |
1623 | // nodes in the header of the second loop later that select the loop carried |
1624 | // value, if the second header was reached through an old latch of the |
1625 | // first, or undef otherwise. This is sound as exiting the first implies the |
1626 | // second will exit too, __without__ taking the back-edge. [Their |
1627 | // trip-counts are equal after all. |
1628 | // KB: Would this sequence be simpler to just make FC0.ExitingBlock go |
1629 | // to FC1.Header? I think this is basically what the three sequences are |
1630 | // trying to accomplish; however, doing this directly in the CFG may mean |
1631 | // the DT/PDT becomes invalid |
1632 | if (!FC0.Peeled) { |
1633 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC1.Preheader, |
1634 | To: FC1.Header); |
1635 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1636 | DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); |
1637 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1638 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
1639 | } else { |
1640 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1641 | DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); |
1642 | |
1643 | // Remove the ExitBlock of the first Loop (also not needed) |
1644 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
1645 | To: FC1.Header); |
1646 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1647 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
1648 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
1649 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1650 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
1651 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
1652 | } |
1653 | |
1654 | // The pre-header of L1 is not necessary anymore. |
1655 | assert(pred_empty(FC1.Preheader)); |
1656 | FC1.Preheader->getTerminator()->eraseFromParent(); |
1657 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
1658 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1659 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
1660 | |
1661 | // Moves the phi nodes from the second to the first loops header block. |
1662 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
1663 | if (SE.isSCEVable(Ty: PHI->getType())) |
1664 | SE.forgetValue(V: PHI); |
1665 | if (PHI->hasNUsesOrMore(N: 1)) |
1666 | PHI->moveBefore(MovePos: &*FC0.Header->getFirstInsertionPt()); |
1667 | else |
1668 | PHI->eraseFromParent(); |
1669 | } |
1670 | |
1671 | // Introduce new phi nodes in the second loop header to ensure |
1672 | // exiting the first and jumping to the header of the second does not break |
1673 | // the SSA property of the phis originally in the first loop. See also the |
1674 | // comment above. |
1675 | BasicBlock::iterator = FC1.Header->begin(); |
1676 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
1677 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
1678 | assert(L1LatchBBIdx >= 0 && |
1679 | "Expected loop carried value to be rewired at this point!" ); |
1680 | |
1681 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
1682 | |
1683 | PHINode * = |
1684 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
1685 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
1686 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
1687 | L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()), |
1688 | BB: FC0.ExitingBlock); |
1689 | |
1690 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
1691 | } |
1692 | |
1693 | // Replace latch terminator destinations. |
1694 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
1695 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
1696 | |
1697 | // Modify the latch branch of FC0 to be unconditional as both successors of |
1698 | // the branch are the same. |
1699 | simplifyLatchBranch(FC: FC0); |
1700 | |
1701 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
1702 | // performed the updates above. |
1703 | if (FC0.Latch != FC0.ExitingBlock) |
1704 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1705 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
1706 | |
1707 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1708 | FC0.Latch, FC0.Header)); |
1709 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
1710 | FC1.Latch, FC0.Header)); |
1711 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1712 | FC1.Latch, FC1.Header)); |
1713 | |
1714 | // Update DT/PDT |
1715 | DTU.applyUpdates(Updates: TreeUpdates); |
1716 | |
1717 | LI.removeBlock(BB: FC1.Preheader); |
1718 | DTU.deleteBB(DelBB: FC1.Preheader); |
1719 | if (FC0.Peeled) { |
1720 | LI.removeBlock(BB: FC0.ExitBlock); |
1721 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
1722 | } |
1723 | |
1724 | DTU.flush(); |
1725 | |
1726 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
1727 | // and rebuild the information in subsequent passes of fusion? |
1728 | // Note: Need to forget the loops before merging the loop latches, as |
1729 | // mergeLatch may remove the only block in FC1. |
1730 | SE.forgetLoop(L: FC1.L); |
1731 | SE.forgetLoop(L: FC0.L); |
1732 | SE.forgetLoopDispositions(); |
1733 | |
1734 | // Move instructions from FC0.Latch to FC1.Latch. |
1735 | // Note: mergeLatch requires an updated DT. |
1736 | mergeLatch(FC0, FC1); |
1737 | |
1738 | // Merge the loops. |
1739 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
1740 | for (BasicBlock *BB : Blocks) { |
1741 | FC0.L->addBlockEntry(BB); |
1742 | FC1.L->removeBlockFromLoop(BB); |
1743 | if (LI.getLoopFor(BB) != FC1.L) |
1744 | continue; |
1745 | LI.changeLoopFor(BB, L: FC0.L); |
1746 | } |
1747 | while (!FC1.L->isInnermost()) { |
1748 | const auto &ChildLoopIt = FC1.L->begin(); |
1749 | Loop *ChildLoop = *ChildLoopIt; |
1750 | FC1.L->removeChildLoop(I: ChildLoopIt); |
1751 | FC0.L->addChildLoop(NewChild: ChildLoop); |
1752 | } |
1753 | |
1754 | // Delete the now empty loop L1. |
1755 | LI.erase(L: FC1.L); |
1756 | |
1757 | #ifndef NDEBUG |
1758 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
1759 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
1760 | assert(PDT.verify()); |
1761 | LI.verify(DT); |
1762 | SE.verify(); |
1763 | #endif |
1764 | |
1765 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
1766 | |
1767 | return FC0.L; |
1768 | } |
1769 | |
1770 | /// Report details on loop fusion opportunities. |
1771 | /// |
1772 | /// This template function can be used to report both successful and missed |
1773 | /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should |
1774 | /// be one of: |
1775 | /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful |
1776 | /// given two valid fusion candidates. |
1777 | /// - OptimizationRemark to report successful fusion of two fusion |
1778 | /// candidates. |
1779 | /// The remarks will be printed using the form: |
1780 | /// <path/filename>:<line number>:<column number>: [<function name>]: |
1781 | /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> |
1782 | template <typename RemarkKind> |
1783 | void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, |
1784 | llvm::Statistic &Stat) { |
1785 | assert(FC0.Preheader && FC1.Preheader && |
1786 | "Expecting valid fusion candidates" ); |
1787 | using namespace ore; |
1788 | #if LLVM_ENABLE_STATS |
1789 | ++Stat; |
1790 | ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), |
1791 | FC0.Preheader) |
1792 | << "[" << FC0.Preheader->getParent()->getName() |
1793 | << "]: " << NV("Cand1" , StringRef(FC0.Preheader->getName())) |
1794 | << " and " << NV("Cand2" , StringRef(FC1.Preheader->getName())) |
1795 | << ": " << Stat.getDesc()); |
1796 | #endif |
1797 | } |
1798 | |
1799 | /// Fuse two guarded fusion candidates, creating a new fused loop. |
1800 | /// |
1801 | /// Fusing guarded loops is handled much the same way as fusing non-guarded |
1802 | /// loops. The rewiring of the CFG is slightly different though, because of |
1803 | /// the presence of the guards around the loops and the exit blocks after the |
1804 | /// loop body. As such, the new loop is rewired as follows: |
1805 | /// 1. Keep the guard branch from FC0 and use the non-loop block target |
1806 | /// from the FC1 guard branch. |
1807 | /// 2. Remove the exit block from FC0 (this exit block should be empty |
1808 | /// right now). |
1809 | /// 3. Remove the guard branch for FC1 |
1810 | /// 4. Remove the preheader for FC1. |
1811 | /// The exit block successor for the latch of FC0 is updated to be the header |
1812 | /// of FC1 and the non-exit block successor of the latch of FC1 is updated to |
1813 | /// be the header of FC0, thus creating the fused loop. |
1814 | Loop *fuseGuardedLoops(const FusionCandidate &FC0, |
1815 | const FusionCandidate &FC1) { |
1816 | assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops" ); |
1817 | |
1818 | BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); |
1819 | BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); |
1820 | BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); |
1821 | BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); |
1822 | BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); |
1823 | |
1824 | // Move instructions from the exit block of FC0 to the beginning of the exit |
1825 | // block of FC1, in the case that the FC0 loop has not been peeled. In the |
1826 | // case that FC0 loop is peeled, then move the instructions of the successor |
1827 | // of the FC0 Exit block to the beginning of the exit block of FC1. |
1828 | moveInstructionsToTheBeginning( |
1829 | FromBB&: (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), ToBB&: *FC1.ExitBlock, |
1830 | DT, PDT, DI); |
1831 | |
1832 | // Move instructions from the guard block of FC1 to the end of the guard |
1833 | // block of FC0. |
1834 | moveInstructionsToTheEnd(FromBB&: *FC1GuardBlock, ToBB&: *FC0GuardBlock, DT, PDT, DI); |
1835 | |
1836 | assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent" ); |
1837 | |
1838 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
1839 | |
1840 | //////////////////////////////////////////////////////////////////////////// |
1841 | // Update the Loop Guard |
1842 | //////////////////////////////////////////////////////////////////////////// |
1843 | // The guard for FC0 is updated to guard both FC0 and FC1. This is done by |
1844 | // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. |
1845 | // Thus, one path from the guard goes to the preheader for FC0 (and thus |
1846 | // executes the new fused loop) and the other path goes to the NonLoopBlock |
1847 | // for FC1 (where FC1 guard would have gone if FC1 was not executed). |
1848 | FC1NonLoopBlock->replacePhiUsesWith(Old: FC1GuardBlock, New: FC0GuardBlock); |
1849 | FC0.GuardBranch->replaceUsesOfWith(From: FC0NonLoopBlock, To: FC1NonLoopBlock); |
1850 | |
1851 | BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; |
1852 | BBToUpdate->getTerminator()->replaceUsesOfWith(From: FC1GuardBlock, To: FC1.Header); |
1853 | |
1854 | // The guard of FC1 is not necessary anymore. |
1855 | FC1.GuardBranch->eraseFromParent(); |
1856 | new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); |
1857 | |
1858 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1859 | DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); |
1860 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1861 | DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); |
1862 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1863 | DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); |
1864 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1865 | DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); |
1866 | |
1867 | if (FC0.Peeled) { |
1868 | // Remove the Block after the ExitBlock of FC0 |
1869 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1870 | DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); |
1871 | FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); |
1872 | new UnreachableInst(FC0ExitBlockSuccessor->getContext(), |
1873 | FC0ExitBlockSuccessor); |
1874 | } |
1875 | |
1876 | assert(pred_empty(FC1GuardBlock) && |
1877 | "Expecting guard block to have no predecessors" ); |
1878 | assert(succ_empty(FC1GuardBlock) && |
1879 | "Expecting guard block to have no successors" ); |
1880 | |
1881 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
1882 | // them later. However, this is only necessary if the new loop carried |
1883 | // values might not dominate the exiting branch. While we do not generally |
1884 | // test if this is the case but simply insert intermediate phi nodes, we |
1885 | // need to make sure these intermediate phi nodes have different |
1886 | // predecessors. To this end, we filter the special case where the exiting |
1887 | // block is the latch block of the first loop. Nothing needs to be done |
1888 | // anyway as all loop carried values dominate the latch and thereby also the |
1889 | // exiting branch. |
1890 | // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch |
1891 | // (because the loops are rotated. Thus, nothing will ever be added to |
1892 | // OriginalFC0PHIs. |
1893 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
1894 | if (FC0.ExitingBlock != FC0.Latch) |
1895 | for (PHINode &PHI : FC0.Header->phis()) |
1896 | OriginalFC0PHIs.push_back(Elt: &PHI); |
1897 | |
1898 | assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!" ); |
1899 | |
1900 | // Replace incoming blocks for header PHIs first. |
1901 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
1902 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
1903 | |
1904 | // The old exiting block of the first loop (FC0) has to jump to the header |
1905 | // of the second as we need to execute the code in the second header block |
1906 | // regardless of the trip count. That is, if the trip count is 0, so the |
1907 | // back edge is never taken, we still have to execute both loop headers, |
1908 | // especially (but not only!) if the second is a do-while style loop. |
1909 | // However, doing so might invalidate the phi nodes of the first loop as |
1910 | // the new values do only need to dominate their latch and not the exiting |
1911 | // predicate. To remedy this potential problem we always introduce phi |
1912 | // nodes in the header of the second loop later that select the loop carried |
1913 | // value, if the second header was reached through an old latch of the |
1914 | // first, or undef otherwise. This is sound as exiting the first implies the |
1915 | // second will exit too, __without__ taking the back-edge (their |
1916 | // trip-counts are equal after all). |
1917 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
1918 | To: FC1.Header); |
1919 | |
1920 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1921 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
1922 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1923 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
1924 | |
1925 | // Remove FC0 Exit Block |
1926 | // The exit block for FC0 is no longer needed since control will flow |
1927 | // directly to the header of FC1. Since it is an empty block, it can be |
1928 | // removed at this point. |
1929 | // TODO: In the future, we can handle non-empty exit blocks my merging any |
1930 | // instructions from FC0 exit block into FC1 exit block prior to removing |
1931 | // the block. |
1932 | assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty" ); |
1933 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
1934 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
1935 | |
1936 | // Remove FC1 Preheader |
1937 | // The pre-header of L1 is not necessary anymore. |
1938 | assert(pred_empty(FC1.Preheader)); |
1939 | FC1.Preheader->getTerminator()->eraseFromParent(); |
1940 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
1941 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1942 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
1943 | |
1944 | // Moves the phi nodes from the second to the first loops header block. |
1945 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
1946 | if (SE.isSCEVable(Ty: PHI->getType())) |
1947 | SE.forgetValue(V: PHI); |
1948 | if (PHI->hasNUsesOrMore(N: 1)) |
1949 | PHI->moveBefore(MovePos: &*FC0.Header->getFirstInsertionPt()); |
1950 | else |
1951 | PHI->eraseFromParent(); |
1952 | } |
1953 | |
1954 | // Introduce new phi nodes in the second loop header to ensure |
1955 | // exiting the first and jumping to the header of the second does not break |
1956 | // the SSA property of the phis originally in the first loop. See also the |
1957 | // comment above. |
1958 | BasicBlock::iterator = FC1.Header->begin(); |
1959 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
1960 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
1961 | assert(L1LatchBBIdx >= 0 && |
1962 | "Expected loop carried value to be rewired at this point!" ); |
1963 | |
1964 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
1965 | |
1966 | PHINode * = |
1967 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
1968 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
1969 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
1970 | L1HeaderPHI->addIncoming(V: UndefValue::get(T: LCV->getType()), |
1971 | BB: FC0.ExitingBlock); |
1972 | |
1973 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
1974 | } |
1975 | |
1976 | // Update the latches |
1977 | |
1978 | // Replace latch terminator destinations. |
1979 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
1980 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
1981 | |
1982 | // Modify the latch branch of FC0 to be unconditional as both successors of |
1983 | // the branch are the same. |
1984 | simplifyLatchBranch(FC: FC0); |
1985 | |
1986 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
1987 | // performed the updates above. |
1988 | if (FC0.Latch != FC0.ExitingBlock) |
1989 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1990 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
1991 | |
1992 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1993 | FC0.Latch, FC0.Header)); |
1994 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
1995 | FC1.Latch, FC0.Header)); |
1996 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1997 | FC1.Latch, FC1.Header)); |
1998 | |
1999 | // All done |
2000 | // Apply the updates to the Dominator Tree and cleanup. |
2001 | |
2002 | assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!" ); |
2003 | assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!" ); |
2004 | |
2005 | // Update DT/PDT |
2006 | DTU.applyUpdates(Updates: TreeUpdates); |
2007 | |
2008 | LI.removeBlock(BB: FC1GuardBlock); |
2009 | LI.removeBlock(BB: FC1.Preheader); |
2010 | LI.removeBlock(BB: FC0.ExitBlock); |
2011 | if (FC0.Peeled) { |
2012 | LI.removeBlock(BB: FC0ExitBlockSuccessor); |
2013 | DTU.deleteBB(DelBB: FC0ExitBlockSuccessor); |
2014 | } |
2015 | DTU.deleteBB(DelBB: FC1GuardBlock); |
2016 | DTU.deleteBB(DelBB: FC1.Preheader); |
2017 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
2018 | DTU.flush(); |
2019 | |
2020 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
2021 | // and rebuild the information in subsequent passes of fusion? |
2022 | // Note: Need to forget the loops before merging the loop latches, as |
2023 | // mergeLatch may remove the only block in FC1. |
2024 | SE.forgetLoop(L: FC1.L); |
2025 | SE.forgetLoop(L: FC0.L); |
2026 | SE.forgetLoopDispositions(); |
2027 | |
2028 | // Move instructions from FC0.Latch to FC1.Latch. |
2029 | // Note: mergeLatch requires an updated DT. |
2030 | mergeLatch(FC0, FC1); |
2031 | |
2032 | // Merge the loops. |
2033 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
2034 | for (BasicBlock *BB : Blocks) { |
2035 | FC0.L->addBlockEntry(BB); |
2036 | FC1.L->removeBlockFromLoop(BB); |
2037 | if (LI.getLoopFor(BB) != FC1.L) |
2038 | continue; |
2039 | LI.changeLoopFor(BB, L: FC0.L); |
2040 | } |
2041 | while (!FC1.L->isInnermost()) { |
2042 | const auto &ChildLoopIt = FC1.L->begin(); |
2043 | Loop *ChildLoop = *ChildLoopIt; |
2044 | FC1.L->removeChildLoop(I: ChildLoopIt); |
2045 | FC0.L->addChildLoop(NewChild: ChildLoop); |
2046 | } |
2047 | |
2048 | // Delete the now empty loop L1. |
2049 | LI.erase(L: FC1.L); |
2050 | |
2051 | #ifndef NDEBUG |
2052 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
2053 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
2054 | assert(PDT.verify()); |
2055 | LI.verify(DT); |
2056 | SE.verify(); |
2057 | #endif |
2058 | |
2059 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
2060 | |
2061 | return FC0.L; |
2062 | } |
2063 | }; |
2064 | } // namespace |
2065 | |
2066 | PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { |
2067 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
2068 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
2069 | auto &DI = AM.getResult<DependenceAnalysis>(IR&: F); |
2070 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
2071 | auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(IR&: F); |
2072 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
2073 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
2074 | const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
2075 | const DataLayout &DL = F.getDataLayout(); |
2076 | |
2077 | // Ensure loops are in simplifed form which is a pre-requisite for loop fusion |
2078 | // pass. Added only for new PM since the legacy PM has already added |
2079 | // LoopSimplify pass as a dependency. |
2080 | bool Changed = false; |
2081 | for (auto &L : LI) { |
2082 | Changed |= |
2083 | simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */); |
2084 | } |
2085 | if (Changed) |
2086 | PDT.recalculate(Func&: F); |
2087 | |
2088 | LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); |
2089 | Changed |= LF.fuseLoops(F); |
2090 | if (!Changed) |
2091 | return PreservedAnalyses::all(); |
2092 | |
2093 | PreservedAnalyses PA; |
2094 | PA.preserve<DominatorTreeAnalysis>(); |
2095 | PA.preserve<PostDominatorTreeAnalysis>(); |
2096 | PA.preserve<ScalarEvolutionAnalysis>(); |
2097 | PA.preserve<LoopAnalysis>(); |
2098 | return PA; |
2099 | } |
2100 | |