1 | //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements the loop fusion pass. |
11 | /// The implementation is largely based on the following document: |
12 | /// |
13 | /// Code Transformations to Augment the Scope of Loop Fusion in a |
14 | /// Production Compiler |
15 | /// Christopher Mark Barton |
16 | /// MSc Thesis |
17 | /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf |
18 | /// |
19 | /// The general approach taken is to collect sets of control flow equivalent |
20 | /// loops and test whether they can be fused. The necessary conditions for |
21 | /// fusion are: |
22 | /// 1. The loops must be adjacent (there cannot be any statements between |
23 | /// the two loops). |
24 | /// 2. The loops must be conforming (they must execute the same number of |
25 | /// iterations). |
26 | /// 3. The loops must be control flow equivalent (if one loop executes, the |
27 | /// other is guaranteed to execute). |
28 | /// 4. There cannot be any negative distance dependencies between the loops. |
29 | /// If all of these conditions are satisfied, it is safe to fuse the loops. |
30 | /// |
31 | /// This implementation creates FusionCandidates that represent the loop and the |
32 | /// necessary information needed by fusion. It then operates on the fusion |
33 | /// candidates, first confirming that the candidate is eligible for fusion. The |
34 | /// candidates are then collected into control flow equivalent sets, sorted in |
35 | /// dominance order. Each set of control flow equivalent candidates is then |
36 | /// traversed, attempting to fuse pairs of candidates in the set. If all |
37 | /// requirements for fusion are met, the two candidates are fused, creating a |
38 | /// new (fused) candidate which is then added back into the set to consider for |
39 | /// additional fusion. |
40 | /// |
41 | /// This implementation currently does not make any modifications to remove |
42 | /// conditions for fusion. Code transformations to make loops conform to each of |
43 | /// the conditions for fusion are discussed in more detail in the document |
44 | /// above. These can be added to the current implementation in the future. |
45 | //===----------------------------------------------------------------------===// |
46 | |
47 | #include "llvm/Transforms/Scalar/LoopFuse.h" |
48 | #include "llvm/ADT/Statistic.h" |
49 | #include "llvm/Analysis/AssumptionCache.h" |
50 | #include "llvm/Analysis/DependenceAnalysis.h" |
51 | #include "llvm/Analysis/DomTreeUpdater.h" |
52 | #include "llvm/Analysis/LoopInfo.h" |
53 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
54 | #include "llvm/Analysis/PostDominators.h" |
55 | #include "llvm/Analysis/ScalarEvolution.h" |
56 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
57 | #include "llvm/Analysis/TargetTransformInfo.h" |
58 | #include "llvm/IR/Function.h" |
59 | #include "llvm/IR/Verifier.h" |
60 | #include "llvm/Support/CommandLine.h" |
61 | #include "llvm/Support/Debug.h" |
62 | #include "llvm/Support/raw_ostream.h" |
63 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
64 | #include "llvm/Transforms/Utils/CodeMoverUtils.h" |
65 | #include "llvm/Transforms/Utils/LoopPeel.h" |
66 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
67 | |
68 | using namespace llvm; |
69 | |
70 | #define DEBUG_TYPE "loop-fusion" |
71 | |
72 | STATISTIC(FuseCounter, "Loops fused" ); |
73 | STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion" ); |
74 | STATISTIC(, "Loop has invalid preheader" ); |
75 | STATISTIC(, "Loop has invalid header" ); |
76 | STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks" ); |
77 | STATISTIC(InvalidExitBlock, "Loop has invalid exit block" ); |
78 | STATISTIC(InvalidLatch, "Loop has invalid latch" ); |
79 | STATISTIC(InvalidLoop, "Loop is invalid" ); |
80 | STATISTIC(AddressTakenBB, "Basic block has address taken" ); |
81 | STATISTIC(MayThrowException, "Loop may throw an exception" ); |
82 | STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access" ); |
83 | STATISTIC(NotSimplifiedForm, "Loop is not in simplified form" ); |
84 | STATISTIC(InvalidDependencies, "Dependencies prevent fusion" ); |
85 | STATISTIC(UnknownTripCount, "Loop has unknown trip count" ); |
86 | STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop" ); |
87 | STATISTIC(NonEqualTripCount, "Loop trip counts are not the same" ); |
88 | STATISTIC(NonAdjacent, "Loops are not adjacent" ); |
89 | STATISTIC( |
90 | , |
91 | "Loop has a non-empty preheader with instructions that cannot be moved" ); |
92 | STATISTIC(FusionNotBeneficial, "Fusion is not beneficial" ); |
93 | STATISTIC(NonIdenticalGuards, "Candidates have different guards" ); |
94 | STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " |
95 | "instructions that cannot be moved" ); |
96 | STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " |
97 | "instructions that cannot be moved" ); |
98 | STATISTIC(NotRotated, "Candidate is not rotated" ); |
99 | STATISTIC(OnlySecondCandidateIsGuarded, |
100 | "The second candidate is guarded while the first one is not" ); |
101 | STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions." ); |
102 | STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions." ); |
103 | |
104 | enum FusionDependenceAnalysisChoice { |
105 | FUSION_DEPENDENCE_ANALYSIS_SCEV, |
106 | FUSION_DEPENDENCE_ANALYSIS_DA, |
107 | FUSION_DEPENDENCE_ANALYSIS_ALL, |
108 | }; |
109 | |
110 | static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( |
111 | "loop-fusion-dependence-analysis" , |
112 | cl::desc("Which dependence analysis should loop fusion use?" ), |
113 | cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev" , |
114 | "Use the scalar evolution interface" ), |
115 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da" , |
116 | "Use the dependence analysis interface" ), |
117 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all" , |
118 | "Use all available analyses" )), |
119 | cl::Hidden, cl::init(Val: FUSION_DEPENDENCE_ANALYSIS_ALL)); |
120 | |
121 | static cl::opt<unsigned> FusionPeelMaxCount( |
122 | "loop-fusion-peel-max-count" , cl::init(Val: 0), cl::Hidden, |
123 | cl::desc("Max number of iterations to be peeled from a loop, such that " |
124 | "fusion can take place" )); |
125 | |
126 | #ifndef NDEBUG |
127 | static cl::opt<bool> |
128 | VerboseFusionDebugging("loop-fusion-verbose-debug" , |
129 | cl::desc("Enable verbose debugging for Loop Fusion" ), |
130 | cl::Hidden, cl::init(false)); |
131 | #endif |
132 | |
133 | namespace { |
134 | /// This class is used to represent a candidate for loop fusion. When it is |
135 | /// constructed, it checks the conditions for loop fusion to ensure that it |
136 | /// represents a valid candidate. It caches several parts of a loop that are |
137 | /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead |
138 | /// of continually querying the underlying Loop to retrieve these values. It is |
139 | /// assumed these will not change throughout loop fusion. |
140 | /// |
141 | /// The invalidate method should be used to indicate that the FusionCandidate is |
142 | /// no longer a valid candidate for fusion. Similarly, the isValid() method can |
143 | /// be used to ensure that the FusionCandidate is still valid for fusion. |
144 | struct FusionCandidate { |
145 | /// Cache of parts of the loop used throughout loop fusion. These should not |
146 | /// need to change throughout the analysis and transformation. |
147 | /// These parts are cached to avoid repeatedly looking up in the Loop class. |
148 | |
149 | /// Preheader of the loop this candidate represents |
150 | BasicBlock *Preheader; |
151 | /// Header of the loop this candidate represents |
152 | BasicBlock *Header; |
153 | /// Blocks in the loop that exit the loop |
154 | BasicBlock *ExitingBlock; |
155 | /// The successor block of this loop (where the exiting blocks go to) |
156 | BasicBlock *ExitBlock; |
157 | /// Latch of the loop |
158 | BasicBlock *Latch; |
159 | /// The loop that this fusion candidate represents |
160 | Loop *L; |
161 | /// Vector of instructions in this loop that read from memory |
162 | SmallVector<Instruction *, 16> MemReads; |
163 | /// Vector of instructions in this loop that write to memory |
164 | SmallVector<Instruction *, 16> MemWrites; |
165 | /// Are all of the members of this fusion candidate still valid |
166 | bool Valid; |
167 | /// Guard branch of the loop, if it exists |
168 | BranchInst *GuardBranch; |
169 | /// Peeling Paramaters of the Loop. |
170 | TTI::PeelingPreferences PP; |
171 | /// Can you Peel this Loop? |
172 | bool AbleToPeel; |
173 | /// Has this loop been Peeled |
174 | bool Peeled; |
175 | |
176 | /// Dominator and PostDominator trees are needed for the |
177 | /// FusionCandidateCompare function, required by FusionCandidateSet to |
178 | /// determine where the FusionCandidate should be inserted into the set. These |
179 | /// are used to establish ordering of the FusionCandidates based on dominance. |
180 | DominatorTree &DT; |
181 | const PostDominatorTree *PDT; |
182 | |
183 | OptimizationRemarkEmitter &ORE; |
184 | |
185 | FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, |
186 | OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) |
187 | : Preheader(L->getLoopPreheader()), Header(L->getHeader()), |
188 | ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), |
189 | Latch(L->getLoopLatch()), L(L), Valid(true), |
190 | GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), |
191 | Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { |
192 | |
193 | // Walk over all blocks in the loop and check for conditions that may |
194 | // prevent fusion. For each block, walk over all instructions and collect |
195 | // the memory reads and writes If any instructions that prevent fusion are |
196 | // found, invalidate this object and return. |
197 | for (BasicBlock *BB : L->blocks()) { |
198 | if (BB->hasAddressTaken()) { |
199 | invalidate(); |
200 | reportInvalidCandidate(Stat&: AddressTakenBB); |
201 | return; |
202 | } |
203 | |
204 | for (Instruction &I : *BB) { |
205 | if (I.mayThrow()) { |
206 | invalidate(); |
207 | reportInvalidCandidate(Stat&: MayThrowException); |
208 | return; |
209 | } |
210 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: &I)) { |
211 | if (SI->isVolatile()) { |
212 | invalidate(); |
213 | reportInvalidCandidate(Stat&: ContainsVolatileAccess); |
214 | return; |
215 | } |
216 | } |
217 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: &I)) { |
218 | if (LI->isVolatile()) { |
219 | invalidate(); |
220 | reportInvalidCandidate(Stat&: ContainsVolatileAccess); |
221 | return; |
222 | } |
223 | } |
224 | if (I.mayWriteToMemory()) |
225 | MemWrites.push_back(Elt: &I); |
226 | if (I.mayReadFromMemory()) |
227 | MemReads.push_back(Elt: &I); |
228 | } |
229 | } |
230 | } |
231 | |
232 | /// Check if all members of the class are valid. |
233 | bool isValid() const { |
234 | return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && |
235 | !L->isInvalid() && Valid; |
236 | } |
237 | |
238 | /// Verify that all members are in sync with the Loop object. |
239 | void verify() const { |
240 | assert(isValid() && "Candidate is not valid!!" ); |
241 | assert(!L->isInvalid() && "Loop is invalid!" ); |
242 | assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync" ); |
243 | assert(Header == L->getHeader() && "Header is out of sync" ); |
244 | assert(ExitingBlock == L->getExitingBlock() && |
245 | "Exiting Blocks is out of sync" ); |
246 | assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync" ); |
247 | assert(Latch == L->getLoopLatch() && "Latch is out of sync" ); |
248 | } |
249 | |
250 | /// Get the entry block for this fusion candidate. |
251 | /// |
252 | /// If this fusion candidate represents a guarded loop, the entry block is the |
253 | /// loop guard block. If it represents an unguarded loop, the entry block is |
254 | /// the preheader of the loop. |
255 | BasicBlock *getEntryBlock() const { |
256 | if (GuardBranch) |
257 | return GuardBranch->getParent(); |
258 | else |
259 | return Preheader; |
260 | } |
261 | |
262 | /// After Peeling the loop is modified quite a bit, hence all of the Blocks |
263 | /// need to be updated accordingly. |
264 | void updateAfterPeeling() { |
265 | Preheader = L->getLoopPreheader(); |
266 | Header = L->getHeader(); |
267 | ExitingBlock = L->getExitingBlock(); |
268 | ExitBlock = L->getExitBlock(); |
269 | Latch = L->getLoopLatch(); |
270 | verify(); |
271 | } |
272 | |
273 | /// Given a guarded loop, get the successor of the guard that is not in the |
274 | /// loop. |
275 | /// |
276 | /// This method returns the successor of the loop guard that is not located |
277 | /// within the loop (i.e., the successor of the guard that is not the |
278 | /// preheader). |
279 | /// This method is only valid for guarded loops. |
280 | BasicBlock *getNonLoopBlock() const { |
281 | assert(GuardBranch && "Only valid on guarded loops." ); |
282 | assert(GuardBranch->isConditional() && |
283 | "Expecting guard to be a conditional branch." ); |
284 | if (Peeled) |
285 | return GuardBranch->getSuccessor(i: 1); |
286 | return (GuardBranch->getSuccessor(i: 0) == Preheader) |
287 | ? GuardBranch->getSuccessor(i: 1) |
288 | : GuardBranch->getSuccessor(i: 0); |
289 | } |
290 | |
291 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
292 | LLVM_DUMP_METHOD void dump() const { |
293 | dbgs() << "\tGuardBranch: " ; |
294 | if (GuardBranch) |
295 | dbgs() << *GuardBranch; |
296 | else |
297 | dbgs() << "nullptr" ; |
298 | dbgs() << "\n" |
299 | << (GuardBranch ? GuardBranch->getName() : "nullptr" ) << "\n" |
300 | << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr" ) |
301 | << "\n" |
302 | << "\tHeader: " << (Header ? Header->getName() : "nullptr" ) << "\n" |
303 | << "\tExitingBB: " |
304 | << (ExitingBlock ? ExitingBlock->getName() : "nullptr" ) << "\n" |
305 | << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr" ) |
306 | << "\n" |
307 | << "\tLatch: " << (Latch ? Latch->getName() : "nullptr" ) << "\n" |
308 | << "\tEntryBlock: " |
309 | << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr" ) |
310 | << "\n" ; |
311 | } |
312 | #endif |
313 | |
314 | /// Determine if a fusion candidate (representing a loop) is eligible for |
315 | /// fusion. Note that this only checks whether a single loop can be fused - it |
316 | /// does not check whether it is *legal* to fuse two loops together. |
317 | bool isEligibleForFusion(ScalarEvolution &SE) const { |
318 | if (!isValid()) { |
319 | LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n" ); |
320 | if (!Preheader) |
321 | ++InvalidPreheader; |
322 | if (!Header) |
323 | ++InvalidHeader; |
324 | if (!ExitingBlock) |
325 | ++InvalidExitingBlock; |
326 | if (!ExitBlock) |
327 | ++InvalidExitBlock; |
328 | if (!Latch) |
329 | ++InvalidLatch; |
330 | if (L->isInvalid()) |
331 | ++InvalidLoop; |
332 | |
333 | return false; |
334 | } |
335 | |
336 | // Require ScalarEvolution to be able to determine a trip count. |
337 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { |
338 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
339 | << " trip count not computable!\n" ); |
340 | return reportInvalidCandidate(Stat&: UnknownTripCount); |
341 | } |
342 | |
343 | if (!L->isLoopSimplifyForm()) { |
344 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
345 | << " is not in simplified form!\n" ); |
346 | return reportInvalidCandidate(Stat&: NotSimplifiedForm); |
347 | } |
348 | |
349 | if (!L->isRotatedForm()) { |
350 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n" ); |
351 | return reportInvalidCandidate(Stat&: NotRotated); |
352 | } |
353 | |
354 | return true; |
355 | } |
356 | |
357 | private: |
358 | // This is only used internally for now, to clear the MemWrites and MemReads |
359 | // list and setting Valid to false. I can't envision other uses of this right |
360 | // now, since once FusionCandidates are put into the FusionCandidateSet they |
361 | // are immutable. Thus, any time we need to change/update a FusionCandidate, |
362 | // we must create a new one and insert it into the FusionCandidateSet to |
363 | // ensure the FusionCandidateSet remains ordered correctly. |
364 | void invalidate() { |
365 | MemWrites.clear(); |
366 | MemReads.clear(); |
367 | Valid = false; |
368 | } |
369 | |
370 | bool reportInvalidCandidate(llvm::Statistic &Stat) const { |
371 | using namespace ore; |
372 | assert(L && Preheader && "Fusion candidate not initialized properly!" ); |
373 | #if LLVM_ENABLE_STATS |
374 | ++Stat; |
375 | ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), |
376 | L->getStartLoc(), Preheader) |
377 | << "[" << Preheader->getParent()->getName() << "]: " |
378 | << "Loop is not a candidate for fusion: " << Stat.getDesc()); |
379 | #endif |
380 | return false; |
381 | } |
382 | }; |
383 | |
384 | struct FusionCandidateCompare { |
385 | /// Comparison functor to sort two Control Flow Equivalent fusion candidates |
386 | /// into dominance order. |
387 | /// If LHS dominates RHS and RHS post-dominates LHS, return true; |
388 | /// If RHS dominates LHS and LHS post-dominates RHS, return false; |
389 | /// If both LHS and RHS are not dominating each other then, non-strictly |
390 | /// post dominate check will decide the order of candidates. If RHS |
391 | /// non-strictly post dominates LHS then, return true. If LHS non-strictly |
392 | /// post dominates RHS then, return false. If both are non-strictly post |
393 | /// dominate each other then, level in the post dominator tree will decide |
394 | /// the order of candidates. |
395 | bool operator()(const FusionCandidate &LHS, |
396 | const FusionCandidate &RHS) const { |
397 | const DominatorTree *DT = &(LHS.DT); |
398 | |
399 | BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); |
400 | BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); |
401 | |
402 | // Do not save PDT to local variable as it is only used in asserts and thus |
403 | // will trigger an unused variable warning if building without asserts. |
404 | assert(DT && LHS.PDT && "Expecting valid dominator tree" ); |
405 | |
406 | // Do this compare first so if LHS == RHS, function returns false. |
407 | if (DT->dominates(A: RHSEntryBlock, B: LHSEntryBlock)) { |
408 | // RHS dominates LHS |
409 | // Verify LHS post-dominates RHS |
410 | assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); |
411 | return false; |
412 | } |
413 | |
414 | if (DT->dominates(A: LHSEntryBlock, B: RHSEntryBlock)) { |
415 | // Verify RHS Postdominates LHS |
416 | assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); |
417 | return true; |
418 | } |
419 | |
420 | // If two FusionCandidates are in the same level of dominator tree, |
421 | // they will not dominate each other, but may still be control flow |
422 | // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate() |
423 | // function is needed. |
424 | bool WrongOrder = |
425 | nonStrictlyPostDominate(ThisBlock: LHSEntryBlock, OtherBlock: RHSEntryBlock, DT, PDT: LHS.PDT); |
426 | bool RightOrder = |
427 | nonStrictlyPostDominate(ThisBlock: RHSEntryBlock, OtherBlock: LHSEntryBlock, DT, PDT: LHS.PDT); |
428 | if (WrongOrder && RightOrder) { |
429 | // If common predecessor of LHS and RHS post dominates both |
430 | // FusionCandidates then, Order of FusionCandidate can be |
431 | // identified by its level in post dominator tree. |
432 | DomTreeNode *LNode = LHS.PDT->getNode(BB: LHSEntryBlock); |
433 | DomTreeNode *RNode = LHS.PDT->getNode(BB: RHSEntryBlock); |
434 | return LNode->getLevel() > RNode->getLevel(); |
435 | } else if (WrongOrder) |
436 | return false; |
437 | else if (RightOrder) |
438 | return true; |
439 | |
440 | // If LHS does not non-strict Postdominate RHS and RHS does not non-strict |
441 | // Postdominate LHS then, there is no dominance relationship between the |
442 | // two FusionCandidates. Thus, they should not be in the same set together. |
443 | llvm_unreachable( |
444 | "No dominance relationship between these fusion candidates!" ); |
445 | } |
446 | }; |
447 | |
448 | using LoopVector = SmallVector<Loop *, 4>; |
449 | |
450 | // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance |
451 | // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 |
452 | // dominates FC1 and FC1 post-dominates FC0. |
453 | // std::set was chosen because we want a sorted data structure with stable |
454 | // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent |
455 | // loops by moving intervening code around. When this intervening code contains |
456 | // loops, those loops will be moved also. The corresponding FusionCandidates |
457 | // will also need to be moved accordingly. As this is done, having stable |
458 | // iterators will simplify the logic. Similarly, having an efficient insert that |
459 | // keeps the FusionCandidateSet sorted will also simplify the implementation. |
460 | using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; |
461 | using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; |
462 | |
463 | #if !defined(NDEBUG) |
464 | static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
465 | const FusionCandidate &FC) { |
466 | if (FC.isValid()) |
467 | OS << FC.Preheader->getName(); |
468 | else |
469 | OS << "<Invalid>" ; |
470 | |
471 | return OS; |
472 | } |
473 | |
474 | static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
475 | const FusionCandidateSet &CandSet) { |
476 | for (const FusionCandidate &FC : CandSet) |
477 | OS << FC << '\n'; |
478 | |
479 | return OS; |
480 | } |
481 | |
482 | static void |
483 | printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { |
484 | dbgs() << "Fusion Candidates: \n" ; |
485 | for (const auto &CandidateSet : FusionCandidates) { |
486 | dbgs() << "*** Fusion Candidate Set ***\n" ; |
487 | dbgs() << CandidateSet; |
488 | dbgs() << "****************************\n" ; |
489 | } |
490 | } |
491 | #endif |
492 | |
493 | /// Collect all loops in function at the same nest level, starting at the |
494 | /// outermost level. |
495 | /// |
496 | /// This data structure collects all loops at the same nest level for a |
497 | /// given function (specified by the LoopInfo object). It starts at the |
498 | /// outermost level. |
499 | struct LoopDepthTree { |
500 | using LoopsOnLevelTy = SmallVector<LoopVector, 4>; |
501 | using iterator = LoopsOnLevelTy::iterator; |
502 | using const_iterator = LoopsOnLevelTy::const_iterator; |
503 | |
504 | LoopDepthTree(LoopInfo &LI) : Depth(1) { |
505 | if (!LI.empty()) |
506 | LoopsOnLevel.emplace_back(Args: LoopVector(LI.rbegin(), LI.rend())); |
507 | } |
508 | |
509 | /// Test whether a given loop has been removed from the function, and thus is |
510 | /// no longer valid. |
511 | bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(Ptr: L); } |
512 | |
513 | /// Record that a given loop has been removed from the function and is no |
514 | /// longer valid. |
515 | void removeLoop(const Loop *L) { RemovedLoops.insert(Ptr: L); } |
516 | |
517 | /// Descend the tree to the next (inner) nesting level |
518 | void descend() { |
519 | LoopsOnLevelTy LoopsOnNextLevel; |
520 | |
521 | for (const LoopVector &LV : *this) |
522 | for (Loop *L : LV) |
523 | if (!isRemovedLoop(L) && L->begin() != L->end()) |
524 | LoopsOnNextLevel.emplace_back(Args: LoopVector(L->begin(), L->end())); |
525 | |
526 | LoopsOnLevel = LoopsOnNextLevel; |
527 | RemovedLoops.clear(); |
528 | Depth++; |
529 | } |
530 | |
531 | bool empty() const { return size() == 0; } |
532 | size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } |
533 | unsigned getDepth() const { return Depth; } |
534 | |
535 | iterator begin() { return LoopsOnLevel.begin(); } |
536 | iterator end() { return LoopsOnLevel.end(); } |
537 | const_iterator begin() const { return LoopsOnLevel.begin(); } |
538 | const_iterator end() const { return LoopsOnLevel.end(); } |
539 | |
540 | private: |
541 | /// Set of loops that have been removed from the function and are no longer |
542 | /// valid. |
543 | SmallPtrSet<const Loop *, 8> RemovedLoops; |
544 | |
545 | /// Depth of the current level, starting at 1 (outermost loops). |
546 | unsigned Depth; |
547 | |
548 | /// Vector of loops at the current depth level that have the same parent loop |
549 | LoopsOnLevelTy LoopsOnLevel; |
550 | }; |
551 | |
552 | #ifndef NDEBUG |
553 | static void printLoopVector(const LoopVector &LV) { |
554 | dbgs() << "****************************\n" ; |
555 | for (auto *L : LV) |
556 | printLoop(*L, dbgs()); |
557 | dbgs() << "****************************\n" ; |
558 | } |
559 | #endif |
560 | |
561 | struct LoopFuser { |
562 | private: |
563 | // Sets of control flow equivalent fusion candidates for a given nest level. |
564 | FusionCandidateCollection FusionCandidates; |
565 | |
566 | LoopDepthTree LDT; |
567 | DomTreeUpdater DTU; |
568 | |
569 | LoopInfo &LI; |
570 | DominatorTree &DT; |
571 | DependenceInfo &DI; |
572 | ScalarEvolution &SE; |
573 | PostDominatorTree &PDT; |
574 | OptimizationRemarkEmitter &ORE; |
575 | AssumptionCache &AC; |
576 | const TargetTransformInfo &TTI; |
577 | |
578 | public: |
579 | LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, |
580 | ScalarEvolution &SE, PostDominatorTree &PDT, |
581 | OptimizationRemarkEmitter &ORE, const DataLayout &DL, |
582 | AssumptionCache &AC, const TargetTransformInfo &TTI) |
583 | : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), |
584 | DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} |
585 | |
586 | /// This is the main entry point for loop fusion. It will traverse the |
587 | /// specified function and collect candidate loops to fuse, starting at the |
588 | /// outermost nesting level and working inwards. |
589 | bool fuseLoops(Function &F) { |
590 | #ifndef NDEBUG |
591 | if (VerboseFusionDebugging) { |
592 | LI.print(dbgs()); |
593 | } |
594 | #endif |
595 | |
596 | LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() |
597 | << "\n" ); |
598 | bool Changed = false; |
599 | |
600 | while (!LDT.empty()) { |
601 | LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " |
602 | << LDT.getDepth() << "\n" ;); |
603 | |
604 | for (const LoopVector &LV : LDT) { |
605 | assert(LV.size() > 0 && "Empty loop set was build!" ); |
606 | |
607 | // Skip singleton loop sets as they do not offer fusion opportunities on |
608 | // this level. |
609 | if (LV.size() == 1) |
610 | continue; |
611 | #ifndef NDEBUG |
612 | if (VerboseFusionDebugging) { |
613 | LLVM_DEBUG({ |
614 | dbgs() << " Visit loop set (#" << LV.size() << "):\n" ; |
615 | printLoopVector(LV); |
616 | }); |
617 | } |
618 | #endif |
619 | |
620 | collectFusionCandidates(LV); |
621 | Changed |= fuseCandidates(); |
622 | } |
623 | |
624 | // Finished analyzing candidates at this level. |
625 | // Descend to the next level and clear all of the candidates currently |
626 | // collected. Note that it will not be possible to fuse any of the |
627 | // existing candidates with new candidates because the new candidates will |
628 | // be at a different nest level and thus not be control flow equivalent |
629 | // with all of the candidates collected so far. |
630 | LLVM_DEBUG(dbgs() << "Descend one level!\n" ); |
631 | LDT.descend(); |
632 | FusionCandidates.clear(); |
633 | } |
634 | |
635 | if (Changed) |
636 | LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n" ; F.dump();); |
637 | |
638 | #ifndef NDEBUG |
639 | assert(DT.verify()); |
640 | assert(PDT.verify()); |
641 | LI.verify(DT); |
642 | SE.verify(); |
643 | #endif |
644 | |
645 | LLVM_DEBUG(dbgs() << "Loop Fusion complete\n" ); |
646 | return Changed; |
647 | } |
648 | |
649 | private: |
650 | /// Determine if two fusion candidates are control flow equivalent. |
651 | /// |
652 | /// Two fusion candidates are control flow equivalent if when one executes, |
653 | /// the other is guaranteed to execute. This is determined using dominators |
654 | /// and post-dominators: if A dominates B and B post-dominates A then A and B |
655 | /// are control-flow equivalent. |
656 | bool isControlFlowEquivalent(const FusionCandidate &FC0, |
657 | const FusionCandidate &FC1) const { |
658 | assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders" ); |
659 | |
660 | return ::isControlFlowEquivalent(BB0: *FC0.getEntryBlock(), BB1: *FC1.getEntryBlock(), |
661 | DT, PDT); |
662 | } |
663 | |
664 | /// Iterate over all loops in the given loop set and identify the loops that |
665 | /// are eligible for fusion. Place all eligible fusion candidates into Control |
666 | /// Flow Equivalent sets, sorted by dominance. |
667 | void collectFusionCandidates(const LoopVector &LV) { |
668 | for (Loop *L : LV) { |
669 | TTI::PeelingPreferences PP = |
670 | gatherPeelingPreferences(L, SE, TTI, UserAllowPeeling: std::nullopt, UserAllowProfileBasedPeeling: std::nullopt); |
671 | FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); |
672 | if (!CurrCand.isEligibleForFusion(SE)) |
673 | continue; |
674 | |
675 | // Go through each list in FusionCandidates and determine if L is control |
676 | // flow equivalent with the first loop in that list. If it is, append LV. |
677 | // If not, go to the next list. |
678 | // If no suitable list is found, start another list and add it to |
679 | // FusionCandidates. |
680 | bool FoundSet = false; |
681 | |
682 | for (auto &CurrCandSet : FusionCandidates) { |
683 | if (isControlFlowEquivalent(FC0: *CurrCandSet.begin(), FC1: CurrCand)) { |
684 | CurrCandSet.insert(x: CurrCand); |
685 | FoundSet = true; |
686 | #ifndef NDEBUG |
687 | if (VerboseFusionDebugging) |
688 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand |
689 | << " to existing candidate set\n" ); |
690 | #endif |
691 | break; |
692 | } |
693 | } |
694 | if (!FoundSet) { |
695 | // No set was found. Create a new set and add to FusionCandidates |
696 | #ifndef NDEBUG |
697 | if (VerboseFusionDebugging) |
698 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n" ); |
699 | #endif |
700 | FusionCandidateSet NewCandSet; |
701 | NewCandSet.insert(x: CurrCand); |
702 | FusionCandidates.push_back(Elt: NewCandSet); |
703 | } |
704 | NumFusionCandidates++; |
705 | } |
706 | } |
707 | |
708 | /// Determine if it is beneficial to fuse two loops. |
709 | /// |
710 | /// For now, this method simply returns true because we want to fuse as much |
711 | /// as possible (primarily to test the pass). This method will evolve, over |
712 | /// time, to add heuristics for profitability of fusion. |
713 | bool isBeneficialFusion(const FusionCandidate &FC0, |
714 | const FusionCandidate &FC1) { |
715 | return true; |
716 | } |
717 | |
718 | /// Determine if two fusion candidates have the same trip count (i.e., they |
719 | /// execute the same number of iterations). |
720 | /// |
721 | /// This function will return a pair of values. The first is a boolean, |
722 | /// stating whether or not the two candidates are known at compile time to |
723 | /// have the same TripCount. The second is the difference in the two |
724 | /// TripCounts. This information can be used later to determine whether or not |
725 | /// peeling can be performed on either one of the candidates. |
726 | std::pair<bool, std::optional<unsigned>> |
727 | haveIdenticalTripCounts(const FusionCandidate &FC0, |
728 | const FusionCandidate &FC1) const { |
729 | const SCEV *TripCount0 = SE.getBackedgeTakenCount(L: FC0.L); |
730 | if (isa<SCEVCouldNotCompute>(Val: TripCount0)) { |
731 | UncomputableTripCount++; |
732 | LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!" ); |
733 | return {false, std::nullopt}; |
734 | } |
735 | |
736 | const SCEV *TripCount1 = SE.getBackedgeTakenCount(L: FC1.L); |
737 | if (isa<SCEVCouldNotCompute>(Val: TripCount1)) { |
738 | UncomputableTripCount++; |
739 | LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!" ); |
740 | return {false, std::nullopt}; |
741 | } |
742 | |
743 | LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " |
744 | << *TripCount1 << " are " |
745 | << (TripCount0 == TripCount1 ? "identical" : "different" ) |
746 | << "\n" ); |
747 | |
748 | if (TripCount0 == TripCount1) |
749 | return {true, 0}; |
750 | |
751 | LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " |
752 | "determining the difference between trip counts\n" ); |
753 | |
754 | // Currently only considering loops with a single exit point |
755 | // and a non-constant trip count. |
756 | const unsigned TC0 = SE.getSmallConstantTripCount(L: FC0.L); |
757 | const unsigned TC1 = SE.getSmallConstantTripCount(L: FC1.L); |
758 | |
759 | // If any of the tripcounts are zero that means that loop(s) do not have |
760 | // a single exit or a constant tripcount. |
761 | if (TC0 == 0 || TC1 == 0) { |
762 | LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " |
763 | "have a constant number of iterations. Peeling " |
764 | "is not benefical\n" ); |
765 | return {false, std::nullopt}; |
766 | } |
767 | |
768 | std::optional<unsigned> Difference; |
769 | int Diff = TC0 - TC1; |
770 | |
771 | if (Diff > 0) |
772 | Difference = Diff; |
773 | else { |
774 | LLVM_DEBUG( |
775 | dbgs() << "Difference is less than 0. FC1 (second loop) has more " |
776 | "iterations than the first one. Currently not supported\n" ); |
777 | } |
778 | |
779 | LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference |
780 | << "\n" ); |
781 | |
782 | return {false, Difference}; |
783 | } |
784 | |
785 | void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, |
786 | unsigned PeelCount) { |
787 | assert(FC0.AbleToPeel && "Should be able to peel loop" ); |
788 | |
789 | LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount |
790 | << " iterations of the first loop. \n" ); |
791 | |
792 | ValueToValueMapTy VMap; |
793 | FC0.Peeled = |
794 | peelLoop(L: FC0.L, PeelCount, PeelLast: false, LI: &LI, SE: &SE, DT, AC: &AC, PreserveLCSSA: true, VMap); |
795 | if (FC0.Peeled) { |
796 | LLVM_DEBUG(dbgs() << "Done Peeling\n" ); |
797 | |
798 | #ifndef NDEBUG |
799 | auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); |
800 | |
801 | assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && |
802 | "Loops should have identical trip counts after peeling" ); |
803 | #endif |
804 | |
805 | FC0.PP.PeelCount += PeelCount; |
806 | |
807 | // Peeling does not update the PDT |
808 | PDT.recalculate(Func&: *FC0.Preheader->getParent()); |
809 | |
810 | FC0.updateAfterPeeling(); |
811 | |
812 | // In this case the iterations of the loop are constant, so the first |
813 | // loop will execute completely (will not jump from one of |
814 | // the peeled blocks to the second loop). Here we are updating the |
815 | // branch conditions of each of the peeled blocks, such that it will |
816 | // branch to its successor which is not the preheader of the second loop |
817 | // in the case of unguarded loops, or the succesors of the exit block of |
818 | // the first loop otherwise. Doing this update will ensure that the entry |
819 | // block of the first loop dominates the entry block of the second loop. |
820 | BasicBlock *BB = |
821 | FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; |
822 | if (BB) { |
823 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
824 | SmallVector<Instruction *, 8> WorkList; |
825 | for (BasicBlock *Pred : predecessors(BB)) { |
826 | if (Pred != FC0.ExitBlock) { |
827 | WorkList.emplace_back(Args: Pred->getTerminator()); |
828 | TreeUpdates.emplace_back( |
829 | Args: DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); |
830 | } |
831 | } |
832 | // Cannot modify the predecessors inside the above loop as it will cause |
833 | // the iterators to be nullptrs, causing memory errors. |
834 | for (Instruction *CurrentBranch : WorkList) { |
835 | BasicBlock *Succ = CurrentBranch->getSuccessor(Idx: 0); |
836 | if (Succ == BB) |
837 | Succ = CurrentBranch->getSuccessor(Idx: 1); |
838 | ReplaceInstWithInst(From: CurrentBranch, To: BranchInst::Create(IfTrue: Succ)); |
839 | } |
840 | |
841 | DTU.applyUpdates(Updates: TreeUpdates); |
842 | DTU.flush(); |
843 | } |
844 | LLVM_DEBUG( |
845 | dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount |
846 | << " iterations from the first loop.\n" |
847 | "Both Loops have the same number of iterations now.\n" ); |
848 | } |
849 | } |
850 | |
851 | /// Walk each set of control flow equivalent fusion candidates and attempt to |
852 | /// fuse them. This does a single linear traversal of all candidates in the |
853 | /// set. The conditions for legal fusion are checked at this point. If a pair |
854 | /// of fusion candidates passes all legality checks, they are fused together |
855 | /// and a new fusion candidate is created and added to the FusionCandidateSet. |
856 | /// The original fusion candidates are then removed, as they are no longer |
857 | /// valid. |
858 | bool fuseCandidates() { |
859 | bool Fused = false; |
860 | LLVM_DEBUG(printFusionCandidates(FusionCandidates)); |
861 | for (auto &CandidateSet : FusionCandidates) { |
862 | if (CandidateSet.size() < 2) |
863 | continue; |
864 | |
865 | LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" |
866 | << CandidateSet << "\n" ); |
867 | |
868 | for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { |
869 | assert(!LDT.isRemovedLoop(FC0->L) && |
870 | "Should not have removed loops in CandidateSet!" ); |
871 | auto FC1 = FC0; |
872 | for (++FC1; FC1 != CandidateSet.end(); ++FC1) { |
873 | assert(!LDT.isRemovedLoop(FC1->L) && |
874 | "Should not have removed loops in CandidateSet!" ); |
875 | |
876 | LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n" ; FC0->dump(); |
877 | dbgs() << " with\n" ; FC1->dump(); dbgs() << "\n" ); |
878 | |
879 | FC0->verify(); |
880 | FC1->verify(); |
881 | |
882 | // Check if the candidates have identical tripcounts (first value of |
883 | // pair), and if not check the difference in the tripcounts between |
884 | // the loops (second value of pair). The difference is not equal to |
885 | // std::nullopt iff the loops iterate a constant number of times, and |
886 | // have a single exit. |
887 | std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = |
888 | haveIdenticalTripCounts(FC0: *FC0, FC1: *FC1); |
889 | bool SameTripCount = IdenticalTripCountRes.first; |
890 | std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; |
891 | |
892 | // Here we are checking that FC0 (the first loop) can be peeled, and |
893 | // both loops have different tripcounts. |
894 | if (FC0->AbleToPeel && !SameTripCount && TCDifference) { |
895 | if (*TCDifference > FusionPeelMaxCount) { |
896 | LLVM_DEBUG(dbgs() |
897 | << "Difference in loop trip counts: " << *TCDifference |
898 | << " is greater than maximum peel count specificed: " |
899 | << FusionPeelMaxCount << "\n" ); |
900 | } else { |
901 | // Dependent on peeling being performed on the first loop, and |
902 | // assuming all other conditions for fusion return true. |
903 | SameTripCount = true; |
904 | } |
905 | } |
906 | |
907 | if (!SameTripCount) { |
908 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " |
909 | "counts. Not fusing.\n" ); |
910 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
911 | Stat&: NonEqualTripCount); |
912 | continue; |
913 | } |
914 | |
915 | if (!isAdjacent(FC0: *FC0, FC1: *FC1)) { |
916 | LLVM_DEBUG(dbgs() |
917 | << "Fusion candidates are not adjacent. Not fusing.\n" ); |
918 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, Stat&: NonAdjacent); |
919 | continue; |
920 | } |
921 | |
922 | if ((!FC0->GuardBranch && FC1->GuardBranch) || |
923 | (FC0->GuardBranch && !FC1->GuardBranch)) { |
924 | LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " |
925 | "another one is not. Not fusing.\n" ); |
926 | reportLoopFusion<OptimizationRemarkMissed>( |
927 | FC0: *FC0, FC1: *FC1, Stat&: OnlySecondCandidateIsGuarded); |
928 | continue; |
929 | } |
930 | |
931 | // Ensure that FC0 and FC1 have identical guards. |
932 | // If one (or both) are not guarded, this check is not necessary. |
933 | if (FC0->GuardBranch && FC1->GuardBranch && |
934 | !haveIdenticalGuards(FC0: *FC0, FC1: *FC1) && !TCDifference) { |
935 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " |
936 | "guards. Not Fusing.\n" ); |
937 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
938 | Stat&: NonIdenticalGuards); |
939 | continue; |
940 | } |
941 | |
942 | if (FC0->GuardBranch) { |
943 | assert(FC1->GuardBranch && "Expecting valid FC1 guard branch" ); |
944 | |
945 | if (!isSafeToMoveBefore(BB&: *FC0->ExitBlock, |
946 | InsertPoint&: *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, |
947 | PDT: &PDT, DI: &DI)) { |
948 | LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " |
949 | "instructions in exit block. Not fusing.\n" ); |
950 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
951 | Stat&: NonEmptyExitBlock); |
952 | continue; |
953 | } |
954 | |
955 | if (!isSafeToMoveBefore( |
956 | BB&: *FC1->GuardBranch->getParent(), |
957 | InsertPoint&: *FC0->GuardBranch->getParent()->getTerminator(), DT, PDT: &PDT, |
958 | DI: &DI)) { |
959 | LLVM_DEBUG(dbgs() |
960 | << "Fusion candidate contains unsafe " |
961 | "instructions in guard block. Not fusing.\n" ); |
962 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
963 | Stat&: NonEmptyGuardBlock); |
964 | continue; |
965 | } |
966 | } |
967 | |
968 | // Check the dependencies across the loops and do not fuse if it would |
969 | // violate them. |
970 | if (!dependencesAllowFusion(FC0: *FC0, FC1: *FC1)) { |
971 | LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n" ); |
972 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
973 | Stat&: InvalidDependencies); |
974 | continue; |
975 | } |
976 | |
977 | // If the second loop has instructions in the pre-header, attempt to |
978 | // hoist them up to the first loop's pre-header or sink them into the |
979 | // body of the second loop. |
980 | SmallVector<Instruction *, 4> SafeToHoist; |
981 | SmallVector<Instruction *, 4> SafeToSink; |
982 | // At this point, this is the last remaining legality check. |
983 | // Which means if we can make this pre-header empty, we can fuse |
984 | // these loops |
985 | if (!isEmptyPreheader(FC: *FC1)) { |
986 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " |
987 | "preheader.\n" ); |
988 | |
989 | // If it is not safe to hoist/sink all instructions in the |
990 | // pre-header, we cannot fuse these loops. |
991 | if (!collectMovablePreheaderInsts(FC0: *FC0, FC1: *FC1, SafeToHoist, |
992 | SafeToSink)) { |
993 | LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " |
994 | "Fusion Candidate Pre-header.\n" |
995 | << "Not Fusing.\n" ); |
996 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
997 | Stat&: NonEmptyPreheader); |
998 | continue; |
999 | } |
1000 | } |
1001 | |
1002 | bool BeneficialToFuse = isBeneficialFusion(FC0: *FC0, FC1: *FC1); |
1003 | LLVM_DEBUG(dbgs() |
1004 | << "\tFusion appears to be " |
1005 | << (BeneficialToFuse ? "" : "un" ) << "profitable!\n" ); |
1006 | if (!BeneficialToFuse) { |
1007 | reportLoopFusion<OptimizationRemarkMissed>(FC0: *FC0, FC1: *FC1, |
1008 | Stat&: FusionNotBeneficial); |
1009 | continue; |
1010 | } |
1011 | // All analysis has completed and has determined that fusion is legal |
1012 | // and profitable. At this point, start transforming the code and |
1013 | // perform fusion. |
1014 | |
1015 | // Execute the hoist/sink operations on preheader instructions |
1016 | movePreheaderInsts(FC0: *FC0, FC1: *FC1, HoistInsts&: SafeToHoist, SinkInsts&: SafeToSink); |
1017 | |
1018 | LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " |
1019 | << *FC1 << "\n" ); |
1020 | |
1021 | FusionCandidate FC0Copy = *FC0; |
1022 | // Peel the loop after determining that fusion is legal. The Loops |
1023 | // will still be safe to fuse after the peeling is performed. |
1024 | bool Peel = TCDifference && *TCDifference > 0; |
1025 | if (Peel) |
1026 | peelFusionCandidate(FC0&: FC0Copy, FC1: *FC1, PeelCount: *TCDifference); |
1027 | |
1028 | // Report fusion to the Optimization Remarks. |
1029 | // Note this needs to be done *before* performFusion because |
1030 | // performFusion will change the original loops, making it not |
1031 | // possible to identify them after fusion is complete. |
1032 | reportLoopFusion<OptimizationRemark>(FC0: (Peel ? FC0Copy : *FC0), FC1: *FC1, |
1033 | Stat&: FuseCounter); |
1034 | |
1035 | FusionCandidate FusedCand( |
1036 | performFusion(FC0: (Peel ? FC0Copy : *FC0), FC1: *FC1), DT, &PDT, ORE, |
1037 | FC0Copy.PP); |
1038 | FusedCand.verify(); |
1039 | assert(FusedCand.isEligibleForFusion(SE) && |
1040 | "Fused candidate should be eligible for fusion!" ); |
1041 | |
1042 | // Notify the loop-depth-tree that these loops are not valid objects |
1043 | LDT.removeLoop(L: FC1->L); |
1044 | |
1045 | CandidateSet.erase(position: FC0); |
1046 | CandidateSet.erase(position: FC1); |
1047 | |
1048 | auto InsertPos = CandidateSet.insert(x: FusedCand); |
1049 | |
1050 | assert(InsertPos.second && |
1051 | "Unable to insert TargetCandidate in CandidateSet!" ); |
1052 | |
1053 | // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations |
1054 | // of the FC1 loop will attempt to fuse the new (fused) loop with the |
1055 | // remaining candidates in the current candidate set. |
1056 | FC0 = FC1 = InsertPos.first; |
1057 | |
1058 | LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet |
1059 | << "\n" ); |
1060 | |
1061 | Fused = true; |
1062 | } |
1063 | } |
1064 | } |
1065 | return Fused; |
1066 | } |
1067 | |
1068 | // Returns true if the instruction \p I can be hoisted to the end of the |
1069 | // preheader of \p FC0. \p SafeToHoist contains the instructions that are |
1070 | // known to be safe to hoist. The instructions encountered that cannot be |
1071 | // hoisted are in \p NotHoisting. |
1072 | // TODO: Move functionality into CodeMoverUtils |
1073 | bool canHoistInst(Instruction &I, |
1074 | const SmallVector<Instruction *, 4> &SafeToHoist, |
1075 | const SmallVector<Instruction *, 4> &NotHoisting, |
1076 | const FusionCandidate &FC0) const { |
1077 | const BasicBlock * = FC0.Preheader->getSingleSuccessor(); |
1078 | assert(FC0PreheaderTarget && |
1079 | "Expected single successor for loop preheader." ); |
1080 | |
1081 | for (Use &Op : I.operands()) { |
1082 | if (auto *OpInst = dyn_cast<Instruction>(Val&: Op)) { |
1083 | bool OpHoisted = is_contained(Range: SafeToHoist, Element: OpInst); |
1084 | // Check if we have already decided to hoist this operand. In this |
1085 | // case, it does not dominate FC0 *yet*, but will after we hoist it. |
1086 | if (!(OpHoisted || DT.dominates(Def: OpInst, BB: FC0PreheaderTarget))) { |
1087 | return false; |
1088 | } |
1089 | } |
1090 | } |
1091 | |
1092 | // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs |
1093 | // cannot be hoisted and should be sunk to the exit of the fused loop. |
1094 | if (isa<PHINode>(Val: I)) |
1095 | return false; |
1096 | |
1097 | // If this isn't a memory inst, hoisting is safe |
1098 | if (!I.mayReadOrWriteMemory()) |
1099 | return true; |
1100 | |
1101 | LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n" ); |
1102 | for (Instruction *NotHoistedInst : NotHoisting) { |
1103 | if (auto D = DI.depends(Src: &I, Dst: NotHoistedInst)) { |
1104 | // Dependency is not read-before-write, write-before-read or |
1105 | // write-before-write |
1106 | if (D->isFlow() || D->isAnti() || D->isOutput()) { |
1107 | LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " |
1108 | "preheader that is not being hoisted.\n" ); |
1109 | return false; |
1110 | } |
1111 | } |
1112 | } |
1113 | |
1114 | for (Instruction *ReadInst : FC0.MemReads) { |
1115 | if (auto D = DI.depends(Src: ReadInst, Dst: &I)) { |
1116 | // Dependency is not read-before-write |
1117 | if (D->isAnti()) { |
1118 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n" ); |
1119 | return false; |
1120 | } |
1121 | } |
1122 | } |
1123 | |
1124 | for (Instruction *WriteInst : FC0.MemWrites) { |
1125 | if (auto D = DI.depends(Src: WriteInst, Dst: &I)) { |
1126 | // Dependency is not write-before-read or write-before-write |
1127 | if (D->isFlow() || D->isOutput()) { |
1128 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n" ); |
1129 | return false; |
1130 | } |
1131 | } |
1132 | } |
1133 | return true; |
1134 | } |
1135 | |
1136 | // Returns true if the instruction \p I can be sunk to the top of the exit |
1137 | // block of \p FC1. |
1138 | // TODO: Move functionality into CodeMoverUtils |
1139 | bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { |
1140 | for (User *U : I.users()) { |
1141 | if (auto *UI{dyn_cast<Instruction>(Val: U)}) { |
1142 | // Cannot sink if user in loop |
1143 | // If FC1 has phi users of this value, we cannot sink it into FC1. |
1144 | if (FC1.L->contains(Inst: UI)) { |
1145 | // Cannot hoist or sink this instruction. No hoisting/sinking |
1146 | // should take place, loops should not fuse |
1147 | return false; |
1148 | } |
1149 | } |
1150 | } |
1151 | |
1152 | // If this isn't a memory inst, sinking is safe |
1153 | if (!I.mayReadOrWriteMemory()) |
1154 | return true; |
1155 | |
1156 | for (Instruction *ReadInst : FC1.MemReads) { |
1157 | if (auto D = DI.depends(Src: &I, Dst: ReadInst)) { |
1158 | // Dependency is not write-before-read |
1159 | if (D->isFlow()) { |
1160 | LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n" ); |
1161 | return false; |
1162 | } |
1163 | } |
1164 | } |
1165 | |
1166 | for (Instruction *WriteInst : FC1.MemWrites) { |
1167 | if (auto D = DI.depends(Src: &I, Dst: WriteInst)) { |
1168 | // Dependency is not write-before-write or read-before-write |
1169 | if (D->isOutput() || D->isAnti()) { |
1170 | LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n" ); |
1171 | return false; |
1172 | } |
1173 | } |
1174 | } |
1175 | |
1176 | return true; |
1177 | } |
1178 | |
1179 | /// Collect instructions in the \p FC1 Preheader that can be hoisted |
1180 | /// to the \p FC0 Preheader or sunk into the \p FC1 Body |
1181 | bool collectMovablePreheaderInsts( |
1182 | const FusionCandidate &FC0, const FusionCandidate &FC1, |
1183 | SmallVector<Instruction *, 4> &SafeToHoist, |
1184 | SmallVector<Instruction *, 4> &SafeToSink) const { |
1185 | BasicBlock * = FC1.Preheader; |
1186 | // Save the instructions that are not being hoisted, so we know not to hoist |
1187 | // mem insts that they dominate. |
1188 | SmallVector<Instruction *, 4> NotHoisting; |
1189 | |
1190 | for (Instruction &I : *FC1Preheader) { |
1191 | // Can't move a branch |
1192 | if (&I == FC1Preheader->getTerminator()) |
1193 | continue; |
1194 | // If the instruction has side-effects, give up. |
1195 | // TODO: The case of mayReadFromMemory we can handle but requires |
1196 | // additional work with a dependence analysis so for now we give |
1197 | // up on memory reads. |
1198 | if (I.mayThrow() || !I.willReturn()) { |
1199 | LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n" ); |
1200 | return false; |
1201 | } |
1202 | |
1203 | LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n" ); |
1204 | |
1205 | if (I.isAtomic() || I.isVolatile()) { |
1206 | LLVM_DEBUG( |
1207 | dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n" ); |
1208 | return false; |
1209 | } |
1210 | |
1211 | if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { |
1212 | SafeToHoist.push_back(Elt: &I); |
1213 | LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n" ); |
1214 | } else { |
1215 | LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n" ); |
1216 | NotHoisting.push_back(Elt: &I); |
1217 | |
1218 | if (canSinkInst(I, FC1)) { |
1219 | SafeToSink.push_back(Elt: &I); |
1220 | LLVM_DEBUG(dbgs() << "\tSafe to sink.\n" ); |
1221 | } else { |
1222 | LLVM_DEBUG(dbgs() << "\tCould not sink.\n" ); |
1223 | return false; |
1224 | } |
1225 | } |
1226 | } |
1227 | LLVM_DEBUG( |
1228 | dbgs() << "All preheader instructions could be sunk or hoisted!\n" ); |
1229 | return true; |
1230 | } |
1231 | |
1232 | /// Rewrite all additive recurrences in a SCEV to use a new loop. |
1233 | class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { |
1234 | public: |
1235 | AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, |
1236 | bool UseMax = true) |
1237 | : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), |
1238 | NewL(NewL) {} |
1239 | |
1240 | const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { |
1241 | const Loop *ExprL = Expr->getLoop(); |
1242 | SmallVector<const SCEV *, 2> Operands; |
1243 | if (ExprL == &OldL) { |
1244 | append_range(C&: Operands, R: Expr->operands()); |
1245 | return SE.getAddRecExpr(Operands, L: &NewL, Flags: Expr->getNoWrapFlags()); |
1246 | } |
1247 | |
1248 | if (OldL.contains(L: ExprL)) { |
1249 | bool Pos = SE.isKnownPositive(S: Expr->getStepRecurrence(SE)); |
1250 | if (!UseMax || !Pos || !Expr->isAffine()) { |
1251 | Valid = false; |
1252 | return Expr; |
1253 | } |
1254 | return visit(S: Expr->getStart()); |
1255 | } |
1256 | |
1257 | for (const SCEV *Op : Expr->operands()) |
1258 | Operands.push_back(Elt: visit(S: Op)); |
1259 | return SE.getAddRecExpr(Operands, L: ExprL, Flags: Expr->getNoWrapFlags()); |
1260 | } |
1261 | |
1262 | bool wasValidSCEV() const { return Valid; } |
1263 | |
1264 | private: |
1265 | bool Valid, UseMax; |
1266 | const Loop &OldL, &NewL; |
1267 | }; |
1268 | |
1269 | /// Return false if the access functions of \p I0 and \p I1 could cause |
1270 | /// a negative dependence. |
1271 | bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, |
1272 | Instruction &I1, bool EqualIsInvalid) { |
1273 | Value *Ptr0 = getLoadStorePointerOperand(V: &I0); |
1274 | Value *Ptr1 = getLoadStorePointerOperand(V: &I1); |
1275 | if (!Ptr0 || !Ptr1) |
1276 | return false; |
1277 | |
1278 | const SCEV *SCEVPtr0 = SE.getSCEVAtScope(V: Ptr0, L: &L0); |
1279 | const SCEV *SCEVPtr1 = SE.getSCEVAtScope(V: Ptr1, L: &L1); |
1280 | #ifndef NDEBUG |
1281 | if (VerboseFusionDebugging) |
1282 | LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " |
1283 | << *SCEVPtr1 << "\n" ); |
1284 | #endif |
1285 | AddRecLoopReplacer Rewriter(SE, L0, L1); |
1286 | SCEVPtr0 = Rewriter.visit(S: SCEVPtr0); |
1287 | #ifndef NDEBUG |
1288 | if (VerboseFusionDebugging) |
1289 | LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 |
1290 | << " [Valid: " << Rewriter.wasValidSCEV() << "]\n" ); |
1291 | #endif |
1292 | if (!Rewriter.wasValidSCEV()) |
1293 | return false; |
1294 | |
1295 | // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by |
1296 | // L0) and the other is not. We could check if it is monotone and test |
1297 | // the beginning and end value instead. |
1298 | |
1299 | BasicBlock * = L0.getHeader(); |
1300 | auto HasNonLinearDominanceRelation = [&](const SCEV *S) { |
1301 | const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Val: S); |
1302 | if (!AddRec) |
1303 | return false; |
1304 | return !DT.dominates(A: L0Header, B: AddRec->getLoop()->getHeader()) && |
1305 | !DT.dominates(A: AddRec->getLoop()->getHeader(), B: L0Header); |
1306 | }; |
1307 | if (SCEVExprContains(Root: SCEVPtr1, Pred: HasNonLinearDominanceRelation)) |
1308 | return false; |
1309 | |
1310 | ICmpInst::Predicate Pred = |
1311 | EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; |
1312 | bool IsAlwaysGE = SE.isKnownPredicate(Pred, LHS: SCEVPtr0, RHS: SCEVPtr1); |
1313 | #ifndef NDEBUG |
1314 | if (VerboseFusionDebugging) |
1315 | LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 |
1316 | << (IsAlwaysGE ? " >= " : " may < " ) << *SCEVPtr1 |
1317 | << "\n" ); |
1318 | #endif |
1319 | return IsAlwaysGE; |
1320 | } |
1321 | |
1322 | /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in |
1323 | /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses |
1324 | /// specified by @p DepChoice are used to determine this. |
1325 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
1326 | const FusionCandidate &FC1, Instruction &I0, |
1327 | Instruction &I1, bool AnyDep, |
1328 | FusionDependenceAnalysisChoice DepChoice) { |
1329 | #ifndef NDEBUG |
1330 | if (VerboseFusionDebugging) { |
1331 | LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " |
1332 | << DepChoice << "\n" ); |
1333 | } |
1334 | #endif |
1335 | switch (DepChoice) { |
1336 | case FUSION_DEPENDENCE_ANALYSIS_SCEV: |
1337 | return accessDiffIsPositive(L0: *FC0.L, L1: *FC1.L, I0, I1, EqualIsInvalid: AnyDep); |
1338 | case FUSION_DEPENDENCE_ANALYSIS_DA: { |
1339 | auto DepResult = DI.depends(Src: &I0, Dst: &I1); |
1340 | if (!DepResult) |
1341 | return true; |
1342 | #ifndef NDEBUG |
1343 | if (VerboseFusionDebugging) { |
1344 | LLVM_DEBUG(dbgs() << "DA res: " ; DepResult->dump(dbgs()); |
1345 | dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " |
1346 | << (DepResult->isOrdered() ? "true" : "false" ) |
1347 | << "]\n" ); |
1348 | LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() |
1349 | << "\n" ); |
1350 | } |
1351 | #endif |
1352 | |
1353 | if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) |
1354 | LLVM_DEBUG( |
1355 | dbgs() << "TODO: Implement pred/succ dependence handling!\n" ); |
1356 | |
1357 | // TODO: Can we actually use the dependence info analysis here? |
1358 | return false; |
1359 | } |
1360 | |
1361 | case FUSION_DEPENDENCE_ANALYSIS_ALL: |
1362 | return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
1363 | DepChoice: FUSION_DEPENDENCE_ANALYSIS_SCEV) || |
1364 | dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
1365 | DepChoice: FUSION_DEPENDENCE_ANALYSIS_DA); |
1366 | } |
1367 | |
1368 | llvm_unreachable("Unknown fusion dependence analysis choice!" ); |
1369 | } |
1370 | |
1371 | /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. |
1372 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
1373 | const FusionCandidate &FC1) { |
1374 | LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 |
1375 | << "\n" ); |
1376 | assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); |
1377 | assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); |
1378 | |
1379 | for (Instruction *WriteL0 : FC0.MemWrites) { |
1380 | for (Instruction *WriteL1 : FC1.MemWrites) |
1381 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1, |
1382 | /* AnyDep */ false, |
1383 | DepChoice: FusionDependenceAnalysis)) { |
1384 | InvalidDependencies++; |
1385 | return false; |
1386 | } |
1387 | for (Instruction *ReadL1 : FC1.MemReads) |
1388 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *ReadL1, |
1389 | /* AnyDep */ false, |
1390 | DepChoice: FusionDependenceAnalysis)) { |
1391 | InvalidDependencies++; |
1392 | return false; |
1393 | } |
1394 | } |
1395 | |
1396 | for (Instruction *WriteL1 : FC1.MemWrites) { |
1397 | for (Instruction *WriteL0 : FC0.MemWrites) |
1398 | if (!dependencesAllowFusion(FC0, FC1, I0&: *WriteL0, I1&: *WriteL1, |
1399 | /* AnyDep */ false, |
1400 | DepChoice: FusionDependenceAnalysis)) { |
1401 | InvalidDependencies++; |
1402 | return false; |
1403 | } |
1404 | for (Instruction *ReadL0 : FC0.MemReads) |
1405 | if (!dependencesAllowFusion(FC0, FC1, I0&: *ReadL0, I1&: *WriteL1, |
1406 | /* AnyDep */ false, |
1407 | DepChoice: FusionDependenceAnalysis)) { |
1408 | InvalidDependencies++; |
1409 | return false; |
1410 | } |
1411 | } |
1412 | |
1413 | // Walk through all uses in FC1. For each use, find the reaching def. If the |
1414 | // def is located in FC0 then it is not safe to fuse. |
1415 | for (BasicBlock *BB : FC1.L->blocks()) |
1416 | for (Instruction &I : *BB) |
1417 | for (auto &Op : I.operands()) |
1418 | if (Instruction *Def = dyn_cast<Instruction>(Val&: Op)) |
1419 | if (FC0.L->contains(BB: Def->getParent())) { |
1420 | InvalidDependencies++; |
1421 | return false; |
1422 | } |
1423 | |
1424 | return true; |
1425 | } |
1426 | |
1427 | /// Determine if two fusion candidates are adjacent in the CFG. |
1428 | /// |
1429 | /// This method will determine if there are additional basic blocks in the CFG |
1430 | /// between the exit of \p FC0 and the entry of \p FC1. |
1431 | /// If the two candidates are guarded loops, then it checks whether the |
1432 | /// non-loop successor of the \p FC0 guard branch is the entry block of \p |
1433 | /// FC1. If not, then the loops are not adjacent. If the two candidates are |
1434 | /// not guarded loops, then it checks whether the exit block of \p FC0 is the |
1435 | /// preheader of \p FC1. |
1436 | bool isAdjacent(const FusionCandidate &FC0, |
1437 | const FusionCandidate &FC1) const { |
1438 | // If the successor of the guard branch is FC1, then the loops are adjacent |
1439 | if (FC0.GuardBranch) |
1440 | return FC0.getNonLoopBlock() == FC1.getEntryBlock(); |
1441 | else |
1442 | return FC0.ExitBlock == FC1.getEntryBlock(); |
1443 | } |
1444 | |
1445 | bool isEmptyPreheader(const FusionCandidate &FC) const { |
1446 | return FC.Preheader->size() == 1; |
1447 | } |
1448 | |
1449 | /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader |
1450 | /// and sink others into the body of \p FC1. |
1451 | void movePreheaderInsts(const FusionCandidate &FC0, |
1452 | const FusionCandidate &FC1, |
1453 | SmallVector<Instruction *, 4> &HoistInsts, |
1454 | SmallVector<Instruction *, 4> &SinkInsts) const { |
1455 | // All preheader instructions except the branch must be hoisted or sunk |
1456 | assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && |
1457 | "Attempting to sink and hoist preheader instructions, but not all " |
1458 | "the preheader instructions are accounted for." ); |
1459 | |
1460 | NumHoistedInsts += HoistInsts.size(); |
1461 | NumSunkInsts += SinkInsts.size(); |
1462 | |
1463 | LLVM_DEBUG(if (VerboseFusionDebugging) { |
1464 | if (!HoistInsts.empty()) |
1465 | dbgs() << "Hoisting: \n" ; |
1466 | for (Instruction *I : HoistInsts) |
1467 | dbgs() << *I << "\n" ; |
1468 | if (!SinkInsts.empty()) |
1469 | dbgs() << "Sinking: \n" ; |
1470 | for (Instruction *I : SinkInsts) |
1471 | dbgs() << *I << "\n" ; |
1472 | }); |
1473 | |
1474 | for (Instruction *I : HoistInsts) { |
1475 | assert(I->getParent() == FC1.Preheader); |
1476 | I->moveBefore(BB&: *FC0.Preheader, |
1477 | I: FC0.Preheader->getTerminator()->getIterator()); |
1478 | } |
1479 | // insert instructions in reverse order to maintain dominance relationship |
1480 | for (Instruction *I : reverse(C&: SinkInsts)) { |
1481 | assert(I->getParent() == FC1.Preheader); |
1482 | I->moveBefore(BB&: *FC1.ExitBlock, I: FC1.ExitBlock->getFirstInsertionPt()); |
1483 | } |
1484 | } |
1485 | |
1486 | /// Determine if two fusion candidates have identical guards |
1487 | /// |
1488 | /// This method will determine if two fusion candidates have the same guards. |
1489 | /// The guards are considered the same if: |
1490 | /// 1. The instructions to compute the condition used in the compare are |
1491 | /// identical. |
1492 | /// 2. The successors of the guard have the same flow into/around the loop. |
1493 | /// If the compare instructions are identical, then the first successor of the |
1494 | /// guard must go to the same place (either the preheader of the loop or the |
1495 | /// NonLoopBlock). In other words, the first successor of both loops must |
1496 | /// both go into the loop (i.e., the preheader) or go around the loop (i.e., |
1497 | /// the NonLoopBlock). The same must be true for the second successor. |
1498 | bool haveIdenticalGuards(const FusionCandidate &FC0, |
1499 | const FusionCandidate &FC1) const { |
1500 | assert(FC0.GuardBranch && FC1.GuardBranch && |
1501 | "Expecting FC0 and FC1 to be guarded loops." ); |
1502 | |
1503 | if (auto FC0CmpInst = |
1504 | dyn_cast<Instruction>(Val: FC0.GuardBranch->getCondition())) |
1505 | if (auto FC1CmpInst = |
1506 | dyn_cast<Instruction>(Val: FC1.GuardBranch->getCondition())) |
1507 | if (!FC0CmpInst->isIdenticalTo(I: FC1CmpInst)) |
1508 | return false; |
1509 | |
1510 | // The compare instructions are identical. |
1511 | // Now make sure the successor of the guards have the same flow into/around |
1512 | // the loop |
1513 | if (FC0.GuardBranch->getSuccessor(i: 0) == FC0.Preheader) |
1514 | return (FC1.GuardBranch->getSuccessor(i: 0) == FC1.Preheader); |
1515 | else |
1516 | return (FC1.GuardBranch->getSuccessor(i: 1) == FC1.Preheader); |
1517 | } |
1518 | |
1519 | /// Modify the latch branch of FC to be unconditional since successors of the |
1520 | /// branch are the same. |
1521 | void simplifyLatchBranch(const FusionCandidate &FC) const { |
1522 | BranchInst *FCLatchBranch = dyn_cast<BranchInst>(Val: FC.Latch->getTerminator()); |
1523 | if (FCLatchBranch) { |
1524 | assert(FCLatchBranch->isConditional() && |
1525 | FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && |
1526 | "Expecting the two successors of FCLatchBranch to be the same" ); |
1527 | BranchInst *NewBranch = |
1528 | BranchInst::Create(IfTrue: FCLatchBranch->getSuccessor(i: 0)); |
1529 | ReplaceInstWithInst(From: FCLatchBranch, To: NewBranch); |
1530 | } |
1531 | } |
1532 | |
1533 | /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique |
1534 | /// successor, then merge FC0.Latch with its unique successor. |
1535 | void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
1536 | moveInstructionsToTheBeginning(FromBB&: *FC0.Latch, ToBB&: *FC1.Latch, DT, PDT, DI); |
1537 | if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { |
1538 | MergeBlockIntoPredecessor(BB: Succ, DTU: &DTU, LI: &LI); |
1539 | DTU.flush(); |
1540 | } |
1541 | } |
1542 | |
1543 | /// Fuse two fusion candidates, creating a new fused loop. |
1544 | /// |
1545 | /// This method contains the mechanics of fusing two loops, represented by \p |
1546 | /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 |
1547 | /// postdominates \p FC0 (making them control flow equivalent). It also |
1548 | /// assumes that the other conditions for fusion have been met: adjacent, |
1549 | /// identical trip counts, and no negative distance dependencies exist that |
1550 | /// would prevent fusion. Thus, there is no checking for these conditions in |
1551 | /// this method. |
1552 | /// |
1553 | /// Fusion is performed by rewiring the CFG to update successor blocks of the |
1554 | /// components of tho loop. Specifically, the following changes are done: |
1555 | /// |
1556 | /// 1. The preheader of \p FC1 is removed as it is no longer necessary |
1557 | /// (because it is currently only a single statement block). |
1558 | /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. |
1559 | /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. |
1560 | /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. |
1561 | /// |
1562 | /// All of these modifications are done with dominator tree updates, thus |
1563 | /// keeping the dominator (and post dominator) information up-to-date. |
1564 | /// |
1565 | /// This can be improved in the future by actually merging blocks during |
1566 | /// fusion. For example, the preheader of \p FC1 can be merged with the |
1567 | /// preheader of \p FC0. This would allow loops with more than a single |
1568 | /// statement in the preheader to be fused. Similarly, the latch blocks of the |
1569 | /// two loops could also be fused into a single block. This will require |
1570 | /// analysis to prove it is safe to move the contents of the block past |
1571 | /// existing code, which currently has not been implemented. |
1572 | Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
1573 | assert(FC0.isValid() && FC1.isValid() && |
1574 | "Expecting valid fusion candidates" ); |
1575 | |
1576 | LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n" ; FC0.dump(); |
1577 | dbgs() << "Fusion Candidate 1: \n" ; FC1.dump();); |
1578 | |
1579 | // Move instructions from the preheader of FC1 to the end of the preheader |
1580 | // of FC0. |
1581 | moveInstructionsToTheEnd(FromBB&: *FC1.Preheader, ToBB&: *FC0.Preheader, DT, PDT, DI); |
1582 | |
1583 | // Fusing guarded loops is handled slightly differently than non-guarded |
1584 | // loops and has been broken out into a separate method instead of trying to |
1585 | // intersperse the logic within a single method. |
1586 | if (FC0.GuardBranch) |
1587 | return fuseGuardedLoops(FC0, FC1); |
1588 | |
1589 | assert(FC1.Preheader == |
1590 | (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); |
1591 | assert(FC1.Preheader->size() == 1 && |
1592 | FC1.Preheader->getSingleSuccessor() == FC1.Header); |
1593 | |
1594 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
1595 | // them later. However, this is only necessary if the new loop carried |
1596 | // values might not dominate the exiting branch. While we do not generally |
1597 | // test if this is the case but simply insert intermediate phi nodes, we |
1598 | // need to make sure these intermediate phi nodes have different |
1599 | // predecessors. To this end, we filter the special case where the exiting |
1600 | // block is the latch block of the first loop. Nothing needs to be done |
1601 | // anyway as all loop carried values dominate the latch and thereby also the |
1602 | // exiting branch. |
1603 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
1604 | if (FC0.ExitingBlock != FC0.Latch) |
1605 | for (PHINode &PHI : FC0.Header->phis()) |
1606 | OriginalFC0PHIs.push_back(Elt: &PHI); |
1607 | |
1608 | // Replace incoming blocks for header PHIs first. |
1609 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
1610 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
1611 | |
1612 | // Then modify the control flow and update DT and PDT. |
1613 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
1614 | |
1615 | // The old exiting block of the first loop (FC0) has to jump to the header |
1616 | // of the second as we need to execute the code in the second header block |
1617 | // regardless of the trip count. That is, if the trip count is 0, so the |
1618 | // back edge is never taken, we still have to execute both loop headers, |
1619 | // especially (but not only!) if the second is a do-while style loop. |
1620 | // However, doing so might invalidate the phi nodes of the first loop as |
1621 | // the new values do only need to dominate their latch and not the exiting |
1622 | // predicate. To remedy this potential problem we always introduce phi |
1623 | // nodes in the header of the second loop later that select the loop carried |
1624 | // value, if the second header was reached through an old latch of the |
1625 | // first, or undef otherwise. This is sound as exiting the first implies the |
1626 | // second will exit too, __without__ taking the back-edge. [Their |
1627 | // trip-counts are equal after all. |
1628 | // KB: Would this sequence be simpler to just make FC0.ExitingBlock go |
1629 | // to FC1.Header? I think this is basically what the three sequences are |
1630 | // trying to accomplish; however, doing this directly in the CFG may mean |
1631 | // the DT/PDT becomes invalid |
1632 | if (!FC0.Peeled) { |
1633 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC1.Preheader, |
1634 | To: FC1.Header); |
1635 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1636 | DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); |
1637 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1638 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
1639 | } else { |
1640 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1641 | DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); |
1642 | |
1643 | // Remove the ExitBlock of the first Loop (also not needed) |
1644 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
1645 | To: FC1.Header); |
1646 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1647 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
1648 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
1649 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1650 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
1651 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
1652 | } |
1653 | |
1654 | // The pre-header of L1 is not necessary anymore. |
1655 | assert(pred_empty(FC1.Preheader)); |
1656 | FC1.Preheader->getTerminator()->eraseFromParent(); |
1657 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
1658 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1659 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
1660 | |
1661 | // Moves the phi nodes from the second to the first loops header block. |
1662 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
1663 | if (SE.isSCEVable(Ty: PHI->getType())) |
1664 | SE.forgetValue(V: PHI); |
1665 | if (PHI->hasNUsesOrMore(N: 1)) |
1666 | PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt()); |
1667 | else |
1668 | PHI->eraseFromParent(); |
1669 | } |
1670 | |
1671 | // Introduce new phi nodes in the second loop header to ensure |
1672 | // exiting the first and jumping to the header of the second does not break |
1673 | // the SSA property of the phis originally in the first loop. See also the |
1674 | // comment above. |
1675 | BasicBlock::iterator = FC1.Header->begin(); |
1676 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
1677 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
1678 | assert(L1LatchBBIdx >= 0 && |
1679 | "Expected loop carried value to be rewired at this point!" ); |
1680 | |
1681 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
1682 | |
1683 | PHINode * = |
1684 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
1685 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
1686 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
1687 | L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()), |
1688 | BB: FC0.ExitingBlock); |
1689 | |
1690 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
1691 | } |
1692 | |
1693 | // Replace latch terminator destinations. |
1694 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
1695 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
1696 | |
1697 | // Modify the latch branch of FC0 to be unconditional as both successors of |
1698 | // the branch are the same. |
1699 | simplifyLatchBranch(FC: FC0); |
1700 | |
1701 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
1702 | // performed the updates above. |
1703 | if (FC0.Latch != FC0.ExitingBlock) |
1704 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1705 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
1706 | |
1707 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1708 | FC0.Latch, FC0.Header)); |
1709 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
1710 | FC1.Latch, FC0.Header)); |
1711 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1712 | FC1.Latch, FC1.Header)); |
1713 | |
1714 | // Update DT/PDT |
1715 | DTU.applyUpdates(Updates: TreeUpdates); |
1716 | |
1717 | LI.removeBlock(BB: FC1.Preheader); |
1718 | DTU.deleteBB(DelBB: FC1.Preheader); |
1719 | if (FC0.Peeled) { |
1720 | LI.removeBlock(BB: FC0.ExitBlock); |
1721 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
1722 | } |
1723 | |
1724 | DTU.flush(); |
1725 | |
1726 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
1727 | // and rebuild the information in subsequent passes of fusion? |
1728 | // Note: Need to forget the loops before merging the loop latches, as |
1729 | // mergeLatch may remove the only block in FC1. |
1730 | SE.forgetLoop(L: FC1.L); |
1731 | SE.forgetLoop(L: FC0.L); |
1732 | // Forget block dispositions as well, so that there are no dangling |
1733 | // pointers to erased/free'ed blocks. |
1734 | SE.forgetBlockAndLoopDispositions(); |
1735 | |
1736 | // Move instructions from FC0.Latch to FC1.Latch. |
1737 | // Note: mergeLatch requires an updated DT. |
1738 | mergeLatch(FC0, FC1); |
1739 | |
1740 | // Merge the loops. |
1741 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
1742 | for (BasicBlock *BB : Blocks) { |
1743 | FC0.L->addBlockEntry(BB); |
1744 | FC1.L->removeBlockFromLoop(BB); |
1745 | if (LI.getLoopFor(BB) != FC1.L) |
1746 | continue; |
1747 | LI.changeLoopFor(BB, L: FC0.L); |
1748 | } |
1749 | while (!FC1.L->isInnermost()) { |
1750 | const auto &ChildLoopIt = FC1.L->begin(); |
1751 | Loop *ChildLoop = *ChildLoopIt; |
1752 | FC1.L->removeChildLoop(I: ChildLoopIt); |
1753 | FC0.L->addChildLoop(NewChild: ChildLoop); |
1754 | } |
1755 | |
1756 | // Delete the now empty loop L1. |
1757 | LI.erase(L: FC1.L); |
1758 | |
1759 | #ifndef NDEBUG |
1760 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
1761 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
1762 | assert(PDT.verify()); |
1763 | LI.verify(DT); |
1764 | SE.verify(); |
1765 | #endif |
1766 | |
1767 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
1768 | |
1769 | return FC0.L; |
1770 | } |
1771 | |
1772 | /// Report details on loop fusion opportunities. |
1773 | /// |
1774 | /// This template function can be used to report both successful and missed |
1775 | /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should |
1776 | /// be one of: |
1777 | /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful |
1778 | /// given two valid fusion candidates. |
1779 | /// - OptimizationRemark to report successful fusion of two fusion |
1780 | /// candidates. |
1781 | /// The remarks will be printed using the form: |
1782 | /// <path/filename>:<line number>:<column number>: [<function name>]: |
1783 | /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> |
1784 | template <typename RemarkKind> |
1785 | void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, |
1786 | llvm::Statistic &Stat) { |
1787 | assert(FC0.Preheader && FC1.Preheader && |
1788 | "Expecting valid fusion candidates" ); |
1789 | using namespace ore; |
1790 | #if LLVM_ENABLE_STATS |
1791 | ++Stat; |
1792 | ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), |
1793 | FC0.Preheader) |
1794 | << "[" << FC0.Preheader->getParent()->getName() |
1795 | << "]: " << NV("Cand1" , StringRef(FC0.Preheader->getName())) |
1796 | << " and " << NV("Cand2" , StringRef(FC1.Preheader->getName())) |
1797 | << ": " << Stat.getDesc()); |
1798 | #endif |
1799 | } |
1800 | |
1801 | /// Fuse two guarded fusion candidates, creating a new fused loop. |
1802 | /// |
1803 | /// Fusing guarded loops is handled much the same way as fusing non-guarded |
1804 | /// loops. The rewiring of the CFG is slightly different though, because of |
1805 | /// the presence of the guards around the loops and the exit blocks after the |
1806 | /// loop body. As such, the new loop is rewired as follows: |
1807 | /// 1. Keep the guard branch from FC0 and use the non-loop block target |
1808 | /// from the FC1 guard branch. |
1809 | /// 2. Remove the exit block from FC0 (this exit block should be empty |
1810 | /// right now). |
1811 | /// 3. Remove the guard branch for FC1 |
1812 | /// 4. Remove the preheader for FC1. |
1813 | /// The exit block successor for the latch of FC0 is updated to be the header |
1814 | /// of FC1 and the non-exit block successor of the latch of FC1 is updated to |
1815 | /// be the header of FC0, thus creating the fused loop. |
1816 | Loop *fuseGuardedLoops(const FusionCandidate &FC0, |
1817 | const FusionCandidate &FC1) { |
1818 | assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops" ); |
1819 | |
1820 | BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); |
1821 | BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); |
1822 | BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); |
1823 | BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); |
1824 | BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); |
1825 | |
1826 | // Move instructions from the exit block of FC0 to the beginning of the exit |
1827 | // block of FC1, in the case that the FC0 loop has not been peeled. In the |
1828 | // case that FC0 loop is peeled, then move the instructions of the successor |
1829 | // of the FC0 Exit block to the beginning of the exit block of FC1. |
1830 | moveInstructionsToTheBeginning( |
1831 | FromBB&: (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), ToBB&: *FC1.ExitBlock, |
1832 | DT, PDT, DI); |
1833 | |
1834 | // Move instructions from the guard block of FC1 to the end of the guard |
1835 | // block of FC0. |
1836 | moveInstructionsToTheEnd(FromBB&: *FC1GuardBlock, ToBB&: *FC0GuardBlock, DT, PDT, DI); |
1837 | |
1838 | assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent" ); |
1839 | |
1840 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
1841 | |
1842 | //////////////////////////////////////////////////////////////////////////// |
1843 | // Update the Loop Guard |
1844 | //////////////////////////////////////////////////////////////////////////// |
1845 | // The guard for FC0 is updated to guard both FC0 and FC1. This is done by |
1846 | // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. |
1847 | // Thus, one path from the guard goes to the preheader for FC0 (and thus |
1848 | // executes the new fused loop) and the other path goes to the NonLoopBlock |
1849 | // for FC1 (where FC1 guard would have gone if FC1 was not executed). |
1850 | FC1NonLoopBlock->replacePhiUsesWith(Old: FC1GuardBlock, New: FC0GuardBlock); |
1851 | FC0.GuardBranch->replaceUsesOfWith(From: FC0NonLoopBlock, To: FC1NonLoopBlock); |
1852 | |
1853 | BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; |
1854 | BBToUpdate->getTerminator()->replaceUsesOfWith(From: FC1GuardBlock, To: FC1.Header); |
1855 | |
1856 | // The guard of FC1 is not necessary anymore. |
1857 | FC1.GuardBranch->eraseFromParent(); |
1858 | new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); |
1859 | |
1860 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1861 | DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); |
1862 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1863 | DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); |
1864 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1865 | DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); |
1866 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1867 | DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); |
1868 | |
1869 | if (FC0.Peeled) { |
1870 | // Remove the Block after the ExitBlock of FC0 |
1871 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1872 | DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); |
1873 | FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); |
1874 | new UnreachableInst(FC0ExitBlockSuccessor->getContext(), |
1875 | FC0ExitBlockSuccessor); |
1876 | } |
1877 | |
1878 | assert(pred_empty(FC1GuardBlock) && |
1879 | "Expecting guard block to have no predecessors" ); |
1880 | assert(succ_empty(FC1GuardBlock) && |
1881 | "Expecting guard block to have no successors" ); |
1882 | |
1883 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
1884 | // them later. However, this is only necessary if the new loop carried |
1885 | // values might not dominate the exiting branch. While we do not generally |
1886 | // test if this is the case but simply insert intermediate phi nodes, we |
1887 | // need to make sure these intermediate phi nodes have different |
1888 | // predecessors. To this end, we filter the special case where the exiting |
1889 | // block is the latch block of the first loop. Nothing needs to be done |
1890 | // anyway as all loop carried values dominate the latch and thereby also the |
1891 | // exiting branch. |
1892 | // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch |
1893 | // (because the loops are rotated. Thus, nothing will ever be added to |
1894 | // OriginalFC0PHIs. |
1895 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
1896 | if (FC0.ExitingBlock != FC0.Latch) |
1897 | for (PHINode &PHI : FC0.Header->phis()) |
1898 | OriginalFC0PHIs.push_back(Elt: &PHI); |
1899 | |
1900 | assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!" ); |
1901 | |
1902 | // Replace incoming blocks for header PHIs first. |
1903 | FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader); |
1904 | FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch); |
1905 | |
1906 | // The old exiting block of the first loop (FC0) has to jump to the header |
1907 | // of the second as we need to execute the code in the second header block |
1908 | // regardless of the trip count. That is, if the trip count is 0, so the |
1909 | // back edge is never taken, we still have to execute both loop headers, |
1910 | // especially (but not only!) if the second is a do-while style loop. |
1911 | // However, doing so might invalidate the phi nodes of the first loop as |
1912 | // the new values do only need to dominate their latch and not the exiting |
1913 | // predicate. To remedy this potential problem we always introduce phi |
1914 | // nodes in the header of the second loop later that select the loop carried |
1915 | // value, if the second header was reached through an old latch of the |
1916 | // first, or undef otherwise. This is sound as exiting the first implies the |
1917 | // second will exit too, __without__ taking the back-edge (their |
1918 | // trip-counts are equal after all). |
1919 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock, |
1920 | To: FC1.Header); |
1921 | |
1922 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1923 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
1924 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1925 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
1926 | |
1927 | // Remove FC0 Exit Block |
1928 | // The exit block for FC0 is no longer needed since control will flow |
1929 | // directly to the header of FC1. Since it is an empty block, it can be |
1930 | // removed at this point. |
1931 | // TODO: In the future, we can handle non-empty exit blocks my merging any |
1932 | // instructions from FC0 exit block into FC1 exit block prior to removing |
1933 | // the block. |
1934 | assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty" ); |
1935 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
1936 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
1937 | |
1938 | // Remove FC1 Preheader |
1939 | // The pre-header of L1 is not necessary anymore. |
1940 | assert(pred_empty(FC1.Preheader)); |
1941 | FC1.Preheader->getTerminator()->eraseFromParent(); |
1942 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
1943 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1944 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
1945 | |
1946 | // Moves the phi nodes from the second to the first loops header block. |
1947 | while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) { |
1948 | if (SE.isSCEVable(Ty: PHI->getType())) |
1949 | SE.forgetValue(V: PHI); |
1950 | if (PHI->hasNUsesOrMore(N: 1)) |
1951 | PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt()); |
1952 | else |
1953 | PHI->eraseFromParent(); |
1954 | } |
1955 | |
1956 | // Introduce new phi nodes in the second loop header to ensure |
1957 | // exiting the first and jumping to the header of the second does not break |
1958 | // the SSA property of the phis originally in the first loop. See also the |
1959 | // comment above. |
1960 | BasicBlock::iterator = FC1.Header->begin(); |
1961 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
1962 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch); |
1963 | assert(L1LatchBBIdx >= 0 && |
1964 | "Expected loop carried value to be rewired at this point!" ); |
1965 | |
1966 | Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx); |
1967 | |
1968 | PHINode * = |
1969 | PHINode::Create(Ty: LCV->getType(), NumReservedValues: 2, NameStr: LCPHI->getName() + ".afterFC0" ); |
1970 | L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP); |
1971 | L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch); |
1972 | L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()), |
1973 | BB: FC0.ExitingBlock); |
1974 | |
1975 | LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI); |
1976 | } |
1977 | |
1978 | // Update the latches |
1979 | |
1980 | // Replace latch terminator destinations. |
1981 | FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header); |
1982 | FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header); |
1983 | |
1984 | // Modify the latch branch of FC0 to be unconditional as both successors of |
1985 | // the branch are the same. |
1986 | simplifyLatchBranch(FC: FC0); |
1987 | |
1988 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
1989 | // performed the updates above. |
1990 | if (FC0.Latch != FC0.ExitingBlock) |
1991 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType( |
1992 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
1993 | |
1994 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1995 | FC0.Latch, FC0.Header)); |
1996 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Insert, |
1997 | FC1.Latch, FC0.Header)); |
1998 | TreeUpdates.emplace_back(Args: DominatorTree::UpdateType(DominatorTree::Delete, |
1999 | FC1.Latch, FC1.Header)); |
2000 | |
2001 | // All done |
2002 | // Apply the updates to the Dominator Tree and cleanup. |
2003 | |
2004 | assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!" ); |
2005 | assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!" ); |
2006 | |
2007 | // Update DT/PDT |
2008 | DTU.applyUpdates(Updates: TreeUpdates); |
2009 | |
2010 | LI.removeBlock(BB: FC1GuardBlock); |
2011 | LI.removeBlock(BB: FC1.Preheader); |
2012 | LI.removeBlock(BB: FC0.ExitBlock); |
2013 | if (FC0.Peeled) { |
2014 | LI.removeBlock(BB: FC0ExitBlockSuccessor); |
2015 | DTU.deleteBB(DelBB: FC0ExitBlockSuccessor); |
2016 | } |
2017 | DTU.deleteBB(DelBB: FC1GuardBlock); |
2018 | DTU.deleteBB(DelBB: FC1.Preheader); |
2019 | DTU.deleteBB(DelBB: FC0.ExitBlock); |
2020 | DTU.flush(); |
2021 | |
2022 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
2023 | // and rebuild the information in subsequent passes of fusion? |
2024 | // Note: Need to forget the loops before merging the loop latches, as |
2025 | // mergeLatch may remove the only block in FC1. |
2026 | SE.forgetLoop(L: FC1.L); |
2027 | SE.forgetLoop(L: FC0.L); |
2028 | // Forget block dispositions as well, so that there are no dangling |
2029 | // pointers to erased/free'ed blocks. |
2030 | SE.forgetBlockAndLoopDispositions(); |
2031 | |
2032 | // Move instructions from FC0.Latch to FC1.Latch. |
2033 | // Note: mergeLatch requires an updated DT. |
2034 | mergeLatch(FC0, FC1); |
2035 | |
2036 | // Merge the loops. |
2037 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); |
2038 | for (BasicBlock *BB : Blocks) { |
2039 | FC0.L->addBlockEntry(BB); |
2040 | FC1.L->removeBlockFromLoop(BB); |
2041 | if (LI.getLoopFor(BB) != FC1.L) |
2042 | continue; |
2043 | LI.changeLoopFor(BB, L: FC0.L); |
2044 | } |
2045 | while (!FC1.L->isInnermost()) { |
2046 | const auto &ChildLoopIt = FC1.L->begin(); |
2047 | Loop *ChildLoop = *ChildLoopIt; |
2048 | FC1.L->removeChildLoop(I: ChildLoopIt); |
2049 | FC0.L->addChildLoop(NewChild: ChildLoop); |
2050 | } |
2051 | |
2052 | // Delete the now empty loop L1. |
2053 | LI.erase(L: FC1.L); |
2054 | |
2055 | #ifndef NDEBUG |
2056 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
2057 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
2058 | assert(PDT.verify()); |
2059 | LI.verify(DT); |
2060 | SE.verify(); |
2061 | #endif |
2062 | |
2063 | LLVM_DEBUG(dbgs() << "Fusion done:\n" ); |
2064 | |
2065 | return FC0.L; |
2066 | } |
2067 | }; |
2068 | } // namespace |
2069 | |
2070 | PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { |
2071 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
2072 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
2073 | auto &DI = AM.getResult<DependenceAnalysis>(IR&: F); |
2074 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
2075 | auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(IR&: F); |
2076 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
2077 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
2078 | const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
2079 | const DataLayout &DL = F.getDataLayout(); |
2080 | |
2081 | // Ensure loops are in simplifed form which is a pre-requisite for loop fusion |
2082 | // pass. Added only for new PM since the legacy PM has already added |
2083 | // LoopSimplify pass as a dependency. |
2084 | bool Changed = false; |
2085 | for (auto &L : LI) { |
2086 | Changed |= |
2087 | simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */); |
2088 | } |
2089 | if (Changed) |
2090 | PDT.recalculate(Func&: F); |
2091 | |
2092 | LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); |
2093 | Changed |= LF.fuseLoops(F); |
2094 | if (!Changed) |
2095 | return PreservedAnalyses::all(); |
2096 | |
2097 | PreservedAnalyses PA; |
2098 | PA.preserve<DominatorTreeAnalysis>(); |
2099 | PA.preserve<PostDominatorTreeAnalysis>(); |
2100 | PA.preserve<ScalarEvolutionAnalysis>(); |
2101 | PA.preserve<LoopAnalysis>(); |
2102 | return PA; |
2103 | } |
2104 | |