1 | //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements some loop unrolling utilities. It does not define any |
10 | // actual pass or policy, but provides a single function to perform loop |
11 | // unrolling. |
12 | // |
13 | // The process of unrolling can produce extraneous basic blocks linked with |
14 | // unconditional branches. This will be corrected in the future. |
15 | // |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "llvm/ADT/ArrayRef.h" |
19 | #include "llvm/ADT/DenseMap.h" |
20 | #include "llvm/ADT/STLExtras.h" |
21 | #include "llvm/ADT/ScopedHashTable.h" |
22 | #include "llvm/ADT/SetVector.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/ADT/Statistic.h" |
25 | #include "llvm/ADT/StringRef.h" |
26 | #include "llvm/ADT/Twine.h" |
27 | #include "llvm/ADT/ilist_iterator.h" |
28 | #include "llvm/Analysis/AliasAnalysis.h" |
29 | #include "llvm/Analysis/AssumptionCache.h" |
30 | #include "llvm/Analysis/DomTreeUpdater.h" |
31 | #include "llvm/Analysis/InstructionSimplify.h" |
32 | #include "llvm/Analysis/LoopInfo.h" |
33 | #include "llvm/Analysis/LoopIterator.h" |
34 | #include "llvm/Analysis/MemorySSA.h" |
35 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
36 | #include "llvm/Analysis/ScalarEvolution.h" |
37 | #include "llvm/IR/BasicBlock.h" |
38 | #include "llvm/IR/CFG.h" |
39 | #include "llvm/IR/Constants.h" |
40 | #include "llvm/IR/DebugInfoMetadata.h" |
41 | #include "llvm/IR/DebugLoc.h" |
42 | #include "llvm/IR/DiagnosticInfo.h" |
43 | #include "llvm/IR/Dominators.h" |
44 | #include "llvm/IR/Function.h" |
45 | #include "llvm/IR/Instruction.h" |
46 | #include "llvm/IR/Instructions.h" |
47 | #include "llvm/IR/IntrinsicInst.h" |
48 | #include "llvm/IR/Metadata.h" |
49 | #include "llvm/IR/Module.h" |
50 | #include "llvm/IR/PatternMatch.h" |
51 | #include "llvm/IR/Use.h" |
52 | #include "llvm/IR/User.h" |
53 | #include "llvm/IR/ValueHandle.h" |
54 | #include "llvm/IR/ValueMap.h" |
55 | #include "llvm/Support/Casting.h" |
56 | #include "llvm/Support/CommandLine.h" |
57 | #include "llvm/Support/Debug.h" |
58 | #include "llvm/Support/GenericDomTree.h" |
59 | #include "llvm/Support/MathExtras.h" |
60 | #include "llvm/Support/raw_ostream.h" |
61 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
62 | #include "llvm/Transforms/Utils/Cloning.h" |
63 | #include "llvm/Transforms/Utils/Local.h" |
64 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
65 | #include "llvm/Transforms/Utils/LoopUtils.h" |
66 | #include "llvm/Transforms/Utils/SimplifyIndVar.h" |
67 | #include "llvm/Transforms/Utils/UnrollLoop.h" |
68 | #include "llvm/Transforms/Utils/ValueMapper.h" |
69 | #include <algorithm> |
70 | #include <assert.h> |
71 | #include <numeric> |
72 | #include <type_traits> |
73 | #include <vector> |
74 | |
75 | namespace llvm { |
76 | class DataLayout; |
77 | class Value; |
78 | } // namespace llvm |
79 | |
80 | using namespace llvm; |
81 | |
82 | #define DEBUG_TYPE "loop-unroll" |
83 | |
84 | // TODO: Should these be here or in LoopUnroll? |
85 | STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled" ); |
86 | STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)" ); |
87 | STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional " |
88 | "latch (completely or otherwise)" ); |
89 | |
90 | static cl::opt<bool> |
91 | UnrollRuntimeEpilog("unroll-runtime-epilog" , cl::init(Val: false), cl::Hidden, |
92 | cl::desc("Allow runtime unrolled loops to be unrolled " |
93 | "with epilog instead of prolog." )); |
94 | |
95 | static cl::opt<bool> |
96 | UnrollVerifyDomtree("unroll-verify-domtree" , cl::Hidden, |
97 | cl::desc("Verify domtree after unrolling" ), |
98 | #ifdef EXPENSIVE_CHECKS |
99 | cl::init(true) |
100 | #else |
101 | cl::init(Val: false) |
102 | #endif |
103 | ); |
104 | |
105 | static cl::opt<bool> |
106 | UnrollVerifyLoopInfo("unroll-verify-loopinfo" , cl::Hidden, |
107 | cl::desc("Verify loopinfo after unrolling" ), |
108 | #ifdef EXPENSIVE_CHECKS |
109 | cl::init(true) |
110 | #else |
111 | cl::init(Val: false) |
112 | #endif |
113 | ); |
114 | |
115 | |
116 | /// Check if unrolling created a situation where we need to insert phi nodes to |
117 | /// preserve LCSSA form. |
118 | /// \param Blocks is a vector of basic blocks representing unrolled loop. |
119 | /// \param L is the outer loop. |
120 | /// It's possible that some of the blocks are in L, and some are not. In this |
121 | /// case, if there is a use is outside L, and definition is inside L, we need to |
122 | /// insert a phi-node, otherwise LCSSA will be broken. |
123 | /// The function is just a helper function for llvm::UnrollLoop that returns |
124 | /// true if this situation occurs, indicating that LCSSA needs to be fixed. |
125 | static bool needToInsertPhisForLCSSA(Loop *L, |
126 | const std::vector<BasicBlock *> &Blocks, |
127 | LoopInfo *LI) { |
128 | for (BasicBlock *BB : Blocks) { |
129 | if (LI->getLoopFor(BB) == L) |
130 | continue; |
131 | for (Instruction &I : *BB) { |
132 | for (Use &U : I.operands()) { |
133 | if (const auto *Def = dyn_cast<Instruction>(Val&: U)) { |
134 | Loop *DefLoop = LI->getLoopFor(BB: Def->getParent()); |
135 | if (!DefLoop) |
136 | continue; |
137 | if (DefLoop->contains(L)) |
138 | return true; |
139 | } |
140 | } |
141 | } |
142 | } |
143 | return false; |
144 | } |
145 | |
146 | /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary |
147 | /// and adds a mapping from the original loop to the new loop to NewLoops. |
148 | /// Returns nullptr if no new loop was created and a pointer to the |
149 | /// original loop OriginalBB was part of otherwise. |
150 | const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB, |
151 | BasicBlock *ClonedBB, LoopInfo *LI, |
152 | NewLoopsMap &NewLoops) { |
153 | // Figure out which loop New is in. |
154 | const Loop *OldLoop = LI->getLoopFor(BB: OriginalBB); |
155 | assert(OldLoop && "Should (at least) be in the loop being unrolled!" ); |
156 | |
157 | Loop *&NewLoop = NewLoops[OldLoop]; |
158 | if (!NewLoop) { |
159 | // Found a new sub-loop. |
160 | assert(OriginalBB == OldLoop->getHeader() && |
161 | "Header should be first in RPO" ); |
162 | |
163 | NewLoop = LI->AllocateLoop(); |
164 | Loop *NewLoopParent = NewLoops.lookup(Val: OldLoop->getParentLoop()); |
165 | |
166 | if (NewLoopParent) |
167 | NewLoopParent->addChildLoop(NewChild: NewLoop); |
168 | else |
169 | LI->addTopLevelLoop(New: NewLoop); |
170 | |
171 | NewLoop->addBasicBlockToLoop(NewBB: ClonedBB, LI&: *LI); |
172 | return OldLoop; |
173 | } else { |
174 | NewLoop->addBasicBlockToLoop(NewBB: ClonedBB, LI&: *LI); |
175 | return nullptr; |
176 | } |
177 | } |
178 | |
179 | /// The function chooses which type of unroll (epilog or prolog) is more |
180 | /// profitabale. |
181 | /// Epilog unroll is more profitable when there is PHI that starts from |
182 | /// constant. In this case epilog will leave PHI start from constant, |
183 | /// but prolog will convert it to non-constant. |
184 | /// |
185 | /// loop: |
186 | /// PN = PHI [I, Latch], [CI, PreHeader] |
187 | /// I = foo(PN) |
188 | /// ... |
189 | /// |
190 | /// Epilog unroll case. |
191 | /// loop: |
192 | /// PN = PHI [I2, Latch], [CI, PreHeader] |
193 | /// I1 = foo(PN) |
194 | /// I2 = foo(I1) |
195 | /// ... |
196 | /// Prolog unroll case. |
197 | /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader] |
198 | /// loop: |
199 | /// PN = PHI [I2, Latch], [NewPN, PreHeader] |
200 | /// I1 = foo(PN) |
201 | /// I2 = foo(I1) |
202 | /// ... |
203 | /// |
204 | static bool isEpilogProfitable(Loop *L) { |
205 | BasicBlock * = L->getLoopPreheader(); |
206 | BasicBlock * = L->getHeader(); |
207 | assert(PreHeader && Header); |
208 | for (const PHINode &PN : Header->phis()) { |
209 | if (isa<ConstantInt>(Val: PN.getIncomingValueForBlock(BB: PreHeader))) |
210 | return true; |
211 | } |
212 | return false; |
213 | } |
214 | |
215 | struct LoadValue { |
216 | Instruction *DefI = nullptr; |
217 | unsigned Generation = 0; |
218 | LoadValue() = default; |
219 | LoadValue(Instruction *Inst, unsigned Generation) |
220 | : DefI(Inst), Generation(Generation) {} |
221 | }; |
222 | |
223 | class StackNode { |
224 | ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope; |
225 | unsigned CurrentGeneration; |
226 | unsigned ChildGeneration; |
227 | DomTreeNode *Node; |
228 | DomTreeNode::const_iterator ChildIter; |
229 | DomTreeNode::const_iterator EndIter; |
230 | bool Processed = false; |
231 | |
232 | public: |
233 | StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads, |
234 | unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child, |
235 | DomTreeNode::const_iterator End) |
236 | : LoadScope(AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg), |
237 | Node(N), ChildIter(Child), EndIter(End) {} |
238 | // Accessors. |
239 | unsigned currentGeneration() const { return CurrentGeneration; } |
240 | unsigned childGeneration() const { return ChildGeneration; } |
241 | void childGeneration(unsigned generation) { ChildGeneration = generation; } |
242 | DomTreeNode *node() { return Node; } |
243 | DomTreeNode::const_iterator childIter() const { return ChildIter; } |
244 | |
245 | DomTreeNode *nextChild() { |
246 | DomTreeNode *Child = *ChildIter; |
247 | ++ChildIter; |
248 | return Child; |
249 | } |
250 | |
251 | DomTreeNode::const_iterator end() const { return EndIter; } |
252 | bool isProcessed() const { return Processed; } |
253 | void process() { Processed = true; } |
254 | }; |
255 | |
256 | Value *getMatchingValue(LoadValue LV, LoadInst *LI, unsigned CurrentGeneration, |
257 | BatchAAResults &BAA, |
258 | function_ref<MemorySSA *()> GetMSSA) { |
259 | if (!LV.DefI) |
260 | return nullptr; |
261 | if (LV.DefI->getType() != LI->getType()) |
262 | return nullptr; |
263 | if (LV.Generation != CurrentGeneration) { |
264 | MemorySSA *MSSA = GetMSSA(); |
265 | if (!MSSA) |
266 | return nullptr; |
267 | auto *EarlierMA = MSSA->getMemoryAccess(I: LV.DefI); |
268 | MemoryAccess *LaterDef = |
269 | MSSA->getWalker()->getClobberingMemoryAccess(I: LI, AA&: BAA); |
270 | if (!MSSA->dominates(A: LaterDef, B: EarlierMA)) |
271 | return nullptr; |
272 | } |
273 | return LV.DefI; |
274 | } |
275 | |
276 | void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI, |
277 | BatchAAResults &BAA, function_ref<MemorySSA *()> GetMSSA) { |
278 | ScopedHashTable<const SCEV *, LoadValue> AvailableLoads; |
279 | SmallVector<std::unique_ptr<StackNode>> NodesToProcess; |
280 | DomTreeNode * = DT.getNode(BB: L->getHeader()); |
281 | NodesToProcess.emplace_back(Args: new StackNode(AvailableLoads, 0, HeaderD, |
282 | HeaderD->begin(), HeaderD->end())); |
283 | |
284 | unsigned CurrentGeneration = 0; |
285 | while (!NodesToProcess.empty()) { |
286 | StackNode *NodeToProcess = &*NodesToProcess.back(); |
287 | |
288 | CurrentGeneration = NodeToProcess->currentGeneration(); |
289 | |
290 | if (!NodeToProcess->isProcessed()) { |
291 | // Process the node. |
292 | |
293 | // If this block has a single predecessor, then the predecessor is the |
294 | // parent |
295 | // of the domtree node and all of the live out memory values are still |
296 | // current in this block. If this block has multiple predecessors, then |
297 | // they could have invalidated the live-out memory values of our parent |
298 | // value. For now, just be conservative and invalidate memory if this |
299 | // block has multiple predecessors. |
300 | if (!NodeToProcess->node()->getBlock()->getSinglePredecessor()) |
301 | ++CurrentGeneration; |
302 | for (auto &I : make_early_inc_range(Range&: *NodeToProcess->node()->getBlock())) { |
303 | |
304 | auto *Load = dyn_cast<LoadInst>(Val: &I); |
305 | if (!Load || !Load->isSimple()) { |
306 | if (I.mayWriteToMemory()) |
307 | CurrentGeneration++; |
308 | continue; |
309 | } |
310 | |
311 | const SCEV *PtrSCEV = SE.getSCEV(V: Load->getPointerOperand()); |
312 | LoadValue LV = AvailableLoads.lookup(Key: PtrSCEV); |
313 | if (Value *M = |
314 | getMatchingValue(LV, LI: Load, CurrentGeneration, BAA, GetMSSA)) { |
315 | if (LI.replacementPreservesLCSSAForm(From: Load, To: M)) { |
316 | Load->replaceAllUsesWith(V: M); |
317 | Load->eraseFromParent(); |
318 | } |
319 | } else { |
320 | AvailableLoads.insert(Key: PtrSCEV, Val: LoadValue(Load, CurrentGeneration)); |
321 | } |
322 | } |
323 | NodeToProcess->childGeneration(generation: CurrentGeneration); |
324 | NodeToProcess->process(); |
325 | } else if (NodeToProcess->childIter() != NodeToProcess->end()) { |
326 | // Push the next child onto the stack. |
327 | DomTreeNode *Child = NodeToProcess->nextChild(); |
328 | if (!L->contains(BB: Child->getBlock())) |
329 | continue; |
330 | NodesToProcess.emplace_back( |
331 | Args: new StackNode(AvailableLoads, NodeToProcess->childGeneration(), Child, |
332 | Child->begin(), Child->end())); |
333 | } else { |
334 | // It has been processed, and there are no more children to process, |
335 | // so delete it and pop it off the stack. |
336 | NodesToProcess.pop_back(); |
337 | } |
338 | } |
339 | } |
340 | |
341 | /// Perform some cleanup and simplifications on loops after unrolling. It is |
342 | /// useful to simplify the IV's in the new loop, as well as do a quick |
343 | /// simplify/dce pass of the instructions. |
344 | void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, |
345 | ScalarEvolution *SE, DominatorTree *DT, |
346 | AssumptionCache *AC, |
347 | const TargetTransformInfo *TTI, |
348 | AAResults *AA) { |
349 | using namespace llvm::PatternMatch; |
350 | |
351 | // Simplify any new induction variables in the partially unrolled loop. |
352 | if (SE && SimplifyIVs) { |
353 | SmallVector<WeakTrackingVH, 16> DeadInsts; |
354 | simplifyLoopIVs(L, SE, DT, LI, TTI, Dead&: DeadInsts); |
355 | |
356 | // Aggressively clean up dead instructions that simplifyLoopIVs already |
357 | // identified. Any remaining should be cleaned up below. |
358 | while (!DeadInsts.empty()) { |
359 | Value *V = DeadInsts.pop_back_val(); |
360 | if (Instruction *Inst = dyn_cast_or_null<Instruction>(Val: V)) |
361 | RecursivelyDeleteTriviallyDeadInstructions(V: Inst); |
362 | } |
363 | |
364 | if (AA) { |
365 | std::unique_ptr<MemorySSA> MSSA = nullptr; |
366 | BatchAAResults BAA(*AA); |
367 | loadCSE(L, DT&: *DT, SE&: *SE, LI&: *LI, BAA, GetMSSA: [L, AA, DT, &MSSA]() -> MemorySSA * { |
368 | if (!MSSA) |
369 | MSSA.reset(p: new MemorySSA(*L, AA, DT)); |
370 | return &*MSSA; |
371 | }); |
372 | } |
373 | } |
374 | |
375 | // At this point, the code is well formed. Perform constprop, instsimplify, |
376 | // and dce. |
377 | const DataLayout &DL = L->getHeader()->getDataLayout(); |
378 | SmallVector<WeakTrackingVH, 16> DeadInsts; |
379 | for (BasicBlock *BB : L->getBlocks()) { |
380 | // Remove repeated debug instructions after loop unrolling. |
381 | if (BB->getParent()->getSubprogram()) |
382 | RemoveRedundantDbgInstrs(BB); |
383 | |
384 | for (Instruction &Inst : llvm::make_early_inc_range(Range&: *BB)) { |
385 | if (Value *V = simplifyInstruction(I: &Inst, Q: {DL, nullptr, DT, AC})) |
386 | if (LI->replacementPreservesLCSSAForm(From: &Inst, To: V)) |
387 | Inst.replaceAllUsesWith(V); |
388 | if (isInstructionTriviallyDead(I: &Inst)) |
389 | DeadInsts.emplace_back(Args: &Inst); |
390 | |
391 | // Fold ((add X, C1), C2) to (add X, C1+C2). This is very common in |
392 | // unrolled loops, and handling this early allows following code to |
393 | // identify the IV as a "simple recurrence" without first folding away |
394 | // a long chain of adds. |
395 | { |
396 | Value *X; |
397 | const APInt *C1, *C2; |
398 | if (match(V: &Inst, P: m_Add(L: m_Add(L: m_Value(V&: X), R: m_APInt(Res&: C1)), R: m_APInt(Res&: C2)))) { |
399 | auto *InnerI = dyn_cast<Instruction>(Val: Inst.getOperand(i: 0)); |
400 | auto *InnerOBO = cast<OverflowingBinaryOperator>(Val: Inst.getOperand(i: 0)); |
401 | bool SignedOverflow; |
402 | APInt NewC = C1->sadd_ov(RHS: *C2, Overflow&: SignedOverflow); |
403 | Inst.setOperand(i: 0, Val: X); |
404 | Inst.setOperand(i: 1, Val: ConstantInt::get(Ty: Inst.getType(), V: NewC)); |
405 | Inst.setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap() && |
406 | InnerOBO->hasNoUnsignedWrap()); |
407 | Inst.setHasNoSignedWrap(Inst.hasNoSignedWrap() && |
408 | InnerOBO->hasNoSignedWrap() && |
409 | !SignedOverflow); |
410 | if (InnerI && isInstructionTriviallyDead(I: InnerI)) |
411 | DeadInsts.emplace_back(Args&: InnerI); |
412 | } |
413 | } |
414 | } |
415 | // We can't do recursive deletion until we're done iterating, as we might |
416 | // have a phi which (potentially indirectly) uses instructions later in |
417 | // the block we're iterating through. |
418 | RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); |
419 | } |
420 | } |
421 | |
422 | // Loops containing convergent instructions that are uncontrolled or controlled |
423 | // from outside the loop must have a count that divides their TripMultiple. |
424 | LLVM_ATTRIBUTE_USED |
425 | static bool canHaveUnrollRemainder(const Loop *L) { |
426 | if (getLoopConvergenceHeart(TheLoop: L)) |
427 | return false; |
428 | |
429 | // Check for uncontrolled convergent operations. |
430 | for (auto &BB : L->blocks()) { |
431 | for (auto &I : *BB) { |
432 | if (isa<ConvergenceControlInst>(Val: I)) |
433 | return true; |
434 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) |
435 | if (CB->isConvergent()) |
436 | return CB->getConvergenceControlToken(); |
437 | } |
438 | } |
439 | return true; |
440 | } |
441 | |
442 | /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling |
443 | /// can only fail when the loop's latch block is not terminated by a conditional |
444 | /// branch instruction. However, if the trip count (and multiple) are not known, |
445 | /// loop unrolling will mostly produce more code that is no faster. |
446 | /// |
447 | /// If Runtime is true then UnrollLoop will try to insert a prologue or |
448 | /// epilogue that ensures the latch has a trip multiple of Count. UnrollLoop |
449 | /// will not runtime-unroll the loop if computing the run-time trip count will |
450 | /// be expensive and AllowExpensiveTripCount is false. |
451 | /// |
452 | /// The LoopInfo Analysis that is passed will be kept consistent. |
453 | /// |
454 | /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and |
455 | /// DominatorTree if they are non-null. |
456 | /// |
457 | /// If RemainderLoop is non-null, it will receive the remainder loop (if |
458 | /// required and not fully unrolled). |
459 | LoopUnrollResult |
460 | llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, |
461 | ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, |
462 | const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, |
463 | bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) { |
464 | assert(DT && "DomTree is required" ); |
465 | |
466 | if (!L->getLoopPreheader()) { |
467 | LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n" ); |
468 | return LoopUnrollResult::Unmodified; |
469 | } |
470 | |
471 | if (!L->getLoopLatch()) { |
472 | LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n" ); |
473 | return LoopUnrollResult::Unmodified; |
474 | } |
475 | |
476 | // Loops with indirectbr cannot be cloned. |
477 | if (!L->isSafeToClone()) { |
478 | LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n" ); |
479 | return LoopUnrollResult::Unmodified; |
480 | } |
481 | |
482 | if (L->getHeader()->hasAddressTaken()) { |
483 | // The loop-rotate pass can be helpful to avoid this in many cases. |
484 | LLVM_DEBUG( |
485 | dbgs() << " Won't unroll loop: address of header block is taken.\n" ); |
486 | return LoopUnrollResult::Unmodified; |
487 | } |
488 | |
489 | assert(ULO.Count > 0); |
490 | |
491 | // All these values should be taken only after peeling because they might have |
492 | // changed. |
493 | BasicBlock * = L->getLoopPreheader(); |
494 | BasicBlock * = L->getHeader(); |
495 | BasicBlock *LatchBlock = L->getLoopLatch(); |
496 | SmallVector<BasicBlock *, 4> ExitBlocks; |
497 | L->getExitBlocks(ExitBlocks); |
498 | std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks(); |
499 | |
500 | const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); |
501 | const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); |
502 | unsigned EstimatedLoopInvocationWeight = 0; |
503 | std::optional<unsigned> OriginalTripCount = |
504 | llvm::getLoopEstimatedTripCount(L, EstimatedLoopInvocationWeight: &EstimatedLoopInvocationWeight); |
505 | |
506 | // Effectively "DCE" unrolled iterations that are beyond the max tripcount |
507 | // and will never be executed. |
508 | if (MaxTripCount && ULO.Count > MaxTripCount) |
509 | ULO.Count = MaxTripCount; |
510 | |
511 | struct ExitInfo { |
512 | unsigned TripCount; |
513 | unsigned TripMultiple; |
514 | unsigned BreakoutTrip; |
515 | bool ExitOnTrue; |
516 | BasicBlock *FirstExitingBlock = nullptr; |
517 | SmallVector<BasicBlock *> ExitingBlocks; |
518 | }; |
519 | DenseMap<BasicBlock *, ExitInfo> ExitInfos; |
520 | SmallVector<BasicBlock *, 4> ExitingBlocks; |
521 | L->getExitingBlocks(ExitingBlocks); |
522 | for (auto *ExitingBlock : ExitingBlocks) { |
523 | // The folding code is not prepared to deal with non-branch instructions |
524 | // right now. |
525 | auto *BI = dyn_cast<BranchInst>(Val: ExitingBlock->getTerminator()); |
526 | if (!BI) |
527 | continue; |
528 | |
529 | ExitInfo &Info = ExitInfos.try_emplace(Key: ExitingBlock).first->second; |
530 | Info.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); |
531 | Info.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); |
532 | if (Info.TripCount != 0) { |
533 | Info.BreakoutTrip = Info.TripCount % ULO.Count; |
534 | Info.TripMultiple = 0; |
535 | } else { |
536 | Info.BreakoutTrip = Info.TripMultiple = |
537 | (unsigned)std::gcd(m: ULO.Count, n: Info.TripMultiple); |
538 | } |
539 | Info.ExitOnTrue = !L->contains(BB: BI->getSuccessor(i: 0)); |
540 | Info.ExitingBlocks.push_back(Elt: ExitingBlock); |
541 | LLVM_DEBUG(dbgs() << " Exiting block %" << ExitingBlock->getName() |
542 | << ": TripCount=" << Info.TripCount |
543 | << ", TripMultiple=" << Info.TripMultiple |
544 | << ", BreakoutTrip=" << Info.BreakoutTrip << "\n" ); |
545 | } |
546 | |
547 | // Are we eliminating the loop control altogether? Note that we can know |
548 | // we're eliminating the backedge without knowing exactly which iteration |
549 | // of the unrolled body exits. |
550 | const bool CompletelyUnroll = ULO.Count == MaxTripCount; |
551 | |
552 | const bool PreserveOnlyFirst = CompletelyUnroll && MaxOrZero; |
553 | |
554 | // There's no point in performing runtime unrolling if this unroll count |
555 | // results in a full unroll. |
556 | if (CompletelyUnroll) |
557 | ULO.Runtime = false; |
558 | |
559 | // Go through all exits of L and see if there are any phi-nodes there. We just |
560 | // conservatively assume that they're inserted to preserve LCSSA form, which |
561 | // means that complete unrolling might break this form. We need to either fix |
562 | // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For |
563 | // now we just recompute LCSSA for the outer loop, but it should be possible |
564 | // to fix it in-place. |
565 | bool NeedToFixLCSSA = |
566 | PreserveLCSSA && CompletelyUnroll && |
567 | any_of(Range&: ExitBlocks, |
568 | P: [](const BasicBlock *BB) { return isa<PHINode>(Val: BB->begin()); }); |
569 | |
570 | // The current loop unroll pass can unroll loops that have |
571 | // (1) single latch; and |
572 | // (2a) latch is unconditional; or |
573 | // (2b) latch is conditional and is an exiting block |
574 | // FIXME: The implementation can be extended to work with more complicated |
575 | // cases, e.g. loops with multiple latches. |
576 | BranchInst *LatchBI = dyn_cast<BranchInst>(Val: LatchBlock->getTerminator()); |
577 | |
578 | // A conditional branch which exits the loop, which can be optimized to an |
579 | // unconditional branch in the unrolled loop in some cases. |
580 | bool LatchIsExiting = L->isLoopExiting(BB: LatchBlock); |
581 | if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) { |
582 | LLVM_DEBUG( |
583 | dbgs() << "Can't unroll; a conditional latch must exit the loop" ); |
584 | return LoopUnrollResult::Unmodified; |
585 | } |
586 | |
587 | assert((!ULO.Runtime || canHaveUnrollRemainder(L)) && |
588 | "Can't runtime unroll if loop contains a convergent operation." ); |
589 | |
590 | bool EpilogProfitability = |
591 | UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog |
592 | : isEpilogProfitable(L); |
593 | |
594 | if (ULO.Runtime && |
595 | !UnrollRuntimeLoopRemainder(L, Count: ULO.Count, AllowExpensiveTripCount: ULO.AllowExpensiveTripCount, |
596 | UseEpilogRemainder: EpilogProfitability, UnrollRemainder: ULO.UnrollRemainder, |
597 | ForgetAllSCEV: ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, |
598 | PreserveLCSSA, ResultLoop: RemainderLoop)) { |
599 | if (ULO.Force) |
600 | ULO.Runtime = false; |
601 | else { |
602 | LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be " |
603 | "generated when assuming runtime trip count\n" ); |
604 | return LoopUnrollResult::Unmodified; |
605 | } |
606 | } |
607 | |
608 | using namespace ore; |
609 | // Report the unrolling decision. |
610 | if (CompletelyUnroll) { |
611 | LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() |
612 | << " with trip count " << ULO.Count << "!\n" ); |
613 | if (ORE) |
614 | ORE->emit(RemarkBuilder: [&]() { |
615 | return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled" , L->getStartLoc(), |
616 | L->getHeader()) |
617 | << "completely unrolled loop with " |
618 | << NV("UnrollCount" , ULO.Count) << " iterations" ; |
619 | }); |
620 | } else { |
621 | LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " |
622 | << ULO.Count); |
623 | if (ULO.Runtime) |
624 | LLVM_DEBUG(dbgs() << " with run-time trip count" ); |
625 | LLVM_DEBUG(dbgs() << "!\n" ); |
626 | |
627 | if (ORE) |
628 | ORE->emit(RemarkBuilder: [&]() { |
629 | OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled" , L->getStartLoc(), |
630 | L->getHeader()); |
631 | Diag << "unrolled loop by a factor of " << NV("UnrollCount" , ULO.Count); |
632 | if (ULO.Runtime) |
633 | Diag << " with run-time trip count" ; |
634 | return Diag; |
635 | }); |
636 | } |
637 | |
638 | // We are going to make changes to this loop. SCEV may be keeping cached info |
639 | // about it, in particular about backedge taken count. The changes we make |
640 | // are guaranteed to invalidate this information for our loop. It is tempting |
641 | // to only invalidate the loop being unrolled, but it is incorrect as long as |
642 | // all exiting branches from all inner loops have impact on the outer loops, |
643 | // and if something changes inside them then any of outer loops may also |
644 | // change. When we forget outermost loop, we also forget all contained loops |
645 | // and this is what we need here. |
646 | if (SE) { |
647 | if (ULO.ForgetAllSCEV) |
648 | SE->forgetAllLoops(); |
649 | else { |
650 | SE->forgetTopmostLoop(L); |
651 | SE->forgetBlockAndLoopDispositions(); |
652 | } |
653 | } |
654 | |
655 | if (!LatchIsExiting) |
656 | ++NumUnrolledNotLatch; |
657 | |
658 | // For the first iteration of the loop, we should use the precloned values for |
659 | // PHI nodes. Insert associations now. |
660 | ValueToValueMapTy LastValueMap; |
661 | std::vector<PHINode*> OrigPHINode; |
662 | for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) { |
663 | OrigPHINode.push_back(x: cast<PHINode>(Val&: I)); |
664 | } |
665 | |
666 | std::vector<BasicBlock *> ; |
667 | std::vector<BasicBlock *> Latches; |
668 | Headers.push_back(x: Header); |
669 | Latches.push_back(x: LatchBlock); |
670 | |
671 | // The current on-the-fly SSA update requires blocks to be processed in |
672 | // reverse postorder so that LastValueMap contains the correct value at each |
673 | // exit. |
674 | LoopBlocksDFS DFS(L); |
675 | DFS.perform(LI); |
676 | |
677 | // Stash the DFS iterators before adding blocks to the loop. |
678 | LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); |
679 | LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); |
680 | |
681 | std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks(); |
682 | |
683 | // Loop Unrolling might create new loops. While we do preserve LoopInfo, we |
684 | // might break loop-simplified form for these loops (as they, e.g., would |
685 | // share the same exit blocks). We'll keep track of loops for which we can |
686 | // break this so that later we can re-simplify them. |
687 | SmallSetVector<Loop *, 4> LoopsToSimplify; |
688 | for (Loop *SubLoop : *L) |
689 | LoopsToSimplify.insert(X: SubLoop); |
690 | |
691 | // When a FSDiscriminator is enabled, we don't need to add the multiply |
692 | // factors to the discriminators. |
693 | if (Header->getParent()->shouldEmitDebugInfoForProfiling() && |
694 | !EnableFSDiscriminator) |
695 | for (BasicBlock *BB : L->getBlocks()) |
696 | for (Instruction &I : *BB) |
697 | if (!I.isDebugOrPseudoInst()) |
698 | if (const DILocation *DIL = I.getDebugLoc()) { |
699 | auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(DF: ULO.Count); |
700 | if (NewDIL) |
701 | I.setDebugLoc(*NewDIL); |
702 | else |
703 | LLVM_DEBUG(dbgs() |
704 | << "Failed to create new discriminator: " |
705 | << DIL->getFilename() << " Line: " << DIL->getLine()); |
706 | } |
707 | |
708 | // Identify what noalias metadata is inside the loop: if it is inside the |
709 | // loop, the associated metadata must be cloned for each iteration. |
710 | SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes; |
711 | identifyNoAliasScopesToClone(BBs: L->getBlocks(), NoAliasDeclScopes&: LoopLocalNoAliasDeclScopes); |
712 | |
713 | // We place the unrolled iterations immediately after the original loop |
714 | // latch. This is a reasonable default placement if we don't have block |
715 | // frequencies, and if we do, well the layout will be adjusted later. |
716 | auto BlockInsertPt = std::next(x: LatchBlock->getIterator()); |
717 | for (unsigned It = 1; It != ULO.Count; ++It) { |
718 | SmallVector<BasicBlock *, 8> NewBlocks; |
719 | SmallDenseMap<const Loop *, Loop *, 4> NewLoops; |
720 | NewLoops[L] = L; |
721 | |
722 | for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { |
723 | ValueToValueMapTy VMap; |
724 | BasicBlock *New = CloneBasicBlock(BB: *BB, VMap, NameSuffix: "." + Twine(It)); |
725 | Header->getParent()->insert(Position: BlockInsertPt, BB: New); |
726 | |
727 | assert((*BB != Header || LI->getLoopFor(*BB) == L) && |
728 | "Header should not be in a sub-loop" ); |
729 | // Tell LI about New. |
730 | const Loop *OldLoop = addClonedBlockToLoopInfo(OriginalBB: *BB, ClonedBB: New, LI, NewLoops); |
731 | if (OldLoop) |
732 | LoopsToSimplify.insert(X: NewLoops[OldLoop]); |
733 | |
734 | if (*BB == Header) { |
735 | // Loop over all of the PHI nodes in the block, changing them to use |
736 | // the incoming values from the previous block. |
737 | for (PHINode *OrigPHI : OrigPHINode) { |
738 | PHINode *NewPHI = cast<PHINode>(Val&: VMap[OrigPHI]); |
739 | Value *InVal = NewPHI->getIncomingValueForBlock(BB: LatchBlock); |
740 | if (Instruction *InValI = dyn_cast<Instruction>(Val: InVal)) |
741 | if (It > 1 && L->contains(Inst: InValI)) |
742 | InVal = LastValueMap[InValI]; |
743 | VMap[OrigPHI] = InVal; |
744 | NewPHI->eraseFromParent(); |
745 | } |
746 | |
747 | // Eliminate copies of the loop heart intrinsic, if any. |
748 | if (ULO.Heart) { |
749 | auto it = VMap.find(Val: ULO.Heart); |
750 | assert(it != VMap.end()); |
751 | Instruction *heartCopy = cast<Instruction>(Val&: it->second); |
752 | heartCopy->eraseFromParent(); |
753 | VMap.erase(I: it); |
754 | } |
755 | } |
756 | |
757 | // Update our running map of newest clones |
758 | LastValueMap[*BB] = New; |
759 | for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); |
760 | VI != VE; ++VI) |
761 | LastValueMap[VI->first] = VI->second; |
762 | |
763 | // Add phi entries for newly created values to all exit blocks. |
764 | for (BasicBlock *Succ : successors(BB: *BB)) { |
765 | if (L->contains(BB: Succ)) |
766 | continue; |
767 | for (PHINode &PHI : Succ->phis()) { |
768 | Value *Incoming = PHI.getIncomingValueForBlock(BB: *BB); |
769 | ValueToValueMapTy::iterator It = LastValueMap.find(Val: Incoming); |
770 | if (It != LastValueMap.end()) |
771 | Incoming = It->second; |
772 | PHI.addIncoming(V: Incoming, BB: New); |
773 | SE->forgetValue(V: &PHI); |
774 | } |
775 | } |
776 | // Keep track of new headers and latches as we create them, so that |
777 | // we can insert the proper branches later. |
778 | if (*BB == Header) |
779 | Headers.push_back(x: New); |
780 | if (*BB == LatchBlock) |
781 | Latches.push_back(x: New); |
782 | |
783 | // Keep track of the exiting block and its successor block contained in |
784 | // the loop for the current iteration. |
785 | auto ExitInfoIt = ExitInfos.find(Val: *BB); |
786 | if (ExitInfoIt != ExitInfos.end()) |
787 | ExitInfoIt->second.ExitingBlocks.push_back(Elt: New); |
788 | |
789 | NewBlocks.push_back(Elt: New); |
790 | UnrolledLoopBlocks.push_back(x: New); |
791 | |
792 | // Update DomTree: since we just copy the loop body, and each copy has a |
793 | // dedicated entry block (copy of the header block), this header's copy |
794 | // dominates all copied blocks. That means, dominance relations in the |
795 | // copied body are the same as in the original body. |
796 | if (*BB == Header) |
797 | DT->addNewBlock(BB: New, DomBB: Latches[It - 1]); |
798 | else { |
799 | auto BBDomNode = DT->getNode(BB: *BB); |
800 | auto BBIDom = BBDomNode->getIDom(); |
801 | BasicBlock *OriginalBBIDom = BBIDom->getBlock(); |
802 | DT->addNewBlock( |
803 | BB: New, DomBB: cast<BasicBlock>(Val&: LastValueMap[cast<Value>(Val: OriginalBBIDom)])); |
804 | } |
805 | } |
806 | |
807 | // Remap all instructions in the most recent iteration |
808 | remapInstructionsInBlocks(Blocks: NewBlocks, VMap&: LastValueMap); |
809 | for (BasicBlock *NewBlock : NewBlocks) |
810 | for (Instruction &I : *NewBlock) |
811 | if (auto *II = dyn_cast<AssumeInst>(Val: &I)) |
812 | AC->registerAssumption(CI: II); |
813 | |
814 | { |
815 | // Identify what other metadata depends on the cloned version. After |
816 | // cloning, replace the metadata with the corrected version for both |
817 | // memory instructions and noalias intrinsics. |
818 | std::string ext = (Twine("It" ) + Twine(It)).str(); |
819 | cloneAndAdaptNoAliasScopes(NoAliasDeclScopes: LoopLocalNoAliasDeclScopes, NewBlocks, |
820 | Context&: Header->getContext(), Ext: ext); |
821 | } |
822 | } |
823 | |
824 | // Loop over the PHI nodes in the original block, setting incoming values. |
825 | for (PHINode *PN : OrigPHINode) { |
826 | if (CompletelyUnroll) { |
827 | PN->replaceAllUsesWith(V: PN->getIncomingValueForBlock(BB: Preheader)); |
828 | PN->eraseFromParent(); |
829 | } else if (ULO.Count > 1) { |
830 | Value *InVal = PN->removeIncomingValue(BB: LatchBlock, DeletePHIIfEmpty: false); |
831 | // If this value was defined in the loop, take the value defined by the |
832 | // last iteration of the loop. |
833 | if (Instruction *InValI = dyn_cast<Instruction>(Val: InVal)) { |
834 | if (L->contains(Inst: InValI)) |
835 | InVal = LastValueMap[InVal]; |
836 | } |
837 | assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch" ); |
838 | PN->addIncoming(V: InVal, BB: Latches.back()); |
839 | } |
840 | } |
841 | |
842 | // Connect latches of the unrolled iterations to the headers of the next |
843 | // iteration. Currently they point to the header of the same iteration. |
844 | for (unsigned i = 0, e = Latches.size(); i != e; ++i) { |
845 | unsigned j = (i + 1) % e; |
846 | Latches[i]->getTerminator()->replaceSuccessorWith(OldBB: Headers[i], NewBB: Headers[j]); |
847 | } |
848 | |
849 | // Update dominators of blocks we might reach through exits. |
850 | // Immediate dominator of such block might change, because we add more |
851 | // routes which can lead to the exit: we can now reach it from the copied |
852 | // iterations too. |
853 | if (ULO.Count > 1) { |
854 | for (auto *BB : OriginalLoopBlocks) { |
855 | auto *BBDomNode = DT->getNode(BB); |
856 | SmallVector<BasicBlock *, 16> ChildrenToUpdate; |
857 | for (auto *ChildDomNode : BBDomNode->children()) { |
858 | auto *ChildBB = ChildDomNode->getBlock(); |
859 | if (!L->contains(BB: ChildBB)) |
860 | ChildrenToUpdate.push_back(Elt: ChildBB); |
861 | } |
862 | // The new idom of the block will be the nearest common dominator |
863 | // of all copies of the previous idom. This is equivalent to the |
864 | // nearest common dominator of the previous idom and the first latch, |
865 | // which dominates all copies of the previous idom. |
866 | BasicBlock *NewIDom = DT->findNearestCommonDominator(A: BB, B: LatchBlock); |
867 | for (auto *ChildBB : ChildrenToUpdate) |
868 | DT->changeImmediateDominator(BB: ChildBB, NewBB: NewIDom); |
869 | } |
870 | } |
871 | |
872 | assert(!UnrollVerifyDomtree || |
873 | DT->verify(DominatorTree::VerificationLevel::Fast)); |
874 | |
875 | SmallVector<DominatorTree::UpdateType> DTUpdates; |
876 | auto SetDest = [&](BasicBlock *Src, bool WillExit, bool ExitOnTrue) { |
877 | auto *Term = cast<BranchInst>(Val: Src->getTerminator()); |
878 | const unsigned Idx = ExitOnTrue ^ WillExit; |
879 | BasicBlock *Dest = Term->getSuccessor(i: Idx); |
880 | BasicBlock *DeadSucc = Term->getSuccessor(i: 1-Idx); |
881 | |
882 | // Remove predecessors from all non-Dest successors. |
883 | DeadSucc->removePredecessor(Pred: Src, /* KeepOneInputPHIs */ true); |
884 | |
885 | // Replace the conditional branch with an unconditional one. |
886 | BranchInst::Create(IfTrue: Dest, InsertBefore: Term->getIterator()); |
887 | Term->eraseFromParent(); |
888 | |
889 | DTUpdates.emplace_back(Args: DominatorTree::Delete, Args&: Src, Args&: DeadSucc); |
890 | }; |
891 | |
892 | auto WillExit = [&](const ExitInfo &Info, unsigned i, unsigned j, |
893 | bool IsLatch) -> std::optional<bool> { |
894 | if (CompletelyUnroll) { |
895 | if (PreserveOnlyFirst) { |
896 | if (i == 0) |
897 | return std::nullopt; |
898 | return j == 0; |
899 | } |
900 | // Complete (but possibly inexact) unrolling |
901 | if (j == 0) |
902 | return true; |
903 | if (Info.TripCount && j != Info.TripCount) |
904 | return false; |
905 | return std::nullopt; |
906 | } |
907 | |
908 | if (ULO.Runtime) { |
909 | // If runtime unrolling inserts a prologue, information about non-latch |
910 | // exits may be stale. |
911 | if (IsLatch && j != 0) |
912 | return false; |
913 | return std::nullopt; |
914 | } |
915 | |
916 | if (j != Info.BreakoutTrip && |
917 | (Info.TripMultiple == 0 || j % Info.TripMultiple != 0)) { |
918 | // If we know the trip count or a multiple of it, we can safely use an |
919 | // unconditional branch for some iterations. |
920 | return false; |
921 | } |
922 | return std::nullopt; |
923 | }; |
924 | |
925 | // Fold branches for iterations where we know that they will exit or not |
926 | // exit. |
927 | for (auto &Pair : ExitInfos) { |
928 | ExitInfo &Info = Pair.second; |
929 | for (unsigned i = 0, e = Info.ExitingBlocks.size(); i != e; ++i) { |
930 | // The branch destination. |
931 | unsigned j = (i + 1) % e; |
932 | bool IsLatch = Pair.first == LatchBlock; |
933 | std::optional<bool> KnownWillExit = WillExit(Info, i, j, IsLatch); |
934 | if (!KnownWillExit) { |
935 | if (!Info.FirstExitingBlock) |
936 | Info.FirstExitingBlock = Info.ExitingBlocks[i]; |
937 | continue; |
938 | } |
939 | |
940 | // We don't fold known-exiting branches for non-latch exits here, |
941 | // because this ensures that both all loop blocks and all exit blocks |
942 | // remain reachable in the CFG. |
943 | // TODO: We could fold these branches, but it would require much more |
944 | // sophisticated updates to LoopInfo. |
945 | if (*KnownWillExit && !IsLatch) { |
946 | if (!Info.FirstExitingBlock) |
947 | Info.FirstExitingBlock = Info.ExitingBlocks[i]; |
948 | continue; |
949 | } |
950 | |
951 | SetDest(Info.ExitingBlocks[i], *KnownWillExit, Info.ExitOnTrue); |
952 | } |
953 | } |
954 | |
955 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); |
956 | DomTreeUpdater *DTUToUse = &DTU; |
957 | if (ExitingBlocks.size() == 1 && ExitInfos.size() == 1) { |
958 | // Manually update the DT if there's a single exiting node. In that case |
959 | // there's a single exit node and it is sufficient to update the nodes |
960 | // immediately dominated by the original exiting block. They will become |
961 | // dominated by the first exiting block that leaves the loop after |
962 | // unrolling. Note that the CFG inside the loop does not change, so there's |
963 | // no need to update the DT inside the unrolled loop. |
964 | DTUToUse = nullptr; |
965 | auto &[OriginalExit, Info] = *ExitInfos.begin(); |
966 | if (!Info.FirstExitingBlock) |
967 | Info.FirstExitingBlock = Info.ExitingBlocks.back(); |
968 | for (auto *C : to_vector(Range: DT->getNode(BB: OriginalExit)->children())) { |
969 | if (L->contains(BB: C->getBlock())) |
970 | continue; |
971 | C->setIDom(DT->getNode(BB: Info.FirstExitingBlock)); |
972 | } |
973 | } else { |
974 | DTU.applyUpdates(Updates: DTUpdates); |
975 | } |
976 | |
977 | // When completely unrolling, the last latch becomes unreachable. |
978 | if (!LatchIsExiting && CompletelyUnroll) { |
979 | // There is no need to update the DT here, because there must be a unique |
980 | // latch. Hence if the latch is not exiting it must directly branch back to |
981 | // the original loop header and does not dominate any nodes. |
982 | assert(LatchBlock->getSingleSuccessor() && "Loop with multiple latches?" ); |
983 | changeToUnreachable(I: Latches.back()->getTerminator(), PreserveLCSSA); |
984 | } |
985 | |
986 | // Merge adjacent basic blocks, if possible. |
987 | for (BasicBlock *Latch : Latches) { |
988 | BranchInst *Term = dyn_cast<BranchInst>(Val: Latch->getTerminator()); |
989 | assert((Term || |
990 | (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) && |
991 | "Need a branch as terminator, except when fully unrolling with " |
992 | "unconditional latch" ); |
993 | if (Term && Term->isUnconditional()) { |
994 | BasicBlock *Dest = Term->getSuccessor(i: 0); |
995 | BasicBlock *Fold = Dest->getUniquePredecessor(); |
996 | if (MergeBlockIntoPredecessor(BB: Dest, /*DTU=*/DTUToUse, LI, |
997 | /*MSSAU=*/nullptr, /*MemDep=*/nullptr, |
998 | /*PredecessorWithTwoSuccessors=*/false, |
999 | DT: DTUToUse ? nullptr : DT)) { |
1000 | // Dest has been folded into Fold. Update our worklists accordingly. |
1001 | std::replace(first: Latches.begin(), last: Latches.end(), old_value: Dest, new_value: Fold); |
1002 | llvm::erase(C&: UnrolledLoopBlocks, V: Dest); |
1003 | } |
1004 | } |
1005 | } |
1006 | |
1007 | if (DTUToUse) { |
1008 | // Apply updates to the DomTree. |
1009 | DT = &DTU.getDomTree(); |
1010 | } |
1011 | assert(!UnrollVerifyDomtree || |
1012 | DT->verify(DominatorTree::VerificationLevel::Fast)); |
1013 | |
1014 | // At this point, the code is well formed. We now simplify the unrolled loop, |
1015 | // doing constant propagation and dead code elimination as we go. |
1016 | simplifyLoopAfterUnroll(L, SimplifyIVs: !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC, |
1017 | TTI, AA); |
1018 | |
1019 | NumCompletelyUnrolled += CompletelyUnroll; |
1020 | ++NumUnrolled; |
1021 | |
1022 | Loop *OuterL = L->getParentLoop(); |
1023 | // Update LoopInfo if the loop is completely removed. |
1024 | if (CompletelyUnroll) { |
1025 | LI->erase(L); |
1026 | // We shouldn't try to use `L` anymore. |
1027 | L = nullptr; |
1028 | } else if (OriginalTripCount) { |
1029 | // Update the trip count. Note that the remainder has already logic |
1030 | // computing it in `UnrollRuntimeLoopRemainder`. |
1031 | setLoopEstimatedTripCount(L, EstimatedTripCount: *OriginalTripCount / ULO.Count, |
1032 | EstimatedLoopInvocationWeight); |
1033 | } |
1034 | |
1035 | // LoopInfo should not be valid, confirm that. |
1036 | if (UnrollVerifyLoopInfo) |
1037 | LI->verify(DomTree: *DT); |
1038 | |
1039 | // After complete unrolling most of the blocks should be contained in OuterL. |
1040 | // However, some of them might happen to be out of OuterL (e.g. if they |
1041 | // precede a loop exit). In this case we might need to insert PHI nodes in |
1042 | // order to preserve LCSSA form. |
1043 | // We don't need to check this if we already know that we need to fix LCSSA |
1044 | // form. |
1045 | // TODO: For now we just recompute LCSSA for the outer loop in this case, but |
1046 | // it should be possible to fix it in-place. |
1047 | if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA) |
1048 | NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(L: OuterL, Blocks: UnrolledLoopBlocks, LI); |
1049 | |
1050 | // Make sure that loop-simplify form is preserved. We want to simplify |
1051 | // at least one layer outside of the loop that was unrolled so that any |
1052 | // changes to the parent loop exposed by the unrolling are considered. |
1053 | if (OuterL) { |
1054 | // OuterL includes all loops for which we can break loop-simplify, so |
1055 | // it's sufficient to simplify only it (it'll recursively simplify inner |
1056 | // loops too). |
1057 | if (NeedToFixLCSSA) { |
1058 | // LCSSA must be performed on the outermost affected loop. The unrolled |
1059 | // loop's last loop latch is guaranteed to be in the outermost loop |
1060 | // after LoopInfo's been updated by LoopInfo::erase. |
1061 | Loop *LatchLoop = LI->getLoopFor(BB: Latches.back()); |
1062 | Loop *FixLCSSALoop = OuterL; |
1063 | if (!FixLCSSALoop->contains(L: LatchLoop)) |
1064 | while (FixLCSSALoop->getParentLoop() != LatchLoop) |
1065 | FixLCSSALoop = FixLCSSALoop->getParentLoop(); |
1066 | |
1067 | formLCSSARecursively(L&: *FixLCSSALoop, DT: *DT, LI, SE); |
1068 | } else if (PreserveLCSSA) { |
1069 | assert(OuterL->isLCSSAForm(*DT) && |
1070 | "Loops should be in LCSSA form after loop-unroll." ); |
1071 | } |
1072 | |
1073 | // TODO: That potentially might be compile-time expensive. We should try |
1074 | // to fix the loop-simplified form incrementally. |
1075 | simplifyLoop(L: OuterL, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA); |
1076 | } else { |
1077 | // Simplify loops for which we might've broken loop-simplify form. |
1078 | for (Loop *SubLoop : LoopsToSimplify) |
1079 | simplifyLoop(L: SubLoop, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA); |
1080 | } |
1081 | |
1082 | return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled |
1083 | : LoopUnrollResult::PartiallyUnrolled; |
1084 | } |
1085 | |
1086 | /// Given an llvm.loop loop id metadata node, returns the loop hint metadata |
1087 | /// node with the given name (for example, "llvm.loop.unroll.count"). If no |
1088 | /// such metadata node exists, then nullptr is returned. |
1089 | MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { |
1090 | // First operand should refer to the loop id itself. |
1091 | assert(LoopID->getNumOperands() > 0 && "requires at least one operand" ); |
1092 | assert(LoopID->getOperand(0) == LoopID && "invalid loop id" ); |
1093 | |
1094 | for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) { |
1095 | MDNode *MD = dyn_cast<MDNode>(Val: MDO); |
1096 | if (!MD) |
1097 | continue; |
1098 | |
1099 | MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0)); |
1100 | if (!S) |
1101 | continue; |
1102 | |
1103 | if (Name == S->getString()) |
1104 | return MD; |
1105 | } |
1106 | return nullptr; |
1107 | } |
1108 | |