1//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements some loop unrolling utilities for loops with run-time
10// trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
11// trip counts.
12//
13// The functions in this file are used to generate extra code when the
14// run-time trip count modulo the unroll factor is not 0. When this is the
15// case, we need to generate code to execute these 'left over' iterations.
16//
17// The current strategy generates an if-then-else sequence prior to the
18// unrolled loop to execute the 'left over' iterations before or after the
19// unrolled loop.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/ADT/Statistic.h"
24#include "llvm/Analysis/DomTreeUpdater.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/LoopIterator.h"
27#include "llvm/Analysis/ScalarEvolution.h"
28#include "llvm/Analysis/ValueTracking.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/MDBuilder.h"
32#include "llvm/IR/Module.h"
33#include "llvm/IR/ProfDataUtils.h"
34#include "llvm/Support/CommandLine.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Support/raw_ostream.h"
37#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38#include "llvm/Transforms/Utils/Cloning.h"
39#include "llvm/Transforms/Utils/Local.h"
40#include "llvm/Transforms/Utils/LoopUtils.h"
41#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
42#include "llvm/Transforms/Utils/UnrollLoop.h"
43#include <cmath>
44
45using namespace llvm;
46
47#define DEBUG_TYPE "loop-unroll"
48
49STATISTIC(NumRuntimeUnrolled,
50 "Number of loops unrolled with run-time trip counts");
51static cl::opt<bool> UnrollRuntimeMultiExit(
52 "unroll-runtime-multi-exit", cl::init(Val: false), cl::Hidden,
53 cl::desc("Allow runtime unrolling for loops with multiple exits, when "
54 "epilog is generated"));
55static cl::opt<bool> UnrollRuntimeOtherExitPredictable(
56 "unroll-runtime-other-exit-predictable", cl::init(Val: false), cl::Hidden,
57 cl::desc("Assume the non latch exit block to be predictable"));
58
59// Probability that the loop trip count is so small that after the prolog
60// we do not enter the unrolled loop at all.
61// It is unlikely that the loop trip count is smaller than the unroll factor;
62// other than that, the choice of constant is not tuned yet.
63static const uint32_t UnrolledLoopHeaderWeights[] = {1, 127};
64// Probability that the loop trip count is so small that we skip the unrolled
65// loop completely and immediately enter the epilogue loop.
66// It is unlikely that the loop trip count is smaller than the unroll factor;
67// other than that, the choice of constant is not tuned yet.
68static const uint32_t EpilogHeaderWeights[] = {1, 127};
69
70/// Connect the unrolling prolog code to the original loop.
71/// The unrolling prolog code contains code to execute the
72/// 'extra' iterations if the run-time trip count modulo the
73/// unroll count is non-zero.
74///
75/// This function performs the following:
76/// - Create PHI nodes at prolog end block to combine values
77/// that exit the prolog code and jump around the prolog.
78/// - Add a PHI operand to a PHI node at the loop exit block
79/// for values that exit the prolog and go around the loop.
80/// - Branch around the original loop if the trip count is less
81/// than the unroll factor.
82///
83static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
84 BasicBlock *PrologExit,
85 BasicBlock *OriginalLoopLatchExit,
86 BasicBlock *PreHeader, BasicBlock *NewPreHeader,
87 ValueToValueMapTy &VMap, DominatorTree *DT,
88 LoopInfo *LI, bool PreserveLCSSA,
89 ScalarEvolution &SE) {
90 // Loop structure should be the following:
91 // Preheader
92 // PrologHeader
93 // ...
94 // PrologLatch
95 // PrologExit
96 // NewPreheader
97 // Header
98 // ...
99 // Latch
100 // LatchExit
101 BasicBlock *Latch = L->getLoopLatch();
102 assert(Latch && "Loop must have a latch");
103 BasicBlock *PrologLatch = cast<BasicBlock>(Val&: VMap[Latch]);
104
105 // Create a PHI node for each outgoing value from the original loop
106 // (which means it is an outgoing value from the prolog code too).
107 // The new PHI node is inserted in the prolog end basic block.
108 // The new PHI node value is added as an operand of a PHI node in either
109 // the loop header or the loop exit block.
110 for (BasicBlock *Succ : successors(BB: Latch)) {
111 for (PHINode &PN : Succ->phis()) {
112 // Add a new PHI node to the prolog end block and add the
113 // appropriate incoming values.
114 // TODO: This code assumes that the PrologExit (or the LatchExit block for
115 // prolog loop) contains only one predecessor from the loop, i.e. the
116 // PrologLatch. When supporting multiple-exiting block loops, we can have
117 // two or more blocks that have the LatchExit as the target in the
118 // original loop.
119 PHINode *NewPN = PHINode::Create(Ty: PN.getType(), NumReservedValues: 2, NameStr: PN.getName() + ".unr");
120 NewPN->insertBefore(InsertPos: PrologExit->getFirstNonPHIIt());
121 // Adding a value to the new PHI node from the original loop preheader.
122 // This is the value that skips all the prolog code.
123 if (L->contains(Inst: &PN)) {
124 // Succ is loop header.
125 NewPN->addIncoming(V: PN.getIncomingValueForBlock(BB: NewPreHeader),
126 BB: PreHeader);
127 } else {
128 // Succ is LatchExit.
129 NewPN->addIncoming(V: PoisonValue::get(T: PN.getType()), BB: PreHeader);
130 }
131
132 Value *V = PN.getIncomingValueForBlock(BB: Latch);
133 if (Instruction *I = dyn_cast<Instruction>(Val: V)) {
134 if (L->contains(Inst: I)) {
135 V = VMap.lookup(Val: I);
136 }
137 }
138 // Adding a value to the new PHI node from the last prolog block
139 // that was created.
140 NewPN->addIncoming(V, BB: PrologLatch);
141
142 // Update the existing PHI node operand with the value from the
143 // new PHI node. How this is done depends on if the existing
144 // PHI node is in the original loop block, or the exit block.
145 if (L->contains(Inst: &PN))
146 PN.setIncomingValueForBlock(BB: NewPreHeader, V: NewPN);
147 else
148 PN.addIncoming(V: NewPN, BB: PrologExit);
149 SE.forgetLcssaPhiWithNewPredecessor(L, V: &PN);
150 }
151 }
152
153 // Make sure that created prolog loop is in simplified form
154 SmallVector<BasicBlock *, 4> PrologExitPreds;
155 Loop *PrologLoop = LI->getLoopFor(BB: PrologLatch);
156 if (PrologLoop) {
157 for (BasicBlock *PredBB : predecessors(BB: PrologExit))
158 if (PrologLoop->contains(BB: PredBB))
159 PrologExitPreds.push_back(Elt: PredBB);
160
161 SplitBlockPredecessors(BB: PrologExit, Preds: PrologExitPreds, Suffix: ".unr-lcssa", DT, LI,
162 MSSAU: nullptr, PreserveLCSSA);
163 }
164
165 // Create a branch around the original loop, which is taken if there are no
166 // iterations remaining to be executed after running the prologue.
167 Instruction *InsertPt = PrologExit->getTerminator();
168 IRBuilder<> B(InsertPt);
169
170 assert(Count != 0 && "nonsensical Count!");
171
172 // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
173 // This means %xtraiter is (BECount + 1) and all of the iterations of this
174 // loop were executed by the prologue. Note that if BECount <u (Count - 1)
175 // then (BECount + 1) cannot unsigned-overflow.
176 Value *BrLoopExit =
177 B.CreateICmpULT(LHS: BECount, RHS: ConstantInt::get(Ty: BECount->getType(), V: Count - 1));
178 // Split the exit to maintain loop canonicalization guarantees
179 SmallVector<BasicBlock *, 4> Preds(predecessors(BB: OriginalLoopLatchExit));
180 SplitBlockPredecessors(BB: OriginalLoopLatchExit, Preds, Suffix: ".unr-lcssa", DT, LI,
181 MSSAU: nullptr, PreserveLCSSA);
182 // Add the branch to the exit block (around the unrolled loop)
183 MDNode *BranchWeights = nullptr;
184 if (hasBranchWeightMD(I: *Latch->getTerminator())) {
185 // Assume loop is nearly always entered.
186 MDBuilder MDB(B.getContext());
187 BranchWeights = MDB.createBranchWeights(Weights: UnrolledLoopHeaderWeights);
188 }
189 B.CreateCondBr(Cond: BrLoopExit, True: OriginalLoopLatchExit, False: NewPreHeader,
190 BranchWeights);
191 InsertPt->eraseFromParent();
192 if (DT) {
193 auto *NewDom = DT->findNearestCommonDominator(A: OriginalLoopLatchExit,
194 B: PrologExit);
195 DT->changeImmediateDominator(BB: OriginalLoopLatchExit, NewBB: NewDom);
196 }
197}
198
199/// Assume, due to our position in the remainder loop or its guard, anywhere
200/// from 0 to \p N more iterations can possibly execute. Among such cases in
201/// the original loop (with loop probability \p OriginalLoopProb), what is the
202/// probability of executing at least one more iteration?
203static BranchProbability
204probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
205 // OriginalLoopProb == 1 would produce a division by zero in the calculation
206 // below. The problem is that case indicates an always infinite loop, but a
207 // remainder loop cannot be calculated at run time if the original loop is
208 // infinite as infinity % UnrollCount is undefined. We then choose
209 // probabilities indicating that all remainder loop iterations will always
210 // execute.
211 //
212 // Currently, the remainder loop here is an epilogue, which cannot be reached
213 // if the original loop is infinite, so the aforementioned choice is
214 // arbitrary.
215 //
216 // FIXME: Branch weights still need to be fixed in the case of prologues
217 // (issue #135812). In that case, the aforementioned choice seems reasonable
218 // for the goal of maintaining the original loop's block frequencies. That
219 // is, an infinite loop's initial iterations are not skipped, and the prologue
220 // loop body might have unique blocks that execute a finite number of times
221 // if, for example, the original loop body contains conditionals like i <
222 // UnrollCount.
223 if (OriginalLoopProb == BranchProbability::getOne())
224 return BranchProbability::getOne();
225
226 // Each of these variables holds the original loop's probability that the
227 // number of iterations it will execute is some m in the specified range.
228 BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
229 BranchProbability ProbTooMany = ProbOne.pow(N: N + 1); // N + 1 <= m
230 BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N
231 BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N
232 return ProbOneNotTooMany / ProbNotTooMany;
233}
234
235/// Connect the unrolling epilog code to the original loop.
236/// The unrolling epilog code contains code to execute the
237/// 'extra' iterations if the run-time trip count modulo the
238/// unroll count is non-zero.
239///
240/// This function performs the following:
241/// - Update PHI nodes at the epilog loop exit
242/// - Create PHI nodes at the unrolling loop exit and epilog preheader to
243/// combine values that exit the unrolling loop code and jump around it.
244/// - Update PHI operands in the epilog loop by the new PHI nodes
245/// - At the unrolling loop exit, branch around the epilog loop if extra iters
246// (ModVal) is zero.
247/// - At the epilog preheader, add an llvm.assume call that extra iters is
248/// non-zero. If the unrolling loop exit is the predecessor, the above new
249/// branch guarantees that assumption. If the unrolling loop preheader is the
250/// predecessor, then the required first iteration from the original loop has
251/// yet to be executed, so it must be executed in the epilog loop. If we
252/// later unroll the epilog loop, that llvm.assume call somehow enables
253/// ScalarEvolution to compute a epilog loop maximum trip count, which enables
254/// eliminating the branch at the end of the final unrolled epilog iteration.
255///
256static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
257 BasicBlock *Exit, BasicBlock *PreHeader,
258 BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
259 ValueToValueMapTy &VMap, DominatorTree *DT,
260 LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
261 unsigned Count, AssumptionCache &AC,
262 BranchProbability OriginalLoopProb) {
263 BasicBlock *Latch = L->getLoopLatch();
264 assert(Latch && "Loop must have a latch");
265 BasicBlock *EpilogLatch = cast<BasicBlock>(Val&: VMap[Latch]);
266
267 // Loop structure should be the following:
268 //
269 // PreHeader
270 // NewPreHeader
271 // Header
272 // ...
273 // Latch
274 // NewExit (PN)
275 // EpilogPreHeader
276 // EpilogHeader
277 // ...
278 // EpilogLatch
279 // Exit (EpilogPN)
280
281 // Update PHI nodes at Exit.
282 for (PHINode &PN : NewExit->phis()) {
283 // PN should be used in another PHI located in Exit block as
284 // Exit was split by SplitBlockPredecessors into Exit and NewExit
285 // Basically it should look like:
286 // NewExit:
287 // PN = PHI [I, Latch]
288 // ...
289 // Exit:
290 // EpilogPN = PHI [PN, EpilogPreHeader], [X, Exit2], [Y, Exit2.epil]
291 //
292 // Exits from non-latch blocks point to the original exit block and the
293 // epilogue edges have already been added.
294 //
295 // There is EpilogPreHeader incoming block instead of NewExit as
296 // NewExit was split 1 more time to get EpilogPreHeader.
297 assert(PN.hasOneUse() && "The phi should have 1 use");
298 PHINode *EpilogPN = cast<PHINode>(Val: PN.use_begin()->getUser());
299 assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
300
301 Value *V = PN.getIncomingValueForBlock(BB: Latch);
302 Instruction *I = dyn_cast<Instruction>(Val: V);
303 if (I && L->contains(Inst: I))
304 // If value comes from an instruction in the loop add VMap value.
305 V = VMap.lookup(Val: I);
306 // For the instruction out of the loop, constant or undefined value
307 // insert value itself.
308 EpilogPN->addIncoming(V, BB: EpilogLatch);
309
310 assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
311 "EpilogPN should have EpilogPreHeader incoming block");
312 // Change EpilogPreHeader incoming block to NewExit.
313 EpilogPN->setIncomingBlock(i: EpilogPN->getBasicBlockIndex(BB: EpilogPreHeader),
314 BB: NewExit);
315 // Now PHIs should look like:
316 // NewExit:
317 // PN = PHI [I, Latch]
318 // ...
319 // Exit:
320 // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
321 }
322
323 // Create PHI nodes at NewExit (from the unrolling loop Latch) and at
324 // EpilogPreHeader (from PreHeader and NewExit). Update corresponding PHI
325 // nodes in epilog loop.
326 for (BasicBlock *Succ : successors(BB: Latch)) {
327 // Skip this as we already updated phis in exit blocks.
328 if (!L->contains(BB: Succ))
329 continue;
330
331 // Succ here appears to always be just L->getHeader(). Otherwise, how do we
332 // know its corresponding epilog block (from VMap) is EpilogHeader and thus
333 // EpilogPreHeader is the right incoming block for VPN, as set below?
334 // TODO: Can we thus avoid the enclosing loop over successors?
335 assert(Succ == L->getHeader() &&
336 "Expect the only in-loop successor of latch to be the loop header");
337
338 for (PHINode &PN : Succ->phis()) {
339 // Add new PHI nodes to the loop exit block.
340 PHINode *NewPN0 = PHINode::Create(Ty: PN.getType(), /*NumReservedValues=*/1,
341 NameStr: PN.getName() + ".unr");
342 NewPN0->insertBefore(InsertPos: NewExit->getFirstNonPHIIt());
343 // Add value to the new PHI node from the unrolling loop latch.
344 NewPN0->addIncoming(V: PN.getIncomingValueForBlock(BB: Latch), BB: Latch);
345
346 // Add new PHI nodes to EpilogPreHeader.
347 PHINode *NewPN1 = PHINode::Create(Ty: PN.getType(), /*NumReservedValues=*/2,
348 NameStr: PN.getName() + ".epil.init");
349 NewPN1->insertBefore(InsertPos: EpilogPreHeader->getFirstNonPHIIt());
350 // Add value to the new PHI node from the unrolling loop preheader.
351 NewPN1->addIncoming(V: PN.getIncomingValueForBlock(BB: NewPreHeader), BB: PreHeader);
352 // Add value to the new PHI node from the epilog loop guard.
353 NewPN1->addIncoming(V: NewPN0, BB: NewExit);
354
355 // Update the existing PHI node operand with the value from the new PHI
356 // node. Corresponding instruction in epilog loop should be PHI.
357 PHINode *VPN = cast<PHINode>(Val&: VMap[&PN]);
358 VPN->setIncomingValueForBlock(BB: EpilogPreHeader, V: NewPN1);
359 }
360 }
361
362 // In NewExit, branch around the epilog loop if no extra iters.
363 Instruction *InsertPt = NewExit->getTerminator();
364 IRBuilder<> B(InsertPt);
365 Value *BrLoopExit = B.CreateIsNotNull(Arg: ModVal, Name: "lcmp.mod");
366 assert(Exit && "Loop must have a single exit block only");
367 // Split the epilogue exit to maintain loop canonicalization guarantees
368 SmallVector<BasicBlock*, 4> Preds(predecessors(BB: Exit));
369 SplitBlockPredecessors(BB: Exit, Preds, Suffix: ".epilog-lcssa", DT, LI, MSSAU: nullptr,
370 PreserveLCSSA);
371 // Add the branch to the exit block (around the epilog loop)
372 MDNode *BranchWeights = nullptr;
373 if (OriginalLoopProb.isUnknown() &&
374 hasBranchWeightMD(I: *Latch->getTerminator())) {
375 // Assume equal distribution in interval [0, Count).
376 MDBuilder MDB(B.getContext());
377 BranchWeights = MDB.createBranchWeights(TrueWeight: 1, FalseWeight: Count - 1);
378 }
379 BranchInst *RemainderLoopGuard =
380 B.CreateCondBr(Cond: BrLoopExit, True: EpilogPreHeader, False: Exit, BranchWeights);
381 if (!OriginalLoopProb.isUnknown()) {
382 setBranchProbability(B: RemainderLoopGuard,
383 P: probOfNextInRemainder(OriginalLoopProb, N: Count - 1),
384 /*ForFirstTarget=*/true);
385 }
386 InsertPt->eraseFromParent();
387 if (DT) {
388 auto *NewDom = DT->findNearestCommonDominator(A: Exit, B: NewExit);
389 DT->changeImmediateDominator(BB: Exit, NewBB: NewDom);
390 }
391
392 // In EpilogPreHeader, assume extra iters is non-zero.
393 IRBuilder<> B2(EpilogPreHeader, EpilogPreHeader->getFirstNonPHIIt());
394 Value *ModIsNotNull = B2.CreateIsNotNull(Arg: ModVal, Name: "lcmp.mod");
395 AssumeInst *AI = cast<AssumeInst>(Val: B2.CreateAssumption(Cond: ModIsNotNull));
396 AC.registerAssumption(CI: AI);
397}
398
399/// Create a clone of the blocks in a loop and connect them together. A new
400/// loop will be created including all cloned blocks, and the iterator of the
401/// new loop switched to count NewIter down to 0.
402/// The cloned blocks should be inserted between InsertTop and InsertBot.
403/// InsertTop should be new preheader, InsertBot new loop exit.
404/// Returns the new cloned loop that is created.
405static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
406 const bool UseEpilogRemainder,
407 const bool UnrollRemainder, BasicBlock *InsertTop,
408 BasicBlock *InsertBot, BasicBlock *Preheader,
409 std::vector<BasicBlock *> &NewBlocks,
410 LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
411 DominatorTree *DT, LoopInfo *LI, unsigned Count,
412 std::optional<unsigned> OriginalTripCount,
413 BranchProbability OriginalLoopProb) {
414 StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
415 BasicBlock *Header = L->getHeader();
416 BasicBlock *Latch = L->getLoopLatch();
417 Function *F = Header->getParent();
418 LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
419 LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
420 Loop *ParentLoop = L->getParentLoop();
421 NewLoopsMap NewLoops;
422 NewLoops[ParentLoop] = ParentLoop;
423
424 // For each block in the original loop, create a new copy,
425 // and update the value map with the newly created values.
426 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
427 BasicBlock *NewBB = CloneBasicBlock(BB: *BB, VMap, NameSuffix: "." + suffix, F);
428 NewBlocks.push_back(x: NewBB);
429
430 addClonedBlockToLoopInfo(OriginalBB: *BB, ClonedBB: NewBB, LI, NewLoops);
431
432 VMap[*BB] = NewBB;
433 if (Header == *BB) {
434 // For the first block, add a CFG connection to this newly
435 // created block.
436 InsertTop->getTerminator()->setSuccessor(Idx: 0, BB: NewBB);
437 }
438
439 if (DT) {
440 if (Header == *BB) {
441 // The header is dominated by the preheader.
442 DT->addNewBlock(BB: NewBB, DomBB: InsertTop);
443 } else {
444 // Copy information from original loop to unrolled loop.
445 BasicBlock *IDomBB = DT->getNode(BB: *BB)->getIDom()->getBlock();
446 DT->addNewBlock(BB: NewBB, DomBB: cast<BasicBlock>(Val&: VMap[IDomBB]));
447 }
448 }
449
450 if (Latch == *BB) {
451 // For the last block, create a loop back to cloned head.
452 VMap.erase(Val: (*BB)->getTerminator());
453 // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
454 // Subtle: NewIter can be 0 if we wrapped when computing the trip count,
455 // thus we must compare the post-increment (wrapping) value.
456 BasicBlock *FirstLoopBB = cast<BasicBlock>(Val&: VMap[Header]);
457 BranchInst *LatchBR = cast<BranchInst>(Val: NewBB->getTerminator());
458 IRBuilder<> Builder(LatchBR);
459 PHINode *NewIdx =
460 PHINode::Create(Ty: NewIter->getType(), NumReservedValues: 2, NameStr: suffix + ".iter");
461 NewIdx->insertBefore(InsertPos: FirstLoopBB->getFirstNonPHIIt());
462 auto *Zero = ConstantInt::get(Ty: NewIdx->getType(), V: 0);
463 auto *One = ConstantInt::get(Ty: NewIdx->getType(), V: 1);
464 Value *IdxNext =
465 Builder.CreateAdd(LHS: NewIdx, RHS: One, Name: NewIdx->getName() + ".next");
466 Value *IdxCmp = Builder.CreateICmpNE(LHS: IdxNext, RHS: NewIter, Name: NewIdx->getName() + ".cmp");
467 MDNode *BranchWeights = nullptr;
468 if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
469 hasBranchWeightMD(I: *LatchBR)) {
470 uint32_t ExitWeight;
471 uint32_t BackEdgeWeight;
472 if (Count >= 3) {
473 // Note: We do not enter this loop for zero-remainders. The check
474 // is at the end of the loop. We assume equal distribution between
475 // possible remainders in [1, Count).
476 ExitWeight = 1;
477 BackEdgeWeight = (Count - 2) / 2;
478 } else {
479 // Unnecessary backedge, should never be taken. The conditional
480 // jump should be optimized away later.
481 ExitWeight = 1;
482 BackEdgeWeight = 0;
483 }
484 MDBuilder MDB(Builder.getContext());
485 BranchWeights = MDB.createBranchWeights(TrueWeight: BackEdgeWeight, FalseWeight: ExitWeight);
486 }
487 BranchInst *RemainderLoopLatch =
488 Builder.CreateCondBr(Cond: IdxCmp, True: FirstLoopBB, False: InsertBot, BranchWeights);
489 if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
490 // Compute the total frequency of the original loop body from the
491 // remainder iterations. Once we've reached them, the first of them
492 // always executes, so its frequency and probability are 1.
493 double FreqRemIters = 1;
494 if (Count > 2) {
495 BranchProbability ProbReaching = BranchProbability::getOne();
496 for (unsigned N = Count - 2; N >= 1; --N) {
497 ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N);
498 FreqRemIters += ProbReaching.toDouble();
499 }
500 }
501 // Solve for the loop probability that would produce that frequency.
502 // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters.
503 BranchProbability Prob =
504 BranchProbability::getBranchProbability(Prob: 1 - 1 / FreqRemIters);
505 setBranchProbability(B: RemainderLoopLatch, P: Prob, /*ForFirstTarget=*/true);
506 }
507 NewIdx->addIncoming(V: Zero, BB: InsertTop);
508 NewIdx->addIncoming(V: IdxNext, BB: NewBB);
509 LatchBR->eraseFromParent();
510 }
511 }
512
513 // Change the incoming values to the ones defined in the preheader or
514 // cloned loop.
515 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
516 PHINode *NewPHI = cast<PHINode>(Val&: VMap[&*I]);
517 unsigned idx = NewPHI->getBasicBlockIndex(BB: Preheader);
518 NewPHI->setIncomingBlock(i: idx, BB: InsertTop);
519 BasicBlock *NewLatch = cast<BasicBlock>(Val&: VMap[Latch]);
520 idx = NewPHI->getBasicBlockIndex(BB: Latch);
521 Value *InVal = NewPHI->getIncomingValue(i: idx);
522 NewPHI->setIncomingBlock(i: idx, BB: NewLatch);
523 if (Value *V = VMap.lookup(Val: InVal))
524 NewPHI->setIncomingValue(i: idx, V);
525 }
526
527 Loop *NewLoop = NewLoops[L];
528 assert(NewLoop && "L should have been cloned");
529
530 if (OriginalTripCount && UseEpilogRemainder)
531 setLoopEstimatedTripCount(L: NewLoop, EstimatedTripCount: *OriginalTripCount % Count);
532
533 // Add unroll disable metadata to disable future unrolling for this loop.
534 if (!UnrollRemainder)
535 NewLoop->setLoopAlreadyUnrolled();
536 return NewLoop;
537}
538
539/// Returns true if we can profitably unroll the multi-exit loop L.
540static bool canProfitablyRuntimeUnrollMultiExitLoop(
541 Loop *L, const TargetTransformInfo *TTI,
542 SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
543 bool UseEpilogRemainder) {
544
545 // The main pain point with multi-exit loop unrolling is that once unrolled,
546 // we will not be able to merge all blocks into a straight line code.
547 // There are branches within the unrolled loop that go to the OtherExits.
548 // The second point is the increase in code size, but this is true
549 // irrespective of multiple exits.
550
551 // Note: Both the heuristics below are coarse grained. We are essentially
552 // enabling unrolling of loops that have a single side exit other than the
553 // normal LatchExit (i.e. exiting into a deoptimize block).
554 // The heuristics considered are:
555 // 1. low number of branches in the unrolled version.
556 // 2. high predictability of these extra branches.
557 // We avoid unrolling loops that have more than two exiting blocks. This
558 // limits the total number of branches in the unrolled loop to be atmost
559 // the unroll factor (since one of the exiting blocks is the latch block).
560 SmallVector<BasicBlock*, 4> ExitingBlocks;
561 L->getExitingBlocks(ExitingBlocks);
562 if (ExitingBlocks.size() > 2)
563 return false;
564
565 // Allow unrolling of loops with no non latch exit blocks.
566 if (OtherExits.size() == 0)
567 return true;
568
569 if (OtherExits.size() != 1)
570 return false;
571
572 // When UnrollRuntimeOtherExitPredictable is specified, we assume the other
573 // exit branch is predictable even if it has no deoptimize call.
574 if (UnrollRuntimeOtherExitPredictable)
575 return true;
576
577 // The second heuristic is that L has one exit other than the latchexit and
578 // that exit is highly unlikely.
579 if (TTI) {
580 BasicBlock *LatchBB = L->getLoopLatch();
581 assert(LatchBB && "Expected loop to have a latch");
582 BasicBlock *NonLatchExitingBlock =
583 (ExitingBlocks[0] == LatchBB) ? ExitingBlocks[1] : ExitingBlocks[0];
584 auto BranchProb =
585 llvm::getBranchProbability(Src: NonLatchExitingBlock, Dst: OtherExits[0]);
586 // If BranchProbability could not be extracted (returns unknown), then
587 // don't return and do the check for deopt block.
588 if (!BranchProb.isUnknown()) {
589 auto Threshold = TTI->getPredictableBranchThreshold().getCompl();
590 return BranchProb < Threshold;
591 }
592 }
593
594 // We know that deoptimize blocks are rarely taken, which also implies the
595 // branch leading to the deoptimize block is highly unlikely.
596 return OtherExits[0]->getPostdominatingDeoptimizeCall();
597 // TODO: These can be fine-tuned further to consider code size or deopt states
598 // that are captured by the deoptimize exit block.
599 // Also, we can extend this to support more cases, if we actually
600 // know of kinds of multiexit loops that would benefit from unrolling.
601}
602
603/// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain
604/// accounting for the possibility of unsigned overflow in the 2s complement
605/// domain. Preconditions:
606/// 1) TripCount = BECount + 1 (allowing overflow)
607/// 2) Log2(Count) <= BitWidth(BECount)
608static Value *CreateTripRemainder(IRBuilder<> &B, Value *BECount,
609 Value *TripCount, unsigned Count) {
610 // Note that TripCount is BECount + 1.
611 if (isPowerOf2_32(Value: Count))
612 // If the expression is zero, then either:
613 // 1. There are no iterations to be run in the prolog/epilog loop.
614 // OR
615 // 2. The addition computing TripCount overflowed.
616 //
617 // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
618 // the number of iterations that remain to be run in the original loop is a
619 // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (a
620 // precondition of this method).
621 return B.CreateAnd(LHS: TripCount, RHS: Count - 1, Name: "xtraiter");
622
623 // As (BECount + 1) can potentially unsigned overflow we count
624 // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
625 Constant *CountC = ConstantInt::get(Ty: BECount->getType(), V: Count);
626 Value *ModValTmp = B.CreateURem(LHS: BECount, RHS: CountC);
627 Value *ModValAdd = B.CreateAdd(LHS: ModValTmp,
628 RHS: ConstantInt::get(Ty: ModValTmp->getType(), V: 1));
629 // At that point (BECount % Count) + 1 could be equal to Count.
630 // To handle this case we need to take mod by Count one more time.
631 return B.CreateURem(LHS: ModValAdd, RHS: CountC, Name: "xtraiter");
632}
633
634
635/// Insert code in the prolog/epilog code when unrolling a loop with a
636/// run-time trip-count.
637///
638/// This method assumes that the loop unroll factor is total number
639/// of loop bodies in the loop after unrolling. (Some folks refer
640/// to the unroll factor as the number of *extra* copies added).
641/// We assume also that the loop unroll factor is a power-of-two. So, after
642/// unrolling the loop, the number of loop bodies executed is 2,
643/// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
644/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
645/// the switch instruction is generated.
646///
647/// ***Prolog case***
648/// extraiters = tripcount % loopfactor
649/// if (extraiters == 0) jump Loop:
650/// else jump Prol:
651/// Prol: LoopBody;
652/// extraiters -= 1 // Omitted if unroll factor is 2.
653/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
654/// if (tripcount < loopfactor) jump End:
655/// Loop:
656/// ...
657/// End:
658///
659/// ***Epilog case***
660/// extraiters = tripcount % loopfactor
661/// if (tripcount < loopfactor) jump LoopExit:
662/// unroll_iters = tripcount - extraiters
663/// Loop: LoopBody; (executes unroll_iter times);
664/// unroll_iter -= 1
665/// if (unroll_iter != 0) jump Loop:
666/// LoopExit:
667/// if (extraiters == 0) jump EpilExit:
668/// Epil: LoopBody; (executes extraiters times)
669/// extraiters -= 1 // Omitted if unroll factor is 2.
670/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
671/// EpilExit:
672
673bool llvm::UnrollRuntimeLoopRemainder(
674 Loop *L, unsigned Count, bool AllowExpensiveTripCount,
675 bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
676 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
677 const TargetTransformInfo *TTI, bool PreserveLCSSA,
678 unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
679 Loop **ResultLoop, std::optional<unsigned> OriginalTripCount,
680 BranchProbability OriginalLoopProb) {
681 LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
682 LLVM_DEBUG(L->dump());
683 LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
684 : dbgs() << "Using prolog remainder.\n");
685
686 // Make sure the loop is in canonical form.
687 if (!L->isLoopSimplifyForm()) {
688 LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
689 return false;
690 }
691
692 // Guaranteed by LoopSimplifyForm.
693 BasicBlock *Latch = L->getLoopLatch();
694 BasicBlock *Header = L->getHeader();
695
696 BranchInst *LatchBR = cast<BranchInst>(Val: Latch->getTerminator());
697
698 if (!LatchBR || LatchBR->isUnconditional()) {
699 // The loop-rotate pass can be helpful to avoid this in many cases.
700 LLVM_DEBUG(
701 dbgs()
702 << "Loop latch not terminated by a conditional branch.\n");
703 return false;
704 }
705
706 unsigned ExitIndex = LatchBR->getSuccessor(i: 0) == Header ? 1 : 0;
707 BasicBlock *LatchExit = LatchBR->getSuccessor(i: ExitIndex);
708
709 if (L->contains(BB: LatchExit)) {
710 // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
711 // targets of the Latch be an exit block out of the loop.
712 LLVM_DEBUG(
713 dbgs()
714 << "One of the loop latch successors must be the exit block.\n");
715 return false;
716 }
717
718 // These are exit blocks other than the target of the latch exiting block.
719 SmallVector<BasicBlock *, 4> OtherExits;
720 L->getUniqueNonLatchExitBlocks(ExitBlocks&: OtherExits);
721 // Support only single exit and exiting block unless multi-exit loop
722 // unrolling is enabled.
723 if (!L->getExitingBlock() || OtherExits.size()) {
724 // We rely on LCSSA form being preserved when the exit blocks are transformed.
725 // (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)
726 if (!PreserveLCSSA)
727 return false;
728
729 // Priority goes to UnrollRuntimeMultiExit if it's supplied.
730 if (UnrollRuntimeMultiExit.getNumOccurrences()) {
731 if (!UnrollRuntimeMultiExit)
732 return false;
733 } else {
734 // Otherwise perform multi-exit unrolling, if either the target indicates
735 // it is profitable or the general profitability heuristics apply.
736 if (!RuntimeUnrollMultiExit &&
737 !canProfitablyRuntimeUnrollMultiExitLoop(
738 L, TTI, OtherExits, LatchExit, UseEpilogRemainder)) {
739 LLVM_DEBUG(dbgs() << "Multiple exit/exiting blocks in loop and "
740 "multi-exit unrolling not enabled!\n");
741 return false;
742 }
743 }
744 }
745 // Use Scalar Evolution to compute the trip count. This allows more loops to
746 // be unrolled than relying on induction var simplification.
747 if (!SE)
748 return false;
749
750 // Only unroll loops with a computable trip count.
751 // We calculate the backedge count by using getExitCount on the Latch block,
752 // which is proven to be the only exiting block in this loop. This is same as
753 // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
754 // exiting blocks).
755 const SCEV *BECountSC = SE->getExitCount(L, ExitingBlock: Latch);
756 if (isa<SCEVCouldNotCompute>(Val: BECountSC)) {
757 LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
758 return false;
759 }
760
761 unsigned BEWidth = cast<IntegerType>(Val: BECountSC->getType())->getBitWidth();
762
763 // Add 1 since the backedge count doesn't include the first loop iteration.
764 // (Note that overflow can occur, this is handled explicitly below)
765 const SCEV *TripCountSC =
766 SE->getAddExpr(LHS: BECountSC, RHS: SE->getConstant(Ty: BECountSC->getType(), V: 1));
767 if (isa<SCEVCouldNotCompute>(Val: TripCountSC)) {
768 LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
769 return false;
770 }
771
772 BasicBlock *PreHeader = L->getLoopPreheader();
773 BranchInst *PreHeaderBR = cast<BranchInst>(Val: PreHeader->getTerminator());
774 SCEVExpander Expander(*SE, "loop-unroll");
775 if (!AllowExpensiveTripCount &&
776 Expander.isHighCostExpansion(Exprs: TripCountSC, L, Budget: SCEVExpansionBudget, TTI,
777 At: PreHeaderBR)) {
778 LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
779 return false;
780 }
781
782 // This constraint lets us deal with an overflowing trip count easily; see the
783 // comment on ModVal below.
784 if (Log2_32(Value: Count) > BEWidth) {
785 LLVM_DEBUG(
786 dbgs()
787 << "Count failed constraint on overflow trip count calculation.\n");
788 return false;
789 }
790
791 // Loop structure is the following:
792 //
793 // PreHeader
794 // Header
795 // ...
796 // Latch
797 // LatchExit
798
799 BasicBlock *NewPreHeader;
800 BasicBlock *NewExit = nullptr;
801 BasicBlock *PrologExit = nullptr;
802 BasicBlock *EpilogPreHeader = nullptr;
803 BasicBlock *PrologPreHeader = nullptr;
804
805 if (UseEpilogRemainder) {
806 // If epilog remainder
807 // Split PreHeader to insert a branch around loop for unrolling.
808 NewPreHeader = SplitBlock(Old: PreHeader, SplitPt: PreHeader->getTerminator(), DT, LI);
809 NewPreHeader->setName(PreHeader->getName() + ".new");
810 // Split LatchExit to create phi nodes from branch above.
811 NewExit = SplitBlockPredecessors(BB: LatchExit, Preds: {Latch}, Suffix: ".unr-lcssa", DT, LI,
812 MSSAU: nullptr, PreserveLCSSA);
813 // NewExit gets its DebugLoc from LatchExit, which is not part of the
814 // original Loop.
815 // Fix this by setting Loop's DebugLoc to NewExit.
816 auto *NewExitTerminator = NewExit->getTerminator();
817 NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
818 // Split NewExit to insert epilog remainder loop.
819 EpilogPreHeader = SplitBlock(Old: NewExit, SplitPt: NewExitTerminator, DT, LI);
820 EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
821
822 // If the latch exits from multiple level of nested loops, then
823 // by assumption there must be another loop exit which branches to the
824 // outer loop and we must adjust the loop for the newly inserted blocks
825 // to account for the fact that our epilogue is still in the same outer
826 // loop. Note that this leaves loopinfo temporarily out of sync with the
827 // CFG until the actual epilogue loop is inserted.
828 if (auto *ParentL = L->getParentLoop())
829 if (LI->getLoopFor(BB: LatchExit) != ParentL) {
830 LI->removeBlock(BB: NewExit);
831 ParentL->addBasicBlockToLoop(NewBB: NewExit, LI&: *LI);
832 LI->removeBlock(BB: EpilogPreHeader);
833 ParentL->addBasicBlockToLoop(NewBB: EpilogPreHeader, LI&: *LI);
834 }
835
836 } else {
837 // If prolog remainder
838 // Split the original preheader twice to insert prolog remainder loop
839 PrologPreHeader = SplitEdge(From: PreHeader, To: Header, DT, LI);
840 PrologPreHeader->setName(Header->getName() + ".prol.preheader");
841 PrologExit = SplitBlock(Old: PrologPreHeader, SplitPt: PrologPreHeader->getTerminator(),
842 DT, LI);
843 PrologExit->setName(Header->getName() + ".prol.loopexit");
844 // Split PrologExit to get NewPreHeader.
845 NewPreHeader = SplitBlock(Old: PrologExit, SplitPt: PrologExit->getTerminator(), DT, LI);
846 NewPreHeader->setName(PreHeader->getName() + ".new");
847 }
848 // Loop structure should be the following:
849 // Epilog Prolog
850 //
851 // PreHeader PreHeader
852 // *NewPreHeader *PrologPreHeader
853 // Header *PrologExit
854 // ... *NewPreHeader
855 // Latch Header
856 // *NewExit ...
857 // *EpilogPreHeader Latch
858 // LatchExit LatchExit
859
860 // Calculate conditions for branch around loop for unrolling
861 // in epilog case and around prolog remainder loop in prolog case.
862 // Compute the number of extra iterations required, which is:
863 // extra iterations = run-time trip count % loop unroll factor
864 PreHeaderBR = cast<BranchInst>(Val: PreHeader->getTerminator());
865 IRBuilder<> B(PreHeaderBR);
866 Value *TripCount = Expander.expandCodeFor(SH: TripCountSC, Ty: TripCountSC->getType(),
867 I: PreHeaderBR);
868 Value *BECount;
869 // If there are other exits before the latch, that may cause the latch exit
870 // branch to never be executed, and the latch exit count may be poison.
871 // In this case, freeze the TripCount and base BECount on the frozen
872 // TripCount. We will introduce two branches using these values, and it's
873 // important that they see a consistent value (which would not be guaranteed
874 // if were frozen independently.)
875 if ((!OtherExits.empty() || !SE->loopHasNoAbnormalExits(L)) &&
876 !isGuaranteedNotToBeUndefOrPoison(V: TripCount, AC, CtxI: PreHeaderBR, DT)) {
877 TripCount = B.CreateFreeze(V: TripCount);
878 BECount =
879 B.CreateAdd(LHS: TripCount, RHS: Constant::getAllOnesValue(Ty: TripCount->getType()));
880 } else {
881 // If we don't need to freeze, use SCEVExpander for BECount as well, to
882 // allow slightly better value reuse.
883 BECount =
884 Expander.expandCodeFor(SH: BECountSC, Ty: BECountSC->getType(), I: PreHeaderBR);
885 }
886
887 Value * const ModVal = CreateTripRemainder(B, BECount, TripCount, Count);
888
889 Value *BranchVal =
890 UseEpilogRemainder ? B.CreateICmpULT(LHS: BECount,
891 RHS: ConstantInt::get(Ty: BECount->getType(),
892 V: Count - 1)) :
893 B.CreateIsNotNull(Arg: ModVal, Name: "lcmp.mod");
894 BasicBlock *RemainderLoop =
895 UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
896 BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
897 // Branch to either remainder (extra iterations) loop or unrolling loop.
898 MDNode *BranchWeights = nullptr;
899 if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
900 hasBranchWeightMD(I: *Latch->getTerminator())) {
901 // Assume loop is nearly always entered.
902 MDBuilder MDB(B.getContext());
903 BranchWeights = MDB.createBranchWeights(Weights: EpilogHeaderWeights);
904 }
905 BranchInst *UnrollingLoopGuard =
906 B.CreateCondBr(Cond: BranchVal, True: RemainderLoop, False: UnrollingLoop, BranchWeights);
907 if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
908 // The original loop's first iteration always happens. Compute the
909 // probability of the original loop executing Count-1 iterations after that
910 // to complete the first iteration of the unrolled loop.
911 BranchProbability ProbOne = OriginalLoopProb;
912 BranchProbability ProbRest = ProbOne.pow(N: Count - 1);
913 setBranchProbability(B: UnrollingLoopGuard, P: ProbRest,
914 /*ForFirstTarget=*/false);
915 }
916 PreHeaderBR->eraseFromParent();
917 if (DT) {
918 if (UseEpilogRemainder)
919 DT->changeImmediateDominator(BB: EpilogPreHeader, NewBB: PreHeader);
920 else
921 DT->changeImmediateDominator(BB: PrologExit, NewBB: PreHeader);
922 }
923 Function *F = Header->getParent();
924 // Get an ordered list of blocks in the loop to help with the ordering of the
925 // cloned blocks in the prolog/epilog code
926 LoopBlocksDFS LoopBlocks(L);
927 LoopBlocks.perform(LI);
928
929 //
930 // For each extra loop iteration, create a copy of the loop's basic blocks
931 // and generate a condition that branches to the copy depending on the
932 // number of 'left over' iterations.
933 //
934 std::vector<BasicBlock *> NewBlocks;
935 ValueToValueMapTy VMap;
936
937 // Clone all the basic blocks in the loop. If Count is 2, we don't clone
938 // the loop, otherwise we create a cloned loop to execute the extra
939 // iterations. This function adds the appropriate CFG connections.
940 BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
941 BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
942 Loop *remainderLoop =
943 CloneLoopBlocks(L, NewIter: ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop,
944 InsertBot, Preheader: NewPreHeader, NewBlocks, LoopBlocks, VMap, DT,
945 LI, Count, OriginalTripCount, OriginalLoopProb);
946
947 // Insert the cloned blocks into the function.
948 F->splice(ToIt: InsertBot->getIterator(), FromF: F, FromBeginIt: NewBlocks[0]->getIterator(), FromEndIt: F->end());
949
950 // Now the loop blocks are cloned and the other exiting blocks from the
951 // remainder are connected to the original Loop's exit blocks. The remaining
952 // work is to update the phi nodes in the original loop, and take in the
953 // values from the cloned region.
954 for (auto *BB : OtherExits) {
955 // Given we preserve LCSSA form, we know that the values used outside the
956 // loop will be used through these phi nodes at the exit blocks that are
957 // transformed below.
958 for (PHINode &PN : BB->phis()) {
959 unsigned oldNumOperands = PN.getNumIncomingValues();
960 // Add the incoming values from the remainder code to the end of the phi
961 // node.
962 for (unsigned i = 0; i < oldNumOperands; i++){
963 auto *PredBB =PN.getIncomingBlock(i);
964 if (PredBB == Latch)
965 // The latch exit is handled separately, see connectX
966 continue;
967 if (!L->contains(BB: PredBB))
968 // Even if we had dedicated exits, the code above inserted an
969 // extra branch which can reach the latch exit.
970 continue;
971
972 auto *V = PN.getIncomingValue(i);
973 if (Instruction *I = dyn_cast<Instruction>(Val: V))
974 if (L->contains(Inst: I))
975 V = VMap.lookup(Val: I);
976 PN.addIncoming(V, BB: cast<BasicBlock>(Val&: VMap[PredBB]));
977 }
978 }
979#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
980 for (BasicBlock *SuccBB : successors(BB)) {
981 assert(!(llvm::is_contained(OtherExits, SuccBB) || SuccBB == LatchExit) &&
982 "Breaks the definition of dedicated exits!");
983 }
984#endif
985 }
986
987 // Update the immediate dominator of the exit blocks and blocks that are
988 // reachable from the exit blocks. This is needed because we now have paths
989 // from both the original loop and the remainder code reaching the exit
990 // blocks. While the IDom of these exit blocks were from the original loop,
991 // now the IDom is the preheader (which decides whether the original loop or
992 // remainder code should run) unless the block still has just the original
993 // predecessor (such as NewExit in the case of an epilog remainder).
994 if (DT && !L->getExitingBlock()) {
995 SmallVector<BasicBlock *, 16> ChildrenToUpdate;
996 // NB! We have to examine the dom children of all loop blocks, not just
997 // those which are the IDom of the exit blocks. This is because blocks
998 // reachable from the exit blocks can have their IDom as the nearest common
999 // dominator of the exit blocks.
1000 for (auto *BB : L->blocks()) {
1001 auto *DomNodeBB = DT->getNode(BB);
1002 for (auto *DomChild : DomNodeBB->children()) {
1003 auto *DomChildBB = DomChild->getBlock();
1004 if (!L->contains(L: LI->getLoopFor(BB: DomChildBB)) &&
1005 DomChildBB->getUniquePredecessor() != BB)
1006 ChildrenToUpdate.push_back(Elt: DomChildBB);
1007 }
1008 }
1009 for (auto *BB : ChildrenToUpdate)
1010 DT->changeImmediateDominator(BB, NewBB: PreHeader);
1011 }
1012
1013 // Loop structure should be the following:
1014 // Epilog Prolog
1015 //
1016 // PreHeader PreHeader
1017 // NewPreHeader PrologPreHeader
1018 // Header PrologHeader
1019 // ... ...
1020 // Latch PrologLatch
1021 // NewExit PrologExit
1022 // EpilogPreHeader NewPreHeader
1023 // EpilogHeader Header
1024 // ... ...
1025 // EpilogLatch Latch
1026 // LatchExit LatchExit
1027
1028 // Rewrite the cloned instruction operands to use the values created when the
1029 // clone is created.
1030 for (BasicBlock *BB : NewBlocks) {
1031 Module *M = BB->getModule();
1032 for (Instruction &I : *BB) {
1033 RemapInstruction(I: &I, VM&: VMap,
1034 Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
1035 RemapDbgRecordRange(M, Range: I.getDbgRecordRange(), VM&: VMap,
1036 Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
1037 }
1038 }
1039
1040 if (UseEpilogRemainder) {
1041 // Connect the epilog code to the original loop and update the
1042 // PHI functions.
1043 ConnectEpilog(L, ModVal, NewExit, Exit: LatchExit, PreHeader, EpilogPreHeader,
1044 NewPreHeader, VMap, DT, LI, PreserveLCSSA, SE&: *SE, Count, AC&: *AC,
1045 OriginalLoopProb);
1046
1047 // Update counter in loop for unrolling.
1048 // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
1049 // Subtle: TestVal can be 0 if we wrapped when computing the trip count,
1050 // thus we must compare the post-increment (wrapping) value.
1051 IRBuilder<> B2(NewPreHeader->getTerminator());
1052 Value *TestVal = B2.CreateSub(LHS: TripCount, RHS: ModVal, Name: "unroll_iter");
1053 BranchInst *LatchBR = cast<BranchInst>(Val: Latch->getTerminator());
1054 PHINode *NewIdx = PHINode::Create(Ty: TestVal->getType(), NumReservedValues: 2, NameStr: "niter");
1055 NewIdx->insertBefore(InsertPos: Header->getFirstNonPHIIt());
1056 B2.SetInsertPoint(LatchBR);
1057 auto *Zero = ConstantInt::get(Ty: NewIdx->getType(), V: 0);
1058 auto *One = ConstantInt::get(Ty: NewIdx->getType(), V: 1);
1059 Value *IdxNext = B2.CreateAdd(LHS: NewIdx, RHS: One, Name: NewIdx->getName() + ".next");
1060 auto Pred = LatchBR->getSuccessor(i: 0) == Header ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
1061 Value *IdxCmp = B2.CreateICmp(P: Pred, LHS: IdxNext, RHS: TestVal, Name: NewIdx->getName() + ".ncmp");
1062 NewIdx->addIncoming(V: Zero, BB: NewPreHeader);
1063 NewIdx->addIncoming(V: IdxNext, BB: Latch);
1064 LatchBR->setCondition(IdxCmp);
1065 } else {
1066 // Connect the prolog code to the original loop and update the
1067 // PHI functions.
1068 ConnectProlog(L, BECount, Count, PrologExit, OriginalLoopLatchExit: LatchExit, PreHeader,
1069 NewPreHeader, VMap, DT, LI, PreserveLCSSA, SE&: *SE);
1070 }
1071
1072 // If this loop is nested, then the loop unroller changes the code in the any
1073 // of its parent loops, so the Scalar Evolution pass needs to be run again.
1074 SE->forgetTopmostLoop(L);
1075
1076 // Verify that the Dom Tree and Loop Info are correct.
1077#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
1078 if (DT) {
1079 assert(DT->verify(DominatorTree::VerificationLevel::Full));
1080 LI->verify(*DT);
1081 }
1082#endif
1083
1084 // For unroll factor 2 remainder loop will have 1 iteration.
1085 if (Count == 2 && DT && LI && SE) {
1086 // TODO: This code could probably be pulled out into a helper function
1087 // (e.g. breakLoopBackedgeAndSimplify) and reused in loop-deletion.
1088 BasicBlock *RemainderLatch = remainderLoop->getLoopLatch();
1089 assert(RemainderLatch);
1090 SmallVector<BasicBlock *> RemainderBlocks(remainderLoop->getBlocks());
1091 breakLoopBackedge(L: remainderLoop, DT&: *DT, SE&: *SE, LI&: *LI, MSSA: nullptr);
1092 remainderLoop = nullptr;
1093
1094 // Simplify loop values after breaking the backedge
1095 const DataLayout &DL = L->getHeader()->getDataLayout();
1096 SmallVector<WeakTrackingVH, 16> DeadInsts;
1097 for (BasicBlock *BB : RemainderBlocks) {
1098 for (Instruction &Inst : llvm::make_early_inc_range(Range&: *BB)) {
1099 if (Value *V = simplifyInstruction(I: &Inst, Q: {DL, nullptr, DT, AC}))
1100 if (LI->replacementPreservesLCSSAForm(From: &Inst, To: V))
1101 Inst.replaceAllUsesWith(V);
1102 if (isInstructionTriviallyDead(I: &Inst))
1103 DeadInsts.emplace_back(Args: &Inst);
1104 }
1105 // We can't do recursive deletion until we're done iterating, as we might
1106 // have a phi which (potentially indirectly) uses instructions later in
1107 // the block we're iterating through.
1108 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
1109 }
1110
1111 // Merge latch into exit block.
1112 auto *ExitBB = RemainderLatch->getSingleSuccessor();
1113 assert(ExitBB && "required after breaking cond br backedge");
1114 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
1115 MergeBlockIntoPredecessor(BB: ExitBB, DTU: &DTU, LI);
1116 }
1117
1118 // Canonicalize to LoopSimplifyForm both original and remainder loops. We
1119 // cannot rely on the LoopUnrollPass to do this because it only does
1120 // canonicalization for parent/subloops and not the sibling loops.
1121 if (OtherExits.size() > 0) {
1122 // Generate dedicated exit blocks for the original loop, to preserve
1123 // LoopSimplifyForm.
1124 formDedicatedExitBlocks(L, DT, LI, MSSAU: nullptr, PreserveLCSSA);
1125 // Generate dedicated exit blocks for the remainder loop if one exists, to
1126 // preserve LoopSimplifyForm.
1127 if (remainderLoop)
1128 formDedicatedExitBlocks(L: remainderLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA);
1129 }
1130
1131 auto UnrollResult = LoopUnrollResult::Unmodified;
1132 if (remainderLoop && UnrollRemainder) {
1133 LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
1134 UnrollLoopOptions ULO;
1135 ULO.Count = Count - 1;
1136 ULO.Force = false;
1137 ULO.Runtime = false;
1138 ULO.AllowExpensiveTripCount = false;
1139 ULO.UnrollRemainder = false;
1140 ULO.ForgetAllSCEV = ForgetAllSCEV;
1141 assert(!getLoopConvergenceHeart(L) &&
1142 "A loop with a convergence heart does not allow runtime unrolling.");
1143 UnrollResult = UnrollLoop(L: remainderLoop, ULO, LI, SE, DT, AC, TTI,
1144 /*ORE*/ nullptr, PreserveLCSSA);
1145 }
1146
1147 if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
1148 *ResultLoop = remainderLoop;
1149 NumRuntimeUnrolled++;
1150 return true;
1151}
1152