1//===- LoopPeel.cpp -------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Loop Peeling Utilities.
10//===----------------------------------------------------------------------===//
11
12#include "llvm/Transforms/Utils/LoopPeel.h"
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/MapVector.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/Analysis/Loads.h"
18#include "llvm/Analysis/LoopInfo.h"
19#include "llvm/Analysis/LoopIterator.h"
20#include "llvm/Analysis/ScalarEvolution.h"
21#include "llvm/Analysis/ScalarEvolutionExpressions.h"
22#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
23#include "llvm/Analysis/TargetTransformInfo.h"
24#include "llvm/IR/BasicBlock.h"
25#include "llvm/IR/Dominators.h"
26#include "llvm/IR/Function.h"
27#include "llvm/IR/InstrTypes.h"
28#include "llvm/IR/Instruction.h"
29#include "llvm/IR/Instructions.h"
30#include "llvm/IR/LLVMContext.h"
31#include "llvm/IR/MDBuilder.h"
32#include "llvm/IR/PatternMatch.h"
33#include "llvm/IR/ProfDataUtils.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/CheckedArithmetic.h"
36#include "llvm/Support/CommandLine.h"
37#include "llvm/Support/Debug.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Transforms/Utils/BasicBlockUtils.h"
40#include "llvm/Transforms/Utils/Cloning.h"
41#include "llvm/Transforms/Utils/LoopSimplify.h"
42#include "llvm/Transforms/Utils/LoopUtils.h"
43#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
44#include "llvm/Transforms/Utils/ValueMapper.h"
45#include <algorithm>
46#include <cassert>
47#include <cstdint>
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::PatternMatch;
52using namespace llvm::SCEVPatternMatch;
53
54#define DEBUG_TYPE "loop-peel"
55
56STATISTIC(NumPeeled, "Number of loops peeled");
57STATISTIC(NumPeeledEnd, "Number of loops peeled from end");
58
59namespace llvm {
60static cl::opt<unsigned> UnrollPeelCount(
61 "unroll-peel-count", cl::Hidden,
62 cl::desc("Set the unroll peeling count, for testing purposes"));
63
64static cl::opt<bool>
65 UnrollAllowPeeling("unroll-allow-peeling", cl::init(Val: true), cl::Hidden,
66 cl::desc("Allows loops to be peeled when the dynamic "
67 "trip count is known to be low."));
68
69static cl::opt<bool>
70 UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",
71 cl::init(Val: false), cl::Hidden,
72 cl::desc("Allows loop nests to be peeled."));
73
74static cl::opt<unsigned> UnrollPeelMaxCount(
75 "unroll-peel-max-count", cl::init(Val: 7), cl::Hidden,
76 cl::desc("Max average trip count which will cause loop peeling."));
77
78static cl::opt<unsigned> UnrollForcePeelCount(
79 "unroll-force-peel-count", cl::init(Val: 0), cl::Hidden,
80 cl::desc("Force a peel count regardless of profiling information."));
81
82static cl::opt<bool> DisableAdvancedPeeling(
83 "disable-advanced-peeling", cl::init(Val: false), cl::Hidden,
84 cl::desc(
85 "Disable advance peeling. Issues for convergent targets (D134803)."));
86
87static cl::opt<bool> EnablePeelingForIV(
88 "enable-peeling-for-iv", cl::init(Val: false), cl::Hidden,
89 cl::desc("Enable peeling to convert Phi nodes into IVs"));
90
91static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
92
93extern cl::opt<bool> ProfcheckDisableMetadataFixes;
94} // namespace llvm
95
96// Check whether we are capable of peeling this loop.
97bool llvm::canPeel(const Loop *L) {
98 // Make sure the loop is in simplified form
99 if (!L->isLoopSimplifyForm())
100 return false;
101 if (!DisableAdvancedPeeling)
102 return true;
103
104 SmallVector<BasicBlock *, 4> Exits;
105 L->getUniqueNonLatchExitBlocks(ExitBlocks&: Exits);
106 // The latch must either be the only exiting block or all non-latch exit
107 // blocks have either a deopt or unreachable terminator or compose a chain of
108 // blocks where the last one is either deopt or unreachable terminated. Both
109 // deopt and unreachable terminators are a strong indication they are not
110 // taken. Note that this is a profitability check, not a legality check. Also
111 // note that LoopPeeling currently can only update the branch weights of latch
112 // blocks and branch weights to blocks with deopt or unreachable do not need
113 // updating.
114 return llvm::all_of(Range&: Exits, P: IsBlockFollowedByDeoptOrUnreachable);
115}
116
117namespace {
118
119// As a loop is peeled, it may be the case that Phi nodes become
120// loop-invariant (ie, known because there is only one choice).
121// For example, consider the following function:
122// void g(int);
123// void binary() {
124// int x = 0;
125// int y = 0;
126// int a = 0;
127// for(int i = 0; i <100000; ++i) {
128// g(x);
129// x = y;
130// g(a);
131// y = a + 1;
132// a = 5;
133// }
134// }
135// Peeling 3 iterations is beneficial because the values for x, y and a
136// become known. The IR for this loop looks something like the following:
137//
138// %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
139// %a = phi i32 [ 0, %entry ], [ 5, %if.end ]
140// %y = phi i32 [ 0, %entry ], [ %add, %if.end ]
141// %x = phi i32 [ 0, %entry ], [ %y, %if.end ]
142// ...
143// tail call void @_Z1gi(i32 signext %x)
144// tail call void @_Z1gi(i32 signext %a)
145// %add = add nuw nsw i32 %a, 1
146// %inc = add nuw nsw i32 %i, 1
147// %exitcond = icmp eq i32 %inc, 100000
148// br i1 %exitcond, label %for.cond.cleanup, label %for.body
149//
150// The arguments for the calls to g will become known after 3 iterations
151// of the loop, because the phi nodes values become known after 3 iterations
152// of the loop (ie, they are known on the 4th iteration, so peel 3 iterations).
153// The first iteration has g(0), g(0); the second has g(0), g(5); the
154// third has g(1), g(5) and the fourth (and all subsequent) have g(6), g(5).
155// Now consider the phi nodes:
156// %a is a phi with constants so it is determined after iteration 1.
157// %y is a phi based on a constant and %a so it is determined on
158// the iteration after %a is determined, so iteration 2.
159// %x is a phi based on a constant and %y so it is determined on
160// the iteration after %y, so iteration 3.
161// %i is based on itself (and is an induction variable) so it is
162// never determined.
163// This means that peeling off 3 iterations will result in being able to
164// remove the phi nodes for %a, %y, and %x. The arguments for the
165// corresponding calls to g are determined and the code for computing
166// x, y, and a can be removed.
167//
168// Similarly, there are cases where peeling makes Phi nodes loop-inductions
169// (i.e., the value is increased or decreased by a fixed amount on every
170// iteration). For example, consider the following function.
171//
172// #define N 100
173// void f(int a[], int b[]) {
174// int im = N - 1;
175// for (int i = 0; i < N; i++) {
176// a[i] = b[i] + b[im];
177// im = i;
178// }
179// }
180//
181// The IR of the loop will look something like the following.
182//
183// %i = phi i32 [ 0, %entry ], [ %i.next, %for.body ]
184// %im = phi i32 [ 99, %entry ], [ %i, %for.body ]
185// ...
186// %i.next = add nuw nsw i32 %i, 1
187// ...
188//
189// In this case, %im becomes a loop-induction variable by peeling 1 iteration,
190// because %i is a loop-induction one. The peeling count can be determined by
191// the same algorithm with loop-invariant case. Such peeling is profitable for
192// loop-vectorization.
193//
194// The PhiAnalyzer class calculates how many times a loop should be
195// peeled based on the above analysis of the phi nodes in the loop while
196// respecting the maximum specified.
197class PhiAnalyzer {
198public:
199 PhiAnalyzer(const Loop &L, unsigned MaxIterations, bool PeelForIV);
200
201 // Calculate the sufficient minimum number of iterations of the loop to peel
202 // such that phi instructions become determined (subject to allowable limits)
203 std::optional<unsigned> calculateIterationsToPeel();
204
205protected:
206 enum class PeelCounterType {
207 Invariant,
208 Induction,
209 };
210
211 using PeelCounterValue = std::pair<unsigned, PeelCounterType>;
212 using PeelCounter = std::optional<PeelCounterValue>;
213 const PeelCounter Unknown = std::nullopt;
214
215 // Add 1 respecting Unknown and return Unknown if result over MaxIterations
216 PeelCounter addOne(PeelCounter PC) const {
217 if (PC == Unknown)
218 return Unknown;
219 auto [Val, Ty] = *PC;
220 return (Val + 1 <= MaxIterations) ? PeelCounter({Val + 1, Ty}) : Unknown;
221 }
222
223 // Return a value representing zero for the given counter type.
224 PeelCounter makeZero(PeelCounterType Ty) const {
225 return PeelCounter({0, Ty});
226 }
227
228 // Calculate the number of iterations after which the given value becomes an
229 // invariant or an induction.
230 PeelCounter calculate(const Value &);
231
232 // Auxiliary function to calculate the number of iterations for a comparison
233 // instruction or a binary operator.
234 PeelCounter mergeTwoCounters(const Instruction &CmpOrBinaryOp,
235 const PeelCounterValue &LHS,
236 const PeelCounterValue &RHS) const;
237
238 // Returns true if the \p Phi is an induction in the target loop. This is a
239 // lightweight check and possible to detect an IV in some cases.
240 bool isInductionPHI(const PHINode *Phi) const;
241
242 const Loop &L;
243 const unsigned MaxIterations;
244 const bool PeelForIV;
245
246 // Map of Values to number of iterations to invariance or induction
247 SmallDenseMap<const Value *, PeelCounter> IterationsToInvarianceOrInduction;
248};
249
250PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations, bool PeelForIV)
251 : L(L), MaxIterations(MaxIterations), PeelForIV(PeelForIV) {
252 assert(canPeel(&L) && "loop is not suitable for peeling");
253 assert(MaxIterations > 0 && "no peeling is allowed?");
254}
255
256/// Test whether \p Phi is an induction variable. Although this can be
257/// determined using SCEV analysis, it is expensive to compute here. Instead,
258/// we perform cheaper checks that may not detect complex cases but are
259/// sufficient for some situations.
260bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const {
261 // Currently we only support a loop that has single latch.
262 BasicBlock *Latch = L.getLoopLatch();
263 if (Latch == nullptr)
264 return false;
265
266 Value *Cur = Phi->getIncomingValueForBlock(BB: Latch);
267 SmallPtrSet<Value *, 4> Visited;
268 bool VisitBinOp = false;
269
270 // Starting from the incoming value of the Phi, we follow the use-def chain.
271 // We consider Phi to be an IV if we can reach it again by traversing only
272 // add, sub, or cast instructions.
273 while (true) {
274 if (Cur == Phi)
275 break;
276
277 // Avoid infinite loop.
278 if (!Visited.insert(Ptr: Cur).second)
279 return false;
280
281 auto *I = dyn_cast<Instruction>(Val: Cur);
282 if (!I || !L.contains(Inst: I))
283 return false;
284
285 if (auto *Cast = dyn_cast<CastInst>(Val: I)) {
286 Cur = Cast->getOperand(i_nocapture: 0);
287 } else if (auto *BinOp = dyn_cast<BinaryOperator>(Val: I)) {
288 if (BinOp->getOpcode() != Instruction::Add &&
289 BinOp->getOpcode() != Instruction::Sub)
290 return false;
291 if (!isa<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1)))
292 return false;
293
294 VisitBinOp = true;
295 Cur = BinOp->getOperand(i_nocapture: 0);
296 } else {
297 return false;
298 }
299 }
300
301 // Ignore cases where no binary operations are visited.
302 return VisitBinOp;
303}
304
305/// When either \p LHS or \p RHS is an IV, the result of \p CmpOrBinaryOp is
306/// considered an IV only if it is an addition or a subtraction. Otherwise the
307/// result can be a value that is neither a loop-invariant nor an IV.
308///
309/// If both \p LHS and \p RHS are loop-invariants, then the result of
310/// \CmpOrBinaryOp is also a loop-invariant.
311PhiAnalyzer::PeelCounter
312PhiAnalyzer::mergeTwoCounters(const Instruction &CmpOrBinaryOp,
313 const PeelCounterValue &LHS,
314 const PeelCounterValue &RHS) const {
315 auto &[LVal, LTy] = LHS;
316 auto &[RVal, RTy] = RHS;
317 unsigned NewVal = std::max(a: LVal, b: RVal);
318
319 if (LTy == PeelCounterType::Induction || RTy == PeelCounterType::Induction) {
320 if (const auto *BinOp = dyn_cast<BinaryOperator>(Val: &CmpOrBinaryOp)) {
321 if (BinOp->getOpcode() == Instruction::Add ||
322 BinOp->getOpcode() == Instruction::Sub)
323 return PeelCounter({NewVal, PeelCounterType::Induction});
324 }
325 return Unknown;
326 }
327 return PeelCounter({NewVal, PeelCounterType::Invariant});
328}
329
330// This function calculates the number of iterations after which the value
331// becomes an invariant. The pre-calculated values are memorized in a map.
332// N.B. This number will be Unknown or <= MaxIterations.
333// The function is calculated according to the following definition:
334// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
335// F(%x) = G(%y) + 1 (N.B. [MaxIterations | Unknown] + 1 => Unknown)
336// G(%y) = 0 if %y is a loop invariant
337// G(%y) = G(%BackEdgeValue) if %y is a phi in the header block
338// G(%y) = TODO: if %y is an expression based on phis and loop invariants
339// The example looks like:
340// %x = phi(0, %a) <-- becomes invariant starting from 3rd iteration.
341// %y = phi(0, 5)
342// %a = %y + 1
343// G(%y) = Unknown otherwise (including phi not in header block)
344PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
345 // If we already know the answer, take it from the map.
346 // Otherwise, place Unknown to map to avoid infinite recursion. Such
347 // cycles can never stop on an invariant.
348 auto [I, Inserted] =
349 IterationsToInvarianceOrInduction.try_emplace(Key: &V, Args: Unknown);
350 if (!Inserted)
351 return I->second;
352
353 if (L.isLoopInvariant(V: &V))
354 // Loop invariant so known at start.
355 return (IterationsToInvarianceOrInduction[&V] =
356 makeZero(Ty: PeelCounterType::Invariant));
357 if (const PHINode *Phi = dyn_cast<PHINode>(Val: &V)) {
358 if (Phi->getParent() != L.getHeader()) {
359 // Phi is not in header block so Unknown.
360 assert(IterationsToInvarianceOrInduction[&V] == Unknown &&
361 "unexpected value saved");
362 return Unknown;
363 }
364
365 // If Phi is an induction, register it as a starting point.
366 if (PeelForIV && isInductionPHI(Phi))
367 return (IterationsToInvarianceOrInduction[&V] =
368 makeZero(Ty: PeelCounterType::Induction));
369
370 // We need to analyze the input from the back edge and add 1.
371 Value *Input = Phi->getIncomingValueForBlock(BB: L.getLoopLatch());
372 PeelCounter Iterations = calculate(V: *Input);
373 assert(IterationsToInvarianceOrInduction[Input] == Iterations &&
374 "unexpected value saved");
375 return (IterationsToInvarianceOrInduction[Phi] = addOne(PC: Iterations));
376 }
377 if (const Instruction *I = dyn_cast<Instruction>(Val: &V)) {
378 if (isa<CmpInst>(Val: I) || I->isBinaryOp()) {
379 // Binary instructions get the max of the operands.
380 PeelCounter LHS = calculate(V: *I->getOperand(i: 0));
381 if (LHS == Unknown)
382 return Unknown;
383 PeelCounter RHS = calculate(V: *I->getOperand(i: 1));
384 if (RHS == Unknown)
385 return Unknown;
386 return (IterationsToInvarianceOrInduction[I] =
387 mergeTwoCounters(CmpOrBinaryOp: *I, LHS: *LHS, RHS: *RHS));
388 }
389 if (I->isCast())
390 // Cast instructions get the value of the operand.
391 return (IterationsToInvarianceOrInduction[I] =
392 calculate(V: *I->getOperand(i: 0)));
393 }
394 // TODO: handle more expressions
395
396 // Everything else is Unknown.
397 assert(IterationsToInvarianceOrInduction[&V] == Unknown &&
398 "unexpected value saved");
399 return Unknown;
400}
401
402std::optional<unsigned> PhiAnalyzer::calculateIterationsToPeel() {
403 unsigned Iterations = 0;
404 for (auto &PHI : L.getHeader()->phis()) {
405 PeelCounter ToInvarianceOrInduction = calculate(V: PHI);
406 if (ToInvarianceOrInduction != Unknown) {
407 unsigned Val = ToInvarianceOrInduction->first;
408 assert(Val <= MaxIterations && "bad result in phi analysis");
409 Iterations = std::max(a: Iterations, b: Val);
410 if (Iterations == MaxIterations)
411 break;
412 }
413 }
414 assert((Iterations <= MaxIterations) && "bad result in phi analysis");
415 return Iterations ? std::optional<unsigned>(Iterations) : std::nullopt;
416}
417
418} // unnamed namespace
419
420// Try to find any invariant memory reads that will become dereferenceable in
421// the remainder loop after peeling. The load must also be used (transitively)
422// by an exit condition. Returns the number of iterations to peel off (at the
423// moment either 0 or 1).
424static unsigned peelToTurnInvariantLoadsDereferenceable(Loop &L,
425 DominatorTree &DT,
426 AssumptionCache *AC) {
427 // Skip loops with a single exiting block, because there should be no benefit
428 // for the heuristic below.
429 if (L.getExitingBlock())
430 return 0;
431
432 // All non-latch exit blocks must have an UnreachableInst terminator.
433 // Otherwise the heuristic below may not be profitable.
434 SmallVector<BasicBlock *, 4> Exits;
435 L.getUniqueNonLatchExitBlocks(ExitBlocks&: Exits);
436 if (any_of(Range&: Exits, P: [](const BasicBlock *BB) {
437 return !isa<UnreachableInst>(Val: BB->getTerminator());
438 }))
439 return 0;
440
441 // Now look for invariant loads that dominate the latch and are not known to
442 // be dereferenceable. If there are such loads and no writes, they will become
443 // dereferenceable in the loop if the first iteration is peeled off. Also
444 // collect the set of instructions controlled by such loads. Only peel if an
445 // exit condition uses (transitively) such a load.
446 BasicBlock *Header = L.getHeader();
447 BasicBlock *Latch = L.getLoopLatch();
448 SmallPtrSet<Value *, 8> LoadUsers;
449 const DataLayout &DL = L.getHeader()->getDataLayout();
450 for (BasicBlock *BB : L.blocks()) {
451 for (Instruction &I : *BB) {
452 // Calls that only access inaccessible memory can never alias with loads.
453 if (I.mayWriteToMemory() &&
454 !(isa<CallBase>(Val: I) &&
455 cast<CallBase>(Val&: I).onlyAccessesInaccessibleMemory()))
456 return 0;
457
458 if (LoadUsers.contains(Ptr: &I))
459 LoadUsers.insert_range(R: I.users());
460 // Do not look for reads in the header; they can already be hoisted
461 // without peeling.
462 if (BB == Header)
463 continue;
464 if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
465 Value *Ptr = LI->getPointerOperand();
466 if (DT.dominates(A: BB, B: Latch) && L.isLoopInvariant(V: Ptr) &&
467 !isDereferenceablePointer(V: Ptr, Ty: LI->getType(),
468 Q: SimplifyQuery(DL, &DT, AC, LI)))
469 LoadUsers.insert_range(R: I.users());
470 }
471 }
472 }
473 SmallVector<BasicBlock *> ExitingBlocks;
474 L.getExitingBlocks(ExitingBlocks);
475 if (any_of(Range&: ExitingBlocks, P: [&LoadUsers](BasicBlock *Exiting) {
476 return LoadUsers.contains(Ptr: Exiting->getTerminator());
477 }))
478 return 1;
479 return 0;
480}
481
482bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
483 const SCEV *BTC = SE.getBackedgeTakenCount(L: &L);
484 if (isa<SCEVCouldNotCompute>(Val: BTC))
485 return false;
486
487 // Check if the exit condition of the loop can be adjusted by the peeling
488 // codegen. For now, it must
489 // * exit via the latch,
490 // * the exit condition must be a NE/EQ compare of an induction with step
491 // of 1 and must only be used by the exiting branch.
492 BasicBlock *Latch = L.getLoopLatch();
493 Value *Inc;
494 Value *Bound;
495 CmpPredicate Pred;
496 BasicBlock *Succ1;
497 BasicBlock *Succ2;
498 return Latch && Latch == L.getExitingBlock() &&
499 match(V: Latch->getTerminator(),
500 P: m_Br(C: m_OneUse(SubPattern: m_ICmp(Pred, L: m_Value(V&: Inc), R: m_Value(V&: Bound))),
501 T: m_BasicBlock(V&: Succ1), F: m_BasicBlock(V&: Succ2))) &&
502 ((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) ||
503 (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) &&
504 Bound->getType()->isIntegerTy() &&
505 SE.isLoopInvariant(S: SE.getSCEV(V: Bound), L: &L) &&
506 match(S: SE.getSCEV(V: Inc),
507 P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_One(), L: m_SpecificLoop(L: &L)));
508}
509
510/// Returns true if the last iteration can be peeled off and the condition (Pred
511/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition
512/// is known at the second-to-last.
513static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred,
514 const SCEVAddRecExpr *LeftAR,
515 const SCEV *RightSCEV, ScalarEvolution &SE,
516 const TargetTransformInfo &TTI) {
517 if (!canPeelLastIteration(L, SE))
518 return false;
519
520 const SCEV *BTC = SE.getBackedgeTakenCount(L: &L);
521 SCEVExpander Expander(SE, "loop-peel");
522 if (!SE.isKnownNonZero(S: BTC) &&
523 Expander.isHighCostExpansion(Exprs: BTC, L: &L, Budget: SCEVCheapExpansionBudget, TTI: &TTI,
524 At: L.getLoopPredecessor()->getTerminator()))
525 return false;
526
527 auto Guards = ScalarEvolution::LoopGuards::collect(L: &L, SE);
528 BTC = SE.applyLoopGuards(Expr: BTC, Guards);
529 RightSCEV = SE.applyLoopGuards(Expr: RightSCEV, Guards);
530 const SCEV *ValAtLastIter = LeftAR->evaluateAtIteration(It: BTC, SE);
531 const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration(
532 It: SE.getMinusSCEV(LHS: BTC, RHS: SE.getOne(Ty: BTC->getType())), SE);
533
534 return SE.isKnownPredicate(Pred: ICmpInst::getInversePredicate(pred: Pred), LHS: ValAtLastIter,
535 RHS: RightSCEV) &&
536 SE.isKnownPredicate(Pred, LHS: ValAtSecondToLastIter, RHS: RightSCEV);
537}
538
539// Return the number of iterations to peel off from the beginning and end of the
540// loop respectively, that make conditions in the body true/false. For example,
541// if we peel 2 iterations off the loop below, the condition i < 2 can be
542// evaluated at compile time.
543//
544// for (i = 0; i < n; i++)
545// if (i < 2)
546// ..
547// else
548// ..
549// }
550static std::pair<unsigned, unsigned>
551countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
552 const TargetTransformInfo &TTI) {
553 assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
554 unsigned DesiredPeelCount = 0;
555 unsigned DesiredPeelCountLast = 0;
556
557 // Do not peel the entire loop.
558 const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(L: &L);
559 if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Val: BE))
560 MaxPeelCount =
561 std::min(a: (unsigned)SC->getAPInt().getLimitedValue() - 1, b: MaxPeelCount);
562
563 // Increase PeelCount while (IterVal Pred BoundSCEV) condition is satisfied;
564 // return true if inversed condition become known before reaching the
565 // MaxPeelCount limit.
566 auto PeelWhilePredicateIsKnown =
567 [&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
568 const SCEV *Step, ICmpInst::Predicate Pred) {
569 while (PeelCount < MaxPeelCount &&
570 SE.isKnownPredicate(Pred, LHS: IterVal, RHS: BoundSCEV)) {
571 IterVal = SE.getAddExpr(LHS: IterVal, RHS: Step);
572 ++PeelCount;
573 }
574 return SE.isKnownPredicate(Pred: ICmpInst::getInversePredicate(pred: Pred), LHS: IterVal,
575 RHS: BoundSCEV);
576 };
577
578 const unsigned MaxDepth = 4;
579 std::function<void(Value *, unsigned)> ComputePeelCount =
580 [&](Value *Condition, unsigned Depth) -> void {
581 if (!Condition->getType()->isIntegerTy() || Depth >= MaxDepth)
582 return;
583
584 Value *LeftVal, *RightVal;
585 if (match(V: Condition, P: m_And(L: m_Value(V&: LeftVal), R: m_Value(V&: RightVal))) ||
586 match(V: Condition, P: m_Or(L: m_Value(V&: LeftVal), R: m_Value(V&: RightVal)))) {
587 ComputePeelCount(LeftVal, Depth + 1);
588 ComputePeelCount(RightVal, Depth + 1);
589 return;
590 }
591
592 CmpPredicate Pred;
593 if (!match(V: Condition, P: m_ICmp(Pred, L: m_Value(V&: LeftVal), R: m_Value(V&: RightVal))))
594 return;
595
596 const SCEV *LeftSCEV = SE.getSCEV(V: LeftVal);
597 const SCEV *RightSCEV = SE.getSCEV(V: RightVal);
598
599 // Do not consider predicates that are known to be true or false
600 // independently of the loop iteration.
601 if (SE.evaluatePredicate(Pred, LHS: LeftSCEV, RHS: RightSCEV))
602 return;
603
604 // Check if we have a condition with one AddRec and one non AddRec
605 // expression. Normalize LeftSCEV to be the AddRec.
606 if (!isa<SCEVAddRecExpr>(Val: LeftSCEV)) {
607 if (isa<SCEVAddRecExpr>(Val: RightSCEV)) {
608 std::swap(a&: LeftSCEV, b&: RightSCEV);
609 Pred = ICmpInst::getSwappedPredicate(pred: Pred);
610 } else
611 return;
612 }
613
614 const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(Val: LeftSCEV);
615
616 // Avoid huge SCEV computations in the loop below, make sure we only
617 // consider AddRecs of the loop we are trying to peel.
618 if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
619 return;
620 if (!(ICmpInst::isEquality(P: Pred) && LeftAR->hasNoSelfWrap()) &&
621 !SE.getMonotonicPredicateType(LHS: LeftAR, Pred))
622 return;
623
624 // Check if extending the current DesiredPeelCount lets us evaluate Pred
625 // or !Pred in the loop body statically.
626 unsigned NewPeelCount = DesiredPeelCount;
627
628 const SCEV *IterVal = LeftAR->evaluateAtIteration(
629 It: SE.getConstant(Ty: LeftSCEV->getType(), V: NewPeelCount), SE);
630
631 // If the original condition is not known, get the negated predicate
632 // (which holds on the else branch) and check if it is known. This allows
633 // us to peel of iterations that make the original condition false.
634 if (!SE.isKnownPredicate(Pred, LHS: IterVal, RHS: RightSCEV))
635 Pred = ICmpInst::getInversePredicate(pred: Pred);
636
637 const SCEV *Step = LeftAR->getStepRecurrence(SE);
638 if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,
639 Pred)) {
640 if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE, TTI))
641 DesiredPeelCountLast = 1;
642 return;
643 }
644
645 // However, for equality comparisons, that isn't always sufficient to
646 // eliminate the comparsion in loop body, we may need to peel one more
647 // iteration. See if that makes !Pred become unknown again.
648 const SCEV *NextIterVal = SE.getAddExpr(LHS: IterVal, RHS: Step);
649 if (ICmpInst::isEquality(P: Pred) &&
650 !SE.isKnownPredicate(Pred: ICmpInst::getInversePredicate(pred: Pred), LHS: NextIterVal,
651 RHS: RightSCEV) &&
652 !SE.isKnownPredicate(Pred, LHS: IterVal, RHS: RightSCEV) &&
653 SE.isKnownPredicate(Pred, LHS: NextIterVal, RHS: RightSCEV)) {
654 if (NewPeelCount >= MaxPeelCount)
655 return; // Need to peel one more iteration, but can't. Give up.
656 ++NewPeelCount; // Great!
657 }
658
659 DesiredPeelCount = std::max(a: DesiredPeelCount, b: NewPeelCount);
660 DesiredPeelCountLast = std::max(a: DesiredPeelCountLast, b: NewPeelCount);
661 };
662
663 auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) {
664 if (!MinMax->getType()->isIntegerTy())
665 return;
666 Value *LHS = MinMax->getLHS(), *RHS = MinMax->getRHS();
667 const SCEV *BoundSCEV, *IterSCEV;
668 if (L.isLoopInvariant(V: LHS)) {
669 BoundSCEV = SE.getSCEV(V: LHS);
670 IterSCEV = SE.getSCEV(V: RHS);
671 } else if (L.isLoopInvariant(V: RHS)) {
672 BoundSCEV = SE.getSCEV(V: RHS);
673 IterSCEV = SE.getSCEV(V: LHS);
674 } else
675 return;
676 const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Val: IterSCEV);
677 // For simplicity, we support only affine recurrences.
678 if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)
679 return;
680 const SCEV *Step = AddRec->getStepRecurrence(SE);
681 bool IsSigned = MinMax->isSigned();
682 // To minimize number of peeled iterations, we use strict relational
683 // predicates here.
684 ICmpInst::Predicate Pred;
685 if (SE.isKnownPositive(S: Step))
686 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
687 else if (SE.isKnownNegative(S: Step))
688 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
689 else
690 return;
691 // Check that AddRec is not wrapping.
692 if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))
693 return;
694 unsigned NewPeelCount = DesiredPeelCount;
695 const SCEV *IterVal = AddRec->evaluateAtIteration(
696 It: SE.getConstant(Ty: AddRec->getType(), V: NewPeelCount), SE);
697 if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
698 Pred)) {
699 if (shouldPeelLastIteration(L, Pred, LeftAR: AddRec, RightSCEV: BoundSCEV, SE, TTI))
700 DesiredPeelCountLast = 1;
701 return;
702 }
703 DesiredPeelCount = NewPeelCount;
704 };
705
706 for (BasicBlock *BB : L.blocks()) {
707 for (Instruction &I : *BB) {
708 if (SelectInst *SI = dyn_cast<SelectInst>(Val: &I))
709 ComputePeelCount(SI->getCondition(), 0);
710 if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(Val: &I))
711 ComputePeelCountMinMax(MinMax);
712 }
713
714 auto *BI = dyn_cast<CondBrInst>(Val: BB->getTerminator());
715 if (!BI)
716 continue;
717
718 // Ignore loop exit condition.
719 if (L.getLoopLatch() == BB)
720 continue;
721
722 ComputePeelCount(BI->getCondition(), 0);
723 }
724
725 return {DesiredPeelCount, DesiredPeelCountLast};
726}
727
728/// This "heuristic" exactly matches implicit behavior which used to exist
729/// inside getLoopEstimatedTripCount. It was added here to keep an
730/// improvement inside that API from causing peeling to become more aggressive.
731/// This should probably be removed.
732static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
733 BasicBlock *Latch = L->getLoopLatch();
734 if (!Latch)
735 return true;
736
737 CondBrInst *LatchBR = dyn_cast<CondBrInst>(Val: Latch->getTerminator());
738 if (!LatchBR || !L->isLoopExiting(BB: Latch))
739 return true;
740
741 assert((LatchBR->getSuccessor(0) == L->getHeader() ||
742 LatchBR->getSuccessor(1) == L->getHeader()) &&
743 "At least one edge out of the latch must go to the header");
744
745 SmallVector<BasicBlock *, 4> ExitBlocks;
746 L->getUniqueNonLatchExitBlocks(ExitBlocks);
747 return any_of(Range&: ExitBlocks, P: [](const BasicBlock *EB) {
748 return !EB->getTerminatingDeoptimizeCall();
749 });
750}
751
752
753// Return the number of iterations we want to peel off.
754void llvm::computePeelCount(Loop *L, unsigned LoopSize,
755 TargetTransformInfo::PeelingPreferences &PP,
756 unsigned TripCount, DominatorTree &DT,
757 ScalarEvolution &SE, const TargetTransformInfo &TTI,
758 AssumptionCache *AC, unsigned Threshold) {
759 assert(LoopSize > 0 && "Zero loop size is not allowed!");
760 // Save the PP.PeelCount value set by the target in
761 // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
762 unsigned TargetPeelCount = PP.PeelCount;
763 PP.PeelCount = 0;
764 PP.PeelLast = false;
765 if (!canPeel(L))
766 return;
767
768 // Only try to peel innermost loops by default.
769 // The constraint can be relaxed by the target in TTI.getPeelingPreferences
770 // or by the flag -unroll-allow-loop-nests-peeling.
771 if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
772 return;
773
774 // If the user provided a peel count, use that.
775 bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
776 if (UserPeelCount) {
777 LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
778 << " iterations.\n");
779 PP.PeelCount = UnrollForcePeelCount;
780 PP.PeelProfiledIterations = true;
781 return;
782 }
783
784 // Skip peeling if it's disabled.
785 if (!PP.AllowPeeling)
786 return;
787
788 // Check that we can peel at least one iteration.
789 if (2 * LoopSize > Threshold)
790 return;
791
792 unsigned AlreadyPeeled = 0;
793 if (auto Peeled = getOptionalIntLoopAttribute(TheLoop: L, Name: PeeledCountMetaData))
794 AlreadyPeeled = *Peeled;
795 // Stop if we already peeled off the maximum number of iterations.
796 if (AlreadyPeeled >= UnrollPeelMaxCount)
797 return;
798
799 // Pay respect to limitations implied by loop size and the max peel count.
800 unsigned MaxPeelCount = UnrollPeelMaxCount;
801 MaxPeelCount = std::min(a: MaxPeelCount, b: Threshold / LoopSize - 1);
802
803 // Start the max computation with the PP.PeelCount value set by the target
804 // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
805 unsigned DesiredPeelCount = TargetPeelCount;
806
807 // Here we try to get rid of Phis which become invariants or inductions after
808 // 1, 2, ..., N iterations of the loop. For this we compute the number for
809 // iterations after which every Phi is guaranteed to become an invariant or an
810 // induction, and try to peel the maximum number of iterations among these
811 // values, thus turning all those Phis into invariants or inductions.
812 if (MaxPeelCount > DesiredPeelCount) {
813 // Check how many iterations are useful for resolving Phis
814 auto NumPeels = PhiAnalyzer(*L, MaxPeelCount, EnablePeelingForIV)
815 .calculateIterationsToPeel();
816 if (NumPeels)
817 DesiredPeelCount = std::max(a: DesiredPeelCount, b: *NumPeels);
818 }
819
820 const auto &[CountToEliminateCmps, CountToEliminateCmpsLast] =
821 countToEliminateCompares(L&: *L, MaxPeelCount, SE, TTI);
822 DesiredPeelCount = std::max(a: DesiredPeelCount, b: CountToEliminateCmps);
823
824 if (DesiredPeelCount == 0)
825 DesiredPeelCount = peelToTurnInvariantLoadsDereferenceable(L&: *L, DT, AC);
826
827 if (DesiredPeelCount > 0) {
828 DesiredPeelCount = std::min(a: DesiredPeelCount, b: MaxPeelCount);
829 // Consider max peel count limitation.
830 assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
831 if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
832 LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
833 << " iteration(s) to turn"
834 << " some Phis into invariants or inductions.\n");
835 PP.PeelCount = DesiredPeelCount;
836 PP.PeelProfiledIterations = false;
837 PP.PeelLast = false;
838 return;
839 }
840 }
841
842 if (CountToEliminateCmpsLast > 0) {
843 unsigned DesiredPeelCountLast =
844 std::min(a: CountToEliminateCmpsLast, b: MaxPeelCount);
845 // Consider max peel count limitation.
846 assert(DesiredPeelCountLast > 0 && "Wrong loop size estimation?");
847 if (DesiredPeelCountLast + AlreadyPeeled <= UnrollPeelMaxCount) {
848 LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
849 << " iteration(s) to turn"
850 << " some Phis into invariants.\n");
851 PP.PeelCount = DesiredPeelCountLast;
852 PP.PeelProfiledIterations = false;
853 PP.PeelLast = true;
854 return;
855 }
856 }
857
858 // Bail if we know the statically calculated trip count.
859 // In this case we rather prefer partial unrolling.
860 if (TripCount)
861 return;
862
863 // Do not apply profile base peeling if it is disabled.
864 if (!PP.PeelProfiledIterations)
865 return;
866 // If we don't know the trip count, but have reason to believe the average
867 // trip count is low, peeling should be beneficial, since we will usually
868 // hit the peeled section.
869 // We only do this in the presence of profile information, since otherwise
870 // our estimates of the trip count are not reliable enough.
871 if (L->getHeader()->getParent()->hasProfileData()) {
872 if (violatesLegacyMultiExitLoopCheck(L))
873 return;
874 std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
875 if (!EstimatedTripCount)
876 return;
877
878 LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
879 << *EstimatedTripCount << "\n");
880
881 std::optional<unsigned> TotalPeeled =
882 llvm::checkedAddUnsigned(LHS: *EstimatedTripCount, RHS: AlreadyPeeled);
883 if (TotalPeeled && *TotalPeeled <= MaxPeelCount) {
884 unsigned PeelCount = *EstimatedTripCount;
885 LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
886 PP.PeelCount = PeelCount;
887 return;
888 }
889 LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
890 LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
891 LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
892 LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
893 LLVM_DEBUG(dbgs() << "Max peel count by cost: "
894 << (Threshold / LoopSize - 1) << "\n");
895 }
896}
897
898/// Clones the body of the loop L, putting it between \p InsertTop and \p
899/// InsertBot.
900/// \param IterNumber The serial number of the iteration currently being
901/// peeled off.
902/// \param PeelLast Peel off the last iterations from \p L.
903/// \param ExitEdges The exit edges of the original loop.
904/// \param[out] NewBlocks A list of the blocks in the newly created clone
905/// \param[out] VMap The value map between the loop and the new clone.
906/// \param LoopBlocks A helper for DFS-traversal of the loop.
907/// \param LVMap A value-map that maps instructions from the original loop to
908/// instructions in the last peeled-off iteration.
909static void cloneLoopBlocks(
910 Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop,
911 BasicBlock *InsertBot, BasicBlock *OrigPreHeader,
912 SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
913 SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
914 ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
915 LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes,
916 ScalarEvolution &SE) {
917 BasicBlock *Header = L->getHeader();
918 BasicBlock *Latch = L->getLoopLatch();
919 BasicBlock *PreHeader = L->getLoopPreheader();
920
921 Function *F = Header->getParent();
922 LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
923 LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
924 Loop *ParentLoop = L->getParentLoop();
925
926 // For each block in the original loop, create a new copy,
927 // and update the value map with the newly created values.
928 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
929 BasicBlock *NewBB = CloneBasicBlock(BB: *BB, VMap, NameSuffix: ".peel", F);
930 NewBlocks.push_back(Elt: NewBB);
931
932 // If an original block is an immediate child of the loop L, its copy
933 // is a child of a ParentLoop after peeling. If a block is a child of
934 // a nested loop, it is handled in the cloneLoop() call below.
935 if (ParentLoop && LI->getLoopFor(BB: *BB) == L)
936 ParentLoop->addBasicBlockToLoop(NewBB, LI&: *LI);
937
938 VMap[*BB] = NewBB;
939
940 // If dominator tree is available, insert nodes to represent cloned blocks.
941 if (DT) {
942 if (Header == *BB)
943 DT->addNewBlock(BB: NewBB, DomBB: InsertTop);
944 else {
945 DomTreeNode *IDom = DT->getNode(BB: *BB)->getIDom();
946 // VMap must contain entry for IDom, as the iteration order is RPO.
947 DT->addNewBlock(BB: NewBB, DomBB: cast<BasicBlock>(Val&: VMap[IDom->getBlock()]));
948 }
949 }
950 }
951
952 {
953 // Identify what other metadata depends on the cloned version. After
954 // cloning, replace the metadata with the corrected version for both
955 // memory instructions and noalias intrinsics.
956 std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
957 cloneAndAdaptNoAliasScopes(NoAliasDeclScopes: LoopLocalNoAliasDeclScopes, NewBlocks,
958 Context&: Header->getContext(), Ext);
959 }
960
961 // Recursively create the new Loop objects for nested loops, if any,
962 // to preserve LoopInfo.
963 for (Loop *ChildLoop : *L) {
964 cloneLoop(L: ChildLoop, PL: ParentLoop, VM&: VMap, LI, LPM: nullptr);
965 }
966
967 // Hook-up the control flow for the newly inserted blocks.
968 // The new header is hooked up directly to the "top", which is either
969 // the original loop preheader (for the first iteration) or the previous
970 // iteration's exiting block (for every other iteration)
971 InsertTop->getTerminator()->setSuccessor(Idx: 0, BB: cast<BasicBlock>(Val&: VMap[Header]));
972
973 // Similarly, for the latch:
974 // The original exiting edge is still hooked up to the loop exit.
975 BasicBlock *NewLatch = cast<BasicBlock>(Val&: VMap[Latch]);
976 if (PeelLast) {
977 // This is the last iteration and we definitely will go to the exit. Just
978 // set both successors to InsertBot and let the branch be simplified later.
979 assert(IterNumber == 0 && "Only peeling a single iteration implemented.");
980 auto *LatchTerm = cast<CondBrInst>(Val: NewLatch->getTerminator());
981 LatchTerm->setSuccessor(idx: 0, NewSucc: InsertBot);
982 LatchTerm->setSuccessor(idx: 1, NewSucc: InsertBot);
983 } else {
984 auto *LatchTerm = cast<Instruction>(Val: NewLatch->getTerminator());
985 // The backedge now goes to the "bottom", which is either the loop's real
986 // header (for the last peeled iteration) or the copied header of the next
987 // iteration (for every other iteration)
988 for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) {
989 if (LatchTerm->getSuccessor(Idx: idx) == Header) {
990 LatchTerm->setSuccessor(Idx: idx, BB: InsertBot);
991 break;
992 }
993 }
994 }
995 if (DT)
996 DT->changeImmediateDominator(BB: InsertBot, NewBB: NewLatch);
997
998 // The new copy of the loop body starts with a bunch of PHI nodes
999 // that pick an incoming value from either the preheader, or the previous
1000 // loop iteration. Since this copy is no longer part of the loop, we
1001 // resolve this statically:
1002 if (PeelLast) {
1003 // For the last iteration, we introduce new phis for each header phi in
1004 // InsertTop, using the incoming value from the preheader for the original
1005 // preheader (when skipping the main loop) and the incoming value from the
1006 // latch for the latch (when continuing from the main loop).
1007 IRBuilder<> B(InsertTop, InsertTop->getFirstNonPHIIt());
1008 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
1009 PHINode *NewPHI = cast<PHINode>(Val&: VMap[&*I]);
1010 PHINode *PN = B.CreatePHI(Ty: NewPHI->getType(), NumReservedValues: 2);
1011 NewPHI->eraseFromParent();
1012 if (OrigPreHeader)
1013 PN->addIncoming(V: cast<PHINode>(Val: &*I)->getIncomingValueForBlock(BB: PreHeader),
1014 BB: OrigPreHeader);
1015
1016 PN->addIncoming(V: cast<PHINode>(Val: &*I)->getIncomingValueForBlock(BB: Latch),
1017 BB: Latch);
1018 VMap[&*I] = PN;
1019 }
1020 } else {
1021 // For the first iteration, we use the value from the preheader directly.
1022 // For any other iteration, we replace the phi with the value generated by
1023 // the immediately preceding clone of the loop body (which represents
1024 // the previous iteration).
1025 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
1026 PHINode *NewPHI = cast<PHINode>(Val&: VMap[&*I]);
1027 if (IterNumber == 0) {
1028 VMap[&*I] = NewPHI->getIncomingValueForBlock(BB: PreHeader);
1029 } else {
1030 Value *LatchVal = NewPHI->getIncomingValueForBlock(BB: Latch);
1031 Instruction *LatchInst = dyn_cast<Instruction>(Val: LatchVal);
1032 if (LatchInst && L->contains(Inst: LatchInst))
1033 VMap[&*I] = LVMap[LatchInst];
1034 else
1035 VMap[&*I] = LatchVal;
1036 }
1037 NewPHI->eraseFromParent();
1038 }
1039 }
1040
1041 // Fix up the outgoing values - we need to add a value for the iteration
1042 // we've just created. Note that this must happen *after* the incoming
1043 // values are adjusted, since the value going out of the latch may also be
1044 // a value coming into the header.
1045 for (auto Edge : ExitEdges)
1046 for (PHINode &PHI : Edge.second->phis()) {
1047 Value *LatchVal = PHI.getIncomingValueForBlock(BB: Edge.first);
1048 Instruction *LatchInst = dyn_cast<Instruction>(Val: LatchVal);
1049 if (LatchInst && L->contains(Inst: LatchInst))
1050 LatchVal = VMap[LatchVal];
1051 PHI.addIncoming(V: LatchVal, BB: cast<BasicBlock>(Val&: VMap[Edge.first]));
1052 SE.forgetLcssaPhiWithNewPredecessor(L, V: &PHI);
1053 }
1054
1055 // LastValueMap is updated with the values for the current loop
1056 // which are used the next time this function is called.
1057 for (auto KV : VMap)
1058 LVMap[KV.first] = KV.second;
1059}
1060
1061TargetTransformInfo::PeelingPreferences
1062llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
1063 const TargetTransformInfo &TTI,
1064 std::optional<bool> UserAllowPeeling,
1065 std::optional<bool> UserAllowProfileBasedPeeling,
1066 bool UnrollingSpecficValues) {
1067 TargetTransformInfo::PeelingPreferences PP;
1068
1069 // Set the default values.
1070 PP.PeelCount = 0;
1071 PP.AllowPeeling = true;
1072 PP.AllowLoopNestsPeeling = false;
1073 PP.PeelLast = false;
1074 PP.PeelProfiledIterations = true;
1075
1076 // Get the target specifc values.
1077 TTI.getPeelingPreferences(L, SE, PP);
1078
1079 // User specified values using cl::opt.
1080 if (UnrollingSpecficValues) {
1081 if (UnrollPeelCount.getNumOccurrences() > 0)
1082 PP.PeelCount = UnrollPeelCount;
1083 if (UnrollAllowPeeling.getNumOccurrences() > 0)
1084 PP.AllowPeeling = UnrollAllowPeeling;
1085 if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
1086 PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
1087 }
1088
1089 // User specifed values provided by argument.
1090 if (UserAllowPeeling)
1091 PP.AllowPeeling = *UserAllowPeeling;
1092 if (UserAllowProfileBasedPeeling)
1093 PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
1094
1095 return PP;
1096}
1097
1098/// Peel off the first \p PeelCount iterations of loop \p L.
1099///
1100/// Note that this does not peel them off as a single straight-line block.
1101/// Rather, each iteration is peeled off separately, and needs to check the
1102/// exit condition.
1103/// For loops that dynamically execute \p PeelCount iterations or less
1104/// this provides a benefit, since the peeled off iterations, which account
1105/// for the bulk of dynamic execution, can be further simplified by scalar
1106/// optimizations.
1107void llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
1108 ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,
1109 bool PreserveLCSSA, ValueToValueMapTy &LVMap) {
1110 assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
1111 assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
1112 assert((!PeelLast || (canPeelLastIteration(*L, *SE) && PeelCount == 1)) &&
1113 "when peeling the last iteration, the loop must be supported and can "
1114 "only peel a single iteration");
1115
1116 LoopBlocksDFS LoopBlocks(L);
1117 LoopBlocks.perform(LI);
1118
1119 BasicBlock *Header = L->getHeader();
1120 BasicBlock *PreHeader = L->getLoopPreheader();
1121 BasicBlock *Latch = L->getLoopLatch();
1122 SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
1123 L->getExitEdges(ExitEdges);
1124
1125 // Remember dominators of blocks we might reach through exits to change them
1126 // later. Immediate dominator of such block might change, because we add more
1127 // routes which can lead to the exit: we can reach it from the peeled
1128 // iterations too.
1129 MapVector<BasicBlock *, BasicBlock *> NonLoopBlocksIDom;
1130 for (auto *BB : L->blocks()) {
1131 auto *BBDomNode = DT.getNode(BB);
1132 SmallVector<BasicBlock *, 16> ChildrenToUpdate;
1133 for (auto *ChildDomNode : BBDomNode->children()) {
1134 auto *ChildBB = ChildDomNode->getBlock();
1135 if (!L->contains(BB: ChildBB))
1136 ChildrenToUpdate.push_back(Elt: ChildBB);
1137 }
1138 // The new idom of the block will be the nearest common dominator
1139 // of all copies of the previous idom. This is equivalent to the
1140 // nearest common dominator of the previous idom and the first latch,
1141 // which dominates all copies of the previous idom.
1142 BasicBlock *NewIDom = DT.findNearestCommonDominator(A: BB, B: Latch);
1143 for (auto *ChildBB : ChildrenToUpdate)
1144 NonLoopBlocksIDom[ChildBB] = NewIDom;
1145 }
1146
1147 Function *F = Header->getParent();
1148
1149 // Set up all the necessary basic blocks.
1150 BasicBlock *InsertTop;
1151 BasicBlock *InsertBot;
1152 BasicBlock *NewPreHeader = nullptr;
1153 DenseMap<Instruction *, Value *> ExitValues;
1154 if (PeelLast) {
1155 // It is convenient to split the single exit block from the latch the
1156 // into 3 parts - two blocks to anchor the peeled copy of the loop body,
1157 // and a new final exit block.
1158
1159 // Peeling the last iteration transforms.
1160 //
1161 // PreHeader:
1162 // ...
1163 // Header:
1164 // LoopBody
1165 // If (cond) goto Header
1166 // Exit:
1167 //
1168 // into
1169 //
1170 // Header:
1171 // LoopBody
1172 // If (cond) goto Header
1173 // InsertTop:
1174 // LoopBody
1175 // If (!cond) goto InsertBot
1176 // InsertBot:
1177 // Exit:
1178 // ...
1179 BasicBlock *Exit = L->getExitBlock();
1180 for (PHINode &P : Exit->phis())
1181 ExitValues[&P] = P.getIncomingValueForBlock(BB: Latch);
1182
1183 const SCEV *BTC = SE->getBackedgeTakenCount(L);
1184
1185 InsertTop = SplitEdge(From: Latch, To: Exit, DT: &DT, LI);
1186 InsertBot = SplitBlock(Old: InsertTop, SplitPt: InsertTop->getTerminator(), DT: &DT, LI);
1187
1188 InsertTop->setName(Exit->getName() + ".peel.begin");
1189 InsertBot->setName(Exit->getName() + ".peel.next");
1190 NewPreHeader = nullptr;
1191
1192 // If the original loop may only execute a single iteration we need to
1193 // insert a trip count check and skip the original loop with the last
1194 // iteration peeled off if necessary. Either way, we must update branch
1195 // weights to maintain the loop body frequency.
1196 if (SE->isKnownNonZero(S: BTC)) {
1197 // We have just proven that, when reached, the original loop always
1198 // executes at least two iterations. Thus, we unconditionally execute
1199 // both the remaining loop's initial iteration and the peeled iteration.
1200 // But that increases the latter's frequency above its frequency in the
1201 // original loop. To maintain the total frequency, we compensate by
1202 // decreasing the remaining loop body's frequency to indicate one less
1203 // iteration.
1204 //
1205 // We use this formula to convert probability to/from frequency:
1206 // Sum(i=0..inf)(P^i) = 1/(1-P) = Freq.
1207 if (BranchProbability P = getLoopProbability(L); !P.isUnknown()) {
1208 // Trying to subtract one from an infinite loop is pointless, and our
1209 // formulas then produce division by zero, so skip that case.
1210 if (BranchProbability ExitP = P.getCompl(); !ExitP.isZero()) {
1211 double Freq = 1 / ExitP.toDouble();
1212 // No branch weights can produce a frequency of less than one given
1213 // the initial iteration, and our formulas produce a negative
1214 // probability if we try.
1215 assert(Freq >= 1.0 && "expected freq >= 1 due to initial iteration");
1216 double NewFreq = std::max(a: Freq - 1, b: 1.0);
1217 setLoopProbability(
1218 L, P: BranchProbability::getBranchProbability(Prob: 1 - 1 / NewFreq));
1219 }
1220 }
1221 } else {
1222 NewPreHeader = SplitEdge(From: PreHeader, To: Header, DT: &DT, LI);
1223 SCEVExpander Expander(*SE, "loop-peel");
1224
1225 Instruction *PreHeaderBR = PreHeader->getTerminator();
1226 Value *BTCValue =
1227 Expander.expandCodeFor(SH: BTC, Ty: BTC->getType(), I: PreHeaderBR);
1228 IRBuilder<> B(PreHeaderBR);
1229 Value *Cond =
1230 B.CreateICmpNE(LHS: BTCValue, RHS: ConstantInt::get(Ty: BTCValue->getType(), V: 0));
1231 auto *BI = B.CreateCondBr(Cond, True: NewPreHeader, False: InsertTop);
1232 SmallVector<uint32_t> Weights;
1233 auto *OrigLatchBr = Latch->getTerminator();
1234 auto HasBranchWeights = !ProfcheckDisableMetadataFixes &&
1235 extractBranchWeights(I: *OrigLatchBr, Weights);
1236 if (HasBranchWeights) {
1237 // The probability that the new guard skips the loop to execute just one
1238 // iteration is the original loop's probability of exiting at the latch
1239 // after any iteration. That should maintain the original loop body
1240 // frequency. Upon arriving at the loop, due to the guard, the
1241 // probability of reaching iteration i of the new loop is the
1242 // probability of reaching iteration i+1 of the original loop. The
1243 // probability of reaching the peeled iteration is 1, which is the
1244 // probability of reaching iteration 0 of the original loop.
1245 if (L->getExitBlock() == OrigLatchBr->getSuccessor(Idx: 0))
1246 std::swap(a&: Weights[0], b&: Weights[1]);
1247 setBranchWeights(I&: *BI, Weights, /*IsExpected=*/false);
1248 }
1249 PreHeaderBR->eraseFromParent();
1250
1251 // PreHeader now dominates InsertTop.
1252 DT.changeImmediateDominator(BB: InsertTop, NewBB: PreHeader);
1253 }
1254 } else {
1255 // It is convenient to split the preheader into 3 parts - two blocks to
1256 // anchor the peeled copy of the loop body, and a new preheader for the
1257 // "real" loop.
1258
1259 // Peeling the first iteration transforms.
1260 //
1261 // PreHeader:
1262 // ...
1263 // Header:
1264 // LoopBody
1265 // If (cond) goto Header
1266 // Exit:
1267 //
1268 // into
1269 //
1270 // InsertTop:
1271 // LoopBody
1272 // If (!cond) goto Exit
1273 // InsertBot:
1274 // NewPreHeader:
1275 // ...
1276 // Header:
1277 // LoopBody
1278 // If (cond) goto Header
1279 // Exit:
1280 //
1281 // Each following iteration will split the current bottom anchor in two,
1282 // and put the new copy of the loop body between these two blocks. That
1283 // is, after peeling another iteration from the example above, we'll
1284 // split InsertBot, and get:
1285 //
1286 // InsertTop:
1287 // LoopBody
1288 // If (!cond) goto Exit
1289 // InsertBot:
1290 // LoopBody
1291 // If (!cond) goto Exit
1292 // InsertBot.next:
1293 // NewPreHeader:
1294 // ...
1295 // Header:
1296 // LoopBody
1297 // If (cond) goto Header
1298 // Exit:
1299 //
1300 InsertTop = SplitEdge(From: PreHeader, To: Header, DT: &DT, LI);
1301 InsertBot = SplitBlock(Old: InsertTop, SplitPt: InsertTop->getTerminator(), DT: &DT, LI);
1302 NewPreHeader = SplitBlock(Old: InsertBot, SplitPt: InsertBot->getTerminator(), DT: &DT, LI);
1303
1304 InsertTop->setName(Header->getName() + ".peel.begin");
1305 InsertBot->setName(Header->getName() + ".peel.next");
1306 NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
1307 }
1308
1309 Instruction *LatchTerm =
1310 cast<Instruction>(Val: cast<BasicBlock>(Val: Latch)->getTerminator());
1311
1312 // Identify what noalias metadata is inside the loop: if it is inside the
1313 // loop, the associated metadata must be cloned for each iteration.
1314 SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
1315 identifyNoAliasScopesToClone(BBs: L->getBlocks(), NoAliasDeclScopes&: LoopLocalNoAliasDeclScopes);
1316
1317 // For each peeled-off iteration, make a copy of the loop.
1318 ValueToValueMapTy VMap;
1319 for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
1320 SmallVector<BasicBlock *, 8> NewBlocks;
1321
1322 cloneLoopBlocks(L, IterNumber: Iter, PeelLast, InsertTop, InsertBot,
1323 OrigPreHeader: NewPreHeader ? PreHeader : nullptr, ExitEdges, NewBlocks,
1324 LoopBlocks, VMap, LVMap, DT: &DT, LI,
1325 LoopLocalNoAliasDeclScopes, SE&: *SE);
1326
1327 // Remap to use values from the current iteration instead of the
1328 // previous one.
1329 remapInstructionsInBlocks(Blocks: NewBlocks, VMap);
1330
1331 if (Iter == 0) {
1332 if (PeelLast) {
1333 // Adjust the exit condition so the loop exits one iteration early.
1334 // For now we simply subtract one form the second operand of the
1335 // exit condition. This relies on the peel count computation to
1336 // check that this is actually legal. In particular, it ensures that
1337 // the first operand of the compare is an AddRec with step 1 and we
1338 // execute more than one iteration.
1339 auto *Cmp =
1340 cast<ICmpInst>(Val: L->getLoopLatch()->getTerminator()->getOperand(i: 0));
1341 IRBuilder B(Cmp);
1342 Cmp->setOperand(
1343 i_nocapture: 1, Val_nocapture: B.CreateSub(LHS: Cmp->getOperand(i_nocapture: 1),
1344 RHS: ConstantInt::get(Ty: Cmp->getOperand(i_nocapture: 1)->getType(), V: 1)));
1345 } else {
1346 // Update IDoms of the blocks reachable through exits.
1347 for (auto BBIDom : NonLoopBlocksIDom)
1348 DT.changeImmediateDominator(BB: BBIDom.first,
1349 NewBB: cast<BasicBlock>(Val&: LVMap[BBIDom.second]));
1350 }
1351 }
1352
1353#ifdef EXPENSIVE_CHECKS
1354 assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1355#endif
1356
1357 // Remove Loop metadata from the latch branch instruction
1358 // because it is not the Loop's latch branch anymore.
1359 auto *LatchTermCopy = cast<Instruction>(Val&: VMap[LatchTerm]);
1360 LatchTermCopy->setMetadata(KindID: LLVMContext::MD_loop, Node: nullptr);
1361
1362 InsertTop = InsertBot;
1363 InsertBot = SplitBlock(Old: InsertBot, SplitPt: InsertBot->getTerminator(), DT: &DT, LI);
1364 InsertBot->setName(Header->getName() + ".peel.next");
1365
1366 F->splice(ToIt: InsertTop->getIterator(), FromF: F, FromBeginIt: NewBlocks[0]->getIterator(),
1367 FromEndIt: F->end());
1368 }
1369
1370 if (PeelLast) {
1371 // Now adjust users of the original exit values by replacing them with the
1372 // exit value from the peeled iteration and remove them.
1373 for (const auto &[P, E] : ExitValues) {
1374 Instruction *ExitInst = dyn_cast<Instruction>(Val: E);
1375 if (ExitInst && L->contains(Inst: ExitInst))
1376 P->replaceAllUsesWith(V: &*VMap[ExitInst]);
1377 else
1378 P->replaceAllUsesWith(V: E);
1379 P->eraseFromParent();
1380 }
1381 formLCSSA(L&: *L, DT, LI, SE);
1382 } else {
1383 // Now adjust the phi nodes in the loop header to get their initial values
1384 // from the last peeled-off iteration instead of the preheader.
1385 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
1386 PHINode *PHI = cast<PHINode>(Val&: I);
1387 Value *NewVal = PHI->getIncomingValueForBlock(BB: Latch);
1388 Instruction *LatchInst = dyn_cast<Instruction>(Val: NewVal);
1389 if (LatchInst && L->contains(Inst: LatchInst))
1390 NewVal = LVMap[LatchInst];
1391
1392 PHI->setIncomingValueForBlock(BB: NewPreHeader, V: NewVal);
1393 }
1394 }
1395
1396 // Update Metadata for count of peeled off iterations.
1397 unsigned AlreadyPeeled = 0;
1398 if (auto Peeled = getOptionalIntLoopAttribute(TheLoop: L, Name: PeeledCountMetaData))
1399 AlreadyPeeled = *Peeled;
1400 unsigned TotalPeeled = AlreadyPeeled + PeelCount;
1401 addStringMetadataToLoop(TheLoop: L, MDString: PeeledCountMetaData, V: TotalPeeled);
1402
1403 // Update metadata for the estimated trip count. The original branch weight
1404 // metadata is already correct for both the remaining loop and the peeled loop
1405 // iterations, so do not adjust it.
1406 //
1407 // For example, consider what happens when peeling 2 iterations from a loop
1408 // with an estimated trip count of 10 and inserting them before the remaining
1409 // loop. Each of the peeled iterations and each iteration in the remaining
1410 // loop still has the same probability of exiting the *entire original* loop
1411 // as it did when in the original loop, and thus it should still have the same
1412 // branch weights. The peeled iterations' non-zero probabilities of exiting
1413 // already appropriately reduce the probability of reaching the remaining
1414 // iterations just as they did in the original loop. Trying to also adjust
1415 // the remaining loop's branch weights to reflect its new trip count of 8 will
1416 // erroneously further reduce its block frequencies. However, in case an
1417 // analysis later needs to determine the trip count of the remaining loop
1418 // while examining it in isolation without considering the probability of
1419 // actually reaching it, we store the new trip count as separate metadata.
1420 if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) {
1421 unsigned EstimatedTripCountNew = *EstimatedTripCount;
1422 if (EstimatedTripCountNew < TotalPeeled)
1423 EstimatedTripCountNew = 0;
1424 else
1425 EstimatedTripCountNew -= TotalPeeled;
1426 setLoopEstimatedTripCount(L, EstimatedTripCount: EstimatedTripCountNew);
1427 }
1428
1429 if (Loop *ParentLoop = L->getParentLoop())
1430 L = ParentLoop;
1431
1432 // We modified the loop, update SE.
1433 SE->forgetTopmostLoop(L);
1434 SE->forgetBlockAndLoopDispositions();
1435
1436#ifdef EXPENSIVE_CHECKS
1437 // Finally DomtTree must be correct.
1438 assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1439#endif
1440
1441 // FIXME: Incrementally update loop-simplify
1442 simplifyLoop(L, DT: &DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA);
1443
1444 NumPeeled++;
1445 NumPeeledEnd += PeelLast;
1446}
1447