1//===- LoopPeel.cpp -------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Loop Peeling Utilities.
10//===----------------------------------------------------------------------===//
11
12#include "llvm/Transforms/Utils/LoopPeel.h"
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/SmallVector.h"
15#include "llvm/ADT/Statistic.h"
16#include "llvm/Analysis/Loads.h"
17#include "llvm/Analysis/LoopInfo.h"
18#include "llvm/Analysis/LoopIterator.h"
19#include "llvm/Analysis/ScalarEvolution.h"
20#include "llvm/Analysis/ScalarEvolutionExpressions.h"
21#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
22#include "llvm/Analysis/TargetTransformInfo.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Dominators.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/InstrTypes.h"
27#include "llvm/IR/Instruction.h"
28#include "llvm/IR/Instructions.h"
29#include "llvm/IR/LLVMContext.h"
30#include "llvm/IR/MDBuilder.h"
31#include "llvm/IR/PatternMatch.h"
32#include "llvm/IR/ProfDataUtils.h"
33#include "llvm/Support/Casting.h"
34#include "llvm/Support/CommandLine.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Support/raw_ostream.h"
37#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38#include "llvm/Transforms/Utils/Cloning.h"
39#include "llvm/Transforms/Utils/LoopSimplify.h"
40#include "llvm/Transforms/Utils/LoopUtils.h"
41#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
42#include "llvm/Transforms/Utils/ValueMapper.h"
43#include <algorithm>
44#include <cassert>
45#include <cstdint>
46#include <optional>
47
48using namespace llvm;
49using namespace llvm::PatternMatch;
50using namespace llvm::SCEVPatternMatch;
51
52#define DEBUG_TYPE "loop-peel"
53
54STATISTIC(NumPeeled, "Number of loops peeled");
55STATISTIC(NumPeeledEnd, "Number of loops peeled from end");
56
57static cl::opt<unsigned> UnrollPeelCount(
58 "unroll-peel-count", cl::Hidden,
59 cl::desc("Set the unroll peeling count, for testing purposes"));
60
61static cl::opt<bool>
62 UnrollAllowPeeling("unroll-allow-peeling", cl::init(Val: true), cl::Hidden,
63 cl::desc("Allows loops to be peeled when the dynamic "
64 "trip count is known to be low."));
65
66static cl::opt<bool>
67 UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",
68 cl::init(Val: false), cl::Hidden,
69 cl::desc("Allows loop nests to be peeled."));
70
71static cl::opt<unsigned> UnrollPeelMaxCount(
72 "unroll-peel-max-count", cl::init(Val: 7), cl::Hidden,
73 cl::desc("Max average trip count which will cause loop peeling."));
74
75static cl::opt<unsigned> UnrollForcePeelCount(
76 "unroll-force-peel-count", cl::init(Val: 0), cl::Hidden,
77 cl::desc("Force a peel count regardless of profiling information."));
78
79static cl::opt<bool> DisableAdvancedPeeling(
80 "disable-advanced-peeling", cl::init(Val: false), cl::Hidden,
81 cl::desc(
82 "Disable advance peeling. Issues for convergent targets (D134803)."));
83
84static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
85
86// Check whether we are capable of peeling this loop.
87bool llvm::canPeel(const Loop *L) {
88 // Make sure the loop is in simplified form
89 if (!L->isLoopSimplifyForm())
90 return false;
91 if (!DisableAdvancedPeeling)
92 return true;
93
94 SmallVector<BasicBlock *, 4> Exits;
95 L->getUniqueNonLatchExitBlocks(ExitBlocks&: Exits);
96 // The latch must either be the only exiting block or all non-latch exit
97 // blocks have either a deopt or unreachable terminator or compose a chain of
98 // blocks where the last one is either deopt or unreachable terminated. Both
99 // deopt and unreachable terminators are a strong indication they are not
100 // taken. Note that this is a profitability check, not a legality check. Also
101 // note that LoopPeeling currently can only update the branch weights of latch
102 // blocks and branch weights to blocks with deopt or unreachable do not need
103 // updating.
104 return llvm::all_of(Range&: Exits, P: IsBlockFollowedByDeoptOrUnreachable);
105}
106
107namespace {
108
109// As a loop is peeled, it may be the case that Phi nodes become
110// loop-invariant (ie, known because there is only one choice).
111// For example, consider the following function:
112// void g(int);
113// void binary() {
114// int x = 0;
115// int y = 0;
116// int a = 0;
117// for(int i = 0; i <100000; ++i) {
118// g(x);
119// x = y;
120// g(a);
121// y = a + 1;
122// a = 5;
123// }
124// }
125// Peeling 3 iterations is beneficial because the values for x, y and a
126// become known. The IR for this loop looks something like the following:
127//
128// %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
129// %a = phi i32 [ 0, %entry ], [ 5, %if.end ]
130// %y = phi i32 [ 0, %entry ], [ %add, %if.end ]
131// %x = phi i32 [ 0, %entry ], [ %y, %if.end ]
132// ...
133// tail call void @_Z1gi(i32 signext %x)
134// tail call void @_Z1gi(i32 signext %a)
135// %add = add nuw nsw i32 %a, 1
136// %inc = add nuw nsw i32 %i, 1
137// %exitcond = icmp eq i32 %inc, 100000
138// br i1 %exitcond, label %for.cond.cleanup, label %for.body
139//
140// The arguments for the calls to g will become known after 3 iterations
141// of the loop, because the phi nodes values become known after 3 iterations
142// of the loop (ie, they are known on the 4th iteration, so peel 3 iterations).
143// The first iteration has g(0), g(0); the second has g(0), g(5); the
144// third has g(1), g(5) and the fourth (and all subsequent) have g(6), g(5).
145// Now consider the phi nodes:
146// %a is a phi with constants so it is determined after iteration 1.
147// %y is a phi based on a constant and %a so it is determined on
148// the iteration after %a is determined, so iteration 2.
149// %x is a phi based on a constant and %y so it is determined on
150// the iteration after %y, so iteration 3.
151// %i is based on itself (and is an induction variable) so it is
152// never determined.
153// This means that peeling off 3 iterations will result in being able to
154// remove the phi nodes for %a, %y, and %x. The arguments for the
155// corresponding calls to g are determined and the code for computing
156// x, y, and a can be removed.
157//
158// The PhiAnalyzer class calculates how many times a loop should be
159// peeled based on the above analysis of the phi nodes in the loop while
160// respecting the maximum specified.
161class PhiAnalyzer {
162public:
163 PhiAnalyzer(const Loop &L, unsigned MaxIterations);
164
165 // Calculate the sufficient minimum number of iterations of the loop to peel
166 // such that phi instructions become determined (subject to allowable limits)
167 std::optional<unsigned> calculateIterationsToPeel();
168
169protected:
170 using PeelCounter = std::optional<unsigned>;
171 const PeelCounter Unknown = std::nullopt;
172
173 // Add 1 respecting Unknown and return Unknown if result over MaxIterations
174 PeelCounter addOne(PeelCounter PC) const {
175 if (PC == Unknown)
176 return Unknown;
177 return (*PC + 1 <= MaxIterations) ? PeelCounter{*PC + 1} : Unknown;
178 }
179
180 // Calculate the number of iterations after which the given value
181 // becomes an invariant.
182 PeelCounter calculate(const Value &);
183
184 const Loop &L;
185 const unsigned MaxIterations;
186
187 // Map of Values to number of iterations to invariance
188 SmallDenseMap<const Value *, PeelCounter> IterationsToInvariance;
189};
190
191PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations)
192 : L(L), MaxIterations(MaxIterations) {
193 assert(canPeel(&L) && "loop is not suitable for peeling");
194 assert(MaxIterations > 0 && "no peeling is allowed?");
195}
196
197// This function calculates the number of iterations after which the value
198// becomes an invariant. The pre-calculated values are memorized in a map.
199// N.B. This number will be Unknown or <= MaxIterations.
200// The function is calculated according to the following definition:
201// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
202// F(%x) = G(%y) + 1 (N.B. [MaxIterations | Unknown] + 1 => Unknown)
203// G(%y) = 0 if %y is a loop invariant
204// G(%y) = G(%BackEdgeValue) if %y is a phi in the header block
205// G(%y) = TODO: if %y is an expression based on phis and loop invariants
206// The example looks like:
207// %x = phi(0, %a) <-- becomes invariant starting from 3rd iteration.
208// %y = phi(0, 5)
209// %a = %y + 1
210// G(%y) = Unknown otherwise (including phi not in header block)
211PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
212 // If we already know the answer, take it from the map.
213 // Otherwise, place Unknown to map to avoid infinite recursion. Such
214 // cycles can never stop on an invariant.
215 auto [I, Inserted] = IterationsToInvariance.try_emplace(Key: &V, Args: Unknown);
216 if (!Inserted)
217 return I->second;
218
219 if (L.isLoopInvariant(V: &V))
220 // Loop invariant so known at start.
221 return (IterationsToInvariance[&V] = 0);
222 if (const PHINode *Phi = dyn_cast<PHINode>(Val: &V)) {
223 if (Phi->getParent() != L.getHeader()) {
224 // Phi is not in header block so Unknown.
225 assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");
226 return Unknown;
227 }
228 // We need to analyze the input from the back edge and add 1.
229 Value *Input = Phi->getIncomingValueForBlock(BB: L.getLoopLatch());
230 PeelCounter Iterations = calculate(V: *Input);
231 assert(IterationsToInvariance[Input] == Iterations &&
232 "unexpected value saved");
233 return (IterationsToInvariance[Phi] = addOne(PC: Iterations));
234 }
235 if (const Instruction *I = dyn_cast<Instruction>(Val: &V)) {
236 if (isa<CmpInst>(Val: I) || I->isBinaryOp()) {
237 // Binary instructions get the max of the operands.
238 PeelCounter LHS = calculate(V: *I->getOperand(i: 0));
239 if (LHS == Unknown)
240 return Unknown;
241 PeelCounter RHS = calculate(V: *I->getOperand(i: 1));
242 if (RHS == Unknown)
243 return Unknown;
244 return (IterationsToInvariance[I] = {std::max(a: *LHS, b: *RHS)});
245 }
246 if (I->isCast())
247 // Cast instructions get the value of the operand.
248 return (IterationsToInvariance[I] = calculate(V: *I->getOperand(i: 0)));
249 }
250 // TODO: handle more expressions
251
252 // Everything else is Unknown.
253 assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");
254 return Unknown;
255}
256
257std::optional<unsigned> PhiAnalyzer::calculateIterationsToPeel() {
258 unsigned Iterations = 0;
259 for (auto &PHI : L.getHeader()->phis()) {
260 PeelCounter ToInvariance = calculate(V: PHI);
261 if (ToInvariance != Unknown) {
262 assert(*ToInvariance <= MaxIterations && "bad result in phi analysis");
263 Iterations = std::max(a: Iterations, b: *ToInvariance);
264 if (Iterations == MaxIterations)
265 break;
266 }
267 }
268 assert((Iterations <= MaxIterations) && "bad result in phi analysis");
269 return Iterations ? std::optional<unsigned>(Iterations) : std::nullopt;
270}
271
272} // unnamed namespace
273
274// Try to find any invariant memory reads that will become dereferenceable in
275// the remainder loop after peeling. The load must also be used (transitively)
276// by an exit condition. Returns the number of iterations to peel off (at the
277// moment either 0 or 1).
278static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
279 DominatorTree &DT,
280 AssumptionCache *AC) {
281 // Skip loops with a single exiting block, because there should be no benefit
282 // for the heuristic below.
283 if (L.getExitingBlock())
284 return 0;
285
286 // All non-latch exit blocks must have an UnreachableInst terminator.
287 // Otherwise the heuristic below may not be profitable.
288 SmallVector<BasicBlock *, 4> Exits;
289 L.getUniqueNonLatchExitBlocks(ExitBlocks&: Exits);
290 if (any_of(Range&: Exits, P: [](const BasicBlock *BB) {
291 return !isa<UnreachableInst>(Val: BB->getTerminator());
292 }))
293 return 0;
294
295 // Now look for invariant loads that dominate the latch and are not known to
296 // be dereferenceable. If there are such loads and no writes, they will become
297 // dereferenceable in the loop if the first iteration is peeled off. Also
298 // collect the set of instructions controlled by such loads. Only peel if an
299 // exit condition uses (transitively) such a load.
300 BasicBlock *Header = L.getHeader();
301 BasicBlock *Latch = L.getLoopLatch();
302 SmallPtrSet<Value *, 8> LoadUsers;
303 const DataLayout &DL = L.getHeader()->getDataLayout();
304 for (BasicBlock *BB : L.blocks()) {
305 for (Instruction &I : *BB) {
306 if (I.mayWriteToMemory())
307 return 0;
308
309 if (LoadUsers.contains(Ptr: &I))
310 LoadUsers.insert_range(R: I.users());
311 // Do not look for reads in the header; they can already be hoisted
312 // without peeling.
313 if (BB == Header)
314 continue;
315 if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
316 Value *Ptr = LI->getPointerOperand();
317 if (DT.dominates(A: BB, B: Latch) && L.isLoopInvariant(V: Ptr) &&
318 !isDereferenceablePointer(V: Ptr, Ty: LI->getType(), DL, CtxI: LI, AC, DT: &DT))
319 LoadUsers.insert_range(R: I.users());
320 }
321 }
322 }
323 SmallVector<BasicBlock *> ExitingBlocks;
324 L.getExitingBlocks(ExitingBlocks);
325 if (any_of(Range&: ExitingBlocks, P: [&LoadUsers](BasicBlock *Exiting) {
326 return LoadUsers.contains(Ptr: Exiting->getTerminator());
327 }))
328 return 1;
329 return 0;
330}
331
332bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
333 const SCEV *BTC = SE.getBackedgeTakenCount(L: &L);
334 if (isa<SCEVCouldNotCompute>(Val: BTC))
335 return false;
336
337 // Check if the exit condition of the loop can be adjusted by the peeling
338 // codegen. For now, it must
339 // * exit via the latch,
340 // * the exit condition must be a NE/EQ compare of an induction with step
341 // of 1 and must only be used by the exiting branch.
342 BasicBlock *Latch = L.getLoopLatch();
343 Value *Inc;
344 Value *Bound;
345 CmpPredicate Pred;
346 BasicBlock *Succ1;
347 BasicBlock *Succ2;
348 return Latch && Latch == L.getExitingBlock() &&
349 match(V: Latch->getTerminator(),
350 P: m_Br(C: m_OneUse(SubPattern: m_ICmp(Pred, L: m_Value(V&: Inc), R: m_Value(V&: Bound))),
351 T: m_BasicBlock(V&: Succ1), F: m_BasicBlock(V&: Succ2))) &&
352 ((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) ||
353 (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) &&
354 Bound->getType()->isIntegerTy() &&
355 SE.isLoopInvariant(S: SE.getSCEV(V: Bound), L: &L) &&
356 match(S: SE.getSCEV(V: Inc),
357 P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_One(), L: m_SpecificLoop(L: &L)));
358}
359
360/// Returns true if the last iteration can be peeled off and the condition (Pred
361/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition
362/// is known at the second-to-last.
363static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred,
364 const SCEVAddRecExpr *LeftAR,
365 const SCEV *RightSCEV, ScalarEvolution &SE,
366 const TargetTransformInfo &TTI) {
367 if (!canPeelLastIteration(L, SE))
368 return false;
369
370 const SCEV *BTC = SE.getBackedgeTakenCount(L: &L);
371 SCEVExpander Expander(SE, L.getHeader()->getDataLayout(), "loop-peel");
372 if (!SE.isKnownNonZero(S: BTC) &&
373 Expander.isHighCostExpansion(Exprs: BTC, L: &L, Budget: SCEVCheapExpansionBudget, TTI: &TTI,
374 At: L.getLoopPredecessor()->getTerminator()))
375 return false;
376
377 auto Guards = ScalarEvolution::LoopGuards::collect(L: &L, SE);
378 BTC = SE.applyLoopGuards(Expr: BTC, Guards);
379 RightSCEV = SE.applyLoopGuards(Expr: RightSCEV, Guards);
380 const SCEV *ValAtLastIter = LeftAR->evaluateAtIteration(It: BTC, SE);
381 const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration(
382 It: SE.getMinusSCEV(LHS: BTC, RHS: SE.getOne(Ty: BTC->getType())), SE);
383
384 return SE.isKnownPredicate(Pred: ICmpInst::getInversePredicate(pred: Pred), LHS: ValAtLastIter,
385 RHS: RightSCEV) &&
386 SE.isKnownPredicate(Pred, LHS: ValAtSecondToLastIter, RHS: RightSCEV);
387}
388
389// Return the number of iterations to peel off from the beginning and end of the
390// loop respectively, that make conditions in the body true/false. For example,
391// if we peel 2 iterations off the loop below, the condition i < 2 can be
392// evaluated at compile time.
393//
394// for (i = 0; i < n; i++)
395// if (i < 2)
396// ..
397// else
398// ..
399// }
400static std::pair<unsigned, unsigned>
401countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
402 const TargetTransformInfo &TTI) {
403 assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
404 unsigned DesiredPeelCount = 0;
405 unsigned DesiredPeelCountLast = 0;
406
407 // Do not peel the entire loop.
408 const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(L: &L);
409 if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Val: BE))
410 MaxPeelCount =
411 std::min(a: (unsigned)SC->getAPInt().getLimitedValue() - 1, b: MaxPeelCount);
412
413 // Increase PeelCount while (IterVal Pred BoundSCEV) condition is satisfied;
414 // return true if inversed condition become known before reaching the
415 // MaxPeelCount limit.
416 auto PeelWhilePredicateIsKnown =
417 [&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
418 const SCEV *Step, ICmpInst::Predicate Pred) {
419 while (PeelCount < MaxPeelCount &&
420 SE.isKnownPredicate(Pred, LHS: IterVal, RHS: BoundSCEV)) {
421 IterVal = SE.getAddExpr(LHS: IterVal, RHS: Step);
422 ++PeelCount;
423 }
424 return SE.isKnownPredicate(Pred: ICmpInst::getInversePredicate(pred: Pred), LHS: IterVal,
425 RHS: BoundSCEV);
426 };
427
428 const unsigned MaxDepth = 4;
429 std::function<void(Value *, unsigned)> ComputePeelCount =
430 [&](Value *Condition, unsigned Depth) -> void {
431 if (!Condition->getType()->isIntegerTy() || Depth >= MaxDepth)
432 return;
433
434 Value *LeftVal, *RightVal;
435 if (match(V: Condition, P: m_And(L: m_Value(V&: LeftVal), R: m_Value(V&: RightVal))) ||
436 match(V: Condition, P: m_Or(L: m_Value(V&: LeftVal), R: m_Value(V&: RightVal)))) {
437 ComputePeelCount(LeftVal, Depth + 1);
438 ComputePeelCount(RightVal, Depth + 1);
439 return;
440 }
441
442 CmpPredicate Pred;
443 if (!match(V: Condition, P: m_ICmp(Pred, L: m_Value(V&: LeftVal), R: m_Value(V&: RightVal))))
444 return;
445
446 const SCEV *LeftSCEV = SE.getSCEV(V: LeftVal);
447 const SCEV *RightSCEV = SE.getSCEV(V: RightVal);
448
449 // Do not consider predicates that are known to be true or false
450 // independently of the loop iteration.
451 if (SE.evaluatePredicate(Pred, LHS: LeftSCEV, RHS: RightSCEV))
452 return;
453
454 // Check if we have a condition with one AddRec and one non AddRec
455 // expression. Normalize LeftSCEV to be the AddRec.
456 if (!isa<SCEVAddRecExpr>(Val: LeftSCEV)) {
457 if (isa<SCEVAddRecExpr>(Val: RightSCEV)) {
458 std::swap(a&: LeftSCEV, b&: RightSCEV);
459 Pred = ICmpInst::getSwappedPredicate(pred: Pred);
460 } else
461 return;
462 }
463
464 const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(Val: LeftSCEV);
465
466 // Avoid huge SCEV computations in the loop below, make sure we only
467 // consider AddRecs of the loop we are trying to peel.
468 if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
469 return;
470 if (!(ICmpInst::isEquality(P: Pred) && LeftAR->hasNoSelfWrap()) &&
471 !SE.getMonotonicPredicateType(LHS: LeftAR, Pred))
472 return;
473
474 // Check if extending the current DesiredPeelCount lets us evaluate Pred
475 // or !Pred in the loop body statically.
476 unsigned NewPeelCount = DesiredPeelCount;
477
478 const SCEV *IterVal = LeftAR->evaluateAtIteration(
479 It: SE.getConstant(Ty: LeftSCEV->getType(), V: NewPeelCount), SE);
480
481 // If the original condition is not known, get the negated predicate
482 // (which holds on the else branch) and check if it is known. This allows
483 // us to peel of iterations that make the original condition false.
484 if (!SE.isKnownPredicate(Pred, LHS: IterVal, RHS: RightSCEV))
485 Pred = ICmpInst::getInversePredicate(pred: Pred);
486
487 const SCEV *Step = LeftAR->getStepRecurrence(SE);
488 if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,
489 Pred)) {
490 if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE, TTI))
491 DesiredPeelCountLast = 1;
492 return;
493 }
494
495 // However, for equality comparisons, that isn't always sufficient to
496 // eliminate the comparsion in loop body, we may need to peel one more
497 // iteration. See if that makes !Pred become unknown again.
498 const SCEV *NextIterVal = SE.getAddExpr(LHS: IterVal, RHS: Step);
499 if (ICmpInst::isEquality(P: Pred) &&
500 !SE.isKnownPredicate(Pred: ICmpInst::getInversePredicate(pred: Pred), LHS: NextIterVal,
501 RHS: RightSCEV) &&
502 !SE.isKnownPredicate(Pred, LHS: IterVal, RHS: RightSCEV) &&
503 SE.isKnownPredicate(Pred, LHS: NextIterVal, RHS: RightSCEV)) {
504 if (NewPeelCount >= MaxPeelCount)
505 return; // Need to peel one more iteration, but can't. Give up.
506 ++NewPeelCount; // Great!
507 }
508
509 DesiredPeelCount = std::max(a: DesiredPeelCount, b: NewPeelCount);
510 DesiredPeelCountLast = std::max(a: DesiredPeelCountLast, b: NewPeelCount);
511 };
512
513 auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) {
514 if (!MinMax->getType()->isIntegerTy())
515 return;
516 Value *LHS = MinMax->getLHS(), *RHS = MinMax->getRHS();
517 const SCEV *BoundSCEV, *IterSCEV;
518 if (L.isLoopInvariant(V: LHS)) {
519 BoundSCEV = SE.getSCEV(V: LHS);
520 IterSCEV = SE.getSCEV(V: RHS);
521 } else if (L.isLoopInvariant(V: RHS)) {
522 BoundSCEV = SE.getSCEV(V: RHS);
523 IterSCEV = SE.getSCEV(V: LHS);
524 } else
525 return;
526 const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Val: IterSCEV);
527 // For simplicity, we support only affine recurrences.
528 if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)
529 return;
530 const SCEV *Step = AddRec->getStepRecurrence(SE);
531 bool IsSigned = MinMax->isSigned();
532 // To minimize number of peeled iterations, we use strict relational
533 // predicates here.
534 ICmpInst::Predicate Pred;
535 if (SE.isKnownPositive(S: Step))
536 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
537 else if (SE.isKnownNegative(S: Step))
538 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
539 else
540 return;
541 // Check that AddRec is not wrapping.
542 if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))
543 return;
544 unsigned NewPeelCount = DesiredPeelCount;
545 const SCEV *IterVal = AddRec->evaluateAtIteration(
546 It: SE.getConstant(Ty: AddRec->getType(), V: NewPeelCount), SE);
547 if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
548 Pred)) {
549 if (shouldPeelLastIteration(L, Pred, LeftAR: AddRec, RightSCEV: BoundSCEV, SE, TTI))
550 DesiredPeelCountLast = 1;
551 return;
552 }
553 DesiredPeelCount = NewPeelCount;
554 };
555
556 for (BasicBlock *BB : L.blocks()) {
557 for (Instruction &I : *BB) {
558 if (SelectInst *SI = dyn_cast<SelectInst>(Val: &I))
559 ComputePeelCount(SI->getCondition(), 0);
560 if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(Val: &I))
561 ComputePeelCountMinMax(MinMax);
562 }
563
564 auto *BI = dyn_cast<BranchInst>(Val: BB->getTerminator());
565 if (!BI || BI->isUnconditional())
566 continue;
567
568 // Ignore loop exit condition.
569 if (L.getLoopLatch() == BB)
570 continue;
571
572 ComputePeelCount(BI->getCondition(), 0);
573 }
574
575 return {DesiredPeelCount, DesiredPeelCountLast};
576}
577
578/// This "heuristic" exactly matches implicit behavior which used to exist
579/// inside getLoopEstimatedTripCount. It was added here to keep an
580/// improvement inside that API from causing peeling to become more aggressive.
581/// This should probably be removed.
582static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
583 BasicBlock *Latch = L->getLoopLatch();
584 if (!Latch)
585 return true;
586
587 BranchInst *LatchBR = dyn_cast<BranchInst>(Val: Latch->getTerminator());
588 if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(BB: Latch))
589 return true;
590
591 assert((LatchBR->getSuccessor(0) == L->getHeader() ||
592 LatchBR->getSuccessor(1) == L->getHeader()) &&
593 "At least one edge out of the latch must go to the header");
594
595 SmallVector<BasicBlock *, 4> ExitBlocks;
596 L->getUniqueNonLatchExitBlocks(ExitBlocks);
597 return any_of(Range&: ExitBlocks, P: [](const BasicBlock *EB) {
598 return !EB->getTerminatingDeoptimizeCall();
599 });
600}
601
602
603// Return the number of iterations we want to peel off.
604void llvm::computePeelCount(Loop *L, unsigned LoopSize,
605 TargetTransformInfo::PeelingPreferences &PP,
606 unsigned TripCount, DominatorTree &DT,
607 ScalarEvolution &SE, const TargetTransformInfo &TTI,
608 AssumptionCache *AC, unsigned Threshold) {
609 assert(LoopSize > 0 && "Zero loop size is not allowed!");
610 // Save the PP.PeelCount value set by the target in
611 // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
612 unsigned TargetPeelCount = PP.PeelCount;
613 PP.PeelCount = 0;
614 PP.PeelLast = false;
615 if (!canPeel(L))
616 return;
617
618 // Only try to peel innermost loops by default.
619 // The constraint can be relaxed by the target in TTI.getPeelingPreferences
620 // or by the flag -unroll-allow-loop-nests-peeling.
621 if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
622 return;
623
624 // If the user provided a peel count, use that.
625 bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
626 if (UserPeelCount) {
627 LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
628 << " iterations.\n");
629 PP.PeelCount = UnrollForcePeelCount;
630 PP.PeelProfiledIterations = true;
631 return;
632 }
633
634 // Skip peeling if it's disabled.
635 if (!PP.AllowPeeling)
636 return;
637
638 // Check that we can peel at least one iteration.
639 if (2 * LoopSize > Threshold)
640 return;
641
642 unsigned AlreadyPeeled = 0;
643 if (auto Peeled = getOptionalIntLoopAttribute(TheLoop: L, Name: PeeledCountMetaData))
644 AlreadyPeeled = *Peeled;
645 // Stop if we already peeled off the maximum number of iterations.
646 if (AlreadyPeeled >= UnrollPeelMaxCount)
647 return;
648
649 // Pay respect to limitations implied by loop size and the max peel count.
650 unsigned MaxPeelCount = UnrollPeelMaxCount;
651 MaxPeelCount = std::min(a: MaxPeelCount, b: Threshold / LoopSize - 1);
652
653 // Start the max computation with the PP.PeelCount value set by the target
654 // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
655 unsigned DesiredPeelCount = TargetPeelCount;
656
657 // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
658 // iterations of the loop. For this we compute the number for iterations after
659 // which every Phi is guaranteed to become an invariant, and try to peel the
660 // maximum number of iterations among these values, thus turning all those
661 // Phis into invariants.
662 if (MaxPeelCount > DesiredPeelCount) {
663 // Check how many iterations are useful for resolving Phis
664 auto NumPeels = PhiAnalyzer(*L, MaxPeelCount).calculateIterationsToPeel();
665 if (NumPeels)
666 DesiredPeelCount = std::max(a: DesiredPeelCount, b: *NumPeels);
667 }
668
669 const auto &[CountToEliminateCmps, CountToEliminateCmpsLast] =
670 countToEliminateCompares(L&: *L, MaxPeelCount, SE, TTI);
671 DesiredPeelCount = std::max(a: DesiredPeelCount, b: CountToEliminateCmps);
672
673 if (DesiredPeelCount == 0)
674 DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(L&: *L, DT, AC);
675
676 if (DesiredPeelCount > 0) {
677 DesiredPeelCount = std::min(a: DesiredPeelCount, b: MaxPeelCount);
678 // Consider max peel count limitation.
679 assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
680 if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
681 LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
682 << " iteration(s) to turn"
683 << " some Phis into invariants.\n");
684 PP.PeelCount = DesiredPeelCount;
685 PP.PeelProfiledIterations = false;
686 PP.PeelLast = false;
687 return;
688 }
689 }
690
691 if (CountToEliminateCmpsLast > 0) {
692 unsigned DesiredPeelCountLast =
693 std::min(a: CountToEliminateCmpsLast, b: MaxPeelCount);
694 // Consider max peel count limitation.
695 assert(DesiredPeelCountLast > 0 && "Wrong loop size estimation?");
696 if (DesiredPeelCountLast + AlreadyPeeled <= UnrollPeelMaxCount) {
697 LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
698 << " iteration(s) to turn"
699 << " some Phis into invariants.\n");
700 PP.PeelCount = DesiredPeelCountLast;
701 PP.PeelProfiledIterations = false;
702 PP.PeelLast = true;
703 return;
704 }
705 }
706
707 // Bail if we know the statically calculated trip count.
708 // In this case we rather prefer partial unrolling.
709 if (TripCount)
710 return;
711
712 // Do not apply profile base peeling if it is disabled.
713 if (!PP.PeelProfiledIterations)
714 return;
715 // If we don't know the trip count, but have reason to believe the average
716 // trip count is low, peeling should be beneficial, since we will usually
717 // hit the peeled section.
718 // We only do this in the presence of profile information, since otherwise
719 // our estimates of the trip count are not reliable enough.
720 if (L->getHeader()->getParent()->hasProfileData()) {
721 if (violatesLegacyMultiExitLoopCheck(L))
722 return;
723 std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
724 if (!EstimatedTripCount)
725 return;
726
727 LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
728 << *EstimatedTripCount << "\n");
729
730 if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
731 unsigned PeelCount = *EstimatedTripCount;
732 LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
733 PP.PeelCount = PeelCount;
734 return;
735 }
736 LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
737 LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
738 LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
739 LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
740 LLVM_DEBUG(dbgs() << "Max peel count by cost: "
741 << (Threshold / LoopSize - 1) << "\n");
742 }
743}
744
745struct WeightInfo {
746 // Weights for current iteration.
747 SmallVector<uint32_t> Weights;
748 // Weights to subtract after each iteration.
749 const SmallVector<uint32_t> SubWeights;
750};
751
752/// Update the branch weights of an exiting block of a peeled-off loop
753/// iteration.
754/// Let F is a weight of the edge to continue (fallthrough) into the loop.
755/// Let E is a weight of the edge to an exit.
756/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
757/// go to exit.
758/// Then, Estimated ExitCount = F / E.
759/// For I-th (counting from 0) peeled off iteration we set the weights for
760/// the peeled exit as (EC - I, 1). It gives us reasonable distribution,
761/// The probability to go to exit 1/(EC-I) increases. At the same time
762/// the estimated exit count in the remainder loop reduces by I.
763/// To avoid dealing with division rounding we can just multiple both part
764/// of weights to E and use weight as (F - I * E, E).
765static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
766 setBranchWeights(I&: *Term, Weights: Info.Weights, /*IsExpected=*/false);
767 for (auto [Idx, SubWeight] : enumerate(First: Info.SubWeights))
768 if (SubWeight != 0)
769 // Don't set the probability of taking the edge from latch to loop header
770 // to less than 1:1 ratio (meaning Weight should not be lower than
771 // SubWeight), as this could significantly reduce the loop's hotness,
772 // which would be incorrect in the case of underestimating the trip count.
773 Info.Weights[Idx] =
774 Info.Weights[Idx] > SubWeight
775 ? std::max(a: Info.Weights[Idx] - SubWeight, b: SubWeight)
776 : SubWeight;
777}
778
779/// Initialize the weights for all exiting blocks.
780static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
781 Loop *L) {
782 SmallVector<BasicBlock *> ExitingBlocks;
783 L->getExitingBlocks(ExitingBlocks);
784 for (BasicBlock *ExitingBlock : ExitingBlocks) {
785 Instruction *Term = ExitingBlock->getTerminator();
786 SmallVector<uint32_t> Weights;
787 if (!extractBranchWeights(I: *Term, Weights))
788 continue;
789
790 // See the comment on updateBranchWeights() for an explanation of what we
791 // do here.
792 uint32_t FallThroughWeights = 0;
793 uint32_t ExitWeights = 0;
794 for (auto [Succ, Weight] : zip(t: successors(I: Term), u&: Weights)) {
795 if (L->contains(BB: Succ))
796 FallThroughWeights += Weight;
797 else
798 ExitWeights += Weight;
799 }
800
801 // Don't try to update weights for degenerate case.
802 if (FallThroughWeights == 0)
803 continue;
804
805 SmallVector<uint32_t> SubWeights;
806 for (auto [Succ, Weight] : zip(t: successors(I: Term), u&: Weights)) {
807 if (!L->contains(BB: Succ)) {
808 // Exit weights stay the same.
809 SubWeights.push_back(Elt: 0);
810 continue;
811 }
812
813 // Subtract exit weights on each iteration, distributed across all
814 // fallthrough edges.
815 double W = (double)Weight / (double)FallThroughWeights;
816 SubWeights.push_back(Elt: (uint32_t)(ExitWeights * W));
817 }
818
819 WeightInfos.insert(KV: {Term, {.Weights: std::move(Weights), .SubWeights: std::move(SubWeights)}});
820 }
821}
822
823/// Clones the body of the loop L, putting it between \p InsertTop and \p
824/// InsertBot.
825/// \param IterNumber The serial number of the iteration currently being
826/// peeled off.
827/// \param PeelLast Peel off the last iterations from \p L.
828/// \param ExitEdges The exit edges of the original loop.
829/// \param[out] NewBlocks A list of the blocks in the newly created clone
830/// \param[out] VMap The value map between the loop and the new clone.
831/// \param LoopBlocks A helper for DFS-traversal of the loop.
832/// \param LVMap A value-map that maps instructions from the original loop to
833/// instructions in the last peeled-off iteration.
834static void cloneLoopBlocks(
835 Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop,
836 BasicBlock *InsertBot, BasicBlock *OrigPreHeader,
837 SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
838 SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
839 ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
840 LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes,
841 ScalarEvolution &SE) {
842 BasicBlock *Header = L->getHeader();
843 BasicBlock *Latch = L->getLoopLatch();
844 BasicBlock *PreHeader = L->getLoopPreheader();
845
846 Function *F = Header->getParent();
847 LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
848 LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
849 Loop *ParentLoop = L->getParentLoop();
850
851 // For each block in the original loop, create a new copy,
852 // and update the value map with the newly created values.
853 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
854 BasicBlock *NewBB = CloneBasicBlock(BB: *BB, VMap, NameSuffix: ".peel", F);
855 NewBlocks.push_back(Elt: NewBB);
856
857 // If an original block is an immediate child of the loop L, its copy
858 // is a child of a ParentLoop after peeling. If a block is a child of
859 // a nested loop, it is handled in the cloneLoop() call below.
860 if (ParentLoop && LI->getLoopFor(BB: *BB) == L)
861 ParentLoop->addBasicBlockToLoop(NewBB, LI&: *LI);
862
863 VMap[*BB] = NewBB;
864
865 // If dominator tree is available, insert nodes to represent cloned blocks.
866 if (DT) {
867 if (Header == *BB)
868 DT->addNewBlock(BB: NewBB, DomBB: InsertTop);
869 else {
870 DomTreeNode *IDom = DT->getNode(BB: *BB)->getIDom();
871 // VMap must contain entry for IDom, as the iteration order is RPO.
872 DT->addNewBlock(BB: NewBB, DomBB: cast<BasicBlock>(Val&: VMap[IDom->getBlock()]));
873 }
874 }
875 }
876
877 {
878 // Identify what other metadata depends on the cloned version. After
879 // cloning, replace the metadata with the corrected version for both
880 // memory instructions and noalias intrinsics.
881 std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
882 cloneAndAdaptNoAliasScopes(NoAliasDeclScopes: LoopLocalNoAliasDeclScopes, NewBlocks,
883 Context&: Header->getContext(), Ext);
884 }
885
886 // Recursively create the new Loop objects for nested loops, if any,
887 // to preserve LoopInfo.
888 for (Loop *ChildLoop : *L) {
889 cloneLoop(L: ChildLoop, PL: ParentLoop, VM&: VMap, LI, LPM: nullptr);
890 }
891
892 // Hook-up the control flow for the newly inserted blocks.
893 // The new header is hooked up directly to the "top", which is either
894 // the original loop preheader (for the first iteration) or the previous
895 // iteration's exiting block (for every other iteration)
896 InsertTop->getTerminator()->setSuccessor(Idx: 0, BB: cast<BasicBlock>(Val&: VMap[Header]));
897
898 // Similarly, for the latch:
899 // The original exiting edge is still hooked up to the loop exit.
900 BasicBlock *NewLatch = cast<BasicBlock>(Val&: VMap[Latch]);
901 if (PeelLast) {
902 // This is the last iteration and we definitely will go to the exit. Just
903 // set both successors to InsertBot and let the branch be simplified later.
904 assert(IterNumber == 0 && "Only peeling a single iteration implemented.");
905 auto *LatchTerm = cast<BranchInst>(Val: NewLatch->getTerminator());
906 LatchTerm->setSuccessor(idx: 0, NewSucc: InsertBot);
907 LatchTerm->setSuccessor(idx: 1, NewSucc: InsertBot);
908 } else {
909 auto *LatchTerm = cast<Instruction>(Val: NewLatch->getTerminator());
910 // The backedge now goes to the "bottom", which is either the loop's real
911 // header (for the last peeled iteration) or the copied header of the next
912 // iteration (for every other iteration)
913 for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) {
914 if (LatchTerm->getSuccessor(Idx: idx) == Header) {
915 LatchTerm->setSuccessor(Idx: idx, BB: InsertBot);
916 break;
917 }
918 }
919 }
920 if (DT)
921 DT->changeImmediateDominator(BB: InsertBot, NewBB: NewLatch);
922
923 // The new copy of the loop body starts with a bunch of PHI nodes
924 // that pick an incoming value from either the preheader, or the previous
925 // loop iteration. Since this copy is no longer part of the loop, we
926 // resolve this statically:
927 if (PeelLast) {
928 // For the last iteration, we introduce new phis for each header phi in
929 // InsertTop, using the incoming value from the preheader for the original
930 // preheader (when skipping the main loop) and the incoming value from the
931 // latch for the latch (when continuing from the main loop).
932 IRBuilder<> B(InsertTop, InsertTop->getFirstNonPHIIt());
933 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
934 PHINode *NewPHI = cast<PHINode>(Val&: VMap[&*I]);
935 PHINode *PN = B.CreatePHI(Ty: NewPHI->getType(), NumReservedValues: 2);
936 NewPHI->eraseFromParent();
937 if (OrigPreHeader)
938 PN->addIncoming(V: cast<PHINode>(Val: &*I)->getIncomingValueForBlock(BB: PreHeader),
939 BB: OrigPreHeader);
940
941 PN->addIncoming(V: cast<PHINode>(Val: &*I)->getIncomingValueForBlock(BB: Latch),
942 BB: Latch);
943 VMap[&*I] = PN;
944 }
945 } else {
946 // For the first iteration, we use the value from the preheader directly.
947 // For any other iteration, we replace the phi with the value generated by
948 // the immediately preceding clone of the loop body (which represents
949 // the previous iteration).
950 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
951 PHINode *NewPHI = cast<PHINode>(Val&: VMap[&*I]);
952 if (IterNumber == 0) {
953 VMap[&*I] = NewPHI->getIncomingValueForBlock(BB: PreHeader);
954 } else {
955 Value *LatchVal = NewPHI->getIncomingValueForBlock(BB: Latch);
956 Instruction *LatchInst = dyn_cast<Instruction>(Val: LatchVal);
957 if (LatchInst && L->contains(Inst: LatchInst))
958 VMap[&*I] = LVMap[LatchInst];
959 else
960 VMap[&*I] = LatchVal;
961 }
962 NewPHI->eraseFromParent();
963 }
964 }
965
966 // Fix up the outgoing values - we need to add a value for the iteration
967 // we've just created. Note that this must happen *after* the incoming
968 // values are adjusted, since the value going out of the latch may also be
969 // a value coming into the header.
970 for (auto Edge : ExitEdges)
971 for (PHINode &PHI : Edge.second->phis()) {
972 Value *LatchVal = PHI.getIncomingValueForBlock(BB: Edge.first);
973 Instruction *LatchInst = dyn_cast<Instruction>(Val: LatchVal);
974 if (LatchInst && L->contains(Inst: LatchInst))
975 LatchVal = VMap[LatchVal];
976 PHI.addIncoming(V: LatchVal, BB: cast<BasicBlock>(Val&: VMap[Edge.first]));
977 SE.forgetLcssaPhiWithNewPredecessor(L, V: &PHI);
978 }
979
980 // LastValueMap is updated with the values for the current loop
981 // which are used the next time this function is called.
982 for (auto KV : VMap)
983 LVMap[KV.first] = KV.second;
984}
985
986TargetTransformInfo::PeelingPreferences
987llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
988 const TargetTransformInfo &TTI,
989 std::optional<bool> UserAllowPeeling,
990 std::optional<bool> UserAllowProfileBasedPeeling,
991 bool UnrollingSpecficValues) {
992 TargetTransformInfo::PeelingPreferences PP;
993
994 // Set the default values.
995 PP.PeelCount = 0;
996 PP.AllowPeeling = true;
997 PP.AllowLoopNestsPeeling = false;
998 PP.PeelLast = false;
999 PP.PeelProfiledIterations = true;
1000
1001 // Get the target specifc values.
1002 TTI.getPeelingPreferences(L, SE, PP);
1003
1004 // User specified values using cl::opt.
1005 if (UnrollingSpecficValues) {
1006 if (UnrollPeelCount.getNumOccurrences() > 0)
1007 PP.PeelCount = UnrollPeelCount;
1008 if (UnrollAllowPeeling.getNumOccurrences() > 0)
1009 PP.AllowPeeling = UnrollAllowPeeling;
1010 if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
1011 PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
1012 }
1013
1014 // User specifed values provided by argument.
1015 if (UserAllowPeeling)
1016 PP.AllowPeeling = *UserAllowPeeling;
1017 if (UserAllowProfileBasedPeeling)
1018 PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
1019
1020 return PP;
1021}
1022
1023/// Peel off the first \p PeelCount iterations of loop \p L.
1024///
1025/// Note that this does not peel them off as a single straight-line block.
1026/// Rather, each iteration is peeled off separately, and needs to check the
1027/// exit condition.
1028/// For loops that dynamically execute \p PeelCount iterations or less
1029/// this provides a benefit, since the peeled off iterations, which account
1030/// for the bulk of dynamic execution, can be further simplified by scalar
1031/// optimizations.
1032bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
1033 ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,
1034 bool PreserveLCSSA, ValueToValueMapTy &LVMap) {
1035 assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
1036 assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
1037 assert((!PeelLast || (canPeelLastIteration(*L, *SE) && PeelCount == 1)) &&
1038 "when peeling the last iteration, the loop must be supported and can "
1039 "only peel a single iteration");
1040
1041 LoopBlocksDFS LoopBlocks(L);
1042 LoopBlocks.perform(LI);
1043
1044 BasicBlock *Header = L->getHeader();
1045 BasicBlock *PreHeader = L->getLoopPreheader();
1046 BasicBlock *Latch = L->getLoopLatch();
1047 SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
1048 L->getExitEdges(ExitEdges);
1049
1050 // Remember dominators of blocks we might reach through exits to change them
1051 // later. Immediate dominator of such block might change, because we add more
1052 // routes which can lead to the exit: we can reach it from the peeled
1053 // iterations too.
1054 DenseMap<BasicBlock *, BasicBlock *> NonLoopBlocksIDom;
1055 for (auto *BB : L->blocks()) {
1056 auto *BBDomNode = DT.getNode(BB);
1057 SmallVector<BasicBlock *, 16> ChildrenToUpdate;
1058 for (auto *ChildDomNode : BBDomNode->children()) {
1059 auto *ChildBB = ChildDomNode->getBlock();
1060 if (!L->contains(BB: ChildBB))
1061 ChildrenToUpdate.push_back(Elt: ChildBB);
1062 }
1063 // The new idom of the block will be the nearest common dominator
1064 // of all copies of the previous idom. This is equivalent to the
1065 // nearest common dominator of the previous idom and the first latch,
1066 // which dominates all copies of the previous idom.
1067 BasicBlock *NewIDom = DT.findNearestCommonDominator(A: BB, B: Latch);
1068 for (auto *ChildBB : ChildrenToUpdate)
1069 NonLoopBlocksIDom[ChildBB] = NewIDom;
1070 }
1071
1072 Function *F = Header->getParent();
1073
1074 // Set up all the necessary basic blocks.
1075 BasicBlock *InsertTop;
1076 BasicBlock *InsertBot;
1077 BasicBlock *NewPreHeader = nullptr;
1078 DenseMap<Instruction *, Value *> ExitValues;
1079 if (PeelLast) {
1080 // It is convenient to split the single exit block from the latch the
1081 // into 3 parts - two blocks to anchor the peeled copy of the loop body,
1082 // and a new final exit block.
1083
1084 // Peeling the last iteration transforms.
1085 //
1086 // PreHeader:
1087 // ...
1088 // Header:
1089 // LoopBody
1090 // If (cond) goto Header
1091 // Exit:
1092 //
1093 // into
1094 //
1095 // Header:
1096 // LoopBody
1097 // If (cond) goto Header
1098 // InsertTop:
1099 // LoopBody
1100 // If (!cond) goto InsertBot
1101 // InsertBot:
1102 // Exit:
1103 // ...
1104 BasicBlock *Exit = L->getExitBlock();
1105 for (PHINode &P : Exit->phis())
1106 ExitValues[&P] = P.getIncomingValueForBlock(BB: Latch);
1107
1108 const SCEV *BTC = SE->getBackedgeTakenCount(L);
1109
1110 InsertTop = SplitEdge(From: Latch, To: Exit, DT: &DT, LI);
1111 InsertBot = SplitBlock(Old: InsertTop, SplitPt: InsertTop->getTerminator(), DT: &DT, LI);
1112
1113 InsertTop->setName(Exit->getName() + ".peel.begin");
1114 InsertBot->setName(Exit->getName() + ".peel.next");
1115 NewPreHeader = nullptr;
1116
1117 // If the original loop may only execute a single iteration we need to
1118 // insert a trip count check and skip the original loop with the last
1119 // iteration peeled off if necessary.
1120 if (!SE->isKnownNonZero(S: BTC)) {
1121 NewPreHeader = SplitEdge(From: PreHeader, To: Header, DT: &DT, LI);
1122 SCEVExpander Expander(*SE, Latch->getDataLayout(), "loop-peel");
1123
1124 BranchInst *PreHeaderBR = cast<BranchInst>(Val: PreHeader->getTerminator());
1125 Value *BTCValue =
1126 Expander.expandCodeFor(SH: BTC, Ty: BTC->getType(), I: PreHeaderBR);
1127 IRBuilder<> B(PreHeaderBR);
1128 Value *Cond =
1129 B.CreateICmpNE(LHS: BTCValue, RHS: ConstantInt::get(Ty: BTCValue->getType(), V: 0));
1130 B.CreateCondBr(Cond, True: NewPreHeader, False: InsertTop);
1131 PreHeaderBR->eraseFromParent();
1132
1133 // PreHeader now dominates InsertTop.
1134 DT.changeImmediateDominator(BB: InsertTop, NewBB: PreHeader);
1135 }
1136 } else {
1137 // It is convenient to split the preheader into 3 parts - two blocks to
1138 // anchor the peeled copy of the loop body, and a new preheader for the
1139 // "real" loop.
1140
1141 // Peeling the first iteration transforms.
1142 //
1143 // PreHeader:
1144 // ...
1145 // Header:
1146 // LoopBody
1147 // If (cond) goto Header
1148 // Exit:
1149 //
1150 // into
1151 //
1152 // InsertTop:
1153 // LoopBody
1154 // If (!cond) goto Exit
1155 // InsertBot:
1156 // NewPreHeader:
1157 // ...
1158 // Header:
1159 // LoopBody
1160 // If (cond) goto Header
1161 // Exit:
1162 //
1163 // Each following iteration will split the current bottom anchor in two,
1164 // and put the new copy of the loop body between these two blocks. That
1165 // is, after peeling another iteration from the example above, we'll
1166 // split InsertBot, and get:
1167 //
1168 // InsertTop:
1169 // LoopBody
1170 // If (!cond) goto Exit
1171 // InsertBot:
1172 // LoopBody
1173 // If (!cond) goto Exit
1174 // InsertBot.next:
1175 // NewPreHeader:
1176 // ...
1177 // Header:
1178 // LoopBody
1179 // If (cond) goto Header
1180 // Exit:
1181 //
1182 InsertTop = SplitEdge(From: PreHeader, To: Header, DT: &DT, LI);
1183 InsertBot = SplitBlock(Old: InsertTop, SplitPt: InsertTop->getTerminator(), DT: &DT, LI);
1184 NewPreHeader = SplitBlock(Old: InsertBot, SplitPt: InsertBot->getTerminator(), DT: &DT, LI);
1185
1186 InsertTop->setName(Header->getName() + ".peel.begin");
1187 InsertBot->setName(Header->getName() + ".peel.next");
1188 NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
1189 }
1190
1191 Instruction *LatchTerm =
1192 cast<Instruction>(Val: cast<BasicBlock>(Val: Latch)->getTerminator());
1193
1194 // If we have branch weight information, we'll want to update it for the
1195 // newly created branches.
1196 DenseMap<Instruction *, WeightInfo> Weights;
1197 initBranchWeights(WeightInfos&: Weights, L);
1198
1199 // Identify what noalias metadata is inside the loop: if it is inside the
1200 // loop, the associated metadata must be cloned for each iteration.
1201 SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
1202 identifyNoAliasScopesToClone(BBs: L->getBlocks(), NoAliasDeclScopes&: LoopLocalNoAliasDeclScopes);
1203
1204 // For each peeled-off iteration, make a copy of the loop.
1205 ValueToValueMapTy VMap;
1206 for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
1207 SmallVector<BasicBlock *, 8> NewBlocks;
1208
1209 cloneLoopBlocks(L, IterNumber: Iter, PeelLast, InsertTop, InsertBot,
1210 OrigPreHeader: NewPreHeader ? PreHeader : nullptr, ExitEdges, NewBlocks,
1211 LoopBlocks, VMap, LVMap, DT: &DT, LI,
1212 LoopLocalNoAliasDeclScopes, SE&: *SE);
1213
1214 // Remap to use values from the current iteration instead of the
1215 // previous one.
1216 remapInstructionsInBlocks(Blocks: NewBlocks, VMap);
1217
1218 if (Iter == 0) {
1219 if (PeelLast) {
1220 // Adjust the exit condition so the loop exits one iteration early.
1221 // For now we simply subtract one form the second operand of the
1222 // exit condition. This relies on the peel count computation to
1223 // check that this is actually legal. In particular, it ensures that
1224 // the first operand of the compare is an AddRec with step 1 and we
1225 // execute more than one iteration.
1226 auto *Cmp =
1227 cast<ICmpInst>(Val: L->getLoopLatch()->getTerminator()->getOperand(i: 0));
1228 IRBuilder B(Cmp);
1229 Cmp->setOperand(
1230 i_nocapture: 1, Val_nocapture: B.CreateSub(LHS: Cmp->getOperand(i_nocapture: 1),
1231 RHS: ConstantInt::get(Ty: Cmp->getOperand(i_nocapture: 1)->getType(), V: 1)));
1232 } else {
1233 // Update IDoms of the blocks reachable through exits.
1234 for (auto BBIDom : NonLoopBlocksIDom)
1235 DT.changeImmediateDominator(BB: BBIDom.first,
1236 NewBB: cast<BasicBlock>(Val&: LVMap[BBIDom.second]));
1237 }
1238 }
1239
1240#ifdef EXPENSIVE_CHECKS
1241 assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1242#endif
1243
1244 for (auto &[Term, Info] : Weights) {
1245 auto *TermCopy = cast<Instruction>(Val&: VMap[Term]);
1246 updateBranchWeights(Term: TermCopy, Info);
1247 }
1248
1249 // Remove Loop metadata from the latch branch instruction
1250 // because it is not the Loop's latch branch anymore.
1251 auto *LatchTermCopy = cast<Instruction>(Val&: VMap[LatchTerm]);
1252 LatchTermCopy->setMetadata(KindID: LLVMContext::MD_loop, Node: nullptr);
1253
1254 InsertTop = InsertBot;
1255 InsertBot = SplitBlock(Old: InsertBot, SplitPt: InsertBot->getTerminator(), DT: &DT, LI);
1256 InsertBot->setName(Header->getName() + ".peel.next");
1257
1258 F->splice(ToIt: InsertTop->getIterator(), FromF: F, FromBeginIt: NewBlocks[0]->getIterator(),
1259 FromEndIt: F->end());
1260 }
1261
1262 if (PeelLast) {
1263 // Now adjust users of the original exit values by replacing them with the
1264 // exit value from the peeled iteration and remove them.
1265 for (const auto &[P, E] : ExitValues) {
1266 Instruction *ExitInst = dyn_cast<Instruction>(Val: E);
1267 if (ExitInst && L->contains(Inst: ExitInst))
1268 P->replaceAllUsesWith(V: &*VMap[ExitInst]);
1269 else
1270 P->replaceAllUsesWith(V: E);
1271 P->eraseFromParent();
1272 }
1273 formLCSSA(L&: *L, DT, LI, SE);
1274 } else {
1275 // Now adjust the phi nodes in the loop header to get their initial values
1276 // from the last peeled-off iteration instead of the preheader.
1277 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
1278 PHINode *PHI = cast<PHINode>(Val&: I);
1279 Value *NewVal = PHI->getIncomingValueForBlock(BB: Latch);
1280 Instruction *LatchInst = dyn_cast<Instruction>(Val: NewVal);
1281 if (LatchInst && L->contains(Inst: LatchInst))
1282 NewVal = LVMap[LatchInst];
1283
1284 PHI->setIncomingValueForBlock(BB: NewPreHeader, V: NewVal);
1285 }
1286 }
1287
1288 for (const auto &[Term, Info] : Weights) {
1289 setBranchWeights(I&: *Term, Weights: Info.Weights, /*IsExpected=*/false);
1290 }
1291
1292 // Update Metadata for count of peeled off iterations.
1293 unsigned AlreadyPeeled = 0;
1294 if (auto Peeled = getOptionalIntLoopAttribute(TheLoop: L, Name: PeeledCountMetaData))
1295 AlreadyPeeled = *Peeled;
1296 addStringMetadataToLoop(TheLoop: L, MDString: PeeledCountMetaData, V: AlreadyPeeled + PeelCount);
1297
1298 if (Loop *ParentLoop = L->getParentLoop())
1299 L = ParentLoop;
1300
1301 // We modified the loop, update SE.
1302 SE->forgetTopmostLoop(L);
1303 SE->forgetBlockAndLoopDispositions();
1304
1305#ifdef EXPENSIVE_CHECKS
1306 // Finally DomtTree must be correct.
1307 assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1308#endif
1309
1310 // FIXME: Incrementally update loop-simplify
1311 simplifyLoop(L, DT: &DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA);
1312
1313 NumPeeled++;
1314 NumPeeledEnd += PeelLast;
1315
1316 return true;
1317}
1318